BoilerpipeArticle 0.0.4 → 0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/boilerpipe_article.rb +153 -54
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fccf00da5423dc69b01d5f7b43a0932574ac16e9
|
4
|
+
data.tar.gz: a9d9ae5187108e1d20d7d9a45926b811cde2f3cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c85b972589fd947fe9e2dd7606935fcd292e874e7e19ba1936d128eae26e03e8dc0e8e35afd40b82dac598925ba2e4de3d72da4d43f6a13ed36982a27c7ff7c
|
7
|
+
data.tar.gz: 10642f8bbf5573f8d4608945201cca270022b5e000bba26998670a75d235e5c234d5fd0047039d2a2afff38b5a696d4c82c10c68d2b9148104ce4cd24f09dcc3
|
data/lib/boilerpipe_article.rb
CHANGED
@@ -1,90 +1,189 @@
|
|
1
|
+
#Encoding: UTF-8
|
2
|
+
|
1
3
|
require 'nokogiri'
|
4
|
+
require 'mida'
|
2
5
|
|
3
6
|
class BoilerpipeArticle
|
4
7
|
def initialize(html)
|
5
|
-
@html = html
|
8
|
+
@html = html.gsub(/\s\s+/,' ')
|
9
|
+
@articlesStats = Hash.new
|
6
10
|
end
|
7
|
-
|
8
|
-
def getText(html = @html)
|
11
|
+
def removeBadHtmlTags(html = @html)
|
9
12
|
html = Nokogiri::HTML.parse(html).to_s
|
10
13
|
html.gsub!(/<!-[\s\S]*?->/, '')
|
11
14
|
html.gsub!(/\r?\n|\r/, '')
|
12
15
|
|
16
|
+
unwantedTags = ['strong','bold','i']
|
17
|
+
unwantedTags.each do |tag|
|
18
|
+
html.gsub!("<#{tag}>",'')
|
19
|
+
html.gsub!("</#{tag}>",'')
|
20
|
+
end
|
21
|
+
|
22
|
+
|
13
23
|
doc = Nokogiri::HTML(html)
|
14
|
-
|
24
|
+
|
25
|
+
badHtmlTags = ['script','style','head','nav','iframe','img','footer','ol','ul','li','a']
|
26
|
+
doc.css('*').each do |node|
|
27
|
+
node.remove if node.text.length < 3
|
28
|
+
end
|
15
29
|
badHtmlTags.each do |tag|
|
16
30
|
doc.search(tag).each do |src|
|
17
31
|
src.remove
|
18
32
|
end
|
19
33
|
end
|
20
|
-
|
34
|
+
# doc.css('a').each do |atag|
|
35
|
+
# atag = "#{atag.text}"
|
36
|
+
# puts atag
|
37
|
+
# end
|
21
38
|
html = doc.to_html.to_s
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
start = 0
|
27
|
-
close = 0
|
39
|
+
|
40
|
+
return html
|
41
|
+
end
|
42
|
+
def calculateDepth(html = @html)
|
28
43
|
articlesStats = Hash.new
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
44
|
+
doc = Nokogiri::HTML(html)
|
45
|
+
i = 0
|
46
|
+
doc.xpath('//text()').each do |node|
|
47
|
+
text = node.to_s
|
48
|
+
articlesStats.store(i,[node.text.to_s,node.ancestors.length.to_i,node.parent.name])
|
49
|
+
i+=1
|
50
|
+
end
|
51
|
+
return articlesStats
|
52
|
+
end
|
53
|
+
def removeSamePatterns(html)
|
54
|
+
doc = Nokogiri::HTML(html)
|
55
|
+
paths = Array.new
|
56
|
+
doc.css('*').each do |node|
|
57
|
+
s = node.path.gsub(/\[[\s\S]*?\]/, '')
|
58
|
+
paths.push(s)
|
59
|
+
end
|
60
|
+
final = []
|
61
|
+
(7..30).each do |i|
|
62
|
+
all = []
|
63
|
+
paths.each_with_index do |seq,a|
|
64
|
+
se = []
|
65
|
+
paths[a..-1].each_with_index do |s,ii|
|
66
|
+
se << s
|
67
|
+
break if ii == i-1
|
68
|
+
end
|
69
|
+
all << se
|
70
|
+
end
|
71
|
+
final << all
|
72
|
+
end
|
73
|
+
allDoubles = Hash.new
|
74
|
+
final.each_with_index do |seq,i|
|
75
|
+
counts = Hash.new(0)
|
76
|
+
seq.each do |name|
|
77
|
+
counts[name] += 1
|
78
|
+
end
|
79
|
+
counts = counts.sort_by{|k,v|v}.reverse.to_h
|
80
|
+
allDoubles.store(i,counts)
|
81
|
+
end
|
82
|
+
allDoubles.each do |i,doubles|
|
83
|
+
doubles.each do |path,count|
|
84
|
+
if count >= 7
|
85
|
+
doc.css('*').each do |node|
|
86
|
+
s = node.path.gsub(/\[[\s\S]*?\]/, '')
|
87
|
+
if path.include? s
|
88
|
+
node.remove
|
57
89
|
end
|
58
|
-
break
|
59
90
|
end
|
60
|
-
ii+=1
|
61
91
|
end
|
62
92
|
end
|
63
|
-
i+=1
|
64
93
|
end
|
94
|
+
return doc.to_s
|
95
|
+
end
|
96
|
+
def calculateBestDepth(articlesStats)
|
65
97
|
bestDepth = Hash.new(0)
|
66
98
|
articlesStats.each do |line,stats|
|
67
|
-
bestDepth[stats[1]]+=stats[0].
|
99
|
+
bestDepth[stats[1]]+=stats[0].length
|
68
100
|
end
|
69
|
-
|
101
|
+
bestvalues = bestDepth.sort_by {|key,value|value}.reverse.to_h
|
102
|
+
average = 0.0
|
103
|
+
bestDepth.each {|l,v|average+=v/bestDepth.keys.length.to_f}
|
104
|
+
texts = 0
|
105
|
+
bestDepth.each{|l,v|texts +=1 if v > average}
|
106
|
+
|
107
|
+
doubleTexts = false
|
108
|
+
doubleTexts = true if texts >= 2
|
109
|
+
best = bestvalues.keys[0]
|
110
|
+
|
111
|
+
return best,doubleTexts
|
112
|
+
end
|
113
|
+
|
114
|
+
def getTextOfBestDepth(articlesStats,best)
|
70
115
|
text = ''
|
71
116
|
articlesStats.each do |line,stats|
|
72
|
-
|
117
|
+
if stats[1] == best && (stats[-1].eql?('h1') || stats[-1].eql?('h2') || stats[-1].eql?('p'))
|
118
|
+
text = "#{text} <#{stats[-1]}>#{stats[0]}</#{stats[-1]}>" if stats[0].strip.length > 2
|
119
|
+
end
|
73
120
|
end
|
74
|
-
return
|
121
|
+
return text
|
75
122
|
end
|
76
123
|
|
77
|
-
def
|
124
|
+
def getMetas(html = @html)
|
78
125
|
metas = Hash.new
|
79
126
|
doc = Nokogiri.parse(html)
|
80
|
-
|
81
|
-
|
82
|
-
if
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
127
|
+
doc.xpath("//meta").each do |node|
|
128
|
+
name = node[node.attributes.keys[1]]
|
129
|
+
name = node[node.attributes.keys[0]] if node.attributes.keys[0] != 'content' && node.attributes.keys[0] != 'value'
|
130
|
+
content = node['content']
|
131
|
+
content = node['value'] if content == nil
|
132
|
+
|
133
|
+
metas.store(name,content)
|
87
134
|
end
|
88
135
|
return metas
|
89
136
|
end
|
137
|
+
def getOtherHTMLDescriptions(html = @html)
|
138
|
+
doc = Nokogiri.parse(html)
|
139
|
+
images = Array.new
|
140
|
+
headlines = Hash.new
|
141
|
+
links = Hash.new
|
142
|
+
5.times do |i|
|
143
|
+
hs = doc.xpath("//h#{i+1}")
|
144
|
+
texts = []
|
145
|
+
hs.each {|node| texts.push(node.text.to_s)}
|
146
|
+
headlines.store("h#{i+1}",texts)
|
147
|
+
end
|
148
|
+
|
149
|
+
imgs = doc.xpath('//img/@src')
|
150
|
+
imgs.each do |source|
|
151
|
+
images.push(source.text) if source.text.include?('http')
|
152
|
+
end
|
153
|
+
|
154
|
+
plinks = doc.xpath('//a/@href')
|
155
|
+
plinks.each do |source|
|
156
|
+
links.store(source.text,1) if source.text.strip.length > 2
|
157
|
+
end
|
158
|
+
|
159
|
+
return {'headlines'=>headlines,'images'=>images, 'links' => links.keys}
|
160
|
+
end
|
161
|
+
def getMicroData(html = @html)
|
162
|
+
doc = Mida::Document.new(html, "")
|
163
|
+
topLevel = Array.new
|
164
|
+
doc.items.each do |item|
|
165
|
+
topLevel.push(item.to_h)
|
166
|
+
end
|
167
|
+
return topLevel
|
168
|
+
end
|
169
|
+
def getAllText(html = @html)
|
170
|
+
doc = Nokogiri.parse(html)
|
171
|
+
doc.search('script').remove
|
172
|
+
doc.search('style').remove
|
173
|
+
return doc.text.gsub(/\s\s+/,' ')
|
174
|
+
end
|
175
|
+
def getArticle(html = @html)
|
176
|
+
html = removeBadHtmlTags(html)
|
177
|
+
articlesStats = calculateDepth(html)
|
178
|
+
best,doubleTexts = calculateBestDepth(articlesStats)
|
179
|
+
if doubleTexts
|
180
|
+
html = removeSamePatterns(html)
|
181
|
+
articlesStats,d = calculateDepth(html)
|
182
|
+
|
183
|
+
end
|
184
|
+
bestDepth,doubles = calculateBestDepth(articlesStats)
|
185
|
+
plainText = getTextOfBestDepth(articlesStats,bestDepth)
|
186
|
+
return plainText
|
187
|
+
end
|
188
|
+
|
90
189
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: BoilerpipeArticle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Layer-Reiss
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,9 +24,24 @@ dependencies:
|
|
24
24
|
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.6.8
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mida
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.3.9
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.3.9
|
27
41
|
description: This gem removes the surplus “clutter” (boilerplate, templates) around
|
28
42
|
the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
|
29
|
-
can be also used to parse open graph meta data. Check GitHub for
|
43
|
+
can be also used to parse (open graph) meta data and microdata. Check GitHub for
|
44
|
+
usage examples.
|
30
45
|
email: layerreiss@gmail.com
|
31
46
|
executables: []
|
32
47
|
extensions: []
|