BoilerpipeArticle 0.0.4 → 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/boilerpipe_article.rb +153 -54
- metadata +18 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: fccf00da5423dc69b01d5f7b43a0932574ac16e9
|
4
|
+
data.tar.gz: a9d9ae5187108e1d20d7d9a45926b811cde2f3cf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3c85b972589fd947fe9e2dd7606935fcd292e874e7e19ba1936d128eae26e03e8dc0e8e35afd40b82dac598925ba2e4de3d72da4d43f6a13ed36982a27c7ff7c
|
7
|
+
data.tar.gz: 10642f8bbf5573f8d4608945201cca270022b5e000bba26998670a75d235e5c234d5fd0047039d2a2afff38b5a696d4c82c10c68d2b9148104ce4cd24f09dcc3
|
data/lib/boilerpipe_article.rb
CHANGED
@@ -1,90 +1,189 @@
|
|
1
|
+
#Encoding: UTF-8
|
2
|
+
|
1
3
|
require 'nokogiri'
|
4
|
+
require 'mida'
|
2
5
|
|
3
6
|
class BoilerpipeArticle
|
4
7
|
def initialize(html)
|
5
|
-
@html = html
|
8
|
+
@html = html.gsub(/\s\s+/,' ')
|
9
|
+
@articlesStats = Hash.new
|
6
10
|
end
|
7
|
-
|
8
|
-
def getText(html = @html)
|
11
|
+
def removeBadHtmlTags(html = @html)
|
9
12
|
html = Nokogiri::HTML.parse(html).to_s
|
10
13
|
html.gsub!(/<!-[\s\S]*?->/, '')
|
11
14
|
html.gsub!(/\r?\n|\r/, '')
|
12
15
|
|
16
|
+
unwantedTags = ['strong','bold','i']
|
17
|
+
unwantedTags.each do |tag|
|
18
|
+
html.gsub!("<#{tag}>",'')
|
19
|
+
html.gsub!("</#{tag}>",'')
|
20
|
+
end
|
21
|
+
|
22
|
+
|
13
23
|
doc = Nokogiri::HTML(html)
|
14
|
-
|
24
|
+
|
25
|
+
badHtmlTags = ['script','style','head','nav','iframe','img','footer','ol','ul','li','a']
|
26
|
+
doc.css('*').each do |node|
|
27
|
+
node.remove if node.text.length < 3
|
28
|
+
end
|
15
29
|
badHtmlTags.each do |tag|
|
16
30
|
doc.search(tag).each do |src|
|
17
31
|
src.remove
|
18
32
|
end
|
19
33
|
end
|
20
|
-
|
34
|
+
# doc.css('a').each do |atag|
|
35
|
+
# atag = "#{atag.text}"
|
36
|
+
# puts atag
|
37
|
+
# end
|
21
38
|
html = doc.to_html.to_s
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
start = 0
|
27
|
-
close = 0
|
39
|
+
|
40
|
+
return html
|
41
|
+
end
|
42
|
+
def calculateDepth(html = @html)
|
28
43
|
articlesStats = Hash.new
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
44
|
+
doc = Nokogiri::HTML(html)
|
45
|
+
i = 0
|
46
|
+
doc.xpath('//text()').each do |node|
|
47
|
+
text = node.to_s
|
48
|
+
articlesStats.store(i,[node.text.to_s,node.ancestors.length.to_i,node.parent.name])
|
49
|
+
i+=1
|
50
|
+
end
|
51
|
+
return articlesStats
|
52
|
+
end
|
53
|
+
def removeSamePatterns(html)
|
54
|
+
doc = Nokogiri::HTML(html)
|
55
|
+
paths = Array.new
|
56
|
+
doc.css('*').each do |node|
|
57
|
+
s = node.path.gsub(/\[[\s\S]*?\]/, '')
|
58
|
+
paths.push(s)
|
59
|
+
end
|
60
|
+
final = []
|
61
|
+
(7..30).each do |i|
|
62
|
+
all = []
|
63
|
+
paths.each_with_index do |seq,a|
|
64
|
+
se = []
|
65
|
+
paths[a..-1].each_with_index do |s,ii|
|
66
|
+
se << s
|
67
|
+
break if ii == i-1
|
68
|
+
end
|
69
|
+
all << se
|
70
|
+
end
|
71
|
+
final << all
|
72
|
+
end
|
73
|
+
allDoubles = Hash.new
|
74
|
+
final.each_with_index do |seq,i|
|
75
|
+
counts = Hash.new(0)
|
76
|
+
seq.each do |name|
|
77
|
+
counts[name] += 1
|
78
|
+
end
|
79
|
+
counts = counts.sort_by{|k,v|v}.reverse.to_h
|
80
|
+
allDoubles.store(i,counts)
|
81
|
+
end
|
82
|
+
allDoubles.each do |i,doubles|
|
83
|
+
doubles.each do |path,count|
|
84
|
+
if count >= 7
|
85
|
+
doc.css('*').each do |node|
|
86
|
+
s = node.path.gsub(/\[[\s\S]*?\]/, '')
|
87
|
+
if path.include? s
|
88
|
+
node.remove
|
57
89
|
end
|
58
|
-
break
|
59
90
|
end
|
60
|
-
ii+=1
|
61
91
|
end
|
62
92
|
end
|
63
|
-
i+=1
|
64
93
|
end
|
94
|
+
return doc.to_s
|
95
|
+
end
|
96
|
+
def calculateBestDepth(articlesStats)
|
65
97
|
bestDepth = Hash.new(0)
|
66
98
|
articlesStats.each do |line,stats|
|
67
|
-
bestDepth[stats[1]]+=stats[0].
|
99
|
+
bestDepth[stats[1]]+=stats[0].length
|
68
100
|
end
|
69
|
-
|
101
|
+
bestvalues = bestDepth.sort_by {|key,value|value}.reverse.to_h
|
102
|
+
average = 0.0
|
103
|
+
bestDepth.each {|l,v|average+=v/bestDepth.keys.length.to_f}
|
104
|
+
texts = 0
|
105
|
+
bestDepth.each{|l,v|texts +=1 if v > average}
|
106
|
+
|
107
|
+
doubleTexts = false
|
108
|
+
doubleTexts = true if texts >= 2
|
109
|
+
best = bestvalues.keys[0]
|
110
|
+
|
111
|
+
return best,doubleTexts
|
112
|
+
end
|
113
|
+
|
114
|
+
def getTextOfBestDepth(articlesStats,best)
|
70
115
|
text = ''
|
71
116
|
articlesStats.each do |line,stats|
|
72
|
-
|
117
|
+
if stats[1] == best && (stats[-1].eql?('h1') || stats[-1].eql?('h2') || stats[-1].eql?('p'))
|
118
|
+
text = "#{text} <#{stats[-1]}>#{stats[0]}</#{stats[-1]}>" if stats[0].strip.length > 2
|
119
|
+
end
|
73
120
|
end
|
74
|
-
return
|
121
|
+
return text
|
75
122
|
end
|
76
123
|
|
77
|
-
def
|
124
|
+
def getMetas(html = @html)
|
78
125
|
metas = Hash.new
|
79
126
|
doc = Nokogiri.parse(html)
|
80
|
-
|
81
|
-
|
82
|
-
if
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
127
|
+
doc.xpath("//meta").each do |node|
|
128
|
+
name = node[node.attributes.keys[1]]
|
129
|
+
name = node[node.attributes.keys[0]] if node.attributes.keys[0] != 'content' && node.attributes.keys[0] != 'value'
|
130
|
+
content = node['content']
|
131
|
+
content = node['value'] if content == nil
|
132
|
+
|
133
|
+
metas.store(name,content)
|
87
134
|
end
|
88
135
|
return metas
|
89
136
|
end
|
137
|
+
def getOtherHTMLDescriptions(html = @html)
|
138
|
+
doc = Nokogiri.parse(html)
|
139
|
+
images = Array.new
|
140
|
+
headlines = Hash.new
|
141
|
+
links = Hash.new
|
142
|
+
5.times do |i|
|
143
|
+
hs = doc.xpath("//h#{i+1}")
|
144
|
+
texts = []
|
145
|
+
hs.each {|node| texts.push(node.text.to_s)}
|
146
|
+
headlines.store("h#{i+1}",texts)
|
147
|
+
end
|
148
|
+
|
149
|
+
imgs = doc.xpath('//img/@src')
|
150
|
+
imgs.each do |source|
|
151
|
+
images.push(source.text) if source.text.include?('http')
|
152
|
+
end
|
153
|
+
|
154
|
+
plinks = doc.xpath('//a/@href')
|
155
|
+
plinks.each do |source|
|
156
|
+
links.store(source.text,1) if source.text.strip.length > 2
|
157
|
+
end
|
158
|
+
|
159
|
+
return {'headlines'=>headlines,'images'=>images, 'links' => links.keys}
|
160
|
+
end
|
161
|
+
def getMicroData(html = @html)
|
162
|
+
doc = Mida::Document.new(html, "")
|
163
|
+
topLevel = Array.new
|
164
|
+
doc.items.each do |item|
|
165
|
+
topLevel.push(item.to_h)
|
166
|
+
end
|
167
|
+
return topLevel
|
168
|
+
end
|
169
|
+
def getAllText(html = @html)
|
170
|
+
doc = Nokogiri.parse(html)
|
171
|
+
doc.search('script').remove
|
172
|
+
doc.search('style').remove
|
173
|
+
return doc.text.gsub(/\s\s+/,' ')
|
174
|
+
end
|
175
|
+
def getArticle(html = @html)
|
176
|
+
html = removeBadHtmlTags(html)
|
177
|
+
articlesStats = calculateDepth(html)
|
178
|
+
best,doubleTexts = calculateBestDepth(articlesStats)
|
179
|
+
if doubleTexts
|
180
|
+
html = removeSamePatterns(html)
|
181
|
+
articlesStats,d = calculateDepth(html)
|
182
|
+
|
183
|
+
end
|
184
|
+
bestDepth,doubles = calculateBestDepth(articlesStats)
|
185
|
+
plainText = getTextOfBestDepth(articlesStats,bestDepth)
|
186
|
+
return plainText
|
187
|
+
end
|
188
|
+
|
90
189
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: BoilerpipeArticle
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: '0.1'
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Layer-Reiss
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-09-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -24,9 +24,24 @@ dependencies:
|
|
24
24
|
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: 1.6.8
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: mida
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.3.9
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 0.3.9
|
27
41
|
description: This gem removes the surplus “clutter” (boilerplate, templates) around
|
28
42
|
the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
|
29
|
-
can be also used to parse open graph meta data. Check GitHub for
|
43
|
+
can be also used to parse (open graph) meta data and microdata. Check GitHub for
|
44
|
+
usage examples.
|
30
45
|
email: layerreiss@gmail.com
|
31
46
|
executables: []
|
32
47
|
extensions: []
|