BoilerpipeArticle 0.0.4 → 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/boilerpipe_article.rb +153 -54
  3. metadata +18 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 86e9801c9b5e6eacf758aa4bbff2ec298313e163
4
- data.tar.gz: 58bdb8a6a7624e08f5f28f8bbbc094da2a3fc02a
3
+ metadata.gz: fccf00da5423dc69b01d5f7b43a0932574ac16e9
4
+ data.tar.gz: a9d9ae5187108e1d20d7d9a45926b811cde2f3cf
5
5
  SHA512:
6
- metadata.gz: f8dcba151737a254fb7f0840d24d187a757575ed87c8b0817ffb24dd3d3e3b28bebacf7ac4ee980bf8bc085317c9f3ae94a651c7c5b77fee43511f1a615077ab
7
- data.tar.gz: e3a3ed339a688ae3ed778cd4f4c2e2e426ea070b94e07dcc156d945731b38961540f7a46fa4d421dbdac21aa47022c70df6586f6458bf4f620b95e3628278cd5
6
+ metadata.gz: 3c85b972589fd947fe9e2dd7606935fcd292e874e7e19ba1936d128eae26e03e8dc0e8e35afd40b82dac598925ba2e4de3d72da4d43f6a13ed36982a27c7ff7c
7
+ data.tar.gz: 10642f8bbf5573f8d4608945201cca270022b5e000bba26998670a75d235e5c234d5fd0047039d2a2afff38b5a696d4c82c10c68d2b9148104ce4cd24f09dcc3
@@ -1,90 +1,189 @@
1
+ #Encoding: UTF-8
2
+
1
3
  require 'nokogiri'
4
+ require 'mida'
2
5
 
3
6
  class BoilerpipeArticle
4
7
  def initialize(html)
5
- @html = html
8
+ @html = html.gsub(/\s\s+/,' ')
9
+ @articlesStats = Hash.new
6
10
  end
7
-
8
- def getText(html = @html)
11
+ def removeBadHtmlTags(html = @html)
9
12
  html = Nokogiri::HTML.parse(html).to_s
10
13
  html.gsub!(/<!-[\s\S]*?->/, '')
11
14
  html.gsub!(/\r?\n|\r/, '')
12
15
 
16
+ unwantedTags = ['strong','bold','i']
17
+ unwantedTags.each do |tag|
18
+ html.gsub!("<#{tag}>",'')
19
+ html.gsub!("</#{tag}>",'')
20
+ end
21
+
22
+
13
23
  doc = Nokogiri::HTML(html)
14
- badHtmlTags = ['nav','head','script','style','a','img']
24
+
25
+ badHtmlTags = ['script','style','head','nav','iframe','img','footer','ol','ul','li','a']
26
+ doc.css('*').each do |node|
27
+ node.remove if node.text.length < 3
28
+ end
15
29
  badHtmlTags.each do |tag|
16
30
  doc.search(tag).each do |src|
17
31
  src.remove
18
32
  end
19
33
  end
20
-
34
+ # doc.css('a').each do |atag|
35
+ # atag = "#{atag.text}"
36
+ # puts atag
37
+ # end
21
38
  html = doc.to_html.to_s
22
- selfClosingTags = ['<area','<base','<br','<col','<command','<embed','<hr','<img','<input','<keygen','<link','<meta','<param','<source','<track','<wbr']
23
- time = Time.now.to_f
24
- depth = 1
25
- i = 0
26
- start = 0
27
- close = 0
39
+
40
+ return html
41
+ end
42
+ def calculateDepth(html = @html)
28
43
  articlesStats = Hash.new
29
- inPtag = false
30
- content = ''
31
- html.length.times do
32
- char = html[i]
33
- if char.eql? '<'
34
- start = i
35
- ii = start
36
- html.length.times do
37
- char2 = html[ii]
38
- if char2.eql? '>'
39
- tag = html[start..ii]
40
- tagname = "#{tag}"
41
- inPtag = true if tagname.eql?('<p>') || tagname.split(' ')[0].eql?('<p')
42
- content = html[close..start].gsub(/[<>]/,'')
43
- tagname = "#{tag}"
44
- text = ''
45
- text = content if inPtag
46
- articlesStats.store(i,[text,depth,tagname]) if content.gsub(/[^a-zA-Z]+/,'').length > 1
47
- close = ii
48
- inPtag = false if tagname.eql? '</p>'
49
- if !selfClosingTags.include?(tag.split(" ")[0]) && !tag.include?('<br')
50
- tag.gsub!(/"[\s\S]*?"/,'')
51
- tag.gsub!(/[^<>\/]+/,'')
52
- if tag.eql? '<>'
53
- depth+=1
54
- else
55
- depth-=1
56
- end
44
+ doc = Nokogiri::HTML(html)
45
+ i = 0
46
+ doc.xpath('//text()').each do |node|
47
+ text = node.to_s
48
+ articlesStats.store(i,[node.text.to_s,node.ancestors.length.to_i,node.parent.name])
49
+ i+=1
50
+ end
51
+ return articlesStats
52
+ end
53
+ def removeSamePatterns(html)
54
+ doc = Nokogiri::HTML(html)
55
+ paths = Array.new
56
+ doc.css('*').each do |node|
57
+ s = node.path.gsub(/\[[\s\S]*?\]/, '')
58
+ paths.push(s)
59
+ end
60
+ final = []
61
+ (7..30).each do |i|
62
+ all = []
63
+ paths.each_with_index do |seq,a|
64
+ se = []
65
+ paths[a..-1].each_with_index do |s,ii|
66
+ se << s
67
+ break if ii == i-1
68
+ end
69
+ all << se
70
+ end
71
+ final << all
72
+ end
73
+ allDoubles = Hash.new
74
+ final.each_with_index do |seq,i|
75
+ counts = Hash.new(0)
76
+ seq.each do |name|
77
+ counts[name] += 1
78
+ end
79
+ counts = counts.sort_by{|k,v|v}.reverse.to_h
80
+ allDoubles.store(i,counts)
81
+ end
82
+ allDoubles.each do |i,doubles|
83
+ doubles.each do |path,count|
84
+ if count >= 7
85
+ doc.css('*').each do |node|
86
+ s = node.path.gsub(/\[[\s\S]*?\]/, '')
87
+ if path.include? s
88
+ node.remove
57
89
  end
58
- break
59
90
  end
60
- ii+=1
61
91
  end
62
92
  end
63
- i+=1
64
93
  end
94
+ return doc.to_s
95
+ end
96
+ def calculateBestDepth(articlesStats)
65
97
  bestDepth = Hash.new(0)
66
98
  articlesStats.each do |line,stats|
67
- bestDepth[stats[1]]+=stats[0].gsub(/[^a-zA-Z]+/,'').length
99
+ bestDepth[stats[1]]+=stats[0].length
68
100
  end
69
- best = bestDepth.sort_by {|key,value|value}.reverse.to_h.keys[0]
101
+ bestvalues = bestDepth.sort_by {|key,value|value}.reverse.to_h
102
+ average = 0.0
103
+ bestDepth.each {|l,v|average+=v/bestDepth.keys.length.to_f}
104
+ texts = 0
105
+ bestDepth.each{|l,v|texts +=1 if v > average}
106
+
107
+ doubleTexts = false
108
+ doubleTexts = true if texts >= 2
109
+ best = bestvalues.keys[0]
110
+
111
+ return best,doubleTexts
112
+ end
113
+
114
+ def getTextOfBestDepth(articlesStats,best)
70
115
  text = ''
71
116
  articlesStats.each do |line,stats|
72
- text = "#{text} #{stats[0]}" if stats[1] == best
117
+ if stats[1] == best && (stats[-1].eql?('h1') || stats[-1].eql?('h2') || stats[-1].eql?('p'))
118
+ text = "#{text} <#{stats[-1]}>#{stats[0]}</#{stats[-1]}>" if stats[0].strip.length > 2
119
+ end
73
120
  end
74
- return Nokogiri::HTML.parse(text).text
121
+ return text
75
122
  end
76
123
 
77
- def getOgMetas(html = @html)
124
+ def getMetas(html = @html)
78
125
  metas = Hash.new
79
126
  doc = Nokogiri.parse(html)
80
- properties = ['title','type','url','description','image','type','updated_time','locale','url','site_name']
81
- properties.each do |prop|
82
- if doc.at("meta[property=\"og:#{prop}\"]") != nil
83
- metas.store(prop,doc.at("meta[property=\"og:#{prop}\"]")['content'])
84
- else
85
- metas.store(prop,' ')
86
- end
127
+ doc.xpath("//meta").each do |node|
128
+ name = node[node.attributes.keys[1]]
129
+ name = node[node.attributes.keys[0]] if node.attributes.keys[0] != 'content' && node.attributes.keys[0] != 'value'
130
+ content = node['content']
131
+ content = node['value'] if content == nil
132
+
133
+ metas.store(name,content)
87
134
  end
88
135
  return metas
89
136
  end
137
+ def getOtherHTMLDescriptions(html = @html)
138
+ doc = Nokogiri.parse(html)
139
+ images = Array.new
140
+ headlines = Hash.new
141
+ links = Hash.new
142
+ 5.times do |i|
143
+ hs = doc.xpath("//h#{i+1}")
144
+ texts = []
145
+ hs.each {|node| texts.push(node.text.to_s)}
146
+ headlines.store("h#{i+1}",texts)
147
+ end
148
+
149
+ imgs = doc.xpath('//img/@src')
150
+ imgs.each do |source|
151
+ images.push(source.text) if source.text.include?('http')
152
+ end
153
+
154
+ plinks = doc.xpath('//a/@href')
155
+ plinks.each do |source|
156
+ links.store(source.text,1) if source.text.strip.length > 2
157
+ end
158
+
159
+ return {'headlines'=>headlines,'images'=>images, 'links' => links.keys}
160
+ end
161
+ def getMicroData(html = @html)
162
+ doc = Mida::Document.new(html, "")
163
+ topLevel = Array.new
164
+ doc.items.each do |item|
165
+ topLevel.push(item.to_h)
166
+ end
167
+ return topLevel
168
+ end
169
+ def getAllText(html = @html)
170
+ doc = Nokogiri.parse(html)
171
+ doc.search('script').remove
172
+ doc.search('style').remove
173
+ return doc.text.gsub(/\s\s+/,' ')
174
+ end
175
+ def getArticle(html = @html)
176
+ html = removeBadHtmlTags(html)
177
+ articlesStats = calculateDepth(html)
178
+ best,doubleTexts = calculateBestDepth(articlesStats)
179
+ if doubleTexts
180
+ html = removeSamePatterns(html)
181
+ articlesStats,d = calculateDepth(html)
182
+
183
+ end
184
+ bestDepth,doubles = calculateBestDepth(articlesStats)
185
+ plainText = getTextOfBestDepth(articlesStats,bestDepth)
186
+ return plainText
187
+ end
188
+
90
189
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: BoilerpipeArticle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: '0.1'
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Layer-Reiss
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-31 00:00:00.000000000 Z
11
+ date: 2016-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,9 +24,24 @@ dependencies:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.6.8
27
+ - !ruby/object:Gem::Dependency
28
+ name: mida
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.9
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.9
27
41
  description: This gem removes the surplus “clutter” (boilerplate, templates) around
28
42
  the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
29
- can be also used to parse open graph meta data. Check GitHub for usage examples.
43
+ can be also used to parse (open graph) meta data and microdata. Check GitHub for
44
+ usage examples.
30
45
  email: layerreiss@gmail.com
31
46
  executables: []
32
47
  extensions: []