BoilerpipeArticle 0.0.4 → 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. checksums.yaml +4 -4
  2. data/lib/boilerpipe_article.rb +153 -54
  3. metadata +18 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 86e9801c9b5e6eacf758aa4bbff2ec298313e163
4
- data.tar.gz: 58bdb8a6a7624e08f5f28f8bbbc094da2a3fc02a
3
+ metadata.gz: fccf00da5423dc69b01d5f7b43a0932574ac16e9
4
+ data.tar.gz: a9d9ae5187108e1d20d7d9a45926b811cde2f3cf
5
5
  SHA512:
6
- metadata.gz: f8dcba151737a254fb7f0840d24d187a757575ed87c8b0817ffb24dd3d3e3b28bebacf7ac4ee980bf8bc085317c9f3ae94a651c7c5b77fee43511f1a615077ab
7
- data.tar.gz: e3a3ed339a688ae3ed778cd4f4c2e2e426ea070b94e07dcc156d945731b38961540f7a46fa4d421dbdac21aa47022c70df6586f6458bf4f620b95e3628278cd5
6
+ metadata.gz: 3c85b972589fd947fe9e2dd7606935fcd292e874e7e19ba1936d128eae26e03e8dc0e8e35afd40b82dac598925ba2e4de3d72da4d43f6a13ed36982a27c7ff7c
7
+ data.tar.gz: 10642f8bbf5573f8d4608945201cca270022b5e000bba26998670a75d235e5c234d5fd0047039d2a2afff38b5a696d4c82c10c68d2b9148104ce4cd24f09dcc3
@@ -1,90 +1,189 @@
1
+ #Encoding: UTF-8
2
+
1
3
  require 'nokogiri'
4
+ require 'mida'
2
5
 
3
6
  class BoilerpipeArticle
4
7
  def initialize(html)
5
- @html = html
8
+ @html = html.gsub(/\s\s+/,' ')
9
+ @articlesStats = Hash.new
6
10
  end
7
-
8
- def getText(html = @html)
11
+ def removeBadHtmlTags(html = @html)
9
12
  html = Nokogiri::HTML.parse(html).to_s
10
13
  html.gsub!(/<!-[\s\S]*?->/, '')
11
14
  html.gsub!(/\r?\n|\r/, '')
12
15
 
16
+ unwantedTags = ['strong','bold','i']
17
+ unwantedTags.each do |tag|
18
+ html.gsub!("<#{tag}>",'')
19
+ html.gsub!("</#{tag}>",'')
20
+ end
21
+
22
+
13
23
  doc = Nokogiri::HTML(html)
14
- badHtmlTags = ['nav','head','script','style','a','img']
24
+
25
+ badHtmlTags = ['script','style','head','nav','iframe','img','footer','ol','ul','li','a']
26
+ doc.css('*').each do |node|
27
+ node.remove if node.text.length < 3
28
+ end
15
29
  badHtmlTags.each do |tag|
16
30
  doc.search(tag).each do |src|
17
31
  src.remove
18
32
  end
19
33
  end
20
-
34
+ # doc.css('a').each do |atag|
35
+ # atag = "#{atag.text}"
36
+ # puts atag
37
+ # end
21
38
  html = doc.to_html.to_s
22
- selfClosingTags = ['<area','<base','<br','<col','<command','<embed','<hr','<img','<input','<keygen','<link','<meta','<param','<source','<track','<wbr']
23
- time = Time.now.to_f
24
- depth = 1
25
- i = 0
26
- start = 0
27
- close = 0
39
+
40
+ return html
41
+ end
42
+ def calculateDepth(html = @html)
28
43
  articlesStats = Hash.new
29
- inPtag = false
30
- content = ''
31
- html.length.times do
32
- char = html[i]
33
- if char.eql? '<'
34
- start = i
35
- ii = start
36
- html.length.times do
37
- char2 = html[ii]
38
- if char2.eql? '>'
39
- tag = html[start..ii]
40
- tagname = "#{tag}"
41
- inPtag = true if tagname.eql?('<p>') || tagname.split(' ')[0].eql?('<p')
42
- content = html[close..start].gsub(/[<>]/,'')
43
- tagname = "#{tag}"
44
- text = ''
45
- text = content if inPtag
46
- articlesStats.store(i,[text,depth,tagname]) if content.gsub(/[^a-zA-Z]+/,'').length > 1
47
- close = ii
48
- inPtag = false if tagname.eql? '</p>'
49
- if !selfClosingTags.include?(tag.split(" ")[0]) && !tag.include?('<br')
50
- tag.gsub!(/"[\s\S]*?"/,'')
51
- tag.gsub!(/[^<>\/]+/,'')
52
- if tag.eql? '<>'
53
- depth+=1
54
- else
55
- depth-=1
56
- end
44
+ doc = Nokogiri::HTML(html)
45
+ i = 0
46
+ doc.xpath('//text()').each do |node|
47
+ text = node.to_s
48
+ articlesStats.store(i,[node.text.to_s,node.ancestors.length.to_i,node.parent.name])
49
+ i+=1
50
+ end
51
+ return articlesStats
52
+ end
53
+ def removeSamePatterns(html)
54
+ doc = Nokogiri::HTML(html)
55
+ paths = Array.new
56
+ doc.css('*').each do |node|
57
+ s = node.path.gsub(/\[[\s\S]*?\]/, '')
58
+ paths.push(s)
59
+ end
60
+ final = []
61
+ (7..30).each do |i|
62
+ all = []
63
+ paths.each_with_index do |seq,a|
64
+ se = []
65
+ paths[a..-1].each_with_index do |s,ii|
66
+ se << s
67
+ break if ii == i-1
68
+ end
69
+ all << se
70
+ end
71
+ final << all
72
+ end
73
+ allDoubles = Hash.new
74
+ final.each_with_index do |seq,i|
75
+ counts = Hash.new(0)
76
+ seq.each do |name|
77
+ counts[name] += 1
78
+ end
79
+ counts = counts.sort_by{|k,v|v}.reverse.to_h
80
+ allDoubles.store(i,counts)
81
+ end
82
+ allDoubles.each do |i,doubles|
83
+ doubles.each do |path,count|
84
+ if count >= 7
85
+ doc.css('*').each do |node|
86
+ s = node.path.gsub(/\[[\s\S]*?\]/, '')
87
+ if path.include? s
88
+ node.remove
57
89
  end
58
- break
59
90
  end
60
- ii+=1
61
91
  end
62
92
  end
63
- i+=1
64
93
  end
94
+ return doc.to_s
95
+ end
96
+ def calculateBestDepth(articlesStats)
65
97
  bestDepth = Hash.new(0)
66
98
  articlesStats.each do |line,stats|
67
- bestDepth[stats[1]]+=stats[0].gsub(/[^a-zA-Z]+/,'').length
99
+ bestDepth[stats[1]]+=stats[0].length
68
100
  end
69
- best = bestDepth.sort_by {|key,value|value}.reverse.to_h.keys[0]
101
+ bestvalues = bestDepth.sort_by {|key,value|value}.reverse.to_h
102
+ average = 0.0
103
+ bestDepth.each {|l,v|average+=v/bestDepth.keys.length.to_f}
104
+ texts = 0
105
+ bestDepth.each{|l,v|texts +=1 if v > average}
106
+
107
+ doubleTexts = false
108
+ doubleTexts = true if texts >= 2
109
+ best = bestvalues.keys[0]
110
+
111
+ return best,doubleTexts
112
+ end
113
+
114
+ def getTextOfBestDepth(articlesStats,best)
70
115
  text = ''
71
116
  articlesStats.each do |line,stats|
72
- text = "#{text} #{stats[0]}" if stats[1] == best
117
+ if stats[1] == best && (stats[-1].eql?('h1') || stats[-1].eql?('h2') || stats[-1].eql?('p'))
118
+ text = "#{text} <#{stats[-1]}>#{stats[0]}</#{stats[-1]}>" if stats[0].strip.length > 2
119
+ end
73
120
  end
74
- return Nokogiri::HTML.parse(text).text
121
+ return text
75
122
  end
76
123
 
77
- def getOgMetas(html = @html)
124
+ def getMetas(html = @html)
78
125
  metas = Hash.new
79
126
  doc = Nokogiri.parse(html)
80
- properties = ['title','type','url','description','image','type','updated_time','locale','url','site_name']
81
- properties.each do |prop|
82
- if doc.at("meta[property=\"og:#{prop}\"]") != nil
83
- metas.store(prop,doc.at("meta[property=\"og:#{prop}\"]")['content'])
84
- else
85
- metas.store(prop,' ')
86
- end
127
+ doc.xpath("//meta").each do |node|
128
+ name = node[node.attributes.keys[1]]
129
+ name = node[node.attributes.keys[0]] if node.attributes.keys[0] != 'content' && node.attributes.keys[0] != 'value'
130
+ content = node['content']
131
+ content = node['value'] if content == nil
132
+
133
+ metas.store(name,content)
87
134
  end
88
135
  return metas
89
136
  end
137
+ def getOtherHTMLDescriptions(html = @html)
138
+ doc = Nokogiri.parse(html)
139
+ images = Array.new
140
+ headlines = Hash.new
141
+ links = Hash.new
142
+ 5.times do |i|
143
+ hs = doc.xpath("//h#{i+1}")
144
+ texts = []
145
+ hs.each {|node| texts.push(node.text.to_s)}
146
+ headlines.store("h#{i+1}",texts)
147
+ end
148
+
149
+ imgs = doc.xpath('//img/@src')
150
+ imgs.each do |source|
151
+ images.push(source.text) if source.text.include?('http')
152
+ end
153
+
154
+ plinks = doc.xpath('//a/@href')
155
+ plinks.each do |source|
156
+ links.store(source.text,1) if source.text.strip.length > 2
157
+ end
158
+
159
+ return {'headlines'=>headlines,'images'=>images, 'links' => links.keys}
160
+ end
161
+ def getMicroData(html = @html)
162
+ doc = Mida::Document.new(html, "")
163
+ topLevel = Array.new
164
+ doc.items.each do |item|
165
+ topLevel.push(item.to_h)
166
+ end
167
+ return topLevel
168
+ end
169
+ def getAllText(html = @html)
170
+ doc = Nokogiri.parse(html)
171
+ doc.search('script').remove
172
+ doc.search('style').remove
173
+ return doc.text.gsub(/\s\s+/,' ')
174
+ end
175
+ def getArticle(html = @html)
176
+ html = removeBadHtmlTags(html)
177
+ articlesStats = calculateDepth(html)
178
+ best,doubleTexts = calculateBestDepth(articlesStats)
179
+ if doubleTexts
180
+ html = removeSamePatterns(html)
181
+ articlesStats,d = calculateDepth(html)
182
+
183
+ end
184
+ bestDepth,doubles = calculateBestDepth(articlesStats)
185
+ plainText = getTextOfBestDepth(articlesStats,bestDepth)
186
+ return plainText
187
+ end
188
+
90
189
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: BoilerpipeArticle
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
4
+ version: '0.1'
5
5
  platform: ruby
6
6
  authors:
7
7
  - David Layer-Reiss
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-07-31 00:00:00.000000000 Z
11
+ date: 2016-09-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -24,9 +24,24 @@ dependencies:
24
24
  - - '='
25
25
  - !ruby/object:Gem::Version
26
26
  version: 1.6.8
27
+ - !ruby/object:Gem::Dependency
28
+ name: mida
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 0.3.9
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '='
39
+ - !ruby/object:Gem::Version
40
+ version: 0.3.9
27
41
  description: This gem removes the surplus “clutter” (boilerplate, templates) around
28
42
  the main textual content of a web page (pure Ruby implementation). BoilerpipeArticle
29
- can be also used to parse open graph meta data. Check GitHub for usage examples.
43
+ can be also used to parse (open graph) meta data and microdata. Check GitHub for
44
+ usage examples.
30
45
  email: layerreiss@gmail.com
31
46
  executables: []
32
47
  extensions: []