BoilerpipeArticle 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. checksums.yaml +7 -0
  2. data/lib/boilerpipe_article.rb +80 -0
  3. metadata +60 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: a2cbf7a8c4c979aecefeb76935ec831f8f7e010b
4
+ data.tar.gz: 427bb161caf9302c93739c52ffdda87795d08ab1
5
+ SHA512:
6
+ metadata.gz: d7268bf78155ffba0641a79c2aa83b36953f965587fe404ea96b41fbf7b960b7c70189a22690eedd9993ef2cd0f04d12408718a84e2a3ef5db445636be7f6fe7
7
+ data.tar.gz: 8366fd8c1ad057b7bb3718109c6e9070e60dd68e06fc4372e9a16f1ed72c9dd7527039e46c841a4f6aafa86d8ae72829f8f1775c1d85d76f1ad9a8ddd40fd24a
@@ -0,0 +1,80 @@
1
+ require 'nokogiri'
2
+
3
+ class BoilerpipeArticle
4
+ def initialize(html)
5
+ @html = html
6
+ end
7
+
8
+ def run
9
+ html = Nokogiri::HTML.parse(@html).to_s
10
+ html.gsub!(/<!-[\s\S]*?->/, '')
11
+ html.gsub!(/\r?\n|\r/, '')
12
+
13
+ doc = Nokogiri::HTML(html)
14
+ badHtmlTags = ['li','ol','ul','head','script','style','a','img']
15
+ badHtmlTags.each do |tag|
16
+ doc.search(tag).each do |src|
17
+ src.remove
18
+ end
19
+ end
20
+
21
+ html = doc.to_html.to_s
22
+
23
+ selfClosingTags = ['<area','<base','<br','<col','<command','<embed','<hr','<img','<input','<keygen','<link','<meta','<param','<source','<track','<wbr']
24
+
25
+
26
+ time = Time.now.to_f
27
+ depth = 1
28
+ i = 0
29
+ start = 0
30
+ close = 0
31
+ articlesStats = Hash.new
32
+ inPtag = false
33
+ content = ''
34
+ html.length.times do
35
+ char = html[i]
36
+ if char.eql? '<'
37
+ start = i
38
+ ii = start
39
+ html.length.times do
40
+ char2 = html[ii]
41
+ if char2.eql? '>'
42
+ tag = html[start..ii]
43
+ tagname = "#{tag}"
44
+ inPtag = true if tagname.eql?('<p>') || tagname.split(' ')[0].eql?('<p')
45
+ content = html[close..start].gsub(/[<>]/,'')
46
+ tagname = "#{tag}"
47
+ text = ''
48
+ text = content if inPtag
49
+ articlesStats.store(i,[text,depth,tagname]) if content.gsub(/[^a-zA-Z]+/,'').length > 1
50
+ close = ii
51
+ inPtag = false if tagname.eql? '</p>'
52
+ if !selfClosingTags.include?(tag.split(" ")[0]) && !tag.include?('<br')
53
+ tag.gsub!(/"[\s\S]*?"/,'')
54
+ tag.gsub!(/[^<>\/]+/,'')
55
+ if tag.eql? '<>'
56
+ depth+=1
57
+ else
58
+ depth-=1
59
+ end
60
+ end
61
+ break
62
+ end
63
+ ii+=1
64
+ end
65
+ end
66
+ i+=1
67
+ end
68
+ bestDepth = Hash.new(0)
69
+ articlesStats.each do |line,stats|
70
+ bestDepth[stats[1]]+=stats[0].gsub(/[^a-zA-Z]+/,'').length
71
+ end
72
+ best = bestDepth.sort_by {|key,value|value}.reverse.to_h.keys[0]
73
+ text = ''
74
+ articlesStats.each do |line,stats|
75
+ text = "#{text} #{stats[0]}" if stats[1] == best
76
+ end
77
+
78
+ return Nokogiri::HTML.parse(text).text
79
+ end
80
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: BoilerpipeArticle
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.3
5
+ platform: ruby
6
+ authors:
7
+ - David Layer-Reiss
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-07-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '='
18
+ - !ruby/object:Gem::Version
19
+ version: 1.6.8
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '='
25
+ - !ruby/object:Gem::Version
26
+ version: 1.6.8
27
+ description: This gem removes the surplus “clutter” (boilerplate, templates) around
28
+ the main textual content of a web page (pure Ruby implementation). Check GitHub
29
+ for usage examples.
30
+ email: layerreiss@gmail.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - lib/boilerpipe_article.rb
36
+ homepage: https://github.com/davidlr99/BoilerpipeArticle
37
+ licenses:
38
+ - MIT
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: '1.9'
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.5.1
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: This gem extract the main textual content of a HTML page (e.g. news articles).
60
+ test_files: []