boilerpipe-ruby 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +28 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +13 -4
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +4 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  15. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  17. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
  18. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  19. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  20. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  21. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  22. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  23. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  24. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
  25. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  26. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  27. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  28. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  29. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  30. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  31. data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
  32. data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
  33. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  34. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  35. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
  36. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  37. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  38. data/lib/boilerpipe/labels/label_action.rb +1 -1
  39. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
  40. data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
  41. data/lib/boilerpipe/sax/preprocessor.rb +11 -0
  42. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  43. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  44. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
  47. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  48. data/lib/boilerpipe/version.rb +1 -1
  49. metadata +28 -25
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks trailing headlines TextBlocks that have the label :#HEADING
3
2
  # as boilerplate. Trailing means they are marked content and are
4
3
  # below any other content block.
@@ -6,7 +5,6 @@
6
5
  module Boilerpipe::Filters
7
6
  class TrailingHeadlineToBoilerplateFilter
8
7
  def self.process(doc)
9
-
10
8
  doc.text_blocks.each do |tb|
11
9
  next unless tb.is_content?
12
10
 
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
19
17
 
20
18
  doc
21
19
  end
22
-
23
20
  end
24
21
  end
@@ -2,7 +2,7 @@ module Boilerpipe::Labels
2
2
  class LabelAction
3
3
  attr_reader :labels
4
4
 
5
- def initialize(labels=[])
5
+ def initialize(labels = [])
6
6
  @labels = labels
7
7
  end
8
8
 
@@ -1,20 +1,11 @@
1
- require 'nokogiri'
2
1
  module Boilerpipe::SAX
3
2
  class BoilerpipeHTMLParser
4
3
  def self.parse(text)
5
-
6
- #script bug - delete script tags
7
- text.gsub!(/\<script>.+?<\/script>/i, '')
8
-
9
- # nokogiri uses libxml for mri and nekohtml for jruby
10
- # mri doesn't remove &nbsp; when missing the semicolon
11
- text.gsub!(/(&nbsp) /, '\1; ')
12
-
4
+ # strip out tags that cause issues
5
+ text = Preprocessor.strip(text)
13
6
 
14
7
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
15
8
  text = Nokogiri::HTML(text).to_html
16
-
17
-
18
9
  handler = HTMLContentHandler.new
19
10
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
20
11
  noko_parser.parse(text)
@@ -1,11 +1,8 @@
1
- require 'nokogiri'
2
- require 'set'
3
-
4
1
  module Boilerpipe::SAX
5
2
  class HTMLContentHandler < Nokogiri::XML::SAX::Document
6
3
  attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
7
4
 
8
- attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
5
+ attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
9
6
  ANCHOR_TEXT_START = "$\ue00a<"
10
7
  ANCHOR_TEXT_END = ">\ue00a$"
11
8
 
@@ -20,7 +17,7 @@ module Boilerpipe::SAX
20
17
  @flush = false
21
18
  @block_tag_level = -1
22
19
 
23
- @in_body = 0
20
+ @in_body_tag = 0
24
21
  @in_anchor_tag = 0
25
22
  @in_ignorable_element = 0
26
23
  @in_anchor_text = false
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
34
31
  @label_stacks << nil
35
32
  tag = name.upcase.intern
36
33
 
37
-
38
34
  tag_action = @tag_actions[tag]
39
35
  if tag_action
40
36
  @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
51
47
  def characters(text)
52
48
  flush_block if @flush
53
49
 
54
- return if @in_ignorable_element != 0
50
+ return if in_ignorable_element?
55
51
  return if text.empty?
56
52
 
57
53
  # replace all whitespace with simple space
58
54
  text.gsub!(/\s+/, ' ')
59
55
 
60
56
  # trim whitespace
61
- started_with_whitespace = text =~ /^\s/
62
- ended_with_whitespace = text =~ /\s$/
57
+ started_with_whitespace = text =~ /^\s/
58
+ ended_with_whitespace = text =~ /\s$/
63
59
  text.strip!
64
60
 
65
61
  # add a single space if the block was only whitespace
@@ -96,9 +92,15 @@ module Boilerpipe::SAX
96
92
  @label_stacks.pop
97
93
  end
98
94
 
95
+ def not_in_body_tag?
96
+ @in_body_tag == 0
97
+ end
98
+
99
99
  def flush_block
100
100
  @flush = false
101
- if @in_body == 0
101
+
102
+ # set title
103
+ if not_in_body_tag?
102
104
  @title = @token_buffer.strip if :TITLE == @last_start_tag
103
105
  clear_buffers
104
106
  return
@@ -158,10 +160,10 @@ module Boilerpipe::SAX
158
160
  end
159
161
 
160
162
  text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
161
- num_words,
162
- num_linked_words,
163
- num_words_in_wrapped_lines,
164
- num_wrapped_lines, @offset_blocks)
163
+ num_words,
164
+ num_linked_words,
165
+ num_words_in_wrapped_lines,
166
+ num_wrapped_lines, @offset_blocks)
165
167
 
166
168
  @offset_blocks += 1
167
169
  clear_buffers
@@ -187,10 +189,10 @@ module Boilerpipe::SAX
187
189
  # \p{No} -- a numeric character of other type
188
190
 
189
191
  def is_word?(word)
190
- word =~ VALID_WORD_CHARACTER
192
+ word =~ VALID_WORD_CHARACTER
191
193
  end
192
194
 
193
- #public void flushBlock() {
195
+ # public void flushBlock() {
194
196
  # int numWords = 0;
195
197
  # int numLinkedWords = 0;
196
198
  # int numWrappedLines = 0;
@@ -198,22 +200,23 @@ module Boilerpipe::SAX
198
200
  # final int maxLineLength = 80;
199
201
  # int numTokens = 0;
200
202
  # int numWordsCurrentLine = 0;
201
- #}
203
+ # }
202
204
 
203
205
  def increase_in_ignorable_element!
204
206
  @in_ignorable_element += 1
205
207
  end
206
208
 
209
+ # should we prevent less than zero here?
207
210
  def decrease_in_ignorable_element!
208
211
  @in_ignorable_element -= 1
209
212
  end
210
213
 
211
- def increase_in_body!
212
- @in_body += 1
214
+ def enter_body_tag!
215
+ @in_body_tag += 1
213
216
  end
214
217
 
215
- def decrease_in_body!
216
- @in_body -= 1
218
+ def exit_body_tag!
219
+ @in_body_tag -= 1
217
220
  end
218
221
 
219
222
  def in_ignorable_element?
@@ -224,7 +227,6 @@ module Boilerpipe::SAX
224
227
  @in_anchor_tag > 0
225
228
  end
226
229
 
227
-
228
230
  def add_text_block(text_block)
229
231
  @label_stacks.each do |stack|
230
232
  next unless stack
@@ -239,6 +241,7 @@ module Boilerpipe::SAX
239
241
  # append space if last character wasn't already one
240
242
  def append_space
241
243
  return if @sb_last_was_whitespace
244
+
242
245
  @sb_last_was_whitespace = true
243
246
 
244
247
  @text_buffer << ' '
@@ -0,0 +1,11 @@
1
+ module Boilerpipe::SAX
2
+ class Preprocessor
3
+ def self.strip(text)
4
+ # script bug - delete script tags
5
+ text = text.gsub(/\<script.+?<\/script>/im, '')
6
+ # nokogiri uses libxml for mri and nekohtml for jruby
7
+ # mri doesn't remove &nbsp; when missing the semicolon
8
+ text.gsub(/(&nbsp) /, '\1; ')
9
+ end
10
+ end
11
+ end
@@ -48,4 +48,3 @@ module Boilerpipe::SAX
48
48
  end
49
49
  end
50
50
  end
51
-
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
2
2
  class AnchorText
3
3
  # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
4
4
  # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
5
- #* encounters such nestings, a SAXException is thrown.
5
+ # * encounters such nestings, a SAXException is thrown.
6
6
  def start(handler, name, attrs)
7
7
  if handler.in_anchor_tag?
8
8
  handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
42
42
  # - dunno about nokogiri???????
43
43
  # as nested A elements are not allowed per specification, we
44
44
  # are probably reaching this branch due to a bug in the XML parser
45
- #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
45
+ # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
46
46
  end_tag(handler, name)
47
47
  end
48
48
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Explicitly marks this tag a simple "block-level" element,
3
- # which always generates whitespace
2
+ # Explicitly marks this tag a simple "block-level" element,
3
+ # which always generates whitespace
4
4
  class BlockLevel
5
5
  def start(handler, name, attrs)
6
6
  true
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # for block-level elements, which triggers some LabelAction on
3
- # the generated TextBlock.
2
+ # for block-level elements, which triggers some LabelAction on
3
+ # the generated TextBlock.
4
4
  class BlockTagLabel
5
5
  def initialize(label_action)
6
6
  @label_action = label_action
@@ -1,16 +1,16 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Marks this tag the body element (this should usually only
3
- # be set for the <BODY> tag).
2
+ # Marks this tag the body element (this should usually only
3
+ # be set for the <BODY> tag).
4
4
  class Body
5
5
  def start(handler, name, attrs)
6
6
  handler.flush_block
7
- handler.increase_in_body!
7
+ handler.enter_body_tag!
8
8
  false
9
9
  end
10
10
 
11
11
  def end_tag(handler, name)
12
12
  handler.flush_block
13
- handler.decrease_in_body!
13
+ handler.exit_body_tag!
14
14
  false
15
15
  end
16
16
 
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
10
10
  rel = m[1]
11
11
  val = m[2].to_i # absolute
12
12
  size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
13
- handler.font_size_stack << size
13
+ handler.font_size_stack << size
14
14
  else
15
15
  handler.font_size_stack << nil
16
16
  end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
27
27
  end
28
28
 
29
29
  def relative(font_size_stack, rel, val)
30
- prev_size = font_size_stack.reverse_each.find{|s| s != nil}
30
+ prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
31
31
  prev_size = 3 if prev_size.nil?
32
32
 
33
33
  size = if rel == '+'
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.4.0'
2
+ VERSION = '0.5.0'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-15 00:00:00.000000000 Z
11
+ date: 2021-02-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,71 +16,71 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: rickshaw
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: 0.5.0
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: 0.5.0
55
55
  - !ruby/object:Gem::Dependency
56
- name: rickshaw
56
+ name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.4.0
61
+ version: '3.10'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.4.0
68
+ version: '3.10'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ">="
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: 1.6.6.2
75
+ version: '1.10'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 1.6.6.2
83
- description: A pure ruby implementation of the boilerpipe algorithm
82
+ version: '1.10'
83
+ description: A pure ruby implementation of the boilerpipe web content extraction algorithm
84
84
  email:
85
85
  - "<gregory.ostermayr@gmail.com>"
86
86
  executables: []
@@ -88,9 +88,11 @@ extensions: []
88
88
  extra_rdoc_files: []
89
89
  files:
90
90
  - ".circleci/config.yml"
91
+ - ".dockerignore"
91
92
  - ".gitignore"
92
93
  - ".rspec"
93
94
  - CHANGELOG.md
95
+ - Dockerfile
94
96
  - Gemfile
95
97
  - LICENSE.txt
96
98
  - README.md
@@ -98,6 +100,7 @@ files:
98
100
  - bin/console
99
101
  - bin/setup
100
102
  - boilerpipe-ruby.gemspec
103
+ - boilerpipe_flow.md
101
104
  - lib/boilerpipe.rb
102
105
  - lib/boilerpipe/document/text_block.rb
103
106
  - lib/boilerpipe/document/text_document.rb
@@ -133,6 +136,7 @@ files:
133
136
  - lib/boilerpipe/labels/label_action.rb
134
137
  - lib/boilerpipe/sax/boilerpipe_html_parser.rb
135
138
  - lib/boilerpipe/sax/html_content_handler.rb
139
+ - lib/boilerpipe/sax/preprocessor.rb
136
140
  - lib/boilerpipe/sax/tag_action_map.rb
137
141
  - lib/boilerpipe/sax/tag_actions/anchor_text.rb
138
142
  - lib/boilerpipe/sax/tag_actions/block_level.rb
@@ -151,7 +155,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
151
155
  licenses:
152
156
  - Apache 2.0
153
157
  metadata: {}
154
- post_install_message:
158
+ post_install_message:
155
159
  rdoc_options: []
156
160
  require_paths:
157
161
  - lib
@@ -166,9 +170,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
166
170
  - !ruby/object:Gem::Version
167
171
  version: '0'
168
172
  requirements: []
169
- rubyforge_project:
170
- rubygems_version: 2.6.12
171
- signing_key:
173
+ rubygems_version: 3.0.8
174
+ signing_key:
172
175
  specification_version: 4
173
- summary: A pure ruby implemenation of the boilerpipe algorithm
176
+ summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
174
177
  test_files: []