boilerpipe-ruby 0.2.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +34 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +32 -7
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +14 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
  15. data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
  16. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  17. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
  18. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
  19. data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
  20. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
  21. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  22. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  23. data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
  24. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  25. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  26. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
  27. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  28. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  29. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  30. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  31. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  32. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
  33. data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
  34. data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
  35. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  36. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  37. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
  38. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  39. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  40. data/lib/boilerpipe/labels/label_action.rb +1 -1
  41. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
  42. data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
  43. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  44. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  47. data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
  48. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  49. data/lib/boilerpipe/version.rb +1 -1
  50. metadata +38 -25
@@ -0,0 +1,14 @@
1
+ # Keeps only those content blocks which contain at least k words.
2
+
3
+ module Boilerpipe::Filters
4
+ class MinWordsFilter
5
+ def self.process(min_words, doc)
6
+ doc.text_blocks.each do |tb|
7
+ next if tb.is_not_content?
8
+
9
+ tb.content = false if tb.num_words < min_words
10
+ end
11
+ doc
12
+ end
13
+ end
14
+ end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  # Classifies TextBlocks as content/not-content through rules that have been determined
4
2
  # using the C4.8 machine learning algorithm, as described in the paper
5
3
  # "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
7
5
 
8
6
  module Boilerpipe::Filters
9
7
  class NumWordsRulesClassifier
10
-
11
8
  def self.process(doc)
12
9
  empty = Boilerpipe::Document::TextBlock.empty_start
13
10
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
37
34
 
38
35
  false
39
36
  end
40
-
41
37
  end
42
38
  end
@@ -1,4 +1,4 @@
1
- # Merges two subsequent blocks if their text densities are equal.
1
+ # Merges two subsequent blocks if their text densities are equal.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
17
17
  end
18
18
  end
19
19
 
20
- doc.replace_text_blocks!( tbs - blocks_to_remove )
20
+ doc.replace_text_blocks!(tbs - blocks_to_remove)
21
21
  doc
22
22
  end
23
23
  end
@@ -0,0 +1,37 @@
1
+ # Splits TextBlocks at paragraph boundaries.
2
+ #
3
+ # NOTE: This is not fully supported (i.e., it will break highlighting support via
4
+ # #getContainedTextElements()), but this one probably is necessary for some other filters.
5
+ #
6
+ # see MinClauseWordsFilter
7
+
8
+ module Boilerpipe::Filters
9
+ class SplitParagraphBlocksFilter
10
+ def self.process(doc)
11
+ tbs = doc.text_blocks
12
+ new_blocks = []
13
+ changes = false
14
+ tbs.each do |tb|
15
+ paragraphs = tb.text.split(/[\n\r]+/)
16
+
17
+ if paragraphs.size < 2
18
+ new_blocks << tb
19
+ next
20
+ end
21
+
22
+ is_content = tb.is_content?
23
+ labels = tb.labels
24
+ paragraphs.each do |paragraph|
25
+ tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
26
+ tbP.content = is_content
27
+ tbP.add_labels(labels)
28
+ new_blocks << tbP
29
+ changes = true
30
+ end
31
+ end
32
+
33
+ doc.replace_text_blocks!(new_blocks) if changes
34
+ doc
35
+ end
36
+ end
37
+ end
@@ -1,15 +1,13 @@
1
- # encoding: utf-8
2
-
3
1
  # Finds blocks which are potentially indicating the end of an article
4
2
  # text and marks them with INDICATES_END_OF_TEXT. This can be used
5
3
  # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
6
4
 
7
-
8
5
  module Boilerpipe::Filters
9
6
  class TerminatingBlocksFinder
10
7
  def self.process(doc)
11
8
  doc.text_blocks.each do |tb|
12
9
  next unless tb.num_words < 15
10
+
13
11
  if tb.text.length >= 8 && finds_match?(tb.text.downcase)
14
12
  tb.labels << :INDICATES_END_OF_TEXT
15
13
  elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
29
27
  text.include?('what you think...') ||
30
28
  text.include?('add your comment') ||
31
29
  text.include?('add comment') ||
32
- #TODO add this and test
33
- #text.include?('leave a reply') ||
34
- #text.include?('leave a comment') ||
35
- #text.include?('show comments') ||
36
- #text.include?('Share this:') ||
30
+ # TODO add this and test
31
+ # text.include?('leave a reply') ||
32
+ # text.include?('leave a comment') ||
33
+ # text.include?('show comments') ||
34
+ # text.include?('Share this:') ||
37
35
  text.include?('reader views') ||
38
36
  text.include?('have your say') ||
39
37
  text.include?('reader comments') ||
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks trailing headlines TextBlocks that have the label :#HEADING
3
2
  # as boilerplate. Trailing means they are marked content and are
4
3
  # below any other content block.
@@ -6,7 +5,6 @@
6
5
  module Boilerpipe::Filters
7
6
  class TrailingHeadlineToBoilerplateFilter
8
7
  def self.process(doc)
9
-
10
8
  doc.text_blocks.each do |tb|
11
9
  next unless tb.is_content?
12
10
 
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
19
17
 
20
18
  doc
21
19
  end
22
-
23
20
  end
24
21
  end
@@ -2,7 +2,7 @@ module Boilerpipe::Labels
2
2
  class LabelAction
3
3
  attr_reader :labels
4
4
 
5
- def initialize(labels=[])
5
+ def initialize(labels = [])
6
6
  @labels = labels
7
7
  end
8
8
 
@@ -1,20 +1,16 @@
1
- require 'nokogiri'
2
1
  module Boilerpipe::SAX
3
2
  class BoilerpipeHTMLParser
4
3
  def self.parse(text)
5
-
6
- #script bug - delete script tags
7
- text = text.gsub(/\<script>.+?<\/script>/i, '')
4
+ # script bug - delete script tags
5
+ text.gsub!(/\<script>.+?<\/script>/i, '')
8
6
 
9
7
  # nokogiri uses libxml for mri and nekohtml for jruby
10
8
  # mri doesn't remove &nbsp; when missing the semicolon
11
- text = text.gsub(/(&nbsp) /, '\1; ')
12
-
9
+ text.gsub!(/(&nbsp) /, '\1; ')
13
10
 
14
11
  # use nokogiri to fix any bad tags, errors - keep experimenting with this
15
12
  text = Nokogiri::HTML(text).to_html
16
13
 
17
-
18
14
  handler = HTMLContentHandler.new
19
15
  noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
20
16
  noko_parser.parse(text)
@@ -1,11 +1,8 @@
1
- require 'nokogiri'
2
- require 'set'
3
-
4
1
  module Boilerpipe::SAX
5
2
  class HTMLContentHandler < Nokogiri::XML::SAX::Document
6
3
  attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
7
4
 
8
- attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
5
+ attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
9
6
  ANCHOR_TEXT_START = "$\ue00a<"
10
7
  ANCHOR_TEXT_END = ">\ue00a$"
11
8
 
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
34
31
  @label_stacks << nil
35
32
  tag = name.upcase.intern
36
33
 
37
-
38
34
  tag_action = @tag_actions[tag]
39
35
  if tag_action
40
36
  @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
51
47
  def characters(text)
52
48
  flush_block if @flush
53
49
 
54
- return if @in_ignorable_element != 0
50
+ return if in_ignorable_element?
55
51
  return if text.empty?
56
52
 
57
53
  # replace all whitespace with simple space
58
54
  text.gsub!(/\s+/, ' ')
59
55
 
60
56
  # trim whitespace
61
- started_with_whitespace = text =~ /^\s/
62
- ended_with_whitespace = text =~ /\s$/
57
+ started_with_whitespace = text =~ /^\s/
58
+ ended_with_whitespace = text =~ /\s$/
63
59
  text.strip!
64
60
 
65
61
  # add a single space if the block was only whitespace
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
158
154
  end
159
155
 
160
156
  text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
161
- num_words,
162
- num_linked_words,
163
- num_words_in_wrapped_lines,
164
- num_wrapped_lines, @offset_blocks)
157
+ num_words,
158
+ num_linked_words,
159
+ num_words_in_wrapped_lines,
160
+ num_wrapped_lines, @offset_blocks)
165
161
 
166
162
  @offset_blocks += 1
167
163
  clear_buffers
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
187
183
  # \p{No} -- a numeric character of other type
188
184
 
189
185
  def is_word?(word)
190
- word =~ VALID_WORD_CHARACTER
186
+ word =~ VALID_WORD_CHARACTER
191
187
  end
192
188
 
193
- #public void flushBlock() {
189
+ # public void flushBlock() {
194
190
  # int numWords = 0;
195
191
  # int numLinkedWords = 0;
196
192
  # int numWrappedLines = 0;
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
198
194
  # final int maxLineLength = 80;
199
195
  # int numTokens = 0;
200
196
  # int numWordsCurrentLine = 0;
201
- #}
197
+ # }
202
198
 
203
199
  def increase_in_ignorable_element!
204
200
  @in_ignorable_element += 1
205
201
  end
206
202
 
203
+ # should we prevent less than zero here?
207
204
  def decrease_in_ignorable_element!
208
205
  @in_ignorable_element -= 1
209
206
  end
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
224
221
  @in_anchor_tag > 0
225
222
  end
226
223
 
227
-
228
224
  def add_text_block(text_block)
229
225
  @label_stacks.each do |stack|
230
226
  next unless stack
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
239
235
  # append space if last character wasn't already one
240
236
  def append_space
241
237
  return if @sb_last_was_whitespace
238
+
242
239
  @sb_last_was_whitespace = true
243
240
 
244
241
  @text_buffer << ' '
@@ -48,4 +48,3 @@ module Boilerpipe::SAX
48
48
  end
49
49
  end
50
50
  end
51
-
@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
2
2
  class AnchorText
3
3
  # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
4
4
  # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
5
- #* encounters such nestings, a SAXException is thrown.
5
+ # * encounters such nestings, a SAXException is thrown.
6
6
  def start(handler, name, attrs)
7
7
  if handler.in_anchor_tag?
8
8
  handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
42
42
  # - dunno about nokogiri???????
43
43
  # as nested A elements are not allowed per specification, we
44
44
  # are probably reaching this branch due to a bug in the XML parser
45
- #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
45
+ # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
46
46
  end_tag(handler, name)
47
47
  end
48
48
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Explicitly marks this tag a simple "block-level" element,
3
- # which always generates whitespace
2
+ # Explicitly marks this tag a simple "block-level" element,
3
+ # which always generates whitespace
4
4
  class BlockLevel
5
5
  def start(handler, name, attrs)
6
6
  true
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # for block-level elements, which triggers some LabelAction on
3
- # the generated TextBlock.
2
+ # for block-level elements, which triggers some LabelAction on
3
+ # the generated TextBlock.
4
4
  class BlockTagLabel
5
5
  def initialize(label_action)
6
6
  @label_action = label_action
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::SAX::TagActions
2
- # Marks this tag the body element (this should usually only
3
- # be set for the <BODY> tag).
2
+ # Marks this tag the body element (this should usually only
3
+ # be set for the <BODY> tag).
4
4
  class Body
5
5
  def start(handler, name, attrs)
6
6
  handler.flush_block
@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
10
10
  rel = m[1]
11
11
  val = m[2].to_i # absolute
12
12
  size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
13
- handler.font_size_stack << size
13
+ handler.font_size_stack << size
14
14
  else
15
15
  handler.font_size_stack << nil
16
16
  end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
27
27
  end
28
28
 
29
29
  def relative(font_size_stack, rel, val)
30
- prev_size = font_size_stack.reverse_each.find{|s| s != nil}
30
+ prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
31
31
  prev_size = 3 if prev_size.nil?
32
32
 
33
33
  size = if rel == '+'
@@ -1,3 +1,3 @@
1
1
  module Boilerpipe
2
- VERSION = '0.2.0'
2
+ VERSION = '0.4.3'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: boilerpipe-ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.4.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Gregory Ostermayr
8
- autorequire:
8
+ autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-09-11 00:00:00.000000000 Z
11
+ date: 2020-07-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -16,71 +16,71 @@ dependencies:
16
16
  requirements:
17
17
  - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '1.11'
19
+ version: '2.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '1.11'
26
+ version: '2.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rake
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - "~>"
31
+ - - ">="
32
32
  - !ruby/object:Gem::Version
33
- version: '10.0'
33
+ version: 12.3.3
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - "~>"
38
+ - - ">="
39
39
  - !ruby/object:Gem::Version
40
- version: '10.0'
40
+ version: 12.3.3
41
41
  - !ruby/object:Gem::Dependency
42
- name: rspec
42
+ name: rickshaw
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '3.0'
47
+ version: 0.5.0
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
52
  - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '3.0'
54
+ version: 0.5.0
55
55
  - !ruby/object:Gem::Dependency
56
- name: rickshaw
56
+ name: rspec
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements:
59
59
  - - "~>"
60
60
  - !ruby/object:Gem::Version
61
- version: 0.4.0
61
+ version: '3.9'
62
62
  type: :development
63
63
  prerelease: false
64
64
  version_requirements: !ruby/object:Gem::Requirement
65
65
  requirements:
66
66
  - - "~>"
67
67
  - !ruby/object:Gem::Version
68
- version: 0.4.0
68
+ version: '3.9'
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: nokogiri
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
- - - ">="
73
+ - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: 1.6.6.2
75
+ version: '1.10'
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
- - - ">="
80
+ - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 1.6.6.2
83
- description: A pure ruby implementation of the boilerpipe algorithm
82
+ version: '1.10'
83
+ description: A pure ruby implementation of the boilerpipe web content extraction algorithm
84
84
  email:
85
85
  - "<gregory.ostermayr@gmail.com>"
86
86
  executables: []
@@ -88,9 +88,11 @@ extensions: []
88
88
  extra_rdoc_files: []
89
89
  files:
90
90
  - ".circleci/config.yml"
91
+ - ".dockerignore"
91
92
  - ".gitignore"
92
93
  - ".rspec"
93
94
  - CHANGELOG.md
95
+ - Dockerfile
94
96
  - Gemfile
95
97
  - LICENSE.txt
96
98
  - README.md
@@ -98,14 +100,22 @@ files:
98
100
  - bin/console
99
101
  - bin/setup
100
102
  - boilerpipe-ruby.gemspec
103
+ - boilerpipe_flow.md
101
104
  - lib/boilerpipe.rb
102
105
  - lib/boilerpipe/document/text_block.rb
103
106
  - lib/boilerpipe/document/text_document.rb
104
107
  - lib/boilerpipe/errors.rb
105
108
  - lib/boilerpipe/extractors/article_extractor.rb
109
+ - lib/boilerpipe/extractors/article_sentence_extractor.rb
110
+ - lib/boilerpipe/extractors/canola_extractor.rb
106
111
  - lib/boilerpipe/extractors/default_extractor.rb
112
+ - lib/boilerpipe/extractors/keep_everything_extractor.rb
113
+ - lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
114
+ - lib/boilerpipe/extractors/largest_content_extractor.rb
115
+ - lib/boilerpipe/extractors/num_words_rules_extractor.rb
107
116
  - lib/boilerpipe/filters/block_proximity_fusion.rb
108
117
  - lib/boilerpipe/filters/boilerplate_block_filter.rb
118
+ - lib/boilerpipe/filters/canola_classifier.rb
109
119
  - lib/boilerpipe/filters/density_rules_classifier.rb
110
120
  - lib/boilerpipe/filters/document_title_match_classifier.rb
111
121
  - lib/boilerpipe/filters/expand_title_to_content_filter.rb
@@ -114,8 +124,12 @@ files:
114
124
  - lib/boilerpipe/filters/keep_largest_block_filter.rb
115
125
  - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
116
126
  - lib/boilerpipe/filters/list_at_end_filter.rb
127
+ - lib/boilerpipe/filters/mark_everything_content_filter.rb
128
+ - lib/boilerpipe/filters/min_clause_words_filter.rb
129
+ - lib/boilerpipe/filters/min_words_filter.rb
117
130
  - lib/boilerpipe/filters/num_words_rules_classifier.rb
118
131
  - lib/boilerpipe/filters/simple_block_fusion_processor.rb
132
+ - lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
119
133
  - lib/boilerpipe/filters/terminating_blocks_finder.rb
120
134
  - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
121
135
  - lib/boilerpipe/labels/default.rb
@@ -140,7 +154,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
140
154
  licenses:
141
155
  - Apache 2.0
142
156
  metadata: {}
143
- post_install_message:
157
+ post_install_message:
144
158
  rdoc_options: []
145
159
  require_paths:
146
160
  - lib
@@ -155,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
155
169
  - !ruby/object:Gem::Version
156
170
  version: '0'
157
171
  requirements: []
158
- rubyforge_project:
159
- rubygems_version: 2.6.12
160
- signing_key:
172
+ rubygems_version: 3.0.8
173
+ signing_key:
161
174
  specification_version: 4
162
- summary: A pure ruby implemenation of the boilerpipe algorithm
175
+ summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
163
176
  test_files: []