boilerpipe-ruby 0.3.0 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +30 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +15 -4
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +9 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
  15. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  17. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  18. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
  19. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  20. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  21. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  22. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  23. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  24. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  25. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
  26. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  27. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  28. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  29. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  30. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  31. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  32. data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
  33. data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
  34. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  35. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  36. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
  37. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  38. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  39. data/lib/boilerpipe/labels/label_action.rb +1 -1
  40. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
  41. data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
  42. data/lib/boilerpipe/sax/preprocessor.rb +11 -0
  43. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  44. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  47. data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
  48. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  49. data/lib/boilerpipe/version.rb +1 -1
  50. metadata +33 -25
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class DefaultExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::DefaultExtractor.process doc
@@ -1,4 +1,4 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Extractors
4
4
  class KeepEverythingExtractor
@@ -0,0 +1,20 @@
1
+ # A full-text extractor which extracts the largest text component of a page.
2
+ # For news articles, it may perform better than the DefaultExtractor, but
3
+ # usually worse than ArticleExtractor.
4
+
5
+ module Boilerpipe::Extractors
6
+ class KeepEverythingWithKMinWordsExtractor
7
+ def self.text(min, contents)
8
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
9
+ ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
10
+ doc.content
11
+ end
12
+
13
+ def self.process(min, doc)
14
+ ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
15
+ ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
16
+ ::Boilerpipe::Filters::MinWordsFilter.process min, doc
17
+ doc
18
+ end
19
+ end
20
+ end
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class NumWordsRulesExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
@@ -1,11 +1,8 @@
1
-
2
- # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
3
- # probably makes sense only in cases where an upstream filter already has removed some blocks.
1
+ # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
2
+ # probably makes sense only in cases where an upstream filter already has removed some blocks.
4
3
 
5
4
  module Boilerpipe::Filters
6
5
  class BlockProximityFusion
7
-
8
-
9
6
  def initialize(max_blocks_distance, content_only, same_tag_level_only)
10
7
  @max_blocks_distance = max_blocks_distance
11
8
  @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
13
10
  end
14
11
 
15
12
  MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
16
- MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
17
- MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
13
+ MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
14
+ MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
18
15
  MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
19
16
 
20
17
  def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
22
19
  return false if text_blocks.size < 2
23
20
 
24
21
  prev_block = if @content_only
25
- text_blocks.find{ |tb| tb.is_content? }
22
+ text_blocks.find { |tb| tb.is_content? }
26
23
  else
27
24
  text_blocks.first
28
25
  end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
46
43
  ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
47
44
  ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
48
45
 
49
- if ok
46
+ if ok
50
47
  prev_block.merge_next(tb)
51
48
  blocks_to_remove << tb
52
49
  else
53
50
  prev_block = tb
54
51
  end
55
52
  end
56
-
57
53
  end
58
- doc.replace_text_blocks!( text_blocks - blocks_to_remove )
54
+ doc.replace_text_blocks!(text_blocks - blocks_to_remove)
59
55
  doc
60
56
  end
61
-
62
57
  end
63
58
  end
@@ -1,9 +1,7 @@
1
-
2
- # Removes TextBlocks which have explicitly been marked as "not content".
1
+ # Removes TextBlocks which have explicitly been marked as "not content".
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class BoilerplateBlockFilter
6
-
7
5
  def initialize(label)
8
6
  @label_to_keep = label
9
7
  end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
21
19
  doc.replace_text_blocks!(combined)
22
20
  doc
23
21
  end
24
-
25
22
  end
26
23
  end
@@ -1,10 +1,9 @@
1
- # A full-text extractor trained on http://krdwrd.org/
2
- # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
- # Works well with SimpleEstimator, too.
1
+ # A full-text extractor trained on http://krdwrd.org/
2
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
+ # Works well with SimpleEstimator, too.
4
4
 
5
5
  module Boilerpipe::Filters
6
6
  class CanolaClassifier
7
-
8
7
  def self.process(doc)
9
8
  return doc if doc.text_blocks.size < 1
10
9
 
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
22
21
  def self.classify(prev, current, nxt)
23
22
  current.link_density > 0 && nxt.num_words > 11 \
24
23
  || current.num_words > 19 \
25
- || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
24
+ || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
26
25
  end
27
26
  end
28
27
  end
@@ -5,9 +5,8 @@
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class DensityRulesClassifier
8
-
9
8
  def self.process(doc)
10
- #return doc if doc.text_blocks.size < 2
9
+ # return doc if doc.text_blocks.size < 2
11
10
 
12
11
  empty = Boilerpipe::Document::TextBlock.empty_start
13
12
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
26
25
  if prev.link_density <= 0.555556
27
26
  if current.text_density <= 9
28
27
  return true if nxt.text_density > 10
28
+
29
29
  return prev.text_density <= 4 ? false : true
30
30
  else
31
31
  return nxt.text_density == 0 ? false : true
32
32
  end
33
33
  else
34
34
  return false if nxt.text_density <= 11
35
+
35
36
  true
36
37
  end
37
38
  end
@@ -1,12 +1,9 @@
1
- # encoding: utf-8
2
- require 'set'
1
+ # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
2
+ # some heuristics which are quite specific to the news domain.
3
3
 
4
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
5
- # some heuristics which are quite specific to the news domain.
6
-
7
- # we create a list of potential titles from the page title
8
- # then we look at every text block and if the text block
9
- # contains a potential title - we set that text block label as :TITLE
4
+ # we create a list of potential titles from the page title
5
+ # then we look at every text block and if the text block
6
+ # contains a potential title - we set that text block label as :TITLE
10
7
 
11
8
  module Boilerpipe::Filters
12
9
  class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
55
52
  @potential_titles << title
56
53
 
57
54
  # unnecessary
58
- #p = longest_part(title, /[ ]*[|»-][ ]*/)
59
- #@potential_titles << p if p
55
+ # p = longest_part(title, /[ ]*[|»-][ ]*/)
56
+ # @potential_titles << p if p
60
57
 
61
- #p = longest_part(title, /[ ]*[|»:][ ]*/)
62
- #@potential_titles << p if p
58
+ # p = longest_part(title, /[ ]*[|»:][ ]*/)
59
+ # @potential_titles << p if p
63
60
 
64
- #p = longest_part(title, /[ ]*[|»:()][ ]*/)
65
- #@potential_titles << p if p
61
+ # p = longest_part(title, /[ ]*[|»:()][ ]*/)
62
+ # @potential_titles << p if p
66
63
 
67
- #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
68
- #@potential_titles << p if p
64
+ # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
65
+ # @potential_titles << p if p
69
66
 
70
67
  p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
71
68
  @potential_titles << p if p
72
69
 
73
70
  # we replace \u00a0 so why check for it?
74
- #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
75
- #@potential_titles << p if p
71
+ # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
72
+ # @potential_titles << p if p
76
73
 
77
74
  add_potential_titles(title, /[ ]+[|][ ]+/, 4)
78
75
  add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
90
87
 
91
88
  parts.each do |part|
92
89
  next if part =~ /[.]com/
90
+
93
91
  num_words = number_of_words(part)
94
92
 
95
93
  if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
107
105
 
108
106
  parts.each do |part|
109
107
  next if part =~ /[.]com/
108
+
110
109
  num_words = number_of_words(part)
111
110
 
112
111
  @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
116
115
  def number_of_words(s)
117
116
  s.split(/[\b ]+/).size
118
117
  end
119
-
120
118
  end
121
119
  end
@@ -1,10 +1,8 @@
1
-
2
1
  # Marks all TextBlocks "content" which are between the headline and the part that has
3
2
  # already been marked content, if they are marked MIGHT_BE_CONTENT.
4
3
  # This filter is quite specific to the news domain.
5
4
  # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
6
5
 
7
-
8
6
  module Boilerpipe::Filters
9
7
  class ExpandTitleToContentFilter
10
8
  def self.process(doc)
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
38
36
  def self.no_title_with_subsequent_content?(content_start, title)
39
37
  title.nil? || content_start.nil? || content_start <= title
40
38
  end
41
-
42
39
  end
43
40
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::Filters
2
2
  class HeuristicFilterBase
3
- def self.num_full_text_words(tb, min_text_density=9.0)
3
+ def self.num_full_text_words(tb, min_text_density = 9.0)
4
4
  tb.text_density >= min_text_density ? tb.num_words : 0
5
5
  end
6
6
  end
@@ -1,12 +1,11 @@
1
- # Marks all blocks as "non-content" that occur after blocks that have been
2
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
- # number of words in content blocks occur before this mark (default: 60).
4
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
1
+ # Marks all blocks as "non-content" that occur after blocks that have been
2
+ # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
+ # number of words in content blocks occur before this mark (default: 60).
4
+ # This can be used in conjunction with an upstream TerminatingBlocksFinder.
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
8
-
9
- def self.process(doc, min_num_words=60)
8
+ def self.process(doc, min_num_words = 60)
10
9
  found_end_of_text = false
11
10
  num_words = 0
12
11
 
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
19
18
 
20
19
  doc
21
20
  end
22
-
23
21
  end
24
22
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Keeps the largest TextBlock only (by the number of words). In case of
3
2
  # more than one block with the same number of words, the first block is chosen.
4
3
  # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class KeepLargestBlockFilter
11
-
12
10
  def initialize(expand_to_same_level_text, min_words)
13
11
  @expand_to_same_level_text = expand_to_same_level_text
14
12
  @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
43
41
  expand_tag_level(tbs[0...n].reverse, level, @min_words)
44
42
 
45
43
  # expand blocks to the right
46
- expand_tag_level(tbs[n+1..-1], level, @min_words)
44
+ expand_tag_level(tbs[n + 1..-1], level, @min_words)
47
45
  end
48
46
  end
49
47
 
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
57
55
  end
58
56
  end
59
57
  end
60
-
61
58
  end
62
59
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks all blocks as content that:
3
2
  # are on the same tag-level as very likely main content
4
3
  # (usually the level of the largest block)
@@ -7,23 +6,22 @@
7
6
 
8
7
  module Boilerpipe::Filters
9
8
  class LargeBlockSameTagLevelToContentFilter
10
-
11
9
  def self.process(doc)
12
-
13
10
  largest = doc.text_blocks.find do |tb|
14
11
  tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
15
12
  end
16
13
 
17
14
  return doc if largest.nil?
15
+
18
16
  tag_level = largest.tag_level
19
17
 
20
18
  doc.text_blocks.each do |tb|
21
19
  next if tb.is_content?
20
+
22
21
  tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
23
22
  end
24
23
 
25
24
  doc
26
25
  end
27
-
28
26
  end
29
27
  end
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
11
11
  doc.text_blocks.each do |tb|
12
12
  if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
13
13
  tag_level = tb.tag_level
14
- elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
14
+ elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
15
15
  tb.content = true
16
16
  else
17
17
  tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
20
20
 
21
21
  doc
22
22
  end
23
-
24
23
  end
25
24
  end
@@ -1,14 +1,12 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class MarkEverythingContentFilter
5
-
6
5
  def self.process(doc)
7
6
  doc.text_blocks.each do |tb|
8
7
  tb.content = true if tb.is_not_content?
9
8
  end
10
9
  doc
11
10
  end
12
-
13
11
  end
14
12
  end
@@ -0,0 +1,34 @@
1
+ #
2
+ # Keeps only blocks that have at least one segment fragment ("clause") with at least k
3
+ # words (default: 5).
4
+ #
5
+ # NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
6
+ #
7
+ # SplitParagraphBlocksFilter
8
+
9
+ module Boilerpipe::Filters
10
+ class MinClauseWordsFilter
11
+ def self.process(doc, min_words = 5)
12
+ doc.text_blocks.each do |tb|
13
+ next if tb.is_not_content?
14
+
15
+ clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
16
+ hasClause = false
17
+ tb.text.scan(clause_delimiter).each do |possible_clause|
18
+ hasClause |= is_clause? possible_clause
19
+ end
20
+
21
+ tb.content = false unless hasClause
22
+ end
23
+
24
+ doc
25
+ end
26
+
27
+ def self.is_clause?(text, min_words = 5)
28
+ return false if text.nil?
29
+
30
+ whitespace = /[ \n\r]+/
31
+ text.scan(whitespace).size >= min_words
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,14 @@
1
+ # Keeps only those content blocks which contain at least k words.
2
+
3
+ module Boilerpipe::Filters
4
+ class MinWordsFilter
5
+ def self.process(min_words, doc)
6
+ doc.text_blocks.each do |tb|
7
+ next if tb.is_not_content?
8
+
9
+ tb.content = false if tb.num_words < min_words
10
+ end
11
+ doc
12
+ end
13
+ end
14
+ end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  # Classifies TextBlocks as content/not-content through rules that have been determined
4
2
  # using the C4.8 machine learning algorithm, as described in the paper
5
3
  # "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
7
5
 
8
6
  module Boilerpipe::Filters
9
7
  class NumWordsRulesClassifier
10
-
11
8
  def self.process(doc)
12
9
  empty = Boilerpipe::Document::TextBlock.empty_start
13
10
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
37
34
 
38
35
  false
39
36
  end
40
-
41
37
  end
42
38
  end