boilerpipe-ruby 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +28 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +13 -4
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +4 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
  15. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  16. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
  17. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
  18. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
  19. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  20. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  21. data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
  22. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  23. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  24. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
  25. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  26. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  27. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  28. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  29. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  30. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
  31. data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
  32. data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
  33. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  34. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  35. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
  36. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  37. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  38. data/lib/boilerpipe/labels/label_action.rb +1 -1
  39. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
  40. data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
  41. data/lib/boilerpipe/sax/preprocessor.rb +11 -0
  42. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  43. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  44. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
  47. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  48. data/lib/boilerpipe/version.rb +1 -1
  49. metadata +28 -25
@@ -1,9 +1,7 @@
1
-
2
- # Removes TextBlocks which have explicitly been marked as "not content".
1
+ # Removes TextBlocks which have explicitly been marked as "not content".
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class BoilerplateBlockFilter
6
-
7
5
  def initialize(label)
8
6
  @label_to_keep = label
9
7
  end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
21
19
  doc.replace_text_blocks!(combined)
22
20
  doc
23
21
  end
24
-
25
22
  end
26
23
  end
@@ -1,10 +1,9 @@
1
- # A full-text extractor trained on http://krdwrd.org/
2
- # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
- # Works well with SimpleEstimator, too.
1
+ # A full-text extractor trained on http://krdwrd.org/
2
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
+ # Works well with SimpleEstimator, too.
4
4
 
5
5
  module Boilerpipe::Filters
6
6
  class CanolaClassifier
7
-
8
7
  def self.process(doc)
9
8
  return doc if doc.text_blocks.size < 1
10
9
 
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
22
21
  def self.classify(prev, current, nxt)
23
22
  current.link_density > 0 && nxt.num_words > 11 \
24
23
  || current.num_words > 19 \
25
- || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
24
+ || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
26
25
  end
27
26
  end
28
27
  end
@@ -5,9 +5,8 @@
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class DensityRulesClassifier
8
-
9
8
  def self.process(doc)
10
- #return doc if doc.text_blocks.size < 2
9
+ # return doc if doc.text_blocks.size < 2
11
10
 
12
11
  empty = Boilerpipe::Document::TextBlock.empty_start
13
12
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
26
25
  if prev.link_density <= 0.555556
27
26
  if current.text_density <= 9
28
27
  return true if nxt.text_density > 10
28
+
29
29
  return prev.text_density <= 4 ? false : true
30
30
  else
31
31
  return nxt.text_density == 0 ? false : true
32
32
  end
33
33
  else
34
34
  return false if nxt.text_density <= 11
35
+
35
36
  true
36
37
  end
37
38
  end
@@ -1,12 +1,9 @@
1
- # encoding: utf-8
2
- require 'set'
1
+ # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
2
+ # some heuristics which are quite specific to the news domain.
3
3
 
4
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
5
- # some heuristics which are quite specific to the news domain.
6
-
7
- # we create a list of potential titles from the page title
8
- # then we look at every text block and if the text block
9
- # contains a potential title - we set that text block label as :TITLE
4
+ # we create a list of potential titles from the page title
5
+ # then we look at every text block and if the text block
6
+ # contains a potential title - we set that text block label as :TITLE
10
7
 
11
8
  module Boilerpipe::Filters
12
9
  class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
55
52
  @potential_titles << title
56
53
 
57
54
  # unnecessary
58
- #p = longest_part(title, /[ ]*[|»-][ ]*/)
59
- #@potential_titles << p if p
55
+ # p = longest_part(title, /[ ]*[|»-][ ]*/)
56
+ # @potential_titles << p if p
60
57
 
61
- #p = longest_part(title, /[ ]*[|»:][ ]*/)
62
- #@potential_titles << p if p
58
+ # p = longest_part(title, /[ ]*[|»:][ ]*/)
59
+ # @potential_titles << p if p
63
60
 
64
- #p = longest_part(title, /[ ]*[|»:()][ ]*/)
65
- #@potential_titles << p if p
61
+ # p = longest_part(title, /[ ]*[|»:()][ ]*/)
62
+ # @potential_titles << p if p
66
63
 
67
- #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
68
- #@potential_titles << p if p
64
+ # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
65
+ # @potential_titles << p if p
69
66
 
70
67
  p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
71
68
  @potential_titles << p if p
72
69
 
73
70
  # we replace \u00a0 so why check for it?
74
- #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
75
- #@potential_titles << p if p
71
+ # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
72
+ # @potential_titles << p if p
76
73
 
77
74
  add_potential_titles(title, /[ ]+[|][ ]+/, 4)
78
75
  add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
90
87
 
91
88
  parts.each do |part|
92
89
  next if part =~ /[.]com/
90
+
93
91
  num_words = number_of_words(part)
94
92
 
95
93
  if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
107
105
 
108
106
  parts.each do |part|
109
107
  next if part =~ /[.]com/
108
+
110
109
  num_words = number_of_words(part)
111
110
 
112
111
  @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
116
115
  def number_of_words(s)
117
116
  s.split(/[\b ]+/).size
118
117
  end
119
-
120
118
  end
121
119
  end
@@ -1,43 +1,30 @@
1
-
2
1
  # Marks all TextBlocks "content" which are between the headline and the part that has
3
2
  # already been marked content, if they are marked MIGHT_BE_CONTENT.
4
3
  # This filter is quite specific to the news domain.
5
4
  # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
6
5
 
7
-
8
6
  module Boilerpipe::Filters
9
7
  class ExpandTitleToContentFilter
10
8
  def self.process(doc)
11
9
  tbs = doc.text_blocks
12
10
 
13
- # slower and more ruby-like
14
- # comeback and let's do some benchmarking
15
- # titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
16
- # title = tbs.index(titles.last)
17
- # content_start = tbs.find_index(&:is_content?)
11
+ title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
12
+ title_idx = tbs.index(title)
18
13
 
19
- i = 0
20
- title = nil
21
- content_start = nil
14
+ content_start = tbs.find_index(&:is_content?)
22
15
 
23
- tbs.each do |tb|
24
- title = i if content_start.nil? && tb.has_label?(:TITLE)
25
- content_start = i if content_start.nil? && tb.is_content?
26
- i += 1
27
- end
16
+ return doc if no_title_with_subsequent_content?(content_start, title_idx)
28
17
 
29
- return doc if no_title_with_subsequent_content?(content_start, title)
30
-
31
- tbs.slice(title...content_start).each do |tb|
32
- tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
33
- end
18
+ tbs.slice(title_idx...content_start)
19
+ .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
20
+ .each{ |tb| tb.content = true }
34
21
 
35
22
  doc
36
23
  end
37
24
 
38
- def self.no_title_with_subsequent_content?(content_start, title)
39
- title.nil? || content_start.nil? || content_start <= title
25
+ def self.no_title_with_subsequent_content?(content_start, title_idx)
26
+ # title has to start before content
27
+ title_idx.nil? || content_start.nil? || title_idx >= content_start
40
28
  end
41
-
42
29
  end
43
30
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::Filters
2
2
  class HeuristicFilterBase
3
- def self.num_full_text_words(tb, min_text_density=9.0)
3
+ def self.num_full_text_words(tb, min_text_density = 9.0)
4
4
  tb.text_density >= min_text_density ? tb.num_words : 0
5
5
  end
6
6
  end
@@ -1,12 +1,11 @@
1
- # Marks all blocks as "non-content" that occur after blocks that have been
2
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
- # number of words in content blocks occur before this mark (default: 60).
4
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
1
+ # Marks all blocks as "non-content" that occur after blocks that have been
2
+ # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
+ # number of words in content blocks occur before this mark (default: 60).
4
+ # This can be used in conjunction with an upstream TerminatingBlocksFinder.
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
8
-
9
- def self.process(doc, min_num_words=60)
8
+ def self.process(doc, min_num_words = 60)
10
9
  found_end_of_text = false
11
10
  num_words = 0
12
11
 
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
19
18
 
20
19
  doc
21
20
  end
22
-
23
21
  end
24
22
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Keeps the largest TextBlock only (by the number of words). In case of
3
2
  # more than one block with the same number of words, the first block is chosen.
4
3
  # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class KeepLargestBlockFilter
11
-
12
10
  def initialize(expand_to_same_level_text, min_words)
13
11
  @expand_to_same_level_text = expand_to_same_level_text
14
12
  @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
43
41
  expand_tag_level(tbs[0...n].reverse, level, @min_words)
44
42
 
45
43
  # expand blocks to the right
46
- expand_tag_level(tbs[n+1..-1], level, @min_words)
44
+ expand_tag_level(tbs[n + 1..-1], level, @min_words)
47
45
  end
48
46
  end
49
47
 
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
57
55
  end
58
56
  end
59
57
  end
60
-
61
58
  end
62
59
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks all blocks as content that:
3
2
  # are on the same tag-level as very likely main content
4
3
  # (usually the level of the largest block)
@@ -7,23 +6,22 @@
7
6
 
8
7
  module Boilerpipe::Filters
9
8
  class LargeBlockSameTagLevelToContentFilter
10
-
11
9
  def self.process(doc)
12
-
13
10
  largest = doc.text_blocks.find do |tb|
14
11
  tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
15
12
  end
16
13
 
17
14
  return doc if largest.nil?
15
+
18
16
  tag_level = largest.tag_level
19
17
 
20
18
  doc.text_blocks.each do |tb|
21
19
  next if tb.is_content?
20
+
22
21
  tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
23
22
  end
24
23
 
25
24
  doc
26
25
  end
27
-
28
26
  end
29
27
  end
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
11
11
  doc.text_blocks.each do |tb|
12
12
  if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
13
13
  tag_level = tb.tag_level
14
- elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
14
+ elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
15
15
  tb.content = true
16
16
  else
17
17
  tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
20
20
 
21
21
  doc
22
22
  end
23
-
24
23
  end
25
24
  end
@@ -1,14 +1,12 @@
1
- # Marks all blocks as content.
1
+ # Marks all blocks as content.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class MarkEverythingContentFilter
5
-
6
5
  def self.process(doc)
7
6
  doc.text_blocks.each do |tb|
8
7
  tb.content = true if tb.is_not_content?
9
8
  end
10
9
  doc
11
10
  end
12
-
13
11
  end
14
12
  end
@@ -8,30 +8,27 @@
8
8
 
9
9
  module Boilerpipe::Filters
10
10
  class MinClauseWordsFilter
11
-
12
- def self.process(doc, min_words=5)
13
-
11
+ def self.process(doc, min_words = 5)
14
12
  doc.text_blocks.each do |tb|
15
13
  next if tb.is_not_content?
16
14
 
17
15
  clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
16
+ hasClause = false
18
17
  tb.text.scan(clause_delimiter).each do |possible_clause|
19
- if is_clause? possible_clause
20
- break
21
- else
22
- tb.content = false
23
- end
18
+ hasClause |= is_clause? possible_clause
24
19
  end
20
+
21
+ tb.content = false unless hasClause
25
22
  end
26
23
 
27
24
  doc
28
25
  end
29
26
 
30
- def self.is_clause?(text, min_words=5)
31
- return false if text.nil?
27
+ def self.is_clause?(text, min_words = 5)
28
+ return false if text.nil?
29
+
32
30
  whitespace = /[ \n\r]+/
33
31
  text.scan(whitespace).size >= min_words
34
32
  end
35
-
36
33
  end
37
34
  end
@@ -1,16 +1,14 @@
1
-
2
1
  # Keeps only those content blocks which contain at least k words.
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class MinWordsFilter
6
-
7
5
  def self.process(min_words, doc)
8
6
  doc.text_blocks.each do |tb|
9
7
  next if tb.is_not_content?
8
+
10
9
  tb.content = false if tb.num_words < min_words
11
10
  end
12
11
  doc
13
12
  end
14
-
15
13
  end
16
14
  end
@@ -1,5 +1,3 @@
1
- # encoding: utf-8
2
-
3
1
  # Classifies TextBlocks as content/not-content through rules that have been determined
4
2
  # using the C4.8 machine learning algorithm, as described in the paper
5
3
  # "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
7
5
 
8
6
  module Boilerpipe::Filters
9
7
  class NumWordsRulesClassifier
10
-
11
8
  def self.process(doc)
12
9
  empty = Boilerpipe::Document::TextBlock.empty_start
13
10
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
37
34
 
38
35
  false
39
36
  end
40
-
41
37
  end
42
38
  end
@@ -1,4 +1,4 @@
1
- # Merges two subsequent blocks if their text densities are equal.
1
+ # Merges two subsequent blocks if their text densities are equal.
2
2
 
3
3
  module Boilerpipe::Filters
4
4
  class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
17
17
  end
18
18
  end
19
19
 
20
- doc.replace_text_blocks!( tbs - blocks_to_remove )
20
+ doc.replace_text_blocks!(tbs - blocks_to_remove)
21
21
  doc
22
22
  end
23
23
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Splits TextBlocks at paragraph boundaries.
3
2
  #
4
3
  # NOTE: This is not fully supported (i.e., it will break highlighting support via
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class SplitParagraphBlocksFilter
11
-
12
10
  def self.process(doc)
13
11
  tbs = doc.text_blocks
14
12
  new_blocks = []
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
35
33
  doc.replace_text_blocks!(new_blocks) if changes
36
34
  doc
37
35
  end
38
-
39
36
  end
40
37
  end
@@ -1,15 +1,13 @@
1
- # encoding: utf-8
2
-
3
1
  # Finds blocks which are potentially indicating the end of an article
4
2
  # text and marks them with INDICATES_END_OF_TEXT. This can be used
5
3
  # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
6
4
 
7
-
8
5
  module Boilerpipe::Filters
9
6
  class TerminatingBlocksFinder
10
7
  def self.process(doc)
11
8
  doc.text_blocks.each do |tb|
12
9
  next unless tb.num_words < 15
10
+
13
11
  if tb.text.length >= 8 && finds_match?(tb.text.downcase)
14
12
  tb.labels << :INDICATES_END_OF_TEXT
15
13
  elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
29
27
  text.include?('what you think...') ||
30
28
  text.include?('add your comment') ||
31
29
  text.include?('add comment') ||
32
- #TODO add this and test
33
- #text.include?('leave a reply') ||
34
- #text.include?('leave a comment') ||
35
- #text.include?('show comments') ||
36
- #text.include?('Share this:') ||
30
+ # TODO add this and test
31
+ # text.include?('leave a reply') ||
32
+ # text.include?('leave a comment') ||
33
+ # text.include?('show comments') ||
34
+ # text.include?('Share this:') ||
37
35
  text.include?('reader views') ||
38
36
  text.include?('have your say') ||
39
37
  text.include?('reader comments') ||