boilerpipe-ruby 0.2.0 → 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +5 -5
  2. data/.circleci/config.yml +6 -24
  3. data/.dockerignore +7 -0
  4. data/CHANGELOG.md +34 -1
  5. data/Dockerfile +14 -0
  6. data/README.md +32 -7
  7. data/Rakefile +3 -4
  8. data/bin/console +3 -3
  9. data/boilerpipe-ruby.gemspec +9 -9
  10. data/boilerpipe_flow.md +40 -0
  11. data/lib/boilerpipe.rb +14 -0
  12. data/lib/boilerpipe/document/text_block.rb +10 -12
  13. data/lib/boilerpipe/document/text_document.rb +4 -5
  14. data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
  15. data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
  16. data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
  17. data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
  18. data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
  19. data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
  20. data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
  21. data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
  22. data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
  23. data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
  24. data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
  25. data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
  26. data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
  27. data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
  28. data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
  29. data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
  30. data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
  31. data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
  32. data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
  33. data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
  34. data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
  35. data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
  36. data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
  37. data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
  38. data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
  39. data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
  40. data/lib/boilerpipe/labels/label_action.rb +1 -1
  41. data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
  42. data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
  43. data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
  44. data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
  45. data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
  46. data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
  47. data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
  48. data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
  49. data/lib/boilerpipe/version.rb +1 -1
  50. metadata +38 -25
@@ -0,0 +1,15 @@
1
+ module Boilerpipe::Extractors
2
+ class CanolaExtractor
3
+ def self.text(contents)
4
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
5
+ ::Boilerpipe::Extractors::CanolaExtractor.process doc
6
+ doc.content
7
+ end
8
+
9
+ def self.process(doc)
10
+ ::Boilerpipe::Filters::CanolaClassifier.process doc
11
+
12
+ doc
13
+ end
14
+ end
15
+ end
@@ -1,6 +1,5 @@
1
1
  module Boilerpipe::Extractors
2
2
  class DefaultExtractor
3
-
4
3
  def self.text(contents)
5
4
  doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
6
5
  ::Boilerpipe::Extractors::DefaultExtractor.process doc
@@ -0,0 +1,16 @@
1
+ # Marks all blocks as content.
2
+
3
+ module Boilerpipe::Extractors
4
+ class KeepEverythingExtractor
5
+ def self.text(contents)
6
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
7
+ ::Boilerpipe::Extractors::KeepEverythingExtractor.process doc
8
+ doc.content
9
+ end
10
+
11
+ def self.process(doc)
12
+ ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
13
+ doc
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,20 @@
1
+ # A full-text extractor which extracts the largest text component of a page.
2
+ # For news articles, it may perform better than the DefaultExtractor, but
3
+ # usually worse than ArticleExtractor.
4
+
5
+ module Boilerpipe::Extractors
6
+ class KeepEverythingWithKMinWordsExtractor
7
+ def self.text(min, contents)
8
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
9
+ ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
10
+ doc.content
11
+ end
12
+
13
+ def self.process(min, doc)
14
+ ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
15
+ ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
16
+ ::Boilerpipe::Filters::MinWordsFilter.process min, doc
17
+ doc
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,18 @@
1
+ module Boilerpipe::Extractors
2
+ class LargestContentExtractor
3
+ def self.text(contents)
4
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
5
+ ::Boilerpipe::Extractors::LargestContentExtractor.process doc
6
+ doc.content
7
+ end
8
+
9
+ def self.process(doc)
10
+ filters = ::Boilerpipe::Filters
11
+ filters::NumWordsRulesClassifier.process doc
12
+ filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
13
+ filters::KeepLargestBlockFilter::INSTANCE.process doc
14
+
15
+ doc
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,14 @@
1
+ module Boilerpipe::Extractors
2
+ class NumWordsRulesExtractor
3
+ def self.text(contents)
4
+ doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
5
+ ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
6
+ doc.content
7
+ end
8
+
9
+ def self.process(doc)
10
+ ::Boilerpipe::Filters::NumWordsRulesClassifier.process doc
11
+ doc
12
+ end
13
+ end
14
+ end
@@ -1,11 +1,8 @@
1
-
2
- # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
3
- # probably makes sense only in cases where an upstream filter already has removed some blocks.
1
+ # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
2
+ # probably makes sense only in cases where an upstream filter already has removed some blocks.
4
3
 
5
4
  module Boilerpipe::Filters
6
5
  class BlockProximityFusion
7
-
8
-
9
6
  def initialize(max_blocks_distance, content_only, same_tag_level_only)
10
7
  @max_blocks_distance = max_blocks_distance
11
8
  @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
13
10
  end
14
11
 
15
12
  MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
16
- MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
17
- MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
13
+ MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
14
+ MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
18
15
  MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
19
16
 
20
17
  def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
22
19
  return false if text_blocks.size < 2
23
20
 
24
21
  prev_block = if @content_only
25
- text_blocks.find{ |tb| tb.is_content? }
22
+ text_blocks.find { |tb| tb.is_content? }
26
23
  else
27
24
  text_blocks.first
28
25
  end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
46
43
  ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
47
44
  ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
48
45
 
49
- if ok
46
+ if ok
50
47
  prev_block.merge_next(tb)
51
48
  blocks_to_remove << tb
52
49
  else
53
50
  prev_block = tb
54
51
  end
55
52
  end
56
-
57
53
  end
58
- doc.replace_text_blocks!( text_blocks - blocks_to_remove )
54
+ doc.replace_text_blocks!(text_blocks - blocks_to_remove)
59
55
  doc
60
56
  end
61
-
62
57
  end
63
58
  end
@@ -1,9 +1,7 @@
1
-
2
- # Removes TextBlocks which have explicitly been marked as "not content".
1
+ # Removes TextBlocks which have explicitly been marked as "not content".
3
2
 
4
3
  module Boilerpipe::Filters
5
4
  class BoilerplateBlockFilter
6
-
7
5
  def initialize(label)
8
6
  @label_to_keep = label
9
7
  end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
21
19
  doc.replace_text_blocks!(combined)
22
20
  doc
23
21
  end
24
-
25
22
  end
26
23
  end
@@ -0,0 +1,27 @@
1
+ # A full-text extractor trained on http://krdwrd.org/
2
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
3
+ # Works well with SimpleEstimator, too.
4
+
5
+ module Boilerpipe::Filters
6
+ class CanolaClassifier
7
+ def self.process(doc)
8
+ return doc if doc.text_blocks.size < 1
9
+
10
+ empty = Boilerpipe::Document::TextBlock.empty_start
11
+ text_blocks = [empty] + doc.text_blocks + [empty]
12
+
13
+ text_blocks.each_cons(3) do |slice|
14
+ prev, current, nxt = *slice
15
+ current.content = classify(prev, current, nxt)
16
+ end
17
+
18
+ doc
19
+ end
20
+
21
+ def self.classify(prev, current, nxt)
22
+ current.link_density > 0 && nxt.num_words > 11 \
23
+ || current.num_words > 19 \
24
+ || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
25
+ end
26
+ end
27
+ end
@@ -5,9 +5,8 @@
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class DensityRulesClassifier
8
-
9
8
  def self.process(doc)
10
- #return doc if doc.text_blocks.size < 2
9
+ # return doc if doc.text_blocks.size < 2
11
10
 
12
11
  empty = Boilerpipe::Document::TextBlock.empty_start
13
12
  text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
26
25
  if prev.link_density <= 0.555556
27
26
  if current.text_density <= 9
28
27
  return true if nxt.text_density > 10
28
+
29
29
  return prev.text_density <= 4 ? false : true
30
30
  else
31
31
  return nxt.text_density == 0 ? false : true
32
32
  end
33
33
  else
34
34
  return false if nxt.text_density <= 11
35
+
35
36
  true
36
37
  end
37
38
  end
@@ -1,12 +1,9 @@
1
- # encoding: utf-8
2
- require 'set'
1
+ # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
2
+ # some heuristics which are quite specific to the news domain.
3
3
 
4
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
5
- # some heuristics which are quite specific to the news domain.
6
-
7
- # we create a list of potential titles from the page title
8
- # then we look at every text block and if the text block
9
- # contains a potential title - we set that text block label as :TITLE
4
+ # we create a list of potential titles from the page title
5
+ # then we look at every text block and if the text block
6
+ # contains a potential title - we set that text block label as :TITLE
10
7
 
11
8
  module Boilerpipe::Filters
12
9
  class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
55
52
  @potential_titles << title
56
53
 
57
54
  # unnecessary
58
- #p = longest_part(title, /[ ]*[|»-][ ]*/)
59
- #@potential_titles << p if p
55
+ # p = longest_part(title, /[ ]*[|»-][ ]*/)
56
+ # @potential_titles << p if p
60
57
 
61
- #p = longest_part(title, /[ ]*[|»:][ ]*/)
62
- #@potential_titles << p if p
58
+ # p = longest_part(title, /[ ]*[|»:][ ]*/)
59
+ # @potential_titles << p if p
63
60
 
64
- #p = longest_part(title, /[ ]*[|»:()][ ]*/)
65
- #@potential_titles << p if p
61
+ # p = longest_part(title, /[ ]*[|»:()][ ]*/)
62
+ # @potential_titles << p if p
66
63
 
67
- #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
68
- #@potential_titles << p if p
64
+ # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
65
+ # @potential_titles << p if p
69
66
 
70
67
  p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
71
68
  @potential_titles << p if p
72
69
 
73
70
  # we replace \u00a0 so why check for it?
74
- #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
75
- #@potential_titles << p if p
71
+ # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
72
+ # @potential_titles << p if p
76
73
 
77
74
  add_potential_titles(title, /[ ]+[|][ ]+/, 4)
78
75
  add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
90
87
 
91
88
  parts.each do |part|
92
89
  next if part =~ /[.]com/
90
+
93
91
  num_words = number_of_words(part)
94
92
 
95
93
  if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
107
105
 
108
106
  parts.each do |part|
109
107
  next if part =~ /[.]com/
108
+
110
109
  num_words = number_of_words(part)
111
110
 
112
111
  @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
116
115
  def number_of_words(s)
117
116
  s.split(/[\b ]+/).size
118
117
  end
119
-
120
118
  end
121
119
  end
@@ -1,10 +1,8 @@
1
-
2
1
  # Marks all TextBlocks "content" which are between the headline and the part that has
3
2
  # already been marked content, if they are marked MIGHT_BE_CONTENT.
4
3
  # This filter is quite specific to the news domain.
5
4
  # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
6
5
 
7
-
8
6
  module Boilerpipe::Filters
9
7
  class ExpandTitleToContentFilter
10
8
  def self.process(doc)
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
38
36
  def self.no_title_with_subsequent_content?(content_start, title)
39
37
  title.nil? || content_start.nil? || content_start <= title
40
38
  end
41
-
42
39
  end
43
40
  end
@@ -1,6 +1,6 @@
1
1
  module Boilerpipe::Filters
2
2
  class HeuristicFilterBase
3
- def self.num_full_text_words(tb, min_text_density=9.0)
3
+ def self.num_full_text_words(tb, min_text_density = 9.0)
4
4
  tb.text_density >= min_text_density ? tb.num_words : 0
5
5
  end
6
6
  end
@@ -1,12 +1,11 @@
1
- # Marks all blocks as "non-content" that occur after blocks that have been
2
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
- # number of words in content blocks occur before this mark (default: 60).
4
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
1
+ # Marks all blocks as "non-content" that occur after blocks that have been
2
+ # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
3
+ # number of words in content blocks occur before this mark (default: 60).
4
+ # This can be used in conjunction with an upstream TerminatingBlocksFinder.
5
5
 
6
6
  module Boilerpipe::Filters
7
7
  class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
8
-
9
- def self.process(doc, min_num_words=60)
8
+ def self.process(doc, min_num_words = 60)
10
9
  found_end_of_text = false
11
10
  num_words = 0
12
11
 
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
19
18
 
20
19
  doc
21
20
  end
22
-
23
21
  end
24
22
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Keeps the largest TextBlock only (by the number of words). In case of
3
2
  # more than one block with the same number of words, the first block is chosen.
4
3
  # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
8
7
 
9
8
  module Boilerpipe::Filters
10
9
  class KeepLargestBlockFilter
11
-
12
10
  def initialize(expand_to_same_level_text, min_words)
13
11
  @expand_to_same_level_text = expand_to_same_level_text
14
12
  @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
43
41
  expand_tag_level(tbs[0...n].reverse, level, @min_words)
44
42
 
45
43
  # expand blocks to the right
46
- expand_tag_level(tbs[n+1..-1], level, @min_words)
44
+ expand_tag_level(tbs[n + 1..-1], level, @min_words)
47
45
  end
48
46
  end
49
47
 
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
57
55
  end
58
56
  end
59
57
  end
60
-
61
58
  end
62
59
  end
@@ -1,4 +1,3 @@
1
-
2
1
  # Marks all blocks as content that:
3
2
  # are on the same tag-level as very likely main content
4
3
  # (usually the level of the largest block)
@@ -7,23 +6,22 @@
7
6
 
8
7
  module Boilerpipe::Filters
9
8
  class LargeBlockSameTagLevelToContentFilter
10
-
11
9
  def self.process(doc)
12
-
13
10
  largest = doc.text_blocks.find do |tb|
14
11
  tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
15
12
  end
16
13
 
17
14
  return doc if largest.nil?
15
+
18
16
  tag_level = largest.tag_level
19
17
 
20
18
  doc.text_blocks.each do |tb|
21
19
  next if tb.is_content?
20
+
22
21
  tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
23
22
  end
24
23
 
25
24
  doc
26
25
  end
27
-
28
26
  end
29
27
  end
@@ -11,7 +11,7 @@ module Boilerpipe::Filters
11
11
  doc.text_blocks.each do |tb|
12
12
  if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
13
13
  tag_level = tb.tag_level
14
- elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
14
+ elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
15
15
  tb.content = true
16
16
  else
17
17
  tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
20
20
 
21
21
  doc
22
22
  end
23
-
24
23
  end
25
24
  end
@@ -0,0 +1,12 @@
1
+ # Marks all blocks as content.
2
+
3
+ module Boilerpipe::Filters
4
+ class MarkEverythingContentFilter
5
+ def self.process(doc)
6
+ doc.text_blocks.each do |tb|
7
+ tb.content = true if tb.is_not_content?
8
+ end
9
+ doc
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,34 @@
1
+ #
2
+ # Keeps only blocks that have at least one segment fragment ("clause") with at least k
3
+ # words (default: 5).
4
+ #
5
+ # NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
6
+ #
7
+ # SplitParagraphBlocksFilter
8
+
9
+ module Boilerpipe::Filters
10
+ class MinClauseWordsFilter
11
+ def self.process(doc, min_words = 5)
12
+ doc.text_blocks.each do |tb|
13
+ next if tb.is_not_content?
14
+
15
+ clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
16
+ hasClause = false
17
+ tb.text.scan(clause_delimiter).each do |possible_clause|
18
+ hasClause |= is_clause? possible_clause
19
+ end
20
+
21
+ tb.content = false unless hasClause
22
+ end
23
+
24
+ doc
25
+ end
26
+
27
+ def self.is_clause?(text, min_words = 5)
28
+ return false if text.nil?
29
+
30
+ whitespace = /[ \n\r]+/
31
+ text.scan(whitespace).size >= min_words
32
+ end
33
+ end
34
+ end