RubyGems - boilerpipe-ruby - Versions diffs - 0.2.0 → 0.4.3 - Mend

boilerpipe-ruby 0.2.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +6 -24
data/.dockerignore +7 -0
data/CHANGELOG.md +34 -1
data/Dockerfile +14 -0
data/README.md +32 -7
data/Rakefile +3 -4
data/bin/console +3 -3
data/boilerpipe-ruby.gemspec +9 -9
data/boilerpipe_flow.md +40 -0
data/lib/boilerpipe.rb +14 -0
data/lib/boilerpipe/document/text_block.rb +10 -12
data/lib/boilerpipe/document/text_document.rb +4 -5
data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
data/lib/boilerpipe/labels/label_action.rb +1 -1
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +38 -25

data/lib/boilerpipe/extractors/canola_extractor.rb ADDED

@@ -0,0 +1,15 @@
+module Boilerpipe::Extractors
+  class CanolaExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::CanolaExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Filters::CanolaClassifier.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/default_extractor.rb CHANGED

@@ -1,6 +1,5 @@
 module Boilerpipe::Extractors
   class DefaultExtractor
     def self.text(contents)
       doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
       ::Boilerpipe::Extractors::DefaultExtractor.process doc

data/lib/boilerpipe/extractors/keep_everything_extractor.rb ADDED

@@ -0,0 +1,16 @@
+# Marks all blocks as content.
+module Boilerpipe::Extractors
+  class KeepEverythingExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::KeepEverythingExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb ADDED

@@ -0,0 +1,20 @@
+# A full-text extractor which extracts the largest text component of a page.
+# For news articles, it may perform better than the DefaultExtractor, but
+# usually worse than ArticleExtractor.
+module Boilerpipe::Extractors
+  class KeepEverythingWithKMinWordsExtractor
+    def self.text(min, contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
+      doc.content
+    end
+    def self.process(min, doc)
+      ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
+      ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
+      ::Boilerpipe::Filters::MinWordsFilter.process min, doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/largest_content_extractor.rb ADDED

@@ -0,0 +1,18 @@
+module Boilerpipe::Extractors
+  class LargestContentExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::LargestContentExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      filters = ::Boilerpipe::Filters
+      filters::NumWordsRulesClassifier.process doc
+      filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
+      filters::KeepLargestBlockFilter::INSTANCE.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/num_words_rules_extractor.rb ADDED

@@ -0,0 +1,14 @@
+module Boilerpipe::Extractors
+  class NumWordsRulesExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Filters::NumWordsRulesClassifier.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/block_proximity_fusion.rb CHANGED

@@ -1,11 +1,8 @@
-    # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
-    # probably makes sense only in cases where an upstream filter already has removed some blocks.
+# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
+# probably makes sense only in cases where an upstream filter already has removed some blocks.
 module Boilerpipe::Filters
   class BlockProximityFusion
     def initialize(max_blocks_distance, content_only, same_tag_level_only)
       @max_blocks_distance = max_blocks_distance
       @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
     end
     MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
-    MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
-    MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
+    MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
+    MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
     MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
     def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
       return false if text_blocks.size < 2
       prev_block = if @content_only
-                     text_blocks.find{ |tb| tb.is_content? }
+                     text_blocks.find { |tb| tb.is_content? }
                    else
                      text_blocks.first
                    end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
           ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
           ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
-          if  ok
+          if ok
             prev_block.merge_next(tb)
             blocks_to_remove << tb
           else
             prev_block = tb
           end
         end
       end
-      doc.replace_text_blocks!( text_blocks - blocks_to_remove )
+      doc.replace_text_blocks!(text_blocks - blocks_to_remove)
       doc
     end
   end
 end

data/lib/boilerpipe/filters/boilerplate_block_filter.rb CHANGED

@@ -1,9 +1,7 @@
- # Removes TextBlocks which have explicitly been marked as "not content".
+# Removes TextBlocks which have explicitly been marked as "not content".
 module Boilerpipe::Filters
   class BoilerplateBlockFilter
     def initialize(label)
       @label_to_keep = label
     end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
       doc.replace_text_blocks!(combined)
       doc
     end
   end
 end

data/lib/boilerpipe/filters/canola_classifier.rb ADDED

@@ -0,0 +1,27 @@
+# A full-text extractor trained on http://krdwrd.org/
+# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
+# Works well with SimpleEstimator, too.
+module Boilerpipe::Filters
+  class CanolaClassifier
+    def self.process(doc)
+      return doc if doc.text_blocks.size < 1
+      empty = Boilerpipe::Document::TextBlock.empty_start
+      text_blocks = [empty] + doc.text_blocks + [empty]
+      text_blocks.each_cons(3) do |slice|
+        prev, current, nxt = *slice
+        current.content = classify(prev, current, nxt)
+      end
+      doc
+    end
+    def self.classify(prev, current, nxt)
+      current.link_density > 0 && nxt.num_words > 11 \
+        || current.num_words > 19 \
+        || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
+    end
+  end
+end

data/lib/boilerpipe/filters/density_rules_classifier.rb CHANGED

@@ -5,9 +5,8 @@
 module Boilerpipe::Filters
   class DensityRulesClassifier
     def self.process(doc)
-      #return doc if doc.text_blocks.size < 2
+      # return doc if doc.text_blocks.size < 2
       empty = Boilerpipe::Document::TextBlock.empty_start
       text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
       if prev.link_density <= 0.555556
         if current.text_density <= 9
           return true if nxt.text_density > 10
           return prev.text_density <= 4 ? false : true
         else
           return nxt.text_density == 0 ? false : true
         end
       else
         return false if nxt.text_density <= 11
         true
       end
     end

data/lib/boilerpipe/filters/document_title_match_classifier.rb CHANGED

@@ -1,12 +1,9 @@
-# encoding: utf-8
-require 'set'
+# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
+# some heuristics which are quite specific to the news domain.
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
- # some heuristics which are quite specific to the news domain.
-    # we create a list of potential titles from the page title
-    # then we look at every text block and if the text block
-    # contains a potential title - we set that text block label as :TITLE
+# we create a list of potential titles from the page title
+# then we look at every text block and if the text block
+# contains a potential title - we set that text block label as :TITLE
 module Boilerpipe::Filters
   class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
       @potential_titles << title
       # unnecessary
-      #p = longest_part(title, /[ ]*[|»-][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»-][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:()][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:()][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
+      # @potential_titles << p if p
       p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
       @potential_titles << p if p
       # we replace \u00a0 so why check for it?
-      #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
+      # @potential_titles << p if p
       add_potential_titles(title, /[ ]+[|][ ]+/, 4)
       add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
       parts.each do |part|
         next if part =~ /[.]com/
         num_words = number_of_words(part)
         if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
       parts.each do |part|
         next if part =~ /[.]com/
         num_words = number_of_words(part)
         @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
     def number_of_words(s)
       s.split(/[\b ]+/).size
     end
   end
 end

data/lib/boilerpipe/filters/expand_title_to_content_filter.rb CHANGED

@@ -1,10 +1,8 @@
 # Marks all TextBlocks "content" which are between the headline and the part that has
 # already been marked content, if they are marked MIGHT_BE_CONTENT.
 # This filter is quite specific to the news domain.
 # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
 module Boilerpipe::Filters
   class ExpandTitleToContentFilter
     def self.process(doc)
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
     def self.no_title_with_subsequent_content?(content_start, title)
       title.nil? || content_start.nil? || content_start <= title
     end
   end
 end

data/lib/boilerpipe/filters/heuristic_filter_base.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::Filters
   class HeuristicFilterBase
-    def self.num_full_text_words(tb, min_text_density=9.0)
+    def self.num_full_text_words(tb, min_text_density = 9.0)
       tb.text_density >= min_text_density ? tb.num_words : 0
     end
   end

data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb CHANGED

@@ -1,12 +1,11 @@
- # Marks all blocks as "non-content" that occur after blocks that have been
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
- # number of words in content blocks occur before this mark (default: 60).
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
+# Marks all blocks as "non-content" that occur after blocks that have been
+# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
+# number of words in content blocks occur before this mark (default: 60).
+# This can be used in conjunction with an upstream TerminatingBlocksFinder.
 module Boilerpipe::Filters
   class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
-    def self.process(doc, min_num_words=60)
+    def self.process(doc, min_num_words = 60)
       found_end_of_text = false
       num_words = 0
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/filters/keep_largest_block_filter.rb CHANGED

@@ -1,4 +1,3 @@
 # Keeps the largest TextBlock only (by the number of words). In case of
 # more than one block with the same number of words, the first block is chosen.
 # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
 module Boilerpipe::Filters
   class KeepLargestBlockFilter
     def initialize(expand_to_same_level_text, min_words)
       @expand_to_same_level_text = expand_to_same_level_text
       @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
         expand_tag_level(tbs[0...n].reverse, level, @min_words)
         # expand blocks to the right
-        expand_tag_level(tbs[n+1..-1], level, @min_words)
+        expand_tag_level(tbs[n + 1..-1], level, @min_words)
       end
     end
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
         end
       end
     end
   end
 end

data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb CHANGED

@@ -1,4 +1,3 @@
 #  Marks all blocks as content that:
 #  are on the same tag-level as very likely main content
 #  (usually the level of the largest  block)
@@ -7,23 +6,22 @@
 module Boilerpipe::Filters
   class LargeBlockSameTagLevelToContentFilter
     def self.process(doc)
       largest = doc.text_blocks.find do |tb|
         tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
       end
       return doc if largest.nil?
       tag_level = largest.tag_level
       doc.text_blocks.each do |tb|
         next if tb.is_content?
         tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/list_at_end_filter.rb CHANGED

@@ -11,7 +11,7 @@ module Boilerpipe::Filters
       doc.text_blocks.each do |tb|
         if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
           tag_level = tb.tag_level
-        elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
+        elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
           tb.content = true
         else
           tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/filters/mark_everything_content_filter.rb ADDED

@@ -0,0 +1,12 @@
+# Marks all blocks as content.
+module Boilerpipe::Filters
+  class MarkEverythingContentFilter
+    def self.process(doc)
+      doc.text_blocks.each do |tb|
+        tb.content = true if tb.is_not_content?
+      end
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/min_clause_words_filter.rb ADDED

@@ -0,0 +1,34 @@
+#
+# Keeps only blocks that have at least one segment fragment ("clause") with at least k
+# words (default: 5).
+#
+# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
+#
+# SplitParagraphBlocksFilter
+module Boilerpipe::Filters
+  class MinClauseWordsFilter
+    def self.process(doc, min_words = 5)
+      doc.text_blocks.each do |tb|
+        next if tb.is_not_content?
+        clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
+        hasClause = false
+        tb.text.scan(clause_delimiter).each do |possible_clause|
+          hasClause |= is_clause? possible_clause
+        end
+        tb.content = false unless hasClause
+      end
+      doc
+    end
+    def self.is_clause?(text, min_words = 5)
+      return false if text.nil?
+      whitespace = /[ \n\r]+/
+      text.scan(whitespace).size >= min_words
+    end
+  end
+end