RubyGems - boilerpipe-ruby - Versions diffs - 0.4.0 → 0.5.0 - Mend

boilerpipe-ruby 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +6 -24
data/.dockerignore +7 -0
data/CHANGELOG.md +28 -1
data/Dockerfile +14 -0
data/README.md +13 -4
data/Rakefile +3 -4
data/bin/console +3 -3
data/boilerpipe-ruby.gemspec +9 -9
data/boilerpipe_flow.md +40 -0
data/lib/boilerpipe.rb +4 -0
data/lib/boilerpipe/document/text_block.rb +10 -12
data/lib/boilerpipe/document/text_document.rb +4 -5
data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +10 -23
data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
data/lib/boilerpipe/labels/label_action.rb +1 -1
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
data/lib/boilerpipe/sax/html_content_handler.rb +25 -22
data/lib/boilerpipe/sax/preprocessor.rb +11 -0
data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/body.rb +4 -4
data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +28 -25

data/lib/boilerpipe/filters/boilerplate_block_filter.rb CHANGED Viewed

@@ -1,9 +1,7 @@
- # Removes TextBlocks which have explicitly been marked as "not content".
+# Removes TextBlocks which have explicitly been marked as "not content".
 module Boilerpipe::Filters
   class BoilerplateBlockFilter
     def initialize(label)
       @label_to_keep = label
     end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
       doc.replace_text_blocks!(combined)
       doc
     end
   end
 end

data/lib/boilerpipe/filters/canola_classifier.rb CHANGED Viewed

@@ -1,10 +1,9 @@
- # A full-text extractor trained on http://krdwrd.org/
- # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
- # Works well with SimpleEstimator, too.
+# A full-text extractor trained on http://krdwrd.org/
+# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
+# Works well with SimpleEstimator, too.
 module Boilerpipe::Filters
   class CanolaClassifier
     def self.process(doc)
       return doc if doc.text_blocks.size < 1
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
     def self.classify(prev, current, nxt)
       current.link_density > 0 && nxt.num_words > 11 \
         || current.num_words > 19 \
-        || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
+        || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
     end
   end
 end

data/lib/boilerpipe/filters/density_rules_classifier.rb CHANGED Viewed

@@ -5,9 +5,8 @@
 module Boilerpipe::Filters
   class DensityRulesClassifier
     def self.process(doc)
-      #return doc if doc.text_blocks.size < 2
+      # return doc if doc.text_blocks.size < 2
       empty = Boilerpipe::Document::TextBlock.empty_start
       text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
       if prev.link_density <= 0.555556
         if current.text_density <= 9
           return true if nxt.text_density > 10
           return prev.text_density <= 4 ? false : true
         else
           return nxt.text_density == 0 ? false : true
         end
       else
         return false if nxt.text_density <= 11
         true
       end
     end

data/lib/boilerpipe/filters/document_title_match_classifier.rb CHANGED Viewed

@@ -1,12 +1,9 @@
-# encoding: utf-8
-require 'set'
+# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
+# some heuristics which are quite specific to the news domain.
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
- # some heuristics which are quite specific to the news domain.
-    # we create a list of potential titles from the page title
-    # then we look at every text block and if the text block
-    # contains a potential title - we set that text block label as :TITLE
+# we create a list of potential titles from the page title
+# then we look at every text block and if the text block
+# contains a potential title - we set that text block label as :TITLE
 module Boilerpipe::Filters
   class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
       @potential_titles << title
       # unnecessary
-      #p = longest_part(title, /[ ]*[|»-][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»-][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:()][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:()][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
+      # @potential_titles << p if p
       p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
       @potential_titles << p if p
       # we replace \u00a0 so why check for it?
-      #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
+      # @potential_titles << p if p
       add_potential_titles(title, /[ ]+[|][ ]+/, 4)
       add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
       parts.each do |part|
         next if part =~ /[.]com/
         num_words = number_of_words(part)
         if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
       parts.each do |part|
         next if part =~ /[.]com/
         num_words = number_of_words(part)
         @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
     def number_of_words(s)
       s.split(/[\b ]+/).size
     end
   end
 end

data/lib/boilerpipe/filters/expand_title_to_content_filter.rb CHANGED Viewed

@@ -1,43 +1,30 @@
 # Marks all TextBlocks "content" which are between the headline and the part that has
 # already been marked content, if they are marked MIGHT_BE_CONTENT.
 # This filter is quite specific to the news domain.
 # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
 module Boilerpipe::Filters
   class ExpandTitleToContentFilter
     def self.process(doc)
       tbs = doc.text_blocks
-      #     slower and more ruby-like
-      #     comeback and let's do some benchmarking
-      #     titles = tbs.select{ |tb| tb.has_label?(:TITLE) }
-      #     title = tbs.index(titles.last)
-      #     content_start = tbs.find_index(&:is_content?)
+      title = tbs.select{ |tb| tb.has_label?(:TITLE) }.last
+      title_idx = tbs.index(title)
-      i = 0
-      title = nil
-      content_start = nil
+      content_start = tbs.find_index(&:is_content?)
-      tbs.each do |tb|
-        title = i if content_start.nil? && tb.has_label?(:TITLE)
-        content_start = i if content_start.nil? && tb.is_content?
-        i += 1
-      end
+      return doc if no_title_with_subsequent_content?(content_start, title_idx)
-      return doc if no_title_with_subsequent_content?(content_start, title)
-      tbs.slice(title...content_start).each do |tb|
-        tb.content = true if tb.has_label?(:MIGHT_BE_CONTENT)
-      end
+      tbs.slice(title_idx...content_start)
+        .select{ |tb| tb.has_label?(:MIGHT_BE_CONTENT) }
+        .each{ |tb| tb.content = true }
       doc
     end
-    def self.no_title_with_subsequent_content?(content_start, title)
-      title.nil? || content_start.nil? || content_start <= title
+    def self.no_title_with_subsequent_content?(content_start, title_idx)
+      # title has to start before content
+      title_idx.nil? || content_start.nil? || title_idx >= content_start
     end
   end
 end

data/lib/boilerpipe/filters/heuristic_filter_base.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Boilerpipe::Filters
   class HeuristicFilterBase
-    def self.num_full_text_words(tb, min_text_density=9.0)
+    def self.num_full_text_words(tb, min_text_density = 9.0)
       tb.text_density >= min_text_density ? tb.num_words : 0
     end
   end

data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb CHANGED Viewed

@@ -1,12 +1,11 @@
- # Marks all blocks as "non-content" that occur after blocks that have been
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
- # number of words in content blocks occur before this mark (default: 60).
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
+# Marks all blocks as "non-content" that occur after blocks that have been
+# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
+# number of words in content blocks occur before this mark (default: 60).
+# This can be used in conjunction with an upstream TerminatingBlocksFinder.
 module Boilerpipe::Filters
   class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
-    def self.process(doc, min_num_words=60)
+    def self.process(doc, min_num_words = 60)
       found_end_of_text = false
       num_words = 0
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/filters/keep_largest_block_filter.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 # Keeps the largest TextBlock only (by the number of words). In case of
 # more than one block with the same number of words, the first block is chosen.
 # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
 module Boilerpipe::Filters
   class KeepLargestBlockFilter
     def initialize(expand_to_same_level_text, min_words)
       @expand_to_same_level_text = expand_to_same_level_text
       @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
         expand_tag_level(tbs[0...n].reverse, level, @min_words)
         # expand blocks to the right
-        expand_tag_level(tbs[n+1..-1], level, @min_words)
+        expand_tag_level(tbs[n + 1..-1], level, @min_words)
       end
     end
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
         end
       end
     end
   end
 end

data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 #  Marks all blocks as content that:
 #  are on the same tag-level as very likely main content
 #  (usually the level of the largest  block)
@@ -7,23 +6,22 @@
 module Boilerpipe::Filters
   class LargeBlockSameTagLevelToContentFilter
     def self.process(doc)
       largest = doc.text_blocks.find do |tb|
         tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
       end
       return doc if largest.nil?
       tag_level = largest.tag_level
       doc.text_blocks.each do |tb|
         next if tb.is_content?
         tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/list_at_end_filter.rb CHANGED Viewed

@@ -11,7 +11,7 @@ module Boilerpipe::Filters
       doc.text_blocks.each do |tb|
         if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
           tag_level = tb.tag_level
-        elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
+        elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
           tb.content = true
         else
           tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/filters/mark_everything_content_filter.rb CHANGED Viewed

@@ -1,14 +1,12 @@
- # Marks all blocks as content.
+# Marks all blocks as content.
 module Boilerpipe::Filters
   class MarkEverythingContentFilter
     def self.process(doc)
       doc.text_blocks.each do |tb|
         tb.content = true if tb.is_not_content?
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/min_clause_words_filter.rb CHANGED Viewed

@@ -8,30 +8,27 @@
 module Boilerpipe::Filters
   class MinClauseWordsFilter
-    def self.process(doc, min_words=5)
+    def self.process(doc, min_words = 5)
       doc.text_blocks.each do |tb|
         next if tb.is_not_content?
         clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
+        hasClause = false
         tb.text.scan(clause_delimiter).each do |possible_clause|
-          if is_clause? possible_clause
-            break
-          else
-            tb.content = false
-          end
+          hasClause |= is_clause? possible_clause
         end
+        tb.content = false unless hasClause
       end
       doc
     end
-    def self.is_clause?(text, min_words=5)
-     return false if text.nil?
+    def self.is_clause?(text, min_words = 5)
+      return false if text.nil?
       whitespace = /[ \n\r]+/
       text.scan(whitespace).size >= min_words
     end
   end
 end

data/lib/boilerpipe/filters/min_words_filter.rb CHANGED Viewed

@@ -1,16 +1,14 @@
 # Keeps only those content blocks which contain at least k words.
 module Boilerpipe::Filters
   class MinWordsFilter
     def self.process(min_words, doc)
       doc.text_blocks.each do |tb|
         next if tb.is_not_content?
         tb.content = false if tb.num_words < min_words
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/num_words_rules_classifier.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# encoding: utf-8
 #  Classifies TextBlocks as content/not-content through rules that have been determined
 #  using the C4.8 machine learning algorithm, as described in the paper
 #  "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
 module Boilerpipe::Filters
   class NumWordsRulesClassifier
     def self.process(doc)
       empty = Boilerpipe::Document::TextBlock.empty_start
       text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
       false
     end
   end
 end

data/lib/boilerpipe/filters/simple_block_fusion_processor.rb CHANGED Viewed

@@ -1,4 +1,4 @@
- # Merges two subsequent blocks if their text densities are equal.
+# Merges two subsequent blocks if their text densities are equal.
 module Boilerpipe::Filters
   class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
         end
       end
-      doc.replace_text_blocks!( tbs - blocks_to_remove )
+      doc.replace_text_blocks!(tbs - blocks_to_remove)
       doc
     end
   end

data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 # Splits TextBlocks at paragraph boundaries.
 #
 # NOTE: This is not fully supported (i.e., it will break highlighting support via
@@ -8,7 +7,6 @@
 module Boilerpipe::Filters
   class SplitParagraphBlocksFilter
     def self.process(doc)
       tbs = doc.text_blocks
       new_blocks = []
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
       doc.replace_text_blocks!(new_blocks) if changes
       doc
     end
   end
 end

data/lib/boilerpipe/filters/terminating_blocks_finder.rb CHANGED Viewed

@@ -1,15 +1,13 @@
-# encoding: utf-8
 # Finds blocks which are potentially indicating the end of an article
 # text and marks them with INDICATES_END_OF_TEXT. This can be used
 # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
 module Boilerpipe::Filters
   class TerminatingBlocksFinder
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.num_words < 15
         if tb.text.length >= 8 && finds_match?(tb.text.downcase)
           tb.labels << :INDICATES_END_OF_TEXT
         elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
         text.include?('what you think...') ||
         text.include?('add your comment') ||
         text.include?('add comment') ||
-        #TODO add this and test
-        #text.include?('leave a reply') ||
-        #text.include?('leave a comment') ||
-        #text.include?('show comments') ||
-        #text.include?('Share this:') ||
+        # TODO add this and test
+        # text.include?('leave a reply') ||
+        # text.include?('leave a comment') ||
+        # text.include?('show comments') ||
+        # text.include?('Share this:') ||
         text.include?('reader views') ||
         text.include?('have your say') ||
         text.include?('reader comments') ||