RubyGems - boilerpipe-ruby - Versions diffs - 0.2.0 → 0.4.3 - Mend

boilerpipe-ruby 0.2.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +6 -24
data/.dockerignore +7 -0
data/CHANGELOG.md +34 -1
data/Dockerfile +14 -0
data/README.md +32 -7
data/Rakefile +3 -4
data/bin/console +3 -3
data/boilerpipe-ruby.gemspec +9 -9
data/boilerpipe_flow.md +40 -0
data/lib/boilerpipe.rb +14 -0
data/lib/boilerpipe/document/text_block.rb +10 -12
data/lib/boilerpipe/document/text_document.rb +4 -5
data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
data/lib/boilerpipe/labels/label_action.rb +1 -1
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +38 -25

data/lib/boilerpipe/filters/min_words_filter.rb ADDED

@@ -0,0 +1,14 @@
+# Keeps only those content blocks which contain at least k words.
+module Boilerpipe::Filters
+  class MinWordsFilter
+    def self.process(min_words, doc)
+      doc.text_blocks.each do |tb|
+        next if tb.is_not_content?
+        tb.content = false if tb.num_words < min_words
+      end
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/num_words_rules_classifier.rb CHANGED

@@ -1,5 +1,3 @@
-# encoding: utf-8
 #  Classifies TextBlocks as content/not-content through rules that have been determined
 #  using the C4.8 machine learning algorithm, as described in the paper
 #  "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
 module Boilerpipe::Filters
   class NumWordsRulesClassifier
     def self.process(doc)
       empty = Boilerpipe::Document::TextBlock.empty_start
       text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
       false
     end
   end
 end

data/lib/boilerpipe/filters/simple_block_fusion_processor.rb CHANGED

@@ -1,4 +1,4 @@
- # Merges two subsequent blocks if their text densities are equal.
+# Merges two subsequent blocks if their text densities are equal.
 module Boilerpipe::Filters
   class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
         end
       end
-      doc.replace_text_blocks!( tbs - blocks_to_remove )
+      doc.replace_text_blocks!(tbs - blocks_to_remove)
       doc
     end
   end

data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb ADDED

@@ -0,0 +1,37 @@
+# Splits TextBlocks at paragraph boundaries.
+#
+# NOTE: This is not fully supported (i.e., it will break highlighting support via
+# #getContainedTextElements()), but this one probably is necessary for some other filters.
+#
+# see MinClauseWordsFilter
+module Boilerpipe::Filters
+  class SplitParagraphBlocksFilter
+    def self.process(doc)
+      tbs = doc.text_blocks
+      new_blocks = []
+      changes = false
+      tbs.each do |tb|
+        paragraphs = tb.text.split(/[\n\r]+/)
+        if paragraphs.size < 2
+          new_blocks << tb
+          next
+        end
+        is_content = tb.is_content?
+        labels = tb.labels
+        paragraphs.each do |paragraph|
+          tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
+          tbP.content = is_content
+          tbP.add_labels(labels)
+          new_blocks << tbP
+          changes = true
+        end
+      end
+      doc.replace_text_blocks!(new_blocks) if changes
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/terminating_blocks_finder.rb CHANGED

@@ -1,15 +1,13 @@
-# encoding: utf-8
 # Finds blocks which are potentially indicating the end of an article
 # text and marks them with INDICATES_END_OF_TEXT. This can be used
 # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
 module Boilerpipe::Filters
   class TerminatingBlocksFinder
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.num_words < 15
         if tb.text.length >= 8 && finds_match?(tb.text.downcase)
           tb.labels << :INDICATES_END_OF_TEXT
         elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
         text.include?('what you think...') ||
         text.include?('add your comment') ||
         text.include?('add comment') ||
-        #TODO add this and test
-        #text.include?('leave a reply') ||
-        #text.include?('leave a comment') ||
-        #text.include?('show comments') ||
-        #text.include?('Share this:') ||
+        # TODO add this and test
+        # text.include?('leave a reply') ||
+        # text.include?('leave a comment') ||
+        # text.include?('show comments') ||
+        # text.include?('Share this:') ||
         text.include?('reader views') ||
         text.include?('have your say') ||
         text.include?('reader comments') ||

data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb CHANGED

@@ -1,4 +1,3 @@
 # Marks trailing headlines TextBlocks that have the label :#HEADING
 # as boilerplate. Trailing means they are marked content and are
 # below any other content block.
@@ -6,7 +5,6 @@
 module Boilerpipe::Filters
   class TrailingHeadlineToBoilerplateFilter
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.is_content?
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/labels/label_action.rb CHANGED

@@ -2,7 +2,7 @@ module Boilerpipe::Labels
   class LabelAction
     attr_reader :labels
-    def initialize(labels=[])
+    def initialize(labels = [])
       @labels = labels
     end

data/lib/boilerpipe/sax/boilerpipe_html_parser.rb CHANGED

@@ -1,20 +1,16 @@
-require 'nokogiri'
 module Boilerpipe::SAX
   class BoilerpipeHTMLParser
     def self.parse(text)
-      #script bug - delete script tags
-      text  = text.gsub(/\<script>.+?<\/script>/i, '')
+      # script bug - delete script tags
+      text.gsub!(/\<script>.+?<\/script>/i, '')
       # nokogiri uses libxml for mri and nekohtml for jruby
       # mri doesn't remove &nbsp; when missing the semicolon
-      text = text.gsub(/(&nbsp) /, '\1; ')
+      text.gsub!(/(&nbsp) /, '\1; ')
       # use nokogiri to fix any bad tags, errors - keep experimenting with this
       text = Nokogiri::HTML(text).to_html
       handler = HTMLContentHandler.new
       noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
       noko_parser.parse(text)

data/lib/boilerpipe/sax/html_content_handler.rb CHANGED

@@ -1,11 +1,8 @@
-require 'nokogiri'
-require 'set'
 module Boilerpipe::SAX
   class HTMLContentHandler < Nokogiri::XML::SAX::Document
     attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
-    attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
+    attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
     ANCHOR_TEXT_START = "$\ue00a<"
     ANCHOR_TEXT_END = ">\ue00a$"
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
       @label_stacks << nil
       tag = name.upcase.intern
       tag_action = @tag_actions[tag]
       if tag_action
         @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
     def characters(text)
       flush_block if @flush
-      return if @in_ignorable_element != 0
+      return if in_ignorable_element?
       return if text.empty?
       # replace all whitespace with simple space
       text.gsub!(/\s+/, ' ')
       # trim whitespace
-      started_with_whitespace = text  =~ /^\s/
-      ended_with_whitespace = text  =~ /\s$/
+      started_with_whitespace = text =~ /^\s/
+      ended_with_whitespace = text =~ /\s$/
       text.strip!
       #  add a single space if the block was only whitespace
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
       end
       text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
-                                 num_words,
-                                 num_linked_words,
-                                 num_words_in_wrapped_lines,
-                                 num_wrapped_lines, @offset_blocks)
+                                                         num_words,
+                                                         num_linked_words,
+                                                         num_words_in_wrapped_lines,
+                                                         num_wrapped_lines, @offset_blocks)
       @offset_blocks += 1
       clear_buffers
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
     # \p{No}  -- a numeric character of other type
     def is_word?(word)
-       word =~ VALID_WORD_CHARACTER
+      word =~ VALID_WORD_CHARACTER
     end
-    #public void flushBlock() {
+    # public void flushBlock() {
     #    int numWords = 0;
     #    int numLinkedWords = 0;
     #    int numWrappedLines = 0;
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
     #    final int maxLineLength = 80;
     #    int numTokens = 0;
     #    int numWordsCurrentLine = 0;
-    #}
+    # }
     def increase_in_ignorable_element!
       @in_ignorable_element += 1
     end
+    # should we prevent less than zero here?
     def decrease_in_ignorable_element!
       @in_ignorable_element -= 1
     end
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
       @in_anchor_tag > 0
     end
     def add_text_block(text_block)
       @label_stacks.each do |stack|
         next unless stack
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
     # append space if last character wasn't already one
     def append_space
       return if @sb_last_was_whitespace
       @sb_last_was_whitespace = true
       @text_buffer << ' '

data/lib/boilerpipe/sax/tag_action_map.rb CHANGED

@@ -48,4 +48,3 @@ module Boilerpipe::SAX
     end
   end
 end

data/lib/boilerpipe/sax/tag_actions/anchor_text.rb CHANGED

@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
   class AnchorText
     # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
     # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
-    #* encounters such nestings, a SAXException is thrown.
+    # * encounters such nestings, a SAXException is thrown.
     def start(handler, name, attrs)
       if handler.in_anchor_tag?
         handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
       # - dunno about nokogiri???????
       # as nested A elements are not allowed per specification, we
       # are probably reaching this branch due to a bug in the XML parser
-      #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
+      # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
       end_tag(handler, name)
     end
   end

data/lib/boilerpipe/sax/tag_actions/block_level.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-     # Explicitly marks this tag a simple "block-level" element,
-     # which always generates whitespace
+  # Explicitly marks this tag a simple "block-level" element,
+  # which always generates whitespace
   class BlockLevel
     def start(handler, name, attrs)
       true

data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-# for block-level elements, which triggers some LabelAction on
-# the generated TextBlock.
+  # for block-level elements, which triggers some LabelAction on
+  # the generated TextBlock.
   class BlockTagLabel
     def initialize(label_action)
       @label_action = label_action

data/lib/boilerpipe/sax/tag_actions/body.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-   # Marks this tag the body element (this should usually only
-   # be set for the <BODY> tag).
+  # Marks this tag the body element (this should usually only
+  # be set for the <BODY> tag).
   class Body
     def start(handler, name, attrs)
       handler.flush_block

data/lib/boilerpipe/sax/tag_actions/font.rb CHANGED

@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
         rel = m[1]
         val = m[2].to_i # absolute
         size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
-        handler.font_size_stack <<  size
+        handler.font_size_stack << size
       else
         handler.font_size_stack << nil
       end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
     end
     def relative(font_size_stack, rel, val)
-      prev_size = font_size_stack.reverse_each.find{|s| s != nil}
+      prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
       prev_size = 3 if prev_size.nil?
       size = if rel == '+'

data/lib/boilerpipe/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Boilerpipe
-  VERSION = '0.2.0'
+  VERSION = '0.4.3'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: boilerpipe-ruby
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.4.3
 platform: ruby
 authors:
 - Gregory Ostermayr
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-09-11 00:00:00.000000000 Z
+date: 2020-07-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -16,71 +16,71 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '2.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '2.0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: 12.3.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: 12.3.3
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: rickshaw
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: 0.5.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: 0.5.0
 - !ruby/object:Gem::Dependency
-  name: rickshaw
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.4.0
+        version: '3.9'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.4.0
+        version: '3.9'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.6.6.2
+        version: '1.10'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.6.6.2
-description: A pure ruby implementation of the boilerpipe algorithm
+        version: '1.10'
+description: A pure ruby implementation of the boilerpipe web content extraction algorithm
 email:
 - "<gregory.ostermayr@gmail.com>"
 executables: []
@@ -88,9 +88,11 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".circleci/config.yml"
+- ".dockerignore"
 - ".gitignore"
 - ".rspec"
 - CHANGELOG.md
+- Dockerfile
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -98,14 +100,22 @@ files:
 - bin/console
 - bin/setup
 - boilerpipe-ruby.gemspec
+- boilerpipe_flow.md
 - lib/boilerpipe.rb
 - lib/boilerpipe/document/text_block.rb
 - lib/boilerpipe/document/text_document.rb
 - lib/boilerpipe/errors.rb
 - lib/boilerpipe/extractors/article_extractor.rb
+- lib/boilerpipe/extractors/article_sentence_extractor.rb
+- lib/boilerpipe/extractors/canola_extractor.rb
 - lib/boilerpipe/extractors/default_extractor.rb
+- lib/boilerpipe/extractors/keep_everything_extractor.rb
+- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
+- lib/boilerpipe/extractors/largest_content_extractor.rb
+- lib/boilerpipe/extractors/num_words_rules_extractor.rb
 - lib/boilerpipe/filters/block_proximity_fusion.rb
 - lib/boilerpipe/filters/boilerplate_block_filter.rb
+- lib/boilerpipe/filters/canola_classifier.rb
 - lib/boilerpipe/filters/density_rules_classifier.rb
 - lib/boilerpipe/filters/document_title_match_classifier.rb
 - lib/boilerpipe/filters/expand_title_to_content_filter.rb
@@ -114,8 +124,12 @@ files:
 - lib/boilerpipe/filters/keep_largest_block_filter.rb
 - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
 - lib/boilerpipe/filters/list_at_end_filter.rb
+- lib/boilerpipe/filters/mark_everything_content_filter.rb
+- lib/boilerpipe/filters/min_clause_words_filter.rb
+- lib/boilerpipe/filters/min_words_filter.rb
 - lib/boilerpipe/filters/num_words_rules_classifier.rb
 - lib/boilerpipe/filters/simple_block_fusion_processor.rb
+- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
 - lib/boilerpipe/filters/terminating_blocks_finder.rb
 - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
 - lib/boilerpipe/labels/default.rb
@@ -140,7 +154,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
 licenses:
 - Apache 2.0
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -155,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.6.12
-signing_key:
+rubygems_version: 3.0.8
+signing_key:
 specification_version: 4
-summary: A pure ruby implemenation of the boilerpipe algorithm
+summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
 test_files: []