RubyGems - boilerpipe-ruby - Versions diffs - 0.3.0 → 0.4.4 - Mend

boilerpipe-ruby 0.3.0 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +6 -24
data/.dockerignore +7 -0
data/CHANGELOG.md +30 -1
data/Dockerfile +14 -0
data/README.md +15 -4
data/Rakefile +3 -4
data/bin/console +3 -3
data/boilerpipe-ruby.gemspec +9 -9
data/boilerpipe_flow.md +40 -0
data/lib/boilerpipe.rb +9 -0
data/lib/boilerpipe/document/text_block.rb +10 -12
data/lib/boilerpipe/document/text_document.rb +4 -5
data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
data/lib/boilerpipe/labels/label_action.rb +1 -1
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -11
data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
data/lib/boilerpipe/sax/preprocessor.rb +11 -0
data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +33 -25

data/lib/boilerpipe/filters/simple_block_fusion_processor.rb CHANGED Viewed

@@ -1,4 +1,4 @@
- # Merges two subsequent blocks if their text densities are equal.
+# Merges two subsequent blocks if their text densities are equal.
 module Boilerpipe::Filters
   class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
         end
       end
-      doc.replace_text_blocks!( tbs - blocks_to_remove )
+      doc.replace_text_blocks!(tbs - blocks_to_remove)
       doc
     end
   end

data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb ADDED Viewed

@@ -0,0 +1,37 @@
+# Splits TextBlocks at paragraph boundaries.
+#
+# NOTE: This is not fully supported (i.e., it will break highlighting support via
+# #getContainedTextElements()), but this one probably is necessary for some other filters.
+#
+# see MinClauseWordsFilter
+module Boilerpipe::Filters
+  class SplitParagraphBlocksFilter
+    def self.process(doc)
+      tbs = doc.text_blocks
+      new_blocks = []
+      changes = false
+      tbs.each do |tb|
+        paragraphs = tb.text.split(/[\n\r]+/)
+        if paragraphs.size < 2
+          new_blocks << tb
+          next
+        end
+        is_content = tb.is_content?
+        labels = tb.labels
+        paragraphs.each do |paragraph|
+          tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
+          tbP.content = is_content
+          tbP.add_labels(labels)
+          new_blocks << tbP
+          changes = true
+        end
+      end
+      doc.replace_text_blocks!(new_blocks) if changes
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/terminating_blocks_finder.rb CHANGED Viewed

@@ -1,15 +1,13 @@
-# encoding: utf-8
 # Finds blocks which are potentially indicating the end of an article
 # text and marks them with INDICATES_END_OF_TEXT. This can be used
 # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
 module Boilerpipe::Filters
   class TerminatingBlocksFinder
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.num_words < 15
         if tb.text.length >= 8 && finds_match?(tb.text.downcase)
           tb.labels << :INDICATES_END_OF_TEXT
         elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
         text.include?('what you think...') ||
         text.include?('add your comment') ||
         text.include?('add comment') ||
-        #TODO add this and test
-        #text.include?('leave a reply') ||
-        #text.include?('leave a comment') ||
-        #text.include?('show comments') ||
-        #text.include?('Share this:') ||
+        # TODO add this and test
+        # text.include?('leave a reply') ||
+        # text.include?('leave a comment') ||
+        # text.include?('show comments') ||
+        # text.include?('Share this:') ||
         text.include?('reader views') ||
         text.include?('have your say') ||
         text.include?('reader comments') ||

data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb CHANGED Viewed

@@ -1,4 +1,3 @@
 # Marks trailing headlines TextBlocks that have the label :#HEADING
 # as boilerplate. Trailing means they are marked content and are
 # below any other content block.
@@ -6,7 +5,6 @@
 module Boilerpipe::Filters
   class TrailingHeadlineToBoilerplateFilter
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.is_content?
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/labels/label_action.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Boilerpipe::Labels
   class LabelAction
     attr_reader :labels
-    def initialize(labels=[])
+    def initialize(labels = [])
       @labels = labels
     end

data/lib/boilerpipe/sax/boilerpipe_html_parser.rb CHANGED Viewed

@@ -1,20 +1,11 @@
-require 'nokogiri'
 module Boilerpipe::SAX
   class BoilerpipeHTMLParser
     def self.parse(text)
-      #script bug - delete script tags
-      text  = text.gsub(/\<script>.+?<\/script>/i, '')
-      # nokogiri uses libxml for mri and nekohtml for jruby
-      # mri doesn't remove &nbsp; when missing the semicolon
-      text = text.gsub(/(&nbsp) /, '\1; ')
+      # strip out tags that cause issues
+      text = Preprocessor.strip(text)
       # use nokogiri to fix any bad tags, errors - keep experimenting with this
       text = Nokogiri::HTML(text).to_html
       handler = HTMLContentHandler.new
       noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
       noko_parser.parse(text)

data/lib/boilerpipe/sax/html_content_handler.rb CHANGED Viewed

@@ -1,11 +1,8 @@
-require 'nokogiri'
-require 'set'
 module Boilerpipe::SAX
   class HTMLContentHandler < Nokogiri::XML::SAX::Document
     attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
-    attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
+    attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
     ANCHOR_TEXT_START = "$\ue00a<"
     ANCHOR_TEXT_END = ">\ue00a$"
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
       @label_stacks << nil
       tag = name.upcase.intern
       tag_action = @tag_actions[tag]
       if tag_action
         @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
     def characters(text)
       flush_block if @flush
-      return if @in_ignorable_element != 0
+      return if in_ignorable_element?
       return if text.empty?
       # replace all whitespace with simple space
       text.gsub!(/\s+/, ' ')
       # trim whitespace
-      started_with_whitespace = text  =~ /^\s/
-      ended_with_whitespace = text  =~ /\s$/
+      started_with_whitespace = text =~ /^\s/
+      ended_with_whitespace = text =~ /\s$/
       text.strip!
       #  add a single space if the block was only whitespace
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
       end
       text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
-                                 num_words,
-                                 num_linked_words,
-                                 num_words_in_wrapped_lines,
-                                 num_wrapped_lines, @offset_blocks)
+                                                         num_words,
+                                                         num_linked_words,
+                                                         num_words_in_wrapped_lines,
+                                                         num_wrapped_lines, @offset_blocks)
       @offset_blocks += 1
       clear_buffers
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
     # \p{No}  -- a numeric character of other type
     def is_word?(word)
-       word =~ VALID_WORD_CHARACTER
+      word =~ VALID_WORD_CHARACTER
     end
-    #public void flushBlock() {
+    # public void flushBlock() {
     #    int numWords = 0;
     #    int numLinkedWords = 0;
     #    int numWrappedLines = 0;
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
     #    final int maxLineLength = 80;
     #    int numTokens = 0;
     #    int numWordsCurrentLine = 0;
-    #}
+    # }
     def increase_in_ignorable_element!
       @in_ignorable_element += 1
     end
+    # should we prevent less than zero here?
     def decrease_in_ignorable_element!
       @in_ignorable_element -= 1
     end
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
       @in_anchor_tag > 0
     end
     def add_text_block(text_block)
       @label_stacks.each do |stack|
         next unless stack
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
     # append space if last character wasn't already one
     def append_space
       return if @sb_last_was_whitespace
       @sb_last_was_whitespace = true
       @text_buffer << ' '

data/lib/boilerpipe/sax/preprocessor.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Boilerpipe::SAX
+  class Preprocessor
+    def self.strip(text)
+      # script bug - delete script tags
+      text = text.gsub(/\<script.+?<\/script>/im, '')
+      # nokogiri uses libxml for mri and nekohtml for jruby
+      # mri doesn't remove &nbsp; when missing the semicolon
+      text.gsub(/(&nbsp) /, '\1; ')
+    end
+  end
+end

data/lib/boilerpipe/sax/tag_action_map.rb CHANGED Viewed

@@ -48,4 +48,3 @@ module Boilerpipe::SAX
     end
   end
 end

data/lib/boilerpipe/sax/tag_actions/anchor_text.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
   class AnchorText
     # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
     # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
-    #* encounters such nestings, a SAXException is thrown.
+    # * encounters such nestings, a SAXException is thrown.
     def start(handler, name, attrs)
       if handler.in_anchor_tag?
         handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
       # - dunno about nokogiri???????
       # as nested A elements are not allowed per specification, we
       # are probably reaching this branch due to a bug in the XML parser
-      #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
+      # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
       end_tag(handler, name)
     end
   end

data/lib/boilerpipe/sax/tag_actions/block_level.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-     # Explicitly marks this tag a simple "block-level" element,
-     # which always generates whitespace
+  # Explicitly marks this tag a simple "block-level" element,
+  # which always generates whitespace
   class BlockLevel
     def start(handler, name, attrs)
       true

data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-# for block-level elements, which triggers some LabelAction on
-# the generated TextBlock.
+  # for block-level elements, which triggers some LabelAction on
+  # the generated TextBlock.
   class BlockTagLabel
     def initialize(label_action)
       @label_action = label_action

data/lib/boilerpipe/sax/tag_actions/body.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-   # Marks this tag the body element (this should usually only
-   # be set for the <BODY> tag).
+  # Marks this tag the body element (this should usually only
+  # be set for the <BODY> tag).
   class Body
     def start(handler, name, attrs)
       handler.flush_block

data/lib/boilerpipe/sax/tag_actions/font.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
         rel = m[1]
         val = m[2].to_i # absolute
         size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
-        handler.font_size_stack <<  size
+        handler.font_size_stack << size
       else
         handler.font_size_stack << nil
       end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
     end
     def relative(font_size_stack, rel, val)
-      prev_size = font_size_stack.reverse_each.find{|s| s != nil}
+      prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
       prev_size = 3 if prev_size.nil?
       size = if rel == '+'

data/lib/boilerpipe/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Boilerpipe
-  VERSION = '0.3.0'
+  VERSION = '0.4.4'
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: boilerpipe-ruby
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.4
 platform: ruby
 authors:
 - Gregory Ostermayr
-autorequire:
+autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-09-12 00:00:00.000000000 Z
+date: 2021-02-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -16,71 +16,71 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '2.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '2.0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: 12.3.3
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - "~>"
+    - - ">="
       - !ruby/object:Gem::Version
-        version: '10.0'
+        version: 12.3.3
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: rickshaw
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: 0.5.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: 0.5.0
 - !ruby/object:Gem::Dependency
-  name: rickshaw
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.4.0
+        version: '3.10'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.4.0
+        version: '3.10'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.6.6.2
+        version: '1.10'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
-    - - ">="
+    - - "~>"
       - !ruby/object:Gem::Version
-        version: 1.6.6.2
-description: A pure ruby implementation of the boilerpipe algorithm
+        version: '1.10'
+description: A pure ruby implementation of the boilerpipe web content extraction algorithm
 email:
 - "<gregory.ostermayr@gmail.com>"
 executables: []
@@ -88,9 +88,11 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".circleci/config.yml"
+- ".dockerignore"
 - ".gitignore"
 - ".rspec"
 - CHANGELOG.md
+- Dockerfile
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -98,14 +100,17 @@ files:
 - bin/console
 - bin/setup
 - boilerpipe-ruby.gemspec
+- boilerpipe_flow.md
 - lib/boilerpipe.rb
 - lib/boilerpipe/document/text_block.rb
 - lib/boilerpipe/document/text_document.rb
 - lib/boilerpipe/errors.rb
 - lib/boilerpipe/extractors/article_extractor.rb
+- lib/boilerpipe/extractors/article_sentence_extractor.rb
 - lib/boilerpipe/extractors/canola_extractor.rb
 - lib/boilerpipe/extractors/default_extractor.rb
 - lib/boilerpipe/extractors/keep_everything_extractor.rb
+- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
 - lib/boilerpipe/extractors/largest_content_extractor.rb
 - lib/boilerpipe/extractors/num_words_rules_extractor.rb
 - lib/boilerpipe/filters/block_proximity_fusion.rb
@@ -120,14 +125,18 @@ files:
 - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
 - lib/boilerpipe/filters/list_at_end_filter.rb
 - lib/boilerpipe/filters/mark_everything_content_filter.rb
+- lib/boilerpipe/filters/min_clause_words_filter.rb
+- lib/boilerpipe/filters/min_words_filter.rb
 - lib/boilerpipe/filters/num_words_rules_classifier.rb
 - lib/boilerpipe/filters/simple_block_fusion_processor.rb
+- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
 - lib/boilerpipe/filters/terminating_blocks_finder.rb
 - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
 - lib/boilerpipe/labels/default.rb
 - lib/boilerpipe/labels/label_action.rb
 - lib/boilerpipe/sax/boilerpipe_html_parser.rb
 - lib/boilerpipe/sax/html_content_handler.rb
+- lib/boilerpipe/sax/preprocessor.rb
 - lib/boilerpipe/sax/tag_action_map.rb
 - lib/boilerpipe/sax/tag_actions/anchor_text.rb
 - lib/boilerpipe/sax/tag_actions/block_level.rb
@@ -146,7 +155,7 @@ homepage: https://github.com/gregors/boilerpipe-ruby
 licenses:
 - Apache 2.0
 metadata: {}
-post_install_message:
+post_install_message:
 rdoc_options: []
 require_paths:
 - lib
@@ -161,9 +170,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.6.12
-signing_key:
+rubygems_version: 3.0.8
+signing_key:
 specification_version: 4
-summary: A pure ruby implemenation of the boilerpipe algorithm
+summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
 test_files: []