RubyGems - boilerpipe-ruby - Versions diffs - 0.4.0 → 0.4.1 - Mend

boilerpipe-ruby 0.4.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +6 -24
data/.dockerignore +7 -0
data/CHANGELOG.md +13 -0
data/Dockerfile +14 -0
data/README.md +12 -3
data/Rakefile +3 -4
data/bin/console +3 -3
data/boilerpipe-ruby.gemspec +6 -6
data/boilerpipe_flow.md +40 -0
data/lib/boilerpipe.rb +3 -0
data/lib/boilerpipe/document/text_block.rb +10 -12
data/lib/boilerpipe/document/text_document.rb +4 -3
data/lib/boilerpipe/extractors/canola_extractor.rb +0 -1
data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +1 -1
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +0 -1
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +0 -1
data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
data/lib/boilerpipe/filters/canola_classifier.rb +4 -5
data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +1 -3
data/lib/boilerpipe/filters/min_clause_words_filter.rb +8 -11
data/lib/boilerpipe/filters/min_words_filter.rb +1 -3
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +0 -3
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
data/lib/boilerpipe/labels/label_action.rb +1 -1
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +1 -5
data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +16 -14

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
-  data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
+SHA256:
+  metadata.gz: 65756911038bd486a08337188c3275ebd0c0c65e4b902edbd6b6667dda422740
+  data.tar.gz: ff83632d9cea8e4a0ede8609115e8282a856d2d7728c801e64ecec39a9399857
 SHA512:
-  metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
-  data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6
+  metadata.gz: e5e902c81cea26252c41bc4b96d0faebe6682a0dc5ae2c09397762ef4c5a7f244c0c100f87923863b308d2ef9b5ecc732e674d2ca801e4087f99031d46776034
+  data.tar.gz: 6788183a0a4c9d01c764d17537c52edaf9d32b93fb42da8a013f9f0b14f6a4f757a8d3b5f77b73fef59f62fcf988a723abf7fe8305f5626b0337838c4eb31c7d

data/.circleci/config.yml CHANGED

@@ -6,48 +6,30 @@ version: 2
 jobs:
   build:
     docker:
-      # specify the version you desire here
-       - image: circleci/ruby:2.4.1-node-browsers
+       - image: circleci/ruby:2.5.5-node-browsers
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
       # documented at https://circleci.com/docs/2.0/circleci-images/
-      # - image: circleci/postgres:9.4
     working_directory: ~/repo
     steps:
       - checkout
-      # Download and cache dependencies
-      - restore_cache:
-          keys:
-          - v1-dependencies-{{ checksum "Gemfile.lock" }}
-          # fallback to using the latest cache if no exact match is found
-          - v1-dependencies-
+      - run: gem install bundler
       - run:
           name: install dependencies
           command: |
-            bundle install --jobs=4 --retry=3 --path vendor/bundle
+            bundle install --jobs=4 --retry=3
-      - save_cache:
-          paths:
-            - ./vendor/bundle
-          key: v1-dependencies-{{ checksum "Gemfile.lock" }}
-      # Database setup
-      #- run: bundle exec rake db:create
-      #- run: bundle exec rake db:schema:load
-      # run tests!
       - run:
           name: run tests
           command: |
             mkdir /tmp/test-results
             TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
-            rspec --format progress "spec"
+            bundle exec rspec --format progress "spec"
       # collect reports
       - store_test_results:

data/.dockerignore ADDED

@@ -0,0 +1,7 @@
+.git
+.gitignore
+ log/*
+ tmp/*
+ *.swp
+ *.swo
+ Gemfile.lock

data/CHANGELOG.md CHANGED

@@ -1,3 +1,16 @@
+# 0.4.1 / 2019-07-04
+* Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
+* Allow tests to run in Docker
+* Update circle to continue to work
+* Add architecture flow
+* Code formatting
+* Add min words filter specs
+* Add label action specs
+* Add missing test case to ignorable element spec
+* Add merge_next case to text block spec
+* Dry up includes
 # 0.4.0 / 2017-09-15
 * Add KeepEverythingWithMinKWords Extractor

data/Dockerfile ADDED

@@ -0,0 +1,14 @@
+From ruby:2.5
+RUN gem install bundler
+COPY *gemspec /usr/src/app/
+COPY Gemfile /usr/src/app/
+COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
+COPY bin /usr/src/app/
+COPY bin/* /usr/src/app/bin/
+WORKDIR /usr/src/app
+RUN bin/setup
+COPY . /usr/src/app/
+CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]

data/README.md CHANGED

@@ -1,5 +1,8 @@
 # Boilerpipe
+[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
+[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
 A pure ruby implemenation of the boilerpipe algorithm.
 This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,6 +13,8 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
 This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
+Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
 # TLDR
 Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
@@ -24,9 +29,6 @@ Presently the follow Extractors are implemented
 * [x] LargestContentExtractor
 * [x] NumWordsRulesExtractor
-[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
-[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
 ## Installation
@@ -71,6 +73,13 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
 To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+### Running Tests on Docker
+The default run command will run the tests
+    docker build -t boilerpipe .
+    docker run -it --rm boilerpipe
 ## Contributing
 Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.

data/Rakefile CHANGED

@@ -1,14 +1,13 @@
-require "bundler/gem_tasks"
-require "rspec/core/rake_task"
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec)
 task :default => :spec
 desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
 task :download_boilerpipe_jar do
   FileUtils.mkdir_p 'spec/sanity_checks/jars/'
   Dir.chdir 'spec/sanity_checks/jars/'
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
+  `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
 end

data/bin/console CHANGED

@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
-require "bundler/setup"
-require "boilerpipe"
+require 'bundler/setup'
+require 'boilerpipe'
 # You can add fixtures and/or initialization code here to make experimenting
 # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
 # require "pry"
 # Pry.start
-require "irb"
+require 'irb'
 IRB.start

data/boilerpipe-ruby.gemspec CHANGED

@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
   spec.email         = ['<gregory.ostermayr@gmail.com>']
   spec.license       = 'Apache 2.0'
-  spec.summary       = %q{A pure ruby implemenation of the boilerpipe algorithm}
-  spec.description   = %q{A pure ruby implementation of the boilerpipe algorithm}
+  spec.summary       = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
+  spec.description   = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
   spec.homepage      = 'https://github.com/gregors/boilerpipe-ruby'
   spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
-  spec.bindir        = "exe"
+  spec.bindir        = 'exe'
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
-  spec.require_paths = ["lib"]
+  spec.require_paths = ['lib']
-  spec.add_development_dependency 'bundler', '~> 1.11'
+  spec.add_development_dependency 'bundler', '~> 2.0'
   spec.add_development_dependency 'rake', '~> 10.0'
-  spec.add_development_dependency 'rspec', '~> 3.0'
   spec.add_development_dependency 'rickshaw', '~> 0.4.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
   spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
 end

data/boilerpipe_flow.md ADDED

@@ -0,0 +1,40 @@
+```
+raw html
+   |
+   |
+  sax input -> sax parser(html parser) ->  HTML Content handler -> tokenizer ---------
+                                                                                     |
+    -------------------------------------<------------------------------------<------|
+    |              |            |
+text blocks    text blocks  text blocks
+    |              |            |
+    |              |            |
+    -----------------------------
+          |
+          |
+     text document
+          |
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+          |
+     text document
+          |
+  outputs extracted text
+  ```

data/lib/boilerpipe.rb CHANGED

@@ -1,3 +1,6 @@
+require 'nokogiri'
+require 'set'
 require 'boilerpipe/version'
 require 'boilerpipe/util/unicode_tokenizer'

data/lib/boilerpipe/document/text_block.rb CHANGED

@@ -1,10 +1,7 @@
-require 'set'
 module Boilerpipe
   module Document
     class TextBlock
-       #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
+      # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
       attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
                   :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
       attr_accessor :content
-      def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
+      def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
         @labels = Set.new
         @text = text
         @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
         new('', 0, 0, 0, 0, -1)
       end
-     def set_tag_level(level)
-       @tag_level = level
-     end
+      def set_tag_level(level)
+        @tag_level = level
+      end
       def is_content?
         @content
@@ -68,8 +65,8 @@ module Boilerpipe
         @num_words_in_anchor_text += other.num_words_in_anchor_text
         @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
         @num_wrapped_lines += other.num_wrapped_lines
-        @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
-        @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
+        @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
+        @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
         init_densities
         @content |= other.is_content?
@@ -87,10 +84,10 @@ module Boilerpipe
       end
       def to_s
-        #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
+        # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
         labels = 'null'
         if !@labels.empty?
-          labels ="[#{ @labels.to_a.join(',')}]"
+          labels = "[#{@labels.to_a.join(',')}]"
         end
         "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
       end
@@ -100,6 +97,7 @@ module Boilerpipe
       end
       private
       def init_densities
         if @num_words_in_wrapped_lines == 0
           @num_words_in_wrapped_lines = @num_words

data/lib/boilerpipe/document/text_document.rb CHANGED

@@ -19,12 +19,14 @@ module Boilerpipe
           case text_block.is_content?
           when true
             next unless include_content
             s << text_block.text
             s << "\n"
           when false
             next unless include_noncontent
-           s << text_block.text
-           s << "\n"
+            s << text_block.text
+            s << "\n"
           end
         end
         s
@@ -38,7 +40,6 @@ module Boilerpipe
         @text_blocks.map(&:to_s).join("\n")
       end
       alias_method :debug_string, :debug_s
     end
   end
 end

data/lib/boilerpipe/extractors/canola_extractor.rb CHANGED

@@ -1,6 +1,5 @@
 module Boilerpipe::Extractors
   class CanolaExtractor
     def self.text(contents)
       doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
       ::Boilerpipe::Extractors::CanolaExtractor.process doc

data/lib/boilerpipe/extractors/default_extractor.rb CHANGED

@@ -1,6 +1,5 @@
 module Boilerpipe::Extractors
   class DefaultExtractor
     def self.text(contents)
       doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
       ::Boilerpipe::Extractors::DefaultExtractor.process doc

data/lib/boilerpipe/extractors/keep_everything_extractor.rb CHANGED

@@ -1,4 +1,4 @@
- # Marks all blocks as content.
+# Marks all blocks as content.
 module Boilerpipe::Extractors
   class KeepEverythingExtractor

data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb CHANGED

@@ -1,4 +1,3 @@
 # A full-text extractor which extracts the largest text component of a page.
 # For news articles, it may perform better than the DefaultExtractor, but
 # usually worse than ArticleExtractor.

data/lib/boilerpipe/extractors/num_words_rules_extractor.rb CHANGED

@@ -1,6 +1,5 @@
 module Boilerpipe::Extractors
   class NumWordsRulesExtractor
     def self.text(contents)
       doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
       ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc

data/lib/boilerpipe/filters/block_proximity_fusion.rb CHANGED

@@ -1,11 +1,8 @@
-    # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
-    # probably makes sense only in cases where an upstream filter already has removed some blocks.
+# Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
+# probably makes sense only in cases where an upstream filter already has removed some blocks.
 module Boilerpipe::Filters
   class BlockProximityFusion
     def initialize(max_blocks_distance, content_only, same_tag_level_only)
       @max_blocks_distance = max_blocks_distance
       @content_only = content_only
@@ -13,8 +10,8 @@ module Boilerpipe::Filters
     end
     MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
-    MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
-    MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
+    MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new(1, false, true)
+    MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new(1, true, false)
     MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
     def process(doc)
@@ -22,7 +19,7 @@ module Boilerpipe::Filters
       return false if text_blocks.size < 2
       prev_block = if @content_only
-                     text_blocks.find{ |tb| tb.is_content? }
+                     text_blocks.find { |tb| tb.is_content? }
                    else
                      text_blocks.first
                    end
@@ -46,18 +43,16 @@ module Boilerpipe::Filters
           ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
           ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
-          if  ok
+          if ok
             prev_block.merge_next(tb)
             blocks_to_remove << tb
           else
             prev_block = tb
           end
         end
       end
-      doc.replace_text_blocks!( text_blocks - blocks_to_remove )
+      doc.replace_text_blocks!(text_blocks - blocks_to_remove)
       doc
     end
   end
 end

data/lib/boilerpipe/filters/boilerplate_block_filter.rb CHANGED

@@ -1,9 +1,7 @@
- # Removes TextBlocks which have explicitly been marked as "not content".
+# Removes TextBlocks which have explicitly been marked as "not content".
 module Boilerpipe::Filters
   class BoilerplateBlockFilter
     def initialize(label)
       @label_to_keep = label
     end
@@ -21,6 +19,5 @@ module Boilerpipe::Filters
       doc.replace_text_blocks!(combined)
       doc
     end
   end
 end

data/lib/boilerpipe/filters/canola_classifier.rb CHANGED

@@ -1,10 +1,9 @@
- # A full-text extractor trained on http://krdwrd.org/
- # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
- # Works well with SimpleEstimator, too.
+# A full-text extractor trained on http://krdwrd.org/
+# https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
+# Works well with SimpleEstimator, too.
 module Boilerpipe::Filters
   class CanolaClassifier
     def self.process(doc)
       return doc if doc.text_blocks.size < 1
@@ -22,7 +21,7 @@ module Boilerpipe::Filters
     def self.classify(prev, current, nxt)
       current.link_density > 0 && nxt.num_words > 11 \
         || current.num_words > 19 \
-        || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
+        || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && (current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19)
     end
   end
 end

data/lib/boilerpipe/filters/density_rules_classifier.rb CHANGED

@@ -5,9 +5,8 @@
 module Boilerpipe::Filters
   class DensityRulesClassifier
     def self.process(doc)
-      #return doc if doc.text_blocks.size < 2
+      # return doc if doc.text_blocks.size < 2
       empty = Boilerpipe::Document::TextBlock.empty_start
       text_blocks = [empty] + doc.text_blocks + [empty]
@@ -26,12 +25,14 @@ module Boilerpipe::Filters
       if prev.link_density <= 0.555556
         if current.text_density <= 9
           return true if nxt.text_density > 10
           return prev.text_density <= 4 ? false : true
         else
           return nxt.text_density == 0 ? false : true
         end
       else
         return false if nxt.text_density <= 11
         true
       end
     end

data/lib/boilerpipe/filters/document_title_match_classifier.rb CHANGED

@@ -1,12 +1,9 @@
-# encoding: utf-8
-require 'set'
+# Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
+# some heuristics which are quite specific to the news domain.
- # Marks TextBlocks which contain parts of the HTML <TITLE> tag, using
- # some heuristics which are quite specific to the news domain.
-    # we create a list of potential titles from the page title
-    # then we look at every text block and if the text block
-    # contains a potential title - we set that text block label as :TITLE
+# we create a list of potential titles from the page title
+# then we look at every text block and if the text block
+# contains a potential title - we set that text block label as :TITLE
 module Boilerpipe::Filters
   class DocumentTitleMatchClassifier
@@ -55,24 +52,24 @@ module Boilerpipe::Filters
       @potential_titles << title
       # unnecessary
-      #p = longest_part(title, /[ ]*[|»-][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»-][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:()][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:()][ ]*/)
+      # @potential_titles << p if p
-      #p = longest_part(title, /[ ]*[|»:()-][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»:()-][ ]*/)
+      # @potential_titles << p if p
       p = longest_part(title, /[ ]*[|»,:()-][ ]*/)
       @potential_titles << p if p
       # we replace \u00a0 so why check for it?
-      #p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
-      #@potential_titles << p if p
+      # p = longest_part(title, /[ ]*[|»,:()-\u00a0][ ]*/)
+      # @potential_titles << p if p
       add_potential_titles(title, /[ ]+[|][ ]+/, 4)
       add_potential_titles(title, /[ ]+[-][ ]+/, 4)
@@ -90,6 +87,7 @@ module Boilerpipe::Filters
       parts.each do |part|
         next if part =~ /[.]com/
         num_words = number_of_words(part)
         if num_words > longest_num_words || part.size > longest_part.size
@@ -107,6 +105,7 @@ module Boilerpipe::Filters
       parts.each do |part|
         next if part =~ /[.]com/
         num_words = number_of_words(part)
         @potential_titles << part if num_words >= min_words
@@ -116,6 +115,5 @@ module Boilerpipe::Filters
     def number_of_words(s)
       s.split(/[\b ]+/).size
     end
   end
 end

data/lib/boilerpipe/filters/expand_title_to_content_filter.rb CHANGED

@@ -1,10 +1,8 @@
 # Marks all TextBlocks "content" which are between the headline and the part that has
 # already been marked content, if they are marked MIGHT_BE_CONTENT.
 # This filter is quite specific to the news domain.
 # used downstream of KeepLargetBlockFilter since that's what sets MIGHT_BE_CONTENT
 module Boilerpipe::Filters
   class ExpandTitleToContentFilter
     def self.process(doc)
@@ -38,6 +36,5 @@ module Boilerpipe::Filters
     def self.no_title_with_subsequent_content?(content_start, title)
       title.nil? || content_start.nil? || content_start <= title
     end
   end
 end

data/lib/boilerpipe/filters/heuristic_filter_base.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::Filters
   class HeuristicFilterBase
-    def self.num_full_text_words(tb, min_text_density=9.0)
+    def self.num_full_text_words(tb, min_text_density = 9.0)
       tb.text_density >= min_text_density ? tb.num_words : 0
     end
   end

data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb CHANGED

@@ -1,12 +1,11 @@
- # Marks all blocks as "non-content" that occur after blocks that have been
- # marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
- # number of words in content blocks occur before this mark (default: 60).
- # This can be used in conjunction with an upstream TerminatingBlocksFinder.
+# Marks all blocks as "non-content" that occur after blocks that have been
+# marked INDICATES_END_OF_TEXT. These marks are ignored unless a minimum
+# number of words in content blocks occur before this mark (default: 60).
+# This can be used in conjunction with an upstream TerminatingBlocksFinder.
 module Boilerpipe::Filters
   class IgnoreBlocksAfterContentFilter < HeuristicFilterBase
-    def self.process(doc, min_num_words=60)
+    def self.process(doc, min_num_words = 60)
       found_end_of_text = false
       num_words = 0
@@ -19,6 +18,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/filters/keep_largest_block_filter.rb CHANGED

@@ -1,4 +1,3 @@
 # Keeps the largest TextBlock only (by the number of words). In case of
 # more than one block with the same number of words, the first block is chosen.
 # All discarded blocks are marked "not content" and flagged as :MIGHT_BE_CONTENT.
@@ -8,7 +7,6 @@
 module Boilerpipe::Filters
   class KeepLargestBlockFilter
     def initialize(expand_to_same_level_text, min_words)
       @expand_to_same_level_text = expand_to_same_level_text
       @min_words = min_words
@@ -43,7 +41,7 @@ module Boilerpipe::Filters
         expand_tag_level(tbs[0...n].reverse, level, @min_words)
         # expand blocks to the right
-        expand_tag_level(tbs[n+1..-1], level, @min_words)
+        expand_tag_level(tbs[n + 1..-1], level, @min_words)
       end
     end
@@ -57,6 +55,5 @@ module Boilerpipe::Filters
         end
       end
     end
   end
 end

data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb CHANGED

@@ -1,4 +1,3 @@
 #  Marks all blocks as content that:
 #  are on the same tag-level as very likely main content
 #  (usually the level of the largest  block)
@@ -7,23 +6,22 @@
 module Boilerpipe::Filters
   class LargeBlockSameTagLevelToContentFilter
     def self.process(doc)
       largest = doc.text_blocks.find do |tb|
         tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
       end
       return doc if largest.nil?
       tag_level = largest.tag_level
       doc.text_blocks.each do |tb|
         next if tb.is_content?
         tb.content = true if tb.num_words >= 100 && tb.tag_level == tag_level
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/list_at_end_filter.rb CHANGED

@@ -11,7 +11,7 @@ module Boilerpipe::Filters
       doc.text_blocks.each do |tb|
         if tb.is_content? && tb.has_label?(:VERY_LIKELY_CONTENT)
           tag_level = tb.tag_level
-        elsif (tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0)
+        elsif tb.tag_level > tag_level && tb.has_label?(:MIGHT_BE_CONTENT) && tb.has_label?(:LI) && tb.link_density == 0
           tb.content = true
         else
           tag_level = MAX
@@ -20,6 +20,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/filters/mark_everything_content_filter.rb CHANGED

@@ -1,14 +1,12 @@
- # Marks all blocks as content.
+# Marks all blocks as content.
 module Boilerpipe::Filters
   class MarkEverythingContentFilter
     def self.process(doc)
       doc.text_blocks.each do |tb|
         tb.content = true if tb.is_not_content?
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/min_clause_words_filter.rb CHANGED

@@ -8,30 +8,27 @@
 module Boilerpipe::Filters
   class MinClauseWordsFilter
-    def self.process(doc, min_words=5)
+    def self.process(doc, min_words = 5)
       doc.text_blocks.each do |tb|
         next if tb.is_not_content?
         clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
+        hasClause = false
         tb.text.scan(clause_delimiter).each do |possible_clause|
-          if is_clause? possible_clause
-            break
-          else
-            tb.content = false
-          end
+          hasClause |= is_clause? possible_clause
         end
+        tb.content = false unless hasClause
       end
       doc
     end
-    def self.is_clause?(text, min_words=5)
-     return false if text.nil?
+    def self.is_clause?(text, min_words = 5)
+      return false if text.nil?
       whitespace = /[ \n\r]+/
       text.scan(whitespace).size >= min_words
     end
   end
 end

data/lib/boilerpipe/filters/min_words_filter.rb CHANGED

@@ -1,16 +1,14 @@
 # Keeps only those content blocks which contain at least k words.
 module Boilerpipe::Filters
   class MinWordsFilter
     def self.process(min_words, doc)
       doc.text_blocks.each do |tb|
         next if tb.is_not_content?
         tb.content = false if tb.num_words < min_words
       end
       doc
     end
   end
 end

data/lib/boilerpipe/filters/num_words_rules_classifier.rb CHANGED

@@ -1,5 +1,3 @@
-# encoding: utf-8
 #  Classifies TextBlocks as content/not-content through rules that have been determined
 #  using the C4.8 machine learning algorithm, as described in the paper
 #  "Boilerplate Detection using Shallow Text Features" (WSDM 2010), particularly
@@ -7,7 +5,6 @@
 module Boilerpipe::Filters
   class NumWordsRulesClassifier
     def self.process(doc)
       empty = Boilerpipe::Document::TextBlock.empty_start
       text_blocks = [empty] + doc.text_blocks + [empty]
@@ -37,6 +34,5 @@ module Boilerpipe::Filters
       false
     end
   end
 end

data/lib/boilerpipe/filters/simple_block_fusion_processor.rb CHANGED

@@ -1,4 +1,4 @@
- # Merges two subsequent blocks if their text densities are equal.
+# Merges two subsequent blocks if their text densities are equal.
 module Boilerpipe::Filters
   class SimpleBlockFusionProcessor
@@ -17,7 +17,7 @@ module Boilerpipe::Filters
         end
       end
-      doc.replace_text_blocks!( tbs - blocks_to_remove )
+      doc.replace_text_blocks!(tbs - blocks_to_remove)
       doc
     end
   end

data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb CHANGED

@@ -1,4 +1,3 @@
 # Splits TextBlocks at paragraph boundaries.
 #
 # NOTE: This is not fully supported (i.e., it will break highlighting support via
@@ -8,7 +7,6 @@
 module Boilerpipe::Filters
   class SplitParagraphBlocksFilter
     def self.process(doc)
       tbs = doc.text_blocks
       new_blocks = []
@@ -35,6 +33,5 @@ module Boilerpipe::Filters
       doc.replace_text_blocks!(new_blocks) if changes
       doc
     end
   end
 end

data/lib/boilerpipe/filters/terminating_blocks_finder.rb CHANGED

@@ -1,15 +1,13 @@
-# encoding: utf-8
 # Finds blocks which are potentially indicating the end of an article
 # text and marks them with INDICATES_END_OF_TEXT. This can be used
 # in conjunction with a downstream IgnoreBlocksAfterContentFilter.
 module Boilerpipe::Filters
   class TerminatingBlocksFinder
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.num_words < 15
         if tb.text.length >= 8 && finds_match?(tb.text.downcase)
           tb.labels << :INDICATES_END_OF_TEXT
         elsif tb.link_density == 1.0 && tb.text == 'comment'
@@ -29,11 +27,11 @@ module Boilerpipe::Filters
         text.include?('what you think...') ||
         text.include?('add your comment') ||
         text.include?('add comment') ||
-        #TODO add this and test
-        #text.include?('leave a reply') ||
-        #text.include?('leave a comment') ||
-        #text.include?('show comments') ||
-        #text.include?('Share this:') ||
+        # TODO add this and test
+        # text.include?('leave a reply') ||
+        # text.include?('leave a comment') ||
+        # text.include?('show comments') ||
+        # text.include?('Share this:') ||
         text.include?('reader views') ||
         text.include?('have your say') ||
         text.include?('reader comments') ||

data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb CHANGED

@@ -1,4 +1,3 @@
 # Marks trailing headlines TextBlocks that have the label :#HEADING
 # as boilerplate. Trailing means they are marked content and are
 # below any other content block.
@@ -6,7 +5,6 @@
 module Boilerpipe::Filters
   class TrailingHeadlineToBoilerplateFilter
     def self.process(doc)
       doc.text_blocks.each do |tb|
         next unless tb.is_content?
@@ -19,6 +17,5 @@ module Boilerpipe::Filters
       doc
     end
   end
 end

data/lib/boilerpipe/labels/label_action.rb CHANGED

@@ -2,7 +2,7 @@ module Boilerpipe::Labels
   class LabelAction
     attr_reader :labels
-    def initialize(labels=[])
+    def initialize(labels = [])
       @labels = labels
     end

data/lib/boilerpipe/sax/boilerpipe_html_parser.rb CHANGED

@@ -1,20 +1,16 @@
-require 'nokogiri'
 module Boilerpipe::SAX
   class BoilerpipeHTMLParser
     def self.parse(text)
-      #script bug - delete script tags
+      # script bug - delete script tags
       text.gsub!(/\<script>.+?<\/script>/i, '')
       # nokogiri uses libxml for mri and nekohtml for jruby
       # mri doesn't remove &nbsp; when missing the semicolon
       text.gsub!(/(&nbsp) /, '\1; ')
       # use nokogiri to fix any bad tags, errors - keep experimenting with this
       text = Nokogiri::HTML(text).to_html
       handler = HTMLContentHandler.new
       noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
       noko_parser.parse(text)

data/lib/boilerpipe/sax/html_content_handler.rb CHANGED

@@ -1,11 +1,8 @@
-require 'nokogiri'
-require 'set'
 module Boilerpipe::SAX
   class HTMLContentHandler < Nokogiri::XML::SAX::Document
     attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
-    attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
+    attr_accessor :in_anchor_tag, :token_buffer, :font_size_stack
     ANCHOR_TEXT_START = "$\ue00a<"
     ANCHOR_TEXT_END = ">\ue00a$"
@@ -34,7 +31,6 @@ module Boilerpipe::SAX
       @label_stacks << nil
       tag = name.upcase.intern
       tag_action = @tag_actions[tag]
       if tag_action
         @tag_level += 1 if tag_action.changes_tag_level?
@@ -51,15 +47,15 @@ module Boilerpipe::SAX
     def characters(text)
       flush_block if @flush
-      return if @in_ignorable_element != 0
+      return if in_ignorable_element?
       return if text.empty?
       # replace all whitespace with simple space
       text.gsub!(/\s+/, ' ')
       # trim whitespace
-      started_with_whitespace = text  =~ /^\s/
-      ended_with_whitespace = text  =~ /\s$/
+      started_with_whitespace = text =~ /^\s/
+      ended_with_whitespace = text =~ /\s$/
       text.strip!
       #  add a single space if the block was only whitespace
@@ -158,10 +154,10 @@ module Boilerpipe::SAX
       end
       text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
-                                 num_words,
-                                 num_linked_words,
-                                 num_words_in_wrapped_lines,
-                                 num_wrapped_lines, @offset_blocks)
+                                                         num_words,
+                                                         num_linked_words,
+                                                         num_words_in_wrapped_lines,
+                                                         num_wrapped_lines, @offset_blocks)
       @offset_blocks += 1
       clear_buffers
@@ -187,10 +183,10 @@ module Boilerpipe::SAX
     # \p{No}  -- a numeric character of other type
     def is_word?(word)
-       word =~ VALID_WORD_CHARACTER
+      word =~ VALID_WORD_CHARACTER
     end
-    #public void flushBlock() {
+    # public void flushBlock() {
     #    int numWords = 0;
     #    int numLinkedWords = 0;
     #    int numWrappedLines = 0;
@@ -198,12 +194,13 @@ module Boilerpipe::SAX
     #    final int maxLineLength = 80;
     #    int numTokens = 0;
     #    int numWordsCurrentLine = 0;
-    #}
+    # }
     def increase_in_ignorable_element!
       @in_ignorable_element += 1
     end
+    # should we prevent less than zero here?
     def decrease_in_ignorable_element!
       @in_ignorable_element -= 1
     end
@@ -224,7 +221,6 @@ module Boilerpipe::SAX
       @in_anchor_tag > 0
     end
     def add_text_block(text_block)
       @label_stacks.each do |stack|
         next unless stack
@@ -239,6 +235,7 @@ module Boilerpipe::SAX
     # append space if last character wasn't already one
     def append_space
       return if @sb_last_was_whitespace
       @sb_last_was_whitespace = true
       @text_buffer << ' '

data/lib/boilerpipe/sax/tag_action_map.rb CHANGED

@@ -48,4 +48,3 @@ module Boilerpipe::SAX
     end
   end
 end

data/lib/boilerpipe/sax/tag_actions/anchor_text.rb CHANGED

@@ -2,7 +2,7 @@ module Boilerpipe::SAX::TagActions
   class AnchorText
     # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
     # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
-    #* encounters such nestings, a SAXException is thrown.
+    # * encounters such nestings, a SAXException is thrown.
     def start(handler, name, attrs)
       if handler.in_anchor_tag?
         handler.in_anchor_tag += 1
@@ -42,7 +42,7 @@ module Boilerpipe::SAX::TagActions
       # - dunno about nokogiri???????
       # as nested A elements are not allowed per specification, we
       # are probably reaching this branch due to a bug in the XML parser
-      #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
+      # puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
       end_tag(handler, name)
     end
   end

data/lib/boilerpipe/sax/tag_actions/block_level.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-     # Explicitly marks this tag a simple "block-level" element,
-     # which always generates whitespace
+  # Explicitly marks this tag a simple "block-level" element,
+  # which always generates whitespace
   class BlockLevel
     def start(handler, name, attrs)
       true

data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-# for block-level elements, which triggers some LabelAction on
-# the generated TextBlock.
+  # for block-level elements, which triggers some LabelAction on
+  # the generated TextBlock.
   class BlockTagLabel
     def initialize(label_action)
       @label_action = label_action

data/lib/boilerpipe/sax/tag_actions/body.rb CHANGED

@@ -1,6 +1,6 @@
 module Boilerpipe::SAX::TagActions
-   # Marks this tag the body element (this should usually only
-   # be set for the <BODY> tag).
+  # Marks this tag the body element (this should usually only
+  # be set for the <BODY> tag).
   class Body
     def start(handler, name, attrs)
       handler.flush_block

data/lib/boilerpipe/sax/tag_actions/font.rb CHANGED

@@ -10,7 +10,7 @@ module Boilerpipe::SAX::TagActions
         rel = m[1]
         val = m[2].to_i # absolute
         size = rel.empty? ? val : relative(handler.font_size_stack, rel, val)
-        handler.font_size_stack <<  size
+        handler.font_size_stack << size
       else
         handler.font_size_stack << nil
       end
@@ -27,7 +27,7 @@ module Boilerpipe::SAX::TagActions
     end
     def relative(font_size_stack, rel, val)
-      prev_size = font_size_stack.reverse_each.find{|s| s != nil}
+      prev_size = font_size_stack.reverse_each.find { |s| !s.nil? }
       prev_size = 3 if prev_size.nil?
       size = if rel == '+'

data/lib/boilerpipe/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Boilerpipe
-  VERSION = '0.4.0'
+  VERSION = '0.4.1'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: boilerpipe-ruby
 version: !ruby/object:Gem::Version
-  version: 0.4.0
+  version: 0.4.1
 platform: ruby
 authors:
 - Gregory Ostermayr
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-09-15 00:00:00.000000000 Z
+date: 2019-07-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '2.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '1.11'
+        version: '2.0'
 - !ruby/object:Gem::Dependency
   name: rake
   requirement: !ruby/object:Gem::Requirement
@@ -39,33 +39,33 @@ dependencies:
       - !ruby/object:Gem::Version
         version: '10.0'
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: rickshaw
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: 0.4.0
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '3.0'
+        version: 0.4.0
 - !ruby/object:Gem::Dependency
-  name: rickshaw
+  name: rspec
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.4.0
+        version: '3.0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: 0.4.0
+        version: '3.0'
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
@@ -80,7 +80,7 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: 1.6.6.2
-description: A pure ruby implementation of the boilerpipe algorithm
+description: A pure ruby implementation of the boilerpipe web content extraction algorithm
 email:
 - "<gregory.ostermayr@gmail.com>"
 executables: []
@@ -88,9 +88,11 @@ extensions: []
 extra_rdoc_files: []
 files:
 - ".circleci/config.yml"
+- ".dockerignore"
 - ".gitignore"
 - ".rspec"
 - CHANGELOG.md
+- Dockerfile
 - Gemfile
 - LICENSE.txt
 - README.md
@@ -98,6 +100,7 @@ files:
 - bin/console
 - bin/setup
 - boilerpipe-ruby.gemspec
+- boilerpipe_flow.md
 - lib/boilerpipe.rb
 - lib/boilerpipe/document/text_block.rb
 - lib/boilerpipe/document/text_document.rb
@@ -166,9 +169,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.6.12
+rubygems_version: 3.0.1
 signing_key:
 specification_version: 4
-summary: A pure ruby implemenation of the boilerpipe algorithm
+summary: A pure ruby implementation of the boilerpipe web content extraction algorithm
 test_files: []