RubyGems - boilerpipe-ruby - Versions diffs - 0.2.0 → 0.4.3 - Mend

boilerpipe-ruby 0.2.0 → 0.4.3

Files changed (50) hide show

checksums.yaml +5 -5
data/.circleci/config.yml +6 -24
data/.dockerignore +7 -0
data/CHANGELOG.md +34 -1
data/Dockerfile +14 -0
data/README.md +32 -7
data/Rakefile +3 -4
data/bin/console +3 -3
data/boilerpipe-ruby.gemspec +9 -9
data/boilerpipe_flow.md +40 -0
data/lib/boilerpipe.rb +14 -0
data/lib/boilerpipe/document/text_block.rb +10 -12
data/lib/boilerpipe/document/text_document.rb +4 -5
data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
data/lib/boilerpipe/extractors/canola_extractor.rb +15 -0
data/lib/boilerpipe/extractors/default_extractor.rb +0 -1
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +20 -0
data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +14 -0
data/lib/boilerpipe/filters/block_proximity_fusion.rb +7 -12
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +1 -4
data/lib/boilerpipe/filters/canola_classifier.rb +27 -0
data/lib/boilerpipe/filters/density_rules_classifier.rb +3 -2
data/lib/boilerpipe/filters/document_title_match_classifier.rb +17 -19
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +0 -3
data/lib/boilerpipe/filters/heuristic_filter_base.rb +1 -1
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +5 -7
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +1 -4
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +2 -4
data/lib/boilerpipe/filters/list_at_end_filter.rb +1 -2
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +12 -0
data/lib/boilerpipe/filters/min_clause_words_filter.rb +34 -0
data/lib/boilerpipe/filters/min_words_filter.rb +14 -0
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +0 -4
data/lib/boilerpipe/filters/simple_block_fusion_processor.rb +2 -2
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +37 -0
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +6 -8
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +0 -3
data/lib/boilerpipe/labels/label_action.rb +1 -1
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +3 -7
data/lib/boilerpipe/sax/html_content_handler.rb +13 -16
data/lib/boilerpipe/sax/tag_action_map.rb +0 -1
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_level.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/body.rb +2 -2
data/lib/boilerpipe/sax/tag_actions/font.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +38 -25

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
-SHA1:
-  metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
-  data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
+SHA256:
+  metadata.gz: da8bc0b8d74eea14b73e61812bbeba5fef75e8bae2330739e49b28e26f73d14d
+  data.tar.gz: 68fee529b501210cf3278eb2b045b09e6d27c7846355b7d430c05e60f39088e2
 SHA512:
-  metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
-  data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
+  metadata.gz: 4202afab2a01ae588977fde351dfacc29551634077e794d028282666c43d3aeb09adf425d60608dc694c36f2fd8ed034ef89ba41ee07e6ad23f426c19d740931
+  data.tar.gz: a0dc75a0c5384e1eaf8b50dfb92bc9294b41f38e8b0e0cceb9e9a6aafdf436629df2b3420f61a33523a08d6788c02ade799cdf8fe29db4338c766c7b01523704

data/.circleci/config.yml CHANGED

@@ -6,48 +6,30 @@ version: 2
 jobs:
   build:
     docker:
-      # specify the version you desire here
-       - image: circleci/ruby:2.4.1-node-browsers
+       - image: circleci/ruby:2.5.5-node-browsers
       # Specify service dependencies here if necessary
       # CircleCI maintains a library of pre-built images
       # documented at https://circleci.com/docs/2.0/circleci-images/
-      # - image: circleci/postgres:9.4
     working_directory: ~/repo
     steps:
       - checkout
-      # Download and cache dependencies
-      - restore_cache:
-          keys:
-          - v1-dependencies-{{ checksum "Gemfile.lock" }}
-          # fallback to using the latest cache if no exact match is found
-          - v1-dependencies-
+      - run: gem install bundler
       - run:
           name: install dependencies
           command: |
-            bundle install --jobs=4 --retry=3 --path vendor/bundle
+            bundle install --jobs=4 --retry=3
-      - save_cache:
-          paths:
-            - ./vendor/bundle
-          key: v1-dependencies-{{ checksum "Gemfile.lock" }}
-      # Database setup
-      #- run: bundle exec rake db:create
-      #- run: bundle exec rake db:schema:load
-      # run tests!
       - run:
           name: run tests
           command: |
             mkdir /tmp/test-results
             TEST_FILES="$(circleci tests glob "spec/**/*_spec.rb")"
-            rspec --format progress "spec"
+            bundle exec rspec --format progress "spec"
       # collect reports
       - store_test_results:

data/.dockerignore ADDED

@@ -0,0 +1,7 @@
+.git
+.gitignore
+ log/*
+ tmp/*
+ *.swp
+ *.swo
+ Gemfile.lock

data/CHANGELOG.md CHANGED

@@ -1,3 +1,36 @@
+# 0.4.3 / 2020-07-18
+* update deps
+# 0.4.2 / 2020-03-11
+* update deps
+# 0.4.1 / 2019-07-04
+* Fix bug in min_clause_words_filter ( used in article_sentence_extractor )
+* Allow tests to run in Docker
+* Update circle to continue to work
+* Add architecture flow
+* Code formatting
+* Add min words filter specs
+* Add label action specs
+* Add missing test case to ignorable element spec
+* Add merge_next case to text block spec
+* Dry up includes
+# 0.4.0 / 2017-09-15
+* Add KeepEverythingWithMinKWords Extractor
+* Add ArticleSentence Extractor
+# 0.3.0 / 2017-09-12
+* Add LargestContent Extractor
+* Add KeepEverything Extractor
+* Add NumWordsRules Extractor
+* Add Canola Extractor
 # 0.2.0 / 2017-09-11
 * Add Default Extractor
@@ -10,4 +43,4 @@
 # 0.1.0 / 2017-09-08
-* Add Article Extractor
+* Add Article Extractor

data/Dockerfile ADDED

@@ -0,0 +1,14 @@
+From ruby:2.5
+RUN gem install bundler
+COPY *gemspec /usr/src/app/
+COPY Gemfile /usr/src/app/
+COPY lib/boilerpipe/version.rb /usr/src/app/lib/boilerpipe/
+COPY bin /usr/src/app/
+COPY bin/* /usr/src/app/bin/
+WORKDIR /usr/src/app
+RUN bin/setup
+COPY . /usr/src/app/
+CMD ["bundle", "exec", "rspec", "--color", "--format", "doc"]

data/README.md CHANGED

@@ -1,5 +1,8 @@
 # Boilerpipe
+[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
+[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
 A pure ruby implemenation of the boilerpipe algorithm.
 This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
@@ -10,15 +13,22 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
 This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
-I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
+Here's a high level [diagram](boilerpipe_flow.md) of how the system works.
+# TLDR
+Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
 Presently the follow Extractors are implemented
 * [x] ArticleExtractor
+* [x] ArticleSentenceExtractor
+* [x] CanolaExtractor
 * [x] DefaultExtractor
-* [ ] LargestContentExtractor
-* [ ] KeepEverythingExtractor
+* [x] KeepEverythingExtractor
+* [x] KeepEverythingWithMinKWordsExtractor
+* [x] LargestContentExtractor
+* [x] NumWordsRulesExtractor
-[![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
 ## Installation
@@ -44,16 +54,31 @@ Or install it yourself as:
     > require 'open-uri'
       => true
     > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
-    > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
+    > Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
      => "Always Squash and Rebase your Git Commits"
-    > output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
+    > Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
      => "Posted on\nWhat is the squash rebase workf"
+    > Boilerpipe::Extractors::LargestContentExtractor.text(content).slice(0, 40)
+     => "git push origin master\nWhy should you ad"
+    > Boilerpipe::Extractors::KeepEverythingExtractor.text(content).slice(0..40)
+     => "Toggle Navigation\nCarbon Five\nAbout\nWork\n"
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
-To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
+To install this gem onto your local machine, run `bundle exec rake install`.
+### Running Tests on Docker
+The default run command will run the tests
+    docker build -t boilerpipe .
+    docker run -it --rm boilerpipe
 ## Contributing

data/Rakefile CHANGED

@@ -1,14 +1,13 @@
-require "bundler/gem_tasks"
-require "rspec/core/rake_task"
+require 'bundler/gem_tasks'
+require 'rspec/core/rake_task'
 RSpec::Core::RakeTask.new(:spec)
 task :default => :spec
 desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
 task :download_boilerpipe_jar do
   FileUtils.mkdir_p 'spec/sanity_checks/jars/'
   Dir.chdir 'spec/sanity_checks/jars/'
- `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
+  `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
 end

data/bin/console CHANGED

@@ -1,7 +1,7 @@
 #!/usr/bin/env ruby
-require "bundler/setup"
-require "boilerpipe"
+require 'bundler/setup'
+require 'boilerpipe'
 # You can add fixtures and/or initialization code here to make experimenting
 # with your gem easier. You can also use a different console, if you like.
@@ -10,5 +10,5 @@ require "boilerpipe"
 # require "pry"
 # Pry.start
-require "irb"
+require 'irb'
 IRB.start

data/boilerpipe-ruby.gemspec CHANGED

@@ -10,18 +10,18 @@ Gem::Specification.new do |spec|
   spec.email         = ['<gregory.ostermayr@gmail.com>']
   spec.license       = 'Apache 2.0'
-  spec.summary       = %q{A pure ruby implemenation of the boilerpipe algorithm}
-  spec.description   = %q{A pure ruby implementation of the boilerpipe algorithm}
+  spec.summary       = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
+  spec.description   = %q{A pure ruby implementation of the boilerpipe web content extraction algorithm}
   spec.homepage      = 'https://github.com/gregors/boilerpipe-ruby'
   spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
-  spec.bindir        = "exe"
+  spec.bindir        = 'exe'
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
-  spec.require_paths = ["lib"]
+  spec.require_paths = ['lib']
-  spec.add_development_dependency 'bundler', '~> 1.11'
-  spec.add_development_dependency 'rake', '~> 10.0'
-  spec.add_development_dependency 'rspec', '~> 3.0'
-  spec.add_development_dependency 'rickshaw', '~> 0.4.0'
-  spec.add_runtime_dependency 'nokogiri', '>= 1.6.6.2'
+  spec.add_development_dependency 'bundler', '~> 2.0'
+  spec.add_development_dependency 'rake', '>= 12.3.3'
+  spec.add_development_dependency 'rickshaw', '~> 0.5.0'
+  spec.add_development_dependency 'rspec', '~> 3.9'
+  spec.add_runtime_dependency 'nokogiri', '~> 1.10'
 end

data/boilerpipe_flow.md ADDED

@@ -0,0 +1,40 @@
+```
+raw html
+   |
+   |
+  sax input -> sax parser(html parser) ->  HTML Content handler -> tokenizer ---------
+                                                                                     |
+    -------------------------------------<------------------------------------<------|
+    |              |            |
+text blocks    text blocks  text blocks
+    |              |            |
+    |              |            |
+    -----------------------------
+          |
+          |
+     text document
+          |
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+        filter
+          |
+          |
+     text document
+          |
+  outputs extracted text
+  ```

data/lib/boilerpipe.rb CHANGED

@@ -1,3 +1,6 @@
+require 'nokogiri'
+require 'set'
 require 'boilerpipe/version'
 require 'boilerpipe/util/unicode_tokenizer'
@@ -6,10 +9,17 @@ require 'boilerpipe/document/text_document'
 require 'boilerpipe/document/text_block'
 require 'boilerpipe/extractors/article_extractor'
+require 'boilerpipe/extractors/article_sentence_extractor'
+require 'boilerpipe/extractors/canola_extractor'
 require 'boilerpipe/extractors/default_extractor'
+require 'boilerpipe/extractors/keep_everything_extractor'
+require 'boilerpipe/extractors/keep_everything_with_k_min_words_extractor'
+require 'boilerpipe/extractors/largest_content_extractor'
+require 'boilerpipe/extractors/num_words_rules_extractor'
 require 'boilerpipe/filters/block_proximity_fusion'
 require 'boilerpipe/filters/boilerplate_block_filter'
+require 'boilerpipe/filters/canola_classifier'
 require 'boilerpipe/filters/density_rules_classifier'
 require 'boilerpipe/filters/document_title_match_classifier'
 require 'boilerpipe/filters/expand_title_to_content_filter'
@@ -18,8 +28,12 @@ require 'boilerpipe/filters/ignore_blocks_after_content_filter'
 require 'boilerpipe/filters/keep_largest_block_filter'
 require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
 require 'boilerpipe/filters/list_at_end_filter'
+require 'boilerpipe/filters/mark_everything_content_filter'
+require 'boilerpipe/filters/min_clause_words_filter'
+require 'boilerpipe/filters/min_words_filter'
 require 'boilerpipe/filters/num_words_rules_classifier'
 require 'boilerpipe/filters/simple_block_fusion_processor'
+require 'boilerpipe/filters/split_paragraph_blocks_filter'
 require 'boilerpipe/filters/terminating_blocks_finder'
 require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'

data/lib/boilerpipe/document/text_block.rb CHANGED

@@ -1,10 +1,7 @@
-require 'set'
 module Boilerpipe
   module Document
     class TextBlock
-       #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
+      # EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
       attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
                   :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
@@ -12,7 +9,7 @@ module Boilerpipe
       attr_accessor :content
-      def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=1, offset_blocks=0)
+      def initialize(text, num_words = 0, num_words_in_anchor_text = 0, num_words_in_wrapped_lines = 0, num_wrapped_lines = 1, offset_blocks = 0)
         @labels = Set.new
         @text = text
         @num_words = num_words
@@ -32,9 +29,9 @@ module Boilerpipe
         new('', 0, 0, 0, 0, -1)
       end
-     def set_tag_level(level)
-       @tag_level = level
-     end
+      def set_tag_level(level)
+        @tag_level = level
+      end
       def is_content?
         @content
@@ -68,8 +65,8 @@ module Boilerpipe
         @num_words_in_anchor_text += other.num_words_in_anchor_text
         @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
         @num_wrapped_lines += other.num_wrapped_lines
-        @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
-        @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
+        @offset_blocks_start = [@offset_blocks_start, other.offset_blocks_start].min
+        @offset_blocks_end = [@offset_blocks_end, other.offset_blocks_end].max
         init_densities
         @content |= other.is_content?
@@ -87,10 +84,10 @@ module Boilerpipe
       end
       def to_s
-        #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
+        # "[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
         labels = 'null'
         if !@labels.empty?
-          labels ="[#{ @labels.to_a.join(',')}]"
+          labels = "[#{@labels.to_a.join(',')}]"
         end
         "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
       end
@@ -100,6 +97,7 @@ module Boilerpipe
       end
       private
       def init_densities
         if @num_words_in_wrapped_lines == 0
           @num_words_in_wrapped_lines = @num_words

data/lib/boilerpipe/document/text_document.rb CHANGED

@@ -19,14 +19,14 @@ module Boilerpipe
           case text_block.is_content?
           when true
             next unless include_content
-            s << text_block.text
-            s << "\n"
           when false
             next unless include_noncontent
-           s << text_block.text
-           s << "\n"
           end
+          s << text_block.text
+          s << "\n"
         end
         s
       end
@@ -38,7 +38,6 @@ module Boilerpipe
         @text_blocks.map(&:to_s).join("\n")
       end
       alias_method :debug_string, :debug_s
     end
   end
 end

data/lib/boilerpipe/extractors/article_sentence_extractor.rb ADDED

@@ -0,0 +1,17 @@
+# A full-text extractor which is tuned towards extracting sentences from news articles.
+module Boilerpipe::Extractors
+  class ArticleSentenceExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::ArticleSentenceExtractor.process(doc)
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Extractors::ArticleExtractor.process doc
+      ::Boilerpipe::Filters::SplitParagraphBlocksFilter.process doc
+      ::Boilerpipe::Filters::MinClauseWordsFilter.process doc
+    end
+  end
+end