RubyGems - boilerpipe-ruby - Versions diffs - 0.2.0 → 0.3.0 - Mend

boilerpipe-ruby 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +7 -0
data/README.md +19 -5
data/lib/boilerpipe.rb +6 -0
data/lib/boilerpipe/extractors/canola_extractor.rb +16 -0
data/lib/boilerpipe/extractors/keep_everything_extractor.rb +16 -0
data/lib/boilerpipe/extractors/largest_content_extractor.rb +18 -0
data/lib/boilerpipe/extractors/num_words_rules_extractor.rb +15 -0
data/lib/boilerpipe/filters/canola_classifier.rb +28 -0
data/lib/boilerpipe/filters/mark_everything_content_filter.rb +14 -0
data/lib/boilerpipe/version.rb +1 -1
metadata +8 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: cbfe98ebb939cfdc50ff0fb7a6bdc4a78b91e880
-  data.tar.gz: fe8781a571918940a74f191b81316f5bf65aaa7a
+  metadata.gz: 64511e224c1ec186c5c7cde62568dd0ba8cf1005
+  data.tar.gz: 455a8bac8eaadda62706d8f507854c6f6ccb6dba
 SHA512:
-  metadata.gz: e8f92169b5b7b766f4fc635ffb5e423ab5493a4bc8df6d546af7373df3863266e7a4fb7ba4973b4b84f79985e1f6585bd98a3bd3547f8bb6c558113288c2549d
-  data.tar.gz: 3f4d9cb77ce06710dd744bc0be6cfe67cfc211631b95e670f5bdccc9ffa6ab48bfe167206f08a20b83adc249ed86afa6c0556e43f83cacc57444a44a0a539e1f
+  metadata.gz: 8f2f769627e15ca0b8293122143304ef397ed789567cff61ceb48f45a657336442711c17d29d25fa8cfdf0dbcfd557030063e7b2a132550921ce601e2573e71f
+  data.tar.gz: 630f46c4d3a6e71933be0ba5f3bc98dc97fd34d10fb05d9e656e8f5f2f20fb4c28197122ef349398c79de7e57c795721307a5dadc7b4a796f907b2904d4816d0

data/CHANGELOG.md CHANGED

@@ -1,3 +1,10 @@
+# 0.3.0 / 2017-09-12
+* Add LargestContent Extractor
+* Add KeepEverything Extractor
+* Add NumWordsRules Extractor
+* Add Canola Extractor
 # 0.2.0 / 2017-09-11
 * Add Default Extractor

data/README.md CHANGED

@@ -10,13 +10,19 @@ I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-w
 This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
-I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
+# TLDR
+Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor - try out the others when you feel like experimenting...
 Presently the follow Extractors are implemented
 * [x] ArticleExtractor
+* [ ] ArticleSentenceExtractor
+* [x] CanolaExtractor
 * [x] DefaultExtractor
-* [ ] LargestContentExtractor
-* [ ] KeepEverythingExtractor
+* [x] KeepEverythingExtractor
+* [ ] KeepEverythingWithMinKWordsExtractor
+* [x] LargestContentExtractor
+* [x] NumWordsRulesExtractor
 [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
@@ -44,10 +50,18 @@ Or install it yourself as:
     > require 'open-uri'
       => true
     > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
-    > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
+    > Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
      => "Always Squash and Rebase your Git Commits"
-    > output = Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
+    > Boilerpipe::Extractors::DefaultExtractor.text(content).slice(0..40)
      => "Posted on\nWhat is the squash rebase workf"
+    > Boilerpipe::Extractors::LargestContentExtractor.text(content).slice(0, 40)
+     => "git push origin master\nWhy should you ad"
+    > Boilerpipe::Extractors::KeepEverythingExtractor.text(content).slice(0..40)
+     => "Toggle Navigation\nCarbon Five\nAbout\nWork\n"
 ## Development

data/lib/boilerpipe.rb CHANGED

@@ -6,10 +6,15 @@ require 'boilerpipe/document/text_document'
 require 'boilerpipe/document/text_block'
 require 'boilerpipe/extractors/article_extractor'
+require 'boilerpipe/extractors/canola_extractor'
 require 'boilerpipe/extractors/default_extractor'
+require 'boilerpipe/extractors/keep_everything_extractor'
+require 'boilerpipe/extractors/largest_content_extractor'
+require 'boilerpipe/extractors/num_words_rules_extractor'
 require 'boilerpipe/filters/block_proximity_fusion'
 require 'boilerpipe/filters/boilerplate_block_filter'
+require 'boilerpipe/filters/canola_classifier'
 require 'boilerpipe/filters/density_rules_classifier'
 require 'boilerpipe/filters/document_title_match_classifier'
 require 'boilerpipe/filters/expand_title_to_content_filter'
@@ -18,6 +23,7 @@ require 'boilerpipe/filters/ignore_blocks_after_content_filter'
 require 'boilerpipe/filters/keep_largest_block_filter'
 require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
 require 'boilerpipe/filters/list_at_end_filter'
+require 'boilerpipe/filters/mark_everything_content_filter'
 require 'boilerpipe/filters/num_words_rules_classifier'
 require 'boilerpipe/filters/simple_block_fusion_processor'
 require 'boilerpipe/filters/terminating_blocks_finder'

data/lib/boilerpipe/extractors/canola_extractor.rb ADDED

@@ -0,0 +1,16 @@
+module Boilerpipe::Extractors
+  class CanolaExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::CanolaExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Filters::CanolaClassifier.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/keep_everything_extractor.rb ADDED

@@ -0,0 +1,16 @@
+ # Marks all blocks as content.
+module Boilerpipe::Extractors
+  class KeepEverythingExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::KeepEverythingExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/largest_content_extractor.rb ADDED

@@ -0,0 +1,18 @@
+module Boilerpipe::Extractors
+  class LargestContentExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::LargestContentExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      filters = ::Boilerpipe::Filters
+      filters::NumWordsRulesClassifier.process doc
+      filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
+      filters::KeepLargestBlockFilter::INSTANCE.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/num_words_rules_extractor.rb ADDED

@@ -0,0 +1,15 @@
+module Boilerpipe::Extractors
+  class NumWordsRulesExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::NumWordsRulesExtractor.process doc
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Filters::NumWordsRulesClassifier.process doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/canola_classifier.rb ADDED

@@ -0,0 +1,28 @@
+ # A full-text extractor trained on http://krdwrd.org/
+ # https://krdwrd.org/trac/attachment/wiki/Corpora/Canola/CANOLA.pdf
+ # Works well with SimpleEstimator, too.
+module Boilerpipe::Filters
+  class CanolaClassifier
+    def self.process(doc)
+      return doc if doc.text_blocks.size < 1
+      empty = Boilerpipe::Document::TextBlock.empty_start
+      text_blocks = [empty] + doc.text_blocks + [empty]
+      text_blocks.each_cons(3) do |slice|
+        prev, current, nxt = *slice
+        current.content = classify(prev, current, nxt)
+      end
+      doc
+    end
+    def self.classify(prev, current, nxt)
+      current.link_density > 0 && nxt.num_words > 11 \
+        || current.num_words > 19 \
+        || nxt.num_words > 6 && nxt.link_density == 0 && prev.link_density == 0 && ( current.num_words > 6 || prev.num_words > 7 || nxt.num_words > 19 )
+    end
+  end
+end

data/lib/boilerpipe/filters/mark_everything_content_filter.rb ADDED

@@ -0,0 +1,14 @@
+ # Marks all blocks as content.
+module Boilerpipe::Filters
+  class MarkEverythingContentFilter
+    def self.process(doc)
+      doc.text_blocks.each do |tb|
+        tb.content = true if tb.is_not_content?
+      end
+      doc
+    end
+  end
+end

data/lib/boilerpipe/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Boilerpipe
-  VERSION = '0.2.0'
+  VERSION = '0.3.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: boilerpipe-ruby
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.3.0
 platform: ruby
 authors:
 - Gregory Ostermayr
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-09-11 00:00:00.000000000 Z
+date: 2017-09-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -103,9 +103,14 @@ files:
 - lib/boilerpipe/document/text_document.rb
 - lib/boilerpipe/errors.rb
 - lib/boilerpipe/extractors/article_extractor.rb
+- lib/boilerpipe/extractors/canola_extractor.rb
 - lib/boilerpipe/extractors/default_extractor.rb
+- lib/boilerpipe/extractors/keep_everything_extractor.rb
+- lib/boilerpipe/extractors/largest_content_extractor.rb
+- lib/boilerpipe/extractors/num_words_rules_extractor.rb
 - lib/boilerpipe/filters/block_proximity_fusion.rb
 - lib/boilerpipe/filters/boilerplate_block_filter.rb
+- lib/boilerpipe/filters/canola_classifier.rb
 - lib/boilerpipe/filters/density_rules_classifier.rb
 - lib/boilerpipe/filters/document_title_match_classifier.rb
 - lib/boilerpipe/filters/expand_title_to_content_filter.rb
@@ -114,6 +119,7 @@ files:
 - lib/boilerpipe/filters/keep_largest_block_filter.rb
 - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
 - lib/boilerpipe/filters/list_at_end_filter.rb
+- lib/boilerpipe/filters/mark_everything_content_filter.rb
 - lib/boilerpipe/filters/num_words_rules_classifier.rb
 - lib/boilerpipe/filters/simple_block_fusion_processor.rb
 - lib/boilerpipe/filters/terminating_blocks_finder.rb