RubyGems - boilerpipe-ruby - Versions diffs - 0.3.0 → 0.4.0 - Mend

boilerpipe-ruby 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +4 -2
data/lib/boilerpipe.rb +5 -0
data/lib/boilerpipe/extractors/article_sentence_extractor.rb +17 -0
data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb +21 -0
data/lib/boilerpipe/filters/min_clause_words_filter.rb +37 -0
data/lib/boilerpipe/filters/min_words_filter.rb +16 -0
data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb +40 -0
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
metadata +7 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 64511e224c1ec186c5c7cde62568dd0ba8cf1005
-  data.tar.gz: 455a8bac8eaadda62706d8f507854c6f6ccb6dba
+  metadata.gz: 68c2ea4ee42a6e1d76e85f7eaa3de9ca95f3a8d3
+  data.tar.gz: 42199704467cb7f20a8fff7c616be67bae1966e1
 SHA512:
-  metadata.gz: 8f2f769627e15ca0b8293122143304ef397ed789567cff61ceb48f45a657336442711c17d29d25fa8cfdf0dbcfd557030063e7b2a132550921ce601e2573e71f
-  data.tar.gz: 630f46c4d3a6e71933be0ba5f3bc98dc97fd34d10fb05d9e656e8f5f2f20fb4c28197122ef349398c79de7e57c795721307a5dadc7b4a796f907b2904d4816d0
+  metadata.gz: 875da6a2ddfdf517509e3ebb7b9c804fadf4a372b812df42bce92fccb82eb8ca31283f11e764c6796c15997bcb8fddf24301d821f4e24a962f40ba6f973f6f17
+  data.tar.gz: 5dfec7323587057c64b931725df2aa5b53dc5a2fadaef52374619fe5798826c98c4e7ae8bb1ea9267b5db9f40249d5822752700e57a145b058231088249d1ac6

data/CHANGELOG.md CHANGED

@@ -1,3 +1,8 @@
+# 0.4.0 / 2017-09-15
+* Add KeepEverythingWithMinKWords Extractor
+* Add ArticleSentence Extractor
 # 0.3.0 / 2017-09-12
 * Add LargestContent Extractor

data/README.md CHANGED

@@ -16,16 +16,18 @@ Just use either ArticleExtractor, DefaultExtractor or KeepEverythingExtractor -
 Presently the follow Extractors are implemented
 * [x] ArticleExtractor
-* [ ] ArticleSentenceExtractor
+* [x] ArticleSentenceExtractor
 * [x] CanolaExtractor
 * [x] DefaultExtractor
 * [x] KeepEverythingExtractor
-* [ ] KeepEverythingWithMinKWordsExtractor
+* [x] KeepEverythingWithMinKWordsExtractor
 * [x] LargestContentExtractor
 * [x] NumWordsRulesExtractor
 [![CircleCI](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master.svg?style=shield)](https://circleci.com/gh/gregors/boilerpipe-ruby/tree/master)
+[![Gem Version](https://badge.fury.io/rb/boilerpipe-ruby.svg)](https://badge.fury.io/rb/boilerpipe-ruby)
 ## Installation
 Add this line to your application's Gemfile:

data/lib/boilerpipe.rb CHANGED

@@ -6,9 +6,11 @@ require 'boilerpipe/document/text_document'
 require 'boilerpipe/document/text_block'
 require 'boilerpipe/extractors/article_extractor'
+require 'boilerpipe/extractors/article_sentence_extractor'
 require 'boilerpipe/extractors/canola_extractor'
 require 'boilerpipe/extractors/default_extractor'
 require 'boilerpipe/extractors/keep_everything_extractor'
+require 'boilerpipe/extractors/keep_everything_with_k_min_words_extractor'
 require 'boilerpipe/extractors/largest_content_extractor'
 require 'boilerpipe/extractors/num_words_rules_extractor'
@@ -24,8 +26,11 @@ require 'boilerpipe/filters/keep_largest_block_filter'
 require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
 require 'boilerpipe/filters/list_at_end_filter'
 require 'boilerpipe/filters/mark_everything_content_filter'
+require 'boilerpipe/filters/min_clause_words_filter'
+require 'boilerpipe/filters/min_words_filter'
 require 'boilerpipe/filters/num_words_rules_classifier'
 require 'boilerpipe/filters/simple_block_fusion_processor'
+require 'boilerpipe/filters/split_paragraph_blocks_filter'
 require 'boilerpipe/filters/terminating_blocks_finder'
 require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'

data/lib/boilerpipe/extractors/article_sentence_extractor.rb ADDED

@@ -0,0 +1,17 @@
+# A full-text extractor which is tuned towards extracting sentences from news articles.
+module Boilerpipe::Extractors
+  class ArticleSentenceExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::ArticleSentenceExtractor.process(doc)
+      doc.content
+    end
+    def self.process(doc)
+      ::Boilerpipe::Extractors::ArticleExtractor.process doc
+      ::Boilerpipe::Filters::SplitParagraphBlocksFilter.process doc
+      ::Boilerpipe::Filters::MinClauseWordsFilter.process doc
+    end
+  end
+end

data/lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb ADDED

@@ -0,0 +1,21 @@
+# A full-text extractor which extracts the largest text component of a page.
+# For news articles, it may perform better than the DefaultExtractor, but
+# usually worse than ArticleExtractor.
+module Boilerpipe::Extractors
+  class KeepEverythingWithKMinWordsExtractor
+    def self.text(min, contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::KeepEverythingWithKMinWordsExtractor.process min, doc
+      doc.content
+    end
+    def self.process(min, doc)
+      ::Boilerpipe::Filters::SimpleBlockFusionProcessor.process doc
+      ::Boilerpipe::Filters::MarkEverythingContentFilter.process doc
+      ::Boilerpipe::Filters::MinWordsFilter.process min, doc
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/min_clause_words_filter.rb ADDED

@@ -0,0 +1,37 @@
+#
+# Keeps only blocks that have at least one segment fragment ("clause") with at least k
+# words (default: 5).
+#
+# NOTE: You might consider using the SplitParagraphBlocksFilter upstream.
+#
+# SplitParagraphBlocksFilter
+module Boilerpipe::Filters
+  class MinClauseWordsFilter
+    def self.process(doc, min_words=5)
+      doc.text_blocks.each do |tb|
+        next if tb.is_not_content?
+        clause_delimiter = /[\p{L}\d \u00a0]+[\,.:;!?]+(?:[ \n\r]+|$)/
+        tb.text.scan(clause_delimiter).each do |possible_clause|
+          if is_clause? possible_clause
+            break
+          else
+            tb.content = false
+          end
+        end
+      end
+      doc
+    end
+    def self.is_clause?(text, min_words=5)
+     return false if text.nil?
+      whitespace = /[ \n\r]+/
+      text.scan(whitespace).size >= min_words
+    end
+  end
+end

data/lib/boilerpipe/filters/min_words_filter.rb ADDED

@@ -0,0 +1,16 @@
+# Keeps only those content blocks which contain at least k words.
+module Boilerpipe::Filters
+  class MinWordsFilter
+    def self.process(min_words, doc)
+      doc.text_blocks.each do |tb|
+        next if tb.is_not_content?
+        tb.content = false if tb.num_words < min_words
+      end
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/split_paragraph_blocks_filter.rb ADDED

@@ -0,0 +1,40 @@
+# Splits TextBlocks at paragraph boundaries.
+#
+# NOTE: This is not fully supported (i.e., it will break highlighting support via
+# #getContainedTextElements()), but this one probably is necessary for some other filters.
+#
+# see MinClauseWordsFilter
+module Boilerpipe::Filters
+  class SplitParagraphBlocksFilter
+    def self.process(doc)
+      tbs = doc.text_blocks
+      new_blocks = []
+      changes = false
+      tbs.each do |tb|
+        paragraphs = tb.text.split(/[\n\r]+/)
+        if paragraphs.size < 2
+          new_blocks << tb
+          next
+        end
+        is_content = tb.is_content?
+        labels = tb.labels
+        paragraphs.each do |paragraph|
+          tbP = ::Boilerpipe::Document::TextBlock.new(paragraph)
+          tbP.content = is_content
+          tbP.add_labels(labels)
+          new_blocks << tbP
+          changes = true
+        end
+      end
+      doc.replace_text_blocks!(new_blocks) if changes
+      doc
+    end
+  end
+end

data/lib/boilerpipe/sax/boilerpipe_html_parser.rb CHANGED

@@ -4,11 +4,11 @@ module Boilerpipe::SAX
     def self.parse(text)
       #script bug - delete script tags
-      text  = text.gsub(/\<script>.+?<\/script>/i, '')
+      text.gsub!(/\<script>.+?<\/script>/i, '')
       # nokogiri uses libxml for mri and nekohtml for jruby
       # mri doesn't remove &nbsp; when missing the semicolon
-      text = text.gsub(/(&nbsp) /, '\1; ')
+      text.gsub!(/(&nbsp) /, '\1; ')
       # use nokogiri to fix any bad tags, errors - keep experimenting with this

data/lib/boilerpipe/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Boilerpipe
-  VERSION = '0.3.0'
+  VERSION = '0.4.0'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: boilerpipe-ruby
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - Gregory Ostermayr
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2017-09-12 00:00:00.000000000 Z
+date: 2017-09-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -103,9 +103,11 @@ files:
 - lib/boilerpipe/document/text_document.rb
 - lib/boilerpipe/errors.rb
 - lib/boilerpipe/extractors/article_extractor.rb
+- lib/boilerpipe/extractors/article_sentence_extractor.rb
 - lib/boilerpipe/extractors/canola_extractor.rb
 - lib/boilerpipe/extractors/default_extractor.rb
 - lib/boilerpipe/extractors/keep_everything_extractor.rb
+- lib/boilerpipe/extractors/keep_everything_with_k_min_words_extractor.rb
 - lib/boilerpipe/extractors/largest_content_extractor.rb
 - lib/boilerpipe/extractors/num_words_rules_extractor.rb
 - lib/boilerpipe/filters/block_proximity_fusion.rb
@@ -120,8 +122,11 @@ files:
 - lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb
 - lib/boilerpipe/filters/list_at_end_filter.rb
 - lib/boilerpipe/filters/mark_everything_content_filter.rb
+- lib/boilerpipe/filters/min_clause_words_filter.rb
+- lib/boilerpipe/filters/min_words_filter.rb
 - lib/boilerpipe/filters/num_words_rules_classifier.rb
 - lib/boilerpipe/filters/simple_block_fusion_processor.rb
+- lib/boilerpipe/filters/split_paragraph_blocks_filter.rb
 - lib/boilerpipe/filters/terminating_blocks_finder.rb
 - lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb
 - lib/boilerpipe/labels/default.rb