RubyGems - boilerpipe-ruby - Versions diffs - 0.0.1 → 0.1.0 - Mend

boilerpipe-ruby 0.0.1 → 0.1.0

Files changed (41) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +27 -6
data/Rakefile +8 -0
data/boilerpipe-ruby.gemspec +10 -9
data/lib/boilerpipe.rb +30 -0
data/lib/boilerpipe/document/text_block.rb +113 -0
data/lib/boilerpipe/document/text_document.rb +44 -0
data/lib/boilerpipe/errors.rb +1 -0
data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
data/lib/boilerpipe/labels/default.rb +17 -0
data/lib/boilerpipe/labels/label_action.rb +17 -0
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
data/stuff.txt +4 -0
metadata +61 -15

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a3ddeb5197a36e243816c8a558826aa25a4127c8
-  data.tar.gz: 3ecf3ded094f75276c5308e48988fd682551c71c
+  metadata.gz: 5aed62a42276e97a3e40126609fc51bf624b091f
+  data.tar.gz: 357afb75e661083011013b0ebdca88103e4a188e
 SHA512:
-  metadata.gz: b3de289e4282a42c4acfa4dd2cc3b7998742c3432789d20404bdd34efc706bde7c205cbf83420b5fe9342e7e4a0cab46beec68a0f62f13c9e7a78f0964011273
-  data.tar.gz: a7617f372ee30014fb614e467d254600b760bdbd7654cbc151b1afd0ff34913ba3198cfa8c7da40a70bf21ab118908ecb5ac90cd493e73b649d7b5c6bd6e0277
+  metadata.gz: a64ef6c16a1f1aa5dc44f60c0c94f0ee8cd02876a549e6433b19db617207e64c258534f6b708bece4c0749c7fee50ec15f114c64cb8ee993b154ede4764c2e2b
+  data.tar.gz: d9d52e256767a6553a7d19fe6ad8711610cc3fa7beea5bef7fdc86c28d25a9f5ab00e3486aff7942251376907322f11f643c879d96a136b3e9a969fca8c23a25

data/.gitignore CHANGED Viewed

@@ -7,3 +7,4 @@
 /pkg/
 /spec/reports/
 /tmp/
+spec/sanity_checks/jars

data/README.md CHANGED Viewed

@@ -1,15 +1,29 @@
-# Boilerpipe::Ruby
+# Boilerpipe
-Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/boilerpipe/ruby`. To experiment with that code, run `bin/console` for an interactive prompt.
+A pure ruby implemenation of the boilerpipe algorithm.
-A pure ruby implemenation of the boilerpipe algorithm - in progress ......
+This is a text extraction utility first written by Christian Kohlshutter - [presentation](http://videolectures.net/wsdm2010_kohlschutter_bdu/)
+I went directly to the original author's github https://github.com/kohlschutter/boilerpipe and forked that code base here https://github.com/gregors/boilerpipe.
+I saw other gems making use of boilerpipe via the [free api](http://boilerpipe-web.appspot.com) but depending on time of day the api goes down due to exceeding the hosting plan. I also checked out some gems making use of Jruby but I had all kinds of dependency and bug issues. So I made some tweaks on my fork and created a new [jruby-boilerpipe gem](https://rubygems.org/gems/jruby-boilerpipe).
+This solution works great if you're using Jruby but I wanted a pure ruby solution to use on MRI. Open vim - start coding...
+I've only got the ArticleExtractor working but the others should be following quickly as the ArticleExtractor definitley has the most code behind it...
+Presently the follow Extractors are implemented
+* [x] ArticleExtractor
+* [ ] DefaultExtractor
+* [ ] LargestContentExtractor
+* [ ] KeepEverythingExtractor
 ## Installation
 Add this line to your application's Gemfile:
 ```ruby
-gem 'boilerpipe-ruby'
+gem 'boilerpipe-ruby', require: 'boilerpipe'
 ```
 And then execute:
@@ -22,7 +36,14 @@ Or install it yourself as:
 ## Usage
-TODO: Write usage instructions here
+    gregors$ irb
+    > require 'boilerpipe'
+     => true
+    > require 'open-uri'
+      => true
+    > content = open('https://blog.carbonfive.com/2017/08/28/always-squash-and-rebase-your-git-commits/').read; true;
+    > output = Boilerpipe::Extractors::ArticleExtractor.text(content).slice(0..40)
+     => "Always Squash and Rebase your Git Commits"
 ## Development
@@ -32,5 +53,5 @@ To install this gem onto your local machine, run `bundle exec rake install`. To
 ## Contributing
-Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/boilerpipe-ruby.
+Bug reports and pull requests are welcome on GitHub at https://github.com/gregors/boilerpipe-ruby.

data/Rakefile CHANGED Viewed

@@ -4,3 +4,11 @@ require "rspec/core/rake_task"
 RSpec::Core::RakeTask.new(:spec)
 task :default => :spec
+desc 'Downloads forked boilerpipe jar from Gregors github for sanity checks'
+task :download_boilerpipe_jar do
+  FileUtils.mkdir_p 'spec/sanity_checks/jars/'
+  Dir.chdir 'spec/sanity_checks/jars/'
+ `wget 'https://github.com/gregors/jruby-boilerpipe/raw/master/lib/boilerpipe-common-2.0-SNAPSHOT-jar-with-dependencies.jar'`
+end

data/boilerpipe-ruby.gemspec CHANGED Viewed

@@ -4,21 +4,22 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 require 'boilerpipe/version'
 Gem::Specification.new do |spec|
-  spec.name          = "boilerpipe-ruby"
+  spec.name          = 'boilerpipe-ruby'
   spec.version       = Boilerpipe::VERSION
-  spec.authors       = ["Gregory Ostermayr"]
-  spec.email         = ["<gregory.ostermayr@gmail.com>"]
+  spec.authors       = ['Gregory Ostermayr']
+  spec.email         = ['<gregory.ostermayr@gmail.com>']
-  spec.summary       = %q{A pure ruby implemenation of the boilerpipe algorithm - in progress}
-  spec.description   = %q{A pure ruby implementation of the boilerpipe algorith - in progress}
-  spec.homepage      = "https://github.com/gregors/boilerpipe-ruby"
+  spec.summary       = %q{A pure ruby implemenation of the boilerpipe algorithm}
+  spec.description   = %q{A pure ruby implementation of the boilerpipe algorithm}
+  spec.homepage      = 'https://github.com/gregors/boilerpipe-ruby'
   spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
   spec.bindir        = "exe"
   spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
   spec.require_paths = ["lib"]
-  spec.add_development_dependency "bundler", "~> 1.11"
-  spec.add_development_dependency "rake", "~> 10.0"
-  spec.add_development_dependency "rspec", "~> 3.0"
+  spec.add_development_dependency 'bundler', '~> 1.11'
+  spec.add_development_dependency 'rake', '~> 10.0'
+  spec.add_development_dependency 'rspec', '~> 3.0'
+  spec.add_runtime_dependency 'nokogiri', '1.6.6.2'
 end

data/lib/boilerpipe.rb CHANGED Viewed

@@ -1,2 +1,32 @@
 require 'boilerpipe/version'
 require 'boilerpipe/util/unicode_tokenizer'
+require 'boilerpipe/document/text_document'
+require 'boilerpipe/document/text_block'
+require 'boilerpipe/extractors/article_extractor'
+require 'boilerpipe/filters/block_proximity_fusion'
+require 'boilerpipe/filters/boilerplate_block_filter'
+require 'boilerpipe/filters/document_title_match_classifier'
+require 'boilerpipe/filters/expand_title_to_content_filter'
+require 'boilerpipe/filters/heuristic_filter_base'
+require 'boilerpipe/filters/ignore_blocks_after_content_filter'
+require 'boilerpipe/filters/keep_largest_block_filter'
+require 'boilerpipe/filters/large_block_same_tag_level_to_content_filter'
+require 'boilerpipe/filters/list_at_end_filter'
+require 'boilerpipe/filters/num_words_rules_classifier'
+require 'boilerpipe/filters/terminating_blocks_finder'
+require 'boilerpipe/filters/trailing_headline_to_boilerplate_filter'
+require 'boilerpipe/labels/default'
+require 'boilerpipe/labels/label_action'
+require 'boilerpipe/sax/html_content_handler'
+require 'boilerpipe/sax/boilerpipe_html_parser'
+require 'boilerpipe/sax/tag_action_map'
+require 'boilerpipe/sax/tag_actions/chained'
+require 'boilerpipe/sax/tag_actions/ignorable_element'
+require 'boilerpipe/sax/tag_actions/anchor_text'
+require 'boilerpipe/sax/tag_actions/body'
+require 'boilerpipe/sax/tag_actions/inline_whitespace'
+require 'boilerpipe/sax/tag_actions/inline_no_whitespace'
+require 'boilerpipe/sax/tag_actions/block_level'
+require 'boilerpipe/sax/tag_actions/font'
+require 'boilerpipe/sax/tag_actions/inline_tag_label'
+require 'boilerpipe/sax/tag_actions/block_tag_label'

data/lib/boilerpipe/document/text_block.rb ADDED Viewed

@@ -0,0 +1,113 @@
+require 'set'
+module Boilerpipe
+  module Document
+    class TextBlock
+       #EMPTY_END = TextBlock.new('', 0, 0, 0, 0, 999999999999999999999999999)
+      attr_reader :text, :num_words, :num_words_in_wrapped_lines, :num_words_in_anchor_text,
+                  :num_wrapped_lines, :offset_blocks_start, :offset_blocks_end, :text_density,
+                  :link_density, :labels, :tag_level, :num_full_text_words
+      attr_accessor :content
+      def initialize(text, num_words=0, num_words_in_anchor_text=0, num_words_in_wrapped_lines=0, num_wrapped_lines=0, offset_blocks=0)
+        @labels = Set.new
+        @text = text
+        @num_words = num_words
+        @num_words_in_anchor_text = num_words_in_anchor_text
+        @num_words_in_wrapped_lines = num_words_in_wrapped_lines
+        @num_wrapped_lines = num_wrapped_lines
+        @num_full_text_words = 0
+        @offset_blocks_start = offset_blocks
+        @offset_blocks_end = offset_blocks
+        @content = false
+        @tag_level = 0
+        init_densities
+      end
+      def self.empty_start
+        new('', 0, 0, 0, 0, -1)
+      end
+     def set_tag_level(level)
+       @tag_level = level
+     end
+      def is_content?
+        @content
+      end
+      def is_not_content?
+        !is_content?
+      end
+      def add_label(label)
+        @labels << label
+      end
+      def add_labels(labels)
+        labels.each do |label|
+          add_label(label)
+        end
+      end
+      def has_label?(label)
+        @labels.include?(label)
+      end
+      def remove_label(label)
+        @labels.delete(label)
+      end
+      def merge_next(other)
+        @text = "#{@text}\n#{other.text}"
+        @num_words += other.num_words
+        @num_words_in_anchor_text += other.num_words_in_anchor_text
+        @num_words_in_wrapped_lines += other.num_words_in_wrapped_lines
+        @num_wrapped_lines += other.num_wrapped_lines
+        @offset_blocks_start = [@offset_blocks_start , other.offset_blocks_start].min
+        @offset_blocks_end = [@offset_blocks_end , other.offset_blocks_end].max
+        init_densities
+        @content |= other.is_content?
+        @num_full_text_words += other.num_full_text_words
+        if other.labels
+          if @labels.nil?
+            @labels = other.labels.clone
+          else
+            @labels.merge(other.labels.clone)
+          end
+        end
+        @tag_level = [@tag_level, other.tag_level].min
+      end
+      def to_s
+        #"[" + offsetBlocksStart + "-" + offsetBlocksEnd + ";tl=" + tagLevel + "; nw=" + numWords + ";nwl=" + numWrappedLines + ";ld=" + linkDensity + "]\t" + (isContent ? "CONTENT" : "boilerplate") + "," + labels + "\n" + getText();
+        labels = 'null'
+        if !@labels.empty?
+          labels ="[#{ @labels.to_a.join(',')}]"
+        end
+        "[#{@offset_blocks_start}-#{@offset_blocks_end};tl=#{@tag_level}; nw=#{@num_words};nwl=#{@num_wrapped_lines};ld=#{@link_density}]\t#{is_content? ? 'CONTENT' : 'BOILERPLATE'},#{labels}\n#{text}"
+      end
+      def clone
+        throw NotImplementedError
+      end
+      private
+      def init_densities
+        if @num_words_in_wrapped_lines == 0
+          @num_words_in_wrapped_lines = @num_words
+          @num_wrapped_lines = 1
+        end
+        @text_density = @num_words_in_wrapped_lines / @num_wrapped_lines.to_f
+        @link_density = @num_words == 0 ? 0.0 : @num_words_in_anchor_text / @num_words.to_f
+      end
+    end
+  end
+end

data/lib/boilerpipe/document/text_document.rb ADDED Viewed

@@ -0,0 +1,44 @@
+module Boilerpipe
+  module Document
+    class TextDocument
+      attr_reader :text_blocks
+      attr_accessor :title
+      def initialize(title, text_blocks)
+        @text_blocks = text_blocks
+        @title = title
+      end
+      def content
+        text(true, false)
+      end
+      def text(include_content, include_noncontent)
+        s = ''
+        @text_blocks.each do |text_block|
+          case text_block.is_content?
+          when true
+            next unless include_content
+            s << text_block.text
+            s << '\n'
+          when false
+            next unless include_noncontent
+           s << text_block.text
+           s << '\n'
+          end
+        end
+        s
+      end
+      def replace_text_blocks!(new_blocks)
+        @text_blocks = new_blocks
+      end
+      def debug_s
+        @text_blocks.map(&:to_s).join("\n")
+      end
+      alias_method :debug_string, :debug_s
+    end
+  end
+end

data/lib/boilerpipe/errors.rb ADDED Viewed

	@@ -0,0 +1 @@
1	+ class BoilerPipeProcessingError < StandardError; end

data/lib/boilerpipe/extractors/article_extractor.rb ADDED Viewed

@@ -0,0 +1,52 @@
+module Boilerpipe::Extractors
+  class ArticleExtractor
+    def self.text(contents)
+      doc = ::Boilerpipe::SAX::BoilerpipeHTMLParser.parse(contents)
+      ::Boilerpipe::Extractors::ArticleExtractor.process(doc)
+    end
+    def self.process(doc)
+      title = doc.title
+      filters = ::Boilerpipe::Filters
+      # marks text blocks as end of text with :INDICATES_END_OF_TEXT
+      filters::TerminatingBlocksFinder.process doc
+      # marks text blocks as title
+      filters::DocumentTitleMatchClassifier.new(title).process doc
+      # marks text blocks as content / non-content using boilerpipe alg
+      filters::NumWordsRulesClassifier.process doc
+      # marks text blocks after INDICATES_END_OF_TEXT non-content
+      filters::IgnoreBlocksAfterContentFilter.process doc
+      # marks HEADING text blocks as non-content after existing content
+      filters::TrailingHeadlineToBoilerplateFilter.process doc
+      # merge text blocks next to each other
+      filters::BlockProximityFusion::MAX_DISTANCE_1.process doc
+      # removes non-content text blocks
+      filters::BoilerplateBlockFilter::INSTANCE_KEEP_TITLE.process doc
+      # merge text blocks next to each other if they are the same tag level
+      filters::BlockProximityFusion::MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL.process doc
+      # Keeps only the largest text block as content
+      filters::KeepLargestBlockFilter::INSTANCE_EXPAND_TO_SAME_TAGLEVEL_MIN_WORDS.process doc
+      # Marks all TextBlocks "content" which are between the headline and the part is already content
+      filters::ExpandTitleToContentFilter.process doc
+      # mark text blocks with a lot of text at the same tag level as the largest current content as additional content
+      filters::LargeBlockSameTagLevelToContentFilter.process doc
+      # Marks nested list-item blocks after the end of the main content as content.
+      filters::ListAtEndFilter.process doc
+      doc.content
+    end
+  end
+end

data/lib/boilerpipe/filters/block_proximity_fusion.rb ADDED Viewed

@@ -0,0 +1,63 @@
+    # Fuses adjacent blocks if their distance (in blocks) does not exceed a certain limit. This
+    # probably makes sense only in cases where an upstream filter already has removed some blocks.
+module Boilerpipe::Filters
+  class BlockProximityFusion
+    def initialize(max_blocks_distance, content_only, same_tag_level_only)
+      @max_blocks_distance = max_blocks_distance
+      @content_only = content_only
+      @same_tag_level_only = same_tag_level_only
+    end
+    MAX_DISTANCE_1 = BlockProximityFusion.new(1, false, false)
+    MAX_DISTANCE_1_SAME_TAGLEVEL = BlockProximityFusion.new( 1, false, true)
+    MAX_DISTANCE_1_CONTENT_ONLY = BlockProximityFusion.new( 1, true, false)
+    MAX_DISTANCE_1_CONTENT_ONLY_SAME_TAGLEVEL = BlockProximityFusion.new(1, true, true)
+    def process(doc)
+      text_blocks = doc.text_blocks
+      return false if text_blocks.size < 2
+      prev_block = if @content_only
+                     text_blocks.find{ |tb| tb.is_content? }
+                   else
+                     text_blocks.first
+                   end
+      return false if prev_block.nil?
+      offset = text_blocks.index(prev_block) + 1
+      blocks = text_blocks[offset..-1]
+      blocks_to_remove = []
+      blocks.each do |tb|
+        if tb.is_not_content?
+          prev_block = tb
+          next
+        end
+        diff_blocks = tb.offset_blocks_start - prev_block.offset_blocks_end - 1
+        if diff_blocks <= @max_blocks_distance
+          ok = true
+          ok = false if (prev_block.is_not_content? || tb.is_not_content?) && @content_only
+          ok = false if ok && prev_block.tag_level != tb.tag_level && @same_tag_level_only
+          if  ok
+            prev_block.merge_next(tb)
+            blocks_to_remove << tb
+          else
+            prev_block = tb
+          end
+        end
+      end
+      doc.replace_text_blocks!( text_blocks - blocks_to_remove )
+      doc
+    end
+  end
+end

data/lib/boilerpipe/filters/boilerplate_block_filter.rb ADDED Viewed

@@ -0,0 +1,26 @@
+ # Removes TextBlocks which have explicitly been marked as "not content".
+module Boilerpipe::Filters
+  class BoilerplateBlockFilter
+    def initialize(label)
+      @label_to_keep = label
+    end
+    INSTANCE_KEEP_TITLE = BoilerplateBlockFilter.new(:TITLE)
+    def process(doc)
+      combined = doc.text_blocks.delete_if do |tb|
+        if tb.is_not_content? &&
+           (@label_to_keep.nil? || !tb.has_label?(:TITLE))
+          true
+        else
+          false
+        end
+      end
+      doc.replace_text_blocks!(combined)
+      doc
+    end
+  end
+end