RubyGems - boilerpipe-ruby - Versions diffs - 0.0.1 → 0.1.0 - Mend

boilerpipe-ruby 0.0.1 → 0.1.0

Files changed (41) hide show

checksums.yaml +4 -4
data/.gitignore +1 -0
data/README.md +27 -6
data/Rakefile +8 -0
data/boilerpipe-ruby.gemspec +10 -9
data/lib/boilerpipe.rb +30 -0
data/lib/boilerpipe/document/text_block.rb +113 -0
data/lib/boilerpipe/document/text_document.rb +44 -0
data/lib/boilerpipe/errors.rb +1 -0
data/lib/boilerpipe/extractors/article_extractor.rb +52 -0
data/lib/boilerpipe/filters/block_proximity_fusion.rb +63 -0
data/lib/boilerpipe/filters/boilerplate_block_filter.rb +26 -0
data/lib/boilerpipe/filters/document_title_match_classifier.rb +121 -0
data/lib/boilerpipe/filters/expand_title_to_content_filter.rb +43 -0
data/lib/boilerpipe/filters/heuristic_filter_base.rb +7 -0
data/lib/boilerpipe/filters/ignore_blocks_after_content_filter.rb +24 -0
data/lib/boilerpipe/filters/keep_largest_block_filter.rb +62 -0
data/lib/boilerpipe/filters/large_block_same_tag_level_to_content_filter.rb +29 -0
data/lib/boilerpipe/filters/list_at_end_filter.rb +25 -0
data/lib/boilerpipe/filters/num_words_rules_classifier.rb +42 -0
data/lib/boilerpipe/filters/terminating_blocks_finder.rb +44 -0
data/lib/boilerpipe/filters/trailing_headline_to_boilerplate_filter.rb +24 -0
data/lib/boilerpipe/labels/default.rb +17 -0
data/lib/boilerpipe/labels/label_action.rb +17 -0
data/lib/boilerpipe/sax/boilerpipe_html_parser.rb +24 -0
data/lib/boilerpipe/sax/html_content_handler.rb +275 -0
data/lib/boilerpipe/sax/tag_action_map.rb +51 -0
data/lib/boilerpipe/sax/tag_actions/anchor_text.rb +49 -0
data/lib/boilerpipe/sax/tag_actions/block_level.rb +17 -0
data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb +22 -0
data/lib/boilerpipe/sax/tag_actions/body.rb +21 -0
data/lib/boilerpipe/sax/tag_actions/chained.rb +20 -0
data/lib/boilerpipe/sax/tag_actions/font.rb +40 -0
data/lib/boilerpipe/sax/tag_actions/ignorable_element.rb +18 -0
data/lib/boilerpipe/sax/tag_actions/inline_no_whitespace.rb +16 -0
data/lib/boilerpipe/sax/tag_actions/inline_tag_label.rb +24 -0
data/lib/boilerpipe/sax/tag_actions/inline_whitespace.rb +18 -0
data/lib/boilerpipe/util/unicode_tokenizer.rb +2 -2
data/lib/boilerpipe/version.rb +1 -1
data/stuff.txt +4 -0
metadata +61 -15

data/lib/boilerpipe/labels/label_action.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Boilerpipe::Labels
+  class LabelAction
+    attr_reader :labels
+    def initialize(labels=[])
+      @labels = labels
+    end
+    def add_to(text_block)
+      text_block.add_labels(@labels)
+    end
+    def to_s
+      @labels.join(',')
+    end
+  end
+end

data/lib/boilerpipe/sax/boilerpipe_html_parser.rb ADDED Viewed

@@ -0,0 +1,24 @@
+require 'nokogiri'
+module Boilerpipe::SAX
+  class BoilerpipeHTMLParser
+    def self.parse(text)
+      #script bug - delete script tags
+      text  = text.gsub(/\<script>.+?<\/script>/i, '')
+      # nokogiri uses libxml for mri and nekohtml for jruby
+      # mri doesn't remove &nbsp; when missing the semicolon
+      text = text.gsub(/(&nbsp) /, '\1; ')
+      # use nokogiri to fix any bad tags, errors - keep experimenting with this
+      text = Nokogiri::HTML(text).to_html
+      handler = HTMLContentHandler.new
+      noko_parser = Nokogiri::HTML::SAX::Parser.new(handler)
+      noko_parser.parse(text)
+      handler.text_document
+    end
+  end
+end

data/lib/boilerpipe/sax/html_content_handler.rb ADDED Viewed

@@ -0,0 +1,275 @@
+require 'nokogiri'
+require 'set'
+module Boilerpipe::SAX
+  class HTMLContentHandler < Nokogiri::XML::SAX::Document
+    attr_reader :in_ignorable_element, :label_stacks, :last_start_tag
+    attr_accessor :in_anchor_tag, :token_buffer ,:font_size_stack
+    ANCHOR_TEXT_START = "$\ue00a<"
+    ANCHOR_TEXT_END = ">\ue00a$"
+    def initialize
+      @label_stacks = []
+      @tag_actions = ::Boilerpipe::SAX::TagActionMap.tag_actions
+      @tag_level = 0
+      @sb_last_was_whitespace = false
+      @text_buffer = ''
+      @token_buffer = ''
+      @offset_blocks = 0
+      @flush = false
+      @block_tag_level = -1
+      @in_body = 0
+      @in_anchor_tag = 0
+      @in_ignorable_element = 0
+      @in_anchor_text = false
+      @font_size_stack = []
+      @last_start_tag = ''
+      @title
+      @text_blocks = []
+    end
+    def start_element(name, attrs = [])
+      @label_stacks << nil
+      tag = name.upcase.intern
+      tag_action = @tag_actions[tag]
+      if tag_action
+        @tag_level += 1 if tag_action.changes_tag_level?
+        @flush = tag_action.start(self, name, attrs) | @flush
+      else
+        @tag_level += 1
+        @flush = true
+      end
+      @last_event = :START_TAG
+      @last_start_tag = tag
+    end
+    def characters(text)
+      flush_block if @flush
+      return if @in_ignorable_element != 0
+      return if text.empty?
+      # replace all whitespace with simple space
+      text.gsub!(/\s+/, ' ')
+      # trim whitespace
+      started_with_whitespace = text  =~ /^\s/
+      ended_with_whitespace = text  =~ /\s$/
+      text.strip!
+      #  add a single space if the block was only whitespace
+      if text.empty?
+        append_space
+        @last_event = :WHITESPACE
+        return
+      end
+      # set block levels
+      @block_tag_level = @tag_level if @block_tag_level == -1
+      append_space if started_with_whitespace
+      append_text(text)
+      append_space if ended_with_whitespace
+      @last_event = :CHARACTERS
+    end
+    def end_element(name)
+      tag = name.upcase.intern
+      tag_action = @tag_actions[tag]
+      if tag_action
+        @flush = tag_action.end_tag(self, name) | @flush
+      else
+        @flush = true
+      end
+      @tag_level -= 1 if tag_action.nil? || tag_action.changes_tag_level?
+      flush_block if @flush
+      @last_event = :END_TAG
+      @last_end_tag = tag
+      @label_stacks.pop
+    end
+    def flush_block
+      @flush = false
+      if @in_body == 0
+        @title = @token_buffer.strip if :TITLE == @last_start_tag
+        clear_buffers
+        return
+      end
+      # clear out if empty or just a space
+      length = @token_buffer.size
+      case length
+      when 0
+        return
+      when 1
+        clear_buffers if @sb_last_was_whitespace
+        return
+      end
+      num_tokens = 0
+      num_words = 0
+      num_words_current_line = 0
+      num_words_in_wrapped_lines = 0
+      num_wrapped_lines = 0
+      num_linked_words = 0
+      current_line_length = 0
+      max_line_length = 80
+      tokens = ::Boilerpipe::UnicodeTokenizer.tokenize(@token_buffer)
+      tokens.each do |token|
+        if ANCHOR_TEXT_START == token
+          @in_anchor_text = true
+        elsif ANCHOR_TEXT_END == token
+          @in_anchor_text = false
+        elsif is_word?(token)
+          num_tokens += 1
+          num_words += 1
+          num_words_current_line += 1
+          num_linked_words += 1 if @in_anchor_text
+          token_length = token.size
+          current_line_length += token_length + 1
+          if current_line_length > max_line_length
+            num_wrapped_lines += 1
+            current_line_length = token_length
+            num_words_current_line = 1
+          end
+        else
+          num_tokens += 1
+        end
+      end
+      return if num_tokens == 0
+      num_words_in_wrapped_lines = 0
+      if num_wrapped_lines == 0
+        num_words_in_wrapped_lines = num_words
+        num_wrapped_lines = 1
+      else
+        num_words_in_wrapped_lines = num_words - num_words_current_line
+      end
+      text_block = ::Boilerpipe::Document::TextBlock.new(@text_buffer.strip,
+                                 num_words,
+                                 num_linked_words,
+                                 num_words_in_wrapped_lines,
+                                 num_wrapped_lines, @offset_blocks)
+      @offset_blocks += 1
+      clear_buffers
+      text_block.set_tag_level(@block_tag_level)
+      add_text_block(text_block)
+      @block_tag_level = -1
+    end
+    def text_document
+      flush_block
+      ::Boilerpipe::Document::TextDocument.new(@title, @text_blocks)
+    end
+    def token_buffer_size
+      @token_buffer.size
+    end
+    VALID_WORD_CHARACTER = /[\p{L}\p{Nd}\p{Nl}\p{No}]/
+    # unicode regex - categories
+    # \p{L}   -- Letter
+    # \p{Nd}  -- a decimal digit
+    # \p{Nl}  -- a letterlike numeric character
+    # \p{No}  -- a numeric character of other type
+    def is_word?(word)
+       word =~ VALID_WORD_CHARACTER
+    end
+    #public void flushBlock() {
+    #    int numWords = 0;
+    #    int numLinkedWords = 0;
+    #    int numWrappedLines = 0;
+    #    int currentLineLength = -1; // don't count the first space
+    #    final int maxLineLength = 80;
+    #    int numTokens = 0;
+    #    int numWordsCurrentLine = 0;
+    #}
+    def increase_in_ignorable_element!
+      @in_ignorable_element += 1
+    end
+    def decrease_in_ignorable_element!
+      @in_ignorable_element -= 1
+    end
+    def increase_in_body!
+      @in_body += 1
+    end
+    def decrease_in_body!
+      @in_body -= 1
+    end
+    def in_ignorable_element?
+      @in_ignorable_element > 0
+    end
+    def in_anchor_tag?
+      @in_anchor_tag > 0
+    end
+    def add_text_block(text_block)
+      @label_stacks.each do |stack|
+        next unless stack
+        stack.each do |label_action|
+          text_block.add_label(label_action.labels) if label_action
+        end
+      end
+      @text_blocks << text_block
+    end
+    # append space if last character wasn't already one
+    def append_space
+      return if @sb_last_was_whitespace
+      @sb_last_was_whitespace = true
+      @text_buffer << ' '
+      @token_buffer << ' '
+    end
+    def append_text(text)
+      @sb_last_was_whitespace = false
+      @text_buffer << text
+      @token_buffer <<  text
+    end
+    def append_token(token)
+      @token_buffer <<  token
+    end
+    def add_label_action(label_action)
+      label_stack = @label_stacks.last
+      if label_stack.nil?
+        label_stack = []
+        @label_stacks.pop
+        @label_stacks << label_stack
+      end
+      label_stack << label_action
+    end
+    private
+    def clear_buffers
+      @token_buffer = ''
+      @text_buffer = ''
+    end
+  end
+end

data/lib/boilerpipe/sax/tag_action_map.rb ADDED Viewed

@@ -0,0 +1,51 @@
+module Boilerpipe::SAX
+  class TagActionMap
+    def self.tag_actions
+      labels = ::Boilerpipe::Labels
+      {
+        STYLE: TagActions::IgnorableElement.new,
+        SCRIPT: TagActions::IgnorableElement.new,
+        OPTION: TagActions::IgnorableElement.new,
+        OBJECT: TagActions::IgnorableElement.new,
+        EMBED: TagActions::IgnorableElement.new,
+        APPLET: TagActions::IgnorableElement.new,
+        LINK: TagActions::IgnorableElement.new,
+        A: TagActions::AnchorText.new,
+        BODY: TagActions::Body.new,
+        STRIKE: TagActions::InlineNoWhitespace.new,
+        U: TagActions::InlineNoWhitespace.new,
+        B: TagActions::InlineNoWhitespace.new,
+        I: TagActions::InlineNoWhitespace.new,
+        EM: TagActions::InlineNoWhitespace.new,
+        STRONG: TagActions::InlineNoWhitespace.new,
+        SPAN: TagActions::InlineNoWhitespace.new,
+        # New in 1.1 (especially to improve extraction quality from Wikipedia etc.)
+        SUP: TagActions::InlineNoWhitespace.new,
+        # New in 1.2
+        CODE: TagActions::InlineNoWhitespace.new,
+        TT: TagActions::InlineNoWhitespace.new,
+        SUB: TagActions::InlineNoWhitespace.new,
+        VAR: TagActions::InlineNoWhitespace.new,
+        ABBR: TagActions::InlineWhitespace.new,
+        ACRONYM: TagActions::InlineWhitespace.new,
+        FONT: TagActions::InlineNoWhitespace.new,
+        # added in 1.1.1
+        NOSCRIPT: TagActions::IgnorableElement.new,
+        # New in 1.3
+        LI: TagActions::BlockTagLabel.new(labels::LabelAction.new([:LI])),
+        H1: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H1, :HEADING])),
+        H2: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H2, :HEADING])),
+        H3: TagActions::BlockTagLabel.new(labels::LabelAction.new([:H3, :HEADING]))
+      }
+    end
+  end
+end

data/lib/boilerpipe/sax/tag_actions/anchor_text.rb ADDED Viewed

@@ -0,0 +1,49 @@
+module Boilerpipe::SAX::TagActions
+  class AnchorText
+    # Marks this tag as "anchor" (this should usually only be set for the <A> tag). Anchor tags may not be nested.
+    # There is a bug in certain versions of NekoHTML which still allows nested tags. If boilerpipe
+    #* encounters such nestings, a SAXException is thrown.
+    def start(handler, name, attrs)
+      if handler.in_anchor_tag?
+        handler.in_anchor_tag += 1
+        nested_achor_tag_error_recovering(handler, name)
+        return
+      else
+        handler.in_anchor_tag += 1
+      end
+      append_anchor_text_start(handler) unless handler.in_ignorable_element?
+      false
+    end
+    def end_tag(handler, name)
+      handler.in_anchor_tag -= 1
+      append_anchor_text_end(handler) unless handler.in_anchor_tag? || handler.in_ignorable_element?
+      false
+    end
+    def changes_tag_level?
+      true
+    end
+    def append_anchor_text_start(handler)
+      handler.append_space
+      handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_START)
+      handler.append_token(' ')
+    end
+    def append_anchor_text_end(handler)
+      handler.append_space
+      handler.append_token(Boilerpipe::SAX::HTMLContentHandler::ANCHOR_TEXT_END)
+      handler.append_token(' ')
+    end
+    def nested_achor_tag_error_recovering(handler, name)
+      # - dunno about nokogiri???????
+      # as nested A elements are not allowed per specification, we
+      # are probably reaching this branch due to a bug in the XML parser
+      #puts "Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow..."
+      end_tag(handler, name)
+    end
+  end
+end

data/lib/boilerpipe/sax/tag_actions/block_level.rb ADDED Viewed

@@ -0,0 +1,17 @@
+module Boilerpipe::SAX::TagActions
+     # Explicitly marks this tag a simple "block-level" element,
+     # which always generates whitespace
+  class BlockLevel
+    def start(handler, name, attrs)
+      true
+    end
+    def end_tag(handler, name)
+      true
+    end
+    def changes_tag_level?
+      true
+    end
+  end
+end

data/lib/boilerpipe/sax/tag_actions/block_tag_label.rb ADDED Viewed

@@ -0,0 +1,22 @@
+module Boilerpipe::SAX::TagActions
+# for block-level elements, which triggers some LabelAction on
+# the generated TextBlock.
+  class BlockTagLabel
+    def initialize(label_action)
+      @label_action = label_action
+    end
+    def start(handler, name, attrs)
+      handler.add_label_action(@label_action)
+      true
+    end
+    def end_tag(handler, name)
+      true
+    end
+    def changes_tag_level?
+      true
+    end
+  end
+end