RubyGems - omnivore - Versions diffs - 0.0.4 → 0.1.0 - Mend

omnivore 0.0.4 → 0.1.0

Files changed (8) hide show

data/.gitignore CHANGED Viewed

@@ -1,4 +1,7 @@
+.rvmrc
+.yardoc
 *.gem
 .bundle
 Gemfile.lock
 pkg/*
+doc/*

data/README.md CHANGED Viewed

@@ -1 +1,18 @@
-Nothing to see here, move along.
+## Omnivore: a library for decrufting HTML documents
+Omnivore is a library for extracting "real" content from HTML documents.  Currently, the approach is limited to
+analysing text density to distiguish relevant sections from navigation, advertising, and other non-relevant elements. As
+such, the results are far from perfect but will hopefully improve as more sophisticated features are added.
+### INSTALL
+```
+sudo gem install omnivore
+```
+### EXAMPLE
+```ruby
+require 'omnivore'
+document = Omnivore::Document.from_url('http://www.slashgear.com/sennheiser-hd-700-hands-on-10208572')
+puts document.to_text
+```

data/lib/omnivore/document.rb CHANGED Viewed

@@ -3,17 +3,29 @@ require "omnivore/http_client"
 module Omnivore
+  # A class encapsulating an HTML document.
   class Document
     attr_reader :model
-    BLOCK_TAGS = %w[div p frame bod]
+    # The HTML tags signaling the start of a block or paragraph.
+    BLOCK_TAGS = %w[div p frame]
+    # A Struct descibing a paragraph, including it's :path in the document, :text,
+    # and various metrics, such as :text_density.
     Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
+    # Creates a Omnivore::Document object from a url.
+    # @param [String] url the document's url
+    # @return [Document] A new Document object.
     def self.from_url(url)
       Document.new(HttpClient.get(url))
     end
+    # Creates a Omnivore::Document object from a string containing HTML.
+    # @param [String] html the HTML content
+    # @return [Document] A new Document object.
     def self.from_html(html)
       Document.new(html)
     end
@@ -26,16 +38,22 @@ module Omnivore
     end
+    # A HTML representation of the document.
+    # @return [String] A HTML representation of the document.
     def to_html
       self.model.to_html
     end
+    # Extracts the document title.
+    # @return [String] The document title.
     def title
       @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
     end
+    # Extracts document metadata.
+    # @return [Hash] The metadata tags found in the document.
     def metadata
       @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
         memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
@@ -44,12 +62,19 @@ module Omnivore
     end
+    # Returns the actual content of the document, without navigation, advertising, etc.
+    # @return [String] The document's main content.
     def to_text
-      paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
-      paragraphs.map { |p| p.text }.join("\n")
+      self.to_paragraphs.inject([ ]) { |buffer, p|
+        buffer << p.text if p.text_density >= 0.25
+        buffer
+      }.join("\n")
     end
+    # Splits the document into paragraphs, assuming that each <div> or <p> tag represents
+    # a paragraph.
+    # @return [Array] An array of Paragraph objects.
     def to_paragraphs
       self.model.xpath("//div|//p").map { |block|
         html = block.to_html.gsub(/\s+/, " ").strip
@@ -64,15 +89,20 @@ module Omnivore
     private
-    def flatten(block)
+    # A convenience method that recursively iterates over a document node and returns
+    # an array of all of it's children, with the exception of other block elements
+    # (e.g div or p nodes).
+    # @param [Nokogiri::XML::Node] node the root node
+    # @return [Array] The Nokogiri::XML::Node objects contained in the root.
+    def flatten(node)
       elements = [ ]
-      return elements if block.nil?
-      return elements if block.respond_to?('cdata?') and block.cdata?
-      return elements if block.respond_to?('comment?') and block.comment?
-      if block.children.empty?
-        elements << block
+      return elements if node.nil?
+      return elements if node.respond_to?('cdata?') and node.cdata?
+      return elements if node.respond_to?('comment?') and node.comment?
+      if node.children.empty?
+        elements << node
       else
-        block.children.each { |child|
+        node.children.each { |child|
           unless BLOCK_TAGS.include?(child.name)
             elements += flatten(child)
           end

data/lib/omnivore/http_client.rb CHANGED Viewed

@@ -2,16 +2,22 @@ require 'net/http'
 require 'uri'
 module Omnivore
+  # A simple HTTP client with a redirect feature.
+  #
   class HttpClient
-    def self.get(url, attempts=3)
-      raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
+    # Sends a `GET` request to the specified url, following the provided number of
+    # maximum redirects.
+    #
+    # @param [String] url the url to be requested
+    # @param [Integer] redirects the number of redirects to follow
+    # @return [String] the response body of the request.
+    def self.get(url, redirects=3)
+      raise ArgumentError, 'HTTP redirect too deep' if redirects == 0
       response = Net::HTTP.get_response(URI.parse(url))
       case response
       when Net::HTTPSuccess then response.body
-      when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
+      when Net::HTTPRedirection then HttpClient.get(response['location'], redirects - 1)
       else
         response.error!
       end

data/lib/omnivore/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Omnivore
-  VERSION = "0.0.4"
+  VERSION = "0.1.0"
 end

data/omnivore.gemspec CHANGED Viewed

@@ -5,10 +5,11 @@ require "omnivore/version"
 Gem::Specification.new do |s|
   s.name        = "omnivore"
   s.version     = Omnivore::VERSION
+  s.platform    = Gem::Platform::RUBY
   s.authors     = ["Matthias Eder"]
   s.email       = ["matthias@izume.com"]
-  s.homepage    = ""
-  s.summary     = %q{Content extraction and analysis}
+  s.homepage    = "http://github.com/matthiase/omnivore"
+  s.summary     = %q{Content Extraction and Analysis Library}
   s.description = %q{A library for extracting content from HTML documents.}
   s.rubyforge_project = "omnivore"
@@ -19,6 +20,8 @@ Gem::Specification.new do |s|
   s.require_paths = ["lib"]
   # specify any dependencies here; for example:
+  s.add_development_dependency "yard", "~> 0.7.4"
+  s.add_development_dependency "redcarpet", "~> 2.0.1"
   s.add_development_dependency "rspec", "~> 2.8.0"
   s.add_runtime_dependency "nokogiri", "~> 1.5.0"
 end

metadata CHANGED Viewed

@@ -1,8 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: omnivore
 version: !ruby/object:Gem::Version
-  prerelease:
-  version: 0.0.4
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
 platform: ruby
 authors:
 - Matthias Eder
@@ -14,27 +19,69 @@ date: 2012-01-11 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: yard
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 2.8.0
+        hash: 11
+        segments:
+        - 0
+        - 7
+        - 4
+        version: 0.7.4
   type: :development
   version_requirements: *id001
 - !ruby/object:Gem::Dependency
-  name: nokogiri
+  name: redcarpet
   prerelease: false
   requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
+        hash: 13
+        segments:
+        - 2
+        - 0
+        - 1
+        version: 2.0.1
+  type: :development
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 47
+        segments:
+        - 2
+        - 8
+        - 0
+        version: 2.8.0
+  type: :development
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 1
+        - 5
+        - 0
         version: 1.5.0
   type: :runtime
-  version_requirements: *id002
+  version_requirements: *id004
 description: A library for extracting content from HTML documents.
 email:
 - matthias@izume.com
@@ -46,7 +93,6 @@ extra_rdoc_files: []
 files:
 - .gitignore
-- .rvmrc
 - Gemfile
 - README.md
 - Rakefile
@@ -59,7 +105,7 @@ files:
 - spec/fixtures/thia-breen-interview
 - spec/http_client_spec.rb
 has_rdoc: true
-homepage: ""
+homepage: http://github.com/matthiase/omnivore
 licenses: []
 post_install_message:
@@ -72,19 +118,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
 requirements: []
 rubyforge_project: omnivore
-rubygems_version: 1.5.0
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
-summary: Content extraction and analysis
+summary: Content Extraction and Analysis Library
 test_files: []

data/.rvmrc DELETED Viewed

@@ -1,7 +0,0 @@
-if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
-  && -s "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore" ]] ; then
-  \. "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore"
-else
-  rvm --create  "ruby-1.9.2-p0@omnivore"
-fi