RubyGems - omnivore - Versions diffs - 0.0.4 → 0.1.0 - Mend

omnivore 0.0.4 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

data/.gitignore CHANGED Viewed

@@ -1,4 +1,7 @@
+.rvmrc
+.yardoc
 *.gem
 .bundle
 Gemfile.lock
 pkg/*
+doc/*

data/README.md CHANGED Viewed

@@ -1 +1,18 @@
-Nothing to see here, move along.
+## Omnivore: a library for decrufting HTML documents
+Omnivore is a library for extracting "real" content from HTML documents.  Currently, the approach is limited to
+analysing text density to distiguish relevant sections from navigation, advertising, and other non-relevant elements. As
+such, the results are far from perfect but will hopefully improve as more sophisticated features are added.
+### INSTALL
+```
+sudo gem install omnivore
+```
+### EXAMPLE
+```ruby
+require 'omnivore'
+document = Omnivore::Document.from_url('http://www.slashgear.com/sennheiser-hd-700-hands-on-10208572')
+puts document.to_text
+```

data/lib/omnivore/document.rb CHANGED Viewed

@@ -3,17 +3,29 @@ require "omnivore/http_client"
 module Omnivore
+  # A class encapsulating an HTML document.
   class Document
     attr_reader :model
-    BLOCK_TAGS = %w[div p frame bod]
+    # The HTML tags signaling the start of a block or paragraph.
+    BLOCK_TAGS = %w[div p frame]
+    # A Struct descibing a paragraph, including it's :path in the document, :text,
+    # and various metrics, such as :text_density.
     Paragraph = Struct.new("Paragraph", :path, :text, :text_density)
+    # Creates a Omnivore::Document object from a url.
+    # @param [String] url the document's url
+    # @return [Document] A new Document object.
     def self.from_url(url)
       Document.new(HttpClient.get(url))
     end
+    # Creates a Omnivore::Document object from a string containing HTML.
+    # @param [String] html the HTML content
+    # @return [Document] A new Document object.
     def self.from_html(html)
       Document.new(html)
     end
@@ -26,16 +38,22 @@ module Omnivore
     end
+    # A HTML representation of the document.
+    # @return [String] A HTML representation of the document.
     def to_html
       self.model.to_html
     end
+    # Extracts the document title.
+    # @return [String] The document title.
     def title
       @title ||= self.model.xpath("/html/head/title").text.gsub(/\s+/, " ").strip
     end
+    # Extracts document metadata.
+    # @return [Hash] The metadata tags found in the document.
     def metadata
       @metadata ||= self.model.xpath("//meta").inject({ }) { |memo, el|
         memo[el.attr("name")] = el.attr("content") || "" if el.attr("name")
@@ -44,12 +62,19 @@ module Omnivore
     end
+    # Returns the actual content of the document, without navigation, advertising, etc.
+    # @return [String] The document's main content.
     def to_text
-      paragraphs = self.to_paragraphs.keep_if { |p| p.text_density > 0.5 }
-      paragraphs.map { |p| p.text }.join("\n")
+      self.to_paragraphs.inject([ ]) { |buffer, p|
+        buffer << p.text if p.text_density >= 0.25
+        buffer
+      }.join("\n")
     end
+    # Splits the document into paragraphs, assuming that each <div> or <p> tag represents
+    # a paragraph.
+    # @return [Array] An array of Paragraph objects.
     def to_paragraphs
       self.model.xpath("//div|//p").map { |block|
         html = block.to_html.gsub(/\s+/, " ").strip
@@ -64,15 +89,20 @@ module Omnivore
     private
-    def flatten(block)
+    # A convenience method that recursively iterates over a document node and returns
+    # an array of all of it's children, with the exception of other block elements
+    # (e.g div or p nodes).
+    # @param [Nokogiri::XML::Node] node the root node
+    # @return [Array] The Nokogiri::XML::Node objects contained in the root.
+    def flatten(node)
       elements = [ ]
-      return elements if block.nil?
-      return elements if block.respond_to?('cdata?') and block.cdata?
-      return elements if block.respond_to?('comment?') and block.comment?
-      if block.children.empty?
-        elements << block
+      return elements if node.nil?
+      return elements if node.respond_to?('cdata?') and node.cdata?
+      return elements if node.respond_to?('comment?') and node.comment?
+      if node.children.empty?
+        elements << node
       else
-        block.children.each { |child|
+        node.children.each { |child|
           unless BLOCK_TAGS.include?(child.name)
             elements += flatten(child)
           end

data/lib/omnivore/http_client.rb CHANGED Viewed

@@ -2,16 +2,22 @@ require 'net/http'
 require 'uri'
 module Omnivore
+  # A simple HTTP client with a redirect feature.
+  #
   class HttpClient
-    def self.get(url, attempts=3)
-      raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
+    # Sends a `GET` request to the specified url, following the provided number of
+    # maximum redirects.
+    #
+    # @param [String] url the url to be requested
+    # @param [Integer] redirects the number of redirects to follow
+    # @return [String] the response body of the request.
+    def self.get(url, redirects=3)
+      raise ArgumentError, 'HTTP redirect too deep' if redirects == 0
       response = Net::HTTP.get_response(URI.parse(url))
       case response
       when Net::HTTPSuccess then response.body
-      when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
+      when Net::HTTPRedirection then HttpClient.get(response['location'], redirects - 1)
       else
         response.error!
       end

data/lib/omnivore/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Omnivore
-  VERSION = "0.0.4"
+  VERSION = "0.1.0"
 end

data/omnivore.gemspec CHANGED Viewed

@@ -5,10 +5,11 @@ require "omnivore/version"
 Gem::Specification.new do |s|
   s.name        = "omnivore"
   s.version     = Omnivore::VERSION
+  s.platform    = Gem::Platform::RUBY
   s.authors     = ["Matthias Eder"]
   s.email       = ["matthias@izume.com"]
-  s.homepage    = ""
-  s.summary     = %q{Content extraction and analysis}
+  s.homepage    = "http://github.com/matthiase/omnivore"
+  s.summary     = %q{Content Extraction and Analysis Library}
   s.description = %q{A library for extracting content from HTML documents.}
   s.rubyforge_project = "omnivore"
@@ -19,6 +20,8 @@ Gem::Specification.new do |s|
   s.require_paths = ["lib"]
   # specify any dependencies here; for example:
+  s.add_development_dependency "yard", "~> 0.7.4"
+  s.add_development_dependency "redcarpet", "~> 2.0.1"
   s.add_development_dependency "rspec", "~> 2.8.0"
   s.add_runtime_dependency "nokogiri", "~> 1.5.0"
 end

metadata CHANGED Viewed

@@ -1,8 +1,13 @@
 --- !ruby/object:Gem::Specification
 name: omnivore
 version: !ruby/object:Gem::Version
-  prerelease:
-  version: 0.0.4
+  hash: 27
+  prerelease: false
+  segments:
+  - 0
+  - 1
+  - 0
+  version: 0.1.0
 platform: ruby
 authors:
 - Matthias Eder
@@ -14,27 +19,69 @@ date: 2012-01-11 00:00:00 -07:00
 default_executable:
 dependencies:
 - !ruby/object:Gem::Dependency
-  name: rspec
+  name: yard
   prerelease: false
   requirement: &id001 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
-        version: 2.8.0
+        hash: 11
+        segments:
+        - 0
+        - 7
+        - 4
+        version: 0.7.4
   type: :development
   version_requirements: *id001
 - !ruby/object:Gem::Dependency
-  name: nokogiri
+  name: redcarpet
   prerelease: false
   requirement: &id002 !ruby/object:Gem::Requirement
     none: false
     requirements:
     - - ~>
       - !ruby/object:Gem::Version
+        hash: 13
+        segments:
+        - 2
+        - 0
+        - 1
+        version: 2.0.1
+  type: :development
+  version_requirements: *id002
+- !ruby/object:Gem::Dependency
+  name: rspec
+  prerelease: false
+  requirement: &id003 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 47
+        segments:
+        - 2
+        - 8
+        - 0
+        version: 2.8.0
+  type: :development
+  version_requirements: *id003
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  prerelease: false
+  requirement: &id004 !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ~>
+      - !ruby/object:Gem::Version
+        hash: 3
+        segments:
+        - 1
+        - 5
+        - 0
         version: 1.5.0
   type: :runtime
-  version_requirements: *id002
+  version_requirements: *id004
 description: A library for extracting content from HTML documents.
 email:
 - matthias@izume.com
@@ -46,7 +93,6 @@ extra_rdoc_files: []
 files:
 - .gitignore
-- .rvmrc
 - Gemfile
 - README.md
 - Rakefile
@@ -59,7 +105,7 @@ files:
 - spec/fixtures/thia-breen-interview
 - spec/http_client_spec.rb
 has_rdoc: true
-homepage: ""
+homepage: http://github.com/matthiase/omnivore
 licenses: []
 post_install_message:
@@ -72,19 +118,25 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
 required_rubygems_version: !ruby/object:Gem::Requirement
   none: false
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
+      hash: 3
+      segments:
+      - 0
       version: "0"
 requirements: []
 rubyforge_project: omnivore
-rubygems_version: 1.5.0
+rubygems_version: 1.3.7
 signing_key:
 specification_version: 3
-summary: Content extraction and analysis
+summary: Content Extraction and Analysis Library
 test_files: []

data/.rvmrc DELETED Viewed

@@ -1,7 +0,0 @@
-if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
-  && -s "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore" ]] ; then
-  \. "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore"
-else
-  rvm --create  "ruby-1.9.2-p0@omnivore"
-fi