omnivore 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
 - data/.rvmrc +7 -0
 - data/Gemfile +4 -0
 - data/README.md +1 -0
 - data/Rakefile +1 -0
 - data/lib/omnivore/http_client.rb +21 -0
 - data/lib/omnivore/version.rb +3 -0
 - data/lib/omnivore/xpath_extractor.rb +12 -0
 - data/lib/omnivore.rb +7 -0
 - data/omnivore.gemspec +24 -0
 - data/spec/http_client_spec.rb +11 -0
 - data/spec/xpath_extractor_spec.rb +30 -0
 - metadata +78 -0
 
    
        data/.gitignore
    ADDED
    
    
    
        data/.rvmrc
    ADDED
    
    
    
        data/Gemfile
    ADDED
    
    
    
        data/README.md
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            Nothing to see here, move along.
         
     | 
    
        data/Rakefile
    ADDED
    
    | 
         @@ -0,0 +1 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "bundler/gem_tasks"
         
     | 
| 
         @@ -0,0 +1,21 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'net/http'
         
     | 
| 
      
 2 
     | 
    
         
            +
            require 'uri'
         
     | 
| 
      
 3 
     | 
    
         
            +
             
     | 
| 
      
 4 
     | 
    
         
            +
            module Omnivore
         
     | 
| 
      
 5 
     | 
    
         
            +
              class HttpClient
         
     | 
| 
      
 6 
     | 
    
         
            +
             
     | 
| 
      
 7 
     | 
    
         
            +
             
     | 
| 
      
 8 
     | 
    
         
            +
                def HttpClient.get(url, attempts=3)
         
     | 
| 
      
 9 
     | 
    
         
            +
                  raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
                  response = Net::HTTP.get_response(URI.parse(url))
         
     | 
| 
      
 12 
     | 
    
         
            +
                  case response
         
     | 
| 
      
 13 
     | 
    
         
            +
                  when Net::HTTPSuccess then response.body
         
     | 
| 
      
 14 
     | 
    
         
            +
                  when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
         
     | 
| 
      
 15 
     | 
    
         
            +
                  else
         
     | 
| 
      
 16 
     | 
    
         
            +
                    response.error!
         
     | 
| 
      
 17 
     | 
    
         
            +
                  end
         
     | 
| 
      
 18 
     | 
    
         
            +
                end
         
     | 
| 
      
 19 
     | 
    
         
            +
             
     | 
| 
      
 20 
     | 
    
         
            +
              end
         
     | 
| 
      
 21 
     | 
    
         
            +
            end
         
     | 
    
        data/lib/omnivore.rb
    ADDED
    
    
    
        data/omnivore.gemspec
    ADDED
    
    | 
         @@ -0,0 +1,24 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            # -*- encoding: utf-8 -*-
         
     | 
| 
      
 2 
     | 
    
         
            +
            $:.push File.expand_path("../lib", __FILE__)
         
     | 
| 
      
 3 
     | 
    
         
            +
            require "omnivore/version"
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
            Gem::Specification.new do |s|
         
     | 
| 
      
 6 
     | 
    
         
            +
              s.name        = "omnivore"
         
     | 
| 
      
 7 
     | 
    
         
            +
              s.version     = Omnivore::VERSION
         
     | 
| 
      
 8 
     | 
    
         
            +
              s.authors     = ["Matthias Eder"]
         
     | 
| 
      
 9 
     | 
    
         
            +
              s.email       = ["matthias@izume.com"]
         
     | 
| 
      
 10 
     | 
    
         
            +
              s.homepage    = ""
         
     | 
| 
      
 11 
     | 
    
         
            +
              s.summary     = %q{Content extraction and analysis}
         
     | 
| 
      
 12 
     | 
    
         
            +
              s.description = %q{A library for extracting content from HTML documents.}
         
     | 
| 
      
 13 
     | 
    
         
            +
             
     | 
| 
      
 14 
     | 
    
         
            +
              s.rubyforge_project = "omnivore"
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
              s.files         = `git ls-files`.split("\n")
         
     | 
| 
      
 17 
     | 
    
         
            +
              s.test_files    = `git ls-files -- {test,spec,features}/*`.split("\n")
         
     | 
| 
      
 18 
     | 
    
         
            +
              s.executables   = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
         
     | 
| 
      
 19 
     | 
    
         
            +
              s.require_paths = ["lib"]
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
              # specify any dependencies here; for example:
         
     | 
| 
      
 22 
     | 
    
         
            +
              s.add_development_dependency "rspec"
         
     | 
| 
      
 23 
     | 
    
         
            +
              # s.add_runtime_dependency "rest-client"
         
     | 
| 
      
 24 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,11 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require 'omnivore/http_client'
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            describe Omnivore::HttpClient do
         
     | 
| 
      
 4 
     | 
    
         
            +
             
     | 
| 
      
 5 
     | 
    
         
            +
              it "should fetch the content of a url" do
         
     | 
| 
      
 6 
     | 
    
         
            +
                html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
         
     | 
| 
      
 7 
     | 
    
         
            +
                html.should_not be_nil
         
     | 
| 
      
 8 
     | 
    
         
            +
                html.should_not be_empty
         
     | 
| 
      
 9 
     | 
    
         
            +
              end
         
     | 
| 
      
 10 
     | 
    
         
            +
             
     | 
| 
      
 11 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -0,0 +1,30 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            require "omnivore/xpath_extractor"
         
     | 
| 
      
 2 
     | 
    
         
            +
             
     | 
| 
      
 3 
     | 
    
         
            +
            CONTENT = %{
         
     | 
| 
      
 4 
     | 
    
         
            +
              <html>
         
     | 
| 
      
 5 
     | 
    
         
            +
                <head></head>
         
     | 
| 
      
 6 
     | 
    
         
            +
                <body>
         
     | 
| 
      
 7 
     | 
    
         
            +
                  <div class="banner">
         
     | 
| 
      
 8 
     | 
    
         
            +
                    This is a banner
         
     | 
| 
      
 9 
     | 
    
         
            +
                  </div>
         
     | 
| 
      
 10 
     | 
    
         
            +
                  <div class="topnav">
         
     | 
| 
      
 11 
     | 
    
         
            +
                    <ul>
         
     | 
| 
      
 12 
     | 
    
         
            +
                      <li>Home</li>
         
     | 
| 
      
 13 
     | 
    
         
            +
                      <li>About</li>
         
     | 
| 
      
 14 
     | 
    
         
            +
                    </ul>
         
     | 
| 
      
 15 
     | 
    
         
            +
                  </div>
         
     | 
| 
      
 16 
     | 
    
         
            +
                  <div class="content">
         
     | 
| 
      
 17 
     | 
    
         
            +
                    This is where the real stuff is.
         
     | 
| 
      
 18 
     | 
    
         
            +
                  </div>
         
     | 
| 
      
 19 
     | 
    
         
            +
                </body>
         
     | 
| 
      
 20 
     | 
    
         
            +
              </html>
         
     | 
| 
      
 21 
     | 
    
         
            +
            }
         
     | 
| 
      
 22 
     | 
    
         
            +
             
     | 
| 
      
 23 
     | 
    
         
            +
            describe Omnivore::XPathExtractor do
         
     | 
| 
      
 24 
     | 
    
         
            +
             
     | 
| 
      
 25 
     | 
    
         
            +
              it "should match the correct xpath" do
         
     | 
| 
      
 26 
     | 
    
         
            +
                matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
         
     | 
| 
      
 27 
     | 
    
         
            +
                matches.size.should be > 0 
         
     | 
| 
      
 28 
     | 
    
         
            +
              end
         
     | 
| 
      
 29 
     | 
    
         
            +
             
     | 
| 
      
 30 
     | 
    
         
            +
            end
         
     | 
    
        metadata
    ADDED
    
    | 
         @@ -0,0 +1,78 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            --- !ruby/object:Gem::Specification 
         
     | 
| 
      
 2 
     | 
    
         
            +
            name: omnivore
         
     | 
| 
      
 3 
     | 
    
         
            +
            version: !ruby/object:Gem::Version 
         
     | 
| 
      
 4 
     | 
    
         
            +
              prerelease: 
         
     | 
| 
      
 5 
     | 
    
         
            +
              version: 0.0.1
         
     | 
| 
      
 6 
     | 
    
         
            +
            platform: ruby
         
     | 
| 
      
 7 
     | 
    
         
            +
            authors: 
         
     | 
| 
      
 8 
     | 
    
         
            +
            - Matthias Eder
         
     | 
| 
      
 9 
     | 
    
         
            +
            autorequire: 
         
     | 
| 
      
 10 
     | 
    
         
            +
            bindir: bin
         
     | 
| 
      
 11 
     | 
    
         
            +
            cert_chain: []
         
     | 
| 
      
 12 
     | 
    
         
            +
             
     | 
| 
      
 13 
     | 
    
         
            +
            date: 2012-01-05 00:00:00 -07:00
         
     | 
| 
      
 14 
     | 
    
         
            +
            default_executable: 
         
     | 
| 
      
 15 
     | 
    
         
            +
            dependencies: 
         
     | 
| 
      
 16 
     | 
    
         
            +
            - !ruby/object:Gem::Dependency 
         
     | 
| 
      
 17 
     | 
    
         
            +
              name: rspec
         
     | 
| 
      
 18 
     | 
    
         
            +
              prerelease: false
         
     | 
| 
      
 19 
     | 
    
         
            +
              requirement: &id001 !ruby/object:Gem::Requirement 
         
     | 
| 
      
 20 
     | 
    
         
            +
                none: false
         
     | 
| 
      
 21 
     | 
    
         
            +
                requirements: 
         
     | 
| 
      
 22 
     | 
    
         
            +
                - - ">="
         
     | 
| 
      
 23 
     | 
    
         
            +
                  - !ruby/object:Gem::Version 
         
     | 
| 
      
 24 
     | 
    
         
            +
                    version: "0"
         
     | 
| 
      
 25 
     | 
    
         
            +
              type: :development
         
     | 
| 
      
 26 
     | 
    
         
            +
              version_requirements: *id001
         
     | 
| 
      
 27 
     | 
    
         
            +
            description: A library for extracting content from HTML documents.
         
     | 
| 
      
 28 
     | 
    
         
            +
            email: 
         
     | 
| 
      
 29 
     | 
    
         
            +
            - matthias@izume.com
         
     | 
| 
      
 30 
     | 
    
         
            +
            executables: []
         
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
      
 32 
     | 
    
         
            +
            extensions: []
         
     | 
| 
      
 33 
     | 
    
         
            +
             
     | 
| 
      
 34 
     | 
    
         
            +
            extra_rdoc_files: []
         
     | 
| 
      
 35 
     | 
    
         
            +
             
     | 
| 
      
 36 
     | 
    
         
            +
            files: 
         
     | 
| 
      
 37 
     | 
    
         
            +
            - .gitignore
         
     | 
| 
      
 38 
     | 
    
         
            +
            - .rvmrc
         
     | 
| 
      
 39 
     | 
    
         
            +
            - Gemfile
         
     | 
| 
      
 40 
     | 
    
         
            +
            - README.md
         
     | 
| 
      
 41 
     | 
    
         
            +
            - Rakefile
         
     | 
| 
      
 42 
     | 
    
         
            +
            - lib/omnivore.rb
         
     | 
| 
      
 43 
     | 
    
         
            +
            - lib/omnivore/http_client.rb
         
     | 
| 
      
 44 
     | 
    
         
            +
            - lib/omnivore/version.rb
         
     | 
| 
      
 45 
     | 
    
         
            +
            - lib/omnivore/xpath_extractor.rb
         
     | 
| 
      
 46 
     | 
    
         
            +
            - omnivore.gemspec
         
     | 
| 
      
 47 
     | 
    
         
            +
            - spec/http_client_spec.rb
         
     | 
| 
      
 48 
     | 
    
         
            +
            - spec/xpath_extractor_spec.rb
         
     | 
| 
      
 49 
     | 
    
         
            +
            has_rdoc: true
         
     | 
| 
      
 50 
     | 
    
         
            +
            homepage: ""
         
     | 
| 
      
 51 
     | 
    
         
            +
            licenses: []
         
     | 
| 
      
 52 
     | 
    
         
            +
             
     | 
| 
      
 53 
     | 
    
         
            +
            post_install_message: 
         
     | 
| 
      
 54 
     | 
    
         
            +
            rdoc_options: []
         
     | 
| 
      
 55 
     | 
    
         
            +
             
     | 
| 
      
 56 
     | 
    
         
            +
            require_paths: 
         
     | 
| 
      
 57 
     | 
    
         
            +
            - lib
         
     | 
| 
      
 58 
     | 
    
         
            +
            required_ruby_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 59 
     | 
    
         
            +
              none: false
         
     | 
| 
      
 60 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 61 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 62 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 63 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 64 
     | 
    
         
            +
            required_rubygems_version: !ruby/object:Gem::Requirement 
         
     | 
| 
      
 65 
     | 
    
         
            +
              none: false
         
     | 
| 
      
 66 
     | 
    
         
            +
              requirements: 
         
     | 
| 
      
 67 
     | 
    
         
            +
              - - ">="
         
     | 
| 
      
 68 
     | 
    
         
            +
                - !ruby/object:Gem::Version 
         
     | 
| 
      
 69 
     | 
    
         
            +
                  version: "0"
         
     | 
| 
      
 70 
     | 
    
         
            +
            requirements: []
         
     | 
| 
      
 71 
     | 
    
         
            +
             
     | 
| 
      
 72 
     | 
    
         
            +
            rubyforge_project: omnivore
         
     | 
| 
      
 73 
     | 
    
         
            +
            rubygems_version: 1.5.0
         
     | 
| 
      
 74 
     | 
    
         
            +
            signing_key: 
         
     | 
| 
      
 75 
     | 
    
         
            +
            specification_version: 3
         
     | 
| 
      
 76 
     | 
    
         
            +
            summary: Content extraction and analysis
         
     | 
| 
      
 77 
     | 
    
         
            +
            test_files: []
         
     | 
| 
      
 78 
     | 
    
         
            +
             
     |