omnivore 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rvmrc ADDED
@@ -0,0 +1,7 @@
1
+
2
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
3
+ && -s "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore" ]] ; then
4
+ \. "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore"
5
+ else
6
+ rvm --create "ruby-1.9.2-p0@omnivore"
7
+ fi
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in omnivore.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1 @@
1
+ Nothing to see here, move along.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module Omnivore
5
+ class HttpClient
6
+
7
+
8
+ def HttpClient.get(url, attempts=3)
9
+ raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
10
+
11
+ response = Net::HTTP.get_response(URI.parse(url))
12
+ case response
13
+ when Net::HTTPSuccess then response.body
14
+ when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
15
+ else
16
+ response.error!
17
+ end
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module Omnivore
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,12 @@
1
+ require "rexml/document"
2
+
3
+ module Omnivore
4
+ class XPathExtractor
5
+
6
+ def XPathExtractor.match(html, xpath)
7
+ xmldoc = REXML::Document.new(html)
8
+ REXML::XPath.match(xmldoc, xpath)
9
+ end
10
+
11
+ end
12
+ end
data/lib/omnivore.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "omnivore/version"
2
+ require "omnivore/http_client"
3
+ require "omnivore/xpath_extractor"
4
+
5
+ module Omnivore
6
+ # Your code goes here...
7
+ end
data/omnivore.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "omnivore/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "omnivore"
7
+ s.version = Omnivore::VERSION
8
+ s.authors = ["Matthias Eder"]
9
+ s.email = ["matthias@izume.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Content extraction and analysis}
12
+ s.description = %q{A library for extracting content from HTML documents.}
13
+
14
+ s.rubyforge_project = "omnivore"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ # s.add_runtime_dependency "rest-client"
24
+ end
@@ -0,0 +1,11 @@
1
+ require 'omnivore/http_client'
2
+
3
+ describe Omnivore::HttpClient do
4
+
5
+ it "should fetch the content of a url" do
6
+ html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
7
+ html.should_not be_nil
8
+ html.should_not be_empty
9
+ end
10
+
11
+ end
@@ -0,0 +1,30 @@
1
+ require "omnivore/xpath_extractor"
2
+
3
+ CONTENT = %{
4
+ <html>
5
+ <head></head>
6
+ <body>
7
+ <div class="banner">
8
+ This is a banner
9
+ </div>
10
+ <div class="topnav">
11
+ <ul>
12
+ <li>Home</li>
13
+ <li>About</li>
14
+ </ul>
15
+ </div>
16
+ <div class="content">
17
+ This is where the real stuff is.
18
+ </div>
19
+ </body>
20
+ </html>
21
+ }
22
+
23
+ describe Omnivore::XPathExtractor do
24
+
25
+ it "should match the correct xpath" do
26
+ matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
27
+ matches.size.should be > 0
28
+ end
29
+
30
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: omnivore
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Matthias Eder
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-01-05 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: rspec
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :development
26
+ version_requirements: *id001
27
+ description: A library for extracting content from HTML documents.
28
+ email:
29
+ - matthias@izume.com
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files: []
35
+
36
+ files:
37
+ - .gitignore
38
+ - .rvmrc
39
+ - Gemfile
40
+ - README.md
41
+ - Rakefile
42
+ - lib/omnivore.rb
43
+ - lib/omnivore/http_client.rb
44
+ - lib/omnivore/version.rb
45
+ - lib/omnivore/xpath_extractor.rb
46
+ - omnivore.gemspec
47
+ - spec/http_client_spec.rb
48
+ - spec/xpath_extractor_spec.rb
49
+ has_rdoc: true
50
+ homepage: ""
51
+ licenses: []
52
+
53
+ post_install_message:
54
+ rdoc_options: []
55
+
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project: omnivore
73
+ rubygems_version: 1.5.0
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: Content extraction and analysis
77
+ test_files: []
78
+