omnivore 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/.rvmrc ADDED
@@ -0,0 +1,7 @@
1
+
2
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments" \
3
+ && -s "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore" ]] ; then
4
+ \. "${rvm_path:-$HOME/.rvm}/environments/ruby-1.9.2-p0@omnivore"
5
+ else
6
+ rvm --create "ruby-1.9.2-p0@omnivore"
7
+ fi
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in omnivore.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1 @@
1
+ Nothing to see here, move along.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,21 @@
1
+ require 'net/http'
2
+ require 'uri'
3
+
4
+ module Omnivore
5
+ class HttpClient
6
+
7
+
8
+ def HttpClient.get(url, attempts=3)
9
+ raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
10
+
11
+ response = Net::HTTP.get_response(URI.parse(url))
12
+ case response
13
+ when Net::HTTPSuccess then response.body
14
+ when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
15
+ else
16
+ response.error!
17
+ end
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,3 @@
1
+ module Omnivore
2
+ VERSION = "0.0.1"
3
+ end
@@ -0,0 +1,12 @@
1
+ require "rexml/document"
2
+
3
+ module Omnivore
4
+ class XPathExtractor
5
+
6
+ def XPathExtractor.match(html, xpath)
7
+ xmldoc = REXML::Document.new(html)
8
+ REXML::XPath.match(xmldoc, xpath)
9
+ end
10
+
11
+ end
12
+ end
data/lib/omnivore.rb ADDED
@@ -0,0 +1,7 @@
1
+ require "omnivore/version"
2
+ require "omnivore/http_client"
3
+ require "omnivore/xpath_extractor"
4
+
5
+ module Omnivore
6
+ # Your code goes here...
7
+ end
data/omnivore.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "omnivore/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "omnivore"
7
+ s.version = Omnivore::VERSION
8
+ s.authors = ["Matthias Eder"]
9
+ s.email = ["matthias@izume.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Content extraction and analysis}
12
+ s.description = %q{A library for extracting content from HTML documents.}
13
+
14
+ s.rubyforge_project = "omnivore"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ # s.add_runtime_dependency "rest-client"
24
+ end
@@ -0,0 +1,11 @@
1
+ require 'omnivore/http_client'
2
+
3
+ describe Omnivore::HttpClient do
4
+
5
+ it "should fetch the content of a url" do
6
+ html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
7
+ html.should_not be_nil
8
+ html.should_not be_empty
9
+ end
10
+
11
+ end
@@ -0,0 +1,30 @@
1
+ require "omnivore/xpath_extractor"
2
+
3
+ CONTENT = %{
4
+ <html>
5
+ <head></head>
6
+ <body>
7
+ <div class="banner">
8
+ This is a banner
9
+ </div>
10
+ <div class="topnav">
11
+ <ul>
12
+ <li>Home</li>
13
+ <li>About</li>
14
+ </ul>
15
+ </div>
16
+ <div class="content">
17
+ This is where the real stuff is.
18
+ </div>
19
+ </body>
20
+ </html>
21
+ }
22
+
23
+ describe Omnivore::XPathExtractor do
24
+
25
+ it "should match the correct xpath" do
26
+ matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
27
+ matches.size.should be > 0
28
+ end
29
+
30
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: omnivore
3
+ version: !ruby/object:Gem::Version
4
+ prerelease:
5
+ version: 0.0.1
6
+ platform: ruby
7
+ authors:
8
+ - Matthias Eder
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+
13
+ date: 2012-01-05 00:00:00 -07:00
14
+ default_executable:
15
+ dependencies:
16
+ - !ruby/object:Gem::Dependency
17
+ name: rspec
18
+ prerelease: false
19
+ requirement: &id001 !ruby/object:Gem::Requirement
20
+ none: false
21
+ requirements:
22
+ - - ">="
23
+ - !ruby/object:Gem::Version
24
+ version: "0"
25
+ type: :development
26
+ version_requirements: *id001
27
+ description: A library for extracting content from HTML documents.
28
+ email:
29
+ - matthias@izume.com
30
+ executables: []
31
+
32
+ extensions: []
33
+
34
+ extra_rdoc_files: []
35
+
36
+ files:
37
+ - .gitignore
38
+ - .rvmrc
39
+ - Gemfile
40
+ - README.md
41
+ - Rakefile
42
+ - lib/omnivore.rb
43
+ - lib/omnivore/http_client.rb
44
+ - lib/omnivore/version.rb
45
+ - lib/omnivore/xpath_extractor.rb
46
+ - omnivore.gemspec
47
+ - spec/http_client_spec.rb
48
+ - spec/xpath_extractor_spec.rb
49
+ has_rdoc: true
50
+ homepage: ""
51
+ licenses: []
52
+
53
+ post_install_message:
54
+ rdoc_options: []
55
+
56
+ require_paths:
57
+ - lib
58
+ required_ruby_version: !ruby/object:Gem::Requirement
59
+ none: false
60
+ requirements:
61
+ - - ">="
62
+ - !ruby/object:Gem::Version
63
+ version: "0"
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ">="
68
+ - !ruby/object:Gem::Version
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project: omnivore
73
+ rubygems_version: 1.5.0
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: Content extraction and analysis
77
+ test_files: []
78
+