omnivore 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/.rvmrc +7 -0
- data/Gemfile +4 -0
- data/README.md +1 -0
- data/Rakefile +1 -0
- data/lib/omnivore/http_client.rb +21 -0
- data/lib/omnivore/version.rb +3 -0
- data/lib/omnivore/xpath_extractor.rb +12 -0
- data/lib/omnivore.rb +7 -0
- data/omnivore.gemspec +24 -0
- data/spec/http_client_spec.rb +11 -0
- data/spec/xpath_extractor_spec.rb +30 -0
- metadata +78 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Nothing to see here, move along.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Omnivore
|
5
|
+
class HttpClient
|
6
|
+
|
7
|
+
|
8
|
+
def HttpClient.get(url, attempts=3)
|
9
|
+
raise ArgumentError, 'HTTP redirect too deep' if attempts == 0
|
10
|
+
|
11
|
+
response = Net::HTTP.get_response(URI.parse(url))
|
12
|
+
case response
|
13
|
+
when Net::HTTPSuccess then response.body
|
14
|
+
when Net::HTTPRedirection then HttpClient.get(response['location'], attempts - 1)
|
15
|
+
else
|
16
|
+
response.error!
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/omnivore.rb
ADDED
data/omnivore.gemspec
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "omnivore/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "omnivore"
|
7
|
+
s.version = Omnivore::VERSION
|
8
|
+
s.authors = ["Matthias Eder"]
|
9
|
+
s.email = ["matthias@izume.com"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{Content extraction and analysis}
|
12
|
+
s.description = %q{A library for extracting content from HTML documents.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "omnivore"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
# s.add_runtime_dependency "rest-client"
|
24
|
+
end
|
@@ -0,0 +1,11 @@
|
|
1
|
+
require 'omnivore/http_client'
|
2
|
+
|
3
|
+
describe Omnivore::HttpClient do
|
4
|
+
|
5
|
+
it "should fetch the content of a url" do
|
6
|
+
html = Omnivore::HttpClient.get("http://blog.steveklabnik.com/posts/2011-09-28-real-modern-ruby-development")
|
7
|
+
html.should_not be_nil
|
8
|
+
html.should_not be_empty
|
9
|
+
end
|
10
|
+
|
11
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require "omnivore/xpath_extractor"
|
2
|
+
|
3
|
+
CONTENT = %{
|
4
|
+
<html>
|
5
|
+
<head></head>
|
6
|
+
<body>
|
7
|
+
<div class="banner">
|
8
|
+
This is a banner
|
9
|
+
</div>
|
10
|
+
<div class="topnav">
|
11
|
+
<ul>
|
12
|
+
<li>Home</li>
|
13
|
+
<li>About</li>
|
14
|
+
</ul>
|
15
|
+
</div>
|
16
|
+
<div class="content">
|
17
|
+
This is where the real stuff is.
|
18
|
+
</div>
|
19
|
+
</body>
|
20
|
+
</html>
|
21
|
+
}
|
22
|
+
|
23
|
+
describe Omnivore::XPathExtractor do
|
24
|
+
|
25
|
+
it "should match the correct xpath" do
|
26
|
+
matches = Omnivore::XPathExtractor.match(CONTENT, "//div[@class=\"content\"]")
|
27
|
+
matches.size.should be > 0
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: omnivore
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease:
|
5
|
+
version: 0.0.1
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Matthias Eder
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
|
13
|
+
date: 2012-01-05 00:00:00 -07:00
|
14
|
+
default_executable:
|
15
|
+
dependencies:
|
16
|
+
- !ruby/object:Gem::Dependency
|
17
|
+
name: rspec
|
18
|
+
prerelease: false
|
19
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
20
|
+
none: false
|
21
|
+
requirements:
|
22
|
+
- - ">="
|
23
|
+
- !ruby/object:Gem::Version
|
24
|
+
version: "0"
|
25
|
+
type: :development
|
26
|
+
version_requirements: *id001
|
27
|
+
description: A library for extracting content from HTML documents.
|
28
|
+
email:
|
29
|
+
- matthias@izume.com
|
30
|
+
executables: []
|
31
|
+
|
32
|
+
extensions: []
|
33
|
+
|
34
|
+
extra_rdoc_files: []
|
35
|
+
|
36
|
+
files:
|
37
|
+
- .gitignore
|
38
|
+
- .rvmrc
|
39
|
+
- Gemfile
|
40
|
+
- README.md
|
41
|
+
- Rakefile
|
42
|
+
- lib/omnivore.rb
|
43
|
+
- lib/omnivore/http_client.rb
|
44
|
+
- lib/omnivore/version.rb
|
45
|
+
- lib/omnivore/xpath_extractor.rb
|
46
|
+
- omnivore.gemspec
|
47
|
+
- spec/http_client_spec.rb
|
48
|
+
- spec/xpath_extractor_spec.rb
|
49
|
+
has_rdoc: true
|
50
|
+
homepage: ""
|
51
|
+
licenses: []
|
52
|
+
|
53
|
+
post_install_message:
|
54
|
+
rdoc_options: []
|
55
|
+
|
56
|
+
require_paths:
|
57
|
+
- lib
|
58
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
59
|
+
none: false
|
60
|
+
requirements:
|
61
|
+
- - ">="
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: "0"
|
64
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ">="
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: omnivore
|
73
|
+
rubygems_version: 1.5.0
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: Content extraction and analysis
|
77
|
+
test_files: []
|
78
|
+
|