wombat 2.0.0 → 2.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -9
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/examples/iterator.rb +46 -0
- data/examples/list.rb +44 -0
- data/examples/no_class.rb +41 -0
- data/examples/xml.rb +38 -0
- data/lib/wombat/processing/parser.rb +4 -0
- data/lib/wombat/property/locators/html.rb +15 -9
- data/spec/integration/integration_spec.rb +7 -8
- data/spec/property/locators/html_spec.rb +11 -0
- data/wombat.gemspec +7 -3
- metadata +7 -3
data/README.md
CHANGED
@@ -19,10 +19,6 @@ Obs: Requires ruby 1.9
|
|
19
19
|
The simplest way to use Wombat is by calling ``Wombat.crawl`` and passing it a block:
|
20
20
|
|
21
21
|
```ruby
|
22
|
-
|
23
|
-
# => github_scraper.rb
|
24
|
-
|
25
|
-
#coding: utf-8
|
26
22
|
require 'wombat'
|
27
23
|
|
28
24
|
Wombat.crawl do
|
@@ -30,8 +26,8 @@ Wombat.crawl do
|
|
30
26
|
path "/"
|
31
27
|
|
32
28
|
headline "xpath=//h1"
|
33
|
-
|
34
29
|
what_is "css=.column.secondary p", :html
|
30
|
+
repositories "css=a.repo", :list
|
35
31
|
|
36
32
|
explore "xpath=//ul/li[2]/a" do |e|
|
37
33
|
e.gsub(/Explore/, "LOVE")
|
@@ -39,7 +35,7 @@ Wombat.crawl do
|
|
39
35
|
|
40
36
|
benefits do
|
41
37
|
first_benefit "css=.column.leftmost h3"
|
42
|
-
|
38
|
+
second_benefit "css=.column.leftmid h3"
|
43
39
|
third_benefit "css=.column.rightmid h3"
|
44
40
|
fourth_benefit "css=.column.rightmost h3"
|
45
41
|
end
|
@@ -53,6 +49,7 @@ end
|
|
53
49
|
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
54
50
|
"what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
55
51
|
"explore" => "LOVE GitHub",
|
52
|
+
"repositories" => ["jQuery", "reddit", "Sparkle", "curl", "Ruby on Rails", "node.js", "ClickToFlash", "Erlang/OTP", "CakePHP", "Redis"]
|
56
53
|
"benefits" => {
|
57
54
|
"first_benefit" => "Team management",
|
58
55
|
"second_benefit" => "Code review",
|
@@ -63,7 +60,7 @@ end
|
|
63
60
|
```
|
64
61
|
|
65
62
|
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
|
66
|
-
### [API Documentation](http://rubydoc.info/gems/wombat/
|
63
|
+
### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
|
67
64
|
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
|
68
65
|
|
69
66
|
|
@@ -80,8 +77,7 @@ end
|
|
80
77
|
## Contributors
|
81
78
|
|
82
79
|
* Felipe Lima ([@felipecsl](https://github.com/felipecsl))
|
83
|
-
*
|
84
|
-
* [@sigi](https://github.com/sigi)
|
80
|
+
* [List of all contributors](https://github.com/felipecsl/wombat/wiki/Contributors)
|
85
81
|
|
86
82
|
## Copyright
|
87
83
|
|
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ require 'yard'
|
|
10
10
|
Jeweler::Tasks.new do |gem|
|
11
11
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
12
12
|
gem.name = "wombat"
|
13
|
-
gem.homepage = "http://github.com/
|
13
|
+
gem.homepage = "http://felipecsl.github.com/wombat"
|
14
14
|
gem.license = "MIT"
|
15
15
|
gem.summary = %Q{Ruby DSL to scrape web pages}
|
16
16
|
gem.description = %Q{Generic Web crawler with a DSL that parses structured data from web pages}
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0.
|
1
|
+
2.0.1
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
class IteratorCrawler
|
5
|
+
include Wombat::Crawler
|
6
|
+
|
7
|
+
base_url "https://www.github.com"
|
8
|
+
path "/explore"
|
9
|
+
|
10
|
+
repos "css=ol.ranked-repositories>li", :iterator do
|
11
|
+
repo 'css=h3'
|
12
|
+
description 'css=p.description'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
=begin
|
17
|
+
p IteratorCrawler.new.crawl
|
18
|
+
{"repos"=>
|
19
|
+
[
|
20
|
+
{
|
21
|
+
"repo"=>"bernii / gauge.js",
|
22
|
+
"description"=>"100% native and cool looking JavaScript gauge"
|
23
|
+
},
|
24
|
+
{
|
25
|
+
"repo"=>"ZeitOnline / briefkasten",
|
26
|
+
"description"=>"a reasonably secure web application for submitting content anonymously"
|
27
|
+
},
|
28
|
+
{
|
29
|
+
"repo"=>"nothingmagical / cheddar-ios",
|
30
|
+
"description"=>"Cheddar for iOS"
|
31
|
+
},
|
32
|
+
{
|
33
|
+
"repo"=>"nathanmarz / storm-mesos",
|
34
|
+
"description"=>"Run Storm on top of the Mesos cluster resource manager"
|
35
|
+
},
|
36
|
+
{
|
37
|
+
"repo"=>"Netflix / SimianArmy",
|
38
|
+
"description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
|
39
|
+
},
|
40
|
+
{
|
41
|
+
"repo"=>nil,
|
42
|
+
"description"=>nil
|
43
|
+
}
|
44
|
+
]
|
45
|
+
}
|
46
|
+
=end
|
data/examples/list.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
class ListCrawler
|
5
|
+
include Wombat::Crawler
|
6
|
+
|
7
|
+
base_url "http://www.rubygems.org"
|
8
|
+
path "/"
|
9
|
+
|
10
|
+
gems do
|
11
|
+
new "css=#new_gems li", :list
|
12
|
+
most_downloaded "css=#most_downloaded li", :list
|
13
|
+
just_updated "css=#just_updated li", :list
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
=begin
|
18
|
+
pp ListCrawler.new.crawl
|
19
|
+
{
|
20
|
+
"gems"=>{
|
21
|
+
"new"=>[
|
22
|
+
"buffer (0.0.1)",
|
23
|
+
"resque-telework (0.2.0)",
|
24
|
+
"my_string_extend_lyk (0.0.1)",
|
25
|
+
"specr (0.0.1)",
|
26
|
+
"array-frequency (1.0.0)"
|
27
|
+
],
|
28
|
+
"most_downloaded"=> [
|
29
|
+
"rake-0.9.2.2 (7,128)",
|
30
|
+
"mime-types-1.19 (5,331)",
|
31
|
+
"tilt-1.3.3 (5,146)",
|
32
|
+
"rack-1.4.1 (5,124)",
|
33
|
+
"multi_json-1.3.6 (5,093)"
|
34
|
+
],
|
35
|
+
"just_updated"=>[
|
36
|
+
"wombat (2.0.0)",
|
37
|
+
"pdf-reader-turtletext (0.2.1)",
|
38
|
+
"minitest-reporters (0.10.0)",
|
39
|
+
"cloudprint (0.1.3)",
|
40
|
+
"greenletters (0.2.0)"
|
41
|
+
]
|
42
|
+
}
|
43
|
+
}
|
44
|
+
=end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
data = Wombat.crawl do
|
5
|
+
base_url "http://www.github.com"
|
6
|
+
path "/"
|
7
|
+
|
8
|
+
headline "xpath=//h1"
|
9
|
+
what_is "css=.column.secondary p", :html
|
10
|
+
|
11
|
+
explore "xpath=//ul/li[2]/a" do |e|
|
12
|
+
e.gsub(/Explore/, "LOVE")
|
13
|
+
end
|
14
|
+
|
15
|
+
benefits do
|
16
|
+
team_mgmt "css=.column.leftmost h3"
|
17
|
+
code_review "css=.column.leftmid h3"
|
18
|
+
hosting "css=.column.rightmid h3"
|
19
|
+
collaboration "css=.column.rightmost h3"
|
20
|
+
|
21
|
+
links do
|
22
|
+
team_mgmt "xpath=//div[@class='column leftmost']//a/@href"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
=begin
|
28
|
+
pp data
|
29
|
+
{
|
30
|
+
"headline"=>"1,900,094\n people hosting over\n 3,371,168\n repositories",
|
31
|
+
"what_is"=>"GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
32
|
+
"explore"=>"LOVE GitHub",
|
33
|
+
"benefits"=> {
|
34
|
+
"team_mgmt"=>"Team management",
|
35
|
+
"code_review"=>"Code review",
|
36
|
+
"hosting"=>"Reliable code hosting",
|
37
|
+
"collaboration"=>"Open source collaboration",
|
38
|
+
"links"=>{"team_mgmt"=>"/features/projects/collaboration"}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
=end
|
data/examples/xml.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
class XmlCrawler
|
5
|
+
include Wombat::Crawler
|
6
|
+
|
7
|
+
base_url "http://ws.audioscrobbler.com"
|
8
|
+
path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=ENV['API_KEY']"
|
9
|
+
|
10
|
+
document_format :xml
|
11
|
+
|
12
|
+
title "xpath=//event/title"
|
13
|
+
|
14
|
+
locations 'xpath=//event', :iterator do
|
15
|
+
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
16
|
+
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
=begin
|
21
|
+
pp XmlCrawler.new.crawl
|
22
|
+
|
23
|
+
{
|
24
|
+
"title"=>"Sinéad O'Connor",
|
25
|
+
"locations"=>[
|
26
|
+
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
27
|
+
{"latitude"=>"37.76213", "longitude"=>"-122.419032"},
|
28
|
+
{"latitude"=>"37.771491", "longitude"=>"-122.413241"},
|
29
|
+
{"latitude"=>"37.776227", "longitude"=>"-122.42044"},
|
30
|
+
{"latitude"=>"37.766588", "longitude"=>"-122.430391"},
|
31
|
+
{"latitude"=>"37.788978", "longitude"=>"-122.40664"},
|
32
|
+
{"latitude"=>"37.769715", "longitude"=>"-122.420427"},
|
33
|
+
{"latitude"=>"37.78832", "longitude"=>"-122.446692"},
|
34
|
+
{"latitude"=>"37.787583", "longitude"=>"-122.421665"},
|
35
|
+
{"latitude"=>"37.776227", "longitude"=>"-122.42044"}
|
36
|
+
]
|
37
|
+
}
|
38
|
+
=end
|
@@ -1,14 +1,20 @@
|
|
1
1
|
#coding: utf-8
|
2
2
|
|
3
3
|
module Wombat
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class Html < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
node = locate_nodes(context).first
|
9
|
+
value =
|
10
|
+
unless node
|
11
|
+
nil
|
12
|
+
else
|
13
|
+
node.inner_html.strip
|
14
|
+
end
|
15
|
+
super { value }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
13
19
|
end
|
14
20
|
end
|
@@ -189,7 +189,6 @@ describe 'basic crawler setup' do
|
|
189
189
|
crawler = Class.new
|
190
190
|
crawler.send(:include, Wombat::Crawler)
|
191
191
|
|
192
|
-
crawler.document_format :html
|
193
192
|
crawler.base_url "https://www.github.com"
|
194
193
|
crawler.path "/"
|
195
194
|
|
@@ -202,13 +201,13 @@ describe 'basic crawler setup' do
|
|
202
201
|
|
203
202
|
results.should == {
|
204
203
|
"github" => [
|
205
|
-
{ "heading"=>"GitHub helps people build software together."},
|
206
|
-
{ "heading"=>nil},
|
207
|
-
{ "heading"=>"Features"},
|
208
|
-
{ "heading"=>"Contact GitHub"},
|
209
|
-
{ "heading"=>"GitHub Training — Git Training from the Experts"},
|
210
|
-
{ "heading"=>"GitHub on Your Servers"},
|
211
|
-
{ "heading"=>"Loading..."}
|
204
|
+
{ "heading"=>"GitHub helps people build software together." },
|
205
|
+
{ "heading"=>nil },
|
206
|
+
{ "heading"=>"Features" },
|
207
|
+
{ "heading"=>"Contact GitHub" },
|
208
|
+
{ "heading"=>"GitHub Training — Git Training from the Experts" },
|
209
|
+
{ "heading"=>"GitHub on Your Servers" },
|
210
|
+
{ "heading"=>"Loading..." }
|
212
211
|
]
|
213
212
|
}
|
214
213
|
end
|
@@ -12,4 +12,15 @@ describe Wombat::Property::Locators::Html do
|
|
12
12
|
|
13
13
|
locator.locate(context).should == { "data1" => "Something cool" }
|
14
14
|
end
|
15
|
+
|
16
|
+
it 'should return null if the property cannot be found' do
|
17
|
+
fake_elem = double :element
|
18
|
+
context = double :context
|
19
|
+
context.stub(:xpath).with("/abc", nil).and_return []
|
20
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
21
|
+
|
22
|
+
locator = Wombat::Property::Locators::Html.new(property)
|
23
|
+
|
24
|
+
locator.locate(context).should == { "data1" => nil }
|
25
|
+
end
|
15
26
|
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "2.0.
|
8
|
+
s.version = "2.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-09-06"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -27,6 +27,10 @@ Gem::Specification.new do |s|
|
|
27
27
|
"README.md",
|
28
28
|
"Rakefile",
|
29
29
|
"VERSION",
|
30
|
+
"examples/iterator.rb",
|
31
|
+
"examples/list.rb",
|
32
|
+
"examples/no_class.rb",
|
33
|
+
"examples/xml.rb",
|
30
34
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
35
|
"fixtures/vcr_cassettes/broken_selector.yml",
|
32
36
|
"fixtures/vcr_cassettes/error_page.yml",
|
@@ -66,7 +70,7 @@ Gem::Specification.new do |s|
|
|
66
70
|
"spec/wombat_spec.rb",
|
67
71
|
"wombat.gemspec"
|
68
72
|
]
|
69
|
-
s.homepage = "http://github.com/
|
73
|
+
s.homepage = "http://felipecsl.github.com/wombat"
|
70
74
|
s.licenses = ["MIT"]
|
71
75
|
s.require_paths = ["lib"]
|
72
76
|
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -189,6 +189,10 @@ files:
|
|
189
189
|
- README.md
|
190
190
|
- Rakefile
|
191
191
|
- VERSION
|
192
|
+
- examples/iterator.rb
|
193
|
+
- examples/list.rb
|
194
|
+
- examples/no_class.rb
|
195
|
+
- examples/xml.rb
|
192
196
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
193
197
|
- fixtures/vcr_cassettes/broken_selector.yml
|
194
198
|
- fixtures/vcr_cassettes/error_page.yml
|
@@ -227,7 +231,7 @@ files:
|
|
227
231
|
- spec/spec_helper.rb
|
228
232
|
- spec/wombat_spec.rb
|
229
233
|
- wombat.gemspec
|
230
|
-
homepage: http://github.com/
|
234
|
+
homepage: http://felipecsl.github.com/wombat
|
231
235
|
licenses:
|
232
236
|
- MIT
|
233
237
|
post_install_message:
|