wombat 2.0.0 → 2.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +5 -9
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/examples/iterator.rb +46 -0
- data/examples/list.rb +44 -0
- data/examples/no_class.rb +41 -0
- data/examples/xml.rb +38 -0
- data/lib/wombat/processing/parser.rb +4 -0
- data/lib/wombat/property/locators/html.rb +15 -9
- data/spec/integration/integration_spec.rb +7 -8
- data/spec/property/locators/html_spec.rb +11 -0
- data/wombat.gemspec +7 -3
- metadata +7 -3
data/README.md
CHANGED
@@ -19,10 +19,6 @@ Obs: Requires ruby 1.9
|
|
19
19
|
The simplest way to use Wombat is by calling ``Wombat.crawl`` and passing it a block:
|
20
20
|
|
21
21
|
```ruby
|
22
|
-
|
23
|
-
# => github_scraper.rb
|
24
|
-
|
25
|
-
#coding: utf-8
|
26
22
|
require 'wombat'
|
27
23
|
|
28
24
|
Wombat.crawl do
|
@@ -30,8 +26,8 @@ Wombat.crawl do
|
|
30
26
|
path "/"
|
31
27
|
|
32
28
|
headline "xpath=//h1"
|
33
|
-
|
34
29
|
what_is "css=.column.secondary p", :html
|
30
|
+
repositories "css=a.repo", :list
|
35
31
|
|
36
32
|
explore "xpath=//ul/li[2]/a" do |e|
|
37
33
|
e.gsub(/Explore/, "LOVE")
|
@@ -39,7 +35,7 @@ Wombat.crawl do
|
|
39
35
|
|
40
36
|
benefits do
|
41
37
|
first_benefit "css=.column.leftmost h3"
|
42
|
-
|
38
|
+
second_benefit "css=.column.leftmid h3"
|
43
39
|
third_benefit "css=.column.rightmid h3"
|
44
40
|
fourth_benefit "css=.column.rightmost h3"
|
45
41
|
end
|
@@ -53,6 +49,7 @@ end
|
|
53
49
|
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
54
50
|
"what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
55
51
|
"explore" => "LOVE GitHub",
|
52
|
+
"repositories" => ["jQuery", "reddit", "Sparkle", "curl", "Ruby on Rails", "node.js", "ClickToFlash", "Erlang/OTP", "CakePHP", "Redis"]
|
56
53
|
"benefits" => {
|
57
54
|
"first_benefit" => "Team management",
|
58
55
|
"second_benefit" => "Code review",
|
@@ -63,7 +60,7 @@ end
|
|
63
60
|
```
|
64
61
|
|
65
62
|
### This is just a sneak peek of what Wombat can do. For the complete documentation, please check the [project Wiki](http://github.com/felipecsl/wombat/wiki).
|
66
|
-
### [API Documentation](http://rubydoc.info/gems/wombat/
|
63
|
+
### [API Documentation](http://rubydoc.info/gems/wombat/2.0.0/frames)
|
67
64
|
### [Changelog](https://github.com/felipecsl/wombat/wiki/Changelog)
|
68
65
|
|
69
66
|
|
@@ -80,8 +77,7 @@ end
|
|
80
77
|
## Contributors
|
81
78
|
|
82
79
|
* Felipe Lima ([@felipecsl](https://github.com/felipecsl))
|
83
|
-
*
|
84
|
-
* [@sigi](https://github.com/sigi)
|
80
|
+
* [List of all contributors](https://github.com/felipecsl/wombat/wiki/Contributors)
|
85
81
|
|
86
82
|
## Copyright
|
87
83
|
|
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ require 'yard'
|
|
10
10
|
Jeweler::Tasks.new do |gem|
|
11
11
|
# gem is a Gem::Specification... see http://docs.rubygems.org/read/chapter/20 for more options
|
12
12
|
gem.name = "wombat"
|
13
|
-
gem.homepage = "http://github.com/
|
13
|
+
gem.homepage = "http://felipecsl.github.com/wombat"
|
14
14
|
gem.license = "MIT"
|
15
15
|
gem.summary = %Q{Ruby DSL to scrape web pages}
|
16
16
|
gem.description = %Q{Generic Web crawler with a DSL that parses structured data from web pages}
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
2.0.
|
1
|
+
2.0.1
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
class IteratorCrawler
|
5
|
+
include Wombat::Crawler
|
6
|
+
|
7
|
+
base_url "https://www.github.com"
|
8
|
+
path "/explore"
|
9
|
+
|
10
|
+
repos "css=ol.ranked-repositories>li", :iterator do
|
11
|
+
repo 'css=h3'
|
12
|
+
description 'css=p.description'
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
=begin
|
17
|
+
p IteratorCrawler.new.crawl
|
18
|
+
{"repos"=>
|
19
|
+
[
|
20
|
+
{
|
21
|
+
"repo"=>"bernii / gauge.js",
|
22
|
+
"description"=>"100% native and cool looking JavaScript gauge"
|
23
|
+
},
|
24
|
+
{
|
25
|
+
"repo"=>"ZeitOnline / briefkasten",
|
26
|
+
"description"=>"a reasonably secure web application for submitting content anonymously"
|
27
|
+
},
|
28
|
+
{
|
29
|
+
"repo"=>"nothingmagical / cheddar-ios",
|
30
|
+
"description"=>"Cheddar for iOS"
|
31
|
+
},
|
32
|
+
{
|
33
|
+
"repo"=>"nathanmarz / storm-mesos",
|
34
|
+
"description"=>"Run Storm on top of the Mesos cluster resource manager"
|
35
|
+
},
|
36
|
+
{
|
37
|
+
"repo"=>"Netflix / SimianArmy",
|
38
|
+
"description"=>"Tools for keeping your cloud operating in top form. Chaos Monkey is a resiliency tool that helps ..."
|
39
|
+
},
|
40
|
+
{
|
41
|
+
"repo"=>nil,
|
42
|
+
"description"=>nil
|
43
|
+
}
|
44
|
+
]
|
45
|
+
}
|
46
|
+
=end
|
data/examples/list.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
class ListCrawler
|
5
|
+
include Wombat::Crawler
|
6
|
+
|
7
|
+
base_url "http://www.rubygems.org"
|
8
|
+
path "/"
|
9
|
+
|
10
|
+
gems do
|
11
|
+
new "css=#new_gems li", :list
|
12
|
+
most_downloaded "css=#most_downloaded li", :list
|
13
|
+
just_updated "css=#just_updated li", :list
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
=begin
|
18
|
+
pp ListCrawler.new.crawl
|
19
|
+
{
|
20
|
+
"gems"=>{
|
21
|
+
"new"=>[
|
22
|
+
"buffer (0.0.1)",
|
23
|
+
"resque-telework (0.2.0)",
|
24
|
+
"my_string_extend_lyk (0.0.1)",
|
25
|
+
"specr (0.0.1)",
|
26
|
+
"array-frequency (1.0.0)"
|
27
|
+
],
|
28
|
+
"most_downloaded"=> [
|
29
|
+
"rake-0.9.2.2 (7,128)",
|
30
|
+
"mime-types-1.19 (5,331)",
|
31
|
+
"tilt-1.3.3 (5,146)",
|
32
|
+
"rack-1.4.1 (5,124)",
|
33
|
+
"multi_json-1.3.6 (5,093)"
|
34
|
+
],
|
35
|
+
"just_updated"=>[
|
36
|
+
"wombat (2.0.0)",
|
37
|
+
"pdf-reader-turtletext (0.2.1)",
|
38
|
+
"minitest-reporters (0.10.0)",
|
39
|
+
"cloudprint (0.1.3)",
|
40
|
+
"greenletters (0.2.0)"
|
41
|
+
]
|
42
|
+
}
|
43
|
+
}
|
44
|
+
=end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
data = Wombat.crawl do
|
5
|
+
base_url "http://www.github.com"
|
6
|
+
path "/"
|
7
|
+
|
8
|
+
headline "xpath=//h1"
|
9
|
+
what_is "css=.column.secondary p", :html
|
10
|
+
|
11
|
+
explore "xpath=//ul/li[2]/a" do |e|
|
12
|
+
e.gsub(/Explore/, "LOVE")
|
13
|
+
end
|
14
|
+
|
15
|
+
benefits do
|
16
|
+
team_mgmt "css=.column.leftmost h3"
|
17
|
+
code_review "css=.column.leftmid h3"
|
18
|
+
hosting "css=.column.rightmid h3"
|
19
|
+
collaboration "css=.column.rightmost h3"
|
20
|
+
|
21
|
+
links do
|
22
|
+
team_mgmt "xpath=//div[@class='column leftmost']//a/@href"
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
=begin
|
28
|
+
pp data
|
29
|
+
{
|
30
|
+
"headline"=>"1,900,094\n people hosting over\n 3,371,168\n repositories",
|
31
|
+
"what_is"=>"GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
32
|
+
"explore"=>"LOVE GitHub",
|
33
|
+
"benefits"=> {
|
34
|
+
"team_mgmt"=>"Team management",
|
35
|
+
"code_review"=>"Code review",
|
36
|
+
"hosting"=>"Reliable code hosting",
|
37
|
+
"collaboration"=>"Open source collaboration",
|
38
|
+
"links"=>{"team_mgmt"=>"/features/projects/collaboration"}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
=end
|
data/examples/xml.rb
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#coding: utf-8
|
2
|
+
require 'wombat'
|
3
|
+
|
4
|
+
class XmlCrawler
|
5
|
+
include Wombat::Crawler
|
6
|
+
|
7
|
+
base_url "http://ws.audioscrobbler.com"
|
8
|
+
path "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=ENV['API_KEY']"
|
9
|
+
|
10
|
+
document_format :xml
|
11
|
+
|
12
|
+
title "xpath=//event/title"
|
13
|
+
|
14
|
+
locations 'xpath=//event', :iterator do
|
15
|
+
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
16
|
+
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
=begin
|
21
|
+
pp XmlCrawler.new.crawl
|
22
|
+
|
23
|
+
{
|
24
|
+
"title"=>"Sinéad O'Connor",
|
25
|
+
"locations"=>[
|
26
|
+
{"latitude"=>"37.807717", "longitude"=>"-122.270059"},
|
27
|
+
{"latitude"=>"37.76213", "longitude"=>"-122.419032"},
|
28
|
+
{"latitude"=>"37.771491", "longitude"=>"-122.413241"},
|
29
|
+
{"latitude"=>"37.776227", "longitude"=>"-122.42044"},
|
30
|
+
{"latitude"=>"37.766588", "longitude"=>"-122.430391"},
|
31
|
+
{"latitude"=>"37.788978", "longitude"=>"-122.40664"},
|
32
|
+
{"latitude"=>"37.769715", "longitude"=>"-122.420427"},
|
33
|
+
{"latitude"=>"37.78832", "longitude"=>"-122.446692"},
|
34
|
+
{"latitude"=>"37.787583", "longitude"=>"-122.421665"},
|
35
|
+
{"latitude"=>"37.776227", "longitude"=>"-122.42044"}
|
36
|
+
]
|
37
|
+
}
|
38
|
+
=end
|
@@ -1,14 +1,20 @@
|
|
1
1
|
#coding: utf-8
|
2
2
|
|
3
3
|
module Wombat
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
4
|
+
module Property
|
5
|
+
module Locators
|
6
|
+
class Html < Base
|
7
|
+
def locate(context, page = nil)
|
8
|
+
node = locate_nodes(context).first
|
9
|
+
value =
|
10
|
+
unless node
|
11
|
+
nil
|
12
|
+
else
|
13
|
+
node.inner_html.strip
|
14
|
+
end
|
15
|
+
super { value }
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
13
19
|
end
|
14
20
|
end
|
@@ -189,7 +189,6 @@ describe 'basic crawler setup' do
|
|
189
189
|
crawler = Class.new
|
190
190
|
crawler.send(:include, Wombat::Crawler)
|
191
191
|
|
192
|
-
crawler.document_format :html
|
193
192
|
crawler.base_url "https://www.github.com"
|
194
193
|
crawler.path "/"
|
195
194
|
|
@@ -202,13 +201,13 @@ describe 'basic crawler setup' do
|
|
202
201
|
|
203
202
|
results.should == {
|
204
203
|
"github" => [
|
205
|
-
{ "heading"=>"GitHub helps people build software together."},
|
206
|
-
{ "heading"=>nil},
|
207
|
-
{ "heading"=>"Features"},
|
208
|
-
{ "heading"=>"Contact GitHub"},
|
209
|
-
{ "heading"=>"GitHub Training — Git Training from the Experts"},
|
210
|
-
{ "heading"=>"GitHub on Your Servers"},
|
211
|
-
{ "heading"=>"Loading..."}
|
204
|
+
{ "heading"=>"GitHub helps people build software together." },
|
205
|
+
{ "heading"=>nil },
|
206
|
+
{ "heading"=>"Features" },
|
207
|
+
{ "heading"=>"Contact GitHub" },
|
208
|
+
{ "heading"=>"GitHub Training — Git Training from the Experts" },
|
209
|
+
{ "heading"=>"GitHub on Your Servers" },
|
210
|
+
{ "heading"=>"Loading..." }
|
212
211
|
]
|
213
212
|
}
|
214
213
|
end
|
@@ -12,4 +12,15 @@ describe Wombat::Property::Locators::Html do
|
|
12
12
|
|
13
13
|
locator.locate(context).should == { "data1" => "Something cool" }
|
14
14
|
end
|
15
|
+
|
16
|
+
it 'should return null if the property cannot be found' do
|
17
|
+
fake_elem = double :element
|
18
|
+
context = double :context
|
19
|
+
context.stub(:xpath).with("/abc", nil).and_return []
|
20
|
+
property = Wombat::DSL::Property.new('data1', 'xpath=/abc', :html)
|
21
|
+
|
22
|
+
locator = Wombat::Property::Locators::Html.new(property)
|
23
|
+
|
24
|
+
locator.locate(context).should == { "data1" => nil }
|
25
|
+
end
|
15
26
|
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "2.0.
|
8
|
+
s.version = "2.0.1"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-09-06"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -27,6 +27,10 @@ Gem::Specification.new do |s|
|
|
27
27
|
"README.md",
|
28
28
|
"Rakefile",
|
29
29
|
"VERSION",
|
30
|
+
"examples/iterator.rb",
|
31
|
+
"examples/list.rb",
|
32
|
+
"examples/no_class.rb",
|
33
|
+
"examples/xml.rb",
|
30
34
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
35
|
"fixtures/vcr_cassettes/broken_selector.yml",
|
32
36
|
"fixtures/vcr_cassettes/error_page.yml",
|
@@ -66,7 +70,7 @@ Gem::Specification.new do |s|
|
|
66
70
|
"spec/wombat_spec.rb",
|
67
71
|
"wombat.gemspec"
|
68
72
|
]
|
69
|
-
s.homepage = "http://github.com/
|
73
|
+
s.homepage = "http://felipecsl.github.com/wombat"
|
70
74
|
s.licenses = ["MIT"]
|
71
75
|
s.require_paths = ["lib"]
|
72
76
|
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.1
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-09-06 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
@@ -189,6 +189,10 @@ files:
|
|
189
189
|
- README.md
|
190
190
|
- Rakefile
|
191
191
|
- VERSION
|
192
|
+
- examples/iterator.rb
|
193
|
+
- examples/list.rb
|
194
|
+
- examples/no_class.rb
|
195
|
+
- examples/xml.rb
|
192
196
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
193
197
|
- fixtures/vcr_cassettes/broken_selector.yml
|
194
198
|
- fixtures/vcr_cassettes/error_page.yml
|
@@ -227,7 +231,7 @@ files:
|
|
227
231
|
- spec/spec_helper.rb
|
228
232
|
- spec/wombat_spec.rb
|
229
233
|
- wombat.gemspec
|
230
|
-
homepage: http://github.com/
|
234
|
+
homepage: http://felipecsl.github.com/wombat
|
231
235
|
licenses:
|
232
236
|
- MIT
|
233
237
|
post_install_message:
|