wombat 0.2.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +4 -0
- data/README.md +1 -1
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/lib/wombat/crawler.rb +2 -14
- data/lib/wombat/metadata.rb +5 -3
- data/lib/wombat/parser.rb +5 -2
- data/spec/crawler_spec.rb +1 -1
- data/spec/integration/integration_spec.rb +4 -4
- data/spec/parser_spec.rb +5 -1
- data/wombat.gemspec +4 -3
- metadata +75 -25
data/Gemfile.lock
CHANGED
@@ -18,11 +18,14 @@ GEM
|
|
18
18
|
nokogiri (~> 1.4)
|
19
19
|
ntlm-http (~> 0.1, >= 0.1.1)
|
20
20
|
webrobots (~> 0.0, >= 0.0.9)
|
21
|
+
mime-types (1.17.2)
|
21
22
|
net-http-digest_auth (1.2)
|
22
23
|
net-http-persistent (2.3.3)
|
23
24
|
nokogiri (1.5.0)
|
24
25
|
ntlm-http (0.1.1)
|
25
26
|
rake (0.9.2.2)
|
27
|
+
rest-client (1.6.7)
|
28
|
+
mime-types (>= 1.16)
|
26
29
|
rspec (2.7.0)
|
27
30
|
rspec-core (~> 2.7.0)
|
28
31
|
rspec-expectations (~> 2.7.0)
|
@@ -49,6 +52,7 @@ DEPENDENCIES
|
|
49
52
|
jeweler
|
50
53
|
mechanize
|
51
54
|
rake
|
55
|
+
rest-client
|
52
56
|
rspec
|
53
57
|
vcr (= 2.0.0.rc1)
|
54
58
|
yard
|
data/README.md
CHANGED
@@ -63,7 +63,7 @@ my_crawler.crawl
|
|
63
63
|
}
|
64
64
|
```
|
65
65
|
|
66
|
-
### For
|
66
|
+
### For the documentation, please check the project [Wiki](http://github.com/felipecsl/wombat/wiki).
|
67
67
|
|
68
68
|
|
69
69
|
## Contributing to Wombat
|
data/Rakefile
CHANGED
@@ -16,6 +16,7 @@ Jeweler::Tasks.new do |gem|
|
|
16
16
|
gem.description = %Q{Generic Web crawler with a DSL that parses structured data from web pages}
|
17
17
|
gem.email = "felipe.lima@gmail.com"
|
18
18
|
gem.authors = ["Felipe Lima"]
|
19
|
+
gem.required_ruby_version = ">= 1.9"
|
19
20
|
# dependencies defined in Gemfile
|
20
21
|
end
|
21
22
|
|
@@ -26,4 +27,4 @@ RSpec::Core::RakeTask.new(:spec)
|
|
26
27
|
task :test => :spec
|
27
28
|
task :default => :spec
|
28
29
|
|
29
|
-
YARD::Rake::YardocTask.new
|
30
|
+
YARD::Rake::YardocTask.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.5
|
data/lib/wombat/crawler.rb
CHANGED
@@ -10,13 +10,8 @@ module Wombat
|
|
10
10
|
include Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
parse self.class.send(:metadata)
|
16
|
-
end
|
17
|
-
|
18
|
-
def supports_city?
|
19
|
-
end
|
13
|
+
def crawl
|
14
|
+
parse self.class.send(:metadata)
|
20
15
|
end
|
21
16
|
|
22
17
|
module ClassMethods
|
@@ -28,17 +23,10 @@ module Wombat
|
|
28
23
|
metadata.for_each(selector).instance_eval(&block) if block
|
29
24
|
end
|
30
25
|
|
31
|
-
def format type
|
32
|
-
metadata.document_format = type
|
33
|
-
end
|
34
|
-
|
35
26
|
def follow_links selector
|
36
27
|
|
37
28
|
end
|
38
29
|
|
39
|
-
def supported_cities
|
40
|
-
end
|
41
|
-
|
42
30
|
def to_ary
|
43
31
|
end
|
44
32
|
|
data/lib/wombat/metadata.rb
CHANGED
@@ -4,10 +4,8 @@ require 'wombat/iterator'
|
|
4
4
|
|
5
5
|
module Wombat
|
6
6
|
class Metadata < PropertyContainer
|
7
|
-
attr_accessor :document_format
|
8
|
-
|
9
7
|
def initialize
|
10
|
-
|
8
|
+
self[:format] = :html
|
11
9
|
super
|
12
10
|
end
|
13
11
|
|
@@ -18,5 +16,9 @@ module Wombat
|
|
18
16
|
def list_page url
|
19
17
|
self[:list_page] = url
|
20
18
|
end
|
19
|
+
|
20
|
+
def format format
|
21
|
+
self[:format] = format
|
22
|
+
end
|
21
23
|
end
|
22
24
|
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -22,7 +22,10 @@ module Wombat
|
|
22
22
|
it.all_properties.each do |p|
|
23
23
|
p.result ||= []
|
24
24
|
result = locate(p)
|
25
|
-
|
25
|
+
if result
|
26
|
+
result = p.callback ? p.callback.call(result) : result
|
27
|
+
p.result << result
|
28
|
+
end
|
26
29
|
end
|
27
30
|
end
|
28
31
|
end
|
@@ -41,7 +44,7 @@ module Wombat
|
|
41
44
|
def get_parser metadata
|
42
45
|
url = "#{metadata[:base_url]}#{metadata[:list_page]}"
|
43
46
|
|
44
|
-
if metadata
|
47
|
+
if metadata[:format] == :html
|
45
48
|
@mechanize.get(url).parser
|
46
49
|
else
|
47
50
|
Nokogiri::XML RestClient.get(url)
|
data/spec/crawler_spec.rb
CHANGED
@@ -114,7 +114,7 @@ describe Wombat::Crawler do
|
|
114
114
|
|
115
115
|
it 'should assign metadata forma' do
|
116
116
|
@crawler_instance.should_receive(:parse) do |arg|
|
117
|
-
arg.
|
117
|
+
arg[:format].should == :xml
|
118
118
|
end
|
119
119
|
@crawler.format :xml
|
120
120
|
@crawler_instance.crawl
|
@@ -44,7 +44,7 @@ describe 'basic crawler setup' do
|
|
44
44
|
|
45
45
|
crawler.for_each "css=ol.ranked-repositories li" do
|
46
46
|
repo 'css=h3'
|
47
|
-
description
|
47
|
+
description('css=p.description') { |d| d.gsub(/for/, '') }
|
48
48
|
end
|
49
49
|
|
50
50
|
crawler_instance = crawler.new
|
@@ -52,11 +52,11 @@ describe 'basic crawler setup' do
|
|
52
52
|
|
53
53
|
results["repo"].should =~ ["jairajs89 / Touchy.js", "mcavage / node-restify", "notlion / streetview-stereographic", "twitter / bootstrap", "stolksdorf / Parallaxjs"]
|
54
54
|
results["description"].should =~ [
|
55
|
-
"node.js REST framework specifically meant
|
56
|
-
"A simple light-weight JavaScript library
|
55
|
+
"node.js REST framework specifically meant web service APIs",
|
56
|
+
"A simple light-weight JavaScript library dealing with touch events",
|
57
57
|
"Shader Toy + Google Map + Panoramic Explorer",
|
58
58
|
"HTML, CSS, and JS toolkit from Twitter",
|
59
|
-
"a Library
|
59
|
+
"a Library Javascript that allows easy page parallaxing"
|
60
60
|
]
|
61
61
|
end
|
62
62
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -72,6 +72,10 @@ describe Wombat::Parser do
|
|
72
72
|
block_called.should be_true
|
73
73
|
end
|
74
74
|
|
75
|
+
it 'should invoke callback inside for_each block' do
|
76
|
+
|
77
|
+
end
|
78
|
+
|
75
79
|
it 'should return hash with requested properties' do
|
76
80
|
hash = double :results
|
77
81
|
fake_parser = double :parser
|
@@ -140,7 +144,7 @@ describe Wombat::Parser do
|
|
140
144
|
it 'should correctly parse xml documents' do
|
141
145
|
fake_document = double :xml
|
142
146
|
fake_parser = double :parser
|
143
|
-
@metadata.
|
147
|
+
@metadata.format :xml
|
144
148
|
@parser.mechanize.should_not_receive(:get)
|
145
149
|
RestClient.should_receive(:get).and_return fake_document
|
146
150
|
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-03-21"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -54,7 +54,8 @@ Gem::Specification.new do |s|
|
|
54
54
|
s.homepage = "http://github.com/felipecsl/wombat"
|
55
55
|
s.licenses = ["MIT"]
|
56
56
|
s.require_paths = ["lib"]
|
57
|
-
s.
|
57
|
+
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
58
|
+
s.rubygems_version = "1.8.18"
|
58
59
|
s.summary = "Ruby DSL to crawl web pages"
|
59
60
|
|
60
61
|
if s.respond_to? :specification_version then
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: activesupport
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: rest-client
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ! '>='
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: '0'
|
44
54
|
type: :runtime
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: bundler
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ! '>='
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: '0'
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: rake
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ! '>='
|
@@ -65,10 +85,15 @@ dependencies:
|
|
65
85
|
version: '0'
|
66
86
|
type: :development
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: yard
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
72
97
|
none: false
|
73
98
|
requirements:
|
74
99
|
- - ! '>='
|
@@ -76,10 +101,15 @@ dependencies:
|
|
76
101
|
version: '0'
|
77
102
|
type: :development
|
78
103
|
prerelease: false
|
79
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
80
110
|
- !ruby/object:Gem::Dependency
|
81
111
|
name: jeweler
|
82
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
83
113
|
none: false
|
84
114
|
requirements:
|
85
115
|
- - ! '>='
|
@@ -87,10 +117,15 @@ dependencies:
|
|
87
117
|
version: '0'
|
88
118
|
type: :development
|
89
119
|
prerelease: false
|
90
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
91
126
|
- !ruby/object:Gem::Dependency
|
92
127
|
name: rspec
|
93
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
94
129
|
none: false
|
95
130
|
requirements:
|
96
131
|
- - ! '>='
|
@@ -98,21 +133,31 @@ dependencies:
|
|
98
133
|
version: '0'
|
99
134
|
type: :development
|
100
135
|
prerelease: false
|
101
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
102
142
|
- !ruby/object:Gem::Dependency
|
103
143
|
name: vcr
|
104
|
-
requirement:
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
105
145
|
none: false
|
106
146
|
requirements:
|
107
|
-
- - =
|
147
|
+
- - '='
|
108
148
|
- !ruby/object:Gem::Version
|
109
149
|
version: 2.0.0.rc1
|
110
150
|
type: :development
|
111
151
|
prerelease: false
|
112
|
-
version_requirements:
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - '='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: 2.0.0.rc1
|
113
158
|
- !ruby/object:Gem::Dependency
|
114
159
|
name: fakeweb
|
115
|
-
requirement:
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
116
161
|
none: false
|
117
162
|
requirements:
|
118
163
|
- - ! '>='
|
@@ -120,7 +165,12 @@ dependencies:
|
|
120
165
|
version: '0'
|
121
166
|
type: :development
|
122
167
|
prerelease: false
|
123
|
-
version_requirements:
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
124
174
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
125
175
|
email: felipe.lima@gmail.com
|
126
176
|
executables: []
|
@@ -174,7 +224,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
174
224
|
requirements:
|
175
225
|
- - ! '>='
|
176
226
|
- !ruby/object:Gem::Version
|
177
|
-
version: '
|
227
|
+
version: '1.9'
|
178
228
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
179
229
|
none: false
|
180
230
|
requirements:
|
@@ -183,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
233
|
version: '0'
|
184
234
|
requirements: []
|
185
235
|
rubyforge_project:
|
186
|
-
rubygems_version: 1.8.
|
236
|
+
rubygems_version: 1.8.18
|
187
237
|
signing_key:
|
188
238
|
specification_version: 3
|
189
239
|
summary: Ruby DSL to crawl web pages
|