wombat 0.2.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +4 -0
- data/README.md +1 -1
- data/Rakefile +2 -1
- data/VERSION +1 -1
- data/lib/wombat/crawler.rb +2 -14
- data/lib/wombat/metadata.rb +5 -3
- data/lib/wombat/parser.rb +5 -2
- data/spec/crawler_spec.rb +1 -1
- data/spec/integration/integration_spec.rb +4 -4
- data/spec/parser_spec.rb +5 -1
- data/wombat.gemspec +4 -3
- metadata +75 -25
data/Gemfile.lock
CHANGED
@@ -18,11 +18,14 @@ GEM
|
|
18
18
|
nokogiri (~> 1.4)
|
19
19
|
ntlm-http (~> 0.1, >= 0.1.1)
|
20
20
|
webrobots (~> 0.0, >= 0.0.9)
|
21
|
+
mime-types (1.17.2)
|
21
22
|
net-http-digest_auth (1.2)
|
22
23
|
net-http-persistent (2.3.3)
|
23
24
|
nokogiri (1.5.0)
|
24
25
|
ntlm-http (0.1.1)
|
25
26
|
rake (0.9.2.2)
|
27
|
+
rest-client (1.6.7)
|
28
|
+
mime-types (>= 1.16)
|
26
29
|
rspec (2.7.0)
|
27
30
|
rspec-core (~> 2.7.0)
|
28
31
|
rspec-expectations (~> 2.7.0)
|
@@ -49,6 +52,7 @@ DEPENDENCIES
|
|
49
52
|
jeweler
|
50
53
|
mechanize
|
51
54
|
rake
|
55
|
+
rest-client
|
52
56
|
rspec
|
53
57
|
vcr (= 2.0.0.rc1)
|
54
58
|
yard
|
data/README.md
CHANGED
@@ -63,7 +63,7 @@ my_crawler.crawl
|
|
63
63
|
}
|
64
64
|
```
|
65
65
|
|
66
|
-
### For
|
66
|
+
### For the documentation, please check the project [Wiki](http://github.com/felipecsl/wombat/wiki).
|
67
67
|
|
68
68
|
|
69
69
|
## Contributing to Wombat
|
data/Rakefile
CHANGED
@@ -16,6 +16,7 @@ Jeweler::Tasks.new do |gem|
|
|
16
16
|
gem.description = %Q{Generic Web crawler with a DSL that parses structured data from web pages}
|
17
17
|
gem.email = "felipe.lima@gmail.com"
|
18
18
|
gem.authors = ["Felipe Lima"]
|
19
|
+
gem.required_ruby_version = ">= 1.9"
|
19
20
|
# dependencies defined in Gemfile
|
20
21
|
end
|
21
22
|
|
@@ -26,4 +27,4 @@ RSpec::Core::RakeTask.new(:spec)
|
|
26
27
|
task :test => :spec
|
27
28
|
task :default => :spec
|
28
29
|
|
29
|
-
YARD::Rake::YardocTask.new
|
30
|
+
YARD::Rake::YardocTask.new
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.5
|
data/lib/wombat/crawler.rb
CHANGED
@@ -10,13 +10,8 @@ module Wombat
|
|
10
10
|
include Parser
|
11
11
|
extend ActiveSupport::Concern
|
12
12
|
|
13
|
-
|
14
|
-
|
15
|
-
parse self.class.send(:metadata)
|
16
|
-
end
|
17
|
-
|
18
|
-
def supports_city?
|
19
|
-
end
|
13
|
+
def crawl
|
14
|
+
parse self.class.send(:metadata)
|
20
15
|
end
|
21
16
|
|
22
17
|
module ClassMethods
|
@@ -28,17 +23,10 @@ module Wombat
|
|
28
23
|
metadata.for_each(selector).instance_eval(&block) if block
|
29
24
|
end
|
30
25
|
|
31
|
-
def format type
|
32
|
-
metadata.document_format = type
|
33
|
-
end
|
34
|
-
|
35
26
|
def follow_links selector
|
36
27
|
|
37
28
|
end
|
38
29
|
|
39
|
-
def supported_cities
|
40
|
-
end
|
41
|
-
|
42
30
|
def to_ary
|
43
31
|
end
|
44
32
|
|
data/lib/wombat/metadata.rb
CHANGED
@@ -4,10 +4,8 @@ require 'wombat/iterator'
|
|
4
4
|
|
5
5
|
module Wombat
|
6
6
|
class Metadata < PropertyContainer
|
7
|
-
attr_accessor :document_format
|
8
|
-
|
9
7
|
def initialize
|
10
|
-
|
8
|
+
self[:format] = :html
|
11
9
|
super
|
12
10
|
end
|
13
11
|
|
@@ -18,5 +16,9 @@ module Wombat
|
|
18
16
|
def list_page url
|
19
17
|
self[:list_page] = url
|
20
18
|
end
|
19
|
+
|
20
|
+
def format format
|
21
|
+
self[:format] = format
|
22
|
+
end
|
21
23
|
end
|
22
24
|
end
|
data/lib/wombat/parser.rb
CHANGED
@@ -22,7 +22,10 @@ module Wombat
|
|
22
22
|
it.all_properties.each do |p|
|
23
23
|
p.result ||= []
|
24
24
|
result = locate(p)
|
25
|
-
|
25
|
+
if result
|
26
|
+
result = p.callback ? p.callback.call(result) : result
|
27
|
+
p.result << result
|
28
|
+
end
|
26
29
|
end
|
27
30
|
end
|
28
31
|
end
|
@@ -41,7 +44,7 @@ module Wombat
|
|
41
44
|
def get_parser metadata
|
42
45
|
url = "#{metadata[:base_url]}#{metadata[:list_page]}"
|
43
46
|
|
44
|
-
if metadata
|
47
|
+
if metadata[:format] == :html
|
45
48
|
@mechanize.get(url).parser
|
46
49
|
else
|
47
50
|
Nokogiri::XML RestClient.get(url)
|
data/spec/crawler_spec.rb
CHANGED
@@ -114,7 +114,7 @@ describe Wombat::Crawler do
|
|
114
114
|
|
115
115
|
it 'should assign metadata forma' do
|
116
116
|
@crawler_instance.should_receive(:parse) do |arg|
|
117
|
-
arg.
|
117
|
+
arg[:format].should == :xml
|
118
118
|
end
|
119
119
|
@crawler.format :xml
|
120
120
|
@crawler_instance.crawl
|
@@ -44,7 +44,7 @@ describe 'basic crawler setup' do
|
|
44
44
|
|
45
45
|
crawler.for_each "css=ol.ranked-repositories li" do
|
46
46
|
repo 'css=h3'
|
47
|
-
description
|
47
|
+
description('css=p.description') { |d| d.gsub(/for/, '') }
|
48
48
|
end
|
49
49
|
|
50
50
|
crawler_instance = crawler.new
|
@@ -52,11 +52,11 @@ describe 'basic crawler setup' do
|
|
52
52
|
|
53
53
|
results["repo"].should =~ ["jairajs89 / Touchy.js", "mcavage / node-restify", "notlion / streetview-stereographic", "twitter / bootstrap", "stolksdorf / Parallaxjs"]
|
54
54
|
results["description"].should =~ [
|
55
|
-
"node.js REST framework specifically meant
|
56
|
-
"A simple light-weight JavaScript library
|
55
|
+
"node.js REST framework specifically meant web service APIs",
|
56
|
+
"A simple light-weight JavaScript library dealing with touch events",
|
57
57
|
"Shader Toy + Google Map + Panoramic Explorer",
|
58
58
|
"HTML, CSS, and JS toolkit from Twitter",
|
59
|
-
"a Library
|
59
|
+
"a Library Javascript that allows easy page parallaxing"
|
60
60
|
]
|
61
61
|
end
|
62
62
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -72,6 +72,10 @@ describe Wombat::Parser do
|
|
72
72
|
block_called.should be_true
|
73
73
|
end
|
74
74
|
|
75
|
+
it 'should invoke callback inside for_each block' do
|
76
|
+
|
77
|
+
end
|
78
|
+
|
75
79
|
it 'should return hash with requested properties' do
|
76
80
|
hash = double :results
|
77
81
|
fake_parser = double :parser
|
@@ -140,7 +144,7 @@ describe Wombat::Parser do
|
|
140
144
|
it 'should correctly parse xml documents' do
|
141
145
|
fake_document = double :xml
|
142
146
|
fake_parser = double :parser
|
143
|
-
@metadata.
|
147
|
+
@metadata.format :xml
|
144
148
|
@parser.mechanize.should_not_receive(:get)
|
145
149
|
RestClient.should_receive(:get).and_return fake_document
|
146
150
|
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.5"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-
|
12
|
+
s.date = "2012-03-21"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -54,7 +54,8 @@ Gem::Specification.new do |s|
|
|
54
54
|
s.homepage = "http://github.com/felipecsl/wombat"
|
55
55
|
s.licenses = ["MIT"]
|
56
56
|
s.require_paths = ["lib"]
|
57
|
-
s.
|
57
|
+
s.required_ruby_version = Gem::Requirement.new(">= 1.9")
|
58
|
+
s.rubygems_version = "1.8.18"
|
58
59
|
s.summary = "Ruby DSL to crawl web pages"
|
59
60
|
|
60
61
|
if s.respond_to? :specification_version then
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.5
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-21 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: activesupport
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :runtime
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: rest-client
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ! '>='
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: '0'
|
44
54
|
type: :runtime
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: bundler
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ! '>='
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: '0'
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ! '>='
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: '0'
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: rake
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ! '>='
|
@@ -65,10 +85,15 @@ dependencies:
|
|
65
85
|
version: '0'
|
66
86
|
type: :development
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ! '>='
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: '0'
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: yard
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
72
97
|
none: false
|
73
98
|
requirements:
|
74
99
|
- - ! '>='
|
@@ -76,10 +101,15 @@ dependencies:
|
|
76
101
|
version: '0'
|
77
102
|
type: :development
|
78
103
|
prerelease: false
|
79
|
-
version_requirements:
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
80
110
|
- !ruby/object:Gem::Dependency
|
81
111
|
name: jeweler
|
82
|
-
requirement:
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
83
113
|
none: false
|
84
114
|
requirements:
|
85
115
|
- - ! '>='
|
@@ -87,10 +117,15 @@ dependencies:
|
|
87
117
|
version: '0'
|
88
118
|
type: :development
|
89
119
|
prerelease: false
|
90
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ! '>='
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '0'
|
91
126
|
- !ruby/object:Gem::Dependency
|
92
127
|
name: rspec
|
93
|
-
requirement:
|
128
|
+
requirement: !ruby/object:Gem::Requirement
|
94
129
|
none: false
|
95
130
|
requirements:
|
96
131
|
- - ! '>='
|
@@ -98,21 +133,31 @@ dependencies:
|
|
98
133
|
version: '0'
|
99
134
|
type: :development
|
100
135
|
prerelease: false
|
101
|
-
version_requirements:
|
136
|
+
version_requirements: !ruby/object:Gem::Requirement
|
137
|
+
none: false
|
138
|
+
requirements:
|
139
|
+
- - ! '>='
|
140
|
+
- !ruby/object:Gem::Version
|
141
|
+
version: '0'
|
102
142
|
- !ruby/object:Gem::Dependency
|
103
143
|
name: vcr
|
104
|
-
requirement:
|
144
|
+
requirement: !ruby/object:Gem::Requirement
|
105
145
|
none: false
|
106
146
|
requirements:
|
107
|
-
- - =
|
147
|
+
- - '='
|
108
148
|
- !ruby/object:Gem::Version
|
109
149
|
version: 2.0.0.rc1
|
110
150
|
type: :development
|
111
151
|
prerelease: false
|
112
|
-
version_requirements:
|
152
|
+
version_requirements: !ruby/object:Gem::Requirement
|
153
|
+
none: false
|
154
|
+
requirements:
|
155
|
+
- - '='
|
156
|
+
- !ruby/object:Gem::Version
|
157
|
+
version: 2.0.0.rc1
|
113
158
|
- !ruby/object:Gem::Dependency
|
114
159
|
name: fakeweb
|
115
|
-
requirement:
|
160
|
+
requirement: !ruby/object:Gem::Requirement
|
116
161
|
none: false
|
117
162
|
requirements:
|
118
163
|
- - ! '>='
|
@@ -120,7 +165,12 @@ dependencies:
|
|
120
165
|
version: '0'
|
121
166
|
type: :development
|
122
167
|
prerelease: false
|
123
|
-
version_requirements:
|
168
|
+
version_requirements: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
170
|
+
requirements:
|
171
|
+
- - ! '>='
|
172
|
+
- !ruby/object:Gem::Version
|
173
|
+
version: '0'
|
124
174
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
125
175
|
email: felipe.lima@gmail.com
|
126
176
|
executables: []
|
@@ -174,7 +224,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
174
224
|
requirements:
|
175
225
|
- - ! '>='
|
176
226
|
- !ruby/object:Gem::Version
|
177
|
-
version: '
|
227
|
+
version: '1.9'
|
178
228
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
179
229
|
none: false
|
180
230
|
requirements:
|
@@ -183,7 +233,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
183
233
|
version: '0'
|
184
234
|
requirements: []
|
185
235
|
rubyforge_project:
|
186
|
-
rubygems_version: 1.8.
|
236
|
+
rubygems_version: 1.8.18
|
187
237
|
signing_key:
|
188
238
|
specification_version: 3
|
189
239
|
summary: Ruby DSL to crawl web pages
|