wombat 0.2.3 → 0.2.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +1 -0
- data/README.md +1 -1
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/xml_with_namespace.yml +139 -0
- data/lib/wombat/crawler.rb +4 -0
- data/lib/wombat/metadata.rb +7 -0
- data/lib/wombat/parser.rb +15 -3
- data/lib/wombat/property_locator.rb +5 -3
- data/spec/crawler_spec.rb +8 -0
- data/spec/integration/integration_spec.rb +25 -0
- data/spec/parser_spec.rb +19 -6
- data/spec/property_locator_spec.rb +13 -4
- data/wombat.gemspec +6 -2
- metadata +32 -20
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -63,7 +63,7 @@ my_crawler.crawl
|
|
63
63
|
}
|
64
64
|
```
|
65
65
|
|
66
|
-
For
|
66
|
+
### For additional documentation, please check the project [Wiki](http://github.com/felipecsl/wombat/wiki).
|
67
67
|
|
68
68
|
|
69
69
|
## Contributing to Wombat
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.4
|
@@ -0,0 +1,139 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://ws.audioscrobbler.com/2.0/?method=geo.getevents&location=San%20Francisco&api_key=060decb474b73437d5bbec37f527ae7b
|
6
|
+
body: ''
|
7
|
+
headers:
|
8
|
+
accept:
|
9
|
+
- ! '*/*'
|
10
|
+
user-agent:
|
11
|
+
- Mechanize/2.1 Ruby/1.9.3p0 (http://github.com/tenderlove/mechanize/)
|
12
|
+
accept-encoding:
|
13
|
+
- gzip,deflate,identity
|
14
|
+
accept-charset:
|
15
|
+
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
16
|
+
accept-language:
|
17
|
+
- en-us,en;q=0.5
|
18
|
+
host:
|
19
|
+
- ws.audioscrobbler.com
|
20
|
+
connection:
|
21
|
+
- keep-alive
|
22
|
+
keep-alive:
|
23
|
+
- 300
|
24
|
+
response:
|
25
|
+
status:
|
26
|
+
code: 200
|
27
|
+
message: OK
|
28
|
+
headers:
|
29
|
+
server:
|
30
|
+
- Apache/2.2.17 (Unix)
|
31
|
+
x-web-node:
|
32
|
+
- www57
|
33
|
+
access-control-allow-origin:
|
34
|
+
- ! '*'
|
35
|
+
access-control-allow-methods:
|
36
|
+
- POST, GET, OPTIONS
|
37
|
+
access-control-max-age:
|
38
|
+
- '86400'
|
39
|
+
cache-control:
|
40
|
+
- max-age=3600
|
41
|
+
expires:
|
42
|
+
- Tue, 14 Feb 2012 18:21:39 GMT
|
43
|
+
content-type:
|
44
|
+
- text/xml; charset=utf-8;
|
45
|
+
content-encoding:
|
46
|
+
- gzip
|
47
|
+
date:
|
48
|
+
- Tue, 14 Feb 2012 17:21:40 GMT
|
49
|
+
x-varnish:
|
50
|
+
- '2305093058'
|
51
|
+
age:
|
52
|
+
- '0'
|
53
|
+
via:
|
54
|
+
- 1.1 varnish
|
55
|
+
connection:
|
56
|
+
- close
|
57
|
+
body: !binary |-
|
58
|
+
H4sIAAAAAAAAA+1c+3PbNhL++fJXoLp2rh1W4vulysr5ESfNNXUmdpPpdG4y
|
59
|
+
EAlJrEhCR1KSlb/+FnzIlESKpEw5dx1nMhYJLJbAAtjvWwDk4OW956IlCUKH
|
60
|
+
+mcdsSd0EPEtajv+5KyziMZdo/Ny+GLgjj0URjhahGcdOutAClkSPwoRFPfD
|
61
|
+
/oTQs840iuZ9nl+tVr2V3KPBhJcEQeYFkYdsfjUJDeXznIZ/7yCXWjiKH3iL
|
62
|
+
fXQdYN9yQov+iH7znYjY6BaeRMIOmuMJgUrBBQneJ9dQv4hG2GW3UBdZS+/h
|
63
|
+
UjU6aEzCyFliN6S+uz7rMGkMDYEKI/iXVPq4OjMNA8ceyqJqmqo44OGaJUVO
|
64
|
+
5JLh7zcXAz65ZIk4iJwwCtOnJneJTHq9lXHhYmuGLukowMUCn7A7I/5O3pRg
|
65
|
+
23V8EiSKH25ZBfjtGkCzFyS9hnqbgq4Z0qYNkOhjjwzvpgT9SlboPQ6ccDrg
|
66
|
+
48QkP+ux5BYSLCdaD2/wzMW+PeDju00WXfhRsB5udSbIpMmZWBgFhERDVTeR
|
67
|
+
aERTkGP3Az5Nz8TA+tC9MCLJ8PIcmYomSgM+l5jJQX/159TxNyWzRBdHQ1nv
|
68
|
+
GYKu6+qAz5J2pag/GXZFSepJuqTLWirIUrMn8DuPGPDbVhksAneYG1AuDqPe
|
69
|
+
2ONj4/OpzTkwMgdG5jIjs0JJ8RUZhWCxvIpoSnyymseiPYt6Az4TSorMp9QH
|
70
|
+
7d4Iuv17VRR+QIqidHVFV8BGuby05z2YNCh0vsBECj3sup0hDAGWOHzxt61c
|
71
|
+
j9jOwivNdnEwIaW55D4K8GERj0xwLpPZMhmi6ciA0XsFo2Z4tyA/IlFB12SE
|
72
|
+
JEGU4Lovmn1BZMMkE2LFbRJagTOPO2PwzR+XV+d3538MbGeJLOgGcBSjERst
|
73
|
+
neG3osB/K8qDUYD4YfL3itIg7CO9LwvzJOV2Sld9ZPSFLCH5KxocugG/8s03
|
74
|
+
Ax5UD//9b2hC/skvio2cdugiBDdLgiXp4tlmcMQJvKzwmiGooiFpvT/nk7xh
|
75
|
+
irqlWqFWT2HaS9X6REmrpTDf99VaJVUq1YqjiPg2IAMZSgL4s4dblhuQpUNW
|
76
|
+
4RByssvYGePJkD1i7PVjV3+28dYshw2uF+XzNC7BpyU4cKocjnamK0d9TlQ4
|
77
|
+
GI7BAgdrjo3JzRTeTODdWRo51owkvpiBxOaG+UrWItclNmvJw01aDk/Cjfdh
|
78
|
+
DbAp9ZBHwPUlDcrnhe7CnpBcenyZPSdp3LBNEBQNeQcEfyG+v0b/CvDSib5U
|
79
|
+
wuGOdBHufcDzKSYuusXYdv5Tin87mpogoWGasinJRUh4Te8R/IKHCZ4ICkXA
|
80
|
+
KHRHXDIJoOHofEl6J4VDUa8Fh4Kgmq3DYWr4eH6BpbmNpUvxELBwTO9pYud6
|
81
|
+
YCgLUleSdL0mGNby04qgyoIh6lv+qhg9a7npGvoaeeka+po76VqNZpBere7z
|
82
|
+
Rhm/0/fsOr1MZ1Ouv3fQoS5dMPqiWEUXWoNwVVd1STb13txvB8LrKGw0OOoo
|
83
|
+
bD46yrTmMNt8FILHUNMIwaEEF8MCl8JChuX58fZ1wTzwR/soPl74s/1Un9Bu
|
84
|
+
SBcFqB9Qq0A+mK6jqYdg3qCRu2CIs8cWCrW5MZDONgD+NFRCF2VlN57+xYng
|
85
|
+
B71b2Pa6mklsCZeRhC2hJhxBFCRZl01llyS8XcydeuTgggQzgPX1j+gS9x5F
|
86
|
+
EQ7xgeN4gGbqhlKHB2i6zhxpyzwgMy63seY+ATgE9H/ROFcSIPiE/0fHubdz
|
87
|
+
YjnYRR8xTOoIRnqIQohqe73eCWJXWRF0VVPN1mLXOgobAV8dhc2Br0xrDurE
|
88
|
+
RwBf6hgbAF9cgktcHRe7OoZ76dT6yngHkOPbYOEiwAIHGjigAe9nOT54vIUH
|
89
|
+
7SuKesn9HEqW5Vow7j1wRdZhtagYRScLB+be08GgokvqDgxe4eUMV+JfKlUC
|
90
|
+
fGlus6hYN1RB2wW864A4k2kU84pb7C7Z1Lmk4zEh6A2FqdIMCc//x4BQFwxd
|
91
|
+
rgWEpizKJwiIY5tzqZE5MDKXGplLjMylRn4GSAaQUl8Vni6yUw1Da29ptlpd
|
92
|
+
s6iuUt0RMV2hzraALXF1TYCNleBiV8YQrWqSfGWom7nki0eCfUj5k6ziTZ/d
|
93
|
+
9OgLtp3Zk0GNqat7Edc7x0rWWq0p8WckQK8DuphXYk9ZsaLl3Nu5E49CLywF
|
94
|
+
qzJ1jdBL1w1N3VvTfUe+fME+6KgBU1vb0o/CKUVR0FsShg6pscFpKqIgH4Ve
|
95
|
+
uiHJilEDvRTBOMlybmJzLmfkgxubXiYXjquWchVRRZqkIsMwhDYXcmVB0yXR
|
96
|
+
aGsdt1pds4ClUt0R4UqNFtdbxFUFIdPGb7o89rKCW2+p9hOxAdDVB0AXjL6i
|
97
|
+
PhmgK5JuKrrRGqLX0NdsFb9a3xGr+MVKczD+mI3WFFQagHpcgktdPpe5fC52
|
98
|
+
+QzlH0YWg3P1q8H5FAd2SYwYRuCOgpJMAK3lumy/Nl6vcaySojupp6UDMkQ+
|
99
|
+
+g4dYDug52FEAurYIXoNA+1+je7oIqhkBAdKFpGCjzhBGbKb/wD5BzQ25AWm
|
100
|
+
bApFe70/+zaZw7AHYz4pO9AkA105SyfENrSuHkNgu7bHMARdVSWzDkOQQVQ/
|
101
|
+
BUNg1o83YbbMXXUAynkQTsgCf4gtXNB7dDMeOxZBXQTUAdot9kRFktqmDqKh
|
102
|
+
tkkdDqtrTB0OqzuOOlS1uO7+b6qL3xkJXK6jj+QQot6Xnm5RQIHZYgqS3Np2
|
103
|
+
bx2FzVhEDYVH0IgSrTnmoD2CR6Ro1IBHxCVix7JBCS5BCY6hRLb1mx9rX5dQ
|
104
|
+
wEiHeHBO5wUr4S60wAevuiRFi9ZQrmDTmMCoJWgJ3thl8FewvkARsCtrhugi
|
105
|
+
KnjmioTUK1Ac4RlBGLmUzhAuKDhe+AXEZuq4BXzHZmC5LFoU2bbDiVc+DFnc
|
106
|
+
pTqfgHcyBgBdch5F2JqFlRznNQ4Cx0cXxB87xLVLmcueXDO+YsiyKe3ylatf
|
107
|
+
z6FHFv7kaRcyZF1Fohif0z7REoYuCnotggKuTVBOQFBic3NgXy6z70FuYvs4
|
108
|
+
kaskJd8DEfkBaZLWFRXBbJOJSJokSJLaGhWpoa8RAtXQ1xyAajW6/kJGpo5/
|
109
|
+
6HkO+paVmjuWeDwXkR+1g/+eBGMaeI4/QS6gQT9/En2A0TQg4y0fmDXMW0BY
|
110
|
+
yb+nDJ6464XrcnTMfViTzvYDPie+B54TSyImiegYgST4pSHqdlHRU+ax9BiE
|
111
|
+
6ThYk55PIr6DAuKedXw6ppC86gwr5Jn+uu1IPCiXedCyRuw75NIWTGLReM4e
|
112
|
+
qnhOrEl9f2EgzL0BFlJW1VgCMYlDtWSey52C0Cg97XqwsvvSuTqHcwCBGNDO
|
113
|
+
Or/5M5+ufJRWZqeKiyS3M7x4dX3z4RW6e/MKXXw4//gKBjEoKa2s43vrEbED
|
114
|
+
Sj12yCSMaxASHFhT3sUj4vJvcUj97yQBBnWwht/biMF5eLBRx2tt1GHJYYlP
|
115
|
+
1F2yV85K+iw5PJYIFXZbmCofY4uMgC/FlXXjYqtUdUFLK0s9rhvfvDq/uoTu
|
116
|
+
+/BrWQcWVoC9dhfyb4CqXGJgbD7wKt0QAHRVU1JMTXrJikcNWlRTYa6x8d+E
|
117
|
+
JrOlsF6SYLM3ddA/kcpe1fF+ShLZ2IA0LU5LBQFAEXtmevutKCBsLxkt/ylL
|
118
|
+
URHT1qvyqtsgP3bXACU8c/C8IPGihnvTyHP3LLEpdXgkroGOL0aJ6hWOrOnL
|
119
|
+
5dnNvWdanwPx9tzaU5vK79ppr0ctGBc7NaeBTQL+JfAT70wFtBP3lN8lAU0b
|
120
|
+
yqU95R9/fo9uAXEdK6n9Cc6giYqoyO3t0dfQ12xVplrfEcsyxUrbWtFPgqUm
|
121
|
+
K/qsBMcCKi4OqLhNQPVksTWMuKIDZhDVhCUHwQpjaoh55w5b3yiUL1m1PxDX
|
122
|
+
lyz3nzzalXb3+Z+j3edo9znafY52n6Pd52j3Odp9jnafo93naPc52j1ttHvE
|
123
|
+
YGYOGYj6aOPi4xH9HDf/n8TNUsOTcKxErbi5dLJa7EU/GwfZdO1m0/UvGWkf
|
124
|
+
Pj1XFIef+uUtzdjdVwaWtkaXMC5n6C243hpR9pZ40WE5dlbt0xT6El0sxmPo
|
125
|
+
/kOx+Ja2xgfoTaPooNzrgOAInaev66F3jFSgN5ht+D9hdG6oJrr5xzVzkQB9
|
126
|
+
9c7Vm0eeqwfErvNWmCIaht7+69FpV8TnV2Lbc5ntudj2XGL7w2ftmeAU5Goc
|
127
|
+
n0tid8NQu4KutnrcXmVnhqS2IvdKbc1eoKrSdsT7U9XNrXteLlHFHxwCR4bt
|
128
|
+
klAjbG/xE2empmmG0dqBuToKG37irFrhMZ84K9aa/8TZo96mi7Gn0dt0UIJj
|
129
|
+
EMHFEMEBRGSn5EqH2Fc+M1fMNko+b5LiRsF5fWAaJOhCID1ZBcn3Jipf+84/
|
130
|
+
qoRwYDfqlj9z+1srJ17yl42Hz5llb5E7E2iS64Qe+v7qLUQm0Q/V75RvypS/
|
131
|
+
WJ4TaUQvDEnXlEJ68ZGEIXlqLoEuqTePqI/eu1UkwjiSRLCXu2u9nKdpJ1jq
|
132
|
+
T+wdz+7MwAcJwzIWqs0WVFPrqgLjiy2yBUPQtdbW+Su1NWMLVdqOYAvVza2/
|
133
|
+
wp8o4x+6+1hyID3yGzTJ09HtNeJ5pKLfCQ7Que877HPTgCA9dIXXSDzBmoYk
|
134
|
+
y6Ks6Hprixp1FDbbKaqh8IitohKtOWqhPGZdI8GWJusarAT3ABTcd5Jx9ZYD
|
135
|
+
BPqOvYtUsrixVHsr7KdkodAJtcYqiEusiO1N7H1YJsnwgQQdyqzA9fQ3hAt3
|
136
|
+
7A1f/BcWb0rTb10AAA==
|
137
|
+
http_version: '1.0'
|
138
|
+
recorded_at: Tue, 14 Feb 2012 17:21:40 GMT
|
139
|
+
recorded_with: VCR 2.0.0.rc1
|
data/lib/wombat/crawler.rb
CHANGED
data/lib/wombat/metadata.rb
CHANGED
data/lib/wombat/parser.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
#coding: utf-8
|
2
2
|
require 'wombat/property_locator'
|
3
3
|
require 'mechanize'
|
4
|
+
require 'restclient'
|
4
5
|
|
5
6
|
module Wombat
|
6
7
|
module Parser
|
@@ -12,7 +13,7 @@ module Wombat
|
|
12
13
|
end
|
13
14
|
|
14
15
|
def parse metadata
|
15
|
-
self.context =
|
16
|
+
self.context = get_parser metadata
|
16
17
|
original_context = self.context
|
17
18
|
|
18
19
|
metadata.iterators.each do |it|
|
@@ -20,7 +21,7 @@ module Wombat
|
|
20
21
|
self.context = n
|
21
22
|
it.all_properties.each do |p|
|
22
23
|
p.result ||= []
|
23
|
-
result =
|
24
|
+
result = locate(p)
|
24
25
|
p.result << result if result
|
25
26
|
end
|
26
27
|
end
|
@@ -29,11 +30,22 @@ module Wombat
|
|
29
30
|
self.context = original_context
|
30
31
|
|
31
32
|
metadata.all_properties.each do |p|
|
32
|
-
result =
|
33
|
+
result = locate p
|
33
34
|
p.result = p.callback ? p.callback.call(result) : result
|
34
35
|
end
|
35
36
|
|
36
37
|
metadata.flatten
|
37
38
|
end
|
39
|
+
|
40
|
+
private
|
41
|
+
def get_parser metadata
|
42
|
+
url = "#{metadata[:base_url]}#{metadata[:list_page]}"
|
43
|
+
|
44
|
+
if metadata.document_format == :html
|
45
|
+
@mechanize.get(url).parser
|
46
|
+
else
|
47
|
+
Nokogiri::XML RestClient.get(url)
|
48
|
+
end
|
49
|
+
end
|
38
50
|
end
|
39
51
|
end
|
@@ -5,11 +5,13 @@ module Wombat
|
|
5
5
|
module PropertyLocator
|
6
6
|
include NodeSelector
|
7
7
|
|
8
|
-
def
|
9
|
-
|
8
|
+
def locate property
|
9
|
+
props = _locate property
|
10
|
+
property.format != :list ? props.first : props
|
10
11
|
end
|
11
12
|
|
12
|
-
|
13
|
+
private
|
14
|
+
def _locate property
|
13
15
|
result = select_nodes(property.selector, property.namespaces).to_a
|
14
16
|
result.map! {|r| r.inner_html.strip } if property.format == :html
|
15
17
|
result.map {|r| r.kind_of?(String) ? r : r.inner_text }.map(&:strip)
|
data/spec/crawler_spec.rb
CHANGED
@@ -111,4 +111,12 @@ describe Wombat::Crawler do
|
|
111
111
|
|
112
112
|
@crawler_instance.crawl
|
113
113
|
end
|
114
|
+
|
115
|
+
it 'should assign metadata forma' do
|
116
|
+
@crawler_instance.should_receive(:parse) do |arg|
|
117
|
+
arg.document_format.should == :xml
|
118
|
+
end
|
119
|
+
@crawler.format :xml
|
120
|
+
@crawler_instance.crawl
|
121
|
+
end
|
114
122
|
end
|
@@ -60,4 +60,29 @@ describe 'basic crawler setup' do
|
|
60
60
|
]
|
61
61
|
end
|
62
62
|
end
|
63
|
+
|
64
|
+
it 'should crawl xml with namespaces' do
|
65
|
+
VCR.use_cassette('xml_with_namespace') do
|
66
|
+
crawler = Class.new
|
67
|
+
crawler.send(:include, Wombat::Crawler)
|
68
|
+
|
69
|
+
crawler.format :xml
|
70
|
+
crawler.base_url "http://ws.audioscrobbler.com"
|
71
|
+
crawler.list_page "/2.0/?method=geo.getevents&location=#{URI.escape('San Francisco')}&api_key=060decb474b73437d5bbec37f527ae7b"
|
72
|
+
|
73
|
+
crawler.artist "xpath=//title", :list
|
74
|
+
|
75
|
+
crawler.for_each 'xpath=//event' do
|
76
|
+
latitude "xpath=./venue/location/geo:point/geo:lat", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
77
|
+
longitude "xpath=./venue/location/geo:point/geo:long", :text, { 'geo' => 'http://www.w3.org/2003/01/geo/wgs84_pos#' }
|
78
|
+
end
|
79
|
+
|
80
|
+
crawler_instance = crawler.new
|
81
|
+
results = crawler_instance.crawl
|
82
|
+
|
83
|
+
results["latitude"].should =~ ["37.807775", "37.807717", "37.869784", "37.870873", "37.782348", "37.775529", "37.771079", "37.771079", "37.784963", "37.788978"]
|
84
|
+
results["longitude"].should =~ ["-122.272736", "-122.270059", "-122.267701", "-122.269313", "-122.408059", "-122.437757", "-122.412604", "-122.412604", "-122.418871", "-122.40664"]
|
85
|
+
results["artist"].should =~ ["Davka", "Digitalism (DJ Set)", "Gary Clark Jr.", "Lenny Kravitz", "Little Muddy", "Michael Schenker Group", "The Asteroids Galaxy Tour", "When Indie Attacks", "When Indie Attacks", "YOB"]
|
86
|
+
end
|
87
|
+
end
|
63
88
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -42,7 +42,7 @@ describe Wombat::Parser do
|
|
42
42
|
|
43
43
|
@parser.mechanize.stub(:get).and_return fake_document
|
44
44
|
@metadata.stub(:all_properties).and_return [property]
|
45
|
-
@parser.should_receive(:
|
45
|
+
@parser.should_receive(:locate).with(property)
|
46
46
|
|
47
47
|
@parser.parse @metadata
|
48
48
|
|
@@ -65,7 +65,7 @@ describe Wombat::Parser do
|
|
65
65
|
|
66
66
|
@parser.mechanize.stub(:get).and_return fake_document
|
67
67
|
@metadata.stub(:all_properties).and_return [property]
|
68
|
-
@parser.should_receive(:
|
68
|
+
@parser.should_receive(:locate).with(property).and_return("blah")
|
69
69
|
|
70
70
|
@parser.parse @metadata
|
71
71
|
|
@@ -104,8 +104,8 @@ describe Wombat::Parser do
|
|
104
104
|
@parser.should_receive(:context=).with(c1).ordered
|
105
105
|
@parser.should_receive(:context=).with(c2).ordered
|
106
106
|
@parser.should_receive(:context=).ordered
|
107
|
-
@parser.should_receive(:
|
108
|
-
@parser.should_receive(:
|
107
|
+
@parser.should_receive(:locate).with(it['prop_1']).twice
|
108
|
+
@parser.should_receive(:locate).with(it['prop_2']).twice
|
109
109
|
@parser.stub(:locate)
|
110
110
|
|
111
111
|
@parser.parse(@metadata)
|
@@ -128,12 +128,25 @@ describe Wombat::Parser do
|
|
128
128
|
@parser.should_receive(:context=).with(c1).ordered
|
129
129
|
@parser.should_receive(:context=).with(c2).ordered
|
130
130
|
@parser.should_receive(:context=).ordered
|
131
|
-
@parser.should_receive(:
|
132
|
-
@parser.should_receive(:
|
131
|
+
@parser.should_receive(:locate).with(it['prop_1']).and_return(12)
|
132
|
+
@parser.should_receive(:locate).with(it['prop_1']).and_return(nil)
|
133
133
|
@parser.stub(:locate)
|
134
134
|
|
135
135
|
@parser.parse(@metadata)
|
136
136
|
|
137
137
|
it["prop_1"].result.should == [12]
|
138
138
|
end
|
139
|
+
|
140
|
+
it 'should correctly parse xml documents' do
|
141
|
+
fake_document = double :xml
|
142
|
+
fake_parser = double :parser
|
143
|
+
@metadata.document_format = :xml
|
144
|
+
@parser.mechanize.should_not_receive(:get)
|
145
|
+
RestClient.should_receive(:get).and_return fake_document
|
146
|
+
Nokogiri.should_receive(:XML).with(fake_document).and_return fake_parser
|
147
|
+
@parser.should_receive(:context=).with(fake_parser)
|
148
|
+
@parser.should_receive(:context=)
|
149
|
+
|
150
|
+
@parser.parse @metadata
|
151
|
+
end
|
139
152
|
end
|
@@ -28,7 +28,7 @@ describe Wombat::PropertyLocator do
|
|
28
28
|
|
29
29
|
@locator_instance.stub(:context).and_return context
|
30
30
|
|
31
|
-
@metadata.all_properties.each { |p| p.result = @locator_instance.
|
31
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
|
32
32
|
|
33
33
|
@metadata["blah"].result.should == "abc"
|
34
34
|
@metadata["event"]["data1"].result.should == "Something cool"
|
@@ -47,7 +47,7 @@ describe Wombat::PropertyLocator do
|
|
47
47
|
|
48
48
|
@metadata["event"].another_info "xpath=/anotherData", :html
|
49
49
|
|
50
|
-
@metadata.all_properties.each { |p| p.result = @locator_instance.
|
50
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
|
51
51
|
|
52
52
|
@metadata["event"]["another_info"].result.should == "some another info"
|
53
53
|
end
|
@@ -59,8 +59,17 @@ describe Wombat::PropertyLocator do
|
|
59
59
|
@locator_instance.stub(:context).and_return context
|
60
60
|
@metadata["event"].description "xpath=/event/some/description", :text, "blah"
|
61
61
|
|
62
|
-
@metadata.all_properties.each { |p| p.result = @locator_instance.
|
62
|
+
@metadata.all_properties.each { |p| p.result = @locator_instance.locate p }
|
63
63
|
|
64
64
|
@metadata["event"]["description"].result.should == "awesome event"
|
65
65
|
end
|
66
|
-
|
66
|
+
|
67
|
+
it 'should return array of matching nodes for list properties' do
|
68
|
+
context = double :context
|
69
|
+
@metadata.list_prop "css=.selector", :list
|
70
|
+
@locator_instance.stub(:context).and_return context
|
71
|
+
@locator_instance.should_receive(:select_nodes).with("css=.selector", nil).and_return %w(1 2 3 4 5)
|
72
|
+
|
73
|
+
@locator_instance.locate(@metadata["list_prop"]).should == %w(1 2 3 4 5)
|
74
|
+
end
|
75
|
+
end
|
data/wombat.gemspec
CHANGED
@@ -5,11 +5,11 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.4"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
12
|
-
s.date = "2012-02-
|
12
|
+
s.date = "2012-02-15"
|
13
13
|
s.description = "Generic Web crawler with a DSL that parses structured data from web pages"
|
14
14
|
s.email = "felipe.lima@gmail.com"
|
15
15
|
s.extra_rdoc_files = [
|
@@ -29,6 +29,7 @@ Gem::Specification.new do |s|
|
|
29
29
|
"VERSION",
|
30
30
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
31
|
"fixtures/vcr_cassettes/for_each_page.yml",
|
32
|
+
"fixtures/vcr_cassettes/xml_with_namespace.yml",
|
32
33
|
"lib/wombat.rb",
|
33
34
|
"lib/wombat/crawler.rb",
|
34
35
|
"lib/wombat/iterator.rb",
|
@@ -62,6 +63,7 @@ Gem::Specification.new do |s|
|
|
62
63
|
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
63
64
|
s.add_runtime_dependency(%q<mechanize>, [">= 0"])
|
64
65
|
s.add_runtime_dependency(%q<activesupport>, [">= 0"])
|
66
|
+
s.add_runtime_dependency(%q<rest-client>, [">= 0"])
|
65
67
|
s.add_development_dependency(%q<bundler>, [">= 0"])
|
66
68
|
s.add_development_dependency(%q<rake>, [">= 0"])
|
67
69
|
s.add_development_dependency(%q<yard>, [">= 0"])
|
@@ -72,6 +74,7 @@ Gem::Specification.new do |s|
|
|
72
74
|
else
|
73
75
|
s.add_dependency(%q<mechanize>, [">= 0"])
|
74
76
|
s.add_dependency(%q<activesupport>, [">= 0"])
|
77
|
+
s.add_dependency(%q<rest-client>, [">= 0"])
|
75
78
|
s.add_dependency(%q<bundler>, [">= 0"])
|
76
79
|
s.add_dependency(%q<rake>, [">= 0"])
|
77
80
|
s.add_dependency(%q<yard>, [">= 0"])
|
@@ -83,6 +86,7 @@ Gem::Specification.new do |s|
|
|
83
86
|
else
|
84
87
|
s.add_dependency(%q<mechanize>, [">= 0"])
|
85
88
|
s.add_dependency(%q<activesupport>, [">= 0"])
|
89
|
+
s.add_dependency(%q<rest-client>, [">= 0"])
|
86
90
|
s.add_dependency(%q<bundler>, [">= 0"])
|
87
91
|
s.add_dependency(%q<rake>, [">= 0"])
|
88
92
|
s.add_dependency(%q<yard>, [">= 0"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-02-
|
12
|
+
date: 2012-02-15 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70159522946820 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70159522946820
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: activesupport
|
27
|
-
requirement: &
|
27
|
+
requirement: &70159522962680 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,21 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70159522962680
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: rest-client
|
38
|
+
requirement: &70159522962060 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70159522962060
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: bundler
|
38
|
-
requirement: &
|
49
|
+
requirement: &70159522961340 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ! '>='
|
@@ -43,10 +54,10 @@ dependencies:
|
|
43
54
|
version: '0'
|
44
55
|
type: :development
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *70159522961340
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: rake
|
49
|
-
requirement: &
|
60
|
+
requirement: &70159522960620 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - ! '>='
|
@@ -54,10 +65,10 @@ dependencies:
|
|
54
65
|
version: '0'
|
55
66
|
type: :development
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *70159522960620
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: yard
|
60
|
-
requirement: &
|
71
|
+
requirement: &70159522960000 !ruby/object:Gem::Requirement
|
61
72
|
none: false
|
62
73
|
requirements:
|
63
74
|
- - ! '>='
|
@@ -65,10 +76,10 @@ dependencies:
|
|
65
76
|
version: '0'
|
66
77
|
type: :development
|
67
78
|
prerelease: false
|
68
|
-
version_requirements: *
|
79
|
+
version_requirements: *70159522960000
|
69
80
|
- !ruby/object:Gem::Dependency
|
70
81
|
name: jeweler
|
71
|
-
requirement: &
|
82
|
+
requirement: &70159522959520 !ruby/object:Gem::Requirement
|
72
83
|
none: false
|
73
84
|
requirements:
|
74
85
|
- - ! '>='
|
@@ -76,10 +87,10 @@ dependencies:
|
|
76
87
|
version: '0'
|
77
88
|
type: :development
|
78
89
|
prerelease: false
|
79
|
-
version_requirements: *
|
90
|
+
version_requirements: *70159522959520
|
80
91
|
- !ruby/object:Gem::Dependency
|
81
92
|
name: rspec
|
82
|
-
requirement: &
|
93
|
+
requirement: &70159522959040 !ruby/object:Gem::Requirement
|
83
94
|
none: false
|
84
95
|
requirements:
|
85
96
|
- - ! '>='
|
@@ -87,10 +98,10 @@ dependencies:
|
|
87
98
|
version: '0'
|
88
99
|
type: :development
|
89
100
|
prerelease: false
|
90
|
-
version_requirements: *
|
101
|
+
version_requirements: *70159522959040
|
91
102
|
- !ruby/object:Gem::Dependency
|
92
103
|
name: vcr
|
93
|
-
requirement: &
|
104
|
+
requirement: &70159522958540 !ruby/object:Gem::Requirement
|
94
105
|
none: false
|
95
106
|
requirements:
|
96
107
|
- - =
|
@@ -98,10 +109,10 @@ dependencies:
|
|
98
109
|
version: 2.0.0.rc1
|
99
110
|
type: :development
|
100
111
|
prerelease: false
|
101
|
-
version_requirements: *
|
112
|
+
version_requirements: *70159522958540
|
102
113
|
- !ruby/object:Gem::Dependency
|
103
114
|
name: fakeweb
|
104
|
-
requirement: &
|
115
|
+
requirement: &70159522958060 !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
107
118
|
- - ! '>='
|
@@ -109,7 +120,7 @@ dependencies:
|
|
109
120
|
version: '0'
|
110
121
|
type: :development
|
111
122
|
prerelease: false
|
112
|
-
version_requirements: *
|
123
|
+
version_requirements: *70159522958060
|
113
124
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
114
125
|
email: felipe.lima@gmail.com
|
115
126
|
executables: []
|
@@ -130,6 +141,7 @@ files:
|
|
130
141
|
- VERSION
|
131
142
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
132
143
|
- fixtures/vcr_cassettes/for_each_page.yml
|
144
|
+
- fixtures/vcr_cassettes/xml_with_namespace.yml
|
133
145
|
- lib/wombat.rb
|
134
146
|
- lib/wombat/crawler.rb
|
135
147
|
- lib/wombat/iterator.rb
|