wombat 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +22 -17
- data/VERSION +1 -1
- data/fixtures/vcr_cassettes/for_each_page.yml +254 -0
- data/lib/wombat/parser.rb +2 -1
- data/spec/integration/integration_spec.rb +27 -0
- data/spec/parser_spec.rb +28 -2
- data/wombat.gemspec +2 -1
- metadata +20 -19
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# Wombat
|
2
2
|
|
3
|
-
[![CI Build Status](https://secure.travis-ci.org/
|
3
|
+
[![CI Build Status](https://secure.travis-ci.org/felipecsl/wombat.png?branch=master)](http://travis-ci.org/felipecsl/wombat)
|
4
4
|
|
5
5
|
Generic Web crawler with a DSL that parses structured data from web pages.
|
6
6
|
|
@@ -10,6 +10,8 @@ Generic Web crawler with a DSL that parses structured data from web pages.
|
|
10
10
|
|
11
11
|
Creating a crawler:
|
12
12
|
|
13
|
+
###### Create a class that includes ``Wombat::Crawler``:
|
14
|
+
|
13
15
|
```ruby
|
14
16
|
|
15
17
|
# => github_crawler.rb
|
@@ -40,25 +42,28 @@ class GithubCrawler
|
|
40
42
|
end
|
41
43
|
```
|
42
44
|
|
43
|
-
|
45
|
+
###### Run it by calling the instance method ``crawl``:
|
44
46
|
|
45
47
|
```ruby
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
48
|
+
my_crawler = GithubCrawler.new
|
49
|
+
my_crawler.crawl
|
50
|
+
|
51
|
+
#=> the line above outputs:
|
52
|
+
|
53
|
+
{
|
54
|
+
"headline" => "1,316,633 people hosting over 3,951,378 git repositories",
|
55
|
+
"what_is" => "GitHub is the best way to collaborate with others. Fork, send pull requests and manage all your <strong>public</strong> and <strong>private</strong> git repositories.",
|
56
|
+
"explore" => "LOVE GitHub",
|
57
|
+
"benefits" => {
|
58
|
+
"first_benefit" => "Team management",
|
59
|
+
"second_benefit" => "Code review",
|
60
|
+
"third_benefit" => "Reliable code hosting",
|
61
|
+
"fourth_benefit" => "Open source collaboration"
|
62
|
+
}
|
63
|
+
}
|
59
64
|
```
|
60
65
|
|
61
|
-
|
66
|
+
For more documentation, please see the [wiki](http://github.com/felipecsl/wombat/wiki)
|
62
67
|
|
63
68
|
|
64
69
|
## Contributing to Wombat
|
@@ -73,5 +78,5 @@ irb> GithubCrawler.new.crawl
|
|
73
78
|
|
74
79
|
## Copyright
|
75
80
|
|
76
|
-
Copyright (c)
|
81
|
+
Copyright (c) 2012 Felipe Lima. See LICENSE.txt for further details.
|
77
82
|
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.2.
|
1
|
+
0.2.3
|
@@ -0,0 +1,254 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: https://www.github.com/explore
|
6
|
+
body: ''
|
7
|
+
headers:
|
8
|
+
accept:
|
9
|
+
- ! '*/*'
|
10
|
+
user-agent:
|
11
|
+
- Mechanize/2.1 Ruby/1.9.3p0 (http://github.com/tenderlove/mechanize/)
|
12
|
+
accept-encoding:
|
13
|
+
- gzip,deflate,identity
|
14
|
+
accept-charset:
|
15
|
+
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
16
|
+
accept-language:
|
17
|
+
- en-us,en;q=0.5
|
18
|
+
host:
|
19
|
+
- www.github.com
|
20
|
+
connection:
|
21
|
+
- keep-alive
|
22
|
+
keep-alive:
|
23
|
+
- 300
|
24
|
+
response:
|
25
|
+
status:
|
26
|
+
code: 301
|
27
|
+
message: Moved Permanently
|
28
|
+
headers:
|
29
|
+
server:
|
30
|
+
- nginx/1.0.12
|
31
|
+
date:
|
32
|
+
- Tue, 14 Feb 2012 08:26:09 GMT
|
33
|
+
content-type:
|
34
|
+
- text/html
|
35
|
+
content-length:
|
36
|
+
- '185'
|
37
|
+
connection:
|
38
|
+
- keep-alive
|
39
|
+
location:
|
40
|
+
- https://github.com/explore
|
41
|
+
body: ! "<html>\r\n<head><title>301 Moved Permanently</title></head>\r\n<body
|
42
|
+
bgcolor=\"white\">\r\n<center><h1>301 Moved Permanently</h1></center>\r\n<hr><center>nginx/1.0.12</center>\r\n</body>\r\n</html>\r\n"
|
43
|
+
http_version: '1.1'
|
44
|
+
recorded_at: Tue, 14 Feb 2012 08:26:09 GMT
|
45
|
+
- request:
|
46
|
+
method: get
|
47
|
+
uri: https://github.com/explore
|
48
|
+
body: ''
|
49
|
+
headers:
|
50
|
+
accept:
|
51
|
+
- ! '*/*'
|
52
|
+
user-agent:
|
53
|
+
- Mechanize/2.1 Ruby/1.9.3p0 (http://github.com/tenderlove/mechanize/)
|
54
|
+
accept-encoding:
|
55
|
+
- gzip,deflate,identity
|
56
|
+
accept-charset:
|
57
|
+
- ISO-8859-1,utf-8;q=0.7,*;q=0.7
|
58
|
+
accept-language:
|
59
|
+
- en-us,en;q=0.5
|
60
|
+
host:
|
61
|
+
- github.com
|
62
|
+
connection:
|
63
|
+
- keep-alive
|
64
|
+
keep-alive:
|
65
|
+
- 300
|
66
|
+
response:
|
67
|
+
status:
|
68
|
+
code: 200
|
69
|
+
message: OK
|
70
|
+
headers:
|
71
|
+
server:
|
72
|
+
- nginx/1.0.12
|
73
|
+
date:
|
74
|
+
- Tue, 14 Feb 2012 08:26:10 GMT
|
75
|
+
content-type:
|
76
|
+
- text/html; charset=utf-8
|
77
|
+
transfer-encoding:
|
78
|
+
- chunked
|
79
|
+
connection:
|
80
|
+
- keep-alive
|
81
|
+
status:
|
82
|
+
- 200 OK
|
83
|
+
etag:
|
84
|
+
- ! '"0f503f5e8996278ddcd2f781d802ed60"'
|
85
|
+
x-frame-options:
|
86
|
+
- deny
|
87
|
+
x-runtime:
|
88
|
+
- '186'
|
89
|
+
set-cookie:
|
90
|
+
- _gh_sess=BAh7BzoQX2NzcmZfdG9rZW4iMUV1OXhZOXRablNvWWZVRlUzZy9PL3pwa002cjVJOUN0cEg2ZjZxS2puVFU9Og9zZXNzaW9uX2lkIiUyYzIwYTRiZTAwYzViNWRmZjUxYjk4MjRlOWFmM2IwZg%3D%3D--d7f3dfe98a221c6f0b0488b0f4e3b15a9a50da5f;
|
91
|
+
path=/; expires=Sat, 01-Jan-2022 00:00:00 GMT; secure; HttpOnly
|
92
|
+
cache-control:
|
93
|
+
- private, max-age=0, must-revalidate
|
94
|
+
strict-transport-security:
|
95
|
+
- max-age=2592000
|
96
|
+
content-encoding:
|
97
|
+
- gzip
|
98
|
+
body: !binary |-
|
99
|
+
H4sIAAAAAAAAA+1dWXfbRpZ+168o0zNW4jZIAqS4yBJ75C2WY8UaS44nyclR
|
100
|
+
QKBIwsIWAJTE9KQf53mWtzln/sb8gfkn/Uvmu1WFjQBpyqK6X9rHhwSBQi33
|
101
|
+
3rr7LR08ePHu+fkPpy/ZLPHc0c6B/GLsYMZNe7TD8O/A44nJrJkZxTw53J0n
|
102
|
+
E22wW3w0S5JQ47/OnavDxr9oH46054EXmokzdnmDWYGfcD85bBy/POT2lDfk
|
103
|
+
m6LjxElcPnp5E7pBxJnGvnGS1/PxQUvel4O7jn/JIu4eNmJuRtaswZJFyA8b
|
104
|
+
Zhi6joVRAr8VhNyXT20eW5ET0t0/3Hhug80iPjlsFFo0xW0xwmFDDthgLbWc
|
105
|
+
fLCJO3dszcHs0z5olfF+qzV1ktl83LQCryUaUZtm6E/rOhVLkB9FUKYgMefJ
|
106
|
+
DLBxLCdZXCTBJcdgvulhYlYcTbTQjExPTE6hIH3v5Xx488Mw+dE/C36YfHj1
|
107
|
+
oTNtvWv9Fl6e9KK94+HzJHzdm/R+/faTf/7hsNSjGgM9FoArQZQuzzS6gyZv
|
108
|
+
mpemZzpNnyctMwbe42Zh3XGycHk847jdGs99Gz8UWLTOwO5Yg0FP550BH/bG
|
109
|
+
6G3Ch22ja00Gk063w9vDnt0ed5tWHDeYx23HBGatiNPaJZqzzlNUJ/wmaYn2
|
110
|
+
JTxtc96GZuvtLp+M93qYomEPjZ6l98YDa9hr872xZfM9zo2JPjDuNHEFdkmj
|
111
|
+
LI6sw8Zt4P7JvDLluzncP/0659FC0yeTbr+nm217OOj1zGF30DF0bloDe9gb
|
112
|
+
2uOObenGEChpfgIY5Q4SYM27bIwOWrJztRm2OUtJPFp3wPV+p6+3+cAY8LEx
|
113
|
+
6Jg9Y9wH1Cd2GwvoG9awb/V6m89SQhTUfNCSHItuHIwDe8EsF5R72HCD6ZTb
|
114
|
+
F8E8oc3I/SstjAJ7bhGPYIob7ahuxK7A+7ZzxRwbyAET5BGYmOwqieYc1+BD
|
115
|
+
E+emyMiovWpD/M50fB7VNUTX4KSys9hJuIbJBasZTGEIMa0HmvaTM2HHL3/O
|
116
|
+
mah44HhTZrpgsilDU2MosMtRbk1ujmdOsbM9QIt2uARGizq76hPH+6PeMTqD
|
117
|
+
gdHX+9nWlADcbEraLLgi6G51YrLTdHrdjrHX6feq03vwE/dtZ/Kzpt0elFr3
|
118
|
+
BjjjznQGiHfaW17AP3VvtgLc7k0K3/uc6ibgPmiZSuCk1CG/iZ7VjvPMS87i
|
119
|
+
ObSAJGDzmLPJ3HUX7Ne56ToTh9vsw/u3MWCOBrHjW2g2c2LIyiv1Pn7gLZth
|
120
|
+
S/MoCiIWEunSzwAiNmJ2AGnmx6J1ivKDuZtt7CDU0BnLeUVxd7tO2i6MIKsh
|
121
|
+
6kfYxmXxU5COoWv6cWN05kz9echM32an8jWCw0HLdQoaUN41l1rQ+q6zRqnO
|
122
|
+
lGpM5Y4P8m4n3EwAVsxn3ZTzVq9U+7VzHQNO6zuULZ6h3XJHhbnhqeOv70c0
|
123
|
+
+WPEsQYfGtLhPxqvMhi8pWfl7g9acyixiptLIlMEQgICXDoFvfqhHkruLzhp
|
124
|
+
gZMTg17BxOv4/RKzplkUeyOCJA4qmh3M9EztTVGIW/SkQJTmOGbhJ1NKGsAt
|
125
|
+
R2ErBUImTLjLrYTb4EtgS65kTarRBZF+Ix2vCLBynxEPA6w5iByQS10/okFj
|
126
|
+
9L7QbnVv2ATTOW3C2q7yp6O3acPVfSWOx6Gaw5qoWV32cHSumq3uKDUhCt1Y
|
127
|
+
gc0v1P3RmTAx8vcVPZXwmCIEyNpJyUh+Ez9jrWaKacFoSjRgBe7c82OmEIOf
|
128
|
+
2bUkiwJdSaQJAi0On0Qkt/ypVsKWpCkjY2dpIyI4wa9EP+fqNhMYlF3Hoeln
|
129
|
+
rwF+wK/aFOeBbS7Yv6pfGfdISa91zfllY/QRnwSvNQ09aESzxuiEvj7TdAI7
|
130
|
+
kFSC0St5Qc3lPFs0UbHM1swQ30HGvSPTv+R2FSK05h2icWqe7yuCnBYnZpKu
|
131
|
+
tcCSrs3EgrzIoZCv+5PpROaneDBsnQdza7aAjtrKmqcG5cfs/V63U5h9yvSL
|
132
|
+
nDmILjcdB2bYNZpnJibAQ+/qw8oQimKJxXSYgl0qq6DsqyU0RtmlwEiL9FJl
|
133
|
+
KtesszHKlqwGBBY6QqYfhCn1FEzvjIaOIK290OVMcCTtWuhM7A3MmDNhpuP+
|
134
|
+
ODKjBQPimc0h7P0pu4aBDTUAMGYgBj8hSj1ohcTXpfRUJLk91HoWZjTlLR+8
|
135
|
+
AGQUJ85ksQ630CkrgMccC1J3BW5rB1qF3EFljLXIVX03Ruqigti6wRuj4ppv
|
136
|
+
h1x6E5uAvX95ds4mcFVwolEWh9yCzmaZpMB53PQTgd1rPmYxj64cKG9Hp8f3
|
137
|
+
j1Q/SFxyDsVgmTy5cvg1Nj0UyGAameHMsdYheFCF/Ub4/eyYq3Bt3A7XahzC
|
138
|
+
nlhkBdefm0hjtAost6OBsxkZyOw8WLA/sG+CYIq9fmKG+HFq+gFowrGY0lWj
|
139
|
+
e9/GCTgHMNwaB0GC5ZnhOhTrwyeDbrsC9o3QXB1oBV47T3qdvcoYa7ex6rsx
|
140
|
+
UhcV1FYGb4yyBd8Oea/PT94+Yc/Pzp4IK+XNGbhu4F462LBR4LFzCc57R1uc
|
141
|
+
YNDYDqJJ6xQeT9c1b9aLVmNQ3S4b4a1+pBW4qyGOtYjLO6fNlS6pgr7aOTRG
|
142
|
+
+cpvh0OTvS2IUJKsym2XzMwEPiE3uIaOacYLYRHjQwIYcraCVyFWi+aZGSeZ
|
143
|
+
JC9Zj/CCJ0EYL8gJnjvH6A6sucwXdS5u3Nm7o5RlNaJ0O+l7Q2PQ3RN+nZrt
|
144
|
+
FVAkQ5p7yuM6y3RjZeraWlk3xsqLJj1WOOHcjpv0OYbxySPh8YczAYEQf8rJ
|
145
|
+
Ak41H2oDe38+JtCPeTYhxpQ1bRc1bqW/YsCCSrpGg92mDus5bjCbY/4toWc7
|
146
|
+
fB2HNIxeBbYbbbPqKCu2mFEVs2u3WNYxlJx0JZUNVhm9MVKLvd3Oei8hxOBa
|
147
|
+
MhmAFSPWBOlGTHMPkYo4DPzYueLsYxDZp/gdF3SgsamcUT9+eP9sN2avAkRJ
|
148
|
+
RKyqmW07sdsy7ZlibLkrZJeiAooG4a2+WTSnQrYKGvxzVCLD1p87rcWP9vsf
|
149
|
+
rj54Uedlixzs3cHecNjr78JWN212PuPseUq1mM1RhJATlkLwkIp1Sa/eJsG5
|
150
|
+
gBCiOoFpt97i8r24NNZRXe8LefuKkVZQXrdC2WsJL++8McqvK6RXP4fGqLD0
|
151
|
+
29Fg/iLiqJ7HI8sxXfbh+F6IqHezMF8HSfDx/ISIqG209f5QN/72RBR5ljXn
|
152
|
+
rfcIMcM2W2t1G0a/gtmNeNbyGCvoRq9qjGsJR3YLDiSWUCGYpVHJtybXeDsy
|
153
|
+
Sd8Shtbp61PJs2ZzDxF4MKzzU/Ar0W9qbzfZcSJNc/Kuw7EeXLNFMGcOYvWR
|
154
|
+
aSXSBpeuc3KDNpv3w7eef/yh/+blTT9+QyRn9HrDfq/f3vvbk5xvQn/y42Ac
|
155
|
+
kOPJ4hoyDNZxrX5Vy9+I7GrHWUF7VZ/DWtLLu4aZmC2nQoJ1M4ACmy76doSY
|
156
|
+
+XvIngiDax4hjgM1yYEbR4sdmzOywwuCUmip5DaIQXvsNx4FGszzK+RD3A/B
|
157
|
+
RYvvJkf9K+OTHhDB7bX7g/6w0/vbE5w1hTY2DVrYfJCTF2aSROvoTd/7Qt2s
|
158
|
+
bpwV9FYl6bX0pnpujNRFhdJqhm6MCuu9Hal9JMo5EtA6gSPKZS6fJAxR/nuR
|
159
|
+
jS+++fBq+toKxoMu0U2300FUuTf8ErqRQAR3UAYKrkpBJ5uPTXjApTUmfd1C
|
160
|
+
VdzEPMlskxgcfQP7pKQZstPAtmDzyZFzMyUcPYuCOTluEZslKTFeFNzFpm16
|
161
|
+
cKZbjdERrtgZLoMrx5RBAWIDuWcZPHR6vfARi/qIT/Ydp9AsIkE2Yb4pNFGx
|
162
|
+
0gI4MmuN3ImUwhHKOWoIsi4oUCDewDu5gSdSu7IHjL0MnRgEwtrNftNgjzzb
|
163
|
+
jGdP2fcm+Axck8LZfOKQ19912Ws8czzoQKpbdCziDshU8ygWKBJKPsXwEbvQ
|
164
|
+
5q+4BqUeUSn6pBaHDehMhtbG/+G53t7vtPfbba09wGfmus9bsLQFwh18HM3J
|
165
|
+
D94ePmHUgnLgPE4ZOSrsIRYpcJKuOJRQtEyBGkSaVyxFOnMA6mzNsCg+IVpI
|
166
|
+
6ExM95KZY4LslRMlFG7/DWF0ZMk4UeB75H1/wl7wq3chvgmbHgIzElXpPArY
|
167
|
+
GgP3fgEzRUw6PmzoMtqWSbpEvcLU0dt7rW57aHQ0LrGotbW+Zmjw1hP2NFqy
|
168
|
+
5insabMUe00v7GTUT/NhFAWbU+aTzPujezntCBg/ejjs7XWfFiGea5JLe++e
|
169
|
+
JmpS4sPyTOnm0lT/8j//jf/1M12K5+XBP0XKWbRmHgl7tDFqE5GWO8uJrojB
|
170
|
+
a3hU85kU0A56mpIBXNh0xffI5KOIZU2z8mZXJJL19sgfxyEwUojb0yYo/Cz8
|
171
|
+
yC7VRfWRZC6500WxktK0i8kZVZ5b9P4I+kQYNWnp3T0YSv12vzuEFyhEnFpu
|
172
|
+
laLac/L9c7k/j1x+Aw+59dy8zshLsTDuZcghrnIrpqNr4DtGWzAdfZ8Y0DLT
|
173
|
+
US0E06EWjdELbnFvDO+90RZMR8+YDvdSvkrwXkpYyR9tDKDOoN/p6Ybe1Smy
|
174
|
+
m7pCBD+RNsaP82jM4OKwYYxsHSzt/nm7t7/X2e8OVoCl3WdpiwJY2v17BouB
|
175
|
+
/Mc2MtS6PaIbRKkkiSAJwWdn1iyAaebaAkrPIgf3/nmOHbJl8Ohau0Pg6RhC
|
176
|
+
VPWXRBWohloQeESLxug75OwJqml37hk8ekfvI00U/xqjb6Lg2pUEAy2CxcE8
|
177
|
+
QhwRJgPJtaMwhN5BKeRCBD6fRfDZIUQeL/B0y/BqAyDn2FwG7a9aeFELlrZo
|
178
|
+
jN5ZSUDg0vX7BRc8NZ09fdA1+o2R9FE+C+CvjCDqEjAk0pEj3/ktysW4hNap
|
179
|
+
CY54DIiJ3IxM78Hevhs/ag80fUiQaq+iLNGCICVaQH+cQ0QnDIG5e+VGg+6g
|
180
|
+
rxvtvtGBu+X7E7m/XpydSHB8NBc+Z2d8ir22dYC0u+f63r6uYy/VkQ4A0u6y
|
181
|
+
tEUGkHb3ngGit40OWLQOHDwnTZm8SEdwNiKGL4HyMnIuoSRD7eFurL0jciau
|
182
|
+
dGLesHdTm28bUNDwegSolXusTS0IUKJFY/RmjlQDo3e/YOrv9btGt93vgW5O
|
183
|
+
g+sn7L3pIH+r09TZEVVIsFMnFFlqCKkiRTXAVzBBaEhmuixvuzNhK/EQ9R+x
|
184
|
+
jN9vcev1NR1MfbCvd1YwKQBQB1NXLRQA9Xtm6L1+r93t7vX1LujsxXcINwt2
|
185
|
+
fT5Dbm7MXsDrg0gLyOr9ArLuWye69MxtU1ZPM3ShI3VXAYZaCB2JWhBgwA2M
|
186
|
+
e2bdPb096PWHg/ZeY3QMsyuFzKuI27TzXgcuTOzo//7X2zJH6sFUFXTS3d+r
|
187
|
+
50jUQtAJtVDgaBsbbbQlP4fUx1WKpEqEbBYTIUW2pFSFpbFbUvvLloIoKrtJ
|
188
|
+
NKofEKZcvZGQmqiUlzlBUgj4Vp7vL0wUKvKQTxoqW47GxntFA2Yehjy6UM2U
|
189
|
+
iCw+ry35SJdSLNo4mHXFiGPXtC6TANVrFw4UfZkBjEC+fwntd9YdHdRUJ+Ae
|
190
|
+
Sj8eUO1HtZt1feRQLQSc5WIukPNeMNrgRemqyYhp5NgoJQrXFMIJDwIEFn0R
|
191
|
+
jZaNBkoqWJt+vjpbnLwen3l5k9z1z3YiUGhhCc/lBXtkeuFTdgbUB9GXLAnp
|
192
|
+
R44vrO9zdbUJXBDyhcsFShv0MV6suMsQ/DJ7vGF/lOg6L1XvUWkCMtvhqcOD
|
193
|
+
Si+Zj1dsxY1I5hxZQ5Jw11AMVU7ypDmFu6qJYiIsCFc83mdHvukufoPujgRB
|
194
|
+
QG2C1MHKpGqoAP0h0xBlI5HNrUuRiQIjStyAw8q63EfVBY8BTeHmqK6zpktZ
|
195
|
+
3RljklmJJ8E9TvYhyqEXxb4DTpBs1Bem55lWLQpJvYIHYMNFesEYany5o+M4
|
196
|
+
RjRP6GnO6SzwhZHzuT2HGX0KxkuU8CYYs2eBGQkXbKmHL6CDlzfA3yaEAO90
|
197
|
+
WAuaMzyozKQGU1hLALvK5jflbsifTQYXHlS6+YIFvQisOflABQl9jiViTvAh
|
198
|
+
16/rNR5UJlS/LhtpJy4KmqPyyuCDlbcpeXbDrhQZF0qTVD3gxDWvYC7bmmdG
|
199
|
+
l3Zw7dN2FLXX7JV6BAqVjzYcS9Rb1eL0lJ5Ueilio6QaiKobKaVL94tCuFpZ
|
200
|
+
gTS3soj+rHwW3A2KQVpQmakDLp+arpDLFTFclMGp9iCaF9pmEQmqRBI8VPz7
|
201
|
+
nBSjZbfA+T24I8/piwUTmKEiX7oCvE37gxi5Mi2oRyhAo4sv6ohb8wjxWHBX
|
202
|
+
dbXcS2ljHYSjR1YQLp6KMIYMn6Re93ZTH/T7XUqUQnrphOt6MyrSTEqEyhnN
|
203
|
+
jn2ryY4QlomobCCmpCsAhNuFUJFwBkul8qHABANPbMlL4LCgUxYJQuRuBXn0
|
204
|
+
COhZ8vBfX183kQhxKeLgpTRHKn8t6kyF4tsXqGaHTof4CSGOKluzyktkKt61
|
205
|
+
ylWqbK1sVhc0E1EpemU0YErZyQzVqINSqSuhKiNCGK3IxbIpfEdes88uGcXR
|
206
|
+
2YqyTuTKxH4mY+1gHLVG67uyELezpXx+TpdZV89xSsQ8gY4kOgO5v08BDosH
|
207
|
+
NRh4ICMzEZ+W4jJAVxoEqGcdpbtF1iAZh9qRRdopWAdifqpui5T1S74Yk4S8
|
208
|
+
gNSKEmuexBeIF9HJFjIciLLSxAxNMhFQ8UleyAhJdyakNnaNBislMoF8OtTg
|
209
|
+
sGE7MYWe9n3IbEFEiFuOvlUDMEg/OQC2jYd4Z64zP8xGQ/Ax5lzDUy2dmJZN
|
210
|
+
rDH6Cg8p6/druUlVN2ncsMwUZS1agrIfTpVoiqirbdjEiYqZwKj8EQokSA46
|
211
|
+
UTpnWRGkQGtntVnZJD0c2AGUloNEyQiUZCc5jR7YNiIE1jxGdhKGkHV5aKKO
|
212
|
+
IhGYz/jqwebD/LE6DHzqKDZC3FSUEpPoRuzNFBWrdcOVaErar00BmJymakDn
|
213
|
+
OTZOx0jxv1vEvzpChWigA7PDQxgT6VkxjA0qrbo1ID9VV3gC8xggpMpQOuuA
|
214
|
+
BH392jCHzUGJyP4yxpYGmkPXqQPhrYaxyAE9CqKDFr5YUPolbKTqNN5RaIDy
|
215
|
+
JB0A8c4TgLRcXicqWchBhVp0ip07EEkWqlx8qnUiZRzumbpRcTcTUuCDtO02
|
216
|
+
oBwwl+TzdHOKMvlCct/fqYdUI6Btc3pxyJbaGHFxk/CSbvkazMZNZFoiDQKe
|
217
|
+
YaV8CK4g+X/d/i8yjQjnbGS+LMECiDdIay+ruKQFFo8bqWfjda2WGPnG21GC
|
218
|
+
dInB0E0bRb1rWQwaFfj1LQdcYjS1A5ZZzZ2Gg8VY3O9yuHMcHoMc2oyJFgnl
|
219
|
+
TqN9lkDl+IKlVUi0PHINFRb4SypIlkmGKdFUlL0bSW2aWDKCDVqFFtmMBYED
|
220
|
+
rz6pQluD2YfNxpz7Wx2VmENBQ5F4gSaLXJ3tk8WS0JGDvedeeZsJ+4kyaVeD
|
221
|
+
to4oJMaXXeDZuUXESJgQO19IE3Bp1UAKyIAm9xkapoXegvrcNSO55pjT80xl
|
222
|
+
vNM+depGegZDRWgA/jggtrGdoeZrhxIuv62N1aobSyreAlPQv9VRGKsWhw7S
|
223
|
+
41SyS3VBds12hB6igvFMWF9/F3918naL4u+vKpDug2KQTizOH/iGyvtzI6pO
|
224
|
+
WUpVtNpsRqmh301XEo6L1EEwNm0cgQKn0V/+7d9T11bRtsFkq2wTWQMBVHtK
|
225
|
+
at/apl81q/+sm1Uti1WzEv64e5/Wf9RNq1YpVNPa4m5YBar/qptTrWas5rRs
|
226
|
+
dN9JGBHMK6qIUlGRPeFInxO50qi4GbIcknCNzJC7sBSS30RxvIuKgPR6VGms
|
227
|
+
gm7t9pCvrNsk8HcJj+Y1Er23umHWz7Z228hX1m2e4my3u5HWT7d2O8lX1m2q
|
228
|
+
4nS3uMHWz7V2m8lX1m224lzXb7w60s8se2F4lz3M4qijL9RpRHd3M9nX+A9S
|
229
|
+
L8GZTIkVgWrU61zjmER4MnNX4i017FpWI8qtoM3HYrtNqBx+4vhILN2aLKjd
|
230
|
+
OG/mHry0AbyjFOlepZDecoFkO1V46Rky0nDsFM6k8q1ZKzFLvuA7se5aw+42
|
231
|
+
/sTy6Bl5VilVPZLuIyI7CmKkgV6ceuGGtwpdpFGKNB6M9GoYdIhV4FAl0Jcs
|
232
|
+
SCqRNz0Xh0NrIk3LJ3sy3T7K2MSRsvXhhlwjA+UihdvDUOeIoZSc4uHotTiX
|
233
|
+
F6JNlYwBCsifH+08RDYhnPkikZA9cpOnM/3RNHmKMqfpzsPqQ6Pw8CH+Vd7u
|
234
|
+
pQ0wEvUvnCkIcdKU5J4szEFO4bGYAWWnIUACwTSG7YsqK8d6vHMhHom7phsH
|
235
|
+
+aOLncc1r40D136Mt2peo0cXFzs7j39AOd7jx/BCP35Mnu8xdgjtTe8xzj/J
|
236
|
+
ZlwkljIfKoH7LYIPWE4h+hCOPiCgBUhz2F9LoH6MEnJkiOs76sIAcNJLs3CN
|
237
|
+
49xLoHu33J8Em04l6aI/Q10ZOx111SG4q7471Hf2o9j5hqs8cWILpX6I4AWU
|
238
|
+
/VRa7bE48Liy1Ac/pRl6iLj+/FVLnYucxl+/3pGEus8e/HTkSnr9+at55H5d
|
239
|
+
RAKRjcrwSyEpV66O+sizNHAQPg6GR1IsTul4sKPG/vmrSjvE+ARk0TEOWrUu
|
240
|
+
f50jw7FCj0cx+9b0FzinhOr/Y9Ox93d2Rvi1i0glDqygIJjg5nM6JxaVFXhI
|
241
|
+
v9E1ZU/RXoJkwcFByIuqJapcUhZoa9mTLWNcEE0vb5BYh9OkqXYjTzGRWFgS
|
242
|
+
kjlxhqOzBbJwbvLjP2nWInt4OYp9m5yX8l8BqKa+oFzQjJC0dti4QMamj9Mu
|
243
|
+
v3l1IgOrS5vhl19+yQ9S35nMfRlxm0CMLFDxFSVfoaOv2Z9Au84kv2bsH5oT
|
244
|
+
JBjAi/TVnwC9/d2HCETv/v412v2+8/sOur3lNg5H71DhQTIZqEOtLk4hJ4Wg
|
245
|
+
i1P5ME6VtF/TicrELtkpimUQJBTNuUTSDkGYSkPjKvRBRMQLbT6hzNp9yRmx
|
246
|
+
OIaT5xgKmNM7DC5gOrwXqX0UZrkVTwpHxz7JfDkpynaj4B5WVl3HMUVx8Yci
|
247
|
+
qDwZIek56qaomNL0d34hMWDadkSs/BeGaCj1II+SptA9JAnlk6TcPadmqQBi
|
248
|
+
YepWKlfFUoukauJEWE2cOq3h/NQYLKRIuSVbR/w9ibSql50FOPUH0wYp05Su
|
249
|
+
kfqdnn5Jxe3qzI4mO8WJ81hMgvpgc4q00mZO9TnV7T/NxHphQgg4e3DoIZND
|
250
|
+
XpSot6gpiFXlH3LW0Bx2RcpNdKEOPOIXVJe8SxXPpiZKnncpqwd/LkHdmyFz
|
251
|
+
43CXcnt2C9XLBziTz17QoVzy74v8Pw81z153ZAAA
|
252
|
+
http_version: '1.1'
|
253
|
+
recorded_at: Tue, 14 Feb 2012 08:26:10 GMT
|
254
|
+
recorded_with: VCR 2.0.0.rc1
|
data/lib/wombat/parser.rb
CHANGED
@@ -33,4 +33,31 @@ describe 'basic crawler setup' do
|
|
33
33
|
results["social"]["twitter"].should == "Verão"
|
34
34
|
end
|
35
35
|
end
|
36
|
+
|
37
|
+
it 'should iterate elements' do
|
38
|
+
VCR.use_cassette('for_each_page') do
|
39
|
+
crawler = Class.new
|
40
|
+
crawler.send(:include, Wombat::Crawler)
|
41
|
+
|
42
|
+
crawler.base_url "https://www.github.com"
|
43
|
+
crawler.list_page "/explore"
|
44
|
+
|
45
|
+
crawler.for_each "css=ol.ranked-repositories li" do
|
46
|
+
repo 'css=h3'
|
47
|
+
description 'css=p.description'
|
48
|
+
end
|
49
|
+
|
50
|
+
crawler_instance = crawler.new
|
51
|
+
results = crawler_instance.crawl
|
52
|
+
|
53
|
+
results["repo"].should =~ ["jairajs89 / Touchy.js", "mcavage / node-restify", "notlion / streetview-stereographic", "twitter / bootstrap", "stolksdorf / Parallaxjs"]
|
54
|
+
results["description"].should =~ [
|
55
|
+
"node.js REST framework specifically meant for web service APIs",
|
56
|
+
"A simple light-weight JavaScript library for dealing with touch events",
|
57
|
+
"Shader Toy + Google Map + Panoramic Explorer",
|
58
|
+
"HTML, CSS, and JS toolkit from Twitter",
|
59
|
+
"a Library for Javascript that allows easy page parallaxing"
|
60
|
+
]
|
61
|
+
end
|
62
|
+
end
|
36
63
|
end
|
data/spec/parser_spec.rb
CHANGED
@@ -97,8 +97,8 @@ describe Wombat::Parser do
|
|
97
97
|
@metadata.should_receive(:iterators).and_return [it]
|
98
98
|
@metadata.should_receive(:flatten)
|
99
99
|
fake_document.should_receive(:parser).and_return(fake_parser)
|
100
|
-
it['prop_1'].should_receive(:result).exactly(
|
101
|
-
it['prop_2'].should_receive(:result).exactly(
|
100
|
+
it['prop_1'].should_receive(:result).exactly(2).times.and_return([])
|
101
|
+
it['prop_2'].should_receive(:result).exactly(2).times.and_return([])
|
102
102
|
@parser.mechanize.stub(:get).and_return fake_document
|
103
103
|
@parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
|
104
104
|
@parser.should_receive(:context=).with(c1).ordered
|
@@ -110,4 +110,30 @@ describe Wombat::Parser do
|
|
110
110
|
|
111
111
|
@parser.parse(@metadata)
|
112
112
|
end
|
113
|
+
|
114
|
+
it 'should not include null results in iterated block' do
|
115
|
+
fake_parser = double :parser
|
116
|
+
fake_document = double :document
|
117
|
+
c1 = double :context
|
118
|
+
c2 = double :context
|
119
|
+
it = Wombat::Iterator.new "it_selector"
|
120
|
+
it.prop_1 "some_selector"
|
121
|
+
|
122
|
+
@parser.should_receive(:context=).ordered
|
123
|
+
@metadata.should_receive(:iterators).and_return [it]
|
124
|
+
@metadata.should_receive(:flatten)
|
125
|
+
fake_document.should_receive(:parser).and_return(fake_parser)
|
126
|
+
@parser.mechanize.stub(:get).and_return fake_document
|
127
|
+
@parser.should_receive(:select_nodes).with("it_selector").and_return [c1, c2]
|
128
|
+
@parser.should_receive(:context=).with(c1).ordered
|
129
|
+
@parser.should_receive(:context=).with(c2).ordered
|
130
|
+
@parser.should_receive(:context=).ordered
|
131
|
+
@parser.should_receive(:locate_first).with(it['prop_1']).and_return(12)
|
132
|
+
@parser.should_receive(:locate_first).with(it['prop_1']).and_return(nil)
|
133
|
+
@parser.stub(:locate)
|
134
|
+
|
135
|
+
@parser.parse(@metadata)
|
136
|
+
|
137
|
+
it["prop_1"].result.should == [12]
|
138
|
+
end
|
113
139
|
end
|
data/wombat.gemspec
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
|
6
6
|
Gem::Specification.new do |s|
|
7
7
|
s.name = "wombat"
|
8
|
-
s.version = "0.2.
|
8
|
+
s.version = "0.2.3"
|
9
9
|
|
10
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
11
|
s.authors = ["Felipe Lima"]
|
@@ -28,6 +28,7 @@ Gem::Specification.new do |s|
|
|
28
28
|
"Rakefile",
|
29
29
|
"VERSION",
|
30
30
|
"fixtures/vcr_cassettes/basic_crawler_page.yml",
|
31
|
+
"fixtures/vcr_cassettes/for_each_page.yml",
|
31
32
|
"lib/wombat.rb",
|
32
33
|
"lib/wombat/crawler.rb",
|
33
34
|
"lib/wombat/iterator.rb",
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wombat
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-02-14 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70328357893040 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70328357893040
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: activesupport
|
27
|
-
requirement: &
|
27
|
+
requirement: &70328357892320 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70328357892320
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: bundler
|
38
|
-
requirement: &
|
38
|
+
requirement: &70328357891720 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70328357891720
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: rake
|
49
|
-
requirement: &
|
49
|
+
requirement: &70328357907480 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :development
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70328357907480
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: yard
|
60
|
-
requirement: &
|
60
|
+
requirement: &70328357906820 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :development
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70328357906820
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: jeweler
|
71
|
-
requirement: &
|
71
|
+
requirement: &70328357906240 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :development
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70328357906240
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: rspec
|
82
|
-
requirement: &
|
82
|
+
requirement: &70328357905660 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :development
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70328357905660
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: vcr
|
93
|
-
requirement: &
|
93
|
+
requirement: &70328357905040 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - =
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: 2.0.0.rc1
|
99
99
|
type: :development
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70328357905040
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: fakeweb
|
104
|
-
requirement: &
|
104
|
+
requirement: &70328357904360 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,7 +109,7 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :development
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70328357904360
|
113
113
|
description: Generic Web crawler with a DSL that parses structured data from web pages
|
114
114
|
email: felipe.lima@gmail.com
|
115
115
|
executables: []
|
@@ -129,6 +129,7 @@ files:
|
|
129
129
|
- Rakefile
|
130
130
|
- VERSION
|
131
131
|
- fixtures/vcr_cassettes/basic_crawler_page.yml
|
132
|
+
- fixtures/vcr_cassettes/for_each_page.yml
|
132
133
|
- lib/wombat.rb
|
133
134
|
- lib/wombat/crawler.rb
|
134
135
|
- lib/wombat/iterator.rb
|