rhack 1.1.6 → 1.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +5 -0
- data/ext/curb/curb_multi.c +2 -6
- data/lib/rhack/curl/global.rb +1 -0
- data/lib/rhack/page.rb +217 -42
- data/lib/rhack/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzliZTlhMjBjZjM2MzE4ZjNkZDc0MDQzYjg3MDM5NGZmZjdmMjUwMA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGFmNGYyN2Y0MTRkODYzNGRlNzc3NDY1ZTg3MTU5ZTBhZTdiN2FjZQ==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NGQyMjRiNDZmZjRhMWMzZGUwZGE3NjhlMmE2ODYzOGM2NTBkN2JhYThjMDMw
|
10
|
+
ZmZmODg3OThhMzFlNjU1ZjQ2MzI3ZjliY2ZlN2IyNDNiMGE0OTgxZTQxMDU2
|
11
|
+
OGRhYzlhNzgxYzY5ZGEyN2Y1MmZkNjVlZTA0OWJjMThjMGMyNTQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NTQ1ODZhMWY1NTIwZjcxZDM3YzVlY2ZiNTE3NDE2YmVkZDI1M2M4ODYyZjY3
|
14
|
+
OGM0OTM4NjNjZjU5NzNhMTJjYzMyODA0YTNiOWE1NjFiMmM3MWUyN2MxNTY1
|
15
|
+
NzU2OWE2YTU0M2RmM2VhM2FkMGE0YjhjMzU3ZDNkYWEyN2M5MmY=
|
data/README.md
CHANGED
data/ext/curb/curb_multi.c
CHANGED
@@ -71,12 +71,8 @@ rb_hash_clear_i(VALUE key, VALUE value, VALUE dummy) {
|
|
71
71
|
}
|
72
72
|
|
73
73
|
static void curl_multi_free(ruby_curl_multi *rbcm) {
|
74
|
-
if (rbcm) {
|
75
|
-
|
76
|
-
ruby_log_obj(rbcm->requests);
|
77
|
-
}
|
78
|
-
if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
79
|
-
//if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
74
|
+
//if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
75
|
+
if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
80
76
|
rb_hash_foreach( rbcm->requests, (int (*)())curl_multi_flush_easy, (VALUE)rbcm );
|
81
77
|
rb_hash_foreach(rbcm->requests, rb_hash_clear_i, 0); //rb_hash_clear(rbcm->requests);
|
82
78
|
rbcm->requests = Qnil;
|
data/lib/rhack/curl/global.rb
CHANGED
data/lib/rhack/page.rb
CHANGED
@@ -1,10 +1,35 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
2
|
+
class Object
|
3
|
+
unless defined? one_argument_is_a?
|
4
|
+
alias :one_argument_is_a? :is_a?
|
5
|
+
def is_a?(class1, *other_classes)
|
6
|
+
one_argument_is_a? class1 or other_classes.any? {|cl| one_argument_is_a? cl}
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
3
10
|
|
4
|
-
|
5
|
-
|
11
|
+
module RHACK
|
12
|
+
|
13
|
+
class JsonString < String
|
14
|
+
__init__
|
15
|
+
attr_reader :source
|
16
|
+
|
17
|
+
def initialize(source)
|
18
|
+
@source = source
|
19
|
+
super source.to_json
|
20
|
+
end
|
6
21
|
|
7
|
-
|
22
|
+
def inspect
|
23
|
+
"#<RHACK::JsonString(#{@source.inspect})>"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class ScrapeError < ArgumentError; end
|
28
|
+
class NodeNotFound < ScrapeError; end
|
29
|
+
|
30
|
+
# Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
|
31
|
+
# Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
|
32
|
+
class Page
|
8
33
|
# for debug, just enable L#debug, don't write tons of chaotic log-lines
|
9
34
|
__init__
|
10
35
|
attr_writer :title
|
@@ -44,6 +69,9 @@ module RHACK
|
|
44
69
|
@html.force_encoding(encoding)
|
45
70
|
end
|
46
71
|
|
72
|
+
def url() @loc.href end
|
73
|
+
alias :href :url
|
74
|
+
|
47
75
|
# We can then alternate #process in Page subclasses
|
48
76
|
# Frame doesn't mind about value returned by #process
|
49
77
|
def process(c, opts={})
|
@@ -150,63 +178,169 @@ module RHACK
|
|
150
178
|
@loc.href
|
151
179
|
end
|
152
180
|
end
|
153
|
-
|
154
|
-
def find(xp) (@doc || to_doc).find xp end
|
155
181
|
|
156
|
-
def at(xp) (@doc || to_doc).at xp end
|
157
182
|
|
158
|
-
|
159
|
-
alias :href :url
|
183
|
+
# HELPERS #
|
160
184
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
185
|
+
# hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
|
186
|
+
# dict key1: value1, ...
|
187
|
+
# key2: value2, ...
|
188
|
+
def dict(hash)
|
189
|
+
hash.is_a?(Hash) ? hash : Hash[hash]
|
190
|
+
end
|
191
|
+
|
192
|
+
# maps {'firstname lastname' => tuple} into {:firstname => tuple[0], :lastname => tuple[1]}
|
193
|
+
def flatten_dict(hash)
|
194
|
+
result = {}
|
195
|
+
hash.each {|k, v|
|
196
|
+
if k.is String and k[' ']
|
197
|
+
k.split(' ').each_with_index {|k_unit, k_idx|
|
198
|
+
result[k_unit.to_sym] = v[k_idx]
|
199
|
+
}
|
200
|
+
else
|
201
|
+
result[k.to_sym] = v
|
202
|
+
end
|
203
|
+
}
|
204
|
+
result
|
205
|
+
end
|
206
|
+
|
207
|
+
# makes a relative path being on this page into an absolute path
|
208
|
+
def expand_link(link)
|
209
|
+
case link
|
210
|
+
when /^\w+:\/\// then link
|
211
|
+
when /^\/\// then @loc.protocol + ':' + link
|
212
|
+
when /^\// then @loc.root + link
|
213
|
+
when /^\?/ then File.join(@loc.root, @loc.path) + link
|
214
|
+
when /^#/ then File.join(@loc.root, @loc.fullpath) + link
|
215
|
+
else File.join @loc.root, File.dirname(@loc.path), link
|
166
216
|
end
|
167
|
-
links.map {|link| expand_link link}.uniq
|
168
217
|
end
|
169
218
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
219
|
+
|
220
|
+
# FINDERS #
|
221
|
+
|
222
|
+
private
|
223
|
+
|
224
|
+
def node_is_missing!(selector, options)
|
225
|
+
missing = options[:missing]
|
226
|
+
if missing.is Proc
|
227
|
+
missing.call(selector)
|
228
|
+
elsif missing
|
229
|
+
if missing.is String
|
230
|
+
message %= {selector: selector}
|
231
|
+
end
|
232
|
+
raise NodeNotFound, missing
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def preprocess_search_result(preresult, preprocess)
|
237
|
+
if preprocess.is_a? Proc
|
238
|
+
preprocess.call(preresult)
|
239
|
+
elsif preprocess.is_a? Symbol
|
240
|
+
__send__(preprocess, preresult)
|
241
|
+
else
|
242
|
+
preresult
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def preprocess_search_results(preresult, preprocess)
|
247
|
+
if preprocess.is_a? Proc
|
248
|
+
preresult.map(&preprocess)
|
249
|
+
elsif preprocess.is_a? Symbol
|
250
|
+
preresult.map {|node| __send__(preprocess, node)}
|
251
|
+
else
|
252
|
+
preresult
|
174
253
|
end
|
175
|
-
expand_link link if link
|
176
254
|
end
|
177
255
|
|
178
|
-
def
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
256
|
+
def __at(xp) (@doc || to_doc).at xp end
|
257
|
+
|
258
|
+
def __find(xp) (@doc || to_doc).find xp end
|
259
|
+
|
260
|
+
public
|
261
|
+
|
262
|
+
def at(selector_or_node, options={})
|
263
|
+
if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
|
264
|
+
selector_or_node : __at(selector_or_node)
|
265
|
+
|
266
|
+
preresult = preprocess_search_result(preresult, options[:preprocess])
|
267
|
+
block_given? ? yield(preresult) : preresult
|
268
|
+
else
|
269
|
+
node_is_missing!(selector_or_node, options)
|
270
|
+
preresult
|
183
271
|
end
|
184
|
-
links.map {|link| expand_link link}.uniq
|
185
272
|
end
|
273
|
+
alias :first :at
|
186
274
|
|
187
|
-
def
|
188
|
-
|
189
|
-
|
190
|
-
|
275
|
+
def find(selector_or_nodes, options={}, &foreach)
|
276
|
+
preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
|
277
|
+
selector_or_nodes : __find(selector_or_nodes)
|
278
|
+
|
279
|
+
if preresult.size > 0
|
280
|
+
preresult = preprocess_search_results(preresult, options[:preprocess])
|
281
|
+
foreach ? preresult.each(&foreach) : preresult
|
282
|
+
else
|
283
|
+
node_is_missing!(selector_or_nodes, options)
|
284
|
+
preresult
|
191
285
|
end
|
192
|
-
expand_link link if link
|
193
286
|
end
|
194
|
-
alias :
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
287
|
+
alias :all :find
|
288
|
+
|
289
|
+
|
290
|
+
# FINDERS PREPROCESSORS #
|
291
|
+
|
292
|
+
def text(selector_or_node, options={})
|
293
|
+
if node = at(selector_or_node, options)
|
294
|
+
txt = node.text.strip
|
295
|
+
block_given? ? yield(txt) : txt
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
def texts(hash, options={})
|
300
|
+
hash.map_values {|selector_or_node|
|
301
|
+
text(selector_or_node, options)
|
302
|
+
}
|
303
|
+
end
|
304
|
+
|
305
|
+
def get_src(selector_or_node='img', options={}, &onfound)
|
306
|
+
at(selector_or_node, options.merge(:preprocess => lambda {|node|
|
307
|
+
if src = node.src
|
308
|
+
expand_link src
|
309
|
+
end
|
310
|
+
})) {|href| onfound && src ? onfound.call(src) : src}
|
311
|
+
end
|
199
312
|
alias :src :get_src
|
200
313
|
|
201
|
-
def
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
314
|
+
def get_link(selector_or_node='a', options={}, &onfound)
|
315
|
+
at(selector_or_node, options.merge(:preprocess => lambda {|node|
|
316
|
+
unless href = node.href
|
317
|
+
if node = node.find('a')
|
318
|
+
href = node.href
|
319
|
+
end
|
320
|
+
end
|
321
|
+
if href
|
322
|
+
expand_link href
|
323
|
+
end
|
324
|
+
})) {|href| onfound && href ? onfound.call(href) : href}
|
325
|
+
end
|
326
|
+
alias :link :get_link
|
327
|
+
alias :get_href :get_link
|
328
|
+
|
329
|
+
def map(selector_or_nodes, options={}, &mapper)
|
330
|
+
mapping = find(selector_or_nodes, options.merge(:preprocess => mapper))
|
331
|
+
unless options[:compact] == false
|
332
|
+
mapping = mapping.to_a.compact
|
207
333
|
end
|
334
|
+
mapping
|
208
335
|
end
|
209
336
|
|
337
|
+
def map_json(selector_or_nodes, options={}, &mapper)
|
338
|
+
JsonString map(selector_or_nodes, options, &mapper)
|
339
|
+
end
|
340
|
+
|
341
|
+
|
342
|
+
# FORMS #
|
343
|
+
|
210
344
|
def form(form='form', hash={}, opts={})
|
211
345
|
form = "[action=#{@loc.path.inspect}]" if form == :self
|
212
346
|
if form.is String
|
@@ -237,6 +371,47 @@ module RHACK
|
|
237
371
|
page
|
238
372
|
end
|
239
373
|
|
374
|
+
|
375
|
+
# OLD #
|
376
|
+
|
377
|
+
# TODO: make into same form as #get_src and #map
|
378
|
+
def get_srcs(links='img')
|
379
|
+
begin
|
380
|
+
links = find(links).map {|e| e.src} if links.is String
|
381
|
+
rescue XML::Error
|
382
|
+
links = [links]
|
383
|
+
end
|
384
|
+
links.map {|link| expand_link link}.uniq
|
385
|
+
end
|
386
|
+
alias :srcs :get_srcs
|
387
|
+
|
388
|
+
#def get_src(link='img')
|
389
|
+
# begin
|
390
|
+
# link = at(link) && at(link).src if link.is String
|
391
|
+
# rescue XML::Error; nil
|
392
|
+
# end
|
393
|
+
# expand_link link if link
|
394
|
+
#end
|
395
|
+
|
396
|
+
def get_links(links='a')
|
397
|
+
begin
|
398
|
+
links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
|
399
|
+
rescue XML::Error
|
400
|
+
links = [links]
|
401
|
+
end
|
402
|
+
links.map {|link| expand_link link}.uniq
|
403
|
+
end
|
404
|
+
alias :get_hrefs :get_links
|
405
|
+
alias :links :get_links
|
406
|
+
|
407
|
+
#def get_link(link='a')
|
408
|
+
# begin
|
409
|
+
# link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
|
410
|
+
# rescue XML::Error; nil
|
411
|
+
# end
|
412
|
+
# expand_link link if link
|
413
|
+
#end
|
414
|
+
|
240
415
|
def load_scripts(frame)
|
241
416
|
frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
|
242
417
|
end
|
data/lib/rhack/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rhack
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sergey Baev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|