rhack 1.1.6 → 1.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +5 -0
- data/ext/curb/curb_multi.c +2 -6
- data/lib/rhack/curl/global.rb +1 -0
- data/lib/rhack/page.rb +217 -42
- data/lib/rhack/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
YzliZTlhMjBjZjM2MzE4ZjNkZDc0MDQzYjg3MDM5NGZmZjdmMjUwMA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OGFmNGYyN2Y0MTRkODYzNGRlNzc3NDY1ZTg3MTU5ZTBhZTdiN2FjZQ==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
NGQyMjRiNDZmZjRhMWMzZGUwZGE3NjhlMmE2ODYzOGM2NTBkN2JhYThjMDMw
|
10
|
+
ZmZmODg3OThhMzFlNjU1ZjQ2MzI3ZjliY2ZlN2IyNDNiMGE0OTgxZTQxMDU2
|
11
|
+
OGRhYzlhNzgxYzY5ZGEyN2Y1MmZkNjVlZTA0OWJjMThjMGMyNTQ=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
NTQ1ODZhMWY1NTIwZjcxZDM3YzVlY2ZiNTE3NDE2YmVkZDI1M2M4ODYyZjY3
|
14
|
+
OGM0OTM4NjNjZjU5NzNhMTJjYzMyODA0YTNiOWE1NjFiMmM3MWUyN2MxNTY1
|
15
|
+
NzU2OWE2YTU0M2RmM2VhM2FkMGE0YjhjMzU3ZDNkYWEyN2M5MmY=
|
data/README.md
CHANGED
data/ext/curb/curb_multi.c
CHANGED
@@ -71,12 +71,8 @@ rb_hash_clear_i(VALUE key, VALUE value, VALUE dummy) {
|
|
71
71
|
}
|
72
72
|
|
73
73
|
static void curl_multi_free(ruby_curl_multi *rbcm) {
|
74
|
-
if (rbcm) {
|
75
|
-
|
76
|
-
ruby_log_obj(rbcm->requests);
|
77
|
-
}
|
78
|
-
if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
79
|
-
//if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
74
|
+
//if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
75
|
+
if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
|
80
76
|
rb_hash_foreach( rbcm->requests, (int (*)())curl_multi_flush_easy, (VALUE)rbcm );
|
81
77
|
rb_hash_foreach(rbcm->requests, rb_hash_clear_i, 0); //rb_hash_clear(rbcm->requests);
|
82
78
|
rbcm->requests = Qnil;
|
data/lib/rhack/curl/global.rb
CHANGED
data/lib/rhack/page.rb
CHANGED
@@ -1,10 +1,35 @@
|
|
1
1
|
# encoding: utf-8
|
2
|
-
|
2
|
+
class Object
|
3
|
+
unless defined? one_argument_is_a?
|
4
|
+
alias :one_argument_is_a? :is_a?
|
5
|
+
def is_a?(class1, *other_classes)
|
6
|
+
one_argument_is_a? class1 or other_classes.any? {|cl| one_argument_is_a? cl}
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
3
10
|
|
4
|
-
|
5
|
-
|
11
|
+
module RHACK
|
12
|
+
|
13
|
+
class JsonString < String
|
14
|
+
__init__
|
15
|
+
attr_reader :source
|
16
|
+
|
17
|
+
def initialize(source)
|
18
|
+
@source = source
|
19
|
+
super source.to_json
|
20
|
+
end
|
6
21
|
|
7
|
-
|
22
|
+
def inspect
|
23
|
+
"#<RHACK::JsonString(#{@source.inspect})>"
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
class ScrapeError < ArgumentError; end
|
28
|
+
class NodeNotFound < ScrapeError; end
|
29
|
+
|
30
|
+
# Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
|
31
|
+
# Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
|
32
|
+
class Page
|
8
33
|
# for debug, just enable L#debug, don't write tons of chaotic log-lines
|
9
34
|
__init__
|
10
35
|
attr_writer :title
|
@@ -44,6 +69,9 @@ module RHACK
|
|
44
69
|
@html.force_encoding(encoding)
|
45
70
|
end
|
46
71
|
|
72
|
+
def url() @loc.href end
|
73
|
+
alias :href :url
|
74
|
+
|
47
75
|
# We can then alternate #process in Page subclasses
|
48
76
|
# Frame doesn't mind about value returned by #process
|
49
77
|
def process(c, opts={})
|
@@ -150,63 +178,169 @@ module RHACK
|
|
150
178
|
@loc.href
|
151
179
|
end
|
152
180
|
end
|
153
|
-
|
154
|
-
def find(xp) (@doc || to_doc).find xp end
|
155
181
|
|
156
|
-
def at(xp) (@doc || to_doc).at xp end
|
157
182
|
|
158
|
-
|
159
|
-
alias :href :url
|
183
|
+
# HELPERS #
|
160
184
|
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
185
|
+
# hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
|
186
|
+
# dict key1: value1, ...
|
187
|
+
# key2: value2, ...
|
188
|
+
def dict(hash)
|
189
|
+
hash.is_a?(Hash) ? hash : Hash[hash]
|
190
|
+
end
|
191
|
+
|
192
|
+
# maps {'firstname lastname' => tuple} into {:firstname => tuple[0], :lastname => tuple[1]}
|
193
|
+
def flatten_dict(hash)
|
194
|
+
result = {}
|
195
|
+
hash.each {|k, v|
|
196
|
+
if k.is String and k[' ']
|
197
|
+
k.split(' ').each_with_index {|k_unit, k_idx|
|
198
|
+
result[k_unit.to_sym] = v[k_idx]
|
199
|
+
}
|
200
|
+
else
|
201
|
+
result[k.to_sym] = v
|
202
|
+
end
|
203
|
+
}
|
204
|
+
result
|
205
|
+
end
|
206
|
+
|
207
|
+
# makes a relative path being on this page into an absolute path
|
208
|
+
def expand_link(link)
|
209
|
+
case link
|
210
|
+
when /^\w+:\/\// then link
|
211
|
+
when /^\/\// then @loc.protocol + ':' + link
|
212
|
+
when /^\// then @loc.root + link
|
213
|
+
when /^\?/ then File.join(@loc.root, @loc.path) + link
|
214
|
+
when /^#/ then File.join(@loc.root, @loc.fullpath) + link
|
215
|
+
else File.join @loc.root, File.dirname(@loc.path), link
|
166
216
|
end
|
167
|
-
links.map {|link| expand_link link}.uniq
|
168
217
|
end
|
169
218
|
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
219
|
+
|
220
|
+
# FINDERS #
|
221
|
+
|
222
|
+
private
|
223
|
+
|
224
|
+
def node_is_missing!(selector, options)
|
225
|
+
missing = options[:missing]
|
226
|
+
if missing.is Proc
|
227
|
+
missing.call(selector)
|
228
|
+
elsif missing
|
229
|
+
if missing.is String
|
230
|
+
message %= {selector: selector}
|
231
|
+
end
|
232
|
+
raise NodeNotFound, missing
|
233
|
+
end
|
234
|
+
end
|
235
|
+
|
236
|
+
def preprocess_search_result(preresult, preprocess)
|
237
|
+
if preprocess.is_a? Proc
|
238
|
+
preprocess.call(preresult)
|
239
|
+
elsif preprocess.is_a? Symbol
|
240
|
+
__send__(preprocess, preresult)
|
241
|
+
else
|
242
|
+
preresult
|
243
|
+
end
|
244
|
+
end
|
245
|
+
|
246
|
+
def preprocess_search_results(preresult, preprocess)
|
247
|
+
if preprocess.is_a? Proc
|
248
|
+
preresult.map(&preprocess)
|
249
|
+
elsif preprocess.is_a? Symbol
|
250
|
+
preresult.map {|node| __send__(preprocess, node)}
|
251
|
+
else
|
252
|
+
preresult
|
174
253
|
end
|
175
|
-
expand_link link if link
|
176
254
|
end
|
177
255
|
|
178
|
-
def
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
256
|
+
def __at(xp) (@doc || to_doc).at xp end
|
257
|
+
|
258
|
+
def __find(xp) (@doc || to_doc).find xp end
|
259
|
+
|
260
|
+
public
|
261
|
+
|
262
|
+
def at(selector_or_node, options={})
|
263
|
+
if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
|
264
|
+
selector_or_node : __at(selector_or_node)
|
265
|
+
|
266
|
+
preresult = preprocess_search_result(preresult, options[:preprocess])
|
267
|
+
block_given? ? yield(preresult) : preresult
|
268
|
+
else
|
269
|
+
node_is_missing!(selector_or_node, options)
|
270
|
+
preresult
|
183
271
|
end
|
184
|
-
links.map {|link| expand_link link}.uniq
|
185
272
|
end
|
273
|
+
alias :first :at
|
186
274
|
|
187
|
-
def
|
188
|
-
|
189
|
-
|
190
|
-
|
275
|
+
def find(selector_or_nodes, options={}, &foreach)
|
276
|
+
preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
|
277
|
+
selector_or_nodes : __find(selector_or_nodes)
|
278
|
+
|
279
|
+
if preresult.size > 0
|
280
|
+
preresult = preprocess_search_results(preresult, options[:preprocess])
|
281
|
+
foreach ? preresult.each(&foreach) : preresult
|
282
|
+
else
|
283
|
+
node_is_missing!(selector_or_nodes, options)
|
284
|
+
preresult
|
191
285
|
end
|
192
|
-
expand_link link if link
|
193
286
|
end
|
194
|
-
alias :
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
287
|
+
alias :all :find
|
288
|
+
|
289
|
+
|
290
|
+
# FINDERS PREPROCESSORS #
|
291
|
+
|
292
|
+
def text(selector_or_node, options={})
|
293
|
+
if node = at(selector_or_node, options)
|
294
|
+
txt = node.text.strip
|
295
|
+
block_given? ? yield(txt) : txt
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
def texts(hash, options={})
|
300
|
+
hash.map_values {|selector_or_node|
|
301
|
+
text(selector_or_node, options)
|
302
|
+
}
|
303
|
+
end
|
304
|
+
|
305
|
+
def get_src(selector_or_node='img', options={}, &onfound)
|
306
|
+
at(selector_or_node, options.merge(:preprocess => lambda {|node|
|
307
|
+
if src = node.src
|
308
|
+
expand_link src
|
309
|
+
end
|
310
|
+
})) {|href| onfound && src ? onfound.call(src) : src}
|
311
|
+
end
|
199
312
|
alias :src :get_src
|
200
313
|
|
201
|
-
def
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
314
|
+
def get_link(selector_or_node='a', options={}, &onfound)
|
315
|
+
at(selector_or_node, options.merge(:preprocess => lambda {|node|
|
316
|
+
unless href = node.href
|
317
|
+
if node = node.find('a')
|
318
|
+
href = node.href
|
319
|
+
end
|
320
|
+
end
|
321
|
+
if href
|
322
|
+
expand_link href
|
323
|
+
end
|
324
|
+
})) {|href| onfound && href ? onfound.call(href) : href}
|
325
|
+
end
|
326
|
+
alias :link :get_link
|
327
|
+
alias :get_href :get_link
|
328
|
+
|
329
|
+
def map(selector_or_nodes, options={}, &mapper)
|
330
|
+
mapping = find(selector_or_nodes, options.merge(:preprocess => mapper))
|
331
|
+
unless options[:compact] == false
|
332
|
+
mapping = mapping.to_a.compact
|
207
333
|
end
|
334
|
+
mapping
|
208
335
|
end
|
209
336
|
|
337
|
+
def map_json(selector_or_nodes, options={}, &mapper)
|
338
|
+
JsonString map(selector_or_nodes, options, &mapper)
|
339
|
+
end
|
340
|
+
|
341
|
+
|
342
|
+
# FORMS #
|
343
|
+
|
210
344
|
def form(form='form', hash={}, opts={})
|
211
345
|
form = "[action=#{@loc.path.inspect}]" if form == :self
|
212
346
|
if form.is String
|
@@ -237,6 +371,47 @@ module RHACK
|
|
237
371
|
page
|
238
372
|
end
|
239
373
|
|
374
|
+
|
375
|
+
# OLD #
|
376
|
+
|
377
|
+
# TODO: make into same form as #get_src and #map
|
378
|
+
def get_srcs(links='img')
|
379
|
+
begin
|
380
|
+
links = find(links).map {|e| e.src} if links.is String
|
381
|
+
rescue XML::Error
|
382
|
+
links = [links]
|
383
|
+
end
|
384
|
+
links.map {|link| expand_link link}.uniq
|
385
|
+
end
|
386
|
+
alias :srcs :get_srcs
|
387
|
+
|
388
|
+
#def get_src(link='img')
|
389
|
+
# begin
|
390
|
+
# link = at(link) && at(link).src if link.is String
|
391
|
+
# rescue XML::Error; nil
|
392
|
+
# end
|
393
|
+
# expand_link link if link
|
394
|
+
#end
|
395
|
+
|
396
|
+
def get_links(links='a')
|
397
|
+
begin
|
398
|
+
links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
|
399
|
+
rescue XML::Error
|
400
|
+
links = [links]
|
401
|
+
end
|
402
|
+
links.map {|link| expand_link link}.uniq
|
403
|
+
end
|
404
|
+
alias :get_hrefs :get_links
|
405
|
+
alias :links :get_links
|
406
|
+
|
407
|
+
#def get_link(link='a')
|
408
|
+
# begin
|
409
|
+
# link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
|
410
|
+
# rescue XML::Error; nil
|
411
|
+
# end
|
412
|
+
# expand_link link if link
|
413
|
+
#end
|
414
|
+
|
240
415
|
def load_scripts(frame)
|
241
416
|
frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
|
242
417
|
end
|
data/lib/rhack/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rhack
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sergey Baev
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-08-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|