rhack 1.1.6 → 1.1.7

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NWNkY2FhMDI2NTg4YWMyOTNkNGRkMzJhYjc1NWFhYTU2Yzc3NzAzYg==
4
+ YzliZTlhMjBjZjM2MzE4ZjNkZDc0MDQzYjg3MDM5NGZmZjdmMjUwMA==
5
5
  data.tar.gz: !binary |-
6
- ODM0MDBmYmYyNjQxY2Q1M2YzYjhmYjFmNDY2N2QzNTRkZDQxNDA1Ng==
6
+ OGFmNGYyN2Y0MTRkODYzNGRlNzc3NDY1ZTg3MTU5ZTBhZTdiN2FjZQ==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZGEyMzYxYzY0Y2U0Y2JhZjA5NjMzNDQ1NWU3YzA5NDllNWU3NWNiODU2ZTNl
10
- OTZmNGQ1ZWIzZTdiOTUyNGM3N2U4ZTJiMDBkMzY2MDA5N2VmNDBjNGI5NDE4
11
- Mjk4NWM0NTE3MzViNTlkYzQ3YzE2ZTdiMzVmNjA1YWI0ZDhjZDY=
9
+ NGQyMjRiNDZmZjRhMWMzZGUwZGE3NjhlMmE2ODYzOGM2NTBkN2JhYThjMDMw
10
+ ZmZmODg3OThhMzFlNjU1ZjQ2MzI3ZjliY2ZlN2IyNDNiMGE0OTgxZTQxMDU2
11
+ OGRhYzlhNzgxYzY5ZGEyN2Y1MmZkNjVlZTA0OWJjMThjMGMyNTQ=
12
12
  data.tar.gz: !binary |-
13
- ODVhN2NlNDQzMzFmNzhiZGI5MDY3YWRmOTAyZWM0MDdkOTU0NjQ1MTc5ZmIw
14
- YzcxM2M0Y2U5NTkxYWRkNGYxZDU0YTZjNTEyNmJiMjA1YjI2NjI3YmQ3NjM4
15
- ODRmYWUzMTFiOGM1YmJlOWQzYTMwM2M3OTQyNjVkNjlmYTU1ZWM=
13
+ NTQ1ODZhMWY1NTIwZjcxZDM3YzVlY2ZiNTE3NDE2YmVkZDI1M2M4ODYyZjY3
14
+ OGM0OTM4NjNjZjU5NzNhMTJjYzMyODA0YTNiOWE1NjFiMmM3MWUyN2MxNTY1
15
+ NzU2OWE2YTU0M2RmM2VhM2FkMGE0YjhjMzU3ZDNkYWEyN2M5MmY=
data/README.md CHANGED
@@ -31,6 +31,11 @@ It's still randomly documented since it's just my working tool.
31
31
 
32
32
  ### CHANGES
33
33
 
34
+ ##### Version 1.1.7
35
+
36
+ * ::Page
37
+ * Fixed #expand_link for partial links
38
+
34
39
  ##### Version 1.1.6
35
40
 
36
41
  * ::Frame
@@ -71,12 +71,8 @@ rb_hash_clear_i(VALUE key, VALUE value, VALUE dummy) {
71
71
  }
72
72
 
73
73
  static void curl_multi_free(ruby_curl_multi *rbcm) {
74
- if (rbcm) {
75
- ruby_log((!rbcm->requests == Qnil) ? "true" : "false");
76
- ruby_log_obj(rbcm->requests);
77
- }
78
- if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
79
- //if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
74
+ //if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
75
+ if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
80
76
  rb_hash_foreach( rbcm->requests, (int (*)())curl_multi_flush_easy, (VALUE)rbcm );
81
77
  rb_hash_foreach(rbcm->requests, rb_hash_clear_i, 0); //rb_hash_clear(rbcm->requests);
82
78
  rbcm->requests = Qnil;
@@ -44,6 +44,7 @@ module Curl
44
44
  end
45
45
  alias :run :execute
46
46
 
47
+ # TODO: вместо статуса sleep/run использовать глобальную класс-переменную
47
48
  def wait
48
49
  if @@carier_thread and @@carier_thread.status
49
50
  unless within = Thread.current == @@carier_thread
data/lib/rhack/page.rb CHANGED
@@ -1,10 +1,35 @@
1
1
  # encoding: utf-8
2
- module RHACK
2
+ class Object
3
+ unless defined? one_argument_is_a?
4
+ alias :one_argument_is_a? :is_a?
5
+ def is_a?(class1, *other_classes)
6
+ one_argument_is_a? class1 or other_classes.any? {|cl| one_argument_is_a? cl}
7
+ end
8
+ end
9
+ end
3
10
 
4
- # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
5
- # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
11
+ module RHACK
12
+
13
+ class JsonString < String
14
+ __init__
15
+ attr_reader :source
16
+
17
+ def initialize(source)
18
+ @source = source
19
+ super source.to_json
20
+ end
6
21
 
7
- class Page
22
+ def inspect
23
+ "#<RHACK::JsonString(#{@source.inspect})>"
24
+ end
25
+ end
26
+
27
+ class ScrapeError < ArgumentError; end
28
+ class NodeNotFound < ScrapeError; end
29
+
30
+ # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
31
+ # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
32
+ class Page
8
33
  # for debug, just enable L#debug, don't write tons of chaotic log-lines
9
34
  __init__
10
35
  attr_writer :title
@@ -44,6 +69,9 @@ module RHACK
44
69
  @html.force_encoding(encoding)
45
70
  end
46
71
 
72
+ def url() @loc.href end
73
+ alias :href :url
74
+
47
75
  # We can then alternate #process in Page subclasses
48
76
  # Frame doesn't mind about value returned by #process
49
77
  def process(c, opts={})
@@ -150,63 +178,169 @@ module RHACK
150
178
  @loc.href
151
179
  end
152
180
  end
153
-
154
- def find(xp) (@doc || to_doc).find xp end
155
181
 
156
- def at(xp) (@doc || to_doc).at xp end
157
182
 
158
- def url() @loc.href end
159
- alias :href :url
183
+ # HELPERS #
160
184
 
161
- def get_srcs(links='img')
162
- begin
163
- links = find(links).map {|e| e.src} if links.is String
164
- rescue XML::Error
165
- links = [links]
185
+ # hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
186
+ # dict key1: value1, ...
187
+ # key2: value2, ...
188
+ def dict(hash)
189
+ hash.is_a?(Hash) ? hash : Hash[hash]
190
+ end
191
+
192
+ # maps {'firstname lastname' => tuple} into {:firstname => tuple[0], :lastname => tuple[1]}
193
+ def flatten_dict(hash)
194
+ result = {}
195
+ hash.each {|k, v|
196
+ if k.is String and k[' ']
197
+ k.split(' ').each_with_index {|k_unit, k_idx|
198
+ result[k_unit.to_sym] = v[k_idx]
199
+ }
200
+ else
201
+ result[k.to_sym] = v
202
+ end
203
+ }
204
+ result
205
+ end
206
+
207
+ # makes a relative path being on this page into an absolute path
208
+ def expand_link(link)
209
+ case link
210
+ when /^\w+:\/\// then link
211
+ when /^\/\// then @loc.protocol + ':' + link
212
+ when /^\// then @loc.root + link
213
+ when /^\?/ then File.join(@loc.root, @loc.path) + link
214
+ when /^#/ then File.join(@loc.root, @loc.fullpath) + link
215
+ else File.join @loc.root, File.dirname(@loc.path), link
166
216
  end
167
- links.map {|link| expand_link link}.uniq
168
217
  end
169
218
 
170
- def get_src(link='img')
171
- begin
172
- link = at(link) && at(link).src if link.is String
173
- rescue XML::Error; nil
219
+
220
+ # FINDERS #
221
+
222
+ private
223
+
224
+ def node_is_missing!(selector, options)
225
+ missing = options[:missing]
226
+ if missing.is Proc
227
+ missing.call(selector)
228
+ elsif missing
229
+ if missing.is String
230
+ message %= {selector: selector}
231
+ end
232
+ raise NodeNotFound, missing
233
+ end
234
+ end
235
+
236
+ def preprocess_search_result(preresult, preprocess)
237
+ if preprocess.is_a? Proc
238
+ preprocess.call(preresult)
239
+ elsif preprocess.is_a? Symbol
240
+ __send__(preprocess, preresult)
241
+ else
242
+ preresult
243
+ end
244
+ end
245
+
246
+ def preprocess_search_results(preresult, preprocess)
247
+ if preprocess.is_a? Proc
248
+ preresult.map(&preprocess)
249
+ elsif preprocess.is_a? Symbol
250
+ preresult.map {|node| __send__(preprocess, node)}
251
+ else
252
+ preresult
174
253
  end
175
- expand_link link if link
176
254
  end
177
255
 
178
- def get_links(links='a')
179
- begin
180
- links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
181
- rescue XML::Error
182
- links = [links]
256
+ def __at(xp) (@doc || to_doc).at xp end
257
+
258
+ def __find(xp) (@doc || to_doc).find xp end
259
+
260
+ public
261
+
262
+ def at(selector_or_node, options={})
263
+ if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
264
+ selector_or_node : __at(selector_or_node)
265
+
266
+ preresult = preprocess_search_result(preresult, options[:preprocess])
267
+ block_given? ? yield(preresult) : preresult
268
+ else
269
+ node_is_missing!(selector_or_node, options)
270
+ preresult
183
271
  end
184
- links.map {|link| expand_link link}.uniq
185
272
  end
273
+ alias :first :at
186
274
 
187
- def get_link(link='a')
188
- begin
189
- link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
190
- rescue XML::Error; nil
275
+ def find(selector_or_nodes, options={}, &foreach)
276
+ preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
277
+ selector_or_nodes : __find(selector_or_nodes)
278
+
279
+ if preresult.size > 0
280
+ preresult = preprocess_search_results(preresult, options[:preprocess])
281
+ foreach ? preresult.each(&foreach) : preresult
282
+ else
283
+ node_is_missing!(selector_or_nodes, options)
284
+ preresult
191
285
  end
192
- expand_link link if link
193
286
  end
194
- alias :get_hrefs :get_links
195
- alias :links :get_links
196
- alias :get_href :get_link
197
- alias :link :get_link
198
- alias :srcs :get_srcs
287
+ alias :all :find
288
+
289
+
290
+ # FINDERS PREPROCESSORS #
291
+
292
+ def text(selector_or_node, options={})
293
+ if node = at(selector_or_node, options)
294
+ txt = node.text.strip
295
+ block_given? ? yield(txt) : txt
296
+ end
297
+ end
298
+
299
+ def texts(hash, options={})
300
+ hash.map_values {|selector_or_node|
301
+ text(selector_or_node, options)
302
+ }
303
+ end
304
+
305
+ def get_src(selector_or_node='img', options={}, &onfound)
306
+ at(selector_or_node, options.merge(:preprocess => lambda {|node|
307
+ if src = node.src
308
+ expand_link src
309
+ end
310
+ })) {|href| onfound && src ? onfound.call(src) : src}
311
+ end
199
312
  alias :src :get_src
200
313
 
201
- def expand_link(link)
202
- case link
203
- when /^\w+:\/\// then link
204
- when /^\/\// then @loc.protocol+link
205
- when /^\// then @loc.root+link
206
- else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
314
+ def get_link(selector_or_node='a', options={}, &onfound)
315
+ at(selector_or_node, options.merge(:preprocess => lambda {|node|
316
+ unless href = node.href
317
+ if node = node.find('a')
318
+ href = node.href
319
+ end
320
+ end
321
+ if href
322
+ expand_link href
323
+ end
324
+ })) {|href| onfound && href ? onfound.call(href) : href}
325
+ end
326
+ alias :link :get_link
327
+ alias :get_href :get_link
328
+
329
+ def map(selector_or_nodes, options={}, &mapper)
330
+ mapping = find(selector_or_nodes, options.merge(:preprocess => mapper))
331
+ unless options[:compact] == false
332
+ mapping = mapping.to_a.compact
207
333
  end
334
+ mapping
208
335
  end
209
336
 
337
+ def map_json(selector_or_nodes, options={}, &mapper)
338
+ JsonString map(selector_or_nodes, options, &mapper)
339
+ end
340
+
341
+
342
+ # FORMS #
343
+
210
344
  def form(form='form', hash={}, opts={})
211
345
  form = "[action=#{@loc.path.inspect}]" if form == :self
212
346
  if form.is String
@@ -237,6 +371,47 @@ module RHACK
237
371
  page
238
372
  end
239
373
 
374
+
375
+ # OLD #
376
+
377
+ # TODO: make into same form as #get_src and #map
378
+ def get_srcs(links='img')
379
+ begin
380
+ links = find(links).map {|e| e.src} if links.is String
381
+ rescue XML::Error
382
+ links = [links]
383
+ end
384
+ links.map {|link| expand_link link}.uniq
385
+ end
386
+ alias :srcs :get_srcs
387
+
388
+ #def get_src(link='img')
389
+ # begin
390
+ # link = at(link) && at(link).src if link.is String
391
+ # rescue XML::Error; nil
392
+ # end
393
+ # expand_link link if link
394
+ #end
395
+
396
+ def get_links(links='a')
397
+ begin
398
+ links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
399
+ rescue XML::Error
400
+ links = [links]
401
+ end
402
+ links.map {|link| expand_link link}.uniq
403
+ end
404
+ alias :get_hrefs :get_links
405
+ alias :links :get_links
406
+
407
+ #def get_link(link='a')
408
+ # begin
409
+ # link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
410
+ # rescue XML::Error; nil
411
+ # end
412
+ # expand_link link if link
413
+ #end
414
+
240
415
  def load_scripts(frame)
241
416
  frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
242
417
  end
data/lib/rhack/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.1.6'
2
+ VERSION = '1.1.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.6
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sergey Baev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-03 00:00:00.000000000 Z
11
+ date: 2013-08-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport