rhack 1.1.6 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- NWNkY2FhMDI2NTg4YWMyOTNkNGRkMzJhYjc1NWFhYTU2Yzc3NzAzYg==
4
+ YzliZTlhMjBjZjM2MzE4ZjNkZDc0MDQzYjg3MDM5NGZmZjdmMjUwMA==
5
5
  data.tar.gz: !binary |-
6
- ODM0MDBmYmYyNjQxY2Q1M2YzYjhmYjFmNDY2N2QzNTRkZDQxNDA1Ng==
6
+ OGFmNGYyN2Y0MTRkODYzNGRlNzc3NDY1ZTg3MTU5ZTBhZTdiN2FjZQ==
7
7
  !binary "U0hBNTEy":
8
8
  metadata.gz: !binary |-
9
- ZGEyMzYxYzY0Y2U0Y2JhZjA5NjMzNDQ1NWU3YzA5NDllNWU3NWNiODU2ZTNl
10
- OTZmNGQ1ZWIzZTdiOTUyNGM3N2U4ZTJiMDBkMzY2MDA5N2VmNDBjNGI5NDE4
11
- Mjk4NWM0NTE3MzViNTlkYzQ3YzE2ZTdiMzVmNjA1YWI0ZDhjZDY=
9
+ NGQyMjRiNDZmZjRhMWMzZGUwZGE3NjhlMmE2ODYzOGM2NTBkN2JhYThjMDMw
10
+ ZmZmODg3OThhMzFlNjU1ZjQ2MzI3ZjliY2ZlN2IyNDNiMGE0OTgxZTQxMDU2
11
+ OGRhYzlhNzgxYzY5ZGEyN2Y1MmZkNjVlZTA0OWJjMThjMGMyNTQ=
12
12
  data.tar.gz: !binary |-
13
- ODVhN2NlNDQzMzFmNzhiZGI5MDY3YWRmOTAyZWM0MDdkOTU0NjQ1MTc5ZmIw
14
- YzcxM2M0Y2U5NTkxYWRkNGYxZDU0YTZjNTEyNmJiMjA1YjI2NjI3YmQ3NjM4
15
- ODRmYWUzMTFiOGM1YmJlOWQzYTMwM2M3OTQyNjVkNjlmYTU1ZWM=
13
+ NTQ1ODZhMWY1NTIwZjcxZDM3YzVlY2ZiNTE3NDE2YmVkZDI1M2M4ODYyZjY3
14
+ OGM0OTM4NjNjZjU5NzNhMTJjYzMyODA0YTNiOWE1NjFiMmM3MWUyN2MxNTY1
15
+ NzU2OWE2YTU0M2RmM2VhM2FkMGE0YjhjMzU3ZDNkYWEyN2M5MmY=
data/README.md CHANGED
@@ -31,6 +31,11 @@ It's still randomly documented since it's just my working tool.
31
31
 
32
32
  ### CHANGES
33
33
 
34
+ ##### Version 1.1.7
35
+
36
+ * ::Page
37
+ * Fixed #expand_link for partial links
38
+
34
39
  ##### Version 1.1.6
35
40
 
36
41
  * ::Frame
@@ -71,12 +71,8 @@ rb_hash_clear_i(VALUE key, VALUE value, VALUE dummy) {
71
71
  }
72
72
 
73
73
  static void curl_multi_free(ruby_curl_multi *rbcm) {
74
- if (rbcm) {
75
- ruby_log((!rbcm->requests == Qnil) ? "true" : "false");
76
- ruby_log_obj(rbcm->requests);
77
- }
78
- if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
79
- //if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
74
+ //if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
75
+ if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
80
76
  rb_hash_foreach( rbcm->requests, (int (*)())curl_multi_flush_easy, (VALUE)rbcm );
81
77
  rb_hash_foreach(rbcm->requests, rb_hash_clear_i, 0); //rb_hash_clear(rbcm->requests);
82
78
  rbcm->requests = Qnil;
@@ -44,6 +44,7 @@ module Curl
44
44
  end
45
45
  alias :run :execute
46
46
 
47
+ # TODO: вместо статуса sleep/run использовать глобальную класс-переменную
47
48
  def wait
48
49
  if @@carier_thread and @@carier_thread.status
49
50
  unless within = Thread.current == @@carier_thread
data/lib/rhack/page.rb CHANGED
@@ -1,10 +1,35 @@
1
1
  # encoding: utf-8
2
- module RHACK
2
+ class Object
3
+ unless defined? one_argument_is_a?
4
+ alias :one_argument_is_a? :is_a?
5
+ def is_a?(class1, *other_classes)
6
+ one_argument_is_a? class1 or other_classes.any? {|cl| one_argument_is_a? cl}
7
+ end
8
+ end
9
+ end
3
10
 
4
- # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
5
- # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
11
+ module RHACK
12
+
13
+ class JsonString < String
14
+ __init__
15
+ attr_reader :source
16
+
17
+ def initialize(source)
18
+ @source = source
19
+ super source.to_json
20
+ end
6
21
 
7
- class Page
22
+ def inspect
23
+ "#<RHACK::JsonString(#{@source.inspect})>"
24
+ end
25
+ end
26
+
27
+ class ScrapeError < ArgumentError; end
28
+ class NodeNotFound < ScrapeError; end
29
+
30
+ # Frame( ScoutSquad( Curl::Multi <- Scout( Curl API ), Scout, ... ) ) =>
31
+ # Curl -> Johnson::Runtime -> XML::Document => Page( XML::Document ), Page, ...
32
+ class Page
8
33
  # for debug, just enable L#debug, don't write tons of chaotic log-lines
9
34
  __init__
10
35
  attr_writer :title
@@ -44,6 +69,9 @@ module RHACK
44
69
  @html.force_encoding(encoding)
45
70
  end
46
71
 
72
+ def url() @loc.href end
73
+ alias :href :url
74
+
47
75
  # We can then alternate #process in Page subclasses
48
76
  # Frame doesn't mind about value returned by #process
49
77
  def process(c, opts={})
@@ -150,63 +178,169 @@ module RHACK
150
178
  @loc.href
151
179
  end
152
180
  end
153
-
154
- def find(xp) (@doc || to_doc).find xp end
155
181
 
156
- def at(xp) (@doc || to_doc).at xp end
157
182
 
158
- def url() @loc.href end
159
- alias :href :url
183
+ # HELPERS #
160
184
 
161
- def get_srcs(links='img')
162
- begin
163
- links = find(links).map {|e| e.src} if links.is String
164
- rescue XML::Error
165
- links = [links]
185
+ # hook to create even-looked lines defining a hash in my Verdana 10px, e.g.
186
+ # dict key1: value1, ...
187
+ # key2: value2, ...
188
+ def dict(hash)
189
+ hash.is_a?(Hash) ? hash : Hash[hash]
190
+ end
191
+
192
+ # maps {'firstname lastname' => tuple} into {:firstname => tuple[0], :lastname => tuple[1]}
193
+ def flatten_dict(hash)
194
+ result = {}
195
+ hash.each {|k, v|
196
+ if k.is String and k[' ']
197
+ k.split(' ').each_with_index {|k_unit, k_idx|
198
+ result[k_unit.to_sym] = v[k_idx]
199
+ }
200
+ else
201
+ result[k.to_sym] = v
202
+ end
203
+ }
204
+ result
205
+ end
206
+
207
+ # makes a relative path being on this page into an absolute path
208
+ def expand_link(link)
209
+ case link
210
+ when /^\w+:\/\// then link
211
+ when /^\/\// then @loc.protocol + ':' + link
212
+ when /^\// then @loc.root + link
213
+ when /^\?/ then File.join(@loc.root, @loc.path) + link
214
+ when /^#/ then File.join(@loc.root, @loc.fullpath) + link
215
+ else File.join @loc.root, File.dirname(@loc.path), link
166
216
  end
167
- links.map {|link| expand_link link}.uniq
168
217
  end
169
218
 
170
- def get_src(link='img')
171
- begin
172
- link = at(link) && at(link).src if link.is String
173
- rescue XML::Error; nil
219
+
220
+ # FINDERS #
221
+
222
+ private
223
+
224
+ def node_is_missing!(selector, options)
225
+ missing = options[:missing]
226
+ if missing.is Proc
227
+ missing.call(selector)
228
+ elsif missing
229
+ if missing.is String
230
+ message %= {selector: selector}
231
+ end
232
+ raise NodeNotFound, missing
233
+ end
234
+ end
235
+
236
+ def preprocess_search_result(preresult, preprocess)
237
+ if preprocess.is_a? Proc
238
+ preprocess.call(preresult)
239
+ elsif preprocess.is_a? Symbol
240
+ __send__(preprocess, preresult)
241
+ else
242
+ preresult
243
+ end
244
+ end
245
+
246
+ def preprocess_search_results(preresult, preprocess)
247
+ if preprocess.is_a? Proc
248
+ preresult.map(&preprocess)
249
+ elsif preprocess.is_a? Symbol
250
+ preresult.map {|node| __send__(preprocess, node)}
251
+ else
252
+ preresult
174
253
  end
175
- expand_link link if link
176
254
  end
177
255
 
178
- def get_links(links='a')
179
- begin
180
- links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
181
- rescue XML::Error
182
- links = [links]
256
+ def __at(xp) (@doc || to_doc).at xp end
257
+
258
+ def __find(xp) (@doc || to_doc).find xp end
259
+
260
+ public
261
+
262
+ def at(selector_or_node, options={})
263
+ if selector_or_node and preresult = selector_or_node.is_a?(XML::Node) ?
264
+ selector_or_node : __at(selector_or_node)
265
+
266
+ preresult = preprocess_search_result(preresult, options[:preprocess])
267
+ block_given? ? yield(preresult) : preresult
268
+ else
269
+ node_is_missing!(selector_or_node, options)
270
+ preresult
183
271
  end
184
- links.map {|link| expand_link link}.uniq
185
272
  end
273
+ alias :first :at
186
274
 
187
- def get_link(link='a')
188
- begin
189
- link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
190
- rescue XML::Error; nil
275
+ def find(selector_or_nodes, options={}, &foreach)
276
+ preresult = selector_or_nodes.is_a?(XML::XPath::Object, Array) ?
277
+ selector_or_nodes : __find(selector_or_nodes)
278
+
279
+ if preresult.size > 0
280
+ preresult = preprocess_search_results(preresult, options[:preprocess])
281
+ foreach ? preresult.each(&foreach) : preresult
282
+ else
283
+ node_is_missing!(selector_or_nodes, options)
284
+ preresult
191
285
  end
192
- expand_link link if link
193
286
  end
194
- alias :get_hrefs :get_links
195
- alias :links :get_links
196
- alias :get_href :get_link
197
- alias :link :get_link
198
- alias :srcs :get_srcs
287
+ alias :all :find
288
+
289
+
290
+ # FINDERS PREPROCESSORS #
291
+
292
+ def text(selector_or_node, options={})
293
+ if node = at(selector_or_node, options)
294
+ txt = node.text.strip
295
+ block_given? ? yield(txt) : txt
296
+ end
297
+ end
298
+
299
+ def texts(hash, options={})
300
+ hash.map_values {|selector_or_node|
301
+ text(selector_or_node, options)
302
+ }
303
+ end
304
+
305
+ def get_src(selector_or_node='img', options={}, &onfound)
306
+ at(selector_or_node, options.merge(:preprocess => lambda {|node|
307
+ if src = node.src
308
+ expand_link src
309
+ end
310
+ })) {|href| onfound && src ? onfound.call(src) : src}
311
+ end
199
312
  alias :src :get_src
200
313
 
201
- def expand_link(link)
202
- case link
203
- when /^\w+:\/\// then link
204
- when /^\/\// then @loc.protocol+link
205
- when /^\// then @loc.root+link
206
- else File.join((@loc.path.b ? File.dirname(@loc.path) : @loc.root), link)
314
+ def get_link(selector_or_node='a', options={}, &onfound)
315
+ at(selector_or_node, options.merge(:preprocess => lambda {|node|
316
+ unless href = node.href
317
+ if node = node.find('a')
318
+ href = node.href
319
+ end
320
+ end
321
+ if href
322
+ expand_link href
323
+ end
324
+ })) {|href| onfound && href ? onfound.call(href) : href}
325
+ end
326
+ alias :link :get_link
327
+ alias :get_href :get_link
328
+
329
+ def map(selector_or_nodes, options={}, &mapper)
330
+ mapping = find(selector_or_nodes, options.merge(:preprocess => mapper))
331
+ unless options[:compact] == false
332
+ mapping = mapping.to_a.compact
207
333
  end
334
+ mapping
208
335
  end
209
336
 
337
+ def map_json(selector_or_nodes, options={}, &mapper)
338
+ JsonString map(selector_or_nodes, options, &mapper)
339
+ end
340
+
341
+
342
+ # FORMS #
343
+
210
344
  def form(form='form', hash={}, opts={})
211
345
  form = "[action=#{@loc.path.inspect}]" if form == :self
212
346
  if form.is String
@@ -237,6 +371,47 @@ module RHACK
237
371
  page
238
372
  end
239
373
 
374
+
375
+ # OLD #
376
+
377
+ # TODO: make into same form as #get_src and #map
378
+ def get_srcs(links='img')
379
+ begin
380
+ links = find(links).map {|e| e.src} if links.is String
381
+ rescue XML::Error
382
+ links = [links]
383
+ end
384
+ links.map {|link| expand_link link}.uniq
385
+ end
386
+ alias :srcs :get_srcs
387
+
388
+ #def get_src(link='img')
389
+ # begin
390
+ # link = at(link) && at(link).src if link.is String
391
+ # rescue XML::Error; nil
392
+ # end
393
+ # expand_link link if link
394
+ #end
395
+
396
+ def get_links(links='a')
397
+ begin
398
+ links = find(links).map {|e| e.href}.b || find(links+'//a').map {|e| e.href} if links.is String
399
+ rescue XML::Error
400
+ links = [links]
401
+ end
402
+ links.map {|link| expand_link link}.uniq
403
+ end
404
+ alias :get_hrefs :get_links
405
+ alias :links :get_links
406
+
407
+ #def get_link(link='a')
408
+ # begin
409
+ # link = at(link) && (at(link).href || at(link+'//a').href) if link.is String
410
+ # rescue XML::Error; nil
411
+ # end
412
+ # expand_link link if link
413
+ #end
414
+
240
415
  def load_scripts(frame)
241
416
  frame && frame.get_cached(*get_srcs("script[src]")).each {|js| eval_string js}
242
417
  end
data/lib/rhack/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.1.6'
2
+ VERSION = '1.1.7'
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.6
4
+ version: 1.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sergey Baev
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-07-03 00:00:00.000000000 Z
11
+ date: 2013-08-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport