rhack 1.1.0 → 1.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MmU2Y2RiMzNmOTM0ZjUwYzkzM2VlNGEyNGQzZjcwMTI2NjdmNTk0YQ==
5
+ data.tar.gz: !binary |-
6
+ ZWYxYTBkMDI5Njg5NDhiMzgwYTU0MGRkZGY5MWJjNjA3ZjE5OGUxZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NTgxOTYyYTQwYjYwYTkzNTgyZGFmZWMzOGVmZTE1MjA2M2E5YzAzM2Q1ZTYy
10
+ ZjYyYTM3NjkzYTE5OGZjODQ3ZmU0Njc5NzY4MTEzYjE5MDI3MzdiYzc4N2Uy
11
+ ODFhNmIyMjQyOTE3NTFmZmIwN2VjMTE0ZjllMzY3ZDhmZDg1Mzg=
12
+ data.tar.gz: !binary |-
13
+ YmUyM2IyNjBiMWFkNTlmZWUwZWIyZjUxZmIxYmNkYjc3OTliN2Q1ZmMxYzM2
14
+ ZmIzM2Q4MWFhNGZjYjE1NmNkMjcwZGQ5YzAyNjNjNGQ4NGM5N2VmMTQ2MGY5
15
+ ODZkMGJjMGJlOGEyMjIxYjc0NjMyZGY0ZjY1MzRkMjQwNmZlNDQ=
data/README.md CHANGED
@@ -31,6 +31,25 @@ It's still randomly documented since it's just my working tool.
31
31
 
32
32
  ### CHANGES
33
33
 
34
+ ##### Version 1.1.4
35
+
36
+ * ::Frame
37
+ * Moved `Curl.execute` from *initialize* to *on after request added*
38
+
39
+ * ::ScoutSquad
40
+ * Finally stabilized #next and #rand time management for parallel recursive execution
41
+
42
+ ##### Version 1.1.3
43
+
44
+ * ::Frame
45
+ * Added #anchor
46
+
47
+ * ::Scout
48
+ * Fixed #update
49
+ * Catch weird Curl::Err::CurlOK being thrown on some pages
50
+
51
+ * Fixed some exceptions messages
52
+
34
53
  ##### Version 1.1.0
35
54
 
36
55
  * ::OAuthClient < ::Client
@@ -58,6 +77,7 @@ It's still randomly documented since it's just my working tool.
58
77
 
59
78
  * ::Service
60
79
  * Is renamed to Client what is more sensible. RHACK::Service is still usable as alias
80
+ * require 'rhack/clients' <-> require 'rhack/services'
61
81
 
62
82
  * Structural changes
63
83
  * Updated and documented rhack.yml.template that now lies in <gemdir>/config
@@ -71,14 +71,13 @@ rb_hash_clear_i(VALUE key, VALUE value, VALUE dummy) {
71
71
  }
72
72
 
73
73
  static void curl_multi_free(ruby_curl_multi *rbcm) {
74
-
75
- if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
76
-
74
+ //if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
75
+ if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
77
76
  rb_hash_foreach( rbcm->requests, (int (*)())curl_multi_flush_easy, (VALUE)rbcm );
78
-
79
77
  rb_hash_foreach(rbcm->requests, rb_hash_clear_i, 0); //rb_hash_clear(rbcm->requests);
80
78
  rbcm->requests = Qnil;
81
79
  }
80
+
82
81
  curl_multi_cleanup(rbcm->handle);
83
82
  free(rbcm);
84
83
  }
@@ -179,10 +178,10 @@ static VALUE ruby_curl_multi_idle(VALUE self) {
179
178
 
180
179
  Data_Get_Struct(self, ruby_curl_multi, rbcm);
181
180
 
182
- if ( FIX2INT( rb_funcall(rbcm->requests, rb_intern("length"), 0) ) == 0 ) {
183
- return Qtrue;
184
- } else {
181
+ if (RHASH_LEN(rbcm->requests))
185
182
  return Qfalse;
183
+ } else {
184
+ return Qtrue;
186
185
  }
187
186
  }
188
187
 
@@ -627,6 +626,7 @@ static void rb_curl_multi_idle_perform(VALUE self, ruby_curl_multi *rbcm) {
627
626
  create_crt_fd(&fdexcep, &crt_fdexcep);
628
627
  #endif
629
628
 
629
+ // sleep while no requests
630
630
  do {
631
631
  #ifdef HAVE_RB_THREAD_BLOCKING_REGION
632
632
  fdset_args.maxfd = 0;
@@ -641,7 +641,7 @@ static void rb_curl_multi_idle_perform(VALUE self, ruby_curl_multi *rbcm) {
641
641
  if (rc == -1)
642
642
  rb_raise(rb_eRuntimeError, "select(): %s", strerror(errno));
643
643
 
644
- } while (!(RHASH_TBL(rbcm->requests)->num_entries));
644
+ } while (!RHASH_LEN(rbcm->requests));
645
645
 
646
646
  #ifdef _WIN32
647
647
  cleanup_crt_fd(&fdread, &crt_fdread);
@@ -0,0 +1,10 @@
1
+ require 'rhack'
2
+ require 'rhack/clients/base'
3
+ require 'rhack/clients/storage'
4
+ require 'rhack/clients/oauth'
5
+
6
+ module RHACK
7
+ for name in [:Service, :ServiceError]
8
+ autoload name, 'rhack/clients/compatibility'
9
+ end
10
+ end
File without changes
@@ -95,7 +95,7 @@ module RHACK
95
95
  response_type: 'code',
96
96
  client_id: OAUTH(:id),
97
97
  state: state
98
- }.merge(url_params).to_params
98
+ }.merge(url_params).urlencode
99
99
  end
100
100
 
101
101
  # @ url_params : {:code, :state, ...}
@@ -121,7 +121,7 @@ module RHACK
121
121
  grant_type: 'authorization_code',
122
122
  client_id: OAUTH(:id),
123
123
  client_secret: OAUTH(:secret)
124
- }.merge(url_params).to_params, raw: true, proc_result: block) {|curl|
124
+ }.merge(url_params).urlencode, raw: true, proc_result: block) {|curl|
125
125
  L.debug curl.res
126
126
  L.debug curl.res.body
127
127
  # TODO: refactor parse type selector: raw, json, hash, xml...
@@ -145,7 +145,7 @@ module RHACK
145
145
  grant_type: 'client_credentials',
146
146
  client_id: OAUTH(:id),
147
147
  client_secret: OAUTH(:secret)
148
- }.to_params, raw: true, proc_result: block) {|curl|
148
+ }.urlencode, raw: true, proc_result: block) {|curl|
149
149
  if curl.res.code == 200
150
150
  body = curl.res.body
151
151
  hash = '{['[body[0]] ? body.from_json(symbolize_keys: true) : body.to_params
@@ -194,7 +194,7 @@ module RHACK
194
194
 
195
195
  L.debug state_params
196
196
  action += '?' if !action['?']
197
- action += action_params.to_params
197
+ action += action_params.urlencode
198
198
  L.debug [action_data, action, token]
199
199
  opts = {proc_result: block, headers: {'Referer' => nil}, result: CodeIndiffirentPage}.merge(opts)
200
200
  # TODO: option to
@@ -27,10 +27,11 @@ module RHACK
27
27
  def initialize *args
28
28
  args << 10 unless args[-1].is Fixnum
29
29
  args.insert -2, {} unless args[-2].is Hash
30
- if scouts = args[-2][:scouts]
30
+ opts = args[-2]
31
+ if scouts = (opts[:scouts] || opts[:threads])
31
32
  args[-1] = scouts
32
33
  end
33
- @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2])
34
+ @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(opts)
34
35
  args[-2] = @opts
35
36
  if args[0].is String
36
37
  url = args[0]
@@ -41,7 +42,6 @@ module RHACK
41
42
  @static = false
42
43
  end
43
44
  @ss = ScoutSquad *args
44
- Curl.run :unless_allready
45
45
  end
46
46
 
47
47
  def update_loc url
@@ -57,6 +57,10 @@ module RHACK
57
57
  end
58
58
  alias :target= :retarget
59
59
 
60
+ def anchor
61
+ retarget @loc.href
62
+ end
63
+
60
64
  def next() @ss.next end
61
65
  def rand() @ss.rand end
62
66
  def each(&block) @ss.each &block end
@@ -205,17 +209,17 @@ module RHACK
205
209
  if @static
206
210
  if @static.is Hash
207
211
  if loc.host != @loc.host and !@static.host
208
- raise TargetError, "unable to get #{url} by static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with new target"
212
+ raise TargetError, "unable to get #{url} by a static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with a new target"
209
213
  end
210
214
  else
211
- raise TargetError, "unable to get #{url} by static frame #{@loc.root}, you should first update it with new target"
215
+ raise TargetError, "unable to get #{url} by a static frame #{@loc.root}, you should first update it with a new target"
212
216
  end
213
217
  end
214
218
  @loc.root, @loc.host, @loc.protocol = loc.root, loc.host, loc.protocol
215
219
  url
216
220
  elsif !loc.root
217
221
  if !@static
218
- raise TargetError, "undefined root for query #{url}, use :static option as Hash to set default protocol and host, or as True to allow using previously used root"
222
+ raise TargetError, "undefined root for query #{url}, use :static option as Hash to set a default protocol and host, or as True to allow using previously used root"
219
223
  elsif @static.is Hash
220
224
  # targeting relatively to default values (from @static hash)
221
225
  @loc.protocol = @static.protocol
@@ -223,7 +227,7 @@ module RHACK
223
227
  @loc.root = @loc.protocol+'://'+@loc.host
224
228
  end
225
229
  if !@loc.host
226
- raise TargetError, "undefined host for query #{url}, use :host parameter of :static option to set default host"
230
+ raise TargetError, "undefined host for query #{url}, use :host parameter of :static option to set a default host"
227
231
  end
228
232
  File.join @loc.root, url
229
233
  else url
@@ -238,6 +242,15 @@ module RHACK
238
242
  urls.map! {|u| validate u}
239
243
  end
240
244
 
245
+ # Feature of :proc_result in that, if you running synchronously,
246
+ # result of #run will be, for conviniency, `page.res` instead of `page`
247
+ #
248
+ # If you only need to transfer &block through a stack of frame callbacks
249
+ # just add &block to the needed #run call
250
+ #
251
+ # If you want a method to be processable as in async-mode with &block passed
252
+ # as in sync-mode with no &block passed
253
+ # pass :save_result => !block to the topmost #run call
241
254
  def run_callbacks!(page, opts, &callback)
242
255
  # if no callback must have run then page.res is equal to the page
243
256
  # so we can get the page as result of a sync as well as an async request
@@ -280,12 +293,16 @@ module RHACK
280
293
  end
281
294
  if opts[:raw]
282
295
  page.res = yield curl
283
- # here +curl.res.body+ become empty
296
+ # here +curl.res.body+ becomes empty
284
297
  elsif page.process(curl, opts)
285
298
  @@cache[page.href] = page if order[0] == :loadGet and @use_cache
286
299
  run_callbacks! page, opts, &callback
287
300
  end
288
301
  }
302
+ # > Carier.requests++
303
+ unless opts[:wait] and opts[:thread_safe] or opts[:exec] == false
304
+ Curl.execute :unless_already
305
+ end
289
306
  if opts[:wait]
290
307
  opts[:thread_safe] ? Curl.carier.perform : Curl.wait
291
308
  (opts[:save_result] or :proc_result.in opts) ? page.res : page
@@ -302,11 +319,16 @@ module RHACK
302
319
  end
303
320
  pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
304
321
  (with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
305
- exec_one order, with_opts, &callback
322
+ exec_one order, with_opts.merge(:exec => false), &callback
306
323
  }
307
324
  else
325
+ # если ss.next будет не хватать скаутов, то он сам запустит курл
326
+ # правда, это с :thread_safe никак не вяжется
308
327
  pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
309
328
  end
329
+ unless w and with_opts[:thread_safe] or opts[:exec] == false
330
+ Curl.execute :unless_already
331
+ end
310
332
  with_opts[:thread_safe] ? Curl.carier.perform : Curl.wait if w
311
333
  with_opts[:stream] || pages
312
334
  end
@@ -54,7 +54,7 @@ module RHACK
54
54
 
55
55
  def update(uri)
56
56
  if !uri[/^\w+:\/\//]
57
- '/' >> uri if uri[0,1] != '/'
57
+ uri = '/' + uri if uri[0,1] != '/'
58
58
  @uri = uri.parse:uri
59
59
  return
60
60
  end
@@ -198,6 +198,10 @@ module RHACK
198
198
  Curl.carier.reqs.include? @http
199
199
  end
200
200
 
201
+ def available?
202
+ !loaded?
203
+ end
204
+
201
205
  def load!
202
206
  unless Curl.carier.add @http
203
207
  Curl.carier.remove @http
@@ -214,7 +218,10 @@ module RHACK
214
218
  @http.timeout = @timeout
215
219
 
216
220
  @http.on_complete {|c|
221
+ # > Carier.requests--
217
222
  @error = nil
223
+ # While not outdated, Curl::Response here may contain pointers on freed
224
+ # memory, thus throwing exception on #to_s and #inspect
218
225
  c.outdate!
219
226
  ProcCookies c.res if @cookieProc
220
227
  # We cannot just cancel on_complete in on_redirect block
@@ -226,10 +233,11 @@ module RHACK
226
233
  end
227
234
  }
228
235
  @http.on_failure {|c, e|
236
+ @error = e
229
237
  if e[0] == Curl::Err::CurlOK
230
- @error = e
231
- # TODO: где-то в сорцах on_failure вызывается по коду 0, видимо из-за стороннего условия, а не должен
232
- L.log << "Got Curl::Err::CurlOK, response was: #{c.res}"
238
+ # в сорцах on_failure не вызывается по коду 0, это какой-то глюк
239
+ # в любом случае такой поворот не означает ошибки
240
+ L.warn "Got Curl::Err::CurlOK, response was: #{c.res}"
233
241
  else
234
242
  @http.on_complete &Proc::NULL
235
243
  c.outdate!
@@ -34,47 +34,44 @@ module RHACK
34
34
  end
35
35
 
36
36
  def update uri, forced=nil
37
- each {|s| return L.warn "failed to update scout loaded? with url: #{s.http.url}" if s.loaded?} if !forced
37
+ each {|s| return L.warn "failed to update loaded scout with url: #{s.http.url}" if s.loaded?} if !forced
38
38
  each {|s| s.update uri}
39
39
  end
40
40
 
41
41
  def untargeted
42
42
  first.root == 'http://'
43
43
  end
44
+
45
+ def wait_for_available
46
+ Curl.execute :unless_already
47
+ # Carier.requests освобождаются ещё до колбека,
48
+ # но колбеки выполняются последовательно,
49
+ # поэтому здесь мы можем усыплять тред,
50
+ # но только если это не тред самого Carier
51
+ if Curl.carier_thread == Thread.current
52
+ Curl.wait # runs Multi#perform
53
+ else
54
+ sleep 1
55
+ end
56
+ end
44
57
 
45
58
  def rand
46
59
  raise PickError if !b
47
- # to_a because reject returns object of this class
48
- if scout = to_a.rand {|_|!_.loaded?}; scout
49
- else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
50
- unless Curl.status
51
- L.log "Curl must run in order to use ScoutSquad#rand; setting Carier Thread"
52
- Curl.execute
53
- #raise "Curl must run in order to use ScoutSquad#rand"
54
- end
55
- #Curl.wait
56
- loop {
57
- sleep 1
58
- break if Curl.carier.reqs.size < size
59
- }
60
+ # to_a because Array#reject returns object of this class
61
+ if scout = to_a.rand_by_available?
62
+ scout
63
+ else
64
+ wait_for_available
60
65
  self.rand
61
66
  end
62
67
  end
63
68
 
64
69
  def next
65
70
  raise PickError if !b
66
- if scout = find {|_|!_.loaded?}; scout
67
- else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
68
- unless Curl.status
69
- L.log "Curl must run in order to use ScoutSquad#next; setting Carier Thread"
70
- Curl.execute :unless_allready
71
- #raise "Curl must run in order to use ScoutSquad#next"
72
- end
73
- #Curl.wait
74
- loop {
75
- sleep 1
76
- break if Curl.carier.reqs.size < size
77
- }
71
+ if scout = to_a.find_available?
72
+ scout
73
+ else
74
+ wait_for_available
78
75
  self.next
79
76
  end
80
77
  end
@@ -1,10 +1 @@
1
- require 'rhack'
2
- require 'rhack/services/base'
3
- require 'rhack/services/storage'
4
- require 'rhack/services/oauth'
5
-
6
- module RHACK
7
- for name in [:Service, :ServiceError]
8
- autoload name, 'rhack/services/compatibility'
9
- end
10
- end
1
+ require 'rhack/clients'
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.1.0'
2
+ VERSION = '1.1.4'
3
3
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
5
- prerelease:
4
+ version: 1.1.4
6
5
  platform: ruby
7
6
  authors:
8
7
  - Sergey Baev
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-07 00:00:00.000000000 Z
11
+ date: 2013-07-03 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: activesupport
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: redis
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: rmtools
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: libxml-ruby
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :runtime
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: bundler
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - ~>
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :development
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - ~>
92
81
  - !ruby/object:Gem::Version
@@ -94,7 +83,6 @@ dependencies:
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: rake
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
87
  - - ! '>='
100
88
  - !ruby/object:Gem::Version
@@ -102,7 +90,6 @@ dependencies:
102
90
  type: :development
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
94
  - - ! '>='
108
95
  - !ruby/object:Gem::Version
@@ -156,6 +143,12 @@ files:
156
143
  - ext/curb/curb_upload.h
157
144
  - ext/curb/extconf.rb
158
145
  - lib/rhack.rb
146
+ - lib/rhack/clients.rb
147
+ - lib/rhack/clients/base.rb
148
+ - lib/rhack/clients/compatibility.rb
149
+ - lib/rhack/clients/examples.rb
150
+ - lib/rhack/clients/oauth.rb
151
+ - lib/rhack/clients/storage.rb
159
152
  - lib/rhack/cookie.rb
160
153
  - lib/rhack/curl.rb
161
154
  - lib/rhack/curl/easy.rb
@@ -179,11 +172,6 @@ files:
179
172
  - lib/rhack/scout.rb
180
173
  - lib/rhack/scout_squad.rb
181
174
  - lib/rhack/services.rb
182
- - lib/rhack/services/base.rb
183
- - lib/rhack/services/compatibility.rb
184
- - lib/rhack/services/examples.rb
185
- - lib/rhack/services/oauth.rb
186
- - lib/rhack/services/storage.rb
187
175
  - lib/rhack/storage.rb
188
176
  - lib/rhack/version.rb
189
177
  - lib/rhack_in.rb
@@ -194,26 +182,25 @@ files:
194
182
  homepage: https://github.com/tinbka/rhack
195
183
  licenses:
196
184
  - MIT
185
+ metadata: {}
197
186
  post_install_message:
198
187
  rdoc_options: []
199
188
  require_paths:
200
189
  - lib
201
190
  required_ruby_version: !ruby/object:Gem::Requirement
202
- none: false
203
191
  requirements:
204
192
  - - ! '>='
205
193
  - !ruby/object:Gem::Version
206
194
  version: '0'
207
195
  required_rubygems_version: !ruby/object:Gem::Requirement
208
- none: false
209
196
  requirements:
210
197
  - - ! '>='
211
198
  - !ruby/object:Gem::Version
212
199
  version: '0'
213
200
  requirements: []
214
201
  rubyforge_project:
215
- rubygems_version: 1.8.24
202
+ rubygems_version: 2.0.3
216
203
  signing_key:
217
- specification_version: 3
204
+ specification_version: 4
218
205
  summary: Curl-based web-client framework created for developing web-scrapers/bots
219
206
  test_files: []