rhack 1.1.0 → 1.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MmU2Y2RiMzNmOTM0ZjUwYzkzM2VlNGEyNGQzZjcwMTI2NjdmNTk0YQ==
5
+ data.tar.gz: !binary |-
6
+ ZWYxYTBkMDI5Njg5NDhiMzgwYTU0MGRkZGY5MWJjNjA3ZjE5OGUxZQ==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ NTgxOTYyYTQwYjYwYTkzNTgyZGFmZWMzOGVmZTE1MjA2M2E5YzAzM2Q1ZTYy
10
+ ZjYyYTM3NjkzYTE5OGZjODQ3ZmU0Njc5NzY4MTEzYjE5MDI3MzdiYzc4N2Uy
11
+ ODFhNmIyMjQyOTE3NTFmZmIwN2VjMTE0ZjllMzY3ZDhmZDg1Mzg=
12
+ data.tar.gz: !binary |-
13
+ YmUyM2IyNjBiMWFkNTlmZWUwZWIyZjUxZmIxYmNkYjc3OTliN2Q1ZmMxYzM2
14
+ ZmIzM2Q4MWFhNGZjYjE1NmNkMjcwZGQ5YzAyNjNjNGQ4NGM5N2VmMTQ2MGY5
15
+ ODZkMGJjMGJlOGEyMjIxYjc0NjMyZGY0ZjY1MzRkMjQwNmZlNDQ=
data/README.md CHANGED
@@ -31,6 +31,25 @@ It's still randomly documented since it's just my working tool.
31
31
 
32
32
  ### CHANGES
33
33
 
34
+ ##### Version 1.1.4
35
+
36
+ * ::Frame
37
+ * Moved `Curl.execute` from *initialize* to *on after request added*
38
+
39
+ * ::ScoutSquad
40
+ * Finally stabilized #next and #rand time management for parallel recursive execution
41
+
42
+ ##### Version 1.1.3
43
+
44
+ * ::Frame
45
+ * Added #anchor
46
+
47
+ * ::Scout
48
+ * Fixed #update
49
+ * Catch weird Curl::Err::CurlOK being thrown on some pages
50
+
51
+ * Fixed some exceptions messages
52
+
34
53
  ##### Version 1.1.0
35
54
 
36
55
  * ::OAuthClient < ::Client
@@ -58,6 +77,7 @@ It's still randomly documented since it's just my working tool.
58
77
 
59
78
  * ::Service
60
79
  * Is renamed to Client what is more sensible. RHACK::Service is still usable as alias
80
+ * require 'rhack/clients' <-> require 'rhack/services'
61
81
 
62
82
  * Structural changes
63
83
  * Updated and documented rhack.yml.template that now lies in <gemdir>/config
@@ -71,14 +71,13 @@ rb_hash_clear_i(VALUE key, VALUE value, VALUE dummy) {
71
71
  }
72
72
 
73
73
  static void curl_multi_free(ruby_curl_multi *rbcm) {
74
-
75
- if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
76
-
74
+ //if (rbcm && !rbcm->requests == Qnil && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
75
+ if (rbcm && rb_type(rbcm->requests) == T_HASH && RHASH_LEN(rbcm->requests) > 0) {
77
76
  rb_hash_foreach( rbcm->requests, (int (*)())curl_multi_flush_easy, (VALUE)rbcm );
78
-
79
77
  rb_hash_foreach(rbcm->requests, rb_hash_clear_i, 0); //rb_hash_clear(rbcm->requests);
80
78
  rbcm->requests = Qnil;
81
79
  }
80
+
82
81
  curl_multi_cleanup(rbcm->handle);
83
82
  free(rbcm);
84
83
  }
@@ -179,10 +178,10 @@ static VALUE ruby_curl_multi_idle(VALUE self) {
179
178
 
180
179
  Data_Get_Struct(self, ruby_curl_multi, rbcm);
181
180
 
182
- if ( FIX2INT( rb_funcall(rbcm->requests, rb_intern("length"), 0) ) == 0 ) {
183
- return Qtrue;
184
- } else {
181
+ if (RHASH_LEN(rbcm->requests))
185
182
  return Qfalse;
183
+ } else {
184
+ return Qtrue;
186
185
  }
187
186
  }
188
187
 
@@ -627,6 +626,7 @@ static void rb_curl_multi_idle_perform(VALUE self, ruby_curl_multi *rbcm) {
627
626
  create_crt_fd(&fdexcep, &crt_fdexcep);
628
627
  #endif
629
628
 
629
+ // sleep while no requests
630
630
  do {
631
631
  #ifdef HAVE_RB_THREAD_BLOCKING_REGION
632
632
  fdset_args.maxfd = 0;
@@ -641,7 +641,7 @@ static void rb_curl_multi_idle_perform(VALUE self, ruby_curl_multi *rbcm) {
641
641
  if (rc == -1)
642
642
  rb_raise(rb_eRuntimeError, "select(): %s", strerror(errno));
643
643
 
644
- } while (!(RHASH_TBL(rbcm->requests)->num_entries));
644
+ } while (!RHASH_LEN(rbcm->requests));
645
645
 
646
646
  #ifdef _WIN32
647
647
  cleanup_crt_fd(&fdread, &crt_fdread);
@@ -0,0 +1,10 @@
1
+ require 'rhack'
2
+ require 'rhack/clients/base'
3
+ require 'rhack/clients/storage'
4
+ require 'rhack/clients/oauth'
5
+
6
+ module RHACK
7
+ for name in [:Service, :ServiceError]
8
+ autoload name, 'rhack/clients/compatibility'
9
+ end
10
+ end
File without changes
@@ -95,7 +95,7 @@ module RHACK
95
95
  response_type: 'code',
96
96
  client_id: OAUTH(:id),
97
97
  state: state
98
- }.merge(url_params).to_params
98
+ }.merge(url_params).urlencode
99
99
  end
100
100
 
101
101
  # @ url_params : {:code, :state, ...}
@@ -121,7 +121,7 @@ module RHACK
121
121
  grant_type: 'authorization_code',
122
122
  client_id: OAUTH(:id),
123
123
  client_secret: OAUTH(:secret)
124
- }.merge(url_params).to_params, raw: true, proc_result: block) {|curl|
124
+ }.merge(url_params).urlencode, raw: true, proc_result: block) {|curl|
125
125
  L.debug curl.res
126
126
  L.debug curl.res.body
127
127
  # TODO: refactor parse type selector: raw, json, hash, xml...
@@ -145,7 +145,7 @@ module RHACK
145
145
  grant_type: 'client_credentials',
146
146
  client_id: OAUTH(:id),
147
147
  client_secret: OAUTH(:secret)
148
- }.to_params, raw: true, proc_result: block) {|curl|
148
+ }.urlencode, raw: true, proc_result: block) {|curl|
149
149
  if curl.res.code == 200
150
150
  body = curl.res.body
151
151
  hash = '{['[body[0]] ? body.from_json(symbolize_keys: true) : body.to_params
@@ -194,7 +194,7 @@ module RHACK
194
194
 
195
195
  L.debug state_params
196
196
  action += '?' if !action['?']
197
- action += action_params.to_params
197
+ action += action_params.urlencode
198
198
  L.debug [action_data, action, token]
199
199
  opts = {proc_result: block, headers: {'Referer' => nil}, result: CodeIndiffirentPage}.merge(opts)
200
200
  # TODO: option to
@@ -27,10 +27,11 @@ module RHACK
27
27
  def initialize *args
28
28
  args << 10 unless args[-1].is Fixnum
29
29
  args.insert -2, {} unless args[-2].is Hash
30
- if scouts = args[-2][:scouts]
30
+ opts = args[-2]
31
+ if scouts = (opts[:scouts] || opts[:threads])
31
32
  args[-1] = scouts
32
33
  end
33
- @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(args[-2])
34
+ @opts = {:eval => Johnson::Enabled, :redir => true, :cp => true, :result => Page}.merge!(opts)
34
35
  args[-2] = @opts
35
36
  if args[0].is String
36
37
  url = args[0]
@@ -41,7 +42,6 @@ module RHACK
41
42
  @static = false
42
43
  end
43
44
  @ss = ScoutSquad *args
44
- Curl.run :unless_allready
45
45
  end
46
46
 
47
47
  def update_loc url
@@ -57,6 +57,10 @@ module RHACK
57
57
  end
58
58
  alias :target= :retarget
59
59
 
60
+ def anchor
61
+ retarget @loc.href
62
+ end
63
+
60
64
  def next() @ss.next end
61
65
  def rand() @ss.rand end
62
66
  def each(&block) @ss.each &block end
@@ -205,17 +209,17 @@ module RHACK
205
209
  if @static
206
210
  if @static.is Hash
207
211
  if loc.host != @loc.host and !@static.host
208
- raise TargetError, "unable to get #{url} by static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with new target"
212
+ raise TargetError, "unable to get #{url} by a static frame [#{@static.protocol}://]#{@loc.host}, you should first update it with a new target"
209
213
  end
210
214
  else
211
- raise TargetError, "unable to get #{url} by static frame #{@loc.root}, you should first update it with new target"
215
+ raise TargetError, "unable to get #{url} by a static frame #{@loc.root}, you should first update it with a new target"
212
216
  end
213
217
  end
214
218
  @loc.root, @loc.host, @loc.protocol = loc.root, loc.host, loc.protocol
215
219
  url
216
220
  elsif !loc.root
217
221
  if !@static
218
- raise TargetError, "undefined root for query #{url}, use :static option as Hash to set default protocol and host, or as True to allow using previously used root"
222
+ raise TargetError, "undefined root for query #{url}, use :static option as Hash to set a default protocol and host, or as True to allow using previously used root"
219
223
  elsif @static.is Hash
220
224
  # targeting relatively to default values (from @static hash)
221
225
  @loc.protocol = @static.protocol
@@ -223,7 +227,7 @@ module RHACK
223
227
  @loc.root = @loc.protocol+'://'+@loc.host
224
228
  end
225
229
  if !@loc.host
226
- raise TargetError, "undefined host for query #{url}, use :host parameter of :static option to set default host"
230
+ raise TargetError, "undefined host for query #{url}, use :host parameter of :static option to set a default host"
227
231
  end
228
232
  File.join @loc.root, url
229
233
  else url
@@ -238,6 +242,15 @@ module RHACK
238
242
  urls.map! {|u| validate u}
239
243
  end
240
244
 
245
+ # Feature of :proc_result in that, if you running synchronously,
246
+ # result of #run will be, for conviniency, `page.res` instead of `page`
247
+ #
248
+ # If you only need to transfer &block through a stack of frame callbacks
249
+ # just add &block to the needed #run call
250
+ #
251
+ # If you want a method to be processable as in async-mode with &block passed
252
+ # as in sync-mode with no &block passed
253
+ # pass :save_result => !block to the topmost #run call
241
254
  def run_callbacks!(page, opts, &callback)
242
255
  # if no callback must have run then page.res is equal to the page
243
256
  # so we can get the page as result of a sync as well as an async request
@@ -280,12 +293,16 @@ module RHACK
280
293
  end
281
294
  if opts[:raw]
282
295
  page.res = yield curl
283
- # here +curl.res.body+ become empty
296
+ # here +curl.res.body+ becomes empty
284
297
  elsif page.process(curl, opts)
285
298
  @@cache[page.href] = page if order[0] == :loadGet and @use_cache
286
299
  run_callbacks! page, opts, &callback
287
300
  end
288
301
  }
302
+ # > Carier.requests++
303
+ unless opts[:wait] and opts[:thread_safe] or opts[:exec] == false
304
+ Curl.execute :unless_already
305
+ end
289
306
  if opts[:wait]
290
307
  opts[:thread_safe] ? Curl.carier.perform : Curl.wait
291
308
  (opts[:save_result] or :proc_result.in opts) ? page.res : page
@@ -302,11 +319,16 @@ module RHACK
302
319
  end
303
320
  pages = orders.zip(with_opts[:ranges]).send(iterator) {|order, range|
304
321
  (with_opts[:headers] ||= {}).Range = "bytes=#{range.begin}-#{range.end}"
305
- exec_one order, with_opts, &callback
322
+ exec_one order, with_opts.merge(:exec => false), &callback
306
323
  }
307
324
  else
325
+ # если ss.next будет не хватать скаутов, то он сам запустит курл
326
+ # правда, это с :thread_safe никак не вяжется
308
327
  pages = orders.send(iterator) {|order| exec_one order, with_opts, &callback }
309
328
  end
329
+ unless w and with_opts[:thread_safe] or opts[:exec] == false
330
+ Curl.execute :unless_already
331
+ end
310
332
  with_opts[:thread_safe] ? Curl.carier.perform : Curl.wait if w
311
333
  with_opts[:stream] || pages
312
334
  end
@@ -54,7 +54,7 @@ module RHACK
54
54
 
55
55
  def update(uri)
56
56
  if !uri[/^\w+:\/\//]
57
- '/' >> uri if uri[0,1] != '/'
57
+ uri = '/' + uri if uri[0,1] != '/'
58
58
  @uri = uri.parse:uri
59
59
  return
60
60
  end
@@ -198,6 +198,10 @@ module RHACK
198
198
  Curl.carier.reqs.include? @http
199
199
  end
200
200
 
201
+ def available?
202
+ !loaded?
203
+ end
204
+
201
205
  def load!
202
206
  unless Curl.carier.add @http
203
207
  Curl.carier.remove @http
@@ -214,7 +218,10 @@ module RHACK
214
218
  @http.timeout = @timeout
215
219
 
216
220
  @http.on_complete {|c|
221
+ # > Carier.requests--
217
222
  @error = nil
223
+ # While not outdated, Curl::Response here may contain pointers on freed
224
+ # memory, thus throwing exception on #to_s and #inspect
218
225
  c.outdate!
219
226
  ProcCookies c.res if @cookieProc
220
227
  # We cannot just cancel on_complete in on_redirect block
@@ -226,10 +233,11 @@ module RHACK
226
233
  end
227
234
  }
228
235
  @http.on_failure {|c, e|
236
+ @error = e
229
237
  if e[0] == Curl::Err::CurlOK
230
- @error = e
231
- # TODO: где-то в сорцах on_failure вызывается по коду 0, видимо из-за стороннего условия, а не должен
232
- L.log << "Got Curl::Err::CurlOK, response was: #{c.res}"
238
+ # в сорцах on_failure не вызывается по коду 0, это какой-то глюк
239
+ # в любом случае такой поворот не означает ошибки
240
+ L.warn "Got Curl::Err::CurlOK, response was: #{c.res}"
233
241
  else
234
242
  @http.on_complete &Proc::NULL
235
243
  c.outdate!
@@ -34,47 +34,44 @@ module RHACK
34
34
  end
35
35
 
36
36
  def update uri, forced=nil
37
- each {|s| return L.warn "failed to update scout loaded? with url: #{s.http.url}" if s.loaded?} if !forced
37
+ each {|s| return L.warn "failed to update loaded scout with url: #{s.http.url}" if s.loaded?} if !forced
38
38
  each {|s| s.update uri}
39
39
  end
40
40
 
41
41
  def untargeted
42
42
  first.root == 'http://'
43
43
  end
44
+
45
+ def wait_for_available
46
+ Curl.execute :unless_already
47
+ # Carier.requests освобождаются ещё до колбека,
48
+ # но колбеки выполняются последовательно,
49
+ # поэтому здесь мы можем усыплять тред,
50
+ # но только если это не тред самого Carier
51
+ if Curl.carier_thread == Thread.current
52
+ Curl.wait # runs Multi#perform
53
+ else
54
+ sleep 1
55
+ end
56
+ end
44
57
 
45
58
  def rand
46
59
  raise PickError if !b
47
- # to_a because reject returns object of this class
48
- if scout = to_a.rand {|_|!_.loaded?}; scout
49
- else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
50
- unless Curl.status
51
- L.log "Curl must run in order to use ScoutSquad#rand; setting Carier Thread"
52
- Curl.execute
53
- #raise "Curl must run in order to use ScoutSquad#rand"
54
- end
55
- #Curl.wait
56
- loop {
57
- sleep 1
58
- break if Curl.carier.reqs.size < size
59
- }
60
+ # to_a because Array#reject returns object of this class
61
+ if scout = to_a.rand_by_available?
62
+ scout
63
+ else
64
+ wait_for_available
60
65
  self.rand
61
66
  end
62
67
  end
63
68
 
64
69
  def next
65
70
  raise PickError if !b
66
- if scout = find {|_|!_.loaded?}; scout
67
- else # Curl should run here, otherwise `next'/`rand'-recursion will cause stack overflow
68
- unless Curl.status
69
- L.log "Curl must run in order to use ScoutSquad#next; setting Carier Thread"
70
- Curl.execute :unless_allready
71
- #raise "Curl must run in order to use ScoutSquad#next"
72
- end
73
- #Curl.wait
74
- loop {
75
- sleep 1
76
- break if Curl.carier.reqs.size < size
77
- }
71
+ if scout = to_a.find_available?
72
+ scout
73
+ else
74
+ wait_for_available
78
75
  self.next
79
76
  end
80
77
  end
@@ -1,10 +1 @@
1
- require 'rhack'
2
- require 'rhack/services/base'
3
- require 'rhack/services/storage'
4
- require 'rhack/services/oauth'
5
-
6
- module RHACK
7
- for name in [:Service, :ServiceError]
8
- autoload name, 'rhack/services/compatibility'
9
- end
10
- end
1
+ require 'rhack/clients'
@@ -1,3 +1,3 @@
1
1
  module RHACK
2
- VERSION = '1.1.0'
2
+ VERSION = '1.1.4'
3
3
  end
metadata CHANGED
@@ -1,20 +1,18 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rhack
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.0
5
- prerelease:
4
+ version: 1.1.4
6
5
  platform: ruby
7
6
  authors:
8
7
  - Sergey Baev
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-06-07 00:00:00.000000000 Z
11
+ date: 2013-07-03 00:00:00.000000000 Z
13
12
  dependencies:
14
13
  - !ruby/object:Gem::Dependency
15
14
  name: activesupport
16
15
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
16
  requirements:
19
17
  - - ! '>='
20
18
  - !ruby/object:Gem::Version
@@ -22,7 +20,6 @@ dependencies:
22
20
  type: :runtime
23
21
  prerelease: false
24
22
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
23
  requirements:
27
24
  - - ! '>='
28
25
  - !ruby/object:Gem::Version
@@ -30,7 +27,6 @@ dependencies:
30
27
  - !ruby/object:Gem::Dependency
31
28
  name: redis
32
29
  requirement: !ruby/object:Gem::Requirement
33
- none: false
34
30
  requirements:
35
31
  - - ! '>='
36
32
  - !ruby/object:Gem::Version
@@ -38,7 +34,6 @@ dependencies:
38
34
  type: :runtime
39
35
  prerelease: false
40
36
  version_requirements: !ruby/object:Gem::Requirement
41
- none: false
42
37
  requirements:
43
38
  - - ! '>='
44
39
  - !ruby/object:Gem::Version
@@ -46,7 +41,6 @@ dependencies:
46
41
  - !ruby/object:Gem::Dependency
47
42
  name: rmtools
48
43
  requirement: !ruby/object:Gem::Requirement
49
- none: false
50
44
  requirements:
51
45
  - - ! '>='
52
46
  - !ruby/object:Gem::Version
@@ -54,7 +48,6 @@ dependencies:
54
48
  type: :runtime
55
49
  prerelease: false
56
50
  version_requirements: !ruby/object:Gem::Requirement
57
- none: false
58
51
  requirements:
59
52
  - - ! '>='
60
53
  - !ruby/object:Gem::Version
@@ -62,7 +55,6 @@ dependencies:
62
55
  - !ruby/object:Gem::Dependency
63
56
  name: libxml-ruby
64
57
  requirement: !ruby/object:Gem::Requirement
65
- none: false
66
58
  requirements:
67
59
  - - ! '>='
68
60
  - !ruby/object:Gem::Version
@@ -70,7 +62,6 @@ dependencies:
70
62
  type: :runtime
71
63
  prerelease: false
72
64
  version_requirements: !ruby/object:Gem::Requirement
73
- none: false
74
65
  requirements:
75
66
  - - ! '>='
76
67
  - !ruby/object:Gem::Version
@@ -78,7 +69,6 @@ dependencies:
78
69
  - !ruby/object:Gem::Dependency
79
70
  name: bundler
80
71
  requirement: !ruby/object:Gem::Requirement
81
- none: false
82
72
  requirements:
83
73
  - - ~>
84
74
  - !ruby/object:Gem::Version
@@ -86,7 +76,6 @@ dependencies:
86
76
  type: :development
87
77
  prerelease: false
88
78
  version_requirements: !ruby/object:Gem::Requirement
89
- none: false
90
79
  requirements:
91
80
  - - ~>
92
81
  - !ruby/object:Gem::Version
@@ -94,7 +83,6 @@ dependencies:
94
83
  - !ruby/object:Gem::Dependency
95
84
  name: rake
96
85
  requirement: !ruby/object:Gem::Requirement
97
- none: false
98
86
  requirements:
99
87
  - - ! '>='
100
88
  - !ruby/object:Gem::Version
@@ -102,7 +90,6 @@ dependencies:
102
90
  type: :development
103
91
  prerelease: false
104
92
  version_requirements: !ruby/object:Gem::Requirement
105
- none: false
106
93
  requirements:
107
94
  - - ! '>='
108
95
  - !ruby/object:Gem::Version
@@ -156,6 +143,12 @@ files:
156
143
  - ext/curb/curb_upload.h
157
144
  - ext/curb/extconf.rb
158
145
  - lib/rhack.rb
146
+ - lib/rhack/clients.rb
147
+ - lib/rhack/clients/base.rb
148
+ - lib/rhack/clients/compatibility.rb
149
+ - lib/rhack/clients/examples.rb
150
+ - lib/rhack/clients/oauth.rb
151
+ - lib/rhack/clients/storage.rb
159
152
  - lib/rhack/cookie.rb
160
153
  - lib/rhack/curl.rb
161
154
  - lib/rhack/curl/easy.rb
@@ -179,11 +172,6 @@ files:
179
172
  - lib/rhack/scout.rb
180
173
  - lib/rhack/scout_squad.rb
181
174
  - lib/rhack/services.rb
182
- - lib/rhack/services/base.rb
183
- - lib/rhack/services/compatibility.rb
184
- - lib/rhack/services/examples.rb
185
- - lib/rhack/services/oauth.rb
186
- - lib/rhack/services/storage.rb
187
175
  - lib/rhack/storage.rb
188
176
  - lib/rhack/version.rb
189
177
  - lib/rhack_in.rb
@@ -194,26 +182,25 @@ files:
194
182
  homepage: https://github.com/tinbka/rhack
195
183
  licenses:
196
184
  - MIT
185
+ metadata: {}
197
186
  post_install_message:
198
187
  rdoc_options: []
199
188
  require_paths:
200
189
  - lib
201
190
  required_ruby_version: !ruby/object:Gem::Requirement
202
- none: false
203
191
  requirements:
204
192
  - - ! '>='
205
193
  - !ruby/object:Gem::Version
206
194
  version: '0'
207
195
  required_rubygems_version: !ruby/object:Gem::Requirement
208
- none: false
209
196
  requirements:
210
197
  - - ! '>='
211
198
  - !ruby/object:Gem::Version
212
199
  version: '0'
213
200
  requirements: []
214
201
  rubyforge_project:
215
- rubygems_version: 1.8.24
202
+ rubygems_version: 2.0.3
216
203
  signing_key:
217
- specification_version: 3
204
+ specification_version: 4
218
205
  summary: Curl-based web-client framework created for developing web-scrapers/bots
219
206
  test_files: []