crawlora 1.5.0.pre.sdk.2 → 1.5.0.pre.sdk.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +6 -2
- data/lib/crawlora/client.rb +103 -68
- data/lib/crawlora/pagination.rb +2 -2
- data/lib/crawlora/version.rb +3 -3
- data/sig/crawlora.rbs +330 -330
- metadata +4 -4
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e769697392f10afe8e5b54d3eb890d8a5b88aecba2911962c66e04a901b87ee7
|
|
4
|
+
data.tar.gz: 2258a6882ff2fda76328c6da12338ae0bf7d2df03ec58fc8082fe8bb75c266ca
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 39f8c6aa73edab7665e50096673a66bec12bfd8c9f7e2d5aa741cb448f2db77f05fee6c97fe9d9756e2a1d1c10a08dbd752c6e168e4d375e9a35eb6ab6d1ed73
|
|
7
|
+
data.tar.gz: 4c5dfbcff65eaf13865077173a092d44edba165a301fa6fe982890ac5b03befc39ecc7076a692f98f588193f4c31c34e19b164df8af883f96534067f5184ae6e
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,13 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.5.0-sdk.3
|
|
4
|
+
|
|
5
|
+
- Richer RBS: generated `sig/crawlora.rbs` now declares typed keyword parameters
|
|
6
|
+
per operation (Steep/Sorbet users get real signatures instead of `**untyped`).
|
|
7
|
+
- Internal cleanups: split the request and pagination methods into focused
|
|
8
|
+
private helpers, enabled tuned rubocop metric budgets, and hardened multipart
|
|
9
|
+
`Content-Disposition` field/filename escaping. No public API changes.
|
|
10
|
+
|
|
3
11
|
## 1.5.0-sdk.2
|
|
4
12
|
|
|
5
13
|
- Packaging: point the gem homepage at https://crawlora.net/, expand the gem
|
data/README.md
CHANGED
|
@@ -11,13 +11,17 @@ plus retries, pagination, middleware hooks, and client-side rate limiting.
|
|
|
11
11
|
|
|
12
12
|
## Install
|
|
13
13
|
|
|
14
|
+
Published on [RubyGems](https://rubygems.org/gems/crawlora). The current release
|
|
15
|
+
is a prerelease (`1.5.0.pre.sdk.3`), so install it with `--pre` or pin the
|
|
16
|
+
version:
|
|
17
|
+
|
|
14
18
|
```ruby
|
|
15
19
|
# Gemfile
|
|
16
|
-
gem "crawlora"
|
|
20
|
+
gem "crawlora", "1.5.0.pre.sdk.3"
|
|
17
21
|
```
|
|
18
22
|
|
|
19
23
|
```sh
|
|
20
|
-
gem install crawlora
|
|
24
|
+
gem install crawlora --pre
|
|
21
25
|
```
|
|
22
26
|
|
|
23
27
|
## Quick start
|
data/lib/crawlora/client.rb
CHANGED
|
@@ -167,7 +167,7 @@ module Crawlora
|
|
|
167
167
|
@on_retry = on_retry
|
|
168
168
|
@request_id = request_id
|
|
169
169
|
@idempotency_keys = idempotency_keys
|
|
170
|
-
@rate_limiter = rate_limit || max_concurrency ? RateLimiter.new(rate_limit, max_concurrency) : nil
|
|
170
|
+
@rate_limiter = (rate_limit || max_concurrency) ? RateLimiter.new(rate_limit, max_concurrency) : nil
|
|
171
171
|
@logger = logger
|
|
172
172
|
@before_request = as_hook_list(before_request)
|
|
173
173
|
@after_response = as_hook_list(after_response)
|
|
@@ -200,7 +200,7 @@ module Crawlora
|
|
|
200
200
|
log(event: "request", operation: operation_id)
|
|
201
201
|
max_retries = retries.nil? ? @retries : [0, retries.to_i].max
|
|
202
202
|
idempotency_key =
|
|
203
|
-
@idempotency_keys && %w[POST PATCH].include?(operation["method"]) ? SecureRandom.hex(16) : nil
|
|
203
|
+
(@idempotency_keys && %w[POST PATCH].include?(operation["method"])) ? SecureRandom.hex(16) : nil
|
|
204
204
|
|
|
205
205
|
attempt = 0
|
|
206
206
|
loop do
|
|
@@ -225,8 +225,8 @@ module Crawlora
|
|
|
225
225
|
# +next_cursor+ extractor) sends the cursor parameter and stops when
|
|
226
226
|
# +next_cursor+ returns a falsy value.
|
|
227
227
|
def paginate(operation_id, params = {}, page_param: nil, cursor_param: nil, next_cursor: nil,
|
|
228
|
-
start: nil, step: 1, max_pages: nil, response_type: "auto", timeout: nil, headers: nil)
|
|
229
|
-
unless
|
|
228
|
+
start: nil, step: 1, max_pages: nil, response_type: "auto", timeout: nil, headers: nil, &block)
|
|
229
|
+
unless block
|
|
230
230
|
return enum_for(:paginate, operation_id, params, page_param: page_param, cursor_param: cursor_param,
|
|
231
231
|
next_cursor: next_cursor, start: start, step: step, max_pages: max_pages,
|
|
232
232
|
response_type: response_type, timeout: timeout, headers: headers)
|
|
@@ -236,29 +236,56 @@ module Crawlora
|
|
|
236
236
|
raise ArgumentError, "unknown Crawlora operation: #{operation_id}" if operation.nil?
|
|
237
237
|
|
|
238
238
|
base_params = stringify_keys(params)
|
|
239
|
+
opts = { response_type: response_type, timeout: timeout, headers: headers }
|
|
239
240
|
|
|
240
241
|
if cursor_param || next_cursor
|
|
241
|
-
|
|
242
|
+
paginate_cursor(operation_id, operation, base_params, cursor_param: cursor_param, next_cursor: next_cursor,
|
|
243
|
+
start: start, max_pages: max_pages, opts: opts, &block)
|
|
244
|
+
else
|
|
245
|
+
paginate_numeric(operation_id, operation, base_params, page_param: page_param, start: start, step: step,
|
|
246
|
+
max_pages: max_pages, opts: opts, &block)
|
|
247
|
+
end
|
|
248
|
+
end
|
|
242
249
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
250
|
+
# Yield individual items across pages. +items+ extracts the list from a page
|
|
251
|
+
# (default: the Crawlora +data+ array).
|
|
252
|
+
def paginate_items(operation_id, params = {}, items: nil, **options, &block)
|
|
253
|
+
return enum_for(:paginate_items, operation_id, params, items: items, **options) unless block_given?
|
|
247
254
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
255
|
+
extract = items || Pagination.method(:default_items)
|
|
256
|
+
paginate(operation_id, params, **options) do |page|
|
|
257
|
+
extract.call(page).each(&block)
|
|
258
|
+
end
|
|
259
|
+
end
|
|
260
|
+
|
|
261
|
+
private
|
|
262
|
+
|
|
263
|
+
# Yield successive pages by advancing a cursor query parameter until
|
|
264
|
+
# +next_cursor+ returns a blank value.
|
|
265
|
+
def paginate_cursor(operation_id, operation, base_params, cursor_param:, next_cursor:, start:, max_pages:, opts:)
|
|
266
|
+
raise ArgumentError, "cursor pagination requires both cursor_param and next_cursor" unless cursor_param && next_cursor
|
|
267
|
+
|
|
268
|
+
query_names = (operation["queryParams"] || []).map { |p| p["name"] }
|
|
269
|
+
unless query_names.include?(cursor_param)
|
|
270
|
+
raise ArgumentError, "cursor_param #{cursor_param.inspect} is not a query parameter of operation #{operation_id}"
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
cursor = start
|
|
274
|
+
fetched = 0
|
|
275
|
+
while max_pages.nil? || fetched < max_pages
|
|
276
|
+
page_params = base_params.dup
|
|
277
|
+
page_params[cursor_param] = cursor unless cursor.nil?
|
|
278
|
+
response = request(operation_id, page_params, **opts)
|
|
279
|
+
yield response
|
|
280
|
+
fetched += 1
|
|
281
|
+
cursor = next_cursor.call(response)
|
|
282
|
+
break unless cursor && !(cursor.respond_to?(:empty?) && cursor.empty?)
|
|
260
283
|
end
|
|
284
|
+
end
|
|
261
285
|
|
|
286
|
+
# Yield successive pages by advancing the page/offset query parameter until
|
|
287
|
+
# a page comes back empty.
|
|
288
|
+
def paginate_numeric(operation_id, operation, base_params, page_param:, start:, step:, max_pages:, opts:)
|
|
262
289
|
page_param ||= Pagination.detect_page_param(operation)
|
|
263
290
|
raise ArgumentError, "operation #{operation_id} has no page or offset query parameter to paginate" unless page_param
|
|
264
291
|
|
|
@@ -266,7 +293,7 @@ module Crawlora
|
|
|
266
293
|
fetched = 0
|
|
267
294
|
while max_pages.nil? || fetched < max_pages
|
|
268
295
|
page_params = base_params.merge(page_param => page_value)
|
|
269
|
-
response = request(operation_id, page_params,
|
|
296
|
+
response = request(operation_id, page_params, **opts)
|
|
270
297
|
yield response
|
|
271
298
|
fetched += 1
|
|
272
299
|
break if Pagination.page_empty?(response)
|
|
@@ -275,21 +302,30 @@ module Crawlora
|
|
|
275
302
|
end
|
|
276
303
|
end
|
|
277
304
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
305
|
+
def send_request(operation, params, response_type:, timeout:, headers:, idempotency_key: nil)
|
|
306
|
+
url, body, body_headers = build_request(@base_url, operation, params)
|
|
307
|
+
request_headers, req_id = prepare_request(operation, body_headers, headers, idempotency_key)
|
|
308
|
+
unless @before_request.empty?
|
|
309
|
+
ctx = { operation: operation["id"], method: operation["method"], url: url, headers: request_headers }
|
|
310
|
+
@before_request.each { |hook| hook.call(ctx) }
|
|
311
|
+
url = ctx[:url]
|
|
312
|
+
request_headers = ctx[:headers]
|
|
313
|
+
end
|
|
282
314
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
315
|
+
request_timeout = timeout.nil? ? @timeout : timeout
|
|
316
|
+
begin
|
|
317
|
+
response = call_transport(method: operation["method"], url: url, headers: request_headers, body: body, timeout: request_timeout)
|
|
318
|
+
rescue StandardError => e
|
|
319
|
+
message = timeout_error?(e) ? "Crawlora request timed out" : "Crawlora transport error"
|
|
320
|
+
raise NetworkError.new(message, request_id: req_id, cause: e)
|
|
286
321
|
end
|
|
287
|
-
end
|
|
288
322
|
|
|
289
|
-
|
|
323
|
+
handle_response(operation, response, response_type, req_id)
|
|
324
|
+
end
|
|
290
325
|
|
|
291
|
-
|
|
292
|
-
|
|
326
|
+
# Build the merged request headers and resolve the request id, attaching an
|
|
327
|
+
# idempotency key when one was generated.
|
|
328
|
+
def prepare_request(operation, body_headers, headers, idempotency_key)
|
|
293
329
|
request_headers = merge_headers(
|
|
294
330
|
@headers,
|
|
295
331
|
auth_headers(operation["security"] || [], @api_key, @jwt_token),
|
|
@@ -301,53 +337,47 @@ module Crawlora
|
|
|
301
337
|
if @request_id
|
|
302
338
|
ensure_request_id(request_headers)
|
|
303
339
|
else
|
|
304
|
-
|
|
305
|
-
|
|
340
|
+
existing = header_value(request_headers, "x-request-id")
|
|
341
|
+
existing.empty? ? nil : existing
|
|
306
342
|
end
|
|
307
343
|
request_headers["Idempotency-Key"] = idempotency_key if idempotency_key && header_value(request_headers, "idempotency-key").empty?
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
@before_request.each { |hook| hook.call(ctx) }
|
|
311
|
-
url = ctx[:url]
|
|
312
|
-
request_headers = ctx[:headers]
|
|
313
|
-
end
|
|
344
|
+
[request_headers, req_id]
|
|
345
|
+
end
|
|
314
346
|
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
@rate_limiter.run do
|
|
320
|
-
@transport.call(method: operation["method"], url: url, headers: request_headers, body: body, timeout: request_timeout)
|
|
321
|
-
end
|
|
322
|
-
else
|
|
323
|
-
@transport.call(method: operation["method"], url: url, headers: request_headers, body: body, timeout: request_timeout)
|
|
324
|
-
end
|
|
325
|
-
rescue StandardError => e
|
|
326
|
-
message = timeout_error?(e) ? "Crawlora request timed out" : "Crawlora transport error"
|
|
327
|
-
raise NetworkError.new(message, request_id: req_id, cause: e)
|
|
328
|
-
end
|
|
347
|
+
def call_transport(method:, url:, headers:, body:, timeout:)
|
|
348
|
+
call = -> { @transport.call(method: method, url: url, headers: headers, body: body, timeout: timeout) }
|
|
349
|
+
@rate_limiter ? @rate_limiter.run(&call) : call.call
|
|
350
|
+
end
|
|
329
351
|
|
|
352
|
+
# Parse the response, raise the typed API error on non-2xx, and run the
|
|
353
|
+
# after_response hooks on success.
|
|
354
|
+
def handle_response(operation, response, response_type, req_id)
|
|
330
355
|
raw_body = response.body.to_s
|
|
331
356
|
is_error = response.status < 200 || response.status >= 300
|
|
332
|
-
return StringIO.new(
|
|
357
|
+
return StringIO.new(raw_body) if response_type == "stream" && !is_error
|
|
333
358
|
|
|
334
|
-
parse_mode = response_type == "stream" ? "auto" : response_type
|
|
359
|
+
parse_mode = (response_type == "stream") ? "auto" : response_type
|
|
335
360
|
begin
|
|
336
|
-
parsed = parse_response(
|
|
361
|
+
parsed = parse_response(raw_body, header_value(response.headers, "content-type"), parse_mode)
|
|
337
362
|
rescue JSON::ParserError => e
|
|
338
363
|
raise Error.new("Crawlora JSON parse error", status: response.status, raw_body: raw_body,
|
|
339
364
|
headers: response.headers, request_id: req_id, cause: e)
|
|
340
365
|
end
|
|
341
366
|
|
|
342
|
-
if is_error
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
367
|
+
raise_api_error(response, parsed, raw_body, req_id) if is_error
|
|
368
|
+
run_after_response(operation, response, parsed)
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
def raise_api_error(response, parsed, raw_body, req_id)
|
|
372
|
+
code = parsed.is_a?(Hash) ? parsed["code"] : nil
|
|
373
|
+
message = (parsed.is_a?(Hash) && parsed["msg"] && !parsed["msg"].to_s.empty?) ? parsed["msg"] : "HTTP #{response.status}"
|
|
374
|
+
raise Crawlora.error_class_for(response.status).new(
|
|
375
|
+
message, status: response.status, code: code, body: parsed,
|
|
376
|
+
raw_body: raw_body, headers: response.headers, request_id: req_id
|
|
377
|
+
)
|
|
378
|
+
end
|
|
350
379
|
|
|
380
|
+
def run_after_response(operation, response, parsed)
|
|
351
381
|
@after_response.each do |hook|
|
|
352
382
|
result = hook.call(operation["id"], response.status, response.headers, parsed)
|
|
353
383
|
parsed = result unless result.nil?
|
|
@@ -476,18 +506,23 @@ module Crawlora
|
|
|
476
506
|
chunks << "--#{boundary}\r\n"
|
|
477
507
|
if parameter["type"] == "file"
|
|
478
508
|
filename, data = read_file_value(value)
|
|
479
|
-
chunks << %(Content-Disposition: form-data; name="#{name}"; filename="#{filename}"\r\n)
|
|
509
|
+
chunks << %(Content-Disposition: form-data; name="#{quote_escape(name)}"; filename="#{quote_escape(filename)}"\r\n)
|
|
480
510
|
chunks << "Content-Type: application/octet-stream\r\n\r\n"
|
|
481
511
|
chunks << data
|
|
482
512
|
chunks << "\r\n"
|
|
483
513
|
else
|
|
484
|
-
chunks << %(Content-Disposition: form-data; name="#{name}"\r\n\r\n#{value}\r\n)
|
|
514
|
+
chunks << %(Content-Disposition: form-data; name="#{quote_escape(name)}"\r\n\r\n#{value}\r\n)
|
|
485
515
|
end
|
|
486
516
|
end
|
|
487
517
|
chunks << "--#{boundary}--\r\n"
|
|
488
518
|
[chunks, { "content-type" => "multipart/form-data; boundary=#{boundary}" }]
|
|
489
519
|
end
|
|
490
520
|
|
|
521
|
+
# Escape characters that would break a multipart Content-Disposition header.
|
|
522
|
+
def quote_escape(value)
|
|
523
|
+
value.to_s.gsub("\\", "\\\\\\\\").gsub('"', '\\"').gsub(/[\r\n]/, " ")
|
|
524
|
+
end
|
|
525
|
+
|
|
491
526
|
def read_file_value(value)
|
|
492
527
|
return ["upload.bin", value] if value.is_a?(String) && !File.exist?(value)
|
|
493
528
|
return [File.basename(value), File.binread(value)] if value.is_a?(String)
|
data/lib/crawlora/pagination.rb
CHANGED
|
@@ -16,7 +16,7 @@ module Crawlora
|
|
|
16
16
|
# A page is empty when its `data` array (Crawlora envelope) or the page
|
|
17
17
|
# itself is empty/blank.
|
|
18
18
|
def page_empty?(response)
|
|
19
|
-
data = response.is_a?(Hash) && response.key?("data") ? response["data"] : response
|
|
19
|
+
data = (response.is_a?(Hash) && response.key?("data")) ? response["data"] : response
|
|
20
20
|
return true if data.nil?
|
|
21
21
|
return data.empty? if data.respond_to?(:empty?)
|
|
22
22
|
|
|
@@ -24,7 +24,7 @@ module Crawlora
|
|
|
24
24
|
end
|
|
25
25
|
|
|
26
26
|
def default_start(page_param)
|
|
27
|
-
page_param == "offset" ? 0 : 1
|
|
27
|
+
(page_param == "offset") ? 0 : 1
|
|
28
28
|
end
|
|
29
29
|
|
|
30
30
|
# Default item extractor: the response's `data` list (Crawlora envelope), or
|
data/lib/crawlora/version.rb
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
module Crawlora
|
|
4
4
|
# SDK release version, in the shared `MAJOR.MINOR.PATCH-sdk.N` tag form (same
|
|
5
5
|
# as the Go/Java/PHP SDKs). RubyGems treats it as a prerelease and normalizes
|
|
6
|
-
# the published gem version
|
|
7
|
-
# by the API repo's tools/sdkgen/bump_version.py.
|
|
8
|
-
VERSION = "1.5.0-sdk.
|
|
6
|
+
# the published gem version (the `-` becomes `.pre.`, e.g. `1.5.0.pre.sdk.N`).
|
|
7
|
+
# Bumped across all SDK repos by the API repo's tools/sdkgen/bump_version.py.
|
|
8
|
+
VERSION = "1.5.0-sdk.3"
|
|
9
9
|
end
|