crawlora 1.5.0.pre.sdk.2 → 1.5.0.pre.sdk.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d5df185ab17cbdb8ee244385b900049f2209efe980e1a672f39a7f61d18bfd35
4
- data.tar.gz: af839794521c83c882affd859e4929ec76225144e45bb3f29141795b0fbb9b69
3
+ metadata.gz: e769697392f10afe8e5b54d3eb890d8a5b88aecba2911962c66e04a901b87ee7
4
+ data.tar.gz: 2258a6882ff2fda76328c6da12338ae0bf7d2df03ec58fc8082fe8bb75c266ca
5
5
  SHA512:
6
- metadata.gz: ffd5f36fde004299e22e4f07d4c19e9904032d0ff78ee7c9a57b9d4d49964a2fba2fc0e7e14f412e9814b1c64e3d599c63259b69a6173dfbf50a71414354bd87
7
- data.tar.gz: 2d529c261a355e69021267c41f520db47c0a8a399d107108ba08d3fe839158ef6929ef88986c6205592a144980a2f977af4eceeb42fa20ee2a2eb74568232874
6
+ metadata.gz: 39f8c6aa73edab7665e50096673a66bec12bfd8c9f7e2d5aa741cb448f2db77f05fee6c97fe9d9756e2a1d1c10a08dbd752c6e168e4d375e9a35eb6ab6d1ed73
7
+ data.tar.gz: 4c5dfbcff65eaf13865077173a092d44edba165a301fa6fe982890ac5b03befc39ecc7076a692f98f588193f4c31c34e19b164df8af883f96534067f5184ae6e
data/CHANGELOG.md CHANGED
@@ -1,5 +1,13 @@
1
1
  # Changelog
2
2
 
3
+ ## 1.5.0-sdk.3
4
+
5
+ - Richer RBS: generated `sig/crawlora.rbs` now declares typed keyword parameters
6
+ per operation (Steep/Sorbet users get real signatures instead of `**untyped`).
7
+ - Internal cleanups: split the request and pagination methods into focused
8
+ private helpers, enabled tuned rubocop metric budgets, and hardened multipart
9
+ `Content-Disposition` field/filename escaping. No public API changes.
10
+
3
11
  ## 1.5.0-sdk.2
4
12
 
5
13
  - Packaging: point the gem homepage at https://crawlora.net/, expand the gem
data/README.md CHANGED
@@ -11,13 +11,17 @@ plus retries, pagination, middleware hooks, and client-side rate limiting.
11
11
 
12
12
  ## Install
13
13
 
14
+ Published on [RubyGems](https://rubygems.org/gems/crawlora). The current release
15
+ is a prerelease (`1.5.0.pre.sdk.3`), so install it with `--pre` or pin the
16
+ version:
17
+
14
18
  ```ruby
15
19
  # Gemfile
16
- gem "crawlora"
20
+ gem "crawlora", "1.5.0.pre.sdk.3"
17
21
  ```
18
22
 
19
23
  ```sh
20
- gem install crawlora
24
+ gem install crawlora --pre
21
25
  ```
22
26
 
23
27
  ## Quick start
@@ -167,7 +167,7 @@ module Crawlora
167
167
  @on_retry = on_retry
168
168
  @request_id = request_id
169
169
  @idempotency_keys = idempotency_keys
170
- @rate_limiter = rate_limit || max_concurrency ? RateLimiter.new(rate_limit, max_concurrency) : nil
170
+ @rate_limiter = (rate_limit || max_concurrency) ? RateLimiter.new(rate_limit, max_concurrency) : nil
171
171
  @logger = logger
172
172
  @before_request = as_hook_list(before_request)
173
173
  @after_response = as_hook_list(after_response)
@@ -200,7 +200,7 @@ module Crawlora
200
200
  log(event: "request", operation: operation_id)
201
201
  max_retries = retries.nil? ? @retries : [0, retries.to_i].max
202
202
  idempotency_key =
203
- @idempotency_keys && %w[POST PATCH].include?(operation["method"]) ? SecureRandom.hex(16) : nil
203
+ (@idempotency_keys && %w[POST PATCH].include?(operation["method"])) ? SecureRandom.hex(16) : nil
204
204
 
205
205
  attempt = 0
206
206
  loop do
@@ -225,8 +225,8 @@ module Crawlora
225
225
  # +next_cursor+ extractor) sends the cursor parameter and stops when
226
226
  # +next_cursor+ returns a falsy value.
227
227
  def paginate(operation_id, params = {}, page_param: nil, cursor_param: nil, next_cursor: nil,
228
- start: nil, step: 1, max_pages: nil, response_type: "auto", timeout: nil, headers: nil)
229
- unless block_given?
228
+ start: nil, step: 1, max_pages: nil, response_type: "auto", timeout: nil, headers: nil, &block)
229
+ unless block
230
230
  return enum_for(:paginate, operation_id, params, page_param: page_param, cursor_param: cursor_param,
231
231
  next_cursor: next_cursor, start: start, step: step, max_pages: max_pages,
232
232
  response_type: response_type, timeout: timeout, headers: headers)
@@ -236,29 +236,56 @@ module Crawlora
236
236
  raise ArgumentError, "unknown Crawlora operation: #{operation_id}" if operation.nil?
237
237
 
238
238
  base_params = stringify_keys(params)
239
+ opts = { response_type: response_type, timeout: timeout, headers: headers }
239
240
 
240
241
  if cursor_param || next_cursor
241
- raise ArgumentError, "cursor pagination requires both cursor_param and next_cursor" unless cursor_param && next_cursor
242
+ paginate_cursor(operation_id, operation, base_params, cursor_param: cursor_param, next_cursor: next_cursor,
243
+ start: start, max_pages: max_pages, opts: opts, &block)
244
+ else
245
+ paginate_numeric(operation_id, operation, base_params, page_param: page_param, start: start, step: step,
246
+ max_pages: max_pages, opts: opts, &block)
247
+ end
248
+ end
242
249
 
243
- query_names = (operation["queryParams"] || []).map { |p| p["name"] }
244
- unless query_names.include?(cursor_param)
245
- raise ArgumentError, "cursor_param #{cursor_param.inspect} is not a query parameter of operation #{operation_id}"
246
- end
250
+ # Yield individual items across pages. +items+ extracts the list from a page
251
+ # (default: the Crawlora +data+ array).
252
+ def paginate_items(operation_id, params = {}, items: nil, **options, &block)
253
+ return enum_for(:paginate_items, operation_id, params, items: items, **options) unless block_given?
247
254
 
248
- cursor = start
249
- fetched = 0
250
- while max_pages.nil? || fetched < max_pages
251
- page_params = base_params.dup
252
- page_params[cursor_param] = cursor unless cursor.nil?
253
- response = request(operation_id, page_params, response_type: response_type, timeout: timeout, headers: headers)
254
- yield response
255
- fetched += 1
256
- cursor = next_cursor.call(response)
257
- break unless cursor && !(cursor.respond_to?(:empty?) && cursor.empty?)
258
- end
259
- return
255
+ extract = items || Pagination.method(:default_items)
256
+ paginate(operation_id, params, **options) do |page|
257
+ extract.call(page).each(&block)
258
+ end
259
+ end
260
+
261
+ private
262
+
263
+ # Yield successive pages by advancing a cursor query parameter until
264
+ # +next_cursor+ returns a blank value.
265
+ def paginate_cursor(operation_id, operation, base_params, cursor_param:, next_cursor:, start:, max_pages:, opts:)
266
+ raise ArgumentError, "cursor pagination requires both cursor_param and next_cursor" unless cursor_param && next_cursor
267
+
268
+ query_names = (operation["queryParams"] || []).map { |p| p["name"] }
269
+ unless query_names.include?(cursor_param)
270
+ raise ArgumentError, "cursor_param #{cursor_param.inspect} is not a query parameter of operation #{operation_id}"
271
+ end
272
+
273
+ cursor = start
274
+ fetched = 0
275
+ while max_pages.nil? || fetched < max_pages
276
+ page_params = base_params.dup
277
+ page_params[cursor_param] = cursor unless cursor.nil?
278
+ response = request(operation_id, page_params, **opts)
279
+ yield response
280
+ fetched += 1
281
+ cursor = next_cursor.call(response)
282
+ break unless cursor && !(cursor.respond_to?(:empty?) && cursor.empty?)
260
283
  end
284
+ end
261
285
 
286
+ # Yield successive pages by advancing the page/offset query parameter until
287
+ # a page comes back empty.
288
+ def paginate_numeric(operation_id, operation, base_params, page_param:, start:, step:, max_pages:, opts:)
262
289
  page_param ||= Pagination.detect_page_param(operation)
263
290
  raise ArgumentError, "operation #{operation_id} has no page or offset query parameter to paginate" unless page_param
264
291
 
@@ -266,7 +293,7 @@ module Crawlora
266
293
  fetched = 0
267
294
  while max_pages.nil? || fetched < max_pages
268
295
  page_params = base_params.merge(page_param => page_value)
269
- response = request(operation_id, page_params, response_type: response_type, timeout: timeout, headers: headers)
296
+ response = request(operation_id, page_params, **opts)
270
297
  yield response
271
298
  fetched += 1
272
299
  break if Pagination.page_empty?(response)
@@ -275,21 +302,30 @@ module Crawlora
275
302
  end
276
303
  end
277
304
 
278
- # Yield individual items across pages. +items+ extracts the list from a page
279
- # (default: the Crawlora +data+ array).
280
- def paginate_items(operation_id, params = {}, items: nil, **options, &block)
281
- return enum_for(:paginate_items, operation_id, params, items: items, **options) unless block_given?
305
+ def send_request(operation, params, response_type:, timeout:, headers:, idempotency_key: nil)
306
+ url, body, body_headers = build_request(@base_url, operation, params)
307
+ request_headers, req_id = prepare_request(operation, body_headers, headers, idempotency_key)
308
+ unless @before_request.empty?
309
+ ctx = { operation: operation["id"], method: operation["method"], url: url, headers: request_headers }
310
+ @before_request.each { |hook| hook.call(ctx) }
311
+ url = ctx[:url]
312
+ request_headers = ctx[:headers]
313
+ end
282
314
 
283
- extract = items || Pagination.method(:default_items)
284
- paginate(operation_id, params, **options) do |page|
285
- extract.call(page).each(&block)
315
+ request_timeout = timeout.nil? ? @timeout : timeout
316
+ begin
317
+ response = call_transport(method: operation["method"], url: url, headers: request_headers, body: body, timeout: request_timeout)
318
+ rescue StandardError => e
319
+ message = timeout_error?(e) ? "Crawlora request timed out" : "Crawlora transport error"
320
+ raise NetworkError.new(message, request_id: req_id, cause: e)
286
321
  end
287
- end
288
322
 
289
- private
323
+ handle_response(operation, response, response_type, req_id)
324
+ end
290
325
 
291
- def send_request(operation, params, response_type:, timeout:, headers:, idempotency_key: nil)
292
- url, body, body_headers = build_request(@base_url, operation, params)
326
+ # Build the merged request headers and resolve the request id, attaching an
327
+ # idempotency key when one was generated.
328
+ def prepare_request(operation, body_headers, headers, idempotency_key)
293
329
  request_headers = merge_headers(
294
330
  @headers,
295
331
  auth_headers(operation["security"] || [], @api_key, @jwt_token),
@@ -301,53 +337,47 @@ module Crawlora
301
337
  if @request_id
302
338
  ensure_request_id(request_headers)
303
339
  else
304
- v = header_value(request_headers, "x-request-id")
305
- v.empty? ? nil : v
340
+ existing = header_value(request_headers, "x-request-id")
341
+ existing.empty? ? nil : existing
306
342
  end
307
343
  request_headers["Idempotency-Key"] = idempotency_key if idempotency_key && header_value(request_headers, "idempotency-key").empty?
308
- unless @before_request.empty?
309
- ctx = { operation: operation["id"], method: operation["method"], url: url, headers: request_headers }
310
- @before_request.each { |hook| hook.call(ctx) }
311
- url = ctx[:url]
312
- request_headers = ctx[:headers]
313
- end
344
+ [request_headers, req_id]
345
+ end
314
346
 
315
- request_timeout = timeout.nil? ? @timeout : timeout
316
- begin
317
- response =
318
- if @rate_limiter
319
- @rate_limiter.run do
320
- @transport.call(method: operation["method"], url: url, headers: request_headers, body: body, timeout: request_timeout)
321
- end
322
- else
323
- @transport.call(method: operation["method"], url: url, headers: request_headers, body: body, timeout: request_timeout)
324
- end
325
- rescue StandardError => e
326
- message = timeout_error?(e) ? "Crawlora request timed out" : "Crawlora transport error"
327
- raise NetworkError.new(message, request_id: req_id, cause: e)
328
- end
347
+ def call_transport(method:, url:, headers:, body:, timeout:)
348
+ call = -> { @transport.call(method: method, url: url, headers: headers, body: body, timeout: timeout) }
349
+ @rate_limiter ? @rate_limiter.run(&call) : call.call
350
+ end
329
351
 
352
+ # Parse the response, raise the typed API error on non-2xx, and run the
353
+ # after_response hooks on success.
354
+ def handle_response(operation, response, response_type, req_id)
330
355
  raw_body = response.body.to_s
331
356
  is_error = response.status < 200 || response.status >= 300
332
- return StringIO.new(response.body.to_s) if response_type == "stream" && !is_error
357
+ return StringIO.new(raw_body) if response_type == "stream" && !is_error
333
358
 
334
- parse_mode = response_type == "stream" ? "auto" : response_type
359
+ parse_mode = (response_type == "stream") ? "auto" : response_type
335
360
  begin
336
- parsed = parse_response(response.body.to_s, header_value(response.headers, "content-type"), parse_mode)
361
+ parsed = parse_response(raw_body, header_value(response.headers, "content-type"), parse_mode)
337
362
  rescue JSON::ParserError => e
338
363
  raise Error.new("Crawlora JSON parse error", status: response.status, raw_body: raw_body,
339
364
  headers: response.headers, request_id: req_id, cause: e)
340
365
  end
341
366
 
342
- if is_error
343
- code = parsed.is_a?(Hash) ? parsed["code"] : nil
344
- message = parsed.is_a?(Hash) && parsed["msg"] && !parsed["msg"].to_s.empty? ? parsed["msg"] : "HTTP #{response.status}"
345
- raise Crawlora.error_class_for(response.status).new(
346
- message, status: response.status, code: code, body: parsed,
347
- raw_body: raw_body, headers: response.headers, request_id: req_id
348
- )
349
- end
367
+ raise_api_error(response, parsed, raw_body, req_id) if is_error
368
+ run_after_response(operation, response, parsed)
369
+ end
370
+
371
+ def raise_api_error(response, parsed, raw_body, req_id)
372
+ code = parsed.is_a?(Hash) ? parsed["code"] : nil
373
+ message = (parsed.is_a?(Hash) && parsed["msg"] && !parsed["msg"].to_s.empty?) ? parsed["msg"] : "HTTP #{response.status}"
374
+ raise Crawlora.error_class_for(response.status).new(
375
+ message, status: response.status, code: code, body: parsed,
376
+ raw_body: raw_body, headers: response.headers, request_id: req_id
377
+ )
378
+ end
350
379
 
380
+ def run_after_response(operation, response, parsed)
351
381
  @after_response.each do |hook|
352
382
  result = hook.call(operation["id"], response.status, response.headers, parsed)
353
383
  parsed = result unless result.nil?
@@ -476,18 +506,23 @@ module Crawlora
476
506
  chunks << "--#{boundary}\r\n"
477
507
  if parameter["type"] == "file"
478
508
  filename, data = read_file_value(value)
479
- chunks << %(Content-Disposition: form-data; name="#{name}"; filename="#{filename}"\r\n)
509
+ chunks << %(Content-Disposition: form-data; name="#{quote_escape(name)}"; filename="#{quote_escape(filename)}"\r\n)
480
510
  chunks << "Content-Type: application/octet-stream\r\n\r\n"
481
511
  chunks << data
482
512
  chunks << "\r\n"
483
513
  else
484
- chunks << %(Content-Disposition: form-data; name="#{name}"\r\n\r\n#{value}\r\n)
514
+ chunks << %(Content-Disposition: form-data; name="#{quote_escape(name)}"\r\n\r\n#{value}\r\n)
485
515
  end
486
516
  end
487
517
  chunks << "--#{boundary}--\r\n"
488
518
  [chunks, { "content-type" => "multipart/form-data; boundary=#{boundary}" }]
489
519
  end
490
520
 
521
+ # Escape characters that would break a multipart Content-Disposition header.
522
+ def quote_escape(value)
523
+ value.to_s.gsub("\\", "\\\\\\\\").gsub('"', '\\"').gsub(/[\r\n]/, " ")
524
+ end
525
+
491
526
  def read_file_value(value)
492
527
  return ["upload.bin", value] if value.is_a?(String) && !File.exist?(value)
493
528
  return [File.basename(value), File.binread(value)] if value.is_a?(String)
@@ -16,7 +16,7 @@ module Crawlora
16
16
  # A page is empty when its `data` array (Crawlora envelope) or the page
17
17
  # itself is empty/blank.
18
18
  def page_empty?(response)
19
- data = response.is_a?(Hash) && response.key?("data") ? response["data"] : response
19
+ data = (response.is_a?(Hash) && response.key?("data")) ? response["data"] : response
20
20
  return true if data.nil?
21
21
  return data.empty? if data.respond_to?(:empty?)
22
22
 
@@ -24,7 +24,7 @@ module Crawlora
24
24
  end
25
25
 
26
26
  def default_start(page_param)
27
- page_param == "offset" ? 0 : 1
27
+ (page_param == "offset") ? 0 : 1
28
28
  end
29
29
 
30
30
  # Default item extractor: the response's `data` list (Crawlora envelope), or
@@ -3,7 +3,7 @@
3
3
  module Crawlora
4
4
  # SDK release version, in the shared `MAJOR.MINOR.PATCH-sdk.N` tag form (same
5
5
  # as the Go/Java/PHP SDKs). RubyGems treats it as a prerelease and normalizes
6
- # the published gem version to `1.5.0.pre.sdk.2`. Bumped across all SDK repos
7
- # by the API repo's tools/sdkgen/bump_version.py.
8
- VERSION = "1.5.0-sdk.2"
6
+ # the published gem version (the `-` becomes `.pre.`, e.g. `1.5.0.pre.sdk.N`).
7
+ # Bumped across all SDK repos by the API repo's tools/sdkgen/bump_version.py.
8
+ VERSION = "1.5.0-sdk.3"
9
9
  end