ghostcrawl 2.2.2 → 2.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ff09081da25b9c78b261437d5bc26c2a4b25f3e727a53e8393f1adb00aa6536c
4
- data.tar.gz: ad0869fe147e1c33a49fadd364bc93ce5857fd73eb03b584d4c1ce618e1092a9
3
+ metadata.gz: e499bc41b516f5cea6e37d2ec06c17e8fe770a8d928e8415e470be0874d56786
4
+ data.tar.gz: ed4d7fa29dcdc4f1316d0f870555bf32cb61063b3bc7efc07fe46e5682626a50
5
5
  SHA512:
6
- metadata.gz: '085820b153fb6b70ea26f6e866787388835cfec7e4cce86498f83551abe6b4c8800b5eb14f86203c59ca77b7e70cffbd459d0d84f7a75fc69f5337914bcf60e6'
7
- data.tar.gz: 11a00207fc7469d02bec976018c8e627ea11c0da0ce78d4dc5a6d5b76fced17a5f3a37053159e0bde94dd0c44e4d3e273e7cf57cdac1d652a410bcba4da814ff
6
+ metadata.gz: 349f33fe5785ed3458e5e749b7a45e56bf6e9aa8fdcc4b68625792a174676733ac1d0bed6f6148bed07716059498f5f5e565bf4d013357af1d8d4632f5d9ffb3
7
+ data.tar.gz: 7939615bd575ac30e78455194a2979f204d82523b548ede855da34ff06f1422f8e8f676f44fe7ad47262ad63f03b6ef68c39acdc190f9dcb981c06e39717191f
@@ -16,6 +16,7 @@
16
16
  # client = Ghostcrawl::Client.new(token: "gck_live_YOUR_KEY")
17
17
  # result = client.scrape(url: "https://example.com")
18
18
 
19
+ require "json"
19
20
  require "microsoft_kiota_abstractions"
20
21
  require "microsoft_kiota_faraday"
21
22
  require "microsoft_kiota_serialization_json"
@@ -57,6 +58,72 @@ module Ghostcrawl
57
58
  super
58
59
  end
59
60
  end
61
+
62
+ # Fix 1b: the pinned Kiota writer's write_collection_of_object_values calls
63
+ # `self.write_object_value(nil, v).writer` on each element — it relies on
64
+ # write_object_value(nil, ...) returning a *writer object* that responds to
65
+ # `.writer`. Our write_object_value override above (Fix 1) changes that
66
+ # nil-key contract to serialize-into-self and return the serialize result
67
+ # (a Hash / nil), so the upstream `.writer` call raises
68
+ # `NoMethodError: undefined method 'writer' for {}:Hash`. That aborts
69
+ # serialize for ANY response model carrying a collection of typed objects
70
+ # (WebhookListResponse#items, etc.), which ResponseHelper.serialize_parsable
71
+ # then swallows -> the facade returns {} and silently drops the whole list.
72
+ #
73
+ # Fix: serialize each element into its OWN fresh writer and collect the raw
74
+ # per-element hashes directly, never depending on the write_object_value
75
+ # return contract. Mirrors the non-nil upstream branch (line 154) without the
76
+ # broken `.writer` deref.
77
+ def write_collection_of_object_values(key, values)
78
+ return unless values
79
+ hashes = values.map do |v|
80
+ temp = MicrosoftKiotaSerializationJson::JsonSerializationWriter.new
81
+ v.serialize(temp)
82
+ temp.writer
83
+ end
84
+ if key.nil?
85
+ hashes
86
+ else
87
+ writer[key] = hashes
88
+ end
89
+ end
90
+
91
+ # Fix 1c: the pinned Kiota writer's write_any_value (used by
92
+ # write_additional_data, i.e. EVERY field AdditionalDataBody sends) has no
93
+ # case for a plain Hash and mis-handles an Array of Hashes:
94
+ # * a Hash hits the `value.is_a? Object` branch -> `return value.to_s`,
95
+ # which returns a string but NEVER assigns @writer[key] -> the whole
96
+ # nested object is SILENTLY DROPPED from the request body.
97
+ # * an Array is sent to write_collection_of_primitive_values, which mangles
98
+ # any non-primitive (Hash) element.
99
+ # This drops/mangles every nested field the facade sends: scrape(extract_schema),
100
+ # extract(schema), crawl/crawl_runs opts, datasets.append(rows: [ {..} ]),
101
+ # schedules.create(job_params: {..}) — the last of which the API REQUIRES,
102
+ # so the request 422s ("job_params required") even though the caller supplied it.
103
+ #
104
+ # Fix: intercept Hash and Array here and write a fully-recursed, JSON-native
105
+ # structure straight into @writer[key]. The writer emits @writer.to_json at the
106
+ # end, so plain nested Hash/Array values serialize correctly. Everything else
107
+ # falls through to the upstream primitive handling.
108
+ def write_any_value(key, value)
109
+ if value.is_a?(Hash) || value.is_a?(Array)
110
+ return super unless key # nil-key arrays keep upstream behavior
111
+ writer[key] = KiotaWriterFix.jsonable(value)
112
+ return
113
+ end
114
+ super
115
+ end
116
+
117
+ # Recursively converts a value into a JSON-native structure (Hash/Array of
118
+ # plain scalars), so it round-trips through @writer.to_json intact.
119
+ # @api private
120
+ def self.jsonable(value)
121
+ case value
122
+ when Hash then value.each_with_object({}) { |(k, v), h| h[k.to_s] = jsonable(v) }
123
+ when Array then value.map { |v| jsonable(v) }
124
+ else value
125
+ end
126
+ end
60
127
  end
61
128
 
62
129
  # Patch the writer class once, idempotently
@@ -190,15 +257,128 @@ module Ghostcrawl
190
257
  when NilClass
191
258
  {}
192
259
  else
193
- # Typed Parsable: try serializing to JSON then parsing back
194
- if value.respond_to?(:additional_data) && value.additional_data
195
- value.additional_data.transform_values { |v| to_hash(v) }
196
- elsif value.respond_to?(:to_h)
197
- value.to_h.transform_values { |v| to_hash(v) }
198
- else
199
- value
200
- end
260
+ # Typed Parsable (Kiota-generated model). Kiota models keep their spec
261
+ # fields in typed instance variables, NOT in +additional_data+ (which is
262
+ # only the overflow bucket for unmapped keys). Reading +additional_data+
263
+ # alone therefore returns +{}+ for a fully-typed response (e.g.
264
+ # MapResponse{@links,@success}, WebhookListResponse{@items,@total}),
265
+ # silently dropping the real payload.
266
+ #
267
+ # Recover the typed fields by round-tripping the model through its own
268
+ # +serialize+ (the Parsable contract) into a JSON writer and parsing the
269
+ # result back to a plain Hash. Then overlay any +additional_data+ (unmapped
270
+ # keys the server sent that the model didn't declare). Degrade gracefully:
271
+ # a model whose +serialize+ raises (e.g. an unresolved composed-type member)
272
+ # falls back to the prior additional_data/to_h behavior so nothing regresses.
273
+ parsable_to_hash(value)
274
+ end
275
+ end
276
+
277
+ # Converts a typed Kiota Parsable into a plain Hash by serializing it via its
278
+ # own +serialize+ and parsing the emitted JSON, then merging +additional_data+.
279
+ # Falls back to +additional_data+ / +to_h+ / the value itself on any failure.
280
+ # @api private
281
+ def self.parsable_to_hash(value)
282
+ serialized = serialize_parsable(value)
283
+ if serialized.is_a?(Hash)
284
+ extra = (value.additional_data if value.respond_to?(:additional_data)) || {}
285
+ merged = serialized.merge(extra.transform_keys(&:to_s))
286
+ return merged.transform_values { |v| to_hash(v) }
287
+ end
288
+
289
+ if value.respond_to?(:additional_data) && value.additional_data && !value.additional_data.empty?
290
+ value.additional_data.transform_values { |v| to_hash(v) }
291
+ elsif value.respond_to?(:to_h)
292
+ value.to_h.transform_values { |v| to_hash(v) }
293
+ elsif value.respond_to?(:additional_data) && value.additional_data
294
+ value.additional_data.transform_values { |v| to_hash(v) }
295
+ else
296
+ value
297
+ end
298
+ end
299
+
300
+ # Serializes a Parsable to a Ruby Hash using the pinned Kiota JSON writer.
301
+ # Returns the parsed Hash, or +nil+ when the model isn't a serializable
302
+ # Parsable or serialization/parse fails (caller then falls back).
303
+ # @api private
304
+ def self.serialize_parsable(value)
305
+ return nil unless value.respond_to?(:serialize)
306
+
307
+ writer = MicrosoftKiotaSerializationJson::JsonSerializationWriter.new
308
+ value.serialize(writer)
309
+ content = writer.get_serialized_content
310
+ json = content.is_a?(String) ? content : content.read
311
+ return nil if json.nil? || json.empty?
312
+
313
+ parsed = JSON.parse(json)
314
+ parsed.is_a?(Hash) ? parsed : nil
315
+ rescue StandardError
316
+ nil
317
+ end
318
+
319
+ # Executes a request that returns NO response body (an HTTP 204, or any DELETE
320
+ # whose spec maps the success response to +void+) and returns a plain Hash.
321
+ #
322
+ # Why this exists — two upstream defects in the pinned Kiota Ruby runtime that
323
+ # make the generated void-DELETE builders unusable:
324
+ #
325
+ # 1. The generated +delete+ for a void response calls
326
+ # +send_async(request_info, nil, ...)+, and the Faraday adapter hard-raises
327
+ # +"factory cannot be null"+ BEFORE the request is ever sent — so the DELETE
328
+ # never reaches the server and the caller gets a bare +StandardError+
329
+ # (not a {GhostcrawlError}).
330
+ # 2. For void DELETEs the adapter DOES send maps to +Binary+, the response is an
331
+ # empty 204 body, and +get_root_parse_node+ then feeds +""+ to the JSON parser,
332
+ # raising +JSON::ParserError+ on a perfectly successful delete.
333
+ #
334
+ # This helper reuses the adapter's OWN public request pipeline
335
+ # (+convert_to_native_request_async+ applies base-url + auth, +run_request+
336
+ # sends it) but skips body deserialization entirely: a 2xx returns +{}+ (or the
337
+ # decoded JSON when the server did send a body, e.g. +{"deleted":true}+), and a
338
+ # >=400 is translated into the documented typed {GhostcrawlError} hierarchy.
339
+ #
340
+ # @param adapter [MicrosoftKiotaFaraday::FaradayRequestAdapter]
341
+ # @param request_info [MicrosoftKiotaAbstractions::RequestInformation]
342
+ # @return [Hash] +{}+ on an empty 2xx, or the decoded body when one is present
343
+ # @api private
344
+ def self.void_request!(adapter, request_info)
345
+ # Replicate the first half of the adapter's send_async by hand — apply the
346
+ # base URL, run the bearer-auth Fiber (which mutates request_info's headers),
347
+ # then build the native Faraday request. We deliberately do NOT call the
348
+ # adapter's convert_to_native_request_async helper: its body uses a bare
349
+ # +return+ inside a Fiber, which raises LocalJumpError("unexpected return")
350
+ # when resumed under Ruby 3.x.
351
+ request_info.path_parameters["baseurl"] = adapter.get_base_url
352
+ # authenticate_request returns a Fiber only when the Authorization header is
353
+ # not already present; it returns nil otherwise. Guard the resume.
354
+ auth_fiber = adapter.authentication_provider.authenticate_request(request_info)
355
+ auth_fiber.resume if auth_fiber.respond_to?(:resume)
356
+ request = adapter.get_request_from_request_info(request_info)
357
+ response = adapter.client.run_request(
358
+ request.http_method, request.path, request.body, request.headers
359
+ )
360
+
361
+ status = response.status
362
+ if status >= 400
363
+ # Reuse the typed-error translation: synthesize a message carrying the
364
+ # ":<status>" token raise_translated keys off, plus any body text.
365
+ body = response.body.to_s
366
+ err = MicrosoftKiotaAbstractions::ApiError.new(
367
+ "The server returned an unexpected status code:#{status}" \
368
+ "#{body.empty? ? '' : " #{body}"}"
369
+ )
370
+ Ghostcrawl.raise_translated(err)
201
371
  end
372
+
373
+ body = response.body.to_s
374
+ return {} if body.strip.empty?
375
+
376
+ parsed = (JSON.parse(body) rescue nil)
377
+ parsed.is_a?(Hash) ? parsed.transform_values { |v| to_hash(v) } : {}
378
+ rescue Ghostcrawl::GhostcrawlError
379
+ raise
380
+ rescue StandardError => e
381
+ Ghostcrawl.raise_translated(e)
202
382
  end
203
383
 
204
384
  # Inspects a decoded HTTP-200 response hash for a RESULT-channel failure (the
@@ -347,7 +527,7 @@ module Ghostcrawl
347
527
  # Extend a session's TTL.
348
528
  # Delegates to POST /v1/sessions/{id}/extend via the generated builder.
349
529
  def extend(session_id, duration_seconds: 300)
350
- ResponseHelper.to_hash(@v1.sessions.by_profile__id(session_id).extend.post(
530
+ ResponseHelper.to_hash(@v1.sessions.by_profile_id(session_id).extend.post(
351
531
  AdditionalDataBody.new({ "ttl_seconds" => duration_seconds })
352
532
  ))
353
533
  end
@@ -355,14 +535,15 @@ module Ghostcrawl
355
535
  # Release a session back to the pool.
356
536
  # Delegates to POST /v1/sessions/{id}/release via the generated builder.
357
537
  def release(session_id)
358
- ResponseHelper.to_hash(@v1.sessions.by_profile__id(session_id).release.post)
538
+ ResponseHelper.to_hash(@v1.sessions.by_profile_id(session_id).release.post)
359
539
  end
360
540
  end
361
541
 
362
542
  # Manage identity profiles — /v1/profiles.
363
543
  class ProfilesClient
364
- def initialize(v1)
544
+ def initialize(v1, adapter = nil)
365
545
  @v1 = v1
546
+ @adapter = adapter
366
547
  end
367
548
 
368
549
  # List all profiles.
@@ -392,15 +573,18 @@ module Ghostcrawl
392
573
 
393
574
  # Delete a profile.
394
575
  # Delegates to DELETE /v1/profiles/{name} via the generated builder.
576
+ # The endpoint answers 204 No Content; routing through {ResponseHelper.void_request!}
577
+ # avoids the Kiota JSON parser choking on the empty body.
395
578
  def delete(name)
396
- ResponseHelper.to_hash(@v1.profiles.by_name(name).delete)
579
+ ResponseHelper.void_request!(@adapter, @v1.profiles.by_name(name).to_delete_request_information(nil))
397
580
  end
398
581
  end
399
582
 
400
583
  # Manage webhooks — /v1/webhooks.
401
584
  class WebhooksClient
402
- def initialize(v1)
585
+ def initialize(v1, adapter = nil)
403
586
  @v1 = v1
587
+ @adapter = adapter
404
588
  end
405
589
 
406
590
  # List all webhooks.
@@ -426,10 +610,13 @@ module Ghostcrawl
426
610
  end
427
611
 
428
612
  # Delete a webhook.
429
- # Delegates to DELETE /v1/webhooks/{id} via the generated builder.
613
+ # Delegates to DELETE /v1/webhooks/{id}. The generated builder passes a +nil+
614
+ # response factory (204 void), which the Kiota adapter rejects with a bare
615
+ # StandardError BEFORE sending the request — so we build the request info and
616
+ # run it through {ResponseHelper.void_request!} instead, which actually fires
617
+ # the DELETE and returns +{}+ on success.
430
618
  def delete(webhook_id)
431
- @v1.webhooks.by_webhook_id(webhook_id).delete
432
- {}
619
+ ResponseHelper.void_request!(@adapter, @v1.webhooks.by_webhook_id(webhook_id).to_delete_request_information(nil))
433
620
  end
434
621
 
435
622
  # Rotate the signing secret for a webhook.
@@ -441,8 +628,9 @@ module Ghostcrawl
441
628
 
442
629
  # Manage schedules — /v1/schedules.
443
630
  class SchedulesClient
444
- def initialize(v1)
631
+ def initialize(v1, adapter = nil)
445
632
  @v1 = v1
633
+ @adapter = adapter
446
634
  end
447
635
 
448
636
  # List all schedules.
@@ -459,16 +647,45 @@ module Ghostcrawl
459
647
 
460
648
  # Create a new schedule.
461
649
  # Delegates to POST /v1/schedules via the generated SchedulesRequestBuilder.
462
- def create(cron:, task:, **opts)
463
- data = { "cron_expr" => cron, "task" => task }.merge(opts.transform_keys(&:to_s))
650
+ #
651
+ # The API's ScheduleCreateRequest requires +name+, +job_type+
652
+ # ("scrape" | "crawl" | "change_monitor"), +cron_expr+, and +job_params+
653
+ # (the full scrape/crawl request body). It rejects any other top-level field
654
+ # with a 422 — so a legacy +task+ key must be TRANSLATED, never forwarded.
655
+ #
656
+ # @param cron [String] cron expression (sent as +cron_expr+)
657
+ # @param name [String] schedule name (required by the API)
658
+ # @param job_type [String, nil] "scrape" | "crawl" | "change_monitor"
659
+ # @param job_params [Hash, nil] full job request body (e.g. { url: ... })
660
+ # @param task [Hash, nil] DEPRECATED legacy shape { "action" => ..., ...rest };
661
+ # when given (and job_type/job_params are absent) it is split into
662
+ # +job_type+ (from +task["action"]+) and +job_params+ (the remaining keys).
663
+ def create(cron:, name: nil, job_type: nil, job_params: nil, task: nil, **opts)
664
+ # Back-compat: derive job_type/job_params from a legacy `task` hash.
665
+ if task.is_a?(Hash)
666
+ t = task.transform_keys(&:to_s)
667
+ job_type ||= t["action"] || t["job_type"] || t["type"]
668
+ job_params ||= t.reject { |k, _| %w[action job_type type].include?(k) }
669
+ end
670
+
671
+ data = { "cron_expr" => cron }
672
+ data["name"] = name unless name.nil?
673
+ data["job_type"] = job_type unless job_type.nil?
674
+ data["job_params"] = job_params unless job_params.nil?
675
+ # opts may override/supply name/job_type/job_params/notify_webhook/monitor_mode.
676
+ # Never forward a raw `task` field — the API 422s on it.
677
+ data.merge!(opts.transform_keys(&:to_s).reject { |k, _| k == "task" })
678
+
464
679
  ResponseHelper.to_hash(@v1.schedules.post(AdditionalDataBody.new(data)))
465
680
  end
466
681
 
467
682
  # Delete a schedule.
468
- # Delegates to DELETE /v1/schedules/{id} via the generated builder.
683
+ # Delegates to DELETE /v1/schedules/{id}. The generated builder returns a Fiber
684
+ # that must be resumed to actually send the request (the old +; {}+ discarded it,
685
+ # so the DELETE never fired). {ResponseHelper.void_request!} runs it and tolerates
686
+ # the empty 204 body.
469
687
  def delete(schedule_id)
470
- @v1.schedules.by_schedule_id(schedule_id).delete
471
- {}
688
+ ResponseHelper.void_request!(@adapter, @v1.schedules.by_schedule_id(schedule_id).to_delete_request_information(nil))
472
689
  end
473
690
  end
474
691
 
@@ -519,8 +736,9 @@ module Ghostcrawl
519
736
 
520
737
  # Manage session recordings — /v1/recordings.
521
738
  class RecordingsClient
522
- def initialize(v1)
739
+ def initialize(v1, adapter = nil)
523
740
  @v1 = v1
741
+ @adapter = adapter
524
742
  end
525
743
 
526
744
  # List all recordings.
@@ -531,15 +749,17 @@ module Ghostcrawl
531
749
 
532
750
  # Get a recording by ID.
533
751
  # Delegates to GET /v1/recordings/{id} via the generated builder.
752
+ # (The generated accessor is +by_recording_id+, single underscore.)
534
753
  def get(recording_id)
535
- ResponseHelper.to_hash(@v1.recordings.by_recording__id(recording_id).get)
754
+ ResponseHelper.to_hash(@v1.recordings.by_recording_id(recording_id).get)
536
755
  end
537
756
 
538
757
  # Delete a recording.
539
- # Delegates to DELETE /v1/recordings/{id} via the generated builder.
758
+ # Delegates to DELETE /v1/recordings/{id}. Same void-DELETE defect as webhooks
759
+ # (generated builder passes a +nil+ factory → adapter raises before sending), so
760
+ # we route through {ResponseHelper.void_request!}.
540
761
  def delete(recording_id)
541
- @v1.recordings.by_recording__id(recording_id).delete
542
- {}
762
+ ResponseHelper.void_request!(@adapter, @v1.recordings.by_recording_id(recording_id).to_delete_request_information(nil))
543
763
  end
544
764
  end
545
765
 
@@ -630,11 +850,14 @@ module Ghostcrawl
630
850
 
631
851
  @crawl_runs = CrawlRunsClient.new(@v1)
632
852
  @sessions = SessionsClient.new(@v1)
633
- @profiles = ProfilesClient.new(@v1)
634
- @webhooks = WebhooksClient.new(@v1)
635
- @schedules = SchedulesClient.new(@v1)
853
+ # Sub-clients with void-DELETE endpoints (204 No Content) also need the raw
854
+ # adapter so ResponseHelper.void_request! can work around the broken generated
855
+ # delete builders. See ResponseHelper.void_request! for the two Kiota defects.
856
+ @profiles = ProfilesClient.new(@v1, @adapter)
857
+ @webhooks = WebhooksClient.new(@v1, @adapter)
858
+ @schedules = SchedulesClient.new(@v1, @adapter)
636
859
  @datasets = DatasetsClient.new(@v1)
637
- @recordings = RecordingsClient.new(@v1)
860
+ @recordings = RecordingsClient.new(@v1, @adapter)
638
861
  @kv = KVClient.new(@v1)
639
862
  end
640
863
 
@@ -92,7 +92,14 @@ module Ghostcrawl
92
92
  when 403 then FORBIDDEN
93
93
  when 404 then NOT_FOUND
94
94
  when 409 then CONFLICT
95
- when 422 then BYO_PROXY_INVALID
95
+ # 422 is the generic validation status the API returns for ANY malformed
96
+ # request (its canonical code is "bad_request": missing field, failed
97
+ # extraction validation, etc.). byo_proxy_invalid is only ONE 422 sub-case
98
+ # and, when it genuinely occurs, arrives WITH a problem+json body carrying
99
+ # code:"byo_proxy_invalid" (which raise_translated reads first). Using the
100
+ # BYO-specific code as the blanket 422 fallback mislabels every unrelated
101
+ # validation 422. Fall back to the generic bad_request instead.
102
+ when 422 then BAD_REQUEST
96
103
  when 429 then RATE_LIMITED
97
104
  when 503 then SERVICE_UNAVAILABLE
98
105
  when 504 then FLEET_TIMEOUT
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Ghostcrawl
4
- VERSION = "2.2.2"
4
+ VERSION = "2.2.3"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ghostcrawl
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.2.2
4
+ version: 2.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - GhostCrawl