ghostcrawl 2.2.1 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/ghostcrawl/client.rb +39 -1
- data/lib/ghostcrawl/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ff09081da25b9c78b261437d5bc26c2a4b25f3e727a53e8393f1adb00aa6536c
|
|
4
|
+
data.tar.gz: ad0869fe147e1c33a49fadd364bc93ce5857fd73eb03b584d4c1ce618e1092a9
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: '085820b153fb6b70ea26f6e866787388835cfec7e4cce86498f83551abe6b4c8800b5eb14f86203c59ca77b7e70cffbd459d0d84f7a75fc69f5337914bcf60e6'
|
|
7
|
+
data.tar.gz: 11a00207fc7469d02bec976018c8e627ea11c0da0ce78d4dc5a6d5b76fced17a5f3a37053159e0bde94dd0c44e4d3e273e7cf57cdac1d652a410bcba4da814ff
|
data/lib/ghostcrawl/client.rb
CHANGED
|
@@ -668,7 +668,7 @@ module Ghostcrawl
|
|
|
668
668
|
# @param extract_schema [Hash, nil] JSON Schema for structured extraction
|
|
669
669
|
# @param raise_on_result_error [Boolean] raise {Ghostcrawl::ScrapeError} on a
|
|
670
670
|
# target-side (HTTP-200) failure instead of returning the raw hash (default true)
|
|
671
|
-
# @return [Hash] response with +markdown+, +status+, and other fields
|
|
671
|
+
# @return [Hash] response with +content+, +markdown+, +status+, and other fields
|
|
672
672
|
def scrape(url:, format: "markdown", engine: "auto", javascript: true, extract_schema: nil,
|
|
673
673
|
raise_on_result_error: true, **opts)
|
|
674
674
|
# Use AdditionalDataBody to send only the fields we specify — the generated
|
|
@@ -678,6 +678,7 @@ module Ghostcrawl
|
|
|
678
678
|
"javascript_enabled" => javascript }.merge(opts.transform_keys(&:to_s))
|
|
679
679
|
data["extract_schema"] = extract_schema unless extract_schema.nil?
|
|
680
680
|
hash = ResponseHelper.to_hash(@v1.scrape.post(AdditionalDataBody.new(data)))
|
|
681
|
+
normalize_scrape_content(hash)
|
|
681
682
|
raise_on_result_error ? ResponseHelper.raise_on_result_error!(hash) : hash
|
|
682
683
|
end
|
|
683
684
|
|
|
@@ -757,5 +758,42 @@ module Ghostcrawl
|
|
|
757
758
|
request_info = @v1.me.to_get_request_information(nil)
|
|
758
759
|
ResponseHelper.to_hash(@adapter.send_async(request_info, Ghostcrawl::V1::Binary, {}))
|
|
759
760
|
end
|
|
761
|
+
|
|
762
|
+
private
|
|
763
|
+
|
|
764
|
+
# Normalizes a +"content"+ key onto a decoded scrape response.
|
|
765
|
+
#
|
|
766
|
+
# The API returns the rendered page under a FORMAT-SPECIFIC key
|
|
767
|
+
# (+"markdown"+, +"html"+, or +"text"+), but the documented quickstart reads
|
|
768
|
+
# +result["content"]+. This mirrors that value onto +"content"+ in place,
|
|
769
|
+
# KEEPING the format-specific key intact (backward compatible).
|
|
770
|
+
#
|
|
771
|
+
# No-op unless +result+ is a Hash that does not already carry +"content"+.
|
|
772
|
+
# The value chosen is: the field named by +result["format"]+ when that field
|
|
773
|
+
# is a String, else the first String among +"markdown"+, +"html"+, +"text"+.
|
|
774
|
+
#
|
|
775
|
+
# @param result [Object] the decoded response (only mutated when a Hash)
|
|
776
|
+
# @return [Object] the same +result+, unchanged reference
|
|
777
|
+
# @api private
|
|
778
|
+
def normalize_scrape_content(result)
|
|
779
|
+
return result unless result.is_a?(Hash) && !result.key?("content")
|
|
780
|
+
|
|
781
|
+
fmt = result["format"]
|
|
782
|
+
value = result[fmt] if fmt.is_a?(String)
|
|
783
|
+
value = nil unless value.is_a?(String)
|
|
784
|
+
|
|
785
|
+
unless value.is_a?(String)
|
|
786
|
+
%w[markdown html text].each do |key|
|
|
787
|
+
candidate = result[key]
|
|
788
|
+
if candidate.is_a?(String)
|
|
789
|
+
value = candidate
|
|
790
|
+
break
|
|
791
|
+
end
|
|
792
|
+
end
|
|
793
|
+
end
|
|
794
|
+
|
|
795
|
+
result["content"] = value if value.is_a?(String)
|
|
796
|
+
result
|
|
797
|
+
end
|
|
760
798
|
end
|
|
761
799
|
end
|
data/lib/ghostcrawl/version.rb
CHANGED