crawlora 1.5.0.pre.sdk.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +21 -0
- data/LICENSE +21 -0
- data/README.md +123 -0
- data/docs/operations.md +338 -0
- data/docs/recipes.md +87 -0
- data/examples/bing_search.rb +10 -0
- data/examples/paginate.rb +11 -0
- data/examples/youtube_transcript.rb +10 -0
- data/lib/crawlora/client.rb +626 -0
- data/lib/crawlora/errors.rb +38 -0
- data/lib/crawlora/operations.rb +13841 -0
- data/lib/crawlora/pagination.rb +39 -0
- data/lib/crawlora/version.rb +9 -0
- data/lib/crawlora.rb +31 -0
- data/openapi/public.json +54522 -0
- data/sig/crawlora.rbs +465 -0
- metadata +70 -0
data/docs/recipes.md
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Crawlora Ruby SDK recipes
|
|
2
|
+
|
|
3
|
+
Common patterns beyond the README. See [`operations.md`](operations.md) for the
|
|
4
|
+
full list of operations.
|
|
5
|
+
|
|
6
|
+
## Authentication
|
|
7
|
+
|
|
8
|
+
```ruby
|
|
9
|
+
# API key (most endpoints):
|
|
10
|
+
Crawlora.client(api_key: "live_…")
|
|
11
|
+
|
|
12
|
+
# JWT (dashboard/user endpoints). A raw token is sent as "Token <jwt>";
|
|
13
|
+
# pass "Bearer <jwt>" yourself to override the scheme.
|
|
14
|
+
Crawlora.client(jwt_token: "eyJ…")
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Both fall back to environment variables: `CRAWLORA_API_KEY` and
|
|
18
|
+
`CRAWLORA_BASE_URL`.
|
|
19
|
+
|
|
20
|
+
## Retries and Retry-After
|
|
21
|
+
|
|
22
|
+
```ruby
|
|
23
|
+
Crawlora.client(
|
|
24
|
+
retries: 3,
|
|
25
|
+
retry_delay: 0.5, # exponential backoff with jitter
|
|
26
|
+
max_retry_delay: 10,
|
|
27
|
+
retry_statuses: [429, 503], # override the default retryable set
|
|
28
|
+
on_retry: ->(attempt, error, delay) { warn "retry #{attempt} after #{delay}s (#{error.status})" }
|
|
29
|
+
)
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
A custom predicate wins over the status set:
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
Crawlora.client(retries: 2, retry_predicate: ->(status, _error) { status == 429 })
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
`Retry-After` (seconds or HTTP-date) is always honored, capped at
|
|
39
|
+
`max_retry_delay`.
|
|
40
|
+
|
|
41
|
+
## Hooks
|
|
42
|
+
|
|
43
|
+
```ruby
|
|
44
|
+
client = Crawlora.client(
|
|
45
|
+
before_request: ->(ctx) { ctx[:headers]["X-Trace-Id"] = SecureRandom.uuid },
|
|
46
|
+
after_response: ->(operation_id, status, _headers, body) {
|
|
47
|
+
body.is_a?(Hash) ? body.merge("_op" => operation_id, "_status" => status) : body
|
|
48
|
+
}
|
|
49
|
+
)
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
`before_request` receives a mutable context (`:operation`, `:method`, `:url`,
|
|
53
|
+
`:headers`); editing `:url`/`:headers` rewrites the outgoing request.
|
|
54
|
+
`after_response` may return a replacement body (return `nil` to keep it).
|
|
55
|
+
|
|
56
|
+
## Rate limiting and concurrency
|
|
57
|
+
|
|
58
|
+
```ruby
|
|
59
|
+
client = Crawlora.client(rate_limit: 10, max_concurrency: 4)
|
|
60
|
+
|
|
61
|
+
threads = queries.map do |q|
|
|
62
|
+
Thread.new { client.bing.search(q: q) } # throttled to 10 rps / 4 in-flight
|
|
63
|
+
end
|
|
64
|
+
threads.each(&:join)
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Response modes
|
|
68
|
+
|
|
69
|
+
```ruby
|
|
70
|
+
client.request("youtube-transcript", { id: "abc" }, response_type: "text") # String
|
|
71
|
+
io = client.request("bing-search", { q: "x" }, response_type: "stream") # StringIO
|
|
72
|
+
io.read
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
`auto` (default) parses JSON when the response is JSON and returns text
|
|
76
|
+
otherwise.
|
|
77
|
+
|
|
78
|
+
## Custom transport (testing)
|
|
79
|
+
|
|
80
|
+
Inject any object responding to
|
|
81
|
+
`call(method:, url:, headers:, body:, timeout:)` and returning a
|
|
82
|
+
`Crawlora::Response`:
|
|
83
|
+
|
|
84
|
+
```ruby
|
|
85
|
+
fake = ->(**) { Crawlora::Response.new(200, { "content-type" => "application/json" }, '{"data":[]}') }
|
|
86
|
+
client = Crawlora.client(transport: fake)
|
|
87
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Basic search call. Run with:
|
|
4
|
+
# CRAWLORA_API_KEY=... ruby examples/bing_search.rb
|
|
5
|
+
require "crawlora"
|
|
6
|
+
|
|
7
|
+
Crawlora.client do |client|
|
|
8
|
+
result = client.bing.search(q: "web scraping")
|
|
9
|
+
result["data"].each { |item| puts item["title"] || item.inspect }
|
|
10
|
+
end
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Iterate pages until the API returns an empty page. Run with:
|
|
4
|
+
# CRAWLORA_API_KEY=... ruby examples/paginate.rb
|
|
5
|
+
require "crawlora"
|
|
6
|
+
|
|
7
|
+
Crawlora.client do |client|
|
|
8
|
+
client.paginate_items("airbnb-room-reviews", { id: "YOUR_ROOM_ID" }, max_pages: 3).each do |review|
|
|
9
|
+
puts review.inspect
|
|
10
|
+
end
|
|
11
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
# Fetch a transcript in plain-text response mode. Run with:
|
|
4
|
+
# CRAWLORA_API_KEY=... ruby examples/youtube_transcript.rb
|
|
5
|
+
require "crawlora"
|
|
6
|
+
|
|
7
|
+
Crawlora.client do |client|
|
|
8
|
+
transcript = client.request("youtube-transcript", { id: "dQw4w9WgXcQ" }, response_type: "text")
|
|
9
|
+
puts transcript
|
|
10
|
+
end
|