scrapio 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +140 -0
- data/lib/scrapio/client.rb +28 -0
- data/lib/scrapio/errors.rb +14 -0
- data/lib/scrapio/http_client.rb +53 -0
- data/lib/scrapio/resources/amazon.rb +15 -0
- data/lib/scrapio/resources/crawl.rb +16 -0
- data/lib/scrapio/resources/fetch.rb +16 -0
- data/lib/scrapio/resources/google.rb +15 -0
- data/lib/scrapio/resources/interact.rb +15 -0
- data/lib/scrapio/resources/jobs.rb +31 -0
- data/lib/scrapio/resources/walmart.rb +15 -0
- data/lib/scrapio/resources/youtube.rb +19 -0
- data/lib/scrapio.rb +8 -0
- metadata +60 -0
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA256:
|
|
3
|
+
metadata.gz: 0b01cff647c80bdc948a24eef3b9dfe4095466678b6eda74cf01bbfc6a706c57
|
|
4
|
+
data.tar.gz: '08880b616f26a60b6ce18949253cd96c2d24aca227e702843b6371dcab60c7d6'
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 0f321767a8ef0e4775ee52836ba7fa620df758ad341c3d19abf5d52ddaf2a90ba4e9c2e46b22d76d7cbc3ee056cbed63b34d8801d5a3737dbd471c51768122e4
|
|
7
|
+
data.tar.gz: da66af15d073cce2719fab455fd940d272215d76e3abb894c332197ed817d9b79a1392302e04c85f74e896d9726bd4957886d8f833cf34a949906c932fdd8c80
|
data/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# scrapio
|
|
2
|
+
|
|
3
|
+
Official Ruby SDK for [Scrapio](https://scrapio.dev) — fetch, crawl, search, and extract structured data from any URL.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
gem install scrapio
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Or add to your Gemfile:
|
|
12
|
+
|
|
13
|
+
```ruby
|
|
14
|
+
gem "scrapio"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Requires Ruby 2.7 or later.
|
|
18
|
+
|
|
19
|
+
## Quickstart
|
|
20
|
+
|
|
21
|
+
```ruby
|
|
22
|
+
require "scrapio"
|
|
23
|
+
|
|
24
|
+
client = Scrapio::Client.new(ENV["SCRAPIO_API_KEY"])
|
|
25
|
+
|
|
26
|
+
result = client.fetch.fetch(url: "https://example.com", output: ["markdown"])
|
|
27
|
+
puts result["outputs"]["markdown"]
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
### Fetch a page
|
|
33
|
+
|
|
34
|
+
```ruby
|
|
35
|
+
result = client.fetch.fetch(
|
|
36
|
+
url: "https://news.ycombinator.com",
|
|
37
|
+
render_js: true,
|
|
38
|
+
output: ["markdown"]
|
|
39
|
+
)
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Google Search
|
|
43
|
+
|
|
44
|
+
```ruby
|
|
45
|
+
results = client.google.search(
|
|
46
|
+
search: "best web scraping API 2025",
|
|
47
|
+
country_code: "us"
|
|
48
|
+
)
|
|
49
|
+
puts results["results"]
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### Amazon product
|
|
53
|
+
|
|
54
|
+
```ruby
|
|
55
|
+
product = client.amazon.get_product("B08N5WRWNW")
|
|
56
|
+
puts "#{product["title"]} — $#{product["price"]}"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Walmart search
|
|
60
|
+
|
|
61
|
+
```ruby
|
|
62
|
+
items = client.walmart.search("headphones")
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### YouTube video
|
|
66
|
+
|
|
67
|
+
```ruby
|
|
68
|
+
video = client.youtube.get_video("dQw4w9WgXcQ")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Browser automation
|
|
72
|
+
|
|
73
|
+
```ruby
|
|
74
|
+
result = client.interact.interact(
|
|
75
|
+
url: "https://example.com",
|
|
76
|
+
actions: [
|
|
77
|
+
{ type: "click", selector: "#login" },
|
|
78
|
+
{ type: "type", selector: "#email", value: "user@example.com" },
|
|
79
|
+
]
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### Crawl a site
|
|
84
|
+
|
|
85
|
+
```ruby
|
|
86
|
+
result = client.crawl.crawl(
|
|
87
|
+
seeds: ["https://docs.example.com"],
|
|
88
|
+
max_pages: 50,
|
|
89
|
+
output: ["markdown"]
|
|
90
|
+
)
|
|
91
|
+
puts result["result"]["summary"]["pages_succeeded"]
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Async jobs
|
|
95
|
+
|
|
96
|
+
```ruby
|
|
97
|
+
job = client.jobs.create(
|
|
98
|
+
job_type: "fetch",
|
|
99
|
+
payload: { url: "https://example.com", output: ["markdown"] }
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
result = client.jobs.wait_for_completion(job["job_id"])
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Configuration
|
|
106
|
+
|
|
107
|
+
```ruby
|
|
108
|
+
client = Scrapio::Client.new(
|
|
109
|
+
ENV["SCRAPIO_API_KEY"],
|
|
110
|
+
base_url: "https://api.scrapio.dev", # optional override
|
|
111
|
+
timeout: 30 # optional, default 30s
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Error handling
|
|
116
|
+
|
|
117
|
+
```ruby
|
|
118
|
+
begin
|
|
119
|
+
result = client.fetch.fetch(url: "https://example.com")
|
|
120
|
+
rescue Scrapio::AuthError
|
|
121
|
+
puts "Invalid API key"
|
|
122
|
+
rescue Scrapio::CreditsExhaustedError
|
|
123
|
+
puts "No credits remaining"
|
|
124
|
+
rescue Scrapio::RateLimitError
|
|
125
|
+
puts "Rate limited — back off and retry"
|
|
126
|
+
rescue Scrapio::ScrapioError => e
|
|
127
|
+
puts "API error #{e.status_code}: #{e.message}"
|
|
128
|
+
end
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Links
|
|
132
|
+
|
|
133
|
+
- [Documentation](https://scrapio.dev/docs)
|
|
134
|
+
- [API Reference](https://scrapio.dev/docs/api-reference/fetch)
|
|
135
|
+
- [Dashboard](https://app.scrapio.dev)
|
|
136
|
+
- [Get an API key](https://scrapio.dev#pricing)
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
require_relative "http_client"
|
|
2
|
+
require_relative "resources/fetch"
|
|
3
|
+
require_relative "resources/google"
|
|
4
|
+
require_relative "resources/amazon"
|
|
5
|
+
require_relative "resources/walmart"
|
|
6
|
+
require_relative "resources/youtube"
|
|
7
|
+
require_relative "resources/jobs"
|
|
8
|
+
require_relative "resources/crawl"
|
|
9
|
+
require_relative "resources/interact"
|
|
10
|
+
|
|
11
|
+
module Scrapio
|
|
12
|
+
class Client
|
|
13
|
+
attr_reader :fetch, :google, :amazon, :walmart, :youtube, :jobs, :crawl, :interact
|
|
14
|
+
|
|
15
|
+
def initialize(api_key, base_url: HttpClient::DEFAULT_BASE_URL, timeout: HttpClient::DEFAULT_TIMEOUT)
|
|
16
|
+
http = HttpClient.new(api_key, base_url: base_url, timeout: timeout)
|
|
17
|
+
|
|
18
|
+
@fetch = Resources::Fetch.new(http)
|
|
19
|
+
@google = Resources::Google.new(http)
|
|
20
|
+
@amazon = Resources::Amazon.new(http)
|
|
21
|
+
@walmart = Resources::Walmart.new(http)
|
|
22
|
+
@youtube = Resources::YouTube.new(http)
|
|
23
|
+
@jobs = Resources::Jobs.new(http)
|
|
24
|
+
@crawl = Resources::Crawl.new(http)
|
|
25
|
+
@interact = Resources::Interact.new(http)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
class ScrapioError < StandardError
|
|
3
|
+
attr_reader :status_code
|
|
4
|
+
|
|
5
|
+
def initialize(message, status_code: 0)
|
|
6
|
+
super(message)
|
|
7
|
+
@status_code = status_code
|
|
8
|
+
end
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
class AuthError < ScrapioError; end
|
|
12
|
+
class RateLimitError < ScrapioError; end
|
|
13
|
+
class CreditsExhaustedError < ScrapioError; end
|
|
14
|
+
end
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
require "net/http"
|
|
2
|
+
require "uri"
|
|
3
|
+
require "json"
|
|
4
|
+
|
|
5
|
+
module Scrapio
|
|
6
|
+
class HttpClient
|
|
7
|
+
DEFAULT_BASE_URL = "https://api.scrapio.dev"
|
|
8
|
+
DEFAULT_TIMEOUT = 30
|
|
9
|
+
|
|
10
|
+
def initialize(api_key, base_url: DEFAULT_BASE_URL, timeout: DEFAULT_TIMEOUT)
|
|
11
|
+
@api_key = api_key
|
|
12
|
+
@base_url = base_url
|
|
13
|
+
@timeout = timeout
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def get(path, params = {})
|
|
17
|
+
uri = URI("#{@base_url}#{path}")
|
|
18
|
+
filtered = params.compact
|
|
19
|
+
uri.query = URI.encode_www_form(filtered) unless filtered.empty?
|
|
20
|
+
request(Net::HTTP::Get.new(uri))
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def post(path, body = {})
|
|
24
|
+
uri = URI("#{@base_url}#{path}")
|
|
25
|
+
req = Net::HTTP::Post.new(uri)
|
|
26
|
+
req["Content-Type"] = "application/json"
|
|
27
|
+
req.body = JSON.generate(body.compact)
|
|
28
|
+
request(req)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
def request(req)
|
|
34
|
+
req["Authorization"] = "Bearer #{@api_key}"
|
|
35
|
+
req["Accept"] = "application/json"
|
|
36
|
+
|
|
37
|
+
uri = req.uri
|
|
38
|
+
Net::HTTP.start(uri.host, uri.port, use_ssl: uri.scheme == "https",
|
|
39
|
+
read_timeout: @timeout, open_timeout: @timeout) do |http|
|
|
40
|
+
resp = http.request(req)
|
|
41
|
+
data = JSON.parse(resp.body)
|
|
42
|
+
|
|
43
|
+
case resp.code.to_i
|
|
44
|
+
when 200..299 then data
|
|
45
|
+
when 401, 403 then raise AuthError.new(data["message"] || "Unauthorized", status_code: resp.code.to_i)
|
|
46
|
+
when 402 then raise CreditsExhaustedError.new(data["message"] || "Credits exhausted", status_code: 402)
|
|
47
|
+
when 429 then raise RateLimitError.new(data["message"] || "Rate limited", status_code: 429)
|
|
48
|
+
else raise ScrapioError.new(data["message"] || "HTTP #{resp.code}", status_code: resp.code.to_i)
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Amazon
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def get_product(asin, country: nil)
|
|
7
|
+
@http.get("/v1/amazon/product", { asin: asin, country: country })
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def search(query, country: nil, page: nil)
|
|
11
|
+
@http.get("/v1/amazon/search", { query: query, country: country, page: page })
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Crawl
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def crawl(seeds:, max_pages: nil, max_depth: nil, same_domain_only: nil,
|
|
7
|
+
output: nil, extract: nil, timeout_ms: nil)
|
|
8
|
+
@http.post("/v1/crawl", {
|
|
9
|
+
seeds: seeds, max_pages: max_pages, max_depth: max_depth,
|
|
10
|
+
same_domain_only: same_domain_only, output: output,
|
|
11
|
+
extract: extract, timeout_ms: timeout_ms,
|
|
12
|
+
})
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Fetch
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def fetch(url:, render_js: nil, device: nil, session: nil, output: nil,
|
|
7
|
+
extract: nil, actions: nil, timeout: nil, proxy: nil, country: nil)
|
|
8
|
+
@http.post("/v1/fetch", {
|
|
9
|
+
url: url, render_js: render_js, device: device, session: session,
|
|
10
|
+
output: output, extract: extract, actions: actions,
|
|
11
|
+
timeout: timeout, proxy: proxy, country: country,
|
|
12
|
+
})
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Google
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def search(search:, search_type: nil, country_code: nil, language: nil,
|
|
7
|
+
device: nil, page: nil, date_range: nil, **opts)
|
|
8
|
+
@http.get("/v1/google/search", {
|
|
9
|
+
search: search, search_type: search_type, country_code: country_code,
|
|
10
|
+
language: language, device: device, page: page, date_range: date_range,
|
|
11
|
+
}.merge(opts))
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Interact
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def interact(url:, actions:, device: nil, session: nil,
|
|
7
|
+
output: nil, extract: nil, timeout_ms: nil)
|
|
8
|
+
@http.post("/v1/interact", {
|
|
9
|
+
url: url, actions: actions, device: device, session: session,
|
|
10
|
+
output: output, extract: extract, timeout_ms: timeout_ms,
|
|
11
|
+
})
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Jobs
|
|
4
|
+
TERMINAL = %w[completed partial failed cancelled].freeze
|
|
5
|
+
|
|
6
|
+
def initialize(http) = @http = http
|
|
7
|
+
|
|
8
|
+
def create(job_type:, payload:, webhook_url: nil)
|
|
9
|
+
@http.post("/v1/jobs", { job_type: job_type, payload: payload, webhook_url: webhook_url })
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
def get(job_id)
|
|
13
|
+
@http.get("/v1/jobs/#{job_id}")
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
def get_result(job_id)
|
|
17
|
+
@http.get("/v1/jobs/#{job_id}/result")
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def wait_for_completion(job_id, poll_interval: 2.0, timeout: 300.0)
|
|
21
|
+
deadline = Time.now + timeout
|
|
22
|
+
loop do
|
|
23
|
+
job = get(job_id)
|
|
24
|
+
return get_result(job_id) if TERMINAL.include?(job["status"])
|
|
25
|
+
raise ScrapioError, "Job #{job_id} did not complete within #{timeout}s" if Time.now >= deadline
|
|
26
|
+
sleep(poll_interval)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class Walmart
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def get_product(product_id, country: nil)
|
|
7
|
+
@http.get("/v1/walmart/product", { product_id: product_id, country: country })
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def search(query, country: nil, page: nil)
|
|
11
|
+
@http.get("/v1/walmart/search", { query: query, country: country, page: page })
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
module Scrapio
|
|
2
|
+
module Resources
|
|
3
|
+
class YouTube
|
|
4
|
+
def initialize(http) = @http = http
|
|
5
|
+
|
|
6
|
+
def get_video(video_id)
|
|
7
|
+
@http.get("/v1/youtube/videos/#{URI.encode_uri_component(video_id)}")
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def search(query, page: nil, country: nil, language: nil)
|
|
11
|
+
@http.get("/v1/youtube/search", { query: query, page: page, country: country, language: language })
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def get_subtitles(video_id, language: nil)
|
|
15
|
+
@http.get("/v1/youtube/subtitles", { video_id: video_id, language: language })
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|
|
19
|
+
end
|
data/lib/scrapio.rb
ADDED
metadata
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
name: scrapio
|
|
3
|
+
version: !ruby/object:Gem::Version
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
platform: ruby
|
|
6
|
+
authors:
|
|
7
|
+
- Scrapio
|
|
8
|
+
autorequire:
|
|
9
|
+
bindir: bin
|
|
10
|
+
cert_chain: []
|
|
11
|
+
date: 2026-06-29 00:00:00.000000000 Z
|
|
12
|
+
dependencies: []
|
|
13
|
+
description: Fetch, crawl, search, and extract structured data from any URL. Includes
|
|
14
|
+
Google Search, YouTube transcripts, Amazon and Walmart product data, browser automation,
|
|
15
|
+
and async jobs.
|
|
16
|
+
email: support@scrapio.dev
|
|
17
|
+
executables: []
|
|
18
|
+
extensions: []
|
|
19
|
+
extra_rdoc_files: []
|
|
20
|
+
files:
|
|
21
|
+
- README.md
|
|
22
|
+
- lib/scrapio.rb
|
|
23
|
+
- lib/scrapio/client.rb
|
|
24
|
+
- lib/scrapio/errors.rb
|
|
25
|
+
- lib/scrapio/http_client.rb
|
|
26
|
+
- lib/scrapio/resources/amazon.rb
|
|
27
|
+
- lib/scrapio/resources/crawl.rb
|
|
28
|
+
- lib/scrapio/resources/fetch.rb
|
|
29
|
+
- lib/scrapio/resources/google.rb
|
|
30
|
+
- lib/scrapio/resources/interact.rb
|
|
31
|
+
- lib/scrapio/resources/jobs.rb
|
|
32
|
+
- lib/scrapio/resources/walmart.rb
|
|
33
|
+
- lib/scrapio/resources/youtube.rb
|
|
34
|
+
homepage: https://scrapio.dev
|
|
35
|
+
licenses:
|
|
36
|
+
- MIT
|
|
37
|
+
metadata:
|
|
38
|
+
homepage_uri: https://scrapio.dev
|
|
39
|
+
source_code_uri: https://github.com/xsronhou/scrapping-tool
|
|
40
|
+
documentation_uri: https://scrapio.dev/docs
|
|
41
|
+
post_install_message:
|
|
42
|
+
rdoc_options: []
|
|
43
|
+
require_paths:
|
|
44
|
+
- lib
|
|
45
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
|
46
|
+
requirements:
|
|
47
|
+
- - ">="
|
|
48
|
+
- !ruby/object:Gem::Version
|
|
49
|
+
version: '2.7'
|
|
50
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
51
|
+
requirements:
|
|
52
|
+
- - ">="
|
|
53
|
+
- !ruby/object:Gem::Version
|
|
54
|
+
version: '0'
|
|
55
|
+
requirements: []
|
|
56
|
+
rubygems_version: 3.5.22
|
|
57
|
+
signing_key:
|
|
58
|
+
specification_version: 4
|
|
59
|
+
summary: Official Ruby SDK for Scrapio
|
|
60
|
+
test_files: []
|