crawlbase 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +9 -3
- data/crawlbase.gemspec +1 -0
- data/lib/crawlbase/api.rb +15 -7
- data/lib/crawlbase/version.rb +1 -1
- metadata +20 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6a1ccd286be2a6973c23cc8c9dc84a292af4d49910995054312a68e9beca6761
|
4
|
+
data.tar.gz: 0da3a67ba36a2151d6b982f7a9dc961adae4a1a58f0aac311d089e2dfd99cce9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4d4b9ce783a5fa43cb0dab14550f01f2fcaed958b31bceeab91a9540456361a226f18fb41e11ae874f7d20862ba49d62088ab492b44c8a9b73c96c5812b8d4f3
|
7
|
+
data.tar.gz: 39ce7ce6996726bb7a3043277b1170fbc06b05f4cebf2903dc1db242bb4ae865de7a5a7a5eaa5af9f7861fcbccbfff92cce8227f5b2ed7df284c47d4c9f7cbe0
|
data/README.md
CHANGED
@@ -34,6 +34,12 @@ You can get a token for free by [creating a Crawlbase account](https://crawlbase
|
|
34
34
|
api = Crawlbase::API.new(token: 'YOUR_TOKEN')
|
35
35
|
```
|
36
36
|
|
37
|
+
By default, the timeout for API requests is set to 90 seconds. You can configure a custom timeout by passing a `timeout` option during initialization.
|
38
|
+
|
39
|
+
```ruby
|
40
|
+
api = Crawlbase::API.new(token: 'YOUR_TOKEN', timeout: 120)
|
41
|
+
```
|
42
|
+
|
37
43
|
### GET requests
|
38
44
|
|
39
45
|
Pass the url that you want to scrape plus any options from the ones available in the [API documentation](https://crawlbase.com/dashboard/docs).
|
@@ -103,7 +109,7 @@ puts response.body
|
|
103
109
|
If you need to scrape any website built with Javascript like React, Angular, Vue, etc. You just need to pass your javascript token and use the same calls. Note that only `.get` is available for javascript and not `.post`.
|
104
110
|
|
105
111
|
```ruby
|
106
|
-
api = Crawlbase::API.new(token: 'YOUR_JAVASCRIPT_TOKEN'
|
112
|
+
api = Crawlbase::API.new(token: 'YOUR_JAVASCRIPT_TOKEN')
|
107
113
|
```
|
108
114
|
|
109
115
|
```ruby
|
@@ -357,8 +363,8 @@ The gem is available as open source under the terms of the [MIT License](http://
|
|
357
363
|
|
358
364
|
## Code of Conduct
|
359
365
|
|
360
|
-
Everyone interacting in the Crawlbase project
|
366
|
+
Everyone interacting in the Crawlbase project's codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/crawlbase-source/crawlbase-ruby/blob/master/CODE_OF_CONDUCT.md).
|
361
367
|
|
362
368
|
---
|
363
369
|
|
364
|
-
Copyright
|
370
|
+
Copyright 2025 Crawlbase
|
data/crawlbase.gemspec
CHANGED
data/lib/crawlbase/api.rb
CHANGED
@@ -6,23 +6,26 @@ require 'uri'
|
|
6
6
|
|
7
7
|
module Crawlbase
|
8
8
|
class API
|
9
|
-
attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url
|
9
|
+
attr_reader :token, :body, :status_code, :original_status, :pc_status, :url, :storage_url, :timeout
|
10
10
|
|
11
11
|
INVALID_TOKEN = 'Token is required'
|
12
12
|
INVALID_URL = 'URL is required'
|
13
|
+
DEFAULT_TIMEOUT = 90
|
13
14
|
|
14
15
|
def initialize(options = {})
|
15
16
|
raise INVALID_TOKEN if options[:token].nil?
|
16
17
|
|
17
18
|
@token = options[:token]
|
19
|
+
@timeout = options.fetch(:timeout, DEFAULT_TIMEOUT)
|
18
20
|
end
|
19
21
|
|
20
22
|
def get(url, options = {})
|
21
23
|
raise INVALID_URL if url.empty?
|
22
24
|
|
23
25
|
uri = prepare_uri(url, options)
|
24
|
-
|
25
|
-
|
26
|
+
http = build_http(uri)
|
27
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
28
|
+
response = http.request(request)
|
26
29
|
|
27
30
|
prepare_response(response, options[:format])
|
28
31
|
|
@@ -33,10 +36,7 @@ module Crawlbase
|
|
33
36
|
raise INVALID_URL if url.empty?
|
34
37
|
|
35
38
|
uri = prepare_uri(url, options)
|
36
|
-
|
37
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
38
|
-
|
39
|
-
http.use_ssl = true
|
39
|
+
http = build_http(uri)
|
40
40
|
|
41
41
|
content_type = options[:post_content_type].to_s.include?('json') ? { 'Content-Type': 'text/json' } : nil
|
42
42
|
|
@@ -57,6 +57,14 @@ module Crawlbase
|
|
57
57
|
|
58
58
|
private
|
59
59
|
|
60
|
+
def build_http(uri)
|
61
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
62
|
+
http.use_ssl = true
|
63
|
+
http.open_timeout = @timeout
|
64
|
+
http.read_timeout = @timeout
|
65
|
+
http
|
66
|
+
end
|
67
|
+
|
60
68
|
def base_url
|
61
69
|
'https://api.crawlbase.com'
|
62
70
|
end
|
data/lib/crawlbase/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: crawlbase
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- crawlbase
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2025-06-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -66,6 +66,20 @@ dependencies:
|
|
66
66
|
- - "~>"
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: 12.3.3
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rexml
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.2'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.2'
|
69
83
|
description: Ruby based client for the Crawlbase API that helps developers crawl or
|
70
84
|
scrape thousands of web pages anonymously
|
71
85
|
email:
|
@@ -94,7 +108,7 @@ homepage: https://github.com/crawlbase-source/crawlbase-ruby
|
|
94
108
|
licenses:
|
95
109
|
- MIT
|
96
110
|
metadata: {}
|
97
|
-
post_install_message:
|
111
|
+
post_install_message:
|
98
112
|
rdoc_options: []
|
99
113
|
require_paths:
|
100
114
|
- lib
|
@@ -109,8 +123,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
123
|
- !ruby/object:Gem::Version
|
110
124
|
version: '0'
|
111
125
|
requirements: []
|
112
|
-
rubygems_version: 3.
|
113
|
-
signing_key:
|
126
|
+
rubygems_version: 3.6.0.dev
|
127
|
+
signing_key:
|
114
128
|
specification_version: 4
|
115
129
|
summary: Crawlbase API client for web scraping and crawling
|
116
130
|
test_files: []
|