webscraping_ai 3.2.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/LICENSE +21 -0
- data/README.md +110 -85
- data/lib/webscraping_ai/client.rb +130 -0
- data/lib/webscraping_ai/configuration.rb +10 -300
- data/lib/webscraping_ai/errors.rb +44 -0
- data/lib/webscraping_ai/query_encoder.rb +74 -0
- data/lib/webscraping_ai/version.rb +1 -13
- data/lib/webscraping_ai.rb +15 -40
- data/webscraping_ai.gemspec +33 -36
- metadata +27 -74
- data/Gemfile +0 -9
- data/Rakefile +0 -10
- data/docs/AIApi.md +0 -209
- data/docs/Account.md +0 -24
- data/docs/AccountApi.md +0 -76
- data/docs/Error.md +0 -24
- data/docs/HTMLApi.md +0 -109
- data/docs/SelectedHTMLApi.md +0 -209
- data/docs/TextApi.md +0 -109
- data/git_push.sh +0 -57
- data/lib/webscraping_ai/api/account_api.rb +0 -79
- data/lib/webscraping_ai/api/ai_api.rb +0 -295
- data/lib/webscraping_ai/api/html_api.rb +0 -160
- data/lib/webscraping_ai/api/selected_html_api.rb +0 -291
- data/lib/webscraping_ai/api/text_api.rb +0 -160
- data/lib/webscraping_ai/api_client.rb +0 -397
- data/lib/webscraping_ai/api_error.rb +0 -58
- data/lib/webscraping_ai/api_model_base.rb +0 -88
- data/lib/webscraping_ai/models/account.rb +0 -178
- data/lib/webscraping_ai/models/error.rb +0 -178
- data/spec/api/account_api_spec.rb +0 -46
- data/spec/api/ai_api_spec.rb +0 -86
- data/spec/api/html_api_spec.rb +0 -61
- data/spec/api/selected_html_api_spec.rb +0 -86
- data/spec/api/text_api_spec.rb +0 -61
- data/spec/models/account_spec.rb +0 -54
- data/spec/models/error_spec.rb +0 -54
- data/spec/spec_helper.rb +0 -111
metadata
CHANGED
|
@@ -1,99 +1,59 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: webscraping_ai
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 4.0.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- WebScraping.AI
|
|
8
|
+
autorequire:
|
|
8
9
|
bindir: bin
|
|
9
10
|
cert_chain: []
|
|
10
|
-
date:
|
|
11
|
+
date: 2026-05-12 00:00:00.000000000 Z
|
|
11
12
|
dependencies:
|
|
12
13
|
- !ruby/object:Gem::Dependency
|
|
13
|
-
name:
|
|
14
|
+
name: faraday
|
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
|
15
16
|
requirements:
|
|
16
17
|
- - "~>"
|
|
17
18
|
- !ruby/object:Gem::Version
|
|
18
|
-
version: '
|
|
19
|
-
- - ">="
|
|
20
|
-
- !ruby/object:Gem::Version
|
|
21
|
-
version: 1.0.1
|
|
19
|
+
version: '2.0'
|
|
22
20
|
type: :runtime
|
|
23
21
|
prerelease: false
|
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
25
23
|
requirements:
|
|
26
24
|
- - "~>"
|
|
27
25
|
- !ruby/object:Gem::Version
|
|
28
|
-
version: '
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
- !ruby/object:Gem::Dependency
|
|
33
|
-
name: rspec
|
|
34
|
-
requirement: !ruby/object:Gem::Requirement
|
|
35
|
-
requirements:
|
|
36
|
-
- - "~>"
|
|
37
|
-
- !ruby/object:Gem::Version
|
|
38
|
-
version: '3.6'
|
|
39
|
-
- - ">="
|
|
40
|
-
- !ruby/object:Gem::Version
|
|
41
|
-
version: 3.6.0
|
|
42
|
-
type: :development
|
|
43
|
-
prerelease: false
|
|
44
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
45
|
-
requirements:
|
|
46
|
-
- - "~>"
|
|
47
|
-
- !ruby/object:Gem::Version
|
|
48
|
-
version: '3.6'
|
|
49
|
-
- - ">="
|
|
50
|
-
- !ruby/object:Gem::Version
|
|
51
|
-
version: 3.6.0
|
|
52
|
-
description: WebScraping.AI scraping API provides LLM-powered tools with Chromium
|
|
53
|
-
JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
|
26
|
+
version: '2.0'
|
|
27
|
+
description: WebScraping.AI provides LLM-powered web scraping with Chromium JavaScript
|
|
28
|
+
rendering, rotating proxies, and built-in HTML parsing. This gem is the official
|
|
29
|
+
Ruby client.
|
|
54
30
|
email:
|
|
55
31
|
- hello@webscraping.ai
|
|
56
32
|
executables: []
|
|
57
33
|
extensions: []
|
|
58
34
|
extra_rdoc_files: []
|
|
59
35
|
files:
|
|
60
|
-
-
|
|
36
|
+
- CHANGELOG.md
|
|
37
|
+
- LICENSE
|
|
61
38
|
- README.md
|
|
62
|
-
- Rakefile
|
|
63
|
-
- docs/AIApi.md
|
|
64
|
-
- docs/Account.md
|
|
65
|
-
- docs/AccountApi.md
|
|
66
|
-
- docs/Error.md
|
|
67
|
-
- docs/HTMLApi.md
|
|
68
|
-
- docs/SelectedHTMLApi.md
|
|
69
|
-
- docs/TextApi.md
|
|
70
|
-
- git_push.sh
|
|
71
39
|
- lib/webscraping_ai.rb
|
|
72
|
-
- lib/webscraping_ai/
|
|
73
|
-
- lib/webscraping_ai/api/ai_api.rb
|
|
74
|
-
- lib/webscraping_ai/api/html_api.rb
|
|
75
|
-
- lib/webscraping_ai/api/selected_html_api.rb
|
|
76
|
-
- lib/webscraping_ai/api/text_api.rb
|
|
77
|
-
- lib/webscraping_ai/api_client.rb
|
|
78
|
-
- lib/webscraping_ai/api_error.rb
|
|
79
|
-
- lib/webscraping_ai/api_model_base.rb
|
|
40
|
+
- lib/webscraping_ai/client.rb
|
|
80
41
|
- lib/webscraping_ai/configuration.rb
|
|
81
|
-
- lib/webscraping_ai/
|
|
82
|
-
- lib/webscraping_ai/
|
|
42
|
+
- lib/webscraping_ai/errors.rb
|
|
43
|
+
- lib/webscraping_ai/query_encoder.rb
|
|
83
44
|
- lib/webscraping_ai/version.rb
|
|
84
|
-
- spec/api/account_api_spec.rb
|
|
85
|
-
- spec/api/ai_api_spec.rb
|
|
86
|
-
- spec/api/html_api_spec.rb
|
|
87
|
-
- spec/api/selected_html_api_spec.rb
|
|
88
|
-
- spec/api/text_api_spec.rb
|
|
89
|
-
- spec/models/account_spec.rb
|
|
90
|
-
- spec/models/error_spec.rb
|
|
91
|
-
- spec/spec_helper.rb
|
|
92
45
|
- webscraping_ai.gemspec
|
|
93
46
|
homepage: https://webscraping.ai
|
|
94
47
|
licenses:
|
|
95
48
|
- MIT
|
|
96
|
-
metadata:
|
|
49
|
+
metadata:
|
|
50
|
+
homepage_uri: https://webscraping.ai
|
|
51
|
+
source_code_uri: https://github.com/webscraping-ai/webscraping-ai-ruby
|
|
52
|
+
bug_tracker_uri: https://github.com/webscraping-ai/webscraping-ai-ruby/issues
|
|
53
|
+
changelog_uri: https://github.com/webscraping-ai/webscraping-ai-ruby/blob/master/CHANGELOG.md
|
|
54
|
+
documentation_uri: https://webscraping.ai/docs/api
|
|
55
|
+
rubygems_mfa_required: 'true'
|
|
56
|
+
post_install_message:
|
|
97
57
|
rdoc_options: []
|
|
98
58
|
require_paths:
|
|
99
59
|
- lib
|
|
@@ -101,22 +61,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
101
61
|
requirements:
|
|
102
62
|
- - ">="
|
|
103
63
|
- !ruby/object:Gem::Version
|
|
104
|
-
version: '
|
|
64
|
+
version: '3.1'
|
|
105
65
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
106
66
|
requirements:
|
|
107
67
|
- - ">="
|
|
108
68
|
- !ruby/object:Gem::Version
|
|
109
69
|
version: '0'
|
|
110
70
|
requirements: []
|
|
111
|
-
rubygems_version:
|
|
71
|
+
rubygems_version: 3.5.22
|
|
72
|
+
signing_key:
|
|
112
73
|
specification_version: 4
|
|
113
|
-
summary: WebScraping.AI
|
|
114
|
-
test_files:
|
|
115
|
-
- spec/api/account_api_spec.rb
|
|
116
|
-
- spec/api/ai_api_spec.rb
|
|
117
|
-
- spec/api/html_api_spec.rb
|
|
118
|
-
- spec/api/selected_html_api_spec.rb
|
|
119
|
-
- spec/api/text_api_spec.rb
|
|
120
|
-
- spec/models/account_spec.rb
|
|
121
|
-
- spec/models/error_spec.rb
|
|
122
|
-
- spec/spec_helper.rb
|
|
74
|
+
summary: Ruby client for the WebScraping.AI API.
|
|
75
|
+
test_files: []
|
data/Gemfile
DELETED
data/Rakefile
DELETED
data/docs/AIApi.md
DELETED
|
@@ -1,209 +0,0 @@
|
|
|
1
|
-
# WebScrapingAI::AIApi
|
|
2
|
-
|
|
3
|
-
All URIs are relative to *https://api.webscraping.ai*
|
|
4
|
-
|
|
5
|
-
| Method | HTTP request | Description |
|
|
6
|
-
| ------ | ------------ | ----------- |
|
|
7
|
-
| [**get_fields**](AIApi.md#get_fields) | **GET** /ai/fields | Extract structured data fields from a web page |
|
|
8
|
-
| [**get_question**](AIApi.md#get_question) | **GET** /ai/question | Get an answer to a question about a given web page |
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
## get_fields
|
|
12
|
-
|
|
13
|
-
> Hash<String, String> get_fields(url, fields, opts)
|
|
14
|
-
|
|
15
|
-
Extract structured data fields from a web page
|
|
16
|
-
|
|
17
|
-
Returns structured data fields extracted from the webpage using an LLM model. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
|
18
|
-
|
|
19
|
-
### Examples
|
|
20
|
-
|
|
21
|
-
```ruby
|
|
22
|
-
require 'time'
|
|
23
|
-
require 'webscraping_ai'
|
|
24
|
-
# setup authorization
|
|
25
|
-
WebScrapingAI.configure do |config|
|
|
26
|
-
# Configure API key authorization: api_key
|
|
27
|
-
config.api_key['api_key'] = 'YOUR API KEY'
|
|
28
|
-
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
|
29
|
-
# config.api_key_prefix['api_key'] = 'Bearer'
|
|
30
|
-
end
|
|
31
|
-
|
|
32
|
-
api_instance = WebScrapingAI::AIApi.new
|
|
33
|
-
url = 'https://example.com' # String | URL of the target page.
|
|
34
|
-
fields = { key: { key: 'inner_example'}} # Hash<String, String> | Object describing fields to extract from the page and their descriptions
|
|
35
|
-
opts = {
|
|
36
|
-
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
37
|
-
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
|
38
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
|
39
|
-
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
|
40
|
-
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
41
|
-
proxy: 'datacenter', # String | Type of proxy. Use `residential` if your site restricts traffic from datacenters, or `stealth` for the most heavily protected sites with advanced anti-bot detection (`datacenter` by default). Residential and stealth proxy requests are more expensive than datacenter, see the pricing page for details.
|
|
42
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
|
43
|
-
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
44
|
-
device: 'desktop', # String | Type of device emulation.
|
|
45
|
-
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
|
46
|
-
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
|
47
|
-
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
begin
|
|
51
|
-
# Extract structured data fields from a web page
|
|
52
|
-
result = api_instance.get_fields(url, fields, opts)
|
|
53
|
-
p result
|
|
54
|
-
rescue WebScrapingAI::ApiError => e
|
|
55
|
-
puts "Error when calling AIApi->get_fields: #{e}"
|
|
56
|
-
end
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
#### Using the get_fields_with_http_info variant
|
|
60
|
-
|
|
61
|
-
This returns an Array which contains the response data, status code and headers.
|
|
62
|
-
|
|
63
|
-
> <Array(Hash<String, String>, Integer, Hash)> get_fields_with_http_info(url, fields, opts)
|
|
64
|
-
|
|
65
|
-
```ruby
|
|
66
|
-
begin
|
|
67
|
-
# Extract structured data fields from a web page
|
|
68
|
-
data, status_code, headers = api_instance.get_fields_with_http_info(url, fields, opts)
|
|
69
|
-
p status_code # => 2xx
|
|
70
|
-
p headers # => { ... }
|
|
71
|
-
p data # => Hash<String, String>
|
|
72
|
-
rescue WebScrapingAI::ApiError => e
|
|
73
|
-
puts "Error when calling AIApi->get_fields_with_http_info: #{e}"
|
|
74
|
-
end
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
### Parameters
|
|
78
|
-
|
|
79
|
-
| Name | Type | Description | Notes |
|
|
80
|
-
| ---- | ---- | ----------- | ----- |
|
|
81
|
-
| **url** | **String** | URL of the target page. | |
|
|
82
|
-
| **fields** | [**Hash<String, String>**](String.md) | Object describing fields to extract from the page and their descriptions | |
|
|
83
|
-
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
|
84
|
-
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
|
85
|
-
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
|
86
|
-
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
|
87
|
-
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
|
88
|
-
| **proxy** | **String** | Type of proxy. Use `residential` if your site restricts traffic from datacenters, or `stealth` for the most heavily protected sites with advanced anti-bot detection (`datacenter` by default). Residential and stealth proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
|
89
|
-
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
|
90
|
-
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
|
91
|
-
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
|
92
|
-
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
|
93
|
-
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
|
94
|
-
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
|
95
|
-
|
|
96
|
-
### Return type
|
|
97
|
-
|
|
98
|
-
**Hash<String, String>**
|
|
99
|
-
|
|
100
|
-
### Authorization
|
|
101
|
-
|
|
102
|
-
[api_key](../README.md#api_key)
|
|
103
|
-
|
|
104
|
-
### HTTP request headers
|
|
105
|
-
|
|
106
|
-
- **Content-Type**: Not defined
|
|
107
|
-
- **Accept**: application/json
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
## get_question
|
|
111
|
-
|
|
112
|
-
> String get_question(url, opts)
|
|
113
|
-
|
|
114
|
-
Get an answer to a question about a given web page
|
|
115
|
-
|
|
116
|
-
Returns the answer in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing, then the answer is extracted using an LLM model.
|
|
117
|
-
|
|
118
|
-
### Examples
|
|
119
|
-
|
|
120
|
-
```ruby
|
|
121
|
-
require 'time'
|
|
122
|
-
require 'webscraping_ai'
|
|
123
|
-
# setup authorization
|
|
124
|
-
WebScrapingAI.configure do |config|
|
|
125
|
-
# Configure API key authorization: api_key
|
|
126
|
-
config.api_key['api_key'] = 'YOUR API KEY'
|
|
127
|
-
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
|
128
|
-
# config.api_key_prefix['api_key'] = 'Bearer'
|
|
129
|
-
end
|
|
130
|
-
|
|
131
|
-
api_instance = WebScrapingAI::AIApi.new
|
|
132
|
-
url = 'https://example.com' # String | URL of the target page.
|
|
133
|
-
opts = {
|
|
134
|
-
question: 'What is the summary of this page content?', # String | Question or instructions to ask the LLM model about the target page.
|
|
135
|
-
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
136
|
-
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
|
137
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
|
138
|
-
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
|
139
|
-
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
140
|
-
proxy: 'datacenter', # String | Type of proxy. Use `residential` if your site restricts traffic from datacenters, or `stealth` for the most heavily protected sites with advanced anti-bot detection (`datacenter` by default). Residential and stealth proxy requests are more expensive than datacenter, see the pricing page for details.
|
|
141
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
|
142
|
-
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
143
|
-
device: 'desktop', # String | Type of device emulation.
|
|
144
|
-
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
|
145
|
-
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
|
146
|
-
js_script: 'document.querySelector('button').click();', # String | Custom JavaScript code to execute on the target page.
|
|
147
|
-
format: 'json' # String | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response.
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
begin
|
|
151
|
-
# Get an answer to a question about a given web page
|
|
152
|
-
result = api_instance.get_question(url, opts)
|
|
153
|
-
p result
|
|
154
|
-
rescue WebScrapingAI::ApiError => e
|
|
155
|
-
puts "Error when calling AIApi->get_question: #{e}"
|
|
156
|
-
end
|
|
157
|
-
```
|
|
158
|
-
|
|
159
|
-
#### Using the get_question_with_http_info variant
|
|
160
|
-
|
|
161
|
-
This returns an Array which contains the response data, status code and headers.
|
|
162
|
-
|
|
163
|
-
> <Array(String, Integer, Hash)> get_question_with_http_info(url, opts)
|
|
164
|
-
|
|
165
|
-
```ruby
|
|
166
|
-
begin
|
|
167
|
-
# Get an answer to a question about a given web page
|
|
168
|
-
data, status_code, headers = api_instance.get_question_with_http_info(url, opts)
|
|
169
|
-
p status_code # => 2xx
|
|
170
|
-
p headers # => { ... }
|
|
171
|
-
p data # => String
|
|
172
|
-
rescue WebScrapingAI::ApiError => e
|
|
173
|
-
puts "Error when calling AIApi->get_question_with_http_info: #{e}"
|
|
174
|
-
end
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
### Parameters
|
|
178
|
-
|
|
179
|
-
| Name | Type | Description | Notes |
|
|
180
|
-
| ---- | ---- | ----------- | ----- |
|
|
181
|
-
| **url** | **String** | URL of the target page. | |
|
|
182
|
-
| **question** | **String** | Question or instructions to ask the LLM model about the target page. | [optional] |
|
|
183
|
-
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
|
184
|
-
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
|
185
|
-
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
|
186
|
-
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
|
187
|
-
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
|
188
|
-
| **proxy** | **String** | Type of proxy. Use `residential` if your site restricts traffic from datacenters, or `stealth` for the most heavily protected sites with advanced anti-bot detection (`datacenter` by default). Residential and stealth proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
|
189
|
-
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
|
190
|
-
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
|
191
|
-
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
|
192
|
-
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
|
193
|
-
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
|
194
|
-
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
|
195
|
-
| **format** | **String** | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. | [optional][default to 'json'] |
|
|
196
|
-
|
|
197
|
-
### Return type
|
|
198
|
-
|
|
199
|
-
**String**
|
|
200
|
-
|
|
201
|
-
### Authorization
|
|
202
|
-
|
|
203
|
-
[api_key](../README.md#api_key)
|
|
204
|
-
|
|
205
|
-
### HTTP request headers
|
|
206
|
-
|
|
207
|
-
- **Content-Type**: Not defined
|
|
208
|
-
- **Accept**: application/json, text/html
|
|
209
|
-
|
data/docs/Account.md
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
# WebScrapingAI::Account
|
|
2
|
-
|
|
3
|
-
## Properties
|
|
4
|
-
|
|
5
|
-
| Name | Type | Description | Notes |
|
|
6
|
-
| ---- | ---- | ----------- | ----- |
|
|
7
|
-
| **email** | **String** | Your account email | [optional] |
|
|
8
|
-
| **remaining_api_calls** | **Integer** | Remaining API credits quota | [optional] |
|
|
9
|
-
| **resets_at** | **Integer** | Next billing cycle start time (UNIX timestamp) | [optional] |
|
|
10
|
-
| **remaining_concurrency** | **Integer** | Remaining concurrent requests | [optional] |
|
|
11
|
-
|
|
12
|
-
## Example
|
|
13
|
-
|
|
14
|
-
```ruby
|
|
15
|
-
require 'webscraping_ai'
|
|
16
|
-
|
|
17
|
-
instance = WebScrapingAI::Account.new(
|
|
18
|
-
email: null,
|
|
19
|
-
remaining_api_calls: null,
|
|
20
|
-
resets_at: null,
|
|
21
|
-
remaining_concurrency: null
|
|
22
|
-
)
|
|
23
|
-
```
|
|
24
|
-
|
data/docs/AccountApi.md
DELETED
|
@@ -1,76 +0,0 @@
|
|
|
1
|
-
# WebScrapingAI::AccountApi
|
|
2
|
-
|
|
3
|
-
All URIs are relative to *https://api.webscraping.ai*
|
|
4
|
-
|
|
5
|
-
| Method | HTTP request | Description |
|
|
6
|
-
| ------ | ------------ | ----------- |
|
|
7
|
-
| [**account**](AccountApi.md#account) | **GET** /account | Information about your account calls quota |
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
## account
|
|
11
|
-
|
|
12
|
-
> <Account> account
|
|
13
|
-
|
|
14
|
-
Information about your account calls quota
|
|
15
|
-
|
|
16
|
-
Returns information about your account, including the remaining API credits quota, the next billing cycle start time, and the remaining concurrent requests. The response is in JSON format.
|
|
17
|
-
|
|
18
|
-
### Examples
|
|
19
|
-
|
|
20
|
-
```ruby
|
|
21
|
-
require 'time'
|
|
22
|
-
require 'webscraping_ai'
|
|
23
|
-
# setup authorization
|
|
24
|
-
WebScrapingAI.configure do |config|
|
|
25
|
-
# Configure API key authorization: api_key
|
|
26
|
-
config.api_key['api_key'] = 'YOUR API KEY'
|
|
27
|
-
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
|
28
|
-
# config.api_key_prefix['api_key'] = 'Bearer'
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
api_instance = WebScrapingAI::AccountApi.new
|
|
32
|
-
|
|
33
|
-
begin
|
|
34
|
-
# Information about your account calls quota
|
|
35
|
-
result = api_instance.account
|
|
36
|
-
p result
|
|
37
|
-
rescue WebScrapingAI::ApiError => e
|
|
38
|
-
puts "Error when calling AccountApi->account: #{e}"
|
|
39
|
-
end
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
#### Using the account_with_http_info variant
|
|
43
|
-
|
|
44
|
-
This returns an Array which contains the response data, status code and headers.
|
|
45
|
-
|
|
46
|
-
> <Array(<Account>, Integer, Hash)> account_with_http_info
|
|
47
|
-
|
|
48
|
-
```ruby
|
|
49
|
-
begin
|
|
50
|
-
# Information about your account calls quota
|
|
51
|
-
data, status_code, headers = api_instance.account_with_http_info
|
|
52
|
-
p status_code # => 2xx
|
|
53
|
-
p headers # => { ... }
|
|
54
|
-
p data # => <Account>
|
|
55
|
-
rescue WebScrapingAI::ApiError => e
|
|
56
|
-
puts "Error when calling AccountApi->account_with_http_info: #{e}"
|
|
57
|
-
end
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
### Parameters
|
|
61
|
-
|
|
62
|
-
This endpoint does not need any parameter.
|
|
63
|
-
|
|
64
|
-
### Return type
|
|
65
|
-
|
|
66
|
-
[**Account**](Account.md)
|
|
67
|
-
|
|
68
|
-
### Authorization
|
|
69
|
-
|
|
70
|
-
[api_key](../README.md#api_key)
|
|
71
|
-
|
|
72
|
-
### HTTP request headers
|
|
73
|
-
|
|
74
|
-
- **Content-Type**: Not defined
|
|
75
|
-
- **Accept**: application/json
|
|
76
|
-
|
data/docs/Error.md
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
# WebScrapingAI::Error
|
|
2
|
-
|
|
3
|
-
## Properties
|
|
4
|
-
|
|
5
|
-
| Name | Type | Description | Notes |
|
|
6
|
-
| ---- | ---- | ----------- | ----- |
|
|
7
|
-
| **message** | **String** | Error description | [optional] |
|
|
8
|
-
| **status_code** | **Integer** | Target page response HTTP status code (403, 500, etc) | [optional] |
|
|
9
|
-
| **status_message** | **String** | Target page response HTTP status message | [optional] |
|
|
10
|
-
| **body** | **String** | Target page response body | [optional] |
|
|
11
|
-
|
|
12
|
-
## Example
|
|
13
|
-
|
|
14
|
-
```ruby
|
|
15
|
-
require 'webscraping_ai'
|
|
16
|
-
|
|
17
|
-
instance = WebScrapingAI::Error.new(
|
|
18
|
-
message: null,
|
|
19
|
-
status_code: null,
|
|
20
|
-
status_message: null,
|
|
21
|
-
body: null
|
|
22
|
-
)
|
|
23
|
-
```
|
|
24
|
-
|
data/docs/HTMLApi.md
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
# WebScrapingAI::HTMLApi
|
|
2
|
-
|
|
3
|
-
All URIs are relative to *https://api.webscraping.ai*
|
|
4
|
-
|
|
5
|
-
| Method | HTTP request | Description |
|
|
6
|
-
| ------ | ------------ | ----------- |
|
|
7
|
-
| [**get_html**](HTMLApi.md#get_html) | **GET** /html | Page HTML by URL |
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
## get_html
|
|
11
|
-
|
|
12
|
-
> String get_html(url, opts)
|
|
13
|
-
|
|
14
|
-
Page HTML by URL
|
|
15
|
-
|
|
16
|
-
Returns the full HTML content of a webpage specified by the URL. The response is in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
|
17
|
-
|
|
18
|
-
### Examples
|
|
19
|
-
|
|
20
|
-
```ruby
|
|
21
|
-
require 'time'
|
|
22
|
-
require 'webscraping_ai'
|
|
23
|
-
# setup authorization
|
|
24
|
-
WebScrapingAI.configure do |config|
|
|
25
|
-
# Configure API key authorization: api_key
|
|
26
|
-
config.api_key['api_key'] = 'YOUR API KEY'
|
|
27
|
-
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
|
28
|
-
# config.api_key_prefix['api_key'] = 'Bearer'
|
|
29
|
-
end
|
|
30
|
-
|
|
31
|
-
api_instance = WebScrapingAI::HTMLApi.new
|
|
32
|
-
url = 'https://example.com' # String | URL of the target page.
|
|
33
|
-
opts = {
|
|
34
|
-
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
35
|
-
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
|
36
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
|
37
|
-
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
|
38
|
-
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
39
|
-
proxy: 'datacenter', # String | Type of proxy. Use `residential` if your site restricts traffic from datacenters, or `stealth` for the most heavily protected sites with advanced anti-bot detection (`datacenter` by default). Residential and stealth proxy requests are more expensive than datacenter, see the pricing page for details.
|
|
40
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
|
41
|
-
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
42
|
-
device: 'desktop', # String | Type of device emulation.
|
|
43
|
-
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
|
44
|
-
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
|
45
|
-
js_script: 'document.querySelector('button').click();', # String | Custom JavaScript code to execute on the target page.
|
|
46
|
-
return_script_result: false, # Boolean | Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned).
|
|
47
|
-
format: 'json' # String | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response.
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
begin
|
|
51
|
-
# Page HTML by URL
|
|
52
|
-
result = api_instance.get_html(url, opts)
|
|
53
|
-
p result
|
|
54
|
-
rescue WebScrapingAI::ApiError => e
|
|
55
|
-
puts "Error when calling HTMLApi->get_html: #{e}"
|
|
56
|
-
end
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
#### Using the get_html_with_http_info variant
|
|
60
|
-
|
|
61
|
-
This returns an Array which contains the response data, status code and headers.
|
|
62
|
-
|
|
63
|
-
> <Array(String, Integer, Hash)> get_html_with_http_info(url, opts)
|
|
64
|
-
|
|
65
|
-
```ruby
|
|
66
|
-
begin
|
|
67
|
-
# Page HTML by URL
|
|
68
|
-
data, status_code, headers = api_instance.get_html_with_http_info(url, opts)
|
|
69
|
-
p status_code # => 2xx
|
|
70
|
-
p headers # => { ... }
|
|
71
|
-
p data # => String
|
|
72
|
-
rescue WebScrapingAI::ApiError => e
|
|
73
|
-
puts "Error when calling HTMLApi->get_html_with_http_info: #{e}"
|
|
74
|
-
end
|
|
75
|
-
```
|
|
76
|
-
|
|
77
|
-
### Parameters
|
|
78
|
-
|
|
79
|
-
| Name | Type | Description | Notes |
|
|
80
|
-
| ---- | ---- | ----------- | ----- |
|
|
81
|
-
| **url** | **String** | URL of the target page. | |
|
|
82
|
-
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
|
83
|
-
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
|
84
|
-
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
|
85
|
-
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
|
86
|
-
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
|
87
|
-
| **proxy** | **String** | Type of proxy. Use `residential` if your site restricts traffic from datacenters, or `stealth` for the most heavily protected sites with advanced anti-bot detection (`datacenter` by default). Residential and stealth proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
|
88
|
-
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
|
89
|
-
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
|
90
|
-
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
|
91
|
-
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
|
92
|
-
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
|
93
|
-
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
|
94
|
-
| **return_script_result** | **Boolean** | Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned). | [optional][default to false] |
|
|
95
|
-
| **format** | **String** | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. | [optional][default to 'json'] |
|
|
96
|
-
|
|
97
|
-
### Return type
|
|
98
|
-
|
|
99
|
-
**String**
|
|
100
|
-
|
|
101
|
-
### Authorization
|
|
102
|
-
|
|
103
|
-
[api_key](../README.md#api_key)
|
|
104
|
-
|
|
105
|
-
### HTTP request headers
|
|
106
|
-
|
|
107
|
-
- **Content-Type**: Not defined
|
|
108
|
-
- **Accept**: application/json, text/html
|
|
109
|
-
|