webscraping_ai 3.1.3 → 3.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -15
- data/docs/AIApi.md +110 -10
- data/docs/Account.md +2 -0
- data/docs/HTMLApi.md +10 -4
- data/docs/SelectedHTMLApi.md +17 -7
- data/docs/TextApi.md +8 -4
- data/lib/webscraping_ai/api/account_api.rb +4 -4
- data/lib/webscraping_ai/api/ai_api.rb +155 -24
- data/lib/webscraping_ai/api/html_api.rb +20 -7
- data/lib/webscraping_ai/api/selected_html_api.rb +30 -11
- data/lib/webscraping_ai/api/text_api.rb +15 -9
- data/lib/webscraping_ai/api_client.rb +5 -5
- data/lib/webscraping_ai/api_error.rb +3 -3
- data/lib/webscraping_ai/configuration.rb +13 -3
- data/lib/webscraping_ai/models/account.rb +14 -4
- data/lib/webscraping_ai/models/error.rb +3 -3
- data/lib/webscraping_ai/version.rb +4 -4
- data/lib/webscraping_ai.rb +3 -3
- data/spec/api/account_api_spec.rb +3 -3
- data/spec/api/ai_api_spec.rb +32 -7
- data/spec/api/html_api_spec.rb +7 -4
- data/spec/api/selected_html_api_spec.rb +10 -5
- data/spec/api/text_api_spec.rb +7 -5
- data/spec/models/account_spec.rb +9 -3
- data/spec/models/error_spec.rb +3 -3
- data/spec/spec_helper.rb +3 -3
- data/webscraping_ai.gemspec +4 -4
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea239c0ee1260cacb701ce616aef022672f3aa35ca5ec44e6db5348eee1e9fa4
|
4
|
+
data.tar.gz: f2a9af9da881dc8c24396847cbfa724a91f1b12da5b2eeab38b492913383d64b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9d9d8624aa0e2159e6009e2ae3719b2cec9dce3a3d51883441159a8ac9ace76fa801c043bbf19950e001519d06226cc102fcdef746ec427648af1b380d963e84
|
7
|
+
data.tar.gz: 6afde37248d490b226cc41ffb41b972d4d6b427878e36ff0a267ea9e6959b080b2309f4e651151cf762fe2819ab52fe712ff5ae6becb2be1c7f5f1df788c5cd4
|
data/README.md
CHANGED
@@ -2,12 +2,13 @@
|
|
2
2
|
|
3
3
|
WebScrapingAI - the Ruby gem for the WebScraping.AI
|
4
4
|
|
5
|
-
WebScraping.AI scraping API provides
|
5
|
+
WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
6
6
|
|
7
7
|
This SDK is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
|
8
8
|
|
9
|
-
- API version: 3.
|
10
|
-
- Package version: 3.
|
9
|
+
- API version: 3.2.0
|
10
|
+
- Package version: 3.2.0
|
11
|
+
- Generator version: 7.11.0
|
11
12
|
- Build package: org.openapitools.codegen.languages.RubyClientCodegen
|
12
13
|
For more information, please visit [https://webscraping.ai](https://webscraping.ai)
|
13
14
|
|
@@ -24,16 +25,16 @@ gem build webscraping_ai.gemspec
|
|
24
25
|
Then either install the gem locally:
|
25
26
|
|
26
27
|
```shell
|
27
|
-
gem install ./webscraping_ai-3.
|
28
|
+
gem install ./webscraping_ai-3.2.0.gem
|
28
29
|
```
|
29
30
|
|
30
|
-
(for development, run `gem install --dev ./webscraping_ai-3.
|
31
|
+
(for development, run `gem install --dev ./webscraping_ai-3.2.0.gem` to install the development dependencies)
|
31
32
|
|
32
33
|
or publish the gem to a gem hosting service, e.g. [RubyGems](https://rubygems.org/).
|
33
34
|
|
34
35
|
Finally add this to the Gemfile:
|
35
36
|
|
36
|
-
gem 'webscraping_ai', '~> 3.
|
37
|
+
gem 'webscraping_ai', '~> 3.2.0'
|
37
38
|
|
38
39
|
### Install from Git
|
39
40
|
|
@@ -67,17 +68,16 @@ end
|
|
67
68
|
|
68
69
|
api_instance = WebScrapingAI::AIApi.new
|
69
70
|
url = 'https://example.com' # String | URL of the target page.
|
71
|
+
fields = { key: { key: 'inner_example'}} # Hash<String, String> | Object describing fields to extract from the page and their descriptions
|
70
72
|
opts = {
|
71
|
-
|
72
|
-
context_limit: 4000, # Integer | Maximum number of tokens to use as context for the LLM model (4000 by default).
|
73
|
-
response_tokens: 100, # Integer | Maximum number of tokens to return in the LLM model response. The total context size (context_limit) includes the question, the target page content and the response, so this parameter reserves tokens for the response (see also on_context_limit).
|
74
|
-
on_context_limit: 'truncate', # String | What to do if the context_limit parameter is exceeded (truncate by default). The context is exceeded when the target page content is too long.
|
75
|
-
headers: { key: 3.56}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
73
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
76
74
|
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
77
75
|
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
78
76
|
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
77
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
79
78
|
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
80
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
79
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
80
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
81
81
|
device: 'desktop', # String | Type of device emulation.
|
82
82
|
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
83
83
|
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
@@ -85,11 +85,11 @@ opts = {
|
|
85
85
|
}
|
86
86
|
|
87
87
|
begin
|
88
|
-
#
|
89
|
-
result = api_instance.
|
88
|
+
#Extract structured data fields from a web page
|
89
|
+
result = api_instance.get_fields(url, fields, opts)
|
90
90
|
p result
|
91
91
|
rescue WebScrapingAI::ApiError => e
|
92
|
-
puts "Exception when calling AIApi->
|
92
|
+
puts "Exception when calling AIApi->get_fields: #{e}"
|
93
93
|
end
|
94
94
|
|
95
95
|
```
|
@@ -100,6 +100,7 @@ All URIs are relative to *https://api.webscraping.ai*
|
|
100
100
|
|
101
101
|
Class | Method | HTTP request | Description
|
102
102
|
------------ | ------------- | ------------- | -------------
|
103
|
+
*WebScrapingAI::AIApi* | [**get_fields**](docs/AIApi.md#get_fields) | **GET** /ai/fields | Extract structured data fields from a web page
|
103
104
|
*WebScrapingAI::AIApi* | [**get_question**](docs/AIApi.md#get_question) | **GET** /ai/question | Get an answer to a question about a given web page
|
104
105
|
*WebScrapingAI::AccountApi* | [**account**](docs/AccountApi.md#account) | **GET** /account | Information about your account calls quota
|
105
106
|
*WebScrapingAI::HTMLApi* | [**get_html**](docs/HTMLApi.md#get_html) | **GET** /html | Page HTML by URL
|
data/docs/AIApi.md
CHANGED
@@ -4,9 +4,109 @@ All URIs are relative to *https://api.webscraping.ai*
|
|
4
4
|
|
5
5
|
| Method | HTTP request | Description |
|
6
6
|
| ------ | ------------ | ----------- |
|
7
|
+
| [**get_fields**](AIApi.md#get_fields) | **GET** /ai/fields | Extract structured data fields from a web page |
|
7
8
|
| [**get_question**](AIApi.md#get_question) | **GET** /ai/question | Get an answer to a question about a given web page |
|
8
9
|
|
9
10
|
|
11
|
+
## get_fields
|
12
|
+
|
13
|
+
> Hash<String, String> get_fields(url, fields, opts)
|
14
|
+
|
15
|
+
Extract structured data fields from a web page
|
16
|
+
|
17
|
+
Returns structured data fields extracted from the webpage using an LLM model. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
18
|
+
|
19
|
+
### Examples
|
20
|
+
|
21
|
+
```ruby
|
22
|
+
require 'time'
|
23
|
+
require 'webscraping_ai'
|
24
|
+
# setup authorization
|
25
|
+
WebScrapingAI.configure do |config|
|
26
|
+
# Configure API key authorization: api_key
|
27
|
+
config.api_key['api_key'] = 'YOUR API KEY'
|
28
|
+
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
29
|
+
# config.api_key_prefix['api_key'] = 'Bearer'
|
30
|
+
end
|
31
|
+
|
32
|
+
api_instance = WebScrapingAI::AIApi.new
|
33
|
+
url = 'https://example.com' # String | URL of the target page.
|
34
|
+
fields = { key: { key: 'inner_example'}} # Hash<String, String> | Object describing fields to extract from the page and their descriptions
|
35
|
+
opts = {
|
36
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
37
|
+
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
38
|
+
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
39
|
+
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
40
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
41
|
+
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
42
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
43
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
44
|
+
device: 'desktop', # String | Type of device emulation.
|
45
|
+
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
46
|
+
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
47
|
+
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
48
|
+
}
|
49
|
+
|
50
|
+
begin
|
51
|
+
# Extract structured data fields from a web page
|
52
|
+
result = api_instance.get_fields(url, fields, opts)
|
53
|
+
p result
|
54
|
+
rescue WebScrapingAI::ApiError => e
|
55
|
+
puts "Error when calling AIApi->get_fields: #{e}"
|
56
|
+
end
|
57
|
+
```
|
58
|
+
|
59
|
+
#### Using the get_fields_with_http_info variant
|
60
|
+
|
61
|
+
This returns an Array which contains the response data, status code and headers.
|
62
|
+
|
63
|
+
> <Array(Hash<String, String>, Integer, Hash)> get_fields_with_http_info(url, fields, opts)
|
64
|
+
|
65
|
+
```ruby
|
66
|
+
begin
|
67
|
+
# Extract structured data fields from a web page
|
68
|
+
data, status_code, headers = api_instance.get_fields_with_http_info(url, fields, opts)
|
69
|
+
p status_code # => 2xx
|
70
|
+
p headers # => { ... }
|
71
|
+
p data # => Hash<String, String>
|
72
|
+
rescue WebScrapingAI::ApiError => e
|
73
|
+
puts "Error when calling AIApi->get_fields_with_http_info: #{e}"
|
74
|
+
end
|
75
|
+
```
|
76
|
+
|
77
|
+
### Parameters
|
78
|
+
|
79
|
+
| Name | Type | Description | Notes |
|
80
|
+
| ---- | ---- | ----------- | ----- |
|
81
|
+
| **url** | **String** | URL of the target page. | |
|
82
|
+
| **fields** | [**Hash<String, String>**](String.md) | Object describing fields to extract from the page and their descriptions | |
|
83
|
+
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
84
|
+
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
85
|
+
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
86
|
+
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
87
|
+
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
88
|
+
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
89
|
+
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
90
|
+
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
91
|
+
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
92
|
+
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
93
|
+
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
94
|
+
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
95
|
+
|
96
|
+
### Return type
|
97
|
+
|
98
|
+
**Hash<String, String>**
|
99
|
+
|
100
|
+
### Authorization
|
101
|
+
|
102
|
+
[api_key](../README.md#api_key)
|
103
|
+
|
104
|
+
### HTTP request headers
|
105
|
+
|
106
|
+
- **Content-Type**: Not defined
|
107
|
+
- **Accept**: application/json
|
108
|
+
|
109
|
+
|
10
110
|
## get_question
|
11
111
|
|
12
112
|
> String get_question(url, opts)
|
@@ -32,19 +132,19 @@ api_instance = WebScrapingAI::AIApi.new
|
|
32
132
|
url = 'https://example.com' # String | URL of the target page.
|
33
133
|
opts = {
|
34
134
|
question: 'What is the summary of this page content?', # String | Question or instructions to ask the LLM model about the target page.
|
35
|
-
|
36
|
-
response_tokens: 100, # Integer | Maximum number of tokens to return in the LLM model response. The total context size (context_limit) includes the question, the target page content and the response, so this parameter reserves tokens for the response (see also on_context_limit).
|
37
|
-
on_context_limit: 'truncate', # String | What to do if the context_limit parameter is exceeded (truncate by default). The context is exceeded when the target page content is too long.
|
38
|
-
headers: { key: 3.56}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
135
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
39
136
|
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
40
137
|
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
41
138
|
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
139
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
42
140
|
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
43
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
141
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
142
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
44
143
|
device: 'desktop', # String | Type of device emulation.
|
45
144
|
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
46
145
|
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
47
|
-
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
146
|
+
js_script: 'document.querySelector('button').click();', # String | Custom JavaScript code to execute on the target page.
|
147
|
+
format: 'json' # String | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response.
|
48
148
|
}
|
49
149
|
|
50
150
|
begin
|
@@ -80,19 +180,19 @@ end
|
|
80
180
|
| ---- | ---- | ----------- | ----- |
|
81
181
|
| **url** | **String** | URL of the target page. | |
|
82
182
|
| **question** | **String** | Question or instructions to ask the LLM model about the target page. | [optional] |
|
83
|
-
| **context_limit** | **Integer** | Maximum number of tokens to use as context for the LLM model (4000 by default). | [optional][default to 4000] |
|
84
|
-
| **response_tokens** | **Integer** | Maximum number of tokens to return in the LLM model response. The total context size (context_limit) includes the question, the target page content and the response, so this parameter reserves tokens for the response (see also on_context_limit). | [optional][default to 100] |
|
85
|
-
| **on_context_limit** | **String** | What to do if the context_limit parameter is exceeded (truncate by default). The context is exceeded when the target page content is too long. | [optional][default to 'error'] |
|
86
183
|
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
87
184
|
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
88
185
|
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
89
186
|
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
187
|
+
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
90
188
|
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
91
|
-
| **country** | **String** | Country of the proxy to use (US by default).
|
189
|
+
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
190
|
+
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
92
191
|
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
93
192
|
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
94
193
|
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
95
194
|
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
195
|
+
| **format** | **String** | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. | [optional][default to 'json'] |
|
96
196
|
|
97
197
|
### Return type
|
98
198
|
|
data/docs/Account.md
CHANGED
@@ -4,6 +4,7 @@
|
|
4
4
|
|
5
5
|
| Name | Type | Description | Notes |
|
6
6
|
| ---- | ---- | ----------- | ----- |
|
7
|
+
| **email** | **String** | Your account email | [optional] |
|
7
8
|
| **remaining_api_calls** | **Integer** | Remaining API credits quota | [optional] |
|
8
9
|
| **resets_at** | **Integer** | Next billing cycle start time (UNIX timestamp) | [optional] |
|
9
10
|
| **remaining_concurrency** | **Integer** | Remaining concurrent requests | [optional] |
|
@@ -14,6 +15,7 @@
|
|
14
15
|
require 'webscraping_ai'
|
15
16
|
|
16
17
|
instance = WebScrapingAI::Account.new(
|
18
|
+
email: null,
|
17
19
|
remaining_api_calls: null,
|
18
20
|
resets_at: null,
|
19
21
|
remaining_concurrency: null
|
data/docs/HTMLApi.md
CHANGED
@@ -31,17 +31,20 @@ end
|
|
31
31
|
api_instance = WebScrapingAI::HTMLApi.new
|
32
32
|
url = 'https://example.com' # String | URL of the target page.
|
33
33
|
opts = {
|
34
|
-
headers: { key:
|
34
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
35
35
|
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
36
36
|
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
37
37
|
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
38
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
38
39
|
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
39
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
40
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
41
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
40
42
|
device: 'desktop', # String | Type of device emulation.
|
41
43
|
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
42
44
|
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
43
45
|
js_script: 'document.querySelector('button').click();', # String | Custom JavaScript code to execute on the target page.
|
44
|
-
return_script_result: false # Boolean | Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned).
|
46
|
+
return_script_result: false, # Boolean | Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned).
|
47
|
+
format: 'json' # String | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response.
|
45
48
|
}
|
46
49
|
|
47
50
|
begin
|
@@ -80,13 +83,16 @@ end
|
|
80
83
|
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
81
84
|
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
82
85
|
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
86
|
+
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
83
87
|
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
84
|
-
| **country** | **String** | Country of the proxy to use (US by default).
|
88
|
+
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
89
|
+
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
85
90
|
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
86
91
|
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
87
92
|
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
88
93
|
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
89
94
|
| **return_script_result** | **Boolean** | Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned). | [optional][default to false] |
|
95
|
+
| **format** | **String** | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. | [optional][default to 'json'] |
|
90
96
|
|
91
97
|
### Return type
|
92
98
|
|
data/docs/SelectedHTMLApi.md
CHANGED
@@ -33,16 +33,19 @@ api_instance = WebScrapingAI::SelectedHTMLApi.new
|
|
33
33
|
url = 'https://example.com' # String | URL of the target page.
|
34
34
|
opts = {
|
35
35
|
selector: 'h1', # String | CSS selector (null by default, returns whole page HTML)
|
36
|
-
headers: { key:
|
36
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
37
37
|
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
38
38
|
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
39
39
|
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
40
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
40
41
|
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
41
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
42
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
43
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
42
44
|
device: 'desktop', # String | Type of device emulation.
|
43
45
|
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
44
46
|
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
45
|
-
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
47
|
+
js_script: 'document.querySelector('button').click();', # String | Custom JavaScript code to execute on the target page.
|
48
|
+
format: 'json' # String | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response.
|
46
49
|
}
|
47
50
|
|
48
51
|
begin
|
@@ -82,12 +85,15 @@ end
|
|
82
85
|
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
83
86
|
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
84
87
|
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
88
|
+
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
85
89
|
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
86
|
-
| **country** | **String** | Country of the proxy to use (US by default).
|
90
|
+
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
91
|
+
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
87
92
|
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
88
93
|
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
89
94
|
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
90
95
|
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
96
|
+
| **format** | **String** | Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. | [optional][default to 'json'] |
|
91
97
|
|
92
98
|
### Return type
|
93
99
|
|
@@ -128,12 +134,14 @@ api_instance = WebScrapingAI::SelectedHTMLApi.new
|
|
128
134
|
url = 'https://example.com' # String | URL of the target page.
|
129
135
|
opts = {
|
130
136
|
selectors: ['inner_example'], # Array<String> | Multiple CSS selectors (null by default, returns whole page HTML)
|
131
|
-
headers: { key:
|
137
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
132
138
|
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
133
139
|
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
134
140
|
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
141
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
135
142
|
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
136
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
143
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
144
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
137
145
|
device: 'desktop', # String | Type of device emulation.
|
138
146
|
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
139
147
|
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
@@ -177,8 +185,10 @@ end
|
|
177
185
|
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
178
186
|
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
179
187
|
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
188
|
+
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
180
189
|
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
181
|
-
| **country** | **String** | Country of the proxy to use (US by default).
|
190
|
+
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
191
|
+
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
182
192
|
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
183
193
|
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
184
194
|
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
data/docs/TextApi.md
CHANGED
@@ -13,7 +13,7 @@ All URIs are relative to *https://api.webscraping.ai*
|
|
13
13
|
|
14
14
|
Page text by URL
|
15
15
|
|
16
|
-
Returns the visible text content of a webpage specified by the URL. Can be used to feed data to
|
16
|
+
Returns the visible text content of a webpage specified by the URL. Can be used to feed data to LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
|
17
17
|
|
18
18
|
### Examples
|
19
19
|
|
@@ -33,12 +33,14 @@ url = 'https://example.com' # String | URL of the target page.
|
|
33
33
|
opts = {
|
34
34
|
text_format: 'plain', # String | Format of the text response (plain by default). \"plain\" will return only the page body text. \"json\" and \"xml\" will return a json/xml with \"title\", \"description\" and \"content\" keys.
|
35
35
|
return_links: false, # Boolean | [Works only with text_format=json] Return links from the page body text (false by default). Useful for building web crawlers.
|
36
|
-
headers: { key:
|
36
|
+
headers: { key: { key: 'inner_example'}}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
37
37
|
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
38
38
|
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
39
39
|
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
40
|
+
wait_for: 'wait_for_example', # String | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
40
41
|
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
41
|
-
country: 'us', # String | Country of the proxy to use (US by default).
|
42
|
+
country: 'us', # String | Country of the proxy to use (US by default).
|
43
|
+
custom_proxy: 'custom_proxy_example', # String | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
42
44
|
device: 'desktop', # String | Type of device emulation.
|
43
45
|
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
44
46
|
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
@@ -83,8 +85,10 @@ end
|
|
83
85
|
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
84
86
|
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
85
87
|
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
88
|
+
| **wait_for** | **String** | CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout. | [optional] |
|
86
89
|
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
87
|
-
| **country** | **String** | Country of the proxy to use (US by default).
|
90
|
+
| **country** | **String** | Country of the proxy to use (US by default). | [optional][default to 'us'] |
|
91
|
+
| **custom_proxy** | **String** | Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example). | [optional] |
|
88
92
|
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
89
93
|
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
90
94
|
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
@@ -1,12 +1,12 @@
|
|
1
1
|
=begin
|
2
2
|
#WebScraping.AI
|
3
3
|
|
4
|
-
#WebScraping.AI scraping API provides
|
4
|
+
#WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
5
5
|
|
6
|
-
The version of the OpenAPI document: 3.
|
6
|
+
The version of the OpenAPI document: 3.2.0
|
7
7
|
Contact: support@webscraping.ai
|
8
8
|
Generated by: https://openapi-generator.tech
|
9
|
-
|
9
|
+
Generator version: 7.11.0
|
10
10
|
|
11
11
|
=end
|
12
12
|
|
@@ -45,7 +45,7 @@ module WebScrapingAI
|
|
45
45
|
# header parameters
|
46
46
|
header_params = opts[:header_params] || {}
|
47
47
|
# HTTP header 'Accept' (if needed)
|
48
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json'])
|
48
|
+
header_params['Accept'] = @api_client.select_header_accept(['application/json']) unless header_params['Accept']
|
49
49
|
|
50
50
|
# form parameters
|
51
51
|
form_params = opts[:form_params] || {}
|