webscraping_ai 2.0.1 → 3.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +32 -20
- data/docs/AIApi.md +109 -0
- data/docs/Account.md +22 -0
- data/docs/AccountApi.md +76 -0
- data/docs/Error.md +14 -7
- data/docs/HTMLApi.md +49 -84
- data/docs/SelectedHTMLApi.md +96 -175
- data/docs/TextApi.md +105 -0
- data/git_push.sh +3 -4
- data/lib/webscraping_ai/api/account_api.rb +79 -0
- data/lib/webscraping_ai/api/ai_api.rb +164 -0
- data/lib/webscraping_ai/api/html_api.rb +58 -111
- data/lib/webscraping_ai/api/selected_html_api.rb +103 -221
- data/lib/webscraping_ai/api/text_api.rb +154 -0
- data/lib/webscraping_ai/api_client.rb +71 -65
- data/lib/webscraping_ai/api_error.rb +4 -3
- data/lib/webscraping_ai/configuration.rb +65 -15
- data/lib/webscraping_ai/models/{page_error.rb → account.rb} +60 -42
- data/lib/webscraping_ai/models/error.rb +66 -28
- data/lib/webscraping_ai/version.rb +4 -4
- data/lib/webscraping_ai.rb +7 -4
- data/spec/api/account_api_spec.rb +46 -0
- data/spec/api/ai_api_spec.rb +61 -0
- data/spec/api/html_api_spec.rb +18 -28
- data/spec/api/selected_html_api_spec.rb +30 -54
- data/spec/api/text_api_spec.rb +59 -0
- data/spec/models/account_spec.rb +48 -0
- data/spec/models/error_spec.rb +27 -14
- data/spec/spec_helper.rb +3 -3
- data/webscraping_ai.gemspec +7 -7
- metadata +22 -34
- data/docs/PageError.md +0 -19
- data/spec/api_client_spec.rb +0 -226
- data/spec/configuration_spec.rb +0 -42
- data/spec/models/page_error_spec.rb +0 -47
data/docs/SelectedHTMLApi.md
CHANGED
@@ -2,135 +2,96 @@
|
|
2
2
|
|
3
3
|
All URIs are relative to *https://api.webscraping.ai*
|
4
4
|
|
5
|
-
Method | HTTP request | Description
|
6
|
-
|
7
|
-
[**get_selected**](SelectedHTMLApi.md#get_selected) | **GET** /selected | HTML of a selected page area by URL and CSS selector
|
8
|
-
[**get_selected_multiple**](SelectedHTMLApi.md#get_selected_multiple) | **GET** /selected-multiple | HTML of multiple page areas by URL and CSS selectors
|
9
|
-
[**post_selected**](SelectedHTMLApi.md#post_selected) | **POST** /selected | HTML of a selected page areas by URL and CSS selector, with POST request to the target page
|
10
|
-
[**post_selected_multiple**](SelectedHTMLApi.md#post_selected_multiple) | **POST** /selected-multiple | HTML of multiple page areas by URL and CSS selectors, with POST request to the target page
|
11
|
-
|
5
|
+
| Method | HTTP request | Description |
|
6
|
+
| ------ | ------------ | ----------- |
|
7
|
+
| [**get_selected**](SelectedHTMLApi.md#get_selected) | **GET** /selected | HTML of a selected page area by URL and CSS selector |
|
8
|
+
| [**get_selected_multiple**](SelectedHTMLApi.md#get_selected_multiple) | **GET** /selected-multiple | HTML of multiple page areas by URL and CSS selectors |
|
12
9
|
|
13
10
|
|
14
11
|
## get_selected
|
15
12
|
|
16
|
-
> get_selected(url, opts)
|
13
|
+
> String get_selected(url, opts)
|
17
14
|
|
18
15
|
HTML of a selected page area by URL and CSS selector
|
19
16
|
|
20
|
-
Returns
|
17
|
+
Returns HTML of a selected page area by URL and CSS selector. Useful if you don't want to do the HTML parsing on your side.
|
21
18
|
|
22
|
-
###
|
19
|
+
### Examples
|
23
20
|
|
24
21
|
```ruby
|
25
|
-
|
22
|
+
require 'time'
|
26
23
|
require 'webscraping_ai'
|
27
24
|
# setup authorization
|
28
25
|
WebScrapingAI.configure do |config|
|
29
26
|
# Configure API key authorization: api_key
|
30
27
|
config.api_key['api_key'] = 'YOUR API KEY'
|
31
28
|
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
32
|
-
#config.api_key_prefix['api_key'] = 'Bearer'
|
29
|
+
# config.api_key_prefix['api_key'] = 'Bearer'
|
33
30
|
end
|
34
31
|
|
35
32
|
api_instance = WebScrapingAI::SelectedHTMLApi.new
|
36
|
-
url = 'https://example.com' # String | URL of the target page
|
33
|
+
url = 'https://example.com' # String | URL of the target page.
|
37
34
|
opts = {
|
38
35
|
selector: 'h1', # String | CSS selector (null by default, returns whole page HTML)
|
39
|
-
headers: {
|
40
|
-
timeout:
|
41
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default)
|
42
|
-
|
36
|
+
headers: { key: 3.56}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
37
|
+
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
38
|
+
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
39
|
+
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
40
|
+
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
41
|
+
country: 'us', # String | Country of the proxy to use (US by default). Only available on Startup and Custom plans.
|
42
|
+
device: 'desktop', # String | Type of device emulation.
|
43
|
+
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
44
|
+
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
45
|
+
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
43
46
|
}
|
44
47
|
|
45
48
|
begin
|
46
|
-
#HTML of a selected page area by URL and CSS selector
|
47
|
-
api_instance.get_selected(url, opts)
|
49
|
+
# HTML of a selected page area by URL and CSS selector
|
50
|
+
result = api_instance.get_selected(url, opts)
|
51
|
+
p result
|
48
52
|
rescue WebScrapingAI::ApiError => e
|
49
|
-
puts "
|
53
|
+
puts "Error when calling SelectedHTMLApi->get_selected: #{e}"
|
50
54
|
end
|
51
55
|
```
|
52
56
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
Name | Type | Description | Notes
|
57
|
-
------------- | ------------- | ------------- | -------------
|
58
|
-
**url** | **String**| URL of the target page |
|
59
|
-
**selector** | **String**| CSS selector (null by default, returns whole page HTML) | [optional]
|
60
|
-
**headers** | [**Hash<String, String>**](String.md)| HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}) | [optional]
|
61
|
-
**timeout** | **Integer**| Maximum processing time in ms. Increase it in case of timeout errors (5000 by default, maximum is 30000) | [optional] [default to 5000]
|
62
|
-
**js** | **Boolean**| Execute on-page JavaScript using a headless browser (true by default), costs 2 requests | [optional] [default to true]
|
63
|
-
**proxy** | **String**| Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default) | [optional] [default to 'datacenter']
|
57
|
+
#### Using the get_selected_with_http_info variant
|
64
58
|
|
65
|
-
|
59
|
+
This returns an Array which contains the response data, status code and headers.
|
66
60
|
|
67
|
-
|
68
|
-
|
69
|
-
### Authorization
|
70
|
-
|
71
|
-
[api_key](../README.md#api_key)
|
72
|
-
|
73
|
-
### HTTP request headers
|
74
|
-
|
75
|
-
- **Content-Type**: Not defined
|
76
|
-
- **Accept**: application/json, text/html
|
77
|
-
|
78
|
-
|
79
|
-
## get_selected_multiple
|
80
|
-
|
81
|
-
> Array<String> get_selected_multiple(url, opts)
|
82
|
-
|
83
|
-
HTML of multiple page areas by URL and CSS selectors
|
84
|
-
|
85
|
-
Always returns JSON
|
86
|
-
|
87
|
-
### Example
|
61
|
+
> <Array(String, Integer, Hash)> get_selected_with_http_info(url, opts)
|
88
62
|
|
89
63
|
```ruby
|
90
|
-
# load the gem
|
91
|
-
require 'webscraping_ai'
|
92
|
-
# setup authorization
|
93
|
-
WebScrapingAI.configure do |config|
|
94
|
-
# Configure API key authorization: api_key
|
95
|
-
config.api_key['api_key'] = 'YOUR API KEY'
|
96
|
-
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
97
|
-
#config.api_key_prefix['api_key'] = 'Bearer'
|
98
|
-
end
|
99
|
-
|
100
|
-
api_instance = WebScrapingAI::SelectedHTMLApi.new
|
101
|
-
url = 'https://example.com' # String | URL of the target page
|
102
|
-
opts = {
|
103
|
-
selectors: ['[\"h1\"]'], # Array<String> | Multiple CSS selectors (null by default, returns whole page HTML)
|
104
|
-
headers: {'key' => '{\"Cookie\":\"session=some_id\"}'}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"})
|
105
|
-
timeout: 5000, # Integer | Maximum processing time in ms. Increase it in case of timeout errors (5000 by default, maximum is 30000)
|
106
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default), costs 2 requests
|
107
|
-
proxy: 'datacenter' # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default)
|
108
|
-
}
|
109
|
-
|
110
64
|
begin
|
111
|
-
#HTML of
|
112
|
-
|
113
|
-
p
|
65
|
+
# HTML of a selected page area by URL and CSS selector
|
66
|
+
data, status_code, headers = api_instance.get_selected_with_http_info(url, opts)
|
67
|
+
p status_code # => 2xx
|
68
|
+
p headers # => { ... }
|
69
|
+
p data # => String
|
114
70
|
rescue WebScrapingAI::ApiError => e
|
115
|
-
puts "
|
71
|
+
puts "Error when calling SelectedHTMLApi->get_selected_with_http_info: #{e}"
|
116
72
|
end
|
117
73
|
```
|
118
74
|
|
119
75
|
### Parameters
|
120
76
|
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
**
|
125
|
-
**
|
126
|
-
**
|
127
|
-
**
|
128
|
-
**
|
129
|
-
**proxy** | **String
|
77
|
+
| Name | Type | Description | Notes |
|
78
|
+
| ---- | ---- | ----------- | ----- |
|
79
|
+
| **url** | **String** | URL of the target page. | |
|
80
|
+
| **selector** | **String** | CSS selector (null by default, returns whole page HTML) | [optional] |
|
81
|
+
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
82
|
+
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
83
|
+
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
84
|
+
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
85
|
+
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
86
|
+
| **country** | **String** | Country of the proxy to use (US by default). Only available on Startup and Custom plans. | [optional][default to 'us'] |
|
87
|
+
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
88
|
+
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
89
|
+
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
90
|
+
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
130
91
|
|
131
92
|
### Return type
|
132
93
|
|
133
|
-
**
|
94
|
+
**String**
|
134
95
|
|
135
96
|
### Authorization
|
136
97
|
|
@@ -139,129 +100,89 @@ Name | Type | Description | Notes
|
|
139
100
|
### HTTP request headers
|
140
101
|
|
141
102
|
- **Content-Type**: Not defined
|
142
|
-
- **Accept**: application/json
|
103
|
+
- **Accept**: application/json, text/html
|
143
104
|
|
144
105
|
|
145
|
-
##
|
106
|
+
## get_selected_multiple
|
146
107
|
|
147
|
-
>
|
108
|
+
> Array<String> get_selected_multiple(url, opts)
|
148
109
|
|
149
|
-
HTML of
|
110
|
+
HTML of multiple page areas by URL and CSS selectors
|
150
111
|
|
151
|
-
Returns
|
112
|
+
Returns HTML of multiple page areas by URL and CSS selectors. Useful if you don't want to do the HTML parsing on your side.
|
152
113
|
|
153
|
-
###
|
114
|
+
### Examples
|
154
115
|
|
155
116
|
```ruby
|
156
|
-
|
117
|
+
require 'time'
|
157
118
|
require 'webscraping_ai'
|
158
119
|
# setup authorization
|
159
120
|
WebScrapingAI.configure do |config|
|
160
121
|
# Configure API key authorization: api_key
|
161
122
|
config.api_key['api_key'] = 'YOUR API KEY'
|
162
123
|
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
163
|
-
#config.api_key_prefix['api_key'] = 'Bearer'
|
124
|
+
# config.api_key_prefix['api_key'] = 'Bearer'
|
164
125
|
end
|
165
126
|
|
166
127
|
api_instance = WebScrapingAI::SelectedHTMLApi.new
|
167
|
-
url = 'https://
|
128
|
+
url = 'https://example.com' # String | URL of the target page.
|
168
129
|
opts = {
|
169
|
-
|
170
|
-
headers: {
|
171
|
-
timeout:
|
172
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default)
|
173
|
-
|
174
|
-
|
130
|
+
selectors: ['inner_example'], # Array<String> | Multiple CSS selectors (null by default, returns whole page HTML)
|
131
|
+
headers: { key: 3.56}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
132
|
+
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
133
|
+
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
134
|
+
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
135
|
+
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
136
|
+
country: 'us', # String | Country of the proxy to use (US by default). Only available on Startup and Custom plans.
|
137
|
+
device: 'desktop', # String | Type of device emulation.
|
138
|
+
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
139
|
+
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
140
|
+
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
175
141
|
}
|
176
142
|
|
177
143
|
begin
|
178
|
-
#HTML of
|
179
|
-
api_instance.
|
144
|
+
# HTML of multiple page areas by URL and CSS selectors
|
145
|
+
result = api_instance.get_selected_multiple(url, opts)
|
146
|
+
p result
|
180
147
|
rescue WebScrapingAI::ApiError => e
|
181
|
-
puts "
|
148
|
+
puts "Error when calling SelectedHTMLApi->get_selected_multiple: #{e}"
|
182
149
|
end
|
183
150
|
```
|
184
151
|
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
Name | Type | Description | Notes
|
189
|
-
------------- | ------------- | ------------- | -------------
|
190
|
-
**url** | **String**| URL of the target page |
|
191
|
-
**selector** | **String**| CSS selector (null by default, returns whole page HTML) | [optional]
|
192
|
-
**headers** | [**Hash<String, String>**](String.md)| HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}) | [optional]
|
193
|
-
**timeout** | **Integer**| Maximum processing time in ms. Increase it in case of timeout errors (5000 by default, maximum is 30000) | [optional] [default to 5000]
|
194
|
-
**js** | **Boolean**| Execute on-page JavaScript using a headless browser (true by default), costs 2 requests | [optional] [default to true]
|
195
|
-
**proxy** | **String**| Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default) | [optional] [default to 'datacenter']
|
196
|
-
**request_body** | [**Hash<String, Object>**](Object.md)| Request body to pass to the target page | [optional]
|
197
|
-
|
198
|
-
### Return type
|
199
|
-
|
200
|
-
nil (empty response body)
|
201
|
-
|
202
|
-
### Authorization
|
203
|
-
|
204
|
-
[api_key](../README.md#api_key)
|
205
|
-
|
206
|
-
### HTTP request headers
|
207
|
-
|
208
|
-
- **Content-Type**: application/json, application/x-www-form-urlencoded, application/xml, text/plain
|
209
|
-
- **Accept**: application/json, text/html
|
210
|
-
|
211
|
-
|
212
|
-
## post_selected_multiple
|
213
|
-
|
214
|
-
> Array<String> post_selected_multiple(url, opts)
|
215
|
-
|
216
|
-
HTML of multiple page areas by URL and CSS selectors, with POST request to the target page
|
152
|
+
#### Using the get_selected_multiple_with_http_info variant
|
217
153
|
|
218
|
-
|
154
|
+
This returns an Array which contains the response data, status code and headers.
|
219
155
|
|
220
|
-
|
156
|
+
> <Array(Array<String>, Integer, Hash)> get_selected_multiple_with_http_info(url, opts)
|
221
157
|
|
222
158
|
```ruby
|
223
|
-
# load the gem
|
224
|
-
require 'webscraping_ai'
|
225
|
-
# setup authorization
|
226
|
-
WebScrapingAI.configure do |config|
|
227
|
-
# Configure API key authorization: api_key
|
228
|
-
config.api_key['api_key'] = 'YOUR API KEY'
|
229
|
-
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
230
|
-
#config.api_key_prefix['api_key'] = 'Bearer'
|
231
|
-
end
|
232
|
-
|
233
|
-
api_instance = WebScrapingAI::SelectedHTMLApi.new
|
234
|
-
url = 'https://httpbin.org/post' # String | URL of the target page
|
235
|
-
opts = {
|
236
|
-
selectors: ['[\"h1\"]'], # Array<String> | Multiple CSS selectors (null by default, returns whole page HTML)
|
237
|
-
headers: {'key' => '{\"Cookie\":\"session=some_id\"}'}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"})
|
238
|
-
timeout: 5000, # Integer | Maximum processing time in ms. Increase it in case of timeout errors (5000 by default, maximum is 30000)
|
239
|
-
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default), costs 2 requests
|
240
|
-
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default)
|
241
|
-
request_body: nil # Hash<String, Object> | Request body to pass to the target page
|
242
|
-
}
|
243
|
-
|
244
159
|
begin
|
245
|
-
#HTML of multiple page areas by URL and CSS selectors
|
246
|
-
|
247
|
-
p
|
160
|
+
# HTML of multiple page areas by URL and CSS selectors
|
161
|
+
data, status_code, headers = api_instance.get_selected_multiple_with_http_info(url, opts)
|
162
|
+
p status_code # => 2xx
|
163
|
+
p headers # => { ... }
|
164
|
+
p data # => Array<String>
|
248
165
|
rescue WebScrapingAI::ApiError => e
|
249
|
-
puts "
|
166
|
+
puts "Error when calling SelectedHTMLApi->get_selected_multiple_with_http_info: #{e}"
|
250
167
|
end
|
251
168
|
```
|
252
169
|
|
253
170
|
### Parameters
|
254
171
|
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
**
|
259
|
-
**
|
260
|
-
**
|
261
|
-
**
|
262
|
-
**
|
263
|
-
**proxy** | **String
|
264
|
-
**
|
172
|
+
| Name | Type | Description | Notes |
|
173
|
+
| ---- | ---- | ----------- | ----- |
|
174
|
+
| **url** | **String** | URL of the target page. | |
|
175
|
+
| **selectors** | [**Array<String>**](String.md) | Multiple CSS selectors (null by default, returns whole page HTML) | [optional] |
|
176
|
+
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
177
|
+
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
178
|
+
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
179
|
+
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
180
|
+
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
181
|
+
| **country** | **String** | Country of the proxy to use (US by default). Only available on Startup and Custom plans. | [optional][default to 'us'] |
|
182
|
+
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
183
|
+
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
184
|
+
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
185
|
+
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
265
186
|
|
266
187
|
### Return type
|
267
188
|
|
@@ -273,6 +194,6 @@ Name | Type | Description | Notes
|
|
273
194
|
|
274
195
|
### HTTP request headers
|
275
196
|
|
276
|
-
- **Content-Type**:
|
197
|
+
- **Content-Type**: Not defined
|
277
198
|
- **Accept**: application/json
|
278
199
|
|
data/docs/TextApi.md
ADDED
@@ -0,0 +1,105 @@
|
|
1
|
+
# WebScrapingAI::TextApi
|
2
|
+
|
3
|
+
All URIs are relative to *https://api.webscraping.ai*
|
4
|
+
|
5
|
+
| Method | HTTP request | Description |
|
6
|
+
| ------ | ------------ | ----------- |
|
7
|
+
| [**get_text**](TextApi.md#get_text) | **GET** /text | Page text by URL |
|
8
|
+
|
9
|
+
|
10
|
+
## get_text
|
11
|
+
|
12
|
+
> String get_text(url, opts)
|
13
|
+
|
14
|
+
Page text by URL
|
15
|
+
|
16
|
+
Returns the visible text content of a webpage specified by the URL. Can be used to feed data to GPT or other LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
|
17
|
+
|
18
|
+
### Examples
|
19
|
+
|
20
|
+
```ruby
|
21
|
+
require 'time'
|
22
|
+
require 'webscraping_ai'
|
23
|
+
# setup authorization
|
24
|
+
WebScrapingAI.configure do |config|
|
25
|
+
# Configure API key authorization: api_key
|
26
|
+
config.api_key['api_key'] = 'YOUR API KEY'
|
27
|
+
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
28
|
+
# config.api_key_prefix['api_key'] = 'Bearer'
|
29
|
+
end
|
30
|
+
|
31
|
+
api_instance = WebScrapingAI::TextApi.new
|
32
|
+
url = 'https://example.com' # String | URL of the target page.
|
33
|
+
opts = {
|
34
|
+
text_format: 'plain', # String | Format of the text response (plain by default). \"plain\" will return only the page body text. \"json\" and \"xml\" will return a json/xml with \"title\", \"description\" and \"content\" keys.
|
35
|
+
return_links: false, # Boolean | [Works only with text_format=json] Return links from the page body text (false by default). Useful for building web crawlers.
|
36
|
+
headers: { key: 3.56}, # Hash<String, String> | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
37
|
+
timeout: 10000, # Integer | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000).
|
38
|
+
js: true, # Boolean | Execute on-page JavaScript using a headless browser (true by default).
|
39
|
+
js_timeout: 2000, # Integer | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page.
|
40
|
+
proxy: 'datacenter', # String | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details.
|
41
|
+
country: 'us', # String | Country of the proxy to use (US by default). Only available on Startup and Custom plans.
|
42
|
+
device: 'desktop', # String | Type of device emulation.
|
43
|
+
error_on_404: false, # Boolean | Return error on 404 HTTP status on the target page (false by default).
|
44
|
+
error_on_redirect: false, # Boolean | Return error on redirect on the target page (false by default).
|
45
|
+
js_script: 'document.querySelector('button').click();' # String | Custom JavaScript code to execute on the target page.
|
46
|
+
}
|
47
|
+
|
48
|
+
begin
|
49
|
+
# Page text by URL
|
50
|
+
result = api_instance.get_text(url, opts)
|
51
|
+
p result
|
52
|
+
rescue WebScrapingAI::ApiError => e
|
53
|
+
puts "Error when calling TextApi->get_text: #{e}"
|
54
|
+
end
|
55
|
+
```
|
56
|
+
|
57
|
+
#### Using the get_text_with_http_info variant
|
58
|
+
|
59
|
+
This returns an Array which contains the response data, status code and headers.
|
60
|
+
|
61
|
+
> <Array(String, Integer, Hash)> get_text_with_http_info(url, opts)
|
62
|
+
|
63
|
+
```ruby
|
64
|
+
begin
|
65
|
+
# Page text by URL
|
66
|
+
data, status_code, headers = api_instance.get_text_with_http_info(url, opts)
|
67
|
+
p status_code # => 2xx
|
68
|
+
p headers # => { ... }
|
69
|
+
p data # => String
|
70
|
+
rescue WebScrapingAI::ApiError => e
|
71
|
+
puts "Error when calling TextApi->get_text_with_http_info: #{e}"
|
72
|
+
end
|
73
|
+
```
|
74
|
+
|
75
|
+
### Parameters
|
76
|
+
|
77
|
+
| Name | Type | Description | Notes |
|
78
|
+
| ---- | ---- | ----------- | ----- |
|
79
|
+
| **url** | **String** | URL of the target page. | |
|
80
|
+
| **text_format** | **String** | Format of the text response (plain by default). \"plain\" will return only the page body text. \"json\" and \"xml\" will return a json/xml with \"title\", \"description\" and \"content\" keys. | [optional][default to 'plain'] |
|
81
|
+
| **return_links** | **Boolean** | [Works only with text_format=json] Return links from the page body text (false by default). Useful for building web crawlers. | [optional][default to false] |
|
82
|
+
| **headers** | [**Hash<String, String>**](String.md) | HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}). | [optional] |
|
83
|
+
| **timeout** | **Integer** | Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). | [optional][default to 10000] |
|
84
|
+
| **js** | **Boolean** | Execute on-page JavaScript using a headless browser (true by default). | [optional][default to true] |
|
85
|
+
| **js_timeout** | **Integer** | Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. | [optional][default to 2000] |
|
86
|
+
| **proxy** | **String** | Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. | [optional][default to 'datacenter'] |
|
87
|
+
| **country** | **String** | Country of the proxy to use (US by default). Only available on Startup and Custom plans. | [optional][default to 'us'] |
|
88
|
+
| **device** | **String** | Type of device emulation. | [optional][default to 'desktop'] |
|
89
|
+
| **error_on_404** | **Boolean** | Return error on 404 HTTP status on the target page (false by default). | [optional][default to false] |
|
90
|
+
| **error_on_redirect** | **Boolean** | Return error on redirect on the target page (false by default). | [optional][default to false] |
|
91
|
+
| **js_script** | **String** | Custom JavaScript code to execute on the target page. | [optional] |
|
92
|
+
|
93
|
+
### Return type
|
94
|
+
|
95
|
+
**String**
|
96
|
+
|
97
|
+
### Authorization
|
98
|
+
|
99
|
+
[api_key](../README.md#api_key)
|
100
|
+
|
101
|
+
### HTTP request headers
|
102
|
+
|
103
|
+
- **Content-Type**: Not defined
|
104
|
+
- **Accept**: application/json, text/html, text/xml
|
105
|
+
|
data/git_push.sh
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/bin/sh
|
2
2
|
# ref: https://help.github.com/articles/adding-an-existing-project-to-github-using-the-command-line/
|
3
3
|
#
|
4
|
-
# Usage example: /bin/sh ./git_push.sh wing328 openapi-
|
4
|
+
# Usage example: /bin/sh ./git_push.sh wing328 openapi-petstore-perl "minor update" "gitlab.com"
|
5
5
|
|
6
6
|
git_user_id=$1
|
7
7
|
git_repo_id=$2
|
@@ -38,14 +38,14 @@ git add .
|
|
38
38
|
git commit -m "$release_note"
|
39
39
|
|
40
40
|
# Sets the new remote
|
41
|
-
git_remote
|
41
|
+
git_remote=$(git remote)
|
42
42
|
if [ "$git_remote" = "" ]; then # git remote not defined
|
43
43
|
|
44
44
|
if [ "$GIT_TOKEN" = "" ]; then
|
45
45
|
echo "[INFO] \$GIT_TOKEN (environment variable) is not set. Using the git credential in your environment."
|
46
46
|
git remote add origin https://${git_host}/${git_user_id}/${git_repo_id}.git
|
47
47
|
else
|
48
|
-
git remote add origin https://${git_user_id}
|
48
|
+
git remote add origin https://${git_user_id}:"${GIT_TOKEN}"@${git_host}/${git_user_id}/${git_repo_id}.git
|
49
49
|
fi
|
50
50
|
|
51
51
|
fi
|
@@ -55,4 +55,3 @@ git pull origin master
|
|
55
55
|
# Pushes (Forces) the changes in the local repository up to the remote repository
|
56
56
|
echo "Git pushing to https://${git_host}/${git_user_id}/${git_repo_id}.git"
|
57
57
|
git push origin master 2>&1 | grep -v 'To https'
|
58
|
-
|
@@ -0,0 +1,79 @@
|
|
1
|
+
=begin
|
2
|
+
#WebScraping.AI
|
3
|
+
|
4
|
+
#WebScraping.AI scraping API provides GPT-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
5
|
+
|
6
|
+
The version of the OpenAPI document: 3.1.2
|
7
|
+
Contact: support@webscraping.ai
|
8
|
+
Generated by: https://openapi-generator.tech
|
9
|
+
OpenAPI Generator version: 7.2.0
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
require 'cgi'
|
14
|
+
|
15
|
+
module WebScrapingAI
|
16
|
+
class AccountApi
|
17
|
+
attr_accessor :api_client
|
18
|
+
|
19
|
+
def initialize(api_client = ApiClient.default)
|
20
|
+
@api_client = api_client
|
21
|
+
end
|
22
|
+
# Information about your account calls quota
|
23
|
+
# Returns information about your account, including the remaining API credits quota, the next billing cycle start time, and the remaining concurrent requests. The response is in JSON format.
|
24
|
+
# @param [Hash] opts the optional parameters
|
25
|
+
# @return [Account]
|
26
|
+
def account(opts = {})
|
27
|
+
data, _status_code, _headers = account_with_http_info(opts)
|
28
|
+
data
|
29
|
+
end
|
30
|
+
|
31
|
+
# Information about your account calls quota
|
32
|
+
# Returns information about your account, including the remaining API credits quota, the next billing cycle start time, and the remaining concurrent requests. The response is in JSON format.
|
33
|
+
# @param [Hash] opts the optional parameters
|
34
|
+
# @return [Array<(Account, Integer, Hash)>] Account data, response status code and response headers
|
35
|
+
def account_with_http_info(opts = {})
|
36
|
+
if @api_client.config.debugging
|
37
|
+
@api_client.config.logger.debug 'Calling API: AccountApi.account ...'
|
38
|
+
end
|
39
|
+
# resource path
|
40
|
+
local_var_path = '/account'
|
41
|
+
|
42
|
+
# query parameters
|
43
|
+
query_params = opts[:query_params] || {}
|
44
|
+
|
45
|
+
# header parameters
|
46
|
+
header_params = opts[:header_params] || {}
|
47
|
+
# HTTP header 'Accept' (if needed)
|
48
|
+
header_params['Accept'] = @api_client.select_header_accept(['application/json'])
|
49
|
+
|
50
|
+
# form parameters
|
51
|
+
form_params = opts[:form_params] || {}
|
52
|
+
|
53
|
+
# http body (model)
|
54
|
+
post_body = opts[:debug_body]
|
55
|
+
|
56
|
+
# return_type
|
57
|
+
return_type = opts[:debug_return_type] || 'Account'
|
58
|
+
|
59
|
+
# auth_names
|
60
|
+
auth_names = opts[:debug_auth_names] || ['api_key']
|
61
|
+
|
62
|
+
new_options = opts.merge(
|
63
|
+
:operation => :"AccountApi.account",
|
64
|
+
:header_params => header_params,
|
65
|
+
:query_params => query_params,
|
66
|
+
:form_params => form_params,
|
67
|
+
:body => post_body,
|
68
|
+
:auth_names => auth_names,
|
69
|
+
:return_type => return_type
|
70
|
+
)
|
71
|
+
|
72
|
+
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
73
|
+
if @api_client.config.debugging
|
74
|
+
@api_client.config.logger.debug "API called: AccountApi#account\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
75
|
+
end
|
76
|
+
return data, status_code, headers
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|