webscraping_ai 3.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/LICENSE +21 -0
- data/README.md +110 -85
- data/lib/webscraping_ai/client.rb +130 -0
- data/lib/webscraping_ai/configuration.rb +10 -300
- data/lib/webscraping_ai/errors.rb +44 -0
- data/lib/webscraping_ai/query_encoder.rb +74 -0
- data/lib/webscraping_ai/version.rb +1 -13
- data/lib/webscraping_ai.rb +15 -39
- data/webscraping_ai.gemspec +33 -36
- metadata +23 -72
- data/Gemfile +0 -9
- data/Rakefile +0 -10
- data/docs/AIApi.md +0 -209
- data/docs/Account.md +0 -24
- data/docs/AccountApi.md +0 -76
- data/docs/Error.md +0 -24
- data/docs/HTMLApi.md +0 -109
- data/docs/SelectedHTMLApi.md +0 -209
- data/docs/TextApi.md +0 -109
- data/git_push.sh +0 -57
- data/lib/webscraping_ai/api/account_api.rb +0 -79
- data/lib/webscraping_ai/api/ai_api.rb +0 -295
- data/lib/webscraping_ai/api/html_api.rb +0 -160
- data/lib/webscraping_ai/api/selected_html_api.rb +0 -291
- data/lib/webscraping_ai/api/text_api.rb +0 -160
- data/lib/webscraping_ai/api_client.rb +0 -394
- data/lib/webscraping_ai/api_error.rb +0 -58
- data/lib/webscraping_ai/models/account.rb +0 -245
- data/lib/webscraping_ai/models/error.rb +0 -245
- data/spec/api/account_api_spec.rb +0 -46
- data/spec/api/ai_api_spec.rb +0 -86
- data/spec/api/html_api_spec.rb +0 -61
- data/spec/api/selected_html_api_spec.rb +0 -86
- data/spec/api/text_api_spec.rb +0 -61
- data/spec/models/account_spec.rb +0 -54
- data/spec/models/error_spec.rb +0 -54
- data/spec/spec_helper.rb +0 -111
|
@@ -1,291 +0,0 @@
|
|
|
1
|
-
=begin
|
|
2
|
-
#WebScraping.AI
|
|
3
|
-
|
|
4
|
-
#WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
|
5
|
-
|
|
6
|
-
The version of the OpenAPI document: 3.2.0
|
|
7
|
-
Contact: support@webscraping.ai
|
|
8
|
-
Generated by: https://openapi-generator.tech
|
|
9
|
-
Generator version: 7.11.0
|
|
10
|
-
|
|
11
|
-
=end
|
|
12
|
-
|
|
13
|
-
require 'cgi'
|
|
14
|
-
|
|
15
|
-
module WebScrapingAI
|
|
16
|
-
class SelectedHTMLApi
|
|
17
|
-
attr_accessor :api_client
|
|
18
|
-
|
|
19
|
-
def initialize(api_client = ApiClient.default)
|
|
20
|
-
@api_client = api_client
|
|
21
|
-
end
|
|
22
|
-
# HTML of a selected page area by URL and CSS selector
|
|
23
|
-
# Returns HTML of a selected page area by URL and CSS selector. Useful if you don't want to do the HTML parsing on your side.
|
|
24
|
-
# @param url [String] URL of the target page.
|
|
25
|
-
# @param [Hash] opts the optional parameters
|
|
26
|
-
# @option opts [String] :selector CSS selector (null by default, returns whole page HTML)
|
|
27
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
28
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
29
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
30
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
31
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
32
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
33
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
34
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
35
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
36
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
37
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
38
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
39
|
-
# @option opts [String] :format Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. (default to 'json')
|
|
40
|
-
# @return [String]
|
|
41
|
-
def get_selected(url, opts = {})
|
|
42
|
-
data, _status_code, _headers = get_selected_with_http_info(url, opts)
|
|
43
|
-
data
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# HTML of a selected page area by URL and CSS selector
|
|
47
|
-
# Returns HTML of a selected page area by URL and CSS selector. Useful if you don't want to do the HTML parsing on your side.
|
|
48
|
-
# @param url [String] URL of the target page.
|
|
49
|
-
# @param [Hash] opts the optional parameters
|
|
50
|
-
# @option opts [String] :selector CSS selector (null by default, returns whole page HTML)
|
|
51
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
52
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
53
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
54
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
55
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
56
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
57
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
58
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
59
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
60
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
61
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
62
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
63
|
-
# @option opts [String] :format Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. (default to 'json')
|
|
64
|
-
# @return [Array<(String, Integer, Hash)>] String data, response status code and response headers
|
|
65
|
-
def get_selected_with_http_info(url, opts = {})
|
|
66
|
-
if @api_client.config.debugging
|
|
67
|
-
@api_client.config.logger.debug 'Calling API: SelectedHTMLApi.get_selected ...'
|
|
68
|
-
end
|
|
69
|
-
# verify the required parameter 'url' is set
|
|
70
|
-
if @api_client.config.client_side_validation && url.nil?
|
|
71
|
-
fail ArgumentError, "Missing the required parameter 'url' when calling SelectedHTMLApi.get_selected"
|
|
72
|
-
end
|
|
73
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
|
|
74
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected, must be smaller than or equal to 30000.'
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
|
|
78
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected, must be greater than or equal to 1.'
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
|
|
82
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected, must be smaller than or equal to 20000.'
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
|
|
86
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected, must be greater than or equal to 1.'
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
allowable_values = ["datacenter", "residential"]
|
|
90
|
-
if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
|
|
91
|
-
fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
|
|
92
|
-
end
|
|
93
|
-
allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
|
|
94
|
-
if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
|
|
95
|
-
fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
|
|
96
|
-
end
|
|
97
|
-
allowable_values = ["desktop", "mobile", "tablet"]
|
|
98
|
-
if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
|
|
99
|
-
fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
|
|
100
|
-
end
|
|
101
|
-
allowable_values = ["json", "text"]
|
|
102
|
-
if @api_client.config.client_side_validation && opts[:'format'] && !allowable_values.include?(opts[:'format'])
|
|
103
|
-
fail ArgumentError, "invalid value for \"format\", must be one of #{allowable_values}"
|
|
104
|
-
end
|
|
105
|
-
# resource path
|
|
106
|
-
local_var_path = '/selected'
|
|
107
|
-
|
|
108
|
-
# query parameters
|
|
109
|
-
query_params = opts[:query_params] || {}
|
|
110
|
-
query_params[:'url'] = url
|
|
111
|
-
query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
|
|
112
|
-
query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
|
|
113
|
-
query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
|
|
114
|
-
query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
|
|
115
|
-
query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
|
|
116
|
-
query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
|
|
117
|
-
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
|
118
|
-
query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
|
|
119
|
-
query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
|
|
120
|
-
query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
|
|
121
|
-
query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
|
|
122
|
-
query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
|
|
123
|
-
query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
|
|
124
|
-
query_params[:'format'] = opts[:'format'] if !opts[:'format'].nil?
|
|
125
|
-
|
|
126
|
-
# header parameters
|
|
127
|
-
header_params = opts[:header_params] || {}
|
|
128
|
-
# HTTP header 'Accept' (if needed)
|
|
129
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json', 'text/html']) unless header_params['Accept']
|
|
130
|
-
|
|
131
|
-
# form parameters
|
|
132
|
-
form_params = opts[:form_params] || {}
|
|
133
|
-
|
|
134
|
-
# http body (model)
|
|
135
|
-
post_body = opts[:debug_body]
|
|
136
|
-
|
|
137
|
-
# return_type
|
|
138
|
-
return_type = opts[:debug_return_type] || 'String'
|
|
139
|
-
|
|
140
|
-
# auth_names
|
|
141
|
-
auth_names = opts[:debug_auth_names] || ['api_key']
|
|
142
|
-
|
|
143
|
-
new_options = opts.merge(
|
|
144
|
-
:operation => :"SelectedHTMLApi.get_selected",
|
|
145
|
-
:header_params => header_params,
|
|
146
|
-
:query_params => query_params,
|
|
147
|
-
:form_params => form_params,
|
|
148
|
-
:body => post_body,
|
|
149
|
-
:auth_names => auth_names,
|
|
150
|
-
:return_type => return_type
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
|
154
|
-
if @api_client.config.debugging
|
|
155
|
-
@api_client.config.logger.debug "API called: SelectedHTMLApi#get_selected\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
|
156
|
-
end
|
|
157
|
-
return data, status_code, headers
|
|
158
|
-
end
|
|
159
|
-
|
|
160
|
-
# HTML of multiple page areas by URL and CSS selectors
|
|
161
|
-
# Returns HTML of multiple page areas by URL and CSS selectors. Useful if you don't want to do the HTML parsing on your side.
|
|
162
|
-
# @param url [String] URL of the target page.
|
|
163
|
-
# @param [Hash] opts the optional parameters
|
|
164
|
-
# @option opts [Array<String>] :selectors Multiple CSS selectors (null by default, returns whole page HTML)
|
|
165
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
166
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
167
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
168
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
169
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
170
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
171
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
172
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
173
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
174
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
175
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
176
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
177
|
-
# @return [Array<String>]
|
|
178
|
-
def get_selected_multiple(url, opts = {})
|
|
179
|
-
data, _status_code, _headers = get_selected_multiple_with_http_info(url, opts)
|
|
180
|
-
data
|
|
181
|
-
end
|
|
182
|
-
|
|
183
|
-
# HTML of multiple page areas by URL and CSS selectors
|
|
184
|
-
# Returns HTML of multiple page areas by URL and CSS selectors. Useful if you don't want to do the HTML parsing on your side.
|
|
185
|
-
# @param url [String] URL of the target page.
|
|
186
|
-
# @param [Hash] opts the optional parameters
|
|
187
|
-
# @option opts [Array<String>] :selectors Multiple CSS selectors (null by default, returns whole page HTML)
|
|
188
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
189
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
190
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
191
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
192
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
193
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
194
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
195
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
196
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
197
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
198
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
199
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
200
|
-
# @return [Array<(Array<String>, Integer, Hash)>] Array<String> data, response status code and response headers
|
|
201
|
-
def get_selected_multiple_with_http_info(url, opts = {})
|
|
202
|
-
if @api_client.config.debugging
|
|
203
|
-
@api_client.config.logger.debug 'Calling API: SelectedHTMLApi.get_selected_multiple ...'
|
|
204
|
-
end
|
|
205
|
-
# verify the required parameter 'url' is set
|
|
206
|
-
if @api_client.config.client_side_validation && url.nil?
|
|
207
|
-
fail ArgumentError, "Missing the required parameter 'url' when calling SelectedHTMLApi.get_selected_multiple"
|
|
208
|
-
end
|
|
209
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
|
|
210
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be smaller than or equal to 30000.'
|
|
211
|
-
end
|
|
212
|
-
|
|
213
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
|
|
214
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be greater than or equal to 1.'
|
|
215
|
-
end
|
|
216
|
-
|
|
217
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
|
|
218
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be smaller than or equal to 20000.'
|
|
219
|
-
end
|
|
220
|
-
|
|
221
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
|
|
222
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be greater than or equal to 1.'
|
|
223
|
-
end
|
|
224
|
-
|
|
225
|
-
allowable_values = ["datacenter", "residential"]
|
|
226
|
-
if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
|
|
227
|
-
fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
|
|
228
|
-
end
|
|
229
|
-
allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
|
|
230
|
-
if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
|
|
231
|
-
fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
|
|
232
|
-
end
|
|
233
|
-
allowable_values = ["desktop", "mobile", "tablet"]
|
|
234
|
-
if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
|
|
235
|
-
fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
|
|
236
|
-
end
|
|
237
|
-
# resource path
|
|
238
|
-
local_var_path = '/selected-multiple'
|
|
239
|
-
|
|
240
|
-
# query parameters
|
|
241
|
-
query_params = opts[:query_params] || {}
|
|
242
|
-
query_params[:'url'] = url
|
|
243
|
-
query_params[:'selectors'] = @api_client.build_collection_param(opts[:'selectors'], :multi) if !opts[:'selectors'].nil?
|
|
244
|
-
query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
|
|
245
|
-
query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
|
|
246
|
-
query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
|
|
247
|
-
query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
|
|
248
|
-
query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
|
|
249
|
-
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
|
250
|
-
query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
|
|
251
|
-
query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
|
|
252
|
-
query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
|
|
253
|
-
query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
|
|
254
|
-
query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
|
|
255
|
-
query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
|
|
256
|
-
|
|
257
|
-
# header parameters
|
|
258
|
-
header_params = opts[:header_params] || {}
|
|
259
|
-
# HTTP header 'Accept' (if needed)
|
|
260
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json']) unless header_params['Accept']
|
|
261
|
-
|
|
262
|
-
# form parameters
|
|
263
|
-
form_params = opts[:form_params] || {}
|
|
264
|
-
|
|
265
|
-
# http body (model)
|
|
266
|
-
post_body = opts[:debug_body]
|
|
267
|
-
|
|
268
|
-
# return_type
|
|
269
|
-
return_type = opts[:debug_return_type] || 'Array<String>'
|
|
270
|
-
|
|
271
|
-
# auth_names
|
|
272
|
-
auth_names = opts[:debug_auth_names] || ['api_key']
|
|
273
|
-
|
|
274
|
-
new_options = opts.merge(
|
|
275
|
-
:operation => :"SelectedHTMLApi.get_selected_multiple",
|
|
276
|
-
:header_params => header_params,
|
|
277
|
-
:query_params => query_params,
|
|
278
|
-
:form_params => form_params,
|
|
279
|
-
:body => post_body,
|
|
280
|
-
:auth_names => auth_names,
|
|
281
|
-
:return_type => return_type
|
|
282
|
-
)
|
|
283
|
-
|
|
284
|
-
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
|
285
|
-
if @api_client.config.debugging
|
|
286
|
-
@api_client.config.logger.debug "API called: SelectedHTMLApi#get_selected_multiple\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
|
287
|
-
end
|
|
288
|
-
return data, status_code, headers
|
|
289
|
-
end
|
|
290
|
-
end
|
|
291
|
-
end
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
=begin
|
|
2
|
-
#WebScraping.AI
|
|
3
|
-
|
|
4
|
-
#WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
|
5
|
-
|
|
6
|
-
The version of the OpenAPI document: 3.2.0
|
|
7
|
-
Contact: support@webscraping.ai
|
|
8
|
-
Generated by: https://openapi-generator.tech
|
|
9
|
-
Generator version: 7.11.0
|
|
10
|
-
|
|
11
|
-
=end
|
|
12
|
-
|
|
13
|
-
require 'cgi'
|
|
14
|
-
|
|
15
|
-
module WebScrapingAI
|
|
16
|
-
class TextApi
|
|
17
|
-
attr_accessor :api_client
|
|
18
|
-
|
|
19
|
-
def initialize(api_client = ApiClient.default)
|
|
20
|
-
@api_client = api_client
|
|
21
|
-
end
|
|
22
|
-
# Page text by URL
|
|
23
|
-
# Returns the visible text content of a webpage specified by the URL. Can be used to feed data to LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
|
|
24
|
-
# @param url [String] URL of the target page.
|
|
25
|
-
# @param [Hash] opts the optional parameters
|
|
26
|
-
# @option opts [String] :text_format Format of the text response (plain by default). \"plain\" will return only the page body text. \"json\" and \"xml\" will return a json/xml with \"title\", \"description\" and \"content\" keys. (default to 'plain')
|
|
27
|
-
# @option opts [Boolean] :return_links [Works only with text_format=json] Return links from the page body text (false by default). Useful for building web crawlers. (default to false)
|
|
28
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
29
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
30
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
31
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
32
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
33
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
34
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
35
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
36
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
37
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
38
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
39
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
40
|
-
# @return [String]
|
|
41
|
-
def get_text(url, opts = {})
|
|
42
|
-
data, _status_code, _headers = get_text_with_http_info(url, opts)
|
|
43
|
-
data
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# Page text by URL
|
|
47
|
-
# Returns the visible text content of a webpage specified by the URL. Can be used to feed data to LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
|
|
48
|
-
# @param url [String] URL of the target page.
|
|
49
|
-
# @param [Hash] opts the optional parameters
|
|
50
|
-
# @option opts [String] :text_format Format of the text response (plain by default). \"plain\" will return only the page body text. \"json\" and \"xml\" will return a json/xml with \"title\", \"description\" and \"content\" keys. (default to 'plain')
|
|
51
|
-
# @option opts [Boolean] :return_links [Works only with text_format=json] Return links from the page body text (false by default). Useful for building web crawlers. (default to false)
|
|
52
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
53
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
54
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
55
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
56
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
57
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
58
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
59
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
60
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
61
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
62
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
63
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
64
|
-
# @return [Array<(String, Integer, Hash)>] String data, response status code and response headers
|
|
65
|
-
def get_text_with_http_info(url, opts = {})
|
|
66
|
-
if @api_client.config.debugging
|
|
67
|
-
@api_client.config.logger.debug 'Calling API: TextApi.get_text ...'
|
|
68
|
-
end
|
|
69
|
-
# verify the required parameter 'url' is set
|
|
70
|
-
if @api_client.config.client_side_validation && url.nil?
|
|
71
|
-
fail ArgumentError, "Missing the required parameter 'url' when calling TextApi.get_text"
|
|
72
|
-
end
|
|
73
|
-
allowable_values = ["plain", "xml", "json"]
|
|
74
|
-
if @api_client.config.client_side_validation && opts[:'text_format'] && !allowable_values.include?(opts[:'text_format'])
|
|
75
|
-
fail ArgumentError, "invalid value for \"text_format\", must be one of #{allowable_values}"
|
|
76
|
-
end
|
|
77
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
|
|
78
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling TextApi.get_text, must be smaller than or equal to 30000.'
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
|
|
82
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling TextApi.get_text, must be greater than or equal to 1.'
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
|
|
86
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling TextApi.get_text, must be smaller than or equal to 20000.'
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
|
|
90
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling TextApi.get_text, must be greater than or equal to 1.'
|
|
91
|
-
end
|
|
92
|
-
|
|
93
|
-
allowable_values = ["datacenter", "residential"]
|
|
94
|
-
if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
|
|
95
|
-
fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
|
|
96
|
-
end
|
|
97
|
-
allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
|
|
98
|
-
if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
|
|
99
|
-
fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
|
|
100
|
-
end
|
|
101
|
-
allowable_values = ["desktop", "mobile", "tablet"]
|
|
102
|
-
if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
|
|
103
|
-
fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
|
|
104
|
-
end
|
|
105
|
-
# resource path
|
|
106
|
-
local_var_path = '/text'
|
|
107
|
-
|
|
108
|
-
# query parameters
|
|
109
|
-
query_params = opts[:query_params] || {}
|
|
110
|
-
query_params[:'url'] = url
|
|
111
|
-
query_params[:'text_format'] = opts[:'text_format'] if !opts[:'text_format'].nil?
|
|
112
|
-
query_params[:'return_links'] = opts[:'return_links'] if !opts[:'return_links'].nil?
|
|
113
|
-
query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
|
|
114
|
-
query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
|
|
115
|
-
query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
|
|
116
|
-
query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
|
|
117
|
-
query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
|
|
118
|
-
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
|
119
|
-
query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
|
|
120
|
-
query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
|
|
121
|
-
query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
|
|
122
|
-
query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
|
|
123
|
-
query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
|
|
124
|
-
query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
|
|
125
|
-
|
|
126
|
-
# header parameters
|
|
127
|
-
header_params = opts[:header_params] || {}
|
|
128
|
-
# HTTP header 'Accept' (if needed)
|
|
129
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json', 'text/html', 'text/xml']) unless header_params['Accept']
|
|
130
|
-
|
|
131
|
-
# form parameters
|
|
132
|
-
form_params = opts[:form_params] || {}
|
|
133
|
-
|
|
134
|
-
# http body (model)
|
|
135
|
-
post_body = opts[:debug_body]
|
|
136
|
-
|
|
137
|
-
# return_type
|
|
138
|
-
return_type = opts[:debug_return_type] || 'String'
|
|
139
|
-
|
|
140
|
-
# auth_names
|
|
141
|
-
auth_names = opts[:debug_auth_names] || ['api_key']
|
|
142
|
-
|
|
143
|
-
new_options = opts.merge(
|
|
144
|
-
:operation => :"TextApi.get_text",
|
|
145
|
-
:header_params => header_params,
|
|
146
|
-
:query_params => query_params,
|
|
147
|
-
:form_params => form_params,
|
|
148
|
-
:body => post_body,
|
|
149
|
-
:auth_names => auth_names,
|
|
150
|
-
:return_type => return_type
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
|
154
|
-
if @api_client.config.debugging
|
|
155
|
-
@api_client.config.logger.debug "API called: TextApi#get_text\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
|
156
|
-
end
|
|
157
|
-
return data, status_code, headers
|
|
158
|
-
end
|
|
159
|
-
end
|
|
160
|
-
end
|