webscraping_ai 3.2.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +25 -0
- data/LICENSE +21 -0
- data/README.md +110 -85
- data/lib/webscraping_ai/client.rb +130 -0
- data/lib/webscraping_ai/configuration.rb +10 -300
- data/lib/webscraping_ai/errors.rb +44 -0
- data/lib/webscraping_ai/query_encoder.rb +74 -0
- data/lib/webscraping_ai/version.rb +1 -13
- data/lib/webscraping_ai.rb +15 -39
- data/webscraping_ai.gemspec +33 -36
- metadata +23 -72
- data/Gemfile +0 -9
- data/Rakefile +0 -10
- data/docs/AIApi.md +0 -209
- data/docs/Account.md +0 -24
- data/docs/AccountApi.md +0 -76
- data/docs/Error.md +0 -24
- data/docs/HTMLApi.md +0 -109
- data/docs/SelectedHTMLApi.md +0 -209
- data/docs/TextApi.md +0 -109
- data/git_push.sh +0 -57
- data/lib/webscraping_ai/api/account_api.rb +0 -79
- data/lib/webscraping_ai/api/ai_api.rb +0 -295
- data/lib/webscraping_ai/api/html_api.rb +0 -160
- data/lib/webscraping_ai/api/selected_html_api.rb +0 -291
- data/lib/webscraping_ai/api/text_api.rb +0 -160
- data/lib/webscraping_ai/api_client.rb +0 -394
- data/lib/webscraping_ai/api_error.rb +0 -58
- data/lib/webscraping_ai/models/account.rb +0 -245
- data/lib/webscraping_ai/models/error.rb +0 -245
- data/spec/api/account_api_spec.rb +0 -46
- data/spec/api/ai_api_spec.rb +0 -86
- data/spec/api/html_api_spec.rb +0 -61
- data/spec/api/selected_html_api_spec.rb +0 -86
- data/spec/api/text_api_spec.rb +0 -61
- data/spec/models/account_spec.rb +0 -54
- data/spec/models/error_spec.rb +0 -54
- data/spec/spec_helper.rb +0 -111
|
@@ -1,295 +0,0 @@
|
|
|
1
|
-
=begin
|
|
2
|
-
#WebScraping.AI
|
|
3
|
-
|
|
4
|
-
#WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
|
5
|
-
|
|
6
|
-
The version of the OpenAPI document: 3.2.0
|
|
7
|
-
Contact: support@webscraping.ai
|
|
8
|
-
Generated by: https://openapi-generator.tech
|
|
9
|
-
Generator version: 7.11.0
|
|
10
|
-
|
|
11
|
-
=end
|
|
12
|
-
|
|
13
|
-
require 'cgi'
|
|
14
|
-
|
|
15
|
-
module WebScrapingAI
|
|
16
|
-
class AIApi
|
|
17
|
-
attr_accessor :api_client
|
|
18
|
-
|
|
19
|
-
def initialize(api_client = ApiClient.default)
|
|
20
|
-
@api_client = api_client
|
|
21
|
-
end
|
|
22
|
-
# Extract structured data fields from a web page
|
|
23
|
-
# Returns structured data fields extracted from the webpage using an LLM model. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
|
24
|
-
# @param url [String] URL of the target page.
|
|
25
|
-
# @param fields [Hash<String, String>] Object describing fields to extract from the page and their descriptions
|
|
26
|
-
# @param [Hash] opts the optional parameters
|
|
27
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
28
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
29
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
30
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
31
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
32
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
33
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
34
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
35
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
36
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
37
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
38
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
39
|
-
# @return [Hash<String, String>]
|
|
40
|
-
def get_fields(url, fields, opts = {})
|
|
41
|
-
data, _status_code, _headers = get_fields_with_http_info(url, fields, opts)
|
|
42
|
-
data
|
|
43
|
-
end
|
|
44
|
-
|
|
45
|
-
# Extract structured data fields from a web page
|
|
46
|
-
# Returns structured data fields extracted from the webpage using an LLM model. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
|
47
|
-
# @param url [String] URL of the target page.
|
|
48
|
-
# @param fields [Hash<String, String>] Object describing fields to extract from the page and their descriptions
|
|
49
|
-
# @param [Hash] opts the optional parameters
|
|
50
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
51
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
52
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
53
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
54
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
55
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
56
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
57
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
58
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
59
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
60
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
61
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
62
|
-
# @return [Array<(Hash<String, String>, Integer, Hash)>] Hash<String, String> data, response status code and response headers
|
|
63
|
-
def get_fields_with_http_info(url, fields, opts = {})
|
|
64
|
-
if @api_client.config.debugging
|
|
65
|
-
@api_client.config.logger.debug 'Calling API: AIApi.get_fields ...'
|
|
66
|
-
end
|
|
67
|
-
# verify the required parameter 'url' is set
|
|
68
|
-
if @api_client.config.client_side_validation && url.nil?
|
|
69
|
-
fail ArgumentError, "Missing the required parameter 'url' when calling AIApi.get_fields"
|
|
70
|
-
end
|
|
71
|
-
# verify the required parameter 'fields' is set
|
|
72
|
-
if @api_client.config.client_side_validation && fields.nil?
|
|
73
|
-
fail ArgumentError, "Missing the required parameter 'fields' when calling AIApi.get_fields"
|
|
74
|
-
end
|
|
75
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
|
|
76
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling AIApi.get_fields, must be smaller than or equal to 30000.'
|
|
77
|
-
end
|
|
78
|
-
|
|
79
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
|
|
80
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling AIApi.get_fields, must be greater than or equal to 1.'
|
|
81
|
-
end
|
|
82
|
-
|
|
83
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
|
|
84
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling AIApi.get_fields, must be smaller than or equal to 20000.'
|
|
85
|
-
end
|
|
86
|
-
|
|
87
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
|
|
88
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling AIApi.get_fields, must be greater than or equal to 1.'
|
|
89
|
-
end
|
|
90
|
-
|
|
91
|
-
allowable_values = ["datacenter", "residential"]
|
|
92
|
-
if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
|
|
93
|
-
fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
|
|
94
|
-
end
|
|
95
|
-
allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
|
|
96
|
-
if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
|
|
97
|
-
fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
|
|
98
|
-
end
|
|
99
|
-
allowable_values = ["desktop", "mobile", "tablet"]
|
|
100
|
-
if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
|
|
101
|
-
fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
|
|
102
|
-
end
|
|
103
|
-
# resource path
|
|
104
|
-
local_var_path = '/ai/fields'
|
|
105
|
-
|
|
106
|
-
# query parameters
|
|
107
|
-
query_params = opts[:query_params] || {}
|
|
108
|
-
query_params[:'url'] = url
|
|
109
|
-
query_params[:'fields'] = fields
|
|
110
|
-
query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
|
|
111
|
-
query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
|
|
112
|
-
query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
|
|
113
|
-
query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
|
|
114
|
-
query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
|
|
115
|
-
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
|
116
|
-
query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
|
|
117
|
-
query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
|
|
118
|
-
query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
|
|
119
|
-
query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
|
|
120
|
-
query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
|
|
121
|
-
query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
|
|
122
|
-
|
|
123
|
-
# header parameters
|
|
124
|
-
header_params = opts[:header_params] || {}
|
|
125
|
-
# HTTP header 'Accept' (if needed)
|
|
126
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json']) unless header_params['Accept']
|
|
127
|
-
|
|
128
|
-
# form parameters
|
|
129
|
-
form_params = opts[:form_params] || {}
|
|
130
|
-
|
|
131
|
-
# http body (model)
|
|
132
|
-
post_body = opts[:debug_body]
|
|
133
|
-
|
|
134
|
-
# return_type
|
|
135
|
-
return_type = opts[:debug_return_type] || 'Hash<String, String>'
|
|
136
|
-
|
|
137
|
-
# auth_names
|
|
138
|
-
auth_names = opts[:debug_auth_names] || ['api_key']
|
|
139
|
-
|
|
140
|
-
new_options = opts.merge(
|
|
141
|
-
:operation => :"AIApi.get_fields",
|
|
142
|
-
:header_params => header_params,
|
|
143
|
-
:query_params => query_params,
|
|
144
|
-
:form_params => form_params,
|
|
145
|
-
:body => post_body,
|
|
146
|
-
:auth_names => auth_names,
|
|
147
|
-
:return_type => return_type
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
|
151
|
-
if @api_client.config.debugging
|
|
152
|
-
@api_client.config.logger.debug "API called: AIApi#get_fields\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
|
153
|
-
end
|
|
154
|
-
return data, status_code, headers
|
|
155
|
-
end
|
|
156
|
-
|
|
157
|
-
# Get an answer to a question about a given web page
|
|
158
|
-
# Returns the answer in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing, then the answer is extracted using an LLM model.
|
|
159
|
-
# @param url [String] URL of the target page.
|
|
160
|
-
# @param [Hash] opts the optional parameters
|
|
161
|
-
# @option opts [String] :question Question or instructions to ask the LLM model about the target page.
|
|
162
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
163
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
164
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
165
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
166
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
167
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
168
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
169
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
170
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
171
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
172
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
173
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
174
|
-
# @option opts [String] :format Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. (default to 'json')
|
|
175
|
-
# @return [String]
|
|
176
|
-
def get_question(url, opts = {})
|
|
177
|
-
data, _status_code, _headers = get_question_with_http_info(url, opts)
|
|
178
|
-
data
|
|
179
|
-
end
|
|
180
|
-
|
|
181
|
-
# Get an answer to a question about a given web page
|
|
182
|
-
# Returns the answer in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing, then the answer is extracted using an LLM model.
|
|
183
|
-
# @param url [String] URL of the target page.
|
|
184
|
-
# @param [Hash] opts the optional parameters
|
|
185
|
-
# @option opts [String] :question Question or instructions to ask the LLM model about the target page.
|
|
186
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
187
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
188
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
189
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
190
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
191
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
192
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
193
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
194
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
195
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
196
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
197
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
198
|
-
# @option opts [String] :format Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. (default to 'json')
|
|
199
|
-
# @return [Array<(String, Integer, Hash)>] String data, response status code and response headers
|
|
200
|
-
def get_question_with_http_info(url, opts = {})
|
|
201
|
-
if @api_client.config.debugging
|
|
202
|
-
@api_client.config.logger.debug 'Calling API: AIApi.get_question ...'
|
|
203
|
-
end
|
|
204
|
-
# verify the required parameter 'url' is set
|
|
205
|
-
if @api_client.config.client_side_validation && url.nil?
|
|
206
|
-
fail ArgumentError, "Missing the required parameter 'url' when calling AIApi.get_question"
|
|
207
|
-
end
|
|
208
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
|
|
209
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling AIApi.get_question, must be smaller than or equal to 30000.'
|
|
210
|
-
end
|
|
211
|
-
|
|
212
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
|
|
213
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling AIApi.get_question, must be greater than or equal to 1.'
|
|
214
|
-
end
|
|
215
|
-
|
|
216
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
|
|
217
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling AIApi.get_question, must be smaller than or equal to 20000.'
|
|
218
|
-
end
|
|
219
|
-
|
|
220
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
|
|
221
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling AIApi.get_question, must be greater than or equal to 1.'
|
|
222
|
-
end
|
|
223
|
-
|
|
224
|
-
allowable_values = ["datacenter", "residential"]
|
|
225
|
-
if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
|
|
226
|
-
fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
|
|
227
|
-
end
|
|
228
|
-
allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
|
|
229
|
-
if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
|
|
230
|
-
fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
|
|
231
|
-
end
|
|
232
|
-
allowable_values = ["desktop", "mobile", "tablet"]
|
|
233
|
-
if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
|
|
234
|
-
fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
|
|
235
|
-
end
|
|
236
|
-
allowable_values = ["json", "text"]
|
|
237
|
-
if @api_client.config.client_side_validation && opts[:'format'] && !allowable_values.include?(opts[:'format'])
|
|
238
|
-
fail ArgumentError, "invalid value for \"format\", must be one of #{allowable_values}"
|
|
239
|
-
end
|
|
240
|
-
# resource path
|
|
241
|
-
local_var_path = '/ai/question'
|
|
242
|
-
|
|
243
|
-
# query parameters
|
|
244
|
-
query_params = opts[:query_params] || {}
|
|
245
|
-
query_params[:'url'] = url
|
|
246
|
-
query_params[:'question'] = opts[:'question'] if !opts[:'question'].nil?
|
|
247
|
-
query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
|
|
248
|
-
query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
|
|
249
|
-
query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
|
|
250
|
-
query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
|
|
251
|
-
query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
|
|
252
|
-
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
|
253
|
-
query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
|
|
254
|
-
query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
|
|
255
|
-
query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
|
|
256
|
-
query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
|
|
257
|
-
query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
|
|
258
|
-
query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
|
|
259
|
-
query_params[:'format'] = opts[:'format'] if !opts[:'format'].nil?
|
|
260
|
-
|
|
261
|
-
# header parameters
|
|
262
|
-
header_params = opts[:header_params] || {}
|
|
263
|
-
# HTTP header 'Accept' (if needed)
|
|
264
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json', 'text/html']) unless header_params['Accept']
|
|
265
|
-
|
|
266
|
-
# form parameters
|
|
267
|
-
form_params = opts[:form_params] || {}
|
|
268
|
-
|
|
269
|
-
# http body (model)
|
|
270
|
-
post_body = opts[:debug_body]
|
|
271
|
-
|
|
272
|
-
# return_type
|
|
273
|
-
return_type = opts[:debug_return_type] || 'String'
|
|
274
|
-
|
|
275
|
-
# auth_names
|
|
276
|
-
auth_names = opts[:debug_auth_names] || ['api_key']
|
|
277
|
-
|
|
278
|
-
new_options = opts.merge(
|
|
279
|
-
:operation => :"AIApi.get_question",
|
|
280
|
-
:header_params => header_params,
|
|
281
|
-
:query_params => query_params,
|
|
282
|
-
:form_params => form_params,
|
|
283
|
-
:body => post_body,
|
|
284
|
-
:auth_names => auth_names,
|
|
285
|
-
:return_type => return_type
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
|
289
|
-
if @api_client.config.debugging
|
|
290
|
-
@api_client.config.logger.debug "API called: AIApi#get_question\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
|
291
|
-
end
|
|
292
|
-
return data, status_code, headers
|
|
293
|
-
end
|
|
294
|
-
end
|
|
295
|
-
end
|
|
@@ -1,160 +0,0 @@
|
|
|
1
|
-
=begin
|
|
2
|
-
#WebScraping.AI
|
|
3
|
-
|
|
4
|
-
#WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
|
|
5
|
-
|
|
6
|
-
The version of the OpenAPI document: 3.2.0
|
|
7
|
-
Contact: support@webscraping.ai
|
|
8
|
-
Generated by: https://openapi-generator.tech
|
|
9
|
-
Generator version: 7.11.0
|
|
10
|
-
|
|
11
|
-
=end
|
|
12
|
-
|
|
13
|
-
require 'cgi'
|
|
14
|
-
|
|
15
|
-
module WebScrapingAI
|
|
16
|
-
class HTMLApi
|
|
17
|
-
attr_accessor :api_client
|
|
18
|
-
|
|
19
|
-
def initialize(api_client = ApiClient.default)
|
|
20
|
-
@api_client = api_client
|
|
21
|
-
end
|
|
22
|
-
# Page HTML by URL
|
|
23
|
-
# Returns the full HTML content of a webpage specified by the URL. The response is in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
|
24
|
-
# @param url [String] URL of the target page.
|
|
25
|
-
# @param [Hash] opts the optional parameters
|
|
26
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
27
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
28
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
29
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
30
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
31
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
32
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
33
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
34
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
35
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
36
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
37
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
38
|
-
# @option opts [Boolean] :return_script_result Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned). (default to false)
|
|
39
|
-
# @option opts [String] :format Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. (default to 'json')
|
|
40
|
-
# @return [String]
|
|
41
|
-
def get_html(url, opts = {})
|
|
42
|
-
data, _status_code, _headers = get_html_with_http_info(url, opts)
|
|
43
|
-
data
|
|
44
|
-
end
|
|
45
|
-
|
|
46
|
-
# Page HTML by URL
|
|
47
|
-
# Returns the full HTML content of a webpage specified by the URL. The response is in plain text. Proxies and Chromium JavaScript rendering are used for page retrieval and processing.
|
|
48
|
-
# @param url [String] URL of the target page.
|
|
49
|
-
# @param [Hash] opts the optional parameters
|
|
50
|
-
# @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&headers[One]=value1&headers=[Another]=value2) or as a JSON encoded object (...&headers={\"One\": \"value1\", \"Another\": \"value2\"}).
|
|
51
|
-
# @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
|
|
52
|
-
# @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
|
|
53
|
-
# @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
|
|
54
|
-
# @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
|
|
55
|
-
# @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
|
|
56
|
-
# @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
|
|
57
|
-
# @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \"http://user:password@host:port\" format (<a target=\"_blank\" href=\"https://webscraping.ai/proxies/smartproxy\">Smartproxy</a> for example).
|
|
58
|
-
# @option opts [String] :device Type of device emulation. (default to 'desktop')
|
|
59
|
-
# @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
|
|
60
|
-
# @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
|
|
61
|
-
# @option opts [String] :js_script Custom JavaScript code to execute on the target page.
|
|
62
|
-
# @option opts [Boolean] :return_script_result Return result of the custom JavaScript code (js_script parameter) execution on the target page (false by default, page HTML will be returned). (default to false)
|
|
63
|
-
# @option opts [String] :format Format of the response (text by default). \"json\" will return a JSON object with the response, \"text\" will return a plain text/HTML response. (default to 'json')
|
|
64
|
-
# @return [Array<(String, Integer, Hash)>] String data, response status code and response headers
|
|
65
|
-
def get_html_with_http_info(url, opts = {})
|
|
66
|
-
if @api_client.config.debugging
|
|
67
|
-
@api_client.config.logger.debug 'Calling API: HTMLApi.get_html ...'
|
|
68
|
-
end
|
|
69
|
-
# verify the required parameter 'url' is set
|
|
70
|
-
if @api_client.config.client_side_validation && url.nil?
|
|
71
|
-
fail ArgumentError, "Missing the required parameter 'url' when calling HTMLApi.get_html"
|
|
72
|
-
end
|
|
73
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
|
|
74
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling HTMLApi.get_html, must be smaller than or equal to 30000.'
|
|
75
|
-
end
|
|
76
|
-
|
|
77
|
-
if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
|
|
78
|
-
fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling HTMLApi.get_html, must be greater than or equal to 1.'
|
|
79
|
-
end
|
|
80
|
-
|
|
81
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
|
|
82
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling HTMLApi.get_html, must be smaller than or equal to 20000.'
|
|
83
|
-
end
|
|
84
|
-
|
|
85
|
-
if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
|
|
86
|
-
fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling HTMLApi.get_html, must be greater than or equal to 1.'
|
|
87
|
-
end
|
|
88
|
-
|
|
89
|
-
allowable_values = ["datacenter", "residential"]
|
|
90
|
-
if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
|
|
91
|
-
fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
|
|
92
|
-
end
|
|
93
|
-
allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
|
|
94
|
-
if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
|
|
95
|
-
fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
|
|
96
|
-
end
|
|
97
|
-
allowable_values = ["desktop", "mobile", "tablet"]
|
|
98
|
-
if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
|
|
99
|
-
fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
|
|
100
|
-
end
|
|
101
|
-
allowable_values = ["json", "text"]
|
|
102
|
-
if @api_client.config.client_side_validation && opts[:'format'] && !allowable_values.include?(opts[:'format'])
|
|
103
|
-
fail ArgumentError, "invalid value for \"format\", must be one of #{allowable_values}"
|
|
104
|
-
end
|
|
105
|
-
# resource path
|
|
106
|
-
local_var_path = '/html'
|
|
107
|
-
|
|
108
|
-
# query parameters
|
|
109
|
-
query_params = opts[:query_params] || {}
|
|
110
|
-
query_params[:'url'] = url
|
|
111
|
-
query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
|
|
112
|
-
query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
|
|
113
|
-
query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
|
|
114
|
-
query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
|
|
115
|
-
query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
|
|
116
|
-
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
|
117
|
-
query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
|
|
118
|
-
query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
|
|
119
|
-
query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
|
|
120
|
-
query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
|
|
121
|
-
query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
|
|
122
|
-
query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
|
|
123
|
-
query_params[:'return_script_result'] = opts[:'return_script_result'] if !opts[:'return_script_result'].nil?
|
|
124
|
-
query_params[:'format'] = opts[:'format'] if !opts[:'format'].nil?
|
|
125
|
-
|
|
126
|
-
# header parameters
|
|
127
|
-
header_params = opts[:header_params] || {}
|
|
128
|
-
# HTTP header 'Accept' (if needed)
|
|
129
|
-
header_params['Accept'] = @api_client.select_header_accept(['application/json', 'text/html']) unless header_params['Accept']
|
|
130
|
-
|
|
131
|
-
# form parameters
|
|
132
|
-
form_params = opts[:form_params] || {}
|
|
133
|
-
|
|
134
|
-
# http body (model)
|
|
135
|
-
post_body = opts[:debug_body]
|
|
136
|
-
|
|
137
|
-
# return_type
|
|
138
|
-
return_type = opts[:debug_return_type] || 'String'
|
|
139
|
-
|
|
140
|
-
# auth_names
|
|
141
|
-
auth_names = opts[:debug_auth_names] || ['api_key']
|
|
142
|
-
|
|
143
|
-
new_options = opts.merge(
|
|
144
|
-
:operation => :"HTMLApi.get_html",
|
|
145
|
-
:header_params => header_params,
|
|
146
|
-
:query_params => query_params,
|
|
147
|
-
:form_params => form_params,
|
|
148
|
-
:body => post_body,
|
|
149
|
-
:auth_names => auth_names,
|
|
150
|
-
:return_type => return_type
|
|
151
|
-
)
|
|
152
|
-
|
|
153
|
-
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
|
154
|
-
if @api_client.config.debugging
|
|
155
|
-
@api_client.config.logger.debug "API called: HTMLApi#get_html\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
|
156
|
-
end
|
|
157
|
-
return data, status_code, headers
|
|
158
|
-
end
|
|
159
|
-
end
|
|
160
|
-
end
|