webscraping_ai 3.2.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,291 +0,0 @@
1
- =begin
2
- #WebScraping.AI
3
-
4
- #WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
5
-
6
- The version of the OpenAPI document: 3.2.0
7
- Contact: support@webscraping.ai
8
- Generated by: https://openapi-generator.tech
9
- Generator version: 7.11.0
10
-
11
- =end
12
-
13
- require 'cgi'
14
-
15
- module WebScrapingAI
16
- class SelectedHTMLApi
17
- attr_accessor :api_client
18
-
19
- def initialize(api_client = ApiClient.default)
20
- @api_client = api_client
21
- end
22
- # HTML of a selected page area by URL and CSS selector
23
- # Returns HTML of a selected page area by URL and CSS selector. Useful if you don't want to do the HTML parsing on your side.
24
- # @param url [String] URL of the target page.
25
- # @param [Hash] opts the optional parameters
26
- # @option opts [String] :selector CSS selector (null by default, returns whole page HTML)
27
- # @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&amp;headers[One]&#x3D;value1&amp;headers&#x3D;[Another]&#x3D;value2) or as a JSON encoded object (...&amp;headers&#x3D;{\&quot;One\&quot;: \&quot;value1\&quot;, \&quot;Another\&quot;: \&quot;value2\&quot;}).
28
- # @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
29
- # @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
30
- # @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
31
- # @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
32
- # @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
33
- # @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
34
- # @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \&quot;http://user:password@host:port\&quot; format (&lt;a target&#x3D;\&quot;_blank\&quot; href&#x3D;\&quot;https://webscraping.ai/proxies/smartproxy\&quot;&gt;Smartproxy&lt;/a&gt; for example).
35
- # @option opts [String] :device Type of device emulation. (default to 'desktop')
36
- # @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
37
- # @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
38
- # @option opts [String] :js_script Custom JavaScript code to execute on the target page.
39
- # @option opts [String] :format Format of the response (text by default). \&quot;json\&quot; will return a JSON object with the response, \&quot;text\&quot; will return a plain text/HTML response. (default to 'json')
40
- # @return [String]
41
- def get_selected(url, opts = {})
42
- data, _status_code, _headers = get_selected_with_http_info(url, opts)
43
- data
44
- end
45
-
46
- # HTML of a selected page area by URL and CSS selector
47
- # Returns HTML of a selected page area by URL and CSS selector. Useful if you don&#39;t want to do the HTML parsing on your side.
48
- # @param url [String] URL of the target page.
49
- # @param [Hash] opts the optional parameters
50
- # @option opts [String] :selector CSS selector (null by default, returns whole page HTML)
51
- # @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&amp;headers[One]&#x3D;value1&amp;headers&#x3D;[Another]&#x3D;value2) or as a JSON encoded object (...&amp;headers&#x3D;{\&quot;One\&quot;: \&quot;value1\&quot;, \&quot;Another\&quot;: \&quot;value2\&quot;}).
52
- # @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
53
- # @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
54
- # @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
55
- # @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
56
- # @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
57
- # @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
58
- # @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \&quot;http://user:password@host:port\&quot; format (&lt;a target&#x3D;\&quot;_blank\&quot; href&#x3D;\&quot;https://webscraping.ai/proxies/smartproxy\&quot;&gt;Smartproxy&lt;/a&gt; for example).
59
- # @option opts [String] :device Type of device emulation. (default to 'desktop')
60
- # @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
61
- # @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
62
- # @option opts [String] :js_script Custom JavaScript code to execute on the target page.
63
- # @option opts [String] :format Format of the response (text by default). \&quot;json\&quot; will return a JSON object with the response, \&quot;text\&quot; will return a plain text/HTML response. (default to 'json')
64
- # @return [Array<(String, Integer, Hash)>] String data, response status code and response headers
65
- def get_selected_with_http_info(url, opts = {})
66
- if @api_client.config.debugging
67
- @api_client.config.logger.debug 'Calling API: SelectedHTMLApi.get_selected ...'
68
- end
69
- # verify the required parameter 'url' is set
70
- if @api_client.config.client_side_validation && url.nil?
71
- fail ArgumentError, "Missing the required parameter 'url' when calling SelectedHTMLApi.get_selected"
72
- end
73
- if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
74
- fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected, must be smaller than or equal to 30000.'
75
- end
76
-
77
- if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
78
- fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected, must be greater than or equal to 1.'
79
- end
80
-
81
- if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
82
- fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected, must be smaller than or equal to 20000.'
83
- end
84
-
85
- if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
86
- fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected, must be greater than or equal to 1.'
87
- end
88
-
89
- allowable_values = ["datacenter", "residential"]
90
- if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
91
- fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
92
- end
93
- allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
94
- if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
95
- fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
96
- end
97
- allowable_values = ["desktop", "mobile", "tablet"]
98
- if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
99
- fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
100
- end
101
- allowable_values = ["json", "text"]
102
- if @api_client.config.client_side_validation && opts[:'format'] && !allowable_values.include?(opts[:'format'])
103
- fail ArgumentError, "invalid value for \"format\", must be one of #{allowable_values}"
104
- end
105
- # resource path
106
- local_var_path = '/selected'
107
-
108
- # query parameters
109
- query_params = opts[:query_params] || {}
110
- query_params[:'url'] = url
111
- query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
112
- query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
113
- query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
114
- query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
115
- query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
116
- query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
117
- query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
118
- query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
119
- query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
120
- query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
121
- query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
122
- query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
123
- query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
124
- query_params[:'format'] = opts[:'format'] if !opts[:'format'].nil?
125
-
126
- # header parameters
127
- header_params = opts[:header_params] || {}
128
- # HTTP header 'Accept' (if needed)
129
- header_params['Accept'] = @api_client.select_header_accept(['application/json', 'text/html']) unless header_params['Accept']
130
-
131
- # form parameters
132
- form_params = opts[:form_params] || {}
133
-
134
- # http body (model)
135
- post_body = opts[:debug_body]
136
-
137
- # return_type
138
- return_type = opts[:debug_return_type] || 'String'
139
-
140
- # auth_names
141
- auth_names = opts[:debug_auth_names] || ['api_key']
142
-
143
- new_options = opts.merge(
144
- :operation => :"SelectedHTMLApi.get_selected",
145
- :header_params => header_params,
146
- :query_params => query_params,
147
- :form_params => form_params,
148
- :body => post_body,
149
- :auth_names => auth_names,
150
- :return_type => return_type
151
- )
152
-
153
- data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
154
- if @api_client.config.debugging
155
- @api_client.config.logger.debug "API called: SelectedHTMLApi#get_selected\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
156
- end
157
- return data, status_code, headers
158
- end
159
-
160
- # HTML of multiple page areas by URL and CSS selectors
161
- # Returns HTML of multiple page areas by URL and CSS selectors. Useful if you don't want to do the HTML parsing on your side.
162
- # @param url [String] URL of the target page.
163
- # @param [Hash] opts the optional parameters
164
- # @option opts [Array<String>] :selectors Multiple CSS selectors (null by default, returns whole page HTML)
165
- # @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&amp;headers[One]&#x3D;value1&amp;headers&#x3D;[Another]&#x3D;value2) or as a JSON encoded object (...&amp;headers&#x3D;{\&quot;One\&quot;: \&quot;value1\&quot;, \&quot;Another\&quot;: \&quot;value2\&quot;}).
166
- # @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
167
- # @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
168
- # @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
169
- # @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
170
- # @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
171
- # @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
172
- # @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \&quot;http://user:password@host:port\&quot; format (&lt;a target&#x3D;\&quot;_blank\&quot; href&#x3D;\&quot;https://webscraping.ai/proxies/smartproxy\&quot;&gt;Smartproxy&lt;/a&gt; for example).
173
- # @option opts [String] :device Type of device emulation. (default to 'desktop')
174
- # @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
175
- # @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
176
- # @option opts [String] :js_script Custom JavaScript code to execute on the target page.
177
- # @return [Array<String>]
178
- def get_selected_multiple(url, opts = {})
179
- data, _status_code, _headers = get_selected_multiple_with_http_info(url, opts)
180
- data
181
- end
182
-
183
- # HTML of multiple page areas by URL and CSS selectors
184
- # Returns HTML of multiple page areas by URL and CSS selectors. Useful if you don&#39;t want to do the HTML parsing on your side.
185
- # @param url [String] URL of the target page.
186
- # @param [Hash] opts the optional parameters
187
- # @option opts [Array<String>] :selectors Multiple CSS selectors (null by default, returns whole page HTML)
188
- # @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&amp;headers[One]&#x3D;value1&amp;headers&#x3D;[Another]&#x3D;value2) or as a JSON encoded object (...&amp;headers&#x3D;{\&quot;One\&quot;: \&quot;value1\&quot;, \&quot;Another\&quot;: \&quot;value2\&quot;}).
189
- # @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
190
- # @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
191
- # @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
192
- # @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
193
- # @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
194
- # @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
195
- # @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \&quot;http://user:password@host:port\&quot; format (&lt;a target&#x3D;\&quot;_blank\&quot; href&#x3D;\&quot;https://webscraping.ai/proxies/smartproxy\&quot;&gt;Smartproxy&lt;/a&gt; for example).
196
- # @option opts [String] :device Type of device emulation. (default to 'desktop')
197
- # @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
198
- # @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
199
- # @option opts [String] :js_script Custom JavaScript code to execute on the target page.
200
- # @return [Array<(Array<String>, Integer, Hash)>] Array<String> data, response status code and response headers
201
- def get_selected_multiple_with_http_info(url, opts = {})
202
- if @api_client.config.debugging
203
- @api_client.config.logger.debug 'Calling API: SelectedHTMLApi.get_selected_multiple ...'
204
- end
205
- # verify the required parameter 'url' is set
206
- if @api_client.config.client_side_validation && url.nil?
207
- fail ArgumentError, "Missing the required parameter 'url' when calling SelectedHTMLApi.get_selected_multiple"
208
- end
209
- if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
210
- fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be smaller than or equal to 30000.'
211
- end
212
-
213
- if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
214
- fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be greater than or equal to 1.'
215
- end
216
-
217
- if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
218
- fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be smaller than or equal to 20000.'
219
- end
220
-
221
- if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
222
- fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling SelectedHTMLApi.get_selected_multiple, must be greater than or equal to 1.'
223
- end
224
-
225
- allowable_values = ["datacenter", "residential"]
226
- if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
227
- fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
228
- end
229
- allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
230
- if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
231
- fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
232
- end
233
- allowable_values = ["desktop", "mobile", "tablet"]
234
- if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
235
- fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
236
- end
237
- # resource path
238
- local_var_path = '/selected-multiple'
239
-
240
- # query parameters
241
- query_params = opts[:query_params] || {}
242
- query_params[:'url'] = url
243
- query_params[:'selectors'] = @api_client.build_collection_param(opts[:'selectors'], :multi) if !opts[:'selectors'].nil?
244
- query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
245
- query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
246
- query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
247
- query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
248
- query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
249
- query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
250
- query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
251
- query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
252
- query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
253
- query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
254
- query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
255
- query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
256
-
257
- # header parameters
258
- header_params = opts[:header_params] || {}
259
- # HTTP header 'Accept' (if needed)
260
- header_params['Accept'] = @api_client.select_header_accept(['application/json']) unless header_params['Accept']
261
-
262
- # form parameters
263
- form_params = opts[:form_params] || {}
264
-
265
- # http body (model)
266
- post_body = opts[:debug_body]
267
-
268
- # return_type
269
- return_type = opts[:debug_return_type] || 'Array<String>'
270
-
271
- # auth_names
272
- auth_names = opts[:debug_auth_names] || ['api_key']
273
-
274
- new_options = opts.merge(
275
- :operation => :"SelectedHTMLApi.get_selected_multiple",
276
- :header_params => header_params,
277
- :query_params => query_params,
278
- :form_params => form_params,
279
- :body => post_body,
280
- :auth_names => auth_names,
281
- :return_type => return_type
282
- )
283
-
284
- data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
285
- if @api_client.config.debugging
286
- @api_client.config.logger.debug "API called: SelectedHTMLApi#get_selected_multiple\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
287
- end
288
- return data, status_code, headers
289
- end
290
- end
291
- end
@@ -1,160 +0,0 @@
1
- =begin
2
- #WebScraping.AI
3
-
4
- #WebScraping.AI scraping API provides LLM-powered tools with Chromium JavaScript rendering, rotating proxies, and built-in HTML parsing.
5
-
6
- The version of the OpenAPI document: 3.2.0
7
- Contact: support@webscraping.ai
8
- Generated by: https://openapi-generator.tech
9
- Generator version: 7.11.0
10
-
11
- =end
12
-
13
- require 'cgi'
14
-
15
- module WebScrapingAI
16
- class TextApi
17
- attr_accessor :api_client
18
-
19
- def initialize(api_client = ApiClient.default)
20
- @api_client = api_client
21
- end
22
- # Page text by URL
23
- # Returns the visible text content of a webpage specified by the URL. Can be used to feed data to LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
24
- # @param url [String] URL of the target page.
25
- # @param [Hash] opts the optional parameters
26
- # @option opts [String] :text_format Format of the text response (plain by default). \&quot;plain\&quot; will return only the page body text. \&quot;json\&quot; and \&quot;xml\&quot; will return a json/xml with \&quot;title\&quot;, \&quot;description\&quot; and \&quot;content\&quot; keys. (default to 'plain')
27
- # @option opts [Boolean] :return_links [Works only with text_format&#x3D;json] Return links from the page body text (false by default). Useful for building web crawlers. (default to false)
28
- # @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&amp;headers[One]&#x3D;value1&amp;headers&#x3D;[Another]&#x3D;value2) or as a JSON encoded object (...&amp;headers&#x3D;{\&quot;One\&quot;: \&quot;value1\&quot;, \&quot;Another\&quot;: \&quot;value2\&quot;}).
29
- # @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
30
- # @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
31
- # @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
32
- # @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
33
- # @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
34
- # @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
35
- # @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \&quot;http://user:password@host:port\&quot; format (&lt;a target&#x3D;\&quot;_blank\&quot; href&#x3D;\&quot;https://webscraping.ai/proxies/smartproxy\&quot;&gt;Smartproxy&lt;/a&gt; for example).
36
- # @option opts [String] :device Type of device emulation. (default to 'desktop')
37
- # @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
38
- # @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
39
- # @option opts [String] :js_script Custom JavaScript code to execute on the target page.
40
- # @return [String]
41
- def get_text(url, opts = {})
42
- data, _status_code, _headers = get_text_with_http_info(url, opts)
43
- data
44
- end
45
-
46
- # Page text by URL
47
- # Returns the visible text content of a webpage specified by the URL. Can be used to feed data to LLM models. The response can be in plain text, JSON, or XML format based on the text_format parameter. Proxies and Chromium JavaScript rendering are used for page retrieval and processing. Returns JSON on error.
48
- # @param url [String] URL of the target page.
49
- # @param [Hash] opts the optional parameters
50
- # @option opts [String] :text_format Format of the text response (plain by default). \&quot;plain\&quot; will return only the page body text. \&quot;json\&quot; and \&quot;xml\&quot; will return a json/xml with \&quot;title\&quot;, \&quot;description\&quot; and \&quot;content\&quot; keys. (default to 'plain')
51
- # @option opts [Boolean] :return_links [Works only with text_format&#x3D;json] Return links from the page body text (false by default). Useful for building web crawlers. (default to false)
52
- # @option opts [Hash<String, String>] :headers HTTP headers to pass to the target page. Can be specified either via a nested query parameter (...&amp;headers[One]&#x3D;value1&amp;headers&#x3D;[Another]&#x3D;value2) or as a JSON encoded object (...&amp;headers&#x3D;{\&quot;One\&quot;: \&quot;value1\&quot;, \&quot;Another\&quot;: \&quot;value2\&quot;}).
53
- # @option opts [Integer] :timeout Maximum web page retrieval time in ms. Increase it in case of timeout errors (10000 by default, maximum is 30000). (default to 10000)
54
- # @option opts [Boolean] :js Execute on-page JavaScript using a headless browser (true by default). (default to true)
55
- # @option opts [Integer] :js_timeout Maximum JavaScript rendering time in ms. Increase it in case if you see a loading indicator instead of data on the target page. (default to 2000)
56
- # @option opts [String] :wait_for CSS selector to wait for before returning the page content. Useful for pages with dynamic content loading. Overrides js_timeout.
57
- # @option opts [String] :proxy Type of proxy, use residential proxies if your site restricts traffic from datacenters (datacenter by default). Note that residential proxy requests are more expensive than datacenter, see the pricing page for details. (default to 'datacenter')
58
- # @option opts [String] :country Country of the proxy to use (US by default). (default to 'us')
59
- # @option opts [String] :custom_proxy Your own proxy URL to use instead of our built-in proxy pool in \&quot;http://user:password@host:port\&quot; format (&lt;a target&#x3D;\&quot;_blank\&quot; href&#x3D;\&quot;https://webscraping.ai/proxies/smartproxy\&quot;&gt;Smartproxy&lt;/a&gt; for example).
60
- # @option opts [String] :device Type of device emulation. (default to 'desktop')
61
- # @option opts [Boolean] :error_on_404 Return error on 404 HTTP status on the target page (false by default). (default to false)
62
- # @option opts [Boolean] :error_on_redirect Return error on redirect on the target page (false by default). (default to false)
63
- # @option opts [String] :js_script Custom JavaScript code to execute on the target page.
64
- # @return [Array<(String, Integer, Hash)>] String data, response status code and response headers
65
- def get_text_with_http_info(url, opts = {})
66
- if @api_client.config.debugging
67
- @api_client.config.logger.debug 'Calling API: TextApi.get_text ...'
68
- end
69
- # verify the required parameter 'url' is set
70
- if @api_client.config.client_side_validation && url.nil?
71
- fail ArgumentError, "Missing the required parameter 'url' when calling TextApi.get_text"
72
- end
73
- allowable_values = ["plain", "xml", "json"]
74
- if @api_client.config.client_side_validation && opts[:'text_format'] && !allowable_values.include?(opts[:'text_format'])
75
- fail ArgumentError, "invalid value for \"text_format\", must be one of #{allowable_values}"
76
- end
77
- if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] > 30000
78
- fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling TextApi.get_text, must be smaller than or equal to 30000.'
79
- end
80
-
81
- if @api_client.config.client_side_validation && !opts[:'timeout'].nil? && opts[:'timeout'] < 1
82
- fail ArgumentError, 'invalid value for "opts[:"timeout"]" when calling TextApi.get_text, must be greater than or equal to 1.'
83
- end
84
-
85
- if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] > 20000
86
- fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling TextApi.get_text, must be smaller than or equal to 20000.'
87
- end
88
-
89
- if @api_client.config.client_side_validation && !opts[:'js_timeout'].nil? && opts[:'js_timeout'] < 1
90
- fail ArgumentError, 'invalid value for "opts[:"js_timeout"]" when calling TextApi.get_text, must be greater than or equal to 1.'
91
- end
92
-
93
- allowable_values = ["datacenter", "residential"]
94
- if @api_client.config.client_side_validation && opts[:'proxy'] && !allowable_values.include?(opts[:'proxy'])
95
- fail ArgumentError, "invalid value for \"proxy\", must be one of #{allowable_values}"
96
- end
97
- allowable_values = ["us", "gb", "de", "it", "fr", "ca", "es", "ru", "jp", "kr", "in"]
98
- if @api_client.config.client_side_validation && opts[:'country'] && !allowable_values.include?(opts[:'country'])
99
- fail ArgumentError, "invalid value for \"country\", must be one of #{allowable_values}"
100
- end
101
- allowable_values = ["desktop", "mobile", "tablet"]
102
- if @api_client.config.client_side_validation && opts[:'device'] && !allowable_values.include?(opts[:'device'])
103
- fail ArgumentError, "invalid value for \"device\", must be one of #{allowable_values}"
104
- end
105
- # resource path
106
- local_var_path = '/text'
107
-
108
- # query parameters
109
- query_params = opts[:query_params] || {}
110
- query_params[:'url'] = url
111
- query_params[:'text_format'] = opts[:'text_format'] if !opts[:'text_format'].nil?
112
- query_params[:'return_links'] = opts[:'return_links'] if !opts[:'return_links'].nil?
113
- query_params[:'headers'] = opts[:'headers'] if !opts[:'headers'].nil?
114
- query_params[:'timeout'] = opts[:'timeout'] if !opts[:'timeout'].nil?
115
- query_params[:'js'] = opts[:'js'] if !opts[:'js'].nil?
116
- query_params[:'js_timeout'] = opts[:'js_timeout'] if !opts[:'js_timeout'].nil?
117
- query_params[:'wait_for'] = opts[:'wait_for'] if !opts[:'wait_for'].nil?
118
- query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
119
- query_params[:'country'] = opts[:'country'] if !opts[:'country'].nil?
120
- query_params[:'custom_proxy'] = opts[:'custom_proxy'] if !opts[:'custom_proxy'].nil?
121
- query_params[:'device'] = opts[:'device'] if !opts[:'device'].nil?
122
- query_params[:'error_on_404'] = opts[:'error_on_404'] if !opts[:'error_on_404'].nil?
123
- query_params[:'error_on_redirect'] = opts[:'error_on_redirect'] if !opts[:'error_on_redirect'].nil?
124
- query_params[:'js_script'] = opts[:'js_script'] if !opts[:'js_script'].nil?
125
-
126
- # header parameters
127
- header_params = opts[:header_params] || {}
128
- # HTTP header 'Accept' (if needed)
129
- header_params['Accept'] = @api_client.select_header_accept(['application/json', 'text/html', 'text/xml']) unless header_params['Accept']
130
-
131
- # form parameters
132
- form_params = opts[:form_params] || {}
133
-
134
- # http body (model)
135
- post_body = opts[:debug_body]
136
-
137
- # return_type
138
- return_type = opts[:debug_return_type] || 'String'
139
-
140
- # auth_names
141
- auth_names = opts[:debug_auth_names] || ['api_key']
142
-
143
- new_options = opts.merge(
144
- :operation => :"TextApi.get_text",
145
- :header_params => header_params,
146
- :query_params => query_params,
147
- :form_params => form_params,
148
- :body => post_body,
149
- :auth_names => auth_names,
150
- :return_type => return_type
151
- )
152
-
153
- data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
154
- if @api_client.config.debugging
155
- @api_client.config.logger.debug "API called: TextApi#get_text\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
156
- end
157
- return data, status_code, headers
158
- end
159
- end
160
- end