webscraping_ai 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 87c7c3987da110e442c1a4af9c64faf489e1357e19e961d9e8ccf064986b2cf0
4
+ data.tar.gz: 7705c46a0f70dccbbc8c3e51467e37c84faab243330bbeb6069733fa732a3afa
5
+ SHA512:
6
+ metadata.gz: 33d32382161eb7cdbacfb27ffb75c0d758b507e7945adc8c9d418fee2ff6bda748d1b5671ed7c2f430af3fc522537856a7a2ef85fd844e92272203d3d8f06341
7
+ data.tar.gz: 3482be7567deceb06613d2cf82a9d56d1cc7c9ef51b8a27fde3d2df283c91f3d695fceb9dca8adc96a8f48f81c1ad624c9419f1eb700f533e932285e9e512da8
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :development, :test do
6
+ gem 'rake', '~> 12.0.0'
7
+ gem 'pry-byebug'
8
+ gem 'rubocop', '~> 0.66.0'
9
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,70 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ webscraping_ai (1.0.0)
5
+ json (~> 2.1, >= 2.1.0)
6
+ typhoeus (~> 1.0, >= 1.0.1)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.0)
12
+ byebug (11.1.1)
13
+ coderay (1.1.2)
14
+ diff-lcs (1.3)
15
+ ethon (0.12.0)
16
+ ffi (>= 1.3.0)
17
+ ffi (1.12.2)
18
+ jaro_winkler (1.5.4)
19
+ json (2.3.0)
20
+ method_source (1.0.0)
21
+ parallel (1.19.1)
22
+ parser (2.7.0.5)
23
+ ast (~> 2.4.0)
24
+ pry (0.13.0)
25
+ coderay (~> 1.1)
26
+ method_source (~> 1.0)
27
+ pry-byebug (3.9.0)
28
+ byebug (~> 11.0)
29
+ pry (~> 0.13.0)
30
+ psych (3.1.0)
31
+ rainbow (3.0.0)
32
+ rake (12.0.0)
33
+ rspec (3.9.0)
34
+ rspec-core (~> 3.9.0)
35
+ rspec-expectations (~> 3.9.0)
36
+ rspec-mocks (~> 3.9.0)
37
+ rspec-core (3.9.1)
38
+ rspec-support (~> 3.9.1)
39
+ rspec-expectations (3.9.1)
40
+ diff-lcs (>= 1.2.0, < 2.0)
41
+ rspec-support (~> 3.9.0)
42
+ rspec-mocks (3.9.1)
43
+ diff-lcs (>= 1.2.0, < 2.0)
44
+ rspec-support (~> 3.9.0)
45
+ rspec-support (3.9.2)
46
+ rubocop (0.66.0)
47
+ jaro_winkler (~> 1.5.1)
48
+ parallel (~> 1.10)
49
+ parser (>= 2.5, != 2.5.1.1)
50
+ psych (>= 3.1.0)
51
+ rainbow (>= 2.2.2, < 4.0)
52
+ ruby-progressbar (~> 1.7)
53
+ unicode-display_width (>= 1.4.0, < 1.6)
54
+ ruby-progressbar (1.10.1)
55
+ typhoeus (1.3.1)
56
+ ethon (>= 0.9.0)
57
+ unicode-display_width (1.5.0)
58
+
59
+ PLATFORMS
60
+ ruby
61
+
62
+ DEPENDENCIES
63
+ pry-byebug
64
+ rake (~> 12.0.0)
65
+ rspec (~> 3.6, >= 3.6.0)
66
+ rubocop (~> 0.66.0)
67
+ webscraping_ai!
68
+
69
+ BUNDLED WITH
70
+ 2.1.2
data/README.md ADDED
@@ -0,0 +1,78 @@
1
+ # webscraping_ai
2
+
3
+ WebScrapingAI - the Ruby gem for the WebScraping.AI
4
+
5
+ A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
6
+
7
+ This SDK is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
8
+
9
+ - API version: 1.0.0
10
+ - Package version: 1.0.0
11
+ - Build package: org.openapitools.codegen.languages.RubyClientCodegen
12
+
13
+ ## Installation
14
+
15
+ ### Install from RubyGems
16
+
17
+ Add the following in the Gemfile:
18
+
19
+ gem 'webscraping_ai'
20
+
21
+ ## Getting Started
22
+
23
+ Please follow the [installation](#installation) procedure and then run the following code:
24
+
25
+ ```ruby
26
+ # Load the gem
27
+ require 'webscraping_ai'
28
+
29
+ # Setup authorization
30
+ WebScrapingAI.configure do |config|
31
+ # Configure API key authorization: api_key
32
+ config.api_key['api_key'] = 'test-api-key'
33
+ end
34
+
35
+ api_instance = WebScrapingAI::HtmlApi.new
36
+ url = 'https://example.com' # String | URL of the page to get
37
+ opts = {
38
+ selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
39
+ outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
40
+ proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
41
+ disable_js: false, # Boolean | Disable JS execution (false by default)
42
+ inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
43
+ }
44
+
45
+ begin
46
+ #Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
47
+ result = api_instance.get_page(url, opts)
48
+ p result
49
+ rescue WebScrapingAI::ApiError => e
50
+ puts "Exception when calling HtmlApi->get_page: #{e}"
51
+ end
52
+
53
+ ```
54
+
55
+ ## Documentation for API Endpoints
56
+
57
+ All URIs are relative to *https://webscraping.ai/api*
58
+
59
+ Class | Method | HTTP request | Description
60
+ ------------ | ------------- | ------------- | -------------
61
+ *WebScrapingAI::HtmlApi* | [**get_page**](docs/HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
62
+
63
+
64
+ ## Documentation for Models
65
+
66
+ - [WebScrapingAI::ScrappedPage](docs/ScrappedPage.md)
67
+
68
+
69
+ ## Documentation for Authorization
70
+
71
+
72
+ ### api_key
73
+
74
+
75
+ - **Type**: API key
76
+ - **API key parameter name**: api_key
77
+ - **Location**: URL query string
78
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ begin
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+ task default: :spec
8
+ rescue LoadError
9
+ # no rspec available
10
+ end
data/docs/HtmlApi.md ADDED
@@ -0,0 +1,73 @@
1
+ # WebScrapingAI::HtmlApi
2
+
3
+ All URIs are relative to *https://webscraping.ai/api*
4
+
5
+ Method | HTTP request | Description
6
+ ------------- | ------------- | -------------
7
+ [**get_page**](HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
8
+
9
+
10
+
11
+ ## get_page
12
+
13
+ > ScrappedPage get_page(url, opts)
14
+
15
+ Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
16
+
17
+ ### Example
18
+
19
+ ```ruby
20
+ # load the gem
21
+ require 'webscraping_ai'
22
+ # setup authorization
23
+ WebScrapingAI.configure do |config|
24
+ # Configure API key authorization: api_key
25
+ config.api_key['api_key'] = 'YOUR API KEY'
26
+ # Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
27
+ #config.api_key_prefix['api_key'] = 'Bearer'
28
+ end
29
+
30
+ api_instance = WebScrapingAI::HtmlApi.new
31
+ url = 'https://example.com' # String | URL of the page to get
32
+ opts = {
33
+ selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
34
+ outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
35
+ proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
36
+ disable_js: false, # Boolean | Disable JS execution (false by default)
37
+ inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
38
+ }
39
+
40
+ begin
41
+ #Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
42
+ result = api_instance.get_page(url, opts)
43
+ p result
44
+ rescue WebScrapingAI::ApiError => e
45
+ puts "Exception when calling HtmlApi->get_page: #{e}"
46
+ end
47
+ ```
48
+
49
+ ### Parameters
50
+
51
+
52
+ Name | Type | Description | Notes
53
+ ------------- | ------------- | ------------- | -------------
54
+ **url** | **String**| URL of the page to get |
55
+ **selector** | **String**| CSS selector to get a part of the page (null by default, returns whole page HTML) | [optional]
56
+ **outer_html** | **Boolean**| Return outer HTML of the selected element (false by default, returns inner HTML) | [optional]
57
+ **proxy** | **String**| Proxy country code, for geotargeting (US by default) | [optional]
58
+ **disable_js** | **Boolean**| Disable JS execution (false by default) | [optional]
59
+ **inline_css** | **Boolean**| Inline included CSS files to make page viewable on other domains (false by default) | [optional]
60
+
61
+ ### Return type
62
+
63
+ [**ScrappedPage**](ScrappedPage.md)
64
+
65
+ ### Authorization
66
+
67
+ [api_key](../README.md#api_key)
68
+
69
+ ### HTTP request headers
70
+
71
+ - **Content-Type**: Not defined
72
+ - **Accept**: application/json
73
+
@@ -0,0 +1,23 @@
1
+ # WebScrapingAI::ScrappedPage
2
+
3
+ ## Properties
4
+
5
+ Name | Type | Description | Notes
6
+ ------------ | ------------- | ------------- | -------------
7
+ **size_bytes** | **Integer** | Page HTML content size in bytes | [optional]
8
+ **html** | **String** | HTML of the full page or a selected area | [optional]
9
+ **status** | **Integer** | Response HTTP status code (200, 404, 302, etc) | [optional]
10
+ **status_message** | **String** | Response HTTP status message | [optional]
11
+
12
+ ## Code Sample
13
+
14
+ ```ruby
15
+ require 'WebScrapingAI'
16
+
17
+ instance = WebScrapingAI::ScrappedPage.new(size_bytes: null,
18
+ html: null,
19
+ status: null,
20
+ status_message: null)
21
+ ```
22
+
23
+
data/git_push.sh ADDED
@@ -0,0 +1,58 @@
1
+ #!/bin/sh
2
+ # ref: https://help.github.com/articles/adding-an-existing-project-to-github-using-the-command-line/
3
+ #
4
+ # Usage example: /bin/sh ./git_push.sh wing328 openapi-pestore-perl "minor update" "gitlab.com"
5
+
6
+ git_user_id=$1
7
+ git_repo_id=$2
8
+ release_note=$3
9
+ git_host=$4
10
+
11
+ if [ "$git_host" = "" ]; then
12
+ git_host="github.com"
13
+ echo "[INFO] No command line input provided. Set \$git_host to $git_host"
14
+ fi
15
+
16
+ if [ "$git_user_id" = "" ]; then
17
+ git_user_id="webscraping-ai"
18
+ echo "[INFO] No command line input provided. Set \$git_user_id to $git_user_id"
19
+ fi
20
+
21
+ if [ "$git_repo_id" = "" ]; then
22
+ git_repo_id="webscraping-ai-ruby"
23
+ echo "[INFO] No command line input provided. Set \$git_repo_id to $git_repo_id"
24
+ fi
25
+
26
+ if [ "$release_note" = "" ]; then
27
+ release_note="Minor update"
28
+ echo "[INFO] No command line input provided. Set \$release_note to $release_note"
29
+ fi
30
+
31
+ # Initialize the local directory as a Git repository
32
+ git init
33
+
34
+ # Adds the files in the local repository and stages them for commit.
35
+ git add .
36
+
37
+ # Commits the tracked changes and prepares them to be pushed to a remote repository.
38
+ git commit -m "$release_note"
39
+
40
+ # Sets the new remote
41
+ git_remote=`git remote`
42
+ if [ "$git_remote" = "" ]; then # git remote not defined
43
+
44
+ if [ "$GIT_TOKEN" = "" ]; then
45
+ echo "[INFO] \$GIT_TOKEN (environment variable) is not set. Using the git credential in your environment."
46
+ git remote add origin https://${git_host}/${git_user_id}/${git_repo_id}.git
47
+ else
48
+ git remote add origin https://${git_user_id}:${GIT_TOKEN}@${git_host}/${git_user_id}/${git_repo_id}.git
49
+ fi
50
+
51
+ fi
52
+
53
+ git pull origin master
54
+
55
+ # Pushes (Forces) the changes in the local repository up to the remote repository
56
+ echo "Git pushing to https://${git_host}/${git_user_id}/${git_repo_id}.git"
57
+ git push origin master 2>&1 | grep -v 'To https'
58
+
@@ -0,0 +1,98 @@
1
+ =begin
2
+ #WebScraping.AI
3
+
4
+ #A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
5
+
6
+ The version of the OpenAPI document: 1.0.0
7
+
8
+ Generated by: https://openapi-generator.tech
9
+ OpenAPI Generator version: 4.2.3
10
+
11
+ =end
12
+
13
+ require 'cgi'
14
+
15
+ module WebScrapingAI
16
+ class HtmlApi
17
+ attr_accessor :api_client
18
+
19
+ def initialize(api_client = ApiClient.default)
20
+ @api_client = api_client
21
+ end
22
+ # Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
23
+ # @param url [String] URL of the page to get
24
+ # @param [Hash] opts the optional parameters
25
+ # @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
26
+ # @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
27
+ # @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
28
+ # @option opts [Boolean] :disable_js Disable JS execution (false by default)
29
+ # @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
30
+ # @return [ScrappedPage]
31
+ def get_page(url, opts = {})
32
+ data, _status_code, _headers = get_page_with_http_info(url, opts)
33
+ data
34
+ end
35
+
36
+ # Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
37
+ # @param url [String] URL of the page to get
38
+ # @param [Hash] opts the optional parameters
39
+ # @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
40
+ # @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
41
+ # @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
42
+ # @option opts [Boolean] :disable_js Disable JS execution (false by default)
43
+ # @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
44
+ # @return [Array<(ScrappedPage, Integer, Hash)>] ScrappedPage data, response status code and response headers
45
+ def get_page_with_http_info(url, opts = {})
46
+ if @api_client.config.debugging
47
+ @api_client.config.logger.debug 'Calling API: HtmlApi.get_page ...'
48
+ end
49
+ # verify the required parameter 'url' is set
50
+ if @api_client.config.client_side_validation && url.nil?
51
+ fail ArgumentError, "Missing the required parameter 'url' when calling HtmlApi.get_page"
52
+ end
53
+ # resource path
54
+ local_var_path = '/'
55
+
56
+ # query parameters
57
+ query_params = opts[:query_params] || {}
58
+ query_params[:'url'] = url
59
+ query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
60
+ query_params[:'outer_html'] = opts[:'outer_html'] if !opts[:'outer_html'].nil?
61
+ query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
62
+ query_params[:'disable_js'] = opts[:'disable_js'] if !opts[:'disable_js'].nil?
63
+ query_params[:'inline_css'] = opts[:'inline_css'] if !opts[:'inline_css'].nil?
64
+
65
+ # header parameters
66
+ header_params = opts[:header_params] || {}
67
+ # HTTP header 'Accept' (if needed)
68
+ header_params['Accept'] = @api_client.select_header_accept(['application/json'])
69
+
70
+ # form parameters
71
+ form_params = opts[:form_params] || {}
72
+
73
+ # http body (model)
74
+ post_body = opts[:body]
75
+
76
+ # return_type
77
+ return_type = opts[:return_type] || 'ScrappedPage'
78
+
79
+ # auth_names
80
+ auth_names = opts[:auth_names] || ['api_key']
81
+
82
+ new_options = opts.merge(
83
+ :header_params => header_params,
84
+ :query_params => query_params,
85
+ :form_params => form_params,
86
+ :body => post_body,
87
+ :auth_names => auth_names,
88
+ :return_type => return_type
89
+ )
90
+
91
+ data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
92
+ if @api_client.config.debugging
93
+ @api_client.config.logger.debug "API called: HtmlApi#get_page\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
94
+ end
95
+ return data, status_code, headers
96
+ end
97
+ end
98
+ end