webscraping_ai 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 87c7c3987da110e442c1a4af9c64faf489e1357e19e961d9e8ccf064986b2cf0
4
+ data.tar.gz: 7705c46a0f70dccbbc8c3e51467e37c84faab243330bbeb6069733fa732a3afa
5
+ SHA512:
6
+ metadata.gz: 33d32382161eb7cdbacfb27ffb75c0d758b507e7945adc8c9d418fee2ff6bda748d1b5671ed7c2f430af3fc522537856a7a2ef85fd844e92272203d3d8f06341
7
+ data.tar.gz: 3482be7567deceb06613d2cf82a9d56d1cc7c9ef51b8a27fde3d2df283c91f3d695fceb9dca8adc96a8f48f81c1ad624c9419f1eb700f533e932285e9e512da8
data/Gemfile ADDED
@@ -0,0 +1,9 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :development, :test do
6
+ gem 'rake', '~> 12.0.0'
7
+ gem 'pry-byebug'
8
+ gem 'rubocop', '~> 0.66.0'
9
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,70 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ webscraping_ai (1.0.0)
5
+ json (~> 2.1, >= 2.1.0)
6
+ typhoeus (~> 1.0, >= 1.0.1)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ ast (2.4.0)
12
+ byebug (11.1.1)
13
+ coderay (1.1.2)
14
+ diff-lcs (1.3)
15
+ ethon (0.12.0)
16
+ ffi (>= 1.3.0)
17
+ ffi (1.12.2)
18
+ jaro_winkler (1.5.4)
19
+ json (2.3.0)
20
+ method_source (1.0.0)
21
+ parallel (1.19.1)
22
+ parser (2.7.0.5)
23
+ ast (~> 2.4.0)
24
+ pry (0.13.0)
25
+ coderay (~> 1.1)
26
+ method_source (~> 1.0)
27
+ pry-byebug (3.9.0)
28
+ byebug (~> 11.0)
29
+ pry (~> 0.13.0)
30
+ psych (3.1.0)
31
+ rainbow (3.0.0)
32
+ rake (12.0.0)
33
+ rspec (3.9.0)
34
+ rspec-core (~> 3.9.0)
35
+ rspec-expectations (~> 3.9.0)
36
+ rspec-mocks (~> 3.9.0)
37
+ rspec-core (3.9.1)
38
+ rspec-support (~> 3.9.1)
39
+ rspec-expectations (3.9.1)
40
+ diff-lcs (>= 1.2.0, < 2.0)
41
+ rspec-support (~> 3.9.0)
42
+ rspec-mocks (3.9.1)
43
+ diff-lcs (>= 1.2.0, < 2.0)
44
+ rspec-support (~> 3.9.0)
45
+ rspec-support (3.9.2)
46
+ rubocop (0.66.0)
47
+ jaro_winkler (~> 1.5.1)
48
+ parallel (~> 1.10)
49
+ parser (>= 2.5, != 2.5.1.1)
50
+ psych (>= 3.1.0)
51
+ rainbow (>= 2.2.2, < 4.0)
52
+ ruby-progressbar (~> 1.7)
53
+ unicode-display_width (>= 1.4.0, < 1.6)
54
+ ruby-progressbar (1.10.1)
55
+ typhoeus (1.3.1)
56
+ ethon (>= 0.9.0)
57
+ unicode-display_width (1.5.0)
58
+
59
+ PLATFORMS
60
+ ruby
61
+
62
+ DEPENDENCIES
63
+ pry-byebug
64
+ rake (~> 12.0.0)
65
+ rspec (~> 3.6, >= 3.6.0)
66
+ rubocop (~> 0.66.0)
67
+ webscraping_ai!
68
+
69
+ BUNDLED WITH
70
+ 2.1.2
data/README.md ADDED
@@ -0,0 +1,78 @@
1
+ # webscraping_ai
2
+
3
+ WebScrapingAI - the Ruby gem for the WebScraping.AI
4
+
5
+ A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
6
+
7
+ This SDK is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
8
+
9
+ - API version: 1.0.0
10
+ - Package version: 1.0.0
11
+ - Build package: org.openapitools.codegen.languages.RubyClientCodegen
12
+
13
+ ## Installation
14
+
15
+ ### Install from RubyGems
16
+
17
+ Add the following in the Gemfile:
18
+
19
+ gem 'webscraping_ai'
20
+
21
+ ## Getting Started
22
+
23
+ Please follow the [installation](#installation) procedure and then run the following code:
24
+
25
+ ```ruby
26
+ # Load the gem
27
+ require 'webscraping_ai'
28
+
29
+ # Setup authorization
30
+ WebScrapingAI.configure do |config|
31
+ # Configure API key authorization: api_key
32
+ config.api_key['api_key'] = 'test-api-key'
33
+ end
34
+
35
+ api_instance = WebScrapingAI::HtmlApi.new
36
+ url = 'https://example.com' # String | URL of the page to get
37
+ opts = {
38
+ selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
39
+ outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
40
+ proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
41
+ disable_js: false, # Boolean | Disable JS execution (false by default)
42
+ inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
43
+ }
44
+
45
+ begin
46
+ #Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
47
+ result = api_instance.get_page(url, opts)
48
+ p result
49
+ rescue WebScrapingAI::ApiError => e
50
+ puts "Exception when calling HtmlApi->get_page: #{e}"
51
+ end
52
+
53
+ ```
54
+
55
+ ## Documentation for API Endpoints
56
+
57
+ All URIs are relative to *https://webscraping.ai/api*
58
+
59
+ Class | Method | HTTP request | Description
60
+ ------------ | ------------- | ------------- | -------------
61
+ *WebScrapingAI::HtmlApi* | [**get_page**](docs/HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
62
+
63
+
64
+ ## Documentation for Models
65
+
66
+ - [WebScrapingAI::ScrappedPage](docs/ScrappedPage.md)
67
+
68
+
69
+ ## Documentation for Authorization
70
+
71
+
72
+ ### api_key
73
+
74
+
75
+ - **Type**: API key
76
+ - **API key parameter name**: api_key
77
+ - **Location**: URL query string
78
+
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ begin
4
+ require 'rspec/core/rake_task'
5
+
6
+ RSpec::Core::RakeTask.new(:spec)
7
+ task default: :spec
8
+ rescue LoadError
9
+ # no rspec available
10
+ end
data/docs/HtmlApi.md ADDED
@@ -0,0 +1,73 @@
1
+ # WebScrapingAI::HtmlApi
2
+
3
+ All URIs are relative to *https://webscraping.ai/api*
4
+
5
+ Method | HTTP request | Description
6
+ ------------- | ------------- | -------------
7
+ [**get_page**](HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
8
+
9
+
10
+
11
+ ## get_page
12
+
13
+ > ScrappedPage get_page(url, opts)
14
+
15
+ Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
16
+
17
+ ### Example
18
+
19
+ ```ruby
20
+ # load the gem
21
+ require 'webscraping_ai'
22
+ # setup authorization
23
+ WebScrapingAI.configure do |config|
24
+ # Configure API key authorization: api_key
25
+ config.api_key['api_key'] = 'YOUR API KEY'
26
+ # Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
27
+ #config.api_key_prefix['api_key'] = 'Bearer'
28
+ end
29
+
30
+ api_instance = WebScrapingAI::HtmlApi.new
31
+ url = 'https://example.com' # String | URL of the page to get
32
+ opts = {
33
+ selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
34
+ outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
35
+ proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
36
+ disable_js: false, # Boolean | Disable JS execution (false by default)
37
+ inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
38
+ }
39
+
40
+ begin
41
+ #Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
42
+ result = api_instance.get_page(url, opts)
43
+ p result
44
+ rescue WebScrapingAI::ApiError => e
45
+ puts "Exception when calling HtmlApi->get_page: #{e}"
46
+ end
47
+ ```
48
+
49
+ ### Parameters
50
+
51
+
52
+ Name | Type | Description | Notes
53
+ ------------- | ------------- | ------------- | -------------
54
+ **url** | **String**| URL of the page to get |
55
+ **selector** | **String**| CSS selector to get a part of the page (null by default, returns whole page HTML) | [optional]
56
+ **outer_html** | **Boolean**| Return outer HTML of the selected element (false by default, returns inner HTML) | [optional]
57
+ **proxy** | **String**| Proxy country code, for geotargeting (US by default) | [optional]
58
+ **disable_js** | **Boolean**| Disable JS execution (false by default) | [optional]
59
+ **inline_css** | **Boolean**| Inline included CSS files to make page viewable on other domains (false by default) | [optional]
60
+
61
+ ### Return type
62
+
63
+ [**ScrappedPage**](ScrappedPage.md)
64
+
65
+ ### Authorization
66
+
67
+ [api_key](../README.md#api_key)
68
+
69
+ ### HTTP request headers
70
+
71
+ - **Content-Type**: Not defined
72
+ - **Accept**: application/json
73
+
@@ -0,0 +1,23 @@
1
+ # WebScrapingAI::ScrappedPage
2
+
3
+ ## Properties
4
+
5
+ Name | Type | Description | Notes
6
+ ------------ | ------------- | ------------- | -------------
7
+ **size_bytes** | **Integer** | Page HTML content size in bytes | [optional]
8
+ **html** | **String** | HTML of the full page or a selected area | [optional]
9
+ **status** | **Integer** | Response HTTP status code (200, 404, 302, etc) | [optional]
10
+ **status_message** | **String** | Response HTTP status message | [optional]
11
+
12
+ ## Code Sample
13
+
14
+ ```ruby
15
+ require 'WebScrapingAI'
16
+
17
+ instance = WebScrapingAI::ScrappedPage.new(size_bytes: null,
18
+ html: null,
19
+ status: null,
20
+ status_message: null)
21
+ ```
22
+
23
+
data/git_push.sh ADDED
@@ -0,0 +1,58 @@
1
+ #!/bin/sh
2
+ # ref: https://help.github.com/articles/adding-an-existing-project-to-github-using-the-command-line/
3
+ #
4
+ # Usage example: /bin/sh ./git_push.sh wing328 openapi-pestore-perl "minor update" "gitlab.com"
5
+
6
+ git_user_id=$1
7
+ git_repo_id=$2
8
+ release_note=$3
9
+ git_host=$4
10
+
11
+ if [ "$git_host" = "" ]; then
12
+ git_host="github.com"
13
+ echo "[INFO] No command line input provided. Set \$git_host to $git_host"
14
+ fi
15
+
16
+ if [ "$git_user_id" = "" ]; then
17
+ git_user_id="webscraping-ai"
18
+ echo "[INFO] No command line input provided. Set \$git_user_id to $git_user_id"
19
+ fi
20
+
21
+ if [ "$git_repo_id" = "" ]; then
22
+ git_repo_id="webscraping-ai-ruby"
23
+ echo "[INFO] No command line input provided. Set \$git_repo_id to $git_repo_id"
24
+ fi
25
+
26
+ if [ "$release_note" = "" ]; then
27
+ release_note="Minor update"
28
+ echo "[INFO] No command line input provided. Set \$release_note to $release_note"
29
+ fi
30
+
31
+ # Initialize the local directory as a Git repository
32
+ git init
33
+
34
+ # Adds the files in the local repository and stages them for commit.
35
+ git add .
36
+
37
+ # Commits the tracked changes and prepares them to be pushed to a remote repository.
38
+ git commit -m "$release_note"
39
+
40
+ # Sets the new remote
41
+ git_remote=`git remote`
42
+ if [ "$git_remote" = "" ]; then # git remote not defined
43
+
44
+ if [ "$GIT_TOKEN" = "" ]; then
45
+ echo "[INFO] \$GIT_TOKEN (environment variable) is not set. Using the git credential in your environment."
46
+ git remote add origin https://${git_host}/${git_user_id}/${git_repo_id}.git
47
+ else
48
+ git remote add origin https://${git_user_id}:${GIT_TOKEN}@${git_host}/${git_user_id}/${git_repo_id}.git
49
+ fi
50
+
51
+ fi
52
+
53
+ git pull origin master
54
+
55
+ # Pushes (Forces) the changes in the local repository up to the remote repository
56
+ echo "Git pushing to https://${git_host}/${git_user_id}/${git_repo_id}.git"
57
+ git push origin master 2>&1 | grep -v 'To https'
58
+
@@ -0,0 +1,98 @@
1
+ =begin
2
+ #WebScraping.AI
3
+
4
+ #A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
5
+
6
+ The version of the OpenAPI document: 1.0.0
7
+
8
+ Generated by: https://openapi-generator.tech
9
+ OpenAPI Generator version: 4.2.3
10
+
11
+ =end
12
+
13
+ require 'cgi'
14
+
15
+ module WebScrapingAI
16
+ class HtmlApi
17
+ attr_accessor :api_client
18
+
19
+ def initialize(api_client = ApiClient.default)
20
+ @api_client = api_client
21
+ end
22
+ # Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
23
+ # @param url [String] URL of the page to get
24
+ # @param [Hash] opts the optional parameters
25
+ # @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
26
+ # @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
27
+ # @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
28
+ # @option opts [Boolean] :disable_js Disable JS execution (false by default)
29
+ # @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
30
+ # @return [ScrappedPage]
31
+ def get_page(url, opts = {})
32
+ data, _status_code, _headers = get_page_with_http_info(url, opts)
33
+ data
34
+ end
35
+
36
+ # Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
37
+ # @param url [String] URL of the page to get
38
+ # @param [Hash] opts the optional parameters
39
+ # @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
40
+ # @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
41
+ # @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
42
+ # @option opts [Boolean] :disable_js Disable JS execution (false by default)
43
+ # @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
44
+ # @return [Array<(ScrappedPage, Integer, Hash)>] ScrappedPage data, response status code and response headers
45
+ def get_page_with_http_info(url, opts = {})
46
+ if @api_client.config.debugging
47
+ @api_client.config.logger.debug 'Calling API: HtmlApi.get_page ...'
48
+ end
49
+ # verify the required parameter 'url' is set
50
+ if @api_client.config.client_side_validation && url.nil?
51
+ fail ArgumentError, "Missing the required parameter 'url' when calling HtmlApi.get_page"
52
+ end
53
+ # resource path
54
+ local_var_path = '/'
55
+
56
+ # query parameters
57
+ query_params = opts[:query_params] || {}
58
+ query_params[:'url'] = url
59
+ query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
60
+ query_params[:'outer_html'] = opts[:'outer_html'] if !opts[:'outer_html'].nil?
61
+ query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
62
+ query_params[:'disable_js'] = opts[:'disable_js'] if !opts[:'disable_js'].nil?
63
+ query_params[:'inline_css'] = opts[:'inline_css'] if !opts[:'inline_css'].nil?
64
+
65
+ # header parameters
66
+ header_params = opts[:header_params] || {}
67
+ # HTTP header 'Accept' (if needed)
68
+ header_params['Accept'] = @api_client.select_header_accept(['application/json'])
69
+
70
+ # form parameters
71
+ form_params = opts[:form_params] || {}
72
+
73
+ # http body (model)
74
+ post_body = opts[:body]
75
+
76
+ # return_type
77
+ return_type = opts[:return_type] || 'ScrappedPage'
78
+
79
+ # auth_names
80
+ auth_names = opts[:auth_names] || ['api_key']
81
+
82
+ new_options = opts.merge(
83
+ :header_params => header_params,
84
+ :query_params => query_params,
85
+ :form_params => form_params,
86
+ :body => post_body,
87
+ :auth_names => auth_names,
88
+ :return_type => return_type
89
+ )
90
+
91
+ data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
92
+ if @api_client.config.debugging
93
+ @api_client.config.logger.debug "API called: HtmlApi#get_page\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
94
+ end
95
+ return data, status_code, headers
96
+ end
97
+ end
98
+ end