webscraping_ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +70 -0
- data/README.md +78 -0
- data/Rakefile +10 -0
- data/docs/HtmlApi.md +73 -0
- data/docs/ScrappedPage.md +23 -0
- data/git_push.sh +58 -0
- data/lib/webscraping_ai/api/html_api.rb +98 -0
- data/lib/webscraping_ai/api_client.rb +386 -0
- data/lib/webscraping_ai/api_error.rb +57 -0
- data/lib/webscraping_ai/configuration.rb +248 -0
- data/lib/webscraping_ai/models/scrapped_page.rb +237 -0
- data/lib/webscraping_ai/version.rb +15 -0
- data/lib/webscraping_ai.rb +41 -0
- data/spec/api/html_api_spec.rb +46 -0
- data/spec/api_client_spec.rb +226 -0
- data/spec/configuration_spec.rb +42 -0
- data/spec/models/scrapped_page_spec.rb +59 -0
- data/spec/spec_helper.rb +111 -0
- data/webscraping_ai.gemspec +39 -0
- metadata +129 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 87c7c3987da110e442c1a4af9c64faf489e1357e19e961d9e8ccf064986b2cf0
|
4
|
+
data.tar.gz: 7705c46a0f70dccbbc8c3e51467e37c84faab243330bbeb6069733fa732a3afa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 33d32382161eb7cdbacfb27ffb75c0d758b507e7945adc8c9d418fee2ff6bda748d1b5671ed7c2f430af3fc522537856a7a2ef85fd844e92272203d3d8f06341
|
7
|
+
data.tar.gz: 3482be7567deceb06613d2cf82a9d56d1cc7c9ef51b8a27fde3d2df283c91f3d695fceb9dca8adc96a8f48f81c1ad624c9419f1eb700f533e932285e9e512da8
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
webscraping_ai (1.0.0)
|
5
|
+
json (~> 2.1, >= 2.1.0)
|
6
|
+
typhoeus (~> 1.0, >= 1.0.1)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
ast (2.4.0)
|
12
|
+
byebug (11.1.1)
|
13
|
+
coderay (1.1.2)
|
14
|
+
diff-lcs (1.3)
|
15
|
+
ethon (0.12.0)
|
16
|
+
ffi (>= 1.3.0)
|
17
|
+
ffi (1.12.2)
|
18
|
+
jaro_winkler (1.5.4)
|
19
|
+
json (2.3.0)
|
20
|
+
method_source (1.0.0)
|
21
|
+
parallel (1.19.1)
|
22
|
+
parser (2.7.0.5)
|
23
|
+
ast (~> 2.4.0)
|
24
|
+
pry (0.13.0)
|
25
|
+
coderay (~> 1.1)
|
26
|
+
method_source (~> 1.0)
|
27
|
+
pry-byebug (3.9.0)
|
28
|
+
byebug (~> 11.0)
|
29
|
+
pry (~> 0.13.0)
|
30
|
+
psych (3.1.0)
|
31
|
+
rainbow (3.0.0)
|
32
|
+
rake (12.0.0)
|
33
|
+
rspec (3.9.0)
|
34
|
+
rspec-core (~> 3.9.0)
|
35
|
+
rspec-expectations (~> 3.9.0)
|
36
|
+
rspec-mocks (~> 3.9.0)
|
37
|
+
rspec-core (3.9.1)
|
38
|
+
rspec-support (~> 3.9.1)
|
39
|
+
rspec-expectations (3.9.1)
|
40
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
41
|
+
rspec-support (~> 3.9.0)
|
42
|
+
rspec-mocks (3.9.1)
|
43
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
44
|
+
rspec-support (~> 3.9.0)
|
45
|
+
rspec-support (3.9.2)
|
46
|
+
rubocop (0.66.0)
|
47
|
+
jaro_winkler (~> 1.5.1)
|
48
|
+
parallel (~> 1.10)
|
49
|
+
parser (>= 2.5, != 2.5.1.1)
|
50
|
+
psych (>= 3.1.0)
|
51
|
+
rainbow (>= 2.2.2, < 4.0)
|
52
|
+
ruby-progressbar (~> 1.7)
|
53
|
+
unicode-display_width (>= 1.4.0, < 1.6)
|
54
|
+
ruby-progressbar (1.10.1)
|
55
|
+
typhoeus (1.3.1)
|
56
|
+
ethon (>= 0.9.0)
|
57
|
+
unicode-display_width (1.5.0)
|
58
|
+
|
59
|
+
PLATFORMS
|
60
|
+
ruby
|
61
|
+
|
62
|
+
DEPENDENCIES
|
63
|
+
pry-byebug
|
64
|
+
rake (~> 12.0.0)
|
65
|
+
rspec (~> 3.6, >= 3.6.0)
|
66
|
+
rubocop (~> 0.66.0)
|
67
|
+
webscraping_ai!
|
68
|
+
|
69
|
+
BUNDLED WITH
|
70
|
+
2.1.2
|
data/README.md
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# webscraping_ai
|
2
|
+
|
3
|
+
WebScrapingAI - the Ruby gem for the WebScraping.AI
|
4
|
+
|
5
|
+
A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
|
6
|
+
|
7
|
+
This SDK is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
|
8
|
+
|
9
|
+
- API version: 1.0.0
|
10
|
+
- Package version: 1.0.0
|
11
|
+
- Build package: org.openapitools.codegen.languages.RubyClientCodegen
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
### Install from RubyGems
|
16
|
+
|
17
|
+
Add the following in the Gemfile:
|
18
|
+
|
19
|
+
gem 'webscraping_ai'
|
20
|
+
|
21
|
+
## Getting Started
|
22
|
+
|
23
|
+
Please follow the [installation](#installation) procedure and then run the following code:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
# Load the gem
|
27
|
+
require 'webscraping_ai'
|
28
|
+
|
29
|
+
# Setup authorization
|
30
|
+
WebScrapingAI.configure do |config|
|
31
|
+
# Configure API key authorization: api_key
|
32
|
+
config.api_key['api_key'] = 'test-api-key'
|
33
|
+
end
|
34
|
+
|
35
|
+
api_instance = WebScrapingAI::HtmlApi.new
|
36
|
+
url = 'https://example.com' # String | URL of the page to get
|
37
|
+
opts = {
|
38
|
+
selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
|
39
|
+
outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
|
40
|
+
proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
|
41
|
+
disable_js: false, # Boolean | Disable JS execution (false by default)
|
42
|
+
inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
|
43
|
+
}
|
44
|
+
|
45
|
+
begin
|
46
|
+
#Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
47
|
+
result = api_instance.get_page(url, opts)
|
48
|
+
p result
|
49
|
+
rescue WebScrapingAI::ApiError => e
|
50
|
+
puts "Exception when calling HtmlApi->get_page: #{e}"
|
51
|
+
end
|
52
|
+
|
53
|
+
```
|
54
|
+
|
55
|
+
## Documentation for API Endpoints
|
56
|
+
|
57
|
+
All URIs are relative to *https://webscraping.ai/api*
|
58
|
+
|
59
|
+
Class | Method | HTTP request | Description
|
60
|
+
------------ | ------------- | ------------- | -------------
|
61
|
+
*WebScrapingAI::HtmlApi* | [**get_page**](docs/HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
62
|
+
|
63
|
+
|
64
|
+
## Documentation for Models
|
65
|
+
|
66
|
+
- [WebScrapingAI::ScrappedPage](docs/ScrappedPage.md)
|
67
|
+
|
68
|
+
|
69
|
+
## Documentation for Authorization
|
70
|
+
|
71
|
+
|
72
|
+
### api_key
|
73
|
+
|
74
|
+
|
75
|
+
- **Type**: API key
|
76
|
+
- **API key parameter name**: api_key
|
77
|
+
- **Location**: URL query string
|
78
|
+
|
data/Rakefile
ADDED
data/docs/HtmlApi.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# WebScrapingAI::HtmlApi
|
2
|
+
|
3
|
+
All URIs are relative to *https://webscraping.ai/api*
|
4
|
+
|
5
|
+
Method | HTTP request | Description
|
6
|
+
------------- | ------------- | -------------
|
7
|
+
[**get_page**](HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
## get_page
|
12
|
+
|
13
|
+
> ScrappedPage get_page(url, opts)
|
14
|
+
|
15
|
+
Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
16
|
+
|
17
|
+
### Example
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
# load the gem
|
21
|
+
require 'webscraping_ai'
|
22
|
+
# setup authorization
|
23
|
+
WebScrapingAI.configure do |config|
|
24
|
+
# Configure API key authorization: api_key
|
25
|
+
config.api_key['api_key'] = 'YOUR API KEY'
|
26
|
+
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
27
|
+
#config.api_key_prefix['api_key'] = 'Bearer'
|
28
|
+
end
|
29
|
+
|
30
|
+
api_instance = WebScrapingAI::HtmlApi.new
|
31
|
+
url = 'https://example.com' # String | URL of the page to get
|
32
|
+
opts = {
|
33
|
+
selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
|
34
|
+
outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
|
35
|
+
proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
|
36
|
+
disable_js: false, # Boolean | Disable JS execution (false by default)
|
37
|
+
inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
|
38
|
+
}
|
39
|
+
|
40
|
+
begin
|
41
|
+
#Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
42
|
+
result = api_instance.get_page(url, opts)
|
43
|
+
p result
|
44
|
+
rescue WebScrapingAI::ApiError => e
|
45
|
+
puts "Exception when calling HtmlApi->get_page: #{e}"
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
### Parameters
|
50
|
+
|
51
|
+
|
52
|
+
Name | Type | Description | Notes
|
53
|
+
------------- | ------------- | ------------- | -------------
|
54
|
+
**url** | **String**| URL of the page to get |
|
55
|
+
**selector** | **String**| CSS selector to get a part of the page (null by default, returns whole page HTML) | [optional]
|
56
|
+
**outer_html** | **Boolean**| Return outer HTML of the selected element (false by default, returns inner HTML) | [optional]
|
57
|
+
**proxy** | **String**| Proxy country code, for geotargeting (US by default) | [optional]
|
58
|
+
**disable_js** | **Boolean**| Disable JS execution (false by default) | [optional]
|
59
|
+
**inline_css** | **Boolean**| Inline included CSS files to make page viewable on other domains (false by default) | [optional]
|
60
|
+
|
61
|
+
### Return type
|
62
|
+
|
63
|
+
[**ScrappedPage**](ScrappedPage.md)
|
64
|
+
|
65
|
+
### Authorization
|
66
|
+
|
67
|
+
[api_key](../README.md#api_key)
|
68
|
+
|
69
|
+
### HTTP request headers
|
70
|
+
|
71
|
+
- **Content-Type**: Not defined
|
72
|
+
- **Accept**: application/json
|
73
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# WebScrapingAI::ScrappedPage
|
2
|
+
|
3
|
+
## Properties
|
4
|
+
|
5
|
+
Name | Type | Description | Notes
|
6
|
+
------------ | ------------- | ------------- | -------------
|
7
|
+
**size_bytes** | **Integer** | Page HTML content size in bytes | [optional]
|
8
|
+
**html** | **String** | HTML of the full page or a selected area | [optional]
|
9
|
+
**status** | **Integer** | Response HTTP status code (200, 404, 302, etc) | [optional]
|
10
|
+
**status_message** | **String** | Response HTTP status message | [optional]
|
11
|
+
|
12
|
+
## Code Sample
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
require 'WebScrapingAI'
|
16
|
+
|
17
|
+
instance = WebScrapingAI::ScrappedPage.new(size_bytes: null,
|
18
|
+
html: null,
|
19
|
+
status: null,
|
20
|
+
status_message: null)
|
21
|
+
```
|
22
|
+
|
23
|
+
|
data/git_push.sh
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
# ref: https://help.github.com/articles/adding-an-existing-project-to-github-using-the-command-line/
|
3
|
+
#
|
4
|
+
# Usage example: /bin/sh ./git_push.sh wing328 openapi-pestore-perl "minor update" "gitlab.com"
|
5
|
+
|
6
|
+
git_user_id=$1
|
7
|
+
git_repo_id=$2
|
8
|
+
release_note=$3
|
9
|
+
git_host=$4
|
10
|
+
|
11
|
+
if [ "$git_host" = "" ]; then
|
12
|
+
git_host="github.com"
|
13
|
+
echo "[INFO] No command line input provided. Set \$git_host to $git_host"
|
14
|
+
fi
|
15
|
+
|
16
|
+
if [ "$git_user_id" = "" ]; then
|
17
|
+
git_user_id="webscraping-ai"
|
18
|
+
echo "[INFO] No command line input provided. Set \$git_user_id to $git_user_id"
|
19
|
+
fi
|
20
|
+
|
21
|
+
if [ "$git_repo_id" = "" ]; then
|
22
|
+
git_repo_id="webscraping-ai-ruby"
|
23
|
+
echo "[INFO] No command line input provided. Set \$git_repo_id to $git_repo_id"
|
24
|
+
fi
|
25
|
+
|
26
|
+
if [ "$release_note" = "" ]; then
|
27
|
+
release_note="Minor update"
|
28
|
+
echo "[INFO] No command line input provided. Set \$release_note to $release_note"
|
29
|
+
fi
|
30
|
+
|
31
|
+
# Initialize the local directory as a Git repository
|
32
|
+
git init
|
33
|
+
|
34
|
+
# Adds the files in the local repository and stages them for commit.
|
35
|
+
git add .
|
36
|
+
|
37
|
+
# Commits the tracked changes and prepares them to be pushed to a remote repository.
|
38
|
+
git commit -m "$release_note"
|
39
|
+
|
40
|
+
# Sets the new remote
|
41
|
+
git_remote=`git remote`
|
42
|
+
if [ "$git_remote" = "" ]; then # git remote not defined
|
43
|
+
|
44
|
+
if [ "$GIT_TOKEN" = "" ]; then
|
45
|
+
echo "[INFO] \$GIT_TOKEN (environment variable) is not set. Using the git credential in your environment."
|
46
|
+
git remote add origin https://${git_host}/${git_user_id}/${git_repo_id}.git
|
47
|
+
else
|
48
|
+
git remote add origin https://${git_user_id}:${GIT_TOKEN}@${git_host}/${git_user_id}/${git_repo_id}.git
|
49
|
+
fi
|
50
|
+
|
51
|
+
fi
|
52
|
+
|
53
|
+
git pull origin master
|
54
|
+
|
55
|
+
# Pushes (Forces) the changes in the local repository up to the remote repository
|
56
|
+
echo "Git pushing to https://${git_host}/${git_user_id}/${git_repo_id}.git"
|
57
|
+
git push origin master 2>&1 | grep -v 'To https'
|
58
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
=begin
|
2
|
+
#WebScraping.AI
|
3
|
+
|
4
|
+
#A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
|
5
|
+
|
6
|
+
The version of the OpenAPI document: 1.0.0
|
7
|
+
|
8
|
+
Generated by: https://openapi-generator.tech
|
9
|
+
OpenAPI Generator version: 4.2.3
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
require 'cgi'
|
14
|
+
|
15
|
+
module WebScrapingAI
|
16
|
+
class HtmlApi
|
17
|
+
attr_accessor :api_client
|
18
|
+
|
19
|
+
def initialize(api_client = ApiClient.default)
|
20
|
+
@api_client = api_client
|
21
|
+
end
|
22
|
+
# Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
23
|
+
# @param url [String] URL of the page to get
|
24
|
+
# @param [Hash] opts the optional parameters
|
25
|
+
# @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
|
26
|
+
# @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
|
27
|
+
# @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
|
28
|
+
# @option opts [Boolean] :disable_js Disable JS execution (false by default)
|
29
|
+
# @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
|
30
|
+
# @return [ScrappedPage]
|
31
|
+
def get_page(url, opts = {})
|
32
|
+
data, _status_code, _headers = get_page_with_http_info(url, opts)
|
33
|
+
data
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
37
|
+
# @param url [String] URL of the page to get
|
38
|
+
# @param [Hash] opts the optional parameters
|
39
|
+
# @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
|
40
|
+
# @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
|
41
|
+
# @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
|
42
|
+
# @option opts [Boolean] :disable_js Disable JS execution (false by default)
|
43
|
+
# @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
|
44
|
+
# @return [Array<(ScrappedPage, Integer, Hash)>] ScrappedPage data, response status code and response headers
|
45
|
+
def get_page_with_http_info(url, opts = {})
|
46
|
+
if @api_client.config.debugging
|
47
|
+
@api_client.config.logger.debug 'Calling API: HtmlApi.get_page ...'
|
48
|
+
end
|
49
|
+
# verify the required parameter 'url' is set
|
50
|
+
if @api_client.config.client_side_validation && url.nil?
|
51
|
+
fail ArgumentError, "Missing the required parameter 'url' when calling HtmlApi.get_page"
|
52
|
+
end
|
53
|
+
# resource path
|
54
|
+
local_var_path = '/'
|
55
|
+
|
56
|
+
# query parameters
|
57
|
+
query_params = opts[:query_params] || {}
|
58
|
+
query_params[:'url'] = url
|
59
|
+
query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
|
60
|
+
query_params[:'outer_html'] = opts[:'outer_html'] if !opts[:'outer_html'].nil?
|
61
|
+
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
62
|
+
query_params[:'disable_js'] = opts[:'disable_js'] if !opts[:'disable_js'].nil?
|
63
|
+
query_params[:'inline_css'] = opts[:'inline_css'] if !opts[:'inline_css'].nil?
|
64
|
+
|
65
|
+
# header parameters
|
66
|
+
header_params = opts[:header_params] || {}
|
67
|
+
# HTTP header 'Accept' (if needed)
|
68
|
+
header_params['Accept'] = @api_client.select_header_accept(['application/json'])
|
69
|
+
|
70
|
+
# form parameters
|
71
|
+
form_params = opts[:form_params] || {}
|
72
|
+
|
73
|
+
# http body (model)
|
74
|
+
post_body = opts[:body]
|
75
|
+
|
76
|
+
# return_type
|
77
|
+
return_type = opts[:return_type] || 'ScrappedPage'
|
78
|
+
|
79
|
+
# auth_names
|
80
|
+
auth_names = opts[:auth_names] || ['api_key']
|
81
|
+
|
82
|
+
new_options = opts.merge(
|
83
|
+
:header_params => header_params,
|
84
|
+
:query_params => query_params,
|
85
|
+
:form_params => form_params,
|
86
|
+
:body => post_body,
|
87
|
+
:auth_names => auth_names,
|
88
|
+
:return_type => return_type
|
89
|
+
)
|
90
|
+
|
91
|
+
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
92
|
+
if @api_client.config.debugging
|
93
|
+
@api_client.config.logger.debug "API called: HtmlApi#get_page\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
94
|
+
end
|
95
|
+
return data, status_code, headers
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|