webscraping_ai 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +70 -0
- data/README.md +78 -0
- data/Rakefile +10 -0
- data/docs/HtmlApi.md +73 -0
- data/docs/ScrappedPage.md +23 -0
- data/git_push.sh +58 -0
- data/lib/webscraping_ai/api/html_api.rb +98 -0
- data/lib/webscraping_ai/api_client.rb +386 -0
- data/lib/webscraping_ai/api_error.rb +57 -0
- data/lib/webscraping_ai/configuration.rb +248 -0
- data/lib/webscraping_ai/models/scrapped_page.rb +237 -0
- data/lib/webscraping_ai/version.rb +15 -0
- data/lib/webscraping_ai.rb +41 -0
- data/spec/api/html_api_spec.rb +46 -0
- data/spec/api_client_spec.rb +226 -0
- data/spec/configuration_spec.rb +42 -0
- data/spec/models/scrapped_page_spec.rb +59 -0
- data/spec/spec_helper.rb +111 -0
- data/webscraping_ai.gemspec +39 -0
- metadata +129 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 87c7c3987da110e442c1a4af9c64faf489e1357e19e961d9e8ccf064986b2cf0
|
4
|
+
data.tar.gz: 7705c46a0f70dccbbc8c3e51467e37c84faab243330bbeb6069733fa732a3afa
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 33d32382161eb7cdbacfb27ffb75c0d758b507e7945adc8c9d418fee2ff6bda748d1b5671ed7c2f430af3fc522537856a7a2ef85fd844e92272203d3d8f06341
|
7
|
+
data.tar.gz: 3482be7567deceb06613d2cf82a9d56d1cc7c9ef51b8a27fde3d2df283c91f3d695fceb9dca8adc96a8f48f81c1ad624c9419f1eb700f533e932285e9e512da8
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
webscraping_ai (1.0.0)
|
5
|
+
json (~> 2.1, >= 2.1.0)
|
6
|
+
typhoeus (~> 1.0, >= 1.0.1)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
ast (2.4.0)
|
12
|
+
byebug (11.1.1)
|
13
|
+
coderay (1.1.2)
|
14
|
+
diff-lcs (1.3)
|
15
|
+
ethon (0.12.0)
|
16
|
+
ffi (>= 1.3.0)
|
17
|
+
ffi (1.12.2)
|
18
|
+
jaro_winkler (1.5.4)
|
19
|
+
json (2.3.0)
|
20
|
+
method_source (1.0.0)
|
21
|
+
parallel (1.19.1)
|
22
|
+
parser (2.7.0.5)
|
23
|
+
ast (~> 2.4.0)
|
24
|
+
pry (0.13.0)
|
25
|
+
coderay (~> 1.1)
|
26
|
+
method_source (~> 1.0)
|
27
|
+
pry-byebug (3.9.0)
|
28
|
+
byebug (~> 11.0)
|
29
|
+
pry (~> 0.13.0)
|
30
|
+
psych (3.1.0)
|
31
|
+
rainbow (3.0.0)
|
32
|
+
rake (12.0.0)
|
33
|
+
rspec (3.9.0)
|
34
|
+
rspec-core (~> 3.9.0)
|
35
|
+
rspec-expectations (~> 3.9.0)
|
36
|
+
rspec-mocks (~> 3.9.0)
|
37
|
+
rspec-core (3.9.1)
|
38
|
+
rspec-support (~> 3.9.1)
|
39
|
+
rspec-expectations (3.9.1)
|
40
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
41
|
+
rspec-support (~> 3.9.0)
|
42
|
+
rspec-mocks (3.9.1)
|
43
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
44
|
+
rspec-support (~> 3.9.0)
|
45
|
+
rspec-support (3.9.2)
|
46
|
+
rubocop (0.66.0)
|
47
|
+
jaro_winkler (~> 1.5.1)
|
48
|
+
parallel (~> 1.10)
|
49
|
+
parser (>= 2.5, != 2.5.1.1)
|
50
|
+
psych (>= 3.1.0)
|
51
|
+
rainbow (>= 2.2.2, < 4.0)
|
52
|
+
ruby-progressbar (~> 1.7)
|
53
|
+
unicode-display_width (>= 1.4.0, < 1.6)
|
54
|
+
ruby-progressbar (1.10.1)
|
55
|
+
typhoeus (1.3.1)
|
56
|
+
ethon (>= 0.9.0)
|
57
|
+
unicode-display_width (1.5.0)
|
58
|
+
|
59
|
+
PLATFORMS
|
60
|
+
ruby
|
61
|
+
|
62
|
+
DEPENDENCIES
|
63
|
+
pry-byebug
|
64
|
+
rake (~> 12.0.0)
|
65
|
+
rspec (~> 3.6, >= 3.6.0)
|
66
|
+
rubocop (~> 0.66.0)
|
67
|
+
webscraping_ai!
|
68
|
+
|
69
|
+
BUNDLED WITH
|
70
|
+
2.1.2
|
data/README.md
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
# webscraping_ai
|
2
|
+
|
3
|
+
WebScrapingAI - the Ruby gem for the WebScraping.AI
|
4
|
+
|
5
|
+
A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
|
6
|
+
|
7
|
+
This SDK is automatically generated by the [OpenAPI Generator](https://openapi-generator.tech) project:
|
8
|
+
|
9
|
+
- API version: 1.0.0
|
10
|
+
- Package version: 1.0.0
|
11
|
+
- Build package: org.openapitools.codegen.languages.RubyClientCodegen
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
### Install from RubyGems
|
16
|
+
|
17
|
+
Add the following in the Gemfile:
|
18
|
+
|
19
|
+
gem 'webscraping_ai'
|
20
|
+
|
21
|
+
## Getting Started
|
22
|
+
|
23
|
+
Please follow the [installation](#installation) procedure and then run the following code:
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
# Load the gem
|
27
|
+
require 'webscraping_ai'
|
28
|
+
|
29
|
+
# Setup authorization
|
30
|
+
WebScrapingAI.configure do |config|
|
31
|
+
# Configure API key authorization: api_key
|
32
|
+
config.api_key['api_key'] = 'test-api-key'
|
33
|
+
end
|
34
|
+
|
35
|
+
api_instance = WebScrapingAI::HtmlApi.new
|
36
|
+
url = 'https://example.com' # String | URL of the page to get
|
37
|
+
opts = {
|
38
|
+
selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
|
39
|
+
outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
|
40
|
+
proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
|
41
|
+
disable_js: false, # Boolean | Disable JS execution (false by default)
|
42
|
+
inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
|
43
|
+
}
|
44
|
+
|
45
|
+
begin
|
46
|
+
#Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
47
|
+
result = api_instance.get_page(url, opts)
|
48
|
+
p result
|
49
|
+
rescue WebScrapingAI::ApiError => e
|
50
|
+
puts "Exception when calling HtmlApi->get_page: #{e}"
|
51
|
+
end
|
52
|
+
|
53
|
+
```
|
54
|
+
|
55
|
+
## Documentation for API Endpoints
|
56
|
+
|
57
|
+
All URIs are relative to *https://webscraping.ai/api*
|
58
|
+
|
59
|
+
Class | Method | HTTP request | Description
|
60
|
+
------------ | ------------- | ------------- | -------------
|
61
|
+
*WebScrapingAI::HtmlApi* | [**get_page**](docs/HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
62
|
+
|
63
|
+
|
64
|
+
## Documentation for Models
|
65
|
+
|
66
|
+
- [WebScrapingAI::ScrappedPage](docs/ScrappedPage.md)
|
67
|
+
|
68
|
+
|
69
|
+
## Documentation for Authorization
|
70
|
+
|
71
|
+
|
72
|
+
### api_key
|
73
|
+
|
74
|
+
|
75
|
+
- **Type**: API key
|
76
|
+
- **API key parameter name**: api_key
|
77
|
+
- **Location**: URL query string
|
78
|
+
|
data/Rakefile
ADDED
data/docs/HtmlApi.md
ADDED
@@ -0,0 +1,73 @@
|
|
1
|
+
# WebScrapingAI::HtmlApi
|
2
|
+
|
3
|
+
All URIs are relative to *https://webscraping.ai/api*
|
4
|
+
|
5
|
+
Method | HTTP request | Description
|
6
|
+
------------- | ------------- | -------------
|
7
|
+
[**get_page**](HtmlApi.md#get_page) | **GET** / | Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
## get_page
|
12
|
+
|
13
|
+
> ScrappedPage get_page(url, opts)
|
14
|
+
|
15
|
+
Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
16
|
+
|
17
|
+
### Example
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
# load the gem
|
21
|
+
require 'webscraping_ai'
|
22
|
+
# setup authorization
|
23
|
+
WebScrapingAI.configure do |config|
|
24
|
+
# Configure API key authorization: api_key
|
25
|
+
config.api_key['api_key'] = 'YOUR API KEY'
|
26
|
+
# Uncomment the following line to set a prefix for the API key, e.g. 'Bearer' (defaults to nil)
|
27
|
+
#config.api_key_prefix['api_key'] = 'Bearer'
|
28
|
+
end
|
29
|
+
|
30
|
+
api_instance = WebScrapingAI::HtmlApi.new
|
31
|
+
url = 'https://example.com' # String | URL of the page to get
|
32
|
+
opts = {
|
33
|
+
selector: 'html', # String | CSS selector to get a part of the page (null by default, returns whole page HTML)
|
34
|
+
outer_html: false, # Boolean | Return outer HTML of the selected element (false by default, returns inner HTML)
|
35
|
+
proxy: 'US', # String | Proxy country code, for geotargeting (US by default)
|
36
|
+
disable_js: false, # Boolean | Disable JS execution (false by default)
|
37
|
+
inline_css: false # Boolean | Inline included CSS files to make page viewable on other domains (false by default)
|
38
|
+
}
|
39
|
+
|
40
|
+
begin
|
41
|
+
#Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
42
|
+
result = api_instance.get_page(url, opts)
|
43
|
+
p result
|
44
|
+
rescue WebScrapingAI::ApiError => e
|
45
|
+
puts "Exception when calling HtmlApi->get_page: #{e}"
|
46
|
+
end
|
47
|
+
```
|
48
|
+
|
49
|
+
### Parameters
|
50
|
+
|
51
|
+
|
52
|
+
Name | Type | Description | Notes
|
53
|
+
------------- | ------------- | ------------- | -------------
|
54
|
+
**url** | **String**| URL of the page to get |
|
55
|
+
**selector** | **String**| CSS selector to get a part of the page (null by default, returns whole page HTML) | [optional]
|
56
|
+
**outer_html** | **Boolean**| Return outer HTML of the selected element (false by default, returns inner HTML) | [optional]
|
57
|
+
**proxy** | **String**| Proxy country code, for geotargeting (US by default) | [optional]
|
58
|
+
**disable_js** | **Boolean**| Disable JS execution (false by default) | [optional]
|
59
|
+
**inline_css** | **Boolean**| Inline included CSS files to make page viewable on other domains (false by default) | [optional]
|
60
|
+
|
61
|
+
### Return type
|
62
|
+
|
63
|
+
[**ScrappedPage**](ScrappedPage.md)
|
64
|
+
|
65
|
+
### Authorization
|
66
|
+
|
67
|
+
[api_key](../README.md#api_key)
|
68
|
+
|
69
|
+
### HTTP request headers
|
70
|
+
|
71
|
+
- **Content-Type**: Not defined
|
72
|
+
- **Accept**: application/json
|
73
|
+
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# WebScrapingAI::ScrappedPage
|
2
|
+
|
3
|
+
## Properties
|
4
|
+
|
5
|
+
Name | Type | Description | Notes
|
6
|
+
------------ | ------------- | ------------- | -------------
|
7
|
+
**size_bytes** | **Integer** | Page HTML content size in bytes | [optional]
|
8
|
+
**html** | **String** | HTML of the full page or a selected area | [optional]
|
9
|
+
**status** | **Integer** | Response HTTP status code (200, 404, 302, etc) | [optional]
|
10
|
+
**status_message** | **String** | Response HTTP status message | [optional]
|
11
|
+
|
12
|
+
## Code Sample
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
require 'WebScrapingAI'
|
16
|
+
|
17
|
+
instance = WebScrapingAI::ScrappedPage.new(size_bytes: null,
|
18
|
+
html: null,
|
19
|
+
status: null,
|
20
|
+
status_message: null)
|
21
|
+
```
|
22
|
+
|
23
|
+
|
data/git_push.sh
ADDED
@@ -0,0 +1,58 @@
|
|
1
|
+
#!/bin/sh
|
2
|
+
# ref: https://help.github.com/articles/adding-an-existing-project-to-github-using-the-command-line/
|
3
|
+
#
|
4
|
+
# Usage example: /bin/sh ./git_push.sh wing328 openapi-pestore-perl "minor update" "gitlab.com"
|
5
|
+
|
6
|
+
git_user_id=$1
|
7
|
+
git_repo_id=$2
|
8
|
+
release_note=$3
|
9
|
+
git_host=$4
|
10
|
+
|
11
|
+
if [ "$git_host" = "" ]; then
|
12
|
+
git_host="github.com"
|
13
|
+
echo "[INFO] No command line input provided. Set \$git_host to $git_host"
|
14
|
+
fi
|
15
|
+
|
16
|
+
if [ "$git_user_id" = "" ]; then
|
17
|
+
git_user_id="webscraping-ai"
|
18
|
+
echo "[INFO] No command line input provided. Set \$git_user_id to $git_user_id"
|
19
|
+
fi
|
20
|
+
|
21
|
+
if [ "$git_repo_id" = "" ]; then
|
22
|
+
git_repo_id="webscraping-ai-ruby"
|
23
|
+
echo "[INFO] No command line input provided. Set \$git_repo_id to $git_repo_id"
|
24
|
+
fi
|
25
|
+
|
26
|
+
if [ "$release_note" = "" ]; then
|
27
|
+
release_note="Minor update"
|
28
|
+
echo "[INFO] No command line input provided. Set \$release_note to $release_note"
|
29
|
+
fi
|
30
|
+
|
31
|
+
# Initialize the local directory as a Git repository
|
32
|
+
git init
|
33
|
+
|
34
|
+
# Adds the files in the local repository and stages them for commit.
|
35
|
+
git add .
|
36
|
+
|
37
|
+
# Commits the tracked changes and prepares them to be pushed to a remote repository.
|
38
|
+
git commit -m "$release_note"
|
39
|
+
|
40
|
+
# Sets the new remote
|
41
|
+
git_remote=`git remote`
|
42
|
+
if [ "$git_remote" = "" ]; then # git remote not defined
|
43
|
+
|
44
|
+
if [ "$GIT_TOKEN" = "" ]; then
|
45
|
+
echo "[INFO] \$GIT_TOKEN (environment variable) is not set. Using the git credential in your environment."
|
46
|
+
git remote add origin https://${git_host}/${git_user_id}/${git_repo_id}.git
|
47
|
+
else
|
48
|
+
git remote add origin https://${git_user_id}:${GIT_TOKEN}@${git_host}/${git_user_id}/${git_repo_id}.git
|
49
|
+
fi
|
50
|
+
|
51
|
+
fi
|
52
|
+
|
53
|
+
git pull origin master
|
54
|
+
|
55
|
+
# Pushes (Forces) the changes in the local repository up to the remote repository
|
56
|
+
echo "Git pushing to https://${git_host}/${git_user_id}/${git_repo_id}.git"
|
57
|
+
git push origin master 2>&1 | grep -v 'To https'
|
58
|
+
|
@@ -0,0 +1,98 @@
|
|
1
|
+
=begin
|
2
|
+
#WebScraping.AI
|
3
|
+
|
4
|
+
#A client for https://webscraping.ai API. It provides Chrome JS rendering, rotating proxies and HTML parsing for web scraping.
|
5
|
+
|
6
|
+
The version of the OpenAPI document: 1.0.0
|
7
|
+
|
8
|
+
Generated by: https://openapi-generator.tech
|
9
|
+
OpenAPI Generator version: 4.2.3
|
10
|
+
|
11
|
+
=end
|
12
|
+
|
13
|
+
require 'cgi'
|
14
|
+
|
15
|
+
module WebScrapingAI
|
16
|
+
class HtmlApi
|
17
|
+
attr_accessor :api_client
|
18
|
+
|
19
|
+
def initialize(api_client = ApiClient.default)
|
20
|
+
@api_client = api_client
|
21
|
+
end
|
22
|
+
# Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
23
|
+
# @param url [String] URL of the page to get
|
24
|
+
# @param [Hash] opts the optional parameters
|
25
|
+
# @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
|
26
|
+
# @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
|
27
|
+
# @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
|
28
|
+
# @option opts [Boolean] :disable_js Disable JS execution (false by default)
|
29
|
+
# @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
|
30
|
+
# @return [ScrappedPage]
|
31
|
+
def get_page(url, opts = {})
|
32
|
+
data, _status_code, _headers = get_page_with_http_info(url, opts)
|
33
|
+
data
|
34
|
+
end
|
35
|
+
|
36
|
+
# Get page HTML by URL (renders JS in Chrome and uses rotating proxies)
|
37
|
+
# @param url [String] URL of the page to get
|
38
|
+
# @param [Hash] opts the optional parameters
|
39
|
+
# @option opts [String] :selector CSS selector to get a part of the page (null by default, returns whole page HTML)
|
40
|
+
# @option opts [Boolean] :outer_html Return outer HTML of the selected element (false by default, returns inner HTML)
|
41
|
+
# @option opts [String] :proxy Proxy country code, for geotargeting (US by default)
|
42
|
+
# @option opts [Boolean] :disable_js Disable JS execution (false by default)
|
43
|
+
# @option opts [Boolean] :inline_css Inline included CSS files to make page viewable on other domains (false by default)
|
44
|
+
# @return [Array<(ScrappedPage, Integer, Hash)>] ScrappedPage data, response status code and response headers
|
45
|
+
def get_page_with_http_info(url, opts = {})
|
46
|
+
if @api_client.config.debugging
|
47
|
+
@api_client.config.logger.debug 'Calling API: HtmlApi.get_page ...'
|
48
|
+
end
|
49
|
+
# verify the required parameter 'url' is set
|
50
|
+
if @api_client.config.client_side_validation && url.nil?
|
51
|
+
fail ArgumentError, "Missing the required parameter 'url' when calling HtmlApi.get_page"
|
52
|
+
end
|
53
|
+
# resource path
|
54
|
+
local_var_path = '/'
|
55
|
+
|
56
|
+
# query parameters
|
57
|
+
query_params = opts[:query_params] || {}
|
58
|
+
query_params[:'url'] = url
|
59
|
+
query_params[:'selector'] = opts[:'selector'] if !opts[:'selector'].nil?
|
60
|
+
query_params[:'outer_html'] = opts[:'outer_html'] if !opts[:'outer_html'].nil?
|
61
|
+
query_params[:'proxy'] = opts[:'proxy'] if !opts[:'proxy'].nil?
|
62
|
+
query_params[:'disable_js'] = opts[:'disable_js'] if !opts[:'disable_js'].nil?
|
63
|
+
query_params[:'inline_css'] = opts[:'inline_css'] if !opts[:'inline_css'].nil?
|
64
|
+
|
65
|
+
# header parameters
|
66
|
+
header_params = opts[:header_params] || {}
|
67
|
+
# HTTP header 'Accept' (if needed)
|
68
|
+
header_params['Accept'] = @api_client.select_header_accept(['application/json'])
|
69
|
+
|
70
|
+
# form parameters
|
71
|
+
form_params = opts[:form_params] || {}
|
72
|
+
|
73
|
+
# http body (model)
|
74
|
+
post_body = opts[:body]
|
75
|
+
|
76
|
+
# return_type
|
77
|
+
return_type = opts[:return_type] || 'ScrappedPage'
|
78
|
+
|
79
|
+
# auth_names
|
80
|
+
auth_names = opts[:auth_names] || ['api_key']
|
81
|
+
|
82
|
+
new_options = opts.merge(
|
83
|
+
:header_params => header_params,
|
84
|
+
:query_params => query_params,
|
85
|
+
:form_params => form_params,
|
86
|
+
:body => post_body,
|
87
|
+
:auth_names => auth_names,
|
88
|
+
:return_type => return_type
|
89
|
+
)
|
90
|
+
|
91
|
+
data, status_code, headers = @api_client.call_api(:GET, local_var_path, new_options)
|
92
|
+
if @api_client.config.debugging
|
93
|
+
@api_client.config.logger.debug "API called: HtmlApi#get_page\nData: #{data.inspect}\nStatus code: #{status_code}\nHeaders: #{headers}"
|
94
|
+
end
|
95
|
+
return data, status_code, headers
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|