Dhalang 0.6.5 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/build.yml +20 -0
- data/.github/workflows/publish.yml +23 -0
- data/.gitignore +1 -0
- data/Gemfile.lock +20 -17
- data/README.md +18 -12
- data/lib/Dhalang/configuration.rb +87 -0
- data/lib/Dhalang/node_script_invoker.rb +29 -0
- data/lib/Dhalang/version.rb +1 -1
- data/lib/Dhalang.rb +3 -1
- data/lib/PDF.rb +4 -3
- data/lib/Scraper.rb +26 -0
- data/lib/Screenshot.rb +4 -3
- data/lib/js/dhalang.js +1 -1
- data/lib/js/html-scraper.js +26 -0
- data/package-lock.json +886 -701
- data/package.json +2 -2
- metadata +13 -9
- data/.travis.yml +0 -21
- data/lib/Dhalang/puppeteer.rb +0 -97
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7392a94eca6e888d2a81b779cde341fd7a9be0bbddd2d254e708e93aa18b7fe3
|
4
|
+
data.tar.gz: dc520a25fcaf30ad5584820bb45b9bcf72b35531024676e77d9ce0f54291f91f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b75659edee50ba18a726be62a32d39336d5cb8ce6ac94625e529a75015ba7dee45fb396bc292a34604c602369ebba39842c127753d537fc882665a48aacf249
|
7
|
+
data.tar.gz: 965cbdae8bc88057c4d7a7a366940a5bb8deedf367893fab5d2c7c437bf96dc9530caf96a138ae7b19ef87ce190c9e8740d83f36cf2a411ec745c3c18b5d208b
|
@@ -0,0 +1,20 @@
|
|
1
|
+
name: Build
|
2
|
+
on:
|
3
|
+
- push
|
4
|
+
- pull_request
|
5
|
+
jobs:
|
6
|
+
test:
|
7
|
+
runs-on: ubuntu-latest
|
8
|
+
steps:
|
9
|
+
- uses: actions/checkout@v4
|
10
|
+
- uses: ruby/setup-ruby@v1
|
11
|
+
with:
|
12
|
+
ruby-version: 3.1.3
|
13
|
+
bundler-cache: true
|
14
|
+
- uses: actions/setup-node@v4
|
15
|
+
with:
|
16
|
+
node-version: 18
|
17
|
+
cache: 'npm'
|
18
|
+
cache-dependency-path: '**/package-lock.json'
|
19
|
+
- run: npm ci
|
20
|
+
- run: bundle exec rake spec
|
@@ -0,0 +1,23 @@
|
|
1
|
+
name: Publish
|
2
|
+
on:
|
3
|
+
release:
|
4
|
+
types:
|
5
|
+
- published
|
6
|
+
jobs:
|
7
|
+
publish:
|
8
|
+
runs-on: ubuntu-latest
|
9
|
+
env:
|
10
|
+
GEM_HOST_API_KEY: ${{ secrets.RUBYGEMS_API_KEY }}
|
11
|
+
steps:
|
12
|
+
- uses: actions/checkout@v4
|
13
|
+
- uses: ruby/setup-ruby@v1
|
14
|
+
with:
|
15
|
+
ruby-version: 3.1.3
|
16
|
+
bundler-cache: true
|
17
|
+
- uses: actions/setup-node@v4
|
18
|
+
with:
|
19
|
+
node-version: 18
|
20
|
+
- run: npm ci
|
21
|
+
- run: bundle exec rake spec
|
22
|
+
- run: gem build -o Dhalang.gem Dhalang.gemspec
|
23
|
+
- run: gem push Dhalang.gem
|
data/.gitignore
CHANGED
data/Gemfile.lock
CHANGED
@@ -1,15 +1,16 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
Dhalang (0.
|
4
|
+
Dhalang (0.7.0)
|
5
5
|
|
6
6
|
GEM
|
7
7
|
remote: https://rubygems.org/
|
8
8
|
specs:
|
9
9
|
Ascii85 (1.1.0)
|
10
10
|
afm (0.2.2)
|
11
|
-
|
12
|
-
|
11
|
+
bigdecimal (3.1.7)
|
12
|
+
diff-lcs (1.5.1)
|
13
|
+
fastimage (2.2.7)
|
13
14
|
hashery (2.1.2)
|
14
15
|
pdf-reader (2.9.2)
|
15
16
|
Ascii85 (~> 1.0)
|
@@ -18,24 +19,26 @@ GEM
|
|
18
19
|
ruby-rc4
|
19
20
|
ttfunk
|
20
21
|
rake (13.0.6)
|
21
|
-
rspec (3.
|
22
|
-
rspec-core (~> 3.
|
23
|
-
rspec-expectations (~> 3.
|
24
|
-
rspec-mocks (~> 3.
|
25
|
-
rspec-core (3.
|
26
|
-
rspec-support (~> 3.
|
27
|
-
rspec-expectations (3.
|
22
|
+
rspec (3.13.0)
|
23
|
+
rspec-core (~> 3.13.0)
|
24
|
+
rspec-expectations (~> 3.13.0)
|
25
|
+
rspec-mocks (~> 3.13.0)
|
26
|
+
rspec-core (3.13.0)
|
27
|
+
rspec-support (~> 3.13.0)
|
28
|
+
rspec-expectations (3.13.0)
|
28
29
|
diff-lcs (>= 1.2.0, < 2.0)
|
29
|
-
rspec-support (~> 3.
|
30
|
-
rspec-mocks (3.
|
30
|
+
rspec-support (~> 3.13.0)
|
31
|
+
rspec-mocks (3.13.0)
|
31
32
|
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
-
rspec-support (~> 3.
|
33
|
-
rspec-support (3.
|
33
|
+
rspec-support (~> 3.13.0)
|
34
|
+
rspec-support (3.13.1)
|
34
35
|
ruby-rc4 (0.1.5)
|
35
|
-
ttfunk (1.
|
36
|
+
ttfunk (1.8.0)
|
37
|
+
bigdecimal (~> 3.1)
|
36
38
|
|
37
39
|
PLATFORMS
|
38
|
-
|
40
|
+
arm64-darwin-21
|
41
|
+
x86_64-linux
|
39
42
|
|
40
43
|
DEPENDENCIES
|
41
44
|
Dhalang!
|
@@ -46,4 +49,4 @@ DEPENDENCIES
|
|
46
49
|
rspec (~> 3.0)
|
47
50
|
|
48
51
|
BUNDLED WITH
|
49
|
-
2.3.
|
52
|
+
2.3.7
|
data/README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1
|
-
# Dhalang [](https://github.com/NielsSteensma/Dhalang/actions/workflows/build.yml)
|
2
2
|
|
3
3
|
> Dhalang is a Ruby wrapper for Google's Puppeteer.
|
4
4
|
|
5
5
|
|
6
6
|
|
7
7
|
## Features
|
8
|
-
* Generate PDFs from
|
9
|
-
* Generate PDFs from
|
10
|
-
* Capture
|
8
|
+
* Generate PDFs from webpages
|
9
|
+
* Generate PDFs from HTML ( external images/stylesheets supported )
|
10
|
+
* Capture screenshots from webpages
|
11
|
+
* Scrape HTML from webpages
|
11
12
|
|
12
13
|
|
13
14
|
|
@@ -24,39 +25,44 @@ Install puppeteer in your application's root directory:
|
|
24
25
|
|
25
26
|
$ npm install puppeteer
|
26
27
|
|
27
|
-
<sub>
|
28
|
+
<sub>Dhalang and Puppeteer require Node ≥ 18 and Puppeteer ≥ 22</sub>
|
28
29
|
## Usage
|
29
|
-
|
30
|
+
__PDF of a website url__
|
30
31
|
```ruby
|
31
32
|
Dhalang::PDF.get_from_url("https://www.google.com")
|
32
33
|
```
|
33
34
|
It is important to pass the complete url, leaving out https://, http:// or www. will result in an error.
|
34
35
|
|
35
|
-
|
36
|
+
__PDF of a HTML string__
|
36
37
|
```ruby
|
37
38
|
Dhalang::PDF.get_from_html("<html><head></head><body><h1>examplestring</h1></body></html>")
|
38
39
|
```
|
39
40
|
|
40
|
-
|
41
|
+
__PNG screenshot of a website__
|
41
42
|
```ruby
|
42
43
|
Dhalang::Screenshot.get_from_url("https://www.google.com", :png)
|
43
44
|
```
|
44
45
|
|
45
|
-
|
46
|
+
__JPEG screenshot of a website__
|
46
47
|
```ruby
|
47
48
|
Dhalang::Screenshot.get_from_url("https://www.google.com", :jpeg)
|
48
49
|
```
|
49
50
|
|
50
|
-
|
51
|
+
__WEBP screenshot of a website__
|
51
52
|
```ruby
|
52
53
|
Dhalang::Screenshot.get_from_url("https://www.google.com", :webp)
|
53
54
|
```
|
54
55
|
|
55
|
-
|
56
|
+
__HTML of a website__
|
57
|
+
```ruby
|
58
|
+
Dhalang::Scraper.html("https://www.google.com")
|
59
|
+
```
|
60
|
+
|
61
|
+
Above methods either return a string containing the PDF/JPEG/PNG/WEBP in binary or the scraped HTML.
|
56
62
|
|
57
63
|
|
58
64
|
|
59
|
-
## Custom
|
65
|
+
## Custom options
|
60
66
|
To override the default options that are set by Dhalang you can pass as last argument a hash with the custom options you want to set.
|
61
67
|
|
62
68
|
For example to set custom margins for PDFs:
|
@@ -0,0 +1,87 @@
|
|
1
|
+
module Dhalang
|
2
|
+
# Groups Puppeteer and Dhalang configuration.
|
3
|
+
class Configuration
|
4
|
+
NODE_MODULES_PATH = Dir.pwd + '/node_modules/'.freeze
|
5
|
+
USER_OPTIONS = {
|
6
|
+
navigationTimeout: 10000,
|
7
|
+
printToPDFTimeout: 0, # unlimited
|
8
|
+
navigationWaitUntil: 'load',
|
9
|
+
navigationWaitForSelector: '',
|
10
|
+
navigationWaitForXPath: '',
|
11
|
+
userAgent: '',
|
12
|
+
isHeadless: true,
|
13
|
+
viewPort: '',
|
14
|
+
httpAuthenticationCredentials: '',
|
15
|
+
isAutoHeight: false,
|
16
|
+
chromeOptions: []
|
17
|
+
}.freeze
|
18
|
+
DEFAULT_PDF_OPTIONS = {
|
19
|
+
scale: 1,
|
20
|
+
displayHeaderFooter: false,
|
21
|
+
headerTemplate: '',
|
22
|
+
footerTemplate: '',
|
23
|
+
headerTemplateFile: '',
|
24
|
+
footerTemplateFile: '',
|
25
|
+
printBackground: true,
|
26
|
+
landscape: false,
|
27
|
+
pageRanges: '',
|
28
|
+
format: 'A4',
|
29
|
+
width: '',
|
30
|
+
height: '',
|
31
|
+
margin: { top: 36, right: 36, bottom: 20, left: 36 },
|
32
|
+
preferCSSPageSize: true,
|
33
|
+
omitBackground: false
|
34
|
+
}.freeze
|
35
|
+
DEFAULT_SCREENSHOT_OPTIONS = {
|
36
|
+
fullPage: true,
|
37
|
+
clip: nil,
|
38
|
+
omitBackground: false
|
39
|
+
}.freeze
|
40
|
+
DEFAULT_JPEG_OPTIONS = {
|
41
|
+
quality: 100
|
42
|
+
}.freeze
|
43
|
+
|
44
|
+
private_constant :NODE_MODULES_PATH
|
45
|
+
private_constant :USER_OPTIONS
|
46
|
+
private_constant :DEFAULT_PDF_OPTIONS
|
47
|
+
private_constant :DEFAULT_SCREENSHOT_OPTIONS
|
48
|
+
private_constant :DEFAULT_JPEG_OPTIONS
|
49
|
+
|
50
|
+
private attr_accessor :page_url
|
51
|
+
private attr_accessor :temp_file_path
|
52
|
+
private attr_accessor :temp_file_extension
|
53
|
+
private attr_accessor :user_options
|
54
|
+
private attr_accessor :pdf_options
|
55
|
+
private attr_accessor :screenshot_options
|
56
|
+
private attr_accessor :jpeg_options
|
57
|
+
|
58
|
+
# @param [Hash] custom_options Changes that override default.
|
59
|
+
# @param [String] page_url Url for Puppeteer to visit.
|
60
|
+
# @param [String] temp_file_path Absolute path of temp file to use for writing script results.
|
61
|
+
# Can be nil for scripts using stdout.
|
62
|
+
# @param [String] temp_file_extension Extension of temp file. Can be nil for scripts using stdout.
|
63
|
+
def initialize(custom_options, page_url, temp_file_path = nil, temp_file_extension = nil)
|
64
|
+
self.page_url = page_url
|
65
|
+
self.temp_file_path = temp_file_path
|
66
|
+
self.temp_file_extension = temp_file_extension
|
67
|
+
self.user_options = USER_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
|
68
|
+
self.pdf_options = DEFAULT_PDF_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
|
69
|
+
self.screenshot_options = DEFAULT_SCREENSHOT_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
|
70
|
+
self.jpeg_options = DEFAULT_JPEG_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
|
71
|
+
end
|
72
|
+
|
73
|
+
# Returns configuration as JSON string.
|
74
|
+
def json
|
75
|
+
return {
|
76
|
+
webPageUrl: page_url,
|
77
|
+
tempFilePath: temp_file_path,
|
78
|
+
puppeteerPath: NODE_MODULES_PATH,
|
79
|
+
imageType: temp_file_extension,
|
80
|
+
userOptions: user_options.to_h,
|
81
|
+
pdfOptions: pdf_options.to_h,
|
82
|
+
screenshotOptions: screenshot_options.to_h,
|
83
|
+
jpegOptions: jpeg_options.to_h
|
84
|
+
}.to_json
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Dhalang
|
2
|
+
class NodeScriptInvoker
|
3
|
+
|
4
|
+
# Executes JS script under given script_path by launching a new Node process.
|
5
|
+
#
|
6
|
+
# @param [String] script_path Absolute path of JS script to execute.
|
7
|
+
# @param [Configuration] configuration Configuration to use.
|
8
|
+
def self.execute_script(script_path, configuration)
|
9
|
+
command = create_node_command(script_path, configuration)
|
10
|
+
Open3.popen2e(command) do |_stdin, stdouterr, wait|
|
11
|
+
return nil if wait.value.success?
|
12
|
+
|
13
|
+
output = stdouterr.read.strip
|
14
|
+
output = nil if output == ''
|
15
|
+
message = output || "Exited with status #{wait.value.exitstatus}"
|
16
|
+
raise DhalangError, message
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
|
21
|
+
# Returns a [String] with node command that invokes the provided script with the configuration.
|
22
|
+
#
|
23
|
+
# @param [String] script_path Absolute path of JS script to invoke.
|
24
|
+
# @param [Configuration] configuration Configuration to use.
|
25
|
+
private_class_method def self.create_node_command(script_path, configuration)
|
26
|
+
"node #{script_path} #{Shellwords.escape(configuration.json)}"
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/Dhalang/version.rb
CHANGED
data/lib/Dhalang.rb
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
module Dhalang
|
2
2
|
require_relative 'PDF'
|
3
3
|
require_relative 'Screenshot'
|
4
|
+
require_relative 'Scraper'
|
4
5
|
require_relative 'Dhalang/version'
|
5
6
|
require_relative 'Dhalang/url_utils'
|
6
7
|
require_relative 'Dhalang/file_utils'
|
7
8
|
require_relative 'Dhalang/error'
|
8
|
-
require_relative 'Dhalang/
|
9
|
+
require_relative 'Dhalang/configuration'
|
10
|
+
require_relative 'Dhalang/node_script_invoker'
|
9
11
|
require 'uri'
|
10
12
|
require 'tempfile'
|
11
13
|
require 'shellwords'
|
data/lib/PDF.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
module Dhalang
|
2
2
|
# Allows consumers of this library to create PDFs with Puppeteer.
|
3
3
|
class PDF
|
4
|
-
|
5
|
-
private_constant :
|
4
|
+
SCRIPT_PATH = File.expand_path('../js/pdf-generator.js', __FILE__).freeze
|
5
|
+
private_constant :SCRIPT_PATH
|
6
6
|
|
7
7
|
# Captures the full webpage under the given url as PDF.
|
8
8
|
#
|
@@ -43,7 +43,8 @@ module Dhalang
|
|
43
43
|
private_class_method def self.get(url, options)
|
44
44
|
temp_file = FileUtils.create_temp_file("pdf")
|
45
45
|
begin
|
46
|
-
|
46
|
+
configuration = Configuration.new(options, url, temp_file.path, "pdf")
|
47
|
+
NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
|
47
48
|
binary_pdf_content = FileUtils.read_binary(temp_file.path)
|
48
49
|
ensure
|
49
50
|
FileUtils.delete(temp_file)
|
data/lib/Scraper.rb
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
module Dhalang
|
2
|
+
# Provides functionality for scraping webpages.
|
3
|
+
class Scraper
|
4
|
+
SCRIPT_PATH = File.expand_path('../js/html-scraper.js', __FILE__).freeze
|
5
|
+
private_constant :SCRIPT_PATH
|
6
|
+
|
7
|
+
# Scrapes full HTML content under given url.
|
8
|
+
#
|
9
|
+
# @param [String] url Url to scrape.
|
10
|
+
# @param [Hash] options User configurable options.
|
11
|
+
#
|
12
|
+
# @return [String] Scraped HTML content.
|
13
|
+
def self.html(url, options = {})
|
14
|
+
UrlUtils.validate(url)
|
15
|
+
temp_file = FileUtils.create_temp_file("html")
|
16
|
+
begin
|
17
|
+
configuration = Configuration.new(options, url, temp_file.path, "html")
|
18
|
+
NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
|
19
|
+
html = IO.read(temp_file.path)
|
20
|
+
ensure
|
21
|
+
FileUtils.delete(temp_file)
|
22
|
+
end
|
23
|
+
return html
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/Screenshot.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
module Dhalang
|
2
2
|
# Allows consumers of this library to take screenshots with Puppeteer.
|
3
3
|
class Screenshot
|
4
|
-
|
4
|
+
SCRIPT_PATH = File.expand_path('../js/screenshot-generator.js', __FILE__).freeze
|
5
5
|
IMAGE_TYPES = [:jpeg, :png, :webp].freeze
|
6
|
-
private_constant :
|
6
|
+
private_constant :SCRIPT_PATH
|
7
7
|
private_constant :IMAGE_TYPES
|
8
8
|
|
9
9
|
# <b>DEPRECATED:</b> Please use `get_from_url(url, :jpeg)` instead.
|
@@ -44,7 +44,8 @@ module Dhalang
|
|
44
44
|
|
45
45
|
temp_file = FileUtils.create_temp_file(image_type)
|
46
46
|
begin
|
47
|
-
|
47
|
+
configuration = Configuration.new(options, url, temp_file.path, image_type)
|
48
|
+
NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
|
48
49
|
binary_image_content = FileUtils.read_binary(temp_file.path)
|
49
50
|
ensure
|
50
51
|
FileUtils.delete(temp_file)
|
data/lib/js/dhalang.js
CHANGED
@@ -96,7 +96,7 @@ exports.navigate = async function (page, configuration) {
|
|
96
96
|
} else if (navigationWaitForXPath !== "") {
|
97
97
|
await page.waitForXPath(navigationWaitForXPath, this.getWaitingParameters(configuration));
|
98
98
|
} else {
|
99
|
-
await
|
99
|
+
await new Promise(r => setTimeout(r, 250));
|
100
100
|
}
|
101
101
|
}
|
102
102
|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
'use strict';
|
2
|
+
const dhalang = require('./dhalang');
|
3
|
+
const fs = require('node:fs');
|
4
|
+
|
5
|
+
const scrapeHtml = async () => {
|
6
|
+
const configuration = dhalang.getConfiguration();
|
7
|
+
|
8
|
+
let browser;
|
9
|
+
try {
|
10
|
+
browser = await dhalang.launchPuppeteer(configuration);
|
11
|
+
const page = await browser.newPage();
|
12
|
+
await dhalang.configure(page, configuration.userOptions);
|
13
|
+
await dhalang.navigate(page, configuration);
|
14
|
+
const html = await page.content();
|
15
|
+
fs.writeFileSync(configuration.tempFilePath, html);
|
16
|
+
} catch (error) {
|
17
|
+
console.error(error.message);
|
18
|
+
process.exit(1);
|
19
|
+
} finally {
|
20
|
+
if (browser) {
|
21
|
+
browser.close();
|
22
|
+
}
|
23
|
+
process.exit(0);
|
24
|
+
}
|
25
|
+
};
|
26
|
+
scrapeHtml();
|