Dhalang 0.6.5 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35f7dcd3cc7787c9e13429033b9c177d6f5c82f0b94364a2d4efbe3ffe601a16
4
- data.tar.gz: b204fe4df6c7b3e0da1e7d59407a711a9bd1d9f41c7828b05eccc478499e2b0b
3
+ metadata.gz: 7392a94eca6e888d2a81b779cde341fd7a9be0bbddd2d254e708e93aa18b7fe3
4
+ data.tar.gz: dc520a25fcaf30ad5584820bb45b9bcf72b35531024676e77d9ce0f54291f91f
5
5
  SHA512:
6
- metadata.gz: 25c582e213b4b26851e59859ab24115f136ea02b4b2889eb99502a9b3d3ddc7688f667ffe656d4f79afb81a65dda0b81a2e482a8d310890a1baf5c11fd17dbc0
7
- data.tar.gz: 3cd5d4ece1b4e8f87a59abeb781ae241e8b7a515665f6b5c57cc636ccb767a98ebc7a7be71f7746f379bf71d083aba33eb7003c524e9efa1d652d77a940b2656
6
+ metadata.gz: 3b75659edee50ba18a726be62a32d39336d5cb8ce6ac94625e529a75015ba7dee45fb396bc292a34604c602369ebba39842c127753d537fc882665a48aacf249
7
+ data.tar.gz: 965cbdae8bc88057c4d7a7a366940a5bb8deedf367893fab5d2c7c437bf96dc9530caf96a138ae7b19ef87ce190c9e8740d83f36cf2a411ec745c3c18b5d208b
@@ -0,0 +1,20 @@
1
+ name: Build
2
+ on:
3
+ - push
4
+ - pull_request
5
+ jobs:
6
+ test:
7
+ runs-on: ubuntu-latest
8
+ steps:
9
+ - uses: actions/checkout@v4
10
+ - uses: ruby/setup-ruby@v1
11
+ with:
12
+ ruby-version: 3.1.3
13
+ bundler-cache: true
14
+ - uses: actions/setup-node@v4
15
+ with:
16
+ node-version: 18
17
+ cache: 'npm'
18
+ cache-dependency-path: '**/package-lock.json'
19
+ - run: npm ci
20
+ - run: bundle exec rake spec
@@ -0,0 +1,23 @@
1
+ name: Publish
2
+ on:
3
+ release:
4
+ types:
5
+ - published
6
+ jobs:
7
+ publish:
8
+ runs-on: ubuntu-latest
9
+ env:
10
+ GEM_HOST_API_KEY: ${{ secrets.RUBYGEMS_API_KEY }}
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+ - uses: ruby/setup-ruby@v1
14
+ with:
15
+ ruby-version: 3.1.3
16
+ bundler-cache: true
17
+ - uses: actions/setup-node@v4
18
+ with:
19
+ node-version: 18
20
+ - run: npm ci
21
+ - run: bundle exec rake spec
22
+ - run: gem build -o Dhalang.gem Dhalang.gemspec
23
+ - run: gem push Dhalang.gem
data/.gitignore CHANGED
@@ -50,3 +50,4 @@ build-iPhoneSimulator/
50
50
 
51
51
  # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
52
52
  .rvmrc
53
+ .npmrc
data/Gemfile.lock CHANGED
@@ -1,15 +1,16 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- Dhalang (0.6.4)
4
+ Dhalang (0.7.0)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
8
8
  specs:
9
9
  Ascii85 (1.1.0)
10
10
  afm (0.2.2)
11
- diff-lcs (1.5.0)
12
- fastimage (2.2.6)
11
+ bigdecimal (3.1.7)
12
+ diff-lcs (1.5.1)
13
+ fastimage (2.2.7)
13
14
  hashery (2.1.2)
14
15
  pdf-reader (2.9.2)
15
16
  Ascii85 (~> 1.0)
@@ -18,24 +19,26 @@ GEM
18
19
  ruby-rc4
19
20
  ttfunk
20
21
  rake (13.0.6)
21
- rspec (3.11.0)
22
- rspec-core (~> 3.11.0)
23
- rspec-expectations (~> 3.11.0)
24
- rspec-mocks (~> 3.11.0)
25
- rspec-core (3.11.0)
26
- rspec-support (~> 3.11.0)
27
- rspec-expectations (3.11.0)
22
+ rspec (3.13.0)
23
+ rspec-core (~> 3.13.0)
24
+ rspec-expectations (~> 3.13.0)
25
+ rspec-mocks (~> 3.13.0)
26
+ rspec-core (3.13.0)
27
+ rspec-support (~> 3.13.0)
28
+ rspec-expectations (3.13.0)
28
29
  diff-lcs (>= 1.2.0, < 2.0)
29
- rspec-support (~> 3.11.0)
30
- rspec-mocks (3.11.1)
30
+ rspec-support (~> 3.13.0)
31
+ rspec-mocks (3.13.0)
31
32
  diff-lcs (>= 1.2.0, < 2.0)
32
- rspec-support (~> 3.11.0)
33
- rspec-support (3.11.0)
33
+ rspec-support (~> 3.13.0)
34
+ rspec-support (3.13.1)
34
35
  ruby-rc4 (0.1.5)
35
- ttfunk (1.7.0)
36
+ ttfunk (1.8.0)
37
+ bigdecimal (~> 3.1)
36
38
 
37
39
  PLATFORMS
38
- x86_64-darwin-20
40
+ arm64-darwin-21
41
+ x86_64-linux
39
42
 
40
43
  DEPENDENCIES
41
44
  Dhalang!
@@ -46,4 +49,4 @@ DEPENDENCIES
46
49
  rspec (~> 3.0)
47
50
 
48
51
  BUNDLED WITH
49
- 2.3.15
52
+ 2.3.7
data/README.md CHANGED
@@ -1,13 +1,14 @@
1
- # Dhalang [![Build Status](https://travis-ci.com/NielsSteensma/Dhalang.svg?token=XZgKAByw2KZjcrsCh8gW&branch=master)](https://travis-ci.com/NielsSteensma/Dhalang)
1
+ # Dhalang [![Build](https://github.com/NielsSteensma/Dhalang/actions/workflows/build.yml/badge.svg)](https://github.com/NielsSteensma/Dhalang/actions/workflows/build.yml)
2
2
 
3
3
  > Dhalang is a Ruby wrapper for Google's Puppeteer.
4
4
 
5
5
 
6
6
 
7
7
  ## Features
8
- * Generate PDFs from pages
9
- * Generate PDFs from html ( external images/stylesheets supported )
10
- * Capture a screenshot of a webpage
8
+ * Generate PDFs from webpages
9
+ * Generate PDFs from HTML ( external images/stylesheets supported )
10
+ * Capture screenshots from webpages
11
+ * Scrape HTML from webpages
11
12
 
12
13
 
13
14
 
@@ -24,39 +25,44 @@ Install puppeteer in your application's root directory:
24
25
 
25
26
  $ npm install puppeteer
26
27
 
27
- <sub>NodeJS v10.18.1 or greater is required</sub>
28
+ <sub>Dhalang and Puppeteer require Node ≥ 18 and Puppeteer 22</sub>
28
29
  ## Usage
29
- __Get a PDF of a website url__
30
+ __PDF of a website url__
30
31
  ```ruby
31
32
  Dhalang::PDF.get_from_url("https://www.google.com")
32
33
  ```
33
34
  It is important to pass the complete url, leaving out https://, http:// or www. will result in an error.
34
35
 
35
- __Get a PDF of a HTML string__
36
+ __PDF of a HTML string__
36
37
  ```ruby
37
38
  Dhalang::PDF.get_from_html("<html><head></head><body><h1>examplestring</h1></body></html>")
38
39
  ```
39
40
 
40
- __Get a PNG screenshot of a website__
41
+ __PNG screenshot of a website__
41
42
  ```ruby
42
43
  Dhalang::Screenshot.get_from_url("https://www.google.com", :png)
43
44
  ```
44
45
 
45
- __Get a JPEG screenshot of a website__
46
+ __JPEG screenshot of a website__
46
47
  ```ruby
47
48
  Dhalang::Screenshot.get_from_url("https://www.google.com", :jpeg)
48
49
  ```
49
50
 
50
- __Get a WEBP screenshot of a website__
51
+ __WEBP screenshot of a website__
51
52
  ```ruby
52
53
  Dhalang::Screenshot.get_from_url("https://www.google.com", :webp)
53
54
  ```
54
55
 
55
- All methods return a string containing the PDF or JPEG/PNG/WEBP in binary.
56
+ __HTML of a website__
57
+ ```ruby
58
+ Dhalang::Scraper.html("https://www.google.com")
59
+ ```
60
+
61
+ Above methods either return a string containing the PDF/JPEG/PNG/WEBP in binary or the scraped HTML.
56
62
 
57
63
 
58
64
 
59
- ## Custom PDF/screenshot options
65
+ ## Custom options
60
66
  To override the default options that are set by Dhalang you can pass as last argument a hash with the custom options you want to set.
61
67
 
62
68
  For example to set custom margins for PDFs:
@@ -0,0 +1,87 @@
1
+ module Dhalang
2
+ # Groups Puppeteer and Dhalang configuration.
3
+ class Configuration
4
+ NODE_MODULES_PATH = Dir.pwd + '/node_modules/'.freeze
5
+ USER_OPTIONS = {
6
+ navigationTimeout: 10000,
7
+ printToPDFTimeout: 0, # unlimited
8
+ navigationWaitUntil: 'load',
9
+ navigationWaitForSelector: '',
10
+ navigationWaitForXPath: '',
11
+ userAgent: '',
12
+ isHeadless: true,
13
+ viewPort: '',
14
+ httpAuthenticationCredentials: '',
15
+ isAutoHeight: false,
16
+ chromeOptions: []
17
+ }.freeze
18
+ DEFAULT_PDF_OPTIONS = {
19
+ scale: 1,
20
+ displayHeaderFooter: false,
21
+ headerTemplate: '',
22
+ footerTemplate: '',
23
+ headerTemplateFile: '',
24
+ footerTemplateFile: '',
25
+ printBackground: true,
26
+ landscape: false,
27
+ pageRanges: '',
28
+ format: 'A4',
29
+ width: '',
30
+ height: '',
31
+ margin: { top: 36, right: 36, bottom: 20, left: 36 },
32
+ preferCSSPageSize: true,
33
+ omitBackground: false
34
+ }.freeze
35
+ DEFAULT_SCREENSHOT_OPTIONS = {
36
+ fullPage: true,
37
+ clip: nil,
38
+ omitBackground: false
39
+ }.freeze
40
+ DEFAULT_JPEG_OPTIONS = {
41
+ quality: 100
42
+ }.freeze
43
+
44
+ private_constant :NODE_MODULES_PATH
45
+ private_constant :USER_OPTIONS
46
+ private_constant :DEFAULT_PDF_OPTIONS
47
+ private_constant :DEFAULT_SCREENSHOT_OPTIONS
48
+ private_constant :DEFAULT_JPEG_OPTIONS
49
+
50
+ private attr_accessor :page_url
51
+ private attr_accessor :temp_file_path
52
+ private attr_accessor :temp_file_extension
53
+ private attr_accessor :user_options
54
+ private attr_accessor :pdf_options
55
+ private attr_accessor :screenshot_options
56
+ private attr_accessor :jpeg_options
57
+
58
+ # @param [Hash] custom_options Changes that override default.
59
+ # @param [String] page_url Url for Puppeteer to visit.
60
+ # @param [String] temp_file_path Absolute path of temp file to use for writing script results.
61
+ # Can be nil for scripts using stdout.
62
+ # @param [String] temp_file_extension Extension of temp file. Can be nil for scripts using stdout.
63
+ def initialize(custom_options, page_url, temp_file_path = nil, temp_file_extension = nil)
64
+ self.page_url = page_url
65
+ self.temp_file_path = temp_file_path
66
+ self.temp_file_extension = temp_file_extension
67
+ self.user_options = USER_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
68
+ self.pdf_options = DEFAULT_PDF_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
69
+ self.screenshot_options = DEFAULT_SCREENSHOT_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
70
+ self.jpeg_options = DEFAULT_JPEG_OPTIONS.map { |key, default_value| [key, custom_options.fetch(key, default_value)] }
71
+ end
72
+
73
+ # Returns configuration as JSON string.
74
+ def json
75
+ return {
76
+ webPageUrl: page_url,
77
+ tempFilePath: temp_file_path,
78
+ puppeteerPath: NODE_MODULES_PATH,
79
+ imageType: temp_file_extension,
80
+ userOptions: user_options.to_h,
81
+ pdfOptions: pdf_options.to_h,
82
+ screenshotOptions: screenshot_options.to_h,
83
+ jpegOptions: jpeg_options.to_h
84
+ }.to_json
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,29 @@
1
+ module Dhalang
2
+ class NodeScriptInvoker
3
+
4
+ # Executes JS script under given script_path by launching a new Node process.
5
+ #
6
+ # @param [String] script_path Absolute path of JS script to execute.
7
+ # @param [Configuration] configuration Configuration to use.
8
+ def self.execute_script(script_path, configuration)
9
+ command = create_node_command(script_path, configuration)
10
+ Open3.popen2e(command) do |_stdin, stdouterr, wait|
11
+ return nil if wait.value.success?
12
+
13
+ output = stdouterr.read.strip
14
+ output = nil if output == ''
15
+ message = output || "Exited with status #{wait.value.exitstatus}"
16
+ raise DhalangError, message
17
+ end
18
+ end
19
+
20
+
21
+ # Returns a [String] with node command that invokes the provided script with the configuration.
22
+ #
23
+ # @param [String] script_path Absolute path of JS script to invoke.
24
+ # @param [Configuration] configuration Configuration to use.
25
+ private_class_method def self.create_node_command(script_path, configuration)
26
+ "node #{script_path} #{Shellwords.escape(configuration.json)}"
27
+ end
28
+ end
29
+ end
@@ -1,3 +1,3 @@
1
1
  module Dhalang
2
- VERSION = "0.6.5"
2
+ VERSION = "0.7.0"
3
3
  end
data/lib/Dhalang.rb CHANGED
@@ -1,11 +1,13 @@
1
1
  module Dhalang
2
2
  require_relative 'PDF'
3
3
  require_relative 'Screenshot'
4
+ require_relative 'Scraper'
4
5
  require_relative 'Dhalang/version'
5
6
  require_relative 'Dhalang/url_utils'
6
7
  require_relative 'Dhalang/file_utils'
7
8
  require_relative 'Dhalang/error'
8
- require_relative 'Dhalang/puppeteer'
9
+ require_relative 'Dhalang/configuration'
10
+ require_relative 'Dhalang/node_script_invoker'
9
11
  require 'uri'
10
12
  require 'tempfile'
11
13
  require 'shellwords'
data/lib/PDF.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  module Dhalang
2
2
  # Allows consumers of this library to create PDFs with Puppeteer.
3
3
  class PDF
4
- PUPPETEER_SCRIPT_PATH = File.expand_path('../js/pdf-generator.js', __FILE__).freeze
5
- private_constant :PUPPETEER_SCRIPT_PATH
4
+ SCRIPT_PATH = File.expand_path('../js/pdf-generator.js', __FILE__).freeze
5
+ private_constant :SCRIPT_PATH
6
6
 
7
7
  # Captures the full webpage under the given url as PDF.
8
8
  #
@@ -43,7 +43,8 @@ module Dhalang
43
43
  private_class_method def self.get(url, options)
44
44
  temp_file = FileUtils.create_temp_file("pdf")
45
45
  begin
46
- Puppeteer.visit(url, PUPPETEER_SCRIPT_PATH, temp_file.path, "pdf", options)
46
+ configuration = Configuration.new(options, url, temp_file.path, "pdf")
47
+ NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
47
48
  binary_pdf_content = FileUtils.read_binary(temp_file.path)
48
49
  ensure
49
50
  FileUtils.delete(temp_file)
data/lib/Scraper.rb ADDED
@@ -0,0 +1,26 @@
1
+ module Dhalang
2
+ # Provides functionality for scraping webpages.
3
+ class Scraper
4
+ SCRIPT_PATH = File.expand_path('../js/html-scraper.js', __FILE__).freeze
5
+ private_constant :SCRIPT_PATH
6
+
7
+ # Scrapes full HTML content under given url.
8
+ #
9
+ # @param [String] url Url to scrape.
10
+ # @param [Hash] options User configurable options.
11
+ #
12
+ # @return [String] Scraped HTML content.
13
+ def self.html(url, options = {})
14
+ UrlUtils.validate(url)
15
+ temp_file = FileUtils.create_temp_file("html")
16
+ begin
17
+ configuration = Configuration.new(options, url, temp_file.path, "html")
18
+ NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
19
+ html = IO.read(temp_file.path)
20
+ ensure
21
+ FileUtils.delete(temp_file)
22
+ end
23
+ return html
24
+ end
25
+ end
26
+ end
data/lib/Screenshot.rb CHANGED
@@ -1,9 +1,9 @@
1
1
  module Dhalang
2
2
  # Allows consumers of this library to take screenshots with Puppeteer.
3
3
  class Screenshot
4
- PUPPETEER_SCRIPT_PATH = File.expand_path('../js/screenshot-generator.js', __FILE__).freeze
4
+ SCRIPT_PATH = File.expand_path('../js/screenshot-generator.js', __FILE__).freeze
5
5
  IMAGE_TYPES = [:jpeg, :png, :webp].freeze
6
- private_constant :PUPPETEER_SCRIPT_PATH
6
+ private_constant :SCRIPT_PATH
7
7
  private_constant :IMAGE_TYPES
8
8
 
9
9
  # <b>DEPRECATED:</b> Please use `get_from_url(url, :jpeg)` instead.
@@ -44,7 +44,8 @@ module Dhalang
44
44
 
45
45
  temp_file = FileUtils.create_temp_file(image_type)
46
46
  begin
47
- Puppeteer.visit(url, PUPPETEER_SCRIPT_PATH, temp_file.path, image_type, options)
47
+ configuration = Configuration.new(options, url, temp_file.path, image_type)
48
+ NodeScriptInvoker.execute_script(SCRIPT_PATH, configuration)
48
49
  binary_image_content = FileUtils.read_binary(temp_file.path)
49
50
  ensure
50
51
  FileUtils.delete(temp_file)
data/lib/js/dhalang.js CHANGED
@@ -96,7 +96,7 @@ exports.navigate = async function (page, configuration) {
96
96
  } else if (navigationWaitForXPath !== "") {
97
97
  await page.waitForXPath(navigationWaitForXPath, this.getWaitingParameters(configuration));
98
98
  } else {
99
- await page.waitForTimeout(250);
99
+ await new Promise(r => setTimeout(r, 250));
100
100
  }
101
101
  }
102
102
 
@@ -0,0 +1,26 @@
1
+ 'use strict';
2
+ const dhalang = require('./dhalang');
3
+ const fs = require('node:fs');
4
+
5
+ const scrapeHtml = async () => {
6
+ const configuration = dhalang.getConfiguration();
7
+
8
+ let browser;
9
+ try {
10
+ browser = await dhalang.launchPuppeteer(configuration);
11
+ const page = await browser.newPage();
12
+ await dhalang.configure(page, configuration.userOptions);
13
+ await dhalang.navigate(page, configuration);
14
+ const html = await page.content();
15
+ fs.writeFileSync(configuration.tempFilePath, html);
16
+ } catch (error) {
17
+ console.error(error.message);
18
+ process.exit(1);
19
+ } finally {
20
+ if (browser) {
21
+ browser.close();
22
+ }
23
+ process.exit(0);
24
+ }
25
+ };
26
+ scrapeHtml();