bidi2pdf 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 766b41f0ee642cd7316d0f72d8dd707b0f45aae4a315a46c2b27fb6bb2d176a6
4
- data.tar.gz: aeec0549f82ff7bdd68d1aa658ea6ad2033e5310fd5936f40b94007b4ae6c38f
3
+ metadata.gz: d71c88a5941411b13770993de9b38f6321c263765ffce1a9bbd347fb960855ac
4
+ data.tar.gz: aa64333d4dc4de54f6e1b627287a5d11661634f3244a8e3a213428c243f155f1
5
5
  SHA512:
6
- metadata.gz: cc7f1da58549b642521808b9ea2acc4b04068bdb7c877cf52943d2ae69bb989f2ade02601b8bfd0e409440ad8644206bba1e6e16603eb56099b8963a2136e350
7
- data.tar.gz: 6258250ac5de22034cbb7816d3ff33c62680747a3eaee171fdd784d309bf1cd7880ca60e45342da8ee9195814ef1c78ca9b45b2c435f9fcbc4aaa43a8d7f95e6
6
+ metadata.gz: 1d598fe002552f46e53f803f46577adceeeb087b377a40b486d5d2ef7bf713463f429aa26b2687fb7d0b865d73aacf3262be71e17db154794ac82e1e4a245986
7
+ data.tar.gz: 3b7cb02b0e857e551c720a665ac31d3669a9a27e8c9e3e5c1cdc497517b8fbcd3e917d6b0735113e3b956b23ded042b44c72bfff637cfdbf2431642bd98aaa2b
data/CHANGELOG.md CHANGED
@@ -7,8 +7,37 @@ All notable changes to this project will be documented in this file.
7
7
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
8
8
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
9
9
 
10
+ [unreleased]: https://github.com/dieter-medium/bidi2pdf/compare/v0.1.8..HEAD
11
+
10
12
  <!-- generated by git-cliff end -->
11
13
 
14
+ ## [0.1.8] - 2025-04-22
15
+
16
+ ### 🎨 Refactored
17
+
18
+ - Modularize ChromedriverContainer implementation by @dieter-medium
19
+ - Replace method calls for clarity and consistency by @dieter-medium
20
+ - Namespace PDFTextSanitizer under Bidi2pdf::TestHelpers by @dieter-medium
21
+ - Refactor command management with concurrent queues by @dieter-medium
22
+
23
+ ### 🐛 Fixed
24
+
25
+ - Update CHANGELOG links to correct Markdown syntax by @dieter-medium
26
+
27
+ ### 📝 Docs
28
+
29
+ - Add Rails integration section to README by @dieter-medium
30
+
31
+ ### 🚀 Added
32
+
33
+ - Update Chromedriver container setup and default image by @dieter-medium
34
+ - Add workflow for pushing Chromedriver Docker image by @dieter-medium
35
+ - Return session status and add test coverage by @dieter-medium
36
+ - Integrate concurrent-ruby for thread safety improvements by @dieter-medium
37
+ - Add specific navigation error classes for better handling by @dieter-medium
38
+ - Enhance navigation error handling in BrowserTab by @dieter-medium
39
+ - Add test helpers and matchers for PDF validation by @dieter-medium
40
+
12
41
  ## [0.1.7] - 2025-04-17
13
42
 
14
43
  ### 🎨 Refactored
@@ -143,12 +172,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
143
172
 
144
173
  - Initial release
145
174
 
146
- [unreleased]: https://github.com/dieter-medium/bidi2pdf/compare/v0.1.7..HEAD
147
-
148
- [unreleased]: https://github.com/dieter-medium/bidi2pdf/compare/v0.1.6..v0.1.7
149
-
150
- [0.1.6]: https://github.com/dieter-medium/bidi2pdf/compare/v0.1.5..v0.1.6
151
-
152
- [0.1.5]: https://github.com/dieter-medium/bidi2pdf/compare/v0.1.4..v0.1.5
153
175
 
154
- [0.1.4]: https://github.com/dieter-medium/bidi2pdf/compare/v0.1.3..v0.1.4
176
+ - [unreleased](https://github.com/dieter-medium/bidi2pdf/compare/v0.1.8..HEAD)
177
+ - [0.1.8](https://github.com/dieter-medium/bidi2pdf/compare/v0.1.7..v0.1.8)
178
+ - [0.1.7](https://github.com/dieter-medium/bidi2pdf/compare/v0.1.6..v0.1.7)
179
+ - [0.1.6](https://github.com/dieter-medium/bidi2pdf/compare/v0.1.5..v0.1.6)
180
+ - [0.1.5](https://github.com/dieter-medium/bidi2pdf/compare/v0.1.4..v0.1.5)
181
+ - [0.1.4](https://github.com/dieter-medium/bidi2pdf/compare/v0.1.3..v0.1.4)
data/README.md CHANGED
@@ -257,6 +257,20 @@ docker compose -f docker/docker-compose.yml down
257
257
 
258
258
  ---
259
259
 
260
+ ## 🚂 Rails Integration
261
+
262
+ Rails integration is available as an additional gem:
263
+
264
+ ```ruby
265
+ # In your Gemfile
266
+ gem 'bidi2pdf-rails'
267
+ ```
268
+
269
+ For full documentation and usage examples,
270
+ visit: [https://github.com/dieter-medium/bidi2pdf-rails](https://github.com/dieter-medium/bidi2pdf-rails)
271
+
272
+ ---
273
+
260
274
  ## 🛠 Development
261
275
 
262
276
  ```bash
@@ -7,7 +7,7 @@ ENV DEBIAN_FRONTEND=noninteractive
7
7
  # Install dependencies
8
8
  RUN apt-get update && apt-get upgrade -y && \
9
9
  apt-get install -y --no-install-recommends\
10
- chromium \
10
+ chromium chromium-driver\
11
11
  libglib2.0-0 \
12
12
  libnss3 \
13
13
  libxss1 \
@@ -26,6 +26,13 @@ RUN groupadd -r appuser && useradd -r -g appuser -m -d /home/appuser appuser
26
26
  COPY ./docker/entrypoint.sh /usr/local/bin/entrypoint.sh
27
27
  RUN chmod +x /usr/local/bin/entrypoint.sh
28
28
 
29
+ # ARM compatibility workaround:
30
+ # On ARM architectures (such as Apple Silicon), downloading chromedriver via automated scripts may fail or cause ELF binary errors,
31
+ # such as "rosetta error: failed to open elf at /lib64/ld-linux-x86-64.so.2".
32
+ # To avoid these issues, we directly install 'chromium-driver' via the package manager and explicitly create a symlink in the expected location.
33
+
34
+ RUN mkdir -p /home/appuser/.webdrivers && ln -s /usr/bin/chromedriver /home/appuser/.webdrivers/chromedriver
35
+
29
36
  # Set working directory
30
37
  WORKDIR /app
31
38
 
@@ -4,6 +4,7 @@ require "base64"
4
4
 
5
5
  require_relative "network_events"
6
6
  require_relative "logger_events"
7
+ require_relative "navigation_failed_events"
7
8
  require_relative "auth_interceptor"
8
9
  require_relative "add_headers_interceptor"
9
10
  require_relative "js_logger_helper"
@@ -32,6 +33,11 @@ require_relative "js_logger_helper"
32
33
  # @param [String] user_context_id The ID of the user context.
33
34
  module Bidi2pdf
34
35
  module Bidi
36
+ # Represents a browser tab for managing interactions and communication
37
+ # using the Bidi2pdf library. This class provides methods for creating
38
+ # browser tabs, managing cookies, navigating to URLs, executing scripts,
39
+ # handling network events, and general tab lifecycle management.
40
+ #
35
41
  class BrowserTab
36
42
  include JsLoggerHelper
37
43
 
@@ -56,6 +62,9 @@ module Bidi2pdf
56
62
  # @return [LoggerEvents] The logger events handler.
57
63
  attr_reader :logger_events
58
64
 
65
+ # @return [NavigationFailedEvents] The navigation failed events handler.
66
+ attr_reader :navigation_failed_events
67
+
59
68
  # Initializes a new browser tab.
60
69
  #
61
70
  # @param [Object] client The WebSocket client for communication.
@@ -68,6 +77,7 @@ module Bidi2pdf
68
77
  @tabs = []
69
78
  @network_events = NetworkEvents.new browsing_context_id
70
79
  @logger_events = LoggerEvents.new browsing_context_id
80
+ @navigation_failed_events = NavigationFailedEvents.new browsing_context_id
71
81
  @open = true
72
82
  end
73
83
 
@@ -154,8 +164,21 @@ module Bidi2pdf
154
164
 
155
165
  # Navigates the browser tab to a specified URL.
156
166
  #
167
+ # This method registers necessary event listeners and sends a navigation
168
+ # command to the browser tab, instructing it to load the specified URL.
169
+ # It validates that the URL is properly formatted before attempting navigation.
170
+ #
157
171
  # @param [String] url The URL to navigate to.
172
+ # @raise [NavigationError] If the URL is invalid or improperly formatted.
173
+ # @example
174
+ # browser_tab.navigate_to("https://example.com")
158
175
  def navigate_to(url)
176
+ begin
177
+ URI.parse(url)
178
+ rescue URI::InvalidURIError => e
179
+ raise NavigationError, "Invalid URL: #{url} - #{e.message}"
180
+ end
181
+
159
182
  Bidi2pdf.notification_service.instrument("navigate_to.bidi2pdf", url: url) do
160
183
  navigate_with_listeners url
161
184
  end
@@ -389,6 +412,18 @@ module Bidi2pdf
389
412
  client.send_cmd_and_wait(cmd) do |response|
390
413
  Bidi2pdf.logger.debug "Navigated to page url: #{url} response: #{response}"
391
414
  end
415
+ rescue Bidi2pdf::CmdError => e
416
+ msg = e.response["message"]
417
+ case msg
418
+ when /^net::ERR_INVALID_AUTH_CREDENTIALS/
419
+ raise NavigationAuthError.new(url, msg)
420
+ when /^net::ERR_NAME_NOT_RESOLVED/
421
+ raise NavigationDNSError.new(url, msg)
422
+ when /^net::/
423
+ raise NavigationError, "Connection error: #{url} #{msg}"
424
+ else
425
+ raise e
426
+ end
392
427
  end
393
428
 
394
429
  def register_event_listeners
@@ -401,6 +436,8 @@ module Bidi2pdf
401
436
 
402
437
  client.on_event("log.entryAdded",
403
438
  &logger_events.method(:handle_event))
439
+
440
+ client.on_event("browsingContext.navigationFailed", &navigation_failed_events.method(:handle_event))
404
441
  end
405
442
 
406
443
  def handle_injection_exception(response, url, exception_class)
@@ -536,6 +573,9 @@ module Bidi2pdf
536
573
 
537
574
  client.remove_event_listener "network.responseStarted", "network.responseCompleted", "network.fetchError",
538
575
  &network_events.method(:handle_event)
576
+
577
+ client.remove_event_listener("log.entryAdded",
578
+ &logger_events.method(:handle_event))
539
579
  end
540
580
 
541
581
  # Closes all tabs associated with the browser tab.
@@ -5,11 +5,10 @@ module Bidi2pdf
5
5
  class CommandManager
6
6
  class << self
7
7
  def initialize_counter
8
- @id = 0
9
- @id_mutex = Mutex.new
8
+ @id = Concurrent::AtomicFixnum.new(0)
10
9
  end
11
10
 
12
- def next_id = @id_mutex.synchronize { @id += 1 }
11
+ def next_id = @id.increment
13
12
  end
14
13
 
15
14
  initialize_counter
@@ -17,19 +16,14 @@ module Bidi2pdf
17
16
  def initialize(socket)
18
17
  @socket = socket
19
18
 
20
- @pending_responses = {}
21
- @initiated_cmds = {}
19
+ @pending_responses = Concurrent::Hash.new
22
20
  end
23
21
 
24
- def send_cmd(cmd, store_response: false)
22
+ def send_cmd(cmd, result_queue: nil)
25
23
  id = next_id
26
24
 
27
25
  Bidi2pdf.notification_service.instrument("send_cmd.bidi2pdf", id: id, cmd: cmd) do |instrumentation_payload|
28
- if store_response
29
- init_queue_for id
30
- else
31
- @initiated_cmds[id] = true
32
- end
26
+ init_queue_for id, result_queue
33
27
 
34
28
  payload = cmd.as_payload(id)
35
29
 
@@ -42,17 +36,20 @@ module Bidi2pdf
42
36
  end
43
37
 
44
38
  def send_cmd_and_wait(cmd, timeout: Bidi2pdf.default_timeout, &block)
39
+ result_queue = Thread::Queue.new
40
+
45
41
  Bidi2pdf.notification_service.instrument("send_cmd_and_wait.bidi2pdf", cmd: cmd, timeout: timeout) do |instrumentation_payload|
46
- id = send_cmd(cmd, store_response: true)
42
+ id = send_cmd(cmd, result_queue: result_queue)
47
43
 
48
44
  instrumentation_payload[:id] = id
49
45
 
50
- response = pop_response id, timeout: timeout
46
+ response = result_queue.pop(timeout: timeout)
51
47
 
52
48
  instrumentation_payload[:response] = response
53
49
 
54
50
  raise CmdTimeoutError, "Timeout waiting for response to command ID #{id}" if response.nil?
55
- raise CmdError, "Error response: #{response["error"]} #{cmd.inspect}" if response["error"]
51
+
52
+ raise Bidi2pdf::CmdError.new(cmd, response) if response["error"]
56
53
 
57
54
  block ? block.call(response) : response
58
55
  ensure
@@ -60,14 +57,6 @@ module Bidi2pdf
60
57
  end
61
58
  end
62
59
 
63
- def pop_response(id, timeout:)
64
- raise CmdResponseNotStoredError, "No response stored for command ID #{id} or already popped or this command was not send" unless @pending_responses.key?(id)
65
-
66
- @pending_responses[id].pop(timeout: timeout)
67
- ensure
68
- @pending_responses.delete(id)
69
- end
70
-
71
60
  def handle_response(data)
72
61
  Bidi2pdf.notification_service.instrument("handle_response.bidi2pdf", data: data) do |instrumentation_payload|
73
62
  instrumentation_payload[:error] = data["error"] if data["error"]
@@ -78,9 +67,6 @@ module Bidi2pdf
78
67
 
79
68
  if @pending_responses.key?(id)
80
69
  @pending_responses[id]&.push(data)
81
- return true
82
- elsif @initiated_cmds.key?(id)
83
- @initiated_cmds.delete(id)
84
70
 
85
71
  return true
86
72
  end
@@ -89,12 +75,14 @@ module Bidi2pdf
89
75
  instrumentation_payload[:handled] = false
90
76
 
91
77
  false
78
+ ensure
79
+ @pending_responses.delete id
92
80
  end
93
81
  end
94
82
 
95
83
  private
96
84
 
97
- def init_queue_for(id) = @pending_responses[id] = Thread::Queue.new
85
+ def init_queue_for(id, result_queue) = @pending_responses[id] = result_queue
98
86
 
99
87
  def next_id = self.class.next_id
100
88
  end
@@ -6,7 +6,7 @@ module Bidi2pdf
6
6
  def initialize(logger:)
7
7
  @logger = logger
8
8
  @connected = false
9
- @connection_queue = Thread::Queue.new
9
+ @connection_latch = Concurrent::CountDownLatch.new(1)
10
10
  end
11
11
 
12
12
  def mark_connected
@@ -14,7 +14,7 @@ module Bidi2pdf
14
14
 
15
15
  @connected = true
16
16
  @logger.debug "WebSocket connection is open"
17
- @connection_queue.push(true)
17
+ @connection_latch.count_down
18
18
  end
19
19
 
20
20
  def wait_until_open(timeout:)
@@ -22,13 +22,7 @@ module Bidi2pdf
22
22
 
23
23
  @logger.debug "Waiting for WebSocket connection to open"
24
24
 
25
- begin
26
- Timeout.timeout(timeout) do
27
- @connection_queue.pop
28
- end
29
- rescue Timeout::Error
30
- raise Bidi2pdf::WebsocketError, "WebSocket connection did not open in time #{timeout} sec."
31
- end
25
+ raise Bidi2pdf::WebsocketError, "WebSocket connection did not open in time #{timeout} sec." unless @connection_latch.wait(timeout)
32
26
 
33
27
  true
34
28
  end
@@ -6,7 +6,7 @@ module Bidi2pdf
6
6
  attr_reader :type
7
7
 
8
8
  def initialize(type)
9
- @listeners = Hash.new { |h, k| h[k] = [] }
9
+ @listeners = Concurrent::Hash.new { |h, k| h[k] = [] }
10
10
  @type = type
11
11
  end
12
12
 
@@ -0,0 +1,41 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "browser_console_logger"
4
+
5
+ module Bidi2pdf
6
+ module Bidi
7
+ class NavigationFailedEvents
8
+ attr_reader :context_id, :browser_console_logger
9
+
10
+ def initialize(context_id)
11
+ @context_id = context_id
12
+ end
13
+
14
+ def handle_event(data)
15
+ event = data["params"]
16
+ method = data["method"]
17
+
18
+ if event["context"] == context_id
19
+ handle_response(method, event)
20
+ else
21
+ Bidi2pdf.logger.debug2 "Ignoring Log event: #{method}, context_id: #{context_id}, params: #{event}"
22
+ end
23
+ end
24
+
25
+ def handle_response(_method, event)
26
+ url = event["url"]
27
+ navigation = event["navigation"]
28
+ timestamp = event["timestamp"]
29
+
30
+ Bidi2pdf.notification_service.instrument("navigation_failed_received.bidi2pdf",
31
+ {
32
+ url: url,
33
+ timestamp: timestamp,
34
+ navigation: navigation
35
+ })
36
+
37
+ Bidi2pdf.logger.error "Navigation failed for URL: #{url}, Navigation: #{navigation}"
38
+ end
39
+ end
40
+ end
41
+ end
@@ -117,7 +117,10 @@ module Bidi2pdf
117
117
 
118
118
  # Retrieves the status of the session.
119
119
  def status
120
- send_cmd(Bidi2pdf::Bidi::Commands::SessionStatus.new) { |resp| Bidi2pdf.logger.info "Session status: #{resp.inspect}" }
120
+ send_cmd(Bidi2pdf::Bidi::Commands::SessionStatus.new) do |resp|
121
+ Bidi2pdf.logger.info "Session status: #{resp["result"].inspect}"
122
+ resp["result"]
123
+ end
121
124
  end
122
125
 
123
126
  # Checks if the session has started.
@@ -18,7 +18,7 @@ module Bidi2pdf
18
18
  module Notifications
19
19
  Thread.attr_accessor :bidi2pdf_notification_instrumenter
20
20
 
21
- @subscribers = Hash.new { |h, k| h[k] = [] }
21
+ @subscribers = Concurrent::Hash.new { |h, k| h[k] = [] }
22
22
 
23
23
  class << self
24
24
  attr_reader :subscribers
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../pdf_text_sanitizer"
4
+
5
+ # Custom RSpec matcher for checking whether a PDF document contains specific text.
6
+ #
7
+ # This matcher allows you to assert that a certain string or regular expression
8
+ # is present in the sanitized text of a PDF document.
9
+ #
10
+ # It supports chaining with `.at_page(n)` to limit the search to a specific page.
11
+ #
12
+ # ## Examples
13
+ #
14
+ # expect(pdf_data).to contains_pdf_text("Total: 123.45")
15
+ # expect(pdf_data).to contains_pdf_text(/Invoice #\d+/).at_page(2)
16
+ #
17
+ # @param expected [String, Regexp] The text or pattern to match inside the PDF.
18
+ #
19
+ # @return [Boolean] true if the expected content is found (on the given page if specified)
20
+ RSpec::Matchers.define :contains_pdf_text do |expected|
21
+ chain :at_page do |page_number|
22
+ @page_number = page_number
23
+ end
24
+
25
+ match do |actual|
26
+ Bidi2pdf::TestHelpers::PDFTextSanitizer.contains?(actual, expected, @page_number)
27
+ end
28
+
29
+ failure_message do |actual|
30
+ pages = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean_pages(actual)
31
+
32
+ return "Document does not contain page #{@page_number}" if @page_number && !(@page_number && @page_number <= pages.size)
33
+
34
+ <<~MSG
35
+ PDF text did not contain expected content.
36
+
37
+ --- Expected (#{expected.inspect}) ---
38
+ On page #{@page_number || "any"}:
39
+
40
+ --- Actual ---
41
+ #{pages.each_with_index.map { |text, i| "Page #{i + 1}:\n#{text}" }.join("\n\n")}
42
+ MSG
43
+ end
44
+
45
+ description do
46
+ desc = "contain #{expected.inspect} in PDF"
47
+ desc += " on page #{@page_number}" if @page_number
48
+ desc
49
+ end
50
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "pdf-reader"
4
+ require "base64"
5
+
6
+ # RSpec matcher to assert the number of pages in a PDF document.
7
+ #
8
+ # This matcher is useful for verifying the structural integrity of generated or uploaded PDFs,
9
+ # especially in tests for reporting, invoice generation, or document exports.
10
+ #
11
+ # It supports a variety of input types:
12
+ # - Raw PDF data as a `String`
13
+ # - File paths (`String`)
14
+ # - `StringIO` or `File` objects
15
+ # - Even Base64-encoded strings, if your `pdf_reader_for` method handles it
16
+ #
17
+ # ## Example
18
+ #
19
+ # expect(pdf_data).to have_pdf_page_count(5)
20
+ # expect(StringIO.new(pdf_data)).to have_pdf_page_count(3)
21
+ #
22
+ # If the PDF is malformed, the matcher will gracefully fail and show the error message.
23
+ #
24
+ # @param expected_count [Integer] The number of pages the PDF is expected to contain.
25
+ # @return [RSpec::Matchers::Matcher] The matcher object for use in specs.
26
+ #
27
+ # @note This matcher depends on `Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_reader_for`
28
+ # to extract the page count. Make sure it supports all your intended input formats.
29
+ RSpec::Matchers.define :have_pdf_page_count do |expected_count|
30
+ match do |pdf_data|
31
+ reader = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_reader_for(pdf_data)
32
+ @actual_count = reader.page_count
33
+ @actual_count == expected_count
34
+ rescue PDF::Reader::MalformedPDFError => e
35
+ @error_message = e.message
36
+ false
37
+ end
38
+
39
+ failure_message do |_pdf_data|
40
+ if @error_message
41
+ "Expected a valid PDF with #{expected_count} pages, but encountered an error: #{@error_message}"
42
+ else
43
+ "Expected PDF to have #{expected_count} pages, but it has #{@actual_count} pages"
44
+ end
45
+ end
46
+
47
+ description do
48
+ "have #{expected_count} PDF pages"
49
+ end
50
+ end
@@ -0,0 +1,45 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "../pdf_text_sanitizer"
4
+
5
+ # Custom RSpec matcher to compare the **sanitized text content** of two PDF files.
6
+ #
7
+ # This matcher is useful for comparing PDF documents where formatting and metadata may differ,
8
+ # but the actual visible text content should be the same. It uses `PDFTextSanitizer` internally
9
+ # to normalize and clean the text before comparison.
10
+ #
11
+ # ## Example
12
+ #
13
+ # expect(actual_pdf).to match_pdf_text(expected_pdf)
14
+ #
15
+ # If the texts don’t match, it prints a diff-friendly message showing cleaned text content.
16
+ #
17
+ # @param expected [String, StringIO, File] The expected PDF content (can be a file path, StringIO, or raw string).
18
+ # @return [RSpec::Matchers::Matcher] An RSpec matcher to compare against an actual PDF.
19
+ #
20
+ # @note Ensure `PDFTextSanitizer.match?` and `PDFTextSanitizer.clean_pages` are implemented
21
+ # to handle your specific PDF processing logic.
22
+ RSpec::Matchers.define :match_pdf_text do |expected|
23
+ match do |actual|
24
+ Bidi2pdf::TestHelpers::PDFTextSanitizer.match?(actual, expected)
25
+ end
26
+
27
+ failure_message do |actual|
28
+ cleaned_actual = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean_pages(actual)
29
+ cleaned_expected = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean_pages(expected)
30
+
31
+ <<~MSG
32
+ PDF text did not match.
33
+
34
+ --- Expected ---
35
+ #{cleaned_expected.join("\n")}
36
+
37
+ --- Actual ---
38
+ #{cleaned_actual.join("\n")}
39
+ MSG
40
+ end
41
+
42
+ description do
43
+ "match sanitized PDF text content"
44
+ end
45
+ end
@@ -0,0 +1,89 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Bidi2pdf
4
+ module TestHelpers
5
+ module PDFReaderUtils
6
+ class << self
7
+ # Extracts text content from a PDF document.
8
+ #
9
+ # This method accepts various PDF input formats and attempts to extract text content
10
+ # from all pages. If extraction fails due to malformed PDF data, it returns the original input.
11
+ #
12
+ # @param pdf_data [String, StringIO, File] The PDF data in one of the following formats:
13
+ # * Base64-encoded PDF string
14
+ # * Raw PDF data beginning with "%PDF-"
15
+ # * StringIO object containing PDF data
16
+ # * Path to a PDF file as String
17
+ # * Raw PDF data as String
18
+ # @return [Array<String>] An array of strings, with each string representing the text content of a page
19
+ # @return [Object] The original input if PDF extraction fails
20
+ # @example Extract text from a PDF file
21
+ # text_content = pdf_text('path/to/document.pdf')
22
+ #
23
+ # @example Extract text from Base64-encoded string
24
+ # text_content = pdf_text(base64_encoded_pdf_data)
25
+ def pdf_text(pdf_data)
26
+ return pdf_data unless pdf_data.is_a?(String) || pdf_data.is_a?(StringIO) || pdf_data.is_a?(File)
27
+
28
+ begin
29
+ reader = pdf_reader_for pdf_data
30
+ reader.pages.map(&:text)
31
+ rescue PDF::Reader::MalformedPDFError
32
+ [pdf_data]
33
+ end
34
+ end
35
+
36
+ # Converts the input PDF data into an IO object and initializes a PDF::Reader.
37
+ #
38
+ # @param pdf_data [String, StringIO, File] The PDF data to be read.
39
+ # @return [PDF::Reader] A PDF::Reader instance for the given data.
40
+ # @raise [PDF::Reader::MalformedPDFError] If the PDF data is invalid.
41
+ def pdf_reader_for(pdf_data)
42
+ io = convert_data_to_io(pdf_data)
43
+ PDF::Reader.new(io)
44
+ end
45
+
46
+ # rubocop: disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
47
+ # Converts various input formats into an IO object for PDF::Reader.
48
+ #
49
+ # @param pdf_data [String, StringIO, File] The PDF data to be converted.
50
+ # @return [IO] An IO object containing the PDF data.
51
+ def convert_data_to_io(pdf_data)
52
+ # rubocop:disable Lint/DuplicateBranch
53
+ if pdf_data.is_a?(String) && (pdf_data.start_with?("JVBERi") || pdf_data.start_with?("JVBER"))
54
+ StringIO.new(Base64.decode64(pdf_data))
55
+ elsif pdf_data.start_with?("%PDF-")
56
+ StringIO.new(pdf_data)
57
+ elsif pdf_data.is_a?(StringIO)
58
+ pdf_data
59
+ elsif pdf_data.is_a?(String) && File.exist?(pdf_data)
60
+ File.open(pdf_data, "rb")
61
+ else
62
+ StringIO.new(pdf_data)
63
+ end
64
+ # rubocop:enable Lint/DuplicateBranch
65
+ end
66
+ end
67
+
68
+ # rubocop: enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
69
+
70
+ module InstanceMethods
71
+ def pdf_text(pdf_data)
72
+ PDFReaderUtils.pdf_text(pdf_data)
73
+ end
74
+
75
+ def pdf_reader_for(pdf_data)
76
+ PDFReaderUtils.pdf_reader_for(pdf_data)
77
+ end
78
+
79
+ def convert_data_to_io(pdf_data)
80
+ PDFReaderUtils.convert_data_to_io(pdf_data)
81
+ end
82
+ end
83
+
84
+ def self.included(base)
85
+ base.include(InstanceMethods)
86
+ end
87
+ end
88
+ end
89
+ end
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "unicode_utils"
4
+ require "diff/lcs"
5
+ require "diff/lcs/hunk"
6
+
7
+ module Bidi2pdf
8
+ module TestHelpers
9
+ # rubocop: disable Metrics/ModuleLength
10
+ # Provides utilities for sanitizing and comparing PDF text content.
11
+ # This module includes methods for cleaning text, comparing PDF content,
12
+ # and reporting differences between actual and expected PDF outputs.
13
+ #
14
+ # The sanitization process includes normalizing whitespace, replacing
15
+ # typographic ligatures, and handling other common text formatting issues.
16
+ #
17
+ # @example Cleaning text
18
+ # sanitized_text = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean("Some text")
19
+ #
20
+ # @example Comparing PDF content
21
+ # match = Bidi2pdf::TestHelpers::PDFTextSanitizer.match?(actual_pdf, expected_pdf)
22
+ module PDFTextSanitizer
23
+ class << self
24
+ # Cleans the given text by replacing common typographic ligatures,
25
+ # normalizing whitespace, and removing unnecessary characters.
26
+ #
27
+ # @param [String] text The text to clean.
28
+ # @return [String] The cleaned text.
29
+ def clean(text)
30
+ text = UnicodeUtils.nfkd(text)
31
+
32
+ text.gsub("\uFB01", "fi")
33
+ .gsub("\uFB02", "fl")
34
+ .gsub("-\n", "")
35
+ .gsub(/["]/, '"')
36
+ .gsub(/[']/, "'")
37
+ .gsub("…", "...")
38
+ .gsub("—", "--")
39
+ .gsub("–", "-")
40
+ .gsub(/\s+/, " ") # Replace all whitespace sequences with a single space
41
+ .strip
42
+ end
43
+
44
+ # Cleans an array of PDF page texts by applying the `clean` method
45
+ # to each page's content.
46
+ #
47
+ # @param [Object] actual_pdf_thingy The PDF object to clean.
48
+ # @return [Array<String>] An array of cleaned page texts.
49
+ def clean_pages(actual_pdf_thingy)
50
+ Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text(actual_pdf_thingy).map { |text| clean(text) }
51
+ end
52
+
53
+ # Cleans the given text and removes all whitespace for comparison purposes.
54
+ #
55
+ # @param [String] text The text to clean and normalize.
56
+ # @return [String] The cleaned text without whitespace.
57
+ def normalize(text)
58
+ clean(text).gsub(/\s+/, "")
59
+ end
60
+
61
+ # Checks if the given PDF contains the expected text or pattern.
62
+ #
63
+ # @param [Object] actual_pdf_thingy The PDF object to search.
64
+ # @param [String, Regexp] expected The expected text or pattern.
65
+ # @param [Integer, nil] page_number The specific page to search (optional).
66
+ # @return [Boolean] `true` if the expected text is found, `false` otherwise.
67
+ def contains?(actual_pdf_thingy, expected, page_number = nil)
68
+ pages = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text(actual_pdf_thingy)
69
+ cleaned_pages = clean_pages(pages)
70
+
71
+ return false if page_number && page_number > cleaned_pages.size
72
+
73
+ # Narrow to specific page if requested
74
+ if page_number
75
+ text = cleaned_pages[page_number - 1]
76
+ return match_expected?(text, expected)
77
+ end
78
+
79
+ # Search all pages
80
+ cleaned_pages.any? { |page| match_expected?(page, expected) }
81
+ end
82
+
83
+ # Matches the given text against the expected text or pattern.
84
+ #
85
+ # @param [String] text The text to match.
86
+ # @param [String, Regexp] expected The expected text or pattern.
87
+ # @return [Boolean] `true` if the text matches, `false` otherwise.
88
+ def match_expected?(text, expected)
89
+ return false unless text
90
+
91
+ expected.is_a?(Regexp) ? text.match?(expected) : text.include?(expected.to_s)
92
+ end
93
+
94
+ # Compares the content of two PDF objects for equality.
95
+ #
96
+ # @param [Object] actual_pdf_thingy The actual PDF object.
97
+ # @param [Object] expected_pdf_thingy The expected PDF object.
98
+ # @return [Boolean] `true` if the content matches, `false` otherwise.
99
+ def match?(actual_pdf_thingy, expected_pdf_thingy)
100
+ actual = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text actual_pdf_thingy
101
+ expected = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text expected_pdf_thingy
102
+
103
+ cleaned_actual = clean_pages(actual)
104
+ cleaned_expected = clean_pages(expected)
105
+
106
+ # Compare without whitespace for equality check
107
+ actual_for_comparison = cleaned_actual.map { |text| normalize(text) }
108
+ expected_for_comparison = cleaned_expected.map { |text| normalize(text) }
109
+
110
+ if actual_for_comparison == expected_for_comparison
111
+ true
112
+ else
113
+ report_content_mismatch(cleaned_actual, cleaned_expected)
114
+ false
115
+ end
116
+ end
117
+
118
+ # Reports differences between actual and expected PDF content.
119
+ #
120
+ # @param [Array<String>] actual The actual PDF content.
121
+ # @param [Array<String>] expected The expected PDF content.
122
+ # @return [void]
123
+ def report_content_mismatch(actual, expected)
124
+ puts "--- PDF content mismatch ---"
125
+ print_differences(actual, expected)
126
+ end
127
+
128
+ # Prints detailed differences between actual and expected PDF content.
129
+ #
130
+ # @param [Array<String>] actual The actual PDF content.
131
+ # @param [Array<String>] expected The expected PDF content.
132
+ # @return [void]
133
+ def print_differences(actual, expected)
134
+ max_pages = [actual.length, expected.length].max
135
+
136
+ (0...max_pages).each do |page_idx|
137
+ actual_page = actual[page_idx] || "(missing page)"
138
+ expected_page = expected[page_idx] || "(missing page)"
139
+
140
+ print_differences_for_page(actual_page, expected_page, page_idx)
141
+ end
142
+ end
143
+
144
+ # Prints the differences between actual and expected content for a specific page.
145
+ # This method compares the content ignoring whitespace and, if differences are found,
146
+ # outputs a formatted representation of those differences.
147
+ #
148
+ # @param [String] actual_page The actual page content.
149
+ # @param [String] expected_page The expected page content.
150
+ # @param [Integer] page_idx The zero-based index of the page being compared.
151
+ # @return [void]
152
+ def print_differences_for_page(actual_page, expected_page, page_idx)
153
+ # Compare without whitespace
154
+ actual_no_space = normalize(actual_page.to_s)
155
+ expected_no_space = normalize(expected_page.to_s)
156
+
157
+ return if actual_no_space == expected_no_space
158
+
159
+ puts "\nPage #{page_idx + 1} differences (ignoring whitespace):"
160
+
161
+ # Create diffs between the two pages
162
+ diffs = Diff::LCS.sdiff(expected_page, actual_page)
163
+
164
+ # Format and display the differences
165
+ puts format_diff_output(diffs, expected_page, actual_page)
166
+ end
167
+
168
+ # Formats the output of differences for display.
169
+ #
170
+ # @param [Array<Diff::LCS::ContextChange>] diffs The list of differences.
171
+ # @param [String] expected The expected text.
172
+ # @param [String] actual The actual text.
173
+ # @return [String] The formatted differences.
174
+ def format_diff_output(diffs, expected, actual)
175
+ output = []
176
+
177
+ changes = group_changed_diffs(diffs)
178
+
179
+ # Output each change with context
180
+ changes.each do |change|
181
+ output += format_change expected, actual, change
182
+ end
183
+
184
+ output.join("\n")
185
+ end
186
+
187
+ private
188
+
189
+ # Groups contiguous “real” diffs (added/removed/changed) into blocks,
190
+ # splitting whenever you hit an unchanged (“=”) diff.
191
+ def group_changed_diffs(diffs)
192
+ diffs
193
+ .chunk_while { |_prev, curr| curr.action != "=" }
194
+ .map { |chunk| chunk.reject { |elem| elem.action == "=" } }
195
+ .select(&:any?)
196
+ .map { |chunk| { diffs: chunk } }
197
+ end
198
+
199
+ def format_change(expected, actual, change)
200
+ pos = change[:diffs].first.old_position
201
+ snippets = extract_snippets(expected, actual, change, pos)
202
+
203
+ build_output(snippets, pos)
204
+ end
205
+
206
+ def extract_snippets(expected, actual, change, pos)
207
+ {
208
+ context_start: [0, pos - 20].max,
209
+ context: expected,
210
+ expected_snip: expected[pos, 50],
211
+ actual_snip: actual[change[:diffs].first.new_position, 50]
212
+ }
213
+ end
214
+
215
+ # 3. Build the final lines of output
216
+ def build_output(snip_data, pos)
217
+ start = snip_data[:context_start]
218
+ ctx = snip_data[:context]
219
+
220
+ [
221
+ " Context: ...#{ctx[start...pos]}",
222
+ " Expected: #{snip_data[:expected_snip]}...",
223
+ " Actual: #{snip_data[:actual_snip]}...",
224
+ " Expected (no spaces): #{normalize(snip_data[:expected_snip])}...",
225
+ " Actual (no spaces): #{normalize(snip_data[:actual_snip])}..."
226
+ ]
227
+ end
228
+ end
229
+ end
230
+ # rubocop:enable Metrics/ModuleLength
231
+ end
232
+ end
@@ -0,0 +1,87 @@
1
+ # frozen_string_literal: true
2
+
3
+ begin
4
+ require "testcontainers"
5
+ rescue LoadError
6
+ warn "Missing #{dep}. Add it to your Gemfile if you're using Bidi2pdf test helpers."
7
+ end
8
+
9
+ module Bidi2pdf
10
+ module TestHelpers
11
+ module Testcontainers
12
+ class ChromedriverContainer < ::Testcontainers::DockerContainer
13
+ DEFAULT_CHROMEDRIVER_PORT = 3000
14
+ DEFAULT_IMAGE = "dieters877565/chromedriver"
15
+
16
+ attr_reader :docker_file, :build_dir
17
+
18
+ def initialize(image = DEFAULT_IMAGE, **options)
19
+ @docker_file = options.delete(:docker_file) || "Dockerfile"
20
+ @build_dir = options.delete(:build_dir) || options[:working_dir]
21
+
22
+ super
23
+
24
+ @wait_for ||= add_wait_for(:logs, /ChromeDriver was started successfully on port/)
25
+ end
26
+
27
+ def start
28
+ with_exposed_ports(port)
29
+ super
30
+ end
31
+
32
+ def port
33
+ DEFAULT_CHROMEDRIVER_PORT
34
+ end
35
+
36
+ # rubocop: disable Metrics/AbcSize
37
+ def build_local_image
38
+ old_timeout = Docker.options[:read_timeout]
39
+ Docker.options[:read_timeout] = 60 * 10
40
+
41
+ Docker::Image.build_from_dir(build_dir, { "t" => image, "dockerfile" => docker_file }) do |lines|
42
+ lines.split("\n").each do |line|
43
+ next unless (log = JSON.parse(line)) && log.key?("stream")
44
+ next unless log["stream"] && !(trimmed_stream = log["stream"].strip).empty?
45
+
46
+ timestamp = Time.now.strftime("[%Y-%m-%dT%H:%M:%S.%6N]")
47
+ $stdout.write "#{timestamp} #{trimmed_stream}\n"
48
+ end
49
+ end
50
+
51
+ Docker.options[:read_timeout] = old_timeout
52
+ end
53
+
54
+ # rubocop: enable Metrics/AbcSize
55
+
56
+ # rubocop: disable Metrics/AbcSize
57
+ def start_local_image
58
+ build_local_image
59
+
60
+ with_exposed_ports(port)
61
+
62
+ @_container ||= Docker::Container.create(_container_create_options)
63
+ @_container.start
64
+
65
+ @_id = @_container.id
66
+ json = @_container.json
67
+ @name = json["Name"]
68
+ @_created_at = json["Created"]
69
+
70
+ @wait_for&.call(self)
71
+
72
+ self
73
+ rescue Docker::Error::NotFoundError => e
74
+ raise Testcontainers::NotFoundError, e.message
75
+ rescue Excon::Error::Socket => e
76
+ raise Testcontainers::ConnectionError, e.message
77
+ end
78
+
79
+ # rubocop: enable Metrics/AbcSize
80
+
81
+ def session_url(protocol: "http")
82
+ "#{protocol}://#{host}:#{mapped_port(port)}/session"
83
+ end
84
+ end
85
+ end
86
+ end
87
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ %w[pdf-reader diff-lcs unicode_utils].each do |dep|
4
+ require dep
5
+ rescue LoadError
6
+ warn "Missing #{dep}. Add it to your Gemfile if you're using Bidi2pdf test helpers."
7
+ end
8
+
9
+ require "bidi2pdf/test_helpers/pdf_text_sanitizer"
10
+ require "bidi2pdf/test_helpers/pdf_reader_utils"
11
+ require "bidi2pdf/test_helpers/matchers/match_pdf_text"
12
+ require "bidi2pdf/test_helpers/matchers/contains_pdf_text"
13
+ require "bidi2pdf/test_helpers/matchers/have_pdf_page_count"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Bidi2pdf
4
- VERSION = "0.1.7"
4
+ VERSION = "0.1.8"
5
5
  end
data/lib/bidi2pdf.rb CHANGED
@@ -1,5 +1,8 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "concurrent-ruby"
4
+ require "logger"
5
+
3
6
  require_relative "bidi2pdf/process_tree"
4
7
  require_relative "bidi2pdf/launcher"
5
8
  require_relative "bidi2pdf/bidi/session"
@@ -8,8 +11,6 @@ require_relative "bidi2pdf/notifications"
8
11
  require_relative "bidi2pdf/notifications/logging_subscriber"
9
12
  require_relative "bidi2pdf/verbose_logger"
10
13
 
11
- require "logger"
12
-
13
14
  module Bidi2pdf
14
15
  PAPER_FORMATS_CM = {
15
16
  letter: { width: 21.59, height: 27.94 },
@@ -33,7 +34,16 @@ module Bidi2pdf
33
34
 
34
35
  class ClientError < WebsocketError; end
35
36
 
36
- class CmdError < ClientError; end
37
+ class CmdError < ClientError
38
+ attr_reader :cmd, :response
39
+
40
+ def initialize(cmd, response)
41
+ @cmd = cmd
42
+ @response = response
43
+
44
+ super("Error response: #{response["error"]} #{cmd.inspect}")
45
+ end
46
+ end
37
47
 
38
48
  class CmdResponseNotStoredError < ClientError; end
39
49
 
@@ -55,6 +65,25 @@ module Bidi2pdf
55
65
  end
56
66
  end
57
67
 
68
+ class NavigationError < Error; end
69
+
70
+ class NavigationAuthError < NavigationError
71
+ attr_reader :url
72
+
73
+ def initialize(url, message = nil)
74
+ @url = url
75
+ super("Navigation to #{url} failed due to authentication error. #{message}")
76
+ end
77
+ end
78
+
79
+ class NavigationTimeoutError < NavigationError; end
80
+
81
+ class NavigationNotFoundError < NavigationError; end
82
+
83
+ class NavigationDNSError < NavigationError; end
84
+
85
+ # Global configuration for Bidi2pdf
86
+
58
87
  class << self
59
88
  attr_accessor :default_timeout, :enable_default_logging_subscriber
60
89
  attr_reader :logging_subscriber, :logger, :network_events_logger, :browser_console_logger, :notification_service
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: bidi2pdf
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.7
4
+ version: 0.1.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Dieter S.
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-04-17 00:00:00.000000000 Z
11
+ date: 2025-04-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: base64
@@ -38,6 +38,26 @@ dependencies:
38
38
  - - ">="
39
39
  - !ruby/object:Gem::Version
40
40
  version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: concurrent-ruby
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.0'
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: 1.3.1
51
+ type: :runtime
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '1.0'
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.3.1
41
61
  - !ruby/object:Gem::Dependency
42
62
  name: json
43
63
  requirement: !ruby/object:Gem::Requirement
@@ -379,6 +399,7 @@ files:
379
399
  - lib/bidi2pdf/bidi/interceptor.rb
380
400
  - lib/bidi2pdf/bidi/js_logger_helper.rb
381
401
  - lib/bidi2pdf/bidi/logger_events.rb
402
+ - lib/bidi2pdf/bidi/navigation_failed_events.rb
382
403
  - lib/bidi2pdf/bidi/network_event.rb
383
404
  - lib/bidi2pdf/bidi/network_event_formatters.rb
384
405
  - lib/bidi2pdf/bidi/network_event_formatters/network_event_console_formatter.rb
@@ -398,6 +419,13 @@ files:
398
419
  - lib/bidi2pdf/notifications/logging_subscriber.rb
399
420
  - lib/bidi2pdf/process_tree.rb
400
421
  - lib/bidi2pdf/session_runner.rb
422
+ - lib/bidi2pdf/test_helpers.rb
423
+ - lib/bidi2pdf/test_helpers/matchers/contains_pdf_text.rb
424
+ - lib/bidi2pdf/test_helpers/matchers/have_pdf_page_count.rb
425
+ - lib/bidi2pdf/test_helpers/matchers/match_pdf_text.rb
426
+ - lib/bidi2pdf/test_helpers/pdf_reader_utils.rb
427
+ - lib/bidi2pdf/test_helpers/pdf_text_sanitizer.rb
428
+ - lib/bidi2pdf/test_helpers/testcontainers/chromedriver_container.rb
401
429
  - lib/bidi2pdf/verbose_logger.rb
402
430
  - lib/bidi2pdf/version.rb
403
431
  - sig/bidi2pdf.rbs