bidi2pdf 0.1.7 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +64 -8
- data/README.md +14 -0
- data/docker/Dockerfile.chromedriver +30 -5
- data/docker/entrypoint.sh +41 -0
- data/lib/bidi2pdf/bidi/browser_tab.rb +59 -8
- data/lib/bidi2pdf/bidi/client.rb +7 -5
- data/lib/bidi2pdf/bidi/command_manager.rb +14 -26
- data/lib/bidi2pdf/bidi/connection_manager.rb +3 -9
- data/lib/bidi2pdf/bidi/event_manager.rb +35 -5
- data/lib/bidi2pdf/bidi/interceptor.rb +12 -2
- data/lib/bidi2pdf/bidi/navigation_failed_events.rb +41 -0
- data/lib/bidi2pdf/bidi/session.rb +6 -1
- data/lib/bidi2pdf/bidi/web_socket_dispatcher.rb +5 -5
- data/lib/bidi2pdf/chromedriver_manager.rb +25 -11
- data/lib/bidi2pdf/notifications.rb +1 -1
- data/lib/bidi2pdf/test_helpers/matchers/contains_pdf_text.rb +50 -0
- data/lib/bidi2pdf/test_helpers/matchers/have_pdf_page_count.rb +50 -0
- data/lib/bidi2pdf/test_helpers/matchers/match_pdf_text.rb +45 -0
- data/lib/bidi2pdf/test_helpers/pdf_reader_utils.rb +89 -0
- data/lib/bidi2pdf/test_helpers/pdf_text_sanitizer.rb +232 -0
- data/lib/bidi2pdf/test_helpers/testcontainers/chromedriver_container.rb +81 -0
- data/lib/bidi2pdf/test_helpers/testcontainers/chromedriver_test_helper.rb +103 -0
- data/lib/bidi2pdf/test_helpers/testcontainers/shared_docker_network.rb +21 -0
- data/lib/bidi2pdf/test_helpers/testcontainers/testcontainers_refinement.rb +53 -0
- data/lib/bidi2pdf/test_helpers/testcontainers.rb +17 -0
- data/lib/bidi2pdf/test_helpers.rb +13 -0
- data/lib/bidi2pdf/version.rb +1 -1
- data/lib/bidi2pdf.rb +32 -3
- data/sig/bidi2pdf/bidi/event_manager.rbs +19 -13
- metadata +35 -6
@@ -117,7 +117,10 @@ module Bidi2pdf
|
|
117
117
|
|
118
118
|
# Retrieves the status of the session.
|
119
119
|
def status
|
120
|
-
send_cmd(Bidi2pdf::Bidi::Commands::SessionStatus.new)
|
120
|
+
send_cmd(Bidi2pdf::Bidi::Commands::SessionStatus.new) do |resp|
|
121
|
+
Bidi2pdf.logger.info "Session status: #{resp["result"].inspect}"
|
122
|
+
resp["result"]
|
123
|
+
end
|
121
124
|
end
|
122
125
|
|
123
126
|
# Checks if the session has started.
|
@@ -162,6 +165,7 @@ module Bidi2pdf
|
|
162
165
|
Bidi2pdf.logger.info "Subscribing to events"
|
163
166
|
|
164
167
|
Bidi::Client.new(websocket_url).tap do |event_client|
|
168
|
+
@event_socket = event_client
|
165
169
|
event_client.start
|
166
170
|
event_client.wait_until_open
|
167
171
|
|
@@ -317,6 +321,7 @@ module Bidi2pdf
|
|
317
321
|
# Cleans up resources associated with the session.
|
318
322
|
def cleanup
|
319
323
|
@client&.close
|
324
|
+
@event_socket&.close
|
320
325
|
@client = @websocket_url = @browser = nil
|
321
326
|
end
|
322
327
|
end
|
@@ -24,7 +24,7 @@ module Bidi2pdf
|
|
24
24
|
|
25
25
|
def on_message(&) = socket_events.on(:message, &)
|
26
26
|
|
27
|
-
def on_event(
|
27
|
+
def on_event(*event_names, &) = session_events.on(*event_names, &)
|
28
28
|
|
29
29
|
def on_open(&) = socket_events.on(:open, &)
|
30
30
|
|
@@ -34,13 +34,13 @@ module Bidi2pdf
|
|
34
34
|
|
35
35
|
def remove_message_listener(block) = socket_events.off(:message, block)
|
36
36
|
|
37
|
-
def remove_event_listener(name,
|
37
|
+
def remove_event_listener(name, listener) = session_events.off(name, listener)
|
38
38
|
|
39
|
-
def remove_open_listener(
|
39
|
+
def remove_open_listener(listener) = socket_events.off(:open, listener)
|
40
40
|
|
41
|
-
def remove_close_listener(
|
41
|
+
def remove_close_listener(listener) = socket_events.off(:close, listener)
|
42
42
|
|
43
|
-
def remove_error_listener(
|
43
|
+
def remove_error_listener(listener) = socket_events.off(:error, listener)
|
44
44
|
|
45
45
|
private
|
46
46
|
|
@@ -8,6 +8,7 @@ module Bidi2pdf
|
|
8
8
|
include Chromedriver::Binary::Platform
|
9
9
|
|
10
10
|
attr_reader :port, :pid, :started, :headless, :chrome_args, :shutdown_mutex
|
11
|
+
attr_accessor :reader_thread
|
11
12
|
|
12
13
|
def initialize(port: 0, headless: true, chrome_args: Bidi::Session::DEFAULT_CHROME_ARGS)
|
13
14
|
@port = port
|
@@ -49,10 +50,20 @@ module Bidi2pdf
|
|
49
50
|
"http://localhost:#{@port}/session"
|
50
51
|
end
|
51
52
|
|
53
|
+
# rubocop: disable Metrics/AbcSize
|
52
54
|
def stop(timeout: 5)
|
53
55
|
shutdown_mutex.synchronize do
|
54
56
|
return unless @pid
|
55
57
|
|
58
|
+
if reader_thread&.alive?
|
59
|
+
begin
|
60
|
+
reader_thread.kill
|
61
|
+
reader_thread.join
|
62
|
+
rescue StandardError => e
|
63
|
+
Bidi2pdf.logger.error "Error killing reader thread: #{e.message}"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
56
67
|
@started = false
|
57
68
|
|
58
69
|
close_session
|
@@ -72,6 +83,8 @@ module Bidi2pdf
|
|
72
83
|
end
|
73
84
|
end
|
74
85
|
|
86
|
+
# rubocop: enable Metrics/AbcSize
|
87
|
+
|
75
88
|
private
|
76
89
|
|
77
90
|
def spawn_process(cmd)
|
@@ -184,25 +197,26 @@ module Bidi2pdf
|
|
184
197
|
|
185
198
|
# rubocop: disable Metrics/AbcSize
|
186
199
|
def parse_port_from_output(io, timeout: 5)
|
187
|
-
|
188
|
-
io.each_line do |line|
|
189
|
-
Bidi2pdf.logger.debug1 line.chomp
|
200
|
+
port_event = Concurrent::Event.new
|
190
201
|
|
191
|
-
|
192
|
-
|
193
|
-
Bidi2pdf.logger.
|
194
|
-
|
195
|
-
@port = ::Regexp.last_match(1).to_i if @port.nil? || @port.zero?
|
202
|
+
self.reader_thread = Thread.new do
|
203
|
+
io.each_line do |line|
|
204
|
+
Bidi2pdf.logger.info "[chromedriver] #{line.chomp}"
|
196
205
|
|
197
|
-
|
206
|
+
if line =~ /ChromeDriver was started successfully on port (\d+)/
|
207
|
+
@port = ::Regexp.last_match(1).to_i if @port.nil? || @port.zero?
|
208
|
+
port_event.set
|
209
|
+
end
|
198
210
|
end
|
199
211
|
rescue IOError
|
200
212
|
# reader closed
|
201
213
|
ensure
|
202
214
|
io.close unless io.closed?
|
203
|
-
end
|
215
|
+
end
|
216
|
+
|
217
|
+
return if port_event.wait(timeout)
|
204
218
|
|
205
|
-
raise "Chromedriver did not report a usable port in #{timeout}s"
|
219
|
+
raise "Chromedriver did not report a usable port in #{timeout}s"
|
206
220
|
end
|
207
221
|
|
208
222
|
# rubocop: enable Metrics/AbcSize
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../pdf_text_sanitizer"
|
4
|
+
|
5
|
+
# Custom RSpec matcher for checking whether a PDF document contains specific text.
|
6
|
+
#
|
7
|
+
# This matcher allows you to assert that a certain string or regular expression
|
8
|
+
# is present in the sanitized text of a PDF document.
|
9
|
+
#
|
10
|
+
# It supports chaining with `.at_page(n)` to limit the search to a specific page.
|
11
|
+
#
|
12
|
+
# ## Examples
|
13
|
+
#
|
14
|
+
# expect(pdf_data).to contains_pdf_text("Total: 123.45")
|
15
|
+
# expect(pdf_data).to contains_pdf_text(/Invoice #\d+/).at_page(2)
|
16
|
+
#
|
17
|
+
# @param expected [String, Regexp] The text or pattern to match inside the PDF.
|
18
|
+
#
|
19
|
+
# @return [Boolean] true if the expected content is found (on the given page if specified)
|
20
|
+
RSpec::Matchers.define :contains_pdf_text do |expected|
|
21
|
+
chain :at_page do |page_number|
|
22
|
+
@page_number = page_number
|
23
|
+
end
|
24
|
+
|
25
|
+
match do |actual|
|
26
|
+
Bidi2pdf::TestHelpers::PDFTextSanitizer.contains?(actual, expected, @page_number)
|
27
|
+
end
|
28
|
+
|
29
|
+
failure_message do |actual|
|
30
|
+
pages = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean_pages(actual)
|
31
|
+
|
32
|
+
return "Document does not contain page #{@page_number}" if @page_number && !(@page_number && @page_number <= pages.size)
|
33
|
+
|
34
|
+
<<~MSG
|
35
|
+
PDF text did not contain expected content.
|
36
|
+
|
37
|
+
--- Expected (#{expected.inspect}) ---
|
38
|
+
On page #{@page_number || "any"}:
|
39
|
+
|
40
|
+
--- Actual ---
|
41
|
+
#{pages.each_with_index.map { |text, i| "Page #{i + 1}:\n#{text}" }.join("\n\n")}
|
42
|
+
MSG
|
43
|
+
end
|
44
|
+
|
45
|
+
description do
|
46
|
+
desc = "contain #{expected.inspect} in PDF"
|
47
|
+
desc += " on page #{@page_number}" if @page_number
|
48
|
+
desc
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "pdf-reader"
|
4
|
+
require "base64"
|
5
|
+
|
6
|
+
# RSpec matcher to assert the number of pages in a PDF document.
|
7
|
+
#
|
8
|
+
# This matcher is useful for verifying the structural integrity of generated or uploaded PDFs,
|
9
|
+
# especially in tests for reporting, invoice generation, or document exports.
|
10
|
+
#
|
11
|
+
# It supports a variety of input types:
|
12
|
+
# - Raw PDF data as a `String`
|
13
|
+
# - File paths (`String`)
|
14
|
+
# - `StringIO` or `File` objects
|
15
|
+
# - Even Base64-encoded strings, if your `pdf_reader_for` method handles it
|
16
|
+
#
|
17
|
+
# ## Example
|
18
|
+
#
|
19
|
+
# expect(pdf_data).to have_pdf_page_count(5)
|
20
|
+
# expect(StringIO.new(pdf_data)).to have_pdf_page_count(3)
|
21
|
+
#
|
22
|
+
# If the PDF is malformed, the matcher will gracefully fail and show the error message.
|
23
|
+
#
|
24
|
+
# @param expected_count [Integer] The number of pages the PDF is expected to contain.
|
25
|
+
# @return [RSpec::Matchers::Matcher] The matcher object for use in specs.
|
26
|
+
#
|
27
|
+
# @note This matcher depends on `Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_reader_for`
|
28
|
+
# to extract the page count. Make sure it supports all your intended input formats.
|
29
|
+
RSpec::Matchers.define :have_pdf_page_count do |expected_count|
|
30
|
+
match do |pdf_data|
|
31
|
+
reader = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_reader_for(pdf_data)
|
32
|
+
@actual_count = reader.page_count
|
33
|
+
@actual_count == expected_count
|
34
|
+
rescue PDF::Reader::MalformedPDFError => e
|
35
|
+
@error_message = e.message
|
36
|
+
false
|
37
|
+
end
|
38
|
+
|
39
|
+
failure_message do |_pdf_data|
|
40
|
+
if @error_message
|
41
|
+
"Expected a valid PDF with #{expected_count} pages, but encountered an error: #{@error_message}"
|
42
|
+
else
|
43
|
+
"Expected PDF to have #{expected_count} pages, but it has #{@actual_count} pages"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
description do
|
48
|
+
"have #{expected_count} PDF pages"
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative "../pdf_text_sanitizer"
|
4
|
+
|
5
|
+
# Custom RSpec matcher to compare the **sanitized text content** of two PDF files.
|
6
|
+
#
|
7
|
+
# This matcher is useful for comparing PDF documents where formatting and metadata may differ,
|
8
|
+
# but the actual visible text content should be the same. It uses `PDFTextSanitizer` internally
|
9
|
+
# to normalize and clean the text before comparison.
|
10
|
+
#
|
11
|
+
# ## Example
|
12
|
+
#
|
13
|
+
# expect(actual_pdf).to match_pdf_text(expected_pdf)
|
14
|
+
#
|
15
|
+
# If the texts don’t match, it prints a diff-friendly message showing cleaned text content.
|
16
|
+
#
|
17
|
+
# @param expected [String, StringIO, File] The expected PDF content (can be a file path, StringIO, or raw string).
|
18
|
+
# @return [RSpec::Matchers::Matcher] An RSpec matcher to compare against an actual PDF.
|
19
|
+
#
|
20
|
+
# @note Ensure `PDFTextSanitizer.match?` and `PDFTextSanitizer.clean_pages` are implemented
|
21
|
+
# to handle your specific PDF processing logic.
|
22
|
+
RSpec::Matchers.define :match_pdf_text do |expected|
|
23
|
+
match do |actual|
|
24
|
+
Bidi2pdf::TestHelpers::PDFTextSanitizer.match?(actual, expected)
|
25
|
+
end
|
26
|
+
|
27
|
+
failure_message do |actual|
|
28
|
+
cleaned_actual = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean_pages(actual)
|
29
|
+
cleaned_expected = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean_pages(expected)
|
30
|
+
|
31
|
+
<<~MSG
|
32
|
+
PDF text did not match.
|
33
|
+
|
34
|
+
--- Expected ---
|
35
|
+
#{cleaned_expected.join("\n")}
|
36
|
+
|
37
|
+
--- Actual ---
|
38
|
+
#{cleaned_actual.join("\n")}
|
39
|
+
MSG
|
40
|
+
end
|
41
|
+
|
42
|
+
description do
|
43
|
+
"match sanitized PDF text content"
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,89 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Bidi2pdf
|
4
|
+
module TestHelpers
|
5
|
+
module PDFReaderUtils
|
6
|
+
class << self
|
7
|
+
# Extracts text content from a PDF document.
|
8
|
+
#
|
9
|
+
# This method accepts various PDF input formats and attempts to extract text content
|
10
|
+
# from all pages. If extraction fails due to malformed PDF data, it returns the original input.
|
11
|
+
#
|
12
|
+
# @param pdf_data [String, StringIO, File] The PDF data in one of the following formats:
|
13
|
+
# * Base64-encoded PDF string
|
14
|
+
# * Raw PDF data beginning with "%PDF-"
|
15
|
+
# * StringIO object containing PDF data
|
16
|
+
# * Path to a PDF file as String
|
17
|
+
# * Raw PDF data as String
|
18
|
+
# @return [Array<String>] An array of strings, with each string representing the text content of a page
|
19
|
+
# @return [Object] The original input if PDF extraction fails
|
20
|
+
# @example Extract text from a PDF file
|
21
|
+
# text_content = pdf_text('path/to/document.pdf')
|
22
|
+
#
|
23
|
+
# @example Extract text from Base64-encoded string
|
24
|
+
# text_content = pdf_text(base64_encoded_pdf_data)
|
25
|
+
def pdf_text(pdf_data)
|
26
|
+
return pdf_data unless pdf_data.is_a?(String) || pdf_data.is_a?(StringIO) || pdf_data.is_a?(File)
|
27
|
+
|
28
|
+
begin
|
29
|
+
reader = pdf_reader_for pdf_data
|
30
|
+
reader.pages.map(&:text)
|
31
|
+
rescue PDF::Reader::MalformedPDFError
|
32
|
+
[pdf_data]
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# Converts the input PDF data into an IO object and initializes a PDF::Reader.
|
37
|
+
#
|
38
|
+
# @param pdf_data [String, StringIO, File] The PDF data to be read.
|
39
|
+
# @return [PDF::Reader] A PDF::Reader instance for the given data.
|
40
|
+
# @raise [PDF::Reader::MalformedPDFError] If the PDF data is invalid.
|
41
|
+
def pdf_reader_for(pdf_data)
|
42
|
+
io = convert_data_to_io(pdf_data)
|
43
|
+
PDF::Reader.new(io)
|
44
|
+
end
|
45
|
+
|
46
|
+
# rubocop: disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
47
|
+
# Converts various input formats into an IO object for PDF::Reader.
|
48
|
+
#
|
49
|
+
# @param pdf_data [String, StringIO, File] The PDF data to be converted.
|
50
|
+
# @return [IO] An IO object containing the PDF data.
|
51
|
+
def convert_data_to_io(pdf_data)
|
52
|
+
# rubocop:disable Lint/DuplicateBranch
|
53
|
+
if pdf_data.is_a?(String) && (pdf_data.start_with?("JVBERi") || pdf_data.start_with?("JVBER"))
|
54
|
+
StringIO.new(Base64.decode64(pdf_data))
|
55
|
+
elsif pdf_data.start_with?("%PDF-")
|
56
|
+
StringIO.new(pdf_data)
|
57
|
+
elsif pdf_data.is_a?(StringIO)
|
58
|
+
pdf_data
|
59
|
+
elsif pdf_data.is_a?(String) && File.exist?(pdf_data)
|
60
|
+
File.open(pdf_data, "rb")
|
61
|
+
else
|
62
|
+
StringIO.new(pdf_data)
|
63
|
+
end
|
64
|
+
# rubocop:enable Lint/DuplicateBranch
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# rubocop: enable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
|
69
|
+
|
70
|
+
module InstanceMethods
|
71
|
+
def pdf_text(pdf_data)
|
72
|
+
PDFReaderUtils.pdf_text(pdf_data)
|
73
|
+
end
|
74
|
+
|
75
|
+
def pdf_reader_for(pdf_data)
|
76
|
+
PDFReaderUtils.pdf_reader_for(pdf_data)
|
77
|
+
end
|
78
|
+
|
79
|
+
def convert_data_to_io(pdf_data)
|
80
|
+
PDFReaderUtils.convert_data_to_io(pdf_data)
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.included(base)
|
85
|
+
base.include(InstanceMethods)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
@@ -0,0 +1,232 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "unicode_utils"
|
4
|
+
require "diff/lcs"
|
5
|
+
require "diff/lcs/hunk"
|
6
|
+
|
7
|
+
module Bidi2pdf
|
8
|
+
module TestHelpers
|
9
|
+
# rubocop: disable Metrics/ModuleLength
|
10
|
+
# Provides utilities for sanitizing and comparing PDF text content.
|
11
|
+
# This module includes methods for cleaning text, comparing PDF content,
|
12
|
+
# and reporting differences between actual and expected PDF outputs.
|
13
|
+
#
|
14
|
+
# The sanitization process includes normalizing whitespace, replacing
|
15
|
+
# typographic ligatures, and handling other common text formatting issues.
|
16
|
+
#
|
17
|
+
# @example Cleaning text
|
18
|
+
# sanitized_text = Bidi2pdf::TestHelpers::PDFTextSanitizer.clean("Some text")
|
19
|
+
#
|
20
|
+
# @example Comparing PDF content
|
21
|
+
# match = Bidi2pdf::TestHelpers::PDFTextSanitizer.match?(actual_pdf, expected_pdf)
|
22
|
+
module PDFTextSanitizer
|
23
|
+
class << self
|
24
|
+
# Cleans the given text by replacing common typographic ligatures,
|
25
|
+
# normalizing whitespace, and removing unnecessary characters.
|
26
|
+
#
|
27
|
+
# @param [String] text The text to clean.
|
28
|
+
# @return [String] The cleaned text.
|
29
|
+
def clean(text)
|
30
|
+
text = UnicodeUtils.nfkd(text)
|
31
|
+
|
32
|
+
text.gsub("\uFB01", "fi")
|
33
|
+
.gsub("\uFB02", "fl")
|
34
|
+
.gsub("-\n", "")
|
35
|
+
.gsub(/["]/, '"')
|
36
|
+
.gsub(/[']/, "'")
|
37
|
+
.gsub("…", "...")
|
38
|
+
.gsub("—", "--")
|
39
|
+
.gsub("–", "-")
|
40
|
+
.gsub(/\s+/, " ") # Replace all whitespace sequences with a single space
|
41
|
+
.strip
|
42
|
+
end
|
43
|
+
|
44
|
+
# Cleans an array of PDF page texts by applying the `clean` method
|
45
|
+
# to each page's content.
|
46
|
+
#
|
47
|
+
# @param [Object] actual_pdf_thingy The PDF object to clean.
|
48
|
+
# @return [Array<String>] An array of cleaned page texts.
|
49
|
+
def clean_pages(actual_pdf_thingy)
|
50
|
+
Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text(actual_pdf_thingy).map { |text| clean(text) }
|
51
|
+
end
|
52
|
+
|
53
|
+
# Cleans the given text and removes all whitespace for comparison purposes.
|
54
|
+
#
|
55
|
+
# @param [String] text The text to clean and normalize.
|
56
|
+
# @return [String] The cleaned text without whitespace.
|
57
|
+
def normalize(text)
|
58
|
+
clean(text).gsub(/\s+/, "")
|
59
|
+
end
|
60
|
+
|
61
|
+
# Checks if the given PDF contains the expected text or pattern.
|
62
|
+
#
|
63
|
+
# @param [Object] actual_pdf_thingy The PDF object to search.
|
64
|
+
# @param [String, Regexp] expected The expected text or pattern.
|
65
|
+
# @param [Integer, nil] page_number The specific page to search (optional).
|
66
|
+
# @return [Boolean] `true` if the expected text is found, `false` otherwise.
|
67
|
+
def contains?(actual_pdf_thingy, expected, page_number = nil)
|
68
|
+
pages = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text(actual_pdf_thingy)
|
69
|
+
cleaned_pages = clean_pages(pages)
|
70
|
+
|
71
|
+
return false if page_number && page_number > cleaned_pages.size
|
72
|
+
|
73
|
+
# Narrow to specific page if requested
|
74
|
+
if page_number
|
75
|
+
text = cleaned_pages[page_number - 1]
|
76
|
+
return match_expected?(text, expected)
|
77
|
+
end
|
78
|
+
|
79
|
+
# Search all pages
|
80
|
+
cleaned_pages.any? { |page| match_expected?(page, expected) }
|
81
|
+
end
|
82
|
+
|
83
|
+
# Matches the given text against the expected text or pattern.
|
84
|
+
#
|
85
|
+
# @param [String] text The text to match.
|
86
|
+
# @param [String, Regexp] expected The expected text or pattern.
|
87
|
+
# @return [Boolean] `true` if the text matches, `false` otherwise.
|
88
|
+
def match_expected?(text, expected)
|
89
|
+
return false unless text
|
90
|
+
|
91
|
+
expected.is_a?(Regexp) ? text.match?(expected) : text.include?(expected.to_s)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Compares the content of two PDF objects for equality.
|
95
|
+
#
|
96
|
+
# @param [Object] actual_pdf_thingy The actual PDF object.
|
97
|
+
# @param [Object] expected_pdf_thingy The expected PDF object.
|
98
|
+
# @return [Boolean] `true` if the content matches, `false` otherwise.
|
99
|
+
def match?(actual_pdf_thingy, expected_pdf_thingy)
|
100
|
+
actual = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text actual_pdf_thingy
|
101
|
+
expected = Bidi2pdf::TestHelpers::PDFReaderUtils.pdf_text expected_pdf_thingy
|
102
|
+
|
103
|
+
cleaned_actual = clean_pages(actual)
|
104
|
+
cleaned_expected = clean_pages(expected)
|
105
|
+
|
106
|
+
# Compare without whitespace for equality check
|
107
|
+
actual_for_comparison = cleaned_actual.map { |text| normalize(text) }
|
108
|
+
expected_for_comparison = cleaned_expected.map { |text| normalize(text) }
|
109
|
+
|
110
|
+
if actual_for_comparison == expected_for_comparison
|
111
|
+
true
|
112
|
+
else
|
113
|
+
report_content_mismatch(cleaned_actual, cleaned_expected)
|
114
|
+
false
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
# Reports differences between actual and expected PDF content.
|
119
|
+
#
|
120
|
+
# @param [Array<String>] actual The actual PDF content.
|
121
|
+
# @param [Array<String>] expected The expected PDF content.
|
122
|
+
# @return [void]
|
123
|
+
def report_content_mismatch(actual, expected)
|
124
|
+
puts "--- PDF content mismatch ---"
|
125
|
+
print_differences(actual, expected)
|
126
|
+
end
|
127
|
+
|
128
|
+
# Prints detailed differences between actual and expected PDF content.
|
129
|
+
#
|
130
|
+
# @param [Array<String>] actual The actual PDF content.
|
131
|
+
# @param [Array<String>] expected The expected PDF content.
|
132
|
+
# @return [void]
|
133
|
+
def print_differences(actual, expected)
|
134
|
+
max_pages = [actual.length, expected.length].max
|
135
|
+
|
136
|
+
(0...max_pages).each do |page_idx|
|
137
|
+
actual_page = actual[page_idx] || "(missing page)"
|
138
|
+
expected_page = expected[page_idx] || "(missing page)"
|
139
|
+
|
140
|
+
print_differences_for_page(actual_page, expected_page, page_idx)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Prints the differences between actual and expected content for a specific page.
|
145
|
+
# This method compares the content ignoring whitespace and, if differences are found,
|
146
|
+
# outputs a formatted representation of those differences.
|
147
|
+
#
|
148
|
+
# @param [String] actual_page The actual page content.
|
149
|
+
# @param [String] expected_page The expected page content.
|
150
|
+
# @param [Integer] page_idx The zero-based index of the page being compared.
|
151
|
+
# @return [void]
|
152
|
+
def print_differences_for_page(actual_page, expected_page, page_idx)
|
153
|
+
# Compare without whitespace
|
154
|
+
actual_no_space = normalize(actual_page.to_s)
|
155
|
+
expected_no_space = normalize(expected_page.to_s)
|
156
|
+
|
157
|
+
return if actual_no_space == expected_no_space
|
158
|
+
|
159
|
+
puts "\nPage #{page_idx + 1} differences (ignoring whitespace):"
|
160
|
+
|
161
|
+
# Create diffs between the two pages
|
162
|
+
diffs = Diff::LCS.sdiff(expected_page, actual_page)
|
163
|
+
|
164
|
+
# Format and display the differences
|
165
|
+
puts format_diff_output(diffs, expected_page, actual_page)
|
166
|
+
end
|
167
|
+
|
168
|
+
# Formats the output of differences for display.
|
169
|
+
#
|
170
|
+
# @param [Array<Diff::LCS::ContextChange>] diffs The list of differences.
|
171
|
+
# @param [String] expected The expected text.
|
172
|
+
# @param [String] actual The actual text.
|
173
|
+
# @return [String] The formatted differences.
|
174
|
+
def format_diff_output(diffs, expected, actual)
|
175
|
+
output = []
|
176
|
+
|
177
|
+
changes = group_changed_diffs(diffs)
|
178
|
+
|
179
|
+
# Output each change with context
|
180
|
+
changes.each do |change|
|
181
|
+
output += format_change expected, actual, change
|
182
|
+
end
|
183
|
+
|
184
|
+
output.join("\n")
|
185
|
+
end
|
186
|
+
|
187
|
+
private
|
188
|
+
|
189
|
+
# Groups contiguous “real” diffs (added/removed/changed) into blocks,
|
190
|
+
# splitting whenever you hit an unchanged (“=”) diff.
|
191
|
+
def group_changed_diffs(diffs)
|
192
|
+
diffs
|
193
|
+
.chunk_while { |_prev, curr| curr.action != "=" }
|
194
|
+
.map { |chunk| chunk.reject { |elem| elem.action == "=" } }
|
195
|
+
.select(&:any?)
|
196
|
+
.map { |chunk| { diffs: chunk } }
|
197
|
+
end
|
198
|
+
|
199
|
+
def format_change(expected, actual, change)
|
200
|
+
pos = change[:diffs].first.old_position
|
201
|
+
snippets = extract_snippets(expected, actual, change, pos)
|
202
|
+
|
203
|
+
build_output(snippets, pos)
|
204
|
+
end
|
205
|
+
|
206
|
+
def extract_snippets(expected, actual, change, pos)
|
207
|
+
{
|
208
|
+
context_start: [0, pos - 20].max,
|
209
|
+
context: expected,
|
210
|
+
expected_snip: expected[pos, 50],
|
211
|
+
actual_snip: actual[change[:diffs].first.new_position, 50]
|
212
|
+
}
|
213
|
+
end
|
214
|
+
|
215
|
+
# 3. Build the final lines of output
|
216
|
+
def build_output(snip_data, pos)
|
217
|
+
start = snip_data[:context_start]
|
218
|
+
ctx = snip_data[:context]
|
219
|
+
|
220
|
+
[
|
221
|
+
" Context: ...#{ctx[start...pos]}",
|
222
|
+
" Expected: #{snip_data[:expected_snip]}...",
|
223
|
+
" Actual: #{snip_data[:actual_snip]}...",
|
224
|
+
" Expected (no spaces): #{normalize(snip_data[:expected_snip])}...",
|
225
|
+
" Actual (no spaces): #{normalize(snip_data[:actual_snip])}..."
|
226
|
+
]
|
227
|
+
end
|
228
|
+
end
|
229
|
+
end
|
230
|
+
# rubocop:enable Metrics/ModuleLength
|
231
|
+
end
|
232
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Bidi2pdf
|
4
|
+
module TestHelpers
|
5
|
+
module Testcontainers
|
6
|
+
class ChromedriverContainer < ::Testcontainers::DockerContainer
|
7
|
+
DEFAULT_CHROMEDRIVER_PORT = 3000
|
8
|
+
DEFAULT_IMAGE = "dieters877565/chromedriver"
|
9
|
+
|
10
|
+
attr_reader :docker_file, :build_dir
|
11
|
+
|
12
|
+
def initialize(image = DEFAULT_IMAGE, **options)
|
13
|
+
@docker_file = options.delete(:docker_file) || "Dockerfile"
|
14
|
+
@build_dir = options.delete(:build_dir) || options[:working_dir]
|
15
|
+
|
16
|
+
super
|
17
|
+
|
18
|
+
@wait_for ||= add_wait_for(:logs, /ChromeDriver was started successfully on port/)
|
19
|
+
end
|
20
|
+
|
21
|
+
def start
|
22
|
+
with_exposed_ports(port)
|
23
|
+
super
|
24
|
+
end
|
25
|
+
|
26
|
+
def port
|
27
|
+
DEFAULT_CHROMEDRIVER_PORT
|
28
|
+
end
|
29
|
+
|
30
|
+
# rubocop: disable Metrics/AbcSize
|
31
|
+
def build_local_image
|
32
|
+
old_timeout = Docker.options[:read_timeout]
|
33
|
+
Docker.options[:read_timeout] = 60 * 10
|
34
|
+
|
35
|
+
Docker::Image.build_from_dir(build_dir, { "t" => image, "dockerfile" => docker_file }) do |lines|
|
36
|
+
lines.split("\n").each do |line|
|
37
|
+
next unless (log = JSON.parse(line)) && log.key?("stream")
|
38
|
+
next unless log["stream"] && !(trimmed_stream = log["stream"].strip).empty?
|
39
|
+
|
40
|
+
timestamp = Time.now.strftime("[%Y-%m-%dT%H:%M:%S.%6N]")
|
41
|
+
$stdout.write "#{timestamp} #{trimmed_stream}\n"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
Docker.options[:read_timeout] = old_timeout
|
46
|
+
end
|
47
|
+
|
48
|
+
# rubocop: enable Metrics/AbcSize
|
49
|
+
|
50
|
+
# rubocop: disable Metrics/AbcSize
|
51
|
+
def start_local_image
|
52
|
+
build_local_image
|
53
|
+
|
54
|
+
with_exposed_ports(port)
|
55
|
+
|
56
|
+
@_container ||= Docker::Container.create(_container_create_options)
|
57
|
+
@_container.start
|
58
|
+
|
59
|
+
@_id = @_container.id
|
60
|
+
json = @_container.json
|
61
|
+
@name = json["Name"]
|
62
|
+
@_created_at = json["Created"]
|
63
|
+
|
64
|
+
@wait_for&.call(self)
|
65
|
+
|
66
|
+
self
|
67
|
+
rescue Docker::Error::NotFoundError => e
|
68
|
+
raise Testcontainers::NotFoundError, e.message
|
69
|
+
rescue Excon::Error::Socket => e
|
70
|
+
raise Testcontainers::ConnectionError, e.message
|
71
|
+
end
|
72
|
+
|
73
|
+
# rubocop: enable Metrics/AbcSize
|
74
|
+
|
75
|
+
def session_url(protocol: "http")
|
76
|
+
"#{protocol}://#{host}:#{mapped_port(port)}/session"
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|