hexapdf 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +54 -0
  3. data/README.md +8 -7
  4. data/examples/022-outline.rb +5 -1
  5. data/examples/032-acro_form_list_and_fill.rb +47 -0
  6. data/examples/033-text_extraction.rb +34 -0
  7. data/lib/hexapdf/cli/debug_info.rb +98 -0
  8. data/lib/hexapdf/cli/images.rb +2 -2
  9. data/lib/hexapdf/cli/info.rb +2 -0
  10. data/lib/hexapdf/cli/inspect.rb +5 -1
  11. data/lib/hexapdf/cli.rb +2 -0
  12. data/lib/hexapdf/configuration.rb +8 -0
  13. data/lib/hexapdf/content/canvas.rb +1 -1
  14. data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
  15. data/lib/hexapdf/content.rb +2 -0
  16. data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
  17. data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
  18. data/lib/hexapdf/document.rb +7 -3
  19. data/lib/hexapdf/encryption/security_handler.rb +3 -1
  20. data/lib/hexapdf/filter/brotli_decode.rb +88 -0
  21. data/lib/hexapdf/filter.rb +1 -0
  22. data/lib/hexapdf/font/cmap.rb +10 -6
  23. data/lib/hexapdf/font/true_type/builder.rb +1 -1
  24. data/lib/hexapdf/font/true_type/font.rb +13 -0
  25. data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
  26. data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
  27. data/lib/hexapdf/font/true_type.rb +1 -0
  28. data/lib/hexapdf/layout/style.rb +6 -2
  29. data/lib/hexapdf/parser.rb +29 -4
  30. data/lib/hexapdf/revision.rb +6 -2
  31. data/lib/hexapdf/task/pdfa.rb +108 -1
  32. data/lib/hexapdf/type/acro_form/field.rb +4 -1
  33. data/lib/hexapdf/type/acro_form/form.rb +4 -0
  34. data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
  35. data/lib/hexapdf/type/annotations/widget.rb +9 -0
  36. data/lib/hexapdf/type/document_security_store.rb +80 -0
  37. data/lib/hexapdf/type/page.rb +11 -0
  38. data/lib/hexapdf/type.rb +1 -0
  39. data/lib/hexapdf/version.rb +1 -1
  40. data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
  41. data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
  42. data/test/hexapdf/digital_signature/common.rb +19 -5
  43. data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
  44. data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
  45. data/test/hexapdf/encryption/test_security_handler.rb +7 -5
  46. data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
  47. data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
  48. data/test/hexapdf/font/true_type/test_builder.rb +9 -0
  49. data/test/hexapdf/font/true_type/test_font.rb +17 -3
  50. data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
  51. data/test/hexapdf/task/test_pdfa.rb +72 -0
  52. data/test/hexapdf/test_document.rb +13 -0
  53. data/test/hexapdf/test_parser.rb +55 -3
  54. data/test/hexapdf/test_revision.rb +27 -6
  55. data/test/hexapdf/type/acro_form/test_field.rb +5 -0
  56. data/test/hexapdf/type/acro_form/test_form.rb +6 -0
  57. data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
  58. data/test/hexapdf/type/annotations/test_widget.rb +11 -0
  59. data/test/hexapdf/type/test_page.rb +8 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +41 -4
@@ -0,0 +1,305 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ module HexaPDF
38
+ module Content
39
+
40
+ # This module converts the glyphs on a page to a single text string while preserving the layout.
41
+ #
42
+ # The general algorithm is:
43
+ #
44
+ # 1. Collect all individual glyphs with their user space coordinates in
45
+ # TextRunCollector::TextRun objects.
46
+ #
47
+ # 2. Sort text runs top to bottom and then left to right.
48
+ #
49
+ # 3. Group those text runs into lines based on a "baseline" while also combining neighboring
50
+ # text runs into larger runs.
51
+ #
52
+ # 4. Render each line into a string by taking into account the page size and the median glyph
53
+ # width for a text run to column mapping.
54
+ #
55
+ # 5. Add blank lines between text lines based on the page's normal line spacing.
56
+ module SmartTextExtractor
57
+
58
+ # This module provides the functionality for collecting the necessary TextRun instances for
59
+ # layouting the text.
60
+ #
61
+ # To use this module include it in a processor class. Then invoke the #collect_text_runs
62
+ # method in the #show_text and #show_text_with_positioning methods.
63
+ #
64
+ # Example:
65
+ #
66
+ # class CustomProcessor < HexaPDF::Content::Processor
67
+ # include TextRunCollector
68
+ #
69
+ # def show_text(str)
70
+ # collect_text_runs(decode_text_with_positioning(str))
71
+ # end
72
+ # alias show_text_with_positioning show_text
73
+ #
74
+ # end
75
+ #
76
+ # Once the processor has done its job, the collected text runs are available via the
77
+ # #text_runs method. Use them as input for SmartTextExtractor.layout_text_runs.
78
+ module TextRunCollector
79
+
80
+ # Represents a single run of continuous glyphs and their combined bounding box in user
81
+ # space.
82
+ TextRun = Struct.new(:string, :left, :bottom, :right, :top) do
83
+ # The "baseline" is approximated with the bottom of the bounding box.
84
+ #
85
+ # This works because HexaPDF uses a font's bounding box instead of the glyph's bounding
86
+ # box for each glyph. So while differently sized glyphs will have different "baseline"
87
+ # values, this is taken into account in the algorithm in the same way as subscript and
88
+ # superscript.
89
+ #
90
+ # Using this "fake" baseline works well enough and avoids additional calculations.
91
+ def baseline = bottom
92
+
93
+ # The height of the text run's bounding box.
94
+ def height = top - bottom
95
+
96
+ # The width of the text run's bounding box.
97
+ def width = right - left
98
+ end
99
+
100
+ # Array with all collected TextRun instances.
101
+ attr_reader :text_runs
102
+
103
+ def initialize # :nodoc:
104
+ super
105
+ @text_runs = []
106
+ end
107
+
108
+ private
109
+
110
+ # Collects all text runs from the glyphs in the +boxes+ array.
111
+ def collect_text_runs(boxes)
112
+ boxes.each do |box|
113
+ llx, lly, lrx, lry, urx, ury, ulx, uly = *box.points
114
+ x_min, x_max = [llx, lrx, ulx, urx].minmax
115
+ y_min, y_max = [lly, lry, uly, ury].minmax
116
+ @text_runs << TextRun.new(+box.string, x_min, y_min, x_max, y_max)
117
+ end
118
+ end
119
+ end
120
+
121
+ # This processor class is used when layouting the text through
122
+ # HexaPDF::Type::Page#extract_text.
123
+ class TextRunProcessor < HexaPDF::Content::Processor
124
+
125
+ include TextRunCollector
126
+
127
+ def show_text(str)
128
+ collect_text_runs(decode_text_with_positioning(str))
129
+ end
130
+ alias show_text_with_positioning show_text
131
+
132
+ end
133
+
134
+ # Converts an array of TextRun objects into a single string representation, preserving the
135
+ # visual layout.
136
+ #
137
+ # The +page_width+ and +page_height+ arguments specify the width and height of the page from
138
+ # which the text runs were extracted.
139
+ #
140
+ # The remaining keyword arguments can be used to fine-tune the algorithm for one's needs:
141
+ #
142
+ # +line_tolerance_factor+::
143
+ # The tolerance factor is applied to the median text run height to determine the range
144
+ # within which two text runs are considered to be on the same line. This ensures that
145
+ # small differences in the baseline due to, for example, subscript or superscript parts
146
+ # don't result in multiple lines.
147
+ #
148
+ # The factor should not be too large to avoid forcing separate visual lines into one line
149
+ # but also not too small to avoid subscript/superscript begin on separate lines. The
150
+ # default seems to work quite well.
151
+ #
152
+ # +paragraph_distance_threshold+::
153
+ # If the number of normal line spacings between two adjacent baselines is at least this
154
+ # large (but smaller than +large_distance_threshold+), the gap is interpreted as a
155
+ # paragraph break and a single blank line is inserted.
156
+ #
157
+ # +large_distance_threshold+::
158
+ # Works like +paragraph_distance_threshold+ and indicates if a number of normal line
159
+ # spacings is too large for being a paragraph break. A proportional number of blank lines
160
+ # is inserted in this case.
161
+ #
162
+ # This is used to represent large parts with non-text content like images.
163
+ def self.layout_text_runs(text_runs, page_width, page_height,
164
+ line_tolerance_factor: 0.4, paragraph_distance_threshold: 1.35,
165
+ large_distance_threshold: 3.0)
166
+ return '' if text_runs.empty?
167
+
168
+ # Use the median height of all text runs as an approximation of the main font size used on
169
+ # the page. The line tolerance uses a hard floor for small fonts.
170
+ median_height = median(text_runs.map(&:height).sort)
171
+ line_tolerance = [median_height * line_tolerance_factor, 2].max
172
+
173
+ # Group the text runs into lines which are sorted top to bottom. Text runs are pre-sorted by
174
+ # baseline from top to bottom and left to right (the latter is done so that consecutive text
175
+ # runs can be combined).
176
+ sorted = text_runs.sort_by {|run| [-run.baseline, run.left] }
177
+ lines = group_into_lines(sorted, line_tolerance)
178
+
179
+ # Calculate the normal line spacing, excluding anything too small/big.
180
+ line_distances = lines.map {|l| l.baseline }.each_cons(2).map {|a, b| a - b }.
181
+ select {|d| d >= median_height * 0.5 && d <= median_height * 2 }.sort
182
+ normal_line_spacing = line_distances.empty? ? median_height * 1.2 : median(line_distances)
183
+
184
+ # Convert the lines into actual text strings. Blank lines are inserted between the lines
185
+ # based on the normal line spacing.
186
+ output_lines = []
187
+ left_margin = lines.map {|line| line.text_runs[0].left }.min
188
+ glyph_widths = lines.flat_map do |line|
189
+ line.text_runs.flat_map {|run| [run.width.to_f / run.string.length] * run.string.length }
190
+ end.sort
191
+ median_glyph_width = median(glyph_widths)
192
+
193
+ lines.each_with_index do |line, index|
194
+ output_lines << text_runs_to_string(line.text_runs, median_glyph_width, left_margin)
195
+ next if index == lines.length - 1
196
+
197
+ # Add blank lines as needed.
198
+ ratio = (line.baseline - lines[index + 1].baseline) / normal_line_spacing
199
+ if ratio >= large_distance_threshold
200
+ # Subtract 1 because the newline after the output line already counts as one
201
+ # newline. Also cap at a maximum of 40 to avoid huge gaps.
202
+ [ratio.round - 1, 40].min.times { output_lines << '' }
203
+ elsif ratio >= paragraph_distance_threshold
204
+ output_lines << ''
205
+ end
206
+ end
207
+
208
+ output_lines.join("\n")
209
+ end
210
+
211
+ # Holds an array of TextRun objects and their median baseline.
212
+ Line = Struct.new(:text_runs, :baseline)
213
+
214
+ # Groups a sorted list of TextRuns (sorted by baseline, then left) into lines.
215
+ #
216
+ # Since the text_runs are already sorted, a single run through +sorted_text_runs+ is
217
+ # sufficient. A new line is created if a text run's baseline differs by more than +tolerance+
218
+ # from the current line's (median) baseline.
219
+ #
220
+ # The result is a list of Line objects with their contents sorted left to right.
221
+ def self.group_into_lines(sorted_text_runs, tolerance)
222
+ lines = []
223
+ current_line = []
224
+ current_baseline = sorted_text_runs[0].baseline
225
+ current_baselines = [current_baseline]
226
+
227
+ sorted_text_runs.each do |text_run|
228
+ # Try to combine text_runs that share exactly the same height and are next to each
229
+ # other. This avoids potentially garbled output because if two text parts are above each
230
+ # other but end up on the same line, the text runs would be mixed up (think: centered
231
+ # table header where some cells contain two lines).
232
+ if (last = current_line[-1]) && last.bottom == text_run.bottom &&
233
+ last.top == text_run.top && text_run.left - last.right < 1
234
+ last.string << text_run.string
235
+ last.right = text_run.right
236
+ elsif (current_baseline - text_run.baseline).abs <= tolerance
237
+ current_line << text_run
238
+ current_baselines << text_run.baseline
239
+ current_baseline = median(current_baselines)
240
+ else
241
+ lines << Line.new(current_line.sort_by!(&:left), current_baseline)
242
+ current_line = [text_run]
243
+ current_baseline = text_run.baseline
244
+ current_baselines.clear
245
+ current_baselines << current_baseline
246
+ end
247
+ end
248
+ lines << Line.new(current_line.sort_by!(&:left), current_baseline)
249
+ end
250
+ private_class_method :group_into_lines
251
+
252
+ # Returns the median value of the given sorted array of numerics.
253
+ def self.median(sorted_array)
254
+ mid = sorted_array.length / 2
255
+ sorted_array.length.odd? ? sorted_array[mid] : (sorted_array[mid - 1] + sorted_array[mid]) / 2.0
256
+ end
257
+ private_class_method :median
258
+
259
+ # Renders an array of TextRun objects representing one line to a single string.
260
+ #
261
+ # +median_glyph_width+:: Is used to determine the column for each text run.
262
+ # +left_margin+:: Is removed from the left side to avoid unnecessary indentation.
263
+ def self.text_runs_to_string(text_runs, median_glyph_width, left_margin)
264
+ # Minimum gap to classify as a word boundary
265
+ space_threshold = median_glyph_width * 0.5
266
+
267
+ result = +''
268
+ # The column where the last text run ended. Can be different from result.size due to fitting
269
+ # proportional-width fonts to a fixed-column output.
270
+ cursor = 0
271
+
272
+ text_runs.each_with_index do |text_run, index|
273
+ target_col = ((text_run.left - left_margin) / median_glyph_width).round
274
+ advance = target_col - cursor
275
+
276
+ if advance > 0
277
+ result << ' ' * advance
278
+ cursor += advance
279
+ elsif index >= 1 && text_run.left - text_runs[index - 1].right > space_threshold &&
280
+ result[-1] != ' '
281
+ # Force space even if advance < 0 when the actual spacing between text runs is large
282
+ # enough. This might happen because we are projecting proportional-width fonts to a
283
+ # fixed-column output.
284
+ cursor = target_col
285
+ result << ' '
286
+ end
287
+
288
+ result << text_run.string
289
+
290
+ # Move cursor to the text run's right edge but at least the text run's character count
291
+ # from the current position. This avoids gaps when there is too much difference between
292
+ # the on-page position and the approximated cursor. However, a one column difference is
293
+ # ignored to account for rounding errors.
294
+ cursor += text_run.string.size
295
+ text_run_right_edge_cursor = ((text_run.right - left_margin) / median_glyph_width).round
296
+ cursor = [text_run_right_edge_cursor, cursor].max if text_run_right_edge_cursor != cursor + 1
297
+ end
298
+
299
+ result.rstrip
300
+ end
301
+ private_class_method :text_runs_to_string
302
+
303
+ end
304
+ end
305
+ end
@@ -44,6 +44,7 @@ module HexaPDF
44
44
  #
45
45
  # * The Canvas class which provides an interface for drawing graphics and text.
46
46
  # * The Parser and Processor classes for processing an existing content stream.
47
+ # * SmartTextExtractor for extracting layouted text from a page.
47
48
  module Content
48
49
 
49
50
  autoload(:Canvas, 'hexapdf/content/canvas')
@@ -52,6 +53,7 @@ module HexaPDF
52
53
  autoload(:ColorSpace, 'hexapdf/content/color_space')
53
54
  autoload(:Operator, 'hexapdf/content/operator')
54
55
  autoload(:CanvasComposer, 'hexapdf/content/canvas_composer')
56
+ autoload(:SmartTextExtractor, 'hexapdf/content/smart_text_extractor')
55
57
 
56
58
  end
57
59
 
@@ -52,9 +52,6 @@ module HexaPDF
52
52
  # The signing handler is used by default by all methods that need a signing handler. Therefore
53
53
  # it is usually only necessary to provide the actual attribute values.
54
54
  #
55
- # *Note*: Currently only RSA is supported, DSA and ECDSA are not. See the examples below for
56
- # how to handle them using external signing.
57
- #
58
55
  #
59
56
  # == CMS and PAdES Signatures
60
57
  #
@@ -131,17 +128,6 @@ module HexaPDF
131
128
  # document.sign("output.pdf", certificate: my_cert, certificate_chain: my_chain,
132
129
  # external_signing: signing_proc)
133
130
  #
134
- # # Signing with DSA or ECDSA certificate/keys
135
- # signing_proc = lambda do |io, byte_range|
136
- # io.pos = byte_range[0]
137
- # data = io.read(byte_range[1])
138
- # io.pos = byte_range[2]
139
- # data << io.read(byte_range[3])
140
- # OpenSSL::PKCS7.sign(certificate, key, data, certificate_chain,
141
- # OpenSSL::PKCS7::DETACHED | OpenSSL::PKCS7::BINARY).to_der
142
- # end
143
- # document.sign("output.pdf", signature_size: 10_000, external_signing: signing_proc)
144
- #
145
131
  #
146
132
  # == Implementing a Signing Handler
147
133
  #
@@ -277,7 +263,7 @@ module HexaPDF
277
263
  # If a custom size is set using #signature_size=, it used. Otherwise the size is determined
278
264
  # by using #sign to sign an empty string.
279
265
  def signature_size
280
- @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size
266
+ @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size + 5
281
267
  end
282
268
 
283
269
  # Finalizes the signature field as well as the signature dictionary before writing.
@@ -121,7 +121,7 @@ module HexaPDF
121
121
  private
122
122
 
123
123
  # Creates the set of signed attributes for the signer information structure.
124
- def create_signed_attrs(data, signing_time: true)
124
+ def create_signed_attrs(data, ess_cert_hash: 'sha256', signing_time: true)
125
125
  signing_time = (self.signing_time || Time.now).utc if signing_time
126
126
  set(
127
127
  attribute('content-type', oid('id-data')),
@@ -132,12 +132,13 @@ module HexaPDF
132
132
  ),
133
133
  attribute(
134
134
  'id-aa-signingCertificateV2',
135
- sequence( # SigningCertificateV2
135
+ sequence( # SigningCertificateV2, see RFC5035
136
136
  sequence( # Seq of ESSCertIDv2
137
137
  sequence( # ESSCertIDv2
138
- #TODO: Does not validate on ETSI checker if used, doesn't matter if SHA256 or 512
139
- #oid('sha512'),
140
- binary(OpenSSL::Digest.digest('sha256', @certificate.to_der)), # certHash
138
+ (sequence( # AlgorithmIdentifier RFC3280 4.1.1.2
139
+ oid(ess_cert_hash) # algorithm
140
+ ) unless ess_cert_hash == 'sha256'),
141
+ binary(OpenSSL::Digest.digest(ess_cert_hash, @certificate.to_der)), # certHash
141
142
  sequence( # issuerSerial
142
143
  sequence( # issuer
143
144
  implicit(4, sequence(@certificate.issuer)) # choice 4 directoryName
@@ -184,13 +185,19 @@ module HexaPDF
184
185
  # Creates a signer information structure containing the actual meat of the whole CMS object.
185
186
  def create_signer_info(signature, signed_attrs, unsigned_attrs = nil)
186
187
  certificate_pkey_algorithm = @certificate.public_key.oid
187
- signature_algorithm = if certificate_pkey_algorithm == 'rsaEncryption'
188
+ signature_algorithm = case certificate_pkey_algorithm
189
+ when 'rsaEncryption'
188
190
  sequence( # signatureAlgorithm
189
191
  oid('rsaEncryption'), # algorithmID
190
192
  null # params
191
193
  )
192
- else
193
- raise HexaPDF::Error, "Unsupported key type/signature algorithm"
194
+ when 'DSA'
195
+ unless @digest_algorithm == 'sha256'
196
+ raise HexaPDF::Error, "Only SHA256 supported with DSA"
197
+ end
198
+ sequence(oid('id-dsa-with-sha256'), null)
199
+ when 'id-ecPublicKey'
200
+ sequence(oid("ecdsa-with-#{@digest_algorithm.upcase}"), null)
194
201
  end
195
202
 
196
203
  sequence(
@@ -273,6 +280,12 @@ module HexaPDF
273
280
  'sha384' => '2.16.840.1.101.3.4.2.2',
274
281
  'sha512' => '2.16.840.1.101.3.4.2.3',
275
282
  'rsaEncryption' => '1.2.840.113549.1.1.1',
283
+ 'id-dsa-with-sha1' => '1.2.840.10040.4.3',
284
+ 'id-dsa-with-sha256' => '2.16.840.1.101.3.4.3.2',
285
+ 'ecdsa-with-SHA1' => '1.2.840.10045.4.1',
286
+ 'ecdsa-with-SHA256' => '1.2.840.10045.4.3.2',
287
+ 'ecdsa-with-SHA384' => '1.2.840.10045.4.3.3',
288
+ 'ecdsa-with-SHA512' => '1.2.840.10045.4.3.4',
276
289
  'id-aa-signingCertificate' => '1.2.840.113549.1.9.16.2.12',
277
290
  'id-aa-timeStampToken' => '1.2.840.113549.1.9.16.2.14',
278
291
  'id-aa-signingCertificateV2' => '1.2.840.113549.1.9.16.2.47',
@@ -394,11 +394,12 @@ module HexaPDF
394
394
  # :call-seq:
395
395
  # document.unwrap(obj) -> unwrapped_obj
396
396
  #
397
- # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...
398
- # instead of HexaPDF::Reference and HexaPDF::Object).
397
+ # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...)
398
+ # instead of HexaPDF::Reference and HexaPDF::Object. Only HexaPDF::Stream objects are retained
399
+ # as they are not representable by native Ruby objects.
399
400
  def unwrap(object, seen = {})
400
401
  object = deref(object)
401
- object = object.data if object.kind_of?(HexaPDF::Object)
402
+ object = object.data if object.kind_of?(HexaPDF::Object) && !object.kind_of?(HexaPDF::Stream)
402
403
  if seen.key?(object)
403
404
  raise HexaPDF::Error, "Can't unwrap a recursive structure"
404
405
  end
@@ -413,6 +414,8 @@ module HexaPDF
413
414
  when HexaPDF::PDFData
414
415
  seen[object] = true
415
416
  unwrap(object.value, seen.dup)
417
+ when HexaPDF::Stream
418
+ object
416
419
  else
417
420
  object
418
421
  end
@@ -790,6 +793,7 @@ module HexaPDF
790
793
  if @metadata
791
794
  metadata.modification_date(Time.now)
792
795
  else
796
+ trailer.delete(:Info) unless trailer.info.kind_of?(HexaPDF::Dictionary)
793
797
  trailer.info[:ModDate] = Time.now
794
798
  end
795
799
  end
@@ -363,7 +363,9 @@ module HexaPDF
363
363
  raise(HexaPDF::UnsupportedEncryptionError,
364
364
  "Invalid key length #{key_length} specified")
365
365
  end
366
- dict[:Length] = key_length if dict[:V] == 4 || dict[:V] == 2
366
+ # /Length should only be set for V=2 as per the spec. However, software like Adobe Reader
367
+ # fails if this is not set for V=5 or V=4.
368
+ dict[:Length] = key_length if dict[:V] == 5 || dict[:V] == 4 || dict[:V] == 2
367
369
 
368
370
  if ![:aes, :arc4].include?(algorithm)
369
371
  raise(HexaPDF::UnsupportedEncryptionError,
@@ -0,0 +1,88 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'fiber'
38
+ require 'brotli'
39
+ require 'hexapdf/filter/predictor'
40
+ require 'hexapdf/configuration'
41
+
42
+ module HexaPDF
43
+ module Filter
44
+
45
+ # Implements the Brotli filter using the brotli library which must be installed manually.
46
+ #
47
+ # The BrotliDecode specification is not yet available as a standard but will be in the near
48
+ # future. Therefore it is recommended to wait using it for encoding streams until most of the
49
+ # PDF ecosystem has support for it.
50
+ #
51
+ # See: HexaPDF::Filter
52
+ module BrotliDecode
53
+
54
+ # See HexaPDF::Filter
55
+ #
56
+ # Note that the brotli gem currently doesn't support a streaming decoder. This means that the
57
+ # whole source must be read and decoded at once.
58
+ def self.decoder(source, options = nil)
59
+ fib = Fiber.new do
60
+ data = Filter.string_from_source(source)
61
+ data.empty? ? data: Brotli.inflate(data)
62
+ end
63
+
64
+ if options && options[:Predictor]
65
+ Predictor.decoder(fib, options)
66
+ else
67
+ fib
68
+ end
69
+ end
70
+
71
+ # See HexaPDF::Filter
72
+ #
73
+ # As with ::decoder a usable streaming encoder is not available.
74
+ def self.encoder(source, options = nil)
75
+ if options && options[:Predictor]
76
+ source = Predictor.encoder(source, options)
77
+ end
78
+
79
+ Fiber.new do
80
+ Brotli.deflate(Filter.string_from_source(source),
81
+ quality: HexaPDF::GlobalConfiguration['filter.brotli.compression'])
82
+ end
83
+ end
84
+
85
+ end
86
+
87
+ end
88
+ end
@@ -134,6 +134,7 @@ module HexaPDF
134
134
  autoload(:FlateDecode, 'hexapdf/filter/flate_decode')
135
135
  autoload(:LZWDecode, 'hexapdf/filter/lzw_decode')
136
136
  autoload(:RunLengthDecode, 'hexapdf/filter/run_length_decode')
137
+ autoload(:BrotliDecode, 'hexapdf/filter/brotli_decode')
137
138
 
138
139
  autoload(:Predictor, 'hexapdf/filter/predictor')
139
140
 
@@ -143,10 +143,13 @@ module HexaPDF
143
143
  # An error is raised if the string contains invalid bytes.
144
144
  def read_codes(string)
145
145
  codes = []
146
- bytes = string.each_byte
146
+ bytes = string.bytes
147
+ length = bytes.length
148
+ i = 0
147
149
 
148
- loop do
149
- byte = bytes.next
150
+ while i < length
151
+ byte = bytes[i]
152
+ i += 1
150
153
  code = 0
151
154
 
152
155
  found = @codespace_ranges.any? do |first_byte_range, rest_ranges|
@@ -154,9 +157,10 @@ module HexaPDF
154
157
 
155
158
  code = (code << 8) + byte
156
159
  valid = rest_ranges.all? do |range|
157
- begin
158
- byte = bytes.next
159
- rescue StopIteration
160
+ if i < length
161
+ byte = bytes[i]
162
+ i += 1
163
+ else
160
164
  raise HexaPDF::Error, "Missing bytes while reading codes via CMap"
161
165
  end
162
166
  code = (code << 8) + byte
@@ -48,7 +48,7 @@ module HexaPDF
48
48
  entry_selector = tables.length.bit_length - 1
49
49
  range_shift = tables.length * 16 - search_range
50
50
 
51
- font_data = "\x0\x1\x0\x0".b +
51
+ font_data = (tables.key?('glyf') ? "\x0\x1\x0\x0" : "OTTO").b +
52
52
  [tables.length, search_range, entry_selector, range_shift].pack('n4')
53
53
 
54
54
  offset = font_data.length + tables.length * 16
@@ -35,6 +35,7 @@
35
35
  #++
36
36
 
37
37
  require 'hexapdf/font/true_type/table'
38
+ require 'hexapdf/font/true_type/builder'
38
39
  require 'set'
39
40
 
40
41
  module HexaPDF
@@ -84,6 +85,18 @@ module HexaPDF
84
85
  @tables = {}
85
86
  end
86
87
 
88
+ # Uses Builder to build a font file for this font.
89
+ #
90
+ # The +table_overrides+ argument can be used to supply mappings from table names (in string
91
+ # form) to raw table data that should override the respective font's tables.
92
+ def build(table_overrides = {})
93
+ tables = directory.table_names.each_with_object({}) do |name, hash|
94
+ hash[name] = self[name.to_sym].raw_data
95
+ end
96
+ tables.merge!(table_overrides)
97
+ Builder.build(tables)
98
+ end
99
+
87
100
  # Returns the table instance for the given tag (a symbol), or +nil+ if no such table exists.
88
101
  def [](tag)
89
102
  return @tables[tag] if @tables.key?(tag)
@@ -176,9 +176,14 @@ module HexaPDF
176
176
  # Adds the components of compound glyphs to the subset.
177
177
  def add_glyph_components
178
178
  glyf = @font[:glyf]
179
+ process_glyph_components = lambda do |gid|
180
+ glyf[gid].components&.each do |cgid|
181
+ use_glyph(cgid)
182
+ process_glyph_components.call(cgid) if glyf[cgid].compound?
183
+ end
184
+ end
179
185
  @glyph_map.keys.each do |gid|
180
- next if gid.kind_of?(Symbol)
181
- glyf[gid].components&.each {|cgid| use_glyph(cgid) }
186
+ process_glyph_components.call(gid) unless gid.kind_of?(Symbol)
182
187
  end
183
188
  end
184
189
 
@@ -69,6 +69,11 @@ module HexaPDF
69
69
  @tables[tag]
70
70
  end
71
71
 
72
+ # Returns an array with all the table names (in string form) in the directory.
73
+ def table_names
74
+ @tables.keys
75
+ end
76
+
72
77
  private
73
78
 
74
79
  def load_from_io #:nodoc:
@@ -49,6 +49,7 @@ module HexaPDF
49
49
  autoload(:Font, 'hexapdf/font/true_type/font')
50
50
  autoload(:Subsetter, 'hexapdf/font/true_type/subsetter')
51
51
  autoload(:Optimizer, 'hexapdf/font/true_type/optimizer')
52
+ autoload(:Builder, 'hexapdf/font/true_type/builder')
52
53
 
53
54
  end
54
55