hexapdf 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +28 -0
  3. data/examples/032-acro_form_list_and_fill.rb +47 -0
  4. data/examples/033-text_extraction.rb +34 -0
  5. data/lib/hexapdf/cli/info.rb +2 -0
  6. data/lib/hexapdf/configuration.rb +8 -0
  7. data/lib/hexapdf/content/canvas.rb +1 -1
  8. data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
  9. data/lib/hexapdf/content.rb +2 -0
  10. data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
  11. data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
  12. data/lib/hexapdf/document.rb +7 -3
  13. data/lib/hexapdf/filter/brotli_decode.rb +88 -0
  14. data/lib/hexapdf/filter.rb +1 -0
  15. data/lib/hexapdf/font/true_type/builder.rb +1 -1
  16. data/lib/hexapdf/font/true_type/font.rb +13 -0
  17. data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
  18. data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
  19. data/lib/hexapdf/font/true_type.rb +1 -0
  20. data/lib/hexapdf/layout/style.rb +6 -2
  21. data/lib/hexapdf/task/pdfa.rb +108 -1
  22. data/lib/hexapdf/type/acro_form/form.rb +4 -0
  23. data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
  24. data/lib/hexapdf/type/annotations/widget.rb +9 -0
  25. data/lib/hexapdf/type/document_security_store.rb +80 -0
  26. data/lib/hexapdf/type/page.rb +11 -0
  27. data/lib/hexapdf/type.rb +1 -0
  28. data/lib/hexapdf/version.rb +1 -1
  29. data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
  30. data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
  31. data/test/hexapdf/digital_signature/common.rb +19 -5
  32. data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
  33. data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
  34. data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
  35. data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
  36. data/test/hexapdf/font/true_type/test_builder.rb +9 -0
  37. data/test/hexapdf/font/true_type/test_font.rb +17 -3
  38. data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
  39. data/test/hexapdf/task/test_pdfa.rb +72 -0
  40. data/test/hexapdf/test_document.rb +13 -0
  41. data/test/hexapdf/type/acro_form/test_form.rb +6 -0
  42. data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
  43. data/test/hexapdf/type/annotations/test_widget.rb +11 -0
  44. data/test/hexapdf/type/test_page.rb +8 -0
  45. metadata +25 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 35bbb5d1780d07ecf6098cc40359ff2cc02cd89231a124b6ff1a0a13c760d116
4
- data.tar.gz: 8664f2ac8a6651ee83e7292d005ea10d89b7ea738de47cc62dbf219f4eae0cb4
3
+ metadata.gz: 04f2a87f1aaa95513275432d718996b7d598fc15e476f6999f6b6fe9f29cd0f8
4
+ data.tar.gz: 539d2b0e984db4ca4095bf0aad5208fbbdff5a08acc80d270a6b1c824f12c87e
5
5
  SHA512:
6
- metadata.gz: 232aefc90eb4f9f9a913d27affa95a0c9eff43a72e04eeb1adc0fbe11e865033c6fd0b7779930b15a982afdd909d6ffa98640db6db668f95ce0c26332749cfae
7
- data.tar.gz: e1b836a23d58e92ceb70f5b892d023edcf585288583f2254d35394688204bfdbf4401edea6562a96d1583a71a302d8d50e8a175262ff5077a3b4a2200ec922a4
6
+ metadata.gz: c35f8b0267ef60c6392ae99d8c001e4d6b5e18ea1f5a62132d44bbf865d52cc8a9b08436e107c35a01d3a6edaeb7be9bcede20931a255416d4ea4d07778f8fc0
7
+ data.tar.gz: bfdedefe99c534d62b11f406b447902ea6824758153448ebfba35d0e456850134ba36a6cb2c97d668983e8a5b5b96bf0ab0a03c6136f2478a7717a0e7bb0933b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,31 @@
1
+ ## 1.7.0 - 2026-04-13
2
+
3
+ ### Added
4
+
5
+ * Smart text extraction for retrieving layouted text from pages
6
+ * Support for digitally signing with ECDSA keys
7
+ * Support for digitally signing with DSA keys
8
+ * Support for BrotliDecode filter
9
+ * [HexaPDF::Type::DocumentSecurityStore] and
10
+ [HexaPDF::Type::ValidationRelatedInformation]
11
+
12
+ ### Changed
13
+
14
+ * **Breaking change**: [HexaPDF::Document#unwrap] to not unwrap streams
15
+ * Automatic detection of digital signature size to account for small deviations
16
+ * [HexaPDF::Type::AcroForm::Form#fill] to ignore password fields
17
+ * [HexaPDF::Type::AcroForm::TextField] validation to convert invalid Symbol
18
+ values to String
19
+ * [HexaPDF::Type::Annotations::Widget] validation to also validate a widget as a
20
+ field if necessary
21
+ * PDF/A task to include a fix for mismatching glyph widths for Type 2 CID fonts
22
+
23
+ ### Fixed
24
+
25
+ * Writing of PDF documents with an invalid value for the /Info dictionary
26
+ * Subsetting of TrueType fonts in case compound glyphs are themselves compound
27
+
28
+
1
29
  ## 1.6.0 - 2026-02-10
2
30
 
3
31
  ### Added
@@ -0,0 +1,47 @@
1
+ # # PDF Forms - List and fill fields
2
+ #
3
+ # This example shows how to list the form fields of an interactive PDF form and
4
+ # how to fill out the form.
5
+ #
6
+ # The output file from the [PDF forms](acro_form.html) example can be used as
7
+ # input.
8
+ #
9
+ # One way to list and fill a PDF form is to use the [HexaPDF CLI with the 'form'
10
+ # command](/documentation/hexapdf.1.html#form). Here, however, we are doing it
11
+ # with the HexaPDF API.
12
+ #
13
+ # Usage:
14
+ # : `ruby acro_form_list_and_fill.rb [INPUT.PDF]`
15
+ #
16
+
17
+ require 'base64'
18
+ require 'hexapdf'
19
+
20
+ doc = HexaPDF::Document.open(ARGV[0] || 'acro_form.pdf')
21
+ exit unless doc.acro_form
22
+
23
+ puts "Listing all form fields:"
24
+ doc.acro_form.each_field do |field|
25
+ puts "#{field.full_field_name} (#{field.concrete_field_type})"
26
+ end
27
+
28
+ # We are using this to generate some values for existing text fields. In the
29
+ # real world one would be getting the values from the user.
30
+ puts "\nFilling in the text fields with random values:"
31
+ values = {}
32
+ doc.acro_form.each_field do |field|
33
+ next unless field.field_type == :Tx
34
+ value = Base64.encode64(field.full_field_name).strip
35
+ value = if field.key?(:MaxLen)
36
+ value[0, field[:MaxLen]]
37
+ else
38
+ "Value #{field.field_type} #{value}"
39
+ end
40
+ values[field.full_field_name] = value
41
+ puts "#{field.full_field_name}: #{value}"
42
+ end
43
+
44
+ # Now actually fill out the form the values
45
+ doc.acro_form.fill(values)
46
+
47
+ doc.write('acro_form_list_and_fill.pdf', optimize: true)
@@ -0,0 +1,34 @@
1
+ # # Text Extraction
2
+ #
3
+ # This example shows how to extract layouted text from a page.
4
+ #
5
+ # It uses the provided input PDF or creates a small sample PDF as input. Then it
6
+ # extracts the text for each page and creates new pages with the extracted text
7
+ # in a fixed-width font.
8
+ #
9
+ # Usage:
10
+ # : `ruby text_extraction.rb [INPUT.PDF]`
11
+ #
12
+
13
+ require 'hexapdf'
14
+
15
+ # Use the input PDF or create a sample PDF.
16
+ if ARGV.length > 0
17
+ doc = HexaPDF::Document.open(ARGV[0])
18
+ else
19
+ composer = HexaPDF::Composer.new do |pdf|
20
+ pdf.lorem_ipsum(count: 3, padding: [0, 0, 20])
21
+ pdf.lorem_ipsum(padding: [0, 50, 20], text_indent: 40)
22
+ pdf.lorem_ipsum(count: 2)
23
+ end
24
+ doc = composer.document
25
+ end
26
+
27
+ # Extract the existing pages and add new ones with the extracted text
28
+ doc.pages.count.times do |index|
29
+ text = doc.pages[index].extract_text
30
+ doc.pages.add.canvas.font('/usr/share/fonts/truetype/freefont/FreeMono.ttf', size: 6).
31
+ text(text, at: [10, 820])
32
+ end
33
+
34
+ doc.write('text_extraction.pdf', optimize: true)
@@ -137,6 +137,8 @@ module HexaPDF
137
137
  end
138
138
  elsif doc.encrypted?
139
139
  output_line("Encrypted", "yes (no or wrong password given)")
140
+ else
141
+ output_line("Encrypted", "no")
140
142
  end
141
143
 
142
144
  if doc.revisions.parser.linearized?
@@ -559,6 +559,7 @@ module HexaPDF
559
559
  JPXDecode: 'HexaPDF::Filter::PassThrough',
560
560
  Crypt: 'HexaPDF::Filter::Crypt',
561
561
  Encryption: 'HexaPDF::Filter::Encryption',
562
+ BrotliDecode: 'HexaPDF::Filter::BrotliDecode',
562
563
  },
563
564
  'font.default' => 'Times',
564
565
  'font.fallback' => ['ZapfDingbats', 'Symbol'],
@@ -636,6 +637,11 @@ module HexaPDF
636
637
  #
637
638
  # See PDF2.0 s8.6
638
639
  #
640
+ # filter.brotli.compression::
641
+ # Specifies the compression level that should be used with the BrotliDecode filter. The level
642
+ # can range from 0 (no compression), 1 (best speed) to 11 (best compression). The default
643
+ # value is 8 which is a good compromise between speed and resulting size.
644
+ #
639
645
  # filter.flate.compression::
640
646
  # Specifies the compression level that should be used with the FlateDecode filter. The level
641
647
  # can range from 0 (no compression), 1 (best speed) to 9 (best compression, default).
@@ -754,6 +760,8 @@ module HexaPDF
754
760
  MCR: 'HexaPDF::Type::MarkedContentReference',
755
761
  OBJR: 'HexaPDF::Type::ObjectReference',
756
762
  Measure: 'HexaPDF::Type::Measure',
763
+ DSS: 'HexaPDF::Type::DocumentSecurityStore',
764
+ VRI: 'HexaPDF::Type::DocumentSecurityStore::ValidationRelatedInformation',
757
765
  },
758
766
  'object.subtype_map' => {
759
767
  nil => {
@@ -895,7 +895,7 @@ module HexaPDF
895
895
  #
896
896
  # * Any other string is treated as a color name. HexaPDF supports CSS Color Module Level 3
897
897
  # color names (see https://www.w3.org/TR/css-color-3/#svg-color) as well as HexaPDF design
898
- # colors.
898
+ # colors. See ColorSpace::COLOR_NAMES for the list of supported names.
899
899
  #
900
900
  # * Four numeric arguments specify a CMYK color (see ColorSpace::DeviceCMYK::Color).
901
901
  #
@@ -0,0 +1,305 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ module HexaPDF
38
+ module Content
39
+
40
+ # This module converts the glyphs on a page to a single text string while preserving the layout.
41
+ #
42
+ # The general algorithm is:
43
+ #
44
+ # 1. Collect all individual glyphs with their user space coordinates in
45
+ # TextRunCollector::TextRun objects.
46
+ #
47
+ # 2. Sort text runs top to bottom and then left to right.
48
+ #
49
+ # 3. Group those text runs into lines based on a "baseline" while also combining neighboring
50
+ # text runs into larger runs.
51
+ #
52
+ # 4. Render each line into a string by taking into account the page size and the median glyph
53
+ # width for a text run to column mapping.
54
+ #
55
+ # 5. Add blank lines between text lines based on the page's normal line spacing.
56
+ module SmartTextExtractor
57
+
58
+ # This module provides the functionality for collecting the necessary TextRun instances for
59
+ # layouting the text.
60
+ #
61
+ # To use this module include it in a processor class. Then invoke the #collect_text_runs
62
+ # method in the #show_text and #show_text_with_positioning methods.
63
+ #
64
+ # Example:
65
+ #
66
+ # class CustomProcessor < HexaPDF::Content::Processor
67
+ # include TextRunCollector
68
+ #
69
+ # def show_text(str)
70
+ # collect_text_runs(decode_text_with_positioning(str))
71
+ # end
72
+ # alias show_text_with_positioning show_text
73
+ #
74
+ # end
75
+ #
76
+ # Once the processor has done its job, the collected text runs are available via the
77
+ # #text_runs method. Use them as input for SmartTextExtractor.layout_text_runs.
78
+ module TextRunCollector
79
+
80
+ # Represents a single run of continuous glyphs and their combined bounding box in user
81
+ # space.
82
+ TextRun = Struct.new(:string, :left, :bottom, :right, :top) do
83
+ # The "baseline" is approximated with the bottom of the bounding box.
84
+ #
85
+ # This works because HexaPDF uses a font's bounding box instead of the glyph's bounding
86
+ # box for each glyph. So while differently sized glyphs will have different "baseline"
87
+ # values, this is taken into account in the algorithm in the same way as subscript and
88
+ # superscript.
89
+ #
90
+ # Using this "fake" baseline works well enough and avoids additional calculations.
91
+ def baseline = bottom
92
+
93
+ # The height of the text run's bounding box.
94
+ def height = top - bottom
95
+
96
+ # The width of the text run's bounding box.
97
+ def width = right - left
98
+ end
99
+
100
+ # Array with all collected TextRun instances.
101
+ attr_reader :text_runs
102
+
103
+ def initialize # :nodoc:
104
+ super
105
+ @text_runs = []
106
+ end
107
+
108
+ private
109
+
110
+ # Collects all text runs from the glyphs in the +boxes+ array.
111
+ def collect_text_runs(boxes)
112
+ boxes.each do |box|
113
+ llx, lly, lrx, lry, urx, ury, ulx, uly = *box.points
114
+ x_min, x_max = [llx, lrx, ulx, urx].minmax
115
+ y_min, y_max = [lly, lry, uly, ury].minmax
116
+ @text_runs << TextRun.new(+box.string, x_min, y_min, x_max, y_max)
117
+ end
118
+ end
119
+ end
120
+
121
+ # This processor class is used when layouting the text through
122
+ # HexaPDF::Type::Page#extract_text.
123
+ class TextRunProcessor < HexaPDF::Content::Processor
124
+
125
+ include TextRunCollector
126
+
127
+ def show_text(str)
128
+ collect_text_runs(decode_text_with_positioning(str))
129
+ end
130
+ alias show_text_with_positioning show_text
131
+
132
+ end
133
+
134
+ # Converts an array of TextRun objects into a single string representation, preserving the
135
+ # visual layout.
136
+ #
137
+ # The +page_width+ and +page_height+ arguments specify the width and height of the page from
138
+ # which the text runs were extracted.
139
+ #
140
+ # The remaining keyword arguments can be used to fine-tune the algorithm for one's needs:
141
+ #
142
+ # +line_tolerance_factor+::
143
+ # The tolerance factor is applied to the median text run height to determine the range
144
+ # within which two text runs are considered to be on the same line. This ensures that
145
+ # small differences in the baseline due to, for example, subscript or superscript parts
146
+ # don't result in multiple lines.
147
+ #
148
+ # The factor should not be too large to avoid forcing separate visual lines into one line
149
+ # but also not too small to avoid subscript/superscript begin on separate lines. The
150
+ # default seems to work quite well.
151
+ #
152
+ # +paragraph_distance_threshold+::
153
+ # If the number of normal line spacings between two adjacent baselines is at least this
154
+ # large (but smaller than +large_distance_threshold+), the gap is interpreted as a
155
+ # paragraph break and a single blank line is inserted.
156
+ #
157
+ # +large_distance_threshold+::
158
+ # Works like +paragraph_distance_threshold+ and indicates if a number of normal line
159
+ # spacings is too large for being a paragraph break. A proportional number of blank lines
160
+ # is inserted in this case.
161
+ #
162
+ # This is used to represent large parts with non-text content like images.
163
+ def self.layout_text_runs(text_runs, page_width, page_height,
164
+ line_tolerance_factor: 0.4, paragraph_distance_threshold: 1.35,
165
+ large_distance_threshold: 3.0)
166
+ return '' if text_runs.empty?
167
+
168
+ # Use the median height of all text runs as an approximation of the main font size used on
169
+ # the page. The line tolerance uses a hard floor for small fonts.
170
+ median_height = median(text_runs.map(&:height).sort)
171
+ line_tolerance = [median_height * line_tolerance_factor, 2].max
172
+
173
+ # Group the text runs into lines which are sorted top to bottom. Text runs are pre-sorted by
174
+ # baseline from top to bottom and left to right (the latter is done so that consecutive text
175
+ # runs can be combined).
176
+ sorted = text_runs.sort_by {|run| [-run.baseline, run.left] }
177
+ lines = group_into_lines(sorted, line_tolerance)
178
+
179
+ # Calculate the normal line spacing, excluding anything too small/big.
180
+ line_distances = lines.map {|l| l.baseline }.each_cons(2).map {|a, b| a - b }.
181
+ select {|d| d >= median_height * 0.5 && d <= median_height * 2 }.sort
182
+ normal_line_spacing = line_distances.empty? ? median_height * 1.2 : median(line_distances)
183
+
184
+ # Convert the lines into actual text strings. Blank lines are inserted between the lines
185
+ # based on the normal line spacing.
186
+ output_lines = []
187
+ left_margin = lines.map {|line| line.text_runs[0].left }.min
188
+ glyph_widths = lines.flat_map do |line|
189
+ line.text_runs.flat_map {|run| [run.width.to_f / run.string.length] * run.string.length }
190
+ end.sort
191
+ median_glyph_width = median(glyph_widths)
192
+
193
+ lines.each_with_index do |line, index|
194
+ output_lines << text_runs_to_string(line.text_runs, median_glyph_width, left_margin)
195
+ next if index == lines.length - 1
196
+
197
+ # Add blank lines as needed.
198
+ ratio = (line.baseline - lines[index + 1].baseline) / normal_line_spacing
199
+ if ratio >= large_distance_threshold
200
+ # Subtract 1 because the newline after the output line already counts as one
201
+ # newline. Also cap at a maximum of 40 to avoid huge gaps.
202
+ [ratio.round - 1, 40].min.times { output_lines << '' }
203
+ elsif ratio >= paragraph_distance_threshold
204
+ output_lines << ''
205
+ end
206
+ end
207
+
208
+ output_lines.join("\n")
209
+ end
210
+
211
+ # Holds an array of TextRun objects and their median baseline.
212
+ Line = Struct.new(:text_runs, :baseline)
213
+
214
+ # Groups a sorted list of TextRuns (sorted by baseline, then left) into lines.
215
+ #
216
+ # Since the text_runs are already sorted, a single run through +sorted_text_runs+ is
217
+ # sufficient. A new line is created if a text run's baseline differs by more than +tolerance+
218
+ # from the current line's (median) baseline.
219
+ #
220
+ # The result is a list of Line objects with their contents sorted left to right.
221
+ def self.group_into_lines(sorted_text_runs, tolerance)
222
+ lines = []
223
+ current_line = []
224
+ current_baseline = sorted_text_runs[0].baseline
225
+ current_baselines = [current_baseline]
226
+
227
+ sorted_text_runs.each do |text_run|
228
+ # Try to combine text_runs that share exactly the same height and are next to each
229
+ # other. This avoids potentially garbled output because if two text parts are above each
230
+ # other but end up on the same line, the text runs would be mixed up (think: centered
231
+ # table header where some cells contain two lines).
232
+ if (last = current_line[-1]) && last.bottom == text_run.bottom &&
233
+ last.top == text_run.top && text_run.left - last.right < 1
234
+ last.string << text_run.string
235
+ last.right = text_run.right
236
+ elsif (current_baseline - text_run.baseline).abs <= tolerance
237
+ current_line << text_run
238
+ current_baselines << text_run.baseline
239
+ current_baseline = median(current_baselines)
240
+ else
241
+ lines << Line.new(current_line.sort_by!(&:left), current_baseline)
242
+ current_line = [text_run]
243
+ current_baseline = text_run.baseline
244
+ current_baselines.clear
245
+ current_baselines << current_baseline
246
+ end
247
+ end
248
+ lines << Line.new(current_line.sort_by!(&:left), current_baseline)
249
+ end
250
+ private_class_method :group_into_lines
251
+
252
+ # Returns the median value of the given sorted array of numerics.
253
+ def self.median(sorted_array)
254
+ mid = sorted_array.length / 2
255
+ sorted_array.length.odd? ? sorted_array[mid] : (sorted_array[mid - 1] + sorted_array[mid]) / 2.0
256
+ end
257
+ private_class_method :median
258
+
259
+ # Renders an array of TextRun objects representing one line to a single string.
260
+ #
261
+ # +median_glyph_width+:: Is used to determine the column for each text run.
262
+ # +left_margin+:: Is removed from the left side to avoid unnecessary indentation.
263
+ def self.text_runs_to_string(text_runs, median_glyph_width, left_margin)
264
+ # Minimum gap to classify as a word boundary
265
+ space_threshold = median_glyph_width * 0.5
266
+
267
+ result = +''
268
+ # The column where the last text run ended. Can be different from result.size due to fitting
269
+ # proportional-width fonts to a fixed-column output.
270
+ cursor = 0
271
+
272
+ text_runs.each_with_index do |text_run, index|
273
+ target_col = ((text_run.left - left_margin) / median_glyph_width).round
274
+ advance = target_col - cursor
275
+
276
+ if advance > 0
277
+ result << ' ' * advance
278
+ cursor += advance
279
+ elsif index >= 1 && text_run.left - text_runs[index - 1].right > space_threshold &&
280
+ result[-1] != ' '
281
+ # Force space even if advance < 0 when the actual spacing between text runs is large
282
+ # enough. This might happen because we are projecting proportional-width fonts to a
283
+ # fixed-column output.
284
+ cursor = target_col
285
+ result << ' '
286
+ end
287
+
288
+ result << text_run.string
289
+
290
+ # Move cursor to the text run's right edge but at least the text run's character count
291
+ # from the current position. This avoids gaps when there is too much difference between
292
+ # the on-page position and the approximated cursor. However, a one column difference is
293
+ # ignored to account for rounding errors.
294
+ cursor += text_run.string.size
295
+ text_run_right_edge_cursor = ((text_run.right - left_margin) / median_glyph_width).round
296
+ cursor = [text_run_right_edge_cursor, cursor].max if text_run_right_edge_cursor != cursor + 1
297
+ end
298
+
299
+ result.rstrip
300
+ end
301
+ private_class_method :text_runs_to_string
302
+
303
+ end
304
+ end
305
+ end
@@ -44,6 +44,7 @@ module HexaPDF
44
44
  #
45
45
  # * The Canvas class which provides an interface for drawing graphics and text.
46
46
  # * The Parser and Processor classes for processing an existing content stream.
47
+ # * SmartTextExtractor for extracting layouted text from a page.
47
48
  module Content
48
49
 
49
50
  autoload(:Canvas, 'hexapdf/content/canvas')
@@ -52,6 +53,7 @@ module HexaPDF
52
53
  autoload(:ColorSpace, 'hexapdf/content/color_space')
53
54
  autoload(:Operator, 'hexapdf/content/operator')
54
55
  autoload(:CanvasComposer, 'hexapdf/content/canvas_composer')
56
+ autoload(:SmartTextExtractor, 'hexapdf/content/smart_text_extractor')
55
57
 
56
58
  end
57
59
 
@@ -52,9 +52,6 @@ module HexaPDF
52
52
  # The signing handler is used by default by all methods that need a signing handler. Therefore
53
53
  # it is usually only necessary to provide the actual attribute values.
54
54
  #
55
- # *Note*: Currently only RSA is supported, DSA and ECDSA are not. See the examples below for
56
- # how to handle them using external signing.
57
- #
58
55
  #
59
56
  # == CMS and PAdES Signatures
60
57
  #
@@ -131,17 +128,6 @@ module HexaPDF
131
128
  # document.sign("output.pdf", certificate: my_cert, certificate_chain: my_chain,
132
129
  # external_signing: signing_proc)
133
130
  #
134
- # # Signing with DSA or ECDSA certificate/keys
135
- # signing_proc = lambda do |io, byte_range|
136
- # io.pos = byte_range[0]
137
- # data = io.read(byte_range[1])
138
- # io.pos = byte_range[2]
139
- # data << io.read(byte_range[3])
140
- # OpenSSL::PKCS7.sign(certificate, key, data, certificate_chain,
141
- # OpenSSL::PKCS7::DETACHED | OpenSSL::PKCS7::BINARY).to_der
142
- # end
143
- # document.sign("output.pdf", signature_size: 10_000, external_signing: signing_proc)
144
- #
145
131
  #
146
132
  # == Implementing a Signing Handler
147
133
  #
@@ -277,7 +263,7 @@ module HexaPDF
277
263
  # If a custom size is set using #signature_size=, it used. Otherwise the size is determined
278
264
  # by using #sign to sign an empty string.
279
265
  def signature_size
280
- @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size
266
+ @signature_size || sign(StringIO.new, [0, 0, 0, 0]).size + 5
281
267
  end
282
268
 
283
269
  # Finalizes the signature field as well as the signature dictionary before writing.
@@ -121,7 +121,7 @@ module HexaPDF
121
121
  private
122
122
 
123
123
  # Creates the set of signed attributes for the signer information structure.
124
- def create_signed_attrs(data, signing_time: true)
124
+ def create_signed_attrs(data, ess_cert_hash: 'sha256', signing_time: true)
125
125
  signing_time = (self.signing_time || Time.now).utc if signing_time
126
126
  set(
127
127
  attribute('content-type', oid('id-data')),
@@ -132,12 +132,13 @@ module HexaPDF
132
132
  ),
133
133
  attribute(
134
134
  'id-aa-signingCertificateV2',
135
- sequence( # SigningCertificateV2
135
+ sequence( # SigningCertificateV2, see RFC5035
136
136
  sequence( # Seq of ESSCertIDv2
137
137
  sequence( # ESSCertIDv2
138
- #TODO: Does not validate on ETSI checker if used, doesn't matter if SHA256 or 512
139
- #oid('sha512'),
140
- binary(OpenSSL::Digest.digest('sha256', @certificate.to_der)), # certHash
138
+ (sequence( # AlgorithmIdentifier RFC3280 4.1.1.2
139
+ oid(ess_cert_hash) # algorithm
140
+ ) unless ess_cert_hash == 'sha256'),
141
+ binary(OpenSSL::Digest.digest(ess_cert_hash, @certificate.to_der)), # certHash
141
142
  sequence( # issuerSerial
142
143
  sequence( # issuer
143
144
  implicit(4, sequence(@certificate.issuer)) # choice 4 directoryName
@@ -184,13 +185,19 @@ module HexaPDF
184
185
  # Creates a signer information structure containing the actual meat of the whole CMS object.
185
186
  def create_signer_info(signature, signed_attrs, unsigned_attrs = nil)
186
187
  certificate_pkey_algorithm = @certificate.public_key.oid
187
- signature_algorithm = if certificate_pkey_algorithm == 'rsaEncryption'
188
+ signature_algorithm = case certificate_pkey_algorithm
189
+ when 'rsaEncryption'
188
190
  sequence( # signatureAlgorithm
189
191
  oid('rsaEncryption'), # algorithmID
190
192
  null # params
191
193
  )
192
- else
193
- raise HexaPDF::Error, "Unsupported key type/signature algorithm"
194
+ when 'DSA'
195
+ unless @digest_algorithm == 'sha256'
196
+ raise HexaPDF::Error, "Only SHA256 supported with DSA"
197
+ end
198
+ sequence(oid('id-dsa-with-sha256'), null)
199
+ when 'id-ecPublicKey'
200
+ sequence(oid("ecdsa-with-#{@digest_algorithm.upcase}"), null)
194
201
  end
195
202
 
196
203
  sequence(
@@ -273,6 +280,12 @@ module HexaPDF
273
280
  'sha384' => '2.16.840.1.101.3.4.2.2',
274
281
  'sha512' => '2.16.840.1.101.3.4.2.3',
275
282
  'rsaEncryption' => '1.2.840.113549.1.1.1',
283
+ 'id-dsa-with-sha1' => '1.2.840.10040.4.3',
284
+ 'id-dsa-with-sha256' => '2.16.840.1.101.3.4.3.2',
285
+ 'ecdsa-with-SHA1' => '1.2.840.10045.4.1',
286
+ 'ecdsa-with-SHA256' => '1.2.840.10045.4.3.2',
287
+ 'ecdsa-with-SHA384' => '1.2.840.10045.4.3.3',
288
+ 'ecdsa-with-SHA512' => '1.2.840.10045.4.3.4',
276
289
  'id-aa-signingCertificate' => '1.2.840.113549.1.9.16.2.12',
277
290
  'id-aa-timeStampToken' => '1.2.840.113549.1.9.16.2.14',
278
291
  'id-aa-signingCertificateV2' => '1.2.840.113549.1.9.16.2.47',
@@ -394,11 +394,12 @@ module HexaPDF
394
394
  # :call-seq:
395
395
  # document.unwrap(obj) -> unwrapped_obj
396
396
  #
397
- # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...
398
- # instead of HexaPDF::Reference and HexaPDF::Object).
397
+ # Recursively unwraps the object to get native Ruby objects (i.e. Hash, Array, Integer, ...)
398
+ # instead of HexaPDF::Reference and HexaPDF::Object. Only HexaPDF::Stream objects are retained
399
+ # as they are not representable by native Ruby objects.
399
400
  def unwrap(object, seen = {})
400
401
  object = deref(object)
401
- object = object.data if object.kind_of?(HexaPDF::Object)
402
+ object = object.data if object.kind_of?(HexaPDF::Object) && !object.kind_of?(HexaPDF::Stream)
402
403
  if seen.key?(object)
403
404
  raise HexaPDF::Error, "Can't unwrap a recursive structure"
404
405
  end
@@ -413,6 +414,8 @@ module HexaPDF
413
414
  when HexaPDF::PDFData
414
415
  seen[object] = true
415
416
  unwrap(object.value, seen.dup)
417
+ when HexaPDF::Stream
418
+ object
416
419
  else
417
420
  object
418
421
  end
@@ -790,6 +793,7 @@ module HexaPDF
790
793
  if @metadata
791
794
  metadata.modification_date(Time.now)
792
795
  else
796
+ trailer.delete(:Info) unless trailer.info.kind_of?(HexaPDF::Dictionary)
793
797
  trailer.info[:ModDate] = Time.now
794
798
  end
795
799
  end