hexapdf 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +54 -0
  3. data/README.md +8 -7
  4. data/examples/022-outline.rb +5 -1
  5. data/examples/032-acro_form_list_and_fill.rb +47 -0
  6. data/examples/033-text_extraction.rb +34 -0
  7. data/lib/hexapdf/cli/debug_info.rb +98 -0
  8. data/lib/hexapdf/cli/images.rb +2 -2
  9. data/lib/hexapdf/cli/info.rb +2 -0
  10. data/lib/hexapdf/cli/inspect.rb +5 -1
  11. data/lib/hexapdf/cli.rb +2 -0
  12. data/lib/hexapdf/configuration.rb +8 -0
  13. data/lib/hexapdf/content/canvas.rb +1 -1
  14. data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
  15. data/lib/hexapdf/content.rb +2 -0
  16. data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
  17. data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
  18. data/lib/hexapdf/document.rb +7 -3
  19. data/lib/hexapdf/encryption/security_handler.rb +3 -1
  20. data/lib/hexapdf/filter/brotli_decode.rb +88 -0
  21. data/lib/hexapdf/filter.rb +1 -0
  22. data/lib/hexapdf/font/cmap.rb +10 -6
  23. data/lib/hexapdf/font/true_type/builder.rb +1 -1
  24. data/lib/hexapdf/font/true_type/font.rb +13 -0
  25. data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
  26. data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
  27. data/lib/hexapdf/font/true_type.rb +1 -0
  28. data/lib/hexapdf/layout/style.rb +6 -2
  29. data/lib/hexapdf/parser.rb +29 -4
  30. data/lib/hexapdf/revision.rb +6 -2
  31. data/lib/hexapdf/task/pdfa.rb +108 -1
  32. data/lib/hexapdf/type/acro_form/field.rb +4 -1
  33. data/lib/hexapdf/type/acro_form/form.rb +4 -0
  34. data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
  35. data/lib/hexapdf/type/annotations/widget.rb +9 -0
  36. data/lib/hexapdf/type/document_security_store.rb +80 -0
  37. data/lib/hexapdf/type/page.rb +11 -0
  38. data/lib/hexapdf/type.rb +1 -0
  39. data/lib/hexapdf/version.rb +1 -1
  40. data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
  41. data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
  42. data/test/hexapdf/digital_signature/common.rb +19 -5
  43. data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
  44. data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
  45. data/test/hexapdf/encryption/test_security_handler.rb +7 -5
  46. data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
  47. data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
  48. data/test/hexapdf/font/true_type/test_builder.rb +9 -0
  49. data/test/hexapdf/font/true_type/test_font.rb +17 -3
  50. data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
  51. data/test/hexapdf/task/test_pdfa.rb +72 -0
  52. data/test/hexapdf/test_document.rb +13 -0
  53. data/test/hexapdf/test_parser.rb +55 -3
  54. data/test/hexapdf/test_revision.rb +27 -6
  55. data/test/hexapdf/type/acro_form/test_field.rb +5 -0
  56. data/test/hexapdf/type/acro_form/test_form.rb +6 -0
  57. data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
  58. data/test/hexapdf/type/annotations/test_widget.rb +11 -0
  59. data/test/hexapdf/type/test_page.rb +8 -0
  60. data/test/test_helper.rb +6 -0
  61. metadata +41 -4
@@ -211,6 +211,8 @@ module HexaPDF
211
211
  attr_reader :width
212
212
 
213
213
  # The colors of each edge. See Quad.
214
+ #
215
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
214
216
  attr_reader :color
215
217
 
216
218
  # The styles of each edge. See Quad.
@@ -897,7 +899,7 @@ module HexaPDF
897
899
  #
898
900
  # The color used for filling (e.g. text), defaults to black.
899
901
  #
900
- # See: HexaPDF::Content::Canvas#fill_color
902
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
901
903
  #
902
904
  # Examples:
903
905
  #
@@ -926,7 +928,7 @@ module HexaPDF
926
928
  #
927
929
  # The color used for stroking (e.g. text outlines), defaults to black.
928
930
  #
929
- # See: HexaPDF::Content::Canvas#stroke_color
931
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
930
932
  #
931
933
  # Examples:
932
934
  #
@@ -1175,6 +1177,8 @@ module HexaPDF
1175
1177
  #
1176
1178
  # The color used for backgrounds, defaults to +nil+ (i.e. no background).
1177
1179
  #
1180
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
1181
+ #
1178
1182
  # Examples:
1179
1183
  #
1180
1184
  # #>pdf-composer100
@@ -112,8 +112,18 @@ module HexaPDF
112
112
  end
113
113
 
114
114
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
115
- raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
116
- "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
115
+ msg = "The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
116
+ "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref"
117
+ # Some invalid PDFs contain entries where the generation number in the xref is different
118
+ # from the one found in the indirect object. If the file were reconstructed the generation
119
+ # number from the indirect object itself would be used.
120
+ # To gracefully handle such invalid PDFs they need to have a single revision.
121
+ # The other code part that handles this is in Revision#object.
122
+ if oid == xref_entry.oid && @document.revisions.count == 1
123
+ maybe_raise(msg, pos: xref_entry.pos)
124
+ else
125
+ raise_malformed(msg)
126
+ end
117
127
  end
118
128
 
119
129
  if obj.kind_of?(Reference)
@@ -209,9 +219,24 @@ module HexaPDF
209
219
  tok = @tokenizer.next_token
210
220
 
211
221
  object[:Length] = length
222
+ if object.key?(:Filter)
223
+ begin
224
+ object[:Filter] = @document.unwrap(object[:Filter])
225
+ rescue HexaPDF::Error
226
+ maybe_raise("Invalid /Filter entry for stream", pos: @tokenizer.pos)
227
+ object.delete(:Filter)
228
+ end
229
+ end
230
+ if object.key?(:DecodeParms)
231
+ begin
232
+ object[:DecodeParms] = @document.unwrap(object[:DecodeParms])
233
+ rescue HexaPDF::Error
234
+ maybe_raise("Invalid /DecodeParms entry for stream", pos: @tokenizer.pos)
235
+ object.delete(:DecodeParms)
236
+ end
237
+ end
212
238
  stream = StreamData.new(@tokenizer.io, offset: pos, length: length,
213
- filter: @document.unwrap(object[:Filter]),
214
- decode_parms: @document.unwrap(object[:DecodeParms]))
239
+ filter: object[:Filter], decode_parms: object[:DecodeParms])
215
240
  end
216
241
 
217
242
  unless tok.kind_of?(Tokenizer::Token) && tok == 'endobj'
@@ -128,6 +128,11 @@ module HexaPDF
128
128
  @objects[oid, gen]
129
129
  elsif (xref_entry = @xref_section[oid, gen])
130
130
  load_object(xref_entry)
131
+ elsif (xref_entry = @xref_section[oid]) && (obj = load_object(xref_entry))&.gen == gen
132
+ # This branch handles invalid PDFs with a single revision containing xref entries where the
133
+ # gen doesn't match the gen of the indirect object. Also see the special handling in
134
+ # Parser#load_object.
135
+ obj
131
136
  else
132
137
  nil
133
138
  end
@@ -219,8 +224,7 @@ module HexaPDF
219
224
  seen = {}
220
225
  @objects.each {|oid, _gen, data| seen[oid] = true; yield(data) }
221
226
  @xref_section.each do |oid, _gen, data|
222
- next if seen.key?(oid)
223
- yield(@objects[oid] || load_object(data))
227
+ yield(@objects[oid] || load_object(data)) unless seen.key?(oid)
224
228
  end
225
229
  @all_objects_loaded = true
226
230
  end
@@ -40,6 +40,7 @@ require 'hexapdf/content/parser'
40
40
  require 'hexapdf/content/operator'
41
41
  require 'hexapdf/type/xref_stream'
42
42
  require 'hexapdf/type/object_stream'
43
+ require 'hexapdf/font/true_type'
43
44
 
44
45
  module HexaPDF
45
46
  module Task
@@ -51,6 +52,13 @@ module HexaPDF
51
52
  # * prevents the Standard 14 PDF fonts to be used.
52
53
  # * adds an appropriate output intent if none is set.
53
54
  # * adds the necessary PDF/A metadata properties.
55
+ #
56
+ # Additionally, it applies fixes to the document so that the structures and content of
57
+ # non-conforming PDFs are corrected. See ::call for more information on the available fixes.
58
+ #
59
+ # Note that you should use a PDF/A validation tool like veraPDF (https://verapdf.org/) to ensure
60
+ # that the resulting files confirm to the PDF/A specification because not all documents can be
61
+ # fixed at the moment.
54
62
  module PDFA
55
63
 
56
64
  # Performs the necessary tasks to make the document PDF/A compatible.
@@ -58,7 +66,22 @@ module HexaPDF
58
66
  # +level+::
59
67
  # Specifies the PDF/A conformance level that should be used. Can be one of the following
60
68
  # strings: 2b, 2u, 3b, 3u.
61
- def self.call(doc, level: '3u')
69
+ #
70
+ # +fixes+::
71
+ # Specifies the fixes that should be applied when converting a non-conforming PDF. If a
72
+ # document is created with HexaPDF but also includes parts of loaded documents, this
73
+ # argument hast to be set to +:all+.
74
+ #
75
+ # Can be +:default+ (which is also the default value), +:all+ or an array with one or more
76
+ # fix names.
77
+ #
78
+ # +:default+:: Applies all fixes if the document was loaded from a file. Otherwise applies
79
+ # only those fixes necessary for files created with HexaPDF.
80
+ #
81
+ # +:all+: Applies all available fixes.
82
+ #
83
+ # +:glyph_widths+:: Corrects mismatching width information in fonts.
84
+ def self.call(doc, level: '3u', fixes: :default)
62
85
  unless level.match?(/\A[23][bu]\z/)
63
86
  raise ArgumentError, "The given PDF/A conformance level '#{level}' is not supported"
64
87
  end
@@ -68,6 +91,15 @@ module HexaPDF
68
91
  doc.metadata.property('pdfaid', 'part', part)
69
92
  doc.metadata.property('pdfaid', 'conformance', conformance.upcase)
70
93
  add_srgb_icc_output_intent(doc) unless doc.catalog.key?(:OutputIntents)
94
+
95
+ fixes = if fixes == :all || (fixes == :default && doc.revisions.parser)
96
+ ALL_FIXES
97
+ elsif fixes == :default
98
+ ALL_FIXES - FIXES_FOR_LOADED_DOCUMENTS
99
+ else
100
+ fixes
101
+ end
102
+ fixes.each {|fix| send(fix, doc) }
71
103
  end
72
104
  end
73
105
 
@@ -81,6 +113,81 @@ module HexaPDF
81
113
  ]
82
114
  end
83
115
 
116
+ ALL_FIXES = [:fix_glyph_widths] # :nodoc:
117
+
118
+ FIXES_FOR_LOADED_DOCUMENTS = [:fix_glyph_widths] # :nodoc:
119
+
120
+ # Makes the glyph widths stored in the embedded fonts the same as the ones specified in the
121
+ # PDF font data structures.
122
+ #
123
+ # Note: Currently only handles Type 2 CIDFonts.
124
+ def self.fix_glyph_widths(doc) # :nodoc:
125
+ # Step 1: Collect all CIDs together with their respective fonts
126
+ processor = CIDCollector.new
127
+ doc.pages.each do |page|
128
+ page.process_contents(processor)
129
+ page.each_annotation do |annotation|
130
+ next unless (appearance = annotation.appearance)
131
+ appearance.process_contents(processor, original_resources: page.resources)
132
+ end
133
+ end
134
+
135
+ # Step 2: Process all found fonts
136
+ processor.map.each do |font_object, all_cids|
137
+ next if all_cids.empty?
138
+ font = HexaPDF::Font::TrueType::Font.new(StringIO.new(font_object.font_file.stream))
139
+ cid_to_gid = cid_to_gid_mapping(font_object)
140
+
141
+ # Process all found CIDs by comparing their width with the ones defined in the font and
142
+ # correcting the font if necessary.
143
+ raw_hmtx = font[:hmtx].raw_data
144
+ width_conversion_factor = 1000.0 / font[:head].units_per_em
145
+ all_cids.each do |cid|
146
+ cid_width = font_object.width(cid)
147
+ gid = cid_to_gid[cid]
148
+ gid_width = font[:hmtx][gid].advance_width * width_conversion_factor
149
+ next if (cid_width - gid_width).abs.round <= 1
150
+ raw_hmtx[4 * gid, 2] = [(cid_width / width_conversion_factor).round].pack('n')
151
+ end
152
+
153
+ font_object.font_file.stream = font.build('hmtx' => raw_hmtx)
154
+ end
155
+ end
156
+
157
+ # Processes the contents of a stream and collects the CIDs for each composite font.
158
+ class CIDCollector < HexaPDF::Content::Processor
159
+
160
+ # The mapping from the composite font's descendant font to the set of used CIDs.
161
+ attr_reader :map
162
+
163
+ def initialize(*) # :nodoc:
164
+ super
165
+ @map = Hash.new {|h, k| h[k] = Set.new }
166
+ end
167
+
168
+ def show_text(data) # :nodoc:
169
+ font = graphics_state.font
170
+ return unless font[:Subtype] == :Type0 && font.descendant_font[:Subtype] == :CIDFontType2
171
+
172
+ Array(data).each do |item|
173
+ next if item.kind_of?(Numeric)
174
+ @map[font.descendant_font].merge(font.decode(item))
175
+ end
176
+ end
177
+ alias show_text_with_positioning show_text
178
+
179
+ end
180
+
181
+ # Returns an object responding to #[] that maps CIDs to GIDs for Type 2 CIDFonts.
182
+ def self.cid_to_gid_mapping(font)
183
+ if font[:CIDToGIDMap] == :Identity
184
+ proc {|cid| cid }
185
+ else
186
+ font[:CIDToGIDMap].stream.unpack('n*')
187
+ end
188
+ end
189
+ private_class_method :cid_to_gid_mapping
190
+
84
191
  end
85
192
 
86
193
  end
@@ -291,7 +291,10 @@ module HexaPDF
291
291
  if embedded_widget?
292
292
  yield(document.wrap(self))
293
293
  elsif terminal_field?
294
- self[:Kids]&.each {|kid| yield(document.wrap(kid)) }
294
+ self[:Kids]&.each do |kid|
295
+ kid = document.wrap(kid)
296
+ yield(kid) if kid.type == :Annot && kid[:Subtype] == :Widget
297
+ end
295
298
  end
296
299
 
297
300
  unless direct_only
@@ -412,6 +412,8 @@ module HexaPDF
412
412
  #
413
413
  # * For radio buttons the value needs to be a String or a Symbol representing the name of
414
414
  # the radio button widget to select.
415
+ #
416
+ # * Values for password fields are ignored as they should not be stored in the PDF.
415
417
  def fill(data)
416
418
  data.each do |field_name, value|
417
419
  field = field_by_name(field_name)
@@ -427,6 +429,8 @@ module HexaPDF
427
429
  when /\A(?:n(o)?|f(alse)?)\z/ then false
428
430
  else value
429
431
  end
432
+ when :password_field
433
+ # Ignore the value
430
434
  else
431
435
  raise HexaPDF::Error, "AcroForm field type #{field.concrete_field_type} not yet supported"
432
436
  end
@@ -344,8 +344,10 @@ module HexaPDF
344
344
  super
345
345
 
346
346
  if self[:V] && !(self[:V].kind_of?(String) || self[:V].kind_of?(HexaPDF::Stream))
347
- yield("Text field doesn't contain text but #{self[:V].class} object")
348
- return
347
+ correctable = self[:V].kind_of?(Symbol)
348
+ yield("Text field doesn't contain text but an object of type #{self[:V].class}", correctable)
349
+ return unless correctable
350
+ self[:V] = self[:V].to_s
349
351
  end
350
352
  if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
351
353
  correctable = true
@@ -250,6 +250,15 @@ module HexaPDF
250
250
  end
251
251
  end
252
252
 
253
+ private
254
+
255
+ def perform_validation(&block) #:nodoc:
256
+ super
257
+ if !key?(:Parent) && (field = form_field) == self
258
+ field.validate(&block)
259
+ end
260
+ end
261
+
253
262
  end
254
263
 
255
264
  end
@@ -0,0 +1,80 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'hexapdf/dictionary'
38
+
39
+ module HexaPDF
40
+ module Type
41
+
42
+ # The document security store (DSS) dictionary contains data needed for verifying digital
43
+ # signatures.
44
+ #
45
+ # See: PDF2.0 s12.8.4.3
46
+ class DocumentSecurityStore < Dictionary
47
+
48
+ # The validation-related information (VRI) dictionary contains validation information for one
49
+ # signature. It signifies that the signature has been validated using this information.
50
+ #
51
+ # See: PDF2.0 s12.8.4.4
52
+ class ValidationRelatedInformation < Dictionary
53
+
54
+ define_type :VRI
55
+
56
+ define_field :Type, type: Symbol, default: type
57
+ define_field :Cert, type: PDFArray
58
+ define_field :CRL, type: PDFArray
59
+ define_field :OCSP, type: PDFArray
60
+ define_field :TU, type: PDFDate
61
+ define_field :TS, type: Stream
62
+
63
+ end
64
+
65
+ define_type :DSS
66
+
67
+ define_field :Type, type: Symbol, default: type
68
+ define_field :VRI, type: Dictionary
69
+ define_field :Certs, type: PDFArray
70
+ define_field :OCSPs, type: PDFArray
71
+ define_field :CRLs, type: PDFArray
72
+ define_field :SW, type: Symbol, default: :A, allowed_values: [:A, :B, :S, :N]
73
+ define_field :S, type: Symbol, default: :P, allowed_values: [:A, :P]
74
+ define_field :A, type: PDFArray, default: [0.5, 0.5]
75
+ define_field :FB, type: Boolean, default: false, version: '1.5'
76
+
77
+ end
78
+
79
+ end
80
+ end
@@ -395,6 +395,17 @@ module HexaPDF
395
395
  Content::Parser.parse(contents, processor)
396
396
  end
397
397
 
398
+ # Extracts the layouted text from the page.
399
+ #
400
+ # See HexaPDF::Content::SmartTextExtractor.layout_text_runs for the available +options+.
401
+ def extract_text(**options)
402
+ processor = Content::SmartTextExtractor::TextRunProcessor.new
403
+ process_contents(processor)
404
+ box = box(:media)
405
+ Content::SmartTextExtractor.layout_text_runs(processor.text_runs, box.width, box.height,
406
+ **options)
407
+ end
408
+
398
409
  # Returns the index of the page in the page tree.
399
410
  def index
400
411
  idx = 0
data/lib/hexapdf/type.rb CHANGED
@@ -89,6 +89,7 @@ module HexaPDF
89
89
  autoload(:MarkedContentReference, 'hexapdf/type/marked_content_reference')
90
90
  autoload(:ObjectReference, 'hexapdf/type/object_reference')
91
91
  autoload(:Measure, 'hexapdf/type/measure')
92
+ autoload(:DocumentSecurityStore, 'hexapdf/type/document_security_store')
92
93
 
93
94
  end
94
95
 
@@ -37,6 +37,6 @@
37
37
  module HexaPDF
38
38
 
39
39
  # The version of HexaPDF.
40
- VERSION = '1.5.0'
40
+ VERSION = '1.7.0'
41
41
 
42
42
  end
@@ -0,0 +1,129 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'test_helper'
4
+ require 'hexapdf/content/smart_text_extractor'
5
+ require 'hexapdf/document'
6
+
7
+ describe HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun do
8
+ it "has various accessors" do
9
+ text_run = HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new('s', 1, 2, 3, 5)
10
+ assert_equal('s', text_run.string)
11
+ assert_equal(2, text_run.width)
12
+ assert_equal(3, text_run.height)
13
+ end
14
+ end
15
+
16
+ describe HexaPDF::Content::SmartTextExtractor::TextRunProcessor do
17
+ it "turns glyphs into TextRun objects" do
18
+ processor = HexaPDF::Content::SmartTextExtractor::TextRunProcessor.new
19
+ doc = HexaPDF::Document.new
20
+ page = doc.pages.add
21
+ page.canvas.font('Helvetica', size: 10).
22
+ text('Te', at: [10, 500]).
23
+ text_matrix(0.866, -0.5, 0.5, 0.866, 0, 0).
24
+ text('Te')
25
+ page.process_contents(processor)
26
+ assert_equal([['T', 10, 497.75, 16.11, 509.31], ['e', 16.11, 497.75, 21.67, 509.31],
27
+ ["T", -1.125, -5.0035, 9.94626, 8.06246],
28
+ ["e", 4.16626, -7.7835, 14.761220000000002, 5.00746]],
29
+ processor.text_runs.map(&:to_a))
30
+ end
31
+ end
32
+
33
+ describe HexaPDF::Content::SmartTextExtractor do
34
+ def text_run(str, left, bottom, right, top)
35
+ HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new(str, left, bottom, right, top)
36
+ end
37
+
38
+ def layout_runs(runs, width = 595, height = 842, **options)
39
+ runs = runs.map {|args| text_run(*args) }
40
+ HexaPDF::Content::SmartTextExtractor.layout_text_runs(runs, width, height, **options)
41
+ end
42
+
43
+ it "works for a page with no text" do
44
+ assert_equal('', layout_runs([]))
45
+ end
46
+
47
+ it "works for a single run on the left side of the page" do
48
+ assert_equal('test', layout_runs([['test', 0, 100, 20, 110]]))
49
+ end
50
+
51
+ it "works for a single run not on the left side of the page" do
52
+ assert_equal('test', layout_runs([['test', 50, 100, 70, 110]]))
53
+ end
54
+
55
+ it "preserves the relative indent" do
56
+ assert_equal("Hello\n World", layout_runs([['Hello', 50, 100, 70, 110],
57
+ ['World', 70, 80, 90, 100]]))
58
+ end
59
+
60
+ it "combines text runs if they have the same top/bottom and there is less than 1pt between them" do
61
+ x = +'Hello'
62
+ assert_equal('HelloWorld', layout_runs([[x, 50, 100, 60, 110],
63
+ ['World', 60, 100, 70, 110]]))
64
+ assert_equal('HelloWorld', x)
65
+ end
66
+
67
+ it "preserves the space between two runs" do
68
+ assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
69
+ ['World', 72, 100, 92, 110]]))
70
+ assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
71
+ ['World', 80, 100, 100, 110]]))
72
+ end
73
+
74
+ it "inserts a space after very narrow text parts if necessary" do
75
+ assert_equal('Hello World!', layout_runs([['Hello', 50, 100, 60, 110],
76
+ ['World!', 63, 100, 87, 110]]))
77
+ end
78
+
79
+ it "preserves the visual horizontal ordering of two runs" do
80
+ assert_equal('Hello World', layout_runs([['World', 72, 100, 92, 110],
81
+ ['Hello', 50, 100, 70, 110]]))
82
+ end
83
+
84
+ it "preserves the visual vertical ordering of two runs" do
85
+ assert_equal("Hello\nWorld", layout_runs([['World', 50, 80, 70, 100],
86
+ ['Hello', 50, 100, 70, 110]]))
87
+ end
88
+
89
+ it "inserts a single blank line between paragraphs" do
90
+ assert_equal("Hello\nWorld\n\nHere",
91
+ layout_runs([['Hello', 50, 100, 70, 110],
92
+ ['World', 50, 90, 70, 100],
93
+ ['Here', 50, 65, 66, 75]]))
94
+ end
95
+
96
+ it "inserts multiply lines for large gaps between paragraphs" do
97
+ assert_equal("Hello\nWorld\nHere\n\n\n\n\n\n\nFoot",
98
+ layout_runs([['Hello', 50, 100, 70, 110],
99
+ ['World', 50, 90, 70, 100],
100
+ ['Here', 50, 80, 70, 90],
101
+ ['Foot', 50, 10, 66, 20]]))
102
+ end
103
+
104
+ it "ignores outliers when calculating the normal line spacing" do
105
+ assert_equal("Hello\nWorld\n\n\n\nHere",
106
+ layout_runs([['Hello', 50, 100, 70, 110],
107
+ ['World', 50, 90, 70, 100],
108
+ ['Here', 50, 50, 70, 60]]))
109
+ end
110
+
111
+ it "can use a different line_tolerance_factor" do
112
+ assert_equal("HelloWorld",
113
+ layout_runs([['Hello', 50, 100, 70, 110],
114
+ ['World', 50, 90, 70, 100]], line_tolerance_factor: 1))
115
+ end
116
+
117
+ it "can use a different paragraph_distance_threshold" do
118
+ assert_equal("Hello\n\nWorld",
119
+ layout_runs([['Hello', 50, 100, 70, 110],
120
+ ['World', 50, 90, 70, 100]], paragraph_distance_threshold: 1))
121
+ end
122
+
123
+ it "can use a different large_distance_threshold" do
124
+ assert_equal("Hello\nWorld\n\nHere",
125
+ layout_runs([['Hello', 50, 100, 70, 110],
126
+ ['World', 50, 90, 70, 100],
127
+ ['Here', 50, 50, 66, 60]], large_distance_threshold: 8))
128
+ end
129
+ end
@@ -13,7 +13,7 @@ module HexaPDF
13
13
  @ca_certificate ||=
14
14
  begin
15
15
  cert = create_cert(name: '/C=AT/O=HexaPDF/CN=HexaPDF Test Root CA', serial: 0,
16
- public_key: ca_key.public_key)
16
+ public_key: ca_key)
17
17
  add_extensions(cert, cert, ca_key, is_ca: true, key_usage: 'cRLSign,keyCertSign')
18
18
  cert
19
19
  end
@@ -27,7 +27,7 @@ module HexaPDF
27
27
  @signer_certificate ||=
28
28
  begin
29
29
  cert = create_cert(name: '/CN=RSA signer/DC=gettalong', serial: 2,
30
- public_key: signer_key.public_key, issuer: ca_certificate)
30
+ public_key: signer_key, issuer: ca_certificate)
31
31
  add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature')
32
32
  cert
33
33
  end
@@ -37,7 +37,7 @@ module HexaPDF
37
37
  @non_repudiation_signer_certificate ||=
38
38
  begin
39
39
  cert = create_cert(name: '/CN=Non repudiation signer/DC=gettalong', serial: 2,
40
- public_key: signer_key.public_key, issuer: ca_certificate)
40
+ public_key: signer_key, issuer: ca_certificate)
41
41
  add_extensions(cert, ca_certificate, ca_key, key_usage: 'nonRepudiation')
42
42
  cert
43
43
  end
@@ -51,7 +51,21 @@ module HexaPDF
51
51
  @dsa_signer_certificate ||=
52
52
  begin
53
53
  cert = create_cert(name: '/CN=DSA signer/DC=gettalong', serial: 3,
54
- public_key: dsa_signer_key.public_key, issuer: ca_certificate)
54
+ public_key: dsa_signer_key, issuer: ca_certificate)
55
+ add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature')
56
+ cert
57
+ end
58
+ end
59
+
60
+ def ecdsa_signer_key
61
+ @ecdsa_signer_key ||= OpenSSL::PKey::EC.generate('sect163k1')
62
+ end
63
+
64
+ def ecdsa_signer_certificate
65
+ @ecdsa_signer_certificate ||=
66
+ begin
67
+ cert = create_cert(name: '/CN=ECDSA signer/DC=gettalong', serial: 4,
68
+ public_key: ecdsa_signer_key, issuer: ca_certificate)
55
69
  add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature')
56
70
  cert
57
71
  end
@@ -61,7 +75,7 @@ module HexaPDF
61
75
  @timestamp_certificate ||=
62
76
  begin
63
77
  cert = create_cert(name: '/CN=timestamp/DC=gettalong', serial: 3,
64
- public_key: signer_key.public_key, issuer: ca_certificate)
78
+ public_key: signer_key, issuer: ca_certificate)
65
79
  add_extensions(cert, ca_certificate, ca_key, key_usage: 'digitalSignature',
66
80
  extended_key_usage: 'timeStamping')
67
81
  cert
@@ -154,10 +154,35 @@ describe HexaPDF::DigitalSignature::Signing::SignedDataCreator do
154
154
  assert_equal(CERTIFICATES.signer_key.sign('SHA256', to_sign), @structure.value[5].value)
155
155
  end
156
156
 
157
- it "fails if the signature algorithm is not supported" do
158
- @signed_data.certificate = CERTIFICATES.dsa_signer_certificate
159
- @signed_data.key = CERTIFICATES.dsa_signer_key
160
- assert_raises(HexaPDF::Error) { @signed_data.create("data") }
157
+ describe "DSA key pair" do
158
+ before do
159
+ @signed_data.certificate = CERTIFICATES.dsa_signer_certificate
160
+ @signed_data.key = CERTIFICATES.dsa_signer_key
161
+ end
162
+
163
+ it "works with a DSA key pair" do
164
+ @structure = @signed_data.create("data").value[1].value[4].value[0]
165
+ assert_equal('2.16.840.1.101.3.4.3.2', @structure.value[4].value[0].value)
166
+ assert_nil(@structure.value[4].value[1].value)
167
+ end
168
+
169
+ it "fails if the digest algorithm is not SHA256" do
170
+ @signed_data.digest_algorithm = 'sha512'
171
+ assert_raises { @signed_data.create("data") }
172
+ end
173
+ end
174
+
175
+ describe "ECDSA key pair" do
176
+ before do
177
+ @signed_data.certificate = CERTIFICATES.ecdsa_signer_certificate
178
+ @signed_data.key = CERTIFICATES.ecdsa_signer_key
179
+ end
180
+
181
+ it "works with an ECDSA key pair" do
182
+ structure = @signed_data.create("data").value[1].value[4].value[0]
183
+ assert_equal('1.2.840.10045.4.3.2', structure.value[4].value[0].value)
184
+ assert_nil(structure.value[4].value[1].value)
185
+ end
161
186
  end
162
187
 
163
188
  it "can use a different digest algorithm" do