hexapdf 1.6.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +28 -0
  3. data/examples/032-acro_form_list_and_fill.rb +47 -0
  4. data/examples/033-text_extraction.rb +34 -0
  5. data/lib/hexapdf/cli/info.rb +2 -0
  6. data/lib/hexapdf/configuration.rb +8 -0
  7. data/lib/hexapdf/content/canvas.rb +1 -1
  8. data/lib/hexapdf/content/smart_text_extractor.rb +305 -0
  9. data/lib/hexapdf/content.rb +2 -0
  10. data/lib/hexapdf/digital_signature/signing/default_handler.rb +1 -15
  11. data/lib/hexapdf/digital_signature/signing/signed_data_creator.rb +21 -8
  12. data/lib/hexapdf/document.rb +7 -3
  13. data/lib/hexapdf/filter/brotli_decode.rb +88 -0
  14. data/lib/hexapdf/filter.rb +1 -0
  15. data/lib/hexapdf/font/true_type/builder.rb +1 -1
  16. data/lib/hexapdf/font/true_type/font.rb +13 -0
  17. data/lib/hexapdf/font/true_type/subsetter.rb +7 -2
  18. data/lib/hexapdf/font/true_type/table/directory.rb +5 -0
  19. data/lib/hexapdf/font/true_type.rb +1 -0
  20. data/lib/hexapdf/layout/style.rb +6 -2
  21. data/lib/hexapdf/task/pdfa.rb +108 -1
  22. data/lib/hexapdf/type/acro_form/form.rb +4 -0
  23. data/lib/hexapdf/type/acro_form/text_field.rb +4 -2
  24. data/lib/hexapdf/type/annotations/widget.rb +9 -0
  25. data/lib/hexapdf/type/document_security_store.rb +80 -0
  26. data/lib/hexapdf/type/page.rb +11 -0
  27. data/lib/hexapdf/type.rb +1 -0
  28. data/lib/hexapdf/version.rb +1 -1
  29. data/test/data/pdfa/mismatching_glyph_widths_cidfont_type2.pdf +0 -0
  30. data/test/hexapdf/content/test_smart_text_extractor.rb +129 -0
  31. data/test/hexapdf/digital_signature/common.rb +19 -5
  32. data/test/hexapdf/digital_signature/signing/test_signed_data_creator.rb +29 -4
  33. data/test/hexapdf/digital_signature/test_signatures.rb +3 -3
  34. data/test/hexapdf/filter/test_brotli_decode.rb +34 -0
  35. data/test/hexapdf/font/true_type/table/test_directory.rb +5 -3
  36. data/test/hexapdf/font/true_type/test_builder.rb +9 -0
  37. data/test/hexapdf/font/true_type/test_font.rb +17 -3
  38. data/test/hexapdf/font/true_type/test_subsetter.rb +4 -3
  39. data/test/hexapdf/task/test_pdfa.rb +72 -0
  40. data/test/hexapdf/test_document.rb +13 -0
  41. data/test/hexapdf/type/acro_form/test_form.rb +6 -0
  42. data/test/hexapdf/type/acro_form/test_text_field.rb +7 -1
  43. data/test/hexapdf/type/annotations/test_widget.rb +11 -0
  44. data/test/hexapdf/type/test_page.rb +8 -0
  45. metadata +25 -3
@@ -0,0 +1,88 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'fiber'
38
+ require 'brotli'
39
+ require 'hexapdf/filter/predictor'
40
+ require 'hexapdf/configuration'
41
+
42
+ module HexaPDF
43
+ module Filter
44
+
45
+ # Implements the Brotli filter using the brotli library which must be installed manually.
46
+ #
47
+ # The BrotliDecode specification is not yet available as a standard but will be in the near
48
+ # future. Therefore it is recommended to wait using it for encoding streams until most of the
49
+ # PDF ecosystem has support for it.
50
+ #
51
+ # See: HexaPDF::Filter
52
+ module BrotliDecode
53
+
54
+ # See HexaPDF::Filter
55
+ #
56
+ # Note that the brotli gem currently doesn't support a streaming decoder. This means that the
57
+ # whole source must be read and decoded at once.
58
+ def self.decoder(source, options = nil)
59
+ fib = Fiber.new do
60
+ data = Filter.string_from_source(source)
61
+ data.empty? ? data: Brotli.inflate(data)
62
+ end
63
+
64
+ if options && options[:Predictor]
65
+ Predictor.decoder(fib, options)
66
+ else
67
+ fib
68
+ end
69
+ end
70
+
71
+ # See HexaPDF::Filter
72
+ #
73
+ # As with ::decoder a usable streaming encoder is not available.
74
+ def self.encoder(source, options = nil)
75
+ if options && options[:Predictor]
76
+ source = Predictor.encoder(source, options)
77
+ end
78
+
79
+ Fiber.new do
80
+ Brotli.deflate(Filter.string_from_source(source),
81
+ quality: HexaPDF::GlobalConfiguration['filter.brotli.compression'])
82
+ end
83
+ end
84
+
85
+ end
86
+
87
+ end
88
+ end
@@ -134,6 +134,7 @@ module HexaPDF
134
134
  autoload(:FlateDecode, 'hexapdf/filter/flate_decode')
135
135
  autoload(:LZWDecode, 'hexapdf/filter/lzw_decode')
136
136
  autoload(:RunLengthDecode, 'hexapdf/filter/run_length_decode')
137
+ autoload(:BrotliDecode, 'hexapdf/filter/brotli_decode')
137
138
 
138
139
  autoload(:Predictor, 'hexapdf/filter/predictor')
139
140
 
@@ -48,7 +48,7 @@ module HexaPDF
48
48
  entry_selector = tables.length.bit_length - 1
49
49
  range_shift = tables.length * 16 - search_range
50
50
 
51
- font_data = "\x0\x1\x0\x0".b +
51
+ font_data = (tables.key?('glyf') ? "\x0\x1\x0\x0" : "OTTO").b +
52
52
  [tables.length, search_range, entry_selector, range_shift].pack('n4')
53
53
 
54
54
  offset = font_data.length + tables.length * 16
@@ -35,6 +35,7 @@
35
35
  #++
36
36
 
37
37
  require 'hexapdf/font/true_type/table'
38
+ require 'hexapdf/font/true_type/builder'
38
39
  require 'set'
39
40
 
40
41
  module HexaPDF
@@ -84,6 +85,18 @@ module HexaPDF
84
85
  @tables = {}
85
86
  end
86
87
 
88
+ # Uses Builder to build a font file for this font.
89
+ #
90
+ # The +table_overrides+ argument can be used to supply mappings from table names (in string
91
+ # form) to raw table data that should override the respective font's tables.
92
+ def build(table_overrides = {})
93
+ tables = directory.table_names.each_with_object({}) do |name, hash|
94
+ hash[name] = self[name.to_sym].raw_data
95
+ end
96
+ tables.merge!(table_overrides)
97
+ Builder.build(tables)
98
+ end
99
+
87
100
  # Returns the table instance for the given tag (a symbol), or +nil+ if no such table exists.
88
101
  def [](tag)
89
102
  return @tables[tag] if @tables.key?(tag)
@@ -176,9 +176,14 @@ module HexaPDF
176
176
  # Adds the components of compound glyphs to the subset.
177
177
  def add_glyph_components
178
178
  glyf = @font[:glyf]
179
+ process_glyph_components = lambda do |gid|
180
+ glyf[gid].components&.each do |cgid|
181
+ use_glyph(cgid)
182
+ process_glyph_components.call(cgid) if glyf[cgid].compound?
183
+ end
184
+ end
179
185
  @glyph_map.keys.each do |gid|
180
- next if gid.kind_of?(Symbol)
181
- glyf[gid].components&.each {|cgid| use_glyph(cgid) }
186
+ process_glyph_components.call(gid) unless gid.kind_of?(Symbol)
182
187
  end
183
188
  end
184
189
 
@@ -69,6 +69,11 @@ module HexaPDF
69
69
  @tables[tag]
70
70
  end
71
71
 
72
+ # Returns an array with all the table names (in string form) in the directory.
73
+ def table_names
74
+ @tables.keys
75
+ end
76
+
72
77
  private
73
78
 
74
79
  def load_from_io #:nodoc:
@@ -49,6 +49,7 @@ module HexaPDF
49
49
  autoload(:Font, 'hexapdf/font/true_type/font')
50
50
  autoload(:Subsetter, 'hexapdf/font/true_type/subsetter')
51
51
  autoload(:Optimizer, 'hexapdf/font/true_type/optimizer')
52
+ autoload(:Builder, 'hexapdf/font/true_type/builder')
52
53
 
53
54
  end
54
55
 
@@ -211,6 +211,8 @@ module HexaPDF
211
211
  attr_reader :width
212
212
 
213
213
  # The colors of each edge. See Quad.
214
+ #
215
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
214
216
  attr_reader :color
215
217
 
216
218
  # The styles of each edge. See Quad.
@@ -897,7 +899,7 @@ module HexaPDF
897
899
  #
898
900
  # The color used for filling (e.g. text), defaults to black.
899
901
  #
900
- # See: HexaPDF::Content::Canvas#fill_color
902
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
901
903
  #
902
904
  # Examples:
903
905
  #
@@ -926,7 +928,7 @@ module HexaPDF
926
928
  #
927
929
  # The color used for stroking (e.g. text outlines), defaults to black.
928
930
  #
929
- # See: HexaPDF::Content::Canvas#stroke_color
931
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
930
932
  #
931
933
  # Examples:
932
934
  #
@@ -1175,6 +1177,8 @@ module HexaPDF
1175
1177
  #
1176
1178
  # The color used for backgrounds, defaults to +nil+ (i.e. no background).
1177
1179
  #
1180
+ # See: HexaPDF::Content::ColorSpace.device_color_from_specification
1181
+ #
1178
1182
  # Examples:
1179
1183
  #
1180
1184
  # #>pdf-composer100
@@ -40,6 +40,7 @@ require 'hexapdf/content/parser'
40
40
  require 'hexapdf/content/operator'
41
41
  require 'hexapdf/type/xref_stream'
42
42
  require 'hexapdf/type/object_stream'
43
+ require 'hexapdf/font/true_type'
43
44
 
44
45
  module HexaPDF
45
46
  module Task
@@ -51,6 +52,13 @@ module HexaPDF
51
52
  # * prevents the Standard 14 PDF fonts to be used.
52
53
  # * adds an appropriate output intent if none is set.
53
54
  # * adds the necessary PDF/A metadata properties.
55
+ #
56
+ # Additionally, it applies fixes to the document so that the structures and content of
57
+ # non-conforming PDFs are corrected. See ::call for more information on the available fixes.
58
+ #
59
+ # Note that you should use a PDF/A validation tool like veraPDF (https://verapdf.org/) to ensure
60
+ # that the resulting files confirm to the PDF/A specification because not all documents can be
61
+ # fixed at the moment.
54
62
  module PDFA
55
63
 
56
64
  # Performs the necessary tasks to make the document PDF/A compatible.
@@ -58,7 +66,22 @@ module HexaPDF
58
66
  # +level+::
59
67
  # Specifies the PDF/A conformance level that should be used. Can be one of the following
60
68
  # strings: 2b, 2u, 3b, 3u.
61
- def self.call(doc, level: '3u')
69
+ #
70
+ # +fixes+::
71
+ # Specifies the fixes that should be applied when converting a non-conforming PDF. If a
72
+ # document is created with HexaPDF but also includes parts of loaded documents, this
73
+ # argument hast to be set to +:all+.
74
+ #
75
+ # Can be +:default+ (which is also the default value), +:all+ or an array with one or more
76
+ # fix names.
77
+ #
78
+ # +:default+:: Applies all fixes if the document was loaded from a file. Otherwise applies
79
+ # only those fixes necessary for files created with HexaPDF.
80
+ #
81
+ # +:all+: Applies all available fixes.
82
+ #
83
+ # +:glyph_widths+:: Corrects mismatching width information in fonts.
84
+ def self.call(doc, level: '3u', fixes: :default)
62
85
  unless level.match?(/\A[23][bu]\z/)
63
86
  raise ArgumentError, "The given PDF/A conformance level '#{level}' is not supported"
64
87
  end
@@ -68,6 +91,15 @@ module HexaPDF
68
91
  doc.metadata.property('pdfaid', 'part', part)
69
92
  doc.metadata.property('pdfaid', 'conformance', conformance.upcase)
70
93
  add_srgb_icc_output_intent(doc) unless doc.catalog.key?(:OutputIntents)
94
+
95
+ fixes = if fixes == :all || (fixes == :default && doc.revisions.parser)
96
+ ALL_FIXES
97
+ elsif fixes == :default
98
+ ALL_FIXES - FIXES_FOR_LOADED_DOCUMENTS
99
+ else
100
+ fixes
101
+ end
102
+ fixes.each {|fix| send(fix, doc) }
71
103
  end
72
104
  end
73
105
 
@@ -81,6 +113,81 @@ module HexaPDF
81
113
  ]
82
114
  end
83
115
 
116
+ ALL_FIXES = [:fix_glyph_widths] # :nodoc:
117
+
118
+ FIXES_FOR_LOADED_DOCUMENTS = [:fix_glyph_widths] # :nodoc:
119
+
120
+ # Makes the glyph widths stored in the embedded fonts the same as the ones specified in the
121
+ # PDF font data structures.
122
+ #
123
+ # Note: Currently only handles Type 2 CIDFonts.
124
+ def self.fix_glyph_widths(doc) # :nodoc:
125
+ # Step 1: Collect all CIDs together with their respective fonts
126
+ processor = CIDCollector.new
127
+ doc.pages.each do |page|
128
+ page.process_contents(processor)
129
+ page.each_annotation do |annotation|
130
+ next unless (appearance = annotation.appearance)
131
+ appearance.process_contents(processor, original_resources: page.resources)
132
+ end
133
+ end
134
+
135
+ # Step 2: Process all found fonts
136
+ processor.map.each do |font_object, all_cids|
137
+ next if all_cids.empty?
138
+ font = HexaPDF::Font::TrueType::Font.new(StringIO.new(font_object.font_file.stream))
139
+ cid_to_gid = cid_to_gid_mapping(font_object)
140
+
141
+ # Process all found CIDs by comparing their width with the ones defined in the font and
142
+ # correcting the font if necessary.
143
+ raw_hmtx = font[:hmtx].raw_data
144
+ width_conversion_factor = 1000.0 / font[:head].units_per_em
145
+ all_cids.each do |cid|
146
+ cid_width = font_object.width(cid)
147
+ gid = cid_to_gid[cid]
148
+ gid_width = font[:hmtx][gid].advance_width * width_conversion_factor
149
+ next if (cid_width - gid_width).abs.round <= 1
150
+ raw_hmtx[4 * gid, 2] = [(cid_width / width_conversion_factor).round].pack('n')
151
+ end
152
+
153
+ font_object.font_file.stream = font.build('hmtx' => raw_hmtx)
154
+ end
155
+ end
156
+
157
+ # Processes the contents of a stream and collects the CIDs for each composite font.
158
+ class CIDCollector < HexaPDF::Content::Processor
159
+
160
+ # The mapping from the composite font's descendant font to the set of used CIDs.
161
+ attr_reader :map
162
+
163
+ def initialize(*) # :nodoc:
164
+ super
165
+ @map = Hash.new {|h, k| h[k] = Set.new }
166
+ end
167
+
168
+ def show_text(data) # :nodoc:
169
+ font = graphics_state.font
170
+ return unless font[:Subtype] == :Type0 && font.descendant_font[:Subtype] == :CIDFontType2
171
+
172
+ Array(data).each do |item|
173
+ next if item.kind_of?(Numeric)
174
+ @map[font.descendant_font].merge(font.decode(item))
175
+ end
176
+ end
177
+ alias show_text_with_positioning show_text
178
+
179
+ end
180
+
181
+ # Returns an object responding to #[] that maps CIDs to GIDs for Type 2 CIDFonts.
182
+ def self.cid_to_gid_mapping(font)
183
+ if font[:CIDToGIDMap] == :Identity
184
+ proc {|cid| cid }
185
+ else
186
+ font[:CIDToGIDMap].stream.unpack('n*')
187
+ end
188
+ end
189
+ private_class_method :cid_to_gid_mapping
190
+
84
191
  end
85
192
 
86
193
  end
@@ -412,6 +412,8 @@ module HexaPDF
412
412
  #
413
413
  # * For radio buttons the value needs to be a String or a Symbol representing the name of
414
414
  # the radio button widget to select.
415
+ #
416
+ # * Values for password fields are ignored as they should not be stored in the PDF.
415
417
  def fill(data)
416
418
  data.each do |field_name, value|
417
419
  field = field_by_name(field_name)
@@ -427,6 +429,8 @@ module HexaPDF
427
429
  when /\A(?:n(o)?|f(alse)?)\z/ then false
428
430
  else value
429
431
  end
432
+ when :password_field
433
+ # Ignore the value
430
434
  else
431
435
  raise HexaPDF::Error, "AcroForm field type #{field.concrete_field_type} not yet supported"
432
436
  end
@@ -344,8 +344,10 @@ module HexaPDF
344
344
  super
345
345
 
346
346
  if self[:V] && !(self[:V].kind_of?(String) || self[:V].kind_of?(HexaPDF::Stream))
347
- yield("Text field doesn't contain text but #{self[:V].class} object")
348
- return
347
+ correctable = self[:V].kind_of?(Symbol)
348
+ yield("Text field doesn't contain text but an object of type #{self[:V].class}", correctable)
349
+ return unless correctable
350
+ self[:V] = self[:V].to_s
349
351
  end
350
352
  if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
351
353
  correctable = true
@@ -250,6 +250,15 @@ module HexaPDF
250
250
  end
251
251
  end
252
252
 
253
+ private
254
+
255
+ def perform_validation(&block) #:nodoc:
256
+ super
257
+ if !key?(:Parent) && (field = form_field) == self
258
+ field.validate(&block)
259
+ end
260
+ end
261
+
253
262
  end
254
263
 
255
264
  end
@@ -0,0 +1,80 @@
1
+ # -*- encoding: utf-8; frozen_string_literal: true -*-
2
+ #
3
+ #--
4
+ # This file is part of HexaPDF.
5
+ #
6
+ # HexaPDF - A Versatile PDF Creation and Manipulation Library For Ruby
7
+ # Copyright (C) 2014-2025 Thomas Leitner
8
+ #
9
+ # HexaPDF is free software: you can redistribute it and/or modify it
10
+ # under the terms of the GNU Affero General Public License version 3 as
11
+ # published by the Free Software Foundation with the addition of the
12
+ # following permission added to Section 15 as permitted in Section 7(a):
13
+ # FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY
14
+ # THOMAS LEITNER, THOMAS LEITNER DISCLAIMS THE WARRANTY OF NON
15
+ # INFRINGEMENT OF THIRD PARTY RIGHTS.
16
+ #
17
+ # HexaPDF is distributed in the hope that it will be useful, but WITHOUT
18
+ # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
19
+ # FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
20
+ # License for more details.
21
+ #
22
+ # You should have received a copy of the GNU Affero General Public License
23
+ # along with HexaPDF. If not, see <http://www.gnu.org/licenses/>.
24
+ #
25
+ # The interactive user interfaces in modified source and object code
26
+ # versions of HexaPDF must display Appropriate Legal Notices, as required
27
+ # under Section 5 of the GNU Affero General Public License version 3.
28
+ #
29
+ # In accordance with Section 7(b) of the GNU Affero General Public
30
+ # License, a covered work must retain the producer line in every PDF that
31
+ # is created or manipulated using HexaPDF.
32
+ #
33
+ # If the GNU Affero General Public License doesn't fit your need,
34
+ # commercial licenses are available at <https://gettalong.at/hexapdf/>.
35
+ #++
36
+
37
+ require 'hexapdf/dictionary'
38
+
39
+ module HexaPDF
40
+ module Type
41
+
42
+ # The document security store (DSS) dictionary contains data needed for verifying digital
43
+ # signatures.
44
+ #
45
+ # See: PDF2.0 s12.8.4.3
46
+ class DocumentSecurityStore < Dictionary
47
+
48
+ # The validation-related information (VRI) dictionary contains validation information for one
49
+ # signature. It signifies that the signature has been validated using this information.
50
+ #
51
+ # See: PDF2.0 s12.8.4.4
52
+ class ValidationRelatedInformation < Dictionary
53
+
54
+ define_type :VRI
55
+
56
+ define_field :Type, type: Symbol, default: type
57
+ define_field :Cert, type: PDFArray
58
+ define_field :CRL, type: PDFArray
59
+ define_field :OCSP, type: PDFArray
60
+ define_field :TU, type: PDFDate
61
+ define_field :TS, type: Stream
62
+
63
+ end
64
+
65
+ define_type :DSS
66
+
67
+ define_field :Type, type: Symbol, default: type
68
+ define_field :VRI, type: Dictionary
69
+ define_field :Certs, type: PDFArray
70
+ define_field :OCSPs, type: PDFArray
71
+ define_field :CRLs, type: PDFArray
72
+ define_field :SW, type: Symbol, default: :A, allowed_values: [:A, :B, :S, :N]
73
+ define_field :S, type: Symbol, default: :P, allowed_values: [:A, :P]
74
+ define_field :A, type: PDFArray, default: [0.5, 0.5]
75
+ define_field :FB, type: Boolean, default: false, version: '1.5'
76
+
77
+ end
78
+
79
+ end
80
+ end
@@ -395,6 +395,17 @@ module HexaPDF
395
395
  Content::Parser.parse(contents, processor)
396
396
  end
397
397
 
398
+ # Extracts the layouted text from the page.
399
+ #
400
+ # See HexaPDF::Content::SmartTextExtractor.layout_text_runs for the available +options+.
401
+ def extract_text(**options)
402
+ processor = Content::SmartTextExtractor::TextRunProcessor.new
403
+ process_contents(processor)
404
+ box = box(:media)
405
+ Content::SmartTextExtractor.layout_text_runs(processor.text_runs, box.width, box.height,
406
+ **options)
407
+ end
408
+
398
409
  # Returns the index of the page in the page tree.
399
410
  def index
400
411
  idx = 0
data/lib/hexapdf/type.rb CHANGED
@@ -89,6 +89,7 @@ module HexaPDF
89
89
  autoload(:MarkedContentReference, 'hexapdf/type/marked_content_reference')
90
90
  autoload(:ObjectReference, 'hexapdf/type/object_reference')
91
91
  autoload(:Measure, 'hexapdf/type/measure')
92
+ autoload(:DocumentSecurityStore, 'hexapdf/type/document_security_store')
92
93
 
93
94
  end
94
95
 
@@ -37,6 +37,6 @@
37
37
  module HexaPDF
38
38
 
39
39
  # The version of HexaPDF.
40
- VERSION = '1.6.0'
40
+ VERSION = '1.7.0'
41
41
 
42
42
  end
@@ -0,0 +1,129 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ require 'test_helper'
4
+ require 'hexapdf/content/smart_text_extractor'
5
+ require 'hexapdf/document'
6
+
7
+ describe HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun do
8
+ it "has various accessors" do
9
+ text_run = HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new('s', 1, 2, 3, 5)
10
+ assert_equal('s', text_run.string)
11
+ assert_equal(2, text_run.width)
12
+ assert_equal(3, text_run.height)
13
+ end
14
+ end
15
+
16
+ describe HexaPDF::Content::SmartTextExtractor::TextRunProcessor do
17
+ it "turns glyphs into TextRun objects" do
18
+ processor = HexaPDF::Content::SmartTextExtractor::TextRunProcessor.new
19
+ doc = HexaPDF::Document.new
20
+ page = doc.pages.add
21
+ page.canvas.font('Helvetica', size: 10).
22
+ text('Te', at: [10, 500]).
23
+ text_matrix(0.866, -0.5, 0.5, 0.866, 0, 0).
24
+ text('Te')
25
+ page.process_contents(processor)
26
+ assert_equal([['T', 10, 497.75, 16.11, 509.31], ['e', 16.11, 497.75, 21.67, 509.31],
27
+ ["T", -1.125, -5.0035, 9.94626, 8.06246],
28
+ ["e", 4.16626, -7.7835, 14.761220000000002, 5.00746]],
29
+ processor.text_runs.map(&:to_a))
30
+ end
31
+ end
32
+
33
+ describe HexaPDF::Content::SmartTextExtractor do
34
+ def text_run(str, left, bottom, right, top)
35
+ HexaPDF::Content::SmartTextExtractor::TextRunCollector::TextRun.new(str, left, bottom, right, top)
36
+ end
37
+
38
+ def layout_runs(runs, width = 595, height = 842, **options)
39
+ runs = runs.map {|args| text_run(*args) }
40
+ HexaPDF::Content::SmartTextExtractor.layout_text_runs(runs, width, height, **options)
41
+ end
42
+
43
+ it "works for a page with no text" do
44
+ assert_equal('', layout_runs([]))
45
+ end
46
+
47
+ it "works for a single run on the left side of the page" do
48
+ assert_equal('test', layout_runs([['test', 0, 100, 20, 110]]))
49
+ end
50
+
51
+ it "works for a single run not on the left side of the page" do
52
+ assert_equal('test', layout_runs([['test', 50, 100, 70, 110]]))
53
+ end
54
+
55
+ it "preserves the relative indent" do
56
+ assert_equal("Hello\n World", layout_runs([['Hello', 50, 100, 70, 110],
57
+ ['World', 70, 80, 90, 100]]))
58
+ end
59
+
60
+ it "combines text runs if they have the same top/bottom and there is less than 1pt between them" do
61
+ x = +'Hello'
62
+ assert_equal('HelloWorld', layout_runs([[x, 50, 100, 60, 110],
63
+ ['World', 60, 100, 70, 110]]))
64
+ assert_equal('HelloWorld', x)
65
+ end
66
+
67
+ it "preserves the space between two runs" do
68
+ assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
69
+ ['World', 72, 100, 92, 110]]))
70
+ assert_equal('Hello World', layout_runs([['Hello', 50, 100, 70, 110],
71
+ ['World', 80, 100, 100, 110]]))
72
+ end
73
+
74
+ it "inserts a space after very narrow text parts if necessary" do
75
+ assert_equal('Hello World!', layout_runs([['Hello', 50, 100, 60, 110],
76
+ ['World!', 63, 100, 87, 110]]))
77
+ end
78
+
79
+ it "preserves the visual horizontal ordering of two runs" do
80
+ assert_equal('Hello World', layout_runs([['World', 72, 100, 92, 110],
81
+ ['Hello', 50, 100, 70, 110]]))
82
+ end
83
+
84
+ it "preserves the visual vertical ordering of two runs" do
85
+ assert_equal("Hello\nWorld", layout_runs([['World', 50, 80, 70, 100],
86
+ ['Hello', 50, 100, 70, 110]]))
87
+ end
88
+
89
+ it "inserts a single blank line between paragraphs" do
90
+ assert_equal("Hello\nWorld\n\nHere",
91
+ layout_runs([['Hello', 50, 100, 70, 110],
92
+ ['World', 50, 90, 70, 100],
93
+ ['Here', 50, 65, 66, 75]]))
94
+ end
95
+
96
+ it "inserts multiply lines for large gaps between paragraphs" do
97
+ assert_equal("Hello\nWorld\nHere\n\n\n\n\n\n\nFoot",
98
+ layout_runs([['Hello', 50, 100, 70, 110],
99
+ ['World', 50, 90, 70, 100],
100
+ ['Here', 50, 80, 70, 90],
101
+ ['Foot', 50, 10, 66, 20]]))
102
+ end
103
+
104
+ it "ignores outliers when calculating the normal line spacing" do
105
+ assert_equal("Hello\nWorld\n\n\n\nHere",
106
+ layout_runs([['Hello', 50, 100, 70, 110],
107
+ ['World', 50, 90, 70, 100],
108
+ ['Here', 50, 50, 70, 60]]))
109
+ end
110
+
111
+ it "can use a different line_tolerance_factor" do
112
+ assert_equal("HelloWorld",
113
+ layout_runs([['Hello', 50, 100, 70, 110],
114
+ ['World', 50, 90, 70, 100]], line_tolerance_factor: 1))
115
+ end
116
+
117
+ it "can use a different paragraph_distance_threshold" do
118
+ assert_equal("Hello\n\nWorld",
119
+ layout_runs([['Hello', 50, 100, 70, 110],
120
+ ['World', 50, 90, 70, 100]], paragraph_distance_threshold: 1))
121
+ end
122
+
123
+ it "can use a different large_distance_threshold" do
124
+ assert_equal("Hello\nWorld\n\nHere",
125
+ layout_runs([['Hello', 50, 100, 70, 110],
126
+ ['World', 50, 90, 70, 100],
127
+ ['Here', 50, 50, 66, 60]], large_distance_threshold: 8))
128
+ end
129
+ end