hexapdf 0.22.0 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +50 -0
  3. data/lib/hexapdf/cli/form.rb +26 -3
  4. data/lib/hexapdf/cli/inspect.rb +12 -3
  5. data/lib/hexapdf/cli/modify.rb +23 -3
  6. data/lib/hexapdf/composer.rb +24 -2
  7. data/lib/hexapdf/document/destinations.rb +396 -0
  8. data/lib/hexapdf/document.rb +38 -89
  9. data/lib/hexapdf/layout/frame.rb +8 -9
  10. data/lib/hexapdf/layout/style.rb +280 -7
  11. data/lib/hexapdf/layout/text_box.rb +10 -2
  12. data/lib/hexapdf/layout/text_layouter.rb +6 -1
  13. data/lib/hexapdf/revision.rb +8 -1
  14. data/lib/hexapdf/revisions.rb +151 -50
  15. data/lib/hexapdf/task/optimize.rb +21 -11
  16. data/lib/hexapdf/type/acro_form/text_field.rb +8 -0
  17. data/lib/hexapdf/type/catalog.rb +9 -1
  18. data/lib/hexapdf/type/names.rb +13 -0
  19. data/lib/hexapdf/type/xref_stream.rb +2 -1
  20. data/lib/hexapdf/utils/sorted_tree_node.rb +3 -1
  21. data/lib/hexapdf/version.rb +1 -1
  22. data/lib/hexapdf/writer.rb +15 -2
  23. data/test/hexapdf/document/test_destinations.rb +338 -0
  24. data/test/hexapdf/encryption/test_security_handler.rb +2 -2
  25. data/test/hexapdf/layout/test_frame.rb +15 -1
  26. data/test/hexapdf/layout/test_text_box.rb +16 -0
  27. data/test/hexapdf/layout/test_text_layouter.rb +7 -0
  28. data/test/hexapdf/task/test_optimize.rb +17 -4
  29. data/test/hexapdf/test_composer.rb +24 -1
  30. data/test/hexapdf/test_document.rb +30 -133
  31. data/test/hexapdf/test_parser.rb +1 -1
  32. data/test/hexapdf/test_revision.rb +14 -0
  33. data/test/hexapdf/test_revisions.rb +137 -29
  34. data/test/hexapdf/test_writer.rb +43 -14
  35. data/test/hexapdf/type/acro_form/test_text_field.rb +17 -0
  36. data/test/hexapdf/type/test_catalog.rb +8 -0
  37. data/test/hexapdf/type/test_names.rb +20 -0
  38. data/test/hexapdf/type/test_xref_stream.rb +2 -1
  39. data/test/hexapdf/utils/test_sorted_tree_node.rb +11 -1
  40. metadata +5 -2
@@ -51,6 +51,10 @@ module HexaPDF
51
51
  # the newest revision the highest index. This is also the order in which the revisions get
52
52
  # written.
53
53
  #
54
+ # *Important*: It is possible to manipulate the individual revisions and their objects oneself but
55
+ # this should only be done if one is familiar with the inner workings of HexaPDF. Otherwise it is
56
+ # best to use the convenience methods of this class to create, access or delete indirect objects.
57
+ #
54
58
  # See: PDF1.7 s7.5.6, HexaPDF::Revision
55
59
  class Revisions
56
60
 
@@ -68,27 +72,26 @@ module HexaPDF
68
72
 
69
73
  revisions = []
70
74
  begin
71
- xref_section, trailer = parser.load_revision(parser.startxref_offset)
72
- revisions << Revision.new(document.wrap(trailer, type: :XXTrailer),
73
- xref_section: xref_section, loader: object_loader)
74
- seen_xref_offsets = {parser.startxref_offset => true}
75
+ offset = parser.startxref_offset
76
+ seen_xref_offsets = {}
75
77
 
76
- while (prev = revisions[0].trailer.value[:Prev]) &&
77
- !seen_xref_offsets.key?(prev)
78
+ while offset && !seen_xref_offsets.key?(offset)
78
79
  # PDF1.7 s7.5.5 states that :Prev needs to be indirect, Adobe's reference 3.4.4 says it
79
80
  # should be direct. Adobe's POV is followed here. Same with :XRefStm.
80
- xref_section, trailer = parser.load_revision(prev)
81
- seen_xref_offsets[prev] = true
81
+ xref_section, trailer = parser.load_revision(offset)
82
+ seen_xref_offsets[offset] = true
82
83
 
83
- stm = revisions[0].trailer.value[:XRefStm]
84
+ stm = trailer[:XRefStm]
84
85
  if stm && !seen_xref_offsets.key?(stm)
85
86
  stm_xref_section, = parser.load_revision(stm)
86
- xref_section.merge!(stm_xref_section)
87
+ stm_xref_section.merge!(xref_section)
88
+ xref_section = stm_xref_section
87
89
  seen_xref_offsets[stm] = true
88
90
  end
89
91
 
90
92
  revisions.unshift(Revision.new(document.wrap(trailer, type: :XXTrailer),
91
93
  xref_section: xref_section, loader: object_loader))
94
+ offset = trailer[:Prev]
92
95
  end
93
96
  rescue HexaPDF::MalformedPDFError
94
97
  reconstructed_revision = parser.reconstructed_revision
@@ -133,23 +136,154 @@ module HexaPDF
133
136
  end
134
137
  end
135
138
 
136
- # Returns the revision at the specified index.
137
- def revision(index)
138
- @revisions[index]
139
+ # Returns the next object identifier that should be used when adding a new object.
140
+ def next_oid
141
+ @revisions.map(&:next_free_oid).max
142
+ end
143
+
144
+ # :call-seq:
145
+ # revisions.object(ref) -> obj or nil
146
+ # revisions.object(oid) -> obj or nil
147
+ #
148
+ # Returns the current version of the indirect object for the given exact reference or for the
149
+ # given object number.
150
+ #
151
+ # For references to unknown objects, +nil+ is returned but free objects are represented by a
152
+ # PDF Null object, not by +nil+!
153
+ #
154
+ # See: PDF1.7 s7.3.9
155
+ def object(ref)
156
+ i = @revisions.size - 1
157
+ while i >= 0
158
+ if (result = @revisions[i].object(ref))
159
+ return result
160
+ end
161
+ i -= 1
162
+ end
163
+ nil
164
+ end
165
+
166
+ # :call-seq:
167
+ # revisions.object?(ref) -> true or false
168
+ # revisions.object?(oid) -> true or false
169
+ #
170
+ # Returns +true+ if one of the revisions contains an indirect object for the given exact
171
+ # reference or for the given object number.
172
+ #
173
+ # Even though this method might return +true+ for some references, #object may return +nil+
174
+ # because this method takes *all* revisions into account.
175
+ def object?(ref)
176
+ @revisions.any? {|rev| rev.object?(ref) }
177
+ end
178
+
179
+ # :call-seq:
180
+ # revisions.add_object(object) -> object
181
+ #
182
+ # Adds the given HexaPDF::Object to the current revision and returns it.
183
+ #
184
+ # If +object+ is a direct object, an object number is automatically assigned.
185
+ def add_object(obj)
186
+ if obj.indirect? && (rev_obj = current.object(obj.oid))
187
+ if rev_obj.data == obj.data
188
+ return obj
189
+ else
190
+ raise HexaPDF::Error, "Can't add object because there is already " \
191
+ "an object with object number #{obj.oid}"
192
+ end
193
+ end
194
+
195
+ obj.oid = next_oid unless obj.indirect?
196
+ current.add(obj)
197
+ end
198
+
199
+ # :call-seq:
200
+ # revisions.delete_object(ref)
201
+ # revisions.delete_object(oid)
202
+ #
203
+ # Deletes the indirect object specified by an exact reference or by an object number.
204
+ def delete_object(ref)
205
+ @revisions.reverse_each do |rev|
206
+ if rev.object?(ref)
207
+ rev.delete(ref)
208
+ break
209
+ end
210
+ end
211
+ end
212
+
213
+ # :call-seq:
214
+ # revisions.each_object(only_current: true, only_loaded: false) {|obj| block } -> revisions
215
+ # revisions.each_object(only_current: true, only_loaded: false) {|obj, rev| block } -> revisions
216
+ # revisions.each_object(only_current: true, only_loaded: false) -> Enumerator
217
+ #
218
+ # Yields every object and optionally the revision it is in.
219
+ #
220
+ # If +only_loaded+ is +true+, only the already loaded objects of the PDF document are yielded.
221
+ # This does only matter when the document instance was created from an existing PDF document.
222
+ #
223
+ # By default, only the current version of each object is returned which implies that each object
224
+ # number is yielded exactly once. If the +only_current+ option is +false+, all stored objects
225
+ # from newest to oldest are returned, not only the current version of each object.
226
+ #
227
+ # The +only_current+ option can make a difference because the document can contain multiple
228
+ # revisions:
229
+ #
230
+ # * Multiple revisions may contain objects with the same object and generation numbers, e.g.
231
+ # two (different) objects with oid/gen [3,0].
232
+ #
233
+ # * Additionally, there may also be objects with the same object number but different
234
+ # generation numbers in different revisions, e.g. one object with oid/gen [3,0] and one with
235
+ # oid/gen [3,1].
236
+ def each_object(only_current: true, only_loaded: false, &block)
237
+ unless block_given?
238
+ return to_enum(__method__, only_current: only_current, only_loaded: only_loaded)
239
+ end
240
+
241
+ yield_rev = (block.arity == 2)
242
+ oids = {}
243
+ @revisions.reverse_each do |rev|
244
+ rev.each(only_loaded: only_loaded) do |obj|
245
+ next if only_current && oids.include?(obj.oid)
246
+ yield_rev ? yield(obj, rev) : yield(obj)
247
+ oids[obj.oid] = true
248
+ end
249
+ end
250
+ self
139
251
  end
140
- alias [] revision
141
252
 
142
253
  # Returns the current revision.
254
+ #
255
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
256
+ # *and the PDF specification.
143
257
  def current
144
258
  @revisions.last
145
259
  end
146
260
 
147
- # Returns the number of HexaPDF::Revision objects managed by this object.
148
- def size
149
- @revisions.size
261
+ # Returns a list of all revisions.
262
+ #
263
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
264
+ # *and the PDF specification.
265
+ def all
266
+ @revisions
267
+ end
268
+
269
+ # :call-seq:
270
+ # revisions.each {|rev| block } -> revisions
271
+ # revisions.each -> Enumerator
272
+ #
273
+ # Iterates over all revisions from oldest to current one.
274
+ #
275
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
276
+ # *and the PDF specification.
277
+ def each(&block)
278
+ return to_enum(__method__) unless block_given?
279
+ @revisions.each(&block)
280
+ self
150
281
  end
151
282
 
152
283
  # Adds a new empty revision to the document and returns it.
284
+ #
285
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
286
+ # *and the PDF specification.
153
287
  def add
154
288
  if @revisions.empty?
155
289
  trailer = {}
@@ -164,28 +298,6 @@ module HexaPDF
164
298
  rev
165
299
  end
166
300
 
167
- # :call-seq:
168
- # revisions.delete(index) -> rev or nil
169
- # revisions.delete(oid) -> rev or nil
170
- #
171
- # Deletes a revision from the document, either by index or by specifying the revision object
172
- # itself.
173
- #
174
- # Returns the deleted revision object, or +nil+ if the index was out of range or no matching
175
- # revision was found.
176
- #
177
- # Regarding the index: The oldest revision has index 0 and the current revision the highest
178
- # index!
179
- def delete(index_or_rev)
180
- if @revisions.length == 1
181
- raise HexaPDF::Error, "A document must have a least one revision, can't delete last one"
182
- elsif index_or_rev.kind_of?(Integer)
183
- @revisions.delete_at(index_or_rev)
184
- else
185
- @revisions.delete(index_or_rev)
186
- end
187
- end
188
-
189
301
  # :call-seq:
190
302
  # revisions.merge(range = 0..-1) -> revisions
191
303
  #
@@ -206,17 +318,6 @@ module HexaPDF
206
318
  self
207
319
  end
208
320
 
209
- # :call-seq:
210
- # revisions.each {|rev| block } -> revisions
211
- # revisions.each -> Enumerator
212
- #
213
- # Iterates over all revisions from oldest to current one.
214
- def each(&block)
215
- return to_enum(__method__) unless block_given?
216
- @revisions.each(&block)
217
- self
218
- end
219
-
220
321
  end
221
322
 
222
323
  end
@@ -106,7 +106,7 @@ module HexaPDF
106
106
  rev = doc.revisions.add
107
107
 
108
108
  oid = 1
109
- doc.revisions[0].each do |obj|
109
+ doc.revisions.all[0].each do |obj|
110
110
  if obj.null? || unused.include?(obj) || (obj.type == :ObjStm) ||
111
111
  (obj.type == :XRef && xref_streams != :preserve)
112
112
  obj.data.value = nil
@@ -119,7 +119,7 @@ module HexaPDF
119
119
  rev.add(obj)
120
120
  oid += 1
121
121
  end
122
- doc.revisions.delete(0)
122
+ doc.revisions.all.delete_at(0)
123
123
 
124
124
  if object_streams == :generate
125
125
  process_object_streams(doc, :generate, xref_streams)
@@ -134,7 +134,7 @@ module HexaPDF
134
134
  def self.process_object_streams(doc, method, xref_streams)
135
135
  case method
136
136
  when :delete
137
- doc.revisions.each_with_index do |rev, rev_index|
137
+ doc.revisions.each do |rev|
138
138
  xref_stream = false
139
139
  objects_to_delete = []
140
140
  rev.each do |obj|
@@ -150,11 +150,11 @@ module HexaPDF
150
150
  end
151
151
  objects_to_delete.each {|obj| rev.delete(obj) }
152
152
  if xref_streams == :generate && !xref_stream
153
- doc.add({Type: :XRef}, revision: rev_index)
153
+ rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid))
154
154
  end
155
155
  end
156
156
  when :generate
157
- doc.revisions.each_with_index do |rev, rev_index|
157
+ doc.revisions.each do |rev|
158
158
  xref_stream = false
159
159
  count = 0
160
160
  objstms = [doc.wrap({Type: :ObjStm})]
@@ -178,8 +178,11 @@ module HexaPDF
178
178
  end
179
179
  end
180
180
  old_objstms.each {|objstm| rev.delete(objstm) }
181
- objstms.each {|objstm| doc.add(objstm, revision: rev_index) }
182
- doc.add({Type: :XRef}, revision: rev_index) unless xref_stream
181
+ objstms.each do |objstm|
182
+ objstm.data.oid = doc.revisions.next_oid
183
+ rev.add(objstm)
184
+ end
185
+ rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid)) unless xref_stream
183
186
  end
184
187
  end
185
188
  end
@@ -198,13 +201,13 @@ module HexaPDF
198
201
  end
199
202
  end
200
203
  when :generate
201
- doc.revisions.each_with_index do |rev, rev_index|
204
+ doc.revisions.each do |rev|
202
205
  xref_stream = false
203
206
  rev.each do |obj|
204
207
  xref_stream = true if obj.type == :XRef
205
208
  delete_fields_with_defaults(obj)
206
209
  end
207
- doc.add({Type: :XRef}, revision: rev_index) unless xref_stream
210
+ rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid)) unless xref_stream
208
211
  end
209
212
  end
210
213
  end
@@ -229,7 +232,10 @@ module HexaPDF
229
232
  def self.compress_pages(doc)
230
233
  used_refs = {}
231
234
  doc.pages.each do |page|
232
- processor = SerializationProcessor.new
235
+ processor = SerializationProcessor.new do |error_message|
236
+ doc.config['parser.on_correctable_error'].call(doc, error_message, 0) &&
237
+ raise(HexaPDF::Error, error_message)
238
+ end
233
239
  HexaPDF::Content::Parser.parse(page.contents, processor)
234
240
  page.contents = processor.result
235
241
  page[:Contents].set_filter(:FlateDecode)
@@ -269,16 +275,20 @@ module HexaPDF
269
275
  # Contains all found references
270
276
  attr_reader :used_references
271
277
 
272
- def initialize #:nodoc:
278
+ def initialize(&error_block) #:nodoc:
273
279
  @result = ''.b
274
280
  @serializer = HexaPDF::Serializer.new
275
281
  @used_references = []
282
+ @error_block = error_block
276
283
  end
277
284
 
278
285
  def process(op, operands) #:nodoc:
279
286
  @result << HexaPDF::Content::Operator::DEFAULT_OPERATORS[op].
280
287
  serialize(@serializer, *operands)
281
288
  @used_references << operands[0] if op == :Do
289
+ rescue StandardError => e
290
+ @error_block.call("Invalid content stream operation found: " \
291
+ "#{op}#{operands.inspect} (#{e.message})")
282
292
  end
283
293
 
284
294
  end
@@ -164,8 +164,13 @@ module HexaPDF
164
164
  def field_value=(str)
165
165
  if flagged?(:password)
166
166
  raise HexaPDF::Error, "Storing a field value for a password field is not allowed"
167
+ elsif comb_text_field? && !key?(:MaxLen)
168
+ raise HexaPDF::Error, "A comb text field need a valid /MaxLen value"
167
169
  end
168
170
  str = str.gsub(/[[:space:]]/, ' ') if str && concrete_field_type == :single_line_text_field
171
+ if key?(:MaxLen) && str && str.length > self[:MaxLen]
172
+ raise HexaPDF::Error, "Value exceeds maximum allowed length of #{self[:MaxLen]}"
173
+ end
169
174
  self[:V] = str
170
175
  update_widgets
171
176
  end
@@ -243,6 +248,9 @@ module HexaPDF
243
248
  if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
244
249
  yield("Text contents of field '#{full_field_name}' is too long")
245
250
  end
251
+ if comb_text_field? && !max_len
252
+ yield("Comb text field needs a value for /MaxLen")
253
+ end
246
254
  end
247
255
 
248
256
  end
@@ -90,13 +90,21 @@ module HexaPDF
90
90
  true
91
91
  end
92
92
 
93
- # Returns the root node of the page tree.
93
+ # Returns the root node of the page tree, creating it if needed.
94
94
  #
95
95
  # See: PageTreeNode
96
96
  def pages
97
97
  self[:Pages] ||= document.add({Type: :Pages})
98
98
  end
99
99
 
100
+ # Returns the name dictionary containing all name trees of the document, creating it if
101
+ # needed.
102
+ #
103
+ # See: Names
104
+ def names
105
+ self[:Names] ||= document.add({}, type: :XXNames)
106
+ end
107
+
100
108
  # Returns the main AcroForm object.
101
109
  #
102
110
  # * If an AcroForm object exists, the +create+ argument is not used.
@@ -63,6 +63,19 @@ module HexaPDF
63
63
  define_field :AlternatePresentations, type: NameTreeNode, version: '1.4'
64
64
  define_field :Renditions, type: NameTreeNode, version: '1.5'
65
65
 
66
+ # Returns the destinations name tree containing a mapping from names to destination objects.
67
+ #
68
+ # The name tree will be created if needed.
69
+ #
70
+ # Note: It is possible to use this name tree directly, but HexaPDF::Document::Destinations
71
+ # provides a much easier to work with convenience interface for working with destination
72
+ # objects.
73
+ #
74
+ # See: PDF1.7 s12.3.2
75
+ def destinations
76
+ self[:Dests] ||= document.add({}, type: NameTreeNode)
77
+ end
78
+
66
79
  end
67
80
 
68
81
  end
@@ -93,7 +93,8 @@ module HexaPDF
93
93
  #
94
94
  # See: Type::Trailer
95
95
  def trailer
96
- Trailer.each_field.with_object({}) do |(name, _data), hash|
96
+ trailer = {Type: :XRef}
97
+ Trailer.each_field.with_object(trailer) do |(name, _data), hash|
97
98
  hash[name] = value[name] if key?(name)
98
99
  end
99
100
  end
@@ -168,11 +168,13 @@ module HexaPDF
168
168
  index = find_in_leaf_node(node[container_name], key)
169
169
  if node[container_name][index] == key
170
170
  result = node[container_name][index + 1]
171
+ else
172
+ break
171
173
  end
172
174
  elsif node.key?(:Kids)
173
175
  index = find_in_intermediate_node(node[:Kids], key)
174
176
  node = node[:Kids][index]
175
- break unless key >= node[:Limits][0] && key <= node[:Limits][1]
177
+ break unless node && key >= node[:Limits][0] && key <= node[:Limits][1]
176
178
  else
177
179
  break
178
180
  end
@@ -37,6 +37,6 @@
37
37
  module HexaPDF
38
38
 
39
39
  # The version of HexaPDF.
40
- VERSION = '0.22.0'
40
+ VERSION = '0.23.0'
41
41
 
42
42
  end
@@ -74,6 +74,7 @@ module HexaPDF
74
74
 
75
75
  # Writes the document to the IO object and returns the last XRefSection written.
76
76
  def write
77
+ move_modified_objects_into_current_revision
77
78
  write_file_header
78
79
 
79
80
  pos = xref_section = nil
@@ -109,7 +110,7 @@ module HexaPDF
109
110
  @document.revisions.each do |rev|
110
111
  rev.each_modified_object {|obj| revision.send(:add_without_check, obj) }
111
112
  end
112
- _pos, xref_section = write_revision(revision, @document.revisions.parser.startxref_offset)
113
+ _pos, xref_section = write_revision(revision, parser.startxref_offset)
113
114
 
114
115
  xref_section
115
116
  end
@@ -123,6 +124,18 @@ module HexaPDF
123
124
  @io << "%PDF-#{@document.version}\n%\xCF\xEC\xFF\xE8\xD7\xCB\xCD\n"
124
125
  end
125
126
 
127
+ # Moves all modified objects into the current revision to avoid invalid references and such.
128
+ def move_modified_objects_into_current_revision
129
+ return if @document.revisions.count == 1
130
+
131
+ revision = @document.revisions.add
132
+ @document.revisions.all[0..-2].each do |rev|
133
+ rev.each_modified_object {|obj| revision.send(:add_without_check, obj) }
134
+ rev.reset_objects
135
+ end
136
+ @document.revisions.merge(-2..-1)
137
+ end
138
+
126
139
  # Writes the given revision.
127
140
  #
128
141
  # The optional +previous_xref_pos+ argument needs to contain the byte position of the previous
@@ -190,7 +203,7 @@ module HexaPDF
190
203
  end
191
204
 
192
205
  if (!object_streams.empty? || @use_xref_streams) && xref_stream.nil?
193
- xref_stream = @document.wrap({Type: :XRef}, oid: rev.next_free_oid)
206
+ xref_stream = @document.wrap({Type: :XRef}, oid: @document.revisions.next_oid)
194
207
  rev.add(xref_stream)
195
208
  end
196
209