hexapdf 0.22.0 → 0.23.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +50 -0
  3. data/lib/hexapdf/cli/form.rb +26 -3
  4. data/lib/hexapdf/cli/inspect.rb +12 -3
  5. data/lib/hexapdf/cli/modify.rb +23 -3
  6. data/lib/hexapdf/composer.rb +24 -2
  7. data/lib/hexapdf/document/destinations.rb +396 -0
  8. data/lib/hexapdf/document.rb +38 -89
  9. data/lib/hexapdf/layout/frame.rb +8 -9
  10. data/lib/hexapdf/layout/style.rb +280 -7
  11. data/lib/hexapdf/layout/text_box.rb +10 -2
  12. data/lib/hexapdf/layout/text_layouter.rb +6 -1
  13. data/lib/hexapdf/revision.rb +8 -1
  14. data/lib/hexapdf/revisions.rb +151 -50
  15. data/lib/hexapdf/task/optimize.rb +21 -11
  16. data/lib/hexapdf/type/acro_form/text_field.rb +8 -0
  17. data/lib/hexapdf/type/catalog.rb +9 -1
  18. data/lib/hexapdf/type/names.rb +13 -0
  19. data/lib/hexapdf/type/xref_stream.rb +2 -1
  20. data/lib/hexapdf/utils/sorted_tree_node.rb +3 -1
  21. data/lib/hexapdf/version.rb +1 -1
  22. data/lib/hexapdf/writer.rb +15 -2
  23. data/test/hexapdf/document/test_destinations.rb +338 -0
  24. data/test/hexapdf/encryption/test_security_handler.rb +2 -2
  25. data/test/hexapdf/layout/test_frame.rb +15 -1
  26. data/test/hexapdf/layout/test_text_box.rb +16 -0
  27. data/test/hexapdf/layout/test_text_layouter.rb +7 -0
  28. data/test/hexapdf/task/test_optimize.rb +17 -4
  29. data/test/hexapdf/test_composer.rb +24 -1
  30. data/test/hexapdf/test_document.rb +30 -133
  31. data/test/hexapdf/test_parser.rb +1 -1
  32. data/test/hexapdf/test_revision.rb +14 -0
  33. data/test/hexapdf/test_revisions.rb +137 -29
  34. data/test/hexapdf/test_writer.rb +43 -14
  35. data/test/hexapdf/type/acro_form/test_text_field.rb +17 -0
  36. data/test/hexapdf/type/test_catalog.rb +8 -0
  37. data/test/hexapdf/type/test_names.rb +20 -0
  38. data/test/hexapdf/type/test_xref_stream.rb +2 -1
  39. data/test/hexapdf/utils/test_sorted_tree_node.rb +11 -1
  40. metadata +5 -2
@@ -51,6 +51,10 @@ module HexaPDF
51
51
  # the newest revision the highest index. This is also the order in which the revisions get
52
52
  # written.
53
53
  #
54
+ # *Important*: It is possible to manipulate the individual revisions and their objects oneself but
55
+ # this should only be done if one is familiar with the inner workings of HexaPDF. Otherwise it is
56
+ # best to use the convenience methods of this class to create, access or delete indirect objects.
57
+ #
54
58
  # See: PDF1.7 s7.5.6, HexaPDF::Revision
55
59
  class Revisions
56
60
 
@@ -68,27 +72,26 @@ module HexaPDF
68
72
 
69
73
  revisions = []
70
74
  begin
71
- xref_section, trailer = parser.load_revision(parser.startxref_offset)
72
- revisions << Revision.new(document.wrap(trailer, type: :XXTrailer),
73
- xref_section: xref_section, loader: object_loader)
74
- seen_xref_offsets = {parser.startxref_offset => true}
75
+ offset = parser.startxref_offset
76
+ seen_xref_offsets = {}
75
77
 
76
- while (prev = revisions[0].trailer.value[:Prev]) &&
77
- !seen_xref_offsets.key?(prev)
78
+ while offset && !seen_xref_offsets.key?(offset)
78
79
  # PDF1.7 s7.5.5 states that :Prev needs to be indirect, Adobe's reference 3.4.4 says it
79
80
  # should be direct. Adobe's POV is followed here. Same with :XRefStm.
80
- xref_section, trailer = parser.load_revision(prev)
81
- seen_xref_offsets[prev] = true
81
+ xref_section, trailer = parser.load_revision(offset)
82
+ seen_xref_offsets[offset] = true
82
83
 
83
- stm = revisions[0].trailer.value[:XRefStm]
84
+ stm = trailer[:XRefStm]
84
85
  if stm && !seen_xref_offsets.key?(stm)
85
86
  stm_xref_section, = parser.load_revision(stm)
86
- xref_section.merge!(stm_xref_section)
87
+ stm_xref_section.merge!(xref_section)
88
+ xref_section = stm_xref_section
87
89
  seen_xref_offsets[stm] = true
88
90
  end
89
91
 
90
92
  revisions.unshift(Revision.new(document.wrap(trailer, type: :XXTrailer),
91
93
  xref_section: xref_section, loader: object_loader))
94
+ offset = trailer[:Prev]
92
95
  end
93
96
  rescue HexaPDF::MalformedPDFError
94
97
  reconstructed_revision = parser.reconstructed_revision
@@ -133,23 +136,154 @@ module HexaPDF
133
136
  end
134
137
  end
135
138
 
136
- # Returns the revision at the specified index.
137
- def revision(index)
138
- @revisions[index]
139
+ # Returns the next object identifier that should be used when adding a new object.
140
+ def next_oid
141
+ @revisions.map(&:next_free_oid).max
142
+ end
143
+
144
+ # :call-seq:
145
+ # revisions.object(ref) -> obj or nil
146
+ # revisions.object(oid) -> obj or nil
147
+ #
148
+ # Returns the current version of the indirect object for the given exact reference or for the
149
+ # given object number.
150
+ #
151
+ # For references to unknown objects, +nil+ is returned but free objects are represented by a
152
+ # PDF Null object, not by +nil+!
153
+ #
154
+ # See: PDF1.7 s7.3.9
155
+ def object(ref)
156
+ i = @revisions.size - 1
157
+ while i >= 0
158
+ if (result = @revisions[i].object(ref))
159
+ return result
160
+ end
161
+ i -= 1
162
+ end
163
+ nil
164
+ end
165
+
166
+ # :call-seq:
167
+ # revisions.object?(ref) -> true or false
168
+ # revisions.object?(oid) -> true or false
169
+ #
170
+ # Returns +true+ if one of the revisions contains an indirect object for the given exact
171
+ # reference or for the given object number.
172
+ #
173
+ # Even though this method might return +true+ for some references, #object may return +nil+
174
+ # because this method takes *all* revisions into account.
175
+ def object?(ref)
176
+ @revisions.any? {|rev| rev.object?(ref) }
177
+ end
178
+
179
+ # :call-seq:
180
+ # revisions.add_object(object) -> object
181
+ #
182
+ # Adds the given HexaPDF::Object to the current revision and returns it.
183
+ #
184
+ # If +object+ is a direct object, an object number is automatically assigned.
185
+ def add_object(obj)
186
+ if obj.indirect? && (rev_obj = current.object(obj.oid))
187
+ if rev_obj.data == obj.data
188
+ return obj
189
+ else
190
+ raise HexaPDF::Error, "Can't add object because there is already " \
191
+ "an object with object number #{obj.oid}"
192
+ end
193
+ end
194
+
195
+ obj.oid = next_oid unless obj.indirect?
196
+ current.add(obj)
197
+ end
198
+
199
+ # :call-seq:
200
+ # revisions.delete_object(ref)
201
+ # revisions.delete_object(oid)
202
+ #
203
+ # Deletes the indirect object specified by an exact reference or by an object number.
204
+ def delete_object(ref)
205
+ @revisions.reverse_each do |rev|
206
+ if rev.object?(ref)
207
+ rev.delete(ref)
208
+ break
209
+ end
210
+ end
211
+ end
212
+
213
+ # :call-seq:
214
+ # revisions.each_object(only_current: true, only_loaded: false) {|obj| block } -> revisions
215
+ # revisions.each_object(only_current: true, only_loaded: false) {|obj, rev| block } -> revisions
216
+ # revisions.each_object(only_current: true, only_loaded: false) -> Enumerator
217
+ #
218
+ # Yields every object and optionally the revision it is in.
219
+ #
220
+ # If +only_loaded+ is +true+, only the already loaded objects of the PDF document are yielded.
221
+ # This does only matter when the document instance was created from an existing PDF document.
222
+ #
223
+ # By default, only the current version of each object is returned which implies that each object
224
+ # number is yielded exactly once. If the +only_current+ option is +false+, all stored objects
225
+ # from newest to oldest are returned, not only the current version of each object.
226
+ #
227
+ # The +only_current+ option can make a difference because the document can contain multiple
228
+ # revisions:
229
+ #
230
+ # * Multiple revisions may contain objects with the same object and generation numbers, e.g.
231
+ # two (different) objects with oid/gen [3,0].
232
+ #
233
+ # * Additionally, there may also be objects with the same object number but different
234
+ # generation numbers in different revisions, e.g. one object with oid/gen [3,0] and one with
235
+ # oid/gen [3,1].
236
+ def each_object(only_current: true, only_loaded: false, &block)
237
+ unless block_given?
238
+ return to_enum(__method__, only_current: only_current, only_loaded: only_loaded)
239
+ end
240
+
241
+ yield_rev = (block.arity == 2)
242
+ oids = {}
243
+ @revisions.reverse_each do |rev|
244
+ rev.each(only_loaded: only_loaded) do |obj|
245
+ next if only_current && oids.include?(obj.oid)
246
+ yield_rev ? yield(obj, rev) : yield(obj)
247
+ oids[obj.oid] = true
248
+ end
249
+ end
250
+ self
139
251
  end
140
- alias [] revision
141
252
 
142
253
  # Returns the current revision.
254
+ #
255
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
256
+ # *and the PDF specification.
143
257
  def current
144
258
  @revisions.last
145
259
  end
146
260
 
147
- # Returns the number of HexaPDF::Revision objects managed by this object.
148
- def size
149
- @revisions.size
261
+ # Returns a list of all revisions.
262
+ #
263
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
264
+ # *and the PDF specification.
265
+ def all
266
+ @revisions
267
+ end
268
+
269
+ # :call-seq:
270
+ # revisions.each {|rev| block } -> revisions
271
+ # revisions.each -> Enumerator
272
+ #
273
+ # Iterates over all revisions from oldest to current one.
274
+ #
275
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
276
+ # *and the PDF specification.
277
+ def each(&block)
278
+ return to_enum(__method__) unless block_given?
279
+ @revisions.each(&block)
280
+ self
150
281
  end
151
282
 
152
283
  # Adds a new empty revision to the document and returns it.
284
+ #
285
+ # *Note*: This method should only be used if one is familiar with the inner workings of HexaPDF
286
+ # *and the PDF specification.
153
287
  def add
154
288
  if @revisions.empty?
155
289
  trailer = {}
@@ -164,28 +298,6 @@ module HexaPDF
164
298
  rev
165
299
  end
166
300
 
167
- # :call-seq:
168
- # revisions.delete(index) -> rev or nil
169
- # revisions.delete(oid) -> rev or nil
170
- #
171
- # Deletes a revision from the document, either by index or by specifying the revision object
172
- # itself.
173
- #
174
- # Returns the deleted revision object, or +nil+ if the index was out of range or no matching
175
- # revision was found.
176
- #
177
- # Regarding the index: The oldest revision has index 0 and the current revision the highest
178
- # index!
179
- def delete(index_or_rev)
180
- if @revisions.length == 1
181
- raise HexaPDF::Error, "A document must have a least one revision, can't delete last one"
182
- elsif index_or_rev.kind_of?(Integer)
183
- @revisions.delete_at(index_or_rev)
184
- else
185
- @revisions.delete(index_or_rev)
186
- end
187
- end
188
-
189
301
  # :call-seq:
190
302
  # revisions.merge(range = 0..-1) -> revisions
191
303
  #
@@ -206,17 +318,6 @@ module HexaPDF
206
318
  self
207
319
  end
208
320
 
209
- # :call-seq:
210
- # revisions.each {|rev| block } -> revisions
211
- # revisions.each -> Enumerator
212
- #
213
- # Iterates over all revisions from oldest to current one.
214
- def each(&block)
215
- return to_enum(__method__) unless block_given?
216
- @revisions.each(&block)
217
- self
218
- end
219
-
220
321
  end
221
322
 
222
323
  end
@@ -106,7 +106,7 @@ module HexaPDF
106
106
  rev = doc.revisions.add
107
107
 
108
108
  oid = 1
109
- doc.revisions[0].each do |obj|
109
+ doc.revisions.all[0].each do |obj|
110
110
  if obj.null? || unused.include?(obj) || (obj.type == :ObjStm) ||
111
111
  (obj.type == :XRef && xref_streams != :preserve)
112
112
  obj.data.value = nil
@@ -119,7 +119,7 @@ module HexaPDF
119
119
  rev.add(obj)
120
120
  oid += 1
121
121
  end
122
- doc.revisions.delete(0)
122
+ doc.revisions.all.delete_at(0)
123
123
 
124
124
  if object_streams == :generate
125
125
  process_object_streams(doc, :generate, xref_streams)
@@ -134,7 +134,7 @@ module HexaPDF
134
134
  def self.process_object_streams(doc, method, xref_streams)
135
135
  case method
136
136
  when :delete
137
- doc.revisions.each_with_index do |rev, rev_index|
137
+ doc.revisions.each do |rev|
138
138
  xref_stream = false
139
139
  objects_to_delete = []
140
140
  rev.each do |obj|
@@ -150,11 +150,11 @@ module HexaPDF
150
150
  end
151
151
  objects_to_delete.each {|obj| rev.delete(obj) }
152
152
  if xref_streams == :generate && !xref_stream
153
- doc.add({Type: :XRef}, revision: rev_index)
153
+ rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid))
154
154
  end
155
155
  end
156
156
  when :generate
157
- doc.revisions.each_with_index do |rev, rev_index|
157
+ doc.revisions.each do |rev|
158
158
  xref_stream = false
159
159
  count = 0
160
160
  objstms = [doc.wrap({Type: :ObjStm})]
@@ -178,8 +178,11 @@ module HexaPDF
178
178
  end
179
179
  end
180
180
  old_objstms.each {|objstm| rev.delete(objstm) }
181
- objstms.each {|objstm| doc.add(objstm, revision: rev_index) }
182
- doc.add({Type: :XRef}, revision: rev_index) unless xref_stream
181
+ objstms.each do |objstm|
182
+ objstm.data.oid = doc.revisions.next_oid
183
+ rev.add(objstm)
184
+ end
185
+ rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid)) unless xref_stream
183
186
  end
184
187
  end
185
188
  end
@@ -198,13 +201,13 @@ module HexaPDF
198
201
  end
199
202
  end
200
203
  when :generate
201
- doc.revisions.each_with_index do |rev, rev_index|
204
+ doc.revisions.each do |rev|
202
205
  xref_stream = false
203
206
  rev.each do |obj|
204
207
  xref_stream = true if obj.type == :XRef
205
208
  delete_fields_with_defaults(obj)
206
209
  end
207
- doc.add({Type: :XRef}, revision: rev_index) unless xref_stream
210
+ rev.add(doc.wrap({Type: :XRef}, oid: doc.revisions.next_oid)) unless xref_stream
208
211
  end
209
212
  end
210
213
  end
@@ -229,7 +232,10 @@ module HexaPDF
229
232
  def self.compress_pages(doc)
230
233
  used_refs = {}
231
234
  doc.pages.each do |page|
232
- processor = SerializationProcessor.new
235
+ processor = SerializationProcessor.new do |error_message|
236
+ doc.config['parser.on_correctable_error'].call(doc, error_message, 0) &&
237
+ raise(HexaPDF::Error, error_message)
238
+ end
233
239
  HexaPDF::Content::Parser.parse(page.contents, processor)
234
240
  page.contents = processor.result
235
241
  page[:Contents].set_filter(:FlateDecode)
@@ -269,16 +275,20 @@ module HexaPDF
269
275
  # Contains all found references
270
276
  attr_reader :used_references
271
277
 
272
- def initialize #:nodoc:
278
+ def initialize(&error_block) #:nodoc:
273
279
  @result = ''.b
274
280
  @serializer = HexaPDF::Serializer.new
275
281
  @used_references = []
282
+ @error_block = error_block
276
283
  end
277
284
 
278
285
  def process(op, operands) #:nodoc:
279
286
  @result << HexaPDF::Content::Operator::DEFAULT_OPERATORS[op].
280
287
  serialize(@serializer, *operands)
281
288
  @used_references << operands[0] if op == :Do
289
+ rescue StandardError => e
290
+ @error_block.call("Invalid content stream operation found: " \
291
+ "#{op}#{operands.inspect} (#{e.message})")
282
292
  end
283
293
 
284
294
  end
@@ -164,8 +164,13 @@ module HexaPDF
164
164
  def field_value=(str)
165
165
  if flagged?(:password)
166
166
  raise HexaPDF::Error, "Storing a field value for a password field is not allowed"
167
+ elsif comb_text_field? && !key?(:MaxLen)
168
+ raise HexaPDF::Error, "A comb text field need a valid /MaxLen value"
167
169
  end
168
170
  str = str.gsub(/[[:space:]]/, ' ') if str && concrete_field_type == :single_line_text_field
171
+ if key?(:MaxLen) && str && str.length > self[:MaxLen]
172
+ raise HexaPDF::Error, "Value exceeds maximum allowed length of #{self[:MaxLen]}"
173
+ end
169
174
  self[:V] = str
170
175
  update_widgets
171
176
  end
@@ -243,6 +248,9 @@ module HexaPDF
243
248
  if (max_len = self[:MaxLen]) && field_value && field_value.length > max_len
244
249
  yield("Text contents of field '#{full_field_name}' is too long")
245
250
  end
251
+ if comb_text_field? && !max_len
252
+ yield("Comb text field needs a value for /MaxLen")
253
+ end
246
254
  end
247
255
 
248
256
  end
@@ -90,13 +90,21 @@ module HexaPDF
90
90
  true
91
91
  end
92
92
 
93
- # Returns the root node of the page tree.
93
+ # Returns the root node of the page tree, creating it if needed.
94
94
  #
95
95
  # See: PageTreeNode
96
96
  def pages
97
97
  self[:Pages] ||= document.add({Type: :Pages})
98
98
  end
99
99
 
100
+ # Returns the name dictionary containing all name trees of the document, creating it if
101
+ # needed.
102
+ #
103
+ # See: Names
104
+ def names
105
+ self[:Names] ||= document.add({}, type: :XXNames)
106
+ end
107
+
100
108
  # Returns the main AcroForm object.
101
109
  #
102
110
  # * If an AcroForm object exists, the +create+ argument is not used.
@@ -63,6 +63,19 @@ module HexaPDF
63
63
  define_field :AlternatePresentations, type: NameTreeNode, version: '1.4'
64
64
  define_field :Renditions, type: NameTreeNode, version: '1.5'
65
65
 
66
+ # Returns the destinations name tree containing a mapping from names to destination objects.
67
+ #
68
+ # The name tree will be created if needed.
69
+ #
70
+ # Note: It is possible to use this name tree directly, but HexaPDF::Document::Destinations
71
+ # provides a much easier to work with convenience interface for working with destination
72
+ # objects.
73
+ #
74
+ # See: PDF1.7 s12.3.2
75
+ def destinations
76
+ self[:Dests] ||= document.add({}, type: NameTreeNode)
77
+ end
78
+
66
79
  end
67
80
 
68
81
  end
@@ -93,7 +93,8 @@ module HexaPDF
93
93
  #
94
94
  # See: Type::Trailer
95
95
  def trailer
96
- Trailer.each_field.with_object({}) do |(name, _data), hash|
96
+ trailer = {Type: :XRef}
97
+ Trailer.each_field.with_object(trailer) do |(name, _data), hash|
97
98
  hash[name] = value[name] if key?(name)
98
99
  end
99
100
  end
@@ -168,11 +168,13 @@ module HexaPDF
168
168
  index = find_in_leaf_node(node[container_name], key)
169
169
  if node[container_name][index] == key
170
170
  result = node[container_name][index + 1]
171
+ else
172
+ break
171
173
  end
172
174
  elsif node.key?(:Kids)
173
175
  index = find_in_intermediate_node(node[:Kids], key)
174
176
  node = node[:Kids][index]
175
- break unless key >= node[:Limits][0] && key <= node[:Limits][1]
177
+ break unless node && key >= node[:Limits][0] && key <= node[:Limits][1]
176
178
  else
177
179
  break
178
180
  end
@@ -37,6 +37,6 @@
37
37
  module HexaPDF
38
38
 
39
39
  # The version of HexaPDF.
40
- VERSION = '0.22.0'
40
+ VERSION = '0.23.0'
41
41
 
42
42
  end
@@ -74,6 +74,7 @@ module HexaPDF
74
74
 
75
75
  # Writes the document to the IO object and returns the last XRefSection written.
76
76
  def write
77
+ move_modified_objects_into_current_revision
77
78
  write_file_header
78
79
 
79
80
  pos = xref_section = nil
@@ -109,7 +110,7 @@ module HexaPDF
109
110
  @document.revisions.each do |rev|
110
111
  rev.each_modified_object {|obj| revision.send(:add_without_check, obj) }
111
112
  end
112
- _pos, xref_section = write_revision(revision, @document.revisions.parser.startxref_offset)
113
+ _pos, xref_section = write_revision(revision, parser.startxref_offset)
113
114
 
114
115
  xref_section
115
116
  end
@@ -123,6 +124,18 @@ module HexaPDF
123
124
  @io << "%PDF-#{@document.version}\n%\xCF\xEC\xFF\xE8\xD7\xCB\xCD\n"
124
125
  end
125
126
 
127
+ # Moves all modified objects into the current revision to avoid invalid references and such.
128
+ def move_modified_objects_into_current_revision
129
+ return if @document.revisions.count == 1
130
+
131
+ revision = @document.revisions.add
132
+ @document.revisions.all[0..-2].each do |rev|
133
+ rev.each_modified_object {|obj| revision.send(:add_without_check, obj) }
134
+ rev.reset_objects
135
+ end
136
+ @document.revisions.merge(-2..-1)
137
+ end
138
+
126
139
  # Writes the given revision.
127
140
  #
128
141
  # The optional +previous_xref_pos+ argument needs to contain the byte position of the previous
@@ -190,7 +203,7 @@ module HexaPDF
190
203
  end
191
204
 
192
205
  if (!object_streams.empty? || @use_xref_streams) && xref_stream.nil?
193
- xref_stream = @document.wrap({Type: :XRef}, oid: rev.next_free_oid)
206
+ xref_stream = @document.wrap({Type: :XRef}, oid: @document.revisions.next_oid)
194
207
  rev.add(xref_stream)
195
208
  end
196
209