pdf-core 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+ module PDF
3
+ module Core
4
+ # This is used to differentiate strings that must be encoded as
5
+ # a *literal* string, versus those that can be encoded in
6
+ # the PDF hexadecimal format.
7
+ #
8
+ # Some features of the PDF format appear to require that literal
9
+ # strings be used. One such feature is the /Dest key of a link
10
+ # annotation; if a hex encoded string is used there, the links
11
+ # do not work (as tested in Mac OS X Preview, and Adobe Acrobat
12
+ # Reader).
13
+ class LiteralString < String #:nodoc:
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,177 @@
1
+ # encoding: utf-8
2
+
3
+ # name_tree.rb : Implements NameTree for PDF
4
+ #
5
+ # Copyright November 2008, Jamis Buck. All Rights Reserved.
6
+ #
7
+ # This is free software. Please see the LICENSE and COPYING files for details.
8
+ #
9
+ module PDF
10
+ module Core
11
+ module NameTree #:nodoc:
12
+ class Node #:nodoc:
13
+ attr_reader :children
14
+ attr_reader :limit
15
+ attr_reader :document
16
+ attr_accessor :parent
17
+ attr_accessor :ref
18
+
19
+ def initialize(document, limit, parent=nil)
20
+ @document = document
21
+ @children = []
22
+ @limit = limit
23
+ @parent = parent
24
+ @ref = nil
25
+ end
26
+
27
+ def empty?
28
+ children.empty?
29
+ end
30
+
31
+ def size
32
+ leaf? ? children.size : children.inject(0) { |sum, child| sum + child.size }
33
+ end
34
+
35
+ def leaf?
36
+ children.empty? || children.first.is_a?(Value)
37
+ end
38
+
39
+ def add(name, value)
40
+ self << Value.new(name, value)
41
+ end
42
+
43
+ def to_hash
44
+ hash = {}
45
+
46
+ hash[:Limits] = [least, greatest] if parent
47
+ if leaf?
48
+ hash[:Names] = children if leaf?
49
+ else
50
+ hash[:Kids] = children.map { |child| child.ref }
51
+ end
52
+
53
+ return hash
54
+ end
55
+
56
+ def least
57
+ if leaf?
58
+ children.first.name
59
+ else
60
+ children.first.least
61
+ end
62
+ end
63
+
64
+ def greatest
65
+ if leaf?
66
+ children.last.name
67
+ else
68
+ children.last.greatest
69
+ end
70
+ end
71
+
72
+ def <<(value)
73
+ if children.empty?
74
+ children << value
75
+ elsif leaf?
76
+ children.insert(insertion_point(value), value)
77
+ split! if children.length > limit
78
+ else
79
+ fit = children.detect { |child| child >= value }
80
+ fit = children.last unless fit
81
+ fit << value
82
+ end
83
+
84
+ value
85
+ end
86
+
87
+ def >=(value)
88
+ children.empty? || children.last >= value
89
+ end
90
+
91
+ def split!
92
+ if parent
93
+ parent.split(self)
94
+ else
95
+ left, right = new_node(self), new_node(self)
96
+ split_children(self, left, right)
97
+ children.replace([left, right])
98
+ end
99
+ end
100
+
101
+ # Returns a deep copy of this node, without copying expensive things
102
+ # like the ref to @document.
103
+ #
104
+ def deep_copy
105
+ node = dup
106
+ node.instance_variable_set("@children",
107
+ Marshal.load(Marshal.dump(children)))
108
+ node.instance_variable_set("@ref",
109
+ node.ref ? node.ref.deep_copy : nil)
110
+ node
111
+ end
112
+
113
+ protected
114
+
115
+ def split(node)
116
+ new_child = new_node(self)
117
+ split_children(node, node, new_child)
118
+ index = children.index(node)
119
+ children.insert(index+1, new_child)
120
+ split! if children.length > limit
121
+ end
122
+
123
+ private
124
+
125
+ def new_node(parent=nil)
126
+ node = Node.new(document, limit, parent)
127
+ node.ref = document.ref!(node)
128
+ return node
129
+ end
130
+
131
+ def split_children(node, left, right)
132
+ half = (node.limit+1)/2
133
+
134
+ left_children, right_children = node.children[0...half], node.children[half..-1]
135
+
136
+ left.children.replace(left_children)
137
+ right.children.replace(right_children)
138
+
139
+ unless node.leaf?
140
+ left_children.each { |child| child.parent = left }
141
+ right_children.each { |child| child.parent = right }
142
+ end
143
+ end
144
+
145
+ def insertion_point(value)
146
+ children.each_with_index do |child, index|
147
+ return index if child >= value
148
+ end
149
+ return children.length
150
+ end
151
+ end
152
+
153
+ class Value #:nodoc:
154
+ include Comparable
155
+
156
+ attr_reader :name
157
+ attr_reader :value
158
+
159
+ def initialize(name, value)
160
+ @name, @value = PDF::Core::LiteralString.new(name), value
161
+ end
162
+
163
+ def <=>(leaf)
164
+ name <=> leaf.name
165
+ end
166
+
167
+ def inspect
168
+ "#<Value: #{name.inspect} : #{value.inspect}>"
169
+ end
170
+
171
+ def to_s
172
+ "#{name} : #{value}"
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,308 @@
1
+ # encoding: utf-8
2
+
3
+ # Implements PDF object repository
4
+ #
5
+ # Copyright August 2009, Brad Ediger. All Rights Reserved.
6
+ #
7
+ # This is free software. Please see the LICENSE and COPYING files for details.
8
+
9
+
10
+ require 'pdf/reader'
11
+
12
+ module PDF
13
+ module Core
14
+ class ObjectStore #:nodoc:
15
+ include Enumerable
16
+
17
+ attr_reader :min_version
18
+
19
+ BASE_OBJECTS = %w[info pages root]
20
+
21
+ def initialize(opts = {})
22
+ @objects = {}
23
+ @identifiers = []
24
+
25
+ load_file(opts[:template]) if opts[:template]
26
+
27
+ @info ||= ref(opts[:info] || {}).identifier
28
+ @root ||= ref(:Type => :Catalog).identifier
29
+ if pages.nil?
30
+ root.data[:Pages] = ref(:Type => :Pages, :Count => 0, :Kids => [])
31
+ end
32
+ end
33
+
34
+ def ref(data, &block)
35
+ push(size + 1, data, &block)
36
+ end
37
+
38
+ def info
39
+ @objects[@info]
40
+ end
41
+
42
+ def root
43
+ @objects[@root]
44
+ end
45
+
46
+ def pages
47
+ root.data[:Pages]
48
+ end
49
+
50
+ def page_count
51
+ pages.data[:Count]
52
+ end
53
+
54
+ # Adds the given reference to the store and returns the reference object.
55
+ # If the object provided is not a PDF::Core::Reference, one is created from the
56
+ # arguments provided.
57
+ #
58
+ def push(*args, &block)
59
+ reference = if args.first.is_a?(PDF::Core::Reference)
60
+ args.first
61
+ else
62
+ PDF::Core::Reference.new(*args, &block)
63
+ end
64
+
65
+ @objects[reference.identifier] = reference
66
+ @identifiers << reference.identifier
67
+ reference
68
+ end
69
+
70
+ alias_method :<<, :push
71
+
72
+ def each
73
+ @identifiers.each do |id|
74
+ yield @objects[id]
75
+ end
76
+ end
77
+
78
+ def [](id)
79
+ @objects[id]
80
+ end
81
+
82
+ def size
83
+ @identifiers.size
84
+ end
85
+ alias_method :length, :size
86
+
87
+ def compact
88
+ # Clear live markers
89
+ each { |o| o.live = false }
90
+
91
+ # Recursively mark reachable objects live, starting from the roots
92
+ # (the only objects referenced in the trailer)
93
+ root.mark_live
94
+ info.mark_live
95
+
96
+ # Renumber live objects to eliminate gaps (shrink the xref table)
97
+ if @objects.any?{ |_, o| !o.live }
98
+ new_id = 1
99
+ new_objects = {}
100
+ new_identifiers = []
101
+
102
+ each do |obj|
103
+ if obj.live
104
+ obj.identifier = new_id
105
+ new_objects[new_id] = obj
106
+ new_identifiers << new_id
107
+ new_id += 1
108
+ end
109
+ end
110
+
111
+ @objects = new_objects
112
+ @identifiers = new_identifiers
113
+ end
114
+ end
115
+
116
+ # returns the object ID for a particular page in the document. Pages
117
+ # are indexed starting at 1 (not 0!).
118
+ #
119
+ # object_id_for_page(1)
120
+ # => 5
121
+ # object_id_for_page(10)
122
+ # => 87
123
+ # object_id_for_page(-11)
124
+ # => 17
125
+ #
126
+ def object_id_for_page(k)
127
+ k -= 1 if k > 0
128
+ flat_page_ids = get_page_objects(pages).flatten
129
+ flat_page_ids[k]
130
+ end
131
+
132
+ # imports all objects required to render a page from another PDF. The
133
+ # objects are added to the current object store, but NOT linked
134
+ # anywhere.
135
+ #
136
+ # The object ID of the root Page object is returned, it's up to the
137
+ # calling code to link that into the document structure somewhere. If
138
+ # this isn't done the imported objects will just be removed when the
139
+ # store is compacted.
140
+ #
141
+ # Imports nothing and returns nil if the requested page number doesn't
142
+ # exist. page_num is 1 indexed, so 1 indicates the first page.
143
+ #
144
+ def import_page(input, page_num)
145
+ @loaded_objects = {}
146
+ if template_id = indexed_template(input, page_num)
147
+ return template_id
148
+ end
149
+
150
+ io = if input.respond_to?(:seek) && input.respond_to?(:read)
151
+ input
152
+ elsif File.file?(input.to_s)
153
+ StringIO.new(File.binread(input.to_s))
154
+ else
155
+ raise ArgumentError, "input must be an IO-like object or a filename"
156
+ end
157
+
158
+ # unless File.file?(filename)
159
+ # raise ArgumentError, "#{filename} does not exist"
160
+ # end
161
+
162
+ hash = indexed_hash(input, io)
163
+ ref = hash.page_references[page_num - 1]
164
+
165
+ if ref.nil?
166
+ nil
167
+ else
168
+ index_template(input, page_num, load_object_graph(hash, ref).identifier)
169
+ end
170
+
171
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
172
+ msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
173
+ raise PDF::Core::Errors::TemplateError, msg
174
+ rescue PDF::Reader::UnsupportedFeatureError
175
+ msg = "Template file contains unsupported PDF features"
176
+ raise PDF::Core::Errors::TemplateError, msg
177
+ end
178
+
179
+ private
180
+
181
+ # An index for page templates so that their loaded object graph
182
+ # can be reused without multiple loading
183
+ def template_index
184
+ @template_index ||= {}
185
+ end
186
+
187
+ # An index for the read object hash of a pdf template so that the
188
+ # object hash does not need to be parsed multiple times when using
189
+ # different pages of the pdf as page templates
190
+ def hash_index
191
+ @hash_index ||= {}
192
+ end
193
+
194
+ # returns the indexed object graph identifier for a template page if
195
+ # it exists
196
+ def indexed_template(input, page_number)
197
+ key = indexing_key(input)
198
+ template_index[key] && template_index[key][page_number]
199
+ end
200
+
201
+ # indexes the identifier for a page from a template
202
+ def index_template(input, page_number, id)
203
+ (template_index[indexing_key(input)] ||= {})[page_number] ||= id
204
+ end
205
+
206
+ # reads and indexes a new IO for a template
207
+ # if the IO has been indexed already then the parsed object hash
208
+ # is returned directly
209
+ def indexed_hash(input, io)
210
+ hash_index[indexing_key(input)] ||= PDF::Reader::ObjectHash.new(io)
211
+ end
212
+
213
+ # the index key for the input.
214
+ # uses object_id so that both a string filename or an IO stream can be
215
+ # indexed and reused provided the same object gets used in multiple page
216
+ # template calls.
217
+ def indexing_key(input)
218
+ input.object_id
219
+ end
220
+
221
+ # returns a nested array of object IDs for all pages in this object store.
222
+ #
223
+ def get_page_objects(obj)
224
+ if obj.data[:Type] == :Page
225
+ obj.identifier
226
+ elsif obj.data[:Type] == :Pages
227
+ obj.data[:Kids].map { |kid| get_page_objects(kid) }
228
+ end
229
+ end
230
+
231
+ # takes a source PDF and uses it as a template for this document.
232
+ #
233
+ def load_file(template)
234
+ unless (template.respond_to?(:seek) && template.respond_to?(:read)) ||
235
+ File.file?(template)
236
+ raise ArgumentError, "#{template} does not exist"
237
+ end
238
+
239
+ hash = PDF::Reader::ObjectHash.new(template)
240
+ src_info = hash.trailer[:Info]
241
+ src_root = hash.trailer[:Root]
242
+ @min_version = hash.pdf_version.to_f
243
+
244
+ if hash.trailer[:Encrypt]
245
+ msg = "Template file is an encrypted PDF, it can't be used as a template"
246
+ raise PDF::Core::Errors::TemplateError, msg
247
+ end
248
+
249
+ if src_info
250
+ @info = load_object_graph(hash, src_info).identifier
251
+ end
252
+
253
+ if src_root
254
+ @root = load_object_graph(hash, src_root).identifier
255
+ end
256
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
257
+ msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
258
+ raise PDF::Core::Errors::TemplateError, msg
259
+ rescue PDF::Reader::UnsupportedFeatureError
260
+ msg = "Template file contains unsupported PDF features"
261
+ raise PDF::Core::Errors::TemplateError, msg
262
+ end
263
+
264
+ # recurse down an object graph from a source PDF, importing all the
265
+ # indirect objects we find.
266
+ #
267
+ # hash is the PDF::Reader::ObjectHash to extract objects from, object is
268
+ # the object to extract.
269
+ #
270
+ def load_object_graph(hash, object)
271
+ @loaded_objects ||= {}
272
+ case object
273
+ when ::Hash then
274
+ object.each { |key,value| object[key] = load_object_graph(hash, value) }
275
+ object
276
+ when Array then
277
+ object.map { |item| load_object_graph(hash, item)}
278
+ when PDF::Reader::Reference then
279
+ unless @loaded_objects.has_key?(object.id)
280
+ @loaded_objects[object.id] = ref(nil)
281
+ new_obj = load_object_graph(hash, hash[object])
282
+ if new_obj.kind_of?(PDF::Reader::Stream)
283
+ stream_dict = load_object_graph(hash, new_obj.hash)
284
+ @loaded_objects[object.id].data = stream_dict
285
+ @loaded_objects[object.id] << new_obj.data
286
+ else
287
+ @loaded_objects[object.id].data = new_obj
288
+ end
289
+ end
290
+ @loaded_objects[object.id]
291
+ when PDF::Reader::Stream
292
+ # Stream is a subclass of string, so this is here to prevent the stream
293
+ # being wrapped in a LiteralString
294
+ object
295
+ when String
296
+ is_utf8?(object) ? object : PDF::Core::ByteString.new(object)
297
+ else
298
+ object
299
+ end
300
+ end
301
+
302
+ def is_utf8?(str)
303
+ str.force_encoding(::Encoding::UTF_8)
304
+ str.valid_encoding?
305
+ end
306
+ end
307
+ end
308
+ end