pdf-core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,16 @@
1
+ # encoding: utf-8
2
+ module PDF
3
+ module Core
4
+ # This is used to differentiate strings that must be encoded as
5
+ # a *literal* string, versus those that can be encoded in
6
+ # the PDF hexadecimal format.
7
+ #
8
+ # Some features of the PDF format appear to require that literal
9
+ # strings be used. One such feature is the /Dest key of a link
10
+ # annotation; if a hex encoded string is used there, the links
11
+ # do not work (as tested in Mac OS X Preview, and Adobe Acrobat
12
+ # Reader).
13
+ class LiteralString < String #:nodoc:
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,177 @@
1
+ # encoding: utf-8
2
+
3
+ # name_tree.rb : Implements NameTree for PDF
4
+ #
5
+ # Copyright November 2008, Jamis Buck. All Rights Reserved.
6
+ #
7
+ # This is free software. Please see the LICENSE and COPYING files for details.
8
+ #
9
+ module PDF
10
+ module Core
11
+ module NameTree #:nodoc:
12
+ class Node #:nodoc:
13
+ attr_reader :children
14
+ attr_reader :limit
15
+ attr_reader :document
16
+ attr_accessor :parent
17
+ attr_accessor :ref
18
+
19
+ def initialize(document, limit, parent=nil)
20
+ @document = document
21
+ @children = []
22
+ @limit = limit
23
+ @parent = parent
24
+ @ref = nil
25
+ end
26
+
27
+ def empty?
28
+ children.empty?
29
+ end
30
+
31
+ def size
32
+ leaf? ? children.size : children.inject(0) { |sum, child| sum + child.size }
33
+ end
34
+
35
+ def leaf?
36
+ children.empty? || children.first.is_a?(Value)
37
+ end
38
+
39
+ def add(name, value)
40
+ self << Value.new(name, value)
41
+ end
42
+
43
+ def to_hash
44
+ hash = {}
45
+
46
+ hash[:Limits] = [least, greatest] if parent
47
+ if leaf?
48
+ hash[:Names] = children if leaf?
49
+ else
50
+ hash[:Kids] = children.map { |child| child.ref }
51
+ end
52
+
53
+ return hash
54
+ end
55
+
56
+ def least
57
+ if leaf?
58
+ children.first.name
59
+ else
60
+ children.first.least
61
+ end
62
+ end
63
+
64
+ def greatest
65
+ if leaf?
66
+ children.last.name
67
+ else
68
+ children.last.greatest
69
+ end
70
+ end
71
+
72
+ def <<(value)
73
+ if children.empty?
74
+ children << value
75
+ elsif leaf?
76
+ children.insert(insertion_point(value), value)
77
+ split! if children.length > limit
78
+ else
79
+ fit = children.detect { |child| child >= value }
80
+ fit = children.last unless fit
81
+ fit << value
82
+ end
83
+
84
+ value
85
+ end
86
+
87
+ def >=(value)
88
+ children.empty? || children.last >= value
89
+ end
90
+
91
+ def split!
92
+ if parent
93
+ parent.split(self)
94
+ else
95
+ left, right = new_node(self), new_node(self)
96
+ split_children(self, left, right)
97
+ children.replace([left, right])
98
+ end
99
+ end
100
+
101
+ # Returns a deep copy of this node, without copying expensive things
102
+ # like the ref to @document.
103
+ #
104
+ def deep_copy
105
+ node = dup
106
+ node.instance_variable_set("@children",
107
+ Marshal.load(Marshal.dump(children)))
108
+ node.instance_variable_set("@ref",
109
+ node.ref ? node.ref.deep_copy : nil)
110
+ node
111
+ end
112
+
113
+ protected
114
+
115
+ def split(node)
116
+ new_child = new_node(self)
117
+ split_children(node, node, new_child)
118
+ index = children.index(node)
119
+ children.insert(index+1, new_child)
120
+ split! if children.length > limit
121
+ end
122
+
123
+ private
124
+
125
+ def new_node(parent=nil)
126
+ node = Node.new(document, limit, parent)
127
+ node.ref = document.ref!(node)
128
+ return node
129
+ end
130
+
131
+ def split_children(node, left, right)
132
+ half = (node.limit+1)/2
133
+
134
+ left_children, right_children = node.children[0...half], node.children[half..-1]
135
+
136
+ left.children.replace(left_children)
137
+ right.children.replace(right_children)
138
+
139
+ unless node.leaf?
140
+ left_children.each { |child| child.parent = left }
141
+ right_children.each { |child| child.parent = right }
142
+ end
143
+ end
144
+
145
+ def insertion_point(value)
146
+ children.each_with_index do |child, index|
147
+ return index if child >= value
148
+ end
149
+ return children.length
150
+ end
151
+ end
152
+
153
+ class Value #:nodoc:
154
+ include Comparable
155
+
156
+ attr_reader :name
157
+ attr_reader :value
158
+
159
+ def initialize(name, value)
160
+ @name, @value = PDF::Core::LiteralString.new(name), value
161
+ end
162
+
163
+ def <=>(leaf)
164
+ name <=> leaf.name
165
+ end
166
+
167
+ def inspect
168
+ "#<Value: #{name.inspect} : #{value.inspect}>"
169
+ end
170
+
171
+ def to_s
172
+ "#{name} : #{value}"
173
+ end
174
+ end
175
+ end
176
+ end
177
+ end
@@ -0,0 +1,308 @@
1
+ # encoding: utf-8
2
+
3
+ # Implements PDF object repository
4
+ #
5
+ # Copyright August 2009, Brad Ediger. All Rights Reserved.
6
+ #
7
+ # This is free software. Please see the LICENSE and COPYING files for details.
8
+
9
+
10
+ require 'pdf/reader'
11
+
12
+ module PDF
13
+ module Core
14
+ class ObjectStore #:nodoc:
15
+ include Enumerable
16
+
17
+ attr_reader :min_version
18
+
19
+ BASE_OBJECTS = %w[info pages root]
20
+
21
+ def initialize(opts = {})
22
+ @objects = {}
23
+ @identifiers = []
24
+
25
+ load_file(opts[:template]) if opts[:template]
26
+
27
+ @info ||= ref(opts[:info] || {}).identifier
28
+ @root ||= ref(:Type => :Catalog).identifier
29
+ if pages.nil?
30
+ root.data[:Pages] = ref(:Type => :Pages, :Count => 0, :Kids => [])
31
+ end
32
+ end
33
+
34
+ def ref(data, &block)
35
+ push(size + 1, data, &block)
36
+ end
37
+
38
+ def info
39
+ @objects[@info]
40
+ end
41
+
42
+ def root
43
+ @objects[@root]
44
+ end
45
+
46
+ def pages
47
+ root.data[:Pages]
48
+ end
49
+
50
+ def page_count
51
+ pages.data[:Count]
52
+ end
53
+
54
+ # Adds the given reference to the store and returns the reference object.
55
+ # If the object provided is not a PDF::Core::Reference, one is created from the
56
+ # arguments provided.
57
+ #
58
+ def push(*args, &block)
59
+ reference = if args.first.is_a?(PDF::Core::Reference)
60
+ args.first
61
+ else
62
+ PDF::Core::Reference.new(*args, &block)
63
+ end
64
+
65
+ @objects[reference.identifier] = reference
66
+ @identifiers << reference.identifier
67
+ reference
68
+ end
69
+
70
+ alias_method :<<, :push
71
+
72
+ def each
73
+ @identifiers.each do |id|
74
+ yield @objects[id]
75
+ end
76
+ end
77
+
78
+ def [](id)
79
+ @objects[id]
80
+ end
81
+
82
+ def size
83
+ @identifiers.size
84
+ end
85
+ alias_method :length, :size
86
+
87
+ def compact
88
+ # Clear live markers
89
+ each { |o| o.live = false }
90
+
91
+ # Recursively mark reachable objects live, starting from the roots
92
+ # (the only objects referenced in the trailer)
93
+ root.mark_live
94
+ info.mark_live
95
+
96
+ # Renumber live objects to eliminate gaps (shrink the xref table)
97
+ if @objects.any?{ |_, o| !o.live }
98
+ new_id = 1
99
+ new_objects = {}
100
+ new_identifiers = []
101
+
102
+ each do |obj|
103
+ if obj.live
104
+ obj.identifier = new_id
105
+ new_objects[new_id] = obj
106
+ new_identifiers << new_id
107
+ new_id += 1
108
+ end
109
+ end
110
+
111
+ @objects = new_objects
112
+ @identifiers = new_identifiers
113
+ end
114
+ end
115
+
116
+ # returns the object ID for a particular page in the document. Pages
117
+ # are indexed starting at 1 (not 0!).
118
+ #
119
+ # object_id_for_page(1)
120
+ # => 5
121
+ # object_id_for_page(10)
122
+ # => 87
123
+ # object_id_for_page(-11)
124
+ # => 17
125
+ #
126
+ def object_id_for_page(k)
127
+ k -= 1 if k > 0
128
+ flat_page_ids = get_page_objects(pages).flatten
129
+ flat_page_ids[k]
130
+ end
131
+
132
+ # imports all objects required to render a page from another PDF. The
133
+ # objects are added to the current object store, but NOT linked
134
+ # anywhere.
135
+ #
136
+ # The object ID of the root Page object is returned, it's up to the
137
+ # calling code to link that into the document structure somewhere. If
138
+ # this isn't done the imported objects will just be removed when the
139
+ # store is compacted.
140
+ #
141
+ # Imports nothing and returns nil if the requested page number doesn't
142
+ # exist. page_num is 1 indexed, so 1 indicates the first page.
143
+ #
144
+ def import_page(input, page_num)
145
+ @loaded_objects = {}
146
+ if template_id = indexed_template(input, page_num)
147
+ return template_id
148
+ end
149
+
150
+ io = if input.respond_to?(:seek) && input.respond_to?(:read)
151
+ input
152
+ elsif File.file?(input.to_s)
153
+ StringIO.new(File.binread(input.to_s))
154
+ else
155
+ raise ArgumentError, "input must be an IO-like object or a filename"
156
+ end
157
+
158
+ # unless File.file?(filename)
159
+ # raise ArgumentError, "#{filename} does not exist"
160
+ # end
161
+
162
+ hash = indexed_hash(input, io)
163
+ ref = hash.page_references[page_num - 1]
164
+
165
+ if ref.nil?
166
+ nil
167
+ else
168
+ index_template(input, page_num, load_object_graph(hash, ref).identifier)
169
+ end
170
+
171
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
172
+ msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
173
+ raise PDF::Core::Errors::TemplateError, msg
174
+ rescue PDF::Reader::UnsupportedFeatureError
175
+ msg = "Template file contains unsupported PDF features"
176
+ raise PDF::Core::Errors::TemplateError, msg
177
+ end
178
+
179
+ private
180
+
181
+ # An index for page templates so that their loaded object graph
182
+ # can be reused without multiple loading
183
+ def template_index
184
+ @template_index ||= {}
185
+ end
186
+
187
+ # An index for the read object hash of a pdf template so that the
188
+ # object hash does not need to be parsed multiple times when using
189
+ # different pages of the pdf as page templates
190
+ def hash_index
191
+ @hash_index ||= {}
192
+ end
193
+
194
+ # returns the indexed object graph identifier for a template page if
195
+ # it exists
196
+ def indexed_template(input, page_number)
197
+ key = indexing_key(input)
198
+ template_index[key] && template_index[key][page_number]
199
+ end
200
+
201
+ # indexes the identifier for a page from a template
202
+ def index_template(input, page_number, id)
203
+ (template_index[indexing_key(input)] ||= {})[page_number] ||= id
204
+ end
205
+
206
+ # reads and indexes a new IO for a template
207
+ # if the IO has been indexed already then the parsed object hash
208
+ # is returned directly
209
+ def indexed_hash(input, io)
210
+ hash_index[indexing_key(input)] ||= PDF::Reader::ObjectHash.new(io)
211
+ end
212
+
213
+ # the index key for the input.
214
+ # uses object_id so that both a string filename or an IO stream can be
215
+ # indexed and reused provided the same object gets used in multiple page
216
+ # template calls.
217
+ def indexing_key(input)
218
+ input.object_id
219
+ end
220
+
221
+ # returns a nested array of object IDs for all pages in this object store.
222
+ #
223
+ def get_page_objects(obj)
224
+ if obj.data[:Type] == :Page
225
+ obj.identifier
226
+ elsif obj.data[:Type] == :Pages
227
+ obj.data[:Kids].map { |kid| get_page_objects(kid) }
228
+ end
229
+ end
230
+
231
+ # takes a source PDF and uses it as a template for this document.
232
+ #
233
+ def load_file(template)
234
+ unless (template.respond_to?(:seek) && template.respond_to?(:read)) ||
235
+ File.file?(template)
236
+ raise ArgumentError, "#{template} does not exist"
237
+ end
238
+
239
+ hash = PDF::Reader::ObjectHash.new(template)
240
+ src_info = hash.trailer[:Info]
241
+ src_root = hash.trailer[:Root]
242
+ @min_version = hash.pdf_version.to_f
243
+
244
+ if hash.trailer[:Encrypt]
245
+ msg = "Template file is an encrypted PDF, it can't be used as a template"
246
+ raise PDF::Core::Errors::TemplateError, msg
247
+ end
248
+
249
+ if src_info
250
+ @info = load_object_graph(hash, src_info).identifier
251
+ end
252
+
253
+ if src_root
254
+ @root = load_object_graph(hash, src_root).identifier
255
+ end
256
+ rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
257
+ msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
258
+ raise PDF::Core::Errors::TemplateError, msg
259
+ rescue PDF::Reader::UnsupportedFeatureError
260
+ msg = "Template file contains unsupported PDF features"
261
+ raise PDF::Core::Errors::TemplateError, msg
262
+ end
263
+
264
+ # recurse down an object graph from a source PDF, importing all the
265
+ # indirect objects we find.
266
+ #
267
+ # hash is the PDF::Reader::ObjectHash to extract objects from, object is
268
+ # the object to extract.
269
+ #
270
+ def load_object_graph(hash, object)
271
+ @loaded_objects ||= {}
272
+ case object
273
+ when ::Hash then
274
+ object.each { |key,value| object[key] = load_object_graph(hash, value) }
275
+ object
276
+ when Array then
277
+ object.map { |item| load_object_graph(hash, item)}
278
+ when PDF::Reader::Reference then
279
+ unless @loaded_objects.has_key?(object.id)
280
+ @loaded_objects[object.id] = ref(nil)
281
+ new_obj = load_object_graph(hash, hash[object])
282
+ if new_obj.kind_of?(PDF::Reader::Stream)
283
+ stream_dict = load_object_graph(hash, new_obj.hash)
284
+ @loaded_objects[object.id].data = stream_dict
285
+ @loaded_objects[object.id] << new_obj.data
286
+ else
287
+ @loaded_objects[object.id].data = new_obj
288
+ end
289
+ end
290
+ @loaded_objects[object.id]
291
+ when PDF::Reader::Stream
292
+ # Stream is a subclass of string, so this is here to prevent the stream
293
+ # being wrapped in a LiteralString
294
+ object
295
+ when String
296
+ is_utf8?(object) ? object : PDF::Core::ByteString.new(object)
297
+ else
298
+ object
299
+ end
300
+ end
301
+
302
+ def is_utf8?(str)
303
+ str.force_encoding(::Encoding::UTF_8)
304
+ str.valid_encoding?
305
+ end
306
+ end
307
+ end
308
+ end