pdf-core 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/pdf/core.rb +35 -0
- data/lib/pdf/core/annotations.rb +60 -0
- data/lib/pdf/core/byte_string.rb +9 -0
- data/lib/pdf/core/destinations.rb +90 -0
- data/lib/pdf/core/document_state.rb +78 -0
- data/lib/pdf/core/filter_list.rb +51 -0
- data/lib/pdf/core/filters.rb +36 -0
- data/lib/pdf/core/graphics_state.rb +68 -0
- data/lib/pdf/core/literal_string.rb +16 -0
- data/lib/pdf/core/name_tree.rb +177 -0
- data/lib/pdf/core/object_store.rb +308 -0
- data/lib/pdf/core/outline.rb +315 -0
- data/lib/pdf/core/page.rb +212 -0
- data/lib/pdf/core/page_geometry.rb +126 -0
- data/lib/pdf/core/pdf_object.rb +99 -0
- data/lib/pdf/core/reference.rb +103 -0
- data/lib/pdf/core/stream.rb +98 -0
- data/lib/pdf/core/text.rb +275 -0
- data/pdf-core.gemspec +26 -0
- metadata +140 -0
@@ -0,0 +1,16 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
module PDF
|
3
|
+
module Core
|
4
|
+
# This is used to differentiate strings that must be encoded as
|
5
|
+
# a *literal* string, versus those that can be encoded in
|
6
|
+
# the PDF hexadecimal format.
|
7
|
+
#
|
8
|
+
# Some features of the PDF format appear to require that literal
|
9
|
+
# strings be used. One such feature is the /Dest key of a link
|
10
|
+
# annotation; if a hex encoded string is used there, the links
|
11
|
+
# do not work (as tested in Mac OS X Preview, and Adobe Acrobat
|
12
|
+
# Reader).
|
13
|
+
class LiteralString < String #:nodoc:
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,177 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# name_tree.rb : Implements NameTree for PDF
|
4
|
+
#
|
5
|
+
# Copyright November 2008, Jamis Buck. All Rights Reserved.
|
6
|
+
#
|
7
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
8
|
+
#
|
9
|
+
module PDF
|
10
|
+
module Core
|
11
|
+
module NameTree #:nodoc:
|
12
|
+
class Node #:nodoc:
|
13
|
+
attr_reader :children
|
14
|
+
attr_reader :limit
|
15
|
+
attr_reader :document
|
16
|
+
attr_accessor :parent
|
17
|
+
attr_accessor :ref
|
18
|
+
|
19
|
+
def initialize(document, limit, parent=nil)
|
20
|
+
@document = document
|
21
|
+
@children = []
|
22
|
+
@limit = limit
|
23
|
+
@parent = parent
|
24
|
+
@ref = nil
|
25
|
+
end
|
26
|
+
|
27
|
+
def empty?
|
28
|
+
children.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
def size
|
32
|
+
leaf? ? children.size : children.inject(0) { |sum, child| sum + child.size }
|
33
|
+
end
|
34
|
+
|
35
|
+
def leaf?
|
36
|
+
children.empty? || children.first.is_a?(Value)
|
37
|
+
end
|
38
|
+
|
39
|
+
def add(name, value)
|
40
|
+
self << Value.new(name, value)
|
41
|
+
end
|
42
|
+
|
43
|
+
def to_hash
|
44
|
+
hash = {}
|
45
|
+
|
46
|
+
hash[:Limits] = [least, greatest] if parent
|
47
|
+
if leaf?
|
48
|
+
hash[:Names] = children if leaf?
|
49
|
+
else
|
50
|
+
hash[:Kids] = children.map { |child| child.ref }
|
51
|
+
end
|
52
|
+
|
53
|
+
return hash
|
54
|
+
end
|
55
|
+
|
56
|
+
def least
|
57
|
+
if leaf?
|
58
|
+
children.first.name
|
59
|
+
else
|
60
|
+
children.first.least
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def greatest
|
65
|
+
if leaf?
|
66
|
+
children.last.name
|
67
|
+
else
|
68
|
+
children.last.greatest
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def <<(value)
|
73
|
+
if children.empty?
|
74
|
+
children << value
|
75
|
+
elsif leaf?
|
76
|
+
children.insert(insertion_point(value), value)
|
77
|
+
split! if children.length > limit
|
78
|
+
else
|
79
|
+
fit = children.detect { |child| child >= value }
|
80
|
+
fit = children.last unless fit
|
81
|
+
fit << value
|
82
|
+
end
|
83
|
+
|
84
|
+
value
|
85
|
+
end
|
86
|
+
|
87
|
+
def >=(value)
|
88
|
+
children.empty? || children.last >= value
|
89
|
+
end
|
90
|
+
|
91
|
+
def split!
|
92
|
+
if parent
|
93
|
+
parent.split(self)
|
94
|
+
else
|
95
|
+
left, right = new_node(self), new_node(self)
|
96
|
+
split_children(self, left, right)
|
97
|
+
children.replace([left, right])
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
# Returns a deep copy of this node, without copying expensive things
|
102
|
+
# like the ref to @document.
|
103
|
+
#
|
104
|
+
def deep_copy
|
105
|
+
node = dup
|
106
|
+
node.instance_variable_set("@children",
|
107
|
+
Marshal.load(Marshal.dump(children)))
|
108
|
+
node.instance_variable_set("@ref",
|
109
|
+
node.ref ? node.ref.deep_copy : nil)
|
110
|
+
node
|
111
|
+
end
|
112
|
+
|
113
|
+
protected
|
114
|
+
|
115
|
+
def split(node)
|
116
|
+
new_child = new_node(self)
|
117
|
+
split_children(node, node, new_child)
|
118
|
+
index = children.index(node)
|
119
|
+
children.insert(index+1, new_child)
|
120
|
+
split! if children.length > limit
|
121
|
+
end
|
122
|
+
|
123
|
+
private
|
124
|
+
|
125
|
+
def new_node(parent=nil)
|
126
|
+
node = Node.new(document, limit, parent)
|
127
|
+
node.ref = document.ref!(node)
|
128
|
+
return node
|
129
|
+
end
|
130
|
+
|
131
|
+
def split_children(node, left, right)
|
132
|
+
half = (node.limit+1)/2
|
133
|
+
|
134
|
+
left_children, right_children = node.children[0...half], node.children[half..-1]
|
135
|
+
|
136
|
+
left.children.replace(left_children)
|
137
|
+
right.children.replace(right_children)
|
138
|
+
|
139
|
+
unless node.leaf?
|
140
|
+
left_children.each { |child| child.parent = left }
|
141
|
+
right_children.each { |child| child.parent = right }
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def insertion_point(value)
|
146
|
+
children.each_with_index do |child, index|
|
147
|
+
return index if child >= value
|
148
|
+
end
|
149
|
+
return children.length
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
class Value #:nodoc:
|
154
|
+
include Comparable
|
155
|
+
|
156
|
+
attr_reader :name
|
157
|
+
attr_reader :value
|
158
|
+
|
159
|
+
def initialize(name, value)
|
160
|
+
@name, @value = PDF::Core::LiteralString.new(name), value
|
161
|
+
end
|
162
|
+
|
163
|
+
def <=>(leaf)
|
164
|
+
name <=> leaf.name
|
165
|
+
end
|
166
|
+
|
167
|
+
def inspect
|
168
|
+
"#<Value: #{name.inspect} : #{value.inspect}>"
|
169
|
+
end
|
170
|
+
|
171
|
+
def to_s
|
172
|
+
"#{name} : #{value}"
|
173
|
+
end
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
end
|
@@ -0,0 +1,308 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
# Implements PDF object repository
|
4
|
+
#
|
5
|
+
# Copyright August 2009, Brad Ediger. All Rights Reserved.
|
6
|
+
#
|
7
|
+
# This is free software. Please see the LICENSE and COPYING files for details.
|
8
|
+
|
9
|
+
|
10
|
+
require 'pdf/reader'
|
11
|
+
|
12
|
+
module PDF
|
13
|
+
module Core
|
14
|
+
class ObjectStore #:nodoc:
|
15
|
+
include Enumerable
|
16
|
+
|
17
|
+
attr_reader :min_version
|
18
|
+
|
19
|
+
BASE_OBJECTS = %w[info pages root]
|
20
|
+
|
21
|
+
def initialize(opts = {})
|
22
|
+
@objects = {}
|
23
|
+
@identifiers = []
|
24
|
+
|
25
|
+
load_file(opts[:template]) if opts[:template]
|
26
|
+
|
27
|
+
@info ||= ref(opts[:info] || {}).identifier
|
28
|
+
@root ||= ref(:Type => :Catalog).identifier
|
29
|
+
if pages.nil?
|
30
|
+
root.data[:Pages] = ref(:Type => :Pages, :Count => 0, :Kids => [])
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
def ref(data, &block)
|
35
|
+
push(size + 1, data, &block)
|
36
|
+
end
|
37
|
+
|
38
|
+
def info
|
39
|
+
@objects[@info]
|
40
|
+
end
|
41
|
+
|
42
|
+
def root
|
43
|
+
@objects[@root]
|
44
|
+
end
|
45
|
+
|
46
|
+
def pages
|
47
|
+
root.data[:Pages]
|
48
|
+
end
|
49
|
+
|
50
|
+
def page_count
|
51
|
+
pages.data[:Count]
|
52
|
+
end
|
53
|
+
|
54
|
+
# Adds the given reference to the store and returns the reference object.
|
55
|
+
# If the object provided is not a PDF::Core::Reference, one is created from the
|
56
|
+
# arguments provided.
|
57
|
+
#
|
58
|
+
def push(*args, &block)
|
59
|
+
reference = if args.first.is_a?(PDF::Core::Reference)
|
60
|
+
args.first
|
61
|
+
else
|
62
|
+
PDF::Core::Reference.new(*args, &block)
|
63
|
+
end
|
64
|
+
|
65
|
+
@objects[reference.identifier] = reference
|
66
|
+
@identifiers << reference.identifier
|
67
|
+
reference
|
68
|
+
end
|
69
|
+
|
70
|
+
alias_method :<<, :push
|
71
|
+
|
72
|
+
def each
|
73
|
+
@identifiers.each do |id|
|
74
|
+
yield @objects[id]
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def [](id)
|
79
|
+
@objects[id]
|
80
|
+
end
|
81
|
+
|
82
|
+
def size
|
83
|
+
@identifiers.size
|
84
|
+
end
|
85
|
+
alias_method :length, :size
|
86
|
+
|
87
|
+
def compact
|
88
|
+
# Clear live markers
|
89
|
+
each { |o| o.live = false }
|
90
|
+
|
91
|
+
# Recursively mark reachable objects live, starting from the roots
|
92
|
+
# (the only objects referenced in the trailer)
|
93
|
+
root.mark_live
|
94
|
+
info.mark_live
|
95
|
+
|
96
|
+
# Renumber live objects to eliminate gaps (shrink the xref table)
|
97
|
+
if @objects.any?{ |_, o| !o.live }
|
98
|
+
new_id = 1
|
99
|
+
new_objects = {}
|
100
|
+
new_identifiers = []
|
101
|
+
|
102
|
+
each do |obj|
|
103
|
+
if obj.live
|
104
|
+
obj.identifier = new_id
|
105
|
+
new_objects[new_id] = obj
|
106
|
+
new_identifiers << new_id
|
107
|
+
new_id += 1
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
@objects = new_objects
|
112
|
+
@identifiers = new_identifiers
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# returns the object ID for a particular page in the document. Pages
|
117
|
+
# are indexed starting at 1 (not 0!).
|
118
|
+
#
|
119
|
+
# object_id_for_page(1)
|
120
|
+
# => 5
|
121
|
+
# object_id_for_page(10)
|
122
|
+
# => 87
|
123
|
+
# object_id_for_page(-11)
|
124
|
+
# => 17
|
125
|
+
#
|
126
|
+
def object_id_for_page(k)
|
127
|
+
k -= 1 if k > 0
|
128
|
+
flat_page_ids = get_page_objects(pages).flatten
|
129
|
+
flat_page_ids[k]
|
130
|
+
end
|
131
|
+
|
132
|
+
# imports all objects required to render a page from another PDF. The
|
133
|
+
# objects are added to the current object store, but NOT linked
|
134
|
+
# anywhere.
|
135
|
+
#
|
136
|
+
# The object ID of the root Page object is returned, it's up to the
|
137
|
+
# calling code to link that into the document structure somewhere. If
|
138
|
+
# this isn't done the imported objects will just be removed when the
|
139
|
+
# store is compacted.
|
140
|
+
#
|
141
|
+
# Imports nothing and returns nil if the requested page number doesn't
|
142
|
+
# exist. page_num is 1 indexed, so 1 indicates the first page.
|
143
|
+
#
|
144
|
+
def import_page(input, page_num)
|
145
|
+
@loaded_objects = {}
|
146
|
+
if template_id = indexed_template(input, page_num)
|
147
|
+
return template_id
|
148
|
+
end
|
149
|
+
|
150
|
+
io = if input.respond_to?(:seek) && input.respond_to?(:read)
|
151
|
+
input
|
152
|
+
elsif File.file?(input.to_s)
|
153
|
+
StringIO.new(File.binread(input.to_s))
|
154
|
+
else
|
155
|
+
raise ArgumentError, "input must be an IO-like object or a filename"
|
156
|
+
end
|
157
|
+
|
158
|
+
# unless File.file?(filename)
|
159
|
+
# raise ArgumentError, "#{filename} does not exist"
|
160
|
+
# end
|
161
|
+
|
162
|
+
hash = indexed_hash(input, io)
|
163
|
+
ref = hash.page_references[page_num - 1]
|
164
|
+
|
165
|
+
if ref.nil?
|
166
|
+
nil
|
167
|
+
else
|
168
|
+
index_template(input, page_num, load_object_graph(hash, ref).identifier)
|
169
|
+
end
|
170
|
+
|
171
|
+
rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
|
172
|
+
msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
|
173
|
+
raise PDF::Core::Errors::TemplateError, msg
|
174
|
+
rescue PDF::Reader::UnsupportedFeatureError
|
175
|
+
msg = "Template file contains unsupported PDF features"
|
176
|
+
raise PDF::Core::Errors::TemplateError, msg
|
177
|
+
end
|
178
|
+
|
179
|
+
private
|
180
|
+
|
181
|
+
# An index for page templates so that their loaded object graph
|
182
|
+
# can be reused without multiple loading
|
183
|
+
def template_index
|
184
|
+
@template_index ||= {}
|
185
|
+
end
|
186
|
+
|
187
|
+
# An index for the read object hash of a pdf template so that the
|
188
|
+
# object hash does not need to be parsed multiple times when using
|
189
|
+
# different pages of the pdf as page templates
|
190
|
+
def hash_index
|
191
|
+
@hash_index ||= {}
|
192
|
+
end
|
193
|
+
|
194
|
+
# returns the indexed object graph identifier for a template page if
|
195
|
+
# it exists
|
196
|
+
def indexed_template(input, page_number)
|
197
|
+
key = indexing_key(input)
|
198
|
+
template_index[key] && template_index[key][page_number]
|
199
|
+
end
|
200
|
+
|
201
|
+
# indexes the identifier for a page from a template
|
202
|
+
def index_template(input, page_number, id)
|
203
|
+
(template_index[indexing_key(input)] ||= {})[page_number] ||= id
|
204
|
+
end
|
205
|
+
|
206
|
+
# reads and indexes a new IO for a template
|
207
|
+
# if the IO has been indexed already then the parsed object hash
|
208
|
+
# is returned directly
|
209
|
+
def indexed_hash(input, io)
|
210
|
+
hash_index[indexing_key(input)] ||= PDF::Reader::ObjectHash.new(io)
|
211
|
+
end
|
212
|
+
|
213
|
+
# the index key for the input.
|
214
|
+
# uses object_id so that both a string filename or an IO stream can be
|
215
|
+
# indexed and reused provided the same object gets used in multiple page
|
216
|
+
# template calls.
|
217
|
+
def indexing_key(input)
|
218
|
+
input.object_id
|
219
|
+
end
|
220
|
+
|
221
|
+
# returns a nested array of object IDs for all pages in this object store.
|
222
|
+
#
|
223
|
+
def get_page_objects(obj)
|
224
|
+
if obj.data[:Type] == :Page
|
225
|
+
obj.identifier
|
226
|
+
elsif obj.data[:Type] == :Pages
|
227
|
+
obj.data[:Kids].map { |kid| get_page_objects(kid) }
|
228
|
+
end
|
229
|
+
end
|
230
|
+
|
231
|
+
# takes a source PDF and uses it as a template for this document.
|
232
|
+
#
|
233
|
+
def load_file(template)
|
234
|
+
unless (template.respond_to?(:seek) && template.respond_to?(:read)) ||
|
235
|
+
File.file?(template)
|
236
|
+
raise ArgumentError, "#{template} does not exist"
|
237
|
+
end
|
238
|
+
|
239
|
+
hash = PDF::Reader::ObjectHash.new(template)
|
240
|
+
src_info = hash.trailer[:Info]
|
241
|
+
src_root = hash.trailer[:Root]
|
242
|
+
@min_version = hash.pdf_version.to_f
|
243
|
+
|
244
|
+
if hash.trailer[:Encrypt]
|
245
|
+
msg = "Template file is an encrypted PDF, it can't be used as a template"
|
246
|
+
raise PDF::Core::Errors::TemplateError, msg
|
247
|
+
end
|
248
|
+
|
249
|
+
if src_info
|
250
|
+
@info = load_object_graph(hash, src_info).identifier
|
251
|
+
end
|
252
|
+
|
253
|
+
if src_root
|
254
|
+
@root = load_object_graph(hash, src_root).identifier
|
255
|
+
end
|
256
|
+
rescue PDF::Reader::MalformedPDFError, PDF::Reader::InvalidObjectError => e
|
257
|
+
msg = "Error reading template file. If you are sure it's a valid PDF, it may be a bug.\n#{e.message}"
|
258
|
+
raise PDF::Core::Errors::TemplateError, msg
|
259
|
+
rescue PDF::Reader::UnsupportedFeatureError
|
260
|
+
msg = "Template file contains unsupported PDF features"
|
261
|
+
raise PDF::Core::Errors::TemplateError, msg
|
262
|
+
end
|
263
|
+
|
264
|
+
# recurse down an object graph from a source PDF, importing all the
|
265
|
+
# indirect objects we find.
|
266
|
+
#
|
267
|
+
# hash is the PDF::Reader::ObjectHash to extract objects from, object is
|
268
|
+
# the object to extract.
|
269
|
+
#
|
270
|
+
def load_object_graph(hash, object)
|
271
|
+
@loaded_objects ||= {}
|
272
|
+
case object
|
273
|
+
when ::Hash then
|
274
|
+
object.each { |key,value| object[key] = load_object_graph(hash, value) }
|
275
|
+
object
|
276
|
+
when Array then
|
277
|
+
object.map { |item| load_object_graph(hash, item)}
|
278
|
+
when PDF::Reader::Reference then
|
279
|
+
unless @loaded_objects.has_key?(object.id)
|
280
|
+
@loaded_objects[object.id] = ref(nil)
|
281
|
+
new_obj = load_object_graph(hash, hash[object])
|
282
|
+
if new_obj.kind_of?(PDF::Reader::Stream)
|
283
|
+
stream_dict = load_object_graph(hash, new_obj.hash)
|
284
|
+
@loaded_objects[object.id].data = stream_dict
|
285
|
+
@loaded_objects[object.id] << new_obj.data
|
286
|
+
else
|
287
|
+
@loaded_objects[object.id].data = new_obj
|
288
|
+
end
|
289
|
+
end
|
290
|
+
@loaded_objects[object.id]
|
291
|
+
when PDF::Reader::Stream
|
292
|
+
# Stream is a subclass of string, so this is here to prevent the stream
|
293
|
+
# being wrapped in a LiteralString
|
294
|
+
object
|
295
|
+
when String
|
296
|
+
is_utf8?(object) ? object : PDF::Core::ByteString.new(object)
|
297
|
+
else
|
298
|
+
object
|
299
|
+
end
|
300
|
+
end
|
301
|
+
|
302
|
+
def is_utf8?(str)
|
303
|
+
str.force_encoding(::Encoding::UTF_8)
|
304
|
+
str.valid_encoding?
|
305
|
+
end
|
306
|
+
end
|
307
|
+
end
|
308
|
+
end
|