fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,185 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Reader
|
5
|
+
|
6
|
+
# high level representation of a single PDF page. Ties together the various
|
7
|
+
# low level classes in PDF::Reader and provides access to the various
|
8
|
+
# components of the page (text, images, fonts, etc) in convenient formats.
|
9
|
+
#
|
10
|
+
# If you require access to the raw PDF objects for this page, you can access
|
11
|
+
# the Page dictionary via the page_object accessor. You will need to use the
|
12
|
+
# objects accessor to help walk the page dictionary in any useful way.
|
13
|
+
#
|
14
|
+
class Page
|
15
|
+
|
16
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
17
|
+
attr_reader :objects
|
18
|
+
|
19
|
+
# the raw PDF object that defines this page
|
20
|
+
attr_reader :page_object
|
21
|
+
|
22
|
+
# creates a new page wrapper.
|
23
|
+
#
|
24
|
+
# * objects - an ObjectHash instance that wraps a PDF file
|
25
|
+
# * pagenum - an int specifying the page number to expose. 1 indexed.
|
26
|
+
#
|
27
|
+
def initialize(objects, pagenum)
|
28
|
+
@objects, @pagenum = objects, pagenum
|
29
|
+
@page_object = objects.deref(objects.page_references[pagenum - 1])
|
30
|
+
|
31
|
+
unless @page_object.is_a?(::Hash)
|
32
|
+
raise ArgumentError, "invalid page: #{pagenum}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# return the number of this page within the full document
|
37
|
+
#
|
38
|
+
def number
|
39
|
+
@pagenum
|
40
|
+
end
|
41
|
+
|
42
|
+
# return a friendly string representation of this page
|
43
|
+
#
|
44
|
+
def inspect
|
45
|
+
"<PDF::Reader::Page page: #{@pagenum}>"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the attributes that accompany this page. Includes
|
49
|
+
# attributes inherited from parents.
|
50
|
+
#
|
51
|
+
def attributes
|
52
|
+
hash = {}
|
53
|
+
page_with_ancestors.reverse.each do |obj|
|
54
|
+
hash.merge!(@objects.deref(obj))
|
55
|
+
end
|
56
|
+
hash
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the resources that accompany this page. Includes
|
60
|
+
# resources inherited from parents.
|
61
|
+
#
|
62
|
+
def resources
|
63
|
+
@resources ||= @objects.deref(attributes[:Resources]) || {}
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns the XObjects that are available to this page
|
67
|
+
#
|
68
|
+
def xobjects
|
69
|
+
resources[:XObject] || {}
|
70
|
+
end
|
71
|
+
|
72
|
+
# return a hash of fonts used on this page.
|
73
|
+
#
|
74
|
+
# The keys are the font labels used within the page content stream.
|
75
|
+
#
|
76
|
+
# The values are a PDF::Reader::Font instances that provide access
|
77
|
+
# to most available metrics for each font.
|
78
|
+
#
|
79
|
+
def fonts
|
80
|
+
raw_fonts = objects.deref(resources[:Font] || {})
|
81
|
+
::Hash[raw_fonts.map { |label, font|
|
82
|
+
[label, PDF::Reader::Font.new(objects, objects.deref(font))]
|
83
|
+
}]
|
84
|
+
end
|
85
|
+
|
86
|
+
# returns the plain text content of this page encoded as UTF-8. Any
|
87
|
+
# characters that can't be translated will be returned as a ▯
|
88
|
+
#
|
89
|
+
def text
|
90
|
+
receiver = PageTextReceiver.new
|
91
|
+
walk(receiver)
|
92
|
+
receiver.content
|
93
|
+
end
|
94
|
+
alias :to_s :text
|
95
|
+
|
96
|
+
# processes the raw content stream for this page in sequential order and
|
97
|
+
# passes callbacks to the receiver objects.
|
98
|
+
#
|
99
|
+
# This is mostly low level and you can probably ignore it unless you need
|
100
|
+
# access to soemthing like the raw encoded text. For an example of how
|
101
|
+
# this can be used as a basis for higher level functionality, see the
|
102
|
+
# text() method
|
103
|
+
#
|
104
|
+
# If someone was motivated enough, this method is intended to provide all
|
105
|
+
# the data required to faithfully render the entire page. If you find
|
106
|
+
# some required data isn't available it's a bug - let me know.
|
107
|
+
#
|
108
|
+
# Many operators that generate callbacks will reference resources stored
|
109
|
+
# in the page header - think images, fonts, etc. To facilitate these
|
110
|
+
# operators, the first available callback is page=. If your receiver
|
111
|
+
# accepts that callback it will be passed the current
|
112
|
+
# PDF::Reader::Page object. Use the Page#resources method to grab any
|
113
|
+
# required resources.
|
114
|
+
#
|
115
|
+
def walk(*receivers)
|
116
|
+
callback(receivers, :page=, [self])
|
117
|
+
content_stream(receivers, raw_content)
|
118
|
+
end
|
119
|
+
|
120
|
+
# returns the raw content stream for this page. This is plumbing, nothing to
|
121
|
+
# see here unless you're a PDF nerd like me.
|
122
|
+
#
|
123
|
+
def raw_content
|
124
|
+
contents = objects.deref(@page_object[:Contents])
|
125
|
+
[contents].flatten.compact.map { |obj|
|
126
|
+
objects.deref(obj)
|
127
|
+
}.map { |obj|
|
128
|
+
obj.unfiltered_data
|
129
|
+
}.join
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def root
|
135
|
+
root ||= objects.deref(@objects.trailer[:Root])
|
136
|
+
end
|
137
|
+
|
138
|
+
def content_stream(receivers, instructions)
|
139
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
140
|
+
parser = Parser.new(buffer, @objects)
|
141
|
+
params = []
|
142
|
+
|
143
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
144
|
+
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
145
|
+
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
146
|
+
params.clear
|
147
|
+
else
|
148
|
+
params << token
|
149
|
+
end
|
150
|
+
end
|
151
|
+
rescue EOFError => e
|
152
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
153
|
+
end
|
154
|
+
|
155
|
+
# calls the name callback method on the receiver class with params as the arguments
|
156
|
+
#
|
157
|
+
def callback (receivers, name, params=[])
|
158
|
+
receivers.each do |receiver|
|
159
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def page_with_ancestors(obj = nil)
|
164
|
+
obj = objects.deref(obj)
|
165
|
+
if obj.nil?
|
166
|
+
[@page_object] + page_with_ancestors(@page_object[:Parent])
|
167
|
+
elsif obj[:Parent]
|
168
|
+
[select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
|
169
|
+
else
|
170
|
+
[select_inheritable(obj)]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# select the elements from a Pages dictionary that can be inherited by
|
175
|
+
# child Page dictionaries.
|
176
|
+
#
|
177
|
+
def select_inheritable(obj)
|
178
|
+
::Hash[obj.select { |key, value|
|
179
|
+
[:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
|
180
|
+
}]
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
@@ -0,0 +1,278 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
module PDF
|
7
|
+
class Reader
|
8
|
+
class PageTextReceiver
|
9
|
+
|
10
|
+
DEFAULT_GRAPHICS_STATE = {
|
11
|
+
:ctm => Matrix.identity(3),
|
12
|
+
:char_spacing => 0,
|
13
|
+
:word_spacing => 0,
|
14
|
+
:h_scaling => 100,
|
15
|
+
:text_leading => 0,
|
16
|
+
:text_font => nil,
|
17
|
+
:text_font_size => nil,
|
18
|
+
:text_mode => 0,
|
19
|
+
:text_rise => 0,
|
20
|
+
:text_knockout => 0
|
21
|
+
}
|
22
|
+
|
23
|
+
# starting a new page
|
24
|
+
def page=(page)
|
25
|
+
@page = page
|
26
|
+
@objects = page.objects
|
27
|
+
@fonts = page.fonts
|
28
|
+
@form_fonts = {}
|
29
|
+
@content = ::Hash.new
|
30
|
+
@stack = [DEFAULT_GRAPHICS_STATE]
|
31
|
+
end
|
32
|
+
|
33
|
+
def content
|
34
|
+
keys = @content.keys.sort.reverse
|
35
|
+
keys.map { |key|
|
36
|
+
@content[key]
|
37
|
+
}.join("\n")
|
38
|
+
end
|
39
|
+
|
40
|
+
#####################################################
|
41
|
+
# Graphics State Operators
|
42
|
+
#####################################################
|
43
|
+
|
44
|
+
def save_graphics_state
|
45
|
+
@stack.push clone_state
|
46
|
+
end
|
47
|
+
|
48
|
+
def restore_graphics_state
|
49
|
+
@stack.pop
|
50
|
+
end
|
51
|
+
|
52
|
+
#####################################################
|
53
|
+
# Matrix Operators
|
54
|
+
#####################################################
|
55
|
+
|
56
|
+
# update the current transformation matrix.
|
57
|
+
#
|
58
|
+
# If the CTM is currently undefined, just store the new values.
|
59
|
+
#
|
60
|
+
# If there's an existing CTM, then multiply the existing matrix
|
61
|
+
# with the new matrix to form the updated matrix.
|
62
|
+
#
|
63
|
+
def concatenate_matrix(a, b, c, d, e, f)
|
64
|
+
transform = Matrix[
|
65
|
+
[a, b, 0],
|
66
|
+
[c, d, 0],
|
67
|
+
[e, f, 1]
|
68
|
+
]
|
69
|
+
if state[:ctm]
|
70
|
+
state[:ctm] = transform * state[:ctm]
|
71
|
+
else
|
72
|
+
state[:ctm] = transform
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#####################################################
|
77
|
+
# Text Object Operators
|
78
|
+
#####################################################
|
79
|
+
|
80
|
+
def begin_text_object
|
81
|
+
@text_matrix = Matrix.identity(3)
|
82
|
+
@text_line_matrix = Matrix.identity(3)
|
83
|
+
end
|
84
|
+
|
85
|
+
def end_text_object
|
86
|
+
@text_matrix = Matrix.identity(3)
|
87
|
+
@text_line_matrix = Matrix.identity(3)
|
88
|
+
end
|
89
|
+
|
90
|
+
#####################################################
|
91
|
+
# Text State Operators
|
92
|
+
#####################################################
|
93
|
+
|
94
|
+
def set_character_spacing(char_spacing)
|
95
|
+
state[:char_spacing] = char_spacing
|
96
|
+
end
|
97
|
+
|
98
|
+
def set_horizontal_text_scaling(h_scaling)
|
99
|
+
state[:h_scaling] = h_scaling
|
100
|
+
end
|
101
|
+
|
102
|
+
def set_text_font_and_size(label, size)
|
103
|
+
state[:text_font] = label
|
104
|
+
state[:text_font_size] = size
|
105
|
+
end
|
106
|
+
|
107
|
+
def set_text_leading(leading)
|
108
|
+
state[:text_leading] = leading
|
109
|
+
end
|
110
|
+
|
111
|
+
def set_text_rendering_mode(mode)
|
112
|
+
state[:text_mode] = mode
|
113
|
+
end
|
114
|
+
|
115
|
+
def set_text_rise(rise)
|
116
|
+
state[:text_rise] = rise
|
117
|
+
end
|
118
|
+
|
119
|
+
def set_word_spacing(word_spacing)
|
120
|
+
state[:word_spacing] = word_spacing
|
121
|
+
end
|
122
|
+
|
123
|
+
#####################################################
|
124
|
+
# Text Positioning Operators
|
125
|
+
#####################################################
|
126
|
+
|
127
|
+
def move_text_position(x, y) # Td
|
128
|
+
temp_matrix = Matrix[
|
129
|
+
[1, 0, 0],
|
130
|
+
[0, 1, 0],
|
131
|
+
[x, y, 1]
|
132
|
+
]
|
133
|
+
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
|
134
|
+
end
|
135
|
+
|
136
|
+
def move_text_position_and_set_leading(x, y) # TD
|
137
|
+
set_text_leading(-1 * y)
|
138
|
+
move_text_position(x, y)
|
139
|
+
end
|
140
|
+
|
141
|
+
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
142
|
+
@text_matrix = @text_line_matrix = Matrix[
|
143
|
+
[a, b, 0],
|
144
|
+
[c, d, 0],
|
145
|
+
[e, f, 1]
|
146
|
+
]
|
147
|
+
end
|
148
|
+
|
149
|
+
def move_to_start_of_next_line # T*
|
150
|
+
move_text_position(0, state[:text_leading])
|
151
|
+
end
|
152
|
+
|
153
|
+
#####################################################
|
154
|
+
# Text Showing Operators
|
155
|
+
#####################################################
|
156
|
+
|
157
|
+
# record text that is drawn on the page
|
158
|
+
def show_text(string) # Tj
|
159
|
+
at = transform(Point.new(0,0))
|
160
|
+
@content[at.y] ||= ""
|
161
|
+
@content[at.y] << current_font.to_utf8(string)
|
162
|
+
end
|
163
|
+
|
164
|
+
def show_text_with_positioning(params) # TJ
|
165
|
+
params.each { |arg|
|
166
|
+
case arg
|
167
|
+
when String
|
168
|
+
show_text(arg)
|
169
|
+
when Fixnum, Float
|
170
|
+
show_text(" ") if arg > 1000
|
171
|
+
end
|
172
|
+
}
|
173
|
+
end
|
174
|
+
|
175
|
+
def move_to_next_line_and_show_text(str) # '
|
176
|
+
move_to_start_of_next_line
|
177
|
+
show_text(str)
|
178
|
+
end
|
179
|
+
|
180
|
+
def set_spacing_next_line_show_text(aw, ac, string) # "
|
181
|
+
set_word_spacing(aw)
|
182
|
+
set_character_spacing(ac)
|
183
|
+
move_to_next_line_and_show_text(string)
|
184
|
+
end
|
185
|
+
|
186
|
+
#####################################################
|
187
|
+
# XObjects
|
188
|
+
#####################################################
|
189
|
+
def invoke_xobject(label)
|
190
|
+
save_graphics_state
|
191
|
+
xobject = @objects.deref(@page.xobjects[label])
|
192
|
+
|
193
|
+
matrix = xobject.hash[:Matrix]
|
194
|
+
concatenate_matrix(*matrix) if matrix
|
195
|
+
|
196
|
+
if xobject.hash[:Subtype] == :Form
|
197
|
+
form = PDF::Reader::FormXObject.new(@page, xobject)
|
198
|
+
@form_fonts = form.fonts
|
199
|
+
form.walk(self)
|
200
|
+
end
|
201
|
+
@form_fonts = {}
|
202
|
+
|
203
|
+
restore_graphics_state
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
|
208
|
+
# transform x and y co-ordinates from the current text space to the
|
209
|
+
# underlying device space.
|
210
|
+
#
|
211
|
+
def transform(point, z = 1)
|
212
|
+
trm = text_rendering_matrix
|
213
|
+
Point.new(
|
214
|
+
(trm[0,0] * point.x) + (trm[1,0] * point.y) + (trm[2,0] * z),
|
215
|
+
(trm[0,1] * point.x) + (trm[1,1] * point.y) + (trm[2,1] * z)
|
216
|
+
)
|
217
|
+
end
|
218
|
+
|
219
|
+
def text_rendering_matrix
|
220
|
+
state_matrix = Matrix[
|
221
|
+
[state[:text_font_size] * state[:h_scaling], 0, 0],
|
222
|
+
[0, state[:text_font_size], 0],
|
223
|
+
[0, state[:text_rise], 1]
|
224
|
+
]
|
225
|
+
|
226
|
+
state_matrix * @text_matrix * ctm
|
227
|
+
end
|
228
|
+
|
229
|
+
def state
|
230
|
+
@stack.last
|
231
|
+
end
|
232
|
+
|
233
|
+
# when save_graphics_state is called, we need to push a new copy of the
|
234
|
+
# current state onto the stack. That way any modifications to the state
|
235
|
+
# will be undone once restore_graphics_state is called.
|
236
|
+
#
|
237
|
+
# This returns a deep clone of the current state, ensuring changes are
|
238
|
+
# keep separate from earlier states.
|
239
|
+
#
|
240
|
+
# YAML is used to round-trip the state through a string to easily perform
|
241
|
+
# the deep clone. Kinda hacky, but effective.
|
242
|
+
#
|
243
|
+
def clone_state
|
244
|
+
if @stack.empty?
|
245
|
+
{}
|
246
|
+
else
|
247
|
+
yaml_state = YAML.dump(@stack.last)
|
248
|
+
YAML.load(yaml_state)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
# return the current transformation matrix
|
253
|
+
#
|
254
|
+
def ctm
|
255
|
+
state[:ctm]
|
256
|
+
end
|
257
|
+
|
258
|
+
def current_font
|
259
|
+
@form_fonts[state[:text_font]] || @fonts[state[:text_font]]
|
260
|
+
end
|
261
|
+
|
262
|
+
# private class for representing points on a cartesian plain. Used
|
263
|
+
# to simplify maths in the MinPpi class.
|
264
|
+
#
|
265
|
+
class Point
|
266
|
+
attr_reader :x, :y
|
267
|
+
|
268
|
+
def initialize(x,y)
|
269
|
+
@x, @y = x,y
|
270
|
+
end
|
271
|
+
|
272
|
+
def distance(point)
|
273
|
+
Math.hypot(point.x - x, point.y - y)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|