fireinc-pdf-reader 0.11.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,185 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Reader
|
5
|
+
|
6
|
+
# high level representation of a single PDF page. Ties together the various
|
7
|
+
# low level classes in PDF::Reader and provides access to the various
|
8
|
+
# components of the page (text, images, fonts, etc) in convenient formats.
|
9
|
+
#
|
10
|
+
# If you require access to the raw PDF objects for this page, you can access
|
11
|
+
# the Page dictionary via the page_object accessor. You will need to use the
|
12
|
+
# objects accessor to help walk the page dictionary in any useful way.
|
13
|
+
#
|
14
|
+
class Page
|
15
|
+
|
16
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
17
|
+
attr_reader :objects
|
18
|
+
|
19
|
+
# the raw PDF object that defines this page
|
20
|
+
attr_reader :page_object
|
21
|
+
|
22
|
+
# creates a new page wrapper.
|
23
|
+
#
|
24
|
+
# * objects - an ObjectHash instance that wraps a PDF file
|
25
|
+
# * pagenum - an int specifying the page number to expose. 1 indexed.
|
26
|
+
#
|
27
|
+
def initialize(objects, pagenum)
|
28
|
+
@objects, @pagenum = objects, pagenum
|
29
|
+
@page_object = objects.deref(objects.page_references[pagenum - 1])
|
30
|
+
|
31
|
+
unless @page_object.is_a?(::Hash)
|
32
|
+
raise ArgumentError, "invalid page: #{pagenum}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# return the number of this page within the full document
|
37
|
+
#
|
38
|
+
def number
|
39
|
+
@pagenum
|
40
|
+
end
|
41
|
+
|
42
|
+
# return a friendly string representation of this page
|
43
|
+
#
|
44
|
+
def inspect
|
45
|
+
"<PDF::Reader::Page page: #{@pagenum}>"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the attributes that accompany this page. Includes
|
49
|
+
# attributes inherited from parents.
|
50
|
+
#
|
51
|
+
def attributes
|
52
|
+
hash = {}
|
53
|
+
page_with_ancestors.reverse.each do |obj|
|
54
|
+
hash.merge!(@objects.deref(obj))
|
55
|
+
end
|
56
|
+
hash
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the resources that accompany this page. Includes
|
60
|
+
# resources inherited from parents.
|
61
|
+
#
|
62
|
+
def resources
|
63
|
+
@resources ||= @objects.deref(attributes[:Resources]) || {}
|
64
|
+
end
|
65
|
+
|
66
|
+
# Returns the XObjects that are available to this page
|
67
|
+
#
|
68
|
+
def xobjects
|
69
|
+
resources[:XObject] || {}
|
70
|
+
end
|
71
|
+
|
72
|
+
# return a hash of fonts used on this page.
|
73
|
+
#
|
74
|
+
# The keys are the font labels used within the page content stream.
|
75
|
+
#
|
76
|
+
# The values are a PDF::Reader::Font instances that provide access
|
77
|
+
# to most available metrics for each font.
|
78
|
+
#
|
79
|
+
def fonts
|
80
|
+
raw_fonts = objects.deref(resources[:Font] || {})
|
81
|
+
::Hash[raw_fonts.map { |label, font|
|
82
|
+
[label, PDF::Reader::Font.new(objects, objects.deref(font))]
|
83
|
+
}]
|
84
|
+
end
|
85
|
+
|
86
|
+
# returns the plain text content of this page encoded as UTF-8. Any
|
87
|
+
# characters that can't be translated will be returned as a ▯
|
88
|
+
#
|
89
|
+
def text
|
90
|
+
receiver = PageTextReceiver.new
|
91
|
+
walk(receiver)
|
92
|
+
receiver.content
|
93
|
+
end
|
94
|
+
alias :to_s :text
|
95
|
+
|
96
|
+
# processes the raw content stream for this page in sequential order and
|
97
|
+
# passes callbacks to the receiver objects.
|
98
|
+
#
|
99
|
+
# This is mostly low level and you can probably ignore it unless you need
|
100
|
+
# access to soemthing like the raw encoded text. For an example of how
|
101
|
+
# this can be used as a basis for higher level functionality, see the
|
102
|
+
# text() method
|
103
|
+
#
|
104
|
+
# If someone was motivated enough, this method is intended to provide all
|
105
|
+
# the data required to faithfully render the entire page. If you find
|
106
|
+
# some required data isn't available it's a bug - let me know.
|
107
|
+
#
|
108
|
+
# Many operators that generate callbacks will reference resources stored
|
109
|
+
# in the page header - think images, fonts, etc. To facilitate these
|
110
|
+
# operators, the first available callback is page=. If your receiver
|
111
|
+
# accepts that callback it will be passed the current
|
112
|
+
# PDF::Reader::Page object. Use the Page#resources method to grab any
|
113
|
+
# required resources.
|
114
|
+
#
|
115
|
+
def walk(*receivers)
|
116
|
+
callback(receivers, :page=, [self])
|
117
|
+
content_stream(receivers, raw_content)
|
118
|
+
end
|
119
|
+
|
120
|
+
# returns the raw content stream for this page. This is plumbing, nothing to
|
121
|
+
# see here unless you're a PDF nerd like me.
|
122
|
+
#
|
123
|
+
def raw_content
|
124
|
+
contents = objects.deref(@page_object[:Contents])
|
125
|
+
[contents].flatten.compact.map { |obj|
|
126
|
+
objects.deref(obj)
|
127
|
+
}.map { |obj|
|
128
|
+
obj.unfiltered_data
|
129
|
+
}.join
|
130
|
+
end
|
131
|
+
|
132
|
+
private
|
133
|
+
|
134
|
+
def root
|
135
|
+
root ||= objects.deref(@objects.trailer[:Root])
|
136
|
+
end
|
137
|
+
|
138
|
+
def content_stream(receivers, instructions)
|
139
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
140
|
+
parser = Parser.new(buffer, @objects)
|
141
|
+
params = []
|
142
|
+
|
143
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
144
|
+
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
145
|
+
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
146
|
+
params.clear
|
147
|
+
else
|
148
|
+
params << token
|
149
|
+
end
|
150
|
+
end
|
151
|
+
rescue EOFError => e
|
152
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
153
|
+
end
|
154
|
+
|
155
|
+
# calls the name callback method on the receiver class with params as the arguments
|
156
|
+
#
|
157
|
+
def callback (receivers, name, params=[])
|
158
|
+
receivers.each do |receiver|
|
159
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
def page_with_ancestors(obj = nil)
|
164
|
+
obj = objects.deref(obj)
|
165
|
+
if obj.nil?
|
166
|
+
[@page_object] + page_with_ancestors(@page_object[:Parent])
|
167
|
+
elsif obj[:Parent]
|
168
|
+
[select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
|
169
|
+
else
|
170
|
+
[select_inheritable(obj)]
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
# select the elements from a Pages dictionary that can be inherited by
|
175
|
+
# child Page dictionaries.
|
176
|
+
#
|
177
|
+
def select_inheritable(obj)
|
178
|
+
::Hash[obj.select { |key, value|
|
179
|
+
[:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
|
180
|
+
}]
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
185
|
+
end
|
@@ -0,0 +1,278 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
require 'yaml'
|
5
|
+
|
6
|
+
module PDF
|
7
|
+
class Reader
|
8
|
+
class PageTextReceiver
|
9
|
+
|
10
|
+
DEFAULT_GRAPHICS_STATE = {
|
11
|
+
:ctm => Matrix.identity(3),
|
12
|
+
:char_spacing => 0,
|
13
|
+
:word_spacing => 0,
|
14
|
+
:h_scaling => 100,
|
15
|
+
:text_leading => 0,
|
16
|
+
:text_font => nil,
|
17
|
+
:text_font_size => nil,
|
18
|
+
:text_mode => 0,
|
19
|
+
:text_rise => 0,
|
20
|
+
:text_knockout => 0
|
21
|
+
}
|
22
|
+
|
23
|
+
# starting a new page
|
24
|
+
def page=(page)
|
25
|
+
@page = page
|
26
|
+
@objects = page.objects
|
27
|
+
@fonts = page.fonts
|
28
|
+
@form_fonts = {}
|
29
|
+
@content = ::Hash.new
|
30
|
+
@stack = [DEFAULT_GRAPHICS_STATE]
|
31
|
+
end
|
32
|
+
|
33
|
+
def content
|
34
|
+
keys = @content.keys.sort.reverse
|
35
|
+
keys.map { |key|
|
36
|
+
@content[key]
|
37
|
+
}.join("\n")
|
38
|
+
end
|
39
|
+
|
40
|
+
#####################################################
|
41
|
+
# Graphics State Operators
|
42
|
+
#####################################################
|
43
|
+
|
44
|
+
def save_graphics_state
|
45
|
+
@stack.push clone_state
|
46
|
+
end
|
47
|
+
|
48
|
+
def restore_graphics_state
|
49
|
+
@stack.pop
|
50
|
+
end
|
51
|
+
|
52
|
+
#####################################################
|
53
|
+
# Matrix Operators
|
54
|
+
#####################################################
|
55
|
+
|
56
|
+
# update the current transformation matrix.
|
57
|
+
#
|
58
|
+
# If the CTM is currently undefined, just store the new values.
|
59
|
+
#
|
60
|
+
# If there's an existing CTM, then multiply the existing matrix
|
61
|
+
# with the new matrix to form the updated matrix.
|
62
|
+
#
|
63
|
+
def concatenate_matrix(a, b, c, d, e, f)
|
64
|
+
transform = Matrix[
|
65
|
+
[a, b, 0],
|
66
|
+
[c, d, 0],
|
67
|
+
[e, f, 1]
|
68
|
+
]
|
69
|
+
if state[:ctm]
|
70
|
+
state[:ctm] = transform * state[:ctm]
|
71
|
+
else
|
72
|
+
state[:ctm] = transform
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
#####################################################
|
77
|
+
# Text Object Operators
|
78
|
+
#####################################################
|
79
|
+
|
80
|
+
def begin_text_object
|
81
|
+
@text_matrix = Matrix.identity(3)
|
82
|
+
@text_line_matrix = Matrix.identity(3)
|
83
|
+
end
|
84
|
+
|
85
|
+
def end_text_object
|
86
|
+
@text_matrix = Matrix.identity(3)
|
87
|
+
@text_line_matrix = Matrix.identity(3)
|
88
|
+
end
|
89
|
+
|
90
|
+
#####################################################
|
91
|
+
# Text State Operators
|
92
|
+
#####################################################
|
93
|
+
|
94
|
+
def set_character_spacing(char_spacing)
|
95
|
+
state[:char_spacing] = char_spacing
|
96
|
+
end
|
97
|
+
|
98
|
+
def set_horizontal_text_scaling(h_scaling)
|
99
|
+
state[:h_scaling] = h_scaling
|
100
|
+
end
|
101
|
+
|
102
|
+
def set_text_font_and_size(label, size)
|
103
|
+
state[:text_font] = label
|
104
|
+
state[:text_font_size] = size
|
105
|
+
end
|
106
|
+
|
107
|
+
def set_text_leading(leading)
|
108
|
+
state[:text_leading] = leading
|
109
|
+
end
|
110
|
+
|
111
|
+
def set_text_rendering_mode(mode)
|
112
|
+
state[:text_mode] = mode
|
113
|
+
end
|
114
|
+
|
115
|
+
def set_text_rise(rise)
|
116
|
+
state[:text_rise] = rise
|
117
|
+
end
|
118
|
+
|
119
|
+
def set_word_spacing(word_spacing)
|
120
|
+
state[:word_spacing] = word_spacing
|
121
|
+
end
|
122
|
+
|
123
|
+
#####################################################
|
124
|
+
# Text Positioning Operators
|
125
|
+
#####################################################
|
126
|
+
|
127
|
+
def move_text_position(x, y) # Td
|
128
|
+
temp_matrix = Matrix[
|
129
|
+
[1, 0, 0],
|
130
|
+
[0, 1, 0],
|
131
|
+
[x, y, 1]
|
132
|
+
]
|
133
|
+
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
|
134
|
+
end
|
135
|
+
|
136
|
+
def move_text_position_and_set_leading(x, y) # TD
|
137
|
+
set_text_leading(-1 * y)
|
138
|
+
move_text_position(x, y)
|
139
|
+
end
|
140
|
+
|
141
|
+
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
142
|
+
@text_matrix = @text_line_matrix = Matrix[
|
143
|
+
[a, b, 0],
|
144
|
+
[c, d, 0],
|
145
|
+
[e, f, 1]
|
146
|
+
]
|
147
|
+
end
|
148
|
+
|
149
|
+
def move_to_start_of_next_line # T*
|
150
|
+
move_text_position(0, state[:text_leading])
|
151
|
+
end
|
152
|
+
|
153
|
+
#####################################################
|
154
|
+
# Text Showing Operators
|
155
|
+
#####################################################
|
156
|
+
|
157
|
+
# record text that is drawn on the page
|
158
|
+
def show_text(string) # Tj
|
159
|
+
at = transform(Point.new(0,0))
|
160
|
+
@content[at.y] ||= ""
|
161
|
+
@content[at.y] << current_font.to_utf8(string)
|
162
|
+
end
|
163
|
+
|
164
|
+
def show_text_with_positioning(params) # TJ
|
165
|
+
params.each { |arg|
|
166
|
+
case arg
|
167
|
+
when String
|
168
|
+
show_text(arg)
|
169
|
+
when Fixnum, Float
|
170
|
+
show_text(" ") if arg > 1000
|
171
|
+
end
|
172
|
+
}
|
173
|
+
end
|
174
|
+
|
175
|
+
def move_to_next_line_and_show_text(str) # '
|
176
|
+
move_to_start_of_next_line
|
177
|
+
show_text(str)
|
178
|
+
end
|
179
|
+
|
180
|
+
def set_spacing_next_line_show_text(aw, ac, string) # "
|
181
|
+
set_word_spacing(aw)
|
182
|
+
set_character_spacing(ac)
|
183
|
+
move_to_next_line_and_show_text(string)
|
184
|
+
end
|
185
|
+
|
186
|
+
#####################################################
|
187
|
+
# XObjects
|
188
|
+
#####################################################
|
189
|
+
def invoke_xobject(label)
|
190
|
+
save_graphics_state
|
191
|
+
xobject = @objects.deref(@page.xobjects[label])
|
192
|
+
|
193
|
+
matrix = xobject.hash[:Matrix]
|
194
|
+
concatenate_matrix(*matrix) if matrix
|
195
|
+
|
196
|
+
if xobject.hash[:Subtype] == :Form
|
197
|
+
form = PDF::Reader::FormXObject.new(@page, xobject)
|
198
|
+
@form_fonts = form.fonts
|
199
|
+
form.walk(self)
|
200
|
+
end
|
201
|
+
@form_fonts = {}
|
202
|
+
|
203
|
+
restore_graphics_state
|
204
|
+
end
|
205
|
+
|
206
|
+
private
|
207
|
+
|
208
|
+
# transform x and y co-ordinates from the current text space to the
|
209
|
+
# underlying device space.
|
210
|
+
#
|
211
|
+
def transform(point, z = 1)
|
212
|
+
trm = text_rendering_matrix
|
213
|
+
Point.new(
|
214
|
+
(trm[0,0] * point.x) + (trm[1,0] * point.y) + (trm[2,0] * z),
|
215
|
+
(trm[0,1] * point.x) + (trm[1,1] * point.y) + (trm[2,1] * z)
|
216
|
+
)
|
217
|
+
end
|
218
|
+
|
219
|
+
def text_rendering_matrix
|
220
|
+
state_matrix = Matrix[
|
221
|
+
[state[:text_font_size] * state[:h_scaling], 0, 0],
|
222
|
+
[0, state[:text_font_size], 0],
|
223
|
+
[0, state[:text_rise], 1]
|
224
|
+
]
|
225
|
+
|
226
|
+
state_matrix * @text_matrix * ctm
|
227
|
+
end
|
228
|
+
|
229
|
+
def state
|
230
|
+
@stack.last
|
231
|
+
end
|
232
|
+
|
233
|
+
# when save_graphics_state is called, we need to push a new copy of the
|
234
|
+
# current state onto the stack. That way any modifications to the state
|
235
|
+
# will be undone once restore_graphics_state is called.
|
236
|
+
#
|
237
|
+
# This returns a deep clone of the current state, ensuring changes are
|
238
|
+
# keep separate from earlier states.
|
239
|
+
#
|
240
|
+
# YAML is used to round-trip the state through a string to easily perform
|
241
|
+
# the deep clone. Kinda hacky, but effective.
|
242
|
+
#
|
243
|
+
def clone_state
|
244
|
+
if @stack.empty?
|
245
|
+
{}
|
246
|
+
else
|
247
|
+
yaml_state = YAML.dump(@stack.last)
|
248
|
+
YAML.load(yaml_state)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
|
252
|
+
# return the current transformation matrix
|
253
|
+
#
|
254
|
+
def ctm
|
255
|
+
state[:ctm]
|
256
|
+
end
|
257
|
+
|
258
|
+
def current_font
|
259
|
+
@form_fonts[state[:text_font]] || @fonts[state[:text_font]]
|
260
|
+
end
|
261
|
+
|
262
|
+
# private class for representing points on a cartesian plain. Used
|
263
|
+
# to simplify maths in the MinPpi class.
|
264
|
+
#
|
265
|
+
class Point
|
266
|
+
attr_reader :x, :y
|
267
|
+
|
268
|
+
def initialize(x,y)
|
269
|
+
@x, @y = x,y
|
270
|
+
end
|
271
|
+
|
272
|
+
def distance(point)
|
273
|
+
Math.hypot(point.x - x, point.y - y)
|
274
|
+
end
|
275
|
+
end
|
276
|
+
end
|
277
|
+
end
|
278
|
+
end
|