pdf-reader 0.10.1 → 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -4
- data/README.rdoc +30 -21
- data/bin/pdf_text +5 -35
- data/examples/callbacks.rb +9 -4
- data/examples/extract_bates.rb +15 -29
- data/lib/pdf/reader.rb +150 -37
- data/lib/pdf/reader/abstract_strategy.rb +2 -0
- data/lib/pdf/reader/buffer.rb +12 -13
- data/lib/pdf/reader/font.rb +56 -0
- data/lib/pdf/reader/glyphlist.txt +40 -1
- data/lib/pdf/reader/metadata_strategy.rb +3 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +19 -5
- data/lib/pdf/reader/page.rb +172 -0
- data/lib/pdf/reader/page_text_receiver.rb +253 -0
- data/lib/pdf/reader/pages_strategy.rb +3 -11
- data/lib/pdf/reader/text_receiver.rb +3 -0
- data/lib/pdf/reader/xref.rb +3 -4
- metadata +41 -35
@@ -0,0 +1,253 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require 'matrix'
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
class PageTextReceiver
|
8
|
+
|
9
|
+
DEFAULT_GRAPHICS_STATE = {
|
10
|
+
:ctm => Matrix.identity(3),
|
11
|
+
:char_spacing => 0,
|
12
|
+
:word_spacing => 0,
|
13
|
+
:h_scaling => 100,
|
14
|
+
:text_leading => 0,
|
15
|
+
:text_font => nil,
|
16
|
+
:text_font_size => nil,
|
17
|
+
:text_mode => 0,
|
18
|
+
:text_rise => 0,
|
19
|
+
:text_knockout => 0
|
20
|
+
}
|
21
|
+
|
22
|
+
def initialize(fonts)
|
23
|
+
@fonts = fonts
|
24
|
+
@content = ::Hash.new
|
25
|
+
@stack = [DEFAULT_GRAPHICS_STATE]
|
26
|
+
end
|
27
|
+
|
28
|
+
def content
|
29
|
+
keys = @content.keys.sort.reverse
|
30
|
+
keys.map { |key|
|
31
|
+
@content[key]
|
32
|
+
}.join("\n")
|
33
|
+
end
|
34
|
+
|
35
|
+
#####################################################
|
36
|
+
# Graphics State Operators
|
37
|
+
#####################################################
|
38
|
+
|
39
|
+
def save_graphics_state
|
40
|
+
@stack.push clone_state
|
41
|
+
end
|
42
|
+
|
43
|
+
def restore_graphics_state
|
44
|
+
@stack.pop
|
45
|
+
end
|
46
|
+
|
47
|
+
#####################################################
|
48
|
+
# Matrix Operators
|
49
|
+
#####################################################
|
50
|
+
|
51
|
+
# update the current transformation matrix.
|
52
|
+
#
|
53
|
+
# If the CTM is currently undefined, just store the new values.
|
54
|
+
#
|
55
|
+
# If there's an existing CTM, then multiply the existing matrix
|
56
|
+
# with the new matrix to form the updated matrix.
|
57
|
+
#
|
58
|
+
def concatenate_matrix(a, b, c, d, e, f)
|
59
|
+
transform = Matrix[
|
60
|
+
[a, b, 0],
|
61
|
+
[c, d, 0],
|
62
|
+
[e, f, 1]
|
63
|
+
]
|
64
|
+
if state[:ctm]
|
65
|
+
state[:ctm] = transform * state[:ctm]
|
66
|
+
else
|
67
|
+
state[:ctm] = transform
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
#####################################################
|
72
|
+
# Text Object Operators
|
73
|
+
#####################################################
|
74
|
+
|
75
|
+
def begin_text_object
|
76
|
+
@text_matrix = Matrix.identity(3)
|
77
|
+
@text_line_matrix = Matrix.identity(3)
|
78
|
+
end
|
79
|
+
|
80
|
+
def end_text_object
|
81
|
+
@text_matrix = Matrix.identity(3)
|
82
|
+
@text_line_matrix = Matrix.identity(3)
|
83
|
+
end
|
84
|
+
|
85
|
+
#####################################################
|
86
|
+
# Text State Operators
|
87
|
+
#####################################################
|
88
|
+
|
89
|
+
def set_character_spacing(char_spacing)
|
90
|
+
state[:char_spacing] = char_spacing
|
91
|
+
end
|
92
|
+
|
93
|
+
def set_horizontal_text_scaling(h_scaling)
|
94
|
+
state[:h_scaling] = h_scaling
|
95
|
+
end
|
96
|
+
|
97
|
+
def set_text_font_and_size(label, size)
|
98
|
+
state[:text_font] = label
|
99
|
+
state[:text_font_size] = size
|
100
|
+
end
|
101
|
+
|
102
|
+
def set_text_leading(leading)
|
103
|
+
state[:text_leading] = leading
|
104
|
+
end
|
105
|
+
|
106
|
+
def set_text_rendering_mode(mode)
|
107
|
+
state[:text_mode] = mode
|
108
|
+
end
|
109
|
+
|
110
|
+
def set_text_rise(rise)
|
111
|
+
state[:text_rise] = rise
|
112
|
+
end
|
113
|
+
|
114
|
+
def set_word_spacing(word_spacing)
|
115
|
+
state[:word_spacing] = word_spacing
|
116
|
+
end
|
117
|
+
|
118
|
+
#####################################################
|
119
|
+
# Text Positioning Operators
|
120
|
+
#####################################################
|
121
|
+
|
122
|
+
def move_text_position(x, y) # Td
|
123
|
+
temp_matrix = Matrix[
|
124
|
+
[1, 0, 0],
|
125
|
+
[0, 1, 0],
|
126
|
+
[x, y, 1]
|
127
|
+
]
|
128
|
+
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
|
129
|
+
end
|
130
|
+
|
131
|
+
def move_text_position_and_set_leading(x, y) # TD
|
132
|
+
set_text_leading(-1 * y)
|
133
|
+
move_text_position(x, y)
|
134
|
+
end
|
135
|
+
|
136
|
+
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
|
137
|
+
@text_matrix = @text_line_matrix = Matrix[
|
138
|
+
[a, b, 0],
|
139
|
+
[c, d, 0],
|
140
|
+
[e, f, 1]
|
141
|
+
]
|
142
|
+
end
|
143
|
+
|
144
|
+
def move_to_start_of_next_line # T*
|
145
|
+
move_text_position(0, state[:text_leading])
|
146
|
+
end
|
147
|
+
|
148
|
+
#####################################################
|
149
|
+
# Text Showing Operators
|
150
|
+
#####################################################
|
151
|
+
|
152
|
+
# record text that is drawn on the page
|
153
|
+
def show_text(string) # Tj
|
154
|
+
at = transform(Point.new(0,0))
|
155
|
+
@content[at.y] ||= ""
|
156
|
+
@content[at.y] << current_font.to_utf8(string)
|
157
|
+
end
|
158
|
+
|
159
|
+
def show_text_with_positioning(params) # TJ
|
160
|
+
params.each { |arg|
|
161
|
+
case arg
|
162
|
+
when String
|
163
|
+
show_text(arg)
|
164
|
+
when Fixnum, Float
|
165
|
+
show_text(" ") if arg > 1000
|
166
|
+
end
|
167
|
+
}
|
168
|
+
end
|
169
|
+
|
170
|
+
def move_to_next_line_and_show_text(str) # '
|
171
|
+
move_to_start_of_next_line
|
172
|
+
show_text(str)
|
173
|
+
end
|
174
|
+
|
175
|
+
def set_spacing_next_line_show_text(aw, ac, string) # "
|
176
|
+
set_word_spacing(aw)
|
177
|
+
set_character_spacing(ac)
|
178
|
+
move_to_next_line_and_show_text(string)
|
179
|
+
end
|
180
|
+
|
181
|
+
private
|
182
|
+
|
183
|
+
# transform x and y co-ordinates from the current text space to the
|
184
|
+
# underlying device space.
|
185
|
+
#
|
186
|
+
def transform(point, z = 1)
|
187
|
+
trm = text_rendering_matrix
|
188
|
+
Point.new(
|
189
|
+
(trm[0,0] * point.x) + (trm[1,0] * point.y) + (trm[2,0] * z),
|
190
|
+
(trm[0,1] * point.x) + (trm[1,1] * point.y) + (trm[2,1] * z)
|
191
|
+
)
|
192
|
+
end
|
193
|
+
|
194
|
+
def text_rendering_matrix
|
195
|
+
state_matrix = Matrix[
|
196
|
+
[state[:text_font_size] * state[:h_scaling], 0, 0],
|
197
|
+
[0, state[:text_font_size], 0],
|
198
|
+
[0, state[:text_rise], 1]
|
199
|
+
]
|
200
|
+
|
201
|
+
state_matrix * @text_matrix * ctm
|
202
|
+
end
|
203
|
+
|
204
|
+
def state
|
205
|
+
@stack.last
|
206
|
+
end
|
207
|
+
|
208
|
+
# when save_graphics_state is called, we need to push a new copy of the
|
209
|
+
# current state onto the stack. That way any modifications to the state
|
210
|
+
# will be undone once restore_graphics_state is called.
|
211
|
+
#
|
212
|
+
# This returns a deep clone of the current state, ensuring changes are
|
213
|
+
# keep separate from earlier states.
|
214
|
+
#
|
215
|
+
# YAML is used to round-trip the state through a string to easily perform
|
216
|
+
# the deep clone. Kinda hacky, but effective.
|
217
|
+
#
|
218
|
+
def clone_state
|
219
|
+
if @stack.empty?
|
220
|
+
{}
|
221
|
+
else
|
222
|
+
yaml_state = YAML.dump(@stack.last)
|
223
|
+
YAML.load(yaml_state)
|
224
|
+
end
|
225
|
+
end
|
226
|
+
|
227
|
+
# return the current transformation matrix
|
228
|
+
#
|
229
|
+
def ctm
|
230
|
+
state[:ctm]
|
231
|
+
end
|
232
|
+
|
233
|
+
def current_font
|
234
|
+
@fonts[state[:text_font]]
|
235
|
+
end
|
236
|
+
|
237
|
+
# private class for representing points on a cartesian plain. Used
|
238
|
+
# to simplify maths in the MinPpi class.
|
239
|
+
#
|
240
|
+
class Point
|
241
|
+
attr_reader :x, :y
|
242
|
+
|
243
|
+
def initialize(x,y)
|
244
|
+
@x, @y = x,y
|
245
|
+
end
|
246
|
+
|
247
|
+
def distance(point)
|
248
|
+
Math.hypot(point.x - x, point.y - y)
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
253
|
+
end
|
@@ -175,6 +175,8 @@ class PDF::Reader
|
|
175
175
|
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
176
176
|
# invoke_xobject "IM1".
|
177
177
|
#
|
178
|
+
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
179
|
+
# eventually be removed
|
178
180
|
class PagesStrategy< AbstractStrategy # :nodoc:
|
179
181
|
OPERATORS = {
|
180
182
|
'b' => :close_fill_stroke,
|
@@ -460,17 +462,7 @@ class PDF::Reader
|
|
460
462
|
fonts = {}
|
461
463
|
resources = @ohash.object(resources[:Font]) || {}
|
462
464
|
resources.each do |label, desc|
|
463
|
-
|
464
|
-
fonts[label] = PDF::Reader::Font.new
|
465
|
-
fonts[label].label = label
|
466
|
-
fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
467
|
-
fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
468
|
-
fonts[label].encoding = PDF::Reader::Encoding.new(@ohash.object(desc[:Encoding]))
|
469
|
-
fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
470
|
-
if desc[:ToUnicode]
|
471
|
-
stream = @ohash.object(desc[:ToUnicode])
|
472
|
-
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
473
|
-
end
|
465
|
+
fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
474
466
|
end
|
475
467
|
fonts
|
476
468
|
end
|
@@ -31,6 +31,9 @@ class PDF::Reader
|
|
31
31
|
# Usage:
|
32
32
|
# receiver = PDF::Reader::TextReceiver.new($stdout)
|
33
33
|
# PDF::Reader.file("somefile.pdf", receiver)
|
34
|
+
#
|
35
|
+
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
36
|
+
# eventually be removed
|
34
37
|
class TextReceiver
|
35
38
|
################################################################################
|
36
39
|
# Initialize with the library user's receiver
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -149,10 +149,9 @@ class PDF::Reader
|
|
149
149
|
unless stream.hash[:Type] == :XRef
|
150
150
|
raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
|
151
151
|
end
|
152
|
-
trailer = {
|
153
|
-
|
154
|
-
|
155
|
-
trailer[:Prev] = stream.hash[:Prev] if stream.hash[:Prev]
|
152
|
+
trailer = Hash[stream.hash.select { |key, value|
|
153
|
+
[:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
|
154
|
+
}]
|
156
155
|
|
157
156
|
widths = stream.hash[:W]
|
158
157
|
entry_length = widths.inject(0) { |s, w| s + w }
|
metadata
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
prerelease:
|
4
|
+
prerelease: true
|
5
5
|
segments:
|
6
6
|
- 0
|
7
|
-
-
|
8
|
-
-
|
9
|
-
|
7
|
+
- 11
|
8
|
+
- 0
|
9
|
+
- alpha
|
10
|
+
version: 0.11.0.alpha
|
10
11
|
platform: ruby
|
11
12
|
authors:
|
12
13
|
- James Healy
|
@@ -14,7 +15,7 @@ autorequire:
|
|
14
15
|
bindir: bin
|
15
16
|
cert_chain: []
|
16
17
|
|
17
|
-
date: 2011-
|
18
|
+
date: 2011-07-19 00:00:00 +10:00
|
18
19
|
default_executable:
|
19
20
|
dependencies:
|
20
21
|
- !ruby/object:Gem::Dependency
|
@@ -87,46 +88,49 @@ extra_rdoc_files:
|
|
87
88
|
- CHANGELOG
|
88
89
|
- MIT-LICENSE
|
89
90
|
files:
|
91
|
+
- examples/page_counter_naive.rb
|
92
|
+
- examples/rspec.rb
|
90
93
|
- examples/metadata.rb
|
91
|
-
- examples/extract_images.rb
|
92
94
|
- examples/extract_bates.rb
|
93
|
-
- examples/page_counter_improved.rb
|
94
|
-
- examples/callbacks.rb
|
95
|
-
- examples/rspec.rb
|
96
95
|
- examples/hash.rb
|
96
|
+
- examples/callbacks.rb
|
97
97
|
- examples/text.rb
|
98
|
-
- examples/page_counter_naive.rb
|
99
98
|
- examples/version.rb
|
100
|
-
-
|
101
|
-
-
|
102
|
-
- lib/pdf/reader/
|
103
|
-
- lib/pdf/reader/xref.rb
|
104
|
-
- lib/pdf/reader/buffer.rb
|
105
|
-
- lib/pdf/reader/font.rb
|
106
|
-
- lib/pdf/reader/parser.rb
|
99
|
+
- examples/page_counter_improved.rb
|
100
|
+
- examples/extract_images.rb
|
101
|
+
- lib/pdf/reader/glyphlist.txt
|
107
102
|
- lib/pdf/reader/error.rb
|
103
|
+
- lib/pdf/reader/font.rb
|
104
|
+
- lib/pdf/reader/lzw.rb
|
105
|
+
- lib/pdf/reader/print_receiver.rb
|
106
|
+
- lib/pdf/reader/reference.rb
|
108
107
|
- lib/pdf/reader/filter.rb
|
109
|
-
- lib/pdf/reader/
|
108
|
+
- lib/pdf/reader/text_receiver.rb
|
109
|
+
- lib/pdf/reader/pages_strategy.rb
|
110
|
+
- lib/pdf/reader/abstract_strategy.rb
|
111
|
+
- lib/pdf/reader/page_text_receiver.rb
|
112
|
+
- lib/pdf/reader/encoding.rb
|
110
113
|
- lib/pdf/reader/stream.rb
|
114
|
+
- lib/pdf/reader/register_receiver.rb
|
115
|
+
- lib/pdf/reader/object_hash.rb
|
116
|
+
- lib/pdf/reader/object_cache.rb
|
117
|
+
- lib/pdf/reader/token.rb
|
118
|
+
- lib/pdf/reader/page.rb
|
119
|
+
- lib/pdf/reader/xref.rb
|
111
120
|
- lib/pdf/reader/cmap.rb
|
112
121
|
- lib/pdf/reader/object_stream.rb
|
113
|
-
- lib/pdf/reader/
|
114
|
-
- lib/pdf/reader/
|
115
|
-
- lib/pdf/reader/
|
116
|
-
- lib/pdf/reader/lzw.rb
|
117
|
-
- lib/pdf/reader/register_receiver.rb
|
118
|
-
- lib/pdf/reader/abstract_strategy.rb
|
119
|
-
- lib/pdf/reader/pages_strategy.rb
|
120
|
-
- lib/pdf/reader/reference.rb
|
122
|
+
- lib/pdf/reader/metadata_strategy.rb
|
123
|
+
- lib/pdf/reader/buffer.rb
|
124
|
+
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
121
125
|
- lib/pdf/reader/encodings/standard.txt
|
122
126
|
- lib/pdf/reader/encodings/mac_roman.txt
|
123
|
-
- lib/pdf/reader/encodings/
|
127
|
+
- lib/pdf/reader/encodings/mac_expert.txt
|
124
128
|
- lib/pdf/reader/encodings/win_ansi.txt
|
125
|
-
- lib/pdf/reader/encodings/
|
129
|
+
- lib/pdf/reader/encodings/symbol.txt
|
126
130
|
- lib/pdf/reader/encodings/pdf_doc.txt
|
127
|
-
- lib/pdf/reader/
|
128
|
-
- lib/pdf/
|
129
|
-
- lib/pdf/reader
|
131
|
+
- lib/pdf/reader/parser.rb
|
132
|
+
- lib/pdf/hash.rb
|
133
|
+
- lib/pdf/reader.rb
|
130
134
|
- lib/pdf-reader.rb
|
131
135
|
- Rakefile
|
132
136
|
- README.rdoc
|
@@ -140,7 +144,7 @@ has_rdoc: true
|
|
140
144
|
homepage: http://github.com/yob/pdf-reader
|
141
145
|
licenses: []
|
142
146
|
|
143
|
-
post_install_message:
|
147
|
+
post_install_message: "\n ********************************************\n\n This is an alpha release of PDF::Reader to gather feedback on the proposed\n API changes.\n\n The old API is marked as deprecated but will continue to work with no\n visible warnings for now.\n\n The new API is documented in the README and in rdoc for the PDF::Reader,\n PDF::Reader::Page and PDF::Reader::ObjectHash classes.\n\n Do not use this in production, stick to stable releases for that. If you do\n take the new API for a spin, please send any feedback my way.\n\n ********************************************\n\n"
|
144
148
|
rdoc_options:
|
145
149
|
- --title
|
146
150
|
- PDF::Reader Documentation
|
@@ -162,11 +166,13 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
162
166
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
163
167
|
none: false
|
164
168
|
requirements:
|
165
|
-
- - "
|
169
|
+
- - ">"
|
166
170
|
- !ruby/object:Gem::Version
|
167
171
|
segments:
|
168
|
-
-
|
169
|
-
|
172
|
+
- 1
|
173
|
+
- 3
|
174
|
+
- 1
|
175
|
+
version: 1.3.1
|
170
176
|
requirements: []
|
171
177
|
|
172
178
|
rubyforge_project:
|