pdf-reader 1.4.1 → 2.0.0.beta1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +8 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_object +4 -1
- data/lib/pdf/reader.rb +7 -112
- data/lib/pdf/reader/buffer.rb +2 -1
- data/lib/pdf/reader/cmap.rb +26 -24
- data/lib/pdf/reader/encoding.rb +4 -5
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +1 -5
- data/lib/pdf/reader/font.rb +1 -11
- data/lib/pdf/reader/glyph_hash.rb +6 -2
- data/lib/pdf/reader/lzw.rb +1 -1
- data/lib/pdf/reader/object_hash.rb +35 -16
- data/lib/pdf/reader/page_layout.rb +6 -17
- data/lib/pdf/reader/pages_strategy.rb +1 -304
- data/lib/pdf/reader/parser.rb +6 -4
- data/lib/pdf/reader/standard_security_handler.rb +18 -14
- data/lib/pdf/reader/text_run.rb +3 -9
- metadata +14 -47
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
@@ -1,265 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
################################################################################
|
4
|
-
#
|
5
|
-
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
6
|
-
#
|
7
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
8
|
-
# a copy of this software and associated documentation files (the
|
9
|
-
# "Software"), to deal in the Software without restriction, including
|
10
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
11
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
13
|
-
# the following conditions:
|
14
|
-
#
|
15
|
-
# The above copyright notice and this permission notice shall be
|
16
|
-
# included in all copies or substantial portions of the Software.
|
17
|
-
#
|
18
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
-
#
|
26
|
-
################################################################################
|
27
|
-
|
28
|
-
class PDF::Reader
|
29
|
-
################################################################################
|
30
|
-
# An example receiver class that processes all text found in a PDF file. All text that
|
31
|
-
# is found will be printed to the IO object specified in the constructor.
|
32
|
-
#
|
33
|
-
# Usage:
|
34
|
-
# receiver = PDF::Reader::TextReceiver.new($stdout)
|
35
|
-
# PDF::Reader.file("somefile.pdf", receiver)
|
36
|
-
#
|
37
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
38
|
-
# eventually be removed
|
39
|
-
class TextReceiver
|
40
|
-
################################################################################
|
41
|
-
# Initialize with the library user's receiver
|
42
|
-
def initialize(main_receiver)
|
43
|
-
@main_receiver = main_receiver
|
44
|
-
@upper_corners = []
|
45
|
-
end
|
46
|
-
################################################################################
|
47
|
-
# Called when the document parsing begins
|
48
|
-
def begin_document(root)
|
49
|
-
@upper_corners = []
|
50
|
-
end
|
51
|
-
################################################################################
|
52
|
-
# Called when the document parsing ends
|
53
|
-
def end_document
|
54
|
-
@state.clear
|
55
|
-
end
|
56
|
-
################################################################################
|
57
|
-
def begin_page_container(page)
|
58
|
-
@upper_corners.push(media_box_check(page))
|
59
|
-
end
|
60
|
-
################################################################################
|
61
|
-
def end_page_container
|
62
|
-
@upper_corners.pop
|
63
|
-
end
|
64
|
-
################################################################################
|
65
|
-
# Called when new page parsing begins
|
66
|
-
def begin_page(info)
|
67
|
-
@page = info
|
68
|
-
|
69
|
-
@state = [{
|
70
|
-
:char_spacing => 0,
|
71
|
-
:word_spacing => 0,
|
72
|
-
:hori_scaling => 100,
|
73
|
-
:leading => 0,
|
74
|
-
:tj_adjustment => 0,
|
75
|
-
}]
|
76
|
-
|
77
|
-
@upper_corners.push(media_box_check(info))
|
78
|
-
|
79
|
-
@output = []
|
80
|
-
@line = 0
|
81
|
-
@location = 0
|
82
|
-
@displacement = {}
|
83
|
-
@smallest_y_loc = @upper_corners.last[:ury]
|
84
|
-
@written_to = false
|
85
|
-
end
|
86
|
-
################################################################################
|
87
|
-
# Called when page parsing ends
|
88
|
-
def end_page
|
89
|
-
@main_receiver << @output.join("\n")
|
90
|
-
@upper_corners.pop
|
91
|
-
end
|
92
|
-
################################################################################
|
93
|
-
# PDF operator BT
|
94
|
-
def begin_text_object
|
95
|
-
@state.push(@state.last.dup)
|
96
|
-
end
|
97
|
-
################################################################################
|
98
|
-
# PDF operator ET
|
99
|
-
def end_text_object
|
100
|
-
@state.pop
|
101
|
-
end
|
102
|
-
################################################################################
|
103
|
-
# PDF operator Tm
|
104
|
-
def set_text_matrix_and_text_line_matrix(*args)
|
105
|
-
# these variable names look bad, but they're from the PDF spec
|
106
|
-
_a, _b, _c, _d, _e, f = *args
|
107
|
-
calculate_line_and_location(f)
|
108
|
-
end
|
109
|
-
################################################################################
|
110
|
-
# PDF operator Tc
|
111
|
-
def set_character_spacing(n)
|
112
|
-
@state.last[:char_spacing] = n
|
113
|
-
end
|
114
|
-
################################################################################
|
115
|
-
# PDF operator Tw
|
116
|
-
def set_word_spacing(n)
|
117
|
-
@state.last[:word_spacing] = n
|
118
|
-
end
|
119
|
-
################################################################################
|
120
|
-
# PDF operator Tz
|
121
|
-
def set_horizontal_text_scaling(n)
|
122
|
-
@state.last[:hori_scaling] = n/100
|
123
|
-
end
|
124
|
-
################################################################################
|
125
|
-
# PDF operator TL
|
126
|
-
def set_text_leading(n)
|
127
|
-
@state.last[:leading] = n
|
128
|
-
end
|
129
|
-
################################################################################
|
130
|
-
# PDF operator T*
|
131
|
-
def move_to_start_of_next_line
|
132
|
-
move_text_position(0, @state.last[:leading])
|
133
|
-
end
|
134
|
-
################################################################################
|
135
|
-
# PDF operator Td
|
136
|
-
def move_text_position(tx, ty)
|
137
|
-
#puts "#{tx} #{ty} Td"
|
138
|
-
calculate_line_and_location(@location + ty)
|
139
|
-
end
|
140
|
-
################################################################################
|
141
|
-
# PDF operator TD
|
142
|
-
def move_text_position_and_set_leading(tx, ty)
|
143
|
-
set_text_leading(ty)# * -1)
|
144
|
-
move_text_position(tx, ty)
|
145
|
-
end
|
146
|
-
################################################################################
|
147
|
-
# PDF operator Tj
|
148
|
-
def show_text(string)
|
149
|
-
#puts "getting line #@line"
|
150
|
-
|
151
|
-
place = (@output[@line] ||= "")
|
152
|
-
#place << " " unless place.empty?
|
153
|
-
|
154
|
-
place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
|
155
|
-
place << string
|
156
|
-
|
157
|
-
#puts "place is now: #{place}"
|
158
|
-
@written_to = true
|
159
|
-
end
|
160
|
-
def super_show_text(string)
|
161
|
-
urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
|
162
|
-
ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR
|
163
|
-
|
164
|
-
x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
|
165
|
-
y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
|
166
|
-
|
167
|
-
#puts "rendering '#{string}' to #{x}x#{y}"
|
168
|
-
|
169
|
-
place = (@output[y] ||= (" " * urx.to_i))
|
170
|
-
#puts "#{urx} #{place.size} #{string.size} #{x}"
|
171
|
-
return if x+string.size >= urx
|
172
|
-
|
173
|
-
string.split(//).each do |c|
|
174
|
-
chars = 1
|
175
|
-
|
176
|
-
case c
|
177
|
-
when " "
|
178
|
-
chars += @state.last[:word_spacing].to_i
|
179
|
-
place[x-1, chars] = (" " * chars)
|
180
|
-
else
|
181
|
-
chars += @state.last[:char_spacing].to_i
|
182
|
-
chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
|
183
|
-
chars = 1 if chars < 1
|
184
|
-
|
185
|
-
place[x-1] = c
|
186
|
-
place[x, chars-1] = (" " * (chars-1)) if chars > 1
|
187
|
-
end
|
188
|
-
|
189
|
-
x += chars
|
190
|
-
end
|
191
|
-
|
192
|
-
@tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
|
193
|
-
end
|
194
|
-
################################################################################
|
195
|
-
# PDF operator TJ
|
196
|
-
def show_text_with_positioning(params)
|
197
|
-
prev_adjustment = @state.last[:tj_adjustment]
|
198
|
-
|
199
|
-
params.each do |p|
|
200
|
-
case p
|
201
|
-
when Float, Integer
|
202
|
-
@state.last[:tj_adjustment] = p
|
203
|
-
else
|
204
|
-
show_text(p)
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
@state.last[:tj_adjustment] = prev_adjustment
|
209
|
-
end
|
210
|
-
################################################################################
|
211
|
-
# PDF operator '
|
212
|
-
def move_to_next_line_and_show_text(string)
|
213
|
-
move_to_start_of_next_line
|
214
|
-
show_text(string)
|
215
|
-
end
|
216
|
-
################################################################################
|
217
|
-
# PDF operator "
|
218
|
-
def set_spacing_next_line_show_text(aw, ac, string)
|
219
|
-
set_word_spacing(aw)
|
220
|
-
set_character_spacing(ac)
|
221
|
-
move_to_next_line_and_show_text(string)
|
222
|
-
end
|
223
|
-
################################################################################
|
224
|
-
def media_box_check(dict)
|
225
|
-
corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup
|
226
|
-
|
227
|
-
if dict.has_key?(:MediaBox)
|
228
|
-
media_box = dict[:MediaBox]
|
229
|
-
corners[:urx] = media_box[2] - media_box[0]
|
230
|
-
corners[:ury] = media_box[3] - media_box[1]
|
231
|
-
end
|
232
|
-
|
233
|
-
corners
|
234
|
-
end
|
235
|
-
################################################################################
|
236
|
-
def calculate_line_and_location(new_loc)
|
237
|
-
##puts "calculate_line_and_location(#{new_loc})"
|
238
|
-
key = new_loc; key.freeze
|
239
|
-
|
240
|
-
#key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)
|
241
|
-
|
242
|
-
if @written_to
|
243
|
-
unless @displacement.has_key?(key)
|
244
|
-
if key < @location
|
245
|
-
@displacement[key] = @line + 1
|
246
|
-
elsif key < @smallest_y_loc
|
247
|
-
@displacement[key] = @line + 1
|
248
|
-
else
|
249
|
-
key = @displacement.keys.find_all {|i| key > i}.sort.last
|
250
|
-
@displacement[key] = 0 unless @displacement.has_key?(key)
|
251
|
-
end
|
252
|
-
end
|
253
|
-
else
|
254
|
-
@displacement[key] = 0
|
255
|
-
end
|
256
|
-
|
257
|
-
@smallest_y_loc = key if key < @smallest_y_loc
|
258
|
-
@location = key
|
259
|
-
@line = @displacement[key]
|
260
|
-
end
|
261
|
-
################################################################################
|
262
|
-
end
|
263
|
-
################################################################################
|
264
|
-
end
|
265
|
-
################################################################################
|