pdf-reader 2.2.0 → 2.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -13,7 +14,7 @@ module PDF
|
|
13
14
|
# objects accessor to help walk the page dictionary in any useful way.
|
14
15
|
#
|
15
16
|
class Page
|
16
|
-
|
17
|
+
extend Forwardable
|
17
18
|
|
18
19
|
# lowlevel hash-like access to all objects in the underlying PDF
|
19
20
|
attr_reader :objects
|
@@ -26,6 +27,15 @@ module PDF
|
|
26
27
|
# operations
|
27
28
|
attr_reader :cache
|
28
29
|
|
30
|
+
def_delegators :resources, :color_spaces
|
31
|
+
def_delegators :resources, :fonts
|
32
|
+
def_delegators :resources, :graphic_states
|
33
|
+
def_delegators :resources, :patterns
|
34
|
+
def_delegators :resources, :procedure_sets
|
35
|
+
def_delegators :resources, :properties
|
36
|
+
def_delegators :resources, :shadings
|
37
|
+
def_delegators :resources, :xobjects
|
38
|
+
|
29
39
|
# creates a new page wrapper.
|
30
40
|
#
|
31
41
|
# * objects - an ObjectHash instance that wraps a PDF file
|
@@ -33,10 +43,10 @@ module PDF
|
|
33
43
|
#
|
34
44
|
def initialize(objects, pagenum, options = {})
|
35
45
|
@objects, @pagenum = objects, pagenum
|
36
|
-
@page_object = objects.
|
46
|
+
@page_object = objects.deref_hash(objects.page_references[pagenum - 1]) || {}
|
37
47
|
@cache = options[:cache] || {}
|
38
48
|
|
39
|
-
|
49
|
+
if @page_object.empty?
|
40
50
|
raise InvalidPageError, "Invalid page: #{pagenum}"
|
41
51
|
end
|
42
52
|
end
|
@@ -59,7 +69,7 @@ module PDF
|
|
59
69
|
def attributes
|
60
70
|
@attributes ||= {}.tap { |hash|
|
61
71
|
page_with_ancestors.reverse.each do |obj|
|
62
|
-
hash.merge!(@objects.
|
72
|
+
hash.merge!(@objects.deref_hash(obj) || {})
|
63
73
|
end
|
64
74
|
}
|
65
75
|
# This shouldn't be necesary, but some non compliant PDFs leave MediaBox
|
@@ -68,22 +78,56 @@ module PDF
|
|
68
78
|
@attributes
|
69
79
|
end
|
70
80
|
|
81
|
+
def height
|
82
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
83
|
+
rect.apply_rotation(rotate) if rotate > 0
|
84
|
+
rect.height
|
85
|
+
end
|
86
|
+
|
87
|
+
def width
|
88
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
89
|
+
rect.apply_rotation(rotate) if rotate > 0
|
90
|
+
rect.width
|
91
|
+
end
|
92
|
+
|
93
|
+
def origin
|
94
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
95
|
+
rect.apply_rotation(rotate) if rotate > 0
|
96
|
+
|
97
|
+
rect.bottom_left
|
98
|
+
end
|
99
|
+
|
71
100
|
# Convenience method to identify the page's orientation.
|
72
101
|
#
|
73
102
|
def orientation
|
74
|
-
|
103
|
+
if height > width
|
104
|
+
"portrait"
|
105
|
+
else
|
106
|
+
"landscape"
|
107
|
+
end
|
75
108
|
end
|
76
109
|
|
77
110
|
# returns the plain text content of this page encoded as UTF-8. Any
|
78
111
|
# characters that can't be translated will be returned as a ▯
|
79
112
|
#
|
80
|
-
def text
|
113
|
+
def text(opts = {})
|
81
114
|
receiver = PageTextReceiver.new
|
82
115
|
walk(receiver)
|
83
|
-
receiver.
|
116
|
+
runs = receiver.runs(opts)
|
117
|
+
|
118
|
+
# rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
119
|
+
mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
|
120
|
+
|
121
|
+
PageLayout.new(runs, mediabox).to_s
|
84
122
|
end
|
85
123
|
alias :to_s :text
|
86
124
|
|
125
|
+
def runs(opts = {})
|
126
|
+
receiver = PageTextReceiver.new
|
127
|
+
walk(receiver)
|
128
|
+
receiver.runs(opts)
|
129
|
+
end
|
130
|
+
|
87
131
|
# processes the raw content stream for this page in sequential order and
|
88
132
|
# passes callbacks to the receiver objects.
|
89
133
|
#
|
@@ -108,6 +152,9 @@ module PDF
|
|
108
152
|
# the program in the correct order and calls out to your implementation.
|
109
153
|
#
|
110
154
|
def walk(*receivers)
|
155
|
+
receivers = receivers.map { |receiver|
|
156
|
+
ValidatingReceiver.new(receiver)
|
157
|
+
}
|
111
158
|
callback(receivers, :page=, [self])
|
112
159
|
content_stream(receivers, raw_content)
|
113
160
|
end
|
@@ -116,25 +163,85 @@ module PDF
|
|
116
163
|
# see here unless you're a PDF nerd like me.
|
117
164
|
#
|
118
165
|
def raw_content
|
119
|
-
contents = objects.
|
166
|
+
contents = objects.deref_stream_or_array(@page_object[:Contents])
|
120
167
|
[contents].flatten.compact.map { |obj|
|
121
|
-
objects.
|
122
|
-
}.map { |obj|
|
168
|
+
objects.deref_stream(obj)
|
169
|
+
}.compact.map { |obj|
|
123
170
|
obj.unfiltered_data
|
124
171
|
}.join(" ")
|
125
172
|
end
|
126
173
|
|
174
|
+
# returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
|
175
|
+
#
|
176
|
+
def rotate
|
177
|
+
value = attributes[:Rotate].to_i
|
178
|
+
case value
|
179
|
+
when 0, 90, 180, 270
|
180
|
+
value
|
181
|
+
else
|
182
|
+
0
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# returns the "boxes" that define the page object.
|
187
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
188
|
+
#
|
189
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
190
|
+
#
|
191
|
+
def boxes
|
192
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
193
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
194
|
+
end
|
195
|
+
|
196
|
+
# returns the "boxes" that define the page object.
|
197
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
198
|
+
#
|
199
|
+
def rectangles
|
200
|
+
# attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
201
|
+
mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
|
202
|
+
cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
|
203
|
+
bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
|
204
|
+
trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
|
205
|
+
artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
|
206
|
+
|
207
|
+
begin
|
208
|
+
mediarect = Rectangle.from_array(mediabox)
|
209
|
+
croprect = Rectangle.from_array(cropbox)
|
210
|
+
bleedrect = Rectangle.from_array(bleedbox)
|
211
|
+
trimrect = Rectangle.from_array(trimbox)
|
212
|
+
artrect = Rectangle.from_array(artbox)
|
213
|
+
rescue ArgumentError => e
|
214
|
+
raise MalformedPDFError, e.message
|
215
|
+
end
|
216
|
+
|
217
|
+
if rotate > 0
|
218
|
+
mediarect.apply_rotation(rotate)
|
219
|
+
croprect.apply_rotation(rotate)
|
220
|
+
bleedrect.apply_rotation(rotate)
|
221
|
+
trimrect.apply_rotation(rotate)
|
222
|
+
artrect.apply_rotation(rotate)
|
223
|
+
end
|
224
|
+
|
225
|
+
{
|
226
|
+
MediaBox: mediarect,
|
227
|
+
CropBox: croprect,
|
228
|
+
BleedBox: bleedrect,
|
229
|
+
TrimBox: trimrect,
|
230
|
+
ArtBox: artrect,
|
231
|
+
}
|
232
|
+
end
|
233
|
+
|
127
234
|
private
|
128
235
|
|
129
236
|
def root
|
130
|
-
root ||= objects.
|
237
|
+
@root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
|
131
238
|
end
|
132
239
|
|
133
240
|
# Returns the resources that accompany this page. Includes
|
134
241
|
# resources inherited from parents.
|
135
242
|
#
|
136
243
|
def resources
|
137
|
-
@resources ||= @objects.
|
244
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
|
138
245
|
end
|
139
246
|
|
140
247
|
def content_stream(receivers, instructions)
|
@@ -143,8 +250,8 @@ module PDF
|
|
143
250
|
params = []
|
144
251
|
|
145
252
|
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
146
|
-
if token.kind_of?(Token)
|
147
|
-
callback(receivers,
|
253
|
+
if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
|
254
|
+
callback(receivers, method_name, params)
|
148
255
|
params.clear
|
149
256
|
else
|
150
257
|
params << token
|
@@ -156,9 +263,26 @@ module PDF
|
|
156
263
|
|
157
264
|
# calls the name callback method on each receiver object with params as the arguments
|
158
265
|
#
|
266
|
+
# The silly style here is because sorbet won't let me use splat arguments
|
267
|
+
#
|
159
268
|
def callback(receivers, name, params=[])
|
160
269
|
receivers.each do |receiver|
|
161
|
-
|
270
|
+
if receiver.respond_to?(name)
|
271
|
+
case params.size
|
272
|
+
when 0 then receiver.send(name)
|
273
|
+
when 1 then receiver.send(name, params[0])
|
274
|
+
when 2 then receiver.send(name, params[0], params[1])
|
275
|
+
when 3 then receiver.send(name, params[0], params[1], params[2])
|
276
|
+
when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
|
277
|
+
when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
|
278
|
+
when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
|
279
|
+
when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
|
280
|
+
when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
|
281
|
+
when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
|
282
|
+
else
|
283
|
+
receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
|
284
|
+
end
|
285
|
+
end
|
162
286
|
end
|
163
287
|
end
|
164
288
|
|
@@ -170,7 +294,10 @@ module PDF
|
|
170
294
|
if origin.nil?
|
171
295
|
[]
|
172
296
|
else
|
173
|
-
obj = objects.
|
297
|
+
obj = objects.deref_hash(origin)
|
298
|
+
if obj.nil?
|
299
|
+
raise MalformedPDFError, "parent mus not be nil"
|
300
|
+
end
|
174
301
|
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
|
175
302
|
end
|
176
303
|
end
|
@@ -1,6 +1,10 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
require 'pdf/reader/overlapping_runs_filter'
|
6
|
+
require 'pdf/reader/zero_width_runs_filter'
|
7
|
+
|
4
8
|
class PDF::Reader
|
5
9
|
|
6
10
|
# Takes a collection of TextRun objects and renders them into a single
|
@@ -13,24 +17,28 @@ class PDF::Reader
|
|
13
17
|
DEFAULT_FONT_SIZE = 12
|
14
18
|
|
15
19
|
def initialize(runs, mediabox)
|
16
|
-
|
20
|
+
# mediabox is a 4-element array for now, but it'd be nice to switch to a
|
21
|
+
# PDF::Reader::Rectangle at some point
|
22
|
+
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
17
23
|
|
18
|
-
@
|
24
|
+
@mediabox = process_mediabox(mediabox)
|
25
|
+
@runs = runs
|
19
26
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
20
27
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
21
|
-
@
|
22
|
-
@
|
23
|
-
|
24
|
-
@
|
28
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
29
|
+
@x_offset = @runs.map(&:x).sort.first || 0
|
30
|
+
lowest_y = @runs.map(&:y).sort.first || 0
|
31
|
+
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
25
32
|
end
|
26
33
|
|
27
34
|
def to_s
|
28
35
|
return "" if @runs.empty?
|
36
|
+
return "" if row_count == 0
|
29
37
|
|
30
38
|
page = row_count.times.map { |i| " " * col_count }
|
31
39
|
@runs.each do |run|
|
32
40
|
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
33
|
-
y_pos = row_count - (run.y / row_multiplier).round
|
41
|
+
y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
|
34
42
|
if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
|
35
43
|
local_string_insert(page[y_pos-1], run.text, x_pos)
|
36
44
|
end
|
@@ -40,6 +48,14 @@ class PDF::Reader
|
|
40
48
|
|
41
49
|
private
|
42
50
|
|
51
|
+
def page_width
|
52
|
+
@mediabox.width
|
53
|
+
end
|
54
|
+
|
55
|
+
def page_height
|
56
|
+
@mediabox.height
|
57
|
+
end
|
58
|
+
|
43
59
|
# given an array of strings, return a new array with empty rows from the
|
44
60
|
# beginning and end removed.
|
45
61
|
#
|
@@ -58,19 +74,19 @@ class PDF::Reader
|
|
58
74
|
end
|
59
75
|
|
60
76
|
def row_count
|
61
|
-
@row_count ||= (
|
77
|
+
@row_count ||= (page_height / @mean_font_size).floor
|
62
78
|
end
|
63
79
|
|
64
80
|
def col_count
|
65
|
-
@col_count ||= ((
|
81
|
+
@col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
|
66
82
|
end
|
67
83
|
|
68
84
|
def row_multiplier
|
69
|
-
@row_multiplier ||=
|
85
|
+
@row_multiplier ||= page_height.to_f / row_count.to_f
|
70
86
|
end
|
71
87
|
|
72
88
|
def col_multiplier
|
73
|
-
@col_multiplier ||=
|
89
|
+
@col_multiplier ||= page_width.to_f / col_count.to_f
|
74
90
|
end
|
75
91
|
|
76
92
|
def mean(collection)
|
@@ -81,40 +97,28 @@ class PDF::Reader
|
|
81
97
|
end
|
82
98
|
end
|
83
99
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
100
|
+
def median(collection)
|
101
|
+
if collection.size == 0
|
102
|
+
0
|
103
|
+
else
|
104
|
+
collection.sort[(collection.size * 0.5).floor]
|
105
|
+
end
|
90
106
|
end
|
91
107
|
|
92
|
-
|
93
|
-
|
94
|
-
def merge_runs(runs)
|
95
|
-
runs.group_by { |char|
|
96
|
-
char.y.to_i
|
97
|
-
}.map { |y, chars|
|
98
|
-
group_chars_into_runs(chars.sort)
|
99
|
-
}.flatten.sort
|
108
|
+
def local_string_insert(haystack, needle, index)
|
109
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
100
110
|
end
|
101
111
|
|
102
|
-
def
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
runs << head
|
111
|
-
end
|
112
|
+
def process_mediabox(mediabox)
|
113
|
+
if mediabox.is_a?(Array)
|
114
|
+
msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
|
115
|
+
" please use a Rectangle instead"
|
116
|
+
$stderr.puts msg
|
117
|
+
PDF::Reader::Rectangle.from_array(mediabox)
|
118
|
+
else
|
119
|
+
mediabox
|
112
120
|
end
|
113
|
-
runs
|
114
121
|
end
|
115
122
|
|
116
|
-
def local_string_insert(haystack, needle, index)
|
117
|
-
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
118
|
-
end
|
119
123
|
end
|
120
124
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/transformation_matrix'
|
@@ -15,7 +16,7 @@ class PDF::Reader
|
|
15
16
|
:h_scaling => 1.0,
|
16
17
|
:text_leading => 0,
|
17
18
|
:text_font => nil,
|
18
|
-
:text_font_size =>
|
19
|
+
:text_font_size => 0,
|
19
20
|
:text_mode => 0,
|
20
21
|
:text_rise => 0,
|
21
22
|
:text_knockout => 0
|
@@ -30,7 +31,13 @@ class PDF::Reader
|
|
30
31
|
@xobject_stack = [page.xobjects]
|
31
32
|
@cs_stack = [page.color_spaces]
|
32
33
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
33
|
-
state[:ctm]
|
34
|
+
state[:ctm] = identity_matrix
|
35
|
+
|
36
|
+
# These are only valid when inside a `BT` block and we re-initialize them on each
|
37
|
+
# `BT`. However, we need the instance variables set so PDFs with the text operators
|
38
|
+
# out order don't trigger NoMethodError when these are nil
|
39
|
+
@text_matrix = identity_matrix
|
40
|
+
@text_line_matrix = identity_matrix
|
34
41
|
end
|
35
42
|
|
36
43
|
#####################################################
|
@@ -312,7 +319,7 @@ class PDF::Reader
|
|
312
319
|
# may need to be added
|
313
320
|
#
|
314
321
|
def process_glyph_displacement(w0, tj, word_boundary)
|
315
|
-
fs =
|
322
|
+
fs = state[:text_font_size]
|
316
323
|
tc = state[:char_spacing]
|
317
324
|
if word_boundary
|
318
325
|
tw = state[:word_spacing]
|
@@ -322,22 +329,24 @@ class PDF::Reader
|
|
322
329
|
th = state[:h_scaling]
|
323
330
|
# optimise the common path to reduce Float allocations
|
324
331
|
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
325
|
-
|
326
|
-
|
332
|
+
tx = w0 * fs
|
333
|
+
elsif tj != 0
|
334
|
+
# don't apply spacing to TJ displacement
|
335
|
+
tx = (w0 - (tj/1000.0)) * fs * th
|
327
336
|
else
|
328
|
-
|
329
|
-
tx =
|
337
|
+
# apply horizontal scaling to spacing values but not font size
|
338
|
+
tx = ((w0 * fs) + tc + tw) * th
|
330
339
|
end
|
331
|
-
|
332
|
-
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
333
|
-
# ctm[0] here, but this gets my tests green and I'm out of
|
334
|
-
# ideas for now
|
335
340
|
# TODO: support ty > 0
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
+
ty = 0
|
342
|
+
temp = TransformationMatrix.new(1, 0,
|
343
|
+
0, 1,
|
344
|
+
tx, ty)
|
345
|
+
@text_matrix = temp.multiply!(
|
346
|
+
@text_matrix.a, @text_matrix.b,
|
347
|
+
@text_matrix.c, @text_matrix.d,
|
348
|
+
@text_matrix.e, @text_matrix.f
|
349
|
+
)
|
341
350
|
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
342
351
|
end
|
343
352
|
|
@@ -381,7 +390,7 @@ class PDF::Reader
|
|
381
390
|
#
|
382
391
|
def build_fonts(raw_fonts)
|
383
392
|
wrapped_fonts = raw_fonts.map { |label, font|
|
384
|
-
[label, PDF::Reader::Font.new(@objects, @objects.
|
393
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
|
385
394
|
}
|
386
395
|
|
387
396
|
::Hash[wrapped_fonts]
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'forwardable'
|
@@ -41,13 +42,39 @@ module PDF
|
|
41
42
|
# starting a new page
|
42
43
|
def page=(page)
|
43
44
|
@state = PageState.new(page)
|
45
|
+
@page = page
|
44
46
|
@content = []
|
45
47
|
@characters = []
|
46
|
-
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
47
48
|
end
|
48
49
|
|
50
|
+
def runs(opts = {})
|
51
|
+
runs = @characters
|
52
|
+
|
53
|
+
if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
|
54
|
+
runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
|
55
|
+
end
|
56
|
+
|
57
|
+
if opts.fetch(:skip_zero_width, true)
|
58
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
59
|
+
end
|
60
|
+
|
61
|
+
if opts.fetch(:skip_overlapping, true)
|
62
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
63
|
+
end
|
64
|
+
|
65
|
+
runs = NoTextFilter.exclude_empty_strings(runs)
|
66
|
+
|
67
|
+
if opts.fetch(:merge, true)
|
68
|
+
runs = merge_runs(runs)
|
69
|
+
end
|
70
|
+
|
71
|
+
runs
|
72
|
+
end
|
73
|
+
|
74
|
+
# deprecated
|
49
75
|
def content
|
50
|
-
|
76
|
+
mediabox = @page.rectangles[:MediaBox]
|
77
|
+
PageLayout.new(runs, mediabox).to_s
|
51
78
|
end
|
52
79
|
|
53
80
|
#####################################################
|
@@ -62,8 +89,10 @@ module PDF
|
|
62
89
|
params.each do |arg|
|
63
90
|
if arg.is_a?(String)
|
64
91
|
internal_show_text(arg)
|
65
|
-
|
92
|
+
elsif arg.is_a?(Numeric)
|
66
93
|
@state.process_glyph_displacement(0, arg, false)
|
94
|
+
else
|
95
|
+
# skip it
|
67
96
|
end
|
68
97
|
end
|
69
98
|
end
|
@@ -94,6 +123,7 @@ module PDF
|
|
94
123
|
private
|
95
124
|
|
96
125
|
def internal_show_text(string)
|
126
|
+
PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
|
97
127
|
if @state.current_font.nil?
|
98
128
|
raise PDF::Reader::MalformedPDFError, "current font is invalid"
|
99
129
|
end
|
@@ -101,11 +131,13 @@ module PDF
|
|
101
131
|
glyphs.each_with_index do |glyph_code, index|
|
102
132
|
# paint the current glyph
|
103
133
|
newx, newy = @state.trm_transform(0,0)
|
134
|
+
newx, newy = apply_rotation(newx, newy)
|
135
|
+
|
104
136
|
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
105
137
|
|
106
138
|
# apply to glyph displacment for the current glyph so the next
|
107
139
|
# glyph will appear in the correct position
|
108
|
-
glyph_width = @state.current_font.
|
140
|
+
glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
|
109
141
|
th = 1
|
110
142
|
scaled_glyph_width = glyph_width * @state.font_size * th
|
111
143
|
unless utf8_chars == SPACE
|
@@ -115,6 +147,44 @@ module PDF
|
|
115
147
|
end
|
116
148
|
end
|
117
149
|
|
150
|
+
def apply_rotation(x, y)
|
151
|
+
if @page.rotate == 90
|
152
|
+
tmp = x
|
153
|
+
x = y
|
154
|
+
y = tmp * -1
|
155
|
+
elsif @page.rotate == 180
|
156
|
+
y *= -1
|
157
|
+
x *= -1
|
158
|
+
elsif @page.rotate == 270
|
159
|
+
tmp = y
|
160
|
+
y = x
|
161
|
+
x = tmp * -1
|
162
|
+
end
|
163
|
+
return x, y
|
164
|
+
end
|
165
|
+
|
166
|
+
# take a collection of TextRun objects and merge any that are in close
|
167
|
+
# proximity
|
168
|
+
def merge_runs(runs)
|
169
|
+
runs.group_by { |char|
|
170
|
+
char.y.to_i
|
171
|
+
}.map { |y, chars|
|
172
|
+
group_chars_into_runs(chars.sort)
|
173
|
+
}.flatten.sort
|
174
|
+
end
|
175
|
+
|
176
|
+
def group_chars_into_runs(chars)
|
177
|
+
chars.each_with_object([]) do |char, runs|
|
178
|
+
if runs.empty?
|
179
|
+
runs << char
|
180
|
+
elsif runs.last.mergable?(char)
|
181
|
+
runs[-1] = runs.last + char
|
182
|
+
else
|
183
|
+
runs << char
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
118
188
|
end
|
119
189
|
end
|
120
190
|
end
|