pdf-reader 2.2.0 → 2.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +90 -0
- data/README.md +18 -3
- data/Rakefile +1 -1
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
- data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
- data/lib/pdf/reader/buffer.rb +91 -47
- data/lib/pdf/reader/cid_widths.rb +7 -4
- data/lib/pdf/reader/cmap.rb +83 -59
- data/lib/pdf/reader/encoding.rb +17 -14
- data/lib/pdf/reader/error.rb +15 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +12 -10
- data/lib/pdf/reader/filter/flate.rb +30 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -1
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +11 -11
- data/lib/pdf/reader/font.rb +89 -26
- data/lib/pdf/reader/font_descriptor.rb +22 -18
- data/lib/pdf/reader/form_xobject.rb +18 -5
- data/lib/pdf/reader/glyph_hash.rb +28 -13
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/key_builder_v5.rb +138 -0
- data/lib/pdf/reader/lzw.rb +28 -11
- data/lib/pdf/reader/no_text_filter.rb +14 -0
- data/lib/pdf/reader/null_security_handler.rb +1 -4
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +292 -63
- data/lib/pdf/reader/object_stream.rb +3 -2
- data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
- data/lib/pdf/reader/page.rb +143 -16
- data/lib/pdf/reader/page_layout.rb +43 -39
- data/lib/pdf/reader/page_state.rb +26 -17
- data/lib/pdf/reader/page_text_receiver.rb +74 -4
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +34 -14
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rc4_security_handler.rb +38 -0
- data/lib/pdf/reader/rectangle.rb +113 -0
- data/lib/pdf/reader/reference.rb +3 -1
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
- data/lib/pdf/reader/security_handler_factory.rb +79 -0
- data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
- data/lib/pdf/reader/stream.rb +3 -2
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +40 -5
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +8 -7
- data/lib/pdf/reader/type_check.rb +98 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/validating_receiver.rb +262 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
- data/lib/pdf/reader/width_calculator/composite.rb +6 -1
- data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
- data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +37 -11
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +49 -24
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +2048 -0
- metadata +39 -23
- data/lib/pdf/hash.rb +0 -20
- data/lib/pdf/reader/orientation_detector.rb +0 -34
- data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -13,7 +14,7 @@ module PDF
|
|
13
14
|
# objects accessor to help walk the page dictionary in any useful way.
|
14
15
|
#
|
15
16
|
class Page
|
16
|
-
|
17
|
+
extend Forwardable
|
17
18
|
|
18
19
|
# lowlevel hash-like access to all objects in the underlying PDF
|
19
20
|
attr_reader :objects
|
@@ -26,6 +27,15 @@ module PDF
|
|
26
27
|
# operations
|
27
28
|
attr_reader :cache
|
28
29
|
|
30
|
+
def_delegators :resources, :color_spaces
|
31
|
+
def_delegators :resources, :fonts
|
32
|
+
def_delegators :resources, :graphic_states
|
33
|
+
def_delegators :resources, :patterns
|
34
|
+
def_delegators :resources, :procedure_sets
|
35
|
+
def_delegators :resources, :properties
|
36
|
+
def_delegators :resources, :shadings
|
37
|
+
def_delegators :resources, :xobjects
|
38
|
+
|
29
39
|
# creates a new page wrapper.
|
30
40
|
#
|
31
41
|
# * objects - an ObjectHash instance that wraps a PDF file
|
@@ -33,10 +43,10 @@ module PDF
|
|
33
43
|
#
|
34
44
|
def initialize(objects, pagenum, options = {})
|
35
45
|
@objects, @pagenum = objects, pagenum
|
36
|
-
@page_object = objects.
|
46
|
+
@page_object = objects.deref_hash(objects.page_references[pagenum - 1]) || {}
|
37
47
|
@cache = options[:cache] || {}
|
38
48
|
|
39
|
-
|
49
|
+
if @page_object.empty?
|
40
50
|
raise InvalidPageError, "Invalid page: #{pagenum}"
|
41
51
|
end
|
42
52
|
end
|
@@ -59,7 +69,7 @@ module PDF
|
|
59
69
|
def attributes
|
60
70
|
@attributes ||= {}.tap { |hash|
|
61
71
|
page_with_ancestors.reverse.each do |obj|
|
62
|
-
hash.merge!(@objects.
|
72
|
+
hash.merge!(@objects.deref_hash(obj) || {})
|
63
73
|
end
|
64
74
|
}
|
65
75
|
# This shouldn't be necesary, but some non compliant PDFs leave MediaBox
|
@@ -68,22 +78,56 @@ module PDF
|
|
68
78
|
@attributes
|
69
79
|
end
|
70
80
|
|
81
|
+
def height
|
82
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
83
|
+
rect.apply_rotation(rotate) if rotate > 0
|
84
|
+
rect.height
|
85
|
+
end
|
86
|
+
|
87
|
+
def width
|
88
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
89
|
+
rect.apply_rotation(rotate) if rotate > 0
|
90
|
+
rect.width
|
91
|
+
end
|
92
|
+
|
93
|
+
def origin
|
94
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
95
|
+
rect.apply_rotation(rotate) if rotate > 0
|
96
|
+
|
97
|
+
rect.bottom_left
|
98
|
+
end
|
99
|
+
|
71
100
|
# Convenience method to identify the page's orientation.
|
72
101
|
#
|
73
102
|
def orientation
|
74
|
-
|
103
|
+
if height > width
|
104
|
+
"portrait"
|
105
|
+
else
|
106
|
+
"landscape"
|
107
|
+
end
|
75
108
|
end
|
76
109
|
|
77
110
|
# returns the plain text content of this page encoded as UTF-8. Any
|
78
111
|
# characters that can't be translated will be returned as a ▯
|
79
112
|
#
|
80
|
-
def text
|
113
|
+
def text(opts = {})
|
81
114
|
receiver = PageTextReceiver.new
|
82
115
|
walk(receiver)
|
83
|
-
receiver.
|
116
|
+
runs = receiver.runs(opts)
|
117
|
+
|
118
|
+
# rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
119
|
+
mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
|
120
|
+
|
121
|
+
PageLayout.new(runs, mediabox).to_s
|
84
122
|
end
|
85
123
|
alias :to_s :text
|
86
124
|
|
125
|
+
def runs(opts = {})
|
126
|
+
receiver = PageTextReceiver.new
|
127
|
+
walk(receiver)
|
128
|
+
receiver.runs(opts)
|
129
|
+
end
|
130
|
+
|
87
131
|
# processes the raw content stream for this page in sequential order and
|
88
132
|
# passes callbacks to the receiver objects.
|
89
133
|
#
|
@@ -108,6 +152,9 @@ module PDF
|
|
108
152
|
# the program in the correct order and calls out to your implementation.
|
109
153
|
#
|
110
154
|
def walk(*receivers)
|
155
|
+
receivers = receivers.map { |receiver|
|
156
|
+
ValidatingReceiver.new(receiver)
|
157
|
+
}
|
111
158
|
callback(receivers, :page=, [self])
|
112
159
|
content_stream(receivers, raw_content)
|
113
160
|
end
|
@@ -116,25 +163,85 @@ module PDF
|
|
116
163
|
# see here unless you're a PDF nerd like me.
|
117
164
|
#
|
118
165
|
def raw_content
|
119
|
-
contents = objects.
|
166
|
+
contents = objects.deref_stream_or_array(@page_object[:Contents])
|
120
167
|
[contents].flatten.compact.map { |obj|
|
121
|
-
objects.
|
122
|
-
}.map { |obj|
|
168
|
+
objects.deref_stream(obj)
|
169
|
+
}.compact.map { |obj|
|
123
170
|
obj.unfiltered_data
|
124
171
|
}.join(" ")
|
125
172
|
end
|
126
173
|
|
174
|
+
# returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
|
175
|
+
#
|
176
|
+
def rotate
|
177
|
+
value = attributes[:Rotate].to_i
|
178
|
+
case value
|
179
|
+
when 0, 90, 180, 270
|
180
|
+
value
|
181
|
+
else
|
182
|
+
0
|
183
|
+
end
|
184
|
+
end
|
185
|
+
|
186
|
+
# returns the "boxes" that define the page object.
|
187
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
188
|
+
#
|
189
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
190
|
+
#
|
191
|
+
def boxes
|
192
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
193
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
194
|
+
end
|
195
|
+
|
196
|
+
# returns the "boxes" that define the page object.
|
197
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
198
|
+
#
|
199
|
+
def rectangles
|
200
|
+
# attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
|
201
|
+
mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
|
202
|
+
cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
|
203
|
+
bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
|
204
|
+
trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
|
205
|
+
artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
|
206
|
+
|
207
|
+
begin
|
208
|
+
mediarect = Rectangle.from_array(mediabox)
|
209
|
+
croprect = Rectangle.from_array(cropbox)
|
210
|
+
bleedrect = Rectangle.from_array(bleedbox)
|
211
|
+
trimrect = Rectangle.from_array(trimbox)
|
212
|
+
artrect = Rectangle.from_array(artbox)
|
213
|
+
rescue ArgumentError => e
|
214
|
+
raise MalformedPDFError, e.message
|
215
|
+
end
|
216
|
+
|
217
|
+
if rotate > 0
|
218
|
+
mediarect.apply_rotation(rotate)
|
219
|
+
croprect.apply_rotation(rotate)
|
220
|
+
bleedrect.apply_rotation(rotate)
|
221
|
+
trimrect.apply_rotation(rotate)
|
222
|
+
artrect.apply_rotation(rotate)
|
223
|
+
end
|
224
|
+
|
225
|
+
{
|
226
|
+
MediaBox: mediarect,
|
227
|
+
CropBox: croprect,
|
228
|
+
BleedBox: bleedrect,
|
229
|
+
TrimBox: trimrect,
|
230
|
+
ArtBox: artrect,
|
231
|
+
}
|
232
|
+
end
|
233
|
+
|
127
234
|
private
|
128
235
|
|
129
236
|
def root
|
130
|
-
root ||= objects.
|
237
|
+
@root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
|
131
238
|
end
|
132
239
|
|
133
240
|
# Returns the resources that accompany this page. Includes
|
134
241
|
# resources inherited from parents.
|
135
242
|
#
|
136
243
|
def resources
|
137
|
-
@resources ||= @objects.
|
244
|
+
@resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
|
138
245
|
end
|
139
246
|
|
140
247
|
def content_stream(receivers, instructions)
|
@@ -143,8 +250,8 @@ module PDF
|
|
143
250
|
params = []
|
144
251
|
|
145
252
|
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
146
|
-
if token.kind_of?(Token)
|
147
|
-
callback(receivers,
|
253
|
+
if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
|
254
|
+
callback(receivers, method_name, params)
|
148
255
|
params.clear
|
149
256
|
else
|
150
257
|
params << token
|
@@ -156,9 +263,26 @@ module PDF
|
|
156
263
|
|
157
264
|
# calls the name callback method on each receiver object with params as the arguments
|
158
265
|
#
|
266
|
+
# The silly style here is because sorbet won't let me use splat arguments
|
267
|
+
#
|
159
268
|
def callback(receivers, name, params=[])
|
160
269
|
receivers.each do |receiver|
|
161
|
-
|
270
|
+
if receiver.respond_to?(name)
|
271
|
+
case params.size
|
272
|
+
when 0 then receiver.send(name)
|
273
|
+
when 1 then receiver.send(name, params[0])
|
274
|
+
when 2 then receiver.send(name, params[0], params[1])
|
275
|
+
when 3 then receiver.send(name, params[0], params[1], params[2])
|
276
|
+
when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
|
277
|
+
when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
|
278
|
+
when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
|
279
|
+
when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
|
280
|
+
when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
|
281
|
+
when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
|
282
|
+
else
|
283
|
+
receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
|
284
|
+
end
|
285
|
+
end
|
162
286
|
end
|
163
287
|
end
|
164
288
|
|
@@ -170,7 +294,10 @@ module PDF
|
|
170
294
|
if origin.nil?
|
171
295
|
[]
|
172
296
|
else
|
173
|
-
obj = objects.
|
297
|
+
obj = objects.deref_hash(origin)
|
298
|
+
if obj.nil?
|
299
|
+
raise MalformedPDFError, "parent mus not be nil"
|
300
|
+
end
|
174
301
|
[ select_inheritable(obj) ] + ancestors(obj[:Parent])
|
175
302
|
end
|
176
303
|
end
|
@@ -1,6 +1,10 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: strict
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
require 'pdf/reader/overlapping_runs_filter'
|
6
|
+
require 'pdf/reader/zero_width_runs_filter'
|
7
|
+
|
4
8
|
class PDF::Reader
|
5
9
|
|
6
10
|
# Takes a collection of TextRun objects and renders them into a single
|
@@ -13,24 +17,28 @@ class PDF::Reader
|
|
13
17
|
DEFAULT_FONT_SIZE = 12
|
14
18
|
|
15
19
|
def initialize(runs, mediabox)
|
16
|
-
|
20
|
+
# mediabox is a 4-element array for now, but it'd be nice to switch to a
|
21
|
+
# PDF::Reader::Rectangle at some point
|
22
|
+
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
17
23
|
|
18
|
-
@
|
24
|
+
@mediabox = process_mediabox(mediabox)
|
25
|
+
@runs = runs
|
19
26
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
20
27
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
21
|
-
@
|
22
|
-
@
|
23
|
-
|
24
|
-
@
|
28
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
29
|
+
@x_offset = @runs.map(&:x).sort.first || 0
|
30
|
+
lowest_y = @runs.map(&:y).sort.first || 0
|
31
|
+
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
25
32
|
end
|
26
33
|
|
27
34
|
def to_s
|
28
35
|
return "" if @runs.empty?
|
36
|
+
return "" if row_count == 0
|
29
37
|
|
30
38
|
page = row_count.times.map { |i| " " * col_count }
|
31
39
|
@runs.each do |run|
|
32
40
|
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
33
|
-
y_pos = row_count - (run.y / row_multiplier).round
|
41
|
+
y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
|
34
42
|
if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
|
35
43
|
local_string_insert(page[y_pos-1], run.text, x_pos)
|
36
44
|
end
|
@@ -40,6 +48,14 @@ class PDF::Reader
|
|
40
48
|
|
41
49
|
private
|
42
50
|
|
51
|
+
def page_width
|
52
|
+
@mediabox.width
|
53
|
+
end
|
54
|
+
|
55
|
+
def page_height
|
56
|
+
@mediabox.height
|
57
|
+
end
|
58
|
+
|
43
59
|
# given an array of strings, return a new array with empty rows from the
|
44
60
|
# beginning and end removed.
|
45
61
|
#
|
@@ -58,19 +74,19 @@ class PDF::Reader
|
|
58
74
|
end
|
59
75
|
|
60
76
|
def row_count
|
61
|
-
@row_count ||= (
|
77
|
+
@row_count ||= (page_height / @mean_font_size).floor
|
62
78
|
end
|
63
79
|
|
64
80
|
def col_count
|
65
|
-
@col_count ||= ((
|
81
|
+
@col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
|
66
82
|
end
|
67
83
|
|
68
84
|
def row_multiplier
|
69
|
-
@row_multiplier ||=
|
85
|
+
@row_multiplier ||= page_height.to_f / row_count.to_f
|
70
86
|
end
|
71
87
|
|
72
88
|
def col_multiplier
|
73
|
-
@col_multiplier ||=
|
89
|
+
@col_multiplier ||= page_width.to_f / col_count.to_f
|
74
90
|
end
|
75
91
|
|
76
92
|
def mean(collection)
|
@@ -81,40 +97,28 @@ class PDF::Reader
|
|
81
97
|
end
|
82
98
|
end
|
83
99
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
100
|
+
def median(collection)
|
101
|
+
if collection.size == 0
|
102
|
+
0
|
103
|
+
else
|
104
|
+
collection.sort[(collection.size * 0.5).floor]
|
105
|
+
end
|
90
106
|
end
|
91
107
|
|
92
|
-
|
93
|
-
|
94
|
-
def merge_runs(runs)
|
95
|
-
runs.group_by { |char|
|
96
|
-
char.y.to_i
|
97
|
-
}.map { |y, chars|
|
98
|
-
group_chars_into_runs(chars.sort)
|
99
|
-
}.flatten.sort
|
108
|
+
def local_string_insert(haystack, needle, index)
|
109
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
100
110
|
end
|
101
111
|
|
102
|
-
def
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
runs << head
|
111
|
-
end
|
112
|
+
def process_mediabox(mediabox)
|
113
|
+
if mediabox.is_a?(Array)
|
114
|
+
msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
|
115
|
+
" please use a Rectangle instead"
|
116
|
+
$stderr.puts msg
|
117
|
+
PDF::Reader::Rectangle.from_array(mediabox)
|
118
|
+
else
|
119
|
+
mediabox
|
112
120
|
end
|
113
|
-
runs
|
114
121
|
end
|
115
122
|
|
116
|
-
def local_string_insert(haystack, needle, index)
|
117
|
-
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
118
|
-
end
|
119
123
|
end
|
120
124
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/transformation_matrix'
|
@@ -15,7 +16,7 @@ class PDF::Reader
|
|
15
16
|
:h_scaling => 1.0,
|
16
17
|
:text_leading => 0,
|
17
18
|
:text_font => nil,
|
18
|
-
:text_font_size =>
|
19
|
+
:text_font_size => 0,
|
19
20
|
:text_mode => 0,
|
20
21
|
:text_rise => 0,
|
21
22
|
:text_knockout => 0
|
@@ -30,7 +31,13 @@ class PDF::Reader
|
|
30
31
|
@xobject_stack = [page.xobjects]
|
31
32
|
@cs_stack = [page.color_spaces]
|
32
33
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
33
|
-
state[:ctm]
|
34
|
+
state[:ctm] = identity_matrix
|
35
|
+
|
36
|
+
# These are only valid when inside a `BT` block and we re-initialize them on each
|
37
|
+
# `BT`. However, we need the instance variables set so PDFs with the text operators
|
38
|
+
# out order don't trigger NoMethodError when these are nil
|
39
|
+
@text_matrix = identity_matrix
|
40
|
+
@text_line_matrix = identity_matrix
|
34
41
|
end
|
35
42
|
|
36
43
|
#####################################################
|
@@ -312,7 +319,7 @@ class PDF::Reader
|
|
312
319
|
# may need to be added
|
313
320
|
#
|
314
321
|
def process_glyph_displacement(w0, tj, word_boundary)
|
315
|
-
fs =
|
322
|
+
fs = state[:text_font_size]
|
316
323
|
tc = state[:char_spacing]
|
317
324
|
if word_boundary
|
318
325
|
tw = state[:word_spacing]
|
@@ -322,22 +329,24 @@ class PDF::Reader
|
|
322
329
|
th = state[:h_scaling]
|
323
330
|
# optimise the common path to reduce Float allocations
|
324
331
|
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
325
|
-
|
326
|
-
|
332
|
+
tx = w0 * fs
|
333
|
+
elsif tj != 0
|
334
|
+
# don't apply spacing to TJ displacement
|
335
|
+
tx = (w0 - (tj/1000.0)) * fs * th
|
327
336
|
else
|
328
|
-
|
329
|
-
tx =
|
337
|
+
# apply horizontal scaling to spacing values but not font size
|
338
|
+
tx = ((w0 * fs) + tc + tw) * th
|
330
339
|
end
|
331
|
-
|
332
|
-
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
333
|
-
# ctm[0] here, but this gets my tests green and I'm out of
|
334
|
-
# ideas for now
|
335
340
|
# TODO: support ty > 0
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
+
ty = 0
|
342
|
+
temp = TransformationMatrix.new(1, 0,
|
343
|
+
0, 1,
|
344
|
+
tx, ty)
|
345
|
+
@text_matrix = temp.multiply!(
|
346
|
+
@text_matrix.a, @text_matrix.b,
|
347
|
+
@text_matrix.c, @text_matrix.d,
|
348
|
+
@text_matrix.e, @text_matrix.f
|
349
|
+
)
|
341
350
|
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
342
351
|
end
|
343
352
|
|
@@ -381,7 +390,7 @@ class PDF::Reader
|
|
381
390
|
#
|
382
391
|
def build_fonts(raw_fonts)
|
383
392
|
wrapped_fonts = raw_fonts.map { |label, font|
|
384
|
-
[label, PDF::Reader::Font.new(@objects, @objects.
|
393
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
|
385
394
|
}
|
386
395
|
|
387
396
|
::Hash[wrapped_fonts]
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'forwardable'
|
@@ -41,13 +42,39 @@ module PDF
|
|
41
42
|
# starting a new page
|
42
43
|
def page=(page)
|
43
44
|
@state = PageState.new(page)
|
45
|
+
@page = page
|
44
46
|
@content = []
|
45
47
|
@characters = []
|
46
|
-
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
47
48
|
end
|
48
49
|
|
50
|
+
def runs(opts = {})
|
51
|
+
runs = @characters
|
52
|
+
|
53
|
+
if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
|
54
|
+
runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
|
55
|
+
end
|
56
|
+
|
57
|
+
if opts.fetch(:skip_zero_width, true)
|
58
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
59
|
+
end
|
60
|
+
|
61
|
+
if opts.fetch(:skip_overlapping, true)
|
62
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
63
|
+
end
|
64
|
+
|
65
|
+
runs = NoTextFilter.exclude_empty_strings(runs)
|
66
|
+
|
67
|
+
if opts.fetch(:merge, true)
|
68
|
+
runs = merge_runs(runs)
|
69
|
+
end
|
70
|
+
|
71
|
+
runs
|
72
|
+
end
|
73
|
+
|
74
|
+
# deprecated
|
49
75
|
def content
|
50
|
-
|
76
|
+
mediabox = @page.rectangles[:MediaBox]
|
77
|
+
PageLayout.new(runs, mediabox).to_s
|
51
78
|
end
|
52
79
|
|
53
80
|
#####################################################
|
@@ -62,8 +89,10 @@ module PDF
|
|
62
89
|
params.each do |arg|
|
63
90
|
if arg.is_a?(String)
|
64
91
|
internal_show_text(arg)
|
65
|
-
|
92
|
+
elsif arg.is_a?(Numeric)
|
66
93
|
@state.process_glyph_displacement(0, arg, false)
|
94
|
+
else
|
95
|
+
# skip it
|
67
96
|
end
|
68
97
|
end
|
69
98
|
end
|
@@ -94,6 +123,7 @@ module PDF
|
|
94
123
|
private
|
95
124
|
|
96
125
|
def internal_show_text(string)
|
126
|
+
PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
|
97
127
|
if @state.current_font.nil?
|
98
128
|
raise PDF::Reader::MalformedPDFError, "current font is invalid"
|
99
129
|
end
|
@@ -101,11 +131,13 @@ module PDF
|
|
101
131
|
glyphs.each_with_index do |glyph_code, index|
|
102
132
|
# paint the current glyph
|
103
133
|
newx, newy = @state.trm_transform(0,0)
|
134
|
+
newx, newy = apply_rotation(newx, newy)
|
135
|
+
|
104
136
|
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
105
137
|
|
106
138
|
# apply to glyph displacment for the current glyph so the next
|
107
139
|
# glyph will appear in the correct position
|
108
|
-
glyph_width = @state.current_font.
|
140
|
+
glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
|
109
141
|
th = 1
|
110
142
|
scaled_glyph_width = glyph_width * @state.font_size * th
|
111
143
|
unless utf8_chars == SPACE
|
@@ -115,6 +147,44 @@ module PDF
|
|
115
147
|
end
|
116
148
|
end
|
117
149
|
|
150
|
+
def apply_rotation(x, y)
|
151
|
+
if @page.rotate == 90
|
152
|
+
tmp = x
|
153
|
+
x = y
|
154
|
+
y = tmp * -1
|
155
|
+
elsif @page.rotate == 180
|
156
|
+
y *= -1
|
157
|
+
x *= -1
|
158
|
+
elsif @page.rotate == 270
|
159
|
+
tmp = y
|
160
|
+
y = x
|
161
|
+
x = tmp * -1
|
162
|
+
end
|
163
|
+
return x, y
|
164
|
+
end
|
165
|
+
|
166
|
+
# take a collection of TextRun objects and merge any that are in close
|
167
|
+
# proximity
|
168
|
+
def merge_runs(runs)
|
169
|
+
runs.group_by { |char|
|
170
|
+
char.y.to_i
|
171
|
+
}.map { |y, chars|
|
172
|
+
group_chars_into_runs(chars.sort)
|
173
|
+
}.flatten.sort
|
174
|
+
end
|
175
|
+
|
176
|
+
def group_chars_into_runs(chars)
|
177
|
+
chars.each_with_object([]) do |char, runs|
|
178
|
+
if runs.empty?
|
179
|
+
runs << char
|
180
|
+
elsif runs.last.mergable?(char)
|
181
|
+
runs[-1] = runs.last + char
|
182
|
+
else
|
183
|
+
runs << char
|
184
|
+
end
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
118
188
|
end
|
119
189
|
end
|
120
190
|
end
|