pdf-reader 2.4.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +40 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +27 -14
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +37 -23
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +28 -5
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +12 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +17 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -103,19 +104,25 @@ class PDF::Reader
|
|
103
104
|
|
104
105
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
105
106
|
# a text file supplied by Adobe at:
|
106
|
-
#
|
107
|
+
# https://github.com/adobe-type-tools/agl-aglfn
|
107
108
|
def load_adobe_glyph_mapping
|
108
109
|
keyed_by_name = {}
|
109
110
|
keyed_by_codepoint = {}
|
110
111
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
paths = [
|
113
|
+
File.dirname(__FILE__) + "/glyphlist.txt",
|
114
|
+
File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
|
115
|
+
]
|
116
|
+
paths.each do |path|
|
117
|
+
File.open(path, "r:BINARY") do |f|
|
118
|
+
f.each do |l|
|
119
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
120
|
+
if name && code
|
121
|
+
cp = "0x#{code}".hex
|
122
|
+
keyed_by_name[name.to_sym] = cp
|
123
|
+
keyed_by_codepoint[cp] ||= []
|
124
|
+
keyed_by_codepoint[cp] << name.to_sym
|
125
|
+
end
|
119
126
|
end
|
120
127
|
end
|
121
128
|
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
# -----------------------------------------------------------
|
2
|
+
# Copyright 2002-2019 Adobe (http://www.adobe.com/).
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or
|
5
|
+
# without modification, are permitted provided that the
|
6
|
+
# following conditions are met:
|
7
|
+
#
|
8
|
+
# Redistributions of source code must retain the above
|
9
|
+
# copyright notice, this list of conditions and the following
|
10
|
+
# disclaimer.
|
11
|
+
#
|
12
|
+
# Redistributions in binary form must reproduce the above
|
13
|
+
# copyright notice, this list of conditions and the following
|
14
|
+
# disclaimer in the documentation and/or other materials
|
15
|
+
# provided with the distribution.
|
16
|
+
#
|
17
|
+
# Neither the name of Adobe nor the names of its contributors
|
18
|
+
# may be used to endorse or promote products derived from this
|
19
|
+
# software without specific prior written permission.
|
20
|
+
#
|
21
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
22
|
+
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
23
|
+
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
24
|
+
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
25
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
26
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
27
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
28
|
+
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
29
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
30
|
+
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
31
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
32
|
+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
33
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34
|
+
# -----------------------------------------------------------
|
35
|
+
# Name: ITC Zapf Dingbats Glyph List
|
36
|
+
# Table version: 2.0
|
37
|
+
# Date: September 20, 2002
|
38
|
+
# URL: https://github.com/adobe-type-tools/agl-aglfn
|
39
|
+
#
|
40
|
+
# Format: two semicolon-delimited fields:
|
41
|
+
# (1) glyph name--upper/lowercase letters and digits
|
42
|
+
# (2) Unicode scalar value--four uppercase hexadecimal digits
|
43
|
+
#
|
44
|
+
a100;275E
|
45
|
+
a101;2761
|
46
|
+
a102;2762
|
47
|
+
a103;2763
|
48
|
+
a104;2764
|
49
|
+
a105;2710
|
50
|
+
a106;2765
|
51
|
+
a107;2766
|
52
|
+
a108;2767
|
53
|
+
a109;2660
|
54
|
+
a10;2721
|
55
|
+
a110;2665
|
56
|
+
a111;2666
|
57
|
+
a112;2663
|
58
|
+
a117;2709
|
59
|
+
a118;2708
|
60
|
+
a119;2707
|
61
|
+
a11;261B
|
62
|
+
a120;2460
|
63
|
+
a121;2461
|
64
|
+
a122;2462
|
65
|
+
a123;2463
|
66
|
+
a124;2464
|
67
|
+
a125;2465
|
68
|
+
a126;2466
|
69
|
+
a127;2467
|
70
|
+
a128;2468
|
71
|
+
a129;2469
|
72
|
+
a12;261E
|
73
|
+
a130;2776
|
74
|
+
a131;2777
|
75
|
+
a132;2778
|
76
|
+
a133;2779
|
77
|
+
a134;277A
|
78
|
+
a135;277B
|
79
|
+
a136;277C
|
80
|
+
a137;277D
|
81
|
+
a138;277E
|
82
|
+
a139;277F
|
83
|
+
a13;270C
|
84
|
+
a140;2780
|
85
|
+
a141;2781
|
86
|
+
a142;2782
|
87
|
+
a143;2783
|
88
|
+
a144;2784
|
89
|
+
a145;2785
|
90
|
+
a146;2786
|
91
|
+
a147;2787
|
92
|
+
a148;2788
|
93
|
+
a149;2789
|
94
|
+
a14;270D
|
95
|
+
a150;278A
|
96
|
+
a151;278B
|
97
|
+
a152;278C
|
98
|
+
a153;278D
|
99
|
+
a154;278E
|
100
|
+
a155;278F
|
101
|
+
a156;2790
|
102
|
+
a157;2791
|
103
|
+
a158;2792
|
104
|
+
a159;2793
|
105
|
+
a15;270E
|
106
|
+
a160;2794
|
107
|
+
a161;2192
|
108
|
+
a162;27A3
|
109
|
+
a163;2194
|
110
|
+
a164;2195
|
111
|
+
a165;2799
|
112
|
+
a166;279B
|
113
|
+
a167;279C
|
114
|
+
a168;279D
|
115
|
+
a169;279E
|
116
|
+
a16;270F
|
117
|
+
a170;279F
|
118
|
+
a171;27A0
|
119
|
+
a172;27A1
|
120
|
+
a173;27A2
|
121
|
+
a174;27A4
|
122
|
+
a175;27A5
|
123
|
+
a176;27A6
|
124
|
+
a177;27A7
|
125
|
+
a178;27A8
|
126
|
+
a179;27A9
|
127
|
+
a17;2711
|
128
|
+
a180;27AB
|
129
|
+
a181;27AD
|
130
|
+
a182;27AF
|
131
|
+
a183;27B2
|
132
|
+
a184;27B3
|
133
|
+
a185;27B5
|
134
|
+
a186;27B8
|
135
|
+
a187;27BA
|
136
|
+
a188;27BB
|
137
|
+
a189;27BC
|
138
|
+
a18;2712
|
139
|
+
a190;27BD
|
140
|
+
a191;27BE
|
141
|
+
a192;279A
|
142
|
+
a193;27AA
|
143
|
+
a194;27B6
|
144
|
+
a195;27B9
|
145
|
+
a196;2798
|
146
|
+
a197;27B4
|
147
|
+
a198;27B7
|
148
|
+
a199;27AC
|
149
|
+
a19;2713
|
150
|
+
a1;2701
|
151
|
+
a200;27AE
|
152
|
+
a201;27B1
|
153
|
+
a202;2703
|
154
|
+
a203;2750
|
155
|
+
a204;2752
|
156
|
+
a205;276E
|
157
|
+
a206;2770
|
158
|
+
a20;2714
|
159
|
+
a21;2715
|
160
|
+
a22;2716
|
161
|
+
a23;2717
|
162
|
+
a24;2718
|
163
|
+
a25;2719
|
164
|
+
a26;271A
|
165
|
+
a27;271B
|
166
|
+
a28;271C
|
167
|
+
a29;2722
|
168
|
+
a2;2702
|
169
|
+
a30;2723
|
170
|
+
a31;2724
|
171
|
+
a32;2725
|
172
|
+
a33;2726
|
173
|
+
a34;2727
|
174
|
+
a35;2605
|
175
|
+
a36;2729
|
176
|
+
a37;272A
|
177
|
+
a38;272B
|
178
|
+
a39;272C
|
179
|
+
a3;2704
|
180
|
+
a40;272D
|
181
|
+
a41;272E
|
182
|
+
a42;272F
|
183
|
+
a43;2730
|
184
|
+
a44;2731
|
185
|
+
a45;2732
|
186
|
+
a46;2733
|
187
|
+
a47;2734
|
188
|
+
a48;2735
|
189
|
+
a49;2736
|
190
|
+
a4;260E
|
191
|
+
a50;2737
|
192
|
+
a51;2738
|
193
|
+
a52;2739
|
194
|
+
a53;273A
|
195
|
+
a54;273B
|
196
|
+
a55;273C
|
197
|
+
a56;273D
|
198
|
+
a57;273E
|
199
|
+
a58;273F
|
200
|
+
a59;2740
|
201
|
+
a5;2706
|
202
|
+
a60;2741
|
203
|
+
a61;2742
|
204
|
+
a62;2743
|
205
|
+
a63;2744
|
206
|
+
a64;2745
|
207
|
+
a65;2746
|
208
|
+
a66;2747
|
209
|
+
a67;2748
|
210
|
+
a68;2749
|
211
|
+
a69;274A
|
212
|
+
a6;271D
|
213
|
+
a70;274B
|
214
|
+
a71;25CF
|
215
|
+
a72;274D
|
216
|
+
a73;25A0
|
217
|
+
a74;274F
|
218
|
+
a75;2751
|
219
|
+
a76;25B2
|
220
|
+
a77;25BC
|
221
|
+
a78;25C6
|
222
|
+
a79;2756
|
223
|
+
a7;271E
|
224
|
+
a81;25D7
|
225
|
+
a82;2758
|
226
|
+
a83;2759
|
227
|
+
a84;275A
|
228
|
+
a85;276F
|
229
|
+
a86;2771
|
230
|
+
a87;2772
|
231
|
+
a88;2773
|
232
|
+
a89;2768
|
233
|
+
a8;271F
|
234
|
+
a90;2769
|
235
|
+
a91;276C
|
236
|
+
a92;276D
|
237
|
+
a93;276A
|
238
|
+
a94;276B
|
239
|
+
a95;2774
|
240
|
+
a96;2775
|
241
|
+
a97;275B
|
242
|
+
a98;275C
|
243
|
+
a99;275D
|
244
|
+
a9;2720
|
245
|
+
# END
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -35,9 +36,9 @@ module PDF
|
|
35
36
|
|
36
37
|
def read
|
37
38
|
bits_left_in_chunk = @bits_in_chunk
|
38
|
-
chunk =
|
39
|
+
chunk = -1
|
39
40
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
40
|
-
chunk = 0 if chunk
|
41
|
+
chunk = 0 if chunk < 0
|
41
42
|
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
42
43
|
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
43
44
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
@@ -83,6 +84,7 @@ module PDF
|
|
83
84
|
#
|
84
85
|
def self.decode(data)
|
85
86
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
87
|
+
string_table = StringTable.new
|
86
88
|
result = "".dup
|
87
89
|
until (code = stream.read) == CODE_EOD
|
88
90
|
if code == CODE_CLEAR_TABLE
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -331,11 +332,15 @@ class PDF::Reader
|
|
331
332
|
def decrypt(ref, obj)
|
332
333
|
case obj
|
333
334
|
when PDF::Reader::Stream then
|
334
|
-
|
335
|
+
# PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
|
336
|
+
# Therefore we shouldn't try to decrypt it.
|
337
|
+
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
335
338
|
obj
|
336
339
|
when Hash then
|
337
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
338
|
-
|
340
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
341
|
+
arr.each_with_object({}) { |(k,v), accum|
|
342
|
+
accum[k] = v
|
343
|
+
}
|
339
344
|
when Array then
|
340
345
|
obj.collect { |item| decrypt(ref, item) }
|
341
346
|
when String
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# typed: true
|
1
2
|
# coding: utf-8
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
5
|
class PDF::Reader
|
4
6
|
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
@@ -38,7 +40,8 @@ class PDF::Reader
|
|
38
40
|
|
39
41
|
def self.detect_intersection(sweep_line_status, event_point)
|
40
42
|
sweep_line_status.each do |open_text_run|
|
41
|
-
if
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
42
45
|
event_point.x <= open_text_run.endx &&
|
43
46
|
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
47
|
return true
|
@@ -51,10 +54,14 @@ class PDF::Reader
|
|
51
54
|
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
55
|
# looking for duplicates
|
53
56
|
class EventPoint
|
54
|
-
attr_reader :x, :run
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
58
65
|
end
|
59
66
|
|
60
67
|
def start?
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -68,10 +69,33 @@ module PDF
|
|
68
69
|
@attributes
|
69
70
|
end
|
70
71
|
|
72
|
+
def height
|
73
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
74
|
+
rect.apply_rotation(rotate) if rotate > 0
|
75
|
+
rect.height
|
76
|
+
end
|
77
|
+
|
78
|
+
def width
|
79
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
80
|
+
rect.apply_rotation(rotate) if rotate > 0
|
81
|
+
rect.width
|
82
|
+
end
|
83
|
+
|
84
|
+
def origin
|
85
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
86
|
+
rect.apply_rotation(rotate) if rotate > 0
|
87
|
+
|
88
|
+
rect.bottom_left
|
89
|
+
end
|
90
|
+
|
71
91
|
# Convenience method to identify the page's orientation.
|
72
92
|
#
|
73
93
|
def orientation
|
74
|
-
|
94
|
+
if height > width
|
95
|
+
"portrait"
|
96
|
+
else
|
97
|
+
"landscape"
|
98
|
+
end
|
75
99
|
end
|
76
100
|
|
77
101
|
# returns the plain text content of this page encoded as UTF-8. Any
|
@@ -139,23 +163,50 @@ module PDF
|
|
139
163
|
# returns the "boxes" that define the page object.
|
140
164
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
165
|
#
|
166
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
167
|
+
#
|
142
168
|
def boxes
|
143
|
-
|
144
|
-
|
169
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
170
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
171
|
+
end
|
172
|
+
|
173
|
+
# returns the "boxes" that define the page object.
|
174
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
175
|
+
#
|
176
|
+
def rectangles
|
177
|
+
mediabox = objects.deref!(attributes[:MediaBox])
|
178
|
+
cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
|
179
|
+
bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
|
180
|
+
trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
|
181
|
+
artbox = objects.deref!(attributes[:ArtBox]) || cropbox
|
182
|
+
|
183
|
+
mediarect = Rectangle.new(*mediabox)
|
184
|
+
croprect = Rectangle.new(*cropbox)
|
185
|
+
bleedrect = Rectangle.new(*bleedbox)
|
186
|
+
trimrect = Rectangle.new(*trimbox)
|
187
|
+
artrect = Rectangle.new(*artbox)
|
188
|
+
|
189
|
+
if rotate > 0
|
190
|
+
mediarect.apply_rotation(rotate)
|
191
|
+
croprect.apply_rotation(rotate)
|
192
|
+
bleedrect.apply_rotation(rotate)
|
193
|
+
trimrect.apply_rotation(rotate)
|
194
|
+
artrect.apply_rotation(rotate)
|
195
|
+
end
|
145
196
|
|
146
197
|
{
|
147
|
-
MediaBox:
|
148
|
-
CropBox:
|
149
|
-
BleedBox:
|
150
|
-
TrimBox:
|
151
|
-
ArtBox:
|
198
|
+
MediaBox: mediarect,
|
199
|
+
CropBox: croprect,
|
200
|
+
BleedBox: bleedrect,
|
201
|
+
TrimBox: trimrect,
|
202
|
+
ArtBox: artrect,
|
152
203
|
}
|
153
204
|
end
|
154
205
|
|
155
206
|
private
|
156
207
|
|
157
208
|
def root
|
158
|
-
|
209
|
+
@root ||= objects.deref(@objects.trailer[:Root])
|
159
210
|
end
|
160
211
|
|
161
212
|
# Returns the resources that accompany this page. Includes
|
@@ -1,7 +1,9 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/overlapping_runs_filter'
|
6
|
+
require 'pdf/reader/zero_width_runs_filter'
|
5
7
|
|
6
8
|
class PDF::Reader
|
7
9
|
|
@@ -15,14 +17,17 @@ class PDF::Reader
|
|
15
17
|
DEFAULT_FONT_SIZE = 12
|
16
18
|
|
17
19
|
def initialize(runs, mediabox)
|
18
|
-
|
19
|
-
|
20
|
-
|
20
|
+
# mediabox is a 4-element array for now, but it'd be nice to switch to a
|
21
|
+
# PDF::Reader::Rectangle at some point
|
22
|
+
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
23
|
+
|
24
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
25
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
26
|
+
@mediabox = mediabox
|
27
|
+
@runs = merge_runs(runs)
|
21
28
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
22
29
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
23
|
-
@
|
24
|
-
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
|
-
@page_height = (mediabox[3] - mediabox[1]).abs
|
30
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
26
31
|
@x_offset = @runs.map(&:x).sort.first || 0
|
27
32
|
lowest_y = @runs.map(&:y).sort.first || 0
|
28
33
|
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
@@ -30,6 +35,7 @@ class PDF::Reader
|
|
30
35
|
|
31
36
|
def to_s
|
32
37
|
return "" if @runs.empty?
|
38
|
+
return "" if row_count == 0
|
33
39
|
|
34
40
|
page = row_count.times.map { |i| " " * col_count }
|
35
41
|
@runs.each do |run|
|
@@ -44,6 +50,16 @@ class PDF::Reader
|
|
44
50
|
|
45
51
|
private
|
46
52
|
|
53
|
+
def page_width
|
54
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
|
55
|
+
(@mediabox[2].to_f - @mediabox[0].to_f).abs
|
56
|
+
end
|
57
|
+
|
58
|
+
def page_height
|
59
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
|
60
|
+
(@mediabox[3].to_f - @mediabox[1].to_f).abs
|
61
|
+
end
|
62
|
+
|
47
63
|
# given an array of strings, return a new array with empty rows from the
|
48
64
|
# beginning and end removed.
|
49
65
|
#
|
@@ -62,19 +78,19 @@ class PDF::Reader
|
|
62
78
|
end
|
63
79
|
|
64
80
|
def row_count
|
65
|
-
@row_count ||= (
|
81
|
+
@row_count ||= (page_height / @mean_font_size).floor
|
66
82
|
end
|
67
83
|
|
68
84
|
def col_count
|
69
|
-
@col_count ||= ((
|
85
|
+
@col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
|
70
86
|
end
|
71
87
|
|
72
88
|
def row_multiplier
|
73
|
-
@row_multiplier ||=
|
89
|
+
@row_multiplier ||= page_height.to_f / row_count.to_f
|
74
90
|
end
|
75
91
|
|
76
92
|
def col_multiplier
|
77
|
-
@col_multiplier ||=
|
93
|
+
@col_multiplier ||= page_width.to_f / col_count.to_f
|
78
94
|
end
|
79
95
|
|
80
96
|
def mean(collection)
|
@@ -85,12 +101,12 @@ class PDF::Reader
|
|
85
101
|
end
|
86
102
|
end
|
87
103
|
|
88
|
-
def
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
104
|
+
def median(collection)
|
105
|
+
if collection.size == 0
|
106
|
+
0
|
107
|
+
else
|
108
|
+
collection.sort[(collection.size * 0.5).floor]
|
109
|
+
end
|
94
110
|
end
|
95
111
|
|
96
112
|
# take a collection of TextRun objects and merge any that are in close
|
@@ -104,17 +120,15 @@ class PDF::Reader
|
|
104
120
|
end
|
105
121
|
|
106
122
|
def group_chars_into_runs(chars)
|
107
|
-
|
108
|
-
while head = chars.shift
|
123
|
+
chars.each_with_object([]) do |char, runs|
|
109
124
|
if runs.empty?
|
110
|
-
runs <<
|
111
|
-
elsif runs.last.mergable?(
|
112
|
-
runs[-1] = runs.last +
|
125
|
+
runs << char
|
126
|
+
elsif runs.last.mergable?(char)
|
127
|
+
runs[-1] = runs.last + char
|
113
128
|
else
|
114
|
-
runs <<
|
129
|
+
runs << char
|
115
130
|
end
|
116
131
|
end
|
117
|
-
runs
|
118
132
|
end
|
119
133
|
|
120
134
|
def local_string_insert(haystack, needle, index)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/transformation_matrix'
|
@@ -30,15 +31,7 @@ class PDF::Reader
|
|
30
31
|
@xobject_stack = [page.xobjects]
|
31
32
|
@cs_stack = [page.color_spaces]
|
32
33
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
33
|
-
|
34
|
-
state[:ctm] = identity_matrix
|
35
|
-
else
|
36
|
-
rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
|
37
|
-
rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
|
38
|
-
state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
|
39
|
-
rotate_sin * -1, rotate_cos,
|
40
|
-
0, 0)
|
41
|
-
end
|
34
|
+
state[:ctm] = identity_matrix
|
42
35
|
end
|
43
36
|
|
44
37
|
#####################################################
|
@@ -320,7 +313,7 @@ class PDF::Reader
|
|
320
313
|
# may need to be added
|
321
314
|
#
|
322
315
|
def process_glyph_displacement(w0, tj, word_boundary)
|
323
|
-
fs =
|
316
|
+
fs = state[:text_font_size]
|
324
317
|
tc = state[:char_spacing]
|
325
318
|
if word_boundary
|
326
319
|
tw = state[:word_spacing]
|
@@ -330,22 +323,24 @@ class PDF::Reader
|
|
330
323
|
th = state[:h_scaling]
|
331
324
|
# optimise the common path to reduce Float allocations
|
332
325
|
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
333
|
-
|
334
|
-
|
326
|
+
tx = w0 * fs
|
327
|
+
elsif tj != 0
|
328
|
+
# don't apply spacing to TJ displacement
|
329
|
+
tx = (w0 - (tj/1000.0)) * fs * th
|
335
330
|
else
|
336
|
-
|
337
|
-
tx =
|
331
|
+
# apply horizontal scaling to spacing values but not font size
|
332
|
+
tx = ((w0 * fs) + tc + tw) * th
|
338
333
|
end
|
339
|
-
|
340
|
-
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
341
|
-
# ctm[0] here, but this gets my tests green and I'm out of
|
342
|
-
# ideas for now
|
343
334
|
# TODO: support ty > 0
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
335
|
+
ty = 0
|
336
|
+
temp = TransformationMatrix.new(1, 0,
|
337
|
+
0, 1,
|
338
|
+
tx, ty)
|
339
|
+
@text_matrix = temp.multiply!(
|
340
|
+
@text_matrix.a, @text_matrix.b,
|
341
|
+
@text_matrix.c, @text_matrix.d,
|
342
|
+
@text_matrix.e, @text_matrix.f
|
343
|
+
)
|
349
344
|
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
350
345
|
end
|
351
346
|
|