pdf-reader 2.4.1 → 2.7.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +40 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +27 -14
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +37 -23
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +28 -5
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +12 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +17 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -103,19 +104,25 @@ class PDF::Reader
|
|
103
104
|
|
104
105
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
105
106
|
# a text file supplied by Adobe at:
|
106
|
-
#
|
107
|
+
# https://github.com/adobe-type-tools/agl-aglfn
|
107
108
|
def load_adobe_glyph_mapping
|
108
109
|
keyed_by_name = {}
|
109
110
|
keyed_by_codepoint = {}
|
110
111
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
112
|
+
paths = [
|
113
|
+
File.dirname(__FILE__) + "/glyphlist.txt",
|
114
|
+
File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
|
115
|
+
]
|
116
|
+
paths.each do |path|
|
117
|
+
File.open(path, "r:BINARY") do |f|
|
118
|
+
f.each do |l|
|
119
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
120
|
+
if name && code
|
121
|
+
cp = "0x#{code}".hex
|
122
|
+
keyed_by_name[name.to_sym] = cp
|
123
|
+
keyed_by_codepoint[cp] ||= []
|
124
|
+
keyed_by_codepoint[cp] << name.to_sym
|
125
|
+
end
|
119
126
|
end
|
120
127
|
end
|
121
128
|
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
# -----------------------------------------------------------
|
2
|
+
# Copyright 2002-2019 Adobe (http://www.adobe.com/).
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or
|
5
|
+
# without modification, are permitted provided that the
|
6
|
+
# following conditions are met:
|
7
|
+
#
|
8
|
+
# Redistributions of source code must retain the above
|
9
|
+
# copyright notice, this list of conditions and the following
|
10
|
+
# disclaimer.
|
11
|
+
#
|
12
|
+
# Redistributions in binary form must reproduce the above
|
13
|
+
# copyright notice, this list of conditions and the following
|
14
|
+
# disclaimer in the documentation and/or other materials
|
15
|
+
# provided with the distribution.
|
16
|
+
#
|
17
|
+
# Neither the name of Adobe nor the names of its contributors
|
18
|
+
# may be used to endorse or promote products derived from this
|
19
|
+
# software without specific prior written permission.
|
20
|
+
#
|
21
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
22
|
+
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
23
|
+
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
24
|
+
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
25
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
26
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
27
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
28
|
+
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
29
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
30
|
+
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
31
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
32
|
+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
33
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34
|
+
# -----------------------------------------------------------
|
35
|
+
# Name: ITC Zapf Dingbats Glyph List
|
36
|
+
# Table version: 2.0
|
37
|
+
# Date: September 20, 2002
|
38
|
+
# URL: https://github.com/adobe-type-tools/agl-aglfn
|
39
|
+
#
|
40
|
+
# Format: two semicolon-delimited fields:
|
41
|
+
# (1) glyph name--upper/lowercase letters and digits
|
42
|
+
# (2) Unicode scalar value--four uppercase hexadecimal digits
|
43
|
+
#
|
44
|
+
a100;275E
|
45
|
+
a101;2761
|
46
|
+
a102;2762
|
47
|
+
a103;2763
|
48
|
+
a104;2764
|
49
|
+
a105;2710
|
50
|
+
a106;2765
|
51
|
+
a107;2766
|
52
|
+
a108;2767
|
53
|
+
a109;2660
|
54
|
+
a10;2721
|
55
|
+
a110;2665
|
56
|
+
a111;2666
|
57
|
+
a112;2663
|
58
|
+
a117;2709
|
59
|
+
a118;2708
|
60
|
+
a119;2707
|
61
|
+
a11;261B
|
62
|
+
a120;2460
|
63
|
+
a121;2461
|
64
|
+
a122;2462
|
65
|
+
a123;2463
|
66
|
+
a124;2464
|
67
|
+
a125;2465
|
68
|
+
a126;2466
|
69
|
+
a127;2467
|
70
|
+
a128;2468
|
71
|
+
a129;2469
|
72
|
+
a12;261E
|
73
|
+
a130;2776
|
74
|
+
a131;2777
|
75
|
+
a132;2778
|
76
|
+
a133;2779
|
77
|
+
a134;277A
|
78
|
+
a135;277B
|
79
|
+
a136;277C
|
80
|
+
a137;277D
|
81
|
+
a138;277E
|
82
|
+
a139;277F
|
83
|
+
a13;270C
|
84
|
+
a140;2780
|
85
|
+
a141;2781
|
86
|
+
a142;2782
|
87
|
+
a143;2783
|
88
|
+
a144;2784
|
89
|
+
a145;2785
|
90
|
+
a146;2786
|
91
|
+
a147;2787
|
92
|
+
a148;2788
|
93
|
+
a149;2789
|
94
|
+
a14;270D
|
95
|
+
a150;278A
|
96
|
+
a151;278B
|
97
|
+
a152;278C
|
98
|
+
a153;278D
|
99
|
+
a154;278E
|
100
|
+
a155;278F
|
101
|
+
a156;2790
|
102
|
+
a157;2791
|
103
|
+
a158;2792
|
104
|
+
a159;2793
|
105
|
+
a15;270E
|
106
|
+
a160;2794
|
107
|
+
a161;2192
|
108
|
+
a162;27A3
|
109
|
+
a163;2194
|
110
|
+
a164;2195
|
111
|
+
a165;2799
|
112
|
+
a166;279B
|
113
|
+
a167;279C
|
114
|
+
a168;279D
|
115
|
+
a169;279E
|
116
|
+
a16;270F
|
117
|
+
a170;279F
|
118
|
+
a171;27A0
|
119
|
+
a172;27A1
|
120
|
+
a173;27A2
|
121
|
+
a174;27A4
|
122
|
+
a175;27A5
|
123
|
+
a176;27A6
|
124
|
+
a177;27A7
|
125
|
+
a178;27A8
|
126
|
+
a179;27A9
|
127
|
+
a17;2711
|
128
|
+
a180;27AB
|
129
|
+
a181;27AD
|
130
|
+
a182;27AF
|
131
|
+
a183;27B2
|
132
|
+
a184;27B3
|
133
|
+
a185;27B5
|
134
|
+
a186;27B8
|
135
|
+
a187;27BA
|
136
|
+
a188;27BB
|
137
|
+
a189;27BC
|
138
|
+
a18;2712
|
139
|
+
a190;27BD
|
140
|
+
a191;27BE
|
141
|
+
a192;279A
|
142
|
+
a193;27AA
|
143
|
+
a194;27B6
|
144
|
+
a195;27B9
|
145
|
+
a196;2798
|
146
|
+
a197;27B4
|
147
|
+
a198;27B7
|
148
|
+
a199;27AC
|
149
|
+
a19;2713
|
150
|
+
a1;2701
|
151
|
+
a200;27AE
|
152
|
+
a201;27B1
|
153
|
+
a202;2703
|
154
|
+
a203;2750
|
155
|
+
a204;2752
|
156
|
+
a205;276E
|
157
|
+
a206;2770
|
158
|
+
a20;2714
|
159
|
+
a21;2715
|
160
|
+
a22;2716
|
161
|
+
a23;2717
|
162
|
+
a24;2718
|
163
|
+
a25;2719
|
164
|
+
a26;271A
|
165
|
+
a27;271B
|
166
|
+
a28;271C
|
167
|
+
a29;2722
|
168
|
+
a2;2702
|
169
|
+
a30;2723
|
170
|
+
a31;2724
|
171
|
+
a32;2725
|
172
|
+
a33;2726
|
173
|
+
a34;2727
|
174
|
+
a35;2605
|
175
|
+
a36;2729
|
176
|
+
a37;272A
|
177
|
+
a38;272B
|
178
|
+
a39;272C
|
179
|
+
a3;2704
|
180
|
+
a40;272D
|
181
|
+
a41;272E
|
182
|
+
a42;272F
|
183
|
+
a43;2730
|
184
|
+
a44;2731
|
185
|
+
a45;2732
|
186
|
+
a46;2733
|
187
|
+
a47;2734
|
188
|
+
a48;2735
|
189
|
+
a49;2736
|
190
|
+
a4;260E
|
191
|
+
a50;2737
|
192
|
+
a51;2738
|
193
|
+
a52;2739
|
194
|
+
a53;273A
|
195
|
+
a54;273B
|
196
|
+
a55;273C
|
197
|
+
a56;273D
|
198
|
+
a57;273E
|
199
|
+
a58;273F
|
200
|
+
a59;2740
|
201
|
+
a5;2706
|
202
|
+
a60;2741
|
203
|
+
a61;2742
|
204
|
+
a62;2743
|
205
|
+
a63;2744
|
206
|
+
a64;2745
|
207
|
+
a65;2746
|
208
|
+
a66;2747
|
209
|
+
a67;2748
|
210
|
+
a68;2749
|
211
|
+
a69;274A
|
212
|
+
a6;271D
|
213
|
+
a70;274B
|
214
|
+
a71;25CF
|
215
|
+
a72;274D
|
216
|
+
a73;25A0
|
217
|
+
a74;274F
|
218
|
+
a75;2751
|
219
|
+
a76;25B2
|
220
|
+
a77;25BC
|
221
|
+
a78;25C6
|
222
|
+
a79;2756
|
223
|
+
a7;271E
|
224
|
+
a81;25D7
|
225
|
+
a82;2758
|
226
|
+
a83;2759
|
227
|
+
a84;275A
|
228
|
+
a85;276F
|
229
|
+
a86;2771
|
230
|
+
a87;2772
|
231
|
+
a88;2773
|
232
|
+
a89;2768
|
233
|
+
a8;271F
|
234
|
+
a90;2769
|
235
|
+
a91;276C
|
236
|
+
a92;276D
|
237
|
+
a93;276A
|
238
|
+
a94;276B
|
239
|
+
a95;2774
|
240
|
+
a96;2775
|
241
|
+
a97;275B
|
242
|
+
a98;275C
|
243
|
+
a99;275D
|
244
|
+
a9;2720
|
245
|
+
# END
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -35,9 +36,9 @@ module PDF
|
|
35
36
|
|
36
37
|
def read
|
37
38
|
bits_left_in_chunk = @bits_in_chunk
|
38
|
-
chunk =
|
39
|
+
chunk = -1
|
39
40
|
while bits_left_in_chunk > 0 and @current_pos < @data.size
|
40
|
-
chunk = 0 if chunk
|
41
|
+
chunk = 0 if chunk < 0
|
41
42
|
codepoint = @data[@current_pos, 1].unpack("C*")[0]
|
42
43
|
current_byte = codepoint & (2**@bits_left_in_byte - 1) #clear consumed bits
|
43
44
|
dif = bits_left_in_chunk - @bits_left_in_byte
|
@@ -83,6 +84,7 @@ module PDF
|
|
83
84
|
#
|
84
85
|
def self.decode(data)
|
85
86
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
87
|
+
string_table = StringTable.new
|
86
88
|
result = "".dup
|
87
89
|
until (code = stream.read) == CODE_EOD
|
88
90
|
if code == CODE_CLEAR_TABLE
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
class PDF::Reader
|
@@ -331,11 +332,15 @@ class PDF::Reader
|
|
331
332
|
def decrypt(ref, obj)
|
332
333
|
case obj
|
333
334
|
when PDF::Reader::Stream then
|
334
|
-
|
335
|
+
# PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
|
336
|
+
# Therefore we shouldn't try to decrypt it.
|
337
|
+
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
335
338
|
obj
|
336
339
|
when Hash then
|
337
|
-
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
338
|
-
|
340
|
+
arr = obj.map { |key,val| [key, decrypt(ref, val)] }
|
341
|
+
arr.each_with_object({}) { |(k,v), accum|
|
342
|
+
accum[k] = v
|
343
|
+
}
|
339
344
|
when Array then
|
340
345
|
obj.collect { |item| decrypt(ref, item) }
|
341
346
|
when String
|
@@ -1,4 +1,6 @@
|
|
1
|
+
# typed: true
|
1
2
|
# coding: utf-8
|
3
|
+
# frozen_string_literal: true
|
2
4
|
|
3
5
|
class PDF::Reader
|
4
6
|
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
@@ -38,7 +40,8 @@ class PDF::Reader
|
|
38
40
|
|
39
41
|
def self.detect_intersection(sweep_line_status, event_point)
|
40
42
|
sweep_line_status.each do |open_text_run|
|
41
|
-
if
|
43
|
+
if open_text_run.text == event_point.run.text &&
|
44
|
+
event_point.x >= open_text_run.x &&
|
42
45
|
event_point.x <= open_text_run.endx &&
|
43
46
|
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
47
|
return true
|
@@ -51,10 +54,14 @@ class PDF::Reader
|
|
51
54
|
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
55
|
# looking for duplicates
|
53
56
|
class EventPoint
|
54
|
-
attr_reader :x, :run
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
attr_reader :x
|
59
|
+
|
60
|
+
attr_reader :run
|
61
|
+
|
62
|
+
def initialize(x, run)
|
63
|
+
@x = x
|
64
|
+
@run = run
|
58
65
|
end
|
59
66
|
|
60
67
|
def start?
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
module PDF
|
@@ -68,10 +69,33 @@ module PDF
|
|
68
69
|
@attributes
|
69
70
|
end
|
70
71
|
|
72
|
+
def height
|
73
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
74
|
+
rect.apply_rotation(rotate) if rotate > 0
|
75
|
+
rect.height
|
76
|
+
end
|
77
|
+
|
78
|
+
def width
|
79
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
80
|
+
rect.apply_rotation(rotate) if rotate > 0
|
81
|
+
rect.width
|
82
|
+
end
|
83
|
+
|
84
|
+
def origin
|
85
|
+
rect = Rectangle.new(*attributes[:MediaBox])
|
86
|
+
rect.apply_rotation(rotate) if rotate > 0
|
87
|
+
|
88
|
+
rect.bottom_left
|
89
|
+
end
|
90
|
+
|
71
91
|
# Convenience method to identify the page's orientation.
|
72
92
|
#
|
73
93
|
def orientation
|
74
|
-
|
94
|
+
if height > width
|
95
|
+
"portrait"
|
96
|
+
else
|
97
|
+
"landscape"
|
98
|
+
end
|
75
99
|
end
|
76
100
|
|
77
101
|
# returns the plain text content of this page encoded as UTF-8. Any
|
@@ -139,23 +163,50 @@ module PDF
|
|
139
163
|
# returns the "boxes" that define the page object.
|
140
164
|
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
165
|
#
|
166
|
+
# DEPRECATED. Recommend using Page#rectangles instead
|
167
|
+
#
|
142
168
|
def boxes
|
143
|
-
|
144
|
-
|
169
|
+
# In ruby 2.4+ we could use Hash#transform_values
|
170
|
+
Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
|
171
|
+
end
|
172
|
+
|
173
|
+
# returns the "boxes" that define the page object.
|
174
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
175
|
+
#
|
176
|
+
def rectangles
|
177
|
+
mediabox = objects.deref!(attributes[:MediaBox])
|
178
|
+
cropbox = objects.deref!(attributes[:Cropbox]) || mediabox
|
179
|
+
bleedbox = objects.deref!(attributes[:BleedBox]) || cropbox
|
180
|
+
trimbox = objects.deref!(attributes[:TrimBox]) || cropbox
|
181
|
+
artbox = objects.deref!(attributes[:ArtBox]) || cropbox
|
182
|
+
|
183
|
+
mediarect = Rectangle.new(*mediabox)
|
184
|
+
croprect = Rectangle.new(*cropbox)
|
185
|
+
bleedrect = Rectangle.new(*bleedbox)
|
186
|
+
trimrect = Rectangle.new(*trimbox)
|
187
|
+
artrect = Rectangle.new(*artbox)
|
188
|
+
|
189
|
+
if rotate > 0
|
190
|
+
mediarect.apply_rotation(rotate)
|
191
|
+
croprect.apply_rotation(rotate)
|
192
|
+
bleedrect.apply_rotation(rotate)
|
193
|
+
trimrect.apply_rotation(rotate)
|
194
|
+
artrect.apply_rotation(rotate)
|
195
|
+
end
|
145
196
|
|
146
197
|
{
|
147
|
-
MediaBox:
|
148
|
-
CropBox:
|
149
|
-
BleedBox:
|
150
|
-
TrimBox:
|
151
|
-
ArtBox:
|
198
|
+
MediaBox: mediarect,
|
199
|
+
CropBox: croprect,
|
200
|
+
BleedBox: bleedrect,
|
201
|
+
TrimBox: trimrect,
|
202
|
+
ArtBox: artrect,
|
152
203
|
}
|
153
204
|
end
|
154
205
|
|
155
206
|
private
|
156
207
|
|
157
208
|
def root
|
158
|
-
|
209
|
+
@root ||= objects.deref(@objects.trailer[:Root])
|
159
210
|
end
|
160
211
|
|
161
212
|
# Returns the resources that accompany this page. Includes
|
@@ -1,7 +1,9 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/overlapping_runs_filter'
|
6
|
+
require 'pdf/reader/zero_width_runs_filter'
|
5
7
|
|
6
8
|
class PDF::Reader
|
7
9
|
|
@@ -15,14 +17,17 @@ class PDF::Reader
|
|
15
17
|
DEFAULT_FONT_SIZE = 12
|
16
18
|
|
17
19
|
def initialize(runs, mediabox)
|
18
|
-
|
19
|
-
|
20
|
-
|
20
|
+
# mediabox is a 4-element array for now, but it'd be nice to switch to a
|
21
|
+
# PDF::Reader::Rectangle at some point
|
22
|
+
PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
|
23
|
+
|
24
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
25
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
26
|
+
@mediabox = mediabox
|
27
|
+
@runs = merge_runs(runs)
|
21
28
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
22
29
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
23
|
-
@
|
24
|
-
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
|
-
@page_height = (mediabox[3] - mediabox[1]).abs
|
30
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
26
31
|
@x_offset = @runs.map(&:x).sort.first || 0
|
27
32
|
lowest_y = @runs.map(&:y).sort.first || 0
|
28
33
|
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
@@ -30,6 +35,7 @@ class PDF::Reader
|
|
30
35
|
|
31
36
|
def to_s
|
32
37
|
return "" if @runs.empty?
|
38
|
+
return "" if row_count == 0
|
33
39
|
|
34
40
|
page = row_count.times.map { |i| " " * col_count }
|
35
41
|
@runs.each do |run|
|
@@ -44,6 +50,16 @@ class PDF::Reader
|
|
44
50
|
|
45
51
|
private
|
46
52
|
|
53
|
+
def page_width
|
54
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
|
55
|
+
(@mediabox[2].to_f - @mediabox[0].to_f).abs
|
56
|
+
end
|
57
|
+
|
58
|
+
def page_height
|
59
|
+
# TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
|
60
|
+
(@mediabox[3].to_f - @mediabox[1].to_f).abs
|
61
|
+
end
|
62
|
+
|
47
63
|
# given an array of strings, return a new array with empty rows from the
|
48
64
|
# beginning and end removed.
|
49
65
|
#
|
@@ -62,19 +78,19 @@ class PDF::Reader
|
|
62
78
|
end
|
63
79
|
|
64
80
|
def row_count
|
65
|
-
@row_count ||= (
|
81
|
+
@row_count ||= (page_height / @mean_font_size).floor
|
66
82
|
end
|
67
83
|
|
68
84
|
def col_count
|
69
|
-
@col_count ||= ((
|
85
|
+
@col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
|
70
86
|
end
|
71
87
|
|
72
88
|
def row_multiplier
|
73
|
-
@row_multiplier ||=
|
89
|
+
@row_multiplier ||= page_height.to_f / row_count.to_f
|
74
90
|
end
|
75
91
|
|
76
92
|
def col_multiplier
|
77
|
-
@col_multiplier ||=
|
93
|
+
@col_multiplier ||= page_width.to_f / col_count.to_f
|
78
94
|
end
|
79
95
|
|
80
96
|
def mean(collection)
|
@@ -85,12 +101,12 @@ class PDF::Reader
|
|
85
101
|
end
|
86
102
|
end
|
87
103
|
|
88
|
-
def
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
104
|
+
def median(collection)
|
105
|
+
if collection.size == 0
|
106
|
+
0
|
107
|
+
else
|
108
|
+
collection.sort[(collection.size * 0.5).floor]
|
109
|
+
end
|
94
110
|
end
|
95
111
|
|
96
112
|
# take a collection of TextRun objects and merge any that are in close
|
@@ -104,17 +120,15 @@ class PDF::Reader
|
|
104
120
|
end
|
105
121
|
|
106
122
|
def group_chars_into_runs(chars)
|
107
|
-
|
108
|
-
while head = chars.shift
|
123
|
+
chars.each_with_object([]) do |char, runs|
|
109
124
|
if runs.empty?
|
110
|
-
runs <<
|
111
|
-
elsif runs.last.mergable?(
|
112
|
-
runs[-1] = runs.last +
|
125
|
+
runs << char
|
126
|
+
elsif runs.last.mergable?(char)
|
127
|
+
runs[-1] = runs.last + char
|
113
128
|
else
|
114
|
-
runs <<
|
129
|
+
runs << char
|
115
130
|
end
|
116
131
|
end
|
117
|
-
runs
|
118
132
|
end
|
119
133
|
|
120
134
|
def local_string_insert(haystack, needle, index)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/transformation_matrix'
|
@@ -30,15 +31,7 @@ class PDF::Reader
|
|
30
31
|
@xobject_stack = [page.xobjects]
|
31
32
|
@cs_stack = [page.color_spaces]
|
32
33
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
33
|
-
|
34
|
-
state[:ctm] = identity_matrix
|
35
|
-
else
|
36
|
-
rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
|
37
|
-
rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
|
38
|
-
state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
|
39
|
-
rotate_sin * -1, rotate_cos,
|
40
|
-
0, 0)
|
41
|
-
end
|
34
|
+
state[:ctm] = identity_matrix
|
42
35
|
end
|
43
36
|
|
44
37
|
#####################################################
|
@@ -320,7 +313,7 @@ class PDF::Reader
|
|
320
313
|
# may need to be added
|
321
314
|
#
|
322
315
|
def process_glyph_displacement(w0, tj, word_boundary)
|
323
|
-
fs =
|
316
|
+
fs = state[:text_font_size]
|
324
317
|
tc = state[:char_spacing]
|
325
318
|
if word_boundary
|
326
319
|
tw = state[:word_spacing]
|
@@ -330,22 +323,24 @@ class PDF::Reader
|
|
330
323
|
th = state[:h_scaling]
|
331
324
|
# optimise the common path to reduce Float allocations
|
332
325
|
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
333
|
-
|
334
|
-
|
326
|
+
tx = w0 * fs
|
327
|
+
elsif tj != 0
|
328
|
+
# don't apply spacing to TJ displacement
|
329
|
+
tx = (w0 - (tj/1000.0)) * fs * th
|
335
330
|
else
|
336
|
-
|
337
|
-
tx =
|
331
|
+
# apply horizontal scaling to spacing values but not font size
|
332
|
+
tx = ((w0 * fs) + tc + tw) * th
|
338
333
|
end
|
339
|
-
|
340
|
-
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
341
|
-
# ctm[0] here, but this gets my tests green and I'm out of
|
342
|
-
# ideas for now
|
343
334
|
# TODO: support ty > 0
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
335
|
+
ty = 0
|
336
|
+
temp = TransformationMatrix.new(1, 0,
|
337
|
+
0, 1,
|
338
|
+
tx, ty)
|
339
|
+
@text_matrix = temp.multiply!(
|
340
|
+
@text_matrix.a, @text_matrix.b,
|
341
|
+
@text_matrix.c, @text_matrix.d,
|
342
|
+
@text_matrix.e, @text_matrix.f
|
343
|
+
)
|
349
344
|
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
350
345
|
end
|
351
346
|
|