pdf-reader 2.4.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +31 -0
- data/README.md +17 -2
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +62 -21
- data/lib/pdf/reader/encoding.rb +1 -1
- data/lib/pdf/reader/error.rb +3 -3
- data/lib/pdf/reader/filter/ascii85.rb +5 -1
- data/lib/pdf/reader/filter/depredict.rb +3 -3
- data/lib/pdf/reader/filter/flate.rb +28 -16
- data/lib/pdf/reader/font.rb +3 -1
- data/lib/pdf/reader/glyph_hash.rb +15 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/object_hash.rb +3 -1
- data/lib/pdf/reader/orientation_detector.rb +2 -2
- data/lib/pdf/reader/page.rb +28 -0
- data/lib/pdf/reader/page_layout.rb +19 -13
- data/lib/pdf/reader/page_state.rb +7 -5
- data/lib/pdf/reader/page_text_receiver.rb +22 -1
- data/lib/pdf/reader/parser.rb +8 -6
- data/lib/pdf/reader/width_calculator/built_in.rb +7 -15
- data/lib/pdf/reader/xref.rb +6 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +11 -0
- metadata +17 -14
@@ -8,6 +8,9 @@ class PDF::Reader
|
|
8
8
|
module Filter # :nodoc:
|
9
9
|
# implementation of the Flate (zlib) stream filter
|
10
10
|
class Flate
|
11
|
+
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
|
+
ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
|
13
|
+
|
11
14
|
def initialize(options = {})
|
12
15
|
@options = options
|
13
16
|
end
|
@@ -15,25 +18,34 @@ class PDF::Reader
|
|
15
18
|
################################################################################
|
16
19
|
# Decode the specified data with the Zlib compression algorithm
|
17
20
|
def filter(data)
|
18
|
-
deflated =
|
21
|
+
deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
|
22
|
+
|
23
|
+
if deflated.nil?
|
24
|
+
raise MalformedPDFError,
|
25
|
+
"Error while inflating a compressed stream (no suitable inflation algorithm found)"
|
26
|
+
end
|
27
|
+
Depredict.new(@options).filter(deflated)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def zlib_inflate(data)
|
19
33
|
begin
|
20
|
-
|
21
|
-
rescue Zlib::DataError
|
34
|
+
return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
35
|
+
rescue Zlib::DataError
|
22
36
|
# by default, Ruby's Zlib assumes the data it's inflating
|
23
|
-
# is RFC1951 deflated data, wrapped in a
|
24
|
-
#
|
25
|
-
#
|
26
|
-
#
|
27
|
-
# See
|
28
|
-
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
29
|
-
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
30
|
-
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
37
|
+
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
38
|
+
# fails, swallow the exception and attempt to inflate the data as a raw
|
39
|
+
# RFC1951 stream.
|
31
40
|
end
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
41
|
+
|
42
|
+
begin
|
43
|
+
return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
|
44
|
+
rescue StandardError
|
45
|
+
# swallow this one too, so we can try some other fallback options
|
46
|
+
end
|
47
|
+
|
48
|
+
nil
|
37
49
|
end
|
38
50
|
end
|
39
51
|
end
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -131,7 +131,9 @@ class PDF::Reader
|
|
131
131
|
if obj[:ToUnicode]
|
132
132
|
# ToUnicode is optional for Type1 and Type3
|
133
133
|
stream = @ohash.object(obj[:ToUnicode])
|
134
|
-
|
134
|
+
if stream.is_a?(PDF::Reader::Stream)
|
135
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
136
|
+
end
|
135
137
|
end
|
136
138
|
end
|
137
139
|
|
@@ -103,19 +103,25 @@ class PDF::Reader
|
|
103
103
|
|
104
104
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
105
105
|
# a text file supplied by Adobe at:
|
106
|
-
#
|
106
|
+
# https://github.com/adobe-type-tools/agl-aglfn
|
107
107
|
def load_adobe_glyph_mapping
|
108
108
|
keyed_by_name = {}
|
109
109
|
keyed_by_codepoint = {}
|
110
110
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
111
|
+
paths = [
|
112
|
+
File.dirname(__FILE__) + "/glyphlist.txt",
|
113
|
+
File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
|
114
|
+
]
|
115
|
+
paths.each do |path|
|
116
|
+
File.open(path, "r:BINARY") do |f|
|
117
|
+
f.each do |l|
|
118
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
119
|
+
if name && code
|
120
|
+
cp = "0x#{code}".hex
|
121
|
+
keyed_by_name[name.to_sym] = cp
|
122
|
+
keyed_by_codepoint[cp] ||= []
|
123
|
+
keyed_by_codepoint[cp] << name.to_sym
|
124
|
+
end
|
119
125
|
end
|
120
126
|
end
|
121
127
|
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
# -----------------------------------------------------------
|
2
|
+
# Copyright 2002-2019 Adobe (http://www.adobe.com/).
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or
|
5
|
+
# without modification, are permitted provided that the
|
6
|
+
# following conditions are met:
|
7
|
+
#
|
8
|
+
# Redistributions of source code must retain the above
|
9
|
+
# copyright notice, this list of conditions and the following
|
10
|
+
# disclaimer.
|
11
|
+
#
|
12
|
+
# Redistributions in binary form must reproduce the above
|
13
|
+
# copyright notice, this list of conditions and the following
|
14
|
+
# disclaimer in the documentation and/or other materials
|
15
|
+
# provided with the distribution.
|
16
|
+
#
|
17
|
+
# Neither the name of Adobe nor the names of its contributors
|
18
|
+
# may be used to endorse or promote products derived from this
|
19
|
+
# software without specific prior written permission.
|
20
|
+
#
|
21
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
22
|
+
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
23
|
+
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
24
|
+
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
25
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
26
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
27
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
28
|
+
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
29
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
30
|
+
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
31
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
32
|
+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
33
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34
|
+
# -----------------------------------------------------------
|
35
|
+
# Name: ITC Zapf Dingbats Glyph List
|
36
|
+
# Table version: 2.0
|
37
|
+
# Date: September 20, 2002
|
38
|
+
# URL: https://github.com/adobe-type-tools/agl-aglfn
|
39
|
+
#
|
40
|
+
# Format: two semicolon-delimited fields:
|
41
|
+
# (1) glyph name--upper/lowercase letters and digits
|
42
|
+
# (2) Unicode scalar value--four uppercase hexadecimal digits
|
43
|
+
#
|
44
|
+
a100;275E
|
45
|
+
a101;2761
|
46
|
+
a102;2762
|
47
|
+
a103;2763
|
48
|
+
a104;2764
|
49
|
+
a105;2710
|
50
|
+
a106;2765
|
51
|
+
a107;2766
|
52
|
+
a108;2767
|
53
|
+
a109;2660
|
54
|
+
a10;2721
|
55
|
+
a110;2665
|
56
|
+
a111;2666
|
57
|
+
a112;2663
|
58
|
+
a117;2709
|
59
|
+
a118;2708
|
60
|
+
a119;2707
|
61
|
+
a11;261B
|
62
|
+
a120;2460
|
63
|
+
a121;2461
|
64
|
+
a122;2462
|
65
|
+
a123;2463
|
66
|
+
a124;2464
|
67
|
+
a125;2465
|
68
|
+
a126;2466
|
69
|
+
a127;2467
|
70
|
+
a128;2468
|
71
|
+
a129;2469
|
72
|
+
a12;261E
|
73
|
+
a130;2776
|
74
|
+
a131;2777
|
75
|
+
a132;2778
|
76
|
+
a133;2779
|
77
|
+
a134;277A
|
78
|
+
a135;277B
|
79
|
+
a136;277C
|
80
|
+
a137;277D
|
81
|
+
a138;277E
|
82
|
+
a139;277F
|
83
|
+
a13;270C
|
84
|
+
a140;2780
|
85
|
+
a141;2781
|
86
|
+
a142;2782
|
87
|
+
a143;2783
|
88
|
+
a144;2784
|
89
|
+
a145;2785
|
90
|
+
a146;2786
|
91
|
+
a147;2787
|
92
|
+
a148;2788
|
93
|
+
a149;2789
|
94
|
+
a14;270D
|
95
|
+
a150;278A
|
96
|
+
a151;278B
|
97
|
+
a152;278C
|
98
|
+
a153;278D
|
99
|
+
a154;278E
|
100
|
+
a155;278F
|
101
|
+
a156;2790
|
102
|
+
a157;2791
|
103
|
+
a158;2792
|
104
|
+
a159;2793
|
105
|
+
a15;270E
|
106
|
+
a160;2794
|
107
|
+
a161;2192
|
108
|
+
a162;27A3
|
109
|
+
a163;2194
|
110
|
+
a164;2195
|
111
|
+
a165;2799
|
112
|
+
a166;279B
|
113
|
+
a167;279C
|
114
|
+
a168;279D
|
115
|
+
a169;279E
|
116
|
+
a16;270F
|
117
|
+
a170;279F
|
118
|
+
a171;27A0
|
119
|
+
a172;27A1
|
120
|
+
a173;27A2
|
121
|
+
a174;27A4
|
122
|
+
a175;27A5
|
123
|
+
a176;27A6
|
124
|
+
a177;27A7
|
125
|
+
a178;27A8
|
126
|
+
a179;27A9
|
127
|
+
a17;2711
|
128
|
+
a180;27AB
|
129
|
+
a181;27AD
|
130
|
+
a182;27AF
|
131
|
+
a183;27B2
|
132
|
+
a184;27B3
|
133
|
+
a185;27B5
|
134
|
+
a186;27B8
|
135
|
+
a187;27BA
|
136
|
+
a188;27BB
|
137
|
+
a189;27BC
|
138
|
+
a18;2712
|
139
|
+
a190;27BD
|
140
|
+
a191;27BE
|
141
|
+
a192;279A
|
142
|
+
a193;27AA
|
143
|
+
a194;27B6
|
144
|
+
a195;27B9
|
145
|
+
a196;2798
|
146
|
+
a197;27B4
|
147
|
+
a198;27B7
|
148
|
+
a199;27AC
|
149
|
+
a19;2713
|
150
|
+
a1;2701
|
151
|
+
a200;27AE
|
152
|
+
a201;27B1
|
153
|
+
a202;2703
|
154
|
+
a203;2750
|
155
|
+
a204;2752
|
156
|
+
a205;276E
|
157
|
+
a206;2770
|
158
|
+
a20;2714
|
159
|
+
a21;2715
|
160
|
+
a22;2716
|
161
|
+
a23;2717
|
162
|
+
a24;2718
|
163
|
+
a25;2719
|
164
|
+
a26;271A
|
165
|
+
a27;271B
|
166
|
+
a28;271C
|
167
|
+
a29;2722
|
168
|
+
a2;2702
|
169
|
+
a30;2723
|
170
|
+
a31;2724
|
171
|
+
a32;2725
|
172
|
+
a33;2726
|
173
|
+
a34;2727
|
174
|
+
a35;2605
|
175
|
+
a36;2729
|
176
|
+
a37;272A
|
177
|
+
a38;272B
|
178
|
+
a39;272C
|
179
|
+
a3;2704
|
180
|
+
a40;272D
|
181
|
+
a41;272E
|
182
|
+
a42;272F
|
183
|
+
a43;2730
|
184
|
+
a44;2731
|
185
|
+
a45;2732
|
186
|
+
a46;2733
|
187
|
+
a47;2734
|
188
|
+
a48;2735
|
189
|
+
a49;2736
|
190
|
+
a4;260E
|
191
|
+
a50;2737
|
192
|
+
a51;2738
|
193
|
+
a52;2739
|
194
|
+
a53;273A
|
195
|
+
a54;273B
|
196
|
+
a55;273C
|
197
|
+
a56;273D
|
198
|
+
a57;273E
|
199
|
+
a58;273F
|
200
|
+
a59;2740
|
201
|
+
a5;2706
|
202
|
+
a60;2741
|
203
|
+
a61;2742
|
204
|
+
a62;2743
|
205
|
+
a63;2744
|
206
|
+
a64;2745
|
207
|
+
a65;2746
|
208
|
+
a66;2747
|
209
|
+
a67;2748
|
210
|
+
a68;2749
|
211
|
+
a69;274A
|
212
|
+
a6;271D
|
213
|
+
a70;274B
|
214
|
+
a71;25CF
|
215
|
+
a72;274D
|
216
|
+
a73;25A0
|
217
|
+
a74;274F
|
218
|
+
a75;2751
|
219
|
+
a76;25B2
|
220
|
+
a77;25BC
|
221
|
+
a78;25C6
|
222
|
+
a79;2756
|
223
|
+
a7;271E
|
224
|
+
a81;25D7
|
225
|
+
a82;2758
|
226
|
+
a83;2759
|
227
|
+
a84;275A
|
228
|
+
a85;276F
|
229
|
+
a86;2771
|
230
|
+
a87;2772
|
231
|
+
a88;2773
|
232
|
+
a89;2768
|
233
|
+
a8;271F
|
234
|
+
a90;2769
|
235
|
+
a91;276C
|
236
|
+
a92;276D
|
237
|
+
a93;276A
|
238
|
+
a94;276B
|
239
|
+
a95;2774
|
240
|
+
a96;2775
|
241
|
+
a97;275B
|
242
|
+
a98;275C
|
243
|
+
a99;275D
|
244
|
+
a9;2720
|
245
|
+
# END
|
@@ -331,7 +331,9 @@ class PDF::Reader
|
|
331
331
|
def decrypt(ref, obj)
|
332
332
|
case obj
|
333
333
|
when PDF::Reader::Stream then
|
334
|
-
|
334
|
+
# PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
|
335
|
+
# Therefore we shouldn't try to decrypt it.
|
336
|
+
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
335
337
|
obj
|
336
338
|
when Hash then
|
337
339
|
arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
|
@@ -22,8 +22,8 @@ class PDF::Reader
|
|
22
22
|
def detect_orientation
|
23
23
|
llx,lly,urx,ury = @attributes[:MediaBox]
|
24
24
|
rotation = @attributes[:Rotate].to_i
|
25
|
-
width = urx.to_i - llx.to_i
|
26
|
-
height = ury.to_i - lly.to_i
|
25
|
+
width = (urx.to_i - llx.to_i).abs
|
26
|
+
height = (ury.to_i - lly.to_i).abs
|
27
27
|
if width > height
|
28
28
|
(rotation % 180).zero? ? 'landscape' : 'portrait'
|
29
29
|
else
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -124,6 +124,34 @@ module PDF
|
|
124
124
|
}.join(" ")
|
125
125
|
end
|
126
126
|
|
127
|
+
# returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
|
128
|
+
#
|
129
|
+
def rotate
|
130
|
+
value = attributes[:Rotate].to_i
|
131
|
+
case value
|
132
|
+
when 0, 90, 180, 270
|
133
|
+
value
|
134
|
+
else
|
135
|
+
0
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# returns the "boxes" that define the page object.
|
140
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
|
+
#
|
142
|
+
def boxes
|
143
|
+
mediabox = attributes[:MediaBox]
|
144
|
+
cropbox = attributes[:Cropbox] || mediabox
|
145
|
+
|
146
|
+
{
|
147
|
+
MediaBox: objects.deref!(mediabox),
|
148
|
+
CropBox: objects.deref!(cropbox),
|
149
|
+
BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
|
150
|
+
TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
|
151
|
+
ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
127
155
|
private
|
128
156
|
|
129
157
|
def root
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
require 'pdf/reader/overlapping_runs_filter'
|
5
|
+
require 'pdf/reader/zero_width_runs_filter'
|
5
6
|
|
6
7
|
class PDF::Reader
|
7
8
|
|
@@ -17,22 +18,27 @@ class PDF::Reader
|
|
17
18
|
def initialize(runs, mediabox)
|
18
19
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
19
20
|
|
20
|
-
|
21
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
22
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
23
|
+
@runs = merge_runs(runs)
|
21
24
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
22
25
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
23
|
-
@
|
24
|
-
@page_width = mediabox[2] - mediabox[0]
|
25
|
-
@page_height = mediabox[3] - mediabox[1]
|
26
|
-
@x_offset = @runs.map(&:x).sort.first
|
26
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
27
|
+
@page_width = (mediabox[2] - mediabox[0]).abs
|
28
|
+
@page_height = (mediabox[3] - mediabox[1]).abs
|
29
|
+
@x_offset = @runs.map(&:x).sort.first || 0
|
30
|
+
lowest_y = @runs.map(&:y).sort.first || 0
|
31
|
+
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
27
32
|
end
|
28
33
|
|
29
34
|
def to_s
|
30
35
|
return "" if @runs.empty?
|
36
|
+
return "" if row_count == 0
|
31
37
|
|
32
38
|
page = row_count.times.map { |i| " " * col_count }
|
33
39
|
@runs.each do |run|
|
34
40
|
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
35
|
-
y_pos = row_count - (run.y / row_multiplier).round
|
41
|
+
y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
|
36
42
|
if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
|
37
43
|
local_string_insert(page[y_pos-1], run.text, x_pos)
|
38
44
|
end
|
@@ -64,7 +70,7 @@ class PDF::Reader
|
|
64
70
|
end
|
65
71
|
|
66
72
|
def col_count
|
67
|
-
@col_count ||= ((@page_width / @
|
73
|
+
@col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
|
68
74
|
end
|
69
75
|
|
70
76
|
def row_multiplier
|
@@ -83,12 +89,12 @@ class PDF::Reader
|
|
83
89
|
end
|
84
90
|
end
|
85
91
|
|
86
|
-
def
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
+
def median(collection)
|
93
|
+
if collection.size == 0
|
94
|
+
0
|
95
|
+
else
|
96
|
+
collection.sort[(collection.size * 0.5).floor]
|
97
|
+
end
|
92
98
|
end
|
93
99
|
|
94
100
|
# take a collection of TextRun objects and merge any that are in close
|
@@ -30,7 +30,7 @@ class PDF::Reader
|
|
30
30
|
@xobject_stack = [page.xobjects]
|
31
31
|
@cs_stack = [page.color_spaces]
|
32
32
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
33
|
-
state[:ctm]
|
33
|
+
state[:ctm] = identity_matrix
|
34
34
|
end
|
35
35
|
|
36
36
|
#####################################################
|
@@ -322,11 +322,13 @@ class PDF::Reader
|
|
322
322
|
th = state[:h_scaling]
|
323
323
|
# optimise the common path to reduce Float allocations
|
324
324
|
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
325
|
-
|
326
|
-
|
325
|
+
tx = w0 * fs
|
326
|
+
elsif tj != 0
|
327
|
+
# don't apply spacing to TJ displacement
|
328
|
+
tx = (w0 - (tj/1000.0)) * fs * th
|
327
329
|
else
|
328
|
-
|
329
|
-
tx =
|
330
|
+
# apply horizontal scaling to spacing values but not font size
|
331
|
+
tx = ((w0 * fs) + tc + tw) * th
|
330
332
|
end
|
331
333
|
|
332
334
|
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
@@ -41,13 +41,17 @@ module PDF
|
|
41
41
|
# starting a new page
|
42
42
|
def page=(page)
|
43
43
|
@state = PageState.new(page)
|
44
|
+
@page = page
|
44
45
|
@content = []
|
45
46
|
@characters = []
|
46
47
|
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
48
|
+
device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
|
49
|
+
device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
|
50
|
+
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
47
51
|
end
|
48
52
|
|
49
53
|
def content
|
50
|
-
PageLayout.new(@characters, @
|
54
|
+
PageLayout.new(@characters, @device_mediabox).to_s
|
51
55
|
end
|
52
56
|
|
53
57
|
#####################################################
|
@@ -101,6 +105,8 @@ module PDF
|
|
101
105
|
glyphs.each_with_index do |glyph_code, index|
|
102
106
|
# paint the current glyph
|
103
107
|
newx, newy = @state.trm_transform(0,0)
|
108
|
+
newx, newy = apply_rotation(newx, newy)
|
109
|
+
|
104
110
|
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
105
111
|
|
106
112
|
# apply to glyph displacment for the current glyph so the next
|
@@ -115,6 +121,21 @@ module PDF
|
|
115
121
|
end
|
116
122
|
end
|
117
123
|
|
124
|
+
def apply_rotation(x, y)
|
125
|
+
if @page.rotate == 90
|
126
|
+
tmp = x
|
127
|
+
x = y
|
128
|
+
y = tmp * -1
|
129
|
+
elsif @page.rotate == 180
|
130
|
+
y *= -1
|
131
|
+
elsif @page.rotate == 270
|
132
|
+
tmp = x
|
133
|
+
x = y * -1
|
134
|
+
y = tmp * -1
|
135
|
+
end
|
136
|
+
return x, y
|
137
|
+
end
|
138
|
+
|
118
139
|
end
|
119
140
|
end
|
120
141
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -175,15 +175,18 @@ class PDF::Reader
|
|
175
175
|
return "".dup.force_encoding("binary") if str == ")"
|
176
176
|
Error.assert_equal(parse_token, ")")
|
177
177
|
|
178
|
-
str.gsub!(/\\([nrtbf()\\\n]
|
179
|
-
|
178
|
+
str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
|
179
|
+
if $2.nil? # not octal digits
|
180
|
+
MAPPING[match] || "".dup
|
181
|
+
else # must be octal digits
|
182
|
+
($2.oct & 0xff).chr # ignore high level overflow
|
183
|
+
end
|
180
184
|
end
|
181
185
|
str.force_encoding("binary")
|
182
186
|
end
|
183
187
|
|
184
188
|
MAPPING = {
|
185
189
|
"\r" => "\n",
|
186
|
-
"\n\r" => "\n",
|
187
190
|
"\r\n" => "\n",
|
188
191
|
"\\n" => "\n",
|
189
192
|
"\\r" => "\r",
|
@@ -194,10 +197,9 @@ class PDF::Reader
|
|
194
197
|
"\\)" => ")",
|
195
198
|
"\\\\" => "\\",
|
196
199
|
"\\\n" => "",
|
200
|
+
"\\\r" => "",
|
201
|
+
"\\\r\n" => "",
|
197
202
|
}
|
198
|
-
0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
|
199
|
-
0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
|
200
|
-
0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
|
201
203
|
|
202
204
|
################################################################################
|
203
205
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
@@ -37,23 +37,15 @@ class PDF::Reader
|
|
37
37
|
def glyph_width(code_point)
|
38
38
|
return 0 if code_point.nil? || code_point < 0
|
39
39
|
|
40
|
-
|
41
|
-
|
42
|
-
|
40
|
+
names = @font.encoding.int_to_name(code_point)
|
41
|
+
metrics = names.map { |name|
|
42
|
+
@metrics.char_metrics[name.to_s]
|
43
|
+
}.compact.first
|
43
44
|
|
44
|
-
|
45
|
-
|
46
|
-
}.compact.first
|
47
|
-
end
|
48
|
-
|
49
|
-
if m
|
50
|
-
m[:wx]
|
51
|
-
elsif @font.widths[code_point - 1]
|
52
|
-
@font.widths[code_point - 1]
|
53
|
-
elsif control_character?(code_point)
|
54
|
-
0
|
45
|
+
if metrics
|
46
|
+
metrics[:wx]
|
55
47
|
else
|
56
|
-
0
|
48
|
+
@font.widths[code_point - 1] || 0
|
57
49
|
end
|
58
50
|
end
|
59
51
|
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -131,6 +131,9 @@ class PDF::Reader
|
|
131
131
|
generation = buf.token.to_i
|
132
132
|
state = buf.token
|
133
133
|
|
134
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
135
|
+
# TODO should this fix be logged?
|
136
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
137
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
138
|
objid += 1
|
136
139
|
params.clear
|
@@ -146,7 +149,9 @@ class PDF::Reader
|
|
146
149
|
end
|
147
150
|
|
148
151
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
152
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
153
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
154
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
155
|
|
151
156
|
trailer
|
152
157
|
end
|