pdf-reader 2.4.0 → 2.6.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (40) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +31 -0
  3. data/README.md +17 -2
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  7. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  8. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier.afm +342 -342
  10. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  11. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  12. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  13. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  14. data/lib/pdf/reader/afm/MustRead.html +19 -0
  15. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  16. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  17. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  18. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  19. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  20. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  21. data/lib/pdf/reader/buffer.rb +62 -21
  22. data/lib/pdf/reader/encoding.rb +1 -1
  23. data/lib/pdf/reader/error.rb +3 -3
  24. data/lib/pdf/reader/filter/ascii85.rb +5 -1
  25. data/lib/pdf/reader/filter/depredict.rb +3 -3
  26. data/lib/pdf/reader/filter/flate.rb +28 -16
  27. data/lib/pdf/reader/font.rb +3 -1
  28. data/lib/pdf/reader/glyph_hash.rb +15 -9
  29. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  30. data/lib/pdf/reader/object_hash.rb +3 -1
  31. data/lib/pdf/reader/orientation_detector.rb +2 -2
  32. data/lib/pdf/reader/page.rb +28 -0
  33. data/lib/pdf/reader/page_layout.rb +19 -13
  34. data/lib/pdf/reader/page_state.rb +7 -5
  35. data/lib/pdf/reader/page_text_receiver.rb +22 -1
  36. data/lib/pdf/reader/parser.rb +8 -6
  37. data/lib/pdf/reader/width_calculator/built_in.rb +7 -15
  38. data/lib/pdf/reader/xref.rb +6 -1
  39. data/lib/pdf/reader/zero_width_runs_filter.rb +11 -0
  40. metadata +17 -14
@@ -8,6 +8,9 @@ class PDF::Reader
8
8
  module Filter # :nodoc:
9
9
  # implementation of the Flate (zlib) stream filter
10
10
  class Flate
11
+ ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
12
+ ZLIB_RAW_DEFLATE = -15 # Zlib::MAX_WBITS * -1
13
+
11
14
  def initialize(options = {})
12
15
  @options = options
13
16
  end
@@ -15,25 +18,34 @@ class PDF::Reader
15
18
  ################################################################################
16
19
  # Decode the specified data with the Zlib compression algorithm
17
20
  def filter(data)
18
- deflated = nil
21
+ deflated = zlib_inflate(data) || zlib_inflate(data[0, data.bytesize-1])
22
+
23
+ if deflated.nil?
24
+ raise MalformedPDFError,
25
+ "Error while inflating a compressed stream (no suitable inflation algorithm found)"
26
+ end
27
+ Depredict.new(@options).filter(deflated)
28
+ end
29
+
30
+ private
31
+
32
+ def zlib_inflate(data)
19
33
  begin
20
- deflated = Zlib::Inflate.new.inflate(data)
21
- rescue Zlib::DataError => e
34
+ return Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
35
+ rescue Zlib::DataError
22
36
  # by default, Ruby's Zlib assumes the data it's inflating
23
- # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
24
- # If that fails, then use an undocumented 'feature' to attempt to inflate
25
- # the data as a raw RFC1951 stream.
26
- #
27
- # See
28
- # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
29
- # - http://www.gzip.org/zlib/zlib_faq.html#faq38
30
- deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
37
+ # is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
38
+ # fails, swallow the exception and attempt to inflate the data as a raw
39
+ # RFC1951 stream.
31
40
  end
32
- Depredict.new(@options).filter(deflated)
33
- rescue Exception => e
34
- # Oops, there was a problem inflating the stream
35
- raise MalformedPDFError,
36
- "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
41
+
42
+ begin
43
+ return Zlib::Inflate.new(ZLIB_RAW_DEFLATE).inflate(data)
44
+ rescue StandardError
45
+ # swallow this one too, so we can try some other fallback options
46
+ end
47
+
48
+ nil
37
49
  end
38
50
  end
39
51
  end
@@ -131,7 +131,9 @@ class PDF::Reader
131
131
  if obj[:ToUnicode]
132
132
  # ToUnicode is optional for Type1 and Type3
133
133
  stream = @ohash.object(obj[:ToUnicode])
134
- @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
134
+ if stream.is_a?(PDF::Reader::Stream)
135
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
136
+ end
135
137
  end
136
138
  end
137
139
 
@@ -103,19 +103,25 @@ class PDF::Reader
103
103
 
104
104
  # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
105
105
  # a text file supplied by Adobe at:
106
- # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
106
+ # https://github.com/adobe-type-tools/agl-aglfn
107
107
  def load_adobe_glyph_mapping
108
108
  keyed_by_name = {}
109
109
  keyed_by_codepoint = {}
110
110
 
111
- File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
112
- f.each do |l|
113
- _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
114
- if name && code
115
- cp = "0x#{code}".hex
116
- keyed_by_name[name.to_sym] = cp
117
- keyed_by_codepoint[cp] ||= []
118
- keyed_by_codepoint[cp] << name.to_sym
111
+ paths = [
112
+ File.dirname(__FILE__) + "/glyphlist.txt",
113
+ File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
114
+ ]
115
+ paths.each do |path|
116
+ File.open(path, "r:BINARY") do |f|
117
+ f.each do |l|
118
+ _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
119
+ if name && code
120
+ cp = "0x#{code}".hex
121
+ keyed_by_name[name.to_sym] = cp
122
+ keyed_by_codepoint[cp] ||= []
123
+ keyed_by_codepoint[cp] << name.to_sym
124
+ end
119
125
  end
120
126
  end
121
127
  end
@@ -0,0 +1,245 @@
1
+ # -----------------------------------------------------------
2
+ # Copyright 2002-2019 Adobe (http://www.adobe.com/).
3
+ #
4
+ # Redistribution and use in source and binary forms, with or
5
+ # without modification, are permitted provided that the
6
+ # following conditions are met:
7
+ #
8
+ # Redistributions of source code must retain the above
9
+ # copyright notice, this list of conditions and the following
10
+ # disclaimer.
11
+ #
12
+ # Redistributions in binary form must reproduce the above
13
+ # copyright notice, this list of conditions and the following
14
+ # disclaimer in the documentation and/or other materials
15
+ # provided with the distribution.
16
+ #
17
+ # Neither the name of Adobe nor the names of its contributors
18
+ # may be used to endorse or promote products derived from this
19
+ # software without specific prior written permission.
20
+ #
21
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
22
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
23
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
26
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28
+ # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32
+ # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+ # -----------------------------------------------------------
35
+ # Name: ITC Zapf Dingbats Glyph List
36
+ # Table version: 2.0
37
+ # Date: September 20, 2002
38
+ # URL: https://github.com/adobe-type-tools/agl-aglfn
39
+ #
40
+ # Format: two semicolon-delimited fields:
41
+ # (1) glyph name--upper/lowercase letters and digits
42
+ # (2) Unicode scalar value--four uppercase hexadecimal digits
43
+ #
44
+ a100;275E
45
+ a101;2761
46
+ a102;2762
47
+ a103;2763
48
+ a104;2764
49
+ a105;2710
50
+ a106;2765
51
+ a107;2766
52
+ a108;2767
53
+ a109;2660
54
+ a10;2721
55
+ a110;2665
56
+ a111;2666
57
+ a112;2663
58
+ a117;2709
59
+ a118;2708
60
+ a119;2707
61
+ a11;261B
62
+ a120;2460
63
+ a121;2461
64
+ a122;2462
65
+ a123;2463
66
+ a124;2464
67
+ a125;2465
68
+ a126;2466
69
+ a127;2467
70
+ a128;2468
71
+ a129;2469
72
+ a12;261E
73
+ a130;2776
74
+ a131;2777
75
+ a132;2778
76
+ a133;2779
77
+ a134;277A
78
+ a135;277B
79
+ a136;277C
80
+ a137;277D
81
+ a138;277E
82
+ a139;277F
83
+ a13;270C
84
+ a140;2780
85
+ a141;2781
86
+ a142;2782
87
+ a143;2783
88
+ a144;2784
89
+ a145;2785
90
+ a146;2786
91
+ a147;2787
92
+ a148;2788
93
+ a149;2789
94
+ a14;270D
95
+ a150;278A
96
+ a151;278B
97
+ a152;278C
98
+ a153;278D
99
+ a154;278E
100
+ a155;278F
101
+ a156;2790
102
+ a157;2791
103
+ a158;2792
104
+ a159;2793
105
+ a15;270E
106
+ a160;2794
107
+ a161;2192
108
+ a162;27A3
109
+ a163;2194
110
+ a164;2195
111
+ a165;2799
112
+ a166;279B
113
+ a167;279C
114
+ a168;279D
115
+ a169;279E
116
+ a16;270F
117
+ a170;279F
118
+ a171;27A0
119
+ a172;27A1
120
+ a173;27A2
121
+ a174;27A4
122
+ a175;27A5
123
+ a176;27A6
124
+ a177;27A7
125
+ a178;27A8
126
+ a179;27A9
127
+ a17;2711
128
+ a180;27AB
129
+ a181;27AD
130
+ a182;27AF
131
+ a183;27B2
132
+ a184;27B3
133
+ a185;27B5
134
+ a186;27B8
135
+ a187;27BA
136
+ a188;27BB
137
+ a189;27BC
138
+ a18;2712
139
+ a190;27BD
140
+ a191;27BE
141
+ a192;279A
142
+ a193;27AA
143
+ a194;27B6
144
+ a195;27B9
145
+ a196;2798
146
+ a197;27B4
147
+ a198;27B7
148
+ a199;27AC
149
+ a19;2713
150
+ a1;2701
151
+ a200;27AE
152
+ a201;27B1
153
+ a202;2703
154
+ a203;2750
155
+ a204;2752
156
+ a205;276E
157
+ a206;2770
158
+ a20;2714
159
+ a21;2715
160
+ a22;2716
161
+ a23;2717
162
+ a24;2718
163
+ a25;2719
164
+ a26;271A
165
+ a27;271B
166
+ a28;271C
167
+ a29;2722
168
+ a2;2702
169
+ a30;2723
170
+ a31;2724
171
+ a32;2725
172
+ a33;2726
173
+ a34;2727
174
+ a35;2605
175
+ a36;2729
176
+ a37;272A
177
+ a38;272B
178
+ a39;272C
179
+ a3;2704
180
+ a40;272D
181
+ a41;272E
182
+ a42;272F
183
+ a43;2730
184
+ a44;2731
185
+ a45;2732
186
+ a46;2733
187
+ a47;2734
188
+ a48;2735
189
+ a49;2736
190
+ a4;260E
191
+ a50;2737
192
+ a51;2738
193
+ a52;2739
194
+ a53;273A
195
+ a54;273B
196
+ a55;273C
197
+ a56;273D
198
+ a57;273E
199
+ a58;273F
200
+ a59;2740
201
+ a5;2706
202
+ a60;2741
203
+ a61;2742
204
+ a62;2743
205
+ a63;2744
206
+ a64;2745
207
+ a65;2746
208
+ a66;2747
209
+ a67;2748
210
+ a68;2749
211
+ a69;274A
212
+ a6;271D
213
+ a70;274B
214
+ a71;25CF
215
+ a72;274D
216
+ a73;25A0
217
+ a74;274F
218
+ a75;2751
219
+ a76;25B2
220
+ a77;25BC
221
+ a78;25C6
222
+ a79;2756
223
+ a7;271E
224
+ a81;25D7
225
+ a82;2758
226
+ a83;2759
227
+ a84;275A
228
+ a85;276F
229
+ a86;2771
230
+ a87;2772
231
+ a88;2773
232
+ a89;2768
233
+ a8;271F
234
+ a90;2769
235
+ a91;276C
236
+ a92;276D
237
+ a93;276A
238
+ a94;276B
239
+ a95;2774
240
+ a96;2775
241
+ a97;275B
242
+ a98;275C
243
+ a99;275D
244
+ a9;2720
245
+ # END
@@ -331,7 +331,9 @@ class PDF::Reader
331
331
  def decrypt(ref, obj)
332
332
  case obj
333
333
  when PDF::Reader::Stream then
334
- obj.data = sec_handler.decrypt(obj.data, ref)
334
+ # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
335
+ # Therefore we shouldn't try to decrypt it.
336
+ obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
335
337
  obj
336
338
  when Hash then
337
339
  arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
@@ -22,8 +22,8 @@ class PDF::Reader
22
22
  def detect_orientation
23
23
  llx,lly,urx,ury = @attributes[:MediaBox]
24
24
  rotation = @attributes[:Rotate].to_i
25
- width = urx.to_i - llx.to_i
26
- height = ury.to_i - lly.to_i
25
+ width = (urx.to_i - llx.to_i).abs
26
+ height = (ury.to_i - lly.to_i).abs
27
27
  if width > height
28
28
  (rotation % 180).zero? ? 'landscape' : 'portrait'
29
29
  else
@@ -124,6 +124,34 @@ module PDF
124
124
  }.join(" ")
125
125
  end
126
126
 
127
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
128
+ #
129
+ def rotate
130
+ value = attributes[:Rotate].to_i
131
+ case value
132
+ when 0, 90, 180, 270
133
+ value
134
+ else
135
+ 0
136
+ end
137
+ end
138
+
139
+ # returns the "boxes" that define the page object.
140
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
141
+ #
142
+ def boxes
143
+ mediabox = attributes[:MediaBox]
144
+ cropbox = attributes[:Cropbox] || mediabox
145
+
146
+ {
147
+ MediaBox: objects.deref!(mediabox),
148
+ CropBox: objects.deref!(cropbox),
149
+ BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
150
+ TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
151
+ ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
152
+ }
153
+ end
154
+
127
155
  private
128
156
 
129
157
  def root
@@ -2,6 +2,7 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'pdf/reader/overlapping_runs_filter'
5
+ require 'pdf/reader/zero_width_runs_filter'
5
6
 
6
7
  class PDF::Reader
7
8
 
@@ -17,22 +18,27 @@ class PDF::Reader
17
18
  def initialize(runs, mediabox)
18
19
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
19
20
 
20
- @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
21
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
22
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
23
+ @runs = merge_runs(runs)
21
24
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
22
25
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
23
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
24
- @page_width = mediabox[2] - mediabox[0]
25
- @page_height = mediabox[3] - mediabox[1]
26
- @x_offset = @runs.map(&:x).sort.first
26
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
27
+ @page_width = (mediabox[2] - mediabox[0]).abs
28
+ @page_height = (mediabox[3] - mediabox[1]).abs
29
+ @x_offset = @runs.map(&:x).sort.first || 0
30
+ lowest_y = @runs.map(&:y).sort.first || 0
31
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
27
32
  end
28
33
 
29
34
  def to_s
30
35
  return "" if @runs.empty?
36
+ return "" if row_count == 0
31
37
 
32
38
  page = row_count.times.map { |i| " " * col_count }
33
39
  @runs.each do |run|
34
40
  x_pos = ((run.x - @x_offset) / col_multiplier).round
35
- y_pos = row_count - (run.y / row_multiplier).round
41
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
36
42
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
37
43
  local_string_insert(page[y_pos-1], run.text, x_pos)
38
44
  end
@@ -64,7 +70,7 @@ class PDF::Reader
64
70
  end
65
71
 
66
72
  def col_count
67
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
73
+ @col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
68
74
  end
69
75
 
70
76
  def row_multiplier
@@ -83,12 +89,12 @@ class PDF::Reader
83
89
  end
84
90
  end
85
91
 
86
- def each_line(&block)
87
- @runs.sort.group_by { |run|
88
- run.y.to_i
89
- }.map { |y, collection|
90
- yield y, collection
91
- }
92
+ def median(collection)
93
+ if collection.size == 0
94
+ 0
95
+ else
96
+ collection.sort[(collection.size * 0.5).floor]
97
+ end
92
98
  end
93
99
 
94
100
  # take a collection of TextRun objects and merge any that are in close
@@ -30,7 +30,7 @@ class PDF::Reader
30
30
  @xobject_stack = [page.xobjects]
31
31
  @cs_stack = [page.color_spaces]
32
32
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- state[:ctm] = identity_matrix
33
+ state[:ctm] = identity_matrix
34
34
  end
35
35
 
36
36
  #####################################################
@@ -322,11 +322,13 @@ class PDF::Reader
322
322
  th = state[:h_scaling]
323
323
  # optimise the common path to reduce Float allocations
324
324
  if th == 1 && tj == 0 && tc == 0 && tw == 0
325
- glyph_width = w0 * fs
326
- tx = glyph_width
325
+ tx = w0 * fs
326
+ elsif tj != 0
327
+ # don't apply spacing to TJ displacement
328
+ tx = (w0 - (tj/1000.0)) * fs * th
327
329
  else
328
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
329
- tx = glyph_width + ((tc + tw) * th)
330
+ # apply horizontal scaling to spacing values but not font size
331
+ tx = ((w0 * fs) + tc + tw) * th
330
332
  end
331
333
 
332
334
  # TODO: I'm pretty sure that tx shouldn't need to be divided by
@@ -41,13 +41,17 @@ module PDF
41
41
  # starting a new page
42
42
  def page=(page)
43
43
  @state = PageState.new(page)
44
+ @page = page
44
45
  @content = []
45
46
  @characters = []
46
47
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
+ device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
49
+ device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
50
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
47
51
  end
48
52
 
49
53
  def content
50
- PageLayout.new(@characters, @mediabox).to_s
54
+ PageLayout.new(@characters, @device_mediabox).to_s
51
55
  end
52
56
 
53
57
  #####################################################
@@ -101,6 +105,8 @@ module PDF
101
105
  glyphs.each_with_index do |glyph_code, index|
102
106
  # paint the current glyph
103
107
  newx, newy = @state.trm_transform(0,0)
108
+ newx, newy = apply_rotation(newx, newy)
109
+
104
110
  utf8_chars = @state.current_font.to_utf8(glyph_code)
105
111
 
106
112
  # apply to glyph displacment for the current glyph so the next
@@ -115,6 +121,21 @@ module PDF
115
121
  end
116
122
  end
117
123
 
124
+ def apply_rotation(x, y)
125
+ if @page.rotate == 90
126
+ tmp = x
127
+ x = y
128
+ y = tmp * -1
129
+ elsif @page.rotate == 180
130
+ y *= -1
131
+ elsif @page.rotate == 270
132
+ tmp = x
133
+ x = y * -1
134
+ y = tmp * -1
135
+ end
136
+ return x, y
137
+ end
138
+
118
139
  end
119
140
  end
120
141
  end
@@ -175,15 +175,18 @@ class PDF::Reader
175
175
  return "".dup.force_encoding("binary") if str == ")"
176
176
  Error.assert_equal(parse_token, ")")
177
177
 
178
- str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
179
- MAPPING[match] || "".dup
178
+ str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
179
+ if $2.nil? # not octal digits
180
+ MAPPING[match] || "".dup
181
+ else # must be octal digits
182
+ ($2.oct & 0xff).chr # ignore high level overflow
183
+ end
180
184
  end
181
185
  str.force_encoding("binary")
182
186
  end
183
187
 
184
188
  MAPPING = {
185
189
  "\r" => "\n",
186
- "\n\r" => "\n",
187
190
  "\r\n" => "\n",
188
191
  "\\n" => "\n",
189
192
  "\\r" => "\r",
@@ -194,10 +197,9 @@ class PDF::Reader
194
197
  "\\)" => ")",
195
198
  "\\\\" => "\\",
196
199
  "\\\n" => "",
200
+ "\\\r" => "",
201
+ "\\\r\n" => "",
197
202
  }
198
- 0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
199
- 0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
200
- 0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
201
203
 
202
204
  ################################################################################
203
205
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
@@ -37,23 +37,15 @@ class PDF::Reader
37
37
  def glyph_width(code_point)
38
38
  return 0 if code_point.nil? || code_point < 0
39
39
 
40
- m = @metrics.char_metrics_by_code[code_point]
41
- if m.nil?
42
- names = @font.encoding.int_to_name(code_point)
40
+ names = @font.encoding.int_to_name(code_point)
41
+ metrics = names.map { |name|
42
+ @metrics.char_metrics[name.to_s]
43
+ }.compact.first
43
44
 
44
- m = names.map { |name|
45
- @metrics.char_metrics[name.to_s]
46
- }.compact.first
47
- end
48
-
49
- if m
50
- m[:wx]
51
- elsif @font.widths[code_point - 1]
52
- @font.widths[code_point - 1]
53
- elsif control_character?(code_point)
54
- 0
45
+ if metrics
46
+ metrics[:wx]
55
47
  else
56
- 0
48
+ @font.widths[code_point - 1] || 0
57
49
  end
58
50
  end
59
51
 
@@ -131,6 +131,9 @@ class PDF::Reader
131
131
  generation = buf.token.to_i
132
132
  state = buf.token
133
133
 
134
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
135
+ # TODO should this fix be logged?
136
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
137
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
138
  objid += 1
136
139
  params.clear
@@ -146,7 +149,9 @@ class PDF::Reader
146
149
  end
147
150
 
148
151
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
152
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
153
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
154
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
155
 
151
156
  trailer
152
157
  end
@@ -0,0 +1,11 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # There's no point rendering zero-width characters
5
+ class ZeroWidthRunsFilter
6
+
7
+ def self.exclude_zero_width_runs(runs)
8
+ runs.reject { |run| run.width == 0 }
9
+ end
10
+ end
11
+ end