pdf-reader 2.5.0 → 2.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
4
- data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
3
+ metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
4
+ data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
5
5
  SHA512:
6
- metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
7
- data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
6
+ metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
7
+ data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
data/CHANGELOG CHANGED
@@ -1,3 +1,20 @@
1
+ v2.6.0 (12th November 2021)
2
+ - Text extraction improvements
3
+ - Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
4
+ - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
5
+ - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
6
+ - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
7
+ - Performance improvements
8
+ - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
9
+ - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
10
+ - Successfully parse more files
11
+ - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
12
+ - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
13
+ - Increase the amount of junk bytes we detect and skip at the end of a file (382)
14
+ - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
15
+ - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
16
+ - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
17
+
1
18
  v2.5.0 (6th June 2021)
2
19
  - bump minimum ruby version to 2.0
3
20
  - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
data/README.md CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
166
166
  The easiest way to explain how this works in practice is to show some examples.
167
167
  Check out the examples/ directory for a few files.
168
168
 
169
+ # Alternate Decoder
170
+
171
+ For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
172
+
173
+ First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
174
+
175
+ ```ruby
176
+ require "pdf-reader"
177
+ require "ascii85_native"
178
+ ```
179
+
180
+ Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
181
+
169
182
  # Known Limitations
170
183
 
171
184
  Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
176
189
 
177
190
  * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
191
 
179
- * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
192
+ * PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
193
+
194
+ * Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
195
 
181
196
  * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
197
 
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 31
17
+ cane.max_violations = 32
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -17,8 +17,8 @@ module ExtractFonts
17
17
  return count if page.fonts.nil? || page.fonts.empty?
18
18
 
19
19
  page.fonts.each do |label, font|
20
- next if complete_refs[font]
21
- complete_refs[font] = true
20
+ next if complete_refs[label]
21
+ complete_refs[label] = true
22
22
 
23
23
  process_font(page, font)
24
24
 
@@ -39,7 +39,7 @@ module ExtractFonts
39
39
  when :TrueType, :CIDFontType2 then
40
40
  ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
41
41
  else
42
- $stderr.puts "unsupported font type #{font[:Subtype]}"
42
+ $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
43
43
  end
44
44
  end
45
45
 
@@ -68,10 +68,15 @@ module ExtractFonts
68
68
  end
69
69
  end
70
70
 
71
- filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
71
+ if ARGV.size == 0 # default file name
72
+ ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
73
+ end
74
+
72
75
  extractor = ExtractFonts::Extractor.new
73
76
 
74
- PDF::Reader.open(filename) do |reader|
75
- page = reader.page(1)
76
- extractor.page(page)
77
+ ARGV.each do |arg|
78
+ PDF::Reader.open(arg) do |reader|
79
+ page = reader.page(1)
80
+ extractor.page(page)
81
+ end
77
82
  end
@@ -48,6 +48,15 @@ class PDF::Reader
48
48
  ID = "ID"
49
49
  FWD_SLASH = "/"
50
50
  NULL_BYTE = "\x00"
51
+ CR = "\r"
52
+ LF = "\n"
53
+ CRLF = "\r\n"
54
+ WHITE_SPACE = [LF, CR, ' ']
55
+
56
+ # Quite a few PDFs have trailing junk.
57
+ # This can be several k of nuls in some cases
58
+ # Allow for this here
59
+ TRAILING_BYTECOUNT = 5000
51
60
 
52
61
  attr_reader :pos
53
62
 
@@ -86,9 +95,12 @@ class PDF::Reader
86
95
  #
87
96
  # options:
88
97
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
98
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
99
+ # that is sitting under the io cursor.
100
+ # Note:
101
+ # Skipping a bare CR is not spec-compliant.
102
+ # This is because the data may start with LF.
103
+ # However we check for CRLF first, so the ambiguity is avoided.
92
104
  def read(bytes, opts = {})
93
105
  reset_pos
94
106
 
@@ -97,9 +109,9 @@ class PDF::Reader
97
109
  str = @io.read(2)
98
110
  if str.nil?
99
111
  return nil
100
- elsif str == "\r\n"
112
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
113
  # do nothing
102
- elsif str[0,1] == "\n"
114
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
115
  @io.seek(-1, IO::SEEK_CUR)
104
116
  else
105
117
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +139,8 @@ class PDF::Reader
127
139
  #
128
140
  def find_first_xref_offset
129
141
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
142
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
143
+ data = @io.read(TRAILING_BYTECOUNT)
132
144
 
133
145
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
146
  lines = data.split(/[\n\r]+/).reverse
@@ -217,7 +229,9 @@ class PDF::Reader
217
229
  return if @tokens.size < 3
218
230
  return if @tokens[2] != "R"
219
231
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
232
+ # must match whole tokens
233
+ digits_only = %r{\A\d+\z}
234
+ if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
221
235
  @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
236
  @tokens[1] = nil
223
237
  @tokens[2] = nil
@@ -225,24 +239,51 @@ class PDF::Reader
225
239
  end
226
240
  end
227
241
 
242
+ # Extract data between ID and EI
243
+ # If the EI follows white-space the space is dropped from the data
244
+ # The EI must followed by white-space or end of buffer
245
+ # This is to reduce the chance of accidentally matching an embedded EI
228
246
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
247
+ idstart = @io.pos
248
+ chr = prevchr = nil
249
+ eisize = 0 # how many chars in the end marker
250
+ seeking = 'E' # what are we looking for now?
251
+ loop do
234
252
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
253
+ break if chr.nil?
254
+ case seeking
255
+ when 'E'
256
+ if chr == 'E'
257
+ seeking = 'I'
258
+ if WHITE_SPACE.include? prevchr
259
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
260
+ else # assume the EI immediately follows the data
261
+ eisize = 2 # leave prevchr in data
262
+ end
263
+ end
264
+ when 'I'
265
+ if chr == 'I'
266
+ seeking = :END
267
+ else
268
+ seeking = 'E'
269
+ end
270
+ when :END
271
+ if WHITE_SPACE.include? chr
272
+ eisize += 1 # Drop trailer
273
+ break
274
+ else
275
+ seeking = 'E'
276
+ end
239
277
  end
278
+ prevchr = chr
240
279
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
280
+ unless seeking == :END
281
+ raise MalformedPDFError, "EI terminator not found"
282
+ end
283
+ eiend = @io.pos
284
+ @io.seek(idstart, IO::SEEK_SET)
285
+ str = @io.read(eiend - eisize - idstart) # get the ID content
244
286
  @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
246
287
  end
247
288
 
248
289
  # if we're currently inside a hex string, read hex nibbles until
@@ -208,7 +208,7 @@ class PDF::Reader
208
208
  def load_mapping(file)
209
209
  File.open(file, "r:BINARY") do |f|
210
210
  f.each do |l|
211
- _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
211
+ _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
212
212
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
213
213
  end
214
214
  end
@@ -33,17 +33,17 @@ class PDF::Reader
33
33
  def self.str_assert(lvalue, rvalue, chars=nil)
34
34
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
35
35
  lvalue = lvalue[0,chars] if chars
36
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
36
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
37
37
  end
38
38
  ################################################################################
39
39
  def self.str_assert_not(lvalue, rvalue, chars=nil)
40
40
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
41
41
  lvalue = lvalue[0,chars] if chars
42
- raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
42
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
43
43
  end
44
44
  ################################################################################
45
45
  def self.assert_equal(lvalue, rvalue)
46
- raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
46
+ raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
47
47
  end
48
48
  ################################################################################
49
49
  end
@@ -17,7 +17,11 @@ class PDF::Reader
17
17
  #
18
18
  def filter(data)
19
19
  data = "<~#{data}" unless data.to_s[0,2] == "<~"
20
- ::Ascii85::decode(data)
20
+ if defined?(::Ascii85Native)
21
+ ::Ascii85Native::decode(data)
22
+ else
23
+ ::Ascii85::decode(data)
24
+ end
21
25
  rescue Exception => e
22
26
  # Oops, there was a problem decoding the stream
23
27
  raise MalformedPDFError,
@@ -34,7 +34,7 @@ class PDF::Reader
34
34
  ################################################################################
35
35
  def tiff_depredict(data)
36
36
  data = data.unpack("C*")
37
- unfiltered = []
37
+ unfiltered = ''
38
38
  bpc = @options[:BitsPerComponent] || 8
39
39
  pixel_bits = bpc * @options[:Colors]
40
40
  pixel_bytes = pixel_bits / 8
@@ -51,11 +51,11 @@ class PDF::Reader
51
51
  left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
52
52
  row_data[index] = (byte + left) % 256
53
53
  end
54
- unfiltered += row_data
54
+ unfiltered += row_data.pack("C*")
55
55
  pos += line_len
56
56
  end
57
57
 
58
- unfiltered.pack("C*")
58
+ unfiltered
59
59
  end
60
60
  ################################################################################
61
61
  def png_depredict(data)
@@ -103,19 +103,25 @@ class PDF::Reader
103
103
 
104
104
  # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
105
105
  # a text file supplied by Adobe at:
106
- # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
106
+ # https://github.com/adobe-type-tools/agl-aglfn
107
107
  def load_adobe_glyph_mapping
108
108
  keyed_by_name = {}
109
109
  keyed_by_codepoint = {}
110
110
 
111
- File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
112
- f.each do |l|
113
- _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
114
- if name && code
115
- cp = "0x#{code}".hex
116
- keyed_by_name[name.to_sym] = cp
117
- keyed_by_codepoint[cp] ||= []
118
- keyed_by_codepoint[cp] << name.to_sym
111
+ paths = [
112
+ File.dirname(__FILE__) + "/glyphlist.txt",
113
+ File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
114
+ ]
115
+ paths.each do |path|
116
+ File.open(path, "r:BINARY") do |f|
117
+ f.each do |l|
118
+ _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
119
+ if name && code
120
+ cp = "0x#{code}".hex
121
+ keyed_by_name[name.to_sym] = cp
122
+ keyed_by_codepoint[cp] ||= []
123
+ keyed_by_codepoint[cp] << name.to_sym
124
+ end
119
125
  end
120
126
  end
121
127
  end
@@ -0,0 +1,245 @@
1
+ # -----------------------------------------------------------
2
+ # Copyright 2002-2019 Adobe (http://www.adobe.com/).
3
+ #
4
+ # Redistribution and use in source and binary forms, with or
5
+ # without modification, are permitted provided that the
6
+ # following conditions are met:
7
+ #
8
+ # Redistributions of source code must retain the above
9
+ # copyright notice, this list of conditions and the following
10
+ # disclaimer.
11
+ #
12
+ # Redistributions in binary form must reproduce the above
13
+ # copyright notice, this list of conditions and the following
14
+ # disclaimer in the documentation and/or other materials
15
+ # provided with the distribution.
16
+ #
17
+ # Neither the name of Adobe nor the names of its contributors
18
+ # may be used to endorse or promote products derived from this
19
+ # software without specific prior written permission.
20
+ #
21
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
22
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
23
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
24
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
26
+ # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27
+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28
+ # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29
+ # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
31
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
32
+ # OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34
+ # -----------------------------------------------------------
35
+ # Name: ITC Zapf Dingbats Glyph List
36
+ # Table version: 2.0
37
+ # Date: September 20, 2002
38
+ # URL: https://github.com/adobe-type-tools/agl-aglfn
39
+ #
40
+ # Format: two semicolon-delimited fields:
41
+ # (1) glyph name--upper/lowercase letters and digits
42
+ # (2) Unicode scalar value--four uppercase hexadecimal digits
43
+ #
44
+ a100;275E
45
+ a101;2761
46
+ a102;2762
47
+ a103;2763
48
+ a104;2764
49
+ a105;2710
50
+ a106;2765
51
+ a107;2766
52
+ a108;2767
53
+ a109;2660
54
+ a10;2721
55
+ a110;2665
56
+ a111;2666
57
+ a112;2663
58
+ a117;2709
59
+ a118;2708
60
+ a119;2707
61
+ a11;261B
62
+ a120;2460
63
+ a121;2461
64
+ a122;2462
65
+ a123;2463
66
+ a124;2464
67
+ a125;2465
68
+ a126;2466
69
+ a127;2467
70
+ a128;2468
71
+ a129;2469
72
+ a12;261E
73
+ a130;2776
74
+ a131;2777
75
+ a132;2778
76
+ a133;2779
77
+ a134;277A
78
+ a135;277B
79
+ a136;277C
80
+ a137;277D
81
+ a138;277E
82
+ a139;277F
83
+ a13;270C
84
+ a140;2780
85
+ a141;2781
86
+ a142;2782
87
+ a143;2783
88
+ a144;2784
89
+ a145;2785
90
+ a146;2786
91
+ a147;2787
92
+ a148;2788
93
+ a149;2789
94
+ a14;270D
95
+ a150;278A
96
+ a151;278B
97
+ a152;278C
98
+ a153;278D
99
+ a154;278E
100
+ a155;278F
101
+ a156;2790
102
+ a157;2791
103
+ a158;2792
104
+ a159;2793
105
+ a15;270E
106
+ a160;2794
107
+ a161;2192
108
+ a162;27A3
109
+ a163;2194
110
+ a164;2195
111
+ a165;2799
112
+ a166;279B
113
+ a167;279C
114
+ a168;279D
115
+ a169;279E
116
+ a16;270F
117
+ a170;279F
118
+ a171;27A0
119
+ a172;27A1
120
+ a173;27A2
121
+ a174;27A4
122
+ a175;27A5
123
+ a176;27A6
124
+ a177;27A7
125
+ a178;27A8
126
+ a179;27A9
127
+ a17;2711
128
+ a180;27AB
129
+ a181;27AD
130
+ a182;27AF
131
+ a183;27B2
132
+ a184;27B3
133
+ a185;27B5
134
+ a186;27B8
135
+ a187;27BA
136
+ a188;27BB
137
+ a189;27BC
138
+ a18;2712
139
+ a190;27BD
140
+ a191;27BE
141
+ a192;279A
142
+ a193;27AA
143
+ a194;27B6
144
+ a195;27B9
145
+ a196;2798
146
+ a197;27B4
147
+ a198;27B7
148
+ a199;27AC
149
+ a19;2713
150
+ a1;2701
151
+ a200;27AE
152
+ a201;27B1
153
+ a202;2703
154
+ a203;2750
155
+ a204;2752
156
+ a205;276E
157
+ a206;2770
158
+ a20;2714
159
+ a21;2715
160
+ a22;2716
161
+ a23;2717
162
+ a24;2718
163
+ a25;2719
164
+ a26;271A
165
+ a27;271B
166
+ a28;271C
167
+ a29;2722
168
+ a2;2702
169
+ a30;2723
170
+ a31;2724
171
+ a32;2725
172
+ a33;2726
173
+ a34;2727
174
+ a35;2605
175
+ a36;2729
176
+ a37;272A
177
+ a38;272B
178
+ a39;272C
179
+ a3;2704
180
+ a40;272D
181
+ a41;272E
182
+ a42;272F
183
+ a43;2730
184
+ a44;2731
185
+ a45;2732
186
+ a46;2733
187
+ a47;2734
188
+ a48;2735
189
+ a49;2736
190
+ a4;260E
191
+ a50;2737
192
+ a51;2738
193
+ a52;2739
194
+ a53;273A
195
+ a54;273B
196
+ a55;273C
197
+ a56;273D
198
+ a57;273E
199
+ a58;273F
200
+ a59;2740
201
+ a5;2706
202
+ a60;2741
203
+ a61;2742
204
+ a62;2743
205
+ a63;2744
206
+ a64;2745
207
+ a65;2746
208
+ a66;2747
209
+ a67;2748
210
+ a68;2749
211
+ a69;274A
212
+ a6;271D
213
+ a70;274B
214
+ a71;25CF
215
+ a72;274D
216
+ a73;25A0
217
+ a74;274F
218
+ a75;2751
219
+ a76;25B2
220
+ a77;25BC
221
+ a78;25C6
222
+ a79;2756
223
+ a7;271E
224
+ a81;25D7
225
+ a82;2758
226
+ a83;2759
227
+ a84;275A
228
+ a85;276F
229
+ a86;2771
230
+ a87;2772
231
+ a88;2773
232
+ a89;2768
233
+ a8;271F
234
+ a90;2769
235
+ a91;276C
236
+ a92;276D
237
+ a93;276A
238
+ a94;276B
239
+ a95;2774
240
+ a96;2775
241
+ a97;275B
242
+ a98;275C
243
+ a99;275D
244
+ a9;2720
245
+ # END
@@ -2,6 +2,7 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require 'pdf/reader/overlapping_runs_filter'
5
+ require 'pdf/reader/zero_width_runs_filter'
5
6
 
6
7
  class PDF::Reader
7
8
 
@@ -17,10 +18,12 @@ class PDF::Reader
17
18
  def initialize(runs, mediabox)
18
19
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
19
20
 
20
- @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
21
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
22
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
23
+ @runs = merge_runs(runs)
21
24
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
22
25
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
23
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
26
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
24
27
  @page_width = (mediabox[2] - mediabox[0]).abs
25
28
  @page_height = (mediabox[3] - mediabox[1]).abs
26
29
  @x_offset = @runs.map(&:x).sort.first || 0
@@ -67,7 +70,7 @@ class PDF::Reader
67
70
  end
68
71
 
69
72
  def col_count
70
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
73
+ @col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
71
74
  end
72
75
 
73
76
  def row_multiplier
@@ -86,12 +89,12 @@ class PDF::Reader
86
89
  end
87
90
  end
88
91
 
89
- def each_line(&block)
90
- @runs.sort.group_by { |run|
91
- run.y.to_i
92
- }.map { |y, collection|
93
- yield y, collection
94
- }
92
+ def median(collection)
93
+ if collection.size == 0
94
+ 0
95
+ else
96
+ collection.sort[(collection.size * 0.5).floor]
97
+ end
95
98
  end
96
99
 
97
100
  # take a collection of TextRun objects and merge any that are in close
@@ -45,8 +45,8 @@ module PDF
45
45
  @content = []
46
46
  @characters = []
47
47
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
- device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
49
- device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
48
+ device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
49
+ device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
50
50
  @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
51
51
  end
52
52
 
@@ -175,15 +175,18 @@ class PDF::Reader
175
175
  return "".dup.force_encoding("binary") if str == ")"
176
176
  Error.assert_equal(parse_token, ")")
177
177
 
178
- str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
179
- MAPPING[match] || "".dup
178
+ str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
179
+ if $2.nil? # not octal digits
180
+ MAPPING[match] || "".dup
181
+ else # must be octal digits
182
+ ($2.oct & 0xff).chr # ignore high level overflow
183
+ end
180
184
  end
181
185
  str.force_encoding("binary")
182
186
  end
183
187
 
184
188
  MAPPING = {
185
189
  "\r" => "\n",
186
- "\n\r" => "\n",
187
190
  "\r\n" => "\n",
188
191
  "\\n" => "\n",
189
192
  "\\r" => "\r",
@@ -194,10 +197,9 @@ class PDF::Reader
194
197
  "\\)" => ")",
195
198
  "\\\\" => "\\",
196
199
  "\\\n" => "",
200
+ "\\\r" => "",
201
+ "\\\r\n" => "",
197
202
  }
198
- 0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
199
- 0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
200
- 0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
201
203
 
202
204
  ################################################################################
203
205
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
@@ -131,6 +131,9 @@ class PDF::Reader
131
131
  generation = buf.token.to_i
132
132
  state = buf.token
133
133
 
134
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
135
+ # TODO should this fix be logged?
136
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
137
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
138
  objid += 1
136
139
  params.clear
@@ -146,7 +149,9 @@ class PDF::Reader
146
149
  end
147
150
 
148
151
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
152
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
153
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
154
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
155
 
151
156
  trailer
152
157
  end
@@ -0,0 +1,11 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ # There's no point rendering zero-width characters
5
+ class ZeroWidthRunsFilter
6
+
7
+ def self.exclude_zero_width_runs(runs)
8
+ runs.reject { |run| run.width == 0 }
9
+ end
10
+ end
11
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.5.0
4
+ version: 2.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-06-06 00:00:00.000000000 Z
11
+ date: 2021-11-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -239,6 +239,7 @@ files:
239
239
  - lib/pdf/reader/font_descriptor.rb
240
240
  - lib/pdf/reader/form_xobject.rb
241
241
  - lib/pdf/reader/glyph_hash.rb
242
+ - lib/pdf/reader/glyphlist-zapfdingbats.txt
242
243
  - lib/pdf/reader/glyphlist.txt
243
244
  - lib/pdf/reader/lzw.rb
244
245
  - lib/pdf/reader/null_security_handler.rb
@@ -272,15 +273,16 @@ files:
272
273
  - lib/pdf/reader/width_calculator/type_one_or_three.rb
273
274
  - lib/pdf/reader/width_calculator/type_zero.rb
274
275
  - lib/pdf/reader/xref.rb
276
+ - lib/pdf/reader/zero_width_runs_filter.rb
275
277
  homepage: https://github.com/yob/pdf-reader
276
278
  licenses:
277
279
  - MIT
278
280
  metadata:
279
281
  bug_tracker_uri: https://github.com/yob/pdf-reader/issues
280
- changelog_uri: https://github.com/yob/pdf-reader/blob/v2.5.0/CHANGELOG
281
- documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.5.0
282
- source_code_uri: https://github.com/yob/pdf-reader/tree/v2.5.0
283
- post_install_message:
282
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.6.0/CHANGELOG
283
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.6.0
284
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.6.0
285
+ post_install_message:
284
286
  rdoc_options:
285
287
  - "--title"
286
288
  - PDF::Reader Documentation
@@ -300,8 +302,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
300
302
  - !ruby/object:Gem::Version
301
303
  version: '0'
302
304
  requirements: []
303
- rubygems_version: 3.2.3
304
- signing_key:
305
+ rubygems_version: 3.1.4
306
+ signing_key:
305
307
  specification_version: 4
306
308
  summary: A library for accessing the content of PDF files
307
309
  test_files: []