pdf-reader 2.5.0 → 2.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +42 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  8. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  9. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  10. data/lib/pdf/reader/buffer.rb +90 -46
  11. data/lib/pdf/reader/cid_widths.rb +1 -0
  12. data/lib/pdf/reader/cmap.rb +65 -50
  13. data/lib/pdf/reader/encoding.rb +3 -2
  14. data/lib/pdf/reader/error.rb +19 -3
  15. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  16. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  17. data/lib/pdf/reader/filter/depredict.rb +11 -9
  18. data/lib/pdf/reader/filter/flate.rb +4 -2
  19. data/lib/pdf/reader/filter/lzw.rb +2 -0
  20. data/lib/pdf/reader/filter/null.rb +1 -1
  21. data/lib/pdf/reader/filter/run_length.rb +19 -13
  22. data/lib/pdf/reader/filter.rb +2 -1
  23. data/lib/pdf/reader/font.rb +72 -16
  24. data/lib/pdf/reader/font_descriptor.rb +19 -17
  25. data/lib/pdf/reader/form_xobject.rb +15 -5
  26. data/lib/pdf/reader/glyph_hash.rb +16 -9
  27. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  28. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  29. data/lib/pdf/reader/lzw.rb +4 -2
  30. data/lib/pdf/reader/null_security_handler.rb +1 -4
  31. data/lib/pdf/reader/object_cache.rb +1 -0
  32. data/lib/pdf/reader/object_hash.rb +252 -44
  33. data/lib/pdf/reader/object_stream.rb +1 -0
  34. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  35. data/lib/pdf/reader/page.rb +99 -19
  36. data/lib/pdf/reader/page_layout.rb +36 -37
  37. data/lib/pdf/reader/page_state.rb +12 -11
  38. data/lib/pdf/reader/page_text_receiver.rb +57 -10
  39. data/lib/pdf/reader/pages_strategy.rb +1 -0
  40. data/lib/pdf/reader/parser.rb +23 -12
  41. data/lib/pdf/reader/point.rb +25 -0
  42. data/lib/pdf/reader/print_receiver.rb +1 -0
  43. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  44. data/lib/pdf/reader/rectangle.rb +113 -0
  45. data/lib/pdf/reader/reference.rb +1 -0
  46. data/lib/pdf/reader/register_receiver.rb +1 -0
  47. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
  48. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  49. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  50. data/lib/pdf/reader/stream.rb +2 -1
  51. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  52. data/lib/pdf/reader/text_run.rb +14 -6
  53. data/lib/pdf/reader/token.rb +1 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  55. data/lib/pdf/reader/type_check.rb +52 -0
  56. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  57. data/lib/pdf/reader/validating_receiver.rb +262 -0
  58. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  59. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  60. data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
  61. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  62. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  63. data/lib/pdf/reader/width_calculator.rb +1 -0
  64. data/lib/pdf/reader/xref.rb +27 -4
  65. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  66. data/lib/pdf/reader.rb +46 -15
  67. data/lib/pdf-reader.rb +1 -0
  68. data/rbi/pdf-reader.rbi +1978 -0
  69. metadata +21 -10
  70. data/lib/pdf/reader/orientation_detector.rb +0 -34
  71. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
4
- data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
3
+ metadata.gz: 2b4616131d0ad73c4ef2c4992ae79d4fde420d6857aba60e8dfac9b088a0b915
4
+ data.tar.gz: f93f481d7f76af426420dbf507a88e8ecead8ec84690781f42de3b7b5ffbd1bd
5
5
  SHA512:
6
- metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
7
- data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
6
+ metadata.gz: 86dbe3450a11e0deb3f5db98625375b252cc25f289d76c98b5de48342d1b4957de81c1c2b6cce53d7d09738e9576bd48213c92166d48911c1f45ad6a77f195a5
7
+ data.tar.gz: ee852ff644a095bae93eb7cc30c6d070c8c6adda4f9bfadecf938bf3ba2723fed08c75a3bf15ba30fcf8fded7ad6a5b74dad8a3f512823798686350f24b912eb
data/CHANGELOG CHANGED
@@ -1,3 +1,45 @@
1
+ v2.9.0 (24th January 2022)
2
+ - Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
3
+ - Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
4
+ - For sorbet users, additional type annotations are included in the gem
5
+
6
+ v2.8.0 (28th Decemeber 2021)
7
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
8
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
9
+ - including extracting the text for only part of the page
10
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
11
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
12
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
13
+
14
+ v2.7.0 (13th December 2021)
15
+ - Include RBI type files in the gem
16
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
17
+ now be typed checked by sorbet
18
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
19
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
20
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
21
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
22
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
23
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
24
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
25
+
26
+ v2.6.0 (12th November 2021)
27
+ - Text extraction improvements
28
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
29
+ - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
30
+ - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
31
+ - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
32
+ - Performance improvements
33
+ - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
34
+ - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
35
+ - Successfully parse more files
36
+ - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
37
+ - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
38
+ - Increase the amount of junk bytes we detect and skip at the end of a file (382)
39
+ - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
40
+ - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
41
+ - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
42
+
1
43
  v2.5.0 (6th June 2021)
2
44
  - bump minimum ruby version to 2.0
3
45
  - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
data/README.md CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
166
166
  The easiest way to explain how this works in practice is to show some examples.
167
167
  Check out the examples/ directory for a few files.
168
168
 
169
+ # Alternate Decoder
170
+
171
+ For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
172
+
173
+ First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
174
+
175
+ ```ruby
176
+ require "pdf-reader"
177
+ require "ascii85_native"
178
+ ```
179
+
180
+ Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
181
+
169
182
  # Known Limitations
170
183
 
171
184
  Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
176
189
 
177
190
  * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
191
 
179
- * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
192
+ * PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
193
+
194
+ * Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
195
 
181
196
  * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
197
 
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 31
17
+ cane.max_violations = 28
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -17,8 +17,8 @@ module ExtractFonts
17
17
  return count if page.fonts.nil? || page.fonts.empty?
18
18
 
19
19
  page.fonts.each do |label, font|
20
- next if complete_refs[font]
21
- complete_refs[font] = true
20
+ next if complete_refs[label]
21
+ complete_refs[label] = true
22
22
 
23
23
  process_font(page, font)
24
24
 
@@ -39,7 +39,7 @@ module ExtractFonts
39
39
  when :TrueType, :CIDFontType2 then
40
40
  ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
41
41
  else
42
- $stderr.puts "unsupported font type #{font[:Subtype]}"
42
+ $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
43
43
  end
44
44
  end
45
45
 
@@ -68,10 +68,15 @@ module ExtractFonts
68
68
  end
69
69
  end
70
70
 
71
- filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
71
+ if ARGV.size == 0 # default file name
72
+ ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
73
+ end
74
+
72
75
  extractor = ExtractFonts::Extractor.new
73
76
 
74
- PDF::Reader.open(filename) do |reader|
75
- page = reader.page(1)
76
- extractor.page(page)
77
+ ARGV.each do |arg|
78
+ PDF::Reader.open(arg) do |reader|
79
+ page = reader.page(1)
80
+ extractor.page(page)
81
+ end
77
82
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest/md5'
6
+
7
+ class PDF::Reader
8
+
9
+ # Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
10
+ # a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
11
+ #
12
+ class AesV2SecurityHandler
13
+
14
+ def initialize(key)
15
+ @encrypt_key = key
16
+ end
17
+
18
+ ##7.6.2 General Encryption Algorithm
19
+ #
20
+ # Algorithm 1: Encryption of data using the AES-128-CBC algorithm
21
+ #
22
+ # version == 4 and CFM == AESV2
23
+ #
24
+ # buf - a string to decrypt
25
+ # ref - a PDF::Reader::Reference for the object to decrypt
26
+ #
27
+ def decrypt( buf, ref )
28
+ objKey = @encrypt_key.dup
29
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
31
+ objKey << 'sAlT' # Algorithm 1, b)
32
+ length = objKey.length < 16 ? objKey.length : 16
33
+ cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
+ cipher.decrypt
35
+ cipher.key = Digest::MD5.digest(objKey)[0,length]
36
+ cipher.iv = buf[0..15]
37
+ cipher.update(buf[16..-1]) + cipher.final
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest'
6
+ require 'openssl'
7
+
8
+ class PDF::Reader
9
+
10
+ # Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
11
+ # Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
12
+ #
13
+ class AesV3SecurityHandler
14
+
15
+ def initialize(key)
16
+ @encrypt_key = key
17
+ @cipher = "AES-256-CBC"
18
+ end
19
+
20
+ ##7.6.2 General Encryption Algorithm
21
+ #
22
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
+ #
24
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
25
+ #
26
+ # buf - a string to decrypt
27
+ # ref - a PDF::Reader::Reference for the object to decrypt
28
+ #
29
+ def decrypt( buf, ref )
30
+ cipher = OpenSSL::Cipher.new(@cipher)
31
+ cipher.decrypt
32
+ cipher.key = @encrypt_key.dup
33
+ cipher.iv = buf[0..15]
34
+ cipher.update(buf[16..-1]) + cipher.final
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+
7
+ # Filter our text/characters that are positioned outside a rectangle. Usually the page
8
+ # MediaBox or CropBox, but could be a user specified rectangle too
9
+ class BoundingRectangleRunsFilter
10
+
11
+ def self.runs_within_rect(runs, rect)
12
+ runs.select { |run| rect.contains?(run.origin) }
13
+ end
14
+ end
15
+ end
16
+
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -48,6 +49,18 @@ class PDF::Reader
48
49
  ID = "ID"
49
50
  FWD_SLASH = "/"
50
51
  NULL_BYTE = "\x00"
52
+ CR = "\r"
53
+ LF = "\n"
54
+ CRLF = "\r\n"
55
+ WHITE_SPACE = [LF, CR, ' ']
56
+
57
+ # Quite a few PDFs have trailing junk.
58
+ # This can be several k of nuls in some cases
59
+ # Allow for this here
60
+ TRAILING_BYTECOUNT = 5000
61
+
62
+ # must match whole tokens
63
+ DIGITS_ONLY = %r{\A\d+\z}
51
64
 
52
65
  attr_reader :pos
53
66
 
@@ -86,9 +99,12 @@ class PDF::Reader
86
99
  #
87
100
  # options:
88
101
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
102
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
103
+ # that is sitting under the io cursor.
104
+ # Note:
105
+ # Skipping a bare CR is not spec-compliant.
106
+ # This is because the data may start with LF.
107
+ # However we check for CRLF first, so the ambiguity is avoided.
92
108
  def read(bytes, opts = {})
93
109
  reset_pos
94
110
 
@@ -97,9 +113,9 @@ class PDF::Reader
97
113
  str = @io.read(2)
98
114
  if str.nil?
99
115
  return nil
100
- elsif str == "\r\n"
116
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
117
  # do nothing
102
- elsif str[0,1] == "\n"
118
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
119
  @io.seek(-1, IO::SEEK_CUR)
104
120
  else
105
121
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +143,10 @@ class PDF::Reader
127
143
  #
128
144
  def find_first_xref_offset
129
145
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
146
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
147
+ data = @io.read(TRAILING_BYTECOUNT)
148
+
149
+ raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
132
150
 
133
151
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
152
  lines = data.split(/[\n\r]+/).reverse
@@ -136,7 +154,12 @@ class PDF::Reader
136
154
 
137
155
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
138
156
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
139
- lines[eof_index+1].to_i
157
+ offset = lines[eof_index+1].to_i
158
+
159
+ # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
160
+ # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
161
+ raise MalformedPDFError, "invalid xref offset" if offset < 0
162
+ offset
140
163
  end
141
164
 
142
165
  private
@@ -217,45 +240,73 @@ class PDF::Reader
217
240
  return if @tokens.size < 3
218
241
  return if @tokens[2] != "R"
219
242
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
221
- @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
- @tokens[1] = nil
223
- @tokens[2] = nil
224
- @tokens.compact!
243
+ token_one = @tokens[0]
244
+ token_two = @tokens[1]
245
+ if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
246
+ @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
247
+ @tokens.delete_at(2)
248
+ @tokens.delete_at(1)
225
249
  end
226
250
  end
227
251
 
252
+ # Extract data between ID and EI
253
+ # If the EI follows white-space the space is dropped from the data
254
+ # The EI must followed by white-space or end of buffer
255
+ # This is to reduce the chance of accidentally matching an embedded EI
228
256
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
257
+ idstart = @io.pos
258
+ prevchr = ''
259
+ eisize = 0 # how many chars in the end marker
260
+ seeking = 'E' # what are we looking for now?
261
+ loop do
234
262
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
263
+ break if chr.nil?
264
+ case seeking
265
+ when 'E'
266
+ if chr == 'E'
267
+ seeking = 'I'
268
+ if WHITE_SPACE.include? prevchr
269
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
270
+ else # assume the EI immediately follows the data
271
+ eisize = 2 # leave prevchr in data
272
+ end
273
+ end
274
+ when 'I'
275
+ if chr == 'I'
276
+ seeking = ''
277
+ else
278
+ seeking = 'E'
279
+ end
280
+ when ''
281
+ if WHITE_SPACE.include? chr
282
+ eisize += 1 # Drop trailer
283
+ break
284
+ else
285
+ seeking = 'E'
286
+ end
239
287
  end
288
+ prevchr = chr.is_a?(String) ? chr : ''
240
289
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
244
- @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
290
+ unless seeking == ''
291
+ raise MalformedPDFError, "EI terminator not found"
292
+ end
293
+ eiend = @io.pos
294
+ @io.seek(idstart, IO::SEEK_SET)
295
+ str = @io.read(eiend - eisize - idstart) # get the ID content
296
+ @tokens << str.freeze if str
246
297
  end
247
298
 
248
299
  # if we're currently inside a hex string, read hex nibbles until
249
300
  # we find a closing >
250
301
  #
251
302
  def prepare_hex_token
303
+ finished = :false
252
304
  str = "".dup
253
- finished = false
254
305
 
255
- while !finished
306
+ until finished == :true
256
307
  byte = @io.getbyte
257
308
  if byte.nil?
258
- finished = true # unbalanced params
309
+ finished = :true # unbalanced params
259
310
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
260
311
  str << byte
261
312
  elsif byte <= 32
@@ -264,7 +315,7 @@ class PDF::Reader
264
315
  @tokens << str if str.size > 0
265
316
  @tokens << ">" if byte != 0x3E # '>'
266
317
  @tokens << byte.chr
267
- finished = true
318
+ finished = :true
268
319
  end
269
320
  end
270
321
  end
@@ -311,14 +362,17 @@ class PDF::Reader
311
362
  def prepare_regular_token
312
363
  tok = "".dup
313
364
 
314
- while byte = @io.getbyte
365
+ loop do
366
+ byte = @io.getbyte
367
+
315
368
  case byte
369
+ when nil
370
+ break
316
371
  when 0x25
317
372
  # comment, ignore everything until the next EOL char
318
- done = false
319
- while !done
320
- byte = @io.getbyte
321
- done = true if byte.nil? || byte == 0x0A || byte == 0x0D
373
+ loop do
374
+ commentbyte = @io.getbyte
375
+ break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
322
376
  end
323
377
  when *TOKEN_WHITESPACE
324
378
  # white space, token finished
@@ -388,15 +442,5 @@ class PDF::Reader
388
442
  byte
389
443
  end
390
444
 
391
- # for a handful of tokens we want to tell the parser how to convert them
392
- # into higher level tokens. This methods adds a to_token() method
393
- # to tokens that should remain as strings.
394
- #
395
- def string_token(token)
396
- def token.to_token
397
- to_s
398
- end
399
- token
400
- end
401
445
  end
402
446
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,16 +33,17 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
- "begincodespacerange" => 1,
37
- "endcodespacerange" => 1,
38
- "beginbfchar" => 1,
39
- "endbfchar" => 1,
40
- "beginbfrange" => 1,
41
- "endbfrange" => 1,
42
- "begin" => 1,
43
- "begincmap" => 1,
44
- "def" => 1
38
+ "begincodespacerange" => :noop,
39
+ "endcodespacerange" => :noop,
40
+ "beginbfchar" => :noop,
41
+ "endbfchar" => :noop,
42
+ "beginbfrange" => :noop,
43
+ "endbfrange" => :noop,
44
+ "begin" => :noop,
45
+ "begincmap" => :noop,
46
+ "def" => :noop
45
47
  }
46
48
 
47
49
  attr_reader :map
@@ -51,30 +53,6 @@ class PDF::Reader
51
53
  process_data(data)
52
54
  end
53
55
 
54
- def process_data(data)
55
- parser = build_parser(data)
56
- mode = nil
57
- instructions = []
58
-
59
- while token = parser.parse_token(CMAP_KEYWORDS)
60
- if token == "beginbfchar"
61
- mode = :char
62
- elsif token == "endbfchar"
63
- process_bfchar_instructions(instructions)
64
- instructions = []
65
- mode = nil
66
- elsif token == "beginbfrange"
67
- mode = :range
68
- elsif token == "endbfrange"
69
- process_bfrange_instructions(instructions)
70
- instructions = []
71
- mode = nil
72
- elsif mode == :char || mode == :range
73
- instructions << token
74
- end
75
- end
76
- end
77
-
78
56
  def size
79
57
  @map.size
80
58
  end
@@ -84,13 +62,40 @@ class PDF::Reader
84
62
  # Returns an array of Integers.
85
63
  #
86
64
  def decode(c)
87
- # TODO: implement the conversion
88
- return c unless Integer === c
89
- @map[c]
65
+ @map.fetch(c, [])
90
66
  end
91
67
 
92
68
  private
93
69
 
70
+ def process_data(data, initial_mode = :none)
71
+ parser = build_parser(data)
72
+ mode = initial_mode
73
+ instructions = []
74
+
75
+ while token = parser.parse_token(CMAP_KEYWORDS)
76
+ if token.is_a?(String) || token.is_a?(Array)
77
+ if token == "beginbfchar"
78
+ mode = :char
79
+ elsif token == "endbfchar"
80
+ process_bfchar_instructions(instructions)
81
+ instructions = []
82
+ mode = :none
83
+ elsif token == "beginbfrange"
84
+ mode = :range
85
+ elsif token == "endbfrange"
86
+ process_bfrange_instructions(instructions)
87
+ instructions = []
88
+ mode = :none
89
+ elsif mode == :char
90
+ instructions << token.to_s
91
+ elsif mode == :range
92
+ instructions << token
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
94
99
  def build_parser(instructions)
95
100
  buffer = Buffer.new(StringIO.new(instructions))
96
101
  Parser.new(buffer)
@@ -105,7 +110,6 @@ class PDF::Reader
105
110
  # exception when we try converting broken UTF-16 to UTF-8
106
111
  #
107
112
  def str_to_int(str)
108
- return nil if str.nil? || str.size == 0
109
113
  unpacked_string = if str.bytesize == 1 # UTF-8
110
114
  str.unpack("C*")
111
115
  else # UTF-16
@@ -113,12 +117,15 @@ class PDF::Reader
113
117
  end
114
118
  result = []
115
119
  while unpacked_string.any? do
116
- if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
120
+ if unpacked_string.size >= 2 &&
121
+ unpacked_string.first.to_i > 0xD800 &&
122
+ unpacked_string.first.to_i < 0xDBFF
117
123
  # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
124
  # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
125
  # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
- points = [unpacked_string.shift, unpacked_string.shift]
121
- result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
126
+ point_one = unpacked_string.shift.to_i
127
+ point_two = unpacked_string.shift.to_i
128
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
122
129
  else
123
130
  result << unpacked_string.shift
124
131
  end
@@ -128,9 +135,11 @@ class PDF::Reader
128
135
 
129
136
  def process_bfchar_instructions(instructions)
130
137
  instructions.each_slice(2) do |one, two|
131
- find = str_to_int(one)
132
- replace = str_to_int(two)
133
- @map[find.first] = replace
138
+ find = str_to_int(one.to_s)
139
+ replace = str_to_int(two.to_s)
140
+ if find.any? && replace.any?
141
+ @map[find.first.to_i] = replace
142
+ end
134
143
  end
135
144
  end
136
145
 
@@ -141,30 +150,36 @@ class PDF::Reader
141
150
  elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
142
151
  bfrange_type_two(start, finish, to)
143
152
  else
144
- raise "invalid bfrange section"
153
+ raise MalformedPDFError, "invalid bfrange section"
145
154
  end
146
155
  end
147
156
  end
148
157
 
149
158
  def bfrange_type_one(start_code, end_code, dst)
150
- start_code = str_to_int(start_code)[0]
151
- end_code = str_to_int(end_code)[0]
159
+ start_code = str_to_int(start_code).first
160
+ end_code = str_to_int(end_code).first
152
161
  dst = str_to_int(dst)
153
162
 
163
+ return if start_code.nil? || end_code.nil?
164
+
154
165
  # add all values in the range to our mapping
155
166
  (start_code..end_code).each_with_index do |val, idx|
156
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
167
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
157
168
  end
158
169
  end
159
170
 
160
171
  def bfrange_type_two(start_code, end_code, dst)
161
- start_code = str_to_int(start_code)[0]
162
- end_code = str_to_int(end_code)[0]
172
+ start_code = str_to_int(start_code).first
173
+ end_code = str_to_int(end_code).first
174
+
175
+ return if start_code.nil? || end_code.nil?
176
+
163
177
  from_range = (start_code..end_code)
164
178
 
165
179
  # add all values in the range to our mapping
166
180
  from_range.each_with_index do |val, idx|
167
- @map[val] = str_to_int(dst[idx])
181
+ dst_char = dst[idx]
182
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
168
183
  end
169
184
  end
170
185
  end