pdf-reader 2.5.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +42 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  8. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  9. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  10. data/lib/pdf/reader/buffer.rb +90 -46
  11. data/lib/pdf/reader/cid_widths.rb +1 -0
  12. data/lib/pdf/reader/cmap.rb +65 -50
  13. data/lib/pdf/reader/encoding.rb +3 -2
  14. data/lib/pdf/reader/error.rb +19 -3
  15. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  16. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  17. data/lib/pdf/reader/filter/depredict.rb +11 -9
  18. data/lib/pdf/reader/filter/flate.rb +4 -2
  19. data/lib/pdf/reader/filter/lzw.rb +2 -0
  20. data/lib/pdf/reader/filter/null.rb +1 -1
  21. data/lib/pdf/reader/filter/run_length.rb +19 -13
  22. data/lib/pdf/reader/filter.rb +2 -1
  23. data/lib/pdf/reader/font.rb +72 -16
  24. data/lib/pdf/reader/font_descriptor.rb +19 -17
  25. data/lib/pdf/reader/form_xobject.rb +15 -5
  26. data/lib/pdf/reader/glyph_hash.rb +16 -9
  27. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  28. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  29. data/lib/pdf/reader/lzw.rb +4 -2
  30. data/lib/pdf/reader/null_security_handler.rb +1 -4
  31. data/lib/pdf/reader/object_cache.rb +1 -0
  32. data/lib/pdf/reader/object_hash.rb +252 -44
  33. data/lib/pdf/reader/object_stream.rb +1 -0
  34. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  35. data/lib/pdf/reader/page.rb +99 -19
  36. data/lib/pdf/reader/page_layout.rb +36 -37
  37. data/lib/pdf/reader/page_state.rb +12 -11
  38. data/lib/pdf/reader/page_text_receiver.rb +57 -10
  39. data/lib/pdf/reader/pages_strategy.rb +1 -0
  40. data/lib/pdf/reader/parser.rb +23 -12
  41. data/lib/pdf/reader/point.rb +25 -0
  42. data/lib/pdf/reader/print_receiver.rb +1 -0
  43. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  44. data/lib/pdf/reader/rectangle.rb +113 -0
  45. data/lib/pdf/reader/reference.rb +1 -0
  46. data/lib/pdf/reader/register_receiver.rb +1 -0
  47. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
  48. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  49. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  50. data/lib/pdf/reader/stream.rb +2 -1
  51. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  52. data/lib/pdf/reader/text_run.rb +14 -6
  53. data/lib/pdf/reader/token.rb +1 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  55. data/lib/pdf/reader/type_check.rb +52 -0
  56. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  57. data/lib/pdf/reader/validating_receiver.rb +262 -0
  58. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  59. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  60. data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
  61. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  62. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  63. data/lib/pdf/reader/width_calculator.rb +1 -0
  64. data/lib/pdf/reader/xref.rb +27 -4
  65. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  66. data/lib/pdf/reader.rb +46 -15
  67. data/lib/pdf-reader.rb +1 -0
  68. data/rbi/pdf-reader.rbi +1978 -0
  69. metadata +21 -10
  70. data/lib/pdf/reader/orientation_detector.rb +0 -34
  71. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 652d05cf6a22fad5ecb4b92de1e27ba60cafc6525c5ca524e24c7f9796fe1b83
4
- data.tar.gz: 2c7448e97890a9fcbd10ec2cd5bafb9025db2fb75dabaf71a4074c542b1065a1
3
+ metadata.gz: 2b4616131d0ad73c4ef2c4992ae79d4fde420d6857aba60e8dfac9b088a0b915
4
+ data.tar.gz: f93f481d7f76af426420dbf507a88e8ecead8ec84690781f42de3b7b5ffbd1bd
5
5
  SHA512:
6
- metadata.gz: ac82452924cf46af98ee15f2a20642b1d06d5b9c22104fe171b5b4612665e482f341e12473805016ccb9d921fc15324ba51675170b369adeace8b278cd1279fb
7
- data.tar.gz: b1dc1c4422b0e6bf01092cf724630ba7424fdef1fdaf34f33aaa3a31397caf6ef5a73185a98e6e2828a9e082d87cbca311565397cb064cac20d86e72be27626f
6
+ metadata.gz: 86dbe3450a11e0deb3f5db98625375b252cc25f289d76c98b5de48342d1b4957de81c1c2b6cce53d7d09738e9576bd48213c92166d48911c1f45ad6a77f195a5
7
+ data.tar.gz: ee852ff644a095bae93eb7cc30c6d070c8c6adda4f9bfadecf938bf3ba2723fed08c75a3bf15ba30fcf8fded7ad6a5b74dad8a3f512823798686350f24b912eb
data/CHANGELOG CHANGED
@@ -1,3 +1,45 @@
1
+ v2.9.0 (24th January 2022)
2
+ - Support additional encryption standards (http://github.com/yob/pdf-reader/pull/419)
3
+ - Return CropBox correctly from Page#rectangles (https://github.com/yob/pdf-reader/pull/420)
4
+ - For sorbet users, additional type annotations are included in the gem
5
+
6
+ v2.8.0 (28th Decemeber 2021)
7
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
8
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
9
+ - including extracting the text for only part of the page
10
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
11
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
12
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
13
+
14
+ v2.7.0 (13th December 2021)
15
+ - Include RBI type files in the gem
16
+ - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
17
+ now be typed checked by sorbet
18
+ - Fix glyph positioning in some rotation scenarios (http://github.com/yob/pdf-reader/pull/403)
19
+ - Improved text extraction on some rotated pages, and rotated text on normal pages
20
+ - Add PDF::Reader::Page#rectangles (http://github.com/yob/pdf-reader/pull/402)
21
+ - Returns page boxes (MediaBox, etc) with rotation applied, and as PORO rather than arrays of numbers
22
+ - Add PDF::Reader::Page#origin (http://github.com/yob/pdf-reader/pull/400)
23
+ - Add PDF::Reader::Page#{height,width} (http://github.com/yob/pdf-reader/pull/399)
24
+ - Overlap filter should only drop characters that overlap *and* match (http://github.com/yob/pdf-reader/pull/401)
25
+
26
+ v2.6.0 (12th November 2021)
27
+ - Text extraction improvements
28
+ - Improved text layout on pages with a variety of font sizes (http://github.com/yob/pdf-reader/pull/355)
29
+ - Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
30
+ - Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
31
+ - Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
32
+ - Performance improvements
33
+ - Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
34
+ - Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
35
+ - Successfully parse more files
36
+ - Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
37
+ - Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
38
+ - Increase the amount of junk bytes we detect and skip at the end of a file (382)
39
+ - Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
40
+ - Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
41
+ - Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
42
+
1
43
  v2.5.0 (6th June 2021)
2
44
  - bump minimum ruby version to 2.0
3
45
  - Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
data/README.md CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
166
166
  The easiest way to explain how this works in practice is to show some examples.
167
167
  Check out the examples/ directory for a few files.
168
168
 
169
+ # Alternate Decoder
170
+
171
+ For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
172
+
173
+ First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
174
+
175
+ ```ruby
176
+ require "pdf-reader"
177
+ require "ascii85_native"
178
+ ```
179
+
180
+ Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
181
+
169
182
  # Known Limitations
170
183
 
171
184
  Occasionally some text cannot be extracted properly due to the way it has been
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
176
189
 
177
190
  * PDF::Reader Code Repository: http://github.com/yob/pdf-reader
178
191
 
179
- * PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
192
+ * PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
193
+
194
+ * Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
180
195
 
181
196
  * PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
182
197
 
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 31
17
+ cane.max_violations = 28
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -17,8 +17,8 @@ module ExtractFonts
17
17
  return count if page.fonts.nil? || page.fonts.empty?
18
18
 
19
19
  page.fonts.each do |label, font|
20
- next if complete_refs[font]
21
- complete_refs[font] = true
20
+ next if complete_refs[label]
21
+ complete_refs[label] = true
22
22
 
23
23
  process_font(page, font)
24
24
 
@@ -39,7 +39,7 @@ module ExtractFonts
39
39
  when :TrueType, :CIDFontType2 then
40
40
  ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
41
41
  else
42
- $stderr.puts "unsupported font type #{font[:Subtype]}"
42
+ $stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
43
43
  end
44
44
  end
45
45
 
@@ -68,10 +68,15 @@ module ExtractFonts
68
68
  end
69
69
  end
70
70
 
71
- filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
71
+ if ARGV.size == 0 # default file name
72
+ ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
73
+ end
74
+
72
75
  extractor = ExtractFonts::Extractor.new
73
76
 
74
- PDF::Reader.open(filename) do |reader|
75
- page = reader.page(1)
76
- extractor.page(page)
77
+ ARGV.each do |arg|
78
+ PDF::Reader.open(arg) do |reader|
79
+ page = reader.page(1)
80
+ extractor.page(page)
81
+ end
77
82
  end
data/examples/rspec.rb CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
  # coding: utf-8
3
+ # typed: ignore
3
4
 
4
5
  # Basic RSpec of a generated PDF
5
6
  #
@@ -0,0 +1,41 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest/md5'
6
+
7
+ class PDF::Reader
8
+
9
+ # Decrypts data using the AESV2 algorithim defined in the PDF spec. Requires
10
+ # a decryption key, which is usually generated by PDF::Reader::StandardKeyBuilder
11
+ #
12
+ class AesV2SecurityHandler
13
+
14
+ def initialize(key)
15
+ @encrypt_key = key
16
+ end
17
+
18
+ ##7.6.2 General Encryption Algorithm
19
+ #
20
+ # Algorithm 1: Encryption of data using the AES-128-CBC algorithm
21
+ #
22
+ # version == 4 and CFM == AESV2
23
+ #
24
+ # buf - a string to decrypt
25
+ # ref - a PDF::Reader::Reference for the object to decrypt
26
+ #
27
+ def decrypt( buf, ref )
28
+ objKey = @encrypt_key.dup
29
+ (0..2).each { |e| objKey << (ref.id >> e*8 & 0xFF ) }
30
+ (0..1).each { |e| objKey << (ref.gen >> e*8 & 0xFF ) }
31
+ objKey << 'sAlT' # Algorithm 1, b)
32
+ length = objKey.length < 16 ? objKey.length : 16
33
+ cipher = OpenSSL::Cipher.new("AES-#{length << 3}-CBC")
34
+ cipher.decrypt
35
+ cipher.key = Digest::MD5.digest(objKey)[0,length]
36
+ cipher.iv = buf[0..15]
37
+ cipher.update(buf[16..-1]) + cipher.final
38
+ end
39
+
40
+ end
41
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ require 'digest'
6
+ require 'openssl'
7
+
8
+ class PDF::Reader
9
+
10
+ # Decrypts data using the AESV3 algorithim defined in the PDF 1.7, Extension Level 3 spec.
11
+ # Requires a decryption key, which is usually generated by PDF::Reader::KeyBuilderV5
12
+ #
13
+ class AesV3SecurityHandler
14
+
15
+ def initialize(key)
16
+ @encrypt_key = key
17
+ @cipher = "AES-256-CBC"
18
+ end
19
+
20
+ ##7.6.2 General Encryption Algorithm
21
+ #
22
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
23
+ #
24
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
25
+ #
26
+ # buf - a string to decrypt
27
+ # ref - a PDF::Reader::Reference for the object to decrypt
28
+ #
29
+ def decrypt( buf, ref )
30
+ cipher = OpenSSL::Cipher.new(@cipher)
31
+ cipher.decrypt
32
+ cipher.key = @encrypt_key.dup
33
+ cipher.iv = buf[0..15]
34
+ cipher.update(buf[16..-1]) + cipher.final
35
+ end
36
+
37
+ end
38
+ end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+
7
+ # Filter our text/characters that are positioned outside a rectangle. Usually the page
8
+ # MediaBox or CropBox, but could be a user specified rectangle too
9
+ class BoundingRectangleRunsFilter
10
+
11
+ def self.runs_within_rect(runs, rect)
12
+ runs.select { |run| rect.contains?(run.origin) }
13
+ end
14
+ end
15
+ end
16
+
@@ -1,4 +1,5 @@
1
1
  # coding: ASCII-8BIT
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -48,6 +49,18 @@ class PDF::Reader
48
49
  ID = "ID"
49
50
  FWD_SLASH = "/"
50
51
  NULL_BYTE = "\x00"
52
+ CR = "\r"
53
+ LF = "\n"
54
+ CRLF = "\r\n"
55
+ WHITE_SPACE = [LF, CR, ' ']
56
+
57
+ # Quite a few PDFs have trailing junk.
58
+ # This can be several k of nuls in some cases
59
+ # Allow for this here
60
+ TRAILING_BYTECOUNT = 5000
61
+
62
+ # must match whole tokens
63
+ DIGITS_ONLY = %r{\A\d+\z}
51
64
 
52
65
  attr_reader :pos
53
66
 
@@ -86,9 +99,12 @@ class PDF::Reader
86
99
  #
87
100
  # options:
88
101
  #
89
- # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
90
- # is sitting under the io cursor.
91
- #
102
+ # :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
103
+ # that is sitting under the io cursor.
104
+ # Note:
105
+ # Skipping a bare CR is not spec-compliant.
106
+ # This is because the data may start with LF.
107
+ # However we check for CRLF first, so the ambiguity is avoided.
92
108
  def read(bytes, opts = {})
93
109
  reset_pos
94
110
 
@@ -97,9 +113,9 @@ class PDF::Reader
97
113
  str = @io.read(2)
98
114
  if str.nil?
99
115
  return nil
100
- elsif str == "\r\n"
116
+ elsif str == CRLF # This MUST be done before checking for CR alone
101
117
  # do nothing
102
- elsif str[0,1] == "\n"
118
+ elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
103
119
  @io.seek(-1, IO::SEEK_CUR)
104
120
  else
105
121
  @io.seek(-2, IO::SEEK_CUR)
@@ -127,8 +143,10 @@ class PDF::Reader
127
143
  #
128
144
  def find_first_xref_offset
129
145
  check_size_is_non_zero
130
- @io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
131
- data = @io.read(1024)
146
+ @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
147
+ data = @io.read(TRAILING_BYTECOUNT)
148
+
149
+ raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?
132
150
 
133
151
  # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
134
152
  lines = data.split(/[\n\r]+/).reverse
@@ -136,7 +154,12 @@ class PDF::Reader
136
154
 
137
155
  raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
138
156
  raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
139
- lines[eof_index+1].to_i
157
+ offset = lines[eof_index+1].to_i
158
+
159
+ # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
160
+ # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
161
+ raise MalformedPDFError, "invalid xref offset" if offset < 0
162
+ offset
140
163
  end
141
164
 
142
165
  private
@@ -217,45 +240,73 @@ class PDF::Reader
217
240
  return if @tokens.size < 3
218
241
  return if @tokens[2] != "R"
219
242
 
220
- if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
221
- @tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
222
- @tokens[1] = nil
223
- @tokens[2] = nil
224
- @tokens.compact!
243
+ token_one = @tokens[0]
244
+ token_two = @tokens[1]
245
+ if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
246
+ @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
247
+ @tokens.delete_at(2)
248
+ @tokens.delete_at(1)
225
249
  end
226
250
  end
227
251
 
252
+ # Extract data between ID and EI
253
+ # If the EI follows white-space the space is dropped from the data
254
+ # The EI must followed by white-space or end of buffer
255
+ # This is to reduce the chance of accidentally matching an embedded EI
228
256
  def prepare_inline_token
229
- str = "".dup
230
-
231
- buffer = []
232
-
233
- until buffer[0] =~ /\s|\0/ && buffer[1, 2] == ["E", "I"]
257
+ idstart = @io.pos
258
+ prevchr = ''
259
+ eisize = 0 # how many chars in the end marker
260
+ seeking = 'E' # what are we looking for now?
261
+ loop do
234
262
  chr = @io.read(1)
235
- buffer << chr
236
-
237
- if buffer.length > 3
238
- str << buffer.shift
263
+ break if chr.nil?
264
+ case seeking
265
+ when 'E'
266
+ if chr == 'E'
267
+ seeking = 'I'
268
+ if WHITE_SPACE.include? prevchr
269
+ eisize = 3 # include whitespace in delimiter, i.e. drop from data
270
+ else # assume the EI immediately follows the data
271
+ eisize = 2 # leave prevchr in data
272
+ end
273
+ end
274
+ when 'I'
275
+ if chr == 'I'
276
+ seeking = ''
277
+ else
278
+ seeking = 'E'
279
+ end
280
+ when ''
281
+ if WHITE_SPACE.include? chr
282
+ eisize += 1 # Drop trailer
283
+ break
284
+ else
285
+ seeking = 'E'
286
+ end
239
287
  end
288
+ prevchr = chr.is_a?(String) ? chr : ''
240
289
  end
241
-
242
- str << NULL_BYTE if buffer.first == NULL_BYTE
243
-
244
- @tokens << string_token(str)
245
- @io.seek(-3, IO::SEEK_CUR) unless chr.nil?
290
+ unless seeking == ''
291
+ raise MalformedPDFError, "EI terminator not found"
292
+ end
293
+ eiend = @io.pos
294
+ @io.seek(idstart, IO::SEEK_SET)
295
+ str = @io.read(eiend - eisize - idstart) # get the ID content
296
+ @tokens << str.freeze if str
246
297
  end
247
298
 
248
299
  # if we're currently inside a hex string, read hex nibbles until
249
300
  # we find a closing >
250
301
  #
251
302
  def prepare_hex_token
303
+ finished = :false
252
304
  str = "".dup
253
- finished = false
254
305
 
255
- while !finished
306
+ until finished == :true
256
307
  byte = @io.getbyte
257
308
  if byte.nil?
258
- finished = true # unbalanced params
309
+ finished = :true # unbalanced params
259
310
  elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
260
311
  str << byte
261
312
  elsif byte <= 32
@@ -264,7 +315,7 @@ class PDF::Reader
264
315
  @tokens << str if str.size > 0
265
316
  @tokens << ">" if byte != 0x3E # '>'
266
317
  @tokens << byte.chr
267
- finished = true
318
+ finished = :true
268
319
  end
269
320
  end
270
321
  end
@@ -311,14 +362,17 @@ class PDF::Reader
311
362
  def prepare_regular_token
312
363
  tok = "".dup
313
364
 
314
- while byte = @io.getbyte
365
+ loop do
366
+ byte = @io.getbyte
367
+
315
368
  case byte
369
+ when nil
370
+ break
316
371
  when 0x25
317
372
  # comment, ignore everything until the next EOL char
318
- done = false
319
- while !done
320
- byte = @io.getbyte
321
- done = true if byte.nil? || byte == 0x0A || byte == 0x0D
373
+ loop do
374
+ commentbyte = @io.getbyte
375
+ break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
322
376
  end
323
377
  when *TOKEN_WHITESPACE
324
378
  # white space, token finished
@@ -388,15 +442,5 @@ class PDF::Reader
388
442
  byte
389
443
  end
390
444
 
391
- # for a handful of tokens we want to tell the parser how to convert them
392
- # into higher level tokens. This methods adds a to_token() method
393
- # to tokens that should remain as strings.
394
- #
395
- def string_token(token)
396
- def token.to_token
397
- to_s
398
- end
399
- token
400
- end
401
445
  end
402
446
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -32,16 +33,17 @@ class PDF::Reader
32
33
  # extracting various useful information.
33
34
  #
34
35
  class CMap # :nodoc:
36
+
35
37
  CMAP_KEYWORDS = {
36
- "begincodespacerange" => 1,
37
- "endcodespacerange" => 1,
38
- "beginbfchar" => 1,
39
- "endbfchar" => 1,
40
- "beginbfrange" => 1,
41
- "endbfrange" => 1,
42
- "begin" => 1,
43
- "begincmap" => 1,
44
- "def" => 1
38
+ "begincodespacerange" => :noop,
39
+ "endcodespacerange" => :noop,
40
+ "beginbfchar" => :noop,
41
+ "endbfchar" => :noop,
42
+ "beginbfrange" => :noop,
43
+ "endbfrange" => :noop,
44
+ "begin" => :noop,
45
+ "begincmap" => :noop,
46
+ "def" => :noop
45
47
  }
46
48
 
47
49
  attr_reader :map
@@ -51,30 +53,6 @@ class PDF::Reader
51
53
  process_data(data)
52
54
  end
53
55
 
54
- def process_data(data)
55
- parser = build_parser(data)
56
- mode = nil
57
- instructions = []
58
-
59
- while token = parser.parse_token(CMAP_KEYWORDS)
60
- if token == "beginbfchar"
61
- mode = :char
62
- elsif token == "endbfchar"
63
- process_bfchar_instructions(instructions)
64
- instructions = []
65
- mode = nil
66
- elsif token == "beginbfrange"
67
- mode = :range
68
- elsif token == "endbfrange"
69
- process_bfrange_instructions(instructions)
70
- instructions = []
71
- mode = nil
72
- elsif mode == :char || mode == :range
73
- instructions << token
74
- end
75
- end
76
- end
77
-
78
56
  def size
79
57
  @map.size
80
58
  end
@@ -84,13 +62,40 @@ class PDF::Reader
84
62
  # Returns an array of Integers.
85
63
  #
86
64
  def decode(c)
87
- # TODO: implement the conversion
88
- return c unless Integer === c
89
- @map[c]
65
+ @map.fetch(c, [])
90
66
  end
91
67
 
92
68
  private
93
69
 
70
+ def process_data(data, initial_mode = :none)
71
+ parser = build_parser(data)
72
+ mode = initial_mode
73
+ instructions = []
74
+
75
+ while token = parser.parse_token(CMAP_KEYWORDS)
76
+ if token.is_a?(String) || token.is_a?(Array)
77
+ if token == "beginbfchar"
78
+ mode = :char
79
+ elsif token == "endbfchar"
80
+ process_bfchar_instructions(instructions)
81
+ instructions = []
82
+ mode = :none
83
+ elsif token == "beginbfrange"
84
+ mode = :range
85
+ elsif token == "endbfrange"
86
+ process_bfrange_instructions(instructions)
87
+ instructions = []
88
+ mode = :none
89
+ elsif mode == :char
90
+ instructions << token.to_s
91
+ elsif mode == :range
92
+ instructions << token
93
+ end
94
+ end
95
+ end
96
+ end
97
+
98
+
94
99
  def build_parser(instructions)
95
100
  buffer = Buffer.new(StringIO.new(instructions))
96
101
  Parser.new(buffer)
@@ -105,7 +110,6 @@ class PDF::Reader
105
110
  # exception when we try converting broken UTF-16 to UTF-8
106
111
  #
107
112
  def str_to_int(str)
108
- return nil if str.nil? || str.size == 0
109
113
  unpacked_string = if str.bytesize == 1 # UTF-8
110
114
  str.unpack("C*")
111
115
  else # UTF-16
@@ -113,12 +117,15 @@ class PDF::Reader
113
117
  end
114
118
  result = []
115
119
  while unpacked_string.any? do
116
- if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
120
+ if unpacked_string.size >= 2 &&
121
+ unpacked_string.first.to_i > 0xD800 &&
122
+ unpacked_string.first.to_i < 0xDBFF
117
123
  # this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
118
124
  # lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
119
125
  # low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
120
- points = [unpacked_string.shift, unpacked_string.shift]
121
- result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
126
+ point_one = unpacked_string.shift.to_i
127
+ point_two = unpacked_string.shift.to_i
128
+ result << (point_one - 0xD800) * 0x400 + (point_two - 0xDC00) + 0x10000
122
129
  else
123
130
  result << unpacked_string.shift
124
131
  end
@@ -128,9 +135,11 @@ class PDF::Reader
128
135
 
129
136
  def process_bfchar_instructions(instructions)
130
137
  instructions.each_slice(2) do |one, two|
131
- find = str_to_int(one)
132
- replace = str_to_int(two)
133
- @map[find.first] = replace
138
+ find = str_to_int(one.to_s)
139
+ replace = str_to_int(two.to_s)
140
+ if find.any? && replace.any?
141
+ @map[find.first.to_i] = replace
142
+ end
134
143
  end
135
144
  end
136
145
 
@@ -141,30 +150,36 @@ class PDF::Reader
141
150
  elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
142
151
  bfrange_type_two(start, finish, to)
143
152
  else
144
- raise "invalid bfrange section"
153
+ raise MalformedPDFError, "invalid bfrange section"
145
154
  end
146
155
  end
147
156
  end
148
157
 
149
158
  def bfrange_type_one(start_code, end_code, dst)
150
- start_code = str_to_int(start_code)[0]
151
- end_code = str_to_int(end_code)[0]
159
+ start_code = str_to_int(start_code).first
160
+ end_code = str_to_int(end_code).first
152
161
  dst = str_to_int(dst)
153
162
 
163
+ return if start_code.nil? || end_code.nil?
164
+
154
165
  # add all values in the range to our mapping
155
166
  (start_code..end_code).each_with_index do |val, idx|
156
- @map[val] = dst.length == 1 ? [dst[0] + idx] : [dst[0], dst[1] + 1]
167
+ @map[val] = dst.length == 1 ? [dst[0].to_i + idx] : [dst[0].to_i, dst[1].to_i + 1]
157
168
  end
158
169
  end
159
170
 
160
171
  def bfrange_type_two(start_code, end_code, dst)
161
- start_code = str_to_int(start_code)[0]
162
- end_code = str_to_int(end_code)[0]
172
+ start_code = str_to_int(start_code).first
173
+ end_code = str_to_int(end_code).first
174
+
175
+ return if start_code.nil? || end_code.nil?
176
+
163
177
  from_range = (start_code..end_code)
164
178
 
165
179
  # add all values in the range to our mapping
166
180
  from_range.each_with_index do |val, idx|
167
- @map[val] = str_to_int(dst[idx])
181
+ dst_char = dst[idx]
182
+ @map[val.to_i] = str_to_int(dst_char) if dst_char
168
183
  end
169
184
  end
170
185
  end