pdf-reader 2.5.0 → 2.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +42 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  8. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  9. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  10. data/lib/pdf/reader/buffer.rb +90 -46
  11. data/lib/pdf/reader/cid_widths.rb +1 -0
  12. data/lib/pdf/reader/cmap.rb +65 -50
  13. data/lib/pdf/reader/encoding.rb +3 -2
  14. data/lib/pdf/reader/error.rb +19 -3
  15. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  16. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  17. data/lib/pdf/reader/filter/depredict.rb +11 -9
  18. data/lib/pdf/reader/filter/flate.rb +4 -2
  19. data/lib/pdf/reader/filter/lzw.rb +2 -0
  20. data/lib/pdf/reader/filter/null.rb +1 -1
  21. data/lib/pdf/reader/filter/run_length.rb +19 -13
  22. data/lib/pdf/reader/filter.rb +2 -1
  23. data/lib/pdf/reader/font.rb +72 -16
  24. data/lib/pdf/reader/font_descriptor.rb +19 -17
  25. data/lib/pdf/reader/form_xobject.rb +15 -5
  26. data/lib/pdf/reader/glyph_hash.rb +16 -9
  27. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  28. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  29. data/lib/pdf/reader/lzw.rb +4 -2
  30. data/lib/pdf/reader/null_security_handler.rb +1 -4
  31. data/lib/pdf/reader/object_cache.rb +1 -0
  32. data/lib/pdf/reader/object_hash.rb +252 -44
  33. data/lib/pdf/reader/object_stream.rb +1 -0
  34. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  35. data/lib/pdf/reader/page.rb +99 -19
  36. data/lib/pdf/reader/page_layout.rb +36 -37
  37. data/lib/pdf/reader/page_state.rb +12 -11
  38. data/lib/pdf/reader/page_text_receiver.rb +57 -10
  39. data/lib/pdf/reader/pages_strategy.rb +1 -0
  40. data/lib/pdf/reader/parser.rb +23 -12
  41. data/lib/pdf/reader/point.rb +25 -0
  42. data/lib/pdf/reader/print_receiver.rb +1 -0
  43. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  44. data/lib/pdf/reader/rectangle.rb +113 -0
  45. data/lib/pdf/reader/reference.rb +1 -0
  46. data/lib/pdf/reader/register_receiver.rb +1 -0
  47. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
  48. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  49. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  50. data/lib/pdf/reader/stream.rb +2 -1
  51. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  52. data/lib/pdf/reader/text_run.rb +14 -6
  53. data/lib/pdf/reader/token.rb +1 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  55. data/lib/pdf/reader/type_check.rb +52 -0
  56. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  57. data/lib/pdf/reader/validating_receiver.rb +262 -0
  58. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  59. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  60. data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
  61. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  62. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  63. data/lib/pdf/reader/width_calculator.rb +1 -0
  64. data/lib/pdf/reader/xref.rb +27 -4
  65. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  66. data/lib/pdf/reader.rb +46 -15
  67. data/lib/pdf-reader.rb +1 -0
  68. data/rbi/pdf-reader.rbi +1978 -0
  69. metadata +21 -10
  70. data/lib/pdf/reader/orientation_detector.rb +0 -34
  71. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -103,13 +104,18 @@ class PDF::Reader
103
104
  buf = new_buffer(offset)
104
105
  tok_one = buf.token
105
106
 
107
+ # we have a traditional xref table
106
108
  return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
107
109
 
108
110
  tok_two = buf.token
109
111
  tok_three = buf.token
110
112
 
113
+ # we have an XRef stream
111
114
  if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
112
115
  buf = new_buffer(offset)
116
+ # Maybe we should be parsing the ObjectHash second argument to the Parser here,
117
+ # to handle the case where an XRef Stream has the Length specified via an
118
+ # indirect object
113
119
  stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
114
120
  return load_xref_stream(stream)
115
121
  end
@@ -125,12 +131,19 @@ class PDF::Reader
125
131
 
126
132
  while !params.include?("trailer") && !params.include?(nil)
127
133
  if params.size == 2
134
+ unless params[0].to_s.match(/\A\d+\z/)
135
+ raise MalformedPDFError, "invalid xref table, expected object ID"
136
+ end
137
+
128
138
  objid, count = params[0].to_i, params[1].to_i
129
139
  count.times do
130
140
  offset = buf.token.to_i
131
141
  generation = buf.token.to_i
132
142
  state = buf.token
133
143
 
144
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
145
+ # TODO should this fix be logged?
146
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
147
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
148
  objid += 1
136
149
  params.clear
@@ -139,14 +152,16 @@ class PDF::Reader
139
152
  params << buf.token
140
153
  end
141
154
 
142
- trailer = Parser.new(buf, self).parse_token
155
+ trailer = Parser.new(buf).parse_token
143
156
 
144
157
  unless trailer.kind_of?(Hash)
145
158
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
146
159
  end
147
160
 
148
161
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
162
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
163
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
164
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
165
 
151
166
  trailer
152
167
  end
@@ -162,8 +177,16 @@ class PDF::Reader
162
177
  [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
163
178
  }]
164
179
 
165
- widths = stream.hash[:W]
166
- entry_length = widths.inject(0) { |s, w| s + w }
180
+ widths = stream.hash[:W]
181
+
182
+ PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
183
+
184
+ entry_length = widths.inject(0) { |s, w|
185
+ unless w.is_a?(Integer)
186
+ w = 0
187
+ end
188
+ s + w
189
+ }
167
190
  raw_data = StringIO.new(stream.unfiltered_data)
168
191
  if stream.hash[:Index]
169
192
  index = stream.hash[:Index]
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -111,19 +112,27 @@ module PDF
111
112
  #
112
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
114
118
  def initialize(input, opts = {})
115
119
  @cache = PDF::Reader::ObjectCache.new
116
120
  opts.merge!(:cache => @cache)
117
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
118
122
  end
119
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
120
126
  def info
121
- dict = @objects.deref(@objects.trailer[:Info])
127
+ dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
122
128
  doc_strings_to_utf8(dict)
123
129
  end
124
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
125
134
  def metadata
126
- stream = @objects.deref(root[:Metadata])
135
+ stream = @objects.deref_stream(root[:Metadata])
127
136
  if stream.nil?
128
137
  nil
129
138
  else
@@ -133,20 +142,24 @@ module PDF
133
142
  end
134
143
  end
135
144
 
145
+ # To number of pages in this PDF
146
+ #
136
147
  def page_count
137
- pages = @objects.deref(root[:Pages])
148
+ pages = @objects.deref_hash(root[:Pages])
138
149
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
150
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
151
  end
141
- @page_count ||= @objects.deref(pages[:Count])
152
+ @page_count ||= @objects.deref_integer(pages[:Count]) || 0
142
153
  end
143
154
 
155
+ # The PDF version this file uses
156
+ #
144
157
  def pdf_version
145
158
  @objects.pdf_version
146
159
  end
147
160
 
148
- # syntactic sugar for opening a PDF file. Accepts the same arguments
149
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
150
163
  #
151
164
  # PDF::Reader.open("somefile.pdf") do |reader|
152
165
  # puts reader.pdf_version
@@ -177,6 +190,8 @@ module PDF
177
190
  # methods available on each page
178
191
  #
179
192
  def pages
193
+ return [] if page_count <= 0
194
+
180
195
  (1..self.page_count).map do |num|
181
196
  begin
182
197
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
@@ -221,16 +236,24 @@ module PDF
221
236
  when Array then
222
237
  obj.map { |item| doc_strings_to_utf8(item) }
223
238
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
239
+ if has_utf16_bom?(obj)
225
240
  utf16_to_utf8(obj)
226
241
  else
227
242
  pdfdoc_to_utf8(obj)
228
243
  end
229
244
  else
230
- @objects.deref(obj)
245
+ obj
231
246
  end
232
247
  end
233
248
 
249
+ def has_utf16_bom?(str)
250
+ first_bytes = str[0,2]
251
+
252
+ return false if first_bytes.nil?
253
+
254
+ first_bytes.unpack("C*") == [254, 255]
255
+ end
256
+
234
257
  # TODO find a PDF I can use to spec this behaviour
235
258
  #
236
259
  def pdfdoc_to_utf8(obj)
@@ -242,7 +265,7 @@ module PDF
242
265
  # String#encode
243
266
  #
244
267
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
268
+ str = obj[2, obj.size].to_s
246
269
  str = str.unpack("n*").pack("U*")
247
270
  str.force_encoding("utf-8")
248
271
  str
@@ -250,7 +273,7 @@ module PDF
250
273
 
251
274
  def root
252
275
  @root ||= begin
253
- obj = @objects.deref(@objects.trailer[:Root])
276
+ obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
254
277
  unless obj.kind_of?(::Hash)
255
278
  raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
256
279
  end
@@ -262,8 +285,9 @@ module PDF
262
285
  end
263
286
  ################################################################################
264
287
 
265
- require 'pdf/reader/resource_methods'
288
+ require 'pdf/reader/resources'
266
289
  require 'pdf/reader/buffer'
290
+ require 'pdf/reader/bounding_rectangle_runs_filter'
267
291
  require 'pdf/reader/cid_widths'
268
292
  require 'pdf/reader/cmap'
269
293
  require 'pdf/reader/encoding'
@@ -286,18 +310,25 @@ require 'pdf/reader/object_hash'
286
310
  require 'pdf/reader/object_stream'
287
311
  require 'pdf/reader/pages_strategy'
288
312
  require 'pdf/reader/parser'
313
+ require 'pdf/reader/point'
289
314
  require 'pdf/reader/print_receiver'
315
+ require 'pdf/reader/rectangle'
290
316
  require 'pdf/reader/reference'
291
317
  require 'pdf/reader/register_receiver'
292
318
  require 'pdf/reader/null_security_handler'
293
- require 'pdf/reader/standard_security_handler'
294
- require 'pdf/reader/standard_security_handler_v5'
319
+ require 'pdf/reader/security_handler_factory'
320
+ require 'pdf/reader/standard_key_builder'
321
+ require 'pdf/reader/key_builder_v5'
322
+ require 'pdf/reader/aes_v2_security_handler'
323
+ require 'pdf/reader/aes_v3_security_handler'
324
+ require 'pdf/reader/rc4_security_handler'
295
325
  require 'pdf/reader/unimplemented_security_handler'
296
326
  require 'pdf/reader/stream'
297
327
  require 'pdf/reader/text_run'
328
+ require 'pdf/reader/type_check'
298
329
  require 'pdf/reader/page_state'
299
330
  require 'pdf/reader/page_text_receiver'
300
331
  require 'pdf/reader/token'
301
332
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
333
  require 'pdf/reader/page'
334
+ require 'pdf/reader/validating_receiver'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"