pdf-reader 2.5.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +42 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  8. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  9. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  10. data/lib/pdf/reader/buffer.rb +90 -46
  11. data/lib/pdf/reader/cid_widths.rb +1 -0
  12. data/lib/pdf/reader/cmap.rb +65 -50
  13. data/lib/pdf/reader/encoding.rb +3 -2
  14. data/lib/pdf/reader/error.rb +19 -3
  15. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  16. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  17. data/lib/pdf/reader/filter/depredict.rb +11 -9
  18. data/lib/pdf/reader/filter/flate.rb +4 -2
  19. data/lib/pdf/reader/filter/lzw.rb +2 -0
  20. data/lib/pdf/reader/filter/null.rb +1 -1
  21. data/lib/pdf/reader/filter/run_length.rb +19 -13
  22. data/lib/pdf/reader/filter.rb +2 -1
  23. data/lib/pdf/reader/font.rb +72 -16
  24. data/lib/pdf/reader/font_descriptor.rb +19 -17
  25. data/lib/pdf/reader/form_xobject.rb +15 -5
  26. data/lib/pdf/reader/glyph_hash.rb +16 -9
  27. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  28. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  29. data/lib/pdf/reader/lzw.rb +4 -2
  30. data/lib/pdf/reader/null_security_handler.rb +1 -4
  31. data/lib/pdf/reader/object_cache.rb +1 -0
  32. data/lib/pdf/reader/object_hash.rb +252 -44
  33. data/lib/pdf/reader/object_stream.rb +1 -0
  34. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  35. data/lib/pdf/reader/page.rb +99 -19
  36. data/lib/pdf/reader/page_layout.rb +36 -37
  37. data/lib/pdf/reader/page_state.rb +12 -11
  38. data/lib/pdf/reader/page_text_receiver.rb +57 -10
  39. data/lib/pdf/reader/pages_strategy.rb +1 -0
  40. data/lib/pdf/reader/parser.rb +23 -12
  41. data/lib/pdf/reader/point.rb +25 -0
  42. data/lib/pdf/reader/print_receiver.rb +1 -0
  43. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  44. data/lib/pdf/reader/rectangle.rb +113 -0
  45. data/lib/pdf/reader/reference.rb +1 -0
  46. data/lib/pdf/reader/register_receiver.rb +1 -0
  47. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +16 -9
  48. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  49. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  50. data/lib/pdf/reader/stream.rb +2 -1
  51. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  52. data/lib/pdf/reader/text_run.rb +14 -6
  53. data/lib/pdf/reader/token.rb +1 -0
  54. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  55. data/lib/pdf/reader/type_check.rb +52 -0
  56. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  57. data/lib/pdf/reader/validating_receiver.rb +262 -0
  58. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  59. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  60. data/lib/pdf/reader/width_calculator/true_type.rb +2 -1
  61. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  62. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  63. data/lib/pdf/reader/width_calculator.rb +1 -0
  64. data/lib/pdf/reader/xref.rb +27 -4
  65. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  66. data/lib/pdf/reader.rb +46 -15
  67. data/lib/pdf-reader.rb +1 -0
  68. data/rbi/pdf-reader.rbi +1978 -0
  69. metadata +21 -10
  70. data/lib/pdf/reader/orientation_detector.rb +0 -34
  71. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -103,13 +104,18 @@ class PDF::Reader
103
104
  buf = new_buffer(offset)
104
105
  tok_one = buf.token
105
106
 
107
+ # we have a traditional xref table
106
108
  return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
107
109
 
108
110
  tok_two = buf.token
109
111
  tok_three = buf.token
110
112
 
113
+ # we have an XRef stream
111
114
  if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
112
115
  buf = new_buffer(offset)
116
+ # Maybe we should be parsing the ObjectHash second argument to the Parser here,
117
+ # to handle the case where an XRef Stream has the Length specified via an
118
+ # indirect object
113
119
  stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
114
120
  return load_xref_stream(stream)
115
121
  end
@@ -125,12 +131,19 @@ class PDF::Reader
125
131
 
126
132
  while !params.include?("trailer") && !params.include?(nil)
127
133
  if params.size == 2
134
+ unless params[0].to_s.match(/\A\d+\z/)
135
+ raise MalformedPDFError, "invalid xref table, expected object ID"
136
+ end
137
+
128
138
  objid, count = params[0].to_i, params[1].to_i
129
139
  count.times do
130
140
  offset = buf.token.to_i
131
141
  generation = buf.token.to_i
132
142
  state = buf.token
133
143
 
144
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
145
+ # TODO should this fix be logged?
146
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
147
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
148
  objid += 1
136
149
  params.clear
@@ -139,14 +152,16 @@ class PDF::Reader
139
152
  params << buf.token
140
153
  end
141
154
 
142
- trailer = Parser.new(buf, self).parse_token
155
+ trailer = Parser.new(buf).parse_token
143
156
 
144
157
  unless trailer.kind_of?(Hash)
145
158
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
146
159
  end
147
160
 
148
161
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
162
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
163
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
164
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
165
 
151
166
  trailer
152
167
  end
@@ -162,8 +177,16 @@ class PDF::Reader
162
177
  [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
163
178
  }]
164
179
 
165
- widths = stream.hash[:W]
166
- entry_length = widths.inject(0) { |s, w| s + w }
180
+ widths = stream.hash[:W]
181
+
182
+ PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
183
+
184
+ entry_length = widths.inject(0) { |s, w|
185
+ unless w.is_a?(Integer)
186
+ w = 0
187
+ end
188
+ s + w
189
+ }
167
190
  raw_data = StringIO.new(stream.unfiltered_data)
168
191
  if stream.hash[:Index]
169
192
  index = stream.hash[:Index]
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -111,19 +112,27 @@ module PDF
111
112
  #
112
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
114
118
  def initialize(input, opts = {})
115
119
  @cache = PDF::Reader::ObjectCache.new
116
120
  opts.merge!(:cache => @cache)
117
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
118
122
  end
119
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
120
126
  def info
121
- dict = @objects.deref(@objects.trailer[:Info])
127
+ dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
122
128
  doc_strings_to_utf8(dict)
123
129
  end
124
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
125
134
  def metadata
126
- stream = @objects.deref(root[:Metadata])
135
+ stream = @objects.deref_stream(root[:Metadata])
127
136
  if stream.nil?
128
137
  nil
129
138
  else
@@ -133,20 +142,24 @@ module PDF
133
142
  end
134
143
  end
135
144
 
145
+ # To number of pages in this PDF
146
+ #
136
147
  def page_count
137
- pages = @objects.deref(root[:Pages])
148
+ pages = @objects.deref_hash(root[:Pages])
138
149
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
150
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
151
  end
141
- @page_count ||= @objects.deref(pages[:Count])
152
+ @page_count ||= @objects.deref_integer(pages[:Count]) || 0
142
153
  end
143
154
 
155
+ # The PDF version this file uses
156
+ #
144
157
  def pdf_version
145
158
  @objects.pdf_version
146
159
  end
147
160
 
148
- # syntactic sugar for opening a PDF file. Accepts the same arguments
149
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
150
163
  #
151
164
  # PDF::Reader.open("somefile.pdf") do |reader|
152
165
  # puts reader.pdf_version
@@ -177,6 +190,8 @@ module PDF
177
190
  # methods available on each page
178
191
  #
179
192
  def pages
193
+ return [] if page_count <= 0
194
+
180
195
  (1..self.page_count).map do |num|
181
196
  begin
182
197
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
@@ -221,16 +236,24 @@ module PDF
221
236
  when Array then
222
237
  obj.map { |item| doc_strings_to_utf8(item) }
223
238
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
239
+ if has_utf16_bom?(obj)
225
240
  utf16_to_utf8(obj)
226
241
  else
227
242
  pdfdoc_to_utf8(obj)
228
243
  end
229
244
  else
230
- @objects.deref(obj)
245
+ obj
231
246
  end
232
247
  end
233
248
 
249
+ def has_utf16_bom?(str)
250
+ first_bytes = str[0,2]
251
+
252
+ return false if first_bytes.nil?
253
+
254
+ first_bytes.unpack("C*") == [254, 255]
255
+ end
256
+
234
257
  # TODO find a PDF I can use to spec this behaviour
235
258
  #
236
259
  def pdfdoc_to_utf8(obj)
@@ -242,7 +265,7 @@ module PDF
242
265
  # String#encode
243
266
  #
244
267
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
268
+ str = obj[2, obj.size].to_s
246
269
  str = str.unpack("n*").pack("U*")
247
270
  str.force_encoding("utf-8")
248
271
  str
@@ -250,7 +273,7 @@ module PDF
250
273
 
251
274
  def root
252
275
  @root ||= begin
253
- obj = @objects.deref(@objects.trailer[:Root])
276
+ obj = @objects.deref_hash(@objects.trailer[:Root]) || {}
254
277
  unless obj.kind_of?(::Hash)
255
278
  raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
256
279
  end
@@ -262,8 +285,9 @@ module PDF
262
285
  end
263
286
  ################################################################################
264
287
 
265
- require 'pdf/reader/resource_methods'
288
+ require 'pdf/reader/resources'
266
289
  require 'pdf/reader/buffer'
290
+ require 'pdf/reader/bounding_rectangle_runs_filter'
267
291
  require 'pdf/reader/cid_widths'
268
292
  require 'pdf/reader/cmap'
269
293
  require 'pdf/reader/encoding'
@@ -286,18 +310,25 @@ require 'pdf/reader/object_hash'
286
310
  require 'pdf/reader/object_stream'
287
311
  require 'pdf/reader/pages_strategy'
288
312
  require 'pdf/reader/parser'
313
+ require 'pdf/reader/point'
289
314
  require 'pdf/reader/print_receiver'
315
+ require 'pdf/reader/rectangle'
290
316
  require 'pdf/reader/reference'
291
317
  require 'pdf/reader/register_receiver'
292
318
  require 'pdf/reader/null_security_handler'
293
- require 'pdf/reader/standard_security_handler'
294
- require 'pdf/reader/standard_security_handler_v5'
319
+ require 'pdf/reader/security_handler_factory'
320
+ require 'pdf/reader/standard_key_builder'
321
+ require 'pdf/reader/key_builder_v5'
322
+ require 'pdf/reader/aes_v2_security_handler'
323
+ require 'pdf/reader/aes_v3_security_handler'
324
+ require 'pdf/reader/rc4_security_handler'
295
325
  require 'pdf/reader/unimplemented_security_handler'
296
326
  require 'pdf/reader/stream'
297
327
  require 'pdf/reader/text_run'
328
+ require 'pdf/reader/type_check'
298
329
  require 'pdf/reader/page_state'
299
330
  require 'pdf/reader/page_text_receiver'
300
331
  require 'pdf/reader/token'
301
332
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
333
  require 'pdf/reader/page'
334
+ require 'pdf/reader/validating_receiver'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"