pdf-reader 2.2.0 → 2.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -72,7 +73,7 @@ class PDF::Reader
72
73
  #
73
74
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
74
75
  def [](ref)
75
- @xref[ref.id][ref.gen]
76
+ @xref.fetch(ref.id, {}).fetch(ref.gen)
76
77
  rescue
77
78
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
78
79
  end
@@ -81,8 +82,8 @@ class PDF::Reader
81
82
  def each(&block)
82
83
  ids = @xref.keys.sort
83
84
  ids.each do |id|
84
- gen = @xref[id].keys.sort[-1]
85
- yield PDF::Reader::Reference.new(id, gen)
85
+ gen = @xref.fetch(id, {}).keys.sort[-1]
86
+ yield PDF::Reader::Reference.new(id, gen.to_i)
86
87
  end
87
88
  end
88
89
  ################################################################################
@@ -103,13 +104,18 @@ class PDF::Reader
103
104
  buf = new_buffer(offset)
104
105
  tok_one = buf.token
105
106
 
107
+ # we have a traditional xref table
106
108
  return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
107
109
 
108
110
  tok_two = buf.token
109
111
  tok_three = buf.token
110
112
 
113
+ # we have an XRef stream
111
114
  if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
112
115
  buf = new_buffer(offset)
116
+ # Maybe we should be parsing the ObjectHash second argument to the Parser here,
117
+ # to handle the case where an XRef Stream has the Length specified via an
118
+ # indirect object
113
119
  stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
114
120
  return load_xref_stream(stream)
115
121
  end
@@ -125,12 +131,19 @@ class PDF::Reader
125
131
 
126
132
  while !params.include?("trailer") && !params.include?(nil)
127
133
  if params.size == 2
134
+ unless params[0].to_s.match(/\A\d+\z/)
135
+ raise MalformedPDFError, "invalid xref table, expected object ID"
136
+ end
137
+
128
138
  objid, count = params[0].to_i, params[1].to_i
129
139
  count.times do
130
140
  offset = buf.token.to_i
131
141
  generation = buf.token.to_i
132
142
  state = buf.token
133
143
 
144
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
145
+ # TODO should this fix be logged?
146
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
147
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
148
  objid += 1
136
149
  params.clear
@@ -139,14 +152,16 @@ class PDF::Reader
139
152
  params << buf.token
140
153
  end
141
154
 
142
- trailer = Parser.new(buf, self).parse_token
155
+ trailer = Parser.new(buf).parse_token
143
156
 
144
157
  unless trailer.kind_of?(Hash)
145
158
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
146
159
  end
147
160
 
148
161
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
162
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
163
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
164
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
165
 
151
166
  trailer
152
167
  end
@@ -162,8 +177,16 @@ class PDF::Reader
162
177
  [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
163
178
  }]
164
179
 
165
- widths = stream.hash[:W]
166
- entry_length = widths.inject(0) { |s, w| s + w }
180
+ widths = stream.hash[:W]
181
+
182
+ PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
183
+
184
+ entry_length = widths.inject(0) { |s, w|
185
+ unless w.is_a?(Integer)
186
+ w = 0
187
+ end
188
+ s + w
189
+ }
167
190
  raw_data = StringIO.new(stream.unfiltered_data)
168
191
  if stream.hash[:Index]
169
192
  index = stream.hash[:Index]
@@ -230,18 +253,21 @@ class PDF::Reader
230
253
  # should always be 0, but all sort of crazy junk is prefixed to PDF files
231
254
  # in the real world.
232
255
  #
233
- # Checks up to 50 chars into the file, returns nil if no PDF data detected.
256
+ # Checks up to 1024 chars into the file,
257
+ # returns nil if no PDF data detected.
258
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
259
+ # header appear somewhere within the first 1024 bytes of the file
234
260
  #
235
261
  def calc_junk_offset(io)
236
262
  io.rewind
237
263
  offset = io.pos
238
- until (c = io.readchar) == '%' || c == 37 || offset > 50
264
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
239
265
  offset += 1
240
266
  end
241
267
  io.rewind
242
- offset < 50 ? offset : nil
268
+ offset < 1024 ? offset : nil
243
269
  rescue EOFError
244
- return nil
270
+ nil
245
271
  end
246
272
  end
247
273
  ################################################################################
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -111,19 +112,27 @@ module PDF
111
112
  #
112
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
114
118
  def initialize(input, opts = {})
115
119
  @cache = PDF::Reader::ObjectCache.new
116
120
  opts.merge!(:cache => @cache)
117
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
118
122
  end
119
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
120
126
  def info
121
- dict = @objects.deref(@objects.trailer[:Info])
127
+ dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
122
128
  doc_strings_to_utf8(dict)
123
129
  end
124
130
 
131
+ # Return a String with extra XML metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
125
134
  def metadata
126
- stream = @objects.deref(root[:Metadata])
135
+ stream = @objects.deref_stream(root[:Metadata])
127
136
  if stream.nil?
128
137
  nil
129
138
  else
@@ -133,20 +142,24 @@ module PDF
133
142
  end
134
143
  end
135
144
 
145
+ # To number of pages in this PDF
146
+ #
136
147
  def page_count
137
- pages = @objects.deref(root[:Pages])
148
+ pages = @objects.deref_hash(root[:Pages])
138
149
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
150
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
151
  end
141
- @page_count ||= @objects.deref(pages[:Count])
152
+ @page_count ||= @objects.deref_integer(pages[:Count]) || 0
142
153
  end
143
154
 
155
+ # The PDF version this file uses
156
+ #
144
157
  def pdf_version
145
158
  @objects.pdf_version
146
159
  end
147
160
 
148
- # syntactic sugar for opening a PDF file. Accepts the same arguments
149
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
150
163
  #
151
164
  # PDF::Reader.open("somefile.pdf") do |reader|
152
165
  # puts reader.pdf_version
@@ -169,7 +182,7 @@ module PDF
169
182
  #
170
183
  # reader.pages.each do |page|
171
184
  # puts page.fonts
172
- # puts page.images
185
+ # puts page.rectangles
173
186
  # puts page.text
174
187
  # end
175
188
  #
@@ -177,10 +190,12 @@ module PDF
177
190
  # methods available on each page
178
191
  #
179
192
  def pages
193
+ return [] if page_count <= 0
194
+
180
195
  (1..self.page_count).map do |num|
181
196
  begin
182
197
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
183
- rescue InvalidPageError => ex
198
+ rescue InvalidPageError
184
199
  raise MalformedPDFError, "Missing data for page: #{num}"
185
200
  end
186
201
  end
@@ -221,16 +236,24 @@ module PDF
221
236
  when Array then
222
237
  obj.map { |item| doc_strings_to_utf8(item) }
223
238
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
239
+ if has_utf16_bom?(obj)
225
240
  utf16_to_utf8(obj)
226
241
  else
227
242
  pdfdoc_to_utf8(obj)
228
243
  end
229
244
  else
230
- @objects.deref(obj)
245
+ obj
231
246
  end
232
247
  end
233
248
 
249
+ def has_utf16_bom?(str)
250
+ first_bytes = str[0,2]
251
+
252
+ return false if first_bytes.nil?
253
+
254
+ first_bytes.unpack("C*") == [254, 255]
255
+ end
256
+
234
257
  # TODO find a PDF I can use to spec this behaviour
235
258
  #
236
259
  def pdfdoc_to_utf8(obj)
@@ -242,28 +265,23 @@ module PDF
242
265
  # String#encode
243
266
  #
244
267
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
268
+ str = obj[2, obj.size].to_s
246
269
  str = str.unpack("n*").pack("U*")
247
270
  str.force_encoding("utf-8")
248
271
  str
249
272
  end
250
273
 
251
274
  def root
252
- @root ||= begin
253
- obj = @objects.deref(@objects.trailer[:Root])
254
- unless obj.kind_of?(::Hash)
255
- raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
256
- end
257
- obj
258
- end
275
+ @root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
259
276
  end
260
277
 
261
278
  end
262
279
  end
263
280
  ################################################################################
264
281
 
265
- require 'pdf/reader/resource_methods'
282
+ require 'pdf/reader/resources'
266
283
  require 'pdf/reader/buffer'
284
+ require 'pdf/reader/bounding_rectangle_runs_filter'
267
285
  require 'pdf/reader/cid_widths'
268
286
  require 'pdf/reader/cmap'
269
287
  require 'pdf/reader/encoding'
@@ -286,19 +304,26 @@ require 'pdf/reader/object_hash'
286
304
  require 'pdf/reader/object_stream'
287
305
  require 'pdf/reader/pages_strategy'
288
306
  require 'pdf/reader/parser'
307
+ require 'pdf/reader/point'
289
308
  require 'pdf/reader/print_receiver'
309
+ require 'pdf/reader/rectangle'
290
310
  require 'pdf/reader/reference'
291
311
  require 'pdf/reader/register_receiver'
312
+ require 'pdf/reader/no_text_filter'
292
313
  require 'pdf/reader/null_security_handler'
293
- require 'pdf/reader/standard_security_handler'
294
- require 'pdf/reader/standard_security_handler_v5'
314
+ require 'pdf/reader/security_handler_factory'
315
+ require 'pdf/reader/standard_key_builder'
316
+ require 'pdf/reader/key_builder_v5'
317
+ require 'pdf/reader/aes_v2_security_handler'
318
+ require 'pdf/reader/aes_v3_security_handler'
319
+ require 'pdf/reader/rc4_security_handler'
295
320
  require 'pdf/reader/unimplemented_security_handler'
296
321
  require 'pdf/reader/stream'
297
322
  require 'pdf/reader/text_run'
323
+ require 'pdf/reader/type_check'
298
324
  require 'pdf/reader/page_state'
299
325
  require 'pdf/reader/page_text_receiver'
300
326
  require 'pdf/reader/token'
301
327
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
328
  require 'pdf/reader/page'
304
- require 'pdf/hash'
329
+ require 'pdf/reader/validating_receiver'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"