pdf-reader 2.2.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -72,7 +73,7 @@ class PDF::Reader
72
73
  #
73
74
  # ref - a PDF::Reader::Reference object containing an object ID and revision number
74
75
  def [](ref)
75
- @xref[ref.id][ref.gen]
76
+ @xref.fetch(ref.id, {}).fetch(ref.gen)
76
77
  rescue
77
78
  raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
78
79
  end
@@ -81,8 +82,8 @@ class PDF::Reader
81
82
  def each(&block)
82
83
  ids = @xref.keys.sort
83
84
  ids.each do |id|
84
- gen = @xref[id].keys.sort[-1]
85
- yield PDF::Reader::Reference.new(id, gen)
85
+ gen = @xref.fetch(id, {}).keys.sort[-1]
86
+ yield PDF::Reader::Reference.new(id, gen.to_i)
86
87
  end
87
88
  end
88
89
  ################################################################################
@@ -103,13 +104,18 @@ class PDF::Reader
103
104
  buf = new_buffer(offset)
104
105
  tok_one = buf.token
105
106
 
107
+ # we have a traditional xref table
106
108
  return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"
107
109
 
108
110
  tok_two = buf.token
109
111
  tok_three = buf.token
110
112
 
113
+ # we have an XRef stream
111
114
  if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
112
115
  buf = new_buffer(offset)
116
+ # Maybe we should be parsing the ObjectHash second argument to the Parser here,
117
+ # to handle the case where an XRef Stream has the Length specified via an
118
+ # indirect object
113
119
  stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
114
120
  return load_xref_stream(stream)
115
121
  end
@@ -125,12 +131,19 @@ class PDF::Reader
125
131
 
126
132
  while !params.include?("trailer") && !params.include?(nil)
127
133
  if params.size == 2
134
+ unless params[0].to_s.match(/\A\d+\z/)
135
+ raise MalformedPDFError, "invalid xref table, expected object ID"
136
+ end
137
+
128
138
  objid, count = params[0].to_i, params[1].to_i
129
139
  count.times do
130
140
  offset = buf.token.to_i
131
141
  generation = buf.token.to_i
132
142
  state = buf.token
133
143
 
144
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
145
+ # TODO should this fix be logged?
146
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
147
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
148
  objid += 1
136
149
  params.clear
@@ -139,14 +152,16 @@ class PDF::Reader
139
152
  params << buf.token
140
153
  end
141
154
 
142
- trailer = Parser.new(buf, self).parse_token
155
+ trailer = Parser.new(buf).parse_token
143
156
 
144
157
  unless trailer.kind_of?(Hash)
145
158
  raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
146
159
  end
147
160
 
148
161
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
162
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
163
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
164
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
165
 
151
166
  trailer
152
167
  end
@@ -162,8 +177,16 @@ class PDF::Reader
162
177
  [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
163
178
  }]
164
179
 
165
- widths = stream.hash[:W]
166
- entry_length = widths.inject(0) { |s, w| s + w }
180
+ widths = stream.hash[:W]
181
+
182
+ PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)
183
+
184
+ entry_length = widths.inject(0) { |s, w|
185
+ unless w.is_a?(Integer)
186
+ w = 0
187
+ end
188
+ s + w
189
+ }
167
190
  raw_data = StringIO.new(stream.unfiltered_data)
168
191
  if stream.hash[:Index]
169
192
  index = stream.hash[:Index]
@@ -230,18 +253,21 @@ class PDF::Reader
230
253
  # should always be 0, but all sort of crazy junk is prefixed to PDF files
231
254
  # in the real world.
232
255
  #
233
- # Checks up to 50 chars into the file, returns nil if no PDF data detected.
256
+ # Checks up to 1024 chars into the file,
257
+ # returns nil if no PDF data detected.
258
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
259
+ # header appear somewhere within the first 1024 bytes of the file
234
260
  #
235
261
  def calc_junk_offset(io)
236
262
  io.rewind
237
263
  offset = io.pos
238
- until (c = io.readchar) == '%' || c == 37 || offset > 50
264
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
239
265
  offset += 1
240
266
  end
241
267
  io.rewind
242
- offset < 50 ? offset : nil
268
+ offset < 1024 ? offset : nil
243
269
  rescue EOFError
244
- return nil
270
+ nil
245
271
  end
246
272
  end
247
273
  ################################################################################
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -111,19 +112,27 @@ module PDF
111
112
  #
112
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
114
118
  def initialize(input, opts = {})
115
119
  @cache = PDF::Reader::ObjectCache.new
116
120
  opts.merge!(:cache => @cache)
117
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
118
122
  end
119
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
120
126
  def info
121
- dict = @objects.deref(@objects.trailer[:Info])
127
+ dict = @objects.deref_hash(@objects.trailer[:Info]) || {}
122
128
  doc_strings_to_utf8(dict)
123
129
  end
124
130
 
131
+ # Return a String with extra XML metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
125
134
  def metadata
126
- stream = @objects.deref(root[:Metadata])
135
+ stream = @objects.deref_stream(root[:Metadata])
127
136
  if stream.nil?
128
137
  nil
129
138
  else
@@ -133,20 +142,24 @@ module PDF
133
142
  end
134
143
  end
135
144
 
145
+ # To number of pages in this PDF
146
+ #
136
147
  def page_count
137
- pages = @objects.deref(root[:Pages])
148
+ pages = @objects.deref_hash(root[:Pages])
138
149
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
150
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
151
  end
141
- @page_count ||= @objects.deref(pages[:Count])
152
+ @page_count ||= @objects.deref_integer(pages[:Count]) || 0
142
153
  end
143
154
 
155
+ # The PDF version this file uses
156
+ #
144
157
  def pdf_version
145
158
  @objects.pdf_version
146
159
  end
147
160
 
148
- # syntactic sugar for opening a PDF file. Accepts the same arguments
149
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
150
163
  #
151
164
  # PDF::Reader.open("somefile.pdf") do |reader|
152
165
  # puts reader.pdf_version
@@ -169,7 +182,7 @@ module PDF
169
182
  #
170
183
  # reader.pages.each do |page|
171
184
  # puts page.fonts
172
- # puts page.images
185
+ # puts page.rectangles
173
186
  # puts page.text
174
187
  # end
175
188
  #
@@ -177,10 +190,12 @@ module PDF
177
190
  # methods available on each page
178
191
  #
179
192
  def pages
193
+ return [] if page_count <= 0
194
+
180
195
  (1..self.page_count).map do |num|
181
196
  begin
182
197
  PDF::Reader::Page.new(@objects, num, :cache => @cache)
183
- rescue InvalidPageError => ex
198
+ rescue InvalidPageError
184
199
  raise MalformedPDFError, "Missing data for page: #{num}"
185
200
  end
186
201
  end
@@ -221,16 +236,24 @@ module PDF
221
236
  when Array then
222
237
  obj.map { |item| doc_strings_to_utf8(item) }
223
238
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
239
+ if has_utf16_bom?(obj)
225
240
  utf16_to_utf8(obj)
226
241
  else
227
242
  pdfdoc_to_utf8(obj)
228
243
  end
229
244
  else
230
- @objects.deref(obj)
245
+ obj
231
246
  end
232
247
  end
233
248
 
249
+ def has_utf16_bom?(str)
250
+ first_bytes = str[0,2]
251
+
252
+ return false if first_bytes.nil?
253
+
254
+ first_bytes.unpack("C*") == [254, 255]
255
+ end
256
+
234
257
  # TODO find a PDF I can use to spec this behaviour
235
258
  #
236
259
  def pdfdoc_to_utf8(obj)
@@ -242,28 +265,23 @@ module PDF
242
265
  # String#encode
243
266
  #
244
267
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
268
+ str = obj[2, obj.size].to_s
246
269
  str = str.unpack("n*").pack("U*")
247
270
  str.force_encoding("utf-8")
248
271
  str
249
272
  end
250
273
 
251
274
  def root
252
- @root ||= begin
253
- obj = @objects.deref(@objects.trailer[:Root])
254
- unless obj.kind_of?(::Hash)
255
- raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
256
- end
257
- obj
258
- end
275
+ @root ||= @objects.deref_hash(@objects.trailer[:Root]) || {}
259
276
  end
260
277
 
261
278
  end
262
279
  end
263
280
  ################################################################################
264
281
 
265
- require 'pdf/reader/resource_methods'
282
+ require 'pdf/reader/resources'
266
283
  require 'pdf/reader/buffer'
284
+ require 'pdf/reader/bounding_rectangle_runs_filter'
267
285
  require 'pdf/reader/cid_widths'
268
286
  require 'pdf/reader/cmap'
269
287
  require 'pdf/reader/encoding'
@@ -286,19 +304,26 @@ require 'pdf/reader/object_hash'
286
304
  require 'pdf/reader/object_stream'
287
305
  require 'pdf/reader/pages_strategy'
288
306
  require 'pdf/reader/parser'
307
+ require 'pdf/reader/point'
289
308
  require 'pdf/reader/print_receiver'
309
+ require 'pdf/reader/rectangle'
290
310
  require 'pdf/reader/reference'
291
311
  require 'pdf/reader/register_receiver'
312
+ require 'pdf/reader/no_text_filter'
292
313
  require 'pdf/reader/null_security_handler'
293
- require 'pdf/reader/standard_security_handler'
294
- require 'pdf/reader/standard_security_handler_v5'
314
+ require 'pdf/reader/security_handler_factory'
315
+ require 'pdf/reader/standard_key_builder'
316
+ require 'pdf/reader/key_builder_v5'
317
+ require 'pdf/reader/aes_v2_security_handler'
318
+ require 'pdf/reader/aes_v3_security_handler'
319
+ require 'pdf/reader/rc4_security_handler'
295
320
  require 'pdf/reader/unimplemented_security_handler'
296
321
  require 'pdf/reader/stream'
297
322
  require 'pdf/reader/text_run'
323
+ require 'pdf/reader/type_check'
298
324
  require 'pdf/reader/page_state'
299
325
  require 'pdf/reader/page_text_receiver'
300
326
  require 'pdf/reader/token'
301
327
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
328
  require 'pdf/reader/page'
304
- require 'pdf/hash'
329
+ require 'pdf/reader/validating_receiver'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"