pdf-reader 2.6.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +21 -1
  3. data/Rakefile +1 -1
  4. data/examples/rspec.rb +1 -0
  5. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  6. data/lib/pdf/reader/buffer.rb +1 -0
  7. data/lib/pdf/reader/cid_widths.rb +1 -0
  8. data/lib/pdf/reader/cmap.rb +5 -3
  9. data/lib/pdf/reader/encoding.rb +2 -1
  10. data/lib/pdf/reader/error.rb +8 -0
  11. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  12. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  13. data/lib/pdf/reader/filter/depredict.rb +7 -5
  14. data/lib/pdf/reader/filter/flate.rb +2 -0
  15. data/lib/pdf/reader/filter/lzw.rb +2 -0
  16. data/lib/pdf/reader/filter/null.rb +1 -0
  17. data/lib/pdf/reader/filter/run_length.rb +19 -13
  18. data/lib/pdf/reader/filter.rb +1 -0
  19. data/lib/pdf/reader/font.rb +44 -0
  20. data/lib/pdf/reader/font_descriptor.rb +1 -0
  21. data/lib/pdf/reader/form_xobject.rb +1 -0
  22. data/lib/pdf/reader/glyph_hash.rb +1 -0
  23. data/lib/pdf/reader/lzw.rb +4 -2
  24. data/lib/pdf/reader/null_security_handler.rb +1 -0
  25. data/lib/pdf/reader/object_cache.rb +1 -0
  26. data/lib/pdf/reader/object_hash.rb +5 -2
  27. data/lib/pdf/reader/object_stream.rb +1 -0
  28. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  29. data/lib/pdf/reader/page.rb +73 -11
  30. data/lib/pdf/reader/page_layout.rb +28 -32
  31. data/lib/pdf/reader/page_state.rb +11 -10
  32. data/lib/pdf/reader/page_text_receiver.rb +53 -9
  33. data/lib/pdf/reader/pages_strategy.rb +1 -0
  34. data/lib/pdf/reader/parser.rb +7 -1
  35. data/lib/pdf/reader/point.rb +25 -0
  36. data/lib/pdf/reader/print_receiver.rb +1 -0
  37. data/lib/pdf/reader/rectangle.rb +113 -0
  38. data/lib/pdf/reader/reference.rb +1 -0
  39. data/lib/pdf/reader/register_receiver.rb +1 -0
  40. data/lib/pdf/reader/resource_methods.rb +5 -0
  41. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  42. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  43. data/lib/pdf/reader/stream.rb +1 -0
  44. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  45. data/lib/pdf/reader/text_run.rb +14 -6
  46. data/lib/pdf/reader/token.rb +1 -0
  47. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  48. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  49. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  50. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  52. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  54. data/lib/pdf/reader/width_calculator.rb +1 -0
  55. data/lib/pdf/reader/xref.rb +1 -0
  56. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  57. data/lib/pdf/reader.rb +29 -6
  58. data/lib/pdf-reader.rb +1 -0
  59. data/rbi/pdf-reader.rbi +1763 -0
  60. metadata +13 -10
  61. data/lib/pdf/reader/orientation_detector.rb +0 -34
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -111,17 +112,25 @@ module PDF
111
112
  #
112
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
114
118
  def initialize(input, opts = {})
115
119
  @cache = PDF::Reader::ObjectCache.new
116
120
  opts.merge!(:cache => @cache)
117
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
118
122
  end
119
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
120
126
  def info
121
127
  dict = @objects.deref(@objects.trailer[:Info])
122
128
  doc_strings_to_utf8(dict)
123
129
  end
124
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
125
134
  def metadata
126
135
  stream = @objects.deref(root[:Metadata])
127
136
  if stream.nil?
@@ -133,20 +142,24 @@ module PDF
133
142
  end
134
143
  end
135
144
 
145
+ # To number of pages in this PDF
146
+ #
136
147
  def page_count
137
148
  pages = @objects.deref(root[:Pages])
138
149
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
150
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
151
  end
141
152
  @page_count ||= @objects.deref(pages[:Count])
142
153
  end
143
154
 
155
+ # The PDF version this file uses
156
+ #
144
157
  def pdf_version
145
158
  @objects.pdf_version
146
159
  end
147
160
 
148
- # syntactic sugar for opening a PDF file. Accepts the same arguments
149
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
150
163
  #
151
164
  # PDF::Reader.open("somefile.pdf") do |reader|
152
165
  # puts reader.pdf_version
@@ -221,7 +234,7 @@ module PDF
221
234
  when Array then
222
235
  obj.map { |item| doc_strings_to_utf8(item) }
223
236
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
237
+ if has_utf16_bom?(obj)
225
238
  utf16_to_utf8(obj)
226
239
  else
227
240
  pdfdoc_to_utf8(obj)
@@ -231,6 +244,14 @@ module PDF
231
244
  end
232
245
  end
233
246
 
247
+ def has_utf16_bom?(str)
248
+ first_bytes = str[0,2]
249
+
250
+ return false if first_bytes.nil?
251
+
252
+ first_bytes.unpack("C*") == [254, 255]
253
+ end
254
+
234
255
  # TODO find a PDF I can use to spec this behaviour
235
256
  #
236
257
  def pdfdoc_to_utf8(obj)
@@ -242,7 +263,7 @@ module PDF
242
263
  # String#encode
243
264
  #
244
265
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
266
+ str = obj[2, obj.size].to_s
246
267
  str = str.unpack("n*").pack("U*")
247
268
  str.force_encoding("utf-8")
248
269
  str
@@ -264,6 +285,7 @@ end
264
285
 
265
286
  require 'pdf/reader/resource_methods'
266
287
  require 'pdf/reader/buffer'
288
+ require 'pdf/reader/bounding_rectangle_runs_filter'
267
289
  require 'pdf/reader/cid_widths'
268
290
  require 'pdf/reader/cmap'
269
291
  require 'pdf/reader/encoding'
@@ -286,7 +308,9 @@ require 'pdf/reader/object_hash'
286
308
  require 'pdf/reader/object_stream'
287
309
  require 'pdf/reader/pages_strategy'
288
310
  require 'pdf/reader/parser'
311
+ require 'pdf/reader/point'
289
312
  require 'pdf/reader/print_receiver'
313
+ require 'pdf/reader/rectangle'
290
314
  require 'pdf/reader/reference'
291
315
  require 'pdf/reader/register_receiver'
292
316
  require 'pdf/reader/null_security_handler'
@@ -299,5 +323,4 @@ require 'pdf/reader/page_state'
299
323
  require 'pdf/reader/page_text_receiver'
300
324
  require 'pdf/reader/token'
301
325
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
326
  require 'pdf/reader/page'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"