pdf-reader 2.4.2 → 2.8.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +44 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  8. data/lib/pdf/reader/buffer.rb +63 -21
  9. data/lib/pdf/reader/cid_widths.rb +1 -0
  10. data/lib/pdf/reader/cmap.rb +5 -3
  11. data/lib/pdf/reader/encoding.rb +3 -2
  12. data/lib/pdf/reader/error.rb +11 -3
  13. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  14. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  15. data/lib/pdf/reader/filter/depredict.rb +10 -8
  16. data/lib/pdf/reader/filter/flate.rb +4 -2
  17. data/lib/pdf/reader/filter/lzw.rb +2 -0
  18. data/lib/pdf/reader/filter/null.rb +1 -0
  19. data/lib/pdf/reader/filter/run_length.rb +19 -13
  20. data/lib/pdf/reader/filter.rb +1 -0
  21. data/lib/pdf/reader/font.rb +44 -0
  22. data/lib/pdf/reader/font_descriptor.rb +1 -0
  23. data/lib/pdf/reader/form_xobject.rb +1 -0
  24. data/lib/pdf/reader/glyph_hash.rb +16 -9
  25. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  26. data/lib/pdf/reader/lzw.rb +4 -2
  27. data/lib/pdf/reader/null_security_handler.rb +1 -0
  28. data/lib/pdf/reader/object_cache.rb +1 -0
  29. data/lib/pdf/reader/object_hash.rb +8 -3
  30. data/lib/pdf/reader/object_stream.rb +1 -0
  31. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  32. data/lib/pdf/reader/page.rb +73 -11
  33. data/lib/pdf/reader/page_layout.rb +37 -37
  34. data/lib/pdf/reader/page_state.rb +18 -23
  35. data/lib/pdf/reader/page_text_receiver.rb +68 -6
  36. data/lib/pdf/reader/pages_strategy.rb +1 -0
  37. data/lib/pdf/reader/parser.rb +15 -7
  38. data/lib/pdf/reader/point.rb +25 -0
  39. data/lib/pdf/reader/print_receiver.rb +1 -0
  40. data/lib/pdf/reader/rectangle.rb +113 -0
  41. data/lib/pdf/reader/reference.rb +1 -0
  42. data/lib/pdf/reader/register_receiver.rb +1 -0
  43. data/lib/pdf/reader/resource_methods.rb +5 -0
  44. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  45. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  46. data/lib/pdf/reader/stream.rb +1 -0
  47. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  48. data/lib/pdf/reader/text_run.rb +14 -6
  49. data/lib/pdf/reader/token.rb +1 -0
  50. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  51. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  52. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  53. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  56. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  57. data/lib/pdf/reader/width_calculator.rb +1 -0
  58. data/lib/pdf/reader/xref.rb +7 -1
  59. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  60. data/lib/pdf/reader.rb +29 -6
  61. data/lib/pdf-reader.rb +1 -0
  62. data/rbi/pdf-reader.rbi +1763 -0
  63. metadata +12 -7
  64. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'afm'
@@ -37,23 +38,15 @@ class PDF::Reader
37
38
  def glyph_width(code_point)
38
39
  return 0 if code_point.nil? || code_point < 0
39
40
 
40
- m = @metrics.char_metrics_by_code[code_point]
41
- if m.nil?
42
- names = @font.encoding.int_to_name(code_point)
41
+ names = @font.encoding.int_to_name(code_point)
42
+ metrics = names.map { |name|
43
+ @metrics.char_metrics[name.to_s]
44
+ }.compact.first
43
45
 
44
- m = names.map { |name|
45
- @metrics.char_metrics[name.to_s]
46
- }.compact.first
47
- end
48
-
49
- if m
50
- m[:wx]
51
- elsif @font.widths[code_point - 1]
52
- @font.widths[code_point - 1]
53
- elsif control_character?(code_point)
54
- 0
46
+ if metrics
47
+ metrics[:wx]
55
48
  else
56
- 0
49
+ @font.widths[code_point - 1] || 0
57
50
  end
58
51
  end
59
52
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -131,6 +132,9 @@ class PDF::Reader
131
132
  generation = buf.token.to_i
132
133
  state = buf.token
133
134
 
135
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
136
+ # TODO should this fix be logged?
137
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
138
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
139
  objid += 1
136
140
  params.clear
@@ -146,7 +150,9 @@ class PDF::Reader
146
150
  end
147
151
 
148
152
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
153
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
154
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
155
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
156
 
151
157
  trailer
152
158
  end
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -111,17 +112,25 @@ module PDF
111
112
  #
112
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
114
118
  def initialize(input, opts = {})
115
119
  @cache = PDF::Reader::ObjectCache.new
116
120
  opts.merge!(:cache => @cache)
117
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
118
122
  end
119
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
120
126
  def info
121
127
  dict = @objects.deref(@objects.trailer[:Info])
122
128
  doc_strings_to_utf8(dict)
123
129
  end
124
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
125
134
  def metadata
126
135
  stream = @objects.deref(root[:Metadata])
127
136
  if stream.nil?
@@ -133,20 +142,24 @@ module PDF
133
142
  end
134
143
  end
135
144
 
145
+ # To number of pages in this PDF
146
+ #
136
147
  def page_count
137
148
  pages = @objects.deref(root[:Pages])
138
149
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
150
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
151
  end
141
152
  @page_count ||= @objects.deref(pages[:Count])
142
153
  end
143
154
 
155
+ # The PDF version this file uses
156
+ #
144
157
  def pdf_version
145
158
  @objects.pdf_version
146
159
  end
147
160
 
148
- # syntactic sugar for opening a PDF file. Accepts the same arguments
149
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
150
163
  #
151
164
  # PDF::Reader.open("somefile.pdf") do |reader|
152
165
  # puts reader.pdf_version
@@ -221,7 +234,7 @@ module PDF
221
234
  when Array then
222
235
  obj.map { |item| doc_strings_to_utf8(item) }
223
236
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
237
+ if has_utf16_bom?(obj)
225
238
  utf16_to_utf8(obj)
226
239
  else
227
240
  pdfdoc_to_utf8(obj)
@@ -231,6 +244,14 @@ module PDF
231
244
  end
232
245
  end
233
246
 
247
+ def has_utf16_bom?(str)
248
+ first_bytes = str[0,2]
249
+
250
+ return false if first_bytes.nil?
251
+
252
+ first_bytes.unpack("C*") == [254, 255]
253
+ end
254
+
234
255
  # TODO find a PDF I can use to spec this behaviour
235
256
  #
236
257
  def pdfdoc_to_utf8(obj)
@@ -242,7 +263,7 @@ module PDF
242
263
  # String#encode
243
264
  #
244
265
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
266
+ str = obj[2, obj.size].to_s
246
267
  str = str.unpack("n*").pack("U*")
247
268
  str.force_encoding("utf-8")
248
269
  str
@@ -264,6 +285,7 @@ end
264
285
 
265
286
  require 'pdf/reader/resource_methods'
266
287
  require 'pdf/reader/buffer'
288
+ require 'pdf/reader/bounding_rectangle_runs_filter'
267
289
  require 'pdf/reader/cid_widths'
268
290
  require 'pdf/reader/cmap'
269
291
  require 'pdf/reader/encoding'
@@ -286,7 +308,9 @@ require 'pdf/reader/object_hash'
286
308
  require 'pdf/reader/object_stream'
287
309
  require 'pdf/reader/pages_strategy'
288
310
  require 'pdf/reader/parser'
311
+ require 'pdf/reader/point'
289
312
  require 'pdf/reader/print_receiver'
313
+ require 'pdf/reader/rectangle'
290
314
  require 'pdf/reader/reference'
291
315
  require 'pdf/reader/register_receiver'
292
316
  require 'pdf/reader/null_security_handler'
@@ -299,5 +323,4 @@ require 'pdf/reader/page_state'
299
323
  require 'pdf/reader/page_text_receiver'
300
324
  require 'pdf/reader/token'
301
325
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
326
  require 'pdf/reader/page'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"