pdf-reader 2.9.2 → 2.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +39 -0
  3. data/README.md +33 -33
  4. data/Rakefile +2 -2
  5. data/lib/pdf/reader/advanced_text_run_filter.rb +152 -0
  6. data/lib/pdf/reader/aes_v2_security_handler.rb +30 -0
  7. data/lib/pdf/reader/aes_v3_security_handler.rb +35 -3
  8. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +1 -0
  9. data/lib/pdf/reader/buffer.rb +39 -22
  10. data/lib/pdf/reader/cid_widths.rb +14 -6
  11. data/lib/pdf/reader/cmap.rb +16 -5
  12. data/lib/pdf/reader/encoding.rb +42 -18
  13. data/lib/pdf/reader/error.rb +6 -4
  14. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  15. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  16. data/lib/pdf/reader/filter/depredict.rb +6 -2
  17. data/lib/pdf/reader/filter/flate.rb +5 -2
  18. data/lib/pdf/reader/filter/lzw.rb +2 -0
  19. data/lib/pdf/reader/filter/null.rb +2 -0
  20. data/lib/pdf/reader/filter/run_length.rb +2 -0
  21. data/lib/pdf/reader/filter.rb +1 -0
  22. data/lib/pdf/reader/font.rb +99 -32
  23. data/lib/pdf/reader/font_descriptor.rb +79 -24
  24. data/lib/pdf/reader/form_xobject.rb +15 -1
  25. data/lib/pdf/reader/glyph_hash.rb +41 -8
  26. data/lib/pdf/reader/key_builder_v5.rb +17 -9
  27. data/lib/pdf/reader/lzw.rb +42 -16
  28. data/lib/pdf/reader/no_text_filter.rb +15 -0
  29. data/lib/pdf/reader/null_security_handler.rb +1 -0
  30. data/lib/pdf/reader/object_cache.rb +7 -2
  31. data/lib/pdf/reader/object_hash.rb +129 -16
  32. data/lib/pdf/reader/object_stream.rb +22 -5
  33. data/lib/pdf/reader/overlapping_runs_filter.rb +8 -2
  34. data/lib/pdf/reader/page.rb +66 -13
  35. data/lib/pdf/reader/page_layout.rb +26 -9
  36. data/lib/pdf/reader/page_state.rb +12 -3
  37. data/lib/pdf/reader/page_text_receiver.rb +16 -2
  38. data/lib/pdf/reader/pages_strategy.rb +1 -1
  39. data/lib/pdf/reader/parser.rb +52 -13
  40. data/lib/pdf/reader/point.rb +9 -2
  41. data/lib/pdf/reader/print_receiver.rb +2 -6
  42. data/lib/pdf/reader/rc4_security_handler.rb +2 -0
  43. data/lib/pdf/reader/rectangle.rb +24 -1
  44. data/lib/pdf/reader/reference.rb +13 -3
  45. data/lib/pdf/reader/register_receiver.rb +15 -2
  46. data/lib/pdf/reader/resources.rb +12 -2
  47. data/lib/pdf/reader/security_handler_factory.rb +13 -0
  48. data/lib/pdf/reader/standard_key_builder.rb +37 -23
  49. data/lib/pdf/reader/stream.rb +9 -3
  50. data/lib/pdf/reader/synchronized_cache.rb +6 -3
  51. data/lib/pdf/reader/text_run.rb +33 -3
  52. data/lib/pdf/reader/token.rb +1 -0
  53. data/lib/pdf/reader/transformation_matrix.rb +41 -10
  54. data/lib/pdf/reader/type_check.rb +53 -0
  55. data/lib/pdf/reader/unimplemented_security_handler.rb +2 -0
  56. data/lib/pdf/reader/validating_receiver.rb +29 -0
  57. data/lib/pdf/reader/width_calculator/built_in.rb +13 -5
  58. data/lib/pdf/reader/width_calculator/composite.rb +11 -3
  59. data/lib/pdf/reader/width_calculator/true_type.rb +14 -12
  60. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +8 -5
  61. data/lib/pdf/reader/width_calculator/type_zero.rb +8 -3
  62. data/lib/pdf/reader/xref.rb +31 -10
  63. data/lib/pdf/reader/zero_width_runs_filter.rb +1 -0
  64. data/lib/pdf/reader.rb +24 -12
  65. data/rbi/pdf-reader.rbi +1504 -1480
  66. metadata +34 -17
@@ -1,6 +1,6 @@
1
- # typed: true
2
1
  # coding: utf-8
3
2
  # frozen_string_literal: true
3
+ # typed: strict
4
4
 
5
5
  class PDF::Reader
6
6
  # remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
@@ -9,8 +9,9 @@ class PDF::Reader
9
9
 
10
10
  # This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
11
11
  # have identical characters) then one will be discarded
12
- OVERLAPPING_THRESHOLD = 0.5
12
+ OVERLAPPING_THRESHOLD = 0.5 #: Float
13
13
 
14
+ #: (Array[PDF::Reader::TextRun]) -> Array[PDF::Reader::TextRun]
14
15
  def self.exclude_redundant_runs(runs)
15
16
  sweep_line_status = Array.new
16
17
  event_point_schedule = Array.new
@@ -38,6 +39,7 @@ class PDF::Reader
38
39
  runs - to_exclude
39
40
  end
40
41
 
42
+ #: (Array[PDF::Reader::TextRun], PDF::Reader::EventPoint) -> bool
41
43
  def self.detect_intersection(sweep_line_status, event_point)
42
44
  sweep_line_status.each do |open_text_run|
43
45
  if open_text_run.text == event_point.run.text &&
@@ -55,15 +57,19 @@ class PDF::Reader
55
57
  # looking for duplicates
56
58
  class EventPoint
57
59
 
60
+ #: Numeric
58
61
  attr_reader :x
59
62
 
63
+ #: PDF::Reader::TextRun
60
64
  attr_reader :run
61
65
 
66
+ #: (Numeric, PDF::Reader::TextRun) -> void
62
67
  def initialize(x, run)
63
68
  @x = x
64
69
  @run = run
65
70
  end
66
71
 
72
+ #: () -> bool
67
73
  def start?
68
74
  @x == @run.x
69
75
  end
@@ -1,7 +1,9 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
+ require 'set'
6
+
5
7
  module PDF
6
8
  class Reader
7
9
 
@@ -17,14 +19,17 @@ module PDF
17
19
  extend Forwardable
18
20
 
19
21
  # lowlevel hash-like access to all objects in the underlying PDF
22
+ #: PDF::Reader::ObjectHash
20
23
  attr_reader :objects
21
24
 
22
25
  # the raw PDF object that defines this page
26
+ #: Hash[Symbol, untyped]
23
27
  attr_reader :page_object
24
28
 
25
29
  # a Hash-like object for storing cached data. Generally this is scoped to
26
30
  # the current document and is used to avoid repeating expensive
27
31
  # operations
32
+ #: PDF::Reader::ObjectCache | Hash[untyped, untyped]
28
33
  attr_reader :cache
29
34
 
30
35
  def_delegators :resources, :color_spaces
@@ -41,24 +46,32 @@ module PDF
41
46
  # * objects - an ObjectHash instance that wraps a PDF file
42
47
  # * pagenum - an int specifying the page number to expose. 1 indexed.
43
48
  #
49
+ #: (PDF::Reader::ObjectHash, Integer, ?Hash[Symbol, untyped]) -> void
44
50
  def initialize(objects, pagenum, options = {})
45
- @objects, @pagenum = objects, pagenum
46
- @page_object = objects.deref_hash(objects.page_references[pagenum - 1])
47
- @cache = options[:cache] || {}
48
-
49
- unless @page_object.is_a?(::Hash)
51
+ @objects = objects
52
+ @pagenum = pagenum
53
+ @page_ref = objects.page_references[pagenum - 1] #: (Reference | Hash[Symbol, untyped])?
54
+ @page_object = objects.deref_hash(@page_ref) || {} #: Hash[Symbol, untyped]
55
+ @cache = options[:cache] || {} #: PDF::Reader::ObjectCache | Hash[untyped, untyped]
56
+ @attributes = nil #: Hash[Symbol, untyped] | nil
57
+ @root = nil #: Hash[Symbol, untyped] | nil
58
+ @resources = nil #: PDF::Reader::Resources | nil
59
+
60
+ if @page_object.empty?
50
61
  raise InvalidPageError, "Invalid page: #{pagenum}"
51
62
  end
52
63
  end
53
64
 
54
65
  # return the number of this page within the full document
55
66
  #
67
+ #: () -> Integer
56
68
  def number
57
69
  @pagenum
58
70
  end
59
71
 
60
72
  # return a friendly string representation of this page
61
73
  #
74
+ #: () -> String
62
75
  def inspect
63
76
  "<PDF::Reader::Page page: #{@pagenum}>"
64
77
  end
@@ -66,6 +79,7 @@ module PDF
66
79
  # Returns the attributes that accompany this page, including
67
80
  # attributes inherited from parents.
68
81
  #
82
+ #: () -> Hash[Symbol, untyped]
69
83
  def attributes
70
84
  @attributes ||= {}.tap { |hash|
71
85
  page_with_ancestors.reverse.each do |obj|
@@ -78,18 +92,21 @@ module PDF
78
92
  @attributes
79
93
  end
80
94
 
95
+ #: () -> Numeric
81
96
  def height
82
97
  rect = Rectangle.new(*attributes[:MediaBox])
83
98
  rect.apply_rotation(rotate) if rotate > 0
84
99
  rect.height
85
100
  end
86
101
 
102
+ #: () -> Numeric
87
103
  def width
88
104
  rect = Rectangle.new(*attributes[:MediaBox])
89
105
  rect.apply_rotation(rotate) if rotate > 0
90
106
  rect.width
91
107
  end
92
108
 
109
+ #: () -> Array[Numeric]
93
110
  def origin
94
111
  rect = Rectangle.new(*attributes[:MediaBox])
95
112
  rect.apply_rotation(rotate) if rotate > 0
@@ -99,6 +116,7 @@ module PDF
99
116
 
100
117
  # Convenience method to identify the page's orientation.
101
118
  #
119
+ #: () -> String
102
120
  def orientation
103
121
  if height > width
104
122
  "portrait"
@@ -110,6 +128,7 @@ module PDF
110
128
  # returns the plain text content of this page encoded as UTF-8. Any
111
129
  # characters that can't be translated will be returned as a ▯
112
130
  #
131
+ #: (?Hash[Symbol, untyped]) -> String
113
132
  def text(opts = {})
114
133
  receiver = PageTextReceiver.new
115
134
  walk(receiver)
@@ -122,6 +141,7 @@ module PDF
122
141
  end
123
142
  alias :to_s :text
124
143
 
144
+ #: (?Hash[Symbol, untyped]) -> Array[PDF::Reader::TextRun]
125
145
  def runs(opts = {})
126
146
  receiver = PageTextReceiver.new
127
147
  walk(receiver)
@@ -151,6 +171,7 @@ module PDF
151
171
  # a set of instructions and associated resources. Calling walk() executes
152
172
  # the program in the correct order and calls out to your implementation.
153
173
  #
174
+ #: (*untyped) -> untyped
154
175
  def walk(*receivers)
155
176
  receivers = receivers.map { |receiver|
156
177
  ValidatingReceiver.new(receiver)
@@ -162,6 +183,7 @@ module PDF
162
183
  # returns the raw content stream for this page. This is plumbing, nothing to
163
184
  # see here unless you're a PDF nerd like me.
164
185
  #
186
+ #: () -> String
165
187
  def raw_content
166
188
  contents = objects.deref_stream_or_array(@page_object[:Contents])
167
189
  [contents].flatten.compact.map { |obj|
@@ -173,6 +195,7 @@ module PDF
173
195
 
174
196
  # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
175
197
  #
198
+ #: () -> Integer
176
199
  def rotate
177
200
  value = attributes[:Rotate].to_i
178
201
  case value
@@ -188,6 +211,7 @@ module PDF
188
211
  #
189
212
  # DEPRECATED. Recommend using Page#rectangles instead
190
213
  #
214
+ #: () -> Hash[Symbol, Array[Numeric]]
191
215
  def boxes
192
216
  # In ruby 2.4+ we could use Hash#transform_values
193
217
  Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
@@ -196,6 +220,7 @@ module PDF
196
220
  # returns the "boxes" that define the page object.
197
221
  # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
198
222
  #
223
+ #: () -> Hash[Symbol, PDF::Reader::Rectangle]
199
224
  def rectangles
200
225
  # attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
201
226
  mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
@@ -233,6 +258,7 @@ module PDF
233
258
 
234
259
  private
235
260
 
261
+ #: () -> Hash[Symbol, untyped]
236
262
  def root
237
263
  @root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
238
264
  end
@@ -240,18 +266,20 @@ module PDF
240
266
  # Returns the resources that accompany this page. Includes
241
267
  # resources inherited from parents.
242
268
  #
269
+ #: () -> PDF::Reader::Resources
243
270
  def resources
244
271
  @resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
245
272
  end
246
273
 
274
+ #: (Array[untyped], String) -> void
247
275
  def content_stream(receivers, instructions)
248
276
  buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
249
277
  parser = Parser.new(buffer, @objects)
250
278
  params = []
251
279
 
252
280
  while (token = parser.parse_token(PagesStrategy::OPERATORS))
253
- if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
254
- callback(receivers, PagesStrategy::OPERATORS[token], params)
281
+ if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
282
+ callback(receivers, method_name, params)
255
283
  params.clear
256
284
  else
257
285
  params << token
@@ -263,29 +291,54 @@ module PDF
263
291
 
264
292
  # calls the name callback method on each receiver object with params as the arguments
265
293
  #
294
+ # The silly style here is because sorbet won't let me use splat arguments
295
+ #
296
+ #: (Array[Object], Symbol, ?Array[untyped]) -> void
266
297
  def callback(receivers, name, params=[])
267
298
  receivers.each do |receiver|
268
- receiver.send(name, *params) if receiver.respond_to?(name)
299
+ if receiver.respond_to?(name)
300
+ case params.size
301
+ when 0 then receiver.send(name)
302
+ when 1 then receiver.send(name, params[0])
303
+ when 2 then receiver.send(name, params[0], params[1])
304
+ when 3 then receiver.send(name, params[0], params[1], params[2])
305
+ when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
306
+ when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
307
+ when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
308
+ when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
309
+ when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
310
+ when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
311
+ else
312
+ receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
313
+ end
314
+ end
269
315
  end
270
316
  end
271
317
 
318
+ #: () -> untyped
272
319
  def page_with_ancestors
273
- [ @page_object ] + ancestors
320
+ [ @page_object ] + ancestors(@page_object[:Parent], Set[@page_ref.hash])
274
321
  end
275
322
 
276
- def ancestors(origin = @page_object[:Parent])
323
+ #: (?untyped, ?Set[Integer]) -> untyped
324
+ def ancestors(origin = @page_object[:Parent], seen = Set.new)
277
325
  if origin.nil?
278
326
  []
327
+ elsif seen.include?(origin.hash)
328
+ raise PDF::Reader::MalformedPDFError.new("loop found in ancestor path")
279
329
  else
280
330
  obj = objects.deref_hash(origin)
281
- PDF::Reader::Error.validate_not_nil_as_malformed(obj, "parent")
282
- [ select_inheritable(obj) ] + ancestors(obj[:Parent])
331
+ if obj.nil?
332
+ raise MalformedPDFError, "parent must not be nil"
333
+ end
334
+ [ select_inheritable(obj) ] + ancestors(obj[:Parent], seen.add(origin.hash))
283
335
  end
284
336
  end
285
337
 
286
338
  # select the elements from a Pages dictionary that can be inherited by
287
339
  # child Page dictionaries.
288
340
  #
341
+ #: (Hash[Symbol, untyped]) -> Hash[Symbol, untyped]
289
342
  def select_inheritable(obj)
290
343
  ::Hash[obj.select { |key, value|
291
344
  [:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  require 'pdf/reader/overlapping_runs_filter'
@@ -14,23 +14,29 @@ class PDF::Reader
14
14
  # page to be rendered as described by the page's MediaBox attribute
15
15
  class PageLayout
16
16
 
17
- DEFAULT_FONT_SIZE = 12
17
+ DEFAULT_FONT_SIZE = 12 #: Numeric
18
18
 
19
+ #: (Array[PDF::Reader::TextRun], Array[Numeric] | PDF::Reader::Rectangle) -> void
19
20
  def initialize(runs, mediabox)
20
21
  # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
22
  # PDF::Reader::Rectangle at some point
22
23
  PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
24
 
24
- @mediabox = process_mediabox(mediabox)
25
- @runs = runs
26
- @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
25
+ @mediabox = process_mediabox(mediabox) #: PDF::Reader::Rectangle
26
+ @runs = runs #: Array[PDF::Reader::TextRun]
27
+ @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE #: Numeric
27
28
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
28
- @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
29
- @x_offset = @runs.map(&:x).sort.first || 0
30
- lowest_y = @runs.map(&:y).sort.first || 0
31
- @y_offset = lowest_y > 0 ? 0 : lowest_y
29
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0 #: Numeric
30
+ @x_offset = @runs.map(&:x).sort.first || 0 #: Numeric
31
+ lowest_y = @runs.map(&:y).sort.first || 0 #: Numeric
32
+ @y_offset = lowest_y > 0 ? 0 : lowest_y #: Numeric
33
+ @row_count = nil #: Numeric | nil
34
+ @col_count = nil #: Numeric | nil
35
+ @row_multiplier = nil #: Numeric | nil
36
+ @col_multiplier = nil #: Numeric | nil
32
37
  end
33
38
 
39
+ #: () -> String
34
40
  def to_s
35
41
  return "" if @runs.empty?
36
42
  return "" if row_count == 0
@@ -48,10 +54,12 @@ class PDF::Reader
48
54
 
49
55
  private
50
56
 
57
+ #: () -> Numeric
51
58
  def page_width
52
59
  @mediabox.width
53
60
  end
54
61
 
62
+ #: () -> Numeric
55
63
  def page_height
56
64
  @mediabox.height
57
65
  end
@@ -62,6 +70,7 @@ class PDF::Reader
62
70
  # interesting_rows([ "", "one", "two", "" ])
63
71
  # => [ "one", "two" ]
64
72
  #
73
+ #: (untyped) -> untyped
65
74
  def interesting_rows(rows)
66
75
  line_lengths = rows.map { |l| l.strip.length }
67
76
 
@@ -73,22 +82,27 @@ class PDF::Reader
73
82
  rows[first_line_with_text, interesting_line_count].map
74
83
  end
75
84
 
85
+ #: () -> untyped
76
86
  def row_count
77
87
  @row_count ||= (page_height / @mean_font_size).floor
78
88
  end
79
89
 
90
+ #: () -> untyped
80
91
  def col_count
81
92
  @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
82
93
  end
83
94
 
95
+ #: () -> untyped
84
96
  def row_multiplier
85
97
  @row_multiplier ||= page_height.to_f / row_count.to_f
86
98
  end
87
99
 
100
+ #: () -> untyped
88
101
  def col_multiplier
89
102
  @col_multiplier ||= page_width.to_f / col_count.to_f
90
103
  end
91
104
 
105
+ #: (untyped) -> untyped
92
106
  def mean(collection)
93
107
  if collection.size == 0
94
108
  0
@@ -97,6 +111,7 @@ class PDF::Reader
97
111
  end
98
112
  end
99
113
 
114
+ #: (untyped) -> untyped
100
115
  def median(collection)
101
116
  if collection.size == 0
102
117
  0
@@ -105,10 +120,12 @@ class PDF::Reader
105
120
  end
106
121
  end
107
122
 
123
+ #: (untyped, untyped, untyped) -> untyped
108
124
  def local_string_insert(haystack, needle, index)
109
125
  haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
110
126
  end
111
127
 
128
+ #: (untyped) -> untyped
112
129
  def process_mediabox(mediabox)
113
130
  if mediabox.is_a?(Array)
114
131
  msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
@@ -16,13 +16,14 @@ class PDF::Reader
16
16
  :h_scaling => 1.0,
17
17
  :text_leading => 0,
18
18
  :text_font => nil,
19
- :text_font_size => nil,
19
+ :text_font_size => 0,
20
20
  :text_mode => 0,
21
21
  :text_rise => 0,
22
22
  :text_knockout => 0
23
- }
23
+ } #: Hash[Symbol, Numeric | nil]
24
24
 
25
25
  # starting a new page
26
+ #: (untyped) -> void
26
27
  def initialize(page)
27
28
  @page = page
28
29
  @cache = page.cache
@@ -32,6 +33,12 @@ class PDF::Reader
32
33
  @cs_stack = [page.color_spaces]
33
34
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
34
35
  state[:ctm] = identity_matrix
36
+
37
+ # These are only valid when inside a `BT` block and we re-initialize them on each
38
+ # `BT`. However, we need the instance variables set so PDFs with the text operators
39
+ # out order don't trigger NoMethodError when these are nil
40
+ @text_matrix = identity_matrix
41
+ @text_line_matrix = identity_matrix
35
42
  end
36
43
 
37
44
  #####################################################
@@ -42,12 +49,14 @@ class PDF::Reader
42
49
  # Any changes that are subsequently made to the state can then by reversed
43
50
  # by calling restore_graphics_state.
44
51
  #
52
+ #: () -> untyped
45
53
  def save_graphics_state
46
54
  @stack.push clone_state
47
55
  end
48
56
 
49
57
  # Restore the state to the previous value on the stack.
50
58
  #
59
+ #: () -> untyped
51
60
  def restore_graphics_state
52
61
  @stack.pop
53
62
  end
@@ -112,7 +121,7 @@ class PDF::Reader
112
121
  @font_size ||= begin
113
122
  _, zero = trm_transform(0,0)
114
123
  _, one = trm_transform(1,1)
115
- (zero - one).abs
124
+ (zero - one).abs.round(10)
116
125
  end
117
126
  end
118
127
 
@@ -14,9 +14,13 @@ module PDF
14
14
  class PageTextReceiver
15
15
  extend Forwardable
16
16
 
17
- SPACE = " "
17
+ SPACE = " " #: String
18
18
 
19
- attr_reader :state, :options
19
+ #: untyped
20
+ attr_reader :state
21
+
22
+ #: untyped
23
+ attr_reader :options
20
24
 
21
25
  ########## BEGIN FORWARDERS ##########
22
26
  # Graphics State Operators
@@ -62,10 +66,20 @@ module PDF
62
66
  runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
67
  end
64
68
 
69
+ runs = NoTextFilter.exclude_empty_strings(runs)
70
+
65
71
  if opts.fetch(:merge, true)
66
72
  runs = merge_runs(runs)
67
73
  end
68
74
 
75
+ if (only_filter = opts.fetch(:only, nil))
76
+ runs = AdvancedTextRunFilter.only(runs, only_filter)
77
+ end
78
+
79
+ if (exclude_filter = opts.fetch(:exclude, nil))
80
+ runs = AdvancedTextRunFilter.exclude(runs, exclude_filter)
81
+ end
82
+
69
83
  runs
70
84
  end
71
85
 
@@ -180,7 +180,7 @@ class PDF::Reader
180
180
  'y' => :append_curved_segment_final_point_replicated,
181
181
  '\'' => :move_to_next_line_and_show_text,
182
182
  '"' => :set_spacing_next_line_show_text,
183
- }
183
+ } #: Hash[String, Symbol]
184
184
  end
185
185
  ################################################################################
186
186
  end
@@ -1,5 +1,5 @@
1
1
  # coding: utf-8
2
- # typed: true
2
+ # typed: strict
3
3
  # frozen_string_literal: true
4
4
 
5
5
  ################################################################################
@@ -33,7 +33,7 @@ class PDF::Reader
33
33
  # them into useable ruby objects (hash's, arrays, true, false, etc)
34
34
  class Parser
35
35
 
36
- TOKEN_STRATEGY = proc { |parser, token| Token.new(token) }
36
+ TOKEN_STRATEGY = proc { |parser, token| Token.new(token) } #: Proc
37
37
 
38
38
  STRATEGIES = {
39
39
  "/" => proc { |parser, token| parser.send(:pdf_name) },
@@ -55,13 +55,14 @@ class PDF::Reader
55
55
  "]" => TOKEN_STRATEGY,
56
56
  ">" => TOKEN_STRATEGY,
57
57
  ")" => TOKEN_STRATEGY
58
- }
58
+ } #: Hash[String?, Proc]
59
59
 
60
60
  ################################################################################
61
61
  # Create a new parser around a PDF::Reader::Buffer object
62
62
  #
63
63
  # buffer - a PDF::Reader::Buffer object that contains PDF data
64
64
  # objects - a PDF::Reader::ObjectHash object that can return objects from the PDF file
65
+ #: (PDF::Reader::Buffer, ?PDF::Reader::ObjectHash?) -> void
65
66
  def initialize(buffer, objects=nil)
66
67
  @buffer = buffer
67
68
  @objects = objects
@@ -71,11 +72,24 @@ class PDF::Reader
71
72
  # object
72
73
  #
73
74
  # operators - a hash of supported operators to read from the underlying buffer.
75
+ #: (?Hash[String | PDF::Reader::Token, Symbol]) -> (
76
+ #| PDF::Reader::Reference |
77
+ #| PDF::Reader::Token |
78
+ #| Numeric |
79
+ #| String |
80
+ #| Symbol |
81
+ #| Array[untyped] |
82
+ #| Hash[untyped, untyped] |
83
+ #| nil
84
+ #| )
74
85
  def parse_token(operators={})
75
86
  token = @buffer.token
76
87
 
77
- if STRATEGIES.has_key? token
78
- STRATEGIES[token].call(self, token)
88
+ if token.nil?
89
+ nil
90
+ elsif token.is_a?(String) && STRATEGIES.has_key?(token)
91
+ proc = STRATEGIES[token]
92
+ proc.call(self, token) if proc
79
93
  elsif token.is_a? PDF::Reader::Reference
80
94
  token
81
95
  elsif operators.has_key? token
@@ -95,6 +109,17 @@ class PDF::Reader
95
109
  #
96
110
  # id - the object ID to return
97
111
  # gen - the object revision number to return
112
+ #: (Integer, Integer) -> (
113
+ #| PDF::Reader::Reference |
114
+ #| PDF::Reader::Token |
115
+ #| PDF::Reader::Stream |
116
+ #| Numeric |
117
+ #| String |
118
+ #| Symbol |
119
+ #| Array[untyped] |
120
+ #| Hash[untyped, untyped] |
121
+ #| nil
122
+ #| )
98
123
  def object(id, gen)
99
124
  idCheck = parse_token
100
125
 
@@ -120,6 +145,7 @@ class PDF::Reader
120
145
 
121
146
  ################################################################################
122
147
  # reads a PDF dict from the buffer and converts it to a Ruby Hash.
148
+ #: () -> Hash[Symbol, untyped]
123
149
  def dictionary
124
150
  dict = {}
125
151
 
@@ -138,15 +164,25 @@ class PDF::Reader
138
164
  end
139
165
  ################################################################################
140
166
  # reads a PDF name from the buffer and converts it to a Ruby Symbol
167
+ #: () -> Symbol
141
168
  def pdf_name
142
169
  tok = @buffer.token
143
- tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
144
- match[1, 2].hex.chr
170
+
171
+ if tok.is_a?(String)
172
+ tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
173
+ res = match[1, 2]
174
+ res ? res.hex.chr : ""
175
+ end
176
+ tok.to_sym
177
+ elsif tok.is_a?(PDF::Reader::Reference)
178
+ raise MalformedPDFError, "unexpected reference"
179
+ else
180
+ raise MalformedPDFError, "unexpected nil PDF Name"
145
181
  end
146
- tok.to_sym
147
182
  end
148
183
  ################################################################################
149
184
  # reads a PDF array from the buffer and converts it to a Ruby Array.
185
+ #: () -> Array[untyped]
150
186
  def array
151
187
  a = []
152
188
 
@@ -161,6 +197,7 @@ class PDF::Reader
161
197
  end
162
198
  ################################################################################
163
199
  # Reads a PDF hex string from the buffer and converts it to a Ruby String
200
+ #: () -> String
164
201
  def hex_string
165
202
  str = "".dup
166
203
 
@@ -173,14 +210,15 @@ class PDF::Reader
173
210
 
174
211
  # add a missing digit if required, as required by the spec
175
212
  str << "0" unless str.size % 2 == 0
176
- str.chars.each_slice(2).map { |nibbles|
177
- nibbles.join("").hex.chr
178
- }.join.force_encoding("binary")
213
+ [str].pack('H*')
179
214
  end
180
215
  ################################################################################
181
216
  # Reads a PDF String from the buffer and converts it to a Ruby String
217
+ #: () -> String
182
218
  def string
183
219
  str = @buffer.token
220
+ raise MalformedPDFError, "unexpected reference" if str.is_a?(PDF::Reader::Reference)
221
+ raise MalformedPDFError, "unexpected nil PDF String" if str.nil?
184
222
  return "".dup.force_encoding("binary") if str == ")"
185
223
  Error.assert_equal(parse_token, ")")
186
224
 
@@ -208,10 +246,11 @@ class PDF::Reader
208
246
  "\\\n" => "",
209
247
  "\\\r" => "",
210
248
  "\\\r\n" => "",
211
- }
249
+ } #: Hash[String, String]
212
250
 
213
251
  ################################################################################
214
252
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
253
+ #: (Hash[Symbol, untyped]) -> PDF::Reader::Stream
215
254
  def stream(dict)
216
255
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
217
256
  if @objects
@@ -233,7 +272,7 @@ class PDF::Reader
233
272
  # matter if it's missing, and other readers seems to handle its absence just fine
234
273
  # Error.str_assert(parse_token, "endobj")
235
274
 
236
- PDF::Reader::Stream.new(dict, data)
275
+ PDF::Reader::Stream.new(dict, data || "")
237
276
  end
238
277
  ################################################################################
239
278
  end
@@ -10,12 +10,19 @@ module PDF
10
10
  #
11
11
  class Point
12
12
 
13
- attr_reader :x, :y
13
+ #: Numeric
14
+ attr_reader :x
14
15
 
16
+ #: Numeric
17
+ attr_reader :y
18
+
19
+ #: (Numeric, Numeric) -> void
15
20
  def initialize(x, y)
16
- @x, @y = x, y
21
+ @x = x
22
+ @y = y
17
23
  end
18
24
 
25
+ #: (PDF::Reader::Point) -> bool
19
26
  def ==(other)
20
27
  other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
21
28
  end