pdf-reader 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +13 -1
  3. data/examples/rspec.rb +1 -0
  4. data/lib/pdf/reader/buffer.rb +1 -0
  5. data/lib/pdf/reader/cid_widths.rb +1 -0
  6. data/lib/pdf/reader/cmap.rb +5 -3
  7. data/lib/pdf/reader/encoding.rb +2 -1
  8. data/lib/pdf/reader/error.rb +8 -0
  9. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  10. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  11. data/lib/pdf/reader/filter/depredict.rb +7 -5
  12. data/lib/pdf/reader/filter/flate.rb +2 -0
  13. data/lib/pdf/reader/filter/lzw.rb +2 -0
  14. data/lib/pdf/reader/filter/null.rb +1 -0
  15. data/lib/pdf/reader/filter/run_length.rb +19 -13
  16. data/lib/pdf/reader/filter.rb +1 -0
  17. data/lib/pdf/reader/font.rb +1 -0
  18. data/lib/pdf/reader/font_descriptor.rb +1 -0
  19. data/lib/pdf/reader/form_xobject.rb +1 -0
  20. data/lib/pdf/reader/glyph_hash.rb +1 -0
  21. data/lib/pdf/reader/lzw.rb +4 -2
  22. data/lib/pdf/reader/null_security_handler.rb +1 -0
  23. data/lib/pdf/reader/object_cache.rb +1 -0
  24. data/lib/pdf/reader/object_hash.rb +5 -2
  25. data/lib/pdf/reader/object_stream.rb +1 -0
  26. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  27. data/lib/pdf/reader/page.rb +60 -9
  28. data/lib/pdf/reader/page_layout.rb +24 -14
  29. data/lib/pdf/reader/page_state.rb +11 -10
  30. data/lib/pdf/reader/page_text_receiver.rb +13 -8
  31. data/lib/pdf/reader/pages_strategy.rb +1 -0
  32. data/lib/pdf/reader/parser.rb +4 -1
  33. data/lib/pdf/reader/point.rb +25 -0
  34. data/lib/pdf/reader/print_receiver.rb +1 -0
  35. data/lib/pdf/reader/rectangle.rb +95 -0
  36. data/lib/pdf/reader/reference.rb +1 -0
  37. data/lib/pdf/reader/register_receiver.rb +1 -0
  38. data/lib/pdf/reader/resource_methods.rb +5 -0
  39. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  40. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  41. data/lib/pdf/reader/stream.rb +1 -0
  42. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  43. data/lib/pdf/reader/text_run.rb +1 -0
  44. data/lib/pdf/reader/token.rb +1 -0
  45. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  46. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  47. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  48. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  49. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  50. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  52. data/lib/pdf/reader/width_calculator.rb +1 -0
  53. data/lib/pdf/reader/xref.rb +1 -0
  54. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  55. data/lib/pdf/reader.rb +14 -4
  56. data/lib/pdf-reader.rb +1 -0
  57. data/rbi/pdf-reader.rbi +1744 -0
  58. metadata +12 -10
  59. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/transformation_matrix'
@@ -312,7 +313,7 @@ class PDF::Reader
312
313
  # may need to be added
313
314
  #
314
315
  def process_glyph_displacement(w0, tj, word_boundary)
315
- fs = font_size # font size
316
+ fs = state[:text_font_size]
316
317
  tc = state[:char_spacing]
317
318
  if word_boundary
318
319
  tw = state[:word_spacing]
@@ -330,16 +331,16 @@ class PDF::Reader
330
331
  # apply horizontal scaling to spacing values but not font size
331
332
  tx = ((w0 * fs) + tc + tw) * th
332
333
  end
333
-
334
- # TODO: I'm pretty sure that tx shouldn't need to be divided by
335
- # ctm[0] here, but this gets my tests green and I'm out of
336
- # ideas for now
337
334
  # TODO: support ty > 0
338
- if ctm.a == 1 || ctm.a == 0
339
- @text_matrix.horizontal_displacement_multiply!(tx)
340
- else
341
- @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
342
- end
335
+ ty = 0
336
+ temp = TransformationMatrix.new(1, 0,
337
+ 0, 1,
338
+ tx, ty)
339
+ @text_matrix = temp.multiply!(
340
+ @text_matrix.a, @text_matrix.b,
341
+ @text_matrix.c, @text_matrix.d,
342
+ @text_matrix.e, @text_matrix.f
343
+ )
343
344
  @font_size = @text_rendering_matrix = nil # invalidate cached value
344
345
  end
345
346
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'forwardable'
@@ -44,14 +45,11 @@ module PDF
44
45
  @page = page
45
46
  @content = []
46
47
  @characters = []
47
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
- device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
49
- device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
50
- @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
51
48
  end
52
49
 
53
50
  def content
54
- PageLayout.new(@characters, @device_mediabox).to_s
51
+ mediabox = @page.rectangles[:MediaBox].to_a
52
+ PageLayout.new(@characters, mediabox).to_s
55
53
  end
56
54
 
57
55
  #####################################################
@@ -121,6 +119,12 @@ module PDF
121
119
  end
122
120
  end
123
121
 
122
+ # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
+ # think it sets the correct x,y values. We get away with it because we don't
124
+ # return the text with co-ordinates, only the full text arranged in a string.
125
+ #
126
+ # We should provide an API for extracting the text with positioning data and spec
127
+ # that. I suspect the co-ords might be wrong for rotated pages
124
128
  def apply_rotation(x, y)
125
129
  if @page.rotate == 90
126
130
  tmp = x
@@ -128,10 +132,11 @@ module PDF
128
132
  y = tmp * -1
129
133
  elsif @page.rotate == 180
130
134
  y *= -1
135
+ x *= -1
131
136
  elsif @page.rotate == 270
132
- tmp = x
133
- x = y * -1
134
- y = tmp * -1
137
+ tmp = y
138
+ y = x
139
+ x = tmp * -1
135
140
  end
136
141
  return x, y
137
142
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -166,7 +167,9 @@ class PDF::Reader
166
167
 
167
168
  # add a missing digit if required, as required by the spec
168
169
  str << "0" unless str.size % 2 == 0
169
- str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
170
+ str.chars.each_slice(2).map { |nibbles|
171
+ nibbles.join("").hex.chr
172
+ }.join.force_encoding("binary")
170
173
  end
171
174
  ################################################################################
172
175
  # Reads a PDF String from the buffer and converts it to a Ruby String
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs are all about positioning content on a page, so there's lots of need to
9
+ # work with a set of X,Y coordinates.
10
+ #
11
+ class Point
12
+
13
+ attr_reader :x, :y
14
+
15
+ def initialize(x, y)
16
+ @x, @y = x, y
17
+ end
18
+
19
+ def ==(other)
20
+ other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -0,0 +1,95 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs represent rectangles all over the place. They're 4 element arrays, like this:
9
+ #
10
+ # [A, B, C, D]
11
+ #
12
+ # Four element arrays are yucky to work with though, so here's a class that's better.
13
+ # Initialize it with the 4 elements, and get utility functions (width, height, etc)
14
+ # for free.
15
+ #
16
+ # By convention the first two elements are x1, y1, the co-ords for the bottom left corner
17
+ # of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
18
+ # corner of the rectangle. It's valid for the alternative corners to be used though, so
19
+ # we don't assume which is which.
20
+ #
21
+ class Rectangle
22
+
23
+ attr_reader :bottom_left, :bottom_right, :top_left, :top_right
24
+
25
+ def initialize(x1, y1, x2, y2)
26
+ set_corners(x1, y1, x2, y2)
27
+ end
28
+
29
+ def ==(other)
30
+ to_a == other.to_a
31
+ end
32
+
33
+ def height
34
+ top_right.y - bottom_right.y
35
+ end
36
+
37
+ def width
38
+ bottom_right.x - bottom_left.x
39
+ end
40
+
41
+ # A pdf-style 4-number array
42
+ def to_a
43
+ [
44
+ bottom_left.x,
45
+ bottom_left.y,
46
+ top_right.x,
47
+ top_right.y,
48
+ ]
49
+ end
50
+
51
+ def apply_rotation(degrees)
52
+ return if degrees != 90 && degrees != 180 && degrees != 270
53
+
54
+ if degrees == 90
55
+ new_x1 = bottom_left.x
56
+ new_y1 = bottom_left.y - width
57
+ new_x2 = bottom_left.x + height
58
+ new_y2 = bottom_left.y
59
+ elsif degrees == 180
60
+ new_x1 = bottom_left.x - width
61
+ new_y1 = bottom_left.y - height
62
+ new_x2 = bottom_left.x
63
+ new_y2 = bottom_left.y
64
+ elsif degrees == 270
65
+ new_x1 = bottom_left.x - height
66
+ new_y1 = bottom_left.y
67
+ new_x2 = bottom_left.x
68
+ new_y2 = bottom_left.y + width
69
+ end
70
+ set_corners(new_x1, new_y1, new_x2, new_y2)
71
+ end
72
+
73
+ private
74
+
75
+ def set_corners(x1, y1, x2, y2)
76
+ @bottom_left = PDF::Reader::Point.new(
77
+ [x1, x2].min,
78
+ [y1, y2].min,
79
+ )
80
+ @bottom_right = PDF::Reader::Point.new(
81
+ [x1, x2].max,
82
+ [y1, y2].min,
83
+ )
84
+ @top_left = PDF::Reader::Point.new(
85
+ [x1, x2].min,
86
+ [y1, y2].max,
87
+ )
88
+ @top_right = PDF::Reader::Point.new(
89
+ [x1, x2].max,
90
+ [y1, y2].max,
91
+ )
92
+ end
93
+ end
94
+ end
95
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # Copyright (C) 2010 James Healy (jimmy@deefa.com)
@@ -1,12 +1,17 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
5
+ # Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
6
+ # are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
7
+
4
8
  module PDF
5
9
  class Reader
6
10
 
7
11
  # mixin for common methods in Page and FormXobjects
8
12
  #
9
13
  module ResourceMethods
14
+
10
15
  # Returns a Hash of color spaces that are available to this page
11
16
  #
12
17
  # NOTE: this method de-serialise objects from the underlying PDF
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'afm'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
2
4
 
3
5
  class PDF::Reader
4
6
  # There's no point rendering zero-width characters
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -136,7 +137,7 @@ module PDF
136
137
  def page_count
137
138
  pages = @objects.deref(root[:Pages])
138
139
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
140
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
141
  end
141
142
  @page_count ||= @objects.deref(pages[:Count])
142
143
  end
@@ -221,7 +222,7 @@ module PDF
221
222
  when Array then
222
223
  obj.map { |item| doc_strings_to_utf8(item) }
223
224
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
225
+ if has_utf16_bom?(obj)
225
226
  utf16_to_utf8(obj)
226
227
  else
227
228
  pdfdoc_to_utf8(obj)
@@ -231,6 +232,14 @@ module PDF
231
232
  end
232
233
  end
233
234
 
235
+ def has_utf16_bom?(str)
236
+ first_bytes = str[0,2]
237
+
238
+ return false if first_bytes.nil?
239
+
240
+ first_bytes.unpack("C*") == [254, 255]
241
+ end
242
+
234
243
  # TODO find a PDF I can use to spec this behaviour
235
244
  #
236
245
  def pdfdoc_to_utf8(obj)
@@ -242,7 +251,7 @@ module PDF
242
251
  # String#encode
243
252
  #
244
253
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
254
+ str = obj[2, obj.size].to_s
246
255
  str = str.unpack("n*").pack("U*")
247
256
  str.force_encoding("utf-8")
248
257
  str
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
286
295
  require 'pdf/reader/object_stream'
287
296
  require 'pdf/reader/pages_strategy'
288
297
  require 'pdf/reader/parser'
298
+ require 'pdf/reader/point'
289
299
  require 'pdf/reader/print_receiver'
300
+ require 'pdf/reader/rectangle'
290
301
  require 'pdf/reader/reference'
291
302
  require 'pdf/reader/register_receiver'
292
303
  require 'pdf/reader/null_security_handler'
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
299
310
  require 'pdf/reader/page_text_receiver'
300
311
  require 'pdf/reader/token'
301
312
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
313
  require 'pdf/reader/page'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"