pdf-reader 2.6.0 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +13 -1
  3. data/examples/rspec.rb +1 -0
  4. data/lib/pdf/reader/buffer.rb +1 -0
  5. data/lib/pdf/reader/cid_widths.rb +1 -0
  6. data/lib/pdf/reader/cmap.rb +5 -3
  7. data/lib/pdf/reader/encoding.rb +2 -1
  8. data/lib/pdf/reader/error.rb +8 -0
  9. data/lib/pdf/reader/filter/ascii85.rb +2 -0
  10. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  11. data/lib/pdf/reader/filter/depredict.rb +7 -5
  12. data/lib/pdf/reader/filter/flate.rb +2 -0
  13. data/lib/pdf/reader/filter/lzw.rb +2 -0
  14. data/lib/pdf/reader/filter/null.rb +1 -0
  15. data/lib/pdf/reader/filter/run_length.rb +19 -13
  16. data/lib/pdf/reader/filter.rb +1 -0
  17. data/lib/pdf/reader/font.rb +1 -0
  18. data/lib/pdf/reader/font_descriptor.rb +1 -0
  19. data/lib/pdf/reader/form_xobject.rb +1 -0
  20. data/lib/pdf/reader/glyph_hash.rb +1 -0
  21. data/lib/pdf/reader/lzw.rb +4 -2
  22. data/lib/pdf/reader/null_security_handler.rb +1 -0
  23. data/lib/pdf/reader/object_cache.rb +1 -0
  24. data/lib/pdf/reader/object_hash.rb +5 -2
  25. data/lib/pdf/reader/object_stream.rb +1 -0
  26. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  27. data/lib/pdf/reader/page.rb +60 -9
  28. data/lib/pdf/reader/page_layout.rb +24 -14
  29. data/lib/pdf/reader/page_state.rb +11 -10
  30. data/lib/pdf/reader/page_text_receiver.rb +13 -8
  31. data/lib/pdf/reader/pages_strategy.rb +1 -0
  32. data/lib/pdf/reader/parser.rb +4 -1
  33. data/lib/pdf/reader/point.rb +25 -0
  34. data/lib/pdf/reader/print_receiver.rb +1 -0
  35. data/lib/pdf/reader/rectangle.rb +95 -0
  36. data/lib/pdf/reader/reference.rb +1 -0
  37. data/lib/pdf/reader/register_receiver.rb +1 -0
  38. data/lib/pdf/reader/resource_methods.rb +5 -0
  39. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  40. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  41. data/lib/pdf/reader/stream.rb +1 -0
  42. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  43. data/lib/pdf/reader/text_run.rb +1 -0
  44. data/lib/pdf/reader/token.rb +1 -0
  45. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  46. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  47. data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
  48. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  49. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  50. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  52. data/lib/pdf/reader/width_calculator.rb +1 -0
  53. data/lib/pdf/reader/xref.rb +1 -0
  54. data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
  55. data/lib/pdf/reader.rb +14 -4
  56. data/lib/pdf-reader.rb +1 -0
  57. data/rbi/pdf-reader.rbi +1744 -0
  58. metadata +12 -10
  59. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/transformation_matrix'
@@ -312,7 +313,7 @@ class PDF::Reader
312
313
  # may need to be added
313
314
  #
314
315
  def process_glyph_displacement(w0, tj, word_boundary)
315
- fs = font_size # font size
316
+ fs = state[:text_font_size]
316
317
  tc = state[:char_spacing]
317
318
  if word_boundary
318
319
  tw = state[:word_spacing]
@@ -330,16 +331,16 @@ class PDF::Reader
330
331
  # apply horizontal scaling to spacing values but not font size
331
332
  tx = ((w0 * fs) + tc + tw) * th
332
333
  end
333
-
334
- # TODO: I'm pretty sure that tx shouldn't need to be divided by
335
- # ctm[0] here, but this gets my tests green and I'm out of
336
- # ideas for now
337
334
  # TODO: support ty > 0
338
- if ctm.a == 1 || ctm.a == 0
339
- @text_matrix.horizontal_displacement_multiply!(tx)
340
- else
341
- @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
342
- end
335
+ ty = 0
336
+ temp = TransformationMatrix.new(1, 0,
337
+ 0, 1,
338
+ tx, ty)
339
+ @text_matrix = temp.multiply!(
340
+ @text_matrix.a, @text_matrix.b,
341
+ @text_matrix.c, @text_matrix.d,
342
+ @text_matrix.e, @text_matrix.f
343
+ )
343
344
  @font_size = @text_rendering_matrix = nil # invalidate cached value
344
345
  end
345
346
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'forwardable'
@@ -44,14 +45,11 @@ module PDF
44
45
  @page = page
45
46
  @content = []
46
47
  @characters = []
47
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
- device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
49
- device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
50
- @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
51
48
  end
52
49
 
53
50
  def content
54
- PageLayout.new(@characters, @device_mediabox).to_s
51
+ mediabox = @page.rectangles[:MediaBox].to_a
52
+ PageLayout.new(@characters, mediabox).to_s
55
53
  end
56
54
 
57
55
  #####################################################
@@ -121,6 +119,12 @@ module PDF
121
119
  end
122
120
  end
123
121
 
122
+ # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
+ # think it sets the correct x,y values. We get away with it because we don't
124
+ # return the text with co-ordinates, only the full text arranged in a string.
125
+ #
126
+ # We should provide an API for extracting the text with positioning data and spec
127
+ # that. I suspect the co-ords might be wrong for rotated pages
124
128
  def apply_rotation(x, y)
125
129
  if @page.rotate == 90
126
130
  tmp = x
@@ -128,10 +132,11 @@ module PDF
128
132
  y = tmp * -1
129
133
  elsif @page.rotate == 180
130
134
  y *= -1
135
+ x *= -1
131
136
  elsif @page.rotate == 270
132
- tmp = x
133
- x = y * -1
134
- y = tmp * -1
137
+ tmp = y
138
+ y = x
139
+ x = tmp * -1
135
140
  end
136
141
  return x, y
137
142
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -166,7 +167,9 @@ class PDF::Reader
166
167
 
167
168
  # add a missing digit if required, as required by the spec
168
169
  str << "0" unless str.size % 2 == 0
169
- str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
170
+ str.chars.each_slice(2).map { |nibbles|
171
+ nibbles.join("").hex.chr
172
+ }.join.force_encoding("binary")
170
173
  end
171
174
  ################################################################################
172
175
  # Reads a PDF String from the buffer and converts it to a Ruby String
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs are all about positioning content on a page, so there's lots of need to
9
+ # work with a set of X,Y coordinates.
10
+ #
11
+ class Point
12
+
13
+ attr_reader :x, :y
14
+
15
+ def initialize(x, y)
16
+ @x, @y = x, y
17
+ end
18
+
19
+ def ==(other)
20
+ other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -0,0 +1,95 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs represent rectangles all over the place. They're 4 element arrays, like this:
9
+ #
10
+ # [A, B, C, D]
11
+ #
12
+ # Four element arrays are yucky to work with though, so here's a class that's better.
13
+ # Initialize it with the 4 elements, and get utility functions (width, height, etc)
14
+ # for free.
15
+ #
16
+ # By convention the first two elements are x1, y1, the co-ords for the bottom left corner
17
+ # of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
18
+ # corner of the rectangle. It's valid for the alternative corners to be used though, so
19
+ # we don't assume which is which.
20
+ #
21
+ class Rectangle
22
+
23
+ attr_reader :bottom_left, :bottom_right, :top_left, :top_right
24
+
25
+ def initialize(x1, y1, x2, y2)
26
+ set_corners(x1, y1, x2, y2)
27
+ end
28
+
29
+ def ==(other)
30
+ to_a == other.to_a
31
+ end
32
+
33
+ def height
34
+ top_right.y - bottom_right.y
35
+ end
36
+
37
+ def width
38
+ bottom_right.x - bottom_left.x
39
+ end
40
+
41
+ # A pdf-style 4-number array
42
+ def to_a
43
+ [
44
+ bottom_left.x,
45
+ bottom_left.y,
46
+ top_right.x,
47
+ top_right.y,
48
+ ]
49
+ end
50
+
51
+ def apply_rotation(degrees)
52
+ return if degrees != 90 && degrees != 180 && degrees != 270
53
+
54
+ if degrees == 90
55
+ new_x1 = bottom_left.x
56
+ new_y1 = bottom_left.y - width
57
+ new_x2 = bottom_left.x + height
58
+ new_y2 = bottom_left.y
59
+ elsif degrees == 180
60
+ new_x1 = bottom_left.x - width
61
+ new_y1 = bottom_left.y - height
62
+ new_x2 = bottom_left.x
63
+ new_y2 = bottom_left.y
64
+ elsif degrees == 270
65
+ new_x1 = bottom_left.x - height
66
+ new_y1 = bottom_left.y
67
+ new_x2 = bottom_left.x
68
+ new_y2 = bottom_left.y + width
69
+ end
70
+ set_corners(new_x1, new_y1, new_x2, new_y2)
71
+ end
72
+
73
+ private
74
+
75
+ def set_corners(x1, y1, x2, y2)
76
+ @bottom_left = PDF::Reader::Point.new(
77
+ [x1, x2].min,
78
+ [y1, y2].min,
79
+ )
80
+ @bottom_right = PDF::Reader::Point.new(
81
+ [x1, x2].max,
82
+ [y1, y2].min,
83
+ )
84
+ @top_left = PDF::Reader::Point.new(
85
+ [x1, x2].min,
86
+ [y1, y2].max,
87
+ )
88
+ @top_right = PDF::Reader::Point.new(
89
+ [x1, x2].max,
90
+ [y1, y2].max,
91
+ )
92
+ end
93
+ end
94
+ end
95
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # Copyright (C) 2010 James Healy (jimmy@deefa.com)
@@ -1,12 +1,17 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
5
+ # Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
6
+ # are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
7
+
4
8
  module PDF
5
9
  class Reader
6
10
 
7
11
  # mixin for common methods in Page and FormXobjects
8
12
  #
9
13
  module ResourceMethods
14
+
10
15
  # Returns a Hash of color spaces that are available to this page
11
16
  #
12
17
  # NOTE: this method de-serialise objects from the underlying PDF
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'afm'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
2
4
 
3
5
  class PDF::Reader
4
6
  # There's no point rendering zero-width characters
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -136,7 +137,7 @@ module PDF
136
137
  def page_count
137
138
  pages = @objects.deref(root[:Pages])
138
139
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
140
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
141
  end
141
142
  @page_count ||= @objects.deref(pages[:Count])
142
143
  end
@@ -221,7 +222,7 @@ module PDF
221
222
  when Array then
222
223
  obj.map { |item| doc_strings_to_utf8(item) }
223
224
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
225
+ if has_utf16_bom?(obj)
225
226
  utf16_to_utf8(obj)
226
227
  else
227
228
  pdfdoc_to_utf8(obj)
@@ -231,6 +232,14 @@ module PDF
231
232
  end
232
233
  end
233
234
 
235
+ def has_utf16_bom?(str)
236
+ first_bytes = str[0,2]
237
+
238
+ return false if first_bytes.nil?
239
+
240
+ first_bytes.unpack("C*") == [254, 255]
241
+ end
242
+
234
243
  # TODO find a PDF I can use to spec this behaviour
235
244
  #
236
245
  def pdfdoc_to_utf8(obj)
@@ -242,7 +251,7 @@ module PDF
242
251
  # String#encode
243
252
  #
244
253
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
254
+ str = obj[2, obj.size].to_s
246
255
  str = str.unpack("n*").pack("U*")
247
256
  str.force_encoding("utf-8")
248
257
  str
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
286
295
  require 'pdf/reader/object_stream'
287
296
  require 'pdf/reader/pages_strategy'
288
297
  require 'pdf/reader/parser'
298
+ require 'pdf/reader/point'
289
299
  require 'pdf/reader/print_receiver'
300
+ require 'pdf/reader/rectangle'
290
301
  require 'pdf/reader/reference'
291
302
  require 'pdf/reader/register_receiver'
292
303
  require 'pdf/reader/null_security_handler'
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
299
310
  require 'pdf/reader/page_text_receiver'
300
311
  require 'pdf/reader/token'
301
312
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
313
  require 'pdf/reader/page'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"