pdf-reader 2.2.0 → 2.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -13,7 +14,7 @@ module PDF
13
14
  # objects accessor to help walk the page dictionary in any useful way.
14
15
  #
15
16
  class Page
16
- include ResourceMethods
17
+ extend Forwardable
17
18
 
18
19
  # lowlevel hash-like access to all objects in the underlying PDF
19
20
  attr_reader :objects
@@ -26,6 +27,15 @@ module PDF
26
27
  # operations
27
28
  attr_reader :cache
28
29
 
30
+ def_delegators :resources, :color_spaces
31
+ def_delegators :resources, :fonts
32
+ def_delegators :resources, :graphic_states
33
+ def_delegators :resources, :patterns
34
+ def_delegators :resources, :procedure_sets
35
+ def_delegators :resources, :properties
36
+ def_delegators :resources, :shadings
37
+ def_delegators :resources, :xobjects
38
+
29
39
  # creates a new page wrapper.
30
40
  #
31
41
  # * objects - an ObjectHash instance that wraps a PDF file
@@ -33,10 +43,10 @@ module PDF
33
43
  #
34
44
  def initialize(objects, pagenum, options = {})
35
45
  @objects, @pagenum = objects, pagenum
36
- @page_object = objects.deref(objects.page_references[pagenum - 1])
46
+ @page_object = objects.deref_hash(objects.page_references[pagenum - 1]) || {}
37
47
  @cache = options[:cache] || {}
38
48
 
39
- unless @page_object.is_a?(::Hash)
49
+ if @page_object.empty?
40
50
  raise InvalidPageError, "Invalid page: #{pagenum}"
41
51
  end
42
52
  end
@@ -59,7 +69,7 @@ module PDF
59
69
  def attributes
60
70
  @attributes ||= {}.tap { |hash|
61
71
  page_with_ancestors.reverse.each do |obj|
62
- hash.merge!(@objects.deref(obj))
72
+ hash.merge!(@objects.deref_hash(obj) || {})
63
73
  end
64
74
  }
65
75
  # This shouldn't be necesary, but some non compliant PDFs leave MediaBox
@@ -68,22 +78,56 @@ module PDF
68
78
  @attributes
69
79
  end
70
80
 
81
+ def height
82
+ rect = Rectangle.new(*attributes[:MediaBox])
83
+ rect.apply_rotation(rotate) if rotate > 0
84
+ rect.height
85
+ end
86
+
87
+ def width
88
+ rect = Rectangle.new(*attributes[:MediaBox])
89
+ rect.apply_rotation(rotate) if rotate > 0
90
+ rect.width
91
+ end
92
+
93
+ def origin
94
+ rect = Rectangle.new(*attributes[:MediaBox])
95
+ rect.apply_rotation(rotate) if rotate > 0
96
+
97
+ rect.bottom_left
98
+ end
99
+
71
100
  # Convenience method to identify the page's orientation.
72
101
  #
73
102
  def orientation
74
- OrientationDetector.new(attributes).orientation
103
+ if height > width
104
+ "portrait"
105
+ else
106
+ "landscape"
107
+ end
75
108
  end
76
109
 
77
110
  # returns the plain text content of this page encoded as UTF-8. Any
78
111
  # characters that can't be translated will be returned as a ▯
79
112
  #
80
- def text
113
+ def text(opts = {})
81
114
  receiver = PageTextReceiver.new
82
115
  walk(receiver)
83
- receiver.content
116
+ runs = receiver.runs(opts)
117
+
118
+ # rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
119
+ mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
120
+
121
+ PageLayout.new(runs, mediabox).to_s
84
122
  end
85
123
  alias :to_s :text
86
124
 
125
+ def runs(opts = {})
126
+ receiver = PageTextReceiver.new
127
+ walk(receiver)
128
+ receiver.runs(opts)
129
+ end
130
+
87
131
  # processes the raw content stream for this page in sequential order and
88
132
  # passes callbacks to the receiver objects.
89
133
  #
@@ -108,6 +152,9 @@ module PDF
108
152
  # the program in the correct order and calls out to your implementation.
109
153
  #
110
154
  def walk(*receivers)
155
+ receivers = receivers.map { |receiver|
156
+ ValidatingReceiver.new(receiver)
157
+ }
111
158
  callback(receivers, :page=, [self])
112
159
  content_stream(receivers, raw_content)
113
160
  end
@@ -116,25 +163,85 @@ module PDF
116
163
  # see here unless you're a PDF nerd like me.
117
164
  #
118
165
  def raw_content
119
- contents = objects.deref(@page_object[:Contents])
166
+ contents = objects.deref_stream_or_array(@page_object[:Contents])
120
167
  [contents].flatten.compact.map { |obj|
121
- objects.deref(obj)
122
- }.map { |obj|
168
+ objects.deref_stream(obj)
169
+ }.compact.map { |obj|
123
170
  obj.unfiltered_data
124
171
  }.join(" ")
125
172
  end
126
173
 
174
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
175
+ #
176
+ def rotate
177
+ value = attributes[:Rotate].to_i
178
+ case value
179
+ when 0, 90, 180, 270
180
+ value
181
+ else
182
+ 0
183
+ end
184
+ end
185
+
186
+ # returns the "boxes" that define the page object.
187
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
188
+ #
189
+ # DEPRECATED. Recommend using Page#rectangles instead
190
+ #
191
+ def boxes
192
+ # In ruby 2.4+ we could use Hash#transform_values
193
+ Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
194
+ end
195
+
196
+ # returns the "boxes" that define the page object.
197
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
198
+ #
199
+ def rectangles
200
+ # attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
201
+ mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
202
+ cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
203
+ bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
204
+ trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
205
+ artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
206
+
207
+ begin
208
+ mediarect = Rectangle.from_array(mediabox)
209
+ croprect = Rectangle.from_array(cropbox)
210
+ bleedrect = Rectangle.from_array(bleedbox)
211
+ trimrect = Rectangle.from_array(trimbox)
212
+ artrect = Rectangle.from_array(artbox)
213
+ rescue ArgumentError => e
214
+ raise MalformedPDFError, e.message
215
+ end
216
+
217
+ if rotate > 0
218
+ mediarect.apply_rotation(rotate)
219
+ croprect.apply_rotation(rotate)
220
+ bleedrect.apply_rotation(rotate)
221
+ trimrect.apply_rotation(rotate)
222
+ artrect.apply_rotation(rotate)
223
+ end
224
+
225
+ {
226
+ MediaBox: mediarect,
227
+ CropBox: croprect,
228
+ BleedBox: bleedrect,
229
+ TrimBox: trimrect,
230
+ ArtBox: artrect,
231
+ }
232
+ end
233
+
127
234
  private
128
235
 
129
236
  def root
130
- root ||= objects.deref(@objects.trailer[:Root])
237
+ @root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
131
238
  end
132
239
 
133
240
  # Returns the resources that accompany this page. Includes
134
241
  # resources inherited from parents.
135
242
  #
136
243
  def resources
137
- @resources ||= @objects.deref(attributes[:Resources]) || {}
244
+ @resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
138
245
  end
139
246
 
140
247
  def content_stream(receivers, instructions)
@@ -143,8 +250,8 @@ module PDF
143
250
  params = []
144
251
 
145
252
  while (token = parser.parse_token(PagesStrategy::OPERATORS))
146
- if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
147
- callback(receivers, PagesStrategy::OPERATORS[token], params)
253
+ if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
254
+ callback(receivers, method_name, params)
148
255
  params.clear
149
256
  else
150
257
  params << token
@@ -156,9 +263,26 @@ module PDF
156
263
 
157
264
  # calls the name callback method on each receiver object with params as the arguments
158
265
  #
266
+ # The silly style here is because sorbet won't let me use splat arguments
267
+ #
159
268
  def callback(receivers, name, params=[])
160
269
  receivers.each do |receiver|
161
- receiver.send(name, *params) if receiver.respond_to?(name)
270
+ if receiver.respond_to?(name)
271
+ case params.size
272
+ when 0 then receiver.send(name)
273
+ when 1 then receiver.send(name, params[0])
274
+ when 2 then receiver.send(name, params[0], params[1])
275
+ when 3 then receiver.send(name, params[0], params[1], params[2])
276
+ when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
277
+ when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
278
+ when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
279
+ when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
280
+ when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
281
+ when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
282
+ else
283
+ receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
284
+ end
285
+ end
162
286
  end
163
287
  end
164
288
 
@@ -170,7 +294,10 @@ module PDF
170
294
  if origin.nil?
171
295
  []
172
296
  else
173
- obj = objects.deref(origin)
297
+ obj = objects.deref_hash(origin)
298
+ if obj.nil?
299
+ raise MalformedPDFError, "parent mus not be nil"
300
+ end
174
301
  [ select_inheritable(obj) ] + ancestors(obj[:Parent])
175
302
  end
176
303
  end
@@ -1,6 +1,10 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
5
+ require 'pdf/reader/overlapping_runs_filter'
6
+ require 'pdf/reader/zero_width_runs_filter'
7
+
4
8
  class PDF::Reader
5
9
 
6
10
  # Takes a collection of TextRun objects and renders them into a single
@@ -13,24 +17,28 @@ class PDF::Reader
13
17
  DEFAULT_FONT_SIZE = 12
14
18
 
15
19
  def initialize(runs, mediabox)
16
- raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
20
+ # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
+ # PDF::Reader::Rectangle at some point
22
+ PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
17
23
 
18
- @runs = merge_runs(runs)
24
+ @mediabox = process_mediabox(mediabox)
25
+ @runs = runs
19
26
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
27
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
22
- @page_width = mediabox[2] - mediabox[0]
23
- @page_height = mediabox[3] - mediabox[1]
24
- @x_offset = @runs.map(&:x).sort.first
28
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
29
+ @x_offset = @runs.map(&:x).sort.first || 0
30
+ lowest_y = @runs.map(&:y).sort.first || 0
31
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
25
32
  end
26
33
 
27
34
  def to_s
28
35
  return "" if @runs.empty?
36
+ return "" if row_count == 0
29
37
 
30
38
  page = row_count.times.map { |i| " " * col_count }
31
39
  @runs.each do |run|
32
40
  x_pos = ((run.x - @x_offset) / col_multiplier).round
33
- y_pos = row_count - (run.y / row_multiplier).round
41
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
34
42
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
35
43
  local_string_insert(page[y_pos-1], run.text, x_pos)
36
44
  end
@@ -40,6 +48,14 @@ class PDF::Reader
40
48
 
41
49
  private
42
50
 
51
+ def page_width
52
+ @mediabox.width
53
+ end
54
+
55
+ def page_height
56
+ @mediabox.height
57
+ end
58
+
43
59
  # given an array of strings, return a new array with empty rows from the
44
60
  # beginning and end removed.
45
61
  #
@@ -58,19 +74,19 @@ class PDF::Reader
58
74
  end
59
75
 
60
76
  def row_count
61
- @row_count ||= (@page_height / @mean_font_size).floor
77
+ @row_count ||= (page_height / @mean_font_size).floor
62
78
  end
63
79
 
64
80
  def col_count
65
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
81
+ @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
66
82
  end
67
83
 
68
84
  def row_multiplier
69
- @row_multiplier ||= @page_height.to_f / row_count.to_f
85
+ @row_multiplier ||= page_height.to_f / row_count.to_f
70
86
  end
71
87
 
72
88
  def col_multiplier
73
- @col_multiplier ||= @page_width.to_f / col_count.to_f
89
+ @col_multiplier ||= page_width.to_f / col_count.to_f
74
90
  end
75
91
 
76
92
  def mean(collection)
@@ -81,40 +97,28 @@ class PDF::Reader
81
97
  end
82
98
  end
83
99
 
84
- def each_line(&block)
85
- @runs.sort.group_by { |run|
86
- run.y.to_i
87
- }.map { |y, collection|
88
- yield y, collection
89
- }
100
+ def median(collection)
101
+ if collection.size == 0
102
+ 0
103
+ else
104
+ collection.sort[(collection.size * 0.5).floor]
105
+ end
90
106
  end
91
107
 
92
- # take a collection of TextRun objects and merge any that are in close
93
- # proximity
94
- def merge_runs(runs)
95
- runs.group_by { |char|
96
- char.y.to_i
97
- }.map { |y, chars|
98
- group_chars_into_runs(chars.sort)
99
- }.flatten.sort
108
+ def local_string_insert(haystack, needle, index)
109
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
100
110
  end
101
111
 
102
- def group_chars_into_runs(chars)
103
- runs = []
104
- while head = chars.shift
105
- if runs.empty?
106
- runs << head
107
- elsif runs.last.mergable?(head)
108
- runs[-1] = runs.last + head
109
- else
110
- runs << head
111
- end
112
+ def process_mediabox(mediabox)
113
+ if mediabox.is_a?(Array)
114
+ msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
115
+ " please use a Rectangle instead"
116
+ $stderr.puts msg
117
+ PDF::Reader::Rectangle.from_array(mediabox)
118
+ else
119
+ mediabox
112
120
  end
113
- runs
114
121
  end
115
122
 
116
- def local_string_insert(haystack, needle, index)
117
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
118
- end
119
123
  end
120
124
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/transformation_matrix'
@@ -15,7 +16,7 @@ class PDF::Reader
15
16
  :h_scaling => 1.0,
16
17
  :text_leading => 0,
17
18
  :text_font => nil,
18
- :text_font_size => nil,
19
+ :text_font_size => 0,
19
20
  :text_mode => 0,
20
21
  :text_rise => 0,
21
22
  :text_knockout => 0
@@ -30,7 +31,13 @@ class PDF::Reader
30
31
  @xobject_stack = [page.xobjects]
31
32
  @cs_stack = [page.color_spaces]
32
33
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- state[:ctm] = identity_matrix
34
+ state[:ctm] = identity_matrix
35
+
36
+ # These are only valid when inside a `BT` block and we re-initialize them on each
37
+ # `BT`. However, we need the instance variables set so PDFs with the text operators
38
+ # out order don't trigger NoMethodError when these are nil
39
+ @text_matrix = identity_matrix
40
+ @text_line_matrix = identity_matrix
34
41
  end
35
42
 
36
43
  #####################################################
@@ -312,7 +319,7 @@ class PDF::Reader
312
319
  # may need to be added
313
320
  #
314
321
  def process_glyph_displacement(w0, tj, word_boundary)
315
- fs = font_size # font size
322
+ fs = state[:text_font_size]
316
323
  tc = state[:char_spacing]
317
324
  if word_boundary
318
325
  tw = state[:word_spacing]
@@ -322,22 +329,24 @@ class PDF::Reader
322
329
  th = state[:h_scaling]
323
330
  # optimise the common path to reduce Float allocations
324
331
  if th == 1 && tj == 0 && tc == 0 && tw == 0
325
- glyph_width = w0 * fs
326
- tx = glyph_width
332
+ tx = w0 * fs
333
+ elsif tj != 0
334
+ # don't apply spacing to TJ displacement
335
+ tx = (w0 - (tj/1000.0)) * fs * th
327
336
  else
328
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
329
- tx = glyph_width + ((tc + tw) * th)
337
+ # apply horizontal scaling to spacing values but not font size
338
+ tx = ((w0 * fs) + tc + tw) * th
330
339
  end
331
-
332
- # TODO: I'm pretty sure that tx shouldn't need to be divided by
333
- # ctm[0] here, but this gets my tests green and I'm out of
334
- # ideas for now
335
340
  # TODO: support ty > 0
336
- if ctm.a == 1 || ctm.a == 0
337
- @text_matrix.horizontal_displacement_multiply!(tx)
338
- else
339
- @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
340
- end
341
+ ty = 0
342
+ temp = TransformationMatrix.new(1, 0,
343
+ 0, 1,
344
+ tx, ty)
345
+ @text_matrix = temp.multiply!(
346
+ @text_matrix.a, @text_matrix.b,
347
+ @text_matrix.c, @text_matrix.d,
348
+ @text_matrix.e, @text_matrix.f
349
+ )
341
350
  @font_size = @text_rendering_matrix = nil # invalidate cached value
342
351
  end
343
352
 
@@ -381,7 +390,7 @@ class PDF::Reader
381
390
  #
382
391
  def build_fonts(raw_fonts)
383
392
  wrapped_fonts = raw_fonts.map { |label, font|
384
- [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
393
+ [label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
385
394
  }
386
395
 
387
396
  ::Hash[wrapped_fonts]
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'forwardable'
@@ -41,13 +42,39 @@ module PDF
41
42
  # starting a new page
42
43
  def page=(page)
43
44
  @state = PageState.new(page)
45
+ @page = page
44
46
  @content = []
45
47
  @characters = []
46
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
48
  end
48
49
 
50
+ def runs(opts = {})
51
+ runs = @characters
52
+
53
+ if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
54
+ runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
55
+ end
56
+
57
+ if opts.fetch(:skip_zero_width, true)
58
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
59
+ end
60
+
61
+ if opts.fetch(:skip_overlapping, true)
62
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
+ end
64
+
65
+ runs = NoTextFilter.exclude_empty_strings(runs)
66
+
67
+ if opts.fetch(:merge, true)
68
+ runs = merge_runs(runs)
69
+ end
70
+
71
+ runs
72
+ end
73
+
74
+ # deprecated
49
75
  def content
50
- PageLayout.new(@characters, @mediabox).to_s
76
+ mediabox = @page.rectangles[:MediaBox]
77
+ PageLayout.new(runs, mediabox).to_s
51
78
  end
52
79
 
53
80
  #####################################################
@@ -62,8 +89,10 @@ module PDF
62
89
  params.each do |arg|
63
90
  if arg.is_a?(String)
64
91
  internal_show_text(arg)
65
- else
92
+ elsif arg.is_a?(Numeric)
66
93
  @state.process_glyph_displacement(0, arg, false)
94
+ else
95
+ # skip it
67
96
  end
68
97
  end
69
98
  end
@@ -94,6 +123,7 @@ module PDF
94
123
  private
95
124
 
96
125
  def internal_show_text(string)
126
+ PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
97
127
  if @state.current_font.nil?
98
128
  raise PDF::Reader::MalformedPDFError, "current font is invalid"
99
129
  end
@@ -101,11 +131,13 @@ module PDF
101
131
  glyphs.each_with_index do |glyph_code, index|
102
132
  # paint the current glyph
103
133
  newx, newy = @state.trm_transform(0,0)
134
+ newx, newy = apply_rotation(newx, newy)
135
+
104
136
  utf8_chars = @state.current_font.to_utf8(glyph_code)
105
137
 
106
138
  # apply to glyph displacment for the current glyph so the next
107
139
  # glyph will appear in the correct position
108
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
140
+ glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
109
141
  th = 1
110
142
  scaled_glyph_width = glyph_width * @state.font_size * th
111
143
  unless utf8_chars == SPACE
@@ -115,6 +147,44 @@ module PDF
115
147
  end
116
148
  end
117
149
 
150
+ def apply_rotation(x, y)
151
+ if @page.rotate == 90
152
+ tmp = x
153
+ x = y
154
+ y = tmp * -1
155
+ elsif @page.rotate == 180
156
+ y *= -1
157
+ x *= -1
158
+ elsif @page.rotate == 270
159
+ tmp = y
160
+ y = x
161
+ x = tmp * -1
162
+ end
163
+ return x, y
164
+ end
165
+
166
+ # take a collection of TextRun objects and merge any that are in close
167
+ # proximity
168
+ def merge_runs(runs)
169
+ runs.group_by { |char|
170
+ char.y.to_i
171
+ }.map { |y, chars|
172
+ group_chars_into_runs(chars.sort)
173
+ }.flatten.sort
174
+ end
175
+
176
+ def group_chars_into_runs(chars)
177
+ chars.each_with_object([]) do |char, runs|
178
+ if runs.empty?
179
+ runs << char
180
+ elsif runs.last.mergable?(char)
181
+ runs[-1] = runs.last + char
182
+ else
183
+ runs << char
184
+ end
185
+ end
186
+ end
187
+
118
188
  end
119
189
  end
120
190
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################