pdf-reader 2.2.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +90 -0
  3. data/README.md +18 -3
  4. data/Rakefile +1 -1
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_text +1 -1
  7. data/examples/extract_fonts.rb +12 -7
  8. data/examples/rspec.rb +1 -0
  9. data/lib/pdf/reader/aes_v2_security_handler.rb +41 -0
  10. data/lib/pdf/reader/aes_v3_security_handler.rb +38 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  14. data/lib/pdf/reader/afm/Courier.afm +342 -342
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  26. data/lib/pdf/reader/bounding_rectangle_runs_filter.rb +16 -0
  27. data/lib/pdf/reader/buffer.rb +91 -47
  28. data/lib/pdf/reader/cid_widths.rb +7 -4
  29. data/lib/pdf/reader/cmap.rb +83 -59
  30. data/lib/pdf/reader/encoding.rb +17 -14
  31. data/lib/pdf/reader/error.rb +15 -3
  32. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  33. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  34. data/lib/pdf/reader/filter/depredict.rb +12 -10
  35. data/lib/pdf/reader/filter/flate.rb +30 -16
  36. data/lib/pdf/reader/filter/lzw.rb +2 -0
  37. data/lib/pdf/reader/filter/null.rb +1 -1
  38. data/lib/pdf/reader/filter/run_length.rb +19 -13
  39. data/lib/pdf/reader/filter.rb +11 -11
  40. data/lib/pdf/reader/font.rb +89 -26
  41. data/lib/pdf/reader/font_descriptor.rb +22 -18
  42. data/lib/pdf/reader/form_xobject.rb +18 -5
  43. data/lib/pdf/reader/glyph_hash.rb +28 -13
  44. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  45. data/lib/pdf/reader/key_builder_v5.rb +138 -0
  46. data/lib/pdf/reader/lzw.rb +28 -11
  47. data/lib/pdf/reader/no_text_filter.rb +14 -0
  48. data/lib/pdf/reader/null_security_handler.rb +1 -4
  49. data/lib/pdf/reader/object_cache.rb +1 -0
  50. data/lib/pdf/reader/object_hash.rb +292 -63
  51. data/lib/pdf/reader/object_stream.rb +3 -2
  52. data/lib/pdf/reader/overlapping_runs_filter.rb +72 -0
  53. data/lib/pdf/reader/page.rb +143 -16
  54. data/lib/pdf/reader/page_layout.rb +43 -39
  55. data/lib/pdf/reader/page_state.rb +26 -17
  56. data/lib/pdf/reader/page_text_receiver.rb +74 -4
  57. data/lib/pdf/reader/pages_strategy.rb +1 -0
  58. data/lib/pdf/reader/parser.rb +34 -14
  59. data/lib/pdf/reader/point.rb +25 -0
  60. data/lib/pdf/reader/print_receiver.rb +1 -0
  61. data/lib/pdf/reader/rc4_security_handler.rb +38 -0
  62. data/lib/pdf/reader/rectangle.rb +113 -0
  63. data/lib/pdf/reader/reference.rb +3 -1
  64. data/lib/pdf/reader/register_receiver.rb +1 -0
  65. data/lib/pdf/reader/{resource_methods.rb → resources.rb} +17 -9
  66. data/lib/pdf/reader/security_handler_factory.rb +79 -0
  67. data/lib/pdf/reader/{standard_security_handler.rb → standard_key_builder.rb} +23 -94
  68. data/lib/pdf/reader/stream.rb +3 -2
  69. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  70. data/lib/pdf/reader/text_run.rb +40 -5
  71. data/lib/pdf/reader/token.rb +1 -0
  72. data/lib/pdf/reader/transformation_matrix.rb +8 -7
  73. data/lib/pdf/reader/type_check.rb +98 -0
  74. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  75. data/lib/pdf/reader/validating_receiver.rb +262 -0
  76. data/lib/pdf/reader/width_calculator/built_in.rb +27 -17
  77. data/lib/pdf/reader/width_calculator/composite.rb +6 -1
  78. data/lib/pdf/reader/width_calculator/true_type.rb +10 -11
  79. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +6 -4
  80. data/lib/pdf/reader/width_calculator/type_zero.rb +6 -2
  81. data/lib/pdf/reader/width_calculator.rb +1 -0
  82. data/lib/pdf/reader/xref.rb +37 -11
  83. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  84. data/lib/pdf/reader.rb +49 -24
  85. data/lib/pdf-reader.rb +1 -0
  86. data/rbi/pdf-reader.rbi +2048 -0
  87. metadata +39 -23
  88. data/lib/pdf/hash.rb +0 -20
  89. data/lib/pdf/reader/orientation_detector.rb +0 -34
  90. data/lib/pdf/reader/standard_security_handler_v5.rb +0 -91
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  module PDF
@@ -13,7 +14,7 @@ module PDF
13
14
  # objects accessor to help walk the page dictionary in any useful way.
14
15
  #
15
16
  class Page
16
- include ResourceMethods
17
+ extend Forwardable
17
18
 
18
19
  # lowlevel hash-like access to all objects in the underlying PDF
19
20
  attr_reader :objects
@@ -26,6 +27,15 @@ module PDF
26
27
  # operations
27
28
  attr_reader :cache
28
29
 
30
+ def_delegators :resources, :color_spaces
31
+ def_delegators :resources, :fonts
32
+ def_delegators :resources, :graphic_states
33
+ def_delegators :resources, :patterns
34
+ def_delegators :resources, :procedure_sets
35
+ def_delegators :resources, :properties
36
+ def_delegators :resources, :shadings
37
+ def_delegators :resources, :xobjects
38
+
29
39
  # creates a new page wrapper.
30
40
  #
31
41
  # * objects - an ObjectHash instance that wraps a PDF file
@@ -33,10 +43,10 @@ module PDF
33
43
  #
34
44
  def initialize(objects, pagenum, options = {})
35
45
  @objects, @pagenum = objects, pagenum
36
- @page_object = objects.deref(objects.page_references[pagenum - 1])
46
+ @page_object = objects.deref_hash(objects.page_references[pagenum - 1]) || {}
37
47
  @cache = options[:cache] || {}
38
48
 
39
- unless @page_object.is_a?(::Hash)
49
+ if @page_object.empty?
40
50
  raise InvalidPageError, "Invalid page: #{pagenum}"
41
51
  end
42
52
  end
@@ -59,7 +69,7 @@ module PDF
59
69
  def attributes
60
70
  @attributes ||= {}.tap { |hash|
61
71
  page_with_ancestors.reverse.each do |obj|
62
- hash.merge!(@objects.deref(obj))
72
+ hash.merge!(@objects.deref_hash(obj) || {})
63
73
  end
64
74
  }
65
75
  # This shouldn't be necesary, but some non compliant PDFs leave MediaBox
@@ -68,22 +78,56 @@ module PDF
68
78
  @attributes
69
79
  end
70
80
 
81
+ def height
82
+ rect = Rectangle.new(*attributes[:MediaBox])
83
+ rect.apply_rotation(rotate) if rotate > 0
84
+ rect.height
85
+ end
86
+
87
+ def width
88
+ rect = Rectangle.new(*attributes[:MediaBox])
89
+ rect.apply_rotation(rotate) if rotate > 0
90
+ rect.width
91
+ end
92
+
93
+ def origin
94
+ rect = Rectangle.new(*attributes[:MediaBox])
95
+ rect.apply_rotation(rotate) if rotate > 0
96
+
97
+ rect.bottom_left
98
+ end
99
+
71
100
  # Convenience method to identify the page's orientation.
72
101
  #
73
102
  def orientation
74
- OrientationDetector.new(attributes).orientation
103
+ if height > width
104
+ "portrait"
105
+ else
106
+ "landscape"
107
+ end
75
108
  end
76
109
 
77
110
  # returns the plain text content of this page encoded as UTF-8. Any
78
111
  # characters that can't be translated will be returned as a ▯
79
112
  #
80
- def text
113
+ def text(opts = {})
81
114
  receiver = PageTextReceiver.new
82
115
  walk(receiver)
83
- receiver.content
116
+ runs = receiver.runs(opts)
117
+
118
+ # rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
119
+ mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
120
+
121
+ PageLayout.new(runs, mediabox).to_s
84
122
  end
85
123
  alias :to_s :text
86
124
 
125
+ def runs(opts = {})
126
+ receiver = PageTextReceiver.new
127
+ walk(receiver)
128
+ receiver.runs(opts)
129
+ end
130
+
87
131
  # processes the raw content stream for this page in sequential order and
88
132
  # passes callbacks to the receiver objects.
89
133
  #
@@ -108,6 +152,9 @@ module PDF
108
152
  # the program in the correct order and calls out to your implementation.
109
153
  #
110
154
  def walk(*receivers)
155
+ receivers = receivers.map { |receiver|
156
+ ValidatingReceiver.new(receiver)
157
+ }
111
158
  callback(receivers, :page=, [self])
112
159
  content_stream(receivers, raw_content)
113
160
  end
@@ -116,25 +163,85 @@ module PDF
116
163
  # see here unless you're a PDF nerd like me.
117
164
  #
118
165
  def raw_content
119
- contents = objects.deref(@page_object[:Contents])
166
+ contents = objects.deref_stream_or_array(@page_object[:Contents])
120
167
  [contents].flatten.compact.map { |obj|
121
- objects.deref(obj)
122
- }.map { |obj|
168
+ objects.deref_stream(obj)
169
+ }.compact.map { |obj|
123
170
  obj.unfiltered_data
124
171
  }.join(" ")
125
172
  end
126
173
 
174
+ # returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
175
+ #
176
+ def rotate
177
+ value = attributes[:Rotate].to_i
178
+ case value
179
+ when 0, 90, 180, 270
180
+ value
181
+ else
182
+ 0
183
+ end
184
+ end
185
+
186
+ # returns the "boxes" that define the page object.
187
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
188
+ #
189
+ # DEPRECATED. Recommend using Page#rectangles instead
190
+ #
191
+ def boxes
192
+ # In ruby 2.4+ we could use Hash#transform_values
193
+ Hash[rectangles.map{ |k,rect| [k,rect.to_a] } ]
194
+ end
195
+
196
+ # returns the "boxes" that define the page object.
197
+ # values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
198
+ #
199
+ def rectangles
200
+ # attributes[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
201
+ mediabox = objects.deref_array_of_numbers(attributes[:MediaBox]) || []
202
+ cropbox = objects.deref_array_of_numbers(attributes[:CropBox]) || mediabox
203
+ bleedbox = objects.deref_array_of_numbers(attributes[:BleedBox]) || cropbox
204
+ trimbox = objects.deref_array_of_numbers(attributes[:TrimBox]) || cropbox
205
+ artbox = objects.deref_array_of_numbers(attributes[:ArtBox]) || cropbox
206
+
207
+ begin
208
+ mediarect = Rectangle.from_array(mediabox)
209
+ croprect = Rectangle.from_array(cropbox)
210
+ bleedrect = Rectangle.from_array(bleedbox)
211
+ trimrect = Rectangle.from_array(trimbox)
212
+ artrect = Rectangle.from_array(artbox)
213
+ rescue ArgumentError => e
214
+ raise MalformedPDFError, e.message
215
+ end
216
+
217
+ if rotate > 0
218
+ mediarect.apply_rotation(rotate)
219
+ croprect.apply_rotation(rotate)
220
+ bleedrect.apply_rotation(rotate)
221
+ trimrect.apply_rotation(rotate)
222
+ artrect.apply_rotation(rotate)
223
+ end
224
+
225
+ {
226
+ MediaBox: mediarect,
227
+ CropBox: croprect,
228
+ BleedBox: bleedrect,
229
+ TrimBox: trimrect,
230
+ ArtBox: artrect,
231
+ }
232
+ end
233
+
127
234
  private
128
235
 
129
236
  def root
130
- root ||= objects.deref(@objects.trailer[:Root])
237
+ @root ||= objects.deref_hash(@objects.trailer[:Root]) || {}
131
238
  end
132
239
 
133
240
  # Returns the resources that accompany this page. Includes
134
241
  # resources inherited from parents.
135
242
  #
136
243
  def resources
137
- @resources ||= @objects.deref(attributes[:Resources]) || {}
244
+ @resources ||= Resources.new(@objects, @objects.deref_hash(attributes[:Resources]) || {})
138
245
  end
139
246
 
140
247
  def content_stream(receivers, instructions)
@@ -143,8 +250,8 @@ module PDF
143
250
  params = []
144
251
 
145
252
  while (token = parser.parse_token(PagesStrategy::OPERATORS))
146
- if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
147
- callback(receivers, PagesStrategy::OPERATORS[token], params)
253
+ if token.kind_of?(Token) && method_name = PagesStrategy::OPERATORS[token]
254
+ callback(receivers, method_name, params)
148
255
  params.clear
149
256
  else
150
257
  params << token
@@ -156,9 +263,26 @@ module PDF
156
263
 
157
264
  # calls the name callback method on each receiver object with params as the arguments
158
265
  #
266
+ # The silly style here is because sorbet won't let me use splat arguments
267
+ #
159
268
  def callback(receivers, name, params=[])
160
269
  receivers.each do |receiver|
161
- receiver.send(name, *params) if receiver.respond_to?(name)
270
+ if receiver.respond_to?(name)
271
+ case params.size
272
+ when 0 then receiver.send(name)
273
+ when 1 then receiver.send(name, params[0])
274
+ when 2 then receiver.send(name, params[0], params[1])
275
+ when 3 then receiver.send(name, params[0], params[1], params[2])
276
+ when 4 then receiver.send(name, params[0], params[1], params[2], params[3])
277
+ when 5 then receiver.send(name, params[0], params[1], params[2], params[3], params[4])
278
+ when 6 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5])
279
+ when 7 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6])
280
+ when 8 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7])
281
+ when 9 then receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8])
282
+ else
283
+ receiver.send(name, params[0], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9])
284
+ end
285
+ end
162
286
  end
163
287
  end
164
288
 
@@ -170,7 +294,10 @@ module PDF
170
294
  if origin.nil?
171
295
  []
172
296
  else
173
- obj = objects.deref(origin)
297
+ obj = objects.deref_hash(origin)
298
+ if obj.nil?
299
+ raise MalformedPDFError, "parent mus not be nil"
300
+ end
174
301
  [ select_inheritable(obj) ] + ancestors(obj[:Parent])
175
302
  end
176
303
  end
@@ -1,6 +1,10 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
5
+ require 'pdf/reader/overlapping_runs_filter'
6
+ require 'pdf/reader/zero_width_runs_filter'
7
+
4
8
  class PDF::Reader
5
9
 
6
10
  # Takes a collection of TextRun objects and renders them into a single
@@ -13,24 +17,28 @@ class PDF::Reader
13
17
  DEFAULT_FONT_SIZE = 12
14
18
 
15
19
  def initialize(runs, mediabox)
16
- raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
20
+ # mediabox is a 4-element array for now, but it'd be nice to switch to a
21
+ # PDF::Reader::Rectangle at some point
22
+ PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
17
23
 
18
- @runs = merge_runs(runs)
24
+ @mediabox = process_mediabox(mediabox)
25
+ @runs = runs
19
26
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
20
27
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
21
- @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
22
- @page_width = mediabox[2] - mediabox[0]
23
- @page_height = mediabox[3] - mediabox[1]
24
- @x_offset = @runs.map(&:x).sort.first
28
+ @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
29
+ @x_offset = @runs.map(&:x).sort.first || 0
30
+ lowest_y = @runs.map(&:y).sort.first || 0
31
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
25
32
  end
26
33
 
27
34
  def to_s
28
35
  return "" if @runs.empty?
36
+ return "" if row_count == 0
29
37
 
30
38
  page = row_count.times.map { |i| " " * col_count }
31
39
  @runs.each do |run|
32
40
  x_pos = ((run.x - @x_offset) / col_multiplier).round
33
- y_pos = row_count - (run.y / row_multiplier).round
41
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
34
42
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
35
43
  local_string_insert(page[y_pos-1], run.text, x_pos)
36
44
  end
@@ -40,6 +48,14 @@ class PDF::Reader
40
48
 
41
49
  private
42
50
 
51
+ def page_width
52
+ @mediabox.width
53
+ end
54
+
55
+ def page_height
56
+ @mediabox.height
57
+ end
58
+
43
59
  # given an array of strings, return a new array with empty rows from the
44
60
  # beginning and end removed.
45
61
  #
@@ -58,19 +74,19 @@ class PDF::Reader
58
74
  end
59
75
 
60
76
  def row_count
61
- @row_count ||= (@page_height / @mean_font_size).floor
77
+ @row_count ||= (page_height / @mean_font_size).floor
62
78
  end
63
79
 
64
80
  def col_count
65
- @col_count ||= ((@page_width / @mean_glyph_width) * 1.05).floor
81
+ @col_count ||= ((page_width / @median_glyph_width) * 1.05).floor
66
82
  end
67
83
 
68
84
  def row_multiplier
69
- @row_multiplier ||= @page_height.to_f / row_count.to_f
85
+ @row_multiplier ||= page_height.to_f / row_count.to_f
70
86
  end
71
87
 
72
88
  def col_multiplier
73
- @col_multiplier ||= @page_width.to_f / col_count.to_f
89
+ @col_multiplier ||= page_width.to_f / col_count.to_f
74
90
  end
75
91
 
76
92
  def mean(collection)
@@ -81,40 +97,28 @@ class PDF::Reader
81
97
  end
82
98
  end
83
99
 
84
- def each_line(&block)
85
- @runs.sort.group_by { |run|
86
- run.y.to_i
87
- }.map { |y, collection|
88
- yield y, collection
89
- }
100
+ def median(collection)
101
+ if collection.size == 0
102
+ 0
103
+ else
104
+ collection.sort[(collection.size * 0.5).floor]
105
+ end
90
106
  end
91
107
 
92
- # take a collection of TextRun objects and merge any that are in close
93
- # proximity
94
- def merge_runs(runs)
95
- runs.group_by { |char|
96
- char.y.to_i
97
- }.map { |y, chars|
98
- group_chars_into_runs(chars.sort)
99
- }.flatten.sort
108
+ def local_string_insert(haystack, needle, index)
109
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
100
110
  end
101
111
 
102
- def group_chars_into_runs(chars)
103
- runs = []
104
- while head = chars.shift
105
- if runs.empty?
106
- runs << head
107
- elsif runs.last.mergable?(head)
108
- runs[-1] = runs.last + head
109
- else
110
- runs << head
111
- end
112
+ def process_mediabox(mediabox)
113
+ if mediabox.is_a?(Array)
114
+ msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
115
+ " please use a Rectangle instead"
116
+ $stderr.puts msg
117
+ PDF::Reader::Rectangle.from_array(mediabox)
118
+ else
119
+ mediabox
112
120
  end
113
- runs
114
121
  end
115
122
 
116
- def local_string_insert(haystack, needle, index)
117
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
118
- end
119
123
  end
120
124
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'pdf/reader/transformation_matrix'
@@ -15,7 +16,7 @@ class PDF::Reader
15
16
  :h_scaling => 1.0,
16
17
  :text_leading => 0,
17
18
  :text_font => nil,
18
- :text_font_size => nil,
19
+ :text_font_size => 0,
19
20
  :text_mode => 0,
20
21
  :text_rise => 0,
21
22
  :text_knockout => 0
@@ -30,7 +31,13 @@ class PDF::Reader
30
31
  @xobject_stack = [page.xobjects]
31
32
  @cs_stack = [page.color_spaces]
32
33
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
33
- state[:ctm] = identity_matrix
34
+ state[:ctm] = identity_matrix
35
+
36
+ # These are only valid when inside a `BT` block and we re-initialize them on each
37
+ # `BT`. However, we need the instance variables set so PDFs with the text operators
38
+ # out order don't trigger NoMethodError when these are nil
39
+ @text_matrix = identity_matrix
40
+ @text_line_matrix = identity_matrix
34
41
  end
35
42
 
36
43
  #####################################################
@@ -312,7 +319,7 @@ class PDF::Reader
312
319
  # may need to be added
313
320
  #
314
321
  def process_glyph_displacement(w0, tj, word_boundary)
315
- fs = font_size # font size
322
+ fs = state[:text_font_size]
316
323
  tc = state[:char_spacing]
317
324
  if word_boundary
318
325
  tw = state[:word_spacing]
@@ -322,22 +329,24 @@ class PDF::Reader
322
329
  th = state[:h_scaling]
323
330
  # optimise the common path to reduce Float allocations
324
331
  if th == 1 && tj == 0 && tc == 0 && tw == 0
325
- glyph_width = w0 * fs
326
- tx = glyph_width
332
+ tx = w0 * fs
333
+ elsif tj != 0
334
+ # don't apply spacing to TJ displacement
335
+ tx = (w0 - (tj/1000.0)) * fs * th
327
336
  else
328
- glyph_width = ((w0 - (tj/1000.0)) * fs) * th
329
- tx = glyph_width + ((tc + tw) * th)
337
+ # apply horizontal scaling to spacing values but not font size
338
+ tx = ((w0 * fs) + tc + tw) * th
330
339
  end
331
-
332
- # TODO: I'm pretty sure that tx shouldn't need to be divided by
333
- # ctm[0] here, but this gets my tests green and I'm out of
334
- # ideas for now
335
340
  # TODO: support ty > 0
336
- if ctm.a == 1 || ctm.a == 0
337
- @text_matrix.horizontal_displacement_multiply!(tx)
338
- else
339
- @text_matrix.horizontal_displacement_multiply!(tx/ctm.a)
340
- end
341
+ ty = 0
342
+ temp = TransformationMatrix.new(1, 0,
343
+ 0, 1,
344
+ tx, ty)
345
+ @text_matrix = temp.multiply!(
346
+ @text_matrix.a, @text_matrix.b,
347
+ @text_matrix.c, @text_matrix.d,
348
+ @text_matrix.e, @text_matrix.f
349
+ )
341
350
  @font_size = @text_rendering_matrix = nil # invalidate cached value
342
351
  end
343
352
 
@@ -381,7 +390,7 @@ class PDF::Reader
381
390
  #
382
391
  def build_fonts(raw_fonts)
383
392
  wrapped_fonts = raw_fonts.map { |label, font|
384
- [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
393
+ [label, PDF::Reader::Font.new(@objects, @objects.deref_hash(font) || {})]
385
394
  }
386
395
 
387
396
  ::Hash[wrapped_fonts]
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'forwardable'
@@ -41,13 +42,39 @@ module PDF
41
42
  # starting a new page
42
43
  def page=(page)
43
44
  @state = PageState.new(page)
45
+ @page = page
44
46
  @content = []
45
47
  @characters = []
46
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
48
  end
48
49
 
50
+ def runs(opts = {})
51
+ runs = @characters
52
+
53
+ if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
54
+ runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
55
+ end
56
+
57
+ if opts.fetch(:skip_zero_width, true)
58
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
59
+ end
60
+
61
+ if opts.fetch(:skip_overlapping, true)
62
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
+ end
64
+
65
+ runs = NoTextFilter.exclude_empty_strings(runs)
66
+
67
+ if opts.fetch(:merge, true)
68
+ runs = merge_runs(runs)
69
+ end
70
+
71
+ runs
72
+ end
73
+
74
+ # deprecated
49
75
  def content
50
- PageLayout.new(@characters, @mediabox).to_s
76
+ mediabox = @page.rectangles[:MediaBox]
77
+ PageLayout.new(runs, mediabox).to_s
51
78
  end
52
79
 
53
80
  #####################################################
@@ -62,8 +89,10 @@ module PDF
62
89
  params.each do |arg|
63
90
  if arg.is_a?(String)
64
91
  internal_show_text(arg)
65
- else
92
+ elsif arg.is_a?(Numeric)
66
93
  @state.process_glyph_displacement(0, arg, false)
94
+ else
95
+ # skip it
67
96
  end
68
97
  end
69
98
  end
@@ -94,6 +123,7 @@ module PDF
94
123
  private
95
124
 
96
125
  def internal_show_text(string)
126
+ PDF::Reader::Error.validate_type_as_malformed(string, "string", String)
97
127
  if @state.current_font.nil?
98
128
  raise PDF::Reader::MalformedPDFError, "current font is invalid"
99
129
  end
@@ -101,11 +131,13 @@ module PDF
101
131
  glyphs.each_with_index do |glyph_code, index|
102
132
  # paint the current glyph
103
133
  newx, newy = @state.trm_transform(0,0)
134
+ newx, newy = apply_rotation(newx, newy)
135
+
104
136
  utf8_chars = @state.current_font.to_utf8(glyph_code)
105
137
 
106
138
  # apply to glyph displacment for the current glyph so the next
107
139
  # glyph will appear in the correct position
108
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
140
+ glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
109
141
  th = 1
110
142
  scaled_glyph_width = glyph_width * @state.font_size * th
111
143
  unless utf8_chars == SPACE
@@ -115,6 +147,44 @@ module PDF
115
147
  end
116
148
  end
117
149
 
150
+ def apply_rotation(x, y)
151
+ if @page.rotate == 90
152
+ tmp = x
153
+ x = y
154
+ y = tmp * -1
155
+ elsif @page.rotate == 180
156
+ y *= -1
157
+ x *= -1
158
+ elsif @page.rotate == 270
159
+ tmp = y
160
+ y = x
161
+ x = tmp * -1
162
+ end
163
+ return x, y
164
+ end
165
+
166
+ # take a collection of TextRun objects and merge any that are in close
167
+ # proximity
168
+ def merge_runs(runs)
169
+ runs.group_by { |char|
170
+ char.y.to_i
171
+ }.map { |y, chars|
172
+ group_chars_into_runs(chars.sort)
173
+ }.flatten.sort
174
+ end
175
+
176
+ def group_chars_into_runs(chars)
177
+ chars.each_with_object([]) do |char, runs|
178
+ if runs.empty?
179
+ runs << char
180
+ elsif runs.last.mergable?(char)
181
+ runs[-1] = runs.last + char
182
+ else
183
+ runs << char
184
+ end
185
+ end
186
+ end
187
+
118
188
  end
119
189
  end
120
190
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################