pdf-reader 2.4.1 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +40 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +63 -21
  8. data/lib/pdf/reader/cid_widths.rb +1 -0
  9. data/lib/pdf/reader/cmap.rb +5 -3
  10. data/lib/pdf/reader/encoding.rb +3 -2
  11. data/lib/pdf/reader/error.rb +11 -3
  12. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  13. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  14. data/lib/pdf/reader/filter/depredict.rb +10 -8
  15. data/lib/pdf/reader/filter/flate.rb +27 -14
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +1 -0
  18. data/lib/pdf/reader/filter/run_length.rb +19 -13
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +1 -0
  21. data/lib/pdf/reader/font_descriptor.rb +1 -0
  22. data/lib/pdf/reader/form_xobject.rb +1 -0
  23. data/lib/pdf/reader/glyph_hash.rb +16 -9
  24. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  25. data/lib/pdf/reader/lzw.rb +4 -2
  26. data/lib/pdf/reader/null_security_handler.rb +1 -0
  27. data/lib/pdf/reader/object_cache.rb +1 -0
  28. data/lib/pdf/reader/object_hash.rb +8 -3
  29. data/lib/pdf/reader/object_stream.rb +1 -0
  30. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  31. data/lib/pdf/reader/page.rb +60 -9
  32. data/lib/pdf/reader/page_layout.rb +37 -23
  33. data/lib/pdf/reader/page_state.rb +18 -23
  34. data/lib/pdf/reader/page_text_receiver.rb +28 -5
  35. data/lib/pdf/reader/pages_strategy.rb +1 -0
  36. data/lib/pdf/reader/parser.rb +12 -7
  37. data/lib/pdf/reader/point.rb +25 -0
  38. data/lib/pdf/reader/print_receiver.rb +1 -0
  39. data/lib/pdf/reader/rectangle.rb +95 -0
  40. data/lib/pdf/reader/reference.rb +1 -0
  41. data/lib/pdf/reader/register_receiver.rb +1 -0
  42. data/lib/pdf/reader/resource_methods.rb +5 -0
  43. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  45. data/lib/pdf/reader/stream.rb +1 -0
  46. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  47. data/lib/pdf/reader/text_run.rb +1 -0
  48. data/lib/pdf/reader/token.rb +1 -0
  49. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  50. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  52. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  56. data/lib/pdf/reader/width_calculator.rb +1 -0
  57. data/lib/pdf/reader/xref.rb +7 -1
  58. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  59. data/lib/pdf/reader.rb +14 -4
  60. data/lib/pdf-reader.rb +1 -0
  61. data/rbi/pdf-reader.rbi +1744 -0
  62. metadata +17 -13
  63. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'forwardable'
@@ -41,16 +42,14 @@ module PDF
41
42
  # starting a new page
42
43
  def page=(page)
43
44
  @state = PageState.new(page)
45
+ @page = page
44
46
  @content = []
45
47
  @characters = []
46
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
- device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
48
- device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
49
- @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
50
48
  end
51
49
 
52
50
  def content
53
- PageLayout.new(@characters, @device_mediabox).to_s
51
+ mediabox = @page.rectangles[:MediaBox].to_a
52
+ PageLayout.new(@characters, mediabox).to_s
54
53
  end
55
54
 
56
55
  #####################################################
@@ -104,6 +103,8 @@ module PDF
104
103
  glyphs.each_with_index do |glyph_code, index|
105
104
  # paint the current glyph
106
105
  newx, newy = @state.trm_transform(0,0)
106
+ newx, newy = apply_rotation(newx, newy)
107
+
107
108
  utf8_chars = @state.current_font.to_utf8(glyph_code)
108
109
 
109
110
  # apply to glyph displacment for the current glyph so the next
@@ -118,6 +119,28 @@ module PDF
118
119
  end
119
120
  end
120
121
 
122
+ # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
+ # think it sets the correct x,y values. We get away with it because we don't
124
+ # return the text with co-ordinates, only the full text arranged in a string.
125
+ #
126
+ # We should provide an API for extracting the text with positioning data and spec
127
+ # that. I suspect the co-ords might be wrong for rotated pages
128
+ def apply_rotation(x, y)
129
+ if @page.rotate == 90
130
+ tmp = x
131
+ x = y
132
+ y = tmp * -1
133
+ elsif @page.rotate == 180
134
+ y *= -1
135
+ x *= -1
136
+ elsif @page.rotate == 270
137
+ tmp = y
138
+ y = x
139
+ x = tmp * -1
140
+ end
141
+ return x, y
142
+ end
143
+
121
144
  end
122
145
  end
123
146
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -166,7 +167,9 @@ class PDF::Reader
166
167
 
167
168
  # add a missing digit if required, as required by the spec
168
169
  str << "0" unless str.size % 2 == 0
169
- str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
170
+ str.chars.each_slice(2).map { |nibbles|
171
+ nibbles.join("").hex.chr
172
+ }.join.force_encoding("binary")
170
173
  end
171
174
  ################################################################################
172
175
  # Reads a PDF String from the buffer and converts it to a Ruby String
@@ -175,15 +178,18 @@ class PDF::Reader
175
178
  return "".dup.force_encoding("binary") if str == ")"
176
179
  Error.assert_equal(parse_token, ")")
177
180
 
178
- str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
179
- MAPPING[match] || "".dup
181
+ str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
182
+ if $2.nil? # not octal digits
183
+ MAPPING[match] || "".dup
184
+ else # must be octal digits
185
+ ($2.oct & 0xff).chr # ignore high level overflow
186
+ end
180
187
  end
181
188
  str.force_encoding("binary")
182
189
  end
183
190
 
184
191
  MAPPING = {
185
192
  "\r" => "\n",
186
- "\n\r" => "\n",
187
193
  "\r\n" => "\n",
188
194
  "\\n" => "\n",
189
195
  "\\r" => "\r",
@@ -194,10 +200,9 @@ class PDF::Reader
194
200
  "\\)" => ")",
195
201
  "\\\\" => "\\",
196
202
  "\\\n" => "",
203
+ "\\\r" => "",
204
+ "\\\r\n" => "",
197
205
  }
198
- 0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
199
- 0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
200
- 0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
201
206
 
202
207
  ################################################################################
203
208
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs are all about positioning content on a page, so there's lots of need to
9
+ # work with a set of X,Y coordinates.
10
+ #
11
+ class Point
12
+
13
+ attr_reader :x, :y
14
+
15
+ def initialize(x, y)
16
+ @x, @y = x, y
17
+ end
18
+
19
+ def ==(other)
20
+ other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -0,0 +1,95 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs represent rectangles all over the place. They're 4 element arrays, like this:
9
+ #
10
+ # [A, B, C, D]
11
+ #
12
+ # Four element arrays are yucky to work with though, so here's a class that's better.
13
+ # Initialize it with the 4 elements, and get utility functions (width, height, etc)
14
+ # for free.
15
+ #
16
+ # By convention the first two elements are x1, y1, the co-ords for the bottom left corner
17
+ # of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
18
+ # corner of the rectangle. It's valid for the alternative corners to be used though, so
19
+ # we don't assume which is which.
20
+ #
21
+ class Rectangle
22
+
23
+ attr_reader :bottom_left, :bottom_right, :top_left, :top_right
24
+
25
+ def initialize(x1, y1, x2, y2)
26
+ set_corners(x1, y1, x2, y2)
27
+ end
28
+
29
+ def ==(other)
30
+ to_a == other.to_a
31
+ end
32
+
33
+ def height
34
+ top_right.y - bottom_right.y
35
+ end
36
+
37
+ def width
38
+ bottom_right.x - bottom_left.x
39
+ end
40
+
41
+ # A pdf-style 4-number array
42
+ def to_a
43
+ [
44
+ bottom_left.x,
45
+ bottom_left.y,
46
+ top_right.x,
47
+ top_right.y,
48
+ ]
49
+ end
50
+
51
+ def apply_rotation(degrees)
52
+ return if degrees != 90 && degrees != 180 && degrees != 270
53
+
54
+ if degrees == 90
55
+ new_x1 = bottom_left.x
56
+ new_y1 = bottom_left.y - width
57
+ new_x2 = bottom_left.x + height
58
+ new_y2 = bottom_left.y
59
+ elsif degrees == 180
60
+ new_x1 = bottom_left.x - width
61
+ new_y1 = bottom_left.y - height
62
+ new_x2 = bottom_left.x
63
+ new_y2 = bottom_left.y
64
+ elsif degrees == 270
65
+ new_x1 = bottom_left.x - height
66
+ new_y1 = bottom_left.y
67
+ new_x2 = bottom_left.x
68
+ new_y2 = bottom_left.y + width
69
+ end
70
+ set_corners(new_x1, new_y1, new_x2, new_y2)
71
+ end
72
+
73
+ private
74
+
75
+ def set_corners(x1, y1, x2, y2)
76
+ @bottom_left = PDF::Reader::Point.new(
77
+ [x1, x2].min,
78
+ [y1, y2].min,
79
+ )
80
+ @bottom_right = PDF::Reader::Point.new(
81
+ [x1, x2].max,
82
+ [y1, y2].min,
83
+ )
84
+ @top_left = PDF::Reader::Point.new(
85
+ [x1, x2].min,
86
+ [y1, y2].max,
87
+ )
88
+ @top_right = PDF::Reader::Point.new(
89
+ [x1, x2].max,
90
+ [y1, y2].max,
91
+ )
92
+ end
93
+ end
94
+ end
95
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # Copyright (C) 2010 James Healy (jimmy@deefa.com)
@@ -1,12 +1,17 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
5
+ # Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
6
+ # are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
7
+
4
8
  module PDF
5
9
  class Reader
6
10
 
7
11
  # mixin for common methods in Page and FormXobjects
8
12
  #
9
13
  module ResourceMethods
14
+
10
15
  # Returns a Hash of color spaces that are available to this page
11
16
  #
12
17
  # NOTE: this method de-serialise objects from the underlying PDF
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'afm'
@@ -37,23 +38,15 @@ class PDF::Reader
37
38
  def glyph_width(code_point)
38
39
  return 0 if code_point.nil? || code_point < 0
39
40
 
40
- m = @metrics.char_metrics_by_code[code_point]
41
- if m.nil?
42
- names = @font.encoding.int_to_name(code_point)
41
+ names = @font.encoding.int_to_name(code_point)
42
+ metrics = names.map { |name|
43
+ @metrics.char_metrics[name.to_s]
44
+ }.compact.first
43
45
 
44
- m = names.map { |name|
45
- @metrics.char_metrics[name.to_s]
46
- }.compact.first
47
- end
48
-
49
- if m
50
- m[:wx]
51
- elsif @font.widths[code_point - 1]
52
- @font.widths[code_point - 1]
53
- elsif control_character?(code_point)
54
- 0
46
+ if metrics
47
+ metrics[:wx]
55
48
  else
56
- 0
49
+ @font.widths[code_point - 1] || 0
57
50
  end
58
51
  end
59
52
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -131,6 +132,9 @@ class PDF::Reader
131
132
  generation = buf.token.to_i
132
133
  state = buf.token
133
134
 
135
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
136
+ # TODO should this fix be logged?
137
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
138
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
139
  objid += 1
136
140
  params.clear
@@ -146,7 +150,9 @@ class PDF::Reader
146
150
  end
147
151
 
148
152
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
153
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
154
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
155
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
156
 
151
157
  trailer
152
158
  end
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -136,7 +137,7 @@ module PDF
136
137
  def page_count
137
138
  pages = @objects.deref(root[:Pages])
138
139
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
140
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
141
  end
141
142
  @page_count ||= @objects.deref(pages[:Count])
142
143
  end
@@ -221,7 +222,7 @@ module PDF
221
222
  when Array then
222
223
  obj.map { |item| doc_strings_to_utf8(item) }
223
224
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
225
+ if has_utf16_bom?(obj)
225
226
  utf16_to_utf8(obj)
226
227
  else
227
228
  pdfdoc_to_utf8(obj)
@@ -231,6 +232,14 @@ module PDF
231
232
  end
232
233
  end
233
234
 
235
+ def has_utf16_bom?(str)
236
+ first_bytes = str[0,2]
237
+
238
+ return false if first_bytes.nil?
239
+
240
+ first_bytes.unpack("C*") == [254, 255]
241
+ end
242
+
234
243
  # TODO find a PDF I can use to spec this behaviour
235
244
  #
236
245
  def pdfdoc_to_utf8(obj)
@@ -242,7 +251,7 @@ module PDF
242
251
  # String#encode
243
252
  #
244
253
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
254
+ str = obj[2, obj.size].to_s
246
255
  str = str.unpack("n*").pack("U*")
247
256
  str.force_encoding("utf-8")
248
257
  str
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
286
295
  require 'pdf/reader/object_stream'
287
296
  require 'pdf/reader/pages_strategy'
288
297
  require 'pdf/reader/parser'
298
+ require 'pdf/reader/point'
289
299
  require 'pdf/reader/print_receiver'
300
+ require 'pdf/reader/rectangle'
290
301
  require 'pdf/reader/reference'
291
302
  require 'pdf/reader/register_receiver'
292
303
  require 'pdf/reader/null_security_handler'
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
299
310
  require 'pdf/reader/page_text_receiver'
300
311
  require 'pdf/reader/token'
301
312
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
313
  require 'pdf/reader/page'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"