pdf-reader 2.4.1 → 2.7.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +40 -0
  3. data/README.md +16 -1
  4. data/Rakefile +1 -1
  5. data/examples/extract_fonts.rb +12 -7
  6. data/examples/rspec.rb +1 -0
  7. data/lib/pdf/reader/buffer.rb +63 -21
  8. data/lib/pdf/reader/cid_widths.rb +1 -0
  9. data/lib/pdf/reader/cmap.rb +5 -3
  10. data/lib/pdf/reader/encoding.rb +3 -2
  11. data/lib/pdf/reader/error.rb +11 -3
  12. data/lib/pdf/reader/filter/ascii85.rb +7 -1
  13. data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
  14. data/lib/pdf/reader/filter/depredict.rb +10 -8
  15. data/lib/pdf/reader/filter/flate.rb +27 -14
  16. data/lib/pdf/reader/filter/lzw.rb +2 -0
  17. data/lib/pdf/reader/filter/null.rb +1 -0
  18. data/lib/pdf/reader/filter/run_length.rb +19 -13
  19. data/lib/pdf/reader/filter.rb +1 -0
  20. data/lib/pdf/reader/font.rb +1 -0
  21. data/lib/pdf/reader/font_descriptor.rb +1 -0
  22. data/lib/pdf/reader/form_xobject.rb +1 -0
  23. data/lib/pdf/reader/glyph_hash.rb +16 -9
  24. data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
  25. data/lib/pdf/reader/lzw.rb +4 -2
  26. data/lib/pdf/reader/null_security_handler.rb +1 -0
  27. data/lib/pdf/reader/object_cache.rb +1 -0
  28. data/lib/pdf/reader/object_hash.rb +8 -3
  29. data/lib/pdf/reader/object_stream.rb +1 -0
  30. data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
  31. data/lib/pdf/reader/page.rb +60 -9
  32. data/lib/pdf/reader/page_layout.rb +37 -23
  33. data/lib/pdf/reader/page_state.rb +18 -23
  34. data/lib/pdf/reader/page_text_receiver.rb +28 -5
  35. data/lib/pdf/reader/pages_strategy.rb +1 -0
  36. data/lib/pdf/reader/parser.rb +12 -7
  37. data/lib/pdf/reader/point.rb +25 -0
  38. data/lib/pdf/reader/print_receiver.rb +1 -0
  39. data/lib/pdf/reader/rectangle.rb +95 -0
  40. data/lib/pdf/reader/reference.rb +1 -0
  41. data/lib/pdf/reader/register_receiver.rb +1 -0
  42. data/lib/pdf/reader/resource_methods.rb +5 -0
  43. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  44. data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
  45. data/lib/pdf/reader/stream.rb +1 -0
  46. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  47. data/lib/pdf/reader/text_run.rb +1 -0
  48. data/lib/pdf/reader/token.rb +1 -0
  49. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  50. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  51. data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
  52. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  53. data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
  54. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  55. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  56. data/lib/pdf/reader/width_calculator.rb +1 -0
  57. data/lib/pdf/reader/xref.rb +7 -1
  58. data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
  59. data/lib/pdf/reader.rb +14 -4
  60. data/lib/pdf-reader.rb +1 -0
  61. data/rbi/pdf-reader.rbi +1744 -0
  62. metadata +17 -13
  63. data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'forwardable'
@@ -41,16 +42,14 @@ module PDF
41
42
  # starting a new page
42
43
  def page=(page)
43
44
  @state = PageState.new(page)
45
+ @page = page
44
46
  @content = []
45
47
  @characters = []
46
- @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
- device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
48
- device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
49
- @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
50
48
  end
51
49
 
52
50
  def content
53
- PageLayout.new(@characters, @device_mediabox).to_s
51
+ mediabox = @page.rectangles[:MediaBox].to_a
52
+ PageLayout.new(@characters, mediabox).to_s
54
53
  end
55
54
 
56
55
  #####################################################
@@ -104,6 +103,8 @@ module PDF
104
103
  glyphs.each_with_index do |glyph_code, index|
105
104
  # paint the current glyph
106
105
  newx, newy = @state.trm_transform(0,0)
106
+ newx, newy = apply_rotation(newx, newy)
107
+
107
108
  utf8_chars = @state.current_font.to_utf8(glyph_code)
108
109
 
109
110
  # apply to glyph displacment for the current glyph so the next
@@ -118,6 +119,28 @@ module PDF
118
119
  end
119
120
  end
120
121
 
122
+ # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
+ # think it sets the correct x,y values. We get away with it because we don't
124
+ # return the text with co-ordinates, only the full text arranged in a string.
125
+ #
126
+ # We should provide an API for extracting the text with positioning data and spec
127
+ # that. I suspect the co-ords might be wrong for rotated pages
128
+ def apply_rotation(x, y)
129
+ if @page.rotate == 90
130
+ tmp = x
131
+ x = y
132
+ y = tmp * -1
133
+ elsif @page.rotate == 180
134
+ y *= -1
135
+ x *= -1
136
+ elsif @page.rotate == 270
137
+ tmp = y
138
+ y = x
139
+ x = tmp * -1
140
+ end
141
+ return x, y
142
+ end
143
+
121
144
  end
122
145
  end
123
146
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -166,7 +167,9 @@ class PDF::Reader
166
167
 
167
168
  # add a missing digit if required, as required by the spec
168
169
  str << "0" unless str.size % 2 == 0
169
- str.scan(/../).map {|i| i.hex.chr}.join.force_encoding("binary")
170
+ str.chars.each_slice(2).map { |nibbles|
171
+ nibbles.join("").hex.chr
172
+ }.join.force_encoding("binary")
170
173
  end
171
174
  ################################################################################
172
175
  # Reads a PDF String from the buffer and converts it to a Ruby String
@@ -175,15 +178,18 @@ class PDF::Reader
175
178
  return "".dup.force_encoding("binary") if str == ")"
176
179
  Error.assert_equal(parse_token, ")")
177
180
 
178
- str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
179
- MAPPING[match] || "".dup
181
+ str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
182
+ if $2.nil? # not octal digits
183
+ MAPPING[match] || "".dup
184
+ else # must be octal digits
185
+ ($2.oct & 0xff).chr # ignore high level overflow
186
+ end
180
187
  end
181
188
  str.force_encoding("binary")
182
189
  end
183
190
 
184
191
  MAPPING = {
185
192
  "\r" => "\n",
186
- "\n\r" => "\n",
187
193
  "\r\n" => "\n",
188
194
  "\\n" => "\n",
189
195
  "\\r" => "\r",
@@ -194,10 +200,9 @@ class PDF::Reader
194
200
  "\\)" => ")",
195
201
  "\\\\" => "\\",
196
202
  "\\\n" => "",
203
+ "\\\r" => "",
204
+ "\\\r\n" => "",
197
205
  }
198
- 0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
199
- 0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
200
- 0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
201
206
 
202
207
  ################################################################################
203
208
  # Decodes the contents of a PDF Stream and returns it as a Ruby String.
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs are all about positioning content on a page, so there's lots of need to
9
+ # work with a set of X,Y coordinates.
10
+ #
11
+ class Point
12
+
13
+ attr_reader :x, :y
14
+
15
+ def initialize(x, y)
16
+ @x, @y = x, y
17
+ end
18
+
19
+ def ==(other)
20
+ other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
21
+ end
22
+
23
+ end
24
+ end
25
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -0,0 +1,95 @@
1
+ # coding: utf-8
2
+ # typed: true
3
+ # frozen_string_literal: true
4
+
5
+ module PDF
6
+ class Reader
7
+
8
+ # PDFs represent rectangles all over the place. They're 4 element arrays, like this:
9
+ #
10
+ # [A, B, C, D]
11
+ #
12
+ # Four element arrays are yucky to work with though, so here's a class that's better.
13
+ # Initialize it with the 4 elements, and get utility functions (width, height, etc)
14
+ # for free.
15
+ #
16
+ # By convention the first two elements are x1, y1, the co-ords for the bottom left corner
17
+ # of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
18
+ # corner of the rectangle. It's valid for the alternative corners to be used though, so
19
+ # we don't assume which is which.
20
+ #
21
+ class Rectangle
22
+
23
+ attr_reader :bottom_left, :bottom_right, :top_left, :top_right
24
+
25
+ def initialize(x1, y1, x2, y2)
26
+ set_corners(x1, y1, x2, y2)
27
+ end
28
+
29
+ def ==(other)
30
+ to_a == other.to_a
31
+ end
32
+
33
+ def height
34
+ top_right.y - bottom_right.y
35
+ end
36
+
37
+ def width
38
+ bottom_right.x - bottom_left.x
39
+ end
40
+
41
+ # A pdf-style 4-number array
42
+ def to_a
43
+ [
44
+ bottom_left.x,
45
+ bottom_left.y,
46
+ top_right.x,
47
+ top_right.y,
48
+ ]
49
+ end
50
+
51
+ def apply_rotation(degrees)
52
+ return if degrees != 90 && degrees != 180 && degrees != 270
53
+
54
+ if degrees == 90
55
+ new_x1 = bottom_left.x
56
+ new_y1 = bottom_left.y - width
57
+ new_x2 = bottom_left.x + height
58
+ new_y2 = bottom_left.y
59
+ elsif degrees == 180
60
+ new_x1 = bottom_left.x - width
61
+ new_y1 = bottom_left.y - height
62
+ new_x2 = bottom_left.x
63
+ new_y2 = bottom_left.y
64
+ elsif degrees == 270
65
+ new_x1 = bottom_left.x - height
66
+ new_y1 = bottom_left.y
67
+ new_x2 = bottom_left.x
68
+ new_y2 = bottom_left.y + width
69
+ end
70
+ set_corners(new_x1, new_y1, new_x2, new_y2)
71
+ end
72
+
73
+ private
74
+
75
+ def set_corners(x1, y1, x2, y2)
76
+ @bottom_left = PDF::Reader::Point.new(
77
+ [x1, x2].min,
78
+ [y1, y2].min,
79
+ )
80
+ @bottom_right = PDF::Reader::Point.new(
81
+ [x1, x2].max,
82
+ [y1, y2].min,
83
+ )
84
+ @top_left = PDF::Reader::Point.new(
85
+ [x1, x2].min,
86
+ [y1, y2].max,
87
+ )
88
+ @top_right = PDF::Reader::Point.new(
89
+ [x1, x2].max,
90
+ [y1, y2].max,
91
+ )
92
+ end
93
+ end
94
+ end
95
+ end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # Copyright (C) 2010 James Healy (jimmy@deefa.com)
@@ -1,12 +1,17 @@
1
1
  # coding: utf-8
2
+ # typed: false
2
3
  # frozen_string_literal: true
3
4
 
5
+ # Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
6
+ # are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
7
+
4
8
  module PDF
5
9
  class Reader
6
10
 
7
11
  # mixin for common methods in Page and FormXobjects
8
12
  #
9
13
  module ResourceMethods
14
+
10
15
  # Returns a Hash of color spaces that are available to this page
11
16
  #
12
17
  # NOTE: this method de-serialise objects from the underlying PDF
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'digest'
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require 'afm'
@@ -37,23 +38,15 @@ class PDF::Reader
37
38
  def glyph_width(code_point)
38
39
  return 0 if code_point.nil? || code_point < 0
39
40
 
40
- m = @metrics.char_metrics_by_code[code_point]
41
- if m.nil?
42
- names = @font.encoding.int_to_name(code_point)
41
+ names = @font.encoding.int_to_name(code_point)
42
+ metrics = names.map { |name|
43
+ @metrics.char_metrics[name.to_s]
44
+ }.compact.first
43
45
 
44
- m = names.map { |name|
45
- @metrics.char_metrics[name.to_s]
46
- }.compact.first
47
- end
48
-
49
- if m
50
- m[:wx]
51
- elsif @font.widths[code_point - 1]
52
- @font.widths[code_point - 1]
53
- elsif control_character?(code_point)
54
- 0
46
+ if metrics
47
+ metrics[:wx]
55
48
  else
56
- 0
49
+ @font.widths[code_point - 1] || 0
57
50
  end
58
51
  end
59
52
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  class PDF::Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  # PDF files may define fonts in a number of ways. Each approach means we must
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -131,6 +132,9 @@ class PDF::Reader
131
132
  generation = buf.token.to_i
132
133
  state = buf.token
133
134
 
135
+ # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
136
+ # TODO should this fix be logged?
137
+ objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
134
138
  store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
135
139
  objid += 1
136
140
  params.clear
@@ -146,7 +150,9 @@ class PDF::Reader
146
150
  end
147
151
 
148
152
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
149
- load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
153
+ # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
154
+ # It's not possible for an xref to appear at offset 0, so can safely skip the ref
155
+ load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
150
156
 
151
157
  trailer
152
158
  end
@@ -0,0 +1,13 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+ # There's no point rendering zero-width characters
7
+ class ZeroWidthRunsFilter
8
+
9
+ def self.exclude_zero_width_runs(runs)
10
+ runs.reject { |run| run.width == 0 }
11
+ end
12
+ end
13
+ end
data/lib/pdf/reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: true
2
3
  # frozen_string_literal: true
3
4
 
4
5
  ################################################################################
@@ -136,7 +137,7 @@ module PDF
136
137
  def page_count
137
138
  pages = @objects.deref(root[:Pages])
138
139
  unless pages.kind_of?(::Hash)
139
- raise MalformedPDFError, 'Pages structure is missing'
140
+ raise MalformedPDFError, "Pages structure is missing #{pages.class}"
140
141
  end
141
142
  @page_count ||= @objects.deref(pages[:Count])
142
143
  end
@@ -221,7 +222,7 @@ module PDF
221
222
  when Array then
222
223
  obj.map { |item| doc_strings_to_utf8(item) }
223
224
  when String then
224
- if obj[0,2].unpack("C*") == [254, 255]
225
+ if has_utf16_bom?(obj)
225
226
  utf16_to_utf8(obj)
226
227
  else
227
228
  pdfdoc_to_utf8(obj)
@@ -231,6 +232,14 @@ module PDF
231
232
  end
232
233
  end
233
234
 
235
+ def has_utf16_bom?(str)
236
+ first_bytes = str[0,2]
237
+
238
+ return false if first_bytes.nil?
239
+
240
+ first_bytes.unpack("C*") == [254, 255]
241
+ end
242
+
234
243
  # TODO find a PDF I can use to spec this behaviour
235
244
  #
236
245
  def pdfdoc_to_utf8(obj)
@@ -242,7 +251,7 @@ module PDF
242
251
  # String#encode
243
252
  #
244
253
  def utf16_to_utf8(obj)
245
- str = obj[2, obj.size]
254
+ str = obj[2, obj.size].to_s
246
255
  str = str.unpack("n*").pack("U*")
247
256
  str.force_encoding("utf-8")
248
257
  str
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
286
295
  require 'pdf/reader/object_stream'
287
296
  require 'pdf/reader/pages_strategy'
288
297
  require 'pdf/reader/parser'
298
+ require 'pdf/reader/point'
289
299
  require 'pdf/reader/print_receiver'
300
+ require 'pdf/reader/rectangle'
290
301
  require 'pdf/reader/reference'
291
302
  require 'pdf/reader/register_receiver'
292
303
  require 'pdf/reader/null_security_handler'
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
299
310
  require 'pdf/reader/page_text_receiver'
300
311
  require 'pdf/reader/token'
301
312
  require 'pdf/reader/xref'
302
- require 'pdf/reader/orientation_detector'
303
313
  require 'pdf/reader/page'
data/lib/pdf-reader.rb CHANGED
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # typed: strict
2
3
  # frozen_string_literal: true
3
4
 
4
5
  require "pdf/reader"