pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -0,0 +1,91 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'digest'
5
+ require 'openssl'
6
+
7
+ class PDF::Reader
8
+
9
+ # class creates interface to encrypt dictionary for use in Decrypt
10
+ class StandardSecurityHandlerV5
11
+
12
+ attr_reader :key_length, :encrypt_key
13
+
14
+ def initialize(opts = {})
15
+ @key_length = 256
16
+ @O = opts[:O] # hash(32B) + validation salt(8B) + key salt(8B)
17
+ @U = opts[:U] # hash(32B) + validation salt(8B) + key salt(8B)
18
+ @OE = opts[:OE] # decryption key, encrypted w/ owner password
19
+ @UE = opts[:UE] # decryption key, encrypted w/ user password
20
+ @encrypt_key = build_standard_key(opts[:password] || '')
21
+ end
22
+
23
+ # This handler supports AES-256 encryption defined in PDF 1.7 Extension Level 3
24
+ def self.supports?(encrypt)
25
+ return false if encrypt.nil?
26
+
27
+ filter = encrypt.fetch(:Filter, :Standard)
28
+ version = encrypt.fetch(:V, 0)
29
+ revision = encrypt.fetch(:R, 0)
30
+ algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
31
+ (filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
32
+ ((version == 5) && (revision == 5) && (algorithm == :AESV3))
33
+ end
34
+
35
+ ##7.6.2 General Encryption Algorithm
36
+ #
37
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
38
+ #
39
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
40
+ #
41
+ # buf - a string to decrypt
42
+ # ref - a PDF::Reader::Reference for the object to decrypt
43
+ #
44
+ def decrypt( buf, ref )
45
+ cipher = OpenSSL::Cipher.new("AES-#{@key_length}-CBC")
46
+ cipher.decrypt
47
+ cipher.key = @encrypt_key.dup
48
+ cipher.iv = buf[0..15]
49
+ cipher.update(buf[16..-1]) + cipher.final
50
+ end
51
+
52
+ private
53
+ # Algorithm 3.2a - Computing an encryption key
54
+ #
55
+ # Defined in PDF 1.7 Extension Level 3
56
+ #
57
+ # if the string is a valid user/owner password, this will return the decryption key
58
+ #
59
+ def auth_owner_pass(password)
60
+ if Digest::SHA256.digest(password + @O[32..39] + @U) == @O[0..31]
61
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
62
+ cipher.decrypt
63
+ cipher.key = Digest::SHA256.digest(password + @O[40..-1] + @U)
64
+ cipher.iv = "\x00" * 16
65
+ cipher.padding = 0
66
+ cipher.update(@OE) + cipher.final
67
+ end
68
+ end
69
+
70
+ def auth_user_pass(password)
71
+ if Digest::SHA256.digest(password + @U[32..39]) == @U[0..31]
72
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
73
+ cipher.decrypt
74
+ cipher.key = Digest::SHA256.digest(password + @U[40..-1])
75
+ cipher.iv = "\x00" * 16
76
+ cipher.padding = 0
77
+ cipher.update(@UE) + cipher.final
78
+ end
79
+ end
80
+
81
+ def build_standard_key(pass)
82
+ pass = pass.byteslice(0...127) # UTF-8 encoded password. first 127 bytes
83
+
84
+ encrypt_key = auth_owner_pass(pass)
85
+ encrypt_key ||= auth_user_pass(pass)
86
+
87
+ raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
88
+ encrypt_key
89
+ end
90
+ end
91
+ end
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -35,7 +38,7 @@ class PDF::Reader
35
38
  ################################################################################
36
39
  # Creates a new stream with the specified dictionary and data. The dictionary
37
40
  # should be a standard ruby hash, the data should be a standard ruby string.
38
- def initialize (hash, data)
41
+ def initialize(hash, data)
39
42
  @hash = hash
40
43
  @data = data
41
44
  @udata = nil
@@ -58,7 +61,7 @@ class PDF::Reader
58
61
  end
59
62
 
60
63
  Array(hash[:Filter]).each_with_index do |filter, index|
61
- @udata = Filter.new(filter, options[index]).filter(@udata)
64
+ @udata = Filter.with(filter, options[index]).filter(@udata)
62
65
  end
63
66
  end
64
67
  @udata
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ # utilities.rb : General-purpose utility classes which don't fit anywhere else
5
+ #
6
+ # Copyright August 2012, Alex Dowad. All Rights Reserved.
7
+ #
8
+ # This is free software. Please see the LICENSE and COPYING files for details.
9
+ #
10
+ # This was originally written for the prawn gem.
11
+
12
+ require 'thread'
13
+
14
+ class PDF::Reader
15
+
16
+ # Throughout the pdf-reader codebase, repeated calculations which can benefit
17
+ # from caching are made In some cases, caching and reusing results can not
18
+ # only save CPU cycles but also greatly reduce memory requirements But at the
19
+ # same time, we don't want to throw away thread safety We have two
20
+ # interchangeable thread-safe cache implementations:
21
+ class SynchronizedCache
22
+ def initialize
23
+ @cache = {}
24
+ @mutex = Mutex.new
25
+ end
26
+ def [](key)
27
+ @mutex.synchronize { @cache[key] }
28
+ end
29
+ def []=(key,value)
30
+ @mutex.synchronize { @cache[key] = value }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,99 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ # A value object that represents one or more consecutive characters on a page.
6
+ class TextRun
7
+ include Comparable
8
+
9
+ attr_reader :x, :y, :width, :font_size, :text
10
+
11
+ alias :to_s :text
12
+
13
+ def initialize(x, y, width, font_size, text)
14
+ @x = x
15
+ @y = y
16
+ @width = width
17
+ @font_size = font_size.floor
18
+ @text = text
19
+ end
20
+
21
+ # Allows collections of TextRun objects to be sorted. They will be sorted
22
+ # in order of their position on a cartesian plain - Top Left to Bottom Right
23
+ def <=>(other)
24
+ if x == other.x && y == other.y
25
+ 0
26
+ elsif y < other.y
27
+ 1
28
+ elsif y > other.y
29
+ -1
30
+ elsif x < other.x
31
+ -1
32
+ elsif x > other.x
33
+ 1
34
+ end
35
+ end
36
+
37
+ def endx
38
+ @endx ||= x + width
39
+ end
40
+
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
45
+ def mean_character_width
46
+ @width / character_count
47
+ end
48
+
49
+ def mergable?(other)
50
+ y.to_i == other.y.to_i && font_size == other.font_size && mergable_range.include?(other.x)
51
+ end
52
+
53
+ def +(other)
54
+ raise ArgumentError, "#{other} cannot be merged with this run" unless mergable?(other)
55
+
56
+ if (other.x - endx) <( font_size * 0.2)
57
+ TextRun.new(x, y, other.endx - x, font_size, text + other.text)
58
+ else
59
+ TextRun.new(x, y, other.endx - x, font_size, "#{text} #{other.text}")
60
+ end
61
+ end
62
+
63
+ def inspect
64
+ "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
65
+ end
66
+
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
83
+ private
84
+
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
89
+ def mergable_range
90
+ @mergable_range ||= Range.new(endx - 3, endx + font_size)
91
+ end
92
+
93
+ # Assume string encoding is marked correctly and we can trust String#size to return a
94
+ # character count
95
+ def character_count
96
+ @text.size.to_f
97
+ end
98
+ end
99
+ end
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -31,7 +34,7 @@ class PDF::Reader
31
34
  class Token < String # :nodoc:
32
35
  ################################################################################
33
36
  # Creates a new token with the specified value
34
- def initialize (val)
37
+ def initialize(val)
35
38
  super
36
39
  end
37
40
  ################################################################################
@@ -0,0 +1,195 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ # co-ordinate systems in PDF files are specified using a 3x3 matrix that looks
6
+ # something like this:
7
+ #
8
+ # [ a b 0 ]
9
+ # [ c d 0 ]
10
+ # [ e f 1 ]
11
+ #
12
+ # Because the final column never changes, we can represent each matrix using
13
+ # only 6 numbers. This is important to save CPU time, memory and GC pressure
14
+ # caused by allocating too many unnecessary objects.
15
+ class TransformationMatrix
16
+ attr_reader :a, :b, :c, :d, :e, :f
17
+
18
+ def initialize(a, b, c, d, e, f)
19
+ @a, @b, @c, @d, @e, @f = a, b, c, d, e, f
20
+ end
21
+
22
+ def inspect
23
+ "#{a}, #{b}, 0,\n#{c}, #{d}, #{0},\n#{e}, #{f}, 1"
24
+ end
25
+
26
+ def to_a
27
+ [@a,@b,0,
28
+ @c,@d,0,
29
+ @e,@f,1]
30
+ end
31
+
32
+ # multiply this matrix with another.
33
+ #
34
+ # the second matrix is represented by the 6 scalar values that are changeable
35
+ # in a PDF transformation matrix.
36
+ #
37
+ # WARNING: This mutates the current matrix to avoid allocating memory when
38
+ # we don't need too. Matrices are multiplied ALL THE FREAKING TIME
39
+ # so this is a worthwhile optimisation
40
+ #
41
+ # NOTE: When multiplying matrices, ordering matters. Double check
42
+ # the PDF spec to ensure you're multiplying things correctly.
43
+ #
44
+ # NOTE: see Section 8.3.3, PDF 32000-1:2008, pp 119
45
+ #
46
+ # NOTE: The if statements in this method are ordered to prefer optimisations
47
+ # that allocate fewer objects
48
+ #
49
+ # TODO: it might be worth adding an optimised path for vertical
50
+ # displacement to speed up processing documents that use vertical
51
+ # writing systems
52
+ #
53
+ def multiply!(a,b=nil,c=nil, d=nil,e=nil,f=nil)
54
+ if a == 1 && b == 0 && c == 0 && d == 1 && e == 0 && f == 0
55
+ # the identity matrix, no effect
56
+ self
57
+ elsif @a == 1 && @b == 0 && @c == 0 && @d == 1 && @e == 0 && @f == 0
58
+ # I'm the identity matrix, so just copy values across
59
+ @a = a
60
+ @b = b
61
+ @c = c
62
+ @d = d
63
+ @e = e
64
+ @f = f
65
+ elsif a == 1 && b == 0 && c == 0 && d == 1 && f == 0
66
+ # the other matrix is a horizontal displacement
67
+ horizontal_displacement_multiply!(e)
68
+ elsif @a == 1 && @b == 0 && @c == 0 && @d == 1 && @f == 0
69
+ # I'm a horizontal displacement
70
+ horizontal_displacement_multiply_reversed!(a,b,c,d,e,f)
71
+ elsif @a != 1 && @b == 0 && @c == 0 && @d != 1 && @e == 0 && @f == 0
72
+ # I'm a xy scale
73
+ xy_scaling_multiply_reversed!(a,b,c,d,e,f)
74
+ elsif a != 1 && b == 0 && c == 0 && d != 1 && e == 0 && f == 0
75
+ # the other matrix is an xy scale
76
+ xy_scaling_multiply!(a,b,c,d,e,f)
77
+ else
78
+ faster_multiply!(a,b,c, d,e,f)
79
+ end
80
+ self
81
+ end
82
+
83
+ # Optimised method for when the second matrix in the calculation is
84
+ # a simple horizontal displacement.
85
+ #
86
+ # Like this:
87
+ #
88
+ # [ 1 2 0 ] [ 1 0 0 ]
89
+ # [ 3 4 0 ] x [ 0 1 0 ]
90
+ # [ 5 6 1 ] [ e2 0 1 ]
91
+ #
92
+ def horizontal_displacement_multiply!(e2)
93
+ @e = @e + e2
94
+ end
95
+
96
+ private
97
+
98
+ # Optimised method for when the first matrix in the calculation is
99
+ # a simple horizontal displacement.
100
+ #
101
+ # Like this:
102
+ #
103
+ # [ 1 0 0 ] [ 1 2 0 ]
104
+ # [ 0 1 0 ] x [ 3 4 0 ]
105
+ # [ 5 0 1 ] [ 5 6 1 ]
106
+ #
107
+ def horizontal_displacement_multiply_reversed!(a2,b2,c2,d2,e2,f2)
108
+ newa = a2
109
+ newb = b2
110
+ newc = c2
111
+ newd = d2
112
+ newe = (@e * a2) + e2
113
+ newf = (@e * b2) + f2
114
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
115
+ end
116
+
117
+ # Optimised method for when the second matrix in the calculation is
118
+ # an X and Y scale
119
+ #
120
+ # Like this:
121
+ #
122
+ # [ 1 2 0 ] [ 5 0 0 ]
123
+ # [ 3 4 0 ] x [ 0 5 0 ]
124
+ # [ 5 6 1 ] [ 0 0 1 ]
125
+ #
126
+ def xy_scaling_multiply!(a2,b2,c2,d2,e2,f2)
127
+ newa = @a * a2
128
+ newb = @b * d2
129
+ newc = @c * a2
130
+ newd = @d * d2
131
+ newe = @e * a2
132
+ newf = @f * d2
133
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
134
+ end
135
+
136
+ # Optimised method for when the first matrix in the calculation is
137
+ # an X and Y scale
138
+ #
139
+ # Like this:
140
+ #
141
+ # [ 5 0 0 ] [ 1 2 0 ]
142
+ # [ 0 5 0 ] x [ 3 4 0 ]
143
+ # [ 0 0 1 ] [ 5 6 1 ]
144
+ #
145
+ def xy_scaling_multiply_reversed!(a2,b2,c2,d2,e2,f2)
146
+ newa = @a * a2
147
+ newb = @a * b2
148
+ newc = @d * c2
149
+ newd = @d * d2
150
+ newe = e2
151
+ newf = f2
152
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
153
+ end
154
+
155
+ # A general solution to multiplying two 3x3 matrixes. This is correct in all cases,
156
+ # but slower due to excessive object allocations. It's not actually used in any
157
+ # active code paths, but is here for reference. Use faster_multiply instead.
158
+ #
159
+ # Like this:
160
+ #
161
+ # [ a b 0 ] [ a b 0 ]
162
+ # [ c d 0 ] x [ c d 0 ]
163
+ # [ e f 1 ] [ e f 1 ]
164
+ #
165
+ def regular_multiply!(a2,b2,c2,d2,e2,f2)
166
+ newa = (@a * a2) + (@b * c2) + (0 * e2)
167
+ newb = (@a * b2) + (@b * d2) + (0 * f2)
168
+ newc = (@c * a2) + (@d * c2) + (0 * e2)
169
+ newd = (@c * b2) + (@d * d2) + (0 * f2)
170
+ newe = (@e * a2) + (@f * c2) + (1 * e2)
171
+ newf = (@e * b2) + (@f * d2) + (1 * f2)
172
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
173
+ end
174
+
175
+ # A general solution for multiplying two matrices when we know all values
176
+ # in the final column are fixed. This is the fallback method for when none
177
+ # of the optimised methods are applicable.
178
+ #
179
+ # Like this:
180
+ #
181
+ # [ a b 0 ] [ a b 0 ]
182
+ # [ c d 0 ] x [ c d 0 ]
183
+ # [ e f 1 ] [ e f 1 ]
184
+ #
185
+ def faster_multiply!(a2,b2,c2, d2,e2,f2)
186
+ newa = (@a * a2) + (@b * c2)
187
+ newb = (@a * b2) + (@b * d2)
188
+ newc = (@c * a2) + (@d * c2)
189
+ newd = (@c * b2) + (@d * d2)
190
+ newe = (@e * a2) + (@f * c2) + e2
191
+ newf = (@e * b2) + (@f * d2) + f2
192
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
193
+ end
194
+ end
195
+ end