pdf-reader 1.1.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -0,0 +1,91 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'digest'
5
+ require 'openssl'
6
+
7
+ class PDF::Reader
8
+
9
+ # class creates interface to encrypt dictionary for use in Decrypt
10
+ class StandardSecurityHandlerV5
11
+
12
+ attr_reader :key_length, :encrypt_key
13
+
14
+ def initialize(opts = {})
15
+ @key_length = 256
16
+ @O = opts[:O] # hash(32B) + validation salt(8B) + key salt(8B)
17
+ @U = opts[:U] # hash(32B) + validation salt(8B) + key salt(8B)
18
+ @OE = opts[:OE] # decryption key, encrypted w/ owner password
19
+ @UE = opts[:UE] # decryption key, encrypted w/ user password
20
+ @encrypt_key = build_standard_key(opts[:password] || '')
21
+ end
22
+
23
+ # This handler supports AES-256 encryption defined in PDF 1.7 Extension Level 3
24
+ def self.supports?(encrypt)
25
+ return false if encrypt.nil?
26
+
27
+ filter = encrypt.fetch(:Filter, :Standard)
28
+ version = encrypt.fetch(:V, 0)
29
+ revision = encrypt.fetch(:R, 0)
30
+ algorithm = encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
31
+ (filter == :Standard) && (encrypt[:StmF] == encrypt[:StrF]) &&
32
+ ((version == 5) && (revision == 5) && (algorithm == :AESV3))
33
+ end
34
+
35
+ ##7.6.2 General Encryption Algorithm
36
+ #
37
+ # Algorithm 1: Encryption of data using the RC4 or AES algorithms
38
+ #
39
+ # used to decrypt RC4/AES encrypted PDF streams (buf)
40
+ #
41
+ # buf - a string to decrypt
42
+ # ref - a PDF::Reader::Reference for the object to decrypt
43
+ #
44
+ def decrypt( buf, ref )
45
+ cipher = OpenSSL::Cipher.new("AES-#{@key_length}-CBC")
46
+ cipher.decrypt
47
+ cipher.key = @encrypt_key.dup
48
+ cipher.iv = buf[0..15]
49
+ cipher.update(buf[16..-1]) + cipher.final
50
+ end
51
+
52
+ private
53
+ # Algorithm 3.2a - Computing an encryption key
54
+ #
55
+ # Defined in PDF 1.7 Extension Level 3
56
+ #
57
+ # if the string is a valid user/owner password, this will return the decryption key
58
+ #
59
+ def auth_owner_pass(password)
60
+ if Digest::SHA256.digest(password + @O[32..39] + @U) == @O[0..31]
61
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
62
+ cipher.decrypt
63
+ cipher.key = Digest::SHA256.digest(password + @O[40..-1] + @U)
64
+ cipher.iv = "\x00" * 16
65
+ cipher.padding = 0
66
+ cipher.update(@OE) + cipher.final
67
+ end
68
+ end
69
+
70
+ def auth_user_pass(password)
71
+ if Digest::SHA256.digest(password + @U[32..39]) == @U[0..31]
72
+ cipher = OpenSSL::Cipher.new('AES-256-CBC')
73
+ cipher.decrypt
74
+ cipher.key = Digest::SHA256.digest(password + @U[40..-1])
75
+ cipher.iv = "\x00" * 16
76
+ cipher.padding = 0
77
+ cipher.update(@UE) + cipher.final
78
+ end
79
+ end
80
+
81
+ def build_standard_key(pass)
82
+ pass = pass.byteslice(0...127) # UTF-8 encoded password. first 127 bytes
83
+
84
+ encrypt_key = auth_owner_pass(pass)
85
+ encrypt_key ||= auth_user_pass(pass)
86
+
87
+ raise PDF::Reader::EncryptedPDFError, "Invalid password (#{pass})" if encrypt_key.nil?
88
+ encrypt_key
89
+ end
90
+ end
91
+ end
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -35,7 +38,7 @@ class PDF::Reader
35
38
  ################################################################################
36
39
  # Creates a new stream with the specified dictionary and data. The dictionary
37
40
  # should be a standard ruby hash, the data should be a standard ruby string.
38
- def initialize (hash, data)
41
+ def initialize(hash, data)
39
42
  @hash = hash
40
43
  @data = data
41
44
  @udata = nil
@@ -58,7 +61,7 @@ class PDF::Reader
58
61
  end
59
62
 
60
63
  Array(hash[:Filter]).each_with_index do |filter, index|
61
- @udata = Filter.new(filter, options[index]).filter(@udata)
64
+ @udata = Filter.with(filter, options[index]).filter(@udata)
62
65
  end
63
66
  end
64
67
  @udata
@@ -0,0 +1,33 @@
1
+ # encoding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ # utilities.rb : General-purpose utility classes which don't fit anywhere else
5
+ #
6
+ # Copyright August 2012, Alex Dowad. All Rights Reserved.
7
+ #
8
+ # This is free software. Please see the LICENSE and COPYING files for details.
9
+ #
10
+ # This was originally written for the prawn gem.
11
+
12
+ require 'thread'
13
+
14
+ class PDF::Reader
15
+
16
+ # Throughout the pdf-reader codebase, repeated calculations which can benefit
17
+ # from caching are made In some cases, caching and reusing results can not
18
+ # only save CPU cycles but also greatly reduce memory requirements But at the
19
+ # same time, we don't want to throw away thread safety We have two
20
+ # interchangeable thread-safe cache implementations:
21
+ class SynchronizedCache
22
+ def initialize
23
+ @cache = {}
24
+ @mutex = Mutex.new
25
+ end
26
+ def [](key)
27
+ @mutex.synchronize { @cache[key] }
28
+ end
29
+ def []=(key,value)
30
+ @mutex.synchronize { @cache[key] = value }
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,99 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ # A value object that represents one or more consecutive characters on a page.
6
+ class TextRun
7
+ include Comparable
8
+
9
+ attr_reader :x, :y, :width, :font_size, :text
10
+
11
+ alias :to_s :text
12
+
13
+ def initialize(x, y, width, font_size, text)
14
+ @x = x
15
+ @y = y
16
+ @width = width
17
+ @font_size = font_size.floor
18
+ @text = text
19
+ end
20
+
21
+ # Allows collections of TextRun objects to be sorted. They will be sorted
22
+ # in order of their position on a cartesian plain - Top Left to Bottom Right
23
+ def <=>(other)
24
+ if x == other.x && y == other.y
25
+ 0
26
+ elsif y < other.y
27
+ 1
28
+ elsif y > other.y
29
+ -1
30
+ elsif x < other.x
31
+ -1
32
+ elsif x > other.x
33
+ 1
34
+ end
35
+ end
36
+
37
+ def endx
38
+ @endx ||= x + width
39
+ end
40
+
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
45
+ def mean_character_width
46
+ @width / character_count
47
+ end
48
+
49
+ def mergable?(other)
50
+ y.to_i == other.y.to_i && font_size == other.font_size && mergable_range.include?(other.x)
51
+ end
52
+
53
+ def +(other)
54
+ raise ArgumentError, "#{other} cannot be merged with this run" unless mergable?(other)
55
+
56
+ if (other.x - endx) <( font_size * 0.2)
57
+ TextRun.new(x, y, other.endx - x, font_size, text + other.text)
58
+ else
59
+ TextRun.new(x, y, other.endx - x, font_size, "#{text} #{other.text}")
60
+ end
61
+ end
62
+
63
+ def inspect
64
+ "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
65
+ end
66
+
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
83
+ private
84
+
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
89
+ def mergable_range
90
+ @mergable_range ||= Range.new(endx - 3, endx + font_size)
91
+ end
92
+
93
+ # Assume string encoding is marked correctly and we can trust String#size to return a
94
+ # character count
95
+ def character_count
96
+ @text.size.to_f
97
+ end
98
+ end
99
+ end
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -31,7 +34,7 @@ class PDF::Reader
31
34
  class Token < String # :nodoc:
32
35
  ################################################################################
33
36
  # Creates a new token with the specified value
34
- def initialize (val)
37
+ def initialize(val)
35
38
  super
36
39
  end
37
40
  ################################################################################
@@ -0,0 +1,195 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ # co-ordinate systems in PDF files are specified using a 3x3 matrix that looks
6
+ # something like this:
7
+ #
8
+ # [ a b 0 ]
9
+ # [ c d 0 ]
10
+ # [ e f 1 ]
11
+ #
12
+ # Because the final column never changes, we can represent each matrix using
13
+ # only 6 numbers. This is important to save CPU time, memory and GC pressure
14
+ # caused by allocating too many unnecessary objects.
15
+ class TransformationMatrix
16
+ attr_reader :a, :b, :c, :d, :e, :f
17
+
18
+ def initialize(a, b, c, d, e, f)
19
+ @a, @b, @c, @d, @e, @f = a, b, c, d, e, f
20
+ end
21
+
22
+ def inspect
23
+ "#{a}, #{b}, 0,\n#{c}, #{d}, #{0},\n#{e}, #{f}, 1"
24
+ end
25
+
26
+ def to_a
27
+ [@a,@b,0,
28
+ @c,@d,0,
29
+ @e,@f,1]
30
+ end
31
+
32
+ # multiply this matrix with another.
33
+ #
34
+ # the second matrix is represented by the 6 scalar values that are changeable
35
+ # in a PDF transformation matrix.
36
+ #
37
+ # WARNING: This mutates the current matrix to avoid allocating memory when
38
+ # we don't need too. Matrices are multiplied ALL THE FREAKING TIME
39
+ # so this is a worthwhile optimisation
40
+ #
41
+ # NOTE: When multiplying matrices, ordering matters. Double check
42
+ # the PDF spec to ensure you're multiplying things correctly.
43
+ #
44
+ # NOTE: see Section 8.3.3, PDF 32000-1:2008, pp 119
45
+ #
46
+ # NOTE: The if statements in this method are ordered to prefer optimisations
47
+ # that allocate fewer objects
48
+ #
49
+ # TODO: it might be worth adding an optimised path for vertical
50
+ # displacement to speed up processing documents that use vertical
51
+ # writing systems
52
+ #
53
+ def multiply!(a,b=nil,c=nil, d=nil,e=nil,f=nil)
54
+ if a == 1 && b == 0 && c == 0 && d == 1 && e == 0 && f == 0
55
+ # the identity matrix, no effect
56
+ self
57
+ elsif @a == 1 && @b == 0 && @c == 0 && @d == 1 && @e == 0 && @f == 0
58
+ # I'm the identity matrix, so just copy values across
59
+ @a = a
60
+ @b = b
61
+ @c = c
62
+ @d = d
63
+ @e = e
64
+ @f = f
65
+ elsif a == 1 && b == 0 && c == 0 && d == 1 && f == 0
66
+ # the other matrix is a horizontal displacement
67
+ horizontal_displacement_multiply!(e)
68
+ elsif @a == 1 && @b == 0 && @c == 0 && @d == 1 && @f == 0
69
+ # I'm a horizontal displacement
70
+ horizontal_displacement_multiply_reversed!(a,b,c,d,e,f)
71
+ elsif @a != 1 && @b == 0 && @c == 0 && @d != 1 && @e == 0 && @f == 0
72
+ # I'm a xy scale
73
+ xy_scaling_multiply_reversed!(a,b,c,d,e,f)
74
+ elsif a != 1 && b == 0 && c == 0 && d != 1 && e == 0 && f == 0
75
+ # the other matrix is an xy scale
76
+ xy_scaling_multiply!(a,b,c,d,e,f)
77
+ else
78
+ faster_multiply!(a,b,c, d,e,f)
79
+ end
80
+ self
81
+ end
82
+
83
+ # Optimised method for when the second matrix in the calculation is
84
+ # a simple horizontal displacement.
85
+ #
86
+ # Like this:
87
+ #
88
+ # [ 1 2 0 ] [ 1 0 0 ]
89
+ # [ 3 4 0 ] x [ 0 1 0 ]
90
+ # [ 5 6 1 ] [ e2 0 1 ]
91
+ #
92
+ def horizontal_displacement_multiply!(e2)
93
+ @e = @e + e2
94
+ end
95
+
96
+ private
97
+
98
+ # Optimised method for when the first matrix in the calculation is
99
+ # a simple horizontal displacement.
100
+ #
101
+ # Like this:
102
+ #
103
+ # [ 1 0 0 ] [ 1 2 0 ]
104
+ # [ 0 1 0 ] x [ 3 4 0 ]
105
+ # [ 5 0 1 ] [ 5 6 1 ]
106
+ #
107
+ def horizontal_displacement_multiply_reversed!(a2,b2,c2,d2,e2,f2)
108
+ newa = a2
109
+ newb = b2
110
+ newc = c2
111
+ newd = d2
112
+ newe = (@e * a2) + e2
113
+ newf = (@e * b2) + f2
114
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
115
+ end
116
+
117
+ # Optimised method for when the second matrix in the calculation is
118
+ # an X and Y scale
119
+ #
120
+ # Like this:
121
+ #
122
+ # [ 1 2 0 ] [ 5 0 0 ]
123
+ # [ 3 4 0 ] x [ 0 5 0 ]
124
+ # [ 5 6 1 ] [ 0 0 1 ]
125
+ #
126
+ def xy_scaling_multiply!(a2,b2,c2,d2,e2,f2)
127
+ newa = @a * a2
128
+ newb = @b * d2
129
+ newc = @c * a2
130
+ newd = @d * d2
131
+ newe = @e * a2
132
+ newf = @f * d2
133
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
134
+ end
135
+
136
+ # Optimised method for when the first matrix in the calculation is
137
+ # an X and Y scale
138
+ #
139
+ # Like this:
140
+ #
141
+ # [ 5 0 0 ] [ 1 2 0 ]
142
+ # [ 0 5 0 ] x [ 3 4 0 ]
143
+ # [ 0 0 1 ] [ 5 6 1 ]
144
+ #
145
+ def xy_scaling_multiply_reversed!(a2,b2,c2,d2,e2,f2)
146
+ newa = @a * a2
147
+ newb = @a * b2
148
+ newc = @d * c2
149
+ newd = @d * d2
150
+ newe = e2
151
+ newf = f2
152
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
153
+ end
154
+
155
+ # A general solution to multiplying two 3x3 matrixes. This is correct in all cases,
156
+ # but slower due to excessive object allocations. It's not actually used in any
157
+ # active code paths, but is here for reference. Use faster_multiply instead.
158
+ #
159
+ # Like this:
160
+ #
161
+ # [ a b 0 ] [ a b 0 ]
162
+ # [ c d 0 ] x [ c d 0 ]
163
+ # [ e f 1 ] [ e f 1 ]
164
+ #
165
+ def regular_multiply!(a2,b2,c2,d2,e2,f2)
166
+ newa = (@a * a2) + (@b * c2) + (0 * e2)
167
+ newb = (@a * b2) + (@b * d2) + (0 * f2)
168
+ newc = (@c * a2) + (@d * c2) + (0 * e2)
169
+ newd = (@c * b2) + (@d * d2) + (0 * f2)
170
+ newe = (@e * a2) + (@f * c2) + (1 * e2)
171
+ newf = (@e * b2) + (@f * d2) + (1 * f2)
172
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
173
+ end
174
+
175
+ # A general solution for multiplying two matrices when we know all values
176
+ # in the final column are fixed. This is the fallback method for when none
177
+ # of the optimised methods are applicable.
178
+ #
179
+ # Like this:
180
+ #
181
+ # [ a b 0 ] [ a b 0 ]
182
+ # [ c d 0 ] x [ c d 0 ]
183
+ # [ e f 1 ] [ e f 1 ]
184
+ #
185
+ def faster_multiply!(a2,b2,c2, d2,e2,f2)
186
+ newa = (@a * a2) + (@b * c2)
187
+ newb = (@a * b2) + (@b * d2)
188
+ newc = (@c * a2) + (@d * c2)
189
+ newd = (@c * b2) + (@d * d2)
190
+ newe = (@e * a2) + (@f * c2) + e2
191
+ newf = (@e * b2) + (@f * d2) + f2
192
+ @a, @b, @c, @d, @e, @f = newa, newb, newc, newd, newe, newf
193
+ end
194
+ end
195
+ end