pdf-reader 0.8.6 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -23,58 +23,28 @@
23
23
  #
24
24
  ################################################################################
25
25
 
26
- require 'enumerator'
27
-
28
26
  class PDF::Reader
29
- class Encoding
27
+ class Encoding # :nodoc:
30
28
  CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
31
29
  24,25,26,27,28,29,30,31]
32
30
  UNKNOWN_CHAR = 0x25AF # ▯
33
31
 
34
- attr_reader :differences, :unpack
32
+ attr_reader :unpack
35
33
 
36
34
  def initialize(enc)
37
- @to_unicode_required = false
38
-
39
35
  if enc.kind_of?(Hash)
40
- self.differences=enc[:Differences] if enc[:Differences]
36
+ self.differences = enc[:Differences] if enc[:Differences]
41
37
  enc = enc[:Encoding] || enc[:BaseEncoding]
42
38
  elsif enc != nil
43
39
  enc = enc.to_sym
40
+ else
41
+ enc = nil
44
42
  end
45
43
 
46
- case enc
47
- when nil then
48
- load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
49
- @unpack = "C*"
50
- when "Identity-H".to_sym then
51
- @unpack = "n*"
52
- @to_unicode_required = true
53
- when :MacRomanEncoding then
54
- load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt"
55
- @unpack = "C*"
56
- when :MacExpertEncoding then
57
- load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt"
58
- @unpack = "C*"
59
- when :PDFDocEncoding then
60
- load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
61
- @unpack = "C*"
62
- when :StandardEncoding then
63
- load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
64
- @unpack = "C*"
65
- when :SymbolEncoding then
66
- load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt"
67
- @unpack = "C*"
68
- when :UTF16Encoding then
69
- @unpack = "n*"
70
- when :WinAnsiEncoding then
71
- load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt"
72
- @unpack = "C*"
73
- when :ZapfDingbatsEncoding then
74
- load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
75
- @unpack = "C*"
76
- else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
77
- end
44
+ @to_unicode_required = unicode_required?(enc)
45
+ @unpack = get_unpack(enc)
46
+ @map_file = get_mapping_file(enc)
47
+ load_mapping(@map_file) if @map_file
78
48
  end
79
49
 
80
50
  def to_unicode_required?
@@ -85,9 +55,9 @@ class PDF::Reader
85
55
  #
86
56
  # [25, :A, 26, :B]
87
57
  #
88
- # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
58
+ # The array alternates between a decimal byte number and a glyph name to map to that byte
89
59
  #
90
- # To save space the following array is also valid and equivilant to the previous one
60
+ # To save space the following array is also valid and equivalent to the previous one
91
61
  #
92
62
  # [25, :A, :B]
93
63
  def differences=(diff)
@@ -106,45 +76,90 @@ class PDF::Reader
106
76
  @differences
107
77
  end
108
78
 
79
+ def differences
80
+ @differences ||= {}
81
+ end
82
+
109
83
  # convert the specified string to utf8
84
+ #
85
+ # * unpack raw bytes into codepoints
86
+ # * replace any that have entries in the differences table with a glyph name
87
+ # * convert codepoints from source encoding to Unicode codepoints
88
+ # * convert any glyph names to Unicode codepoints
89
+ # * replace characters that didn't convert to Unicode nicely with something
90
+ # valid
91
+ # * pack the final array of Unicode codepoints into a utf-8 string
92
+ # * mark the string as utf-8 if we're running on a M17N aware VM
93
+ #
110
94
  def to_utf8(str, tounicode = nil)
111
- # unpack the single bytes
112
- array_orig = str.unpack(unpack)
113
-
114
- # replace any relevant bytes with a glyph name
115
- array_orig = process_differences(array_orig)
116
-
117
- # replace any remaining bytes with a unicode codepoint
118
- array_enc = array_orig.map do |num|
119
- if tounicode && (code = tounicode.decode(num))
120
- code
121
- elsif tounicode || ( tounicode.nil? && to_unicode_required? )
122
- PDF::Reader::Encoding::UNKNOWN_CHAR
123
- elsif mapping[num]
124
- mapping[num]
125
- elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(num)
95
+ ret = str.unpack(unpack).map { |c|
96
+ differences[c] || c
97
+ }.map { |num|
98
+ original_codepoint_to_unicode(num, tounicode)
99
+ }.map { |c|
100
+ glyphnames[c] || c
101
+ }.map { |c|
102
+ if c.nil? || !c.is_a?(Fixnum)
126
103
  PDF::Reader::Encoding::UNKNOWN_CHAR
127
104
  else
128
- num
105
+ c
129
106
  end
130
- end
107
+ }.pack("U*")
108
+
109
+ ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
131
110
 
132
- # convert any glyph names to unicode codepoints
133
- array_enc = process_glyphnames(array_enc)
111
+ ret
112
+ end
134
113
 
135
- # replace charcters that didn't convert to unicode nicely with something valid
136
- array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
114
+ private
137
115
 
138
- # pack all our Unicode codepoints into a UTF-8 string
139
- ret = array_enc.pack("U*")
116
+ def original_codepoint_to_unicode(cp, tounicode = nil)
117
+ if tounicode && (code = tounicode.decode(cp))
118
+ code
119
+ elsif tounicode || ( tounicode.nil? && to_unicode_required? )
120
+ PDF::Reader::Encoding::UNKNOWN_CHAR
121
+ elsif mapping[cp]
122
+ mapping[cp]
123
+ elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
124
+ PDF::Reader::Encoding::UNKNOWN_CHAR
125
+ else
126
+ cp
127
+ end
128
+ end
140
129
 
141
- # set the strings encoding correctly under ruby 1.9+
142
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
130
+ def get_unpack(enc)
131
+ case enc
132
+ when :"Identity-H", :UTF16Encoding
133
+ "n*"
134
+ else
135
+ "C*"
136
+ end
137
+ end
143
138
 
144
- return ret
139
+ def get_mapping_file(enc)
140
+ return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
141
+ files = {
142
+ :"Identity-H" => nil,
143
+ :MacRomanEncoding => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
144
+ :MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
145
+ :PDFDocEncoding => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
146
+ :StandardEncoding => File.dirname(__FILE__) + "/encodings/standard.txt",
147
+ :SymbolEncoding => File.dirname(__FILE__) + "/encodings/symbol.txt",
148
+ :UTF16Encoding => nil,
149
+ :WinAnsiEncoding => File.dirname(__FILE__) + "/encodings/win_ansi.txt",
150
+ :ZapfDingbatsEncoding => File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
151
+ }
152
+
153
+ if files.has_key?(enc)
154
+ files[enc]
155
+ else
156
+ raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
157
+ end
145
158
  end
146
159
 
147
- private
160
+ def unicode_required?(enc)
161
+ enc == :"Identity-H"
162
+ end
148
163
 
149
164
  def mapping
150
165
  @mapping ||= {}
@@ -154,17 +169,8 @@ class PDF::Reader
154
169
  mapping.size > 0
155
170
  end
156
171
 
157
- # accepts an array of byte numbers, and replaces any that have entries in the differences table
158
- # with a glyph name
159
- def process_differences(arr)
160
- @differences ||= {}
161
- arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
162
- end
163
-
164
- # accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
165
- def process_glyphnames(arr)
166
- @differences ||= {}
167
- arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
172
+ def glyphnames
173
+ @glyphnames ||= PDF::Reader::Font.glyphnames
168
174
  end
169
175
 
170
176
  def load_mapping(file)
@@ -22,12 +22,11 @@
22
22
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
23
  #
24
24
 
25
-
26
25
  class PDF::Reader
27
26
  ################################################################################
28
27
  # An internal PDF::Reader class that helps to verify various parts of the PDF file
29
28
  # are valid
30
- class Error
29
+ class Error # :nodoc:
31
30
  ################################################################################
32
31
  def self.str_assert (lvalue, rvalue, chars=nil)
33
32
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
@@ -30,9 +30,7 @@ class PDF::Reader
30
30
  # support for features like compression and encryption. This class is for decoding that
31
31
  # content.
32
32
  #
33
- # Currently only 1 filter type is supported. Hopefully support for others will be added
34
- # in the future.
35
- class Filter
33
+ class Filter # :nodoc:
36
34
  ################################################################################
37
35
  # creates a new filter for decoding content.
38
36
  #
@@ -49,6 +47,7 @@ class PDF::Reader
49
47
  when :DCTDecode then @filter = nil
50
48
  when :FlateDecode then @filter = :flate
51
49
  when :JBIG2Decode then @filter = nil
50
+ when :LZWDecode then @filter = :lzw
52
51
  else raise UnsupportedFeatureError, "Unknown filter: #{name}"
53
52
  end
54
53
  end
@@ -92,8 +91,9 @@ class PDF::Reader
92
91
  ################################################################################
93
92
  # Decode the specified data with the Zlib compression algorithm
94
93
  def flate (data)
94
+ deflated = nil
95
95
  begin
96
- Zlib::Inflate.new.inflate(data)
96
+ deflated = Zlib::Inflate.new.inflate(data)
97
97
  rescue Zlib::DataError => e
98
98
  # by default, Ruby's Zlib assumes the data it's inflating
99
99
  # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
@@ -103,14 +103,117 @@ class PDF::Reader
103
103
  # See
104
104
  # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
105
105
  # - http://www.gzip.org/zlib/zlib_faq.html#faq38
106
- Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
106
+ deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
107
107
  end
108
+ depredict(deflated, @options)
108
109
  rescue Exception => e
109
110
  # Oops, there was a problem inflating the stream
110
111
  raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
111
112
  end
112
113
  ################################################################################
114
+ # Decode the specified data with the LZW compression algorithm
115
+ def lzw(data)
116
+ data = PDF::Reader::LZW.decode(data)
117
+ depredict(data, @options)
118
+ end
119
+ ################################################################################
120
+ def depredict(data, opts = {})
121
+ predictor = (opts || {})[:Predictor].to_i
122
+
123
+ case predictor
124
+ when 0, 1 then
125
+ data
126
+ when 2 then
127
+ tiff_depredict(data, opts)
128
+ when 10, 11, 12, 13, 14, 15 then
129
+ png_depredict(data, opts)
130
+ else
131
+ raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
132
+ end
133
+ end
134
+ ################################################################################
135
+ def tiff_depredict(data, opts = {})
136
+ raise UnsupportedFeatureError, "TIFF predictor not supported"
137
+ end
138
+ ################################################################################
139
+ def png_depredict(data, opts = {})
140
+ return data if opts.nil? || opts[:Predictor].to_i < 10
141
+
142
+ data = data.unpack("C*")
143
+
144
+ pixel_bytes = 1 #pixel_bitlength / 8
145
+ scanline_length = (pixel_bytes * opts[:Columns]) + 1
146
+ row = 0
147
+ pixels = []
148
+ paeth, pa, pb, pc = nil
149
+ until data.empty? do
150
+ row_data = data.slice! 0, scanline_length
151
+ filter = row_data.shift
152
+ case filter
153
+ when 0 # None
154
+ when 1 # Sub
155
+ row_data.each_with_index do |byte, index|
156
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
157
+ row_data[index] = (byte + left) % 256
158
+ #p [byte, left, row_data[index]]
159
+ end
160
+ when 2 # Up
161
+ row_data.each_with_index do |byte, index|
162
+ col = index / pixel_bytes
163
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
164
+ row_data[index] = (upper + byte) % 256
165
+ end
166
+ when 3 # Average
167
+ row_data.each_with_index do |byte, index|
168
+ col = index / pixel_bytes
169
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
170
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
171
+
172
+ row_data[index] = (byte + ((left + upper)/2).floor) % 256
173
+ end
174
+ when 4 # Paeth
175
+ left = upper = upper_left = nil
176
+ row_data.each_with_index do |byte, index|
177
+ col = index / pixel_bytes
178
+
179
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
180
+ if row.zero?
181
+ upper = upper_left = 0
182
+ else
183
+ upper = pixels[row-1][col][index % pixel_bytes]
184
+ upper_left = col.zero? ? 0 :
185
+ pixels[row-1][col-1][index % pixel_bytes]
186
+ end
187
+
188
+ p = left + upper - upper_left
189
+ pa = (p - left).abs
190
+ pb = (p - upper).abs
191
+ pc = (p - upper_left).abs
192
+
193
+ paeth = if pa <= pb && pa <= pc
194
+ left
195
+ elsif pb <= pc
196
+ upper
197
+ else
198
+ upper_left
199
+ end
200
+
201
+ row_data[index] = (byte + paeth) % 256
202
+ end
203
+ else
204
+ raise ArgumentError, "Invalid filter algorithm #{filter}"
205
+ end
206
+
207
+ s = []
208
+ row_data.each_slice pixel_bytes do |slice|
209
+ s << slice
210
+ end
211
+ pixels << s
212
+ row += 1
213
+ end
214
+
215
+ pixels.map { |row| row.flatten.pack("C*") }.join("")
216
+ end
113
217
  end
114
- ################################################################################
115
218
  end
116
219
  ################################################################################
@@ -32,19 +32,17 @@ class PDF::Reader
32
32
  # a text file supplied by Adobe at:
33
33
  # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
34
34
  def self.glyphnames
35
- @@glyphs ||= {}
35
+ glyphs = {}
36
36
 
37
- if @@glyphs.empty?
38
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
39
- File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
40
- f.each do |l|
41
- m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
42
- @@glyphs[name.to_sym] = "0x#{code}".hex if name
43
- end
37
+ RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
38
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
39
+ f.each do |l|
40
+ m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
41
+ glyphs[name.to_sym] = "0x#{code}".hex if name
44
42
  end
45
43
  end
46
44
 
47
- @@glyphs
45
+ glyphs
48
46
  end
49
47
 
50
48
  def basefont=(font)
@@ -52,9 +50,11 @@ class PDF::Reader
52
50
  # with encoding= if required
53
51
  case font
54
52
  when "Symbol" then
55
- self.encoding = PDF::Reader::Encoding.new("SymbolEncoding")
53
+ @encoding = PDF::Reader::Encoding.new("SymbolEncoding")
56
54
  when "ZapfDingbats" then
57
- self.encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
55
+ @encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
56
+ else
57
+ @encoding = nil
58
58
  end
59
59
  @basefont = font
60
60
  end
@@ -0,0 +1,123 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+
5
+ class Reader
6
+
7
+ # A general class for decoding LZW compressed data. LZW can be
8
+ # used in PDF files to compresses streams, usually for image data sourced
9
+ # from a TIFF file.
10
+ #
11
+ # See the following links for more information:
12
+ #
13
+ # ref http://www.fileformat.info/format/tiff/corion-lzw.htm
14
+ # ref http://marknelson.us/1989/10/01/lzw-data-compression/
15
+ #
16
+ # The PDF spec also has some data on the algorithm.
17
+ #
18
+ class LZW # :nodoc:
19
+
20
+ class BitStream # :nodoc:
21
+
22
+ def initialize(data, bits_in_chunk)
23
+ @data = data
24
+ @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
25
+ @bits_in_chunk = bits_in_chunk
26
+ @current_pos = 0
27
+ @bits_left_in_byte = 8
28
+ end
29
+
30
+ def set_bits_in_chunk(bits_in_chunk)
31
+ @bits_in_chunk = bits_in_chunk
32
+ end
33
+
34
+ def read
35
+ bits_left_in_chunk = @bits_in_chunk
36
+ chunk = nil
37
+ while bits_left_in_chunk > 0 and @current_pos < @data.size
38
+ chunk = 0 if chunk.nil?
39
+ codepoint = @data[@current_pos, 1].unpack("C*")[0]
40
+ current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
41
+ dif = bits_left_in_chunk - @bits_left_in_byte
42
+ if dif > 0 then current_byte <<= dif
43
+ elsif dif < 0 then current_byte >>= dif.abs
44
+ end
45
+ chunk |= current_byte #add bits to result
46
+ bits_left_in_chunk = if dif >= 0 then dif else 0 end
47
+ @bits_left_in_byte = if dif < 0 then dif.abs else 0 end
48
+ if @bits_left_in_byte.zero? #next byte
49
+ @current_pos += 1
50
+ @bits_left_in_byte = 8
51
+ end
52
+ end
53
+ chunk
54
+ end
55
+ end
56
+
57
+ CODE_EOD = 257 #end of data
58
+ CODE_CLEAR_TABLE = 256 #clear table
59
+
60
+ # stores de pairs code => string
61
+ class StringTable < Hash # :nodoc:
62
+ attr_reader :string_table_pos
63
+
64
+ def initialize
65
+ super
66
+ @string_table_pos = 258 #initial code
67
+ end
68
+
69
+ #if code less than 258 return fixed string
70
+ def [](key)
71
+ if key > 257 then super else key.chr end
72
+ end
73
+
74
+ def add(string)
75
+ store(@string_table_pos, string)
76
+ @string_table_pos += 1
77
+ end
78
+ end
79
+
80
+ # Decompresses a LZW compressed string.
81
+ #
82
+ def self.decode(data)
83
+ stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
84
+ result = ''
85
+ while not (code = stream.read) == CODE_EOD
86
+ if code == CODE_CLEAR_TABLE
87
+ string_table = StringTable.new
88
+ code = stream.read
89
+ break if code == CODE_EOD
90
+ result << string_table[code]
91
+ old_code = code
92
+ else
93
+ string = string_table[code]
94
+ if string
95
+ result << string
96
+ string_table.add create_new_string(string_table, old_code, code)
97
+ old_code = code
98
+ else
99
+ new_string = create_new_string(string_table, old_code, old_code)
100
+ result << new_string
101
+ string_table.add new_string
102
+ old_code = code
103
+ end
104
+ #increase de size of the codes when limit reached
105
+ case string_table.string_table_pos
106
+ when 511 then stream.set_bits_in_chunk(10)
107
+ when 1023 then stream.set_bits_in_chunk(11)
108
+ when 2047 then stream.set_bits_in_chunk(12)
109
+ end
110
+ end
111
+ end
112
+ result
113
+ end
114
+
115
+ private
116
+
117
+ def self.create_new_string(string_table,some_code, other_code)
118
+ string_table[some_code] + string_table[other_code][0].chr
119
+ end
120
+
121
+ end
122
+ end
123
+ end