pdf-reader 0.8.6 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,58 +23,28 @@
23
23
  #
24
24
  ################################################################################
25
25
 
26
- require 'enumerator'
27
-
28
26
  class PDF::Reader
29
- class Encoding
27
+ class Encoding # :nodoc:
30
28
  CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
31
29
  24,25,26,27,28,29,30,31]
32
30
  UNKNOWN_CHAR = 0x25AF # ▯
33
31
 
34
- attr_reader :differences, :unpack
32
+ attr_reader :unpack
35
33
 
36
34
  def initialize(enc)
37
- @to_unicode_required = false
38
-
39
35
  if enc.kind_of?(Hash)
40
- self.differences=enc[:Differences] if enc[:Differences]
36
+ self.differences = enc[:Differences] if enc[:Differences]
41
37
  enc = enc[:Encoding] || enc[:BaseEncoding]
42
38
  elsif enc != nil
43
39
  enc = enc.to_sym
40
+ else
41
+ enc = nil
44
42
  end
45
43
 
46
- case enc
47
- when nil then
48
- load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
49
- @unpack = "C*"
50
- when "Identity-H".to_sym then
51
- @unpack = "n*"
52
- @to_unicode_required = true
53
- when :MacRomanEncoding then
54
- load_mapping File.dirname(__FILE__) + "/encodings/mac_roman.txt"
55
- @unpack = "C*"
56
- when :MacExpertEncoding then
57
- load_mapping File.dirname(__FILE__) + "/encodings/mac_expert.txt"
58
- @unpack = "C*"
59
- when :PDFDocEncoding then
60
- load_mapping File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
61
- @unpack = "C*"
62
- when :StandardEncoding then
63
- load_mapping File.dirname(__FILE__) + "/encodings/standard.txt"
64
- @unpack = "C*"
65
- when :SymbolEncoding then
66
- load_mapping File.dirname(__FILE__) + "/encodings/symbol.txt"
67
- @unpack = "C*"
68
- when :UTF16Encoding then
69
- @unpack = "n*"
70
- when :WinAnsiEncoding then
71
- load_mapping File.dirname(__FILE__) + "/encodings/win_ansi.txt"
72
- @unpack = "C*"
73
- when :ZapfDingbatsEncoding then
74
- load_mapping File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
75
- @unpack = "C*"
76
- else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
77
- end
44
+ @to_unicode_required = unicode_required?(enc)
45
+ @unpack = get_unpack(enc)
46
+ @map_file = get_mapping_file(enc)
47
+ load_mapping(@map_file) if @map_file
78
48
  end
79
49
 
80
50
  def to_unicode_required?
@@ -85,9 +55,9 @@ class PDF::Reader
85
55
  #
86
56
  # [25, :A, 26, :B]
87
57
  #
88
- # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
58
+ # The array alternates between a decimal byte number and a glyph name to map to that byte
89
59
  #
90
- # To save space the following array is also valid and equivilant to the previous one
60
+ # To save space the following array is also valid and equivalent to the previous one
91
61
  #
92
62
  # [25, :A, :B]
93
63
  def differences=(diff)
@@ -106,45 +76,90 @@ class PDF::Reader
106
76
  @differences
107
77
  end
108
78
 
79
+ def differences
80
+ @differences ||= {}
81
+ end
82
+
109
83
  # convert the specified string to utf8
84
+ #
85
+ # * unpack raw bytes into codepoints
86
+ # * replace any that have entries in the differences table with a glyph name
87
+ # * convert codepoints from source encoding to Unicode codepoints
88
+ # * convert any glyph names to Unicode codepoints
89
+ # * replace characters that didn't convert to Unicode nicely with something
90
+ # valid
91
+ # * pack the final array of Unicode codepoints into a utf-8 string
92
+ # * mark the string as utf-8 if we're running on a M17N aware VM
93
+ #
110
94
  def to_utf8(str, tounicode = nil)
111
- # unpack the single bytes
112
- array_orig = str.unpack(unpack)
113
-
114
- # replace any relevant bytes with a glyph name
115
- array_orig = process_differences(array_orig)
116
-
117
- # replace any remaining bytes with a unicode codepoint
118
- array_enc = array_orig.map do |num|
119
- if tounicode && (code = tounicode.decode(num))
120
- code
121
- elsif tounicode || ( tounicode.nil? && to_unicode_required? )
122
- PDF::Reader::Encoding::UNKNOWN_CHAR
123
- elsif mapping[num]
124
- mapping[num]
125
- elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(num)
95
+ ret = str.unpack(unpack).map { |c|
96
+ differences[c] || c
97
+ }.map { |num|
98
+ original_codepoint_to_unicode(num, tounicode)
99
+ }.map { |c|
100
+ glyphnames[c] || c
101
+ }.map { |c|
102
+ if c.nil? || !c.is_a?(Fixnum)
126
103
  PDF::Reader::Encoding::UNKNOWN_CHAR
127
104
  else
128
- num
105
+ c
129
106
  end
130
- end
107
+ }.pack("U*")
108
+
109
+ ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
131
110
 
132
- # convert any glyph names to unicode codepoints
133
- array_enc = process_glyphnames(array_enc)
111
+ ret
112
+ end
134
113
 
135
- # replace charcters that didn't convert to unicode nicely with something valid
136
- array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }
114
+ private
137
115
 
138
- # pack all our Unicode codepoints into a UTF-8 string
139
- ret = array_enc.pack("U*")
116
+ def original_codepoint_to_unicode(cp, tounicode = nil)
117
+ if tounicode && (code = tounicode.decode(cp))
118
+ code
119
+ elsif tounicode || ( tounicode.nil? && to_unicode_required? )
120
+ PDF::Reader::Encoding::UNKNOWN_CHAR
121
+ elsif mapping[cp]
122
+ mapping[cp]
123
+ elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
124
+ PDF::Reader::Encoding::UNKNOWN_CHAR
125
+ else
126
+ cp
127
+ end
128
+ end
140
129
 
141
- # set the strings encoding correctly under ruby 1.9+
142
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
130
+ def get_unpack(enc)
131
+ case enc
132
+ when :"Identity-H", :UTF16Encoding
133
+ "n*"
134
+ else
135
+ "C*"
136
+ end
137
+ end
143
138
 
144
- return ret
139
+ def get_mapping_file(enc)
140
+ return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
141
+ files = {
142
+ :"Identity-H" => nil,
143
+ :MacRomanEncoding => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
144
+ :MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
145
+ :PDFDocEncoding => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
146
+ :StandardEncoding => File.dirname(__FILE__) + "/encodings/standard.txt",
147
+ :SymbolEncoding => File.dirname(__FILE__) + "/encodings/symbol.txt",
148
+ :UTF16Encoding => nil,
149
+ :WinAnsiEncoding => File.dirname(__FILE__) + "/encodings/win_ansi.txt",
150
+ :ZapfDingbatsEncoding => File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
151
+ }
152
+
153
+ if files.has_key?(enc)
154
+ files[enc]
155
+ else
156
+ raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
157
+ end
145
158
  end
146
159
 
147
- private
160
+ def unicode_required?(enc)
161
+ enc == :"Identity-H"
162
+ end
148
163
 
149
164
  def mapping
150
165
  @mapping ||= {}
@@ -154,17 +169,8 @@ class PDF::Reader
154
169
  mapping.size > 0
155
170
  end
156
171
 
157
- # accepts an array of byte numbers, and replaces any that have entries in the differences table
158
- # with a glyph name
159
- def process_differences(arr)
160
- @differences ||= {}
161
- arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
162
- end
163
-
164
- # accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
165
- def process_glyphnames(arr)
166
- @differences ||= {}
167
- arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
172
+ def glyphnames
173
+ @glyphnames ||= PDF::Reader::Font.glyphnames
168
174
  end
169
175
 
170
176
  def load_mapping(file)
@@ -22,12 +22,11 @@
22
22
  # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
23
  #
24
24
 
25
-
26
25
  class PDF::Reader
27
26
  ################################################################################
28
27
  # An internal PDF::Reader class that helps to verify various parts of the PDF file
29
28
  # are valid
30
- class Error
29
+ class Error # :nodoc:
31
30
  ################################################################################
32
31
  def self.str_assert (lvalue, rvalue, chars=nil)
33
32
  raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
@@ -30,9 +30,7 @@ class PDF::Reader
30
30
  # support for features like compression and encryption. This class is for decoding that
31
31
  # content.
32
32
  #
33
- # Currently only 1 filter type is supported. Hopefully support for others will be added
34
- # in the future.
35
- class Filter
33
+ class Filter # :nodoc:
36
34
  ################################################################################
37
35
  # creates a new filter for decoding content.
38
36
  #
@@ -49,6 +47,7 @@ class PDF::Reader
49
47
  when :DCTDecode then @filter = nil
50
48
  when :FlateDecode then @filter = :flate
51
49
  when :JBIG2Decode then @filter = nil
50
+ when :LZWDecode then @filter = :lzw
52
51
  else raise UnsupportedFeatureError, "Unknown filter: #{name}"
53
52
  end
54
53
  end
@@ -92,8 +91,9 @@ class PDF::Reader
92
91
  ################################################################################
93
92
  # Decode the specified data with the Zlib compression algorithm
94
93
  def flate (data)
94
+ deflated = nil
95
95
  begin
96
- Zlib::Inflate.new.inflate(data)
96
+ deflated = Zlib::Inflate.new.inflate(data)
97
97
  rescue Zlib::DataError => e
98
98
  # by default, Ruby's Zlib assumes the data it's inflating
99
99
  # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
@@ -103,14 +103,117 @@ class PDF::Reader
103
103
  # See
104
104
  # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
105
105
  # - http://www.gzip.org/zlib/zlib_faq.html#faq38
106
- Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
106
+ deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
107
107
  end
108
+ depredict(deflated, @options)
108
109
  rescue Exception => e
109
110
  # Oops, there was a problem inflating the stream
110
111
  raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
111
112
  end
112
113
  ################################################################################
114
+ # Decode the specified data with the LZW compression algorithm
115
+ def lzw(data)
116
+ data = PDF::Reader::LZW.decode(data)
117
+ depredict(data, @options)
118
+ end
119
+ ################################################################################
120
+ def depredict(data, opts = {})
121
+ predictor = (opts || {})[:Predictor].to_i
122
+
123
+ case predictor
124
+ when 0, 1 then
125
+ data
126
+ when 2 then
127
+ tiff_depredict(data, opts)
128
+ when 10, 11, 12, 13, 14, 15 then
129
+ png_depredict(data, opts)
130
+ else
131
+ raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
132
+ end
133
+ end
134
+ ################################################################################
135
+ def tiff_depredict(data, opts = {})
136
+ raise UnsupportedFeatureError, "TIFF predictor not supported"
137
+ end
138
+ ################################################################################
139
+ def png_depredict(data, opts = {})
140
+ return data if opts.nil? || opts[:Predictor].to_i < 10
141
+
142
+ data = data.unpack("C*")
143
+
144
+ pixel_bytes = 1 #pixel_bitlength / 8
145
+ scanline_length = (pixel_bytes * opts[:Columns]) + 1
146
+ row = 0
147
+ pixels = []
148
+ paeth, pa, pb, pc = nil
149
+ until data.empty? do
150
+ row_data = data.slice! 0, scanline_length
151
+ filter = row_data.shift
152
+ case filter
153
+ when 0 # None
154
+ when 1 # Sub
155
+ row_data.each_with_index do |byte, index|
156
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
157
+ row_data[index] = (byte + left) % 256
158
+ #p [byte, left, row_data[index]]
159
+ end
160
+ when 2 # Up
161
+ row_data.each_with_index do |byte, index|
162
+ col = index / pixel_bytes
163
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
164
+ row_data[index] = (upper + byte) % 256
165
+ end
166
+ when 3 # Average
167
+ row_data.each_with_index do |byte, index|
168
+ col = index / pixel_bytes
169
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
170
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
171
+
172
+ row_data[index] = (byte + ((left + upper)/2).floor) % 256
173
+ end
174
+ when 4 # Paeth
175
+ left = upper = upper_left = nil
176
+ row_data.each_with_index do |byte, index|
177
+ col = index / pixel_bytes
178
+
179
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
180
+ if row.zero?
181
+ upper = upper_left = 0
182
+ else
183
+ upper = pixels[row-1][col][index % pixel_bytes]
184
+ upper_left = col.zero? ? 0 :
185
+ pixels[row-1][col-1][index % pixel_bytes]
186
+ end
187
+
188
+ p = left + upper - upper_left
189
+ pa = (p - left).abs
190
+ pb = (p - upper).abs
191
+ pc = (p - upper_left).abs
192
+
193
+ paeth = if pa <= pb && pa <= pc
194
+ left
195
+ elsif pb <= pc
196
+ upper
197
+ else
198
+ upper_left
199
+ end
200
+
201
+ row_data[index] = (byte + paeth) % 256
202
+ end
203
+ else
204
+ raise ArgumentError, "Invalid filter algorithm #{filter}"
205
+ end
206
+
207
+ s = []
208
+ row_data.each_slice pixel_bytes do |slice|
209
+ s << slice
210
+ end
211
+ pixels << s
212
+ row += 1
213
+ end
214
+
215
+ pixels.map { |row| row.flatten.pack("C*") }.join("")
216
+ end
113
217
  end
114
- ################################################################################
115
218
  end
116
219
  ################################################################################
@@ -32,19 +32,17 @@ class PDF::Reader
32
32
  # a text file supplied by Adobe at:
33
33
  # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
34
34
  def self.glyphnames
35
- @@glyphs ||= {}
35
+ glyphs = {}
36
36
 
37
- if @@glyphs.empty?
38
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
39
- File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
40
- f.each do |l|
41
- m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
42
- @@glyphs[name.to_sym] = "0x#{code}".hex if name
43
- end
37
+ RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
38
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
39
+ f.each do |l|
40
+ m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
41
+ glyphs[name.to_sym] = "0x#{code}".hex if name
44
42
  end
45
43
  end
46
44
 
47
- @@glyphs
45
+ glyphs
48
46
  end
49
47
 
50
48
  def basefont=(font)
@@ -52,9 +50,11 @@ class PDF::Reader
52
50
  # with encoding= if required
53
51
  case font
54
52
  when "Symbol" then
55
- self.encoding = PDF::Reader::Encoding.new("SymbolEncoding")
53
+ @encoding = PDF::Reader::Encoding.new("SymbolEncoding")
56
54
  when "ZapfDingbats" then
57
- self.encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
55
+ @encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
56
+ else
57
+ @encoding = nil
58
58
  end
59
59
  @basefont = font
60
60
  end
@@ -0,0 +1,123 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+
5
+ class Reader
6
+
7
+ # A general class for decoding LZW compressed data. LZW can be
8
+ # used in PDF files to compresses streams, usually for image data sourced
9
+ # from a TIFF file.
10
+ #
11
+ # See the following links for more information:
12
+ #
13
+ # ref http://www.fileformat.info/format/tiff/corion-lzw.htm
14
+ # ref http://marknelson.us/1989/10/01/lzw-data-compression/
15
+ #
16
+ # The PDF spec also has some data on the algorithm.
17
+ #
18
+ class LZW # :nodoc:
19
+
20
+ class BitStream # :nodoc:
21
+
22
+ def initialize(data, bits_in_chunk)
23
+ @data = data
24
+ @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
25
+ @bits_in_chunk = bits_in_chunk
26
+ @current_pos = 0
27
+ @bits_left_in_byte = 8
28
+ end
29
+
30
+ def set_bits_in_chunk(bits_in_chunk)
31
+ @bits_in_chunk = bits_in_chunk
32
+ end
33
+
34
+ def read
35
+ bits_left_in_chunk = @bits_in_chunk
36
+ chunk = nil
37
+ while bits_left_in_chunk > 0 and @current_pos < @data.size
38
+ chunk = 0 if chunk.nil?
39
+ codepoint = @data[@current_pos, 1].unpack("C*")[0]
40
+ current_byte = codepoint & (2**@bits_left_in_byte -1) #clear consumed bits
41
+ dif = bits_left_in_chunk - @bits_left_in_byte
42
+ if dif > 0 then current_byte <<= dif
43
+ elsif dif < 0 then current_byte >>= dif.abs
44
+ end
45
+ chunk |= current_byte #add bits to result
46
+ bits_left_in_chunk = if dif >= 0 then dif else 0 end
47
+ @bits_left_in_byte = if dif < 0 then dif.abs else 0 end
48
+ if @bits_left_in_byte.zero? #next byte
49
+ @current_pos += 1
50
+ @bits_left_in_byte = 8
51
+ end
52
+ end
53
+ chunk
54
+ end
55
+ end
56
+
57
+ CODE_EOD = 257 #end of data
58
+ CODE_CLEAR_TABLE = 256 #clear table
59
+
60
+ # stores de pairs code => string
61
+ class StringTable < Hash # :nodoc:
62
+ attr_reader :string_table_pos
63
+
64
+ def initialize
65
+ super
66
+ @string_table_pos = 258 #initial code
67
+ end
68
+
69
+ #if code less than 258 return fixed string
70
+ def [](key)
71
+ if key > 257 then super else key.chr end
72
+ end
73
+
74
+ def add(string)
75
+ store(@string_table_pos, string)
76
+ @string_table_pos += 1
77
+ end
78
+ end
79
+
80
+ # Decompresses a LZW compressed string.
81
+ #
82
+ def self.decode(data)
83
+ stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
84
+ result = ''
85
+ while not (code = stream.read) == CODE_EOD
86
+ if code == CODE_CLEAR_TABLE
87
+ string_table = StringTable.new
88
+ code = stream.read
89
+ break if code == CODE_EOD
90
+ result << string_table[code]
91
+ old_code = code
92
+ else
93
+ string = string_table[code]
94
+ if string
95
+ result << string
96
+ string_table.add create_new_string(string_table, old_code, code)
97
+ old_code = code
98
+ else
99
+ new_string = create_new_string(string_table, old_code, old_code)
100
+ result << new_string
101
+ string_table.add new_string
102
+ old_code = code
103
+ end
104
+ #increase de size of the codes when limit reached
105
+ case string_table.string_table_pos
106
+ when 511 then stream.set_bits_in_chunk(10)
107
+ when 1023 then stream.set_bits_in_chunk(11)
108
+ when 2047 then stream.set_bits_in_chunk(12)
109
+ end
110
+ end
111
+ end
112
+ result
113
+ end
114
+
115
+ private
116
+
117
+ def self.create_new_string(string_table,some_code, other_code)
118
+ string_table[some_code] + string_table[other_code][0].chr
119
+ end
120
+
121
+ end
122
+ end
123
+ end