pdf-reader 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+
3
+ require 'ascii85'
4
+
5
+ class PDF::Reader
6
+ module Filter # :nodoc:
7
+ class Ascii85
8
+ def initialize(options = {})
9
+ @options = options
10
+ end
11
+
12
+ ################################################################################
13
+ # Decode the specified data using the Ascii85 algorithm. Relies on the AScii85
14
+ # rubygem.
15
+ #
16
+ def filter(data)
17
+ data = "<~#{data}" unless data.to_s[0,2] == "<~"
18
+ ::Ascii85::decode(data)
19
+ rescue Exception => e
20
+ # Oops, there was a problem decoding the stream
21
+ raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class AsciiHex
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ ################################################################################
11
+ # Decode the specified data using the AsciiHex algorithm.
12
+ #
13
+ def filter(data)
14
+ data.chop! if data[-1,1] == ">"
15
+ data = data[1,data.size] if data[0,1] == "<"
16
+ data.gsub!(/[^A-Fa-f0-9]/,"")
17
+ data << "0" if data.size % 2 == 1
18
+ data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
19
+ rescue Exception => e
20
+ # Oops, there was a problem decoding the stream
21
+ raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
22
+ end
23
+ end
24
+ end
25
+ end
26
+
@@ -0,0 +1,138 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class Depredict
6
+ def initialize(options = {})
7
+ @options = options || {}
8
+ end
9
+
10
+ ################################################################################
11
+ # Streams can be preprocessed to improve compression. This reverses the
12
+ # preprocessing
13
+ #
14
+ def filter(data)
15
+ predictor = @options[:Predictor].to_i
16
+
17
+ case predictor
18
+ when 0, 1 then
19
+ data
20
+ when 2 then
21
+ tiff_depredict(data)
22
+ when 10, 11, 12, 13, 14, 15 then
23
+ png_depredict(data)
24
+ else
25
+ raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ ################################################################################
32
+ def tiff_depredict(data)
33
+ data = data.unpack("C*")
34
+ unfiltered = []
35
+ bpc = @options[:BitsPerComponent] || 8
36
+ pixel_bits = bpc * @options[:Colors]
37
+ pixel_bytes = pixel_bits / 8
38
+ line_len = (pixel_bytes * @options[:Columns])
39
+ pos = 0
40
+
41
+ if bpc != 8
42
+ raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
43
+ end
44
+
45
+ until pos > data.size
46
+ row_data = data[pos, line_len]
47
+ row_data.each_with_index do |byte, index|
48
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
49
+ row_data[index] = (byte + left) % 256
50
+ end
51
+ unfiltered += row_data
52
+ pos += line_len
53
+ end
54
+
55
+ unfiltered.pack("C*")
56
+ end
57
+ ################################################################################
58
+ def png_depredict(data)
59
+ return data if @options[:Predictor].to_i < 10
60
+
61
+ data = data.unpack("C*")
62
+
63
+ pixel_bytes = @options[:Colors] || 1
64
+ scanline_length = (pixel_bytes * @options[:Columns]) + 1
65
+ row = 0
66
+ pixels = []
67
+ paeth, pa, pb, pc = nil
68
+ until data.empty? do
69
+ row_data = data.slice! 0, scanline_length
70
+ filter = row_data.shift
71
+ case filter
72
+ when 0 # None
73
+ when 1 # Sub
74
+ row_data.each_with_index do |byte, index|
75
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
76
+ row_data[index] = (byte + left) % 256
77
+ #p [byte, left, row_data[index]]
78
+ end
79
+ when 2 # Up
80
+ row_data.each_with_index do |byte, index|
81
+ col = index / pixel_bytes
82
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
83
+ row_data[index] = (upper + byte) % 256
84
+ end
85
+ when 3 # Average
86
+ row_data.each_with_index do |byte, index|
87
+ col = index / pixel_bytes
88
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
89
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
90
+
91
+ row_data[index] = (byte + ((left + upper)/2).floor) % 256
92
+ end
93
+ when 4 # Paeth
94
+ left = upper = upper_left = nil
95
+ row_data.each_with_index do |byte, index|
96
+ col = index / pixel_bytes
97
+
98
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
99
+ if row.zero?
100
+ upper = upper_left = 0
101
+ else
102
+ upper = pixels[row-1][col][index % pixel_bytes]
103
+ upper_left = col.zero? ? 0 :
104
+ pixels[row-1][col-1][index % pixel_bytes]
105
+ end
106
+
107
+ p = left + upper - upper_left
108
+ pa = (p - left).abs
109
+ pb = (p - upper).abs
110
+ pc = (p - upper_left).abs
111
+
112
+ paeth = if pa <= pb && pa <= pc
113
+ left
114
+ elsif pb <= pc
115
+ upper
116
+ else
117
+ upper_left
118
+ end
119
+
120
+ row_data[index] = (byte + paeth) % 256
121
+ end
122
+ else
123
+ raise ArgumentError, "Invalid filter algorithm #{filter}"
124
+ end
125
+
126
+ s = []
127
+ row_data.each_slice pixel_bytes do |slice|
128
+ s << slice
129
+ end
130
+ pixels << s
131
+ row += 1
132
+ end
133
+
134
+ pixels.map { |bytes| bytes.flatten.pack("C*") }.join("")
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+
3
+
4
+ require 'zlib'
5
+
6
+ class PDF::Reader
7
+ module Filter # :nodoc:
8
+ class Flate
9
+ def initialize(options = {})
10
+ @options = options
11
+ end
12
+
13
+ ################################################################################
14
+ # Decode the specified data with the Zlib compression algorithm
15
+ def filter(data)
16
+ deflated = nil
17
+ begin
18
+ deflated = Zlib::Inflate.new.inflate(data)
19
+ rescue Zlib::DataError => e
20
+ # by default, Ruby's Zlib assumes the data it's inflating
21
+ # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
22
+ # If that fails, then use an undocumented 'feature' to attempt to inflate
23
+ # the data as a raw RFC1951 stream.
24
+ #
25
+ # See
26
+ # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
27
+ # - http://www.gzip.org/zlib/zlib_faq.html#faq38
28
+ deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
29
+ end
30
+ Depredict.new(@options).filter(deflated)
31
+ rescue Exception => e
32
+ # Oops, there was a problem inflating the stream
33
+ raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
34
+ end
35
+ end
36
+ end
37
+ end
38
+
@@ -0,0 +1,18 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class Lzw
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ ################################################################################
11
+ # Decode the specified data with the LZW compression algorithm
12
+ def filter(data)
13
+ data = PDF::Reader::LZW.decode(data)
14
+ Depredict.new(@options).filter(data)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class Null
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ def filter(data)
11
+ data
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class RunLength
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ ################################################################################
11
+ # Decode the specified data with the RunLengthDecode compression algorithm
12
+ def filter(data)
13
+ pos = 0
14
+ out = ""
15
+
16
+ while pos < data.length
17
+ if data.respond_to?(:getbyte)
18
+ length = data.getbyte(pos)
19
+ else
20
+ length = data[pos]
21
+ end
22
+ pos += 1
23
+
24
+ case
25
+ when length == 128
26
+ break
27
+ when length < 128
28
+ # When the length is < 128, we copy the following length+1 bytes
29
+ # literally.
30
+ out << data[pos, length + 1]
31
+ pos += length
32
+ else
33
+ # When the length is > 128, we copy the next byte (257 - length)
34
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
35
+ # "\x00\x00\x00\x00\x00\x00\x00".
36
+ out << data[pos, 1] * (257 - length)
37
+ end
38
+
39
+ pos += 1
40
+ end
41
+
42
+ Depredict.new(@options).filter(out)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -109,7 +109,7 @@ class PDF::Reader
109
109
  if params.class == String
110
110
  params.unpack(encoding.unpack).map { |c|
111
111
  @tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
112
- }.pack("U*")
112
+ }.flatten.pack("U*")
113
113
  elsif params.class == Array
114
114
  params.collect { |param| to_utf8_via_cmap(param) }
115
115
  else
@@ -1,5 +1,7 @@
1
1
  # coding: utf-8
2
2
 
3
+ require 'digest/md5'
4
+
3
5
  module PDF
4
6
  class Reader
5
7
 
@@ -15,9 +17,10 @@ module PDF
15
17
 
16
18
  attr_reader :xobject
17
19
 
18
- def initialize(page, xobject)
20
+ def initialize(page, xobject, options = {})
19
21
  @page = page
20
22
  @objects = page.objects
23
+ @cache = options[:cache] || {}
21
24
  @xobject = @objects.deref(xobject)
22
25
  end
23
26
 
@@ -65,12 +68,30 @@ module PDF
65
68
  end
66
69
  end
67
70
 
71
+ def content_stream_md5
72
+ @content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
73
+ end
74
+
75
+ def cached_tokens_key
76
+ @cached_tokens_key ||= "tokens-#{content_stream_md5}"
77
+ end
78
+
79
+ def tokens
80
+ @cache[cached_tokens_key] ||= begin
81
+ buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
82
+ parser = Parser.new(buffer, @objects)
83
+ result = []
84
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
85
+ result << token
86
+ end
87
+ result
88
+ end
89
+ end
90
+
68
91
  def content_stream(receivers, instructions)
69
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
70
- parser = Parser.new(buffer, @objects)
71
92
  params = []
72
93
 
73
- while (token = parser.parse_token(PagesStrategy::OPERATORS))
94
+ tokens.each do |token|
74
95
  if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
75
96
  callback(receivers, PagesStrategy::OPERATORS[token], params)
76
97
  params.clear
@@ -26,7 +26,8 @@
26
26
  class PDF::Reader
27
27
  class GlyphHash # :nodoc:
28
28
  def initialize
29
- @adobe = load_adobe_glyph_mapping
29
+ # only parse the glyph list once, and cache the results (for performance)
30
+ @adobe = @@cache ||= load_adobe_glyph_mapping
30
31
  end
31
32
 
32
33
  # attempt to convert a PDF Name to a unicode codepoint. Returns nil
@@ -82,7 +83,7 @@ class PDF::Reader
82
83
  end
83
84
  end
84
85
 
85
- glyphs
86
+ glyphs.freeze
86
87
  end
87
88
 
88
89
  end
@@ -1,10 +1,12 @@
1
1
  # coding: utf-8
2
2
 
3
+ require 'hashery'
4
+
3
5
  class PDF::Reader
4
6
 
5
7
  # A Hash-like object for caching commonly used objects from a PDF file.
6
8
  #
7
- # This is an internal class used by PDF::Reader::ObjectHash
9
+ # This is an internal class, no promises about a stable API.
8
10
  #
9
11
  class ObjectCache # nodoc
10
12
 
@@ -13,53 +15,67 @@ class PDF::Reader
13
15
  # avoid lots of repetitive (and expensive) tokenising
14
16
  CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
17
 
16
- def initialize
18
+ attr_reader :hits, :misses
19
+
20
+ def initialize(lru_size = 1000)
17
21
  @objects = {}
22
+ @lru_cache = Hashery::LRUHash.new(lru_size.to_i)
23
+ @hits = 0
24
+ @misses = 0
18
25
  end
19
26
 
20
27
  def [](key)
21
- @objects[key]
28
+ update_stats(key)
29
+ @objects[key] || @lru_cache[key]
22
30
  end
23
31
 
24
32
  def []=(key, value)
25
- @objects[key] = value if cacheable?(value)
33
+ if cacheable?(value)
34
+ @objects[key] = value
35
+ else
36
+ @lru_cache[key] = value
37
+ end
26
38
  end
27
39
 
28
40
  def fetch(key, local_default = nil)
29
- @objects.fetch(key, local_default)
41
+ update_stats(key)
42
+ @objects[key] || @lru_cache.fetch(key, local_default)
30
43
  end
31
44
 
32
45
  def each(&block)
33
46
  @objects.each(&block)
47
+ @lru_cache.each(&block)
34
48
  end
35
49
  alias :each_pair :each
36
50
 
37
51
  def each_key(&block)
38
52
  @objects.each_key(&block)
53
+ @lru_cache.each_key(&block)
39
54
  end
40
55
 
41
56
  def each_value(&block)
42
57
  @objects.each_value(&block)
58
+ @lru_cache.each_value(&block)
43
59
  end
44
60
 
45
61
  def size
46
- @objects.size
62
+ @objects.size + @lru_cache.size
47
63
  end
48
64
  alias :length :size
49
65
 
50
66
  def empty?
51
- @objects.empty?
67
+ @objects.empty? && @lru_cache.empty?
52
68
  end
53
69
 
54
- def has_key?(key)
55
- @objects.has_key?(key)
70
+ def include?(key)
71
+ @objects.include?(key) || @lru_cache.include?(key)
56
72
  end
57
- alias :include? :has_key?
58
- alias :key? :has_key?
59
- alias :member? :has_key?
73
+ alias :has_key? :include?
74
+ alias :key? :include?
75
+ alias :member? :include?
60
76
 
61
77
  def has_value?(value)
62
- @objects.has_value?(value)
78
+ @objects.has_value?(value) || @lru_cache.has_value?(value)
63
79
  end
64
80
 
65
81
  def to_s
@@ -67,19 +83,26 @@ class PDF::Reader
67
83
  end
68
84
 
69
85
  def keys
70
- @objects.keys
86
+ @objects.keys + @lru_cache.keys
71
87
  end
72
88
 
73
89
  def values
74
- @objects.values
90
+ @objects.values + @lru_cache.values
75
91
  end
76
92
 
77
93
  private
78
94
 
95
+ def update_stats(key)
96
+ if has_key?(key)
97
+ @hits += 1
98
+ else
99
+ @misses += 1
100
+ end
101
+ end
102
+
79
103
  def cacheable?(obj)
80
104
  obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
105
  end
82
106
 
83
-
84
107
  end
85
108
  end