pdf-reader 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+
3
+ require 'ascii85'
4
+
5
+ class PDF::Reader
6
+ module Filter # :nodoc:
7
+ class Ascii85
8
+ def initialize(options = {})
9
+ @options = options
10
+ end
11
+
12
+ ################################################################################
13
+ # Decode the specified data using the Ascii85 algorithm. Relies on the AScii85
14
+ # rubygem.
15
+ #
16
+ def filter(data)
17
+ data = "<~#{data}" unless data.to_s[0,2] == "<~"
18
+ ::Ascii85::decode(data)
19
+ rescue Exception => e
20
+ # Oops, there was a problem decoding the stream
21
+ raise MalformedPDFError, "Error occured while decoding an ASCII85 stream (#{e.class.to_s}: #{e.to_s})"
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,26 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class AsciiHex
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ ################################################################################
11
+ # Decode the specified data using the AsciiHex algorithm.
12
+ #
13
+ def filter(data)
14
+ data.chop! if data[-1,1] == ">"
15
+ data = data[1,data.size] if data[0,1] == "<"
16
+ data.gsub!(/[^A-Fa-f0-9]/,"")
17
+ data << "0" if data.size % 2 == 1
18
+ data.scan(/.{2}/).map { |s| s.hex.chr }.join("")
19
+ rescue Exception => e
20
+ # Oops, there was a problem decoding the stream
21
+ raise MalformedPDFError, "Error occured while decoding an ASCIIHex stream (#{e.class.to_s}: #{e.to_s})"
22
+ end
23
+ end
24
+ end
25
+ end
26
+
@@ -0,0 +1,138 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class Depredict
6
+ def initialize(options = {})
7
+ @options = options || {}
8
+ end
9
+
10
+ ################################################################################
11
+ # Streams can be preprocessed to improve compression. This reverses the
12
+ # preprocessing
13
+ #
14
+ def filter(data)
15
+ predictor = @options[:Predictor].to_i
16
+
17
+ case predictor
18
+ when 0, 1 then
19
+ data
20
+ when 2 then
21
+ tiff_depredict(data)
22
+ when 10, 11, 12, 13, 14, 15 then
23
+ png_depredict(data)
24
+ else
25
+ raise MalformedPDFError, "Unrecognised predictor value (#{predictor})"
26
+ end
27
+ end
28
+
29
+ private
30
+
31
+ ################################################################################
32
+ def tiff_depredict(data)
33
+ data = data.unpack("C*")
34
+ unfiltered = []
35
+ bpc = @options[:BitsPerComponent] || 8
36
+ pixel_bits = bpc * @options[:Colors]
37
+ pixel_bytes = pixel_bits / 8
38
+ line_len = (pixel_bytes * @options[:Columns])
39
+ pos = 0
40
+
41
+ if bpc != 8
42
+ raise UnsupportedFeatureError, "TIFF predictor onlys supports 8 Bits Per Component"
43
+ end
44
+
45
+ until pos > data.size
46
+ row_data = data[pos, line_len]
47
+ row_data.each_with_index do |byte, index|
48
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
49
+ row_data[index] = (byte + left) % 256
50
+ end
51
+ unfiltered += row_data
52
+ pos += line_len
53
+ end
54
+
55
+ unfiltered.pack("C*")
56
+ end
57
+ ################################################################################
58
+ def png_depredict(data)
59
+ return data if @options[:Predictor].to_i < 10
60
+
61
+ data = data.unpack("C*")
62
+
63
+ pixel_bytes = @options[:Colors] || 1
64
+ scanline_length = (pixel_bytes * @options[:Columns]) + 1
65
+ row = 0
66
+ pixels = []
67
+ paeth, pa, pb, pc = nil
68
+ until data.empty? do
69
+ row_data = data.slice! 0, scanline_length
70
+ filter = row_data.shift
71
+ case filter
72
+ when 0 # None
73
+ when 1 # Sub
74
+ row_data.each_with_index do |byte, index|
75
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
76
+ row_data[index] = (byte + left) % 256
77
+ #p [byte, left, row_data[index]]
78
+ end
79
+ when 2 # Up
80
+ row_data.each_with_index do |byte, index|
81
+ col = index / pixel_bytes
82
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
83
+ row_data[index] = (upper + byte) % 256
84
+ end
85
+ when 3 # Average
86
+ row_data.each_with_index do |byte, index|
87
+ col = index / pixel_bytes
88
+ upper = row == 0 ? 0 : pixels[row-1][col][index % pixel_bytes]
89
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
90
+
91
+ row_data[index] = (byte + ((left + upper)/2).floor) % 256
92
+ end
93
+ when 4 # Paeth
94
+ left = upper = upper_left = nil
95
+ row_data.each_with_index do |byte, index|
96
+ col = index / pixel_bytes
97
+
98
+ left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
99
+ if row.zero?
100
+ upper = upper_left = 0
101
+ else
102
+ upper = pixels[row-1][col][index % pixel_bytes]
103
+ upper_left = col.zero? ? 0 :
104
+ pixels[row-1][col-1][index % pixel_bytes]
105
+ end
106
+
107
+ p = left + upper - upper_left
108
+ pa = (p - left).abs
109
+ pb = (p - upper).abs
110
+ pc = (p - upper_left).abs
111
+
112
+ paeth = if pa <= pb && pa <= pc
113
+ left
114
+ elsif pb <= pc
115
+ upper
116
+ else
117
+ upper_left
118
+ end
119
+
120
+ row_data[index] = (byte + paeth) % 256
121
+ end
122
+ else
123
+ raise ArgumentError, "Invalid filter algorithm #{filter}"
124
+ end
125
+
126
+ s = []
127
+ row_data.each_slice pixel_bytes do |slice|
128
+ s << slice
129
+ end
130
+ pixels << s
131
+ row += 1
132
+ end
133
+
134
+ pixels.map { |bytes| bytes.flatten.pack("C*") }.join("")
135
+ end
136
+ end
137
+ end
138
+ end
@@ -0,0 +1,38 @@
1
+ # coding: utf-8
2
+
3
+
4
+ require 'zlib'
5
+
6
+ class PDF::Reader
7
+ module Filter # :nodoc:
8
+ class Flate
9
+ def initialize(options = {})
10
+ @options = options
11
+ end
12
+
13
+ ################################################################################
14
+ # Decode the specified data with the Zlib compression algorithm
15
+ def filter(data)
16
+ deflated = nil
17
+ begin
18
+ deflated = Zlib::Inflate.new.inflate(data)
19
+ rescue Zlib::DataError => e
20
+ # by default, Ruby's Zlib assumes the data it's inflating
21
+ # is RFC1951 deflated data, wrapped in a RFC1951 zlib container.
22
+ # If that fails, then use an undocumented 'feature' to attempt to inflate
23
+ # the data as a raw RFC1951 stream.
24
+ #
25
+ # See
26
+ # - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
27
+ # - http://www.gzip.org/zlib/zlib_faq.html#faq38
28
+ deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
29
+ end
30
+ Depredict.new(@options).filter(deflated)
31
+ rescue Exception => e
32
+ # Oops, there was a problem inflating the stream
33
+ raise MalformedPDFError, "Error occured while inflating a compressed stream (#{e.class.to_s}: #{e.to_s})"
34
+ end
35
+ end
36
+ end
37
+ end
38
+
@@ -0,0 +1,18 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class Lzw
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ ################################################################################
11
+ # Decode the specified data with the LZW compression algorithm
12
+ def filter(data)
13
+ data = PDF::Reader::LZW.decode(data)
14
+ Depredict.new(@options).filter(data)
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,15 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class Null
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ def filter(data)
11
+ data
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,46 @@
1
+ # coding: utf-8
2
+ #
3
+ class PDF::Reader
4
+ module Filter # :nodoc:
5
+ class RunLength
6
+ def initialize(options = {})
7
+ @options = options
8
+ end
9
+
10
+ ################################################################################
11
+ # Decode the specified data with the RunLengthDecode compression algorithm
12
+ def filter(data)
13
+ pos = 0
14
+ out = ""
15
+
16
+ while pos < data.length
17
+ if data.respond_to?(:getbyte)
18
+ length = data.getbyte(pos)
19
+ else
20
+ length = data[pos]
21
+ end
22
+ pos += 1
23
+
24
+ case
25
+ when length == 128
26
+ break
27
+ when length < 128
28
+ # When the length is < 128, we copy the following length+1 bytes
29
+ # literally.
30
+ out << data[pos, length + 1]
31
+ pos += length
32
+ else
33
+ # When the length is > 128, we copy the next byte (257 - length)
34
+ # times; i.e., "\xFA\x00" ([250, 0]) will expand to
35
+ # "\x00\x00\x00\x00\x00\x00\x00".
36
+ out << data[pos, 1] * (257 - length)
37
+ end
38
+
39
+ pos += 1
40
+ end
41
+
42
+ Depredict.new(@options).filter(out)
43
+ end
44
+ end
45
+ end
46
+ end
@@ -109,7 +109,7 @@ class PDF::Reader
109
109
  if params.class == String
110
110
  params.unpack(encoding.unpack).map { |c|
111
111
  @tounicode.decode(c) || PDF::Reader::Encoding::UNKNOWN_CHAR
112
- }.pack("U*")
112
+ }.flatten.pack("U*")
113
113
  elsif params.class == Array
114
114
  params.collect { |param| to_utf8_via_cmap(param) }
115
115
  else
@@ -1,5 +1,7 @@
1
1
  # coding: utf-8
2
2
 
3
+ require 'digest/md5'
4
+
3
5
  module PDF
4
6
  class Reader
5
7
 
@@ -15,9 +17,10 @@ module PDF
15
17
 
16
18
  attr_reader :xobject
17
19
 
18
- def initialize(page, xobject)
20
+ def initialize(page, xobject, options = {})
19
21
  @page = page
20
22
  @objects = page.objects
23
+ @cache = options[:cache] || {}
21
24
  @xobject = @objects.deref(xobject)
22
25
  end
23
26
 
@@ -65,12 +68,30 @@ module PDF
65
68
  end
66
69
  end
67
70
 
71
+ def content_stream_md5
72
+ @content_stream_md5 ||= Digest::MD5.hexdigest(raw_content)
73
+ end
74
+
75
+ def cached_tokens_key
76
+ @cached_tokens_key ||= "tokens-#{content_stream_md5}"
77
+ end
78
+
79
+ def tokens
80
+ @cache[cached_tokens_key] ||= begin
81
+ buffer = Buffer.new(StringIO.new(raw_content), :content_stream => true)
82
+ parser = Parser.new(buffer, @objects)
83
+ result = []
84
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
85
+ result << token
86
+ end
87
+ result
88
+ end
89
+ end
90
+
68
91
  def content_stream(receivers, instructions)
69
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
70
- parser = Parser.new(buffer, @objects)
71
92
  params = []
72
93
 
73
- while (token = parser.parse_token(PagesStrategy::OPERATORS))
94
+ tokens.each do |token|
74
95
  if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
75
96
  callback(receivers, PagesStrategy::OPERATORS[token], params)
76
97
  params.clear
@@ -26,7 +26,8 @@
26
26
  class PDF::Reader
27
27
  class GlyphHash # :nodoc:
28
28
  def initialize
29
- @adobe = load_adobe_glyph_mapping
29
+ # only parse the glyph list once, and cache the results (for performance)
30
+ @adobe = @@cache ||= load_adobe_glyph_mapping
30
31
  end
31
32
 
32
33
  # attempt to convert a PDF Name to a unicode codepoint. Returns nil
@@ -82,7 +83,7 @@ class PDF::Reader
82
83
  end
83
84
  end
84
85
 
85
- glyphs
86
+ glyphs.freeze
86
87
  end
87
88
 
88
89
  end
@@ -1,10 +1,12 @@
1
1
  # coding: utf-8
2
2
 
3
+ require 'hashery'
4
+
3
5
  class PDF::Reader
4
6
 
5
7
  # A Hash-like object for caching commonly used objects from a PDF file.
6
8
  #
7
- # This is an internal class used by PDF::Reader::ObjectHash
9
+ # This is an internal class, no promises about a stable API.
8
10
  #
9
11
  class ObjectCache # nodoc
10
12
 
@@ -13,53 +15,67 @@ class PDF::Reader
13
15
  # avoid lots of repetitive (and expensive) tokenising
14
16
  CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
17
 
16
- def initialize
18
+ attr_reader :hits, :misses
19
+
20
+ def initialize(lru_size = 1000)
17
21
  @objects = {}
22
+ @lru_cache = Hashery::LRUHash.new(lru_size.to_i)
23
+ @hits = 0
24
+ @misses = 0
18
25
  end
19
26
 
20
27
  def [](key)
21
- @objects[key]
28
+ update_stats(key)
29
+ @objects[key] || @lru_cache[key]
22
30
  end
23
31
 
24
32
  def []=(key, value)
25
- @objects[key] = value if cacheable?(value)
33
+ if cacheable?(value)
34
+ @objects[key] = value
35
+ else
36
+ @lru_cache[key] = value
37
+ end
26
38
  end
27
39
 
28
40
  def fetch(key, local_default = nil)
29
- @objects.fetch(key, local_default)
41
+ update_stats(key)
42
+ @objects[key] || @lru_cache.fetch(key, local_default)
30
43
  end
31
44
 
32
45
  def each(&block)
33
46
  @objects.each(&block)
47
+ @lru_cache.each(&block)
34
48
  end
35
49
  alias :each_pair :each
36
50
 
37
51
  def each_key(&block)
38
52
  @objects.each_key(&block)
53
+ @lru_cache.each_key(&block)
39
54
  end
40
55
 
41
56
  def each_value(&block)
42
57
  @objects.each_value(&block)
58
+ @lru_cache.each_value(&block)
43
59
  end
44
60
 
45
61
  def size
46
- @objects.size
62
+ @objects.size + @lru_cache.size
47
63
  end
48
64
  alias :length :size
49
65
 
50
66
  def empty?
51
- @objects.empty?
67
+ @objects.empty? && @lru_cache.empty?
52
68
  end
53
69
 
54
- def has_key?(key)
55
- @objects.has_key?(key)
70
+ def include?(key)
71
+ @objects.include?(key) || @lru_cache.include?(key)
56
72
  end
57
- alias :include? :has_key?
58
- alias :key? :has_key?
59
- alias :member? :has_key?
73
+ alias :has_key? :include?
74
+ alias :key? :include?
75
+ alias :member? :include?
60
76
 
61
77
  def has_value?(value)
62
- @objects.has_value?(value)
78
+ @objects.has_value?(value) || @lru_cache.has_value?(value)
63
79
  end
64
80
 
65
81
  def to_s
@@ -67,19 +83,26 @@ class PDF::Reader
67
83
  end
68
84
 
69
85
  def keys
70
- @objects.keys
86
+ @objects.keys + @lru_cache.keys
71
87
  end
72
88
 
73
89
  def values
74
- @objects.values
90
+ @objects.values + @lru_cache.values
75
91
  end
76
92
 
77
93
  private
78
94
 
95
+ def update_stats(key)
96
+ if has_key?(key)
97
+ @hits += 1
98
+ else
99
+ @misses += 1
100
+ end
101
+ end
102
+
79
103
  def cacheable?(obj)
80
104
  obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
105
  end
82
106
 
83
-
84
107
  end
85
108
  end