pdf-reader 0.10.1 → 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
@@ -2,6 +2,8 @@
2
2
 
3
3
  class PDF::Reader
4
4
 
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
5
7
  class AbstractStrategy # :nodoc:
6
8
 
7
9
  def initialize(ohash, receivers, options = {})
@@ -74,24 +74,23 @@ class PDF::Reader
74
74
  #
75
75
  # options:
76
76
  #
77
- # :skip_eol - if true, the IO stream is advanced past any LF or CR
78
- # bytes before it reads any data. This is to handle
79
- # content streams, which have a CRLF or LF after the stream
80
- # token.
77
+ # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
78
+ # is sitting under the io cursor.
81
79
  #
82
80
  def read(bytes, opts = {})
83
81
  reset_pos
84
82
 
85
83
  if opts[:skip_eol]
86
- done = false
87
- while !done
88
- chr = @io.read(1)
89
- if chr.nil?
90
- return nil
91
- elsif chr != "\n" && chr != "\r"
92
- @io.seek(-1, IO::SEEK_CUR)
93
- done = true
94
- end
84
+ @io.seek(-1, IO::SEEK_CUR)
85
+ str = @io.read(2)
86
+ if str.nil?
87
+ return nil
88
+ elsif str == "\r\n"
89
+ # do nothing
90
+ elsif str[0,1] == "\n"
91
+ @io.seek(-1, IO::SEEK_CUR)
92
+ else
93
+ @io.seek(-2, IO::SEEK_CUR)
95
94
  end
96
95
  end
97
96
 
@@ -26,8 +26,21 @@
26
26
  class PDF::Reader
27
27
  class Font
28
28
  attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
29
+ attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
29
30
  attr_reader :basefont
30
31
 
32
+ def initialize(ohash = nil, obj = nil)
33
+ if ohash.nil? || obj.nil?
34
+ $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
35
+ return
36
+ end
37
+ @ohash = ohash
38
+
39
+ extract_base_info(obj)
40
+ extract_descriptor(obj)
41
+ extract_descendants(obj)
42
+ end
43
+
31
44
  # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
32
45
  # a text file supplied by Adobe at:
33
46
  # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
@@ -73,5 +86,48 @@ class PDF::Reader
73
86
  params
74
87
  end
75
88
  end
89
+
90
+ def glyph_width(c)
91
+ @missing_width ||= 0
92
+ if @widths.nil?
93
+ 0
94
+ else
95
+ @widths.fetch(c.codepoints.first - @first_char, @missing_width)
96
+ end
97
+ end
98
+
99
+ private
100
+
101
+ def extract_base_info(obj)
102
+ @subtype = @ohash.object(obj[:Subtype])
103
+ @basefont = @ohash.object(obj[:BaseFont])
104
+ @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
105
+ @widths = @ohash.object(obj[:Widths])
106
+ @first_char = @ohash.object(obj[:FirstChar])
107
+ if obj[:ToUnicode]
108
+ stream = @ohash.object(obj[:ToUnicode])
109
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
110
+ end
111
+ end
112
+
113
+ def extract_descriptor(obj)
114
+ return unless obj[:FontDescriptor]
115
+
116
+ fd = @ohash.object(obj[:FontDescriptor])
117
+ @ascent = @ohash.object(fd[:Ascent])
118
+ @descent = @ohash.object(fd[:Descent])
119
+ @missing_width = @ohash.object(fd[:MissingWidth])
120
+ @bbox = @ohash.object(fd[:FontBBox])
121
+ end
122
+
123
+ def extract_descendants(obj)
124
+ return unless obj[:DescendantFonts]
125
+
126
+ descendants = @ohash.object(obj[:DescendantFonts])
127
+ @descendantfonts = descendants.map { |desc|
128
+ PDF::Reader::Font.new(@ohash, @ohash.object(desc))
129
+ }
130
+ end
131
+
76
132
  end
77
133
  end
@@ -1,4 +1,43 @@
1
- # This file maps glyph names to unicode codepoints
1
+ # ###################################################################################
2
+ # Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a
5
+ # copy of this documentation file to use, copy, publish, distribute,
6
+ # sublicense, and/or sell copies of the documentation, and to permit
7
+ # others to do the same, provided that:
8
+ # - No modification, editing or other alteration of this document is
9
+ # allowed; and
10
+ # - The above copyright notice and this permission notice shall be
11
+ # included in all copies of the documentation.
12
+ #
13
+ # Permission is hereby granted, free of charge, to any person obtaining a
14
+ # copy of this documentation file, to create their own derivative works
15
+ # from the content of this document to use, copy, publish, distribute,
16
+ # sublicense, and/or sell the derivative works, and to permit others to do
17
+ # the same, provided that the derived work is not represented as being a
18
+ # copy or version of this document.
19
+ #
20
+ # Adobe shall not be liable to any party for any loss of revenue or profit
21
+ # or for indirect, incidental, special, consequential, or other similar
22
+ # damages, whether based on tort (including without limitation negligence
23
+ # or strict liability), contract or other legal or equitable grounds even
24
+ # if Adobe has been advised or had reason to know of the possibility of
25
+ # such damages.� The Adobe materials are provided on an "AS IS" basis.�
26
+ # Adobe specifically disclaims all express, statutory, or implied
27
+ # warranties relating to the Adobe materials, including but not limited to
28
+ # those concerning merchantability or fitness for a particular purpose or
29
+ # non-infringement of any third party rights regarding the Adobe
30
+ # materials.
31
+ # ###################################################################################
32
+ # Name: Adobe Glyph List
33
+ # Table version: 2.0
34
+ # Date: September 20, 2002
35
+ #
36
+ # See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
37
+ #
38
+ # Format: Semicolon-delimited fields:
39
+ # (1) glyph name
40
+ # (2) Unicode scalar value
2
41
  A;0041
3
42
  AE;00C6
4
43
  AEacute;01FC
@@ -2,6 +2,9 @@
2
2
 
3
3
  class PDF::Reader
4
4
 
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
7
+ #
5
8
  class MetadataStrategy < AbstractStrategy # :nodoc:
6
9
 
7
10
  def self.to_sym
@@ -0,0 +1,85 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # A Hash-like object for caching commonly used objects from a PDF file.
6
+ #
7
+ # This is an internal class used by PDF::Reader::ObjectHash
8
+ #
9
+ class ObjectCache # nodoc
10
+
11
+ # These object types use little memory and are accessed a heap of times as
12
+ # part of random page access, so we'll cache the unmarshalled objects and
13
+ # avoid lots of repetitive (and expensive) tokenising
14
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
+
16
+ def initialize
17
+ @objects = {}
18
+ end
19
+
20
+ def [](key)
21
+ @objects[key]
22
+ end
23
+
24
+ def []=(key, value)
25
+ @objects[key] = value if cacheable?(value)
26
+ end
27
+
28
+ def fetch(key, local_default = nil)
29
+ @objects.fetch(key, local_default)
30
+ end
31
+
32
+ def each(&block)
33
+ @objects.each(&block)
34
+ end
35
+ alias :each_pair :each
36
+
37
+ def each_key(&block)
38
+ @objects.each_key(&block)
39
+ end
40
+
41
+ def each_value(&block)
42
+ @objects.each_value(&block)
43
+ end
44
+
45
+ def size
46
+ @objects.size
47
+ end
48
+ alias :length :size
49
+
50
+ def empty?
51
+ @objects.empty?
52
+ end
53
+
54
+ def has_key?(key)
55
+ @objects.has_key?(key)
56
+ end
57
+ alias :include? :has_key?
58
+ alias :key? :has_key?
59
+ alias :member? :has_key?
60
+
61
+ def has_value?(value)
62
+ @objects.has_value?(value)
63
+ end
64
+
65
+ def to_s
66
+ "<PDF::Reader::ObjectCache size: #{self.size}>"
67
+ end
68
+
69
+ def keys
70
+ @objects.keys
71
+ end
72
+
73
+ def values
74
+ @objects.values
75
+ end
76
+
77
+ private
78
+
79
+ def cacheable?(obj)
80
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
+ end
82
+
83
+
84
+ end
85
+ end
@@ -5,7 +5,7 @@ class PDF::Reader
5
5
  # object.
6
6
  #
7
7
  # A PDF file can be viewed as a large hash map. It is a series of objects
8
- # stored at an exact byte offsets, and a table that maps object IDs to byte
8
+ # stored at precise byte offsets, and a table that maps object IDs to byte
9
9
  # offsets. Given an object ID, looking up an object is an O(1) operation.
10
10
  #
11
11
  # Each PDF object can be mapped to a ruby object, so by passing an object
@@ -28,6 +28,8 @@ class PDF::Reader
28
28
  class ObjectHash
29
29
  include Enumerable
30
30
 
31
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
32
+
31
33
  attr_accessor :default
32
34
  attr_reader :trailer, :pdf_version
33
35
 
@@ -50,6 +52,11 @@ class PDF::Reader
50
52
  @pdf_version = read_version
51
53
  @xref = PDF::Reader::XRef.new(@io)
52
54
  @trailer = @xref.trailer
55
+ @cache = PDF::Reader::ObjectCache.new
56
+
57
+ if trailer[:Encrypt]
58
+ raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
59
+ end
53
60
  end
54
61
 
55
62
  # returns the type of object a ref points to
@@ -81,25 +88,32 @@ class PDF::Reader
81
88
  unless key.kind_of?(PDF::Reader::Reference)
82
89
  key = PDF::Reader::Reference.new(key.to_i, 0)
83
90
  end
84
- if xref[key].is_a?(Fixnum)
91
+ if @cache.has_key?(key)
92
+ @cache[key]
93
+ elsif xref[key].is_a?(Fixnum)
85
94
  buf = new_buffer(xref[key])
86
- Parser.new(buf, self).object(key.id, key.gen)
95
+ @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
87
96
  elsif xref[key].is_a?(PDF::Reader::Reference)
88
97
  container_key = xref[key]
89
98
  object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
90
- object_streams[container_key][key.id]
99
+ @cache[key] = object_streams[container_key][key.id]
91
100
  end
92
101
  rescue InvalidObjectError
93
102
  return default
94
103
  end
95
104
  end
96
105
 
106
+ def cacheable?(obj)
107
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
108
+ end
109
+
97
110
  # If key is a PDF::Reader::Reference object, lookup the corresponding
98
111
  # object in the PDF and return it. Otherwise return key untouched.
99
112
  #
100
113
  def object(key)
101
114
  key.is_a?(PDF::Reader::Reference) ? self[key] : key
102
115
  end
116
+ alias :deref :object
103
117
 
104
118
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
105
119
  # object.
@@ -192,7 +206,7 @@ class PDF::Reader
192
206
  alias :value? :has_key?
193
207
 
194
208
  def to_s
195
- "<PDF::Reader::ObejctHash size: #{self.size}>"
209
+ "<PDF::Reader::ObjectHash size: #{self.size}>"
196
210
  end
197
211
 
198
212
  # return an array of all keys in the file
@@ -0,0 +1,172 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # high level representation of a single PDF page. Ties together the various
7
+ # low level classes in PDF::Reader and provides access to the various
8
+ # components of the page (text, images, fonts, etc) in convenient formats.
9
+ #
10
+ # If you require access to the raw PDF objects for this page, you can access
11
+ # the Page dictionary via the page_object accessor. You will need to use the
12
+ # objects accessor to help walk the page dictionary in any useful way.
13
+ #
14
+ class Page
15
+
16
+ # lowlevel hash-like access to all objects in the underlying PDF
17
+ attr_reader :objects
18
+
19
+ # the raw PDF object that defines this page
20
+ attr_reader :page_object
21
+
22
+ # creates a new page wrapper.
23
+ #
24
+ # * objects - an ObjectHash instance that wraps a PDF file
25
+ # * pagenum - an int specifying the page number to expose. 1 indexed.
26
+ #
27
+ def initialize(objects, pagenum)
28
+ @objects, @pagenum = objects, pagenum
29
+ @page_object = objects.deref(objects.page_references[pagenum - 1])
30
+
31
+ unless @page_object.is_a?(::Hash)
32
+ raise ArgumentError, "invalid page: #{pagenum}"
33
+ end
34
+ end
35
+
36
+ # return the number of this page within the full document
37
+ #
38
+ def number
39
+ @pagenum
40
+ end
41
+
42
+ # return a friendly string representation of this page
43
+ #
44
+ def inspect
45
+ "<PDF::Reader::Page page: #{@pagenum}>"
46
+ end
47
+
48
+ # Returns the attributes that accompany this page. Includes
49
+ # attributes inherited from parents.
50
+ #
51
+ def attributes
52
+ hash = {}
53
+ page_with_ancestors.reverse.each do |obj|
54
+ hash.merge!(@objects.deref(obj))
55
+ end
56
+ hash
57
+ end
58
+
59
+ # Returns the resources that accompany this page. Includes
60
+ # resources inherited from parents.
61
+ #
62
+ def resources
63
+ @resources ||= @objects.deref(attributes[:Resources]) || {}
64
+ end
65
+
66
+ # return a hash of fonts used on this page.
67
+ #
68
+ # The keys are the font labels used within the page content stream.
69
+ #
70
+ # The values are a PDF::Reader::Font instances that provide access
71
+ # to most available metrics for each font.
72
+ #
73
+ def fonts
74
+ raw_fonts = objects.deref(resources[:Font] || {})
75
+ ::Hash[raw_fonts.map { |label, font|
76
+ [label, PDF::Reader::Font.new(objects, objects.deref(font))]
77
+ }]
78
+ end
79
+
80
+ # returns the plain text content of this page encoded as UTF-8. Any
81
+ # characters that can't be translated will be returned as a ▯
82
+ #
83
+ def text
84
+ text_receiver = PageTextReceiver.new(fonts)
85
+ walk(text_receiver)
86
+ text_receiver.content
87
+ end
88
+ alias :to_s :text
89
+
90
+ # processes the raw content stream for this page in sequential order and
91
+ # passes callbacks to the receiver objects.
92
+ #
93
+ # This is mostly low level and you can probably ignore it unless you need
94
+ # access to soemthing like the raw encoded text. For an example of how
95
+ # this can be used as a basis for higher level functionality, see the
96
+ # text() method
97
+ #
98
+ def walk(*receivers)
99
+ callback(receivers, :page=, [self])
100
+ content_stream(receivers, raw_content)
101
+ end
102
+
103
+ # returns the raw content stream for this page. This is plumbing, nothing to
104
+ # see here unless you're a PDF nerd like me.
105
+ #
106
+ def raw_content
107
+ contents = objects.deref(@page_object[:Contents])
108
+ [contents].flatten.compact.map { |obj|
109
+ objects.deref(obj)
110
+ }.map { |obj|
111
+ obj.unfiltered_data
112
+ }.join
113
+ end
114
+
115
+ private
116
+
117
+ def root
118
+ root ||= objects.deref(@objects.trailer[:Root])
119
+ end
120
+
121
+ def xobjects
122
+ resources[:XObject] || {}
123
+ end
124
+
125
+ def content_stream(receivers, instructions)
126
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
127
+ parser = Parser.new(buffer, @objects)
128
+ params = []
129
+
130
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
131
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
132
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
133
+ params.clear
134
+ else
135
+ params << token
136
+ end
137
+ end
138
+ rescue EOFError => e
139
+ raise MalformedPDFError, "End Of File while processing a content stream"
140
+ end
141
+
142
+ # calls the name callback method on the receiver class with params as the arguments
143
+ #
144
+ def callback (receivers, name, params=[])
145
+ receivers.each do |receiver|
146
+ receiver.send(name, *params) if receiver.respond_to?(name)
147
+ end
148
+ end
149
+
150
+ def page_with_ancestors(obj = nil)
151
+ obj = objects.deref(obj)
152
+ if obj.nil?
153
+ [@page_object] + page_with_ancestors(@page_object[:Parent])
154
+ elsif obj[:Parent]
155
+ [select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
156
+ else
157
+ [select_inheritable(obj)]
158
+ end
159
+ end
160
+
161
+ # select the elements from a Pages dictionary that can be inherited by
162
+ # child Page dictionaries.
163
+ #
164
+ def select_inheritable(obj)
165
+ ::Hash[obj.select { |key, value|
166
+ [:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
167
+ }]
168
+ end
169
+
170
+ end
171
+ end
172
+ end