pdf-reader 0.10.1 → 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,8 @@
2
2
 
3
3
  class PDF::Reader
4
4
 
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
5
7
  class AbstractStrategy # :nodoc:
6
8
 
7
9
  def initialize(ohash, receivers, options = {})
@@ -74,24 +74,23 @@ class PDF::Reader
74
74
  #
75
75
  # options:
76
76
  #
77
- # :skip_eol - if true, the IO stream is advanced past any LF or CR
78
- # bytes before it reads any data. This is to handle
79
- # content streams, which have a CRLF or LF after the stream
80
- # token.
77
+ # :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
78
+ # is sitting under the io cursor.
81
79
  #
82
80
  def read(bytes, opts = {})
83
81
  reset_pos
84
82
 
85
83
  if opts[:skip_eol]
86
- done = false
87
- while !done
88
- chr = @io.read(1)
89
- if chr.nil?
90
- return nil
91
- elsif chr != "\n" && chr != "\r"
92
- @io.seek(-1, IO::SEEK_CUR)
93
- done = true
94
- end
84
+ @io.seek(-1, IO::SEEK_CUR)
85
+ str = @io.read(2)
86
+ if str.nil?
87
+ return nil
88
+ elsif str == "\r\n"
89
+ # do nothing
90
+ elsif str[0,1] == "\n"
91
+ @io.seek(-1, IO::SEEK_CUR)
92
+ else
93
+ @io.seek(-2, IO::SEEK_CUR)
95
94
  end
96
95
  end
97
96
 
@@ -26,8 +26,21 @@
26
26
  class PDF::Reader
27
27
  class Font
28
28
  attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
29
+ attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
29
30
  attr_reader :basefont
30
31
 
32
+ def initialize(ohash = nil, obj = nil)
33
+ if ohash.nil? || obj.nil?
34
+ $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
35
+ return
36
+ end
37
+ @ohash = ohash
38
+
39
+ extract_base_info(obj)
40
+ extract_descriptor(obj)
41
+ extract_descendants(obj)
42
+ end
43
+
31
44
  # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
32
45
  # a text file supplied by Adobe at:
33
46
  # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
@@ -73,5 +86,48 @@ class PDF::Reader
73
86
  params
74
87
  end
75
88
  end
89
+
90
+ def glyph_width(c)
91
+ @missing_width ||= 0
92
+ if @widths.nil?
93
+ 0
94
+ else
95
+ @widths.fetch(c.codepoints.first - @first_char, @missing_width)
96
+ end
97
+ end
98
+
99
+ private
100
+
101
+ def extract_base_info(obj)
102
+ @subtype = @ohash.object(obj[:Subtype])
103
+ @basefont = @ohash.object(obj[:BaseFont])
104
+ @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
105
+ @widths = @ohash.object(obj[:Widths])
106
+ @first_char = @ohash.object(obj[:FirstChar])
107
+ if obj[:ToUnicode]
108
+ stream = @ohash.object(obj[:ToUnicode])
109
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
110
+ end
111
+ end
112
+
113
+ def extract_descriptor(obj)
114
+ return unless obj[:FontDescriptor]
115
+
116
+ fd = @ohash.object(obj[:FontDescriptor])
117
+ @ascent = @ohash.object(fd[:Ascent])
118
+ @descent = @ohash.object(fd[:Descent])
119
+ @missing_width = @ohash.object(fd[:MissingWidth])
120
+ @bbox = @ohash.object(fd[:FontBBox])
121
+ end
122
+
123
+ def extract_descendants(obj)
124
+ return unless obj[:DescendantFonts]
125
+
126
+ descendants = @ohash.object(obj[:DescendantFonts])
127
+ @descendantfonts = descendants.map { |desc|
128
+ PDF::Reader::Font.new(@ohash, @ohash.object(desc))
129
+ }
130
+ end
131
+
76
132
  end
77
133
  end
@@ -1,4 +1,43 @@
1
- # This file maps glyph names to unicode codepoints
1
+ # ###################################################################################
2
+ # Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
3
+ #
4
+ # Permission is hereby granted, free of charge, to any person obtaining a
5
+ # copy of this documentation file to use, copy, publish, distribute,
6
+ # sublicense, and/or sell copies of the documentation, and to permit
7
+ # others to do the same, provided that:
8
+ # - No modification, editing or other alteration of this document is
9
+ # allowed; and
10
+ # - The above copyright notice and this permission notice shall be
11
+ # included in all copies of the documentation.
12
+ #
13
+ # Permission is hereby granted, free of charge, to any person obtaining a
14
+ # copy of this documentation file, to create their own derivative works
15
+ # from the content of this document to use, copy, publish, distribute,
16
+ # sublicense, and/or sell the derivative works, and to permit others to do
17
+ # the same, provided that the derived work is not represented as being a
18
+ # copy or version of this document.
19
+ #
20
+ # Adobe shall not be liable to any party for any loss of revenue or profit
21
+ # or for indirect, incidental, special, consequential, or other similar
22
+ # damages, whether based on tort (including without limitation negligence
23
+ # or strict liability), contract or other legal or equitable grounds even
24
+ # if Adobe has been advised or had reason to know of the possibility of
25
+ # such damages.� The Adobe materials are provided on an "AS IS" basis.�
26
+ # Adobe specifically disclaims all express, statutory, or implied
27
+ # warranties relating to the Adobe materials, including but not limited to
28
+ # those concerning merchantability or fitness for a particular purpose or
29
+ # non-infringement of any third party rights regarding the Adobe
30
+ # materials.
31
+ # ###################################################################################
32
+ # Name: Adobe Glyph List
33
+ # Table version: 2.0
34
+ # Date: September 20, 2002
35
+ #
36
+ # See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
37
+ #
38
+ # Format: Semicolon-delimited fields:
39
+ # (1) glyph name
40
+ # (2) Unicode scalar value
2
41
  A;0041
3
42
  AE;00C6
4
43
  AEacute;01FC
@@ -2,6 +2,9 @@
2
2
 
3
3
  class PDF::Reader
4
4
 
5
+ # DEPRECATED: this class was deprecated in version 0.11.0 and will
6
+ # eventually be removed
7
+ #
5
8
  class MetadataStrategy < AbstractStrategy # :nodoc:
6
9
 
7
10
  def self.to_sym
@@ -0,0 +1,85 @@
1
+ # coding: utf-8
2
+
3
+ class PDF::Reader
4
+
5
+ # A Hash-like object for caching commonly used objects from a PDF file.
6
+ #
7
+ # This is an internal class used by PDF::Reader::ObjectHash
8
+ #
9
+ class ObjectCache # nodoc
10
+
11
+ # These object types use little memory and are accessed a heap of times as
12
+ # part of random page access, so we'll cache the unmarshalled objects and
13
+ # avoid lots of repetitive (and expensive) tokenising
14
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
15
+
16
+ def initialize
17
+ @objects = {}
18
+ end
19
+
20
+ def [](key)
21
+ @objects[key]
22
+ end
23
+
24
+ def []=(key, value)
25
+ @objects[key] = value if cacheable?(value)
26
+ end
27
+
28
+ def fetch(key, local_default = nil)
29
+ @objects.fetch(key, local_default)
30
+ end
31
+
32
+ def each(&block)
33
+ @objects.each(&block)
34
+ end
35
+ alias :each_pair :each
36
+
37
+ def each_key(&block)
38
+ @objects.each_key(&block)
39
+ end
40
+
41
+ def each_value(&block)
42
+ @objects.each_value(&block)
43
+ end
44
+
45
+ def size
46
+ @objects.size
47
+ end
48
+ alias :length :size
49
+
50
+ def empty?
51
+ @objects.empty?
52
+ end
53
+
54
+ def has_key?(key)
55
+ @objects.has_key?(key)
56
+ end
57
+ alias :include? :has_key?
58
+ alias :key? :has_key?
59
+ alias :member? :has_key?
60
+
61
+ def has_value?(value)
62
+ @objects.has_value?(value)
63
+ end
64
+
65
+ def to_s
66
+ "<PDF::Reader::ObjectCache size: #{self.size}>"
67
+ end
68
+
69
+ def keys
70
+ @objects.keys
71
+ end
72
+
73
+ def values
74
+ @objects.values
75
+ end
76
+
77
+ private
78
+
79
+ def cacheable?(obj)
80
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
81
+ end
82
+
83
+
84
+ end
85
+ end
@@ -5,7 +5,7 @@ class PDF::Reader
5
5
  # object.
6
6
  #
7
7
  # A PDF file can be viewed as a large hash map. It is a series of objects
8
- # stored at an exact byte offsets, and a table that maps object IDs to byte
8
+ # stored at precise byte offsets, and a table that maps object IDs to byte
9
9
  # offsets. Given an object ID, looking up an object is an O(1) operation.
10
10
  #
11
11
  # Each PDF object can be mapped to a ruby object, so by passing an object
@@ -28,6 +28,8 @@ class PDF::Reader
28
28
  class ObjectHash
29
29
  include Enumerable
30
30
 
31
+ CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
32
+
31
33
  attr_accessor :default
32
34
  attr_reader :trailer, :pdf_version
33
35
 
@@ -50,6 +52,11 @@ class PDF::Reader
50
52
  @pdf_version = read_version
51
53
  @xref = PDF::Reader::XRef.new(@io)
52
54
  @trailer = @xref.trailer
55
+ @cache = PDF::Reader::ObjectCache.new
56
+
57
+ if trailer[:Encrypt]
58
+ raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
59
+ end
53
60
  end
54
61
 
55
62
  # returns the type of object a ref points to
@@ -81,25 +88,32 @@ class PDF::Reader
81
88
  unless key.kind_of?(PDF::Reader::Reference)
82
89
  key = PDF::Reader::Reference.new(key.to_i, 0)
83
90
  end
84
- if xref[key].is_a?(Fixnum)
91
+ if @cache.has_key?(key)
92
+ @cache[key]
93
+ elsif xref[key].is_a?(Fixnum)
85
94
  buf = new_buffer(xref[key])
86
- Parser.new(buf, self).object(key.id, key.gen)
95
+ @cache[key] = Parser.new(buf, self).object(key.id, key.gen)
87
96
  elsif xref[key].is_a?(PDF::Reader::Reference)
88
97
  container_key = xref[key]
89
98
  object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
90
- object_streams[container_key][key.id]
99
+ @cache[key] = object_streams[container_key][key.id]
91
100
  end
92
101
  rescue InvalidObjectError
93
102
  return default
94
103
  end
95
104
  end
96
105
 
106
+ def cacheable?(obj)
107
+ obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
108
+ end
109
+
97
110
  # If key is a PDF::Reader::Reference object, lookup the corresponding
98
111
  # object in the PDF and return it. Otherwise return key untouched.
99
112
  #
100
113
  def object(key)
101
114
  key.is_a?(PDF::Reader::Reference) ? self[key] : key
102
115
  end
116
+ alias :deref :object
103
117
 
104
118
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
105
119
  # object.
@@ -192,7 +206,7 @@ class PDF::Reader
192
206
  alias :value? :has_key?
193
207
 
194
208
  def to_s
195
- "<PDF::Reader::ObejctHash size: #{self.size}>"
209
+ "<PDF::Reader::ObjectHash size: #{self.size}>"
196
210
  end
197
211
 
198
212
  # return an array of all keys in the file
@@ -0,0 +1,172 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # high level representation of a single PDF page. Ties together the various
7
+ # low level classes in PDF::Reader and provides access to the various
8
+ # components of the page (text, images, fonts, etc) in convenient formats.
9
+ #
10
+ # If you require access to the raw PDF objects for this page, you can access
11
+ # the Page dictionary via the page_object accessor. You will need to use the
12
+ # objects accessor to help walk the page dictionary in any useful way.
13
+ #
14
+ class Page
15
+
16
+ # lowlevel hash-like access to all objects in the underlying PDF
17
+ attr_reader :objects
18
+
19
+ # the raw PDF object that defines this page
20
+ attr_reader :page_object
21
+
22
+ # creates a new page wrapper.
23
+ #
24
+ # * objects - an ObjectHash instance that wraps a PDF file
25
+ # * pagenum - an int specifying the page number to expose. 1 indexed.
26
+ #
27
+ def initialize(objects, pagenum)
28
+ @objects, @pagenum = objects, pagenum
29
+ @page_object = objects.deref(objects.page_references[pagenum - 1])
30
+
31
+ unless @page_object.is_a?(::Hash)
32
+ raise ArgumentError, "invalid page: #{pagenum}"
33
+ end
34
+ end
35
+
36
+ # return the number of this page within the full document
37
+ #
38
+ def number
39
+ @pagenum
40
+ end
41
+
42
+ # return a friendly string representation of this page
43
+ #
44
+ def inspect
45
+ "<PDF::Reader::Page page: #{@pagenum}>"
46
+ end
47
+
48
+ # Returns the attributes that accompany this page. Includes
49
+ # attributes inherited from parents.
50
+ #
51
+ def attributes
52
+ hash = {}
53
+ page_with_ancestors.reverse.each do |obj|
54
+ hash.merge!(@objects.deref(obj))
55
+ end
56
+ hash
57
+ end
58
+
59
+ # Returns the resources that accompany this page. Includes
60
+ # resources inherited from parents.
61
+ #
62
+ def resources
63
+ @resources ||= @objects.deref(attributes[:Resources]) || {}
64
+ end
65
+
66
+ # return a hash of fonts used on this page.
67
+ #
68
+ # The keys are the font labels used within the page content stream.
69
+ #
70
+ # The values are a PDF::Reader::Font instances that provide access
71
+ # to most available metrics for each font.
72
+ #
73
+ def fonts
74
+ raw_fonts = objects.deref(resources[:Font] || {})
75
+ ::Hash[raw_fonts.map { |label, font|
76
+ [label, PDF::Reader::Font.new(objects, objects.deref(font))]
77
+ }]
78
+ end
79
+
80
+ # returns the plain text content of this page encoded as UTF-8. Any
81
+ # characters that can't be translated will be returned as a ▯
82
+ #
83
+ def text
84
+ text_receiver = PageTextReceiver.new(fonts)
85
+ walk(text_receiver)
86
+ text_receiver.content
87
+ end
88
+ alias :to_s :text
89
+
90
+ # processes the raw content stream for this page in sequential order and
91
+ # passes callbacks to the receiver objects.
92
+ #
93
+ # This is mostly low level and you can probably ignore it unless you need
94
+ # access to soemthing like the raw encoded text. For an example of how
95
+ # this can be used as a basis for higher level functionality, see the
96
+ # text() method
97
+ #
98
+ def walk(*receivers)
99
+ callback(receivers, :page=, [self])
100
+ content_stream(receivers, raw_content)
101
+ end
102
+
103
+ # returns the raw content stream for this page. This is plumbing, nothing to
104
+ # see here unless you're a PDF nerd like me.
105
+ #
106
+ def raw_content
107
+ contents = objects.deref(@page_object[:Contents])
108
+ [contents].flatten.compact.map { |obj|
109
+ objects.deref(obj)
110
+ }.map { |obj|
111
+ obj.unfiltered_data
112
+ }.join
113
+ end
114
+
115
+ private
116
+
117
+ def root
118
+ root ||= objects.deref(@objects.trailer[:Root])
119
+ end
120
+
121
+ def xobjects
122
+ resources[:XObject] || {}
123
+ end
124
+
125
+ def content_stream(receivers, instructions)
126
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
127
+ parser = Parser.new(buffer, @objects)
128
+ params = []
129
+
130
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
131
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
132
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
133
+ params.clear
134
+ else
135
+ params << token
136
+ end
137
+ end
138
+ rescue EOFError => e
139
+ raise MalformedPDFError, "End Of File while processing a content stream"
140
+ end
141
+
142
+ # calls the name callback method on the receiver class with params as the arguments
143
+ #
144
+ def callback (receivers, name, params=[])
145
+ receivers.each do |receiver|
146
+ receiver.send(name, *params) if receiver.respond_to?(name)
147
+ end
148
+ end
149
+
150
+ def page_with_ancestors(obj = nil)
151
+ obj = objects.deref(obj)
152
+ if obj.nil?
153
+ [@page_object] + page_with_ancestors(@page_object[:Parent])
154
+ elsif obj[:Parent]
155
+ [select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
156
+ else
157
+ [select_inheritable(obj)]
158
+ end
159
+ end
160
+
161
+ # select the elements from a Pages dictionary that can be inherited by
162
+ # child Page dictionaries.
163
+ #
164
+ def select_inheritable(obj)
165
+ ::Hash[obj.select { |key, value|
166
+ [:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
167
+ }]
168
+ end
169
+
170
+ end
171
+ end
172
+ end