pdf-reader 0.10.1 → 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -4
- data/README.rdoc +30 -21
- data/bin/pdf_text +5 -35
- data/examples/callbacks.rb +9 -4
- data/examples/extract_bates.rb +15 -29
- data/lib/pdf/reader.rb +150 -37
- data/lib/pdf/reader/abstract_strategy.rb +2 -0
- data/lib/pdf/reader/buffer.rb +12 -13
- data/lib/pdf/reader/font.rb +56 -0
- data/lib/pdf/reader/glyphlist.txt +40 -1
- data/lib/pdf/reader/metadata_strategy.rb +3 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +19 -5
- data/lib/pdf/reader/page.rb +172 -0
- data/lib/pdf/reader/page_text_receiver.rb +253 -0
- data/lib/pdf/reader/pages_strategy.rb +3 -11
- data/lib/pdf/reader/text_receiver.rb +3 -0
- data/lib/pdf/reader/xref.rb +3 -4
- metadata +41 -35
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -74,24 +74,23 @@ class PDF::Reader
|
|
74
74
|
#
|
75
75
|
# options:
|
76
76
|
#
|
77
|
-
# :skip_eol - if true, the IO stream is advanced past
|
78
|
-
#
|
79
|
-
# content streams, which have a CRLF or LF after the stream
|
80
|
-
# token.
|
77
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
|
78
|
+
# is sitting under the io cursor.
|
81
79
|
#
|
82
80
|
def read(bytes, opts = {})
|
83
81
|
reset_pos
|
84
82
|
|
85
83
|
if opts[:skip_eol]
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
84
|
+
@io.seek(-1, IO::SEEK_CUR)
|
85
|
+
str = @io.read(2)
|
86
|
+
if str.nil?
|
87
|
+
return nil
|
88
|
+
elsif str == "\r\n"
|
89
|
+
# do nothing
|
90
|
+
elsif str[0,1] == "\n"
|
91
|
+
@io.seek(-1, IO::SEEK_CUR)
|
92
|
+
else
|
93
|
+
@io.seek(-2, IO::SEEK_CUR)
|
95
94
|
end
|
96
95
|
end
|
97
96
|
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -26,8 +26,21 @@
|
|
26
26
|
class PDF::Reader
|
27
27
|
class Font
|
28
28
|
attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
|
29
|
+
attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
|
29
30
|
attr_reader :basefont
|
30
31
|
|
32
|
+
def initialize(ohash = nil, obj = nil)
|
33
|
+
if ohash.nil? || obj.nil?
|
34
|
+
$stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
|
35
|
+
return
|
36
|
+
end
|
37
|
+
@ohash = ohash
|
38
|
+
|
39
|
+
extract_base_info(obj)
|
40
|
+
extract_descriptor(obj)
|
41
|
+
extract_descendants(obj)
|
42
|
+
end
|
43
|
+
|
31
44
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
32
45
|
# a text file supplied by Adobe at:
|
33
46
|
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
@@ -73,5 +86,48 @@ class PDF::Reader
|
|
73
86
|
params
|
74
87
|
end
|
75
88
|
end
|
89
|
+
|
90
|
+
def glyph_width(c)
|
91
|
+
@missing_width ||= 0
|
92
|
+
if @widths.nil?
|
93
|
+
0
|
94
|
+
else
|
95
|
+
@widths.fetch(c.codepoints.first - @first_char, @missing_width)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def extract_base_info(obj)
|
102
|
+
@subtype = @ohash.object(obj[:Subtype])
|
103
|
+
@basefont = @ohash.object(obj[:BaseFont])
|
104
|
+
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
105
|
+
@widths = @ohash.object(obj[:Widths])
|
106
|
+
@first_char = @ohash.object(obj[:FirstChar])
|
107
|
+
if obj[:ToUnicode]
|
108
|
+
stream = @ohash.object(obj[:ToUnicode])
|
109
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def extract_descriptor(obj)
|
114
|
+
return unless obj[:FontDescriptor]
|
115
|
+
|
116
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
117
|
+
@ascent = @ohash.object(fd[:Ascent])
|
118
|
+
@descent = @ohash.object(fd[:Descent])
|
119
|
+
@missing_width = @ohash.object(fd[:MissingWidth])
|
120
|
+
@bbox = @ohash.object(fd[:FontBBox])
|
121
|
+
end
|
122
|
+
|
123
|
+
def extract_descendants(obj)
|
124
|
+
return unless obj[:DescendantFonts]
|
125
|
+
|
126
|
+
descendants = @ohash.object(obj[:DescendantFonts])
|
127
|
+
@descendantfonts = descendants.map { |desc|
|
128
|
+
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
76
132
|
end
|
77
133
|
end
|
@@ -1,4 +1,43 @@
|
|
1
|
-
#
|
1
|
+
# ###################################################################################
|
2
|
+
# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
5
|
+
# copy of this documentation file to use, copy, publish, distribute,
|
6
|
+
# sublicense, and/or sell copies of the documentation, and to permit
|
7
|
+
# others to do the same, provided that:
|
8
|
+
# - No modification, editing or other alteration of this document is
|
9
|
+
# allowed; and
|
10
|
+
# - The above copyright notice and this permission notice shall be
|
11
|
+
# included in all copies of the documentation.
|
12
|
+
#
|
13
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
14
|
+
# copy of this documentation file, to create their own derivative works
|
15
|
+
# from the content of this document to use, copy, publish, distribute,
|
16
|
+
# sublicense, and/or sell the derivative works, and to permit others to do
|
17
|
+
# the same, provided that the derived work is not represented as being a
|
18
|
+
# copy or version of this document.
|
19
|
+
#
|
20
|
+
# Adobe shall not be liable to any party for any loss of revenue or profit
|
21
|
+
# or for indirect, incidental, special, consequential, or other similar
|
22
|
+
# damages, whether based on tort (including without limitation negligence
|
23
|
+
# or strict liability), contract or other legal or equitable grounds even
|
24
|
+
# if Adobe has been advised or had reason to know of the possibility of
|
25
|
+
# such damages.� The Adobe materials are provided on an "AS IS" basis.�
|
26
|
+
# Adobe specifically disclaims all express, statutory, or implied
|
27
|
+
# warranties relating to the Adobe materials, including but not limited to
|
28
|
+
# those concerning merchantability or fitness for a particular purpose or
|
29
|
+
# non-infringement of any third party rights regarding the Adobe
|
30
|
+
# materials.
|
31
|
+
# ###################################################################################
|
32
|
+
# Name: Adobe Glyph List
|
33
|
+
# Table version: 2.0
|
34
|
+
# Date: September 20, 2002
|
35
|
+
#
|
36
|
+
# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
|
37
|
+
#
|
38
|
+
# Format: Semicolon-delimited fields:
|
39
|
+
# (1) glyph name
|
40
|
+
# (2) Unicode scalar value
|
2
41
|
A;0041
|
3
42
|
AE;00C6
|
4
43
|
AEacute;01FC
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# A Hash-like object for caching commonly used objects from a PDF file.
|
6
|
+
#
|
7
|
+
# This is an internal class used by PDF::Reader::ObjectHash
|
8
|
+
#
|
9
|
+
class ObjectCache # nodoc
|
10
|
+
|
11
|
+
# These object types use little memory and are accessed a heap of times as
|
12
|
+
# part of random page access, so we'll cache the unmarshalled objects and
|
13
|
+
# avoid lots of repetitive (and expensive) tokenising
|
14
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@objects = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def [](key)
|
21
|
+
@objects[key]
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(key, value)
|
25
|
+
@objects[key] = value if cacheable?(value)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, local_default = nil)
|
29
|
+
@objects.fetch(key, local_default)
|
30
|
+
end
|
31
|
+
|
32
|
+
def each(&block)
|
33
|
+
@objects.each(&block)
|
34
|
+
end
|
35
|
+
alias :each_pair :each
|
36
|
+
|
37
|
+
def each_key(&block)
|
38
|
+
@objects.each_key(&block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_value(&block)
|
42
|
+
@objects.each_value(&block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def size
|
46
|
+
@objects.size
|
47
|
+
end
|
48
|
+
alias :length :size
|
49
|
+
|
50
|
+
def empty?
|
51
|
+
@objects.empty?
|
52
|
+
end
|
53
|
+
|
54
|
+
def has_key?(key)
|
55
|
+
@objects.has_key?(key)
|
56
|
+
end
|
57
|
+
alias :include? :has_key?
|
58
|
+
alias :key? :has_key?
|
59
|
+
alias :member? :has_key?
|
60
|
+
|
61
|
+
def has_value?(value)
|
62
|
+
@objects.has_value?(value)
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_s
|
66
|
+
"<PDF::Reader::ObjectCache size: #{self.size}>"
|
67
|
+
end
|
68
|
+
|
69
|
+
def keys
|
70
|
+
@objects.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
def values
|
74
|
+
@objects.values
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def cacheable?(obj)
|
80
|
+
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
@@ -5,7 +5,7 @@ class PDF::Reader
|
|
5
5
|
# object.
|
6
6
|
#
|
7
7
|
# A PDF file can be viewed as a large hash map. It is a series of objects
|
8
|
-
# stored at
|
8
|
+
# stored at precise byte offsets, and a table that maps object IDs to byte
|
9
9
|
# offsets. Given an object ID, looking up an object is an O(1) operation.
|
10
10
|
#
|
11
11
|
# Each PDF object can be mapped to a ruby object, so by passing an object
|
@@ -28,6 +28,8 @@ class PDF::Reader
|
|
28
28
|
class ObjectHash
|
29
29
|
include Enumerable
|
30
30
|
|
31
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
32
|
+
|
31
33
|
attr_accessor :default
|
32
34
|
attr_reader :trailer, :pdf_version
|
33
35
|
|
@@ -50,6 +52,11 @@ class PDF::Reader
|
|
50
52
|
@pdf_version = read_version
|
51
53
|
@xref = PDF::Reader::XRef.new(@io)
|
52
54
|
@trailer = @xref.trailer
|
55
|
+
@cache = PDF::Reader::ObjectCache.new
|
56
|
+
|
57
|
+
if trailer[:Encrypt]
|
58
|
+
raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
59
|
+
end
|
53
60
|
end
|
54
61
|
|
55
62
|
# returns the type of object a ref points to
|
@@ -81,25 +88,32 @@ class PDF::Reader
|
|
81
88
|
unless key.kind_of?(PDF::Reader::Reference)
|
82
89
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
83
90
|
end
|
84
|
-
if
|
91
|
+
if @cache.has_key?(key)
|
92
|
+
@cache[key]
|
93
|
+
elsif xref[key].is_a?(Fixnum)
|
85
94
|
buf = new_buffer(xref[key])
|
86
|
-
Parser.new(buf, self).object(key.id, key.gen)
|
95
|
+
@cache[key] = Parser.new(buf, self).object(key.id, key.gen)
|
87
96
|
elsif xref[key].is_a?(PDF::Reader::Reference)
|
88
97
|
container_key = xref[key]
|
89
98
|
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
90
|
-
object_streams[container_key][key.id]
|
99
|
+
@cache[key] = object_streams[container_key][key.id]
|
91
100
|
end
|
92
101
|
rescue InvalidObjectError
|
93
102
|
return default
|
94
103
|
end
|
95
104
|
end
|
96
105
|
|
106
|
+
def cacheable?(obj)
|
107
|
+
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
108
|
+
end
|
109
|
+
|
97
110
|
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
98
111
|
# object in the PDF and return it. Otherwise return key untouched.
|
99
112
|
#
|
100
113
|
def object(key)
|
101
114
|
key.is_a?(PDF::Reader::Reference) ? self[key] : key
|
102
115
|
end
|
116
|
+
alias :deref :object
|
103
117
|
|
104
118
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
105
119
|
# object.
|
@@ -192,7 +206,7 @@ class PDF::Reader
|
|
192
206
|
alias :value? :has_key?
|
193
207
|
|
194
208
|
def to_s
|
195
|
-
"<PDF::Reader::
|
209
|
+
"<PDF::Reader::ObjectHash size: #{self.size}>"
|
196
210
|
end
|
197
211
|
|
198
212
|
# return an array of all keys in the file
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Reader
|
5
|
+
|
6
|
+
# high level representation of a single PDF page. Ties together the various
|
7
|
+
# low level classes in PDF::Reader and provides access to the various
|
8
|
+
# components of the page (text, images, fonts, etc) in convenient formats.
|
9
|
+
#
|
10
|
+
# If you require access to the raw PDF objects for this page, you can access
|
11
|
+
# the Page dictionary via the page_object accessor. You will need to use the
|
12
|
+
# objects accessor to help walk the page dictionary in any useful way.
|
13
|
+
#
|
14
|
+
class Page
|
15
|
+
|
16
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
17
|
+
attr_reader :objects
|
18
|
+
|
19
|
+
# the raw PDF object that defines this page
|
20
|
+
attr_reader :page_object
|
21
|
+
|
22
|
+
# creates a new page wrapper.
|
23
|
+
#
|
24
|
+
# * objects - an ObjectHash instance that wraps a PDF file
|
25
|
+
# * pagenum - an int specifying the page number to expose. 1 indexed.
|
26
|
+
#
|
27
|
+
def initialize(objects, pagenum)
|
28
|
+
@objects, @pagenum = objects, pagenum
|
29
|
+
@page_object = objects.deref(objects.page_references[pagenum - 1])
|
30
|
+
|
31
|
+
unless @page_object.is_a?(::Hash)
|
32
|
+
raise ArgumentError, "invalid page: #{pagenum}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# return the number of this page within the full document
|
37
|
+
#
|
38
|
+
def number
|
39
|
+
@pagenum
|
40
|
+
end
|
41
|
+
|
42
|
+
# return a friendly string representation of this page
|
43
|
+
#
|
44
|
+
def inspect
|
45
|
+
"<PDF::Reader::Page page: #{@pagenum}>"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the attributes that accompany this page. Includes
|
49
|
+
# attributes inherited from parents.
|
50
|
+
#
|
51
|
+
def attributes
|
52
|
+
hash = {}
|
53
|
+
page_with_ancestors.reverse.each do |obj|
|
54
|
+
hash.merge!(@objects.deref(obj))
|
55
|
+
end
|
56
|
+
hash
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the resources that accompany this page. Includes
|
60
|
+
# resources inherited from parents.
|
61
|
+
#
|
62
|
+
def resources
|
63
|
+
@resources ||= @objects.deref(attributes[:Resources]) || {}
|
64
|
+
end
|
65
|
+
|
66
|
+
# return a hash of fonts used on this page.
|
67
|
+
#
|
68
|
+
# The keys are the font labels used within the page content stream.
|
69
|
+
#
|
70
|
+
# The values are a PDF::Reader::Font instances that provide access
|
71
|
+
# to most available metrics for each font.
|
72
|
+
#
|
73
|
+
def fonts
|
74
|
+
raw_fonts = objects.deref(resources[:Font] || {})
|
75
|
+
::Hash[raw_fonts.map { |label, font|
|
76
|
+
[label, PDF::Reader::Font.new(objects, objects.deref(font))]
|
77
|
+
}]
|
78
|
+
end
|
79
|
+
|
80
|
+
# returns the plain text content of this page encoded as UTF-8. Any
|
81
|
+
# characters that can't be translated will be returned as a ▯
|
82
|
+
#
|
83
|
+
def text
|
84
|
+
text_receiver = PageTextReceiver.new(fonts)
|
85
|
+
walk(text_receiver)
|
86
|
+
text_receiver.content
|
87
|
+
end
|
88
|
+
alias :to_s :text
|
89
|
+
|
90
|
+
# processes the raw content stream for this page in sequential order and
|
91
|
+
# passes callbacks to the receiver objects.
|
92
|
+
#
|
93
|
+
# This is mostly low level and you can probably ignore it unless you need
|
94
|
+
# access to soemthing like the raw encoded text. For an example of how
|
95
|
+
# this can be used as a basis for higher level functionality, see the
|
96
|
+
# text() method
|
97
|
+
#
|
98
|
+
def walk(*receivers)
|
99
|
+
callback(receivers, :page=, [self])
|
100
|
+
content_stream(receivers, raw_content)
|
101
|
+
end
|
102
|
+
|
103
|
+
# returns the raw content stream for this page. This is plumbing, nothing to
|
104
|
+
# see here unless you're a PDF nerd like me.
|
105
|
+
#
|
106
|
+
def raw_content
|
107
|
+
contents = objects.deref(@page_object[:Contents])
|
108
|
+
[contents].flatten.compact.map { |obj|
|
109
|
+
objects.deref(obj)
|
110
|
+
}.map { |obj|
|
111
|
+
obj.unfiltered_data
|
112
|
+
}.join
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def root
|
118
|
+
root ||= objects.deref(@objects.trailer[:Root])
|
119
|
+
end
|
120
|
+
|
121
|
+
def xobjects
|
122
|
+
resources[:XObject] || {}
|
123
|
+
end
|
124
|
+
|
125
|
+
def content_stream(receivers, instructions)
|
126
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
127
|
+
parser = Parser.new(buffer, @objects)
|
128
|
+
params = []
|
129
|
+
|
130
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
131
|
+
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
132
|
+
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
133
|
+
params.clear
|
134
|
+
else
|
135
|
+
params << token
|
136
|
+
end
|
137
|
+
end
|
138
|
+
rescue EOFError => e
|
139
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
140
|
+
end
|
141
|
+
|
142
|
+
# calls the name callback method on the receiver class with params as the arguments
|
143
|
+
#
|
144
|
+
def callback (receivers, name, params=[])
|
145
|
+
receivers.each do |receiver|
|
146
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def page_with_ancestors(obj = nil)
|
151
|
+
obj = objects.deref(obj)
|
152
|
+
if obj.nil?
|
153
|
+
[@page_object] + page_with_ancestors(@page_object[:Parent])
|
154
|
+
elsif obj[:Parent]
|
155
|
+
[select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
|
156
|
+
else
|
157
|
+
[select_inheritable(obj)]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# select the elements from a Pages dictionary that can be inherited by
|
162
|
+
# child Page dictionaries.
|
163
|
+
#
|
164
|
+
def select_inheritable(obj)
|
165
|
+
::Hash[obj.select { |key, value|
|
166
|
+
[:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
|
167
|
+
}]
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|