pdf-reader 0.10.1 → 0.11.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -4
- data/README.rdoc +30 -21
- data/bin/pdf_text +5 -35
- data/examples/callbacks.rb +9 -4
- data/examples/extract_bates.rb +15 -29
- data/lib/pdf/reader.rb +150 -37
- data/lib/pdf/reader/abstract_strategy.rb +2 -0
- data/lib/pdf/reader/buffer.rb +12 -13
- data/lib/pdf/reader/font.rb +56 -0
- data/lib/pdf/reader/glyphlist.txt +40 -1
- data/lib/pdf/reader/metadata_strategy.rb +3 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +19 -5
- data/lib/pdf/reader/page.rb +172 -0
- data/lib/pdf/reader/page_text_receiver.rb +253 -0
- data/lib/pdf/reader/pages_strategy.rb +3 -11
- data/lib/pdf/reader/text_receiver.rb +3 -0
- data/lib/pdf/reader/xref.rb +3 -4
- metadata +41 -35
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -74,24 +74,23 @@ class PDF::Reader
|
|
74
74
|
#
|
75
75
|
# options:
|
76
76
|
#
|
77
|
-
# :skip_eol - if true, the IO stream is advanced past
|
78
|
-
#
|
79
|
-
# content streams, which have a CRLF or LF after the stream
|
80
|
-
# token.
|
77
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
|
78
|
+
# is sitting under the io cursor.
|
81
79
|
#
|
82
80
|
def read(bytes, opts = {})
|
83
81
|
reset_pos
|
84
82
|
|
85
83
|
if opts[:skip_eol]
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
84
|
+
@io.seek(-1, IO::SEEK_CUR)
|
85
|
+
str = @io.read(2)
|
86
|
+
if str.nil?
|
87
|
+
return nil
|
88
|
+
elsif str == "\r\n"
|
89
|
+
# do nothing
|
90
|
+
elsif str[0,1] == "\n"
|
91
|
+
@io.seek(-1, IO::SEEK_CUR)
|
92
|
+
else
|
93
|
+
@io.seek(-2, IO::SEEK_CUR)
|
95
94
|
end
|
96
95
|
end
|
97
96
|
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -26,8 +26,21 @@
|
|
26
26
|
class PDF::Reader
|
27
27
|
class Font
|
28
28
|
attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
|
29
|
+
attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
|
29
30
|
attr_reader :basefont
|
30
31
|
|
32
|
+
def initialize(ohash = nil, obj = nil)
|
33
|
+
if ohash.nil? || obj.nil?
|
34
|
+
$stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
|
35
|
+
return
|
36
|
+
end
|
37
|
+
@ohash = ohash
|
38
|
+
|
39
|
+
extract_base_info(obj)
|
40
|
+
extract_descriptor(obj)
|
41
|
+
extract_descendants(obj)
|
42
|
+
end
|
43
|
+
|
31
44
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
32
45
|
# a text file supplied by Adobe at:
|
33
46
|
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
@@ -73,5 +86,48 @@ class PDF::Reader
|
|
73
86
|
params
|
74
87
|
end
|
75
88
|
end
|
89
|
+
|
90
|
+
def glyph_width(c)
|
91
|
+
@missing_width ||= 0
|
92
|
+
if @widths.nil?
|
93
|
+
0
|
94
|
+
else
|
95
|
+
@widths.fetch(c.codepoints.first - @first_char, @missing_width)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def extract_base_info(obj)
|
102
|
+
@subtype = @ohash.object(obj[:Subtype])
|
103
|
+
@basefont = @ohash.object(obj[:BaseFont])
|
104
|
+
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
105
|
+
@widths = @ohash.object(obj[:Widths])
|
106
|
+
@first_char = @ohash.object(obj[:FirstChar])
|
107
|
+
if obj[:ToUnicode]
|
108
|
+
stream = @ohash.object(obj[:ToUnicode])
|
109
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def extract_descriptor(obj)
|
114
|
+
return unless obj[:FontDescriptor]
|
115
|
+
|
116
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
117
|
+
@ascent = @ohash.object(fd[:Ascent])
|
118
|
+
@descent = @ohash.object(fd[:Descent])
|
119
|
+
@missing_width = @ohash.object(fd[:MissingWidth])
|
120
|
+
@bbox = @ohash.object(fd[:FontBBox])
|
121
|
+
end
|
122
|
+
|
123
|
+
def extract_descendants(obj)
|
124
|
+
return unless obj[:DescendantFonts]
|
125
|
+
|
126
|
+
descendants = @ohash.object(obj[:DescendantFonts])
|
127
|
+
@descendantfonts = descendants.map { |desc|
|
128
|
+
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
76
132
|
end
|
77
133
|
end
|
@@ -1,4 +1,43 @@
|
|
1
|
-
#
|
1
|
+
# ###################################################################################
|
2
|
+
# Copyright (c) 1997,1998,2002,2007 Adobe Systems Incorporated
|
3
|
+
#
|
4
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
5
|
+
# copy of this documentation file to use, copy, publish, distribute,
|
6
|
+
# sublicense, and/or sell copies of the documentation, and to permit
|
7
|
+
# others to do the same, provided that:
|
8
|
+
# - No modification, editing or other alteration of this document is
|
9
|
+
# allowed; and
|
10
|
+
# - The above copyright notice and this permission notice shall be
|
11
|
+
# included in all copies of the documentation.
|
12
|
+
#
|
13
|
+
# Permission is hereby granted, free of charge, to any person obtaining a
|
14
|
+
# copy of this documentation file, to create their own derivative works
|
15
|
+
# from the content of this document to use, copy, publish, distribute,
|
16
|
+
# sublicense, and/or sell the derivative works, and to permit others to do
|
17
|
+
# the same, provided that the derived work is not represented as being a
|
18
|
+
# copy or version of this document.
|
19
|
+
#
|
20
|
+
# Adobe shall not be liable to any party for any loss of revenue or profit
|
21
|
+
# or for indirect, incidental, special, consequential, or other similar
|
22
|
+
# damages, whether based on tort (including without limitation negligence
|
23
|
+
# or strict liability), contract or other legal or equitable grounds even
|
24
|
+
# if Adobe has been advised or had reason to know of the possibility of
|
25
|
+
# such damages.� The Adobe materials are provided on an "AS IS" basis.�
|
26
|
+
# Adobe specifically disclaims all express, statutory, or implied
|
27
|
+
# warranties relating to the Adobe materials, including but not limited to
|
28
|
+
# those concerning merchantability or fitness for a particular purpose or
|
29
|
+
# non-infringement of any third party rights regarding the Adobe
|
30
|
+
# materials.
|
31
|
+
# ###################################################################################
|
32
|
+
# Name: Adobe Glyph List
|
33
|
+
# Table version: 2.0
|
34
|
+
# Date: September 20, 2002
|
35
|
+
#
|
36
|
+
# See http://partners.adobe.com/asn/developer/typeforum/unicodegn.html
|
37
|
+
#
|
38
|
+
# Format: Semicolon-delimited fields:
|
39
|
+
# (1) glyph name
|
40
|
+
# (2) Unicode scalar value
|
2
41
|
A;0041
|
3
42
|
AE;00C6
|
4
43
|
AEacute;01FC
|
@@ -0,0 +1,85 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# A Hash-like object for caching commonly used objects from a PDF file.
|
6
|
+
#
|
7
|
+
# This is an internal class used by PDF::Reader::ObjectHash
|
8
|
+
#
|
9
|
+
class ObjectCache # nodoc
|
10
|
+
|
11
|
+
# These object types use little memory and are accessed a heap of times as
|
12
|
+
# part of random page access, so we'll cache the unmarshalled objects and
|
13
|
+
# avoid lots of repetitive (and expensive) tokenising
|
14
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@objects = {}
|
18
|
+
end
|
19
|
+
|
20
|
+
def [](key)
|
21
|
+
@objects[key]
|
22
|
+
end
|
23
|
+
|
24
|
+
def []=(key, value)
|
25
|
+
@objects[key] = value if cacheable?(value)
|
26
|
+
end
|
27
|
+
|
28
|
+
def fetch(key, local_default = nil)
|
29
|
+
@objects.fetch(key, local_default)
|
30
|
+
end
|
31
|
+
|
32
|
+
def each(&block)
|
33
|
+
@objects.each(&block)
|
34
|
+
end
|
35
|
+
alias :each_pair :each
|
36
|
+
|
37
|
+
def each_key(&block)
|
38
|
+
@objects.each_key(&block)
|
39
|
+
end
|
40
|
+
|
41
|
+
def each_value(&block)
|
42
|
+
@objects.each_value(&block)
|
43
|
+
end
|
44
|
+
|
45
|
+
def size
|
46
|
+
@objects.size
|
47
|
+
end
|
48
|
+
alias :length :size
|
49
|
+
|
50
|
+
def empty?
|
51
|
+
@objects.empty?
|
52
|
+
end
|
53
|
+
|
54
|
+
def has_key?(key)
|
55
|
+
@objects.has_key?(key)
|
56
|
+
end
|
57
|
+
alias :include? :has_key?
|
58
|
+
alias :key? :has_key?
|
59
|
+
alias :member? :has_key?
|
60
|
+
|
61
|
+
def has_value?(value)
|
62
|
+
@objects.has_value?(value)
|
63
|
+
end
|
64
|
+
|
65
|
+
def to_s
|
66
|
+
"<PDF::Reader::ObjectCache size: #{self.size}>"
|
67
|
+
end
|
68
|
+
|
69
|
+
def keys
|
70
|
+
@objects.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
def values
|
74
|
+
@objects.values
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def cacheable?(obj)
|
80
|
+
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
81
|
+
end
|
82
|
+
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
@@ -5,7 +5,7 @@ class PDF::Reader
|
|
5
5
|
# object.
|
6
6
|
#
|
7
7
|
# A PDF file can be viewed as a large hash map. It is a series of objects
|
8
|
-
# stored at
|
8
|
+
# stored at precise byte offsets, and a table that maps object IDs to byte
|
9
9
|
# offsets. Given an object ID, looking up an object is an O(1) operation.
|
10
10
|
#
|
11
11
|
# Each PDF object can be mapped to a ruby object, so by passing an object
|
@@ -28,6 +28,8 @@ class PDF::Reader
|
|
28
28
|
class ObjectHash
|
29
29
|
include Enumerable
|
30
30
|
|
31
|
+
CACHEABLE_TYPES = [:Catalog, :Page, :Pages]
|
32
|
+
|
31
33
|
attr_accessor :default
|
32
34
|
attr_reader :trailer, :pdf_version
|
33
35
|
|
@@ -50,6 +52,11 @@ class PDF::Reader
|
|
50
52
|
@pdf_version = read_version
|
51
53
|
@xref = PDF::Reader::XRef.new(@io)
|
52
54
|
@trailer = @xref.trailer
|
55
|
+
@cache = PDF::Reader::ObjectCache.new
|
56
|
+
|
57
|
+
if trailer[:Encrypt]
|
58
|
+
raise ::PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files'
|
59
|
+
end
|
53
60
|
end
|
54
61
|
|
55
62
|
# returns the type of object a ref points to
|
@@ -81,25 +88,32 @@ class PDF::Reader
|
|
81
88
|
unless key.kind_of?(PDF::Reader::Reference)
|
82
89
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
83
90
|
end
|
84
|
-
if
|
91
|
+
if @cache.has_key?(key)
|
92
|
+
@cache[key]
|
93
|
+
elsif xref[key].is_a?(Fixnum)
|
85
94
|
buf = new_buffer(xref[key])
|
86
|
-
Parser.new(buf, self).object(key.id, key.gen)
|
95
|
+
@cache[key] = Parser.new(buf, self).object(key.id, key.gen)
|
87
96
|
elsif xref[key].is_a?(PDF::Reader::Reference)
|
88
97
|
container_key = xref[key]
|
89
98
|
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
90
|
-
object_streams[container_key][key.id]
|
99
|
+
@cache[key] = object_streams[container_key][key.id]
|
91
100
|
end
|
92
101
|
rescue InvalidObjectError
|
93
102
|
return default
|
94
103
|
end
|
95
104
|
end
|
96
105
|
|
106
|
+
def cacheable?(obj)
|
107
|
+
obj.is_a?(Hash) && CACHEABLE_TYPES.include?(obj[:Type])
|
108
|
+
end
|
109
|
+
|
97
110
|
# If key is a PDF::Reader::Reference object, lookup the corresponding
|
98
111
|
# object in the PDF and return it. Otherwise return key untouched.
|
99
112
|
#
|
100
113
|
def object(key)
|
101
114
|
key.is_a?(PDF::Reader::Reference) ? self[key] : key
|
102
115
|
end
|
116
|
+
alias :deref :object
|
103
117
|
|
104
118
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
105
119
|
# object.
|
@@ -192,7 +206,7 @@ class PDF::Reader
|
|
192
206
|
alias :value? :has_key?
|
193
207
|
|
194
208
|
def to_s
|
195
|
-
"<PDF::Reader::
|
209
|
+
"<PDF::Reader::ObjectHash size: #{self.size}>"
|
196
210
|
end
|
197
211
|
|
198
212
|
# return an array of all keys in the file
|
@@ -0,0 +1,172 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Reader
|
5
|
+
|
6
|
+
# high level representation of a single PDF page. Ties together the various
|
7
|
+
# low level classes in PDF::Reader and provides access to the various
|
8
|
+
# components of the page (text, images, fonts, etc) in convenient formats.
|
9
|
+
#
|
10
|
+
# If you require access to the raw PDF objects for this page, you can access
|
11
|
+
# the Page dictionary via the page_object accessor. You will need to use the
|
12
|
+
# objects accessor to help walk the page dictionary in any useful way.
|
13
|
+
#
|
14
|
+
class Page
|
15
|
+
|
16
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
17
|
+
attr_reader :objects
|
18
|
+
|
19
|
+
# the raw PDF object that defines this page
|
20
|
+
attr_reader :page_object
|
21
|
+
|
22
|
+
# creates a new page wrapper.
|
23
|
+
#
|
24
|
+
# * objects - an ObjectHash instance that wraps a PDF file
|
25
|
+
# * pagenum - an int specifying the page number to expose. 1 indexed.
|
26
|
+
#
|
27
|
+
def initialize(objects, pagenum)
|
28
|
+
@objects, @pagenum = objects, pagenum
|
29
|
+
@page_object = objects.deref(objects.page_references[pagenum - 1])
|
30
|
+
|
31
|
+
unless @page_object.is_a?(::Hash)
|
32
|
+
raise ArgumentError, "invalid page: #{pagenum}"
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
# return the number of this page within the full document
|
37
|
+
#
|
38
|
+
def number
|
39
|
+
@pagenum
|
40
|
+
end
|
41
|
+
|
42
|
+
# return a friendly string representation of this page
|
43
|
+
#
|
44
|
+
def inspect
|
45
|
+
"<PDF::Reader::Page page: #{@pagenum}>"
|
46
|
+
end
|
47
|
+
|
48
|
+
# Returns the attributes that accompany this page. Includes
|
49
|
+
# attributes inherited from parents.
|
50
|
+
#
|
51
|
+
def attributes
|
52
|
+
hash = {}
|
53
|
+
page_with_ancestors.reverse.each do |obj|
|
54
|
+
hash.merge!(@objects.deref(obj))
|
55
|
+
end
|
56
|
+
hash
|
57
|
+
end
|
58
|
+
|
59
|
+
# Returns the resources that accompany this page. Includes
|
60
|
+
# resources inherited from parents.
|
61
|
+
#
|
62
|
+
def resources
|
63
|
+
@resources ||= @objects.deref(attributes[:Resources]) || {}
|
64
|
+
end
|
65
|
+
|
66
|
+
# return a hash of fonts used on this page.
|
67
|
+
#
|
68
|
+
# The keys are the font labels used within the page content stream.
|
69
|
+
#
|
70
|
+
# The values are a PDF::Reader::Font instances that provide access
|
71
|
+
# to most available metrics for each font.
|
72
|
+
#
|
73
|
+
def fonts
|
74
|
+
raw_fonts = objects.deref(resources[:Font] || {})
|
75
|
+
::Hash[raw_fonts.map { |label, font|
|
76
|
+
[label, PDF::Reader::Font.new(objects, objects.deref(font))]
|
77
|
+
}]
|
78
|
+
end
|
79
|
+
|
80
|
+
# returns the plain text content of this page encoded as UTF-8. Any
|
81
|
+
# characters that can't be translated will be returned as a ▯
|
82
|
+
#
|
83
|
+
def text
|
84
|
+
text_receiver = PageTextReceiver.new(fonts)
|
85
|
+
walk(text_receiver)
|
86
|
+
text_receiver.content
|
87
|
+
end
|
88
|
+
alias :to_s :text
|
89
|
+
|
90
|
+
# processes the raw content stream for this page in sequential order and
|
91
|
+
# passes callbacks to the receiver objects.
|
92
|
+
#
|
93
|
+
# This is mostly low level and you can probably ignore it unless you need
|
94
|
+
# access to soemthing like the raw encoded text. For an example of how
|
95
|
+
# this can be used as a basis for higher level functionality, see the
|
96
|
+
# text() method
|
97
|
+
#
|
98
|
+
def walk(*receivers)
|
99
|
+
callback(receivers, :page=, [self])
|
100
|
+
content_stream(receivers, raw_content)
|
101
|
+
end
|
102
|
+
|
103
|
+
# returns the raw content stream for this page. This is plumbing, nothing to
|
104
|
+
# see here unless you're a PDF nerd like me.
|
105
|
+
#
|
106
|
+
def raw_content
|
107
|
+
contents = objects.deref(@page_object[:Contents])
|
108
|
+
[contents].flatten.compact.map { |obj|
|
109
|
+
objects.deref(obj)
|
110
|
+
}.map { |obj|
|
111
|
+
obj.unfiltered_data
|
112
|
+
}.join
|
113
|
+
end
|
114
|
+
|
115
|
+
private
|
116
|
+
|
117
|
+
def root
|
118
|
+
root ||= objects.deref(@objects.trailer[:Root])
|
119
|
+
end
|
120
|
+
|
121
|
+
def xobjects
|
122
|
+
resources[:XObject] || {}
|
123
|
+
end
|
124
|
+
|
125
|
+
def content_stream(receivers, instructions)
|
126
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
127
|
+
parser = Parser.new(buffer, @objects)
|
128
|
+
params = []
|
129
|
+
|
130
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
131
|
+
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
132
|
+
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
133
|
+
params.clear
|
134
|
+
else
|
135
|
+
params << token
|
136
|
+
end
|
137
|
+
end
|
138
|
+
rescue EOFError => e
|
139
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
140
|
+
end
|
141
|
+
|
142
|
+
# calls the name callback method on the receiver class with params as the arguments
|
143
|
+
#
|
144
|
+
def callback (receivers, name, params=[])
|
145
|
+
receivers.each do |receiver|
|
146
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def page_with_ancestors(obj = nil)
|
151
|
+
obj = objects.deref(obj)
|
152
|
+
if obj.nil?
|
153
|
+
[@page_object] + page_with_ancestors(@page_object[:Parent])
|
154
|
+
elsif obj[:Parent]
|
155
|
+
[select_inheritable(obj)] + page_with_ancestors(obj[:Parent])
|
156
|
+
else
|
157
|
+
[select_inheritable(obj)]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
# select the elements from a Pages dictionary that can be inherited by
|
162
|
+
# child Page dictionaries.
|
163
|
+
#
|
164
|
+
def select_inheritable(obj)
|
165
|
+
::Hash[obj.select { |key, value|
|
166
|
+
[:Resources, :MediaBox, :CropBox, :Rotate, :Parent].include?(key)
|
167
|
+
}]
|
168
|
+
end
|
169
|
+
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|