pdf-reader 0.10.1 → 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -4
- data/README.rdoc +30 -21
- data/bin/pdf_text +5 -35
- data/examples/callbacks.rb +9 -4
- data/examples/extract_bates.rb +15 -29
- data/lib/pdf/reader.rb +150 -37
- data/lib/pdf/reader/abstract_strategy.rb +2 -0
- data/lib/pdf/reader/buffer.rb +12 -13
- data/lib/pdf/reader/font.rb +56 -0
- data/lib/pdf/reader/glyphlist.txt +40 -1
- data/lib/pdf/reader/metadata_strategy.rb +3 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +19 -5
- data/lib/pdf/reader/page.rb +172 -0
- data/lib/pdf/reader/page_text_receiver.rb +253 -0
- data/lib/pdf/reader/pages_strategy.rb +3 -11
- data/lib/pdf/reader/text_receiver.rb +3 -0
- data/lib/pdf/reader/xref.rb +3 -4
- metadata +41 -35
data/CHANGELOG
CHANGED
@@ -1,7 +1,4 @@
|
|
1
|
-
v0.
|
2
|
-
- simple license change to glyph data file, no code changes
|
3
|
-
|
4
|
-
v0.10.0 (6th July 2011)
|
1
|
+
v0.9.4 (XXX)
|
5
2
|
- support multiple receivers within a single pass over a source file
|
6
3
|
- massive time saving when dealing with multiple receivers
|
7
4
|
|
data/README.rdoc
CHANGED
@@ -6,7 +6,7 @@ degree of flexibility.
|
|
6
6
|
|
7
7
|
The PDF 1.7 specification is a weighty document and not all aspects are
|
8
8
|
currently supported. I welcome submission of PDF files that exhibit
|
9
|
-
unsupported aspects of the spec to assist with improving
|
9
|
+
unsupported aspects of the spec to assist with improving our support.
|
10
10
|
|
11
11
|
= Installation
|
12
12
|
|
@@ -16,22 +16,37 @@ The recommended installation method is via Rubygems.
|
|
16
16
|
|
17
17
|
= Usage
|
18
18
|
|
19
|
-
PDF::Reader
|
20
|
-
|
21
|
-
|
19
|
+
Begin by creating a PDF::Reader instance that points to a PDF file. Document
|
20
|
+
level information (metadata, page count, bookmarks, etc) is available via
|
21
|
+
this object.
|
22
22
|
|
23
|
-
|
24
|
-
images, shapes, etc) it will call methods on the receiver class. What those
|
25
|
-
methods do is entirely up to you - save the text, extract images, count pages,
|
26
|
-
read metadata, whatever.
|
23
|
+
reader = PDF::Reader.new("somefile.pdf")
|
27
24
|
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
puts reader.pdf_version
|
26
|
+
puts reader.info
|
27
|
+
puts reader.metadata
|
28
|
+
puts reader.page_count
|
31
29
|
|
32
|
-
|
33
|
-
|
34
|
-
|
30
|
+
PDF is a page based file format, so most visible information is available via
|
31
|
+
page-based iteration
|
32
|
+
|
33
|
+
reader = PDF::Reader.new("somefile.pdf")
|
34
|
+
|
35
|
+
reader.pages.each do |page|
|
36
|
+
puts page.fonts
|
37
|
+
puts page.images
|
38
|
+
puts page.text
|
39
|
+
end
|
40
|
+
|
41
|
+
For low level access to the objects in a PDF file, use the ObjectHash class. You can
|
42
|
+
build an ObjectHash instance directly:
|
43
|
+
|
44
|
+
puts PDF::Reader::ObjectHash.new("somefile.pdf")
|
45
|
+
|
46
|
+
or via a PDF::Reader instance:
|
47
|
+
|
48
|
+
reader = PDF::Reader.new("somefile.pdf")
|
49
|
+
puts reader.objects
|
35
50
|
|
36
51
|
= Text Encoding
|
37
52
|
|
@@ -60,7 +75,7 @@ MalformedPDFError has some subclasses if you want to detect finer grained issues
|
|
60
75
|
don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
61
76
|
|
62
77
|
Any other exceptions should be considered bugs in either PDF::Reader (please
|
63
|
-
report it!)
|
78
|
+
report it!).
|
64
79
|
|
65
80
|
= Maintainers
|
66
81
|
|
@@ -86,11 +101,6 @@ Check out the examples/ directory for a few files.
|
|
86
101
|
|
87
102
|
= Known Limitations
|
88
103
|
|
89
|
-
The order of the callbacks is unpredictable, and is dependent on the internal
|
90
|
-
layout of the file, not the order objects are displayed to the user. As a
|
91
|
-
consequence of this it is highly unlikely that text will be completely in
|
92
|
-
order.
|
93
|
-
|
94
104
|
Occasionally some text cannot be extracted properly due to the way it has been
|
95
105
|
stored, or the use of invalid bytes. In these cases PDF::Reader will output a
|
96
106
|
little UTF-8 friendly box to indicate an unrecognisable character.
|
@@ -98,6 +108,5 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
98
108
|
= Resources
|
99
109
|
|
100
110
|
- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
101
|
-
- PDF::Reader Rubyforge Page: http://rubyforge.org/projects/pdf-reader/
|
102
111
|
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
103
112
|
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/bin/pdf_text
CHANGED
@@ -5,41 +5,11 @@ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
|
5
5
|
|
6
6
|
require 'pdf/reader'
|
7
7
|
|
8
|
-
class PageTextReceiver
|
9
|
-
attr_accessor :content
|
10
|
-
|
11
|
-
# Called when page parsing starts
|
12
|
-
def end_page(arg = nil)
|
13
|
-
if @content
|
14
|
-
puts @content
|
15
|
-
@content = nil
|
16
|
-
puts
|
17
|
-
end
|
18
|
-
end
|
19
|
-
|
20
|
-
def show_text(*params)
|
21
|
-
@content = "" if @content.nil?
|
22
|
-
params.each do |str|
|
23
|
-
@content << str.to_s
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
# there's a few text callbacks, so make sure we process them all
|
28
|
-
alias :super_show_text :show_text
|
29
|
-
alias :move_to_next_line_and_show_text :show_text
|
30
|
-
alias :set_spacing_next_line_show_text :show_text
|
31
|
-
|
32
|
-
def show_text_with_positioning(*params)
|
33
|
-
params = params.first
|
34
|
-
params ||= []
|
35
|
-
params.each { |str| show_text(str) if str.kind_of?(String)}
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
receiver = PageTextReceiver.new
|
40
|
-
|
41
8
|
if ARGV.empty?
|
42
|
-
PDF::Reader.new
|
9
|
+
browser = PDF::Reader.new($stdin)
|
43
10
|
else
|
44
|
-
PDF::Reader.
|
11
|
+
browser = PDF::Reader.new(ARGV[0])
|
12
|
+
end
|
13
|
+
browser.pages.each do |page|
|
14
|
+
puts page.text
|
45
15
|
end
|
data/examples/callbacks.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
# coding: utf-8
|
3
3
|
|
4
|
-
# List all callbacks generated by
|
4
|
+
# List all callbacks generated by each page
|
5
5
|
#
|
6
6
|
# WARNING: this will generate a *lot* of output, so you probably want to pipe
|
7
7
|
# it through less or to a text file.
|
@@ -10,7 +10,12 @@ require 'rubygems'
|
|
10
10
|
require 'pdf/reader'
|
11
11
|
|
12
12
|
receiver = PDF::Reader::RegisterReceiver.new
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
|
14
|
+
PDF::Reader.open("somefile.pdf") do |reader|
|
15
|
+
reader.pages.each do |page|
|
16
|
+
page.walk(receiver)
|
17
|
+
receiver.callbacks.each do |cb|
|
18
|
+
puts cb
|
19
|
+
end
|
20
|
+
end
|
16
21
|
end
|
data/examples/extract_bates.rb
CHANGED
@@ -10,26 +10,20 @@
|
|
10
10
|
# the number is to look for words that match a pattern.
|
11
11
|
#
|
12
12
|
# This example attempts to extract numbers using the Acrobat 9 syntax.
|
13
|
-
# As a fall back, you can
|
14
|
-
#
|
15
|
-
# page content.
|
13
|
+
# As a fall back, you can use a regular expression to look for words
|
14
|
+
# that match the numbers you expect in the page content.
|
16
15
|
|
17
16
|
require 'rubygems'
|
18
17
|
require 'pdf/reader'
|
19
18
|
|
20
19
|
class BatesReceiver
|
21
20
|
|
22
|
-
|
23
|
-
@numbers = []
|
24
|
-
@backup = []
|
25
|
-
@regexp = regexp
|
26
|
-
end
|
21
|
+
attr_reader :numbers
|
27
22
|
|
28
|
-
def
|
29
|
-
@numbers
|
23
|
+
def initialize
|
24
|
+
@numbers = []
|
30
25
|
end
|
31
26
|
|
32
|
-
# Called when page parsing starts
|
33
27
|
def begin_marked_content(*args)
|
34
28
|
return unless args.size >= 2
|
35
29
|
return unless args.first == :Artifact
|
@@ -39,25 +33,17 @@ class BatesReceiver
|
|
39
33
|
end
|
40
34
|
alias :begin_marked_content_with_pl :begin_marked_content
|
41
35
|
|
42
|
-
|
43
|
-
def show_text(string, *params)
|
44
|
-
return if @regexp.nil?
|
45
|
-
|
46
|
-
string.scan(@regexp).each { |m| @backup << m }
|
47
|
-
end
|
36
|
+
end
|
48
37
|
|
49
|
-
# there's a few text callbacks, so make sure we process them all
|
50
|
-
alias :super_show_text :show_text
|
51
|
-
alias :move_to_next_line_and_show_text :show_text
|
52
|
-
alias :set_spacing_next_line_show_text :show_text
|
53
38
|
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
39
|
+
PDF::Reader.open("bates.pdf") do |reader|
|
40
|
+
reader.pages.each do |page|
|
41
|
+
receiver = BatesReceiver.new
|
42
|
+
page.walk(receiver)
|
43
|
+
if receiver.numbers.empty?
|
44
|
+
puts page.scan(/CC.+/)
|
45
|
+
else
|
46
|
+
puts receiver.numbers.inspect
|
47
|
+
end
|
58
48
|
end
|
59
49
|
end
|
60
|
-
|
61
|
-
receiver = BatesReceiver.new(/CC.+/)
|
62
|
-
PDF::Reader.file("bates.pdf", receiver)
|
63
|
-
puts receiver.numbers.inspect
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
################################################################################
|
2
2
|
#
|
3
3
|
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
4
|
+
# Copyright (C) 2011 James Healy
|
4
5
|
#
|
5
6
|
# Permission is hereby granted, free of charge, to any person obtaining
|
6
7
|
# a copy of this software and associated documentation files (the
|
@@ -30,62 +31,102 @@ require 'ascii85'
|
|
30
31
|
|
31
32
|
module PDF
|
32
33
|
################################################################################
|
33
|
-
# The Reader class serves as an entry point for parsing a PDF file.
|
34
|
-
# ways to kick off processing - which one you pick will be based on personal preference
|
35
|
-
# and the situation.
|
34
|
+
# The Reader class serves as an entry point for parsing a PDF file.
|
36
35
|
#
|
37
|
-
#
|
38
|
-
#
|
39
|
-
#
|
36
|
+
# PDF is a page based file format. There is some data associated with the
|
37
|
+
# document (metadata, bookmarks, etc) but all visible content is stored
|
38
|
+
# under a Page object.
|
40
39
|
#
|
41
|
-
#
|
40
|
+
# In most use cases for extracting and examining the contents of a PDF it
|
41
|
+
# makes sense to traverse the information using page based iteration.
|
42
42
|
#
|
43
|
-
#
|
43
|
+
# In addition to the documentation here, check out the
|
44
|
+
# PDF::Reader::Page class.
|
44
45
|
#
|
45
|
-
#
|
46
|
+
# == File Metadata
|
46
47
|
#
|
47
|
-
#
|
48
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
48
49
|
#
|
49
|
-
#
|
50
|
+
# puts reader.pdf_version
|
51
|
+
# puts reader.info
|
52
|
+
# puts reader.metadata
|
53
|
+
# puts reader.page_count
|
50
54
|
#
|
51
|
-
#
|
55
|
+
# == Iterating over page content
|
52
56
|
#
|
53
|
-
#
|
57
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
54
58
|
#
|
55
|
-
#
|
56
|
-
#
|
59
|
+
# reader.pages.each do |page|
|
60
|
+
# puts page.fonts
|
61
|
+
# puts page.images
|
62
|
+
# puts page.text
|
63
|
+
# end
|
57
64
|
#
|
58
|
-
#
|
65
|
+
# == Extracting all text
|
59
66
|
#
|
60
|
-
#
|
61
|
-
# specifies which parts of the file to process. By default, all options are
|
62
|
-
# enabled, so this can be useful to cut down processing time if you're only
|
63
|
-
# interested in say, metadata.
|
67
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
64
68
|
#
|
65
|
-
#
|
66
|
-
# pages in the file, but explicitly enables processing metadata.
|
69
|
+
# reader.pages.map(&:text)
|
67
70
|
#
|
68
|
-
#
|
71
|
+
# == Extracting content from a single page
|
69
72
|
#
|
70
|
-
#
|
73
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
71
74
|
#
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
+
# page = reader.page(1)
|
76
|
+
# puts page.fonts
|
77
|
+
# puts page.images
|
78
|
+
# puts page.text
|
75
79
|
#
|
76
|
-
#
|
80
|
+
# == Low level callbacks (ala current version of PDF::Reader)
|
77
81
|
#
|
78
|
-
#
|
79
|
-
# pass an array of receivers as the second argument:
|
82
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
80
83
|
#
|
81
|
-
#
|
82
|
-
#
|
83
|
-
#
|
84
|
-
# This saves a significant amount of time by limiting the work to a single pass
|
85
|
-
# over the source file.
|
84
|
+
# page = reader.page(1)
|
85
|
+
# page.walk(receiver)
|
86
86
|
#
|
87
87
|
class Reader
|
88
88
|
|
89
|
+
# lowlevel hash-like access to all objects in the underlying PDF
|
90
|
+
attr_reader :objects
|
91
|
+
|
92
|
+
attr_reader :page_count, :pdf_version, :info, :metadata
|
93
|
+
|
94
|
+
# creates a new document reader for the provided PDF.
|
95
|
+
#
|
96
|
+
# input can be an IO-ish object (StringIO, File, etc) containing a PDF
|
97
|
+
# or a filename
|
98
|
+
#
|
99
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
100
|
+
#
|
101
|
+
# File.open("somefile.pdf","rb") do |file|
|
102
|
+
# reader = PDF::Reader.new(file)
|
103
|
+
# end
|
104
|
+
#
|
105
|
+
def initialize(input = nil)
|
106
|
+
if input # support the deprecated Reader API
|
107
|
+
@objects = PDF::Reader::ObjectHash.new(input)
|
108
|
+
@page_count = get_page_count
|
109
|
+
@pdf_version = @objects.pdf_version
|
110
|
+
@info = @objects.deref(@objects.trailer[:Info])
|
111
|
+
@metadata = get_metadata
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# syntactic sugar for opening a PDF file. Accepts the same arguments
|
116
|
+
# as new().
|
117
|
+
#
|
118
|
+
# PDF::Reader.open("somefile.pdf") do |reader|
|
119
|
+
# puts reader.pdf_version
|
120
|
+
# end
|
121
|
+
#
|
122
|
+
def self.open(input, &block)
|
123
|
+
yield PDF::Reader.new(input)
|
124
|
+
end
|
125
|
+
|
126
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
127
|
+
# eventually be removed
|
128
|
+
#
|
129
|
+
#
|
89
130
|
# Parse the file with the given name, sending events to the given receiver.
|
90
131
|
#
|
91
132
|
def self.file(name, receivers, opts = {})
|
@@ -94,6 +135,9 @@ module PDF
|
|
94
135
|
end
|
95
136
|
end
|
96
137
|
|
138
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
139
|
+
# eventually be removed
|
140
|
+
#
|
97
141
|
# Parse the given string, sending events to the given receiver.
|
98
142
|
#
|
99
143
|
def self.string(str, receivers, opts = {})
|
@@ -102,6 +146,9 @@ module PDF
|
|
102
146
|
end
|
103
147
|
end
|
104
148
|
|
149
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
150
|
+
# eventually be removed
|
151
|
+
#
|
105
152
|
# Parse the file with the given name, returning an unmarshalled ruby version of
|
106
153
|
# represents the requested pdf object
|
107
154
|
#
|
@@ -111,6 +158,9 @@ module PDF
|
|
111
158
|
}
|
112
159
|
end
|
113
160
|
|
161
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
162
|
+
# eventually be removed
|
163
|
+
#
|
114
164
|
# Parse the given string, returning an unmarshalled ruby version of represents
|
115
165
|
# the requested pdf object
|
116
166
|
#
|
@@ -120,6 +170,48 @@ module PDF
|
|
120
170
|
}
|
121
171
|
end
|
122
172
|
|
173
|
+
# returns an array of PDF::Reader::Page objects, one for each
|
174
|
+
# page in the source PDF.
|
175
|
+
#
|
176
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
177
|
+
#
|
178
|
+
# reader.pages.each do |page|
|
179
|
+
# puts page.fonts
|
180
|
+
# puts page.images
|
181
|
+
# puts page.text
|
182
|
+
# end
|
183
|
+
#
|
184
|
+
# See the docs for PDF::Reader::Page to read more about the
|
185
|
+
# methods available on each page
|
186
|
+
#
|
187
|
+
def pages
|
188
|
+
(1..@page_count).map { |num|
|
189
|
+
PDF::Reader::Page.new(@objects, num)
|
190
|
+
}
|
191
|
+
end
|
192
|
+
|
193
|
+
# returns a single PDF::Reader::Page for the specified page.
|
194
|
+
# Use this instead of pages method when you need to access just a single
|
195
|
+
# page
|
196
|
+
#
|
197
|
+
# reader = PDF::Reader.new("somefile.pdf")
|
198
|
+
# page = reader.page(10)
|
199
|
+
#
|
200
|
+
# puts page.text
|
201
|
+
#
|
202
|
+
# See the docs for PDF::Reader::Page to read more about the
|
203
|
+
# methods available on each page
|
204
|
+
#
|
205
|
+
def page(num)
|
206
|
+
num = num.to_i
|
207
|
+
raise ArgumentError, "valid pages are 1 .. #{@page_count}" if num < 1 || num > @page_count
|
208
|
+
PDF::Reader::Page.new(@objects, num)
|
209
|
+
end
|
210
|
+
|
211
|
+
|
212
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
213
|
+
# eventually be removed
|
214
|
+
#
|
123
215
|
# Given an IO object that contains PDF data, parse it.
|
124
216
|
#
|
125
217
|
def parse(io, receivers, opts = {})
|
@@ -139,12 +231,15 @@ module PDF
|
|
139
231
|
self
|
140
232
|
end
|
141
233
|
|
234
|
+
# DEPRECATED: this method was deprecated in version 0.11.0 and will
|
235
|
+
# eventually be removed
|
236
|
+
#
|
142
237
|
# Given an IO object that contains PDF data, return the contents of a single object
|
143
238
|
#
|
144
239
|
def object (io, id, gen)
|
145
|
-
@
|
240
|
+
@objects = ObjectHash.new(io)
|
146
241
|
|
147
|
-
@
|
242
|
+
@objects.deref(Reference.new(id, gen))
|
148
243
|
end
|
149
244
|
|
150
245
|
private
|
@@ -155,6 +250,21 @@ module PDF
|
|
155
250
|
::PDF::Reader::PagesStrategy
|
156
251
|
]
|
157
252
|
end
|
253
|
+
|
254
|
+
def root
|
255
|
+
root ||= @objects.deref(@objects.trailer[:Root])
|
256
|
+
end
|
257
|
+
|
258
|
+
def get_metadata
|
259
|
+
stream = @objects.deref(root[:Metadata])
|
260
|
+
stream ? stream.unfiltered_data : nil
|
261
|
+
end
|
262
|
+
|
263
|
+
def get_page_count
|
264
|
+
pages = @objects.deref(root[:Pages])
|
265
|
+
pages[:Count]
|
266
|
+
end
|
267
|
+
|
158
268
|
end
|
159
269
|
end
|
160
270
|
################################################################################
|
@@ -168,6 +278,7 @@ require 'pdf/reader/filter'
|
|
168
278
|
require 'pdf/reader/font'
|
169
279
|
require 'pdf/reader/lzw'
|
170
280
|
require 'pdf/reader/metadata_strategy'
|
281
|
+
require 'pdf/reader/object_cache'
|
171
282
|
require 'pdf/reader/object_hash'
|
172
283
|
require 'pdf/reader/object_stream'
|
173
284
|
require 'pdf/reader/pages_strategy'
|
@@ -177,6 +288,8 @@ require 'pdf/reader/reference'
|
|
177
288
|
require 'pdf/reader/register_receiver'
|
178
289
|
require 'pdf/reader/stream'
|
179
290
|
require 'pdf/reader/text_receiver'
|
291
|
+
require 'pdf/reader/page_text_receiver'
|
180
292
|
require 'pdf/reader/token'
|
181
293
|
require 'pdf/reader/xref'
|
294
|
+
require 'pdf/reader/page'
|
182
295
|
require 'pdf/hash'
|