pdf-reader 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -2
- data/{README → README.rdoc} +27 -47
- data/Rakefile +5 -4
- data/TODO +3 -1
- data/bin/pdf_list_callbacks +1 -5
- data/bin/pdf_object +43 -0
- data/bin/pdf_text +1 -0
- data/lib/pdf/reader.rb +25 -7
- data/lib/pdf/reader/buffer.rb +3 -1
- data/lib/pdf/reader/content.rb +56 -48
- data/lib/pdf/reader/encoding.rb +82 -1088
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/font.rb +4 -3
- data/lib/pdf/reader/parser.rb +1 -0
- data/lib/pdf/reader/print_receiver.rb +19 -0
- data/lib/pdf/reader/xref.rb +12 -0
- metadata +26 -17
- data/lib/pdf/reader/parser.rb.rej +0 -29
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
v0.7.3 (UNRELESED)
|
2
|
+
- Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
|
3
|
+
- Fix a hard loop bug caused by a content stream that is missing a final operator
|
4
|
+
- Significantly simplified the internal code for encoding conversions
|
5
|
+
- Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
|
6
|
+
- New callbacks
|
7
|
+
- page_count
|
8
|
+
- pdf_version
|
9
|
+
- Fix a bug that prevented a font's BaseFont from being recorded correctly
|
10
|
+
|
1
11
|
v0.7.2 (20th May 2008)
|
2
12
|
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
3
13
|
- Correctly handle page content instruction sets with trailing whitespace
|
@@ -16,7 +26,7 @@ v0.7 (6th May 2008)
|
|
16
26
|
- Behave as expected if the Contents key in a Page Dict is a reference
|
17
27
|
- Include some basic metadata callbacks
|
18
28
|
- Don't interpret a comment token (%) inside a string as a comment
|
19
|
-
- Small fixes to improve 1.9
|
29
|
+
- Small fixes to improve 1.9 compatibility
|
20
30
|
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
21
31
|
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
22
32
|
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
@@ -36,7 +46,7 @@ v0.6.1 (12th March 2008)
|
|
36
46
|
just replace each character with a little box.
|
37
47
|
- Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
|
38
48
|
NoMethodError.
|
39
|
-
- Added a method to RegisterReceiver that returns all
|
49
|
+
- Added a method to RegisterReceiver that returns all occurrences of a callback
|
40
50
|
|
41
51
|
v0.6.0 (27th February 2008)
|
42
52
|
- all text is now transparently converted to UTF-8 before being passed to the callbacks.
|
data/{README → README.rdoc}
RENAMED
@@ -48,8 +48,11 @@ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't curren
|
|
48
48
|
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
49
49
|
us with future code improvements.
|
50
50
|
|
51
|
-
|
52
|
-
|
51
|
+
MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
|
52
|
+
don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
53
|
+
|
54
|
+
Any other exceptions should be considered bugs in either PDF::Reader (please
|
55
|
+
report it!) your receiver (please don't report it!).
|
53
56
|
|
54
57
|
= Maintainers
|
55
58
|
|
@@ -66,7 +69,7 @@ http://groups.google.com/group/pdf-reader
|
|
66
69
|
|
67
70
|
The easiest way to explain how this works in practice is to show some examples.
|
68
71
|
|
69
|
-
== Page Counter
|
72
|
+
== Naïve Page Counter
|
70
73
|
|
71
74
|
A simple app to count the number of pages in a PDF File.
|
72
75
|
|
@@ -127,6 +130,26 @@ it through less or to a text file.
|
|
127
130
|
puts receiver.regular.inspect
|
128
131
|
puts receiver.xml.inspect
|
129
132
|
|
133
|
+
== Improved Page Counter
|
134
|
+
|
135
|
+
A simple app to display the number of pages in a PDF File.
|
136
|
+
|
137
|
+
require 'rubygems'
|
138
|
+
require 'pdf/reader'
|
139
|
+
|
140
|
+
class PageReceiver
|
141
|
+
attr_accessor :pages
|
142
|
+
|
143
|
+
# Called when page parsing ends
|
144
|
+
def page_count(arg)
|
145
|
+
@pages = arg
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
receiver = PageReceiver.new
|
150
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
|
151
|
+
puts "#{receiver.pages} pages"
|
152
|
+
|
130
153
|
== Basic RSpec of a generated PDF
|
131
154
|
|
132
155
|
require 'rubygems'
|
@@ -182,49 +205,6 @@ it through less or to a text file.
|
|
182
205
|
end
|
183
206
|
end
|
184
207
|
|
185
|
-
== Extract ISBNs
|
186
|
-
|
187
|
-
Parse all text in the requested PDF file and print out any valid book ISBNs.
|
188
|
-
Requires the rbook-isbn gem.
|
189
|
-
|
190
|
-
require 'rubygems'
|
191
|
-
require 'pdf/reader'
|
192
|
-
require 'rbook/isbn'
|
193
|
-
|
194
|
-
class ISBNReceiver
|
195
|
-
|
196
|
-
# there's a few text callbacks, so make sure we process them all
|
197
|
-
def show_text(string, *params)
|
198
|
-
process_words(string.split(/\W+/))
|
199
|
-
end
|
200
|
-
|
201
|
-
def super_show_text(string, *params)
|
202
|
-
process_words(string.split(/\W+/))
|
203
|
-
end
|
204
|
-
|
205
|
-
def move_to_next_line_and_show_text (string)
|
206
|
-
process_words(string.split(/\W+/))
|
207
|
-
end
|
208
|
-
|
209
|
-
def set_spacing_next_line_show_text (aw, ac, string)
|
210
|
-
process_words(string.split(/\W+/))
|
211
|
-
end
|
212
|
-
|
213
|
-
private
|
214
|
-
|
215
|
-
# check if any items in the supplied array are a valid ISBN, and print any
|
216
|
-
# that are to console
|
217
|
-
def process_words(words)
|
218
|
-
words.each do |word|
|
219
|
-
word.strip!
|
220
|
-
puts "#{RBook::ISBN.convert_to_isbn13(word)}" if RBook::ISBN.valid_isbn?(word)
|
221
|
-
end
|
222
|
-
end
|
223
|
-
end
|
224
|
-
|
225
|
-
receiver = ISBNReceiver.new
|
226
|
-
PDF::Reader.file("somefile.pdf", receiver)
|
227
|
-
|
228
208
|
= Known Limitations
|
229
209
|
|
230
210
|
The order of the callbacks is unpredicable, and is dependent on the internal
|
@@ -238,7 +218,7 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
238
218
|
|
239
219
|
= Resources
|
240
220
|
|
241
|
-
- PDF::Reader
|
221
|
+
- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
242
222
|
- PDF::Reader Rubyforge Page: http://rubyforge.org/projects/pdf-reader/
|
243
223
|
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
244
224
|
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/Rakefile
CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
|
|
6
6
|
require "rake/gempackagetask"
|
7
7
|
require 'spec/rake/spectask'
|
8
8
|
|
9
|
-
PKG_VERSION = "0.7.
|
9
|
+
PKG_VERSION = "0.7.3"
|
10
10
|
PKG_NAME = "pdf-reader"
|
11
11
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
12
|
|
@@ -44,7 +44,7 @@ desc "Create documentation"
|
|
44
44
|
Rake::RDocTask.new("doc") do |rdoc|
|
45
45
|
rdoc.title = "pdf-reader"
|
46
46
|
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
47
|
-
rdoc.rdoc_files.include('README')
|
47
|
+
rdoc.rdoc_files.include('README.rdoc')
|
48
48
|
rdoc.rdoc_files.include('TODO')
|
49
49
|
rdoc.rdoc_files.include('CHANGELOG')
|
50
50
|
#rdoc.rdoc_files.include('COPYING')
|
@@ -66,12 +66,13 @@ spec = Gem::Specification.new do |spec|
|
|
66
66
|
|
67
67
|
spec.require_path = "lib"
|
68
68
|
spec.bindir = "bin"
|
69
|
+
spec.executables << "pdf_object"
|
69
70
|
spec.executables << "pdf_text"
|
70
71
|
spec.executables << "pdf_list_callbacks"
|
71
72
|
spec.has_rdoc = true
|
72
|
-
spec.extra_rdoc_files = %w{README TODO CHANGELOG}
|
73
|
+
spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG}
|
73
74
|
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
74
|
-
'--main' << 'README' << '-q'
|
75
|
+
'--main' << 'README.rdoc' << '-q'
|
75
76
|
spec.author = "Peter Jones"
|
76
77
|
spec.email = "pjones@pmade.com"
|
77
78
|
spec.rubyforge_project = "pdf-reader"
|
data/TODO
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
v0.8
|
2
|
+
- add extra callbacks
|
3
|
+
- list implemented features
|
4
|
+
- encrypted? tagged? bookmarks? annotated? optimised?
|
2
5
|
- Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
|
3
6
|
- bookmarks?
|
4
7
|
- outline?
|
@@ -9,7 +12,6 @@ v0.8
|
|
9
12
|
poppler seems to do this in a quite reasonable way. Original Encoding -> Glyph Names -> Unicode. As of 0.6 we go straight
|
10
13
|
from the Original encoding to Unicode.
|
11
14
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
12
|
-
- Provide a way to get raw access to a particular object. Good for testing purposes
|
13
15
|
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
14
16
|
- Support Cross Reference Streams (spec 3.4.7)
|
15
17
|
|
data/bin/pdf_list_callbacks
CHANGED
@@ -4,14 +4,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
|
4
4
|
|
5
5
|
require 'pdf/reader'
|
6
6
|
|
7
|
-
receiver = PDF::Reader::
|
7
|
+
receiver = PDF::Reader::PrintReceiver.new
|
8
8
|
|
9
9
|
if ARGV.empty?
|
10
10
|
PDF::Reader.new.parse($stdin, receiver)
|
11
11
|
else
|
12
12
|
PDF::Reader.file(ARGV[0], receiver)
|
13
13
|
end
|
14
|
-
|
15
|
-
receiver.callbacks.each do |callback|
|
16
|
-
puts "#{callback[:name]} - #{callback[:args].inspect}"
|
17
|
-
end
|
data/bin/pdf_object
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
4
|
+
|
5
|
+
USAGE = "USAGE: " + File.basename(__FILE__) + " <file> <object id> [generation]"
|
6
|
+
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
filename, id, gen = *ARGV
|
10
|
+
|
11
|
+
if filename.nil? || id.nil?
|
12
|
+
puts USAGE
|
13
|
+
exit 1
|
14
|
+
elsif !File.file?(filename)
|
15
|
+
$stderr.puts "#{filename} does not exist"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
# tweak the users options
|
20
|
+
id = id.to_i
|
21
|
+
gen ||= 0
|
22
|
+
gen = gen.to_i
|
23
|
+
|
24
|
+
# make magic happen
|
25
|
+
begin
|
26
|
+
obj = PDF::Reader.object_file(filename, id, gen)
|
27
|
+
|
28
|
+
case obj
|
29
|
+
when Hash, Array
|
30
|
+
puts obj.inspect
|
31
|
+
else
|
32
|
+
puts obj
|
33
|
+
end
|
34
|
+
rescue PDF::Reader::InvalidObjectError
|
35
|
+
$stderr.puts "Error retreiving object #{id}, gen #{gen}. Does it exist?"
|
36
|
+
exit 1
|
37
|
+
rescue PDF::Reader::MalformedPDFError => e
|
38
|
+
$stderr.puts "Malformed PDF file: #{e.message}"
|
39
|
+
exit 1
|
40
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
41
|
+
$stderr.puts "PDF file implements a feature unsupported by PDF::Reader: #{e.message}"
|
42
|
+
exit 1
|
43
|
+
end
|
data/bin/pdf_text
CHANGED
data/lib/pdf/reader.rb
CHANGED
@@ -70,19 +70,31 @@ module PDF
|
|
70
70
|
class Reader
|
71
71
|
################################################################################
|
72
72
|
# Parse the file with the given name, sending events to the given receiver.
|
73
|
-
def self.file
|
73
|
+
def self.file(name, receiver, opts = {})
|
74
74
|
File.open(name,"rb") do |f|
|
75
75
|
new.parse(f, receiver, opts)
|
76
76
|
end
|
77
77
|
end
|
78
78
|
################################################################################
|
79
79
|
# Parse the given string, sending events to the given receiver.
|
80
|
-
def self.string
|
80
|
+
def self.string(str, receiver, opts = {})
|
81
81
|
StringIO.open(str) do |s|
|
82
82
|
new.parse(s, receiver, opts)
|
83
83
|
end
|
84
84
|
end
|
85
85
|
################################################################################
|
86
|
+
def self.object_file(name, id, gen)
|
87
|
+
File.open(name,"rb") do |f|
|
88
|
+
new.object(f, id, gen)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
################################################################################
|
92
|
+
def self.object_string(name, id, gen)
|
93
|
+
StringIO.open(str) do |s|
|
94
|
+
new.object(s, id, gen)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
################################################################################
|
86
98
|
end
|
87
99
|
################################################################################
|
88
100
|
end
|
@@ -96,6 +108,7 @@ require 'pdf/reader/error'
|
|
96
108
|
require 'pdf/reader/filter'
|
97
109
|
require 'pdf/reader/font'
|
98
110
|
require 'pdf/reader/parser'
|
111
|
+
require 'pdf/reader/print_receiver'
|
99
112
|
require 'pdf/reader/reference'
|
100
113
|
require 'pdf/reader/register_receiver'
|
101
114
|
require 'pdf/reader/stream'
|
@@ -104,10 +117,6 @@ require 'pdf/reader/token'
|
|
104
117
|
require 'pdf/reader/xref'
|
105
118
|
|
106
119
|
class PDF::Reader
|
107
|
-
################################################################################
|
108
|
-
# Initialize a new PDF::Reader
|
109
|
-
def initialize
|
110
|
-
end
|
111
120
|
################################################################################
|
112
121
|
# Given an IO object that contains PDF data, parse it.
|
113
122
|
def parse (io, receiver, opts = {})
|
@@ -121,10 +130,19 @@ class PDF::Reader
|
|
121
130
|
|
122
131
|
trailer = @xref.load
|
123
132
|
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
|
124
|
-
@content.metadata(@xref.object(trailer[:Info])) if options[:metadata]
|
133
|
+
@content.metadata(@xref.object(trailer[:Root]), @xref.object(trailer[:Info])) if options[:metadata]
|
125
134
|
@content.document(@xref.object(trailer[:Root])) if options[:pages]
|
126
135
|
self
|
127
136
|
end
|
128
137
|
################################################################################
|
138
|
+
# Given an IO object that contains PDF data, return the contents of a single object
|
139
|
+
def object (io, id, gen)
|
140
|
+
@buffer = Buffer.new(io)
|
141
|
+
@xref = XRef.new(@buffer)
|
142
|
+
@xref.load
|
143
|
+
|
144
|
+
@xref.object(Reference.new(id, gen))
|
145
|
+
end
|
146
|
+
################################################################################
|
129
147
|
end
|
130
148
|
################################################################################
|
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/content.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -27,20 +27,20 @@ require 'stringio'
|
|
27
27
|
class PDF::Reader
|
28
28
|
################################################################################
|
29
29
|
# Walks the PDF file and calls the appropriate callback methods when something of interest is
|
30
|
-
# found.
|
30
|
+
# found.
|
31
31
|
#
|
32
32
|
# The callback methods should exist on the receiver object passed into the constructor. Whenever
|
33
|
-
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
33
|
+
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
34
34
|
# is defined.
|
35
35
|
#
|
36
36
|
# If it is defined it will be called. If not, processing will continue.
|
37
37
|
#
|
38
38
|
# = Available Callbacks
|
39
|
-
# The following callbacks are available and should be methods defined on your receiver class. Only
|
39
|
+
# The following callbacks are available and should be methods defined on your receiver class. Only
|
40
40
|
# implement the ones you need - the rest will be ignored.
|
41
41
|
#
|
42
42
|
# Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
|
43
|
-
# paramters, or where you don't need them, the *params argument can be left off. Some example callback
|
43
|
+
# paramters, or where you don't need them, the *params argument can be left off. Some example callback
|
44
44
|
# method definitions are:
|
45
45
|
#
|
46
46
|
# def begin_document
|
@@ -49,14 +49,14 @@ class PDF::Reader
|
|
49
49
|
# def fill_stroke(*params)
|
50
50
|
#
|
51
51
|
# You should be able to infer the basic command the callback is reporting based on the name. For
|
52
|
-
# further experimentation, define the callback with just a *params parameter, then print out the
|
52
|
+
# further experimentation, define the callback with just a *params parameter, then print out the
|
53
53
|
# contents of the array using something like:
|
54
54
|
#
|
55
55
|
# puts params.inspect
|
56
56
|
#
|
57
57
|
# == Text Callbacks
|
58
58
|
#
|
59
|
-
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
59
|
+
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
60
60
|
# PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
|
61
61
|
# when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
|
62
62
|
# string may not be byte-by-byte identical with the string that was originally written to the PDF.
|
@@ -146,6 +146,7 @@ class PDF::Reader
|
|
146
146
|
# - end_page
|
147
147
|
# - metadata
|
148
148
|
# - xml_metadata
|
149
|
+
# - page_count
|
149
150
|
#
|
150
151
|
# == Resource Callbacks
|
151
152
|
#
|
@@ -155,8 +156,8 @@ class PDF::Reader
|
|
155
156
|
# on a page:
|
156
157
|
#
|
157
158
|
# In most cases, these callbacks associate a name with each resource, allowing it
|
158
|
-
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
159
|
-
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
159
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
160
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
160
161
|
# invoke_xobject "IM1".
|
161
162
|
#
|
162
163
|
# - resource_procset
|
@@ -252,25 +253,37 @@ class PDF::Reader
|
|
252
253
|
end
|
253
254
|
################################################################################
|
254
255
|
# Begin processing the document metadata
|
255
|
-
def metadata (info)
|
256
|
+
def metadata (root, info)
|
256
257
|
info = decode_strings(info)
|
258
|
+
|
259
|
+
# may be useful to some people
|
260
|
+
callback(:pdf_version, @xref.pdf_version)
|
261
|
+
|
262
|
+
# ye olde metadata
|
257
263
|
callback(:metadata, [info]) if info
|
264
|
+
|
265
|
+
# new style xml metadata
|
266
|
+
callback(:xml_metadata,@xref.object(root[:Metadata])) if root[:Metadata]
|
267
|
+
|
268
|
+
# page count
|
269
|
+
if (pages = @xref.object(root[:Pages]))
|
270
|
+
if (count = @xref.object(pages[:Count]))
|
271
|
+
callback(:page_count, count.to_i)
|
272
|
+
end
|
273
|
+
end
|
258
274
|
end
|
259
275
|
################################################################################
|
260
276
|
# Begin processing the document
|
261
277
|
def document (root)
|
262
|
-
if root[:Metadata]
|
263
|
-
callback(:xml_metadata,@xref.object(root[:Metadata]))
|
264
|
-
end
|
265
278
|
callback(:begin_document, [root])
|
266
279
|
walk_pages(@xref.object(root[:Pages]))
|
267
280
|
callback(:end_document)
|
268
281
|
end
|
269
282
|
################################################################################
|
270
|
-
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
283
|
+
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
271
284
|
# its content
|
272
285
|
def walk_pages (page)
|
273
|
-
|
286
|
+
|
274
287
|
if page[:Resources]
|
275
288
|
res = page[:Resources]
|
276
289
|
page.delete(:Resources)
|
@@ -293,7 +306,7 @@ class PDF::Reader
|
|
293
306
|
else
|
294
307
|
contents = [page[:Contents]]
|
295
308
|
end
|
296
|
-
|
309
|
+
|
297
310
|
contents.each do |content|
|
298
311
|
obj = @xref.object(content)
|
299
312
|
content_stream(obj)
|
@@ -310,32 +323,27 @@ class PDF::Reader
|
|
310
323
|
@parser = Parser.new(@buffer, @xref)
|
311
324
|
@params = [] if @params.nil?
|
312
325
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
318
|
-
@current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
|
326
|
+
while (token = @parser.parse_token(OPERATORS))
|
327
|
+
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
328
|
+
@current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
|
319
329
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
end
|
330
|
-
@params = [map]
|
331
|
-
# read the raw image data from the buffer without tokenising
|
332
|
-
@params << @buffer.read_until("EI")
|
330
|
+
# handle special cases in response to certain operators
|
331
|
+
if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
|
332
|
+
# convert any text to utf-8
|
333
|
+
@params = @fonts[@current_font].to_utf8(@params)
|
334
|
+
elsif token == "ID"
|
335
|
+
# inline image data, first convert the current params into a more familiar hash
|
336
|
+
map = {}
|
337
|
+
@params.each_slice(2) do |a|
|
338
|
+
map[a.first] = a.last
|
333
339
|
end
|
334
|
-
|
335
|
-
|
336
|
-
|
340
|
+
@params = [map]
|
341
|
+
# read the raw image data from the buffer without tokenising
|
342
|
+
@params << @buffer.read_until("EI")
|
337
343
|
end
|
338
|
-
|
344
|
+
callback(OPERATORS[token], @params)
|
345
|
+
@params.clear
|
346
|
+
else
|
339
347
|
@params << token
|
340
348
|
end
|
341
349
|
end
|
@@ -345,7 +353,7 @@ class PDF::Reader
|
|
345
353
|
################################################################################
|
346
354
|
def walk_resources(resources)
|
347
355
|
resources = resolve_references(resources)
|
348
|
-
|
356
|
+
|
349
357
|
# extract any procset information
|
350
358
|
if resources[:ProcSet]
|
351
359
|
callback(:resource_procset, resources[:ProcSet])
|
@@ -387,7 +395,7 @@ class PDF::Reader
|
|
387
395
|
@fonts[label].label = label
|
388
396
|
@fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
389
397
|
@fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
390
|
-
@fonts[label].encoding = PDF::Reader::Encoding.
|
398
|
+
@fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
|
391
399
|
@fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
392
400
|
if desc[:ToUnicode]
|
393
401
|
# this stream is a cmap
|
@@ -402,13 +410,13 @@ class PDF::Reader
|
|
402
410
|
end
|
403
411
|
end
|
404
412
|
################################################################################
|
405
|
-
# Convert any PDF::Reader::Resource objects into a real object
|
413
|
+
# Convert any PDF::Reader::Resource objects into a real object
|
406
414
|
def resolve_references(obj)
|
407
415
|
case obj
|
408
|
-
when PDF::Reader::Stream then
|
416
|
+
when PDF::Reader::Stream then
|
409
417
|
obj.hash = resolve_references(obj.hash)
|
410
418
|
obj
|
411
|
-
when PDF::Reader::Reference then
|
419
|
+
when PDF::Reader::Reference then
|
412
420
|
resolve_references(@xref.object(obj))
|
413
421
|
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
414
422
|
when Array then obj.collect { |item| resolve_references(item) }
|
@@ -426,11 +434,11 @@ class PDF::Reader
|
|
426
434
|
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
427
435
|
def decode_strings(obj)
|
428
436
|
case obj
|
429
|
-
when String then
|
437
|
+
when String then
|
430
438
|
if obj[0,2] == "\376\377"
|
431
|
-
PDF::Reader::Encoding
|
439
|
+
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
432
440
|
else
|
433
|
-
PDF::Reader::Encoding
|
441
|
+
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
434
442
|
end
|
435
443
|
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
436
444
|
when Array then obj.collect { |item| decode_strings(item) }
|