pdf-reader 0.7.2 → 0.7.3
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +12 -2
- data/{README → README.rdoc} +27 -47
- data/Rakefile +5 -4
- data/TODO +3 -1
- data/bin/pdf_list_callbacks +1 -5
- data/bin/pdf_object +43 -0
- data/bin/pdf_text +1 -0
- data/lib/pdf/reader.rb +25 -7
- data/lib/pdf/reader/buffer.rb +3 -1
- data/lib/pdf/reader/content.rb +56 -48
- data/lib/pdf/reader/encoding.rb +82 -1088
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/font.rb +4 -3
- data/lib/pdf/reader/parser.rb +1 -0
- data/lib/pdf/reader/print_receiver.rb +19 -0
- data/lib/pdf/reader/xref.rb +12 -0
- metadata +26 -17
- data/lib/pdf/reader/parser.rb.rej +0 -29
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
v0.7.3 (UNRELESED)
|
2
|
+
- Add a high level way to get direct access to a PDF object, including a new executable: pdf_object
|
3
|
+
- Fix a hard loop bug caused by a content stream that is missing a final operator
|
4
|
+
- Significantly simplified the internal code for encoding conversions
|
5
|
+
- Fixes YACC parsing bug that occurs on Fedora 8's ruby VM
|
6
|
+
- New callbacks
|
7
|
+
- page_count
|
8
|
+
- pdf_version
|
9
|
+
- Fix a bug that prevented a font's BaseFont from being recorded correctly
|
10
|
+
|
1
11
|
v0.7.2 (20th May 2008)
|
2
12
|
- Throw an UnsupportedFeatureError if we try to open an encrypted/secure PDF
|
3
13
|
- Correctly handle page content instruction sets with trailing whitespace
|
@@ -16,7 +26,7 @@ v0.7 (6th May 2008)
|
|
16
26
|
- Behave as expected if the Contents key in a Page Dict is a reference
|
17
27
|
- Include some basic metadata callbacks
|
18
28
|
- Don't interpret a comment token (%) inside a string as a comment
|
19
|
-
- Small fixes to improve 1.9
|
29
|
+
- Small fixes to improve 1.9 compatibility
|
20
30
|
- Improved our Zlib deflating to make it slightly more robust - still some more issues to work out though
|
21
31
|
- Throw an UnsupportedFeatureError if a pdf that uses XRef streams is opened
|
22
32
|
- Added an option to PDF::Reader#file and PDF::Reader#string to enable parsing of only parts of a PDF file(ie. only metadata, etc)
|
@@ -36,7 +46,7 @@ v0.6.1 (12th March 2008)
|
|
36
46
|
just replace each character with a little box.
|
37
47
|
- Use the same little box when invalid characters are found in other encodings instead of throwing an ugly
|
38
48
|
NoMethodError.
|
39
|
-
- Added a method to RegisterReceiver that returns all
|
49
|
+
- Added a method to RegisterReceiver that returns all occurrences of a callback
|
40
50
|
|
41
51
|
v0.6.0 (27th February 2008)
|
42
52
|
- all text is now transparently converted to UTF-8 before being passed to the callbacks.
|
data/{README → README.rdoc}
RENAMED
@@ -48,8 +48,11 @@ UnsupportedFeatureError - The PDF uses a feature that PDF::Reader doesn't curren
|
|
48
48
|
support. Again, we welcome submissions of PDF files that exhibit these features to help
|
49
49
|
us with future code improvements.
|
50
50
|
|
51
|
-
|
52
|
-
|
51
|
+
MalformedPDFError has some subclasses if you want to detect finer grained issues. If you
|
52
|
+
don't, 'rescue MalformedPDFError' will catch all the subclassed errors as well.
|
53
|
+
|
54
|
+
Any other exceptions should be considered bugs in either PDF::Reader (please
|
55
|
+
report it!) your receiver (please don't report it!).
|
53
56
|
|
54
57
|
= Maintainers
|
55
58
|
|
@@ -66,7 +69,7 @@ http://groups.google.com/group/pdf-reader
|
|
66
69
|
|
67
70
|
The easiest way to explain how this works in practice is to show some examples.
|
68
71
|
|
69
|
-
== Page Counter
|
72
|
+
== Naïve Page Counter
|
70
73
|
|
71
74
|
A simple app to count the number of pages in a PDF File.
|
72
75
|
|
@@ -127,6 +130,26 @@ it through less or to a text file.
|
|
127
130
|
puts receiver.regular.inspect
|
128
131
|
puts receiver.xml.inspect
|
129
132
|
|
133
|
+
== Improved Page Counter
|
134
|
+
|
135
|
+
A simple app to display the number of pages in a PDF File.
|
136
|
+
|
137
|
+
require 'rubygems'
|
138
|
+
require 'pdf/reader'
|
139
|
+
|
140
|
+
class PageReceiver
|
141
|
+
attr_accessor :pages
|
142
|
+
|
143
|
+
# Called when page parsing ends
|
144
|
+
def page_count(arg)
|
145
|
+
@pages = arg
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
receiver = PageReceiver.new
|
150
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
|
151
|
+
puts "#{receiver.pages} pages"
|
152
|
+
|
130
153
|
== Basic RSpec of a generated PDF
|
131
154
|
|
132
155
|
require 'rubygems'
|
@@ -182,49 +205,6 @@ it through less or to a text file.
|
|
182
205
|
end
|
183
206
|
end
|
184
207
|
|
185
|
-
== Extract ISBNs
|
186
|
-
|
187
|
-
Parse all text in the requested PDF file and print out any valid book ISBNs.
|
188
|
-
Requires the rbook-isbn gem.
|
189
|
-
|
190
|
-
require 'rubygems'
|
191
|
-
require 'pdf/reader'
|
192
|
-
require 'rbook/isbn'
|
193
|
-
|
194
|
-
class ISBNReceiver
|
195
|
-
|
196
|
-
# there's a few text callbacks, so make sure we process them all
|
197
|
-
def show_text(string, *params)
|
198
|
-
process_words(string.split(/\W+/))
|
199
|
-
end
|
200
|
-
|
201
|
-
def super_show_text(string, *params)
|
202
|
-
process_words(string.split(/\W+/))
|
203
|
-
end
|
204
|
-
|
205
|
-
def move_to_next_line_and_show_text (string)
|
206
|
-
process_words(string.split(/\W+/))
|
207
|
-
end
|
208
|
-
|
209
|
-
def set_spacing_next_line_show_text (aw, ac, string)
|
210
|
-
process_words(string.split(/\W+/))
|
211
|
-
end
|
212
|
-
|
213
|
-
private
|
214
|
-
|
215
|
-
# check if any items in the supplied array are a valid ISBN, and print any
|
216
|
-
# that are to console
|
217
|
-
def process_words(words)
|
218
|
-
words.each do |word|
|
219
|
-
word.strip!
|
220
|
-
puts "#{RBook::ISBN.convert_to_isbn13(word)}" if RBook::ISBN.valid_isbn?(word)
|
221
|
-
end
|
222
|
-
end
|
223
|
-
end
|
224
|
-
|
225
|
-
receiver = ISBNReceiver.new
|
226
|
-
PDF::Reader.file("somefile.pdf", receiver)
|
227
|
-
|
228
208
|
= Known Limitations
|
229
209
|
|
230
210
|
The order of the callbacks is unpredicable, and is dependent on the internal
|
@@ -238,7 +218,7 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
238
218
|
|
239
219
|
= Resources
|
240
220
|
|
241
|
-
- PDF::Reader
|
221
|
+
- PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
242
222
|
- PDF::Reader Rubyforge Page: http://rubyforge.org/projects/pdf-reader/
|
243
223
|
- PDF Specification: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
244
224
|
- PDF Tutorial Slide Presentations: http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
data/Rakefile
CHANGED
@@ -6,7 +6,7 @@ require 'rake/testtask'
|
|
6
6
|
require "rake/gempackagetask"
|
7
7
|
require 'spec/rake/spectask'
|
8
8
|
|
9
|
-
PKG_VERSION = "0.7.
|
9
|
+
PKG_VERSION = "0.7.3"
|
10
10
|
PKG_NAME = "pdf-reader"
|
11
11
|
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
12
12
|
|
@@ -44,7 +44,7 @@ desc "Create documentation"
|
|
44
44
|
Rake::RDocTask.new("doc") do |rdoc|
|
45
45
|
rdoc.title = "pdf-reader"
|
46
46
|
rdoc.rdoc_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/rdoc'
|
47
|
-
rdoc.rdoc_files.include('README')
|
47
|
+
rdoc.rdoc_files.include('README.rdoc')
|
48
48
|
rdoc.rdoc_files.include('TODO')
|
49
49
|
rdoc.rdoc_files.include('CHANGELOG')
|
50
50
|
#rdoc.rdoc_files.include('COPYING')
|
@@ -66,12 +66,13 @@ spec = Gem::Specification.new do |spec|
|
|
66
66
|
|
67
67
|
spec.require_path = "lib"
|
68
68
|
spec.bindir = "bin"
|
69
|
+
spec.executables << "pdf_object"
|
69
70
|
spec.executables << "pdf_text"
|
70
71
|
spec.executables << "pdf_list_callbacks"
|
71
72
|
spec.has_rdoc = true
|
72
|
-
spec.extra_rdoc_files = %w{README TODO CHANGELOG}
|
73
|
+
spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG}
|
73
74
|
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
74
|
-
'--main' << 'README' << '-q'
|
75
|
+
'--main' << 'README.rdoc' << '-q'
|
75
76
|
spec.author = "Peter Jones"
|
76
77
|
spec.email = "pjones@pmade.com"
|
77
78
|
spec.rubyforge_project = "pdf-reader"
|
data/TODO
CHANGED
@@ -1,4 +1,7 @@
|
|
1
1
|
v0.8
|
2
|
+
- add extra callbacks
|
3
|
+
- list implemented features
|
4
|
+
- encrypted? tagged? bookmarks? annotated? optimised?
|
2
5
|
- Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
|
3
6
|
- bookmarks?
|
4
7
|
- outline?
|
@@ -9,7 +12,6 @@ v0.8
|
|
9
12
|
poppler seems to do this in a quite reasonable way. Original Encoding -> Glyph Names -> Unicode. As of 0.6 we go straight
|
10
13
|
from the Original encoding to Unicode.
|
11
14
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
12
|
-
- Provide a way to get raw access to a particular object. Good for testing purposes
|
13
15
|
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
14
16
|
- Support Cross Reference Streams (spec 3.4.7)
|
15
17
|
|
data/bin/pdf_list_callbacks
CHANGED
@@ -4,14 +4,10 @@ $LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
|
4
4
|
|
5
5
|
require 'pdf/reader'
|
6
6
|
|
7
|
-
receiver = PDF::Reader::
|
7
|
+
receiver = PDF::Reader::PrintReceiver.new
|
8
8
|
|
9
9
|
if ARGV.empty?
|
10
10
|
PDF::Reader.new.parse($stdin, receiver)
|
11
11
|
else
|
12
12
|
PDF::Reader.file(ARGV[0], receiver)
|
13
13
|
end
|
14
|
-
|
15
|
-
receiver.callbacks.each do |callback|
|
16
|
-
puts "#{callback[:name]} - #{callback[:args].inspect}"
|
17
|
-
end
|
data/bin/pdf_object
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
4
|
+
|
5
|
+
USAGE = "USAGE: " + File.basename(__FILE__) + " <file> <object id> [generation]"
|
6
|
+
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
filename, id, gen = *ARGV
|
10
|
+
|
11
|
+
if filename.nil? || id.nil?
|
12
|
+
puts USAGE
|
13
|
+
exit 1
|
14
|
+
elsif !File.file?(filename)
|
15
|
+
$stderr.puts "#{filename} does not exist"
|
16
|
+
exit 1
|
17
|
+
end
|
18
|
+
|
19
|
+
# tweak the users options
|
20
|
+
id = id.to_i
|
21
|
+
gen ||= 0
|
22
|
+
gen = gen.to_i
|
23
|
+
|
24
|
+
# make magic happen
|
25
|
+
begin
|
26
|
+
obj = PDF::Reader.object_file(filename, id, gen)
|
27
|
+
|
28
|
+
case obj
|
29
|
+
when Hash, Array
|
30
|
+
puts obj.inspect
|
31
|
+
else
|
32
|
+
puts obj
|
33
|
+
end
|
34
|
+
rescue PDF::Reader::InvalidObjectError
|
35
|
+
$stderr.puts "Error retreiving object #{id}, gen #{gen}. Does it exist?"
|
36
|
+
exit 1
|
37
|
+
rescue PDF::Reader::MalformedPDFError => e
|
38
|
+
$stderr.puts "Malformed PDF file: #{e.message}"
|
39
|
+
exit 1
|
40
|
+
rescue PDF::Reader::UnsupportedFeatureError => e
|
41
|
+
$stderr.puts "PDF file implements a feature unsupported by PDF::Reader: #{e.message}"
|
42
|
+
exit 1
|
43
|
+
end
|
data/bin/pdf_text
CHANGED
data/lib/pdf/reader.rb
CHANGED
@@ -70,19 +70,31 @@ module PDF
|
|
70
70
|
class Reader
|
71
71
|
################################################################################
|
72
72
|
# Parse the file with the given name, sending events to the given receiver.
|
73
|
-
def self.file
|
73
|
+
def self.file(name, receiver, opts = {})
|
74
74
|
File.open(name,"rb") do |f|
|
75
75
|
new.parse(f, receiver, opts)
|
76
76
|
end
|
77
77
|
end
|
78
78
|
################################################################################
|
79
79
|
# Parse the given string, sending events to the given receiver.
|
80
|
-
def self.string
|
80
|
+
def self.string(str, receiver, opts = {})
|
81
81
|
StringIO.open(str) do |s|
|
82
82
|
new.parse(s, receiver, opts)
|
83
83
|
end
|
84
84
|
end
|
85
85
|
################################################################################
|
86
|
+
def self.object_file(name, id, gen)
|
87
|
+
File.open(name,"rb") do |f|
|
88
|
+
new.object(f, id, gen)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
################################################################################
|
92
|
+
def self.object_string(name, id, gen)
|
93
|
+
StringIO.open(str) do |s|
|
94
|
+
new.object(s, id, gen)
|
95
|
+
end
|
96
|
+
end
|
97
|
+
################################################################################
|
86
98
|
end
|
87
99
|
################################################################################
|
88
100
|
end
|
@@ -96,6 +108,7 @@ require 'pdf/reader/error'
|
|
96
108
|
require 'pdf/reader/filter'
|
97
109
|
require 'pdf/reader/font'
|
98
110
|
require 'pdf/reader/parser'
|
111
|
+
require 'pdf/reader/print_receiver'
|
99
112
|
require 'pdf/reader/reference'
|
100
113
|
require 'pdf/reader/register_receiver'
|
101
114
|
require 'pdf/reader/stream'
|
@@ -104,10 +117,6 @@ require 'pdf/reader/token'
|
|
104
117
|
require 'pdf/reader/xref'
|
105
118
|
|
106
119
|
class PDF::Reader
|
107
|
-
################################################################################
|
108
|
-
# Initialize a new PDF::Reader
|
109
|
-
def initialize
|
110
|
-
end
|
111
120
|
################################################################################
|
112
121
|
# Given an IO object that contains PDF data, parse it.
|
113
122
|
def parse (io, receiver, opts = {})
|
@@ -121,10 +130,19 @@ class PDF::Reader
|
|
121
130
|
|
122
131
|
trailer = @xref.load
|
123
132
|
raise PDF::Reader::UnsupportedFeatureError, 'PDF::Reader cannot read encrypted PDF files' if trailer[:Encrypt]
|
124
|
-
@content.metadata(@xref.object(trailer[:Info])) if options[:metadata]
|
133
|
+
@content.metadata(@xref.object(trailer[:Root]), @xref.object(trailer[:Info])) if options[:metadata]
|
125
134
|
@content.document(@xref.object(trailer[:Root])) if options[:pages]
|
126
135
|
self
|
127
136
|
end
|
128
137
|
################################################################################
|
138
|
+
# Given an IO object that contains PDF data, return the contents of a single object
|
139
|
+
def object (io, id, gen)
|
140
|
+
@buffer = Buffer.new(io)
|
141
|
+
@xref = XRef.new(@buffer)
|
142
|
+
@xref.load
|
143
|
+
|
144
|
+
@xref.object(Reference.new(id, gen))
|
145
|
+
end
|
146
|
+
################################################################################
|
129
147
|
end
|
130
148
|
################################################################################
|
data/lib/pdf/reader/buffer.rb
CHANGED
data/lib/pdf/reader/content.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -27,20 +27,20 @@ require 'stringio'
|
|
27
27
|
class PDF::Reader
|
28
28
|
################################################################################
|
29
29
|
# Walks the PDF file and calls the appropriate callback methods when something of interest is
|
30
|
-
# found.
|
30
|
+
# found.
|
31
31
|
#
|
32
32
|
# The callback methods should exist on the receiver object passed into the constructor. Whenever
|
33
|
-
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
33
|
+
# some content is found that will trigger a callback, the receiver is checked to see if the callback
|
34
34
|
# is defined.
|
35
35
|
#
|
36
36
|
# If it is defined it will be called. If not, processing will continue.
|
37
37
|
#
|
38
38
|
# = Available Callbacks
|
39
|
-
# The following callbacks are available and should be methods defined on your receiver class. Only
|
39
|
+
# The following callbacks are available and should be methods defined on your receiver class. Only
|
40
40
|
# implement the ones you need - the rest will be ignored.
|
41
41
|
#
|
42
42
|
# Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
|
43
|
-
# paramters, or where you don't need them, the *params argument can be left off. Some example callback
|
43
|
+
# paramters, or where you don't need them, the *params argument can be left off. Some example callback
|
44
44
|
# method definitions are:
|
45
45
|
#
|
46
46
|
# def begin_document
|
@@ -49,14 +49,14 @@ class PDF::Reader
|
|
49
49
|
# def fill_stroke(*params)
|
50
50
|
#
|
51
51
|
# You should be able to infer the basic command the callback is reporting based on the name. For
|
52
|
-
# further experimentation, define the callback with just a *params parameter, then print out the
|
52
|
+
# further experimentation, define the callback with just a *params parameter, then print out the
|
53
53
|
# contents of the array using something like:
|
54
54
|
#
|
55
55
|
# puts params.inspect
|
56
56
|
#
|
57
57
|
# == Text Callbacks
|
58
58
|
#
|
59
|
-
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
59
|
+
# All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
|
60
60
|
# PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
|
61
61
|
# when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
|
62
62
|
# string may not be byte-by-byte identical with the string that was originally written to the PDF.
|
@@ -146,6 +146,7 @@ class PDF::Reader
|
|
146
146
|
# - end_page
|
147
147
|
# - metadata
|
148
148
|
# - xml_metadata
|
149
|
+
# - page_count
|
149
150
|
#
|
150
151
|
# == Resource Callbacks
|
151
152
|
#
|
@@ -155,8 +156,8 @@ class PDF::Reader
|
|
155
156
|
# on a page:
|
156
157
|
#
|
157
158
|
# In most cases, these callbacks associate a name with each resource, allowing it
|
158
|
-
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
159
|
-
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
159
|
+
# to be referred to by name in the page content. For example, an XObject can hold an image.
|
160
|
+
# If it gets mapped to the name "IM1", then it can be placed on the page using
|
160
161
|
# invoke_xobject "IM1".
|
161
162
|
#
|
162
163
|
# - resource_procset
|
@@ -252,25 +253,37 @@ class PDF::Reader
|
|
252
253
|
end
|
253
254
|
################################################################################
|
254
255
|
# Begin processing the document metadata
|
255
|
-
def metadata (info)
|
256
|
+
def metadata (root, info)
|
256
257
|
info = decode_strings(info)
|
258
|
+
|
259
|
+
# may be useful to some people
|
260
|
+
callback(:pdf_version, @xref.pdf_version)
|
261
|
+
|
262
|
+
# ye olde metadata
|
257
263
|
callback(:metadata, [info]) if info
|
264
|
+
|
265
|
+
# new style xml metadata
|
266
|
+
callback(:xml_metadata,@xref.object(root[:Metadata])) if root[:Metadata]
|
267
|
+
|
268
|
+
# page count
|
269
|
+
if (pages = @xref.object(root[:Pages]))
|
270
|
+
if (count = @xref.object(pages[:Count]))
|
271
|
+
callback(:page_count, count.to_i)
|
272
|
+
end
|
273
|
+
end
|
258
274
|
end
|
259
275
|
################################################################################
|
260
276
|
# Begin processing the document
|
261
277
|
def document (root)
|
262
|
-
if root[:Metadata]
|
263
|
-
callback(:xml_metadata,@xref.object(root[:Metadata]))
|
264
|
-
end
|
265
278
|
callback(:begin_document, [root])
|
266
279
|
walk_pages(@xref.object(root[:Pages]))
|
267
280
|
callback(:end_document)
|
268
281
|
end
|
269
282
|
################################################################################
|
270
|
-
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
283
|
+
# Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
|
271
284
|
# its content
|
272
285
|
def walk_pages (page)
|
273
|
-
|
286
|
+
|
274
287
|
if page[:Resources]
|
275
288
|
res = page[:Resources]
|
276
289
|
page.delete(:Resources)
|
@@ -293,7 +306,7 @@ class PDF::Reader
|
|
293
306
|
else
|
294
307
|
contents = [page[:Contents]]
|
295
308
|
end
|
296
|
-
|
309
|
+
|
297
310
|
contents.each do |content|
|
298
311
|
obj = @xref.object(content)
|
299
312
|
content_stream(obj)
|
@@ -310,32 +323,27 @@ class PDF::Reader
|
|
310
323
|
@parser = Parser.new(@buffer, @xref)
|
311
324
|
@params = [] if @params.nil?
|
312
325
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
318
|
-
@current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
|
326
|
+
while (token = @parser.parse_token(OPERATORS))
|
327
|
+
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
328
|
+
@current_font = @params.first if OPERATORS[token] == :set_text_font_and_size
|
319
329
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
end
|
330
|
-
@params = [map]
|
331
|
-
# read the raw image data from the buffer without tokenising
|
332
|
-
@params << @buffer.read_until("EI")
|
330
|
+
# handle special cases in response to certain operators
|
331
|
+
if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
|
332
|
+
# convert any text to utf-8
|
333
|
+
@params = @fonts[@current_font].to_utf8(@params)
|
334
|
+
elsif token == "ID"
|
335
|
+
# inline image data, first convert the current params into a more familiar hash
|
336
|
+
map = {}
|
337
|
+
@params.each_slice(2) do |a|
|
338
|
+
map[a.first] = a.last
|
333
339
|
end
|
334
|
-
|
335
|
-
|
336
|
-
|
340
|
+
@params = [map]
|
341
|
+
# read the raw image data from the buffer without tokenising
|
342
|
+
@params << @buffer.read_until("EI")
|
337
343
|
end
|
338
|
-
|
344
|
+
callback(OPERATORS[token], @params)
|
345
|
+
@params.clear
|
346
|
+
else
|
339
347
|
@params << token
|
340
348
|
end
|
341
349
|
end
|
@@ -345,7 +353,7 @@ class PDF::Reader
|
|
345
353
|
################################################################################
|
346
354
|
def walk_resources(resources)
|
347
355
|
resources = resolve_references(resources)
|
348
|
-
|
356
|
+
|
349
357
|
# extract any procset information
|
350
358
|
if resources[:ProcSet]
|
351
359
|
callback(:resource_procset, resources[:ProcSet])
|
@@ -387,7 +395,7 @@ class PDF::Reader
|
|
387
395
|
@fonts[label].label = label
|
388
396
|
@fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
389
397
|
@fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
390
|
-
@fonts[label].encoding = PDF::Reader::Encoding.
|
398
|
+
@fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
|
391
399
|
@fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
392
400
|
if desc[:ToUnicode]
|
393
401
|
# this stream is a cmap
|
@@ -402,13 +410,13 @@ class PDF::Reader
|
|
402
410
|
end
|
403
411
|
end
|
404
412
|
################################################################################
|
405
|
-
# Convert any PDF::Reader::Resource objects into a real object
|
413
|
+
# Convert any PDF::Reader::Resource objects into a real object
|
406
414
|
def resolve_references(obj)
|
407
415
|
case obj
|
408
|
-
when PDF::Reader::Stream then
|
416
|
+
when PDF::Reader::Stream then
|
409
417
|
obj.hash = resolve_references(obj.hash)
|
410
418
|
obj
|
411
|
-
when PDF::Reader::Reference then
|
419
|
+
when PDF::Reader::Reference then
|
412
420
|
resolve_references(@xref.object(obj))
|
413
421
|
when Hash then obj.each { |key,val| obj[key] = resolve_references(val) }
|
414
422
|
when Array then obj.collect { |item| resolve_references(item) }
|
@@ -426,11 +434,11 @@ class PDF::Reader
|
|
426
434
|
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
427
435
|
def decode_strings(obj)
|
428
436
|
case obj
|
429
|
-
when String then
|
437
|
+
when String then
|
430
438
|
if obj[0,2] == "\376\377"
|
431
|
-
PDF::Reader::Encoding
|
439
|
+
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
432
440
|
else
|
433
|
-
PDF::Reader::Encoding
|
441
|
+
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
434
442
|
end
|
435
443
|
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
436
444
|
when Array then obj.collect { |item| decode_strings(item) }
|