pdf-reader 0.8.3 → 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +10 -0
- data/Rakefile +1 -1
- data/bin/pdf_text +1 -0
- data/examples/version.rb +25 -0
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader/cmap.rb +80 -25
- data/lib/pdf/reader/content.rb +56 -40
- data/lib/pdf/reader/encoding.rb +26 -14
- data/lib/pdf/reader/parser.rb +13 -3
- metadata +4 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
v0.8.4 (XXX)
|
2
|
+
- fix parsing of files that use Form XObjects
|
3
|
+
- thanks to Andrea Barisani for reporting the issue
|
4
|
+
- fix two issues that caused a small number of characters to convert to Unicode
|
5
|
+
incorrectly
|
6
|
+
- thanks to Andrea Barisani for reporting the issue
|
7
|
+
- require 'pdf-reader' now works a well as 'pdf/reader'
|
8
|
+
- good practice to have the require file match the gem name
|
9
|
+
- thanks to Chris O'Meara for highlighting this
|
10
|
+
|
1
11
|
v0.8.3 (14th February 2010)
|
2
12
|
- Fix a bug in tokenising of hex strings inside dictionaries
|
3
13
|
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
data/Rakefile
CHANGED
data/bin/pdf_text
CHANGED
data/examples/version.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Determine the PDF version of a file
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class VersionReceiver
|
10
|
+
attr_accessor :version
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@version = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Called when document parsing starts
|
17
|
+
def pdf_version(arg = nil)
|
18
|
+
@version = arg
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
receiver = VersionReceiver.new
|
24
|
+
pdf = PDF::Reader.file(ARGV.shift, receiver)
|
25
|
+
puts receiver.version
|
data/lib/pdf-reader.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "pdf/reader"
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -30,26 +30,33 @@ class PDF::Reader
|
|
30
30
|
@map = {}
|
31
31
|
in_char_mode = false
|
32
32
|
in_range_mode = false
|
33
|
+
instructions = ""
|
33
34
|
|
34
35
|
data.each_line do |l|
|
35
36
|
if l.include?("beginbfchar")
|
36
|
-
in_char_mode = true
|
37
|
+
in_char_mode = true
|
37
38
|
elsif l.include?("endbfchar")
|
38
|
-
|
39
|
+
process_bfchar_instructions(instructions)
|
40
|
+
instructions = ""
|
41
|
+
in_char_mode = false
|
39
42
|
elsif l.include?("beginbfrange")
|
40
|
-
in_range_mode = true
|
43
|
+
in_range_mode = true
|
41
44
|
elsif l.include?("endbfrange")
|
42
|
-
|
45
|
+
process_bfrange_instructions(instructions)
|
46
|
+
instructions = ""
|
47
|
+
in_range_mode = false
|
43
48
|
end
|
44
49
|
|
45
|
-
if in_char_mode
|
46
|
-
|
47
|
-
elsif in_range_mode
|
48
|
-
process_bfrange_line(l)
|
50
|
+
if !l.include?("begin") && (in_char_mode || in_range_mode)
|
51
|
+
instructions << l
|
49
52
|
end
|
50
53
|
end
|
51
54
|
end
|
52
55
|
|
56
|
+
def size
|
57
|
+
@map.size
|
58
|
+
end
|
59
|
+
|
53
60
|
def decode(c)
|
54
61
|
# TODO: implement the conversion
|
55
62
|
return c unless c.class == Fixnum
|
@@ -58,24 +65,72 @@ class PDF::Reader
|
|
58
65
|
|
59
66
|
private
|
60
67
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
68
|
+
def build_parser(instructions)
|
69
|
+
buffer = Buffer.new(StringIO.new(instructions))
|
70
|
+
Parser.new(buffer)
|
71
|
+
end
|
72
|
+
|
73
|
+
def str_to_int(str)
|
74
|
+
return nil if str.nil? || str.size == 0 || str.size >= 3
|
75
|
+
|
76
|
+
if str.size == 1
|
77
|
+
str.unpack("C*")[0]
|
78
|
+
else
|
79
|
+
str.unpack("n*")[0]
|
80
|
+
end
|
64
81
|
end
|
65
82
|
|
66
|
-
def
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
83
|
+
def process_bfchar_instructions(instructions)
|
84
|
+
parser = build_parser(instructions)
|
85
|
+
find = str_to_int(parser.parse_token)
|
86
|
+
replace = str_to_int(parser.parse_token)
|
87
|
+
while find && replace
|
88
|
+
@map[find] = replace
|
89
|
+
find = str_to_int(parser.parse_token)
|
90
|
+
replace = str_to_int(parser.parse_token)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def process_bfrange_instructions(instructions)
|
95
|
+
parser = build_parser(instructions)
|
96
|
+
start = parser.parse_token
|
97
|
+
finish = parser.parse_token
|
98
|
+
to = parser.parse_token
|
99
|
+
while start && finish && to
|
100
|
+
if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
|
101
|
+
bfrange_type_one(start, finish, to)
|
102
|
+
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
103
|
+
bfrange_type_two(start, finish, to)
|
104
|
+
else
|
105
|
+
raise "invalid bfrange section"
|
78
106
|
end
|
107
|
+
start = parser.parse_token
|
108
|
+
finish = parser.parse_token
|
109
|
+
to = parser.parse_token
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def bfrange_type_one(start_code, end_code, dst)
|
114
|
+
start_code = str_to_int(start_code)
|
115
|
+
end_code = str_to_int(end_code)
|
116
|
+
dst = str_to_int(dst)
|
117
|
+
|
118
|
+
# add all values in the range to our mapping
|
119
|
+
(start_code..end_code).each_with_index do |val, idx|
|
120
|
+
@map[val] = dst + idx
|
121
|
+
# ensure a single range does not exceed 255 chars
|
122
|
+
raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def bfrange_type_two(start_code, end_code, dst)
|
127
|
+
start_code = str_to_int(start_code)
|
128
|
+
end_code = str_to_int(end_code)
|
129
|
+
from_range = (start_code..end_code)
|
130
|
+
|
131
|
+
# add all values in the range to our mapping
|
132
|
+
from_range.each_with_index do |val, idx|
|
133
|
+
@map[val] = str_to_int(dst[idx])
|
79
134
|
end
|
80
135
|
end
|
81
136
|
end
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -251,7 +251,6 @@ class PDF::Reader
|
|
251
251
|
def initialize (receiver, xref)
|
252
252
|
@receiver = receiver
|
253
253
|
@xref = xref
|
254
|
-
@fonts ||= {}
|
255
254
|
end
|
256
255
|
################################################################################
|
257
256
|
# Begin processing the document metadata
|
@@ -309,10 +308,14 @@ class PDF::Reader
|
|
309
308
|
contents = [page[:Contents]]
|
310
309
|
end
|
311
310
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
311
|
+
fonts = font_hash_from_resources(current_resources)
|
312
|
+
|
313
|
+
if page.has_key?(:Contents) and page[:Contents]
|
314
|
+
contents.each do |content|
|
315
|
+
obj = @xref.object(content)
|
316
|
+
content_stream(obj, fonts)
|
317
|
+
end
|
318
|
+
end
|
316
319
|
|
317
320
|
resources.pop if res
|
318
321
|
callback(:end_page)
|
@@ -330,7 +333,8 @@ class PDF::Reader
|
|
330
333
|
callback(:begin_form_xobject)
|
331
334
|
resources = @xref.object(xobject.hash[:Resources])
|
332
335
|
walk_resources(resources) if resources
|
333
|
-
|
336
|
+
fonts = font_hash_from_resources(resources)
|
337
|
+
content_stream(xobject, fonts)
|
334
338
|
callback(:end_form_xobject)
|
335
339
|
end
|
336
340
|
end
|
@@ -348,42 +352,43 @@ class PDF::Reader
|
|
348
352
|
################################################################################
|
349
353
|
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
350
354
|
# it contains
|
351
|
-
def content_stream (instructions)
|
355
|
+
def content_stream (instructions, fonts = {})
|
352
356
|
instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
|
353
|
-
|
354
|
-
|
355
|
-
|
357
|
+
buffer = Buffer.new(StringIO.new(instructions))
|
358
|
+
parser = Parser.new(buffer, @xref)
|
359
|
+
current_font = nil
|
360
|
+
params = []
|
356
361
|
|
357
|
-
while (token =
|
362
|
+
while (token = parser.parse_token(OPERATORS))
|
358
363
|
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
359
|
-
|
364
|
+
current_font = params.first if OPERATORS[token] == :set_text_font_and_size
|
360
365
|
|
361
366
|
# handle special cases in response to certain operators
|
362
|
-
if OPERATORS[token].to_s.include?("show_text") &&
|
367
|
+
if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
|
363
368
|
# convert any text to utf-8
|
364
|
-
|
369
|
+
params = fonts[current_font].to_utf8(params)
|
365
370
|
elsif token == "ID"
|
366
371
|
# inline image data, first convert the current params into a more familiar hash
|
367
372
|
map = {}
|
368
|
-
|
373
|
+
params.each_slice(2) do |a|
|
369
374
|
map[a.first] = a.last
|
370
375
|
end
|
371
|
-
|
376
|
+
params = [map]
|
372
377
|
# read the raw image data from the buffer without tokenising
|
373
|
-
|
378
|
+
params << buffer.read_until("EI")
|
374
379
|
end
|
375
380
|
|
376
|
-
callback(OPERATORS[token],
|
381
|
+
callback(OPERATORS[token], params)
|
377
382
|
|
378
383
|
if OPERATORS[token] == :invoke_xobject
|
379
|
-
xobject_label =
|
380
|
-
|
384
|
+
xobject_label = params.first
|
385
|
+
params.clear
|
381
386
|
walk_xobject_form(xobject_label)
|
382
387
|
else
|
383
|
-
|
388
|
+
params.clear
|
384
389
|
end
|
385
390
|
else
|
386
|
-
|
391
|
+
params << token
|
387
392
|
end
|
388
393
|
end
|
389
394
|
rescue EOFError => e
|
@@ -430,24 +435,9 @@ class PDF::Reader
|
|
430
435
|
|
431
436
|
# extract any font information
|
432
437
|
if resources[:Font]
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
@fonts[label].label = label
|
437
|
-
@fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
438
|
-
@fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
439
|
-
@fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
|
440
|
-
@fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
441
|
-
if desc[:ToUnicode]
|
442
|
-
# this stream is a cmap
|
443
|
-
begin
|
444
|
-
stream = desc[:ToUnicode]
|
445
|
-
@fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
446
|
-
rescue
|
447
|
-
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
448
|
-
end
|
449
|
-
end
|
450
|
-
callback(:resource_font, [label, @fonts[label]])
|
438
|
+
fonts = font_hash_from_resources(resources)
|
439
|
+
fonts.each do |label, font|
|
440
|
+
callback(:resource_font, [label, fonts])
|
451
441
|
end
|
452
442
|
end
|
453
443
|
end
|
@@ -473,6 +463,32 @@ class PDF::Reader
|
|
473
463
|
end
|
474
464
|
################################################################################
|
475
465
|
private
|
466
|
+
################################################################################
|
467
|
+
def font_hash_from_resources(resources)
|
468
|
+
return {} unless resources.respond_to?(:[])
|
469
|
+
|
470
|
+
fonts = {}
|
471
|
+
resources = @xref.object(resources[:Font]) || {}
|
472
|
+
resources.each do |label, desc|
|
473
|
+
desc = @xref.object(desc)
|
474
|
+
fonts[label] = PDF::Reader::Font.new
|
475
|
+
fonts[label].label = label
|
476
|
+
fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
477
|
+
fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
478
|
+
fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
|
479
|
+
fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
480
|
+
if desc[:ToUnicode]
|
481
|
+
# this stream is a cmap
|
482
|
+
begin
|
483
|
+
stream = desc[:ToUnicode]
|
484
|
+
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
485
|
+
rescue
|
486
|
+
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
490
|
+
fonts
|
491
|
+
end
|
476
492
|
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
477
493
|
def decode_strings(obj)
|
478
494
|
case obj
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -30,9 +30,11 @@ class PDF::Reader
|
|
30
30
|
|
31
31
|
UNKNOWN_CHAR = 0x25AF # ▯
|
32
32
|
|
33
|
-
attr_reader :differences
|
33
|
+
attr_reader :differences, :unpack
|
34
34
|
|
35
35
|
def initialize(enc)
|
36
|
+
@to_unicode_required = false
|
37
|
+
|
36
38
|
if enc.kind_of?(Hash)
|
37
39
|
self.differences=enc[:Differences] if enc[:Differences]
|
38
40
|
enc = enc[:Encoding] || enc[:BaseEncoding]
|
@@ -74,6 +76,10 @@ class PDF::Reader
|
|
74
76
|
end
|
75
77
|
end
|
76
78
|
|
79
|
+
def to_unicode_required?
|
80
|
+
@to_unicode_required
|
81
|
+
end
|
82
|
+
|
77
83
|
# set the differences table for this encoding. should be an array in the following format:
|
78
84
|
#
|
79
85
|
# [25, :A, 26, :B]
|
@@ -101,25 +107,22 @@ class PDF::Reader
|
|
101
107
|
|
102
108
|
# convert the specified string to utf8
|
103
109
|
def to_utf8(str, tounicode = nil)
|
104
|
-
|
105
110
|
# unpack the single bytes
|
106
|
-
array_orig = str.unpack(
|
111
|
+
array_orig = str.unpack(unpack)
|
107
112
|
|
108
113
|
# replace any relevant bytes with a glyph name
|
109
114
|
array_orig = process_differences(array_orig)
|
110
115
|
|
111
116
|
# replace any remaining bytes with a unicode codepoint
|
112
|
-
array_enc =
|
113
|
-
array_orig.each do |num|
|
117
|
+
array_enc = array_orig.map do |num|
|
114
118
|
if tounicode && (code = tounicode.decode(num))
|
115
|
-
|
116
|
-
elsif tounicode || ( tounicode.nil? &&
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
array_enc << @mapping[num]
|
119
|
+
code
|
120
|
+
elsif tounicode || ( tounicode.nil? && to_unicode_required? )
|
121
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
122
|
+
elsif mapping[num]
|
123
|
+
mapping[num]
|
121
124
|
else
|
122
|
-
|
125
|
+
num
|
123
126
|
end
|
124
127
|
end
|
125
128
|
|
@@ -140,6 +143,14 @@ class PDF::Reader
|
|
140
143
|
|
141
144
|
private
|
142
145
|
|
146
|
+
def mapping
|
147
|
+
@mapping ||= {}
|
148
|
+
end
|
149
|
+
|
150
|
+
def has_mapping?
|
151
|
+
mapping.size > 0
|
152
|
+
end
|
153
|
+
|
143
154
|
# accepts an array of byte numbers, and replaces any that have entries in the differences table
|
144
155
|
# with a glyph name
|
145
156
|
def process_differences(arr)
|
@@ -154,12 +165,13 @@ class PDF::Reader
|
|
154
165
|
end
|
155
166
|
|
156
167
|
def load_mapping(file)
|
157
|
-
|
168
|
+
return if has_mapping?
|
169
|
+
|
158
170
|
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
159
171
|
File.open(file, mode) do |f|
|
160
172
|
f.each do |l|
|
161
173
|
m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
162
|
-
|
174
|
+
mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
163
175
|
end
|
164
176
|
end
|
165
177
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -33,7 +33,7 @@ class PDF::Reader
|
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
35
|
# xref - a PDF::Reader::XRef object that represents the document's object offsets
|
36
|
-
def initialize (buffer, xref)
|
36
|
+
def initialize (buffer, xref=nil)
|
37
37
|
@buffer = buffer
|
38
38
|
@xref = xref
|
39
39
|
end
|
@@ -48,7 +48,7 @@ class PDF::Reader
|
|
48
48
|
case token
|
49
49
|
when PDF::Reader::Reference then return token
|
50
50
|
when nil then return nil
|
51
|
-
when "/" then return
|
51
|
+
when "/" then return pdf_name()
|
52
52
|
when "<<" then return dictionary()
|
53
53
|
when "[" then return array()
|
54
54
|
when "(" then return string()
|
@@ -107,6 +107,16 @@ class PDF::Reader
|
|
107
107
|
dict
|
108
108
|
end
|
109
109
|
################################################################################
|
110
|
+
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
111
|
+
def pdf_name
|
112
|
+
tok = @buffer.token
|
113
|
+
tok.scan(/#(\d\d)/).each do |find|
|
114
|
+
replace = find[0].hex.chr
|
115
|
+
tok.gsub!("#"+find[0], replace)
|
116
|
+
end
|
117
|
+
tok.to_sym
|
118
|
+
end
|
119
|
+
################################################################################
|
110
120
|
# reads a PDF array from the buffer and converts it to a Ruby Array.
|
111
121
|
def array
|
112
122
|
a = []
|
@@ -141,6 +151,7 @@ class PDF::Reader
|
|
141
151
|
return "" if str == ")"
|
142
152
|
Error.assert_equal(parse_token, ")")
|
143
153
|
|
154
|
+
str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
|
144
155
|
str.gsub!("\\n","\n")
|
145
156
|
str.gsub!("\\r","\r")
|
146
157
|
str.gsub!("\\t","\t")
|
@@ -150,7 +161,6 @@ class PDF::Reader
|
|
150
161
|
str.gsub!("\\)",")")
|
151
162
|
str.gsub!("\\\\","\\")
|
152
163
|
str.gsub!(/\\\n/m,"")
|
153
|
-
str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
|
154
164
|
|
155
165
|
str.scan(/\\\d{1,3}/).each do |octal|
|
156
166
|
str.gsub!(octal, octal[1,3].oct.chr)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-30 00:00:00 +05:30
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -43,6 +43,7 @@ files:
|
|
43
43
|
- examples/hash.rb
|
44
44
|
- examples/callbacks.rb
|
45
45
|
- examples/text.rb
|
46
|
+
- examples/version.rb
|
46
47
|
- examples/page_counter_improved.rb
|
47
48
|
- lib/pdf/reader/glyphlist.txt
|
48
49
|
- lib/pdf/reader/content.rb
|
@@ -70,6 +71,7 @@ files:
|
|
70
71
|
- lib/pdf/reader/parser.rb
|
71
72
|
- lib/pdf/hash.rb
|
72
73
|
- lib/pdf/reader.rb
|
74
|
+
- lib/pdf-reader.rb
|
73
75
|
- Rakefile
|
74
76
|
- README.rdoc
|
75
77
|
- TODO
|