pdf-reader 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +10 -0
- data/Rakefile +1 -1
- data/bin/pdf_text +1 -0
- data/examples/version.rb +25 -0
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader/cmap.rb +80 -25
- data/lib/pdf/reader/content.rb +56 -40
- data/lib/pdf/reader/encoding.rb +26 -14
- data/lib/pdf/reader/parser.rb +13 -3
- metadata +4 -2
data/CHANGELOG
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
v0.8.4 (XXX)
|
2
|
+
- fix parsing of files that use Form XObjects
|
3
|
+
- thanks to Andrea Barisani for reporting the issue
|
4
|
+
- fix two issues that caused a small number of characters to convert to Unicode
|
5
|
+
incorrectly
|
6
|
+
- thanks to Andrea Barisani for reporting the issue
|
7
|
+
- require 'pdf-reader' now works a well as 'pdf/reader'
|
8
|
+
- good practice to have the require file match the gem name
|
9
|
+
- thanks to Chris O'Meara for highlighting this
|
10
|
+
|
1
11
|
v0.8.3 (14th February 2010)
|
2
12
|
- Fix a bug in tokenising of hex strings inside dictionaries
|
3
13
|
- Thanks to Brad Ediger for detecting the issue and proposing a solution
|
data/Rakefile
CHANGED
data/bin/pdf_text
CHANGED
data/examples/version.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Determine the PDF version of a file
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class VersionReceiver
|
10
|
+
attr_accessor :version
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@version = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Called when document parsing starts
|
17
|
+
def pdf_version(arg = nil)
|
18
|
+
@version = arg
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
receiver = VersionReceiver.new
|
24
|
+
pdf = PDF::Reader.file(ARGV.shift, receiver)
|
25
|
+
puts receiver.version
|
data/lib/pdf-reader.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "pdf/reader"
|
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -9,10 +9,10 @@
|
|
9
9
|
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
10
|
# permit persons to whom the Software is furnished to do so, subject to
|
11
11
|
# the following conditions:
|
12
|
-
#
|
12
|
+
#
|
13
13
|
# The above copyright notice and this permission notice shall be
|
14
14
|
# included in all copies or substantial portions of the Software.
|
15
|
-
#
|
15
|
+
#
|
16
16
|
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
17
|
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
18
|
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
@@ -30,26 +30,33 @@ class PDF::Reader
|
|
30
30
|
@map = {}
|
31
31
|
in_char_mode = false
|
32
32
|
in_range_mode = false
|
33
|
+
instructions = ""
|
33
34
|
|
34
35
|
data.each_line do |l|
|
35
36
|
if l.include?("beginbfchar")
|
36
|
-
in_char_mode = true
|
37
|
+
in_char_mode = true
|
37
38
|
elsif l.include?("endbfchar")
|
38
|
-
|
39
|
+
process_bfchar_instructions(instructions)
|
40
|
+
instructions = ""
|
41
|
+
in_char_mode = false
|
39
42
|
elsif l.include?("beginbfrange")
|
40
|
-
in_range_mode = true
|
43
|
+
in_range_mode = true
|
41
44
|
elsif l.include?("endbfrange")
|
42
|
-
|
45
|
+
process_bfrange_instructions(instructions)
|
46
|
+
instructions = ""
|
47
|
+
in_range_mode = false
|
43
48
|
end
|
44
49
|
|
45
|
-
if in_char_mode
|
46
|
-
|
47
|
-
elsif in_range_mode
|
48
|
-
process_bfrange_line(l)
|
50
|
+
if !l.include?("begin") && (in_char_mode || in_range_mode)
|
51
|
+
instructions << l
|
49
52
|
end
|
50
53
|
end
|
51
54
|
end
|
52
55
|
|
56
|
+
def size
|
57
|
+
@map.size
|
58
|
+
end
|
59
|
+
|
53
60
|
def decode(c)
|
54
61
|
# TODO: implement the conversion
|
55
62
|
return c unless c.class == Fixnum
|
@@ -58,24 +65,72 @@ class PDF::Reader
|
|
58
65
|
|
59
66
|
private
|
60
67
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
68
|
+
def build_parser(instructions)
|
69
|
+
buffer = Buffer.new(StringIO.new(instructions))
|
70
|
+
Parser.new(buffer)
|
71
|
+
end
|
72
|
+
|
73
|
+
def str_to_int(str)
|
74
|
+
return nil if str.nil? || str.size == 0 || str.size >= 3
|
75
|
+
|
76
|
+
if str.size == 1
|
77
|
+
str.unpack("C*")[0]
|
78
|
+
else
|
79
|
+
str.unpack("n*")[0]
|
80
|
+
end
|
64
81
|
end
|
65
82
|
|
66
|
-
def
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
83
|
+
def process_bfchar_instructions(instructions)
|
84
|
+
parser = build_parser(instructions)
|
85
|
+
find = str_to_int(parser.parse_token)
|
86
|
+
replace = str_to_int(parser.parse_token)
|
87
|
+
while find && replace
|
88
|
+
@map[find] = replace
|
89
|
+
find = str_to_int(parser.parse_token)
|
90
|
+
replace = str_to_int(parser.parse_token)
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
def process_bfrange_instructions(instructions)
|
95
|
+
parser = build_parser(instructions)
|
96
|
+
start = parser.parse_token
|
97
|
+
finish = parser.parse_token
|
98
|
+
to = parser.parse_token
|
99
|
+
while start && finish && to
|
100
|
+
if start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(String)
|
101
|
+
bfrange_type_one(start, finish, to)
|
102
|
+
elsif start.kind_of?(String) && finish.kind_of?(String) && to.kind_of?(Array)
|
103
|
+
bfrange_type_two(start, finish, to)
|
104
|
+
else
|
105
|
+
raise "invalid bfrange section"
|
78
106
|
end
|
107
|
+
start = parser.parse_token
|
108
|
+
finish = parser.parse_token
|
109
|
+
to = parser.parse_token
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def bfrange_type_one(start_code, end_code, dst)
|
114
|
+
start_code = str_to_int(start_code)
|
115
|
+
end_code = str_to_int(end_code)
|
116
|
+
dst = str_to_int(dst)
|
117
|
+
|
118
|
+
# add all values in the range to our mapping
|
119
|
+
(start_code..end_code).each_with_index do |val, idx|
|
120
|
+
@map[val] = dst + idx
|
121
|
+
# ensure a single range does not exceed 255 chars
|
122
|
+
raise PDF::Reader::MalformedPDFError, "a CMap bfrange cann't exceed 255 chars" if idx > 255
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
def bfrange_type_two(start_code, end_code, dst)
|
127
|
+
start_code = str_to_int(start_code)
|
128
|
+
end_code = str_to_int(end_code)
|
129
|
+
from_range = (start_code..end_code)
|
130
|
+
|
131
|
+
# add all values in the range to our mapping
|
132
|
+
from_range.each_with_index do |val, idx|
|
133
|
+
@map[val] = str_to_int(dst[idx])
|
79
134
|
end
|
80
135
|
end
|
81
136
|
end
|
data/lib/pdf/reader/content.rb
CHANGED
@@ -251,7 +251,6 @@ class PDF::Reader
|
|
251
251
|
def initialize (receiver, xref)
|
252
252
|
@receiver = receiver
|
253
253
|
@xref = xref
|
254
|
-
@fonts ||= {}
|
255
254
|
end
|
256
255
|
################################################################################
|
257
256
|
# Begin processing the document metadata
|
@@ -309,10 +308,14 @@ class PDF::Reader
|
|
309
308
|
contents = [page[:Contents]]
|
310
309
|
end
|
311
310
|
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
311
|
+
fonts = font_hash_from_resources(current_resources)
|
312
|
+
|
313
|
+
if page.has_key?(:Contents) and page[:Contents]
|
314
|
+
contents.each do |content|
|
315
|
+
obj = @xref.object(content)
|
316
|
+
content_stream(obj, fonts)
|
317
|
+
end
|
318
|
+
end
|
316
319
|
|
317
320
|
resources.pop if res
|
318
321
|
callback(:end_page)
|
@@ -330,7 +333,8 @@ class PDF::Reader
|
|
330
333
|
callback(:begin_form_xobject)
|
331
334
|
resources = @xref.object(xobject.hash[:Resources])
|
332
335
|
walk_resources(resources) if resources
|
333
|
-
|
336
|
+
fonts = font_hash_from_resources(resources)
|
337
|
+
content_stream(xobject, fonts)
|
334
338
|
callback(:end_form_xobject)
|
335
339
|
end
|
336
340
|
end
|
@@ -348,42 +352,43 @@ class PDF::Reader
|
|
348
352
|
################################################################################
|
349
353
|
# Reads a PDF content stream and calls all the appropriate callback methods for the operators
|
350
354
|
# it contains
|
351
|
-
def content_stream (instructions)
|
355
|
+
def content_stream (instructions, fonts = {})
|
352
356
|
instructions = instructions.unfiltered_data if instructions.kind_of?(PDF::Reader::Stream)
|
353
|
-
|
354
|
-
|
355
|
-
|
357
|
+
buffer = Buffer.new(StringIO.new(instructions))
|
358
|
+
parser = Parser.new(buffer, @xref)
|
359
|
+
current_font = nil
|
360
|
+
params = []
|
356
361
|
|
357
|
-
while (token =
|
362
|
+
while (token = parser.parse_token(OPERATORS))
|
358
363
|
if token.kind_of?(Token) and OPERATORS.has_key?(token)
|
359
|
-
|
364
|
+
current_font = params.first if OPERATORS[token] == :set_text_font_and_size
|
360
365
|
|
361
366
|
# handle special cases in response to certain operators
|
362
|
-
if OPERATORS[token].to_s.include?("show_text") &&
|
367
|
+
if OPERATORS[token].to_s.include?("show_text") && fonts[current_font]
|
363
368
|
# convert any text to utf-8
|
364
|
-
|
369
|
+
params = fonts[current_font].to_utf8(params)
|
365
370
|
elsif token == "ID"
|
366
371
|
# inline image data, first convert the current params into a more familiar hash
|
367
372
|
map = {}
|
368
|
-
|
373
|
+
params.each_slice(2) do |a|
|
369
374
|
map[a.first] = a.last
|
370
375
|
end
|
371
|
-
|
376
|
+
params = [map]
|
372
377
|
# read the raw image data from the buffer without tokenising
|
373
|
-
|
378
|
+
params << buffer.read_until("EI")
|
374
379
|
end
|
375
380
|
|
376
|
-
callback(OPERATORS[token],
|
381
|
+
callback(OPERATORS[token], params)
|
377
382
|
|
378
383
|
if OPERATORS[token] == :invoke_xobject
|
379
|
-
xobject_label =
|
380
|
-
|
384
|
+
xobject_label = params.first
|
385
|
+
params.clear
|
381
386
|
walk_xobject_form(xobject_label)
|
382
387
|
else
|
383
|
-
|
388
|
+
params.clear
|
384
389
|
end
|
385
390
|
else
|
386
|
-
|
391
|
+
params << token
|
387
392
|
end
|
388
393
|
end
|
389
394
|
rescue EOFError => e
|
@@ -430,24 +435,9 @@ class PDF::Reader
|
|
430
435
|
|
431
436
|
# extract any font information
|
432
437
|
if resources[:Font]
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
@fonts[label].label = label
|
437
|
-
@fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
438
|
-
@fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
439
|
-
@fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
|
440
|
-
@fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
441
|
-
if desc[:ToUnicode]
|
442
|
-
# this stream is a cmap
|
443
|
-
begin
|
444
|
-
stream = desc[:ToUnicode]
|
445
|
-
@fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
446
|
-
rescue
|
447
|
-
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
448
|
-
end
|
449
|
-
end
|
450
|
-
callback(:resource_font, [label, @fonts[label]])
|
438
|
+
fonts = font_hash_from_resources(resources)
|
439
|
+
fonts.each do |label, font|
|
440
|
+
callback(:resource_font, [label, fonts])
|
451
441
|
end
|
452
442
|
end
|
453
443
|
end
|
@@ -473,6 +463,32 @@ class PDF::Reader
|
|
473
463
|
end
|
474
464
|
################################################################################
|
475
465
|
private
|
466
|
+
################################################################################
|
467
|
+
def font_hash_from_resources(resources)
|
468
|
+
return {} unless resources.respond_to?(:[])
|
469
|
+
|
470
|
+
fonts = {}
|
471
|
+
resources = @xref.object(resources[:Font]) || {}
|
472
|
+
resources.each do |label, desc|
|
473
|
+
desc = @xref.object(desc)
|
474
|
+
fonts[label] = PDF::Reader::Font.new
|
475
|
+
fonts[label].label = label
|
476
|
+
fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
|
477
|
+
fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
|
478
|
+
fonts[label].encoding = PDF::Reader::Encoding.new(@xref.object(desc[:Encoding]))
|
479
|
+
fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
|
480
|
+
if desc[:ToUnicode]
|
481
|
+
# this stream is a cmap
|
482
|
+
begin
|
483
|
+
stream = desc[:ToUnicode]
|
484
|
+
fonts[label].tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
485
|
+
rescue
|
486
|
+
# if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
|
487
|
+
end
|
488
|
+
end
|
489
|
+
end
|
490
|
+
fonts
|
491
|
+
end
|
476
492
|
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
477
493
|
def decode_strings(obj)
|
478
494
|
case obj
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -30,9 +30,11 @@ class PDF::Reader
|
|
30
30
|
|
31
31
|
UNKNOWN_CHAR = 0x25AF # ▯
|
32
32
|
|
33
|
-
attr_reader :differences
|
33
|
+
attr_reader :differences, :unpack
|
34
34
|
|
35
35
|
def initialize(enc)
|
36
|
+
@to_unicode_required = false
|
37
|
+
|
36
38
|
if enc.kind_of?(Hash)
|
37
39
|
self.differences=enc[:Differences] if enc[:Differences]
|
38
40
|
enc = enc[:Encoding] || enc[:BaseEncoding]
|
@@ -74,6 +76,10 @@ class PDF::Reader
|
|
74
76
|
end
|
75
77
|
end
|
76
78
|
|
79
|
+
def to_unicode_required?
|
80
|
+
@to_unicode_required
|
81
|
+
end
|
82
|
+
|
77
83
|
# set the differences table for this encoding. should be an array in the following format:
|
78
84
|
#
|
79
85
|
# [25, :A, 26, :B]
|
@@ -101,25 +107,22 @@ class PDF::Reader
|
|
101
107
|
|
102
108
|
# convert the specified string to utf8
|
103
109
|
def to_utf8(str, tounicode = nil)
|
104
|
-
|
105
110
|
# unpack the single bytes
|
106
|
-
array_orig = str.unpack(
|
111
|
+
array_orig = str.unpack(unpack)
|
107
112
|
|
108
113
|
# replace any relevant bytes with a glyph name
|
109
114
|
array_orig = process_differences(array_orig)
|
110
115
|
|
111
116
|
# replace any remaining bytes with a unicode codepoint
|
112
|
-
array_enc =
|
113
|
-
array_orig.each do |num|
|
117
|
+
array_enc = array_orig.map do |num|
|
114
118
|
if tounicode && (code = tounicode.decode(num))
|
115
|
-
|
116
|
-
elsif tounicode || ( tounicode.nil? &&
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
array_enc << @mapping[num]
|
119
|
+
code
|
120
|
+
elsif tounicode || ( tounicode.nil? && to_unicode_required? )
|
121
|
+
PDF::Reader::Encoding::UNKNOWN_CHAR
|
122
|
+
elsif mapping[num]
|
123
|
+
mapping[num]
|
121
124
|
else
|
122
|
-
|
125
|
+
num
|
123
126
|
end
|
124
127
|
end
|
125
128
|
|
@@ -140,6 +143,14 @@ class PDF::Reader
|
|
140
143
|
|
141
144
|
private
|
142
145
|
|
146
|
+
def mapping
|
147
|
+
@mapping ||= {}
|
148
|
+
end
|
149
|
+
|
150
|
+
def has_mapping?
|
151
|
+
mapping.size > 0
|
152
|
+
end
|
153
|
+
|
143
154
|
# accepts an array of byte numbers, and replaces any that have entries in the differences table
|
144
155
|
# with a glyph name
|
145
156
|
def process_differences(arr)
|
@@ -154,12 +165,13 @@ class PDF::Reader
|
|
154
165
|
end
|
155
166
|
|
156
167
|
def load_mapping(file)
|
157
|
-
|
168
|
+
return if has_mapping?
|
169
|
+
|
158
170
|
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
159
171
|
File.open(file, mode) do |f|
|
160
172
|
f.each do |l|
|
161
173
|
m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
162
|
-
|
174
|
+
mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
163
175
|
end
|
164
176
|
end
|
165
177
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -33,7 +33,7 @@ class PDF::Reader
|
|
33
33
|
#
|
34
34
|
# buffer - a PDF::Reader::Buffer object that contains PDF data
|
35
35
|
# xref - a PDF::Reader::XRef object that represents the document's object offsets
|
36
|
-
def initialize (buffer, xref)
|
36
|
+
def initialize (buffer, xref=nil)
|
37
37
|
@buffer = buffer
|
38
38
|
@xref = xref
|
39
39
|
end
|
@@ -48,7 +48,7 @@ class PDF::Reader
|
|
48
48
|
case token
|
49
49
|
when PDF::Reader::Reference then return token
|
50
50
|
when nil then return nil
|
51
|
-
when "/" then return
|
51
|
+
when "/" then return pdf_name()
|
52
52
|
when "<<" then return dictionary()
|
53
53
|
when "[" then return array()
|
54
54
|
when "(" then return string()
|
@@ -107,6 +107,16 @@ class PDF::Reader
|
|
107
107
|
dict
|
108
108
|
end
|
109
109
|
################################################################################
|
110
|
+
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
111
|
+
def pdf_name
|
112
|
+
tok = @buffer.token
|
113
|
+
tok.scan(/#(\d\d)/).each do |find|
|
114
|
+
replace = find[0].hex.chr
|
115
|
+
tok.gsub!("#"+find[0], replace)
|
116
|
+
end
|
117
|
+
tok.to_sym
|
118
|
+
end
|
119
|
+
################################################################################
|
110
120
|
# reads a PDF array from the buffer and converts it to a Ruby Array.
|
111
121
|
def array
|
112
122
|
a = []
|
@@ -141,6 +151,7 @@ class PDF::Reader
|
|
141
151
|
return "" if str == ")"
|
142
152
|
Error.assert_equal(parse_token, ")")
|
143
153
|
|
154
|
+
str.gsub!(/([^\\])(\n\r|\r\n|\r)/m,'\1\n')
|
144
155
|
str.gsub!("\\n","\n")
|
145
156
|
str.gsub!("\\r","\r")
|
146
157
|
str.gsub!("\\t","\t")
|
@@ -150,7 +161,6 @@ class PDF::Reader
|
|
150
161
|
str.gsub!("\\)",")")
|
151
162
|
str.gsub!("\\\\","\\")
|
152
163
|
str.gsub!(/\\\n/m,"")
|
153
|
-
str.gsub!(/(\n\r|\r\n|\r)/m,"\n")
|
154
164
|
|
155
165
|
str.scan(/\\\d{1,3}/).each do |octal|
|
156
166
|
str.gsub!(octal, octal[1,3].oct.chr)
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2010-
|
12
|
+
date: 2010-03-30 00:00:00 +05:30
|
13
13
|
default_executable:
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -43,6 +43,7 @@ files:
|
|
43
43
|
- examples/hash.rb
|
44
44
|
- examples/callbacks.rb
|
45
45
|
- examples/text.rb
|
46
|
+
- examples/version.rb
|
46
47
|
- examples/page_counter_improved.rb
|
47
48
|
- lib/pdf/reader/glyphlist.txt
|
48
49
|
- lib/pdf/reader/content.rb
|
@@ -70,6 +71,7 @@ files:
|
|
70
71
|
- lib/pdf/reader/parser.rb
|
71
72
|
- lib/pdf/hash.rb
|
72
73
|
- lib/pdf/reader.rb
|
74
|
+
- lib/pdf-reader.rb
|
73
75
|
- Rakefile
|
74
76
|
- README.rdoc
|
75
77
|
- TODO
|