pdf-reader 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +12 -2
- data/{README → README.rdoc} +27 -47
- data/Rakefile +5 -4
- data/TODO +3 -1
- data/bin/pdf_list_callbacks +1 -5
- data/bin/pdf_object +43 -0
- data/bin/pdf_text +1 -0
- data/lib/pdf/reader.rb +25 -7
- data/lib/pdf/reader/buffer.rb +3 -1
- data/lib/pdf/reader/content.rb +56 -48
- data/lib/pdf/reader/encoding.rb +82 -1088
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/font.rb +4 -3
- data/lib/pdf/reader/parser.rb +1 -0
- data/lib/pdf/reader/print_receiver.rb +19 -0
- data/lib/pdf/reader/xref.rb +12 -0
- metadata +26 -17
- data/lib/pdf/reader/parser.rb.rej +0 -29
@@ -0,0 +1,29 @@
|
|
1
|
+
# A mapping of WinAnsi (win-1252) characters to unicode. Anything
|
2
|
+
# not specified is left unchanged
|
3
|
+
80;20AC
|
4
|
+
82;201A
|
5
|
+
83;0192
|
6
|
+
84;201E
|
7
|
+
85;2026
|
8
|
+
86;2020
|
9
|
+
87;2021
|
10
|
+
88;02C6
|
11
|
+
89;2030
|
12
|
+
8A;0160
|
13
|
+
8B;2039
|
14
|
+
8C;0152
|
15
|
+
8E;017D
|
16
|
+
91;2018
|
17
|
+
92;2019
|
18
|
+
93;201C
|
19
|
+
94;201D
|
20
|
+
95;2022
|
21
|
+
96;2013
|
22
|
+
97;2014
|
23
|
+
98;02DC
|
24
|
+
99;2122
|
25
|
+
9A;0161
|
26
|
+
9B;203A
|
27
|
+
9C;0152
|
28
|
+
9E;017E
|
29
|
+
9F;0178
|
@@ -0,0 +1,201 @@
|
|
1
|
+
21;2701
|
2
|
+
22;2702
|
3
|
+
23;2703
|
4
|
+
24;2704
|
5
|
+
25;260E
|
6
|
+
26;2706
|
7
|
+
27;2707
|
8
|
+
28;2708
|
9
|
+
29;2709
|
10
|
+
2A;261B
|
11
|
+
2B;261E
|
12
|
+
2C;270C
|
13
|
+
2D;270D
|
14
|
+
2E;270E
|
15
|
+
2F;270F
|
16
|
+
30;2710
|
17
|
+
31;2711
|
18
|
+
32;2712
|
19
|
+
33;2713
|
20
|
+
34;2714
|
21
|
+
35;2715
|
22
|
+
36;2716
|
23
|
+
37;2717
|
24
|
+
38;2718
|
25
|
+
39;2719
|
26
|
+
3A;271A
|
27
|
+
3B;271B
|
28
|
+
3C;271C
|
29
|
+
3D;271D
|
30
|
+
3E;271E
|
31
|
+
3F;271E
|
32
|
+
40;2720
|
33
|
+
41;2721
|
34
|
+
42;2722
|
35
|
+
43;2723
|
36
|
+
44;2724
|
37
|
+
45;2725
|
38
|
+
46;2726
|
39
|
+
47;2727
|
40
|
+
48;2605
|
41
|
+
49;2729
|
42
|
+
4A;272A
|
43
|
+
4B;272B
|
44
|
+
4C;272C
|
45
|
+
4D;272D
|
46
|
+
4E;272E
|
47
|
+
4F;272F
|
48
|
+
50;2730
|
49
|
+
51;2731
|
50
|
+
52;2732
|
51
|
+
53;2733
|
52
|
+
54;2734
|
53
|
+
55;2735
|
54
|
+
56;2736
|
55
|
+
57;2737
|
56
|
+
58;2738
|
57
|
+
59;2739
|
58
|
+
5A;273A
|
59
|
+
5B;273B
|
60
|
+
5C;273C
|
61
|
+
5D;273D
|
62
|
+
5E;273E
|
63
|
+
5F;273F
|
64
|
+
60;2740
|
65
|
+
61;2741
|
66
|
+
62;2742
|
67
|
+
63;2743
|
68
|
+
64;2744
|
69
|
+
65;2745
|
70
|
+
66;2746
|
71
|
+
67;2747
|
72
|
+
68;2748
|
73
|
+
69;2749
|
74
|
+
6A;274A
|
75
|
+
6B;274B
|
76
|
+
6C;25CF
|
77
|
+
6D;274D
|
78
|
+
6E;25A0
|
79
|
+
6F;274F
|
80
|
+
70;2750
|
81
|
+
71;2751
|
82
|
+
72;2752
|
83
|
+
73;2753
|
84
|
+
74;2754
|
85
|
+
75;2755
|
86
|
+
76;2756
|
87
|
+
77;2757
|
88
|
+
78;2758
|
89
|
+
79;2759
|
90
|
+
7A;275A
|
91
|
+
7B;275B
|
92
|
+
7C;275C
|
93
|
+
7D;275D
|
94
|
+
7E;275E
|
95
|
+
80;F8D7
|
96
|
+
81;F8D8
|
97
|
+
82;F8D9
|
98
|
+
83;F8DA
|
99
|
+
84;F8DB
|
100
|
+
85;F8DC
|
101
|
+
86;F8DD
|
102
|
+
87;F8DE
|
103
|
+
88;F8DF
|
104
|
+
89;F8E0
|
105
|
+
8A;F8E1
|
106
|
+
8B;F8E2
|
107
|
+
8C;F8E3
|
108
|
+
8D;F8E4
|
109
|
+
A1;2761
|
110
|
+
A2;2762
|
111
|
+
A3;2763
|
112
|
+
A4;2764
|
113
|
+
A5;2765
|
114
|
+
A6;2766
|
115
|
+
A7;2767
|
116
|
+
A8;2663
|
117
|
+
A9;2666
|
118
|
+
AA;2665
|
119
|
+
AB;2660
|
120
|
+
AC;2460
|
121
|
+
AD;2461
|
122
|
+
AE;2462
|
123
|
+
AF;2463
|
124
|
+
B0;2464
|
125
|
+
B1;2465
|
126
|
+
B2;2466
|
127
|
+
B3;2467
|
128
|
+
B4;2468
|
129
|
+
B5;2469
|
130
|
+
B6;2776
|
131
|
+
B7;2777
|
132
|
+
B8;2778
|
133
|
+
B9;2779
|
134
|
+
BA;277A
|
135
|
+
BB;277B
|
136
|
+
BC;277C
|
137
|
+
BD;277D
|
138
|
+
BE;277E
|
139
|
+
BF;277F
|
140
|
+
C0;2780
|
141
|
+
C1;2781
|
142
|
+
C2;2782
|
143
|
+
C3;2783
|
144
|
+
C4;2784
|
145
|
+
C5;2785
|
146
|
+
C6;2786
|
147
|
+
C7;2787
|
148
|
+
C8;2788
|
149
|
+
C9;2789
|
150
|
+
CA;278A
|
151
|
+
CB;278B
|
152
|
+
CC;278C
|
153
|
+
CD;278D
|
154
|
+
CE;278E
|
155
|
+
CF;278F
|
156
|
+
D0;2790
|
157
|
+
D1;2791
|
158
|
+
D2;2792
|
159
|
+
D3;2793
|
160
|
+
D4;2794
|
161
|
+
D5;2795
|
162
|
+
D6;2796
|
163
|
+
D7;2797
|
164
|
+
D8;2798
|
165
|
+
D9;2799
|
166
|
+
DA;279A
|
167
|
+
DB;279B
|
168
|
+
DC;279C
|
169
|
+
DD;279D
|
170
|
+
DE;279E
|
171
|
+
DF;279F
|
172
|
+
E0;27A0
|
173
|
+
E1;27A1
|
174
|
+
E2;27A2
|
175
|
+
E3;27A3
|
176
|
+
E4;27A4
|
177
|
+
E5;27A5
|
178
|
+
E6;27A6
|
179
|
+
E7;27A7
|
180
|
+
E8;27A8
|
181
|
+
E9;27A9
|
182
|
+
EA;27AA
|
183
|
+
EB;27AB
|
184
|
+
EC;27AC
|
185
|
+
ED;27AD
|
186
|
+
EE;27AE
|
187
|
+
EF;27AF
|
188
|
+
F1;27B1
|
189
|
+
F2;27B2
|
190
|
+
F3;27B3
|
191
|
+
F4;27B4
|
192
|
+
F5;27B5
|
193
|
+
F6;27B6
|
194
|
+
F7;27B7
|
195
|
+
F8;27B8
|
196
|
+
F9;27B9
|
197
|
+
FA;27BA
|
198
|
+
FB;27BB
|
199
|
+
FC;27BC
|
200
|
+
FD;27BD
|
201
|
+
FE;27BE
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -48,6 +48,7 @@ class PDF::Reader
|
|
48
48
|
end
|
49
49
|
################################################################################
|
50
50
|
class MalformedPDFError < RuntimeError; end
|
51
|
+
class InvalidObjectError < MalformedPDFError; end
|
51
52
|
class UnsupportedFeatureError < RuntimeError; end
|
52
53
|
end
|
53
54
|
################################################################################
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -52,10 +52,11 @@ class PDF::Reader
|
|
52
52
|
# with encoding= if required
|
53
53
|
case font
|
54
54
|
when "Symbol" then
|
55
|
-
self.encoding = PDF::Reader::Encoding.
|
55
|
+
self.encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
56
56
|
when "ZapfDingbats" then
|
57
|
-
self.encoding = PDF::Reader::Encoding.
|
57
|
+
self.encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
58
58
|
end
|
59
|
+
@basefont = font
|
59
60
|
end
|
60
61
|
|
61
62
|
def to_utf8(params)
|
@@ -65,7 +66,7 @@ class PDF::Reader
|
|
65
66
|
# translate the bytestram into a UTF-8 string.
|
66
67
|
# If an encoding hasn't been specified, assume the text using this
|
67
68
|
# font is in Adobe Standard Encoding.
|
68
|
-
(encoding || PDF::Reader::Encoding
|
69
|
+
(encoding || PDF::Reader::Encoding.new(:StandardEncoding)).to_utf8(params, tounicode)
|
69
70
|
elsif params.class == Array
|
70
71
|
params.collect { |param| self.to_utf8(param) }
|
71
72
|
else
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -0,0 +1,19 @@
|
|
1
|
+
class PDF::Reader
|
2
|
+
class PrintReceiver
|
3
|
+
|
4
|
+
attr_accessor :callbacks
|
5
|
+
|
6
|
+
def initialize
|
7
|
+
@callbacks = []
|
8
|
+
end
|
9
|
+
|
10
|
+
def respond_to?(meth)
|
11
|
+
return false if [:begin_inline_image_data].include?(meth)
|
12
|
+
true
|
13
|
+
end
|
14
|
+
|
15
|
+
def method_missing(methodname, *args)
|
16
|
+
puts "#{methodname} => #{args.inspect}"
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -37,6 +37,16 @@ class PDF::Reader
|
|
37
37
|
@xref = {}
|
38
38
|
end
|
39
39
|
################################################################################
|
40
|
+
# returns the PDF version of the current document. Technically this isn't part of the XRef
|
41
|
+
# table, but it is one of the lowest level data items in the file, so we've lumped it in
|
42
|
+
# with the cross reference code.
|
43
|
+
def pdf_version
|
44
|
+
@buffer.seek(0)
|
45
|
+
m, version = *@buffer.read(8).match(/%PDF-(\d.\d)/)
|
46
|
+
raise MalformedPDFError, 'invalid PDF version' if version.nil?
|
47
|
+
return version.to_f
|
48
|
+
end
|
49
|
+
################################################################################
|
40
50
|
# Read the xref table from the underlying buffer. If offset is specified the table
|
41
51
|
# will be loaded from there, otherwise the default offset will be located and used.
|
42
52
|
#
|
@@ -122,6 +132,8 @@ class PDF::Reader
|
|
122
132
|
# ref - a PDF::Reader::Reference object containing an object ID and revision number
|
123
133
|
def offset_for (ref)
|
124
134
|
@xref[ref.id][ref.gen]
|
135
|
+
rescue
|
136
|
+
raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
|
125
137
|
end
|
126
138
|
################################################################################
|
127
139
|
# Stores an offset value for a particular PDF object ID and revision number
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Jones
|
@@ -9,44 +9,53 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2008-
|
12
|
+
date: 2008-06-11 00:00:00 +10:00
|
13
13
|
default_executable:
|
14
14
|
dependencies: []
|
15
15
|
|
16
16
|
description: The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe
|
17
17
|
email: pjones@pmade.com
|
18
18
|
executables:
|
19
|
+
- pdf_object
|
19
20
|
- pdf_text
|
20
21
|
- pdf_list_callbacks
|
21
22
|
extensions: []
|
22
23
|
|
23
24
|
extra_rdoc_files:
|
24
|
-
- README
|
25
|
+
- README.rdoc
|
25
26
|
- TODO
|
26
27
|
- CHANGELOG
|
27
28
|
files:
|
28
29
|
- lib/pdf
|
30
|
+
- lib/pdf/reader.rb
|
29
31
|
- lib/pdf/reader
|
30
|
-
- lib/pdf/reader/explore.rb
|
31
|
-
- lib/pdf/reader/reference.rb
|
32
|
-
- lib/pdf/reader/xref.rb
|
33
|
-
- lib/pdf/reader/token.rb
|
34
|
-
- lib/pdf/reader/filter.rb
|
35
|
-
- lib/pdf/reader/text_receiver.rb
|
36
32
|
- lib/pdf/reader/buffer.rb
|
37
|
-
- lib/pdf/reader/error.rb
|
38
|
-
- lib/pdf/reader/content.rb
|
39
|
-
- lib/pdf/reader/parser.rb
|
40
33
|
- lib/pdf/reader/cmap.rb
|
34
|
+
- lib/pdf/reader/content.rb
|
41
35
|
- lib/pdf/reader/encoding.rb
|
42
|
-
- lib/pdf/reader/
|
36
|
+
- lib/pdf/reader/error.rb
|
37
|
+
- lib/pdf/reader/explore.rb
|
38
|
+
- lib/pdf/reader/filter.rb
|
43
39
|
- lib/pdf/reader/font.rb
|
44
40
|
- lib/pdf/reader/glyphlist.txt
|
41
|
+
- lib/pdf/reader/parser.rb
|
42
|
+
- lib/pdf/reader/xref.rb
|
43
|
+
- lib/pdf/reader/reference.rb
|
44
|
+
- lib/pdf/reader/register_receiver.rb
|
45
|
+
- lib/pdf/reader/text_receiver.rb
|
46
|
+
- lib/pdf/reader/token.rb
|
47
|
+
- lib/pdf/reader/encodings
|
48
|
+
- lib/pdf/reader/encodings/mac_expert.txt
|
49
|
+
- lib/pdf/reader/encodings/mac_roman.txt
|
50
|
+
- lib/pdf/reader/encodings/pdf_doc.txt
|
51
|
+
- lib/pdf/reader/encodings/standard.txt
|
52
|
+
- lib/pdf/reader/encodings/symbol.txt
|
53
|
+
- lib/pdf/reader/encodings/win_ansi.txt
|
54
|
+
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
45
55
|
- lib/pdf/reader/stream.rb
|
46
|
-
- lib/pdf/reader/
|
47
|
-
- lib/pdf/reader.rb
|
56
|
+
- lib/pdf/reader/print_receiver.rb
|
48
57
|
- Rakefile
|
49
|
-
- README
|
58
|
+
- README.rdoc
|
50
59
|
- TODO
|
51
60
|
- CHANGELOG
|
52
61
|
has_rdoc: true
|
@@ -56,7 +65,7 @@ rdoc_options:
|
|
56
65
|
- --title
|
57
66
|
- PDF::Reader Documentation
|
58
67
|
- --main
|
59
|
-
- README
|
68
|
+
- README.rdoc
|
60
69
|
- -q
|
61
70
|
require_paths:
|
62
71
|
- lib
|
@@ -1,29 +0,0 @@
|
|
1
|
-
***************
|
2
|
-
*** 173,178 ****
|
3
|
-
|
4
|
-
obj = parse_token
|
5
|
-
post_obj = parse_token
|
6
|
-
case post_obj
|
7
|
-
when "endobj" then return [obj,nil]
|
8
|
-
when "stream" then return [obj, stream(obj)]
|
9
|
-
--- 173,192 ----
|
10
|
-
|
11
|
-
obj = parse_token
|
12
|
-
post_obj = parse_token
|
13
|
-
+
|
14
|
-
+ if obj.class == Array
|
15
|
-
+ newobj = Array.new
|
16
|
-
+ obj.each_index {|idx|
|
17
|
-
+ if obj[idx].class == PDF::Reader::Reference
|
18
|
-
+ xo, xs = @xref.object(obj[idx])
|
19
|
-
+ if xs
|
20
|
-
+ newobj << xs
|
21
|
-
+ end
|
22
|
-
+ end
|
23
|
-
+ }
|
24
|
-
+ return newobj.flatten
|
25
|
-
+ end
|
26
|
-
+
|
27
|
-
case post_obj
|
28
|
-
when "endobj" then return [obj,nil]
|
29
|
-
when "stream" then return [obj, stream(obj)]
|