pdf-reader 1.1.1 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +8 -0
- data/bin/pdf_text +0 -2
- data/examples/extract_images.rb +11 -6
- data/lib/pdf/reader.rb +11 -5
- data/lib/pdf/reader/buffer.rb +48 -42
- data/lib/pdf/reader/cmap.rb +26 -11
- data/lib/pdf/reader/filter.rb +11 -234
- data/lib/pdf/reader/filter/ascii85.rb +25 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +26 -0
- data/lib/pdf/reader/filter/depredict.rb +138 -0
- data/lib/pdf/reader/filter/flate.rb +38 -0
- data/lib/pdf/reader/filter/lzw.rb +18 -0
- data/lib/pdf/reader/filter/null.rb +15 -0
- data/lib/pdf/reader/filter/run_length.rb +46 -0
- data/lib/pdf/reader/font.rb +1 -1
- data/lib/pdf/reader/form_xobject.rb +25 -4
- data/lib/pdf/reader/glyph_hash.rb +3 -2
- data/lib/pdf/reader/object_cache.rb +39 -16
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page.rb +7 -1
- data/lib/pdf/reader/page_state.rb +2 -1
- data/lib/pdf/reader/stream.rb +1 -1
- data/lib/pdf/reader/xref.rb +23 -4
- metadata +99 -46
@@ -44,7 +44,7 @@ class PDF::Reader
|
|
44
44
|
@pdf_version = read_version
|
45
45
|
@xref = PDF::Reader::XRef.new(@io)
|
46
46
|
@trailer = @xref.trailer
|
47
|
-
@cache = PDF::Reader::ObjectCache.new
|
47
|
+
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
48
48
|
@sec_handler = build_security_handler(opts)
|
49
49
|
end
|
50
50
|
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -20,14 +20,20 @@ module PDF
|
|
20
20
|
# the raw PDF object that defines this page
|
21
21
|
attr_reader :page_object
|
22
22
|
|
23
|
+
# a Hash-like object for storing cached data. Generally this is scoped to
|
24
|
+
# the current document and is used to avoid repeating expensive
|
25
|
+
# operations
|
26
|
+
attr_reader :cache
|
27
|
+
|
23
28
|
# creates a new page wrapper.
|
24
29
|
#
|
25
30
|
# * objects - an ObjectHash instance that wraps a PDF file
|
26
31
|
# * pagenum - an int specifying the page number to expose. 1 indexed.
|
27
32
|
#
|
28
|
-
def initialize(objects, pagenum)
|
33
|
+
def initialize(objects, pagenum, options = {})
|
29
34
|
@objects, @pagenum = objects, pagenum
|
30
35
|
@page_object = objects.deref(objects.page_references[pagenum - 1])
|
36
|
+
@cache = options[:cache] || {}
|
31
37
|
|
32
38
|
unless @page_object.is_a?(::Hash)
|
33
39
|
raise ArgumentError, "invalid page: #{pagenum}"
|
@@ -22,6 +22,7 @@ module PDF
|
|
22
22
|
# starting a new page
|
23
23
|
def initialize(page)
|
24
24
|
@page = page
|
25
|
+
@cache = page.cache
|
25
26
|
@objects = page.objects
|
26
27
|
@font_stack = [build_fonts(page.fonts)]
|
27
28
|
@xobject_stack = [page.xobjects]
|
@@ -176,7 +177,7 @@ module PDF
|
|
176
177
|
concatenate_matrix(*matrix) if matrix
|
177
178
|
|
178
179
|
if xobject.hash[:Subtype] == :Form
|
179
|
-
form = PDF::Reader::FormXObject.new(@page, xobject)
|
180
|
+
form = PDF::Reader::FormXObject.new(@page, xobject, :cache => @cache)
|
180
181
|
@font_stack.unshift(form.font_objects)
|
181
182
|
@xobject_stack.unshift(form.xobjects)
|
182
183
|
yield form if block_given?
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/xref.rb
CHANGED
@@ -53,9 +53,11 @@ class PDF::Reader
|
|
53
53
|
#
|
54
54
|
def initialize (io)
|
55
55
|
@io = io
|
56
|
+
@junk_offset = calc_junk_offset(io) || 0
|
56
57
|
@xref = {}
|
57
58
|
@trailer = load_offsets
|
58
59
|
end
|
60
|
+
|
59
61
|
################################################################################
|
60
62
|
# return the number of objects in this file. Objects with multiple generations are
|
61
63
|
# only counter once.
|
@@ -93,6 +95,7 @@ class PDF::Reader
|
|
93
95
|
#
|
94
96
|
def load_offsets(offset = nil)
|
95
97
|
offset ||= new_buffer.find_first_xref_offset
|
98
|
+
offset += @junk_offset
|
96
99
|
|
97
100
|
buf = new_buffer(offset)
|
98
101
|
tok_one = buf.token
|
@@ -124,7 +127,7 @@ class PDF::Reader
|
|
124
127
|
generation = buf.token.to_i
|
125
128
|
state = buf.token
|
126
129
|
|
127
|
-
store(objid, generation, offset) if state == "n" && offset > 0
|
130
|
+
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
128
131
|
objid += 1
|
129
132
|
params.clear
|
130
133
|
end
|
@@ -143,7 +146,7 @@ class PDF::Reader
|
|
143
146
|
end
|
144
147
|
|
145
148
|
################################################################################
|
146
|
-
# Read
|
149
|
+
# Read an XRef stream from the underlying buffer instead of a traditional xref table.
|
147
150
|
#
|
148
151
|
def load_xref_stream(stream)
|
149
152
|
unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
|
@@ -169,7 +172,7 @@ class PDF::Reader
|
|
169
172
|
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
170
173
|
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
171
174
|
if f1 == 1 && f2 > 0
|
172
|
-
store(objid, f3, f2)
|
175
|
+
store(objid, f3, f2 + @junk_offset)
|
173
176
|
elsif f1 == 2 && f2 > 0
|
174
177
|
store(objid, 0, PDF::Reader::Reference.new(f2, 0))
|
175
178
|
end
|
@@ -203,7 +206,7 @@ class PDF::Reader
|
|
203
206
|
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
204
207
|
#
|
205
208
|
# We create multiple buffers so we can be tokenising multiple sections of the file
|
206
|
-
# at the same time without
|
209
|
+
# at the same time without worrying about clearing the buffers contents.
|
207
210
|
#
|
208
211
|
def new_buffer(offset = 0)
|
209
212
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
@@ -214,6 +217,22 @@ class PDF::Reader
|
|
214
217
|
def store (id, gen, offset)
|
215
218
|
(@xref[id] ||= {})[gen] ||= offset
|
216
219
|
end
|
220
|
+
################################################################################
|
221
|
+
# Returns the offset of the PDF document in the +stream+. In theory this
|
222
|
+
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
223
|
+
# in the real world.
|
224
|
+
#
|
225
|
+
# Checks up to 50 chars into the file, returns nil if no PDF data detected.
|
226
|
+
#
|
227
|
+
def calc_junk_offset(io)
|
228
|
+
io.rewind
|
229
|
+
offset = io.pos
|
230
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 50
|
231
|
+
offset += 1
|
232
|
+
end
|
233
|
+
io.rewind
|
234
|
+
offset < 50 ? offset : nil
|
235
|
+
end
|
217
236
|
end
|
218
237
|
################################################################################
|
219
238
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: roodi
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :development
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: rspec
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ~>
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: '2.3'
|
44
54
|
type: :development
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.3'
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: ZenTest
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ~>
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: 4.4.2
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 4.4.2
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: Ascii85
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ~>
|
@@ -65,18 +85,44 @@ dependencies:
|
|
65
85
|
version: 1.0.0
|
66
86
|
type: :runtime
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.0.0
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: ruby-rc4
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
72
105
|
none: false
|
73
106
|
requirements:
|
74
107
|
- - ! '>='
|
75
108
|
- !ruby/object:Gem::Version
|
76
109
|
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: hashery
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.0'
|
77
118
|
type: :runtime
|
78
119
|
prerelease: false
|
79
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ~>
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '2.0'
|
80
126
|
description: The PDF::Reader library implements a PDF parser conforming as much as
|
81
127
|
possible to the PDF specification from Adobe
|
82
128
|
email:
|
@@ -93,54 +139,61 @@ extra_rdoc_files:
|
|
93
139
|
- CHANGELOG
|
94
140
|
- MIT-LICENSE
|
95
141
|
files:
|
142
|
+
- examples/extract_fonts.rb
|
143
|
+
- examples/hash.rb
|
144
|
+
- examples/extract_bates.rb
|
96
145
|
- examples/metadata.rb
|
97
146
|
- examples/extract_images.rb
|
98
|
-
- examples/extract_bates.rb
|
99
|
-
- examples/callbacks.rb
|
100
147
|
- examples/rspec.rb
|
101
|
-
- examples/hash.rb
|
102
|
-
- examples/text.rb
|
103
|
-
- examples/extract_fonts.rb
|
104
148
|
- examples/page_count.rb
|
149
|
+
- examples/callbacks.rb
|
150
|
+
- examples/text.rb
|
105
151
|
- examples/version.rb
|
106
|
-
- lib/pdf/reader.rb
|
107
152
|
- lib/pdf/hash.rb
|
108
|
-
- lib/pdf/reader
|
153
|
+
- lib/pdf/reader.rb
|
109
154
|
- lib/pdf/reader/xref.rb
|
110
|
-
- lib/pdf/reader/
|
155
|
+
- lib/pdf/reader/page.rb
|
156
|
+
- lib/pdf/reader/encoding.rb
|
111
157
|
- lib/pdf/reader/font.rb
|
112
|
-
- lib/pdf/reader/
|
113
|
-
- lib/pdf/reader/
|
114
|
-
- lib/pdf/reader/
|
115
|
-
- lib/pdf/reader/object_hash.rb
|
116
|
-
- lib/pdf/reader/stream.rb
|
117
|
-
- lib/pdf/reader/page_state.rb
|
118
|
-
- lib/pdf/reader/standard_security_handler.rb
|
119
|
-
- lib/pdf/reader/cmap.rb
|
120
|
-
- lib/pdf/reader/form_xobject.rb
|
121
|
-
- lib/pdf/reader/object_cache.rb
|
158
|
+
- lib/pdf/reader/print_receiver.rb
|
159
|
+
- lib/pdf/reader/lzw.rb
|
160
|
+
- lib/pdf/reader/buffer.rb
|
122
161
|
- lib/pdf/reader/object_stream.rb
|
123
|
-
- lib/pdf/reader/
|
124
|
-
- lib/pdf/reader/page_text_receiver.rb
|
162
|
+
- lib/pdf/reader/cmap.rb
|
125
163
|
- lib/pdf/reader/text_receiver.rb
|
126
|
-
- lib/pdf/reader/glyph_hash.rb
|
127
|
-
- lib/pdf/reader/glyphlist.txt
|
128
|
-
- lib/pdf/reader/lzw.rb
|
129
164
|
- lib/pdf/reader/register_receiver.rb
|
130
|
-
- lib/pdf/reader/
|
131
|
-
- lib/pdf/reader/abstract_strategy.rb
|
132
|
-
- lib/pdf/reader/pages_strategy.rb
|
133
|
-
- lib/pdf/reader/reference.rb
|
134
|
-
- lib/pdf/reader/encodings/standard.txt
|
165
|
+
- lib/pdf/reader/page_text_receiver.rb
|
135
166
|
- lib/pdf/reader/encodings/mac_roman.txt
|
167
|
+
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
136
168
|
- lib/pdf/reader/encodings/symbol.txt
|
137
169
|
- lib/pdf/reader/encodings/win_ansi.txt
|
138
|
-
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
139
|
-
- lib/pdf/reader/encodings/pdf_doc.txt
|
140
170
|
- lib/pdf/reader/encodings/mac_expert.txt
|
171
|
+
- lib/pdf/reader/encodings/standard.txt
|
172
|
+
- lib/pdf/reader/encodings/pdf_doc.txt
|
173
|
+
- lib/pdf/reader/filter.rb
|
174
|
+
- lib/pdf/reader/filter/null.rb
|
175
|
+
- lib/pdf/reader/filter/flate.rb
|
176
|
+
- lib/pdf/reader/filter/lzw.rb
|
177
|
+
- lib/pdf/reader/filter/ascii85.rb
|
178
|
+
- lib/pdf/reader/filter/ascii_hex.rb
|
179
|
+
- lib/pdf/reader/filter/run_length.rb
|
180
|
+
- lib/pdf/reader/filter/depredict.rb
|
181
|
+
- lib/pdf/reader/object_hash.rb
|
182
|
+
- lib/pdf/reader/reference.rb
|
183
|
+
- lib/pdf/reader/glyphlist.txt
|
184
|
+
- lib/pdf/reader/token.rb
|
185
|
+
- lib/pdf/reader/parser.rb
|
186
|
+
- lib/pdf/reader/page_state.rb
|
187
|
+
- lib/pdf/reader/error.rb
|
188
|
+
- lib/pdf/reader/glyph_hash.rb
|
141
189
|
- lib/pdf/reader/resource_methods.rb
|
190
|
+
- lib/pdf/reader/standard_security_handler.rb
|
191
|
+
- lib/pdf/reader/form_xobject.rb
|
192
|
+
- lib/pdf/reader/stream.rb
|
193
|
+
- lib/pdf/reader/pages_strategy.rb
|
194
|
+
- lib/pdf/reader/abstract_strategy.rb
|
142
195
|
- lib/pdf/reader/metadata_strategy.rb
|
143
|
-
- lib/pdf/reader/
|
196
|
+
- lib/pdf/reader/object_cache.rb
|
144
197
|
- lib/pdf-reader.rb
|
145
198
|
- Rakefile
|
146
199
|
- README.rdoc
|
@@ -181,7 +234,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
181
234
|
version: '0'
|
182
235
|
requirements: []
|
183
236
|
rubyforge_project:
|
184
|
-
rubygems_version: 1.8.
|
237
|
+
rubygems_version: 1.8.23
|
185
238
|
signing_key:
|
186
239
|
specification_version: 3
|
187
240
|
summary: A library for accessing the content of PDF files
|