pdf-reader 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -0
- data/bin/pdf_text +0 -2
- data/examples/extract_images.rb +11 -6
- data/lib/pdf/reader.rb +11 -5
- data/lib/pdf/reader/buffer.rb +48 -42
- data/lib/pdf/reader/cmap.rb +26 -11
- data/lib/pdf/reader/filter.rb +11 -234
- data/lib/pdf/reader/filter/ascii85.rb +25 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +26 -0
- data/lib/pdf/reader/filter/depredict.rb +138 -0
- data/lib/pdf/reader/filter/flate.rb +38 -0
- data/lib/pdf/reader/filter/lzw.rb +18 -0
- data/lib/pdf/reader/filter/null.rb +15 -0
- data/lib/pdf/reader/filter/run_length.rb +46 -0
- data/lib/pdf/reader/font.rb +1 -1
- data/lib/pdf/reader/form_xobject.rb +25 -4
- data/lib/pdf/reader/glyph_hash.rb +3 -2
- data/lib/pdf/reader/object_cache.rb +39 -16
- data/lib/pdf/reader/object_hash.rb +1 -1
- data/lib/pdf/reader/page.rb +7 -1
- data/lib/pdf/reader/page_state.rb +2 -1
- data/lib/pdf/reader/stream.rb +1 -1
- data/lib/pdf/reader/xref.rb +23 -4
- metadata +99 -46
@@ -44,7 +44,7 @@ class PDF::Reader
|
|
44
44
|
@pdf_version = read_version
|
45
45
|
@xref = PDF::Reader::XRef.new(@io)
|
46
46
|
@trailer = @xref.trailer
|
47
|
-
@cache = PDF::Reader::ObjectCache.new
|
47
|
+
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
48
48
|
@sec_handler = build_security_handler(opts)
|
49
49
|
end
|
50
50
|
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -20,14 +20,20 @@ module PDF
|
|
20
20
|
# the raw PDF object that defines this page
|
21
21
|
attr_reader :page_object
|
22
22
|
|
23
|
+
# a Hash-like object for storing cached data. Generally this is scoped to
|
24
|
+
# the current document and is used to avoid repeating expensive
|
25
|
+
# operations
|
26
|
+
attr_reader :cache
|
27
|
+
|
23
28
|
# creates a new page wrapper.
|
24
29
|
#
|
25
30
|
# * objects - an ObjectHash instance that wraps a PDF file
|
26
31
|
# * pagenum - an int specifying the page number to expose. 1 indexed.
|
27
32
|
#
|
28
|
-
def initialize(objects, pagenum)
|
33
|
+
def initialize(objects, pagenum, options = {})
|
29
34
|
@objects, @pagenum = objects, pagenum
|
30
35
|
@page_object = objects.deref(objects.page_references[pagenum - 1])
|
36
|
+
@cache = options[:cache] || {}
|
31
37
|
|
32
38
|
unless @page_object.is_a?(::Hash)
|
33
39
|
raise ArgumentError, "invalid page: #{pagenum}"
|
@@ -22,6 +22,7 @@ module PDF
|
|
22
22
|
# starting a new page
|
23
23
|
def initialize(page)
|
24
24
|
@page = page
|
25
|
+
@cache = page.cache
|
25
26
|
@objects = page.objects
|
26
27
|
@font_stack = [build_fonts(page.fonts)]
|
27
28
|
@xobject_stack = [page.xobjects]
|
@@ -176,7 +177,7 @@ module PDF
|
|
176
177
|
concatenate_matrix(*matrix) if matrix
|
177
178
|
|
178
179
|
if xobject.hash[:Subtype] == :Form
|
179
|
-
form = PDF::Reader::FormXObject.new(@page, xobject)
|
180
|
+
form = PDF::Reader::FormXObject.new(@page, xobject, :cache => @cache)
|
180
181
|
@font_stack.unshift(form.font_objects)
|
181
182
|
@xobject_stack.unshift(form.xobjects)
|
182
183
|
yield form if block_given?
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/xref.rb
CHANGED
@@ -53,9 +53,11 @@ class PDF::Reader
|
|
53
53
|
#
|
54
54
|
def initialize (io)
|
55
55
|
@io = io
|
56
|
+
@junk_offset = calc_junk_offset(io) || 0
|
56
57
|
@xref = {}
|
57
58
|
@trailer = load_offsets
|
58
59
|
end
|
60
|
+
|
59
61
|
################################################################################
|
60
62
|
# return the number of objects in this file. Objects with multiple generations are
|
61
63
|
# only counter once.
|
@@ -93,6 +95,7 @@ class PDF::Reader
|
|
93
95
|
#
|
94
96
|
def load_offsets(offset = nil)
|
95
97
|
offset ||= new_buffer.find_first_xref_offset
|
98
|
+
offset += @junk_offset
|
96
99
|
|
97
100
|
buf = new_buffer(offset)
|
98
101
|
tok_one = buf.token
|
@@ -124,7 +127,7 @@ class PDF::Reader
|
|
124
127
|
generation = buf.token.to_i
|
125
128
|
state = buf.token
|
126
129
|
|
127
|
-
store(objid, generation, offset) if state == "n" && offset > 0
|
130
|
+
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
128
131
|
objid += 1
|
129
132
|
params.clear
|
130
133
|
end
|
@@ -143,7 +146,7 @@ class PDF::Reader
|
|
143
146
|
end
|
144
147
|
|
145
148
|
################################################################################
|
146
|
-
# Read
|
149
|
+
# Read an XRef stream from the underlying buffer instead of a traditional xref table.
|
147
150
|
#
|
148
151
|
def load_xref_stream(stream)
|
149
152
|
unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
|
@@ -169,7 +172,7 @@ class PDF::Reader
|
|
169
172
|
f2 = unpack_bytes(entry[widths[0],widths[1]])
|
170
173
|
f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
|
171
174
|
if f1 == 1 && f2 > 0
|
172
|
-
store(objid, f3, f2)
|
175
|
+
store(objid, f3, f2 + @junk_offset)
|
173
176
|
elsif f1 == 2 && f2 > 0
|
174
177
|
store(objid, 0, PDF::Reader::Reference.new(f2, 0))
|
175
178
|
end
|
@@ -203,7 +206,7 @@ class PDF::Reader
|
|
203
206
|
# Wrap the io stream we're working with in a buffer that can tokenise it for us.
|
204
207
|
#
|
205
208
|
# We create multiple buffers so we can be tokenising multiple sections of the file
|
206
|
-
# at the same time without
|
209
|
+
# at the same time without worrying about clearing the buffers contents.
|
207
210
|
#
|
208
211
|
def new_buffer(offset = 0)
|
209
212
|
PDF::Reader::Buffer.new(@io, :seek => offset)
|
@@ -214,6 +217,22 @@ class PDF::Reader
|
|
214
217
|
def store (id, gen, offset)
|
215
218
|
(@xref[id] ||= {})[gen] ||= offset
|
216
219
|
end
|
220
|
+
################################################################################
|
221
|
+
# Returns the offset of the PDF document in the +stream+. In theory this
|
222
|
+
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
223
|
+
# in the real world.
|
224
|
+
#
|
225
|
+
# Checks up to 50 chars into the file, returns nil if no PDF data detected.
|
226
|
+
#
|
227
|
+
def calc_junk_offset(io)
|
228
|
+
io.rewind
|
229
|
+
offset = io.pos
|
230
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 50
|
231
|
+
offset += 1
|
232
|
+
end
|
233
|
+
io.rewind
|
234
|
+
offset < 50 ? offset : nil
|
235
|
+
end
|
217
236
|
end
|
218
237
|
################################################################################
|
219
238
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-30 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement:
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,15 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements:
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
25
30
|
- !ruby/object:Gem::Dependency
|
26
31
|
name: roodi
|
27
|
-
requirement:
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
28
33
|
none: false
|
29
34
|
requirements:
|
30
35
|
- - ! '>='
|
@@ -32,10 +37,15 @@ dependencies:
|
|
32
37
|
version: '0'
|
33
38
|
type: :development
|
34
39
|
prerelease: false
|
35
|
-
version_requirements:
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
36
46
|
- !ruby/object:Gem::Dependency
|
37
47
|
name: rspec
|
38
|
-
requirement:
|
48
|
+
requirement: !ruby/object:Gem::Requirement
|
39
49
|
none: false
|
40
50
|
requirements:
|
41
51
|
- - ~>
|
@@ -43,10 +53,15 @@ dependencies:
|
|
43
53
|
version: '2.3'
|
44
54
|
type: :development
|
45
55
|
prerelease: false
|
46
|
-
version_requirements:
|
56
|
+
version_requirements: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ~>
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.3'
|
47
62
|
- !ruby/object:Gem::Dependency
|
48
63
|
name: ZenTest
|
49
|
-
requirement:
|
64
|
+
requirement: !ruby/object:Gem::Requirement
|
50
65
|
none: false
|
51
66
|
requirements:
|
52
67
|
- - ~>
|
@@ -54,10 +69,15 @@ dependencies:
|
|
54
69
|
version: 4.4.2
|
55
70
|
type: :development
|
56
71
|
prerelease: false
|
57
|
-
version_requirements:
|
72
|
+
version_requirements: !ruby/object:Gem::Requirement
|
73
|
+
none: false
|
74
|
+
requirements:
|
75
|
+
- - ~>
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: 4.4.2
|
58
78
|
- !ruby/object:Gem::Dependency
|
59
79
|
name: Ascii85
|
60
|
-
requirement:
|
80
|
+
requirement: !ruby/object:Gem::Requirement
|
61
81
|
none: false
|
62
82
|
requirements:
|
63
83
|
- - ~>
|
@@ -65,18 +85,44 @@ dependencies:
|
|
65
85
|
version: 1.0.0
|
66
86
|
type: :runtime
|
67
87
|
prerelease: false
|
68
|
-
version_requirements:
|
88
|
+
version_requirements: !ruby/object:Gem::Requirement
|
89
|
+
none: false
|
90
|
+
requirements:
|
91
|
+
- - ~>
|
92
|
+
- !ruby/object:Gem::Version
|
93
|
+
version: 1.0.0
|
69
94
|
- !ruby/object:Gem::Dependency
|
70
95
|
name: ruby-rc4
|
71
|
-
requirement:
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: '0'
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
72
105
|
none: false
|
73
106
|
requirements:
|
74
107
|
- - ! '>='
|
75
108
|
- !ruby/object:Gem::Version
|
76
109
|
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: hashery
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ~>
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '2.0'
|
77
118
|
type: :runtime
|
78
119
|
prerelease: false
|
79
|
-
version_requirements:
|
120
|
+
version_requirements: !ruby/object:Gem::Requirement
|
121
|
+
none: false
|
122
|
+
requirements:
|
123
|
+
- - ~>
|
124
|
+
- !ruby/object:Gem::Version
|
125
|
+
version: '2.0'
|
80
126
|
description: The PDF::Reader library implements a PDF parser conforming as much as
|
81
127
|
possible to the PDF specification from Adobe
|
82
128
|
email:
|
@@ -93,54 +139,61 @@ extra_rdoc_files:
|
|
93
139
|
- CHANGELOG
|
94
140
|
- MIT-LICENSE
|
95
141
|
files:
|
142
|
+
- examples/extract_fonts.rb
|
143
|
+
- examples/hash.rb
|
144
|
+
- examples/extract_bates.rb
|
96
145
|
- examples/metadata.rb
|
97
146
|
- examples/extract_images.rb
|
98
|
-
- examples/extract_bates.rb
|
99
|
-
- examples/callbacks.rb
|
100
147
|
- examples/rspec.rb
|
101
|
-
- examples/hash.rb
|
102
|
-
- examples/text.rb
|
103
|
-
- examples/extract_fonts.rb
|
104
148
|
- examples/page_count.rb
|
149
|
+
- examples/callbacks.rb
|
150
|
+
- examples/text.rb
|
105
151
|
- examples/version.rb
|
106
|
-
- lib/pdf/reader.rb
|
107
152
|
- lib/pdf/hash.rb
|
108
|
-
- lib/pdf/reader
|
153
|
+
- lib/pdf/reader.rb
|
109
154
|
- lib/pdf/reader/xref.rb
|
110
|
-
- lib/pdf/reader/
|
155
|
+
- lib/pdf/reader/page.rb
|
156
|
+
- lib/pdf/reader/encoding.rb
|
111
157
|
- lib/pdf/reader/font.rb
|
112
|
-
- lib/pdf/reader/
|
113
|
-
- lib/pdf/reader/
|
114
|
-
- lib/pdf/reader/
|
115
|
-
- lib/pdf/reader/object_hash.rb
|
116
|
-
- lib/pdf/reader/stream.rb
|
117
|
-
- lib/pdf/reader/page_state.rb
|
118
|
-
- lib/pdf/reader/standard_security_handler.rb
|
119
|
-
- lib/pdf/reader/cmap.rb
|
120
|
-
- lib/pdf/reader/form_xobject.rb
|
121
|
-
- lib/pdf/reader/object_cache.rb
|
158
|
+
- lib/pdf/reader/print_receiver.rb
|
159
|
+
- lib/pdf/reader/lzw.rb
|
160
|
+
- lib/pdf/reader/buffer.rb
|
122
161
|
- lib/pdf/reader/object_stream.rb
|
123
|
-
- lib/pdf/reader/
|
124
|
-
- lib/pdf/reader/page_text_receiver.rb
|
162
|
+
- lib/pdf/reader/cmap.rb
|
125
163
|
- lib/pdf/reader/text_receiver.rb
|
126
|
-
- lib/pdf/reader/glyph_hash.rb
|
127
|
-
- lib/pdf/reader/glyphlist.txt
|
128
|
-
- lib/pdf/reader/lzw.rb
|
129
164
|
- lib/pdf/reader/register_receiver.rb
|
130
|
-
- lib/pdf/reader/
|
131
|
-
- lib/pdf/reader/abstract_strategy.rb
|
132
|
-
- lib/pdf/reader/pages_strategy.rb
|
133
|
-
- lib/pdf/reader/reference.rb
|
134
|
-
- lib/pdf/reader/encodings/standard.txt
|
165
|
+
- lib/pdf/reader/page_text_receiver.rb
|
135
166
|
- lib/pdf/reader/encodings/mac_roman.txt
|
167
|
+
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
136
168
|
- lib/pdf/reader/encodings/symbol.txt
|
137
169
|
- lib/pdf/reader/encodings/win_ansi.txt
|
138
|
-
- lib/pdf/reader/encodings/zapf_dingbats.txt
|
139
|
-
- lib/pdf/reader/encodings/pdf_doc.txt
|
140
170
|
- lib/pdf/reader/encodings/mac_expert.txt
|
171
|
+
- lib/pdf/reader/encodings/standard.txt
|
172
|
+
- lib/pdf/reader/encodings/pdf_doc.txt
|
173
|
+
- lib/pdf/reader/filter.rb
|
174
|
+
- lib/pdf/reader/filter/null.rb
|
175
|
+
- lib/pdf/reader/filter/flate.rb
|
176
|
+
- lib/pdf/reader/filter/lzw.rb
|
177
|
+
- lib/pdf/reader/filter/ascii85.rb
|
178
|
+
- lib/pdf/reader/filter/ascii_hex.rb
|
179
|
+
- lib/pdf/reader/filter/run_length.rb
|
180
|
+
- lib/pdf/reader/filter/depredict.rb
|
181
|
+
- lib/pdf/reader/object_hash.rb
|
182
|
+
- lib/pdf/reader/reference.rb
|
183
|
+
- lib/pdf/reader/glyphlist.txt
|
184
|
+
- lib/pdf/reader/token.rb
|
185
|
+
- lib/pdf/reader/parser.rb
|
186
|
+
- lib/pdf/reader/page_state.rb
|
187
|
+
- lib/pdf/reader/error.rb
|
188
|
+
- lib/pdf/reader/glyph_hash.rb
|
141
189
|
- lib/pdf/reader/resource_methods.rb
|
190
|
+
- lib/pdf/reader/standard_security_handler.rb
|
191
|
+
- lib/pdf/reader/form_xobject.rb
|
192
|
+
- lib/pdf/reader/stream.rb
|
193
|
+
- lib/pdf/reader/pages_strategy.rb
|
194
|
+
- lib/pdf/reader/abstract_strategy.rb
|
142
195
|
- lib/pdf/reader/metadata_strategy.rb
|
143
|
-
- lib/pdf/reader/
|
196
|
+
- lib/pdf/reader/object_cache.rb
|
144
197
|
- lib/pdf-reader.rb
|
145
198
|
- Rakefile
|
146
199
|
- README.rdoc
|
@@ -181,7 +234,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
181
234
|
version: '0'
|
182
235
|
requirements: []
|
183
236
|
rubyforge_project:
|
184
|
-
rubygems_version: 1.8.
|
237
|
+
rubygems_version: 1.8.23
|
185
238
|
signing_key:
|
186
239
|
specification_version: 3
|
187
240
|
summary: A library for accessing the content of PDF files
|