pdf-reader 1.1.1 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -44,7 +44,7 @@ class PDF::Reader
44
44
  @pdf_version = read_version
45
45
  @xref = PDF::Reader::XRef.new(@io)
46
46
  @trailer = @xref.trailer
47
- @cache = PDF::Reader::ObjectCache.new
47
+ @cache = opts[:cache] || PDF::Reader::ObjectCache.new
48
48
  @sec_handler = build_security_handler(opts)
49
49
  end
50
50
 
@@ -20,14 +20,20 @@ module PDF
20
20
  # the raw PDF object that defines this page
21
21
  attr_reader :page_object
22
22
 
23
+ # a Hash-like object for storing cached data. Generally this is scoped to
24
+ # the current document and is used to avoid repeating expensive
25
+ # operations
26
+ attr_reader :cache
27
+
23
28
  # creates a new page wrapper.
24
29
  #
25
30
  # * objects - an ObjectHash instance that wraps a PDF file
26
31
  # * pagenum - an int specifying the page number to expose. 1 indexed.
27
32
  #
28
- def initialize(objects, pagenum)
33
+ def initialize(objects, pagenum, options = {})
29
34
  @objects, @pagenum = objects, pagenum
30
35
  @page_object = objects.deref(objects.page_references[pagenum - 1])
36
+ @cache = options[:cache] || {}
31
37
 
32
38
  unless @page_object.is_a?(::Hash)
33
39
  raise ArgumentError, "invalid page: #{pagenum}"
@@ -22,6 +22,7 @@ module PDF
22
22
  # starting a new page
23
23
  def initialize(page)
24
24
  @page = page
25
+ @cache = page.cache
25
26
  @objects = page.objects
26
27
  @font_stack = [build_fonts(page.fonts)]
27
28
  @xobject_stack = [page.xobjects]
@@ -176,7 +177,7 @@ module PDF
176
177
  concatenate_matrix(*matrix) if matrix
177
178
 
178
179
  if xobject.hash[:Subtype] == :Form
179
- form = PDF::Reader::FormXObject.new(@page, xobject)
180
+ form = PDF::Reader::FormXObject.new(@page, xobject, :cache => @cache)
180
181
  @font_stack.unshift(form.font_objects)
181
182
  @xobject_stack.unshift(form.xobjects)
182
183
  yield form if block_given?
@@ -58,7 +58,7 @@ class PDF::Reader
58
58
  end
59
59
 
60
60
  Array(hash[:Filter]).each_with_index do |filter, index|
61
- @udata = Filter.new(filter, options[index]).filter(@udata)
61
+ @udata = Filter.with(filter, options[index]).filter(@udata)
62
62
  end
63
63
  end
64
64
  @udata
@@ -53,9 +53,11 @@ class PDF::Reader
53
53
  #
54
54
  def initialize (io)
55
55
  @io = io
56
+ @junk_offset = calc_junk_offset(io) || 0
56
57
  @xref = {}
57
58
  @trailer = load_offsets
58
59
  end
60
+
59
61
  ################################################################################
60
62
  # return the number of objects in this file. Objects with multiple generations are
61
63
  # only counter once.
@@ -93,6 +95,7 @@ class PDF::Reader
93
95
  #
94
96
  def load_offsets(offset = nil)
95
97
  offset ||= new_buffer.find_first_xref_offset
98
+ offset += @junk_offset
96
99
 
97
100
  buf = new_buffer(offset)
98
101
  tok_one = buf.token
@@ -124,7 +127,7 @@ class PDF::Reader
124
127
  generation = buf.token.to_i
125
128
  state = buf.token
126
129
 
127
- store(objid, generation, offset) if state == "n" && offset > 0
130
+ store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
128
131
  objid += 1
129
132
  params.clear
130
133
  end
@@ -143,7 +146,7 @@ class PDF::Reader
143
146
  end
144
147
 
145
148
  ################################################################################
146
- # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
149
+ # Read an XRef stream from the underlying buffer instead of a traditional xref table.
147
150
  #
148
151
  def load_xref_stream(stream)
149
152
  unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
@@ -169,7 +172,7 @@ class PDF::Reader
169
172
  f2 = unpack_bytes(entry[widths[0],widths[1]])
170
173
  f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
171
174
  if f1 == 1 && f2 > 0
172
- store(objid, f3, f2)
175
+ store(objid, f3, f2 + @junk_offset)
173
176
  elsif f1 == 2 && f2 > 0
174
177
  store(objid, 0, PDF::Reader::Reference.new(f2, 0))
175
178
  end
@@ -203,7 +206,7 @@ class PDF::Reader
203
206
  # Wrap the io stream we're working with in a buffer that can tokenise it for us.
204
207
  #
205
208
  # We create multiple buffers so we can be tokenising multiple sections of the file
206
- # at the same time without worring about clearing the buffers contents.
209
+ # at the same time without worrying about clearing the buffers contents.
207
210
  #
208
211
  def new_buffer(offset = 0)
209
212
  PDF::Reader::Buffer.new(@io, :seek => offset)
@@ -214,6 +217,22 @@ class PDF::Reader
214
217
  def store (id, gen, offset)
215
218
  (@xref[id] ||= {})[gen] ||= offset
216
219
  end
220
+ ################################################################################
221
+ # Returns the offset of the PDF document in the +stream+. In theory this
222
+ # should always be 0, but all sort of crazy junk is prefixed to PDF files
223
+ # in the real world.
224
+ #
225
+ # Checks up to 50 chars into the file, returns nil if no PDF data detected.
226
+ #
227
+ def calc_junk_offset(io)
228
+ io.rewind
229
+ offset = io.pos
230
+ until (c = io.readchar) == '%' || c == 37 || offset > 50
231
+ offset += 1
232
+ end
233
+ io.rewind
234
+ offset < 50 ? offset : nil
235
+ end
217
236
  end
218
237
  ################################################################################
219
238
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-09 00:00:00.000000000 Z
12
+ date: 2012-08-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &35841860 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *35841860
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: roodi
27
- requirement: &35841400 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ! '>='
@@ -32,10 +37,15 @@ dependencies:
32
37
  version: '0'
33
38
  type: :development
34
39
  prerelease: false
35
- version_requirements: *35841400
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
36
46
  - !ruby/object:Gem::Dependency
37
47
  name: rspec
38
- requirement: &35840900 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
39
49
  none: false
40
50
  requirements:
41
51
  - - ~>
@@ -43,10 +53,15 @@ dependencies:
43
53
  version: '2.3'
44
54
  type: :development
45
55
  prerelease: false
46
- version_requirements: *35840900
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '2.3'
47
62
  - !ruby/object:Gem::Dependency
48
63
  name: ZenTest
49
- requirement: &35840400 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
50
65
  none: false
51
66
  requirements:
52
67
  - - ~>
@@ -54,10 +69,15 @@ dependencies:
54
69
  version: 4.4.2
55
70
  type: :development
56
71
  prerelease: false
57
- version_requirements: *35840400
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 4.4.2
58
78
  - !ruby/object:Gem::Dependency
59
79
  name: Ascii85
60
- requirement: &35839940 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
61
81
  none: false
62
82
  requirements:
63
83
  - - ~>
@@ -65,18 +85,44 @@ dependencies:
65
85
  version: 1.0.0
66
86
  type: :runtime
67
87
  prerelease: false
68
- version_requirements: *35839940
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.0.0
69
94
  - !ruby/object:Gem::Dependency
70
95
  name: ruby-rc4
71
- requirement: &35839520 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
72
105
  none: false
73
106
  requirements:
74
107
  - - ! '>='
75
108
  - !ruby/object:Gem::Version
76
109
  version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: hashery
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: '2.0'
77
118
  type: :runtime
78
119
  prerelease: false
79
- version_requirements: *35839520
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ~>
124
+ - !ruby/object:Gem::Version
125
+ version: '2.0'
80
126
  description: The PDF::Reader library implements a PDF parser conforming as much as
81
127
  possible to the PDF specification from Adobe
82
128
  email:
@@ -93,54 +139,61 @@ extra_rdoc_files:
93
139
  - CHANGELOG
94
140
  - MIT-LICENSE
95
141
  files:
142
+ - examples/extract_fonts.rb
143
+ - examples/hash.rb
144
+ - examples/extract_bates.rb
96
145
  - examples/metadata.rb
97
146
  - examples/extract_images.rb
98
- - examples/extract_bates.rb
99
- - examples/callbacks.rb
100
147
  - examples/rspec.rb
101
- - examples/hash.rb
102
- - examples/text.rb
103
- - examples/extract_fonts.rb
104
148
  - examples/page_count.rb
149
+ - examples/callbacks.rb
150
+ - examples/text.rb
105
151
  - examples/version.rb
106
- - lib/pdf/reader.rb
107
152
  - lib/pdf/hash.rb
108
- - lib/pdf/reader/print_receiver.rb
153
+ - lib/pdf/reader.rb
109
154
  - lib/pdf/reader/xref.rb
110
- - lib/pdf/reader/buffer.rb
155
+ - lib/pdf/reader/page.rb
156
+ - lib/pdf/reader/encoding.rb
111
157
  - lib/pdf/reader/font.rb
112
- - lib/pdf/reader/parser.rb
113
- - lib/pdf/reader/error.rb
114
- - lib/pdf/reader/filter.rb
115
- - lib/pdf/reader/object_hash.rb
116
- - lib/pdf/reader/stream.rb
117
- - lib/pdf/reader/page_state.rb
118
- - lib/pdf/reader/standard_security_handler.rb
119
- - lib/pdf/reader/cmap.rb
120
- - lib/pdf/reader/form_xobject.rb
121
- - lib/pdf/reader/object_cache.rb
158
+ - lib/pdf/reader/print_receiver.rb
159
+ - lib/pdf/reader/lzw.rb
160
+ - lib/pdf/reader/buffer.rb
122
161
  - lib/pdf/reader/object_stream.rb
123
- - lib/pdf/reader/encoding.rb
124
- - lib/pdf/reader/page_text_receiver.rb
162
+ - lib/pdf/reader/cmap.rb
125
163
  - lib/pdf/reader/text_receiver.rb
126
- - lib/pdf/reader/glyph_hash.rb
127
- - lib/pdf/reader/glyphlist.txt
128
- - lib/pdf/reader/lzw.rb
129
164
  - lib/pdf/reader/register_receiver.rb
130
- - lib/pdf/reader/page.rb
131
- - lib/pdf/reader/abstract_strategy.rb
132
- - lib/pdf/reader/pages_strategy.rb
133
- - lib/pdf/reader/reference.rb
134
- - lib/pdf/reader/encodings/standard.txt
165
+ - lib/pdf/reader/page_text_receiver.rb
135
166
  - lib/pdf/reader/encodings/mac_roman.txt
167
+ - lib/pdf/reader/encodings/zapf_dingbats.txt
136
168
  - lib/pdf/reader/encodings/symbol.txt
137
169
  - lib/pdf/reader/encodings/win_ansi.txt
138
- - lib/pdf/reader/encodings/zapf_dingbats.txt
139
- - lib/pdf/reader/encodings/pdf_doc.txt
140
170
  - lib/pdf/reader/encodings/mac_expert.txt
171
+ - lib/pdf/reader/encodings/standard.txt
172
+ - lib/pdf/reader/encodings/pdf_doc.txt
173
+ - lib/pdf/reader/filter.rb
174
+ - lib/pdf/reader/filter/null.rb
175
+ - lib/pdf/reader/filter/flate.rb
176
+ - lib/pdf/reader/filter/lzw.rb
177
+ - lib/pdf/reader/filter/ascii85.rb
178
+ - lib/pdf/reader/filter/ascii_hex.rb
179
+ - lib/pdf/reader/filter/run_length.rb
180
+ - lib/pdf/reader/filter/depredict.rb
181
+ - lib/pdf/reader/object_hash.rb
182
+ - lib/pdf/reader/reference.rb
183
+ - lib/pdf/reader/glyphlist.txt
184
+ - lib/pdf/reader/token.rb
185
+ - lib/pdf/reader/parser.rb
186
+ - lib/pdf/reader/page_state.rb
187
+ - lib/pdf/reader/error.rb
188
+ - lib/pdf/reader/glyph_hash.rb
141
189
  - lib/pdf/reader/resource_methods.rb
190
+ - lib/pdf/reader/standard_security_handler.rb
191
+ - lib/pdf/reader/form_xobject.rb
192
+ - lib/pdf/reader/stream.rb
193
+ - lib/pdf/reader/pages_strategy.rb
194
+ - lib/pdf/reader/abstract_strategy.rb
142
195
  - lib/pdf/reader/metadata_strategy.rb
143
- - lib/pdf/reader/token.rb
196
+ - lib/pdf/reader/object_cache.rb
144
197
  - lib/pdf-reader.rb
145
198
  - Rakefile
146
199
  - README.rdoc
@@ -181,7 +234,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
181
234
  version: '0'
182
235
  requirements: []
183
236
  rubyforge_project:
184
- rubygems_version: 1.8.11
237
+ rubygems_version: 1.8.23
185
238
  signing_key:
186
239
  specification_version: 3
187
240
  summary: A library for accessing the content of PDF files