pdf-reader 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -44,7 +44,7 @@ class PDF::Reader
44
44
  @pdf_version = read_version
45
45
  @xref = PDF::Reader::XRef.new(@io)
46
46
  @trailer = @xref.trailer
47
- @cache = PDF::Reader::ObjectCache.new
47
+ @cache = opts[:cache] || PDF::Reader::ObjectCache.new
48
48
  @sec_handler = build_security_handler(opts)
49
49
  end
50
50
 
@@ -20,14 +20,20 @@ module PDF
20
20
  # the raw PDF object that defines this page
21
21
  attr_reader :page_object
22
22
 
23
+ # a Hash-like object for storing cached data. Generally this is scoped to
24
+ # the current document and is used to avoid repeating expensive
25
+ # operations
26
+ attr_reader :cache
27
+
23
28
  # creates a new page wrapper.
24
29
  #
25
30
  # * objects - an ObjectHash instance that wraps a PDF file
26
31
  # * pagenum - an int specifying the page number to expose. 1 indexed.
27
32
  #
28
- def initialize(objects, pagenum)
33
+ def initialize(objects, pagenum, options = {})
29
34
  @objects, @pagenum = objects, pagenum
30
35
  @page_object = objects.deref(objects.page_references[pagenum - 1])
36
+ @cache = options[:cache] || {}
31
37
 
32
38
  unless @page_object.is_a?(::Hash)
33
39
  raise ArgumentError, "invalid page: #{pagenum}"
@@ -22,6 +22,7 @@ module PDF
22
22
  # starting a new page
23
23
  def initialize(page)
24
24
  @page = page
25
+ @cache = page.cache
25
26
  @objects = page.objects
26
27
  @font_stack = [build_fonts(page.fonts)]
27
28
  @xobject_stack = [page.xobjects]
@@ -176,7 +177,7 @@ module PDF
176
177
  concatenate_matrix(*matrix) if matrix
177
178
 
178
179
  if xobject.hash[:Subtype] == :Form
179
- form = PDF::Reader::FormXObject.new(@page, xobject)
180
+ form = PDF::Reader::FormXObject.new(@page, xobject, :cache => @cache)
180
181
  @font_stack.unshift(form.font_objects)
181
182
  @xobject_stack.unshift(form.xobjects)
182
183
  yield form if block_given?
@@ -58,7 +58,7 @@ class PDF::Reader
58
58
  end
59
59
 
60
60
  Array(hash[:Filter]).each_with_index do |filter, index|
61
- @udata = Filter.new(filter, options[index]).filter(@udata)
61
+ @udata = Filter.with(filter, options[index]).filter(@udata)
62
62
  end
63
63
  end
64
64
  @udata
@@ -53,9 +53,11 @@ class PDF::Reader
53
53
  #
54
54
  def initialize (io)
55
55
  @io = io
56
+ @junk_offset = calc_junk_offset(io) || 0
56
57
  @xref = {}
57
58
  @trailer = load_offsets
58
59
  end
60
+
59
61
  ################################################################################
60
62
  # return the number of objects in this file. Objects with multiple generations are
61
63
  # only counter once.
@@ -93,6 +95,7 @@ class PDF::Reader
93
95
  #
94
96
  def load_offsets(offset = nil)
95
97
  offset ||= new_buffer.find_first_xref_offset
98
+ offset += @junk_offset
96
99
 
97
100
  buf = new_buffer(offset)
98
101
  tok_one = buf.token
@@ -124,7 +127,7 @@ class PDF::Reader
124
127
  generation = buf.token.to_i
125
128
  state = buf.token
126
129
 
127
- store(objid, generation, offset) if state == "n" && offset > 0
130
+ store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
128
131
  objid += 1
129
132
  params.clear
130
133
  end
@@ -143,7 +146,7 @@ class PDF::Reader
143
146
  end
144
147
 
145
148
  ################################################################################
146
- # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
149
+ # Read an XRef stream from the underlying buffer instead of a traditional xref table.
147
150
  #
148
151
  def load_xref_stream(stream)
149
152
  unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
@@ -169,7 +172,7 @@ class PDF::Reader
169
172
  f2 = unpack_bytes(entry[widths[0],widths[1]])
170
173
  f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
171
174
  if f1 == 1 && f2 > 0
172
- store(objid, f3, f2)
175
+ store(objid, f3, f2 + @junk_offset)
173
176
  elsif f1 == 2 && f2 > 0
174
177
  store(objid, 0, PDF::Reader::Reference.new(f2, 0))
175
178
  end
@@ -203,7 +206,7 @@ class PDF::Reader
203
206
  # Wrap the io stream we're working with in a buffer that can tokenise it for us.
204
207
  #
205
208
  # We create multiple buffers so we can be tokenising multiple sections of the file
206
- # at the same time without worring about clearing the buffers contents.
209
+ # at the same time without worrying about clearing the buffers contents.
207
210
  #
208
211
  def new_buffer(offset = 0)
209
212
  PDF::Reader::Buffer.new(@io, :seek => offset)
@@ -214,6 +217,22 @@ class PDF::Reader
214
217
  def store (id, gen, offset)
215
218
  (@xref[id] ||= {})[gen] ||= offset
216
219
  end
220
+ ################################################################################
221
+ # Returns the offset of the PDF document in the +stream+. In theory this
222
+ # should always be 0, but all sort of crazy junk is prefixed to PDF files
223
+ # in the real world.
224
+ #
225
+ # Checks up to 50 chars into the file, returns nil if no PDF data detected.
226
+ #
227
+ def calc_junk_offset(io)
228
+ io.rewind
229
+ offset = io.pos
230
+ until (c = io.readchar) == '%' || c == 37 || offset > 50
231
+ offset += 1
232
+ end
233
+ io.rewind
234
+ offset < 50 ? offset : nil
235
+ end
217
236
  end
218
237
  ################################################################################
219
238
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.1
4
+ version: 1.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-05-09 00:00:00.000000000 Z
12
+ date: 2012-08-30 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &35841860 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *35841860
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: roodi
27
- requirement: &35841400 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ! '>='
@@ -32,10 +37,15 @@ dependencies:
32
37
  version: '0'
33
38
  type: :development
34
39
  prerelease: false
35
- version_requirements: *35841400
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
36
46
  - !ruby/object:Gem::Dependency
37
47
  name: rspec
38
- requirement: &35840900 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
39
49
  none: false
40
50
  requirements:
41
51
  - - ~>
@@ -43,10 +53,15 @@ dependencies:
43
53
  version: '2.3'
44
54
  type: :development
45
55
  prerelease: false
46
- version_requirements: *35840900
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ~>
60
+ - !ruby/object:Gem::Version
61
+ version: '2.3'
47
62
  - !ruby/object:Gem::Dependency
48
63
  name: ZenTest
49
- requirement: &35840400 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
50
65
  none: false
51
66
  requirements:
52
67
  - - ~>
@@ -54,10 +69,15 @@ dependencies:
54
69
  version: 4.4.2
55
70
  type: :development
56
71
  prerelease: false
57
- version_requirements: *35840400
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ~>
76
+ - !ruby/object:Gem::Version
77
+ version: 4.4.2
58
78
  - !ruby/object:Gem::Dependency
59
79
  name: Ascii85
60
- requirement: &35839940 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
61
81
  none: false
62
82
  requirements:
63
83
  - - ~>
@@ -65,18 +85,44 @@ dependencies:
65
85
  version: 1.0.0
66
86
  type: :runtime
67
87
  prerelease: false
68
- version_requirements: *35839940
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ~>
92
+ - !ruby/object:Gem::Version
93
+ version: 1.0.0
69
94
  - !ruby/object:Gem::Dependency
70
95
  name: ruby-rc4
71
- requirement: &35839520 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :runtime
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
72
105
  none: false
73
106
  requirements:
74
107
  - - ! '>='
75
108
  - !ruby/object:Gem::Version
76
109
  version: '0'
110
+ - !ruby/object:Gem::Dependency
111
+ name: hashery
112
+ requirement: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ~>
116
+ - !ruby/object:Gem::Version
117
+ version: '2.0'
77
118
  type: :runtime
78
119
  prerelease: false
79
- version_requirements: *35839520
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ~>
124
+ - !ruby/object:Gem::Version
125
+ version: '2.0'
80
126
  description: The PDF::Reader library implements a PDF parser conforming as much as
81
127
  possible to the PDF specification from Adobe
82
128
  email:
@@ -93,54 +139,61 @@ extra_rdoc_files:
93
139
  - CHANGELOG
94
140
  - MIT-LICENSE
95
141
  files:
142
+ - examples/extract_fonts.rb
143
+ - examples/hash.rb
144
+ - examples/extract_bates.rb
96
145
  - examples/metadata.rb
97
146
  - examples/extract_images.rb
98
- - examples/extract_bates.rb
99
- - examples/callbacks.rb
100
147
  - examples/rspec.rb
101
- - examples/hash.rb
102
- - examples/text.rb
103
- - examples/extract_fonts.rb
104
148
  - examples/page_count.rb
149
+ - examples/callbacks.rb
150
+ - examples/text.rb
105
151
  - examples/version.rb
106
- - lib/pdf/reader.rb
107
152
  - lib/pdf/hash.rb
108
- - lib/pdf/reader/print_receiver.rb
153
+ - lib/pdf/reader.rb
109
154
  - lib/pdf/reader/xref.rb
110
- - lib/pdf/reader/buffer.rb
155
+ - lib/pdf/reader/page.rb
156
+ - lib/pdf/reader/encoding.rb
111
157
  - lib/pdf/reader/font.rb
112
- - lib/pdf/reader/parser.rb
113
- - lib/pdf/reader/error.rb
114
- - lib/pdf/reader/filter.rb
115
- - lib/pdf/reader/object_hash.rb
116
- - lib/pdf/reader/stream.rb
117
- - lib/pdf/reader/page_state.rb
118
- - lib/pdf/reader/standard_security_handler.rb
119
- - lib/pdf/reader/cmap.rb
120
- - lib/pdf/reader/form_xobject.rb
121
- - lib/pdf/reader/object_cache.rb
158
+ - lib/pdf/reader/print_receiver.rb
159
+ - lib/pdf/reader/lzw.rb
160
+ - lib/pdf/reader/buffer.rb
122
161
  - lib/pdf/reader/object_stream.rb
123
- - lib/pdf/reader/encoding.rb
124
- - lib/pdf/reader/page_text_receiver.rb
162
+ - lib/pdf/reader/cmap.rb
125
163
  - lib/pdf/reader/text_receiver.rb
126
- - lib/pdf/reader/glyph_hash.rb
127
- - lib/pdf/reader/glyphlist.txt
128
- - lib/pdf/reader/lzw.rb
129
164
  - lib/pdf/reader/register_receiver.rb
130
- - lib/pdf/reader/page.rb
131
- - lib/pdf/reader/abstract_strategy.rb
132
- - lib/pdf/reader/pages_strategy.rb
133
- - lib/pdf/reader/reference.rb
134
- - lib/pdf/reader/encodings/standard.txt
165
+ - lib/pdf/reader/page_text_receiver.rb
135
166
  - lib/pdf/reader/encodings/mac_roman.txt
167
+ - lib/pdf/reader/encodings/zapf_dingbats.txt
136
168
  - lib/pdf/reader/encodings/symbol.txt
137
169
  - lib/pdf/reader/encodings/win_ansi.txt
138
- - lib/pdf/reader/encodings/zapf_dingbats.txt
139
- - lib/pdf/reader/encodings/pdf_doc.txt
140
170
  - lib/pdf/reader/encodings/mac_expert.txt
171
+ - lib/pdf/reader/encodings/standard.txt
172
+ - lib/pdf/reader/encodings/pdf_doc.txt
173
+ - lib/pdf/reader/filter.rb
174
+ - lib/pdf/reader/filter/null.rb
175
+ - lib/pdf/reader/filter/flate.rb
176
+ - lib/pdf/reader/filter/lzw.rb
177
+ - lib/pdf/reader/filter/ascii85.rb
178
+ - lib/pdf/reader/filter/ascii_hex.rb
179
+ - lib/pdf/reader/filter/run_length.rb
180
+ - lib/pdf/reader/filter/depredict.rb
181
+ - lib/pdf/reader/object_hash.rb
182
+ - lib/pdf/reader/reference.rb
183
+ - lib/pdf/reader/glyphlist.txt
184
+ - lib/pdf/reader/token.rb
185
+ - lib/pdf/reader/parser.rb
186
+ - lib/pdf/reader/page_state.rb
187
+ - lib/pdf/reader/error.rb
188
+ - lib/pdf/reader/glyph_hash.rb
141
189
  - lib/pdf/reader/resource_methods.rb
190
+ - lib/pdf/reader/standard_security_handler.rb
191
+ - lib/pdf/reader/form_xobject.rb
192
+ - lib/pdf/reader/stream.rb
193
+ - lib/pdf/reader/pages_strategy.rb
194
+ - lib/pdf/reader/abstract_strategy.rb
142
195
  - lib/pdf/reader/metadata_strategy.rb
143
- - lib/pdf/reader/token.rb
196
+ - lib/pdf/reader/object_cache.rb
144
197
  - lib/pdf-reader.rb
145
198
  - Rakefile
146
199
  - README.rdoc
@@ -181,7 +234,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
181
234
  version: '0'
182
235
  requirements: []
183
236
  rubyforge_project:
184
- rubygems_version: 1.8.11
237
+ rubygems_version: 1.8.23
185
238
  signing_key:
186
239
  specification_version: 3
187
240
  summary: A library for accessing the content of PDF files