pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+
6
+ # Security handler for when we don't support the flavour of encryption
7
+ # used in a PDF.
8
+ class UnimplementedSecurityHandler
9
+ def self.supports?(encrypt)
10
+ true
11
+ end
12
+
13
+ def decrypt(buf, ref)
14
+ raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,67 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'afm'
5
+ require 'pdf/reader/synchronized_cache'
6
+
7
+ class PDF::Reader
8
+ module WidthCalculator
9
+
10
+ # Type1 fonts can be one of 14 "built in" standard fonts. In these cases,
11
+ # the reader is expected to have it's own copy of the font metrics.
12
+ # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
13
+ class BuiltIn
14
+
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
23
+ def initialize(font)
24
+ @font = font
25
+ @@all_metrics ||= PDF::Reader::SynchronizedCache.new
26
+
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
29
+
30
+ if File.file?(metrics_path)
31
+ @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
32
+ else
33
+ raise ArgumentError, "No built-in metrics for #{font.basefont}"
34
+ end
35
+ end
36
+
37
+ def glyph_width(code_point)
38
+ return 0 if code_point.nil? || code_point < 0
39
+
40
+ names = @font.encoding.int_to_name(code_point)
41
+ metrics = names.map { |name|
42
+ @metrics.char_metrics[name.to_s]
43
+ }.compact.first
44
+
45
+ if metrics
46
+ metrics[:wx]
47
+ else
48
+ @font.widths[code_point - 1] || 0
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def control_character?(code_point)
55
+ @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
56
+ end
57
+
58
+ def extract_basefont(font_name)
59
+ if BUILTINS.include?(font_name)
60
+ font_name
61
+ else
62
+ "Times-Roman"
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # CIDFontType0 or CIDFontType2 use DW (integer) and W (array) to determine
7
+ # codepoint widths, note that CIDFontType2 will contain a true type font
8
+ # program which could be used to calculate width, however, a conforming writer
9
+ # is supposed to convert the widths for the codepoints used into the W array
10
+ # so that it can be used.
11
+ # see Section 9.7.4.1, PDF 32000-1:2008, pp 269-270
12
+ class Composite
13
+
14
+ def initialize(font)
15
+ @font = font
16
+ @widths = PDF::Reader::CidWidths.new(@font.cid_default_width, @font.cid_widths)
17
+ end
18
+
19
+ def glyph_width(code_point)
20
+ return 0 if code_point.nil? || code_point < 0
21
+
22
+ w = @widths[code_point]
23
+ # 0 is a valid width
24
+ return w.to_f unless w.nil?
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # Calculates the width of a glyph in a TrueType font
7
+ class TrueType
8
+
9
+ def initialize(font)
10
+ @font = font
11
+
12
+ if @font.font_descriptor
13
+ @missing_width = @font.font_descriptor.missing_width
14
+ else
15
+ @missing_width = 0
16
+ end
17
+ end
18
+
19
+ def glyph_width(code_point)
20
+ return 0 if code_point.nil? || code_point < 0
21
+ glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
22
+ end
23
+
24
+ private
25
+
26
+ #TODO convert Type3 units 1000 units => 1 text space unit
27
+ def glyph_width_from_font(code_point)
28
+ return if @font.widths.nil? || @font.widths.count == 0
29
+
30
+ # in ruby a negative index is valid, and will go from the end of the array
31
+ # which is undesireable in this case.
32
+ if @font.first_char <= code_point
33
+ @font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
34
+ else
35
+ @missing_width.to_f
36
+ end
37
+ end
38
+
39
+ def glyph_width_from_descriptor(code_point)
40
+ return unless @font.font_descriptor
41
+
42
+ # true type fonts will have most of their information contained
43
+ # with-in a program inside the font descriptor, however the widths
44
+ # may not be in standard PDF glyph widths (1000 units => 1 text space unit)
45
+ # so this width will need to be scaled
46
+ w = @font.font_descriptor.glyph_width(code_point)
47
+ if w
48
+ w.to_f * @font.font_descriptor.glyph_to_pdf_scale_factor
49
+ else
50
+ nil
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # Calculates the width of a glyph in a Type One or Type Three
7
+ class TypeOneOrThree
8
+
9
+ def initialize(font)
10
+ @font = font
11
+
12
+ if @font.font_descriptor
13
+ @missing_width = @font.font_descriptor.missing_width
14
+ else
15
+ @missing_width = 0
16
+ end
17
+ end
18
+
19
+ def glyph_width(code_point)
20
+ return 0 if code_point.nil? || code_point < 0
21
+ return 0 if @font.widths.nil? || @font.widths.count == 0
22
+
23
+ # in ruby a negative index is valid, and will go from the end of the array
24
+ # which is undesireable in this case.
25
+ if @font.first_char <= code_point
26
+ @font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
27
+ else
28
+ @missing_width.to_f
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # Type0 (or Composite) fonts are a "root font" that rely on a "descendant font"
7
+ # to do the heavy lifting. The "descendant font" is a CID-Keyed font.
8
+ # see Section 9.7.1, PDF 32000-1:2008, pp 267
9
+ # so if we are calculating a Type0 font width, we just pass off to
10
+ # the descendant font
11
+ class TypeZero
12
+
13
+ def initialize(font)
14
+ @font = font
15
+ @descendant_font = @font.descendantfonts.first
16
+ end
17
+
18
+ def glyph_width(code_point)
19
+ return 0 if code_point.nil? || code_point < 0
20
+
21
+ @descendant_font.glyph_width(code_point).to_f
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,12 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ # PDF files may define fonts in a number of ways. Each approach means we must
5
+ # calculate glyph widths differently, so this set of classes conform to an
6
+ # interface that will perform the appropriate calculations.
7
+
8
+ require 'pdf/reader/width_calculator/built_in'
9
+ require 'pdf/reader/width_calculator/composite'
10
+ require 'pdf/reader/width_calculator/true_type'
11
+ require 'pdf/reader/width_calculator/type_zero'
12
+ require 'pdf/reader/width_calculator/type_one_or_three'
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -51,11 +54,13 @@ class PDF::Reader
51
54
  #
52
55
  # io - must be an IO object, generally either a file or a StringIO
53
56
  #
54
- def initialize (io)
57
+ def initialize(io)
55
58
  @io = io
59
+ @junk_offset = calc_junk_offset(io) || 0
56
60
  @xref = {}
57
61
  @trailer = load_offsets
58
62
  end
63
+
59
64
  ################################################################################
60
65
  # return the number of objects in this file. Objects with multiple generations are
61
66
  # only counter once.
@@ -93,6 +98,7 @@ class PDF::Reader
93
98
  #
94
99
  def load_offsets(offset = nil)
95
100
  offset ||= new_buffer.find_first_xref_offset
101
+ offset += @junk_offset
96
102
 
97
103
  buf = new_buffer(offset)
98
104
  tok_one = buf.token
@@ -108,7 +114,8 @@ class PDF::Reader
108
114
  return load_xref_stream(stream)
109
115
  end
110
116
 
111
- raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
117
+ raise PDF::Reader::MalformedPDFError,
118
+ "xref table not found at offset #{offset} (#{tok_one} != xref)"
112
119
  end
113
120
  ################################################################################
114
121
  # Assumes the underlying buffer is positioned at the start of a traditional
@@ -124,7 +131,7 @@ class PDF::Reader
124
131
  generation = buf.token.to_i
125
132
  state = buf.token
126
133
 
127
- store(objid, generation, offset) if state == "n" && offset > 0
134
+ store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
128
135
  objid += 1
129
136
  params.clear
130
137
  end
@@ -134,7 +141,9 @@ class PDF::Reader
134
141
 
135
142
  trailer = Parser.new(buf, self).parse_token
136
143
 
137
- raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
144
+ unless trailer.kind_of?(Hash)
145
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
146
+ end
138
147
 
139
148
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
140
149
  load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
@@ -143,7 +152,7 @@ class PDF::Reader
143
152
  end
144
153
 
145
154
  ################################################################################
146
- # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
155
+ # Read an XRef stream from the underlying buffer instead of a traditional xref table.
147
156
  #
148
157
  def load_xref_stream(stream)
149
158
  unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
@@ -169,7 +178,7 @@ class PDF::Reader
169
178
  f2 = unpack_bytes(entry[widths[0],widths[1]])
170
179
  f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
171
180
  if f1 == 1 && f2 > 0
172
- store(objid, f3, f2)
181
+ store(objid, f3, f2 + @junk_offset)
173
182
  elsif f1 == 2 && f2 > 0
174
183
  store(objid, 0, PDF::Reader::Reference.new(f2, 0))
175
184
  end
@@ -195,15 +204,17 @@ class PDF::Reader
195
204
  ("\x00" + bytes).unpack("N")[0]
196
205
  elsif bytes.size == 4
197
206
  bytes.unpack("N")[0]
207
+ elsif bytes.size == 8
208
+ bytes.unpack("Q>")[0]
198
209
  else
199
- raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
210
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
200
211
  end
201
212
  end
202
213
  ################################################################################
203
214
  # Wrap the io stream we're working with in a buffer that can tokenise it for us.
204
215
  #
205
216
  # We create multiple buffers so we can be tokenising multiple sections of the file
206
- # at the same time without worring about clearing the buffers contents.
217
+ # at the same time without worrying about clearing the buffers contents.
207
218
  #
208
219
  def new_buffer(offset = 0)
209
220
  PDF::Reader::Buffer.new(@io, :seek => offset)
@@ -211,9 +222,30 @@ class PDF::Reader
211
222
  ################################################################################
212
223
  # Stores an offset value for a particular PDF object ID and revision number
213
224
  #
214
- def store (id, gen, offset)
225
+ def store(id, gen, offset)
215
226
  (@xref[id] ||= {})[gen] ||= offset
216
227
  end
228
+ ################################################################################
229
+ # Returns the offset of the PDF document in the +stream+. In theory this
230
+ # should always be 0, but all sort of crazy junk is prefixed to PDF files
231
+ # in the real world.
232
+ #
233
+ # Checks up to 1024 chars into the file,
234
+ # returns nil if no PDF data detected.
235
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
236
+ # header appear somewhere within the first 1024 bytes of the file
237
+ #
238
+ def calc_junk_offset(io)
239
+ io.rewind
240
+ offset = io.pos
241
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
242
+ offset += 1
243
+ end
244
+ io.rewind
245
+ offset < 1024 ? offset : nil
246
+ rescue EOFError
247
+ nil
248
+ end
217
249
  end
218
250
  ################################################################################
219
251
  end
data/lib/pdf/reader.rb CHANGED
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -25,9 +28,6 @@
25
28
  ################################################################################
26
29
 
27
30
  require 'stringio'
28
- require 'zlib'
29
-
30
- require 'ascii85'
31
31
 
32
32
  module PDF
33
33
  ################################################################################
@@ -111,10 +111,10 @@ module PDF
111
111
  #
112
112
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
113
  #
114
- def initialize(input = nil, opts = {})
115
- if input # support the deprecated Reader API
116
- @objects = PDF::Reader::ObjectHash.new(input, opts)
117
- end
114
+ def initialize(input, opts = {})
115
+ @cache = PDF::Reader::ObjectCache.new
116
+ opts.merge!(:cache => @cache)
117
+ @objects = PDF::Reader::ObjectHash.new(input, opts)
118
118
  end
119
119
 
120
120
  def info
@@ -128,13 +128,16 @@ module PDF
128
128
  nil
129
129
  else
130
130
  xml = stream.unfiltered_data
131
- xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding)
131
+ xml.force_encoding("utf-8")
132
132
  xml
133
133
  end
134
134
  end
135
135
 
136
136
  def page_count
137
137
  pages = @objects.deref(root[:Pages])
138
+ unless pages.kind_of?(::Hash)
139
+ raise MalformedPDFError, 'Pages structure is missing'
140
+ end
138
141
  @page_count ||= @objects.deref(pages[:Count])
139
142
  end
140
143
 
@@ -159,53 +162,6 @@ module PDF
159
162
  yield PDF::Reader.new(input, opts)
160
163
  end
161
164
 
162
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
163
- # eventually be removed
164
- #
165
- #
166
- # Parse the file with the given name, sending events to the given receiver.
167
- #
168
- def self.file(name, receivers, opts = {})
169
- File.open(name,"rb") do |f|
170
- new.parse(f, receivers, opts)
171
- end
172
- end
173
-
174
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
175
- # eventually be removed
176
- #
177
- # Parse the given string, sending events to the given receiver.
178
- #
179
- def self.string(str, receivers, opts = {})
180
- StringIO.open(str) do |s|
181
- new.parse(s, receivers, opts)
182
- end
183
- end
184
-
185
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
186
- # eventually be removed
187
- #
188
- # Parse the file with the given name, returning an unmarshalled ruby version of
189
- # represents the requested pdf object
190
- #
191
- def self.object_file(name, id, gen = 0)
192
- File.open(name,"rb") { |f|
193
- new.object(f, id.to_i, gen.to_i)
194
- }
195
- end
196
-
197
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
198
- # eventually be removed
199
- #
200
- # Parse the given string, returning an unmarshalled ruby version of represents
201
- # the requested pdf object
202
- #
203
- def self.object_string(str, id, gen = 0)
204
- StringIO.open(str) { |s|
205
- new.object(s, id.to_i, gen.to_i)
206
- }
207
- end
208
-
209
165
  # returns an array of PDF::Reader::Page objects, one for each
210
166
  # page in the source PDF.
211
167
  #
@@ -221,9 +177,13 @@ module PDF
221
177
  # methods available on each page
222
178
  #
223
179
  def pages
224
- (1..self.page_count).map { |num|
225
- PDF::Reader::Page.new(@objects, num)
226
- }
180
+ (1..self.page_count).map do |num|
181
+ begin
182
+ PDF::Reader::Page.new(@objects, num, :cache => @cache)
183
+ rescue InvalidPageError
184
+ raise MalformedPDFError, "Missing data for page: #{num}"
185
+ end
186
+ end
227
187
  end
228
188
 
229
189
  # returns a single PDF::Reader::Page for the specified page.
@@ -240,38 +200,10 @@ module PDF
240
200
  #
241
201
  def page(num)
242
202
  num = num.to_i
243
- raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
244
- PDF::Reader::Page.new(@objects, num)
245
- end
246
-
247
-
248
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
249
- # eventually be removed
250
- #
251
- # Given an IO object that contains PDF data, parse it.
252
- #
253
- def parse(io, receivers, opts = {})
254
- ohash = ObjectHash.new(io)
255
-
256
- options = {:pages => true, :raw_text => false, :metadata => true}
257
- options.merge!(opts)
258
-
259
- strategies.each do |s|
260
- s.new(ohash, receivers, options).process
203
+ if num < 1 || num > self.page_count
204
+ raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
261
205
  end
262
-
263
- self
264
- end
265
-
266
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
267
- # eventually be removed
268
- #
269
- # Given an IO object that contains PDF data, return the contents of a single object
270
- #
271
- def object (io, id, gen)
272
- @objects = ObjectHash.new(io)
273
-
274
- @objects.deref(Reference.new(id, gen))
206
+ PDF::Reader::Page.new(@objects, num, :cache => @cache)
275
207
  end
276
208
 
277
209
  private
@@ -295,14 +227,14 @@ module PDF
295
227
  pdfdoc_to_utf8(obj)
296
228
  end
297
229
  else
298
- obj
230
+ @objects.deref(obj)
299
231
  end
300
232
  end
301
233
 
302
234
  # TODO find a PDF I can use to spec this behaviour
303
235
  #
304
236
  def pdfdoc_to_utf8(obj)
305
- obj.force_encoding("utf-8") if obj.respond_to?(:force_encoding)
237
+ obj.force_encoding("utf-8")
306
238
  obj
307
239
  end
308
240
 
@@ -312,19 +244,18 @@ module PDF
312
244
  def utf16_to_utf8(obj)
313
245
  str = obj[2, obj.size]
314
246
  str = str.unpack("n*").pack("U*")
315
- str.force_encoding("utf-8") if str.respond_to?(:force_encoding)
247
+ str.force_encoding("utf-8")
316
248
  str
317
249
  end
318
250
 
319
- def strategies
320
- @strategies ||= [
321
- ::PDF::Reader::MetadataStrategy,
322
- ::PDF::Reader::PagesStrategy
323
- ]
324
- end
325
-
326
251
  def root
327
- @root ||= @objects.deref(@objects.trailer[:Root])
252
+ @root ||= begin
253
+ obj = @objects.deref(@objects.trailer[:Root])
254
+ unless obj.kind_of?(::Hash)
255
+ raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
256
+ end
257
+ obj
258
+ end
328
259
  end
329
260
 
330
261
  end
@@ -332,17 +263,24 @@ end
332
263
  ################################################################################
333
264
 
334
265
  require 'pdf/reader/resource_methods'
335
- require 'pdf/reader/abstract_strategy'
336
266
  require 'pdf/reader/buffer'
267
+ require 'pdf/reader/cid_widths'
337
268
  require 'pdf/reader/cmap'
338
269
  require 'pdf/reader/encoding'
339
270
  require 'pdf/reader/error'
340
271
  require 'pdf/reader/filter'
272
+ require 'pdf/reader/filter/ascii85'
273
+ require 'pdf/reader/filter/ascii_hex'
274
+ require 'pdf/reader/filter/depredict'
275
+ require 'pdf/reader/filter/flate'
276
+ require 'pdf/reader/filter/lzw'
277
+ require 'pdf/reader/filter/null'
278
+ require 'pdf/reader/filter/run_length'
341
279
  require 'pdf/reader/font'
280
+ require 'pdf/reader/font_descriptor'
342
281
  require 'pdf/reader/form_xobject'
343
282
  require 'pdf/reader/glyph_hash'
344
283
  require 'pdf/reader/lzw'
345
- require 'pdf/reader/metadata_strategy'
346
284
  require 'pdf/reader/object_cache'
347
285
  require 'pdf/reader/object_hash'
348
286
  require 'pdf/reader/object_stream'
@@ -351,12 +289,15 @@ require 'pdf/reader/parser'
351
289
  require 'pdf/reader/print_receiver'
352
290
  require 'pdf/reader/reference'
353
291
  require 'pdf/reader/register_receiver'
292
+ require 'pdf/reader/null_security_handler'
354
293
  require 'pdf/reader/standard_security_handler'
294
+ require 'pdf/reader/standard_security_handler_v5'
295
+ require 'pdf/reader/unimplemented_security_handler'
355
296
  require 'pdf/reader/stream'
356
- require 'pdf/reader/text_receiver'
297
+ require 'pdf/reader/text_run'
357
298
  require 'pdf/reader/page_state'
358
299
  require 'pdf/reader/page_text_receiver'
359
300
  require 'pdf/reader/token'
360
301
  require 'pdf/reader/xref'
302
+ require 'pdf/reader/orientation_detector'
361
303
  require 'pdf/reader/page'
362
- require 'pdf/hash'