pdf-reader 1.1.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -0,0 +1,17 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+
6
+ # Security handler for when we don't support the flavour of encryption
7
+ # used in a PDF.
8
+ class UnimplementedSecurityHandler
9
+ def self.supports?(encrypt)
10
+ true
11
+ end
12
+
13
+ def decrypt(buf, ref)
14
+ raise PDF::Reader::EncryptedPDFError, "Unsupported encryption style"
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,67 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'afm'
5
+ require 'pdf/reader/synchronized_cache'
6
+
7
+ class PDF::Reader
8
+ module WidthCalculator
9
+
10
+ # Type1 fonts can be one of 14 "built in" standard fonts. In these cases,
11
+ # the reader is expected to have it's own copy of the font metrics.
12
+ # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
13
+ class BuiltIn
14
+
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
23
+ def initialize(font)
24
+ @font = font
25
+ @@all_metrics ||= PDF::Reader::SynchronizedCache.new
26
+
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
29
+
30
+ if File.file?(metrics_path)
31
+ @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
32
+ else
33
+ raise ArgumentError, "No built-in metrics for #{font.basefont}"
34
+ end
35
+ end
36
+
37
+ def glyph_width(code_point)
38
+ return 0 if code_point.nil? || code_point < 0
39
+
40
+ names = @font.encoding.int_to_name(code_point)
41
+ metrics = names.map { |name|
42
+ @metrics.char_metrics[name.to_s]
43
+ }.compact.first
44
+
45
+ if metrics
46
+ metrics[:wx]
47
+ else
48
+ @font.widths[code_point - 1] || 0
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def control_character?(code_point)
55
+ @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
56
+ end
57
+
58
+ def extract_basefont(font_name)
59
+ if BUILTINS.include?(font_name)
60
+ font_name
61
+ else
62
+ "Times-Roman"
63
+ end
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,28 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # CIDFontType0 or CIDFontType2 use DW (integer) and W (array) to determine
7
+ # codepoint widths, note that CIDFontType2 will contain a true type font
8
+ # program which could be used to calculate width, however, a conforming writer
9
+ # is supposed to convert the widths for the codepoints used into the W array
10
+ # so that it can be used.
11
+ # see Section 9.7.4.1, PDF 32000-1:2008, pp 269-270
12
+ class Composite
13
+
14
+ def initialize(font)
15
+ @font = font
16
+ @widths = PDF::Reader::CidWidths.new(@font.cid_default_width, @font.cid_widths)
17
+ end
18
+
19
+ def glyph_width(code_point)
20
+ return 0 if code_point.nil? || code_point < 0
21
+
22
+ w = @widths[code_point]
23
+ # 0 is a valid width
24
+ return w.to_f unless w.nil?
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,56 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # Calculates the width of a glyph in a TrueType font
7
+ class TrueType
8
+
9
+ def initialize(font)
10
+ @font = font
11
+
12
+ if @font.font_descriptor
13
+ @missing_width = @font.font_descriptor.missing_width
14
+ else
15
+ @missing_width = 0
16
+ end
17
+ end
18
+
19
+ def glyph_width(code_point)
20
+ return 0 if code_point.nil? || code_point < 0
21
+ glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
22
+ end
23
+
24
+ private
25
+
26
+ #TODO convert Type3 units 1000 units => 1 text space unit
27
+ def glyph_width_from_font(code_point)
28
+ return if @font.widths.nil? || @font.widths.count == 0
29
+
30
+ # in ruby a negative index is valid, and will go from the end of the array
31
+ # which is undesireable in this case.
32
+ if @font.first_char <= code_point
33
+ @font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
34
+ else
35
+ @missing_width.to_f
36
+ end
37
+ end
38
+
39
+ def glyph_width_from_descriptor(code_point)
40
+ return unless @font.font_descriptor
41
+
42
+ # true type fonts will have most of their information contained
43
+ # with-in a program inside the font descriptor, however the widths
44
+ # may not be in standard PDF glyph widths (1000 units => 1 text space unit)
45
+ # so this width will need to be scaled
46
+ w = @font.font_descriptor.glyph_width(code_point)
47
+ if w
48
+ w.to_f * @font.font_descriptor.glyph_to_pdf_scale_factor
49
+ else
50
+ nil
51
+ end
52
+ end
53
+ end
54
+ end
55
+ end
56
+
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # Calculates the width of a glyph in a Type One or Type Three
7
+ class TypeOneOrThree
8
+
9
+ def initialize(font)
10
+ @font = font
11
+
12
+ if @font.font_descriptor
13
+ @missing_width = @font.font_descriptor.missing_width
14
+ else
15
+ @missing_width = 0
16
+ end
17
+ end
18
+
19
+ def glyph_width(code_point)
20
+ return 0 if code_point.nil? || code_point < 0
21
+ return 0 if @font.widths.nil? || @font.widths.count == 0
22
+
23
+ # in ruby a negative index is valid, and will go from the end of the array
24
+ # which is undesireable in this case.
25
+ if @font.first_char <= code_point
26
+ @font.widths.fetch(code_point - @font.first_char, @missing_width).to_f
27
+ else
28
+ @missing_width.to_f
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ class PDF::Reader
5
+ module WidthCalculator
6
+ # Type0 (or Composite) fonts are a "root font" that rely on a "descendant font"
7
+ # to do the heavy lifting. The "descendant font" is a CID-Keyed font.
8
+ # see Section 9.7.1, PDF 32000-1:2008, pp 267
9
+ # so if we are calculating a Type0 font width, we just pass off to
10
+ # the descendant font
11
+ class TypeZero
12
+
13
+ def initialize(font)
14
+ @font = font
15
+ @descendant_font = @font.descendantfonts.first
16
+ end
17
+
18
+ def glyph_width(code_point)
19
+ return 0 if code_point.nil? || code_point < 0
20
+
21
+ @descendant_font.glyph_width(code_point).to_f
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,12 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ # PDF files may define fonts in a number of ways. Each approach means we must
5
+ # calculate glyph widths differently, so this set of classes conform to an
6
+ # interface that will perform the appropriate calculations.
7
+
8
+ require 'pdf/reader/width_calculator/built_in'
9
+ require 'pdf/reader/width_calculator/composite'
10
+ require 'pdf/reader/width_calculator/true_type'
11
+ require 'pdf/reader/width_calculator/type_zero'
12
+ require 'pdf/reader/width_calculator/type_one_or_three'
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -51,11 +54,13 @@ class PDF::Reader
51
54
  #
52
55
  # io - must be an IO object, generally either a file or a StringIO
53
56
  #
54
- def initialize (io)
57
+ def initialize(io)
55
58
  @io = io
59
+ @junk_offset = calc_junk_offset(io) || 0
56
60
  @xref = {}
57
61
  @trailer = load_offsets
58
62
  end
63
+
59
64
  ################################################################################
60
65
  # return the number of objects in this file. Objects with multiple generations are
61
66
  # only counter once.
@@ -93,6 +98,7 @@ class PDF::Reader
93
98
  #
94
99
  def load_offsets(offset = nil)
95
100
  offset ||= new_buffer.find_first_xref_offset
101
+ offset += @junk_offset
96
102
 
97
103
  buf = new_buffer(offset)
98
104
  tok_one = buf.token
@@ -108,7 +114,8 @@ class PDF::Reader
108
114
  return load_xref_stream(stream)
109
115
  end
110
116
 
111
- raise PDF::Reader::MalformedPDFError, "xref table not found at offset #{offset} (#{tok_one} != xref)"
117
+ raise PDF::Reader::MalformedPDFError,
118
+ "xref table not found at offset #{offset} (#{tok_one} != xref)"
112
119
  end
113
120
  ################################################################################
114
121
  # Assumes the underlying buffer is positioned at the start of a traditional
@@ -124,7 +131,7 @@ class PDF::Reader
124
131
  generation = buf.token.to_i
125
132
  state = buf.token
126
133
 
127
- store(objid, generation, offset) if state == "n" && offset > 0
134
+ store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
128
135
  objid += 1
129
136
  params.clear
130
137
  end
@@ -134,7 +141,9 @@ class PDF::Reader
134
141
 
135
142
  trailer = Parser.new(buf, self).parse_token
136
143
 
137
- raise MalformedPDFError, "PDF malformed, trailer should be a dictionary" unless trailer.kind_of?(Hash)
144
+ unless trailer.kind_of?(Hash)
145
+ raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
146
+ end
138
147
 
139
148
  load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
140
149
  load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)
@@ -143,7 +152,7 @@ class PDF::Reader
143
152
  end
144
153
 
145
154
  ################################################################################
146
- # Read a XReaf stream from the underlying buffer instead of a traditional xref table.
155
+ # Read an XRef stream from the underlying buffer instead of a traditional xref table.
147
156
  #
148
157
  def load_xref_stream(stream)
149
158
  unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
@@ -169,7 +178,7 @@ class PDF::Reader
169
178
  f2 = unpack_bytes(entry[widths[0],widths[1]])
170
179
  f3 = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
171
180
  if f1 == 1 && f2 > 0
172
- store(objid, f3, f2)
181
+ store(objid, f3, f2 + @junk_offset)
173
182
  elsif f1 == 2 && f2 > 0
174
183
  store(objid, 0, PDF::Reader::Reference.new(f2, 0))
175
184
  end
@@ -195,15 +204,17 @@ class PDF::Reader
195
204
  ("\x00" + bytes).unpack("N")[0]
196
205
  elsif bytes.size == 4
197
206
  bytes.unpack("N")[0]
207
+ elsif bytes.size == 8
208
+ bytes.unpack("Q>")[0]
198
209
  else
199
- raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
210
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
200
211
  end
201
212
  end
202
213
  ################################################################################
203
214
  # Wrap the io stream we're working with in a buffer that can tokenise it for us.
204
215
  #
205
216
  # We create multiple buffers so we can be tokenising multiple sections of the file
206
- # at the same time without worring about clearing the buffers contents.
217
+ # at the same time without worrying about clearing the buffers contents.
207
218
  #
208
219
  def new_buffer(offset = 0)
209
220
  PDF::Reader::Buffer.new(@io, :seek => offset)
@@ -211,9 +222,30 @@ class PDF::Reader
211
222
  ################################################################################
212
223
  # Stores an offset value for a particular PDF object ID and revision number
213
224
  #
214
- def store (id, gen, offset)
225
+ def store(id, gen, offset)
215
226
  (@xref[id] ||= {})[gen] ||= offset
216
227
  end
228
+ ################################################################################
229
+ # Returns the offset of the PDF document in the +stream+. In theory this
230
+ # should always be 0, but all sort of crazy junk is prefixed to PDF files
231
+ # in the real world.
232
+ #
233
+ # Checks up to 1024 chars into the file,
234
+ # returns nil if no PDF data detected.
235
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
236
+ # header appear somewhere within the first 1024 bytes of the file
237
+ #
238
+ def calc_junk_offset(io)
239
+ io.rewind
240
+ offset = io.pos
241
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
242
+ offset += 1
243
+ end
244
+ io.rewind
245
+ offset < 1024 ? offset : nil
246
+ rescue EOFError
247
+ nil
248
+ end
217
249
  end
218
250
  ################################################################################
219
251
  end
data/lib/pdf/reader.rb CHANGED
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -25,9 +28,6 @@
25
28
  ################################################################################
26
29
 
27
30
  require 'stringio'
28
- require 'zlib'
29
-
30
- require 'ascii85'
31
31
 
32
32
  module PDF
33
33
  ################################################################################
@@ -111,10 +111,10 @@ module PDF
111
111
  #
112
112
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
113
113
  #
114
- def initialize(input = nil, opts = {})
115
- if input # support the deprecated Reader API
116
- @objects = PDF::Reader::ObjectHash.new(input, opts)
117
- end
114
+ def initialize(input, opts = {})
115
+ @cache = PDF::Reader::ObjectCache.new
116
+ opts.merge!(:cache => @cache)
117
+ @objects = PDF::Reader::ObjectHash.new(input, opts)
118
118
  end
119
119
 
120
120
  def info
@@ -128,13 +128,16 @@ module PDF
128
128
  nil
129
129
  else
130
130
  xml = stream.unfiltered_data
131
- xml.force_encoding("utf-8") if xml.respond_to?(:force_encoding)
131
+ xml.force_encoding("utf-8")
132
132
  xml
133
133
  end
134
134
  end
135
135
 
136
136
  def page_count
137
137
  pages = @objects.deref(root[:Pages])
138
+ unless pages.kind_of?(::Hash)
139
+ raise MalformedPDFError, 'Pages structure is missing'
140
+ end
138
141
  @page_count ||= @objects.deref(pages[:Count])
139
142
  end
140
143
 
@@ -159,53 +162,6 @@ module PDF
159
162
  yield PDF::Reader.new(input, opts)
160
163
  end
161
164
 
162
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
163
- # eventually be removed
164
- #
165
- #
166
- # Parse the file with the given name, sending events to the given receiver.
167
- #
168
- def self.file(name, receivers, opts = {})
169
- File.open(name,"rb") do |f|
170
- new.parse(f, receivers, opts)
171
- end
172
- end
173
-
174
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
175
- # eventually be removed
176
- #
177
- # Parse the given string, sending events to the given receiver.
178
- #
179
- def self.string(str, receivers, opts = {})
180
- StringIO.open(str) do |s|
181
- new.parse(s, receivers, opts)
182
- end
183
- end
184
-
185
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
186
- # eventually be removed
187
- #
188
- # Parse the file with the given name, returning an unmarshalled ruby version of
189
- # represents the requested pdf object
190
- #
191
- def self.object_file(name, id, gen = 0)
192
- File.open(name,"rb") { |f|
193
- new.object(f, id.to_i, gen.to_i)
194
- }
195
- end
196
-
197
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
198
- # eventually be removed
199
- #
200
- # Parse the given string, returning an unmarshalled ruby version of represents
201
- # the requested pdf object
202
- #
203
- def self.object_string(str, id, gen = 0)
204
- StringIO.open(str) { |s|
205
- new.object(s, id.to_i, gen.to_i)
206
- }
207
- end
208
-
209
165
  # returns an array of PDF::Reader::Page objects, one for each
210
166
  # page in the source PDF.
211
167
  #
@@ -221,9 +177,13 @@ module PDF
221
177
  # methods available on each page
222
178
  #
223
179
  def pages
224
- (1..self.page_count).map { |num|
225
- PDF::Reader::Page.new(@objects, num)
226
- }
180
+ (1..self.page_count).map do |num|
181
+ begin
182
+ PDF::Reader::Page.new(@objects, num, :cache => @cache)
183
+ rescue InvalidPageError
184
+ raise MalformedPDFError, "Missing data for page: #{num}"
185
+ end
186
+ end
227
187
  end
228
188
 
229
189
  # returns a single PDF::Reader::Page for the specified page.
@@ -240,38 +200,10 @@ module PDF
240
200
  #
241
201
  def page(num)
242
202
  num = num.to_i
243
- raise ArgumentError, "valid pages are 1 .. #{self.page_count}" if num < 1 || num > self.page_count
244
- PDF::Reader::Page.new(@objects, num)
245
- end
246
-
247
-
248
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
249
- # eventually be removed
250
- #
251
- # Given an IO object that contains PDF data, parse it.
252
- #
253
- def parse(io, receivers, opts = {})
254
- ohash = ObjectHash.new(io)
255
-
256
- options = {:pages => true, :raw_text => false, :metadata => true}
257
- options.merge!(opts)
258
-
259
- strategies.each do |s|
260
- s.new(ohash, receivers, options).process
203
+ if num < 1 || num > self.page_count
204
+ raise InvalidPageError, "Valid pages are 1 .. #{self.page_count}"
261
205
  end
262
-
263
- self
264
- end
265
-
266
- # DEPRECATED: this method was deprecated in version 1.0.0 and will
267
- # eventually be removed
268
- #
269
- # Given an IO object that contains PDF data, return the contents of a single object
270
- #
271
- def object (io, id, gen)
272
- @objects = ObjectHash.new(io)
273
-
274
- @objects.deref(Reference.new(id, gen))
206
+ PDF::Reader::Page.new(@objects, num, :cache => @cache)
275
207
  end
276
208
 
277
209
  private
@@ -295,14 +227,14 @@ module PDF
295
227
  pdfdoc_to_utf8(obj)
296
228
  end
297
229
  else
298
- obj
230
+ @objects.deref(obj)
299
231
  end
300
232
  end
301
233
 
302
234
  # TODO find a PDF I can use to spec this behaviour
303
235
  #
304
236
  def pdfdoc_to_utf8(obj)
305
- obj.force_encoding("utf-8") if obj.respond_to?(:force_encoding)
237
+ obj.force_encoding("utf-8")
306
238
  obj
307
239
  end
308
240
 
@@ -312,19 +244,18 @@ module PDF
312
244
  def utf16_to_utf8(obj)
313
245
  str = obj[2, obj.size]
314
246
  str = str.unpack("n*").pack("U*")
315
- str.force_encoding("utf-8") if str.respond_to?(:force_encoding)
247
+ str.force_encoding("utf-8")
316
248
  str
317
249
  end
318
250
 
319
- def strategies
320
- @strategies ||= [
321
- ::PDF::Reader::MetadataStrategy,
322
- ::PDF::Reader::PagesStrategy
323
- ]
324
- end
325
-
326
251
  def root
327
- @root ||= @objects.deref(@objects.trailer[:Root])
252
+ @root ||= begin
253
+ obj = @objects.deref(@objects.trailer[:Root])
254
+ unless obj.kind_of?(::Hash)
255
+ raise MalformedPDFError, "PDF malformed, trailer Root should be a dictionary"
256
+ end
257
+ obj
258
+ end
328
259
  end
329
260
 
330
261
  end
@@ -332,17 +263,24 @@ end
332
263
  ################################################################################
333
264
 
334
265
  require 'pdf/reader/resource_methods'
335
- require 'pdf/reader/abstract_strategy'
336
266
  require 'pdf/reader/buffer'
267
+ require 'pdf/reader/cid_widths'
337
268
  require 'pdf/reader/cmap'
338
269
  require 'pdf/reader/encoding'
339
270
  require 'pdf/reader/error'
340
271
  require 'pdf/reader/filter'
272
+ require 'pdf/reader/filter/ascii85'
273
+ require 'pdf/reader/filter/ascii_hex'
274
+ require 'pdf/reader/filter/depredict'
275
+ require 'pdf/reader/filter/flate'
276
+ require 'pdf/reader/filter/lzw'
277
+ require 'pdf/reader/filter/null'
278
+ require 'pdf/reader/filter/run_length'
341
279
  require 'pdf/reader/font'
280
+ require 'pdf/reader/font_descriptor'
342
281
  require 'pdf/reader/form_xobject'
343
282
  require 'pdf/reader/glyph_hash'
344
283
  require 'pdf/reader/lzw'
345
- require 'pdf/reader/metadata_strategy'
346
284
  require 'pdf/reader/object_cache'
347
285
  require 'pdf/reader/object_hash'
348
286
  require 'pdf/reader/object_stream'
@@ -351,12 +289,15 @@ require 'pdf/reader/parser'
351
289
  require 'pdf/reader/print_receiver'
352
290
  require 'pdf/reader/reference'
353
291
  require 'pdf/reader/register_receiver'
292
+ require 'pdf/reader/null_security_handler'
354
293
  require 'pdf/reader/standard_security_handler'
294
+ require 'pdf/reader/standard_security_handler_v5'
295
+ require 'pdf/reader/unimplemented_security_handler'
355
296
  require 'pdf/reader/stream'
356
- require 'pdf/reader/text_receiver'
297
+ require 'pdf/reader/text_run'
357
298
  require 'pdf/reader/page_state'
358
299
  require 'pdf/reader/page_text_receiver'
359
300
  require 'pdf/reader/token'
360
301
  require 'pdf/reader/xref'
302
+ require 'pdf/reader/orientation_detector'
361
303
  require 'pdf/reader/page'
362
- require 'pdf/hash'