pdf-reader 1.4.1 → 2.0.0.beta1

Sign up to get free protection for your applications and to get access to all the features.
@@ -147,7 +147,7 @@ class PDF::Reader
147
147
  ret = [
148
148
  @mapping[glyph_code.to_i] || glyph_code.to_i
149
149
  ].pack("U*")
150
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
150
+ ret.force_encoding("UTF-8")
151
151
  ret
152
152
  end
153
153
 
@@ -158,13 +158,13 @@ class PDF::Reader
158
158
  def little_boxes(times)
159
159
  codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times
160
160
  ret = codepoints.pack("U*")
161
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
161
+ ret.force_encoding("UTF-8")
162
162
  ret
163
163
  end
164
164
 
165
165
  def convert_to_utf8(str)
166
166
  ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
167
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
167
+ ret.force_encoding("UTF-8")
168
168
  ret
169
169
  end
170
170
 
@@ -207,8 +207,7 @@ class PDF::Reader
207
207
  end
208
208
 
209
209
  def load_mapping(file)
210
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
211
- File.open(file, mode) do |f|
210
+ File.open(file, "r:BINARY") do |f|
212
211
  f.each do |l|
213
212
  _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
214
213
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
@@ -46,6 +46,7 @@ class PDF::Reader
46
46
  when :CCITTFaxDecode then PDF::Reader::Filter::Null.new(options)
47
47
  when :DCTDecode then PDF::Reader::Filter::Null.new(options)
48
48
  when :FlateDecode then PDF::Reader::Filter::Flate.new(options)
49
+ when :Fl then PDF::Reader::Filter::Flate.new(options)
49
50
  when :JBIG2Decode then PDF::Reader::Filter::Null.new(options)
50
51
  when :JPXDecode then PDF::Reader::Filter::Null.new(options)
51
52
  when :LZWDecode then PDF::Reader::Filter::Lzw.new(options)
@@ -15,11 +15,7 @@ class PDF::Reader # :nodoc:
15
15
  out = ""
16
16
 
17
17
  while pos < data.length
18
- if data.respond_to?(:getbyte)
19
- length = data.getbyte(pos)
20
- else
21
- length = data[pos]
22
- end
18
+ length = data.getbyte(pos)
23
19
  pos += 1
24
20
 
25
21
  case
@@ -36,11 +36,7 @@ class PDF::Reader
36
36
  attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
37
37
  :cid_widths, :cid_default_width
38
38
 
39
- def initialize(ohash = nil, obj = nil)
40
- if ohash.nil? || obj.nil?
41
- $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
42
- return
43
- end
39
+ def initialize(ohash, obj)
44
40
  @ohash = ohash
45
41
  @tounicode = nil
46
42
 
@@ -52,12 +48,6 @@ class PDF::Reader
52
48
  @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
53
49
  end
54
50
 
55
- def basefont=(font)
56
- $stderr.puts "Font#basefont= is deprecated and will be removed in the 2.0 release"
57
- @encoding ||= default_encoding(font)
58
- @basefont = font
59
- end
60
-
61
51
  def to_utf8(params)
62
52
  if @tounicode
63
53
  to_utf8_via_cmap(params)
@@ -48,6 +48,9 @@ class PDF::Reader
48
48
  # h.name_to_unicode(:Euro)
49
49
  # => 8364
50
50
  #
51
+ # h.name_to_unicode(:X4A)
52
+ # => 74
53
+ #
51
54
  # h.name_to_unicode(:G30)
52
55
  # => 48
53
56
  #
@@ -62,6 +65,8 @@ class PDF::Reader
62
65
 
63
66
  if @by_name.has_key?(name)
64
67
  @by_name[name]
68
+ elsif str.match(/\AX[0-9a-fA-F]{2,4}\Z/)
69
+ "0x#{str[1,4]}".hex
65
70
  elsif str.match(/\Auni[A-F\d]{4}\Z/)
66
71
  "0x#{str[3,4]}".hex
67
72
  elsif str.match(/\Au[A-F\d]{4,6}\Z/)
@@ -102,8 +107,7 @@ class PDF::Reader
102
107
  keyed_by_name = {}
103
108
  keyed_by_codepoint = {}
104
109
 
105
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
106
- File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
110
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
107
111
  f.each do |l|
108
112
  _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
109
113
  if name && code
@@ -22,7 +22,7 @@ module PDF
22
22
 
23
23
  def initialize(data, bits_in_chunk)
24
24
  @data = data
25
- @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
25
+ @data.force_encoding("BINARY")
26
26
  @bits_in_chunk = bits_in_chunk
27
27
  @current_pos = 0
28
28
  @bits_left_in_byte = 8
@@ -102,21 +102,7 @@ class PDF::Reader
102
102
  # a PDF::Reader::Reference, the key is returned unchanged.
103
103
  #
104
104
  def deref!(key)
105
- case object = deref(key)
106
- when Hash
107
- {}.tap { |hash|
108
- object.each do |k, value|
109
- hash[k] = deref!(value)
110
- end
111
- }
112
- when PDF::Reader::Stream
113
- object.hash = deref!(object.hash)
114
- object
115
- when Array
116
- object.map { |value| deref!(value) }
117
- else
118
- object
119
- end
105
+ deref_internal!(key, {})
120
106
  end
121
107
 
122
108
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
@@ -266,6 +252,39 @@ class PDF::Reader
266
252
 
267
253
  private
268
254
 
255
+ # Private implementation of deref!, which exists to ensure the `seen` argument
256
+ # isn't publicly available. It's used to avoid endless loops in the recursion, and
257
+ # doesn't need to be part of the public API.
258
+ #
259
+ def deref_internal!(key, seen)
260
+ seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
261
+
262
+ return seen[seen_key] if seen.key?(seen_key)
263
+
264
+ case object = deref(key)
265
+ when Hash
266
+ seen[seen_key] ||= {}
267
+ object.each do |k, value|
268
+ seen[seen_key][k] = deref_internal!(value, seen)
269
+ end
270
+ seen[seen_key]
271
+ when PDF::Reader::Stream
272
+ seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
273
+ object.hash.each do |k,value|
274
+ seen[seen_key].hash[k] = deref_internal!(value, seen)
275
+ end
276
+ seen[seen_key]
277
+ when Array
278
+ seen[seen_key] ||= []
279
+ object.each do |value|
280
+ seen[seen_key] << deref_internal!(value, seen)
281
+ end
282
+ seen[seen_key]
283
+ else
284
+ object
285
+ end
286
+ end
287
+
269
288
  def build_security_handler(opts = {})
270
289
  return nil if trailer[:Encrypt].nil?
271
290
 
@@ -316,7 +335,7 @@ class PDF::Reader
316
335
 
317
336
  if obj[:Type] == :Page
318
337
  ref
319
- elsif obj[:Type] == :Pages
338
+ elsif obj[:Kids]
320
339
  deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
321
340
  end
322
341
  end
@@ -8,17 +8,19 @@ class PDF::Reader
8
8
  # media box should be a 4 number array that describes the dimensions of the
9
9
  # page to be rendered as described by the page's MediaBox attribute
10
10
  class PageLayout
11
+
12
+ DEFAULT_FONT_SIZE = 12
13
+
11
14
  def initialize(runs, mediabox)
12
15
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
13
16
 
14
17
  @runs = merge_runs(runs)
15
- @mean_font_size = mean(@runs.map(&:font_size)) || 0
18
+ @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
19
+ @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
16
20
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
17
21
  @page_width = mediabox[2] - mediabox[0]
18
22
  @page_height = mediabox[3] - mediabox[1]
19
23
  @x_offset = @runs.map(&:x).sort.first
20
- @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
21
- RUBY_VERSION >= "1.9.0"
22
24
  end
23
25
 
24
26
  def to_s
@@ -110,21 +112,8 @@ class PDF::Reader
110
112
  runs
111
113
  end
112
114
 
113
- # This is a simple alternative to String#[]=. We can't use the string
114
- # method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
115
- #
116
- # See my bug report at https://github.com/rubinius/rubinius/issues/1985
117
115
  def local_string_insert(haystack, needle, index)
118
- if @current_platform_is_rbx_19
119
- char_count = needle.length
120
- haystack.replace(
121
- (haystack[0,index] || "") +
122
- needle +
123
- (haystack[index+char_count,500] || "")
124
- )
125
- else
126
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
127
- end
116
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
128
117
  end
129
118
  end
130
119
  end
@@ -27,42 +27,8 @@
27
27
 
28
28
  class PDF::Reader
29
29
  ################################################################################
30
- # Walks the pages of the PDF file and calls the appropriate callback methods when
31
- # something of interest is found.
32
- #
33
- # The callback methods should exist on the receiver object passed into the constructor.
34
- # Whenever some content is found that will trigger a callback, the receiver is checked
35
- # to see if the callback is defined.
36
- #
37
- # If it is defined it will be called. If not, processing will continue.
38
- #
39
- # = Available Callbacks
40
- # The following callbacks are available and should be methods defined on your receiver class. Only
41
- # implement the ones you need - the rest will be ignored.
42
- #
43
- # Some callbacks will include parameters which will be passed in as an array. For callbacks
44
- # that supply no paramters, or where you don't need them, the *params argument can be left off.
45
- # Some example callback method definitions are:
46
- #
47
- # def begin_document
48
- # def end_page
49
- # def show_text(string, *params)
50
- # def fill_stroke(*params)
51
- #
52
- # You should be able to infer the basic command the callback is reporting based on the name. For
53
- # further experimentation, define the callback with just a *params parameter, then print out the
54
- # contents of the array using something like:
55
- #
56
- # puts params.inspect
57
- #
58
30
  # == Text Callbacks
59
31
  #
60
- # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
61
- # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be
62
- # careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for
63
- # example). The string may not be byte-by-byte identical with the string that was originally
64
- # written to the PDF.
65
- #
66
32
  # - end_text_object
67
33
  # - move_to_start_of_next_line
68
34
  # - set_character_spacing
@@ -80,14 +46,6 @@ class PDF::Reader
80
46
  # - move_to_next_line_and_show_text
81
47
  # - set_spacing_next_line_show_text
82
48
  #
83
- # If the :raw_text option was passed to the PDF::Reader class the following callbacks
84
- # may also appear:
85
- #
86
- # - show_text_raw
87
- # - show_text_with_positioning_raw
88
- # - move_to_next_line_and_show_text_raw
89
- # - set_spacing_next_line_show_text_raw
90
- #
91
49
  # == Graphics Callbacks
92
50
  # - close_fill_stroke
93
51
  # - fill_stroke
@@ -145,42 +103,7 @@ class PDF::Reader
145
103
  # - set_clipping_path_with_even_odd
146
104
  # - append_curved_segment_final_point_replicated
147
105
  #
148
- # == Misc Callbacks
149
- # - begin_compatibility_section
150
- # - end_compatibility_section,
151
- # - begin_document
152
- # - end_document
153
- # - begin_page_container
154
- # - end_page_container
155
- # - begin_page
156
- # - end_page
157
- # - metadata
158
- # - xml_metadata
159
- # - page_count
160
- # - begin_form_xobject
161
- # - end_form_xobject
162
- #
163
- # == Resource Callbacks
164
- #
165
- # Each page can contain (or inherit) a range of resources required for the page,
166
- # including things like fonts and images. The following callbacks may appear
167
- # after begin_page if the relevant resources exist on a page:
168
- #
169
- # - resource_procset
170
- # - resource_xobject
171
- # - resource_extgstate
172
- # - resource_colorspace
173
- # - resource_pattern
174
- # - resource_font
175
- #
176
- # In most cases, these callbacks associate a name with each resource, allowing it
177
- # to be referred to by name in the page content. For example, an XObject can hold an image.
178
- # If it gets mapped to the name "IM1", then it can be placed on the page using
179
- # invoke_xobject "IM1".
180
- #
181
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
182
- # eventually be removed
183
- class PagesStrategy< AbstractStrategy # :nodoc:
106
+ class PagesStrategy # :nodoc:
184
107
  OPERATORS = {
185
108
  'b' => :close_fill_stroke,
186
109
  'B' => :fill_stroke,
@@ -256,232 +179,6 @@ class PDF::Reader
256
179
  '\'' => :move_to_next_line_and_show_text,
257
180
  '"' => :set_spacing_next_line_show_text,
258
181
  }
259
- def self.to_sym
260
- :pages
261
- end
262
- ################################################################################
263
- # Begin processing the document
264
- def process
265
- return false unless options[:pages]
266
-
267
- callback(:begin_document, [root])
268
- walk_pages(@ohash.object(root[:Pages]))
269
- callback(:end_document)
270
- end
271
- private
272
- ################################################################################
273
- def params_to_utf8(params, font)
274
- if params.is_a?(String)
275
- font.to_utf8(params)
276
- elsif params.is_a?(Array)
277
- params.map { |i| params_to_utf8(i, font)}
278
- else
279
- params
280
- end
281
- end
282
- ################################################################################
283
- # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
284
- # its content
285
- def walk_pages(page)
286
-
287
- # extract page content
288
- if page[:Type] == :Pages
289
- callback(:begin_page_container, [page])
290
- res = @ohash.object(page[:Resources])
291
- resources.push res if res
292
- @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
293
- resources.pop if res
294
- callback(:end_page_container)
295
- elsif page[:Type] == :Page
296
- callback(:begin_page, [page])
297
- res = @ohash.object(page[:Resources])
298
- resources.push res if res
299
- walk_resources(current_resources)
300
-
301
- if @ohash.object(page[:Contents]).kind_of?(Array)
302
- contents = @ohash.object(page[:Contents])
303
- else
304
- contents = [page[:Contents]]
305
- end
306
-
307
- fonts = font_hash_from_resources(current_resources)
308
-
309
- if page.has_key?(:Contents) and page[:Contents]
310
- direct_contents = contents.map { |content| @ohash.object(content) }
311
- content_stream(direct_contents, fonts)
312
- end
313
-
314
- resources.pop if res
315
- callback(:end_page)
316
- end
317
- end
318
- ################################################################################
319
- # Retreive the XObject for the supplied label and if it's a Form, walk it
320
- # like a regular page content stream.
321
- #
322
- def walk_xobject_form(label)
323
- xobjects = @ohash.object(current_resources[:XObject]) || {}
324
- xobject = @ohash.object(xobjects[label])
325
-
326
- if xobject && xobject.hash[:Subtype] == :Form
327
- callback(:begin_form_xobject)
328
- xobj_resources = @ohash.object(xobject.hash[:Resources])
329
- if xobj_resources
330
- resources.push xobj_resources
331
- walk_resources(xobj_resources)
332
- end
333
- fonts = font_hash_from_resources(xobj_resources)
334
- content_stream(xobject, fonts)
335
- callback(:end_form_xobject)
336
- resources.pop if xobj_resources
337
- end
338
- end
339
-
340
- ################################################################################
341
- # Return a merged hash of all resources that are current. Pages, page and xobject
342
- #
343
- def current_resources
344
- hash = {}
345
- resources.each do |res|
346
- hash.merge!(res)
347
- end
348
- hash
349
- end
350
- ################################################################################
351
- # Reads a PDF content stream and calls all the appropriate callback methods for the operators
352
- # it contains
353
- #
354
- def content_stream(instructions, fonts = {})
355
- instructions = [instructions] unless instructions.kind_of?(Array)
356
- instructions = instructions.map { |ins|
357
- ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
358
- }.join
359
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
360
- parser = Parser.new(buffer, @ohash)
361
- current_font = nil
362
- params = []
363
-
364
- while (token = parser.parse_token(OPERATORS))
365
- if token.kind_of?(Token) and OPERATORS.has_key?(token)
366
- if OPERATORS[token] == :set_text_font_and_size
367
- current_font = params.first
368
- if fonts[current_font].nil?
369
- raise MalformedPDFError, "Unknown font #{current_font}"
370
- end
371
- end
372
-
373
- # handle special cases in response to certain operators
374
- if OPERATORS[token].to_s.include?("show_text")
375
- # convert any text to utf-8, but output the raw string if the user wants it
376
- if options[:raw_text]
377
- callback("#{OPERATORS[token]}_raw".to_sym, params)
378
- end
379
- params = params_to_utf8(params, fonts[current_font])
380
- elsif token == "ID"
381
- # inline image data, first convert the current params into a more familiar hash
382
- map = {}
383
- params.each_slice(2) do |key, value|
384
- map[key] = value
385
- end
386
- params = [map, buffer.token]
387
- end
388
-
389
- callback(OPERATORS[token], params)
390
-
391
- if OPERATORS[token] == :invoke_xobject
392
- xobject_label = params.first
393
- params.clear
394
- walk_xobject_form(xobject_label)
395
- else
396
- params.clear
397
- end
398
- else
399
- params << token
400
- end
401
- end
402
- rescue EOFError
403
- raise MalformedPDFError, "End Of File while processing a content stream"
404
- end
405
- ################################################################################
406
- def walk_resources(resources)
407
- return unless resources.respond_to?(:[])
408
-
409
- resources = resolve_references(resources)
410
-
411
- # extract any procset information
412
- if resources[:ProcSet]
413
- callback(:resource_procset, resources[:ProcSet])
414
- end
415
-
416
- # extract any xobject information
417
- if resources[:XObject]
418
- @ohash.object(resources[:XObject]).each do |name, val|
419
- callback(:resource_xobject, [name, @ohash.object(val)])
420
- end
421
- end
422
-
423
- # extract any extgstate information
424
- if resources[:ExtGState]
425
- @ohash.object(resources[:ExtGState]).each do |name, val|
426
- callback(:resource_extgstate, [name, @ohash.object(val)])
427
- end
428
- end
429
-
430
- # extract any colorspace information
431
- if resources[:ColorSpace]
432
- @ohash.object(resources[:ColorSpace]).each do |name, val|
433
- callback(:resource_colorspace, [name, @ohash.object(val)])
434
- end
435
- end
436
-
437
- # extract any pattern information
438
- if resources[:Pattern]
439
- @ohash.object(resources[:Pattern]).each do |name, val|
440
- callback(:resource_pattern, [name, @ohash.object(val)])
441
- end
442
- end
443
-
444
- # extract any font information
445
- if resources[:Font]
446
- fonts = font_hash_from_resources(resources)
447
- fonts.each do |label, font|
448
- callback(:resource_font, [label, font])
449
- end
450
- end
451
- end
452
- ################################################################################
453
- # Convert any PDF::Reader::Resource objects into a real object
454
- def resolve_references(obj)
455
- case obj
456
- when PDF::Reader::Stream then
457
- obj.hash = resolve_references(obj.hash)
458
- obj
459
- when PDF::Reader::Reference then
460
- resolve_references(@ohash.object(obj))
461
- when Hash then
462
- arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
463
- Hash[*arr]
464
- when Array then
465
- obj.collect { |item| resolve_references(item) }
466
- else
467
- obj
468
- end
469
- end
470
- ################################################################################
471
- ################################################################################
472
- def font_hash_from_resources(resources)
473
- return {} unless resources.respond_to?(:[])
474
-
475
- fonts = {}
476
- resources = @ohash.object(resources[:Font]) || {}
477
- resources.each do |label, desc|
478
- fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
479
- end
480
- fonts
481
- end
482
- def resources
483
- @resources ||= []
484
- end
485
182
  end
486
183
  ################################################################################
487
184
  end