pdf-reader 1.4.1 → 2.0.0.beta1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -147,7 +147,7 @@ class PDF::Reader
147
147
  ret = [
148
148
  @mapping[glyph_code.to_i] || glyph_code.to_i
149
149
  ].pack("U*")
150
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
150
+ ret.force_encoding("UTF-8")
151
151
  ret
152
152
  end
153
153
 
@@ -158,13 +158,13 @@ class PDF::Reader
158
158
  def little_boxes(times)
159
159
  codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times
160
160
  ret = codepoints.pack("U*")
161
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
161
+ ret.force_encoding("UTF-8")
162
162
  ret
163
163
  end
164
164
 
165
165
  def convert_to_utf8(str)
166
166
  ret = str.unpack(unpack).map! { |c| @mapping[c] || c }.pack("U*")
167
- ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
167
+ ret.force_encoding("UTF-8")
168
168
  ret
169
169
  end
170
170
 
@@ -207,8 +207,7 @@ class PDF::Reader
207
207
  end
208
208
 
209
209
  def load_mapping(file)
210
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
211
- File.open(file, mode) do |f|
210
+ File.open(file, "r:BINARY") do |f|
212
211
  f.each do |l|
213
212
  _m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
214
213
  @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
@@ -46,6 +46,7 @@ class PDF::Reader
46
46
  when :CCITTFaxDecode then PDF::Reader::Filter::Null.new(options)
47
47
  when :DCTDecode then PDF::Reader::Filter::Null.new(options)
48
48
  when :FlateDecode then PDF::Reader::Filter::Flate.new(options)
49
+ when :Fl then PDF::Reader::Filter::Flate.new(options)
49
50
  when :JBIG2Decode then PDF::Reader::Filter::Null.new(options)
50
51
  when :JPXDecode then PDF::Reader::Filter::Null.new(options)
51
52
  when :LZWDecode then PDF::Reader::Filter::Lzw.new(options)
@@ -15,11 +15,7 @@ class PDF::Reader # :nodoc:
15
15
  out = ""
16
16
 
17
17
  while pos < data.length
18
- if data.respond_to?(:getbyte)
19
- length = data.getbyte(pos)
20
- else
21
- length = data[pos]
22
- end
18
+ length = data.getbyte(pos)
23
19
  pos += 1
24
20
 
25
21
  case
@@ -36,11 +36,7 @@ class PDF::Reader
36
36
  attr_reader :widths, :first_char, :last_char, :basefont, :font_descriptor,
37
37
  :cid_widths, :cid_default_width
38
38
 
39
- def initialize(ohash = nil, obj = nil)
40
- if ohash.nil? || obj.nil?
41
- $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
42
- return
43
- end
39
+ def initialize(ohash, obj)
44
40
  @ohash = ohash
45
41
  @tounicode = nil
46
42
 
@@ -52,12 +48,6 @@ class PDF::Reader
52
48
  @encoding ||= PDF::Reader::Encoding.new(:StandardEncoding)
53
49
  end
54
50
 
55
- def basefont=(font)
56
- $stderr.puts "Font#basefont= is deprecated and will be removed in the 2.0 release"
57
- @encoding ||= default_encoding(font)
58
- @basefont = font
59
- end
60
-
61
51
  def to_utf8(params)
62
52
  if @tounicode
63
53
  to_utf8_via_cmap(params)
@@ -48,6 +48,9 @@ class PDF::Reader
48
48
  # h.name_to_unicode(:Euro)
49
49
  # => 8364
50
50
  #
51
+ # h.name_to_unicode(:X4A)
52
+ # => 74
53
+ #
51
54
  # h.name_to_unicode(:G30)
52
55
  # => 48
53
56
  #
@@ -62,6 +65,8 @@ class PDF::Reader
62
65
 
63
66
  if @by_name.has_key?(name)
64
67
  @by_name[name]
68
+ elsif str.match(/\AX[0-9a-fA-F]{2,4}\Z/)
69
+ "0x#{str[1,4]}".hex
65
70
  elsif str.match(/\Auni[A-F\d]{4}\Z/)
66
71
  "0x#{str[3,4]}".hex
67
72
  elsif str.match(/\Au[A-F\d]{4,6}\Z/)
@@ -102,8 +107,7 @@ class PDF::Reader
102
107
  keyed_by_name = {}
103
108
  keyed_by_codepoint = {}
104
109
 
105
- RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
106
- File.open(File.dirname(__FILE__) + "/glyphlist.txt", mode) do |f|
110
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt", "r:BINARY") do |f|
107
111
  f.each do |l|
108
112
  _m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
109
113
  if name && code
@@ -22,7 +22,7 @@ module PDF
22
22
 
23
23
  def initialize(data, bits_in_chunk)
24
24
  @data = data
25
- @data.force_encoding("BINARY") if @data.respond_to?(:force_encoding)
25
+ @data.force_encoding("BINARY")
26
26
  @bits_in_chunk = bits_in_chunk
27
27
  @current_pos = 0
28
28
  @bits_left_in_byte = 8
@@ -102,21 +102,7 @@ class PDF::Reader
102
102
  # a PDF::Reader::Reference, the key is returned unchanged.
103
103
  #
104
104
  def deref!(key)
105
- case object = deref(key)
106
- when Hash
107
- {}.tap { |hash|
108
- object.each do |k, value|
109
- hash[k] = deref!(value)
110
- end
111
- }
112
- when PDF::Reader::Stream
113
- object.hash = deref!(object.hash)
114
- object
115
- when Array
116
- object.map { |value| deref!(value) }
117
- else
118
- object
119
- end
105
+ deref_internal!(key, {})
120
106
  end
121
107
 
122
108
  # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
@@ -266,6 +252,39 @@ class PDF::Reader
266
252
 
267
253
  private
268
254
 
255
+ # Private implementation of deref!, which exists to ensure the `seen` argument
256
+ # isn't publicly available. It's used to avoid endless loops in the recursion, and
257
+ # doesn't need to be part of the public API.
258
+ #
259
+ def deref_internal!(key, seen)
260
+ seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
261
+
262
+ return seen[seen_key] if seen.key?(seen_key)
263
+
264
+ case object = deref(key)
265
+ when Hash
266
+ seen[seen_key] ||= {}
267
+ object.each do |k, value|
268
+ seen[seen_key][k] = deref_internal!(value, seen)
269
+ end
270
+ seen[seen_key]
271
+ when PDF::Reader::Stream
272
+ seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
273
+ object.hash.each do |k,value|
274
+ seen[seen_key].hash[k] = deref_internal!(value, seen)
275
+ end
276
+ seen[seen_key]
277
+ when Array
278
+ seen[seen_key] ||= []
279
+ object.each do |value|
280
+ seen[seen_key] << deref_internal!(value, seen)
281
+ end
282
+ seen[seen_key]
283
+ else
284
+ object
285
+ end
286
+ end
287
+
269
288
  def build_security_handler(opts = {})
270
289
  return nil if trailer[:Encrypt].nil?
271
290
 
@@ -316,7 +335,7 @@ class PDF::Reader
316
335
 
317
336
  if obj[:Type] == :Page
318
337
  ref
319
- elsif obj[:Type] == :Pages
338
+ elsif obj[:Kids]
320
339
  deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
321
340
  end
322
341
  end
@@ -8,17 +8,19 @@ class PDF::Reader
8
8
  # media box should be a 4 number array that describes the dimensions of the
9
9
  # page to be rendered as described by the page's MediaBox attribute
10
10
  class PageLayout
11
+
12
+ DEFAULT_FONT_SIZE = 12
13
+
11
14
  def initialize(runs, mediabox)
12
15
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
13
16
 
14
17
  @runs = merge_runs(runs)
15
- @mean_font_size = mean(@runs.map(&:font_size)) || 0
18
+ @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
19
+ @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
16
20
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
17
21
  @page_width = mediabox[2] - mediabox[0]
18
22
  @page_height = mediabox[3] - mediabox[1]
19
23
  @x_offset = @runs.map(&:x).sort.first
20
- @current_platform_is_rbx_19 = RUBY_DESCRIPTION =~ /\Arubinius 2.0.0/ &&
21
- RUBY_VERSION >= "1.9.0"
22
24
  end
23
25
 
24
26
  def to_s
@@ -110,21 +112,8 @@ class PDF::Reader
110
112
  runs
111
113
  end
112
114
 
113
- # This is a simple alternative to String#[]=. We can't use the string
114
- # method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
115
- #
116
- # See my bug report at https://github.com/rubinius/rubinius/issues/1985
117
115
  def local_string_insert(haystack, needle, index)
118
- if @current_platform_is_rbx_19
119
- char_count = needle.length
120
- haystack.replace(
121
- (haystack[0,index] || "") +
122
- needle +
123
- (haystack[index+char_count,500] || "")
124
- )
125
- else
126
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
127
- end
116
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
128
117
  end
129
118
  end
130
119
  end
@@ -27,42 +27,8 @@
27
27
 
28
28
  class PDF::Reader
29
29
  ################################################################################
30
- # Walks the pages of the PDF file and calls the appropriate callback methods when
31
- # something of interest is found.
32
- #
33
- # The callback methods should exist on the receiver object passed into the constructor.
34
- # Whenever some content is found that will trigger a callback, the receiver is checked
35
- # to see if the callback is defined.
36
- #
37
- # If it is defined it will be called. If not, processing will continue.
38
- #
39
- # = Available Callbacks
40
- # The following callbacks are available and should be methods defined on your receiver class. Only
41
- # implement the ones you need - the rest will be ignored.
42
- #
43
- # Some callbacks will include parameters which will be passed in as an array. For callbacks
44
- # that supply no paramters, or where you don't need them, the *params argument can be left off.
45
- # Some example callback method definitions are:
46
- #
47
- # def begin_document
48
- # def end_page
49
- # def show_text(string, *params)
50
- # def fill_stroke(*params)
51
- #
52
- # You should be able to infer the basic command the callback is reporting based on the name. For
53
- # further experimentation, define the callback with just a *params parameter, then print out the
54
- # contents of the array using something like:
55
- #
56
- # puts params.inspect
57
- #
58
30
  # == Text Callbacks
59
31
  #
60
- # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
61
- # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be
62
- # careful when doing a comparison on strings returned from PDF::Reader (when doing unit tests for
63
- # example). The string may not be byte-by-byte identical with the string that was originally
64
- # written to the PDF.
65
- #
66
32
  # - end_text_object
67
33
  # - move_to_start_of_next_line
68
34
  # - set_character_spacing
@@ -80,14 +46,6 @@ class PDF::Reader
80
46
  # - move_to_next_line_and_show_text
81
47
  # - set_spacing_next_line_show_text
82
48
  #
83
- # If the :raw_text option was passed to the PDF::Reader class the following callbacks
84
- # may also appear:
85
- #
86
- # - show_text_raw
87
- # - show_text_with_positioning_raw
88
- # - move_to_next_line_and_show_text_raw
89
- # - set_spacing_next_line_show_text_raw
90
- #
91
49
  # == Graphics Callbacks
92
50
  # - close_fill_stroke
93
51
  # - fill_stroke
@@ -145,42 +103,7 @@ class PDF::Reader
145
103
  # - set_clipping_path_with_even_odd
146
104
  # - append_curved_segment_final_point_replicated
147
105
  #
148
- # == Misc Callbacks
149
- # - begin_compatibility_section
150
- # - end_compatibility_section,
151
- # - begin_document
152
- # - end_document
153
- # - begin_page_container
154
- # - end_page_container
155
- # - begin_page
156
- # - end_page
157
- # - metadata
158
- # - xml_metadata
159
- # - page_count
160
- # - begin_form_xobject
161
- # - end_form_xobject
162
- #
163
- # == Resource Callbacks
164
- #
165
- # Each page can contain (or inherit) a range of resources required for the page,
166
- # including things like fonts and images. The following callbacks may appear
167
- # after begin_page if the relevant resources exist on a page:
168
- #
169
- # - resource_procset
170
- # - resource_xobject
171
- # - resource_extgstate
172
- # - resource_colorspace
173
- # - resource_pattern
174
- # - resource_font
175
- #
176
- # In most cases, these callbacks associate a name with each resource, allowing it
177
- # to be referred to by name in the page content. For example, an XObject can hold an image.
178
- # If it gets mapped to the name "IM1", then it can be placed on the page using
179
- # invoke_xobject "IM1".
180
- #
181
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
182
- # eventually be removed
183
- class PagesStrategy< AbstractStrategy # :nodoc:
106
+ class PagesStrategy # :nodoc:
184
107
  OPERATORS = {
185
108
  'b' => :close_fill_stroke,
186
109
  'B' => :fill_stroke,
@@ -256,232 +179,6 @@ class PDF::Reader
256
179
  '\'' => :move_to_next_line_and_show_text,
257
180
  '"' => :set_spacing_next_line_show_text,
258
181
  }
259
- def self.to_sym
260
- :pages
261
- end
262
- ################################################################################
263
- # Begin processing the document
264
- def process
265
- return false unless options[:pages]
266
-
267
- callback(:begin_document, [root])
268
- walk_pages(@ohash.object(root[:Pages]))
269
- callback(:end_document)
270
- end
271
- private
272
- ################################################################################
273
- def params_to_utf8(params, font)
274
- if params.is_a?(String)
275
- font.to_utf8(params)
276
- elsif params.is_a?(Array)
277
- params.map { |i| params_to_utf8(i, font)}
278
- else
279
- params
280
- end
281
- end
282
- ################################################################################
283
- # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
284
- # its content
285
- def walk_pages(page)
286
-
287
- # extract page content
288
- if page[:Type] == :Pages
289
- callback(:begin_page_container, [page])
290
- res = @ohash.object(page[:Resources])
291
- resources.push res if res
292
- @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
293
- resources.pop if res
294
- callback(:end_page_container)
295
- elsif page[:Type] == :Page
296
- callback(:begin_page, [page])
297
- res = @ohash.object(page[:Resources])
298
- resources.push res if res
299
- walk_resources(current_resources)
300
-
301
- if @ohash.object(page[:Contents]).kind_of?(Array)
302
- contents = @ohash.object(page[:Contents])
303
- else
304
- contents = [page[:Contents]]
305
- end
306
-
307
- fonts = font_hash_from_resources(current_resources)
308
-
309
- if page.has_key?(:Contents) and page[:Contents]
310
- direct_contents = contents.map { |content| @ohash.object(content) }
311
- content_stream(direct_contents, fonts)
312
- end
313
-
314
- resources.pop if res
315
- callback(:end_page)
316
- end
317
- end
318
- ################################################################################
319
- # Retreive the XObject for the supplied label and if it's a Form, walk it
320
- # like a regular page content stream.
321
- #
322
- def walk_xobject_form(label)
323
- xobjects = @ohash.object(current_resources[:XObject]) || {}
324
- xobject = @ohash.object(xobjects[label])
325
-
326
- if xobject && xobject.hash[:Subtype] == :Form
327
- callback(:begin_form_xobject)
328
- xobj_resources = @ohash.object(xobject.hash[:Resources])
329
- if xobj_resources
330
- resources.push xobj_resources
331
- walk_resources(xobj_resources)
332
- end
333
- fonts = font_hash_from_resources(xobj_resources)
334
- content_stream(xobject, fonts)
335
- callback(:end_form_xobject)
336
- resources.pop if xobj_resources
337
- end
338
- end
339
-
340
- ################################################################################
341
- # Return a merged hash of all resources that are current. Pages, page and xobject
342
- #
343
- def current_resources
344
- hash = {}
345
- resources.each do |res|
346
- hash.merge!(res)
347
- end
348
- hash
349
- end
350
- ################################################################################
351
- # Reads a PDF content stream and calls all the appropriate callback methods for the operators
352
- # it contains
353
- #
354
- def content_stream(instructions, fonts = {})
355
- instructions = [instructions] unless instructions.kind_of?(Array)
356
- instructions = instructions.map { |ins|
357
- ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
358
- }.join
359
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
360
- parser = Parser.new(buffer, @ohash)
361
- current_font = nil
362
- params = []
363
-
364
- while (token = parser.parse_token(OPERATORS))
365
- if token.kind_of?(Token) and OPERATORS.has_key?(token)
366
- if OPERATORS[token] == :set_text_font_and_size
367
- current_font = params.first
368
- if fonts[current_font].nil?
369
- raise MalformedPDFError, "Unknown font #{current_font}"
370
- end
371
- end
372
-
373
- # handle special cases in response to certain operators
374
- if OPERATORS[token].to_s.include?("show_text")
375
- # convert any text to utf-8, but output the raw string if the user wants it
376
- if options[:raw_text]
377
- callback("#{OPERATORS[token]}_raw".to_sym, params)
378
- end
379
- params = params_to_utf8(params, fonts[current_font])
380
- elsif token == "ID"
381
- # inline image data, first convert the current params into a more familiar hash
382
- map = {}
383
- params.each_slice(2) do |key, value|
384
- map[key] = value
385
- end
386
- params = [map, buffer.token]
387
- end
388
-
389
- callback(OPERATORS[token], params)
390
-
391
- if OPERATORS[token] == :invoke_xobject
392
- xobject_label = params.first
393
- params.clear
394
- walk_xobject_form(xobject_label)
395
- else
396
- params.clear
397
- end
398
- else
399
- params << token
400
- end
401
- end
402
- rescue EOFError
403
- raise MalformedPDFError, "End Of File while processing a content stream"
404
- end
405
- ################################################################################
406
- def walk_resources(resources)
407
- return unless resources.respond_to?(:[])
408
-
409
- resources = resolve_references(resources)
410
-
411
- # extract any procset information
412
- if resources[:ProcSet]
413
- callback(:resource_procset, resources[:ProcSet])
414
- end
415
-
416
- # extract any xobject information
417
- if resources[:XObject]
418
- @ohash.object(resources[:XObject]).each do |name, val|
419
- callback(:resource_xobject, [name, @ohash.object(val)])
420
- end
421
- end
422
-
423
- # extract any extgstate information
424
- if resources[:ExtGState]
425
- @ohash.object(resources[:ExtGState]).each do |name, val|
426
- callback(:resource_extgstate, [name, @ohash.object(val)])
427
- end
428
- end
429
-
430
- # extract any colorspace information
431
- if resources[:ColorSpace]
432
- @ohash.object(resources[:ColorSpace]).each do |name, val|
433
- callback(:resource_colorspace, [name, @ohash.object(val)])
434
- end
435
- end
436
-
437
- # extract any pattern information
438
- if resources[:Pattern]
439
- @ohash.object(resources[:Pattern]).each do |name, val|
440
- callback(:resource_pattern, [name, @ohash.object(val)])
441
- end
442
- end
443
-
444
- # extract any font information
445
- if resources[:Font]
446
- fonts = font_hash_from_resources(resources)
447
- fonts.each do |label, font|
448
- callback(:resource_font, [label, font])
449
- end
450
- end
451
- end
452
- ################################################################################
453
- # Convert any PDF::Reader::Resource objects into a real object
454
- def resolve_references(obj)
455
- case obj
456
- when PDF::Reader::Stream then
457
- obj.hash = resolve_references(obj.hash)
458
- obj
459
- when PDF::Reader::Reference then
460
- resolve_references(@ohash.object(obj))
461
- when Hash then
462
- arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
463
- Hash[*arr]
464
- when Array then
465
- obj.collect { |item| resolve_references(item) }
466
- else
467
- obj
468
- end
469
- end
470
- ################################################################################
471
- ################################################################################
472
- def font_hash_from_resources(resources)
473
- return {} unless resources.respond_to?(:[])
474
-
475
- fonts = {}
476
- resources = @ohash.object(resources[:Font]) || {}
477
- resources.each do |label, desc|
478
- fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
479
- end
480
- fonts
481
- end
482
- def resources
483
- @resources ||= []
484
- end
485
182
  end
486
183
  ################################################################################
487
184
  end