pdf-reader 1.1.1 → 2.5.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -1,13 +1,23 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
- require 'matrix'
4
4
  require 'forwardable'
5
+ require 'pdf/reader/page_layout'
5
6
 
6
7
  module PDF
7
8
  class Reader
9
+
10
+ # Builds a UTF-8 string of all the text on a single page by processing all
11
+ # the operaters in a content stream.
12
+ #
8
13
  class PageTextReceiver
9
14
  extend Forwardable
10
15
 
16
+ SPACE = " "
17
+
18
+ attr_reader :state, :options
19
+
20
+ ########## BEGIN FORWARDERS ##########
11
21
  # Graphics State Operators
12
22
  def_delegators :@state, :save_graphics_state, :restore_graphics_state
13
23
 
@@ -26,41 +36,40 @@ module PDF
26
36
  # Text Positioning Operators
27
37
  def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
28
38
  def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
39
+ ########## END FORWARDERS ##########
29
40
 
30
41
  # starting a new page
31
42
  def page=(page)
32
43
  @state = PageState.new(page)
33
- @content = {}
44
+ @page = page
45
+ @content = []
46
+ @characters = []
47
+ @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
49
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
50
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
34
51
  end
35
52
 
36
53
  def content
37
- keys = @content.keys.sort.reverse
38
- keys.map { |key|
39
- @content[key]
40
- }.join("\n")
54
+ PageLayout.new(@characters, @device_mediabox).to_s
41
55
  end
42
56
 
43
57
  #####################################################
44
58
  # Text Showing Operators
45
59
  #####################################################
46
-
47
60
  # record text that is drawn on the page
48
- def show_text(string) # Tj
49
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
50
- newx, newy = @state.trm_transform(0,0)
51
- @content[newy] ||= ""
52
- @content[newy] << @state.current_font.to_utf8(string)
61
+ def show_text(string) # Tj (AWAY)
62
+ internal_show_text(string)
53
63
  end
54
64
 
55
- def show_text_with_positioning(params) # TJ
56
- params.each { |arg|
57
- case arg
58
- when String
59
- show_text(arg)
60
- when Fixnum, Float
61
- show_text(" ") if arg > 1000
65
+ def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
66
+ params.each do |arg|
67
+ if arg.is_a?(String)
68
+ internal_show_text(arg)
69
+ else
70
+ @state.process_glyph_displacement(0, arg, false)
62
71
  end
63
- }
72
+ end
64
73
  end
65
74
 
66
75
  def move_to_next_line_and_show_text(str) # '
@@ -86,6 +95,47 @@ module PDF
86
95
  end
87
96
  end
88
97
 
98
+ private
99
+
100
+ def internal_show_text(string)
101
+ if @state.current_font.nil?
102
+ raise PDF::Reader::MalformedPDFError, "current font is invalid"
103
+ end
104
+ glyphs = @state.current_font.unpack(string)
105
+ glyphs.each_with_index do |glyph_code, index|
106
+ # paint the current glyph
107
+ newx, newy = @state.trm_transform(0,0)
108
+ newx, newy = apply_rotation(newx, newy)
109
+
110
+ utf8_chars = @state.current_font.to_utf8(glyph_code)
111
+
112
+ # apply to glyph displacment for the current glyph so the next
113
+ # glyph will appear in the correct position
114
+ glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
115
+ th = 1
116
+ scaled_glyph_width = glyph_width * @state.font_size * th
117
+ unless utf8_chars == SPACE
118
+ @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
119
+ end
120
+ @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
121
+ end
122
+ end
123
+
124
+ def apply_rotation(x, y)
125
+ if @page.rotate == 90
126
+ tmp = x
127
+ x = y
128
+ y = tmp * -1
129
+ elsif @page.rotate == 180
130
+ y *= -1
131
+ elsif @page.rotate == 270
132
+ tmp = x
133
+ x = y * -1
134
+ y = tmp * -1
135
+ end
136
+ return x, y
137
+ end
138
+
89
139
  end
90
140
  end
91
141
  end
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -25,41 +28,8 @@
25
28
 
26
29
  class PDF::Reader
27
30
  ################################################################################
28
- # Walks the pages of the PDF file and calls the appropriate callback methods when
29
- # something of interest is found.
30
- #
31
- # The callback methods should exist on the receiver object passed into the constructor. Whenever
32
- # some content is found that will trigger a callback, the receiver is checked to see if the callback
33
- # is defined.
34
- #
35
- # If it is defined it will be called. If not, processing will continue.
36
- #
37
- # = Available Callbacks
38
- # The following callbacks are available and should be methods defined on your receiver class. Only
39
- # implement the ones you need - the rest will be ignored.
40
- #
41
- # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
42
- # paramters, or where you don't need them, the *params argument can be left off. Some example callback
43
- # method definitions are:
44
- #
45
- # def begin_document
46
- # def end_page
47
- # def show_text(string, *params)
48
- # def fill_stroke(*params)
49
- #
50
- # You should be able to infer the basic command the callback is reporting based on the name. For
51
- # further experimentation, define the callback with just a *params parameter, then print out the
52
- # contents of the array using something like:
53
- #
54
- # puts params.inspect
55
- #
56
31
  # == Text Callbacks
57
32
  #
58
- # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
59
- # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
60
- # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
61
- # string may not be byte-by-byte identical with the string that was originally written to the PDF.
62
- #
63
33
  # - end_text_object
64
34
  # - move_to_start_of_next_line
65
35
  # - set_character_spacing
@@ -77,14 +47,6 @@ class PDF::Reader
77
47
  # - move_to_next_line_and_show_text
78
48
  # - set_spacing_next_line_show_text
79
49
  #
80
- # If the :raw_text option was passed to the PDF::Reader class the following callbacks
81
- # may also appear:
82
- #
83
- # - show_text_raw
84
- # - show_text_with_positioning_raw
85
- # - move_to_next_line_and_show_text_raw
86
- # - set_spacing_next_line_show_text_raw
87
- #
88
50
  # == Graphics Callbacks
89
51
  # - close_fill_stroke
90
52
  # - fill_stroke
@@ -142,42 +104,7 @@ class PDF::Reader
142
104
  # - set_clipping_path_with_even_odd
143
105
  # - append_curved_segment_final_point_replicated
144
106
  #
145
- # == Misc Callbacks
146
- # - begin_compatibility_section
147
- # - end_compatibility_section,
148
- # - begin_document
149
- # - end_document
150
- # - begin_page_container
151
- # - end_page_container
152
- # - begin_page
153
- # - end_page
154
- # - metadata
155
- # - xml_metadata
156
- # - page_count
157
- # - begin_form_xobject
158
- # - end_form_xobject
159
- #
160
- # == Resource Callbacks
161
- #
162
- # Each page can contain (or inherit) a range of resources required for the page,
163
- # including things like fonts and images. The following callbacks may appear
164
- # after begin_page if the relevant resources exist on a page:
165
- #
166
- # - resource_procset
167
- # - resource_xobject
168
- # - resource_extgstate
169
- # - resource_colorspace
170
- # - resource_pattern
171
- # - resource_font
172
- #
173
- # In most cases, these callbacks associate a name with each resource, allowing it
174
- # to be referred to by name in the page content. For example, an XObject can hold an image.
175
- # If it gets mapped to the name "IM1", then it can be placed on the page using
176
- # invoke_xobject "IM1".
177
- #
178
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
179
- # eventually be removed
180
- class PagesStrategy< AbstractStrategy # :nodoc:
107
+ class PagesStrategy # :nodoc:
181
108
  OPERATORS = {
182
109
  'b' => :close_fill_stroke,
183
110
  'B' => :fill_stroke,
@@ -253,222 +180,6 @@ class PDF::Reader
253
180
  '\'' => :move_to_next_line_and_show_text,
254
181
  '"' => :set_spacing_next_line_show_text,
255
182
  }
256
- def self.to_sym
257
- :pages
258
- end
259
- ################################################################################
260
- # Begin processing the document
261
- def process
262
- return false unless options[:pages]
263
-
264
- callback(:begin_document, [root])
265
- walk_pages(@ohash.object(root[:Pages]))
266
- callback(:end_document)
267
- end
268
- private
269
- ################################################################################
270
- # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
271
- # its content
272
- def walk_pages (page)
273
-
274
- # extract page content
275
- if page[:Type] == :Pages
276
- callback(:begin_page_container, [page])
277
- res = @ohash.object(page[:Resources])
278
- resources.push res if res
279
- @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
280
- resources.pop if res
281
- callback(:end_page_container)
282
- elsif page[:Type] == :Page
283
- callback(:begin_page, [page])
284
- res = @ohash.object(page[:Resources])
285
- resources.push res if res
286
- walk_resources(current_resources)
287
-
288
- if @ohash.object(page[:Contents]).kind_of?(Array)
289
- contents = @ohash.object(page[:Contents])
290
- else
291
- contents = [page[:Contents]]
292
- end
293
-
294
- fonts = font_hash_from_resources(current_resources)
295
-
296
- if page.has_key?(:Contents) and page[:Contents]
297
- direct_contents = contents.map { |content| @ohash.object(content) }
298
- content_stream(direct_contents, fonts)
299
- end
300
-
301
- resources.pop if res
302
- callback(:end_page)
303
- end
304
- end
305
- ################################################################################
306
- # Retreive the XObject for the supplied label and if it's a Form, walk it
307
- # like a regular page content stream.
308
- #
309
- def walk_xobject_form(label)
310
- xobjects = @ohash.object(current_resources[:XObject]) || {}
311
- xobject = @ohash.object(xobjects[label])
312
-
313
- if xobject && xobject.hash[:Subtype] == :Form
314
- callback(:begin_form_xobject)
315
- xobj_resources = @ohash.object(xobject.hash[:Resources])
316
- if xobj_resources
317
- resources.push xobj_resources
318
- walk_resources(xobj_resources)
319
- end
320
- fonts = font_hash_from_resources(xobj_resources)
321
- content_stream(xobject, fonts)
322
- callback(:end_form_xobject)
323
- resources.pop if xobj_resources
324
- end
325
- end
326
-
327
- ################################################################################
328
- # Return a merged hash of all resources that are current. Pages, page and xobject
329
- #
330
- def current_resources
331
- hash = {}
332
- resources.each do |res|
333
- hash.merge!(res)
334
- end
335
- hash
336
- end
337
- ################################################################################
338
- # Reads a PDF content stream and calls all the appropriate callback methods for the operators
339
- # it contains
340
- #
341
- def content_stream (instructions, fonts = {})
342
- instructions = [instructions] unless instructions.kind_of?(Array)
343
- instructions = instructions.map { |ins|
344
- ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
345
- }.join
346
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
347
- parser = Parser.new(buffer, @ohash)
348
- current_font = nil
349
- params = []
350
-
351
- while (token = parser.parse_token(OPERATORS))
352
- if token.kind_of?(Token) and OPERATORS.has_key?(token)
353
- if OPERATORS[token] == :set_text_font_and_size
354
- current_font = params.first
355
- if fonts[current_font].nil?
356
- raise MalformedPDFError, "Unknown font #{current_font}"
357
- end
358
- end
359
-
360
- # handle special cases in response to certain operators
361
- if OPERATORS[token].to_s.include?("show_text")
362
- # convert any text to utf-8, but output the raw string if the user wants it
363
- if options[:raw_text]
364
- callback("#{OPERATORS[token]}_raw".to_sym, params)
365
- end
366
- params = fonts[current_font].to_utf8(params)
367
- elsif token == "ID"
368
- # inline image data, first convert the current params into a more familiar hash
369
- map = {}
370
- params.each_slice(2) do |key, value|
371
- map[key] = value
372
- end
373
- params = [map, buffer.token]
374
- end
375
-
376
- callback(OPERATORS[token], params)
377
-
378
- if OPERATORS[token] == :invoke_xobject
379
- xobject_label = params.first
380
- params.clear
381
- walk_xobject_form(xobject_label)
382
- else
383
- params.clear
384
- end
385
- else
386
- params << token
387
- end
388
- end
389
- rescue EOFError => e
390
- raise MalformedPDFError, "End Of File while processing a content stream"
391
- end
392
- ################################################################################
393
- def walk_resources(resources)
394
- return unless resources.respond_to?(:[])
395
-
396
- resources = resolve_references(resources)
397
-
398
- # extract any procset information
399
- if resources[:ProcSet]
400
- callback(:resource_procset, resources[:ProcSet])
401
- end
402
-
403
- # extract any xobject information
404
- if resources[:XObject]
405
- @ohash.object(resources[:XObject]).each do |name, val|
406
- callback(:resource_xobject, [name, @ohash.object(val)])
407
- end
408
- end
409
-
410
- # extract any extgstate information
411
- if resources[:ExtGState]
412
- @ohash.object(resources[:ExtGState]).each do |name, val|
413
- callback(:resource_extgstate, [name, @ohash.object(val)])
414
- end
415
- end
416
-
417
- # extract any colorspace information
418
- if resources[:ColorSpace]
419
- @ohash.object(resources[:ColorSpace]).each do |name, val|
420
- callback(:resource_colorspace, [name, @ohash.object(val)])
421
- end
422
- end
423
-
424
- # extract any pattern information
425
- if resources[:Pattern]
426
- @ohash.object(resources[:Pattern]).each do |name, val|
427
- callback(:resource_pattern, [name, @ohash.object(val)])
428
- end
429
- end
430
-
431
- # extract any font information
432
- if resources[:Font]
433
- fonts = font_hash_from_resources(resources)
434
- fonts.each do |label, font|
435
- callback(:resource_font, [label, font])
436
- end
437
- end
438
- end
439
- ################################################################################
440
- # Convert any PDF::Reader::Resource objects into a real object
441
- def resolve_references(obj)
442
- case obj
443
- when PDF::Reader::Stream then
444
- obj.hash = resolve_references(obj.hash)
445
- obj
446
- when PDF::Reader::Reference then
447
- resolve_references(@ohash.object(obj))
448
- when Hash then
449
- arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
450
- Hash[*arr]
451
- when Array then
452
- obj.collect { |item| resolve_references(item) }
453
- else
454
- obj
455
- end
456
- end
457
- ################################################################################
458
- ################################################################################
459
- def font_hash_from_resources(resources)
460
- return {} unless resources.respond_to?(:[])
461
-
462
- fonts = {}
463
- resources = @ohash.object(resources[:Font]) || {}
464
- resources.each do |label, desc|
465
- fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
466
- end
467
- fonts
468
- end
469
- def resources
470
- @resources ||= []
471
- end
472
183
  end
473
184
  ################################################################################
474
185
  end