pdf-reader 1.1.1 → 2.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. checksums.yaml +7 -0
  2. data/CHANGELOG +87 -2
  3. data/{README.rdoc → README.md} +43 -31
  4. data/Rakefile +21 -16
  5. data/bin/pdf_callbacks +1 -1
  6. data/bin/pdf_object +4 -1
  7. data/bin/pdf_text +1 -3
  8. data/examples/callbacks.rb +2 -1
  9. data/examples/extract_images.rb +11 -6
  10. data/examples/fuzzy_paragraphs.rb +24 -0
  11. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -0
  12. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -0
  13. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -0
  14. data/lib/pdf/reader/afm/Courier.afm +342 -0
  15. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -0
  16. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -0
  17. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -0
  18. data/lib/pdf/reader/afm/Helvetica.afm +3051 -0
  19. data/lib/pdf/reader/afm/MustRead.html +19 -0
  20. data/lib/pdf/reader/afm/Symbol.afm +213 -0
  21. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -0
  22. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -0
  23. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -0
  24. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -0
  25. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -0
  26. data/lib/pdf/reader/buffer.rb +90 -63
  27. data/lib/pdf/reader/cid_widths.rb +63 -0
  28. data/lib/pdf/reader/cmap.rb +69 -38
  29. data/lib/pdf/reader/encoding.rb +74 -48
  30. data/lib/pdf/reader/error.rb +24 -4
  31. data/lib/pdf/reader/filter/ascii85.rb +28 -0
  32. data/lib/pdf/reader/filter/ascii_hex.rb +30 -0
  33. data/lib/pdf/reader/filter/depredict.rb +141 -0
  34. data/lib/pdf/reader/filter/flate.rb +53 -0
  35. data/lib/pdf/reader/filter/lzw.rb +21 -0
  36. data/lib/pdf/reader/filter/null.rb +18 -0
  37. data/lib/pdf/reader/filter/run_length.rb +45 -0
  38. data/lib/pdf/reader/filter.rb +15 -234
  39. data/lib/pdf/reader/font.rb +107 -43
  40. data/lib/pdf/reader/font_descriptor.rb +80 -0
  41. data/lib/pdf/reader/form_xobject.rb +26 -4
  42. data/lib/pdf/reader/glyph_hash.rb +56 -18
  43. data/lib/pdf/reader/lzw.rb +6 -4
  44. data/lib/pdf/reader/null_security_handler.rb +17 -0
  45. data/lib/pdf/reader/object_cache.rb +40 -16
  46. data/lib/pdf/reader/object_hash.rb +94 -40
  47. data/lib/pdf/reader/object_stream.rb +1 -0
  48. data/lib/pdf/reader/orientation_detector.rb +34 -0
  49. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  50. data/lib/pdf/reader/page.rb +48 -3
  51. data/lib/pdf/reader/page_layout.rb +125 -0
  52. data/lib/pdf/reader/page_state.rb +185 -70
  53. data/lib/pdf/reader/page_text_receiver.rb +70 -20
  54. data/lib/pdf/reader/pages_strategy.rb +4 -293
  55. data/lib/pdf/reader/parser.rb +37 -61
  56. data/lib/pdf/reader/print_receiver.rb +6 -0
  57. data/lib/pdf/reader/reference.rb +4 -1
  58. data/lib/pdf/reader/register_receiver.rb +17 -31
  59. data/lib/pdf/reader/resource_methods.rb +1 -0
  60. data/lib/pdf/reader/standard_security_handler.rb +82 -42
  61. data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
  62. data/lib/pdf/reader/stream.rb +5 -2
  63. data/lib/pdf/reader/synchronized_cache.rb +33 -0
  64. data/lib/pdf/reader/text_run.rb +99 -0
  65. data/lib/pdf/reader/token.rb +4 -1
  66. data/lib/pdf/reader/transformation_matrix.rb +195 -0
  67. data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
  68. data/lib/pdf/reader/width_calculator/built_in.rb +67 -0
  69. data/lib/pdf/reader/width_calculator/composite.rb +28 -0
  70. data/lib/pdf/reader/width_calculator/true_type.rb +56 -0
  71. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +33 -0
  72. data/lib/pdf/reader/width_calculator/type_zero.rb +25 -0
  73. data/lib/pdf/reader/width_calculator.rb +12 -0
  74. data/lib/pdf/reader/xref.rb +41 -9
  75. data/lib/pdf/reader.rb +45 -104
  76. data/lib/pdf-reader.rb +4 -1
  77. metadata +220 -101
  78. data/bin/pdf_list_callbacks +0 -17
  79. data/lib/pdf/hash.rb +0 -15
  80. data/lib/pdf/reader/abstract_strategy.rb +0 -81
  81. data/lib/pdf/reader/metadata_strategy.rb +0 -56
  82. data/lib/pdf/reader/text_receiver.rb +0 -264
@@ -1,13 +1,23 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
- require 'matrix'
4
4
  require 'forwardable'
5
+ require 'pdf/reader/page_layout'
5
6
 
6
7
  module PDF
7
8
  class Reader
9
+
10
+ # Builds a UTF-8 string of all the text on a single page by processing all
11
+ # the operaters in a content stream.
12
+ #
8
13
  class PageTextReceiver
9
14
  extend Forwardable
10
15
 
16
+ SPACE = " "
17
+
18
+ attr_reader :state, :options
19
+
20
+ ########## BEGIN FORWARDERS ##########
11
21
  # Graphics State Operators
12
22
  def_delegators :@state, :save_graphics_state, :restore_graphics_state
13
23
 
@@ -26,41 +36,40 @@ module PDF
26
36
  # Text Positioning Operators
27
37
  def_delegators :@state, :move_text_position, :move_text_position_and_set_leading
28
38
  def_delegators :@state, :set_text_matrix_and_text_line_matrix, :move_to_start_of_next_line
39
+ ########## END FORWARDERS ##########
29
40
 
30
41
  # starting a new page
31
42
  def page=(page)
32
43
  @state = PageState.new(page)
33
- @content = {}
44
+ @page = page
45
+ @content = []
46
+ @characters = []
47
+ @mediabox = page.objects.deref(page.attributes[:MediaBox])
48
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
49
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
50
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
34
51
  end
35
52
 
36
53
  def content
37
- keys = @content.keys.sort.reverse
38
- keys.map { |key|
39
- @content[key]
40
- }.join("\n")
54
+ PageLayout.new(@characters, @device_mediabox).to_s
41
55
  end
42
56
 
43
57
  #####################################################
44
58
  # Text Showing Operators
45
59
  #####################################################
46
-
47
60
  # record text that is drawn on the page
48
- def show_text(string) # Tj
49
- raise PDF::Reader::MalformedPDFError, "current font is invalid" if @state.current_font.nil?
50
- newx, newy = @state.trm_transform(0,0)
51
- @content[newy] ||= ""
52
- @content[newy] << @state.current_font.to_utf8(string)
61
+ def show_text(string) # Tj (AWAY)
62
+ internal_show_text(string)
53
63
  end
54
64
 
55
- def show_text_with_positioning(params) # TJ
56
- params.each { |arg|
57
- case arg
58
- when String
59
- show_text(arg)
60
- when Fixnum, Float
61
- show_text(" ") if arg > 1000
65
+ def show_text_with_positioning(params) # TJ [(A) 120 (WA) 20 (Y)]
66
+ params.each do |arg|
67
+ if arg.is_a?(String)
68
+ internal_show_text(arg)
69
+ else
70
+ @state.process_glyph_displacement(0, arg, false)
62
71
  end
63
- }
72
+ end
64
73
  end
65
74
 
66
75
  def move_to_next_line_and_show_text(str) # '
@@ -86,6 +95,47 @@ module PDF
86
95
  end
87
96
  end
88
97
 
98
+ private
99
+
100
+ def internal_show_text(string)
101
+ if @state.current_font.nil?
102
+ raise PDF::Reader::MalformedPDFError, "current font is invalid"
103
+ end
104
+ glyphs = @state.current_font.unpack(string)
105
+ glyphs.each_with_index do |glyph_code, index|
106
+ # paint the current glyph
107
+ newx, newy = @state.trm_transform(0,0)
108
+ newx, newy = apply_rotation(newx, newy)
109
+
110
+ utf8_chars = @state.current_font.to_utf8(glyph_code)
111
+
112
+ # apply to glyph displacment for the current glyph so the next
113
+ # glyph will appear in the correct position
114
+ glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
115
+ th = 1
116
+ scaled_glyph_width = glyph_width * @state.font_size * th
117
+ unless utf8_chars == SPACE
118
+ @characters << TextRun.new(newx, newy, scaled_glyph_width, @state.font_size, utf8_chars)
119
+ end
120
+ @state.process_glyph_displacement(glyph_width, 0, utf8_chars == SPACE)
121
+ end
122
+ end
123
+
124
+ def apply_rotation(x, y)
125
+ if @page.rotate == 90
126
+ tmp = x
127
+ x = y
128
+ y = tmp * -1
129
+ elsif @page.rotate == 180
130
+ y *= -1
131
+ elsif @page.rotate == 270
132
+ tmp = x
133
+ x = y * -1
134
+ y = tmp * -1
135
+ end
136
+ return x, y
137
+ end
138
+
89
139
  end
90
140
  end
91
141
  end
@@ -1,3 +1,6 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
1
4
  ################################################################################
2
5
  #
3
6
  # Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
@@ -25,41 +28,8 @@
25
28
 
26
29
  class PDF::Reader
27
30
  ################################################################################
28
- # Walks the pages of the PDF file and calls the appropriate callback methods when
29
- # something of interest is found.
30
- #
31
- # The callback methods should exist on the receiver object passed into the constructor. Whenever
32
- # some content is found that will trigger a callback, the receiver is checked to see if the callback
33
- # is defined.
34
- #
35
- # If it is defined it will be called. If not, processing will continue.
36
- #
37
- # = Available Callbacks
38
- # The following callbacks are available and should be methods defined on your receiver class. Only
39
- # implement the ones you need - the rest will be ignored.
40
- #
41
- # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
42
- # paramters, or where you don't need them, the *params argument can be left off. Some example callback
43
- # method definitions are:
44
- #
45
- # def begin_document
46
- # def end_page
47
- # def show_text(string, *params)
48
- # def fill_stroke(*params)
49
- #
50
- # You should be able to infer the basic command the callback is reporting based on the name. For
51
- # further experimentation, define the callback with just a *params parameter, then print out the
52
- # contents of the array using something like:
53
- #
54
- # puts params.inspect
55
- #
56
31
  # == Text Callbacks
57
32
  #
58
- # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
59
- # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
60
- # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
61
- # string may not be byte-by-byte identical with the string that was originally written to the PDF.
62
- #
63
33
  # - end_text_object
64
34
  # - move_to_start_of_next_line
65
35
  # - set_character_spacing
@@ -77,14 +47,6 @@ class PDF::Reader
77
47
  # - move_to_next_line_and_show_text
78
48
  # - set_spacing_next_line_show_text
79
49
  #
80
- # If the :raw_text option was passed to the PDF::Reader class the following callbacks
81
- # may also appear:
82
- #
83
- # - show_text_raw
84
- # - show_text_with_positioning_raw
85
- # - move_to_next_line_and_show_text_raw
86
- # - set_spacing_next_line_show_text_raw
87
- #
88
50
  # == Graphics Callbacks
89
51
  # - close_fill_stroke
90
52
  # - fill_stroke
@@ -142,42 +104,7 @@ class PDF::Reader
142
104
  # - set_clipping_path_with_even_odd
143
105
  # - append_curved_segment_final_point_replicated
144
106
  #
145
- # == Misc Callbacks
146
- # - begin_compatibility_section
147
- # - end_compatibility_section,
148
- # - begin_document
149
- # - end_document
150
- # - begin_page_container
151
- # - end_page_container
152
- # - begin_page
153
- # - end_page
154
- # - metadata
155
- # - xml_metadata
156
- # - page_count
157
- # - begin_form_xobject
158
- # - end_form_xobject
159
- #
160
- # == Resource Callbacks
161
- #
162
- # Each page can contain (or inherit) a range of resources required for the page,
163
- # including things like fonts and images. The following callbacks may appear
164
- # after begin_page if the relevant resources exist on a page:
165
- #
166
- # - resource_procset
167
- # - resource_xobject
168
- # - resource_extgstate
169
- # - resource_colorspace
170
- # - resource_pattern
171
- # - resource_font
172
- #
173
- # In most cases, these callbacks associate a name with each resource, allowing it
174
- # to be referred to by name in the page content. For example, an XObject can hold an image.
175
- # If it gets mapped to the name "IM1", then it can be placed on the page using
176
- # invoke_xobject "IM1".
177
- #
178
- # DEPRECATED: this class was deprecated in version 0.11.0 and will
179
- # eventually be removed
180
- class PagesStrategy< AbstractStrategy # :nodoc:
107
+ class PagesStrategy # :nodoc:
181
108
  OPERATORS = {
182
109
  'b' => :close_fill_stroke,
183
110
  'B' => :fill_stroke,
@@ -253,222 +180,6 @@ class PDF::Reader
253
180
  '\'' => :move_to_next_line_and_show_text,
254
181
  '"' => :set_spacing_next_line_show_text,
255
182
  }
256
- def self.to_sym
257
- :pages
258
- end
259
- ################################################################################
260
- # Begin processing the document
261
- def process
262
- return false unless options[:pages]
263
-
264
- callback(:begin_document, [root])
265
- walk_pages(@ohash.object(root[:Pages]))
266
- callback(:end_document)
267
- end
268
- private
269
- ################################################################################
270
- # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
271
- # its content
272
- def walk_pages (page)
273
-
274
- # extract page content
275
- if page[:Type] == :Pages
276
- callback(:begin_page_container, [page])
277
- res = @ohash.object(page[:Resources])
278
- resources.push res if res
279
- @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
280
- resources.pop if res
281
- callback(:end_page_container)
282
- elsif page[:Type] == :Page
283
- callback(:begin_page, [page])
284
- res = @ohash.object(page[:Resources])
285
- resources.push res if res
286
- walk_resources(current_resources)
287
-
288
- if @ohash.object(page[:Contents]).kind_of?(Array)
289
- contents = @ohash.object(page[:Contents])
290
- else
291
- contents = [page[:Contents]]
292
- end
293
-
294
- fonts = font_hash_from_resources(current_resources)
295
-
296
- if page.has_key?(:Contents) and page[:Contents]
297
- direct_contents = contents.map { |content| @ohash.object(content) }
298
- content_stream(direct_contents, fonts)
299
- end
300
-
301
- resources.pop if res
302
- callback(:end_page)
303
- end
304
- end
305
- ################################################################################
306
- # Retreive the XObject for the supplied label and if it's a Form, walk it
307
- # like a regular page content stream.
308
- #
309
- def walk_xobject_form(label)
310
- xobjects = @ohash.object(current_resources[:XObject]) || {}
311
- xobject = @ohash.object(xobjects[label])
312
-
313
- if xobject && xobject.hash[:Subtype] == :Form
314
- callback(:begin_form_xobject)
315
- xobj_resources = @ohash.object(xobject.hash[:Resources])
316
- if xobj_resources
317
- resources.push xobj_resources
318
- walk_resources(xobj_resources)
319
- end
320
- fonts = font_hash_from_resources(xobj_resources)
321
- content_stream(xobject, fonts)
322
- callback(:end_form_xobject)
323
- resources.pop if xobj_resources
324
- end
325
- end
326
-
327
- ################################################################################
328
- # Return a merged hash of all resources that are current. Pages, page and xobject
329
- #
330
- def current_resources
331
- hash = {}
332
- resources.each do |res|
333
- hash.merge!(res)
334
- end
335
- hash
336
- end
337
- ################################################################################
338
- # Reads a PDF content stream and calls all the appropriate callback methods for the operators
339
- # it contains
340
- #
341
- def content_stream (instructions, fonts = {})
342
- instructions = [instructions] unless instructions.kind_of?(Array)
343
- instructions = instructions.map { |ins|
344
- ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
345
- }.join
346
- buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
347
- parser = Parser.new(buffer, @ohash)
348
- current_font = nil
349
- params = []
350
-
351
- while (token = parser.parse_token(OPERATORS))
352
- if token.kind_of?(Token) and OPERATORS.has_key?(token)
353
- if OPERATORS[token] == :set_text_font_and_size
354
- current_font = params.first
355
- if fonts[current_font].nil?
356
- raise MalformedPDFError, "Unknown font #{current_font}"
357
- end
358
- end
359
-
360
- # handle special cases in response to certain operators
361
- if OPERATORS[token].to_s.include?("show_text")
362
- # convert any text to utf-8, but output the raw string if the user wants it
363
- if options[:raw_text]
364
- callback("#{OPERATORS[token]}_raw".to_sym, params)
365
- end
366
- params = fonts[current_font].to_utf8(params)
367
- elsif token == "ID"
368
- # inline image data, first convert the current params into a more familiar hash
369
- map = {}
370
- params.each_slice(2) do |key, value|
371
- map[key] = value
372
- end
373
- params = [map, buffer.token]
374
- end
375
-
376
- callback(OPERATORS[token], params)
377
-
378
- if OPERATORS[token] == :invoke_xobject
379
- xobject_label = params.first
380
- params.clear
381
- walk_xobject_form(xobject_label)
382
- else
383
- params.clear
384
- end
385
- else
386
- params << token
387
- end
388
- end
389
- rescue EOFError => e
390
- raise MalformedPDFError, "End Of File while processing a content stream"
391
- end
392
- ################################################################################
393
- def walk_resources(resources)
394
- return unless resources.respond_to?(:[])
395
-
396
- resources = resolve_references(resources)
397
-
398
- # extract any procset information
399
- if resources[:ProcSet]
400
- callback(:resource_procset, resources[:ProcSet])
401
- end
402
-
403
- # extract any xobject information
404
- if resources[:XObject]
405
- @ohash.object(resources[:XObject]).each do |name, val|
406
- callback(:resource_xobject, [name, @ohash.object(val)])
407
- end
408
- end
409
-
410
- # extract any extgstate information
411
- if resources[:ExtGState]
412
- @ohash.object(resources[:ExtGState]).each do |name, val|
413
- callback(:resource_extgstate, [name, @ohash.object(val)])
414
- end
415
- end
416
-
417
- # extract any colorspace information
418
- if resources[:ColorSpace]
419
- @ohash.object(resources[:ColorSpace]).each do |name, val|
420
- callback(:resource_colorspace, [name, @ohash.object(val)])
421
- end
422
- end
423
-
424
- # extract any pattern information
425
- if resources[:Pattern]
426
- @ohash.object(resources[:Pattern]).each do |name, val|
427
- callback(:resource_pattern, [name, @ohash.object(val)])
428
- end
429
- end
430
-
431
- # extract any font information
432
- if resources[:Font]
433
- fonts = font_hash_from_resources(resources)
434
- fonts.each do |label, font|
435
- callback(:resource_font, [label, font])
436
- end
437
- end
438
- end
439
- ################################################################################
440
- # Convert any PDF::Reader::Resource objects into a real object
441
- def resolve_references(obj)
442
- case obj
443
- when PDF::Reader::Stream then
444
- obj.hash = resolve_references(obj.hash)
445
- obj
446
- when PDF::Reader::Reference then
447
- resolve_references(@ohash.object(obj))
448
- when Hash then
449
- arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
450
- Hash[*arr]
451
- when Array then
452
- obj.collect { |item| resolve_references(item) }
453
- else
454
- obj
455
- end
456
- end
457
- ################################################################################
458
- ################################################################################
459
- def font_hash_from_resources(resources)
460
- return {} unless resources.respond_to?(:[])
461
-
462
- fonts = {}
463
- resources = @ohash.object(resources[:Font]) || {}
464
- resources.each do |label, desc|
465
- fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
466
- end
467
- fonts
468
- end
469
- def resources
470
- @resources ||= []
471
- end
472
183
  end
473
184
  ################################################################################
474
185
  end