pdf-reader 2.1.0 → 2.4.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG +28 -1
  3. data/README.md +2 -2
  4. data/bin/pdf_callbacks +1 -1
  5. data/bin/pdf_text +1 -1
  6. data/lib/pdf-reader.rb +1 -0
  7. data/lib/pdf/reader.rb +2 -2
  8. data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
  9. data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
  10. data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
  11. data/lib/pdf/reader/afm/Courier.afm +342 -342
  12. data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
  13. data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
  14. data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
  15. data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
  16. data/lib/pdf/reader/afm/MustRead.html +19 -0
  17. data/lib/pdf/reader/afm/Symbol.afm +213 -213
  18. data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
  19. data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
  20. data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
  21. data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
  22. data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
  23. data/lib/pdf/reader/buffer.rb +12 -11
  24. data/lib/pdf/reader/cid_widths.rb +2 -0
  25. data/lib/pdf/reader/cmap.rb +22 -12
  26. data/lib/pdf/reader/encoding.rb +12 -9
  27. data/lib/pdf/reader/error.rb +1 -0
  28. data/lib/pdf/reader/filter.rb +1 -0
  29. data/lib/pdf/reader/filter/ascii85.rb +1 -0
  30. data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
  31. data/lib/pdf/reader/filter/depredict.rb +1 -0
  32. data/lib/pdf/reader/filter/flate.rb +6 -4
  33. data/lib/pdf/reader/filter/lzw.rb +2 -0
  34. data/lib/pdf/reader/filter/null.rb +2 -0
  35. data/lib/pdf/reader/filter/run_length.rb +3 -1
  36. data/lib/pdf/reader/font.rb +11 -2
  37. data/lib/pdf/reader/font_descriptor.rb +1 -0
  38. data/lib/pdf/reader/form_xobject.rb +1 -0
  39. data/lib/pdf/reader/glyph_hash.rb +1 -0
  40. data/lib/pdf/reader/lzw.rb +2 -1
  41. data/lib/pdf/reader/null_security_handler.rb +1 -0
  42. data/lib/pdf/reader/object_cache.rb +1 -0
  43. data/lib/pdf/reader/object_hash.rb +22 -10
  44. data/lib/pdf/reader/object_stream.rb +1 -0
  45. data/lib/pdf/reader/orientation_detector.rb +5 -4
  46. data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
  47. data/lib/pdf/reader/page.rb +29 -0
  48. data/lib/pdf/reader/page_layout.rb +10 -5
  49. data/lib/pdf/reader/page_state.rb +10 -1
  50. data/lib/pdf/reader/page_text_receiver.rb +5 -1
  51. data/lib/pdf/reader/pages_strategy.rb +1 -0
  52. data/lib/pdf/reader/parser.rb +5 -4
  53. data/lib/pdf/reader/print_receiver.rb +1 -0
  54. data/lib/pdf/reader/reference.rb +1 -0
  55. data/lib/pdf/reader/register_receiver.rb +1 -0
  56. data/lib/pdf/reader/resource_methods.rb +1 -0
  57. data/lib/pdf/reader/standard_security_handler.rb +1 -0
  58. data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
  59. data/lib/pdf/reader/stream.rb +1 -0
  60. data/lib/pdf/reader/synchronized_cache.rb +1 -0
  61. data/lib/pdf/reader/text_run.rb +25 -0
  62. data/lib/pdf/reader/token.rb +1 -0
  63. data/lib/pdf/reader/transformation_matrix.rb +1 -0
  64. data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
  65. data/lib/pdf/reader/width_calculator.rb +1 -0
  66. data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
  67. data/lib/pdf/reader/width_calculator/composite.rb +1 -0
  68. data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
  69. data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
  70. data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
  71. data/lib/pdf/reader/xref.rb +11 -5
  72. metadata +17 -13
  73. data/lib/pdf/hash.rb +0 -19
@@ -1,4 +1,7 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require 'pdf/reader/overlapping_runs_filter'
2
5
 
3
6
  class PDF::Reader
4
7
 
@@ -14,13 +17,15 @@ class PDF::Reader
14
17
  def initialize(runs, mediabox)
15
18
  raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
16
19
 
17
- @runs = merge_runs(runs)
20
+ @runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
18
21
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
19
22
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
20
23
  @mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
21
- @page_width = mediabox[2] - mediabox[0]
22
- @page_height = mediabox[3] - mediabox[1]
23
- @x_offset = @runs.map(&:x).sort.first
24
+ @page_width = (mediabox[2] - mediabox[0]).abs
25
+ @page_height = (mediabox[3] - mediabox[1]).abs
26
+ @x_offset = @runs.map(&:x).sort.first || 0
27
+ lowest_y = @runs.map(&:y).sort.first || 0
28
+ @y_offset = lowest_y > 0 ? 0 : lowest_y
24
29
  end
25
30
 
26
31
  def to_s
@@ -29,7 +34,7 @@ class PDF::Reader
29
34
  page = row_count.times.map { |i| " " * col_count }
30
35
  @runs.each do |run|
31
36
  x_pos = ((run.x - @x_offset) / col_multiplier).round
32
- y_pos = row_count - (run.y / row_multiplier).round
37
+ y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
33
38
  if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
34
39
  local_string_insert(page[y_pos-1], run.text, x_pos)
35
40
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'pdf/reader/transformation_matrix'
4
5
 
@@ -29,7 +30,15 @@ class PDF::Reader
29
30
  @xobject_stack = [page.xobjects]
30
31
  @cs_stack = [page.color_spaces]
31
32
  @stack = [DEFAULT_GRAPHICS_STATE.dup]
32
- state[:ctm] = identity_matrix
33
+ if page.rotate == 0
34
+ state[:ctm] = identity_matrix
35
+ else
36
+ rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
37
+ rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
38
+ state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
39
+ rotate_sin * -1, rotate_cos,
40
+ 0, 0)
41
+ end
33
42
  end
34
43
 
35
44
  #####################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'forwardable'
4
5
  require 'pdf/reader/page_layout'
@@ -43,10 +44,13 @@ module PDF
43
44
  @content = []
44
45
  @characters = []
45
46
  @mediabox = page.objects.deref(page.attributes[:MediaBox])
47
+ device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
48
+ device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
49
+ @device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
46
50
  end
47
51
 
48
52
  def content
49
- PageLayout.new(@characters, @mediabox).to_s
53
+ PageLayout.new(@characters, @device_mediabox).to_s
50
54
  end
51
55
 
52
56
  #####################################################
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -132,7 +133,7 @@ class PDF::Reader
132
133
  # reads a PDF name from the buffer and converts it to a Ruby Symbol
133
134
  def pdf_name
134
135
  tok = @buffer.token
135
- tok.gsub!(/#([A-Fa-f0-9]{2})/) do |match|
136
+ tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
136
137
  match[1, 2].hex.chr
137
138
  end
138
139
  tok.to_sym
@@ -154,7 +155,7 @@ class PDF::Reader
154
155
  ################################################################################
155
156
  # Reads a PDF hex string from the buffer and converts it to a Ruby String
156
157
  def hex_string
157
- str = ""
158
+ str = "".dup
158
159
 
159
160
  loop do
160
161
  token = @buffer.token
@@ -171,11 +172,11 @@ class PDF::Reader
171
172
  # Reads a PDF String from the buffer and converts it to a Ruby String
172
173
  def string
173
174
  str = @buffer.token
174
- return "".force_encoding("binary") if str == ")"
175
+ return "".dup.force_encoding("binary") if str == ")"
175
176
  Error.assert_equal(parse_token, ")")
176
177
 
177
178
  str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
178
- MAPPING[match] || ""
179
+ MAPPING[match] || "".dup
179
180
  end
180
181
  str.force_encoding("binary")
181
182
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # A simple receiver that prints all operaters and parameters in the content
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  # Copyright (C) 2010 James Healy (jimmy@deefa.com)
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  module PDF
4
5
  class Reader
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,6 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
3
+
2
4
  require 'digest'
3
5
  require 'openssl'
4
6
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # encoding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  # utilities.rb : General-purpose utility classes which don't fit anywhere else
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # A value object that represents one or more consecutive characters on a page.
@@ -37,6 +38,10 @@ class PDF::Reader
37
38
  @endx ||= x + width
38
39
  end
39
40
 
41
+ def endy
42
+ @endy ||= y + font_size
43
+ end
44
+
40
45
  def mean_character_width
41
46
  @width / character_count
42
47
  end
@@ -59,8 +64,28 @@ class PDF::Reader
59
64
  "#{text} w:#{width} f:#{font_size} @#{x},#{y}"
60
65
  end
61
66
 
67
+ def intersect?(other_run)
68
+ x <= other_run.endx && endx >= other_run.x &&
69
+ endy >= other_run.y && y <= other_run.endy
70
+ end
71
+
72
+ # return what percentage of this text run is overlapped by another run
73
+ def intersection_area_percent(other_run)
74
+ return 0 unless intersect?(other_run)
75
+
76
+ dx = [endx, other_run.endx].min - [x, other_run.x].max
77
+ dy = [endy, other_run.endy].min - [y, other_run.y].max
78
+ intersection_area = dx*dy
79
+
80
+ intersection_area.to_f / area
81
+ end
82
+
62
83
  private
63
84
 
85
+ def area
86
+ (endx - x) * (endy - y)
87
+ end
88
+
64
89
  def mergable_range
65
90
  @mergable_range ||= Range.new(endx - 3, endx + font_size)
66
91
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  # co-ordinate systems in PDF files are specified using a 3x3 matrix that looks
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
 
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  # PDF files may define fonts in a number of ways. Each approach means we must
4
5
  # calculate glyph widths differently, so this set of classes conform to an
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  require 'afm'
4
5
  require 'pdf/reader/synchronized_cache'
@@ -11,11 +12,20 @@ class PDF::Reader
11
12
  # see Section 9.6.2.2, PDF 32000-1:2008, pp 256
12
13
  class BuiltIn
13
14
 
15
+ BUILTINS = [
16
+ :Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
17
+ :Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
18
+ :Symbol,
19
+ :"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
20
+ :ZapfDingbats
21
+ ]
22
+
14
23
  def initialize(font)
15
24
  @font = font
16
25
  @@all_metrics ||= PDF::Reader::SynchronizedCache.new
17
26
 
18
- metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{font.basefont}.afm")
27
+ basefont = extract_basefont(font.basefont)
28
+ metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
19
29
 
20
30
  if File.file?(metrics_path)
21
31
  @metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
@@ -53,6 +63,13 @@ class PDF::Reader
53
63
  @font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
54
64
  end
55
65
 
66
+ def extract_basefont(font_name)
67
+ if BUILTINS.include?(font_name)
68
+ font_name
69
+ else
70
+ "Times-Roman"
71
+ end
72
+ end
56
73
  end
57
74
  end
58
75
  end
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -17,8 +18,7 @@ class PDF::Reader
17
18
 
18
19
  def glyph_width(code_point)
19
20
  return 0 if code_point.nil? || code_point < 0
20
-
21
- glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point)
21
+ glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
22
22
  end
23
23
 
24
24
  private
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  class PDF::Reader
4
5
  module WidthCalculator
@@ -1,4 +1,5 @@
1
1
  # coding: utf-8
2
+ # frozen_string_literal: true
2
3
 
3
4
  ################################################################################
4
5
  #
@@ -203,8 +204,10 @@ class PDF::Reader
203
204
  ("\x00" + bytes).unpack("N")[0]
204
205
  elsif bytes.size == 4
205
206
  bytes.unpack("N")[0]
207
+ elsif bytes.size == 8
208
+ bytes.unpack("Q>")[0]
206
209
  else
207
- raise UnsupportedFeatureError, "Unable to unpack xref stream entries with more than 4 bytes"
210
+ raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
208
211
  end
209
212
  end
210
213
  ################################################################################
@@ -227,18 +230,21 @@ class PDF::Reader
227
230
  # should always be 0, but all sort of crazy junk is prefixed to PDF files
228
231
  # in the real world.
229
232
  #
230
- # Checks up to 50 chars into the file, returns nil if no PDF data detected.
233
+ # Checks up to 1024 chars into the file,
234
+ # returns nil if no PDF data detected.
235
+ # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
236
+ # header appear somewhere within the first 1024 bytes of the file
231
237
  #
232
238
  def calc_junk_offset(io)
233
239
  io.rewind
234
240
  offset = io.pos
235
- until (c = io.readchar) == '%' || c == 37 || offset > 50
241
+ until (c = io.readchar) == '%' || c == 37 || offset > 1024
236
242
  offset += 1
237
243
  end
238
244
  io.rewind
239
- offset < 50 ? offset : nil
245
+ offset < 1024 ? offset : nil
240
246
  rescue EOFError
241
- return nil
247
+ nil
242
248
  end
243
249
  end
244
250
  ################################################################################
metadata CHANGED
@@ -1,29 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.0
4
+ version: 2.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2018-02-15 00:00:00.000000000 Z
11
+ date: 2020-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "<"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '13.0'
20
20
  type: :development
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "<"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '13.0'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
@@ -67,7 +67,7 @@ dependencies:
67
67
  - !ruby/object:Gem::Version
68
68
  version: '0.2'
69
69
  - !ruby/object:Gem::Dependency
70
- name: ir_b
70
+ name: pry
71
71
  requirement: !ruby/object:Gem::Requirement
72
72
  requirements:
73
73
  - - ">="
@@ -167,7 +167,7 @@ dependencies:
167
167
  description: The PDF::Reader library implements a PDF parser conforming as much as
168
168
  possible to the PDF specification from Adobe
169
169
  email:
170
- - jimmy@deefa.com
170
+ - james@yob.id.au
171
171
  executables:
172
172
  - pdf_object
173
173
  - pdf_text
@@ -199,7 +199,6 @@ files:
199
199
  - examples/text.rb
200
200
  - examples/version.rb
201
201
  - lib/pdf-reader.rb
202
- - lib/pdf/hash.rb
203
202
  - lib/pdf/reader.rb
204
203
  - lib/pdf/reader/afm/Courier-Bold.afm
205
204
  - lib/pdf/reader/afm/Courier-BoldOblique.afm
@@ -209,6 +208,7 @@ files:
209
208
  - lib/pdf/reader/afm/Helvetica-BoldOblique.afm
210
209
  - lib/pdf/reader/afm/Helvetica-Oblique.afm
211
210
  - lib/pdf/reader/afm/Helvetica.afm
211
+ - lib/pdf/reader/afm/MustRead.html
212
212
  - lib/pdf/reader/afm/Symbol.afm
213
213
  - lib/pdf/reader/afm/Times-Bold.afm
214
214
  - lib/pdf/reader/afm/Times-BoldItalic.afm
@@ -246,6 +246,7 @@ files:
246
246
  - lib/pdf/reader/object_hash.rb
247
247
  - lib/pdf/reader/object_stream.rb
248
248
  - lib/pdf/reader/orientation_detector.rb
249
+ - lib/pdf/reader/overlapping_runs_filter.rb
249
250
  - lib/pdf/reader/page.rb
250
251
  - lib/pdf/reader/page_layout.rb
251
252
  - lib/pdf/reader/page_state.rb
@@ -271,10 +272,14 @@ files:
271
272
  - lib/pdf/reader/width_calculator/type_one_or_three.rb
272
273
  - lib/pdf/reader/width_calculator/type_zero.rb
273
274
  - lib/pdf/reader/xref.rb
274
- homepage: http://github.com/yob/pdf-reader
275
+ homepage: https://github.com/yob/pdf-reader
275
276
  licenses:
276
277
  - MIT
277
- metadata: {}
278
+ metadata:
279
+ bug_tracker_uri: https://github.com/yob/pdf-reader/issues
280
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.1/CHANGELOG
281
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.1
282
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.1
278
283
  post_install_message:
279
284
  rdoc_options:
280
285
  - "--title"
@@ -295,8 +300,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
295
300
  - !ruby/object:Gem::Version
296
301
  version: '0'
297
302
  requirements: []
298
- rubyforge_project:
299
- rubygems_version: 2.7.3
303
+ rubygems_version: 3.0.3
300
304
  signing_key:
301
305
  specification_version: 4
302
306
  summary: A library for accessing the content of PDF files