pdf-reader 2.7.0 → 2.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
4
- data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
3
+ metadata.gz: 6182ffd59631afba6a2c234547a428382b1ec2d7b414d89830b1143f1a0e1704
4
+ data.tar.gz: 6c0e6a7d32cf24912edc3aa96d72b7f70497d2fdd0e0913b86f871bbf9fa104f
5
5
  SHA512:
6
- metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
7
- data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
6
+ metadata.gz: 42dafbe0c36ce838da4c3120bf2187efde647e486971896d9a9c59c37dac3da0f2ccf3ecd98d8dd1d3acc5404bfcf26e64a327d7797648646afd6b40be02fec2
7
+ data.tar.gz: 40f0b0958024b558d6aca7eb2b3b6f042f034059c8fca52ce97fab7d55a39c313797605341331c65efd1099a1310ccbe386c354024dbd3cbc61c1d96c423842d
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ v2.8.0 (28th Decemeber 2021)
2
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
3
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
4
+ - including extracting the text for only part of the page
5
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
6
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
7
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
8
+
1
9
  v2.7.0 (13th December 2021)
2
10
  - Include RBI type files in the gem
3
11
  - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 32
17
+ cane.max_violations = 28
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+
7
+ # Filter our text/characters that are positioned outside a rectangle. Usually the page
8
+ # MediaBox or CropBox, but could be a user specified rectangle too
9
+ class BoundingRectangleRunsFilter
10
+
11
+ def self.runs_within_rect(runs, rect)
12
+ runs.select { |run| rect.contains?(run.origin) }
13
+ end
14
+ end
15
+ end
16
+
@@ -43,6 +43,7 @@ class PDF::Reader
43
43
  @tounicode = nil
44
44
 
45
45
  extract_base_info(obj)
46
+ extract_type3_info(obj)
46
47
  extract_descriptor(obj)
47
48
  extract_descendants(obj)
48
49
  @width_calc = build_width_calculator
@@ -73,8 +74,44 @@ class PDF::Reader
73
74
  @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
74
75
  end
75
76
 
77
+ # In most cases glyph width is converted into text space with a simple divide by 1000.
78
+ #
79
+ # However, Type3 fonts provide their own FontMatrix that's used for the transformation.
80
+ #
81
+ def glyph_width_in_text_space(code_point)
82
+ glyph_width_in_glyph_space = glyph_width(code_point)
83
+
84
+ if @subtype == :Type3
85
+ x1, y1 = font_matrix_transform(0,0)
86
+ x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
87
+ (x2 - x1).abs.round(2)
88
+ else
89
+ glyph_width_in_glyph_space / 1000.0
90
+ end
91
+ end
92
+
76
93
  private
77
94
 
95
+ # Only valid for Type3 fonts
96
+ def font_matrix_transform(x, y)
97
+ return x, y if @font_matrix.nil?
98
+
99
+ matrix = TransformationMatrix.new(
100
+ @font_matrix[0], @font_matrix[1],
101
+ @font_matrix[2], @font_matrix[3],
102
+ @font_matrix[4], @font_matrix[5],
103
+ )
104
+
105
+ if x == 0 && y == 0
106
+ [matrix.e, matrix.f]
107
+ else
108
+ [
109
+ (matrix.a * x) + (matrix.c * y) + (matrix.e),
110
+ (matrix.b * x) + (matrix.d * y) + (matrix.f)
111
+ ]
112
+ end
113
+ end
114
+
78
115
  def default_encoding(font_name)
79
116
  case font_name.to_s
80
117
  when "Symbol" then
@@ -138,6 +175,12 @@ class PDF::Reader
138
175
  end
139
176
  end
140
177
 
178
+ def extract_type3_info(obj)
179
+ if @subtype == :Type3
180
+ @font_matrix = @ohash.object(obj[:FontMatrix]) || [ 0.001, 0, 0, 0.001, 0, 0 ]
181
+ end
182
+ end
183
+
141
184
  def extract_descriptor(obj)
142
185
  if obj[:FontDescriptor]
143
186
  # create a font descriptor object if we can, in other words, unless this is
@@ -101,13 +101,24 @@ module PDF
101
101
  # returns the plain text content of this page encoded as UTF-8. Any
102
102
  # characters that can't be translated will be returned as a ▯
103
103
  #
104
- def text
104
+ def text(opts = {})
105
105
  receiver = PageTextReceiver.new
106
106
  walk(receiver)
107
- receiver.content
107
+ runs = receiver.runs(opts)
108
+
109
+ # rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
110
+ mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
111
+
112
+ PageLayout.new(runs, mediabox).to_s
108
113
  end
109
114
  alias :to_s :text
110
115
 
116
+ def runs(opts = {})
117
+ receiver = PageTextReceiver.new
118
+ walk(receiver)
119
+ receiver.runs(opts)
120
+ end
121
+
111
122
  # processes the raw content stream for this page in sequential order and
112
123
  # passes callbacks to the receiver objects.
113
124
  #
@@ -21,10 +21,8 @@ class PDF::Reader
21
21
  # PDF::Reader::Rectangle at some point
22
22
  PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
23
 
24
- runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
25
- runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
- @mediabox = mediabox
27
- @runs = merge_runs(runs)
24
+ @mediabox = process_mediabox(mediabox)
25
+ @runs = runs
28
26
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
29
27
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
30
28
  @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
@@ -51,13 +49,11 @@ class PDF::Reader
51
49
  private
52
50
 
53
51
  def page_width
54
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
- (@mediabox[2].to_f - @mediabox[0].to_f).abs
52
+ @mediabox.width
56
53
  end
57
54
 
58
55
  def page_height
59
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
- (@mediabox[3].to_f - @mediabox[1].to_f).abs
56
+ @mediabox.height
61
57
  end
62
58
 
63
59
  # given an array of strings, return a new array with empty rows from the
@@ -109,30 +105,20 @@ class PDF::Reader
109
105
  end
110
106
  end
111
107
 
112
- # take a collection of TextRun objects and merge any that are in close
113
- # proximity
114
- def merge_runs(runs)
115
- runs.group_by { |char|
116
- char.y.to_i
117
- }.map { |y, chars|
118
- group_chars_into_runs(chars.sort)
119
- }.flatten.sort
108
+ def local_string_insert(haystack, needle, index)
109
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
120
110
  end
121
111
 
122
- def group_chars_into_runs(chars)
123
- chars.each_with_object([]) do |char, runs|
124
- if runs.empty?
125
- runs << char
126
- elsif runs.last.mergable?(char)
127
- runs[-1] = runs.last + char
128
- else
129
- runs << char
130
- end
112
+ def process_mediabox(mediabox)
113
+ if mediabox.is_a?(Array)
114
+ msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
115
+ " please use a Rectangle instead"
116
+ $stderr.puts msg
117
+ PDF::Reader::Rectangle.from_array(mediabox)
118
+ else
119
+ mediabox
131
120
  end
132
121
  end
133
122
 
134
- def local_string_insert(haystack, needle, index)
135
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
136
- end
137
123
  end
138
124
  end
@@ -47,9 +47,32 @@ module PDF
47
47
  @characters = []
48
48
  end
49
49
 
50
+ def runs(opts = {})
51
+ runs = @characters
52
+
53
+ if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
54
+ runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
55
+ end
56
+
57
+ if opts.fetch(:skip_zero_width, true)
58
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
59
+ end
60
+
61
+ if opts.fetch(:skip_overlapping, true)
62
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
+ end
64
+
65
+ if opts.fetch(:merge, true)
66
+ runs = merge_runs(runs)
67
+ end
68
+
69
+ runs
70
+ end
71
+
72
+ # deprecated
50
73
  def content
51
- mediabox = @page.rectangles[:MediaBox].to_a
52
- PageLayout.new(@characters, mediabox).to_s
74
+ mediabox = @page.rectangles[:MediaBox]
75
+ PageLayout.new(runs, mediabox).to_s
53
76
  end
54
77
 
55
78
  #####################################################
@@ -109,7 +132,7 @@ module PDF
109
132
 
110
133
  # apply to glyph displacment for the current glyph so the next
111
134
  # glyph will appear in the correct position
112
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
135
+ glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
113
136
  th = 1
114
137
  scaled_glyph_width = glyph_width * @state.font_size * th
115
138
  unless utf8_chars == SPACE
@@ -119,12 +142,6 @@ module PDF
119
142
  end
120
143
  end
121
144
 
122
- # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
- # think it sets the correct x,y values. We get away with it because we don't
124
- # return the text with co-ordinates, only the full text arranged in a string.
125
- #
126
- # We should provide an API for extracting the text with positioning data and spec
127
- # that. I suspect the co-ords might be wrong for rotated pages
128
145
  def apply_rotation(x, y)
129
146
  if @page.rotate == 90
130
147
  tmp = x
@@ -141,6 +158,28 @@ module PDF
141
158
  return x, y
142
159
  end
143
160
 
161
+ # take a collection of TextRun objects and merge any that are in close
162
+ # proximity
163
+ def merge_runs(runs)
164
+ runs.group_by { |char|
165
+ char.y.to_i
166
+ }.map { |y, chars|
167
+ group_chars_into_runs(chars.sort)
168
+ }.flatten.sort
169
+ end
170
+
171
+ def group_chars_into_runs(chars)
172
+ chars.each_with_object([]) do |char, runs|
173
+ if runs.empty?
174
+ runs << char
175
+ elsif runs.last.mergable?(char)
176
+ runs[-1] = runs.last + char
177
+ else
178
+ runs << char
179
+ end
180
+ end
181
+ end
182
+
144
183
  end
145
184
  end
146
185
  end
@@ -210,6 +210,9 @@ class PDF::Reader
210
210
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
211
211
  if @objects
212
212
  length = @objects.deref(dict[:Length])
213
+ if dict[:Filter]
214
+ dict[:Filter] = @objects.deref(dict[:Filter])
215
+ end
213
216
  else
214
217
  length = dict[:Length] || 0
215
218
  end
@@ -26,6 +26,19 @@ module PDF
26
26
  set_corners(x1, y1, x2, y2)
27
27
  end
28
28
 
29
+ def self.from_array(arr)
30
+ if arr.size != 4
31
+ raise ArgumentError, "Only 4-element Arrays can be converted to a Rectangle"
32
+ end
33
+
34
+ PDF::Reader::Rectangle.new(
35
+ arr[0].to_f,
36
+ arr[1].to_f,
37
+ arr[2].to_f,
38
+ arr[3].to_f,
39
+ )
40
+ end
41
+
29
42
  def ==(other)
30
43
  to_a == other.to_a
31
44
  end
@@ -38,6 +51,11 @@ module PDF
38
51
  bottom_right.x - bottom_left.x
39
52
  end
40
53
 
54
+ def contains?(point)
55
+ point.x >= bottom_left.x && point.x <= top_right.x &&
56
+ point.y >= bottom_left.y && point.y <= top_right.y
57
+ end
58
+
41
59
  # A pdf-style 4-number array
42
60
  def to_a
43
61
  [
@@ -7,15 +7,14 @@ class PDF::Reader
7
7
  class TextRun
8
8
  include Comparable
9
9
 
10
- attr_reader :x, :y, :width, :font_size, :text
10
+ attr_reader :origin, :width, :font_size, :text
11
11
 
12
12
  alias :to_s :text
13
13
 
14
14
  def initialize(x, y, width, font_size, text)
15
- @x = x
16
- @y = y
15
+ @origin = PDF::Reader::Point.new(x, y)
17
16
  @width = width
18
- @font_size = font_size.floor
17
+ @font_size = font_size
19
18
  @text = text
20
19
  end
21
20
 
@@ -35,12 +34,20 @@ class PDF::Reader
35
34
  end
36
35
  end
37
36
 
37
+ def x
38
+ @origin.x
39
+ end
40
+
41
+ def y
42
+ @origin.y
43
+ end
44
+
38
45
  def endx
39
- @endx ||= x + width
46
+ @endx ||= @origin.x + width
40
47
  end
41
48
 
42
49
  def endy
43
- @endy ||= y + font_size
50
+ @endy ||= @origin.y + font_size
44
51
  end
45
52
 
46
53
  def mean_character_width
data/lib/pdf/reader.rb CHANGED
@@ -112,17 +112,25 @@ module PDF
112
112
  #
113
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
114
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
115
118
  def initialize(input, opts = {})
116
119
  @cache = PDF::Reader::ObjectCache.new
117
120
  opts.merge!(:cache => @cache)
118
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
119
122
  end
120
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
121
126
  def info
122
127
  dict = @objects.deref(@objects.trailer[:Info])
123
128
  doc_strings_to_utf8(dict)
124
129
  end
125
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
126
134
  def metadata
127
135
  stream = @objects.deref(root[:Metadata])
128
136
  if stream.nil?
@@ -134,6 +142,8 @@ module PDF
134
142
  end
135
143
  end
136
144
 
145
+ # To number of pages in this PDF
146
+ #
137
147
  def page_count
138
148
  pages = @objects.deref(root[:Pages])
139
149
  unless pages.kind_of?(::Hash)
@@ -142,12 +152,14 @@ module PDF
142
152
  @page_count ||= @objects.deref(pages[:Count])
143
153
  end
144
154
 
155
+ # The PDF version this file uses
156
+ #
145
157
  def pdf_version
146
158
  @objects.pdf_version
147
159
  end
148
160
 
149
- # syntactic sugar for opening a PDF file. Accepts the same arguments
150
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
151
163
  #
152
164
  # PDF::Reader.open("somefile.pdf") do |reader|
153
165
  # puts reader.pdf_version
@@ -273,6 +285,7 @@ end
273
285
 
274
286
  require 'pdf/reader/resource_methods'
275
287
  require 'pdf/reader/buffer'
288
+ require 'pdf/reader/bounding_rectangle_runs_filter'
276
289
  require 'pdf/reader/cid_widths'
277
290
  require 'pdf/reader/cmap'
278
291
  require 'pdf/reader/encoding'
data/rbi/pdf-reader.rbi CHANGED
@@ -43,6 +43,13 @@ module PDF
43
43
  sig { returns(T::Hash[Symbol, T.untyped]) }
44
44
  def root; end
45
45
 
46
+ class BoundingRectangleRunsFilter
47
+ extend T::Sig
48
+
49
+ sig { params(runs: T::Array[PDF::Reader::TextRun], rect: PDF::Reader::Rectangle).returns(T::Array[PDF::Reader::TextRun]) }
50
+ def self.runs_within_rect(runs, rect); end
51
+ end
52
+
46
53
  class Buffer
47
54
  TOKEN_WHITESPACE = [0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
48
55
  TOKEN_DELIMITER = [0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
@@ -750,8 +757,11 @@ module PDF
750
757
  sig { returns(T::Array[Numeric]) }
751
758
  def origin; end
752
759
 
753
- sig { returns(String) }
754
- def text; end
760
+ sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
761
+ def runs(opts = {}); end
762
+
763
+ sig { params(opts: T::Hash[Symbol, T.untyped]).returns(String) }
764
+ def text(opts = {}); end
755
765
 
756
766
  sig { params(receivers: T.untyped).void }
757
767
  def walk(*receivers); end
@@ -794,7 +804,7 @@ module PDF
794
804
  extend T::Sig
795
805
  DEFAULT_FONT_SIZE = 12
796
806
 
797
- sig { params(runs: T::Array[PDF::Reader::TextRun], mediabox: T::Array[Numeric]).void }
807
+ sig { params(runs: T::Array[PDF::Reader::TextRun], mediabox: T.any(T::Array[Numeric], PDF::Reader::Rectangle)).void }
798
808
  def initialize(runs, mediabox); end
799
809
 
800
810
  sig { returns(String) }
@@ -829,6 +839,9 @@ module PDF
829
839
 
830
840
  sig { params(haystack: T.untyped, needle: T.untyped, index: T.untyped).returns(T.untyped) }
831
841
  def local_string_insert(haystack, needle, index); end
842
+
843
+ sig { params(mediabox: T.untyped).returns(T.untyped) }
844
+ def process_mediabox(mediabox); end
832
845
  end
833
846
 
834
847
  class PageState
@@ -996,6 +1009,9 @@ module PDF
996
1009
  sig { params(str: T.untyped).returns(T.untyped) }
997
1010
  def move_to_next_line_and_show_text(str); end
998
1011
 
1012
+ sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
1013
+ def runs(opts = {}); end
1014
+
999
1015
  sig { params(aw: T.untyped, ac: T.untyped, string: T.untyped).returns(T.untyped) }
1000
1016
  def set_spacing_next_line_show_text(aw, ac, string); end
1001
1017
 
@@ -1122,6 +1138,9 @@ module PDF
1122
1138
  end
1123
1139
 
1124
1140
  class Rectangle
1141
+ sig { params(arr: T::Array[Numeric]).returns(PDF::Reader::Rectangle) }
1142
+ def self.from_array(arr); end
1143
+
1125
1144
  sig do
1126
1145
  params(
1127
1146
  x1: Numeric,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.7.0
4
+ version: 2.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-13 00:00:00.000000000 Z
11
+ date: 2021-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -215,6 +215,7 @@ files:
215
215
  - lib/pdf/reader/afm/Times-Italic.afm
216
216
  - lib/pdf/reader/afm/Times-Roman.afm
217
217
  - lib/pdf/reader/afm/ZapfDingbats.afm
218
+ - lib/pdf/reader/bounding_rectangle_runs_filter.rb
218
219
  - lib/pdf/reader/buffer.rb
219
220
  - lib/pdf/reader/cid_widths.rb
220
221
  - lib/pdf/reader/cmap.rb
@@ -281,9 +282,9 @@ licenses:
281
282
  - MIT
282
283
  metadata:
283
284
  bug_tracker_uri: https://github.com/yob/pdf-reader/issues
284
- changelog_uri: https://github.com/yob/pdf-reader/blob/v2.7.0/CHANGELOG
285
- documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.7.0
286
- source_code_uri: https://github.com/yob/pdf-reader/tree/v2.7.0
285
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.8.0/CHANGELOG
286
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.8.0
287
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.8.0
287
288
  post_install_message:
288
289
  rdoc_options:
289
290
  - "--title"