pdf-reader 2.7.0 → 2.8.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5ee0d8c3c55f6a0aebb60a0a6dce92428e8371b96a6beb6d75bfe90602bffae7
4
- data.tar.gz: '0911d108353bf577aa9fd7b49b97dda1cf9d54816bf8ff6c4225281eeda63229'
3
+ metadata.gz: 6182ffd59631afba6a2c234547a428382b1ec2d7b414d89830b1143f1a0e1704
4
+ data.tar.gz: 6c0e6a7d32cf24912edc3aa96d72b7f70497d2fdd0e0913b86f871bbf9fa104f
5
5
  SHA512:
6
- metadata.gz: 917db2b1fb977b41e7b057ff3d215b8f249577254d9fe3df72f330b32ff49630874c58f480495ddcd137d9f31d014083438623cdf7260b0d7a87bbe3a5f3685a
7
- data.tar.gz: cd9832f025264e54d586e81eff69727379e8646d741f53ae61e90a5b38945d852147853891d468bab683581bdd0beb68a9b7c7f5e54e064e9a3935262ea9d651
6
+ metadata.gz: 42dafbe0c36ce838da4c3120bf2187efde647e486971896d9a9c59c37dac3da0f2ccf3ecd98d8dd1d3acc5404bfcf26e64a327d7797648646afd6b40be02fec2
7
+ data.tar.gz: 40f0b0958024b558d6aca7eb2b3b6f042f034059c8fca52ce97fab7d55a39c313797605341331c65efd1099a1310ccbe386c354024dbd3cbc61c1d96c423842d
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ v2.8.0 (28th Decemeber 2021)
2
+ - Add PDF::Reader::Page#runs for extracting text from a page with positioning metadata (http://github.com/yob/pdf-reader/pull/411)
3
+ - Add options to PDF::Reader::Page#text to make some behaviour configurable (http://github.com/yob/pdf-reader/pull/411)
4
+ - including extracting the text for only part of the page
5
+ - Improve text positioning and extraction for Type3 fonts (http://github.com/yob/pdf-reader/pull/412)
6
+ - Skip extracting text that is positioned outside the page (http://github.com/yob/pdf-reader/pull/413)
7
+ - Fix occasional crash when reading some streams (http://github.com/yob/pdf-reader/pull/405)
8
+
1
9
  v2.7.0 (13th December 2021)
2
10
  - Include RBI type files in the gem
3
11
  - Downstream users of pdf-reader who also use sorbet *should* find many parts of the API will
data/Rakefile CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
14
14
  Cane::RakeTask.new(:quality) do |cane|
15
15
  cane.abc_max = 20
16
16
  cane.style_measure = 100
17
- cane.max_violations = 32
17
+ cane.max_violations = 28
18
18
 
19
19
  cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
20
20
  end
@@ -0,0 +1,16 @@
1
+ # coding: utf-8
2
+ # typed: strict
3
+ # frozen_string_literal: true
4
+
5
+ class PDF::Reader
6
+
7
+ # Filter our text/characters that are positioned outside a rectangle. Usually the page
8
+ # MediaBox or CropBox, but could be a user specified rectangle too
9
+ class BoundingRectangleRunsFilter
10
+
11
+ def self.runs_within_rect(runs, rect)
12
+ runs.select { |run| rect.contains?(run.origin) }
13
+ end
14
+ end
15
+ end
16
+
@@ -43,6 +43,7 @@ class PDF::Reader
43
43
  @tounicode = nil
44
44
 
45
45
  extract_base_info(obj)
46
+ extract_type3_info(obj)
46
47
  extract_descriptor(obj)
47
48
  extract_descendants(obj)
48
49
  @width_calc = build_width_calculator
@@ -73,8 +74,44 @@ class PDF::Reader
73
74
  @cached_widths[code_point] ||= @width_calc.glyph_width(code_point)
74
75
  end
75
76
 
77
+ # In most cases glyph width is converted into text space with a simple divide by 1000.
78
+ #
79
+ # However, Type3 fonts provide their own FontMatrix that's used for the transformation.
80
+ #
81
+ def glyph_width_in_text_space(code_point)
82
+ glyph_width_in_glyph_space = glyph_width(code_point)
83
+
84
+ if @subtype == :Type3
85
+ x1, y1 = font_matrix_transform(0,0)
86
+ x2, y2 = font_matrix_transform(glyph_width_in_glyph_space, 0)
87
+ (x2 - x1).abs.round(2)
88
+ else
89
+ glyph_width_in_glyph_space / 1000.0
90
+ end
91
+ end
92
+
76
93
  private
77
94
 
95
+ # Only valid for Type3 fonts
96
+ def font_matrix_transform(x, y)
97
+ return x, y if @font_matrix.nil?
98
+
99
+ matrix = TransformationMatrix.new(
100
+ @font_matrix[0], @font_matrix[1],
101
+ @font_matrix[2], @font_matrix[3],
102
+ @font_matrix[4], @font_matrix[5],
103
+ )
104
+
105
+ if x == 0 && y == 0
106
+ [matrix.e, matrix.f]
107
+ else
108
+ [
109
+ (matrix.a * x) + (matrix.c * y) + (matrix.e),
110
+ (matrix.b * x) + (matrix.d * y) + (matrix.f)
111
+ ]
112
+ end
113
+ end
114
+
78
115
  def default_encoding(font_name)
79
116
  case font_name.to_s
80
117
  when "Symbol" then
@@ -138,6 +175,12 @@ class PDF::Reader
138
175
  end
139
176
  end
140
177
 
178
+ def extract_type3_info(obj)
179
+ if @subtype == :Type3
180
+ @font_matrix = @ohash.object(obj[:FontMatrix]) || [ 0.001, 0, 0, 0.001, 0, 0 ]
181
+ end
182
+ end
183
+
141
184
  def extract_descriptor(obj)
142
185
  if obj[:FontDescriptor]
143
186
  # create a font descriptor object if we can, in other words, unless this is
@@ -101,13 +101,24 @@ module PDF
101
101
  # returns the plain text content of this page encoded as UTF-8. Any
102
102
  # characters that can't be translated will be returned as a ▯
103
103
  #
104
- def text
104
+ def text(opts = {})
105
105
  receiver = PageTextReceiver.new
106
106
  walk(receiver)
107
- receiver.content
107
+ runs = receiver.runs(opts)
108
+
109
+ # rectangles[:MediaBox] can never be nil, but I have no easy way to tell sorbet that atm
110
+ mediabox = rectangles[:MediaBox] || Rectangle.new(0, 0, 0, 0)
111
+
112
+ PageLayout.new(runs, mediabox).to_s
108
113
  end
109
114
  alias :to_s :text
110
115
 
116
+ def runs(opts = {})
117
+ receiver = PageTextReceiver.new
118
+ walk(receiver)
119
+ receiver.runs(opts)
120
+ end
121
+
111
122
  # processes the raw content stream for this page in sequential order and
112
123
  # passes callbacks to the receiver objects.
113
124
  #
@@ -21,10 +21,8 @@ class PDF::Reader
21
21
  # PDF::Reader::Rectangle at some point
22
22
  PDF::Reader::Error.validate_not_nil(mediabox, "mediabox")
23
23
 
24
- runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
25
- runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
26
- @mediabox = mediabox
27
- @runs = merge_runs(runs)
24
+ @mediabox = process_mediabox(mediabox)
25
+ @runs = runs
28
26
  @mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
29
27
  @mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
30
28
  @median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
@@ -51,13 +49,11 @@ class PDF::Reader
51
49
  private
52
50
 
53
51
  def page_width
54
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.width`
55
- (@mediabox[2].to_f - @mediabox[0].to_f).abs
52
+ @mediabox.width
56
53
  end
57
54
 
58
55
  def page_height
59
- # TODO once @mediabox is a Rectangle, this can be just `@mediabox.height`
60
- (@mediabox[3].to_f - @mediabox[1].to_f).abs
56
+ @mediabox.height
61
57
  end
62
58
 
63
59
  # given an array of strings, return a new array with empty rows from the
@@ -109,30 +105,20 @@ class PDF::Reader
109
105
  end
110
106
  end
111
107
 
112
- # take a collection of TextRun objects and merge any that are in close
113
- # proximity
114
- def merge_runs(runs)
115
- runs.group_by { |char|
116
- char.y.to_i
117
- }.map { |y, chars|
118
- group_chars_into_runs(chars.sort)
119
- }.flatten.sort
108
+ def local_string_insert(haystack, needle, index)
109
+ haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
120
110
  end
121
111
 
122
- def group_chars_into_runs(chars)
123
- chars.each_with_object([]) do |char, runs|
124
- if runs.empty?
125
- runs << char
126
- elsif runs.last.mergable?(char)
127
- runs[-1] = runs.last + char
128
- else
129
- runs << char
130
- end
112
+ def process_mediabox(mediabox)
113
+ if mediabox.is_a?(Array)
114
+ msg = "Passing the mediabox to PageLayout as an Array is deprecated," +
115
+ " please use a Rectangle instead"
116
+ $stderr.puts msg
117
+ PDF::Reader::Rectangle.from_array(mediabox)
118
+ else
119
+ mediabox
131
120
  end
132
121
  end
133
122
 
134
- def local_string_insert(haystack, needle, index)
135
- haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
136
- end
137
123
  end
138
124
  end
@@ -47,9 +47,32 @@ module PDF
47
47
  @characters = []
48
48
  end
49
49
 
50
+ def runs(opts = {})
51
+ runs = @characters
52
+
53
+ if rect = opts.fetch(:rect, @page.rectangles[:CropBox])
54
+ runs = BoundingRectangleRunsFilter.runs_within_rect(runs, rect)
55
+ end
56
+
57
+ if opts.fetch(:skip_zero_width, true)
58
+ runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
59
+ end
60
+
61
+ if opts.fetch(:skip_overlapping, true)
62
+ runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
63
+ end
64
+
65
+ if opts.fetch(:merge, true)
66
+ runs = merge_runs(runs)
67
+ end
68
+
69
+ runs
70
+ end
71
+
72
+ # deprecated
50
73
  def content
51
- mediabox = @page.rectangles[:MediaBox].to_a
52
- PageLayout.new(@characters, mediabox).to_s
74
+ mediabox = @page.rectangles[:MediaBox]
75
+ PageLayout.new(runs, mediabox).to_s
53
76
  end
54
77
 
55
78
  #####################################################
@@ -109,7 +132,7 @@ module PDF
109
132
 
110
133
  # apply to glyph displacment for the current glyph so the next
111
134
  # glyph will appear in the correct position
112
- glyph_width = @state.current_font.glyph_width(glyph_code) / 1000.0
135
+ glyph_width = @state.current_font.glyph_width_in_text_space(glyph_code)
113
136
  th = 1
114
137
  scaled_glyph_width = glyph_width * @state.font_size * th
115
138
  unless utf8_chars == SPACE
@@ -119,12 +142,6 @@ module PDF
119
142
  end
120
143
  end
121
144
 
122
- # TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
123
- # think it sets the correct x,y values. We get away with it because we don't
124
- # return the text with co-ordinates, only the full text arranged in a string.
125
- #
126
- # We should provide an API for extracting the text with positioning data and spec
127
- # that. I suspect the co-ords might be wrong for rotated pages
128
145
  def apply_rotation(x, y)
129
146
  if @page.rotate == 90
130
147
  tmp = x
@@ -141,6 +158,28 @@ module PDF
141
158
  return x, y
142
159
  end
143
160
 
161
+ # take a collection of TextRun objects and merge any that are in close
162
+ # proximity
163
+ def merge_runs(runs)
164
+ runs.group_by { |char|
165
+ char.y.to_i
166
+ }.map { |y, chars|
167
+ group_chars_into_runs(chars.sort)
168
+ }.flatten.sort
169
+ end
170
+
171
+ def group_chars_into_runs(chars)
172
+ chars.each_with_object([]) do |char, runs|
173
+ if runs.empty?
174
+ runs << char
175
+ elsif runs.last.mergable?(char)
176
+ runs[-1] = runs.last + char
177
+ else
178
+ runs << char
179
+ end
180
+ end
181
+ end
182
+
144
183
  end
145
184
  end
146
185
  end
@@ -210,6 +210,9 @@ class PDF::Reader
210
210
  raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
211
211
  if @objects
212
212
  length = @objects.deref(dict[:Length])
213
+ if dict[:Filter]
214
+ dict[:Filter] = @objects.deref(dict[:Filter])
215
+ end
213
216
  else
214
217
  length = dict[:Length] || 0
215
218
  end
@@ -26,6 +26,19 @@ module PDF
26
26
  set_corners(x1, y1, x2, y2)
27
27
  end
28
28
 
29
+ def self.from_array(arr)
30
+ if arr.size != 4
31
+ raise ArgumentError, "Only 4-element Arrays can be converted to a Rectangle"
32
+ end
33
+
34
+ PDF::Reader::Rectangle.new(
35
+ arr[0].to_f,
36
+ arr[1].to_f,
37
+ arr[2].to_f,
38
+ arr[3].to_f,
39
+ )
40
+ end
41
+
29
42
  def ==(other)
30
43
  to_a == other.to_a
31
44
  end
@@ -38,6 +51,11 @@ module PDF
38
51
  bottom_right.x - bottom_left.x
39
52
  end
40
53
 
54
+ def contains?(point)
55
+ point.x >= bottom_left.x && point.x <= top_right.x &&
56
+ point.y >= bottom_left.y && point.y <= top_right.y
57
+ end
58
+
41
59
  # A pdf-style 4-number array
42
60
  def to_a
43
61
  [
@@ -7,15 +7,14 @@ class PDF::Reader
7
7
  class TextRun
8
8
  include Comparable
9
9
 
10
- attr_reader :x, :y, :width, :font_size, :text
10
+ attr_reader :origin, :width, :font_size, :text
11
11
 
12
12
  alias :to_s :text
13
13
 
14
14
  def initialize(x, y, width, font_size, text)
15
- @x = x
16
- @y = y
15
+ @origin = PDF::Reader::Point.new(x, y)
17
16
  @width = width
18
- @font_size = font_size.floor
17
+ @font_size = font_size
19
18
  @text = text
20
19
  end
21
20
 
@@ -35,12 +34,20 @@ class PDF::Reader
35
34
  end
36
35
  end
37
36
 
37
+ def x
38
+ @origin.x
39
+ end
40
+
41
+ def y
42
+ @origin.y
43
+ end
44
+
38
45
  def endx
39
- @endx ||= x + width
46
+ @endx ||= @origin.x + width
40
47
  end
41
48
 
42
49
  def endy
43
- @endy ||= y + font_size
50
+ @endy ||= @origin.y + font_size
44
51
  end
45
52
 
46
53
  def mean_character_width
data/lib/pdf/reader.rb CHANGED
@@ -112,17 +112,25 @@ module PDF
112
112
  #
113
113
  # reader = PDF::Reader.new("somefile.pdf", :password => "apples")
114
114
  #
115
+ # Using this method directly is supported, but it's more common to use
116
+ # `PDF::Reader.open`
117
+ #
115
118
  def initialize(input, opts = {})
116
119
  @cache = PDF::Reader::ObjectCache.new
117
120
  opts.merge!(:cache => @cache)
118
121
  @objects = PDF::Reader::ObjectHash.new(input, opts)
119
122
  end
120
123
 
124
+ # Return a Hash with some basic information about the PDF file
125
+ #
121
126
  def info
122
127
  dict = @objects.deref(@objects.trailer[:Info])
123
128
  doc_strings_to_utf8(dict)
124
129
  end
125
130
 
131
+ # Return a Hash with extra metadata provided by the author of the PDF file. Not
132
+ # always present.
133
+ #
126
134
  def metadata
127
135
  stream = @objects.deref(root[:Metadata])
128
136
  if stream.nil?
@@ -134,6 +142,8 @@ module PDF
134
142
  end
135
143
  end
136
144
 
145
+ # To number of pages in this PDF
146
+ #
137
147
  def page_count
138
148
  pages = @objects.deref(root[:Pages])
139
149
  unless pages.kind_of?(::Hash)
@@ -142,12 +152,14 @@ module PDF
142
152
  @page_count ||= @objects.deref(pages[:Count])
143
153
  end
144
154
 
155
+ # The PDF version this file uses
156
+ #
145
157
  def pdf_version
146
158
  @objects.pdf_version
147
159
  end
148
160
 
149
- # syntactic sugar for opening a PDF file. Accepts the same arguments
150
- # as new().
161
+ # syntactic sugar for opening a PDF file and the most common approach. Accepts the
162
+ # same arguments as new().
151
163
  #
152
164
  # PDF::Reader.open("somefile.pdf") do |reader|
153
165
  # puts reader.pdf_version
@@ -273,6 +285,7 @@ end
273
285
 
274
286
  require 'pdf/reader/resource_methods'
275
287
  require 'pdf/reader/buffer'
288
+ require 'pdf/reader/bounding_rectangle_runs_filter'
276
289
  require 'pdf/reader/cid_widths'
277
290
  require 'pdf/reader/cmap'
278
291
  require 'pdf/reader/encoding'
data/rbi/pdf-reader.rbi CHANGED
@@ -43,6 +43,13 @@ module PDF
43
43
  sig { returns(T::Hash[Symbol, T.untyped]) }
44
44
  def root; end
45
45
 
46
+ class BoundingRectangleRunsFilter
47
+ extend T::Sig
48
+
49
+ sig { params(runs: T::Array[PDF::Reader::TextRun], rect: PDF::Reader::Rectangle).returns(T::Array[PDF::Reader::TextRun]) }
50
+ def self.runs_within_rect(runs, rect); end
51
+ end
52
+
46
53
  class Buffer
47
54
  TOKEN_WHITESPACE = [0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
48
55
  TOKEN_DELIMITER = [0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]
@@ -750,8 +757,11 @@ module PDF
750
757
  sig { returns(T::Array[Numeric]) }
751
758
  def origin; end
752
759
 
753
- sig { returns(String) }
754
- def text; end
760
+ sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
761
+ def runs(opts = {}); end
762
+
763
+ sig { params(opts: T::Hash[Symbol, T.untyped]).returns(String) }
764
+ def text(opts = {}); end
755
765
 
756
766
  sig { params(receivers: T.untyped).void }
757
767
  def walk(*receivers); end
@@ -794,7 +804,7 @@ module PDF
794
804
  extend T::Sig
795
805
  DEFAULT_FONT_SIZE = 12
796
806
 
797
- sig { params(runs: T::Array[PDF::Reader::TextRun], mediabox: T::Array[Numeric]).void }
807
+ sig { params(runs: T::Array[PDF::Reader::TextRun], mediabox: T.any(T::Array[Numeric], PDF::Reader::Rectangle)).void }
798
808
  def initialize(runs, mediabox); end
799
809
 
800
810
  sig { returns(String) }
@@ -829,6 +839,9 @@ module PDF
829
839
 
830
840
  sig { params(haystack: T.untyped, needle: T.untyped, index: T.untyped).returns(T.untyped) }
831
841
  def local_string_insert(haystack, needle, index); end
842
+
843
+ sig { params(mediabox: T.untyped).returns(T.untyped) }
844
+ def process_mediabox(mediabox); end
832
845
  end
833
846
 
834
847
  class PageState
@@ -996,6 +1009,9 @@ module PDF
996
1009
  sig { params(str: T.untyped).returns(T.untyped) }
997
1010
  def move_to_next_line_and_show_text(str); end
998
1011
 
1012
+ sig { params(opts: T::Hash[Symbol, T.untyped]).returns(T::Array[PDF::Reader::TextRun]) }
1013
+ def runs(opts = {}); end
1014
+
999
1015
  sig { params(aw: T.untyped, ac: T.untyped, string: T.untyped).returns(T.untyped) }
1000
1016
  def set_spacing_next_line_show_text(aw, ac, string); end
1001
1017
 
@@ -1122,6 +1138,9 @@ module PDF
1122
1138
  end
1123
1139
 
1124
1140
  class Rectangle
1141
+ sig { params(arr: T::Array[Numeric]).returns(PDF::Reader::Rectangle) }
1142
+ def self.from_array(arr); end
1143
+
1125
1144
  sig do
1126
1145
  params(
1127
1146
  x1: Numeric,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pdf-reader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.7.0
4
+ version: 2.8.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - James Healy
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-12-13 00:00:00.000000000 Z
11
+ date: 2021-12-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -215,6 +215,7 @@ files:
215
215
  - lib/pdf/reader/afm/Times-Italic.afm
216
216
  - lib/pdf/reader/afm/Times-Roman.afm
217
217
  - lib/pdf/reader/afm/ZapfDingbats.afm
218
+ - lib/pdf/reader/bounding_rectangle_runs_filter.rb
218
219
  - lib/pdf/reader/buffer.rb
219
220
  - lib/pdf/reader/cid_widths.rb
220
221
  - lib/pdf/reader/cmap.rb
@@ -281,9 +282,9 @@ licenses:
281
282
  - MIT
282
283
  metadata:
283
284
  bug_tracker_uri: https://github.com/yob/pdf-reader/issues
284
- changelog_uri: https://github.com/yob/pdf-reader/blob/v2.7.0/CHANGELOG
285
- documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.7.0
286
- source_code_uri: https://github.com/yob/pdf-reader/tree/v2.7.0
285
+ changelog_uri: https://github.com/yob/pdf-reader/blob/v2.8.0/CHANGELOG
286
+ documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.8.0
287
+ source_code_uri: https://github.com/yob/pdf-reader/tree/v2.8.0
287
288
  post_install_message:
288
289
  rdoc_options:
289
290
  - "--title"