pdf-reader 2.4.1 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +40 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +63 -21
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +3 -2
- data/lib/pdf/reader/error.rb +11 -3
- data/lib/pdf/reader/filter/ascii85.rb +7 -1
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +10 -8
- data/lib/pdf/reader/filter/flate.rb +27 -14
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +16 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +8 -3
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +37 -23
- data/lib/pdf/reader/page_state.rb +18 -23
- data/lib/pdf/reader/page_text_receiver.rb +28 -5
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +12 -7
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +8 -15
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +7 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +13 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +17 -13
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'forwardable'
|
@@ -41,16 +42,14 @@ module PDF
|
|
41
42
|
# starting a new page
|
42
43
|
def page=(page)
|
43
44
|
@state = PageState.new(page)
|
45
|
+
@page = page
|
44
46
|
@content = []
|
45
47
|
@characters = []
|
46
|
-
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
47
|
-
device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
|
48
|
-
device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
|
49
|
-
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
50
48
|
end
|
51
49
|
|
52
50
|
def content
|
53
|
-
|
51
|
+
mediabox = @page.rectangles[:MediaBox].to_a
|
52
|
+
PageLayout.new(@characters, mediabox).to_s
|
54
53
|
end
|
55
54
|
|
56
55
|
#####################################################
|
@@ -104,6 +103,8 @@ module PDF
|
|
104
103
|
glyphs.each_with_index do |glyph_code, index|
|
105
104
|
# paint the current glyph
|
106
105
|
newx, newy = @state.trm_transform(0,0)
|
106
|
+
newx, newy = apply_rotation(newx, newy)
|
107
|
+
|
107
108
|
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
108
109
|
|
109
110
|
# apply to glyph displacment for the current glyph so the next
|
@@ -118,6 +119,28 @@ module PDF
|
|
118
119
|
end
|
119
120
|
end
|
120
121
|
|
122
|
+
# TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
|
123
|
+
# think it sets the correct x,y values. We get away with it because we don't
|
124
|
+
# return the text with co-ordinates, only the full text arranged in a string.
|
125
|
+
#
|
126
|
+
# We should provide an API for extracting the text with positioning data and spec
|
127
|
+
# that. I suspect the co-ords might be wrong for rotated pages
|
128
|
+
def apply_rotation(x, y)
|
129
|
+
if @page.rotate == 90
|
130
|
+
tmp = x
|
131
|
+
x = y
|
132
|
+
y = tmp * -1
|
133
|
+
elsif @page.rotate == 180
|
134
|
+
y *= -1
|
135
|
+
x *= -1
|
136
|
+
elsif @page.rotate == 270
|
137
|
+
tmp = y
|
138
|
+
y = x
|
139
|
+
x = tmp * -1
|
140
|
+
end
|
141
|
+
return x, y
|
142
|
+
end
|
143
|
+
|
121
144
|
end
|
122
145
|
end
|
123
146
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -166,7 +167,9 @@ class PDF::Reader
|
|
166
167
|
|
167
168
|
# add a missing digit if required, as required by the spec
|
168
169
|
str << "0" unless str.size % 2 == 0
|
169
|
-
str.
|
170
|
+
str.chars.each_slice(2).map { |nibbles|
|
171
|
+
nibbles.join("").hex.chr
|
172
|
+
}.join.force_encoding("binary")
|
170
173
|
end
|
171
174
|
################################################################################
|
172
175
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
@@ -175,15 +178,18 @@ class PDF::Reader
|
|
175
178
|
return "".dup.force_encoding("binary") if str == ")"
|
176
179
|
Error.assert_equal(parse_token, ")")
|
177
180
|
|
178
|
-
str.gsub!(/\\([nrtbf()\\\n]
|
179
|
-
|
181
|
+
str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
|
182
|
+
if $2.nil? # not octal digits
|
183
|
+
MAPPING[match] || "".dup
|
184
|
+
else # must be octal digits
|
185
|
+
($2.oct & 0xff).chr # ignore high level overflow
|
186
|
+
end
|
180
187
|
end
|
181
188
|
str.force_encoding("binary")
|
182
189
|
end
|
183
190
|
|
184
191
|
MAPPING = {
|
185
192
|
"\r" => "\n",
|
186
|
-
"\n\r" => "\n",
|
187
193
|
"\r\n" => "\n",
|
188
194
|
"\\n" => "\n",
|
189
195
|
"\\r" => "\r",
|
@@ -194,10 +200,9 @@ class PDF::Reader
|
|
194
200
|
"\\)" => ")",
|
195
201
|
"\\\\" => "\\",
|
196
202
|
"\\\n" => "",
|
203
|
+
"\\\r" => "",
|
204
|
+
"\\\r\n" => "",
|
197
205
|
}
|
198
|
-
0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
|
199
|
-
0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
|
200
|
-
0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
|
201
206
|
|
202
207
|
################################################################################
|
203
208
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs are all about positioning content on a page, so there's lots of need to
|
9
|
+
# work with a set of X,Y coordinates.
|
10
|
+
#
|
11
|
+
class Point
|
12
|
+
|
13
|
+
attr_reader :x, :y
|
14
|
+
|
15
|
+
def initialize(x, y)
|
16
|
+
@x, @y = x, y
|
17
|
+
end
|
18
|
+
|
19
|
+
def ==(other)
|
20
|
+
other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs represent rectangles all over the place. They're 4 element arrays, like this:
|
9
|
+
#
|
10
|
+
# [A, B, C, D]
|
11
|
+
#
|
12
|
+
# Four element arrays are yucky to work with though, so here's a class that's better.
|
13
|
+
# Initialize it with the 4 elements, and get utility functions (width, height, etc)
|
14
|
+
# for free.
|
15
|
+
#
|
16
|
+
# By convention the first two elements are x1, y1, the co-ords for the bottom left corner
|
17
|
+
# of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
|
18
|
+
# corner of the rectangle. It's valid for the alternative corners to be used though, so
|
19
|
+
# we don't assume which is which.
|
20
|
+
#
|
21
|
+
class Rectangle
|
22
|
+
|
23
|
+
attr_reader :bottom_left, :bottom_right, :top_left, :top_right
|
24
|
+
|
25
|
+
def initialize(x1, y1, x2, y2)
|
26
|
+
set_corners(x1, y1, x2, y2)
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
to_a == other.to_a
|
31
|
+
end
|
32
|
+
|
33
|
+
def height
|
34
|
+
top_right.y - bottom_right.y
|
35
|
+
end
|
36
|
+
|
37
|
+
def width
|
38
|
+
bottom_right.x - bottom_left.x
|
39
|
+
end
|
40
|
+
|
41
|
+
# A pdf-style 4-number array
|
42
|
+
def to_a
|
43
|
+
[
|
44
|
+
bottom_left.x,
|
45
|
+
bottom_left.y,
|
46
|
+
top_right.x,
|
47
|
+
top_right.y,
|
48
|
+
]
|
49
|
+
end
|
50
|
+
|
51
|
+
def apply_rotation(degrees)
|
52
|
+
return if degrees != 90 && degrees != 180 && degrees != 270
|
53
|
+
|
54
|
+
if degrees == 90
|
55
|
+
new_x1 = bottom_left.x
|
56
|
+
new_y1 = bottom_left.y - width
|
57
|
+
new_x2 = bottom_left.x + height
|
58
|
+
new_y2 = bottom_left.y
|
59
|
+
elsif degrees == 180
|
60
|
+
new_x1 = bottom_left.x - width
|
61
|
+
new_y1 = bottom_left.y - height
|
62
|
+
new_x2 = bottom_left.x
|
63
|
+
new_y2 = bottom_left.y
|
64
|
+
elsif degrees == 270
|
65
|
+
new_x1 = bottom_left.x - height
|
66
|
+
new_y1 = bottom_left.y
|
67
|
+
new_x2 = bottom_left.x
|
68
|
+
new_y2 = bottom_left.y + width
|
69
|
+
end
|
70
|
+
set_corners(new_x1, new_y1, new_x2, new_y2)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def set_corners(x1, y1, x2, y2)
|
76
|
+
@bottom_left = PDF::Reader::Point.new(
|
77
|
+
[x1, x2].min,
|
78
|
+
[y1, y2].min,
|
79
|
+
)
|
80
|
+
@bottom_right = PDF::Reader::Point.new(
|
81
|
+
[x1, x2].max,
|
82
|
+
[y1, y2].min,
|
83
|
+
)
|
84
|
+
@top_left = PDF::Reader::Point.new(
|
85
|
+
[x1, x2].min,
|
86
|
+
[y1, y2].max,
|
87
|
+
)
|
88
|
+
@top_right = PDF::Reader::Point.new(
|
89
|
+
[x1, x2].max,
|
90
|
+
[y1, y2].max,
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
# Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
|
6
|
+
# are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
|
7
|
+
|
4
8
|
module PDF
|
5
9
|
class Reader
|
6
10
|
|
7
11
|
# mixin for common methods in Page and FormXobjects
|
8
12
|
#
|
9
13
|
module ResourceMethods
|
14
|
+
|
10
15
|
# Returns a Hash of color spaces that are available to this page
|
11
16
|
#
|
12
17
|
# NOTE: this method de-serialise objects from the underlying PDF
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
data/lib/pdf/reader/token.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'afm'
|
@@ -37,23 +38,15 @@ class PDF::Reader
|
|
37
38
|
def glyph_width(code_point)
|
38
39
|
return 0 if code_point.nil? || code_point < 0
|
39
40
|
|
40
|
-
|
41
|
-
|
42
|
-
|
41
|
+
names = @font.encoding.int_to_name(code_point)
|
42
|
+
metrics = names.map { |name|
|
43
|
+
@metrics.char_metrics[name.to_s]
|
44
|
+
}.compact.first
|
43
45
|
|
44
|
-
|
45
|
-
|
46
|
-
}.compact.first
|
47
|
-
end
|
48
|
-
|
49
|
-
if m
|
50
|
-
m[:wx]
|
51
|
-
elsif @font.widths[code_point - 1]
|
52
|
-
@font.widths[code_point - 1]
|
53
|
-
elsif control_character?(code_point)
|
54
|
-
0
|
46
|
+
if metrics
|
47
|
+
metrics[:wx]
|
55
48
|
else
|
56
|
-
0
|
49
|
+
@font.widths[code_point - 1] || 0
|
57
50
|
end
|
58
51
|
end
|
59
52
|
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -131,6 +132,9 @@ class PDF::Reader
|
|
131
132
|
generation = buf.token.to_i
|
132
133
|
state = buf.token
|
133
134
|
|
135
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
136
|
+
# TODO should this fix be logged?
|
137
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
138
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
139
|
objid += 1
|
136
140
|
params.clear
|
@@ -146,7 +150,9 @@ class PDF::Reader
|
|
146
150
|
end
|
147
151
|
|
148
152
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
153
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
154
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
155
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
156
|
|
151
157
|
trailer
|
152
158
|
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: strict
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
class PDF::Reader
|
6
|
+
# There's no point rendering zero-width characters
|
7
|
+
class ZeroWidthRunsFilter
|
8
|
+
|
9
|
+
def self.exclude_zero_width_runs(runs)
|
10
|
+
runs.reject { |run| run.width == 0 }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -136,7 +137,7 @@ module PDF
|
|
136
137
|
def page_count
|
137
138
|
pages = @objects.deref(root[:Pages])
|
138
139
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
140
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
141
|
end
|
141
142
|
@page_count ||= @objects.deref(pages[:Count])
|
142
143
|
end
|
@@ -221,7 +222,7 @@ module PDF
|
|
221
222
|
when Array then
|
222
223
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
224
|
when String then
|
224
|
-
if obj
|
225
|
+
if has_utf16_bom?(obj)
|
225
226
|
utf16_to_utf8(obj)
|
226
227
|
else
|
227
228
|
pdfdoc_to_utf8(obj)
|
@@ -231,6 +232,14 @@ module PDF
|
|
231
232
|
end
|
232
233
|
end
|
233
234
|
|
235
|
+
def has_utf16_bom?(str)
|
236
|
+
first_bytes = str[0,2]
|
237
|
+
|
238
|
+
return false if first_bytes.nil?
|
239
|
+
|
240
|
+
first_bytes.unpack("C*") == [254, 255]
|
241
|
+
end
|
242
|
+
|
234
243
|
# TODO find a PDF I can use to spec this behaviour
|
235
244
|
#
|
236
245
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +251,7 @@ module PDF
|
|
242
251
|
# String#encode
|
243
252
|
#
|
244
253
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
254
|
+
str = obj[2, obj.size].to_s
|
246
255
|
str = str.unpack("n*").pack("U*")
|
247
256
|
str.force_encoding("utf-8")
|
248
257
|
str
|
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
|
|
286
295
|
require 'pdf/reader/object_stream'
|
287
296
|
require 'pdf/reader/pages_strategy'
|
288
297
|
require 'pdf/reader/parser'
|
298
|
+
require 'pdf/reader/point'
|
289
299
|
require 'pdf/reader/print_receiver'
|
300
|
+
require 'pdf/reader/rectangle'
|
290
301
|
require 'pdf/reader/reference'
|
291
302
|
require 'pdf/reader/register_receiver'
|
292
303
|
require 'pdf/reader/null_security_handler'
|
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
|
|
299
310
|
require 'pdf/reader/page_text_receiver'
|
300
311
|
require 'pdf/reader/token'
|
301
312
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
313
|
require 'pdf/reader/page'
|