pdf-reader 2.6.0 → 2.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +13 -1
- data/examples/rspec.rb +1 -0
- data/lib/pdf/reader/buffer.rb +1 -0
- data/lib/pdf/reader/cid_widths.rb +1 -0
- data/lib/pdf/reader/cmap.rb +5 -3
- data/lib/pdf/reader/encoding.rb +2 -1
- data/lib/pdf/reader/error.rb +8 -0
- data/lib/pdf/reader/filter/ascii85.rb +2 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +6 -1
- data/lib/pdf/reader/filter/depredict.rb +7 -5
- data/lib/pdf/reader/filter/flate.rb +2 -0
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +1 -0
- data/lib/pdf/reader/filter/run_length.rb +19 -13
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/font.rb +1 -0
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +4 -2
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +5 -2
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/overlapping_runs_filter.rb +11 -4
- data/lib/pdf/reader/page.rb +60 -9
- data/lib/pdf/reader/page_layout.rb +24 -14
- data/lib/pdf/reader/page_state.rb +11 -10
- data/lib/pdf/reader/page_text_receiver.rb +13 -8
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +4 -1
- data/lib/pdf/reader/point.rb +25 -0
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/rectangle.rb +95 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +5 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +1 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +1 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +1 -0
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +1 -0
- data/lib/pdf/reader/zero_width_runs_filter.rb +2 -0
- data/lib/pdf/reader.rb +14 -4
- data/lib/pdf-reader.rb +1 -0
- data/rbi/pdf-reader.rbi +1744 -0
- metadata +12 -10
- data/lib/pdf/reader/orientation_detector.rb +0 -34
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'pdf/reader/transformation_matrix'
|
@@ -312,7 +313,7 @@ class PDF::Reader
|
|
312
313
|
# may need to be added
|
313
314
|
#
|
314
315
|
def process_glyph_displacement(w0, tj, word_boundary)
|
315
|
-
fs =
|
316
|
+
fs = state[:text_font_size]
|
316
317
|
tc = state[:char_spacing]
|
317
318
|
if word_boundary
|
318
319
|
tw = state[:word_spacing]
|
@@ -330,16 +331,16 @@ class PDF::Reader
|
|
330
331
|
# apply horizontal scaling to spacing values but not font size
|
331
332
|
tx = ((w0 * fs) + tc + tw) * th
|
332
333
|
end
|
333
|
-
|
334
|
-
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
335
|
-
# ctm[0] here, but this gets my tests green and I'm out of
|
336
|
-
# ideas for now
|
337
334
|
# TODO: support ty > 0
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
335
|
+
ty = 0
|
336
|
+
temp = TransformationMatrix.new(1, 0,
|
337
|
+
0, 1,
|
338
|
+
tx, ty)
|
339
|
+
@text_matrix = temp.multiply!(
|
340
|
+
@text_matrix.a, @text_matrix.b,
|
341
|
+
@text_matrix.c, @text_matrix.d,
|
342
|
+
@text_matrix.e, @text_matrix.f
|
343
|
+
)
|
343
344
|
@font_size = @text_rendering_matrix = nil # invalidate cached value
|
344
345
|
end
|
345
346
|
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
require 'forwardable'
|
@@ -44,14 +45,11 @@ module PDF
|
|
44
45
|
@page = page
|
45
46
|
@content = []
|
46
47
|
@characters = []
|
47
|
-
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
48
|
-
device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
|
49
|
-
device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
|
50
|
-
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
51
48
|
end
|
52
49
|
|
53
50
|
def content
|
54
|
-
|
51
|
+
mediabox = @page.rectangles[:MediaBox].to_a
|
52
|
+
PageLayout.new(@characters, mediabox).to_s
|
55
53
|
end
|
56
54
|
|
57
55
|
#####################################################
|
@@ -121,6 +119,12 @@ module PDF
|
|
121
119
|
end
|
122
120
|
end
|
123
121
|
|
122
|
+
# TODO: revist this. It rotates the co-ordinates to the right direction, but I don't
|
123
|
+
# think it sets the correct x,y values. We get away with it because we don't
|
124
|
+
# return the text with co-ordinates, only the full text arranged in a string.
|
125
|
+
#
|
126
|
+
# We should provide an API for extracting the text with positioning data and spec
|
127
|
+
# that. I suspect the co-ords might be wrong for rotated pages
|
124
128
|
def apply_rotation(x, y)
|
125
129
|
if @page.rotate == 90
|
126
130
|
tmp = x
|
@@ -128,10 +132,11 @@ module PDF
|
|
128
132
|
y = tmp * -1
|
129
133
|
elsif @page.rotate == 180
|
130
134
|
y *= -1
|
135
|
+
x *= -1
|
131
136
|
elsif @page.rotate == 270
|
132
|
-
tmp =
|
133
|
-
|
134
|
-
|
137
|
+
tmp = y
|
138
|
+
y = x
|
139
|
+
x = tmp * -1
|
135
140
|
end
|
136
141
|
return x, y
|
137
142
|
end
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -166,7 +167,9 @@ class PDF::Reader
|
|
166
167
|
|
167
168
|
# add a missing digit if required, as required by the spec
|
168
169
|
str << "0" unless str.size % 2 == 0
|
169
|
-
str.
|
170
|
+
str.chars.each_slice(2).map { |nibbles|
|
171
|
+
nibbles.join("").hex.chr
|
172
|
+
}.join.force_encoding("binary")
|
170
173
|
end
|
171
174
|
################################################################################
|
172
175
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs are all about positioning content on a page, so there's lots of need to
|
9
|
+
# work with a set of X,Y coordinates.
|
10
|
+
#
|
11
|
+
class Point
|
12
|
+
|
13
|
+
attr_reader :x, :y
|
14
|
+
|
15
|
+
def initialize(x, y)
|
16
|
+
@x, @y = x, y
|
17
|
+
end
|
18
|
+
|
19
|
+
def ==(other)
|
20
|
+
other.respond_to?(:x) && other.respond_to?(:y) && x == other.x && y == other.y
|
21
|
+
end
|
22
|
+
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# typed: true
|
3
|
+
# frozen_string_literal: true
|
4
|
+
|
5
|
+
module PDF
|
6
|
+
class Reader
|
7
|
+
|
8
|
+
# PDFs represent rectangles all over the place. They're 4 element arrays, like this:
|
9
|
+
#
|
10
|
+
# [A, B, C, D]
|
11
|
+
#
|
12
|
+
# Four element arrays are yucky to work with though, so here's a class that's better.
|
13
|
+
# Initialize it with the 4 elements, and get utility functions (width, height, etc)
|
14
|
+
# for free.
|
15
|
+
#
|
16
|
+
# By convention the first two elements are x1, y1, the co-ords for the bottom left corner
|
17
|
+
# of the rectangle. The third and fourth elements are x2, y2, the co-ords for the top left
|
18
|
+
# corner of the rectangle. It's valid for the alternative corners to be used though, so
|
19
|
+
# we don't assume which is which.
|
20
|
+
#
|
21
|
+
class Rectangle
|
22
|
+
|
23
|
+
attr_reader :bottom_left, :bottom_right, :top_left, :top_right
|
24
|
+
|
25
|
+
def initialize(x1, y1, x2, y2)
|
26
|
+
set_corners(x1, y1, x2, y2)
|
27
|
+
end
|
28
|
+
|
29
|
+
def ==(other)
|
30
|
+
to_a == other.to_a
|
31
|
+
end
|
32
|
+
|
33
|
+
def height
|
34
|
+
top_right.y - bottom_right.y
|
35
|
+
end
|
36
|
+
|
37
|
+
def width
|
38
|
+
bottom_right.x - bottom_left.x
|
39
|
+
end
|
40
|
+
|
41
|
+
# A pdf-style 4-number array
|
42
|
+
def to_a
|
43
|
+
[
|
44
|
+
bottom_left.x,
|
45
|
+
bottom_left.y,
|
46
|
+
top_right.x,
|
47
|
+
top_right.y,
|
48
|
+
]
|
49
|
+
end
|
50
|
+
|
51
|
+
def apply_rotation(degrees)
|
52
|
+
return if degrees != 90 && degrees != 180 && degrees != 270
|
53
|
+
|
54
|
+
if degrees == 90
|
55
|
+
new_x1 = bottom_left.x
|
56
|
+
new_y1 = bottom_left.y - width
|
57
|
+
new_x2 = bottom_left.x + height
|
58
|
+
new_y2 = bottom_left.y
|
59
|
+
elsif degrees == 180
|
60
|
+
new_x1 = bottom_left.x - width
|
61
|
+
new_y1 = bottom_left.y - height
|
62
|
+
new_x2 = bottom_left.x
|
63
|
+
new_y2 = bottom_left.y
|
64
|
+
elsif degrees == 270
|
65
|
+
new_x1 = bottom_left.x - height
|
66
|
+
new_y1 = bottom_left.y
|
67
|
+
new_x2 = bottom_left.x
|
68
|
+
new_y2 = bottom_left.y + width
|
69
|
+
end
|
70
|
+
set_corners(new_x1, new_y1, new_x2, new_y2)
|
71
|
+
end
|
72
|
+
|
73
|
+
private
|
74
|
+
|
75
|
+
def set_corners(x1, y1, x2, y2)
|
76
|
+
@bottom_left = PDF::Reader::Point.new(
|
77
|
+
[x1, x2].min,
|
78
|
+
[y1, y2].min,
|
79
|
+
)
|
80
|
+
@bottom_right = PDF::Reader::Point.new(
|
81
|
+
[x1, x2].max,
|
82
|
+
[y1, y2].min,
|
83
|
+
)
|
84
|
+
@top_left = PDF::Reader::Point.new(
|
85
|
+
[x1, x2].min,
|
86
|
+
[y1, y2].max,
|
87
|
+
)
|
88
|
+
@top_right = PDF::Reader::Point.new(
|
89
|
+
[x1, x2].max,
|
90
|
+
[y1, y2].max,
|
91
|
+
)
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
end
|
data/lib/pdf/reader/reference.rb
CHANGED
@@ -1,12 +1,17 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: false
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
5
|
+
# Setting this file to "typed: true" is difficult because it's a mixin that assumes some things
|
6
|
+
# are aavailable from the class, like @objects and resources. Sorbet doesn't know about them.
|
7
|
+
|
4
8
|
module PDF
|
5
9
|
class Reader
|
6
10
|
|
7
11
|
# mixin for common methods in Page and FormXobjects
|
8
12
|
#
|
9
13
|
module ResourceMethods
|
14
|
+
|
10
15
|
# Returns a Hash of color spaces that are available to this page
|
11
16
|
#
|
12
17
|
# NOTE: this method de-serialise objects from the underlying PDF
|
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
data/lib/pdf/reader/token.rb
CHANGED
data/lib/pdf/reader/xref.rb
CHANGED
data/lib/pdf/reader.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# typed: true
|
2
3
|
# frozen_string_literal: true
|
3
4
|
|
4
5
|
################################################################################
|
@@ -136,7 +137,7 @@ module PDF
|
|
136
137
|
def page_count
|
137
138
|
pages = @objects.deref(root[:Pages])
|
138
139
|
unless pages.kind_of?(::Hash)
|
139
|
-
raise MalformedPDFError,
|
140
|
+
raise MalformedPDFError, "Pages structure is missing #{pages.class}"
|
140
141
|
end
|
141
142
|
@page_count ||= @objects.deref(pages[:Count])
|
142
143
|
end
|
@@ -221,7 +222,7 @@ module PDF
|
|
221
222
|
when Array then
|
222
223
|
obj.map { |item| doc_strings_to_utf8(item) }
|
223
224
|
when String then
|
224
|
-
if obj
|
225
|
+
if has_utf16_bom?(obj)
|
225
226
|
utf16_to_utf8(obj)
|
226
227
|
else
|
227
228
|
pdfdoc_to_utf8(obj)
|
@@ -231,6 +232,14 @@ module PDF
|
|
231
232
|
end
|
232
233
|
end
|
233
234
|
|
235
|
+
def has_utf16_bom?(str)
|
236
|
+
first_bytes = str[0,2]
|
237
|
+
|
238
|
+
return false if first_bytes.nil?
|
239
|
+
|
240
|
+
first_bytes.unpack("C*") == [254, 255]
|
241
|
+
end
|
242
|
+
|
234
243
|
# TODO find a PDF I can use to spec this behaviour
|
235
244
|
#
|
236
245
|
def pdfdoc_to_utf8(obj)
|
@@ -242,7 +251,7 @@ module PDF
|
|
242
251
|
# String#encode
|
243
252
|
#
|
244
253
|
def utf16_to_utf8(obj)
|
245
|
-
str = obj[2, obj.size]
|
254
|
+
str = obj[2, obj.size].to_s
|
246
255
|
str = str.unpack("n*").pack("U*")
|
247
256
|
str.force_encoding("utf-8")
|
248
257
|
str
|
@@ -286,7 +295,9 @@ require 'pdf/reader/object_hash'
|
|
286
295
|
require 'pdf/reader/object_stream'
|
287
296
|
require 'pdf/reader/pages_strategy'
|
288
297
|
require 'pdf/reader/parser'
|
298
|
+
require 'pdf/reader/point'
|
289
299
|
require 'pdf/reader/print_receiver'
|
300
|
+
require 'pdf/reader/rectangle'
|
290
301
|
require 'pdf/reader/reference'
|
291
302
|
require 'pdf/reader/register_receiver'
|
292
303
|
require 'pdf/reader/null_security_handler'
|
@@ -299,5 +310,4 @@ require 'pdf/reader/page_state'
|
|
299
310
|
require 'pdf/reader/page_text_receiver'
|
300
311
|
require 'pdf/reader/token'
|
301
312
|
require 'pdf/reader/xref'
|
302
|
-
require 'pdf/reader/orientation_detector'
|
303
313
|
require 'pdf/reader/page'
|