pdf-reader 1.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# Provides low level access to the objects in a PDF file via a hash-like
|
@@ -45,6 +46,7 @@ class PDF::Reader
|
|
45
46
|
@pdf_version = read_version
|
46
47
|
@trailer = @xref.trailer
|
47
48
|
@cache = opts[:cache] || PDF::Reader::ObjectCache.new
|
49
|
+
@sec_handler = NullSecurityHandler.new
|
48
50
|
@sec_handler = build_security_handler(opts)
|
49
51
|
end
|
50
52
|
|
@@ -76,16 +78,7 @@ class PDF::Reader
|
|
76
78
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
77
79
|
end
|
78
80
|
|
79
|
-
|
80
|
-
@cache[key]
|
81
|
-
elsif xref[key].is_a?(Integer)
|
82
|
-
buf = new_buffer(xref[key])
|
83
|
-
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
84
|
-
elsif xref[key].is_a?(PDF::Reader::Reference)
|
85
|
-
container_key = xref[key]
|
86
|
-
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
87
|
-
@cache[key] = object_streams[container_key][key.id]
|
88
|
-
end
|
81
|
+
@cache[key] ||= fetch_object(key) || fetch_object_stream(key)
|
89
82
|
rescue InvalidObjectError
|
90
83
|
return default
|
91
84
|
end
|
@@ -102,21 +95,7 @@ class PDF::Reader
|
|
102
95
|
# a PDF::Reader::Reference, the key is returned unchanged.
|
103
96
|
#
|
104
97
|
def deref!(key)
|
105
|
-
|
106
|
-
when Hash
|
107
|
-
{}.tap { |hash|
|
108
|
-
object.each do |k, value|
|
109
|
-
hash[k] = deref!(value)
|
110
|
-
end
|
111
|
-
}
|
112
|
-
when PDF::Reader::Stream
|
113
|
-
object.hash = deref!(object.hash)
|
114
|
-
object
|
115
|
-
when Array
|
116
|
-
object.map { |value| deref!(value) }
|
117
|
-
else
|
118
|
-
object
|
119
|
-
end
|
98
|
+
deref_internal!(key, {})
|
120
99
|
end
|
121
100
|
|
122
101
|
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
@@ -266,24 +245,95 @@ class PDF::Reader
|
|
266
245
|
|
267
246
|
private
|
268
247
|
|
269
|
-
|
270
|
-
|
248
|
+
# parse a traditional object from the PDF, starting from the byte offset indicated
|
249
|
+
# in the xref table
|
250
|
+
#
|
251
|
+
def fetch_object(key)
|
252
|
+
if xref[key].is_a?(Integer)
|
253
|
+
buf = new_buffer(xref[key])
|
254
|
+
decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# parse a object that's embedded in an object stream in the PDF
|
259
|
+
#
|
260
|
+
def fetch_object_stream(key)
|
261
|
+
if xref[key].is_a?(PDF::Reader::Reference)
|
262
|
+
container_key = xref[key]
|
263
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
264
|
+
object_streams[container_key][key.id]
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
268
|
+
# Private implementation of deref!, which exists to ensure the `seen` argument
|
269
|
+
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
270
|
+
# doesn't need to be part of the public API.
|
271
|
+
#
|
272
|
+
def deref_internal!(key, seen)
|
273
|
+
seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id
|
274
|
+
|
275
|
+
return seen[seen_key] if seen.key?(seen_key)
|
271
276
|
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
277
|
+
case object = deref(key)
|
278
|
+
when Hash
|
279
|
+
seen[seen_key] ||= {}
|
280
|
+
object.each do |k, value|
|
281
|
+
seen[seen_key][k] = deref_internal!(value, seen)
|
282
|
+
end
|
283
|
+
seen[seen_key]
|
284
|
+
when PDF::Reader::Stream
|
285
|
+
seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
|
286
|
+
object.hash.each do |k,value|
|
287
|
+
seen[seen_key].hash[k] = deref_internal!(value, seen)
|
288
|
+
end
|
289
|
+
seen[seen_key]
|
290
|
+
when Array
|
291
|
+
seen[seen_key] ||= []
|
292
|
+
object.each do |value|
|
293
|
+
seen[seen_key] << deref_internal!(value, seen)
|
294
|
+
end
|
295
|
+
seen[seen_key]
|
276
296
|
else
|
277
|
-
|
297
|
+
object
|
278
298
|
end
|
279
299
|
end
|
280
300
|
|
281
|
-
def
|
282
|
-
|
301
|
+
def build_security_handler(opts = {})
|
302
|
+
encrypt = deref(trailer[:Encrypt])
|
303
|
+
if NullSecurityHandler.supports?(encrypt)
|
304
|
+
NullSecurityHandler.new
|
305
|
+
elsif StandardSecurityHandler.supports?(encrypt)
|
306
|
+
encmeta = !encrypt.has_key?(:EncryptMetadata) || encrypt[:EncryptMetadata].to_s == "true"
|
307
|
+
StandardSecurityHandler.new(
|
308
|
+
key_length: (encrypt[:Length] || 40).to_i,
|
309
|
+
revision: encrypt[:R],
|
310
|
+
owner_key: encrypt[:O],
|
311
|
+
user_key: encrypt[:U],
|
312
|
+
permissions: encrypt[:P].to_i,
|
313
|
+
encrypted_metadata: encmeta,
|
314
|
+
file_id: (deref(trailer[:ID]) || []).first,
|
315
|
+
password: opts[:password],
|
316
|
+
cfm: encrypt.fetch(:CF, {}).fetch(encrypt[:StmF], {}).fetch(:CFM, nil)
|
317
|
+
)
|
318
|
+
elsif StandardSecurityHandlerV5.supports?(encrypt)
|
319
|
+
StandardSecurityHandlerV5.new(
|
320
|
+
O: encrypt[:O],
|
321
|
+
U: encrypt[:U],
|
322
|
+
OE: encrypt[:OE],
|
323
|
+
UE: encrypt[:UE],
|
324
|
+
password: opts[:password]
|
325
|
+
)
|
326
|
+
else
|
327
|
+
UnimplementedSecurityHandler.new
|
328
|
+
end
|
329
|
+
end
|
283
330
|
|
331
|
+
def decrypt(ref, obj)
|
284
332
|
case obj
|
285
333
|
when PDF::Reader::Stream then
|
286
|
-
|
334
|
+
# PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
|
335
|
+
# Therefore we shouldn't try to decrypt it.
|
336
|
+
obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
|
287
337
|
obj
|
288
338
|
when Hash then
|
289
339
|
arr = obj.map { |key,val| [key, decrypt(ref, val)] }.flatten(1)
|
@@ -312,11 +362,15 @@ class PDF::Reader
|
|
312
362
|
# returns a nested array of object references for all pages in this object store.
|
313
363
|
#
|
314
364
|
def get_page_objects(ref)
|
315
|
-
obj =
|
365
|
+
obj = deref(ref)
|
366
|
+
|
367
|
+
unless obj.kind_of?(::Hash)
|
368
|
+
raise MalformedPDFError, "Dereferenced page object must be a dict"
|
369
|
+
end
|
316
370
|
|
317
371
|
if obj[:Type] == :Page
|
318
372
|
ref
|
319
|
-
elsif obj[:
|
373
|
+
elsif obj[:Kids]
|
320
374
|
deref(obj[:Kids]).map { |kid| get_page_objects(kid) }
|
321
375
|
end
|
322
376
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# Small util class for detecting the orientation of a single PDF page. Accounts
|
@@ -21,12 +22,12 @@ class PDF::Reader
|
|
21
22
|
def detect_orientation
|
22
23
|
llx,lly,urx,ury = @attributes[:MediaBox]
|
23
24
|
rotation = @attributes[:Rotate].to_i
|
24
|
-
width = urx.to_i - llx.to_i
|
25
|
-
height = ury.to_i - lly.to_i
|
25
|
+
width = (urx.to_i - llx.to_i).abs
|
26
|
+
height = (ury.to_i - lly.to_i).abs
|
26
27
|
if width > height
|
27
|
-
|
28
|
+
(rotation % 180).zero? ? 'landscape' : 'portrait'
|
28
29
|
else
|
29
|
-
|
30
|
+
(rotation % 180).zero? ? 'portrait' : 'landscape'
|
30
31
|
end
|
31
32
|
end
|
32
33
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
5
|
+
# uses slightly offset overlapping characters to achieve a fake 'bold' effect.
|
6
|
+
class OverlappingRunsFilter
|
7
|
+
|
8
|
+
# This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
|
9
|
+
# have identical characters) then one will be discarded
|
10
|
+
OVERLAPPING_THRESHOLD = 0.5
|
11
|
+
|
12
|
+
def self.exclude_redundant_runs(runs)
|
13
|
+
sweep_line_status = Array.new
|
14
|
+
event_point_schedule = Array.new
|
15
|
+
to_exclude = []
|
16
|
+
|
17
|
+
runs.each do |run|
|
18
|
+
event_point_schedule << EventPoint.new(run.x, run)
|
19
|
+
event_point_schedule << EventPoint.new(run.endx, run)
|
20
|
+
end
|
21
|
+
|
22
|
+
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
|
+
|
24
|
+
event_point_schedule.each do |event_point|
|
25
|
+
run = event_point.run
|
26
|
+
|
27
|
+
if event_point.start?
|
28
|
+
if detect_intersection(sweep_line_status, event_point)
|
29
|
+
to_exclude << run
|
30
|
+
end
|
31
|
+
sweep_line_status.push(run)
|
32
|
+
else
|
33
|
+
sweep_line_status.delete(run)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
runs - to_exclude
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.detect_intersection(sweep_line_status, event_point)
|
40
|
+
sweep_line_status.each do |open_text_run|
|
41
|
+
if event_point.x >= open_text_run.x &&
|
42
|
+
event_point.x <= open_text_run.endx &&
|
43
|
+
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
|
+
return true
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
|
+
# looking for duplicates
|
53
|
+
class EventPoint
|
54
|
+
attr_reader :x, :run
|
55
|
+
|
56
|
+
def initialize x, run
|
57
|
+
@x, @run = x, run
|
58
|
+
end
|
59
|
+
|
60
|
+
def start?
|
61
|
+
@x == @run.x
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PDF
|
4
5
|
class Reader
|
@@ -36,7 +37,7 @@ module PDF
|
|
36
37
|
@cache = options[:cache] || {}
|
37
38
|
|
38
39
|
unless @page_object.is_a?(::Hash)
|
39
|
-
raise
|
40
|
+
raise InvalidPageError, "Invalid page: #{pagenum}"
|
40
41
|
end
|
41
42
|
end
|
42
43
|
|
@@ -123,6 +124,34 @@ module PDF
|
|
123
124
|
}.join(" ")
|
124
125
|
end
|
125
126
|
|
127
|
+
# returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
|
128
|
+
#
|
129
|
+
def rotate
|
130
|
+
value = attributes[:Rotate].to_i
|
131
|
+
case value
|
132
|
+
when 0, 90, 180, 270
|
133
|
+
value
|
134
|
+
else
|
135
|
+
0
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# returns the "boxes" that define the page object.
|
140
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
|
+
#
|
142
|
+
def boxes
|
143
|
+
mediabox = attributes[:MediaBox]
|
144
|
+
cropbox = attributes[:Cropbox] || mediabox
|
145
|
+
|
146
|
+
{
|
147
|
+
MediaBox: objects.deref!(mediabox),
|
148
|
+
CropBox: objects.deref!(cropbox),
|
149
|
+
BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
|
150
|
+
TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
|
151
|
+
ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
126
155
|
private
|
127
156
|
|
128
157
|
def root
|
@@ -1,4 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'pdf/reader/overlapping_runs_filter'
|
2
5
|
|
3
6
|
class PDF::Reader
|
4
7
|
|
@@ -8,28 +11,33 @@ class PDF::Reader
|
|
8
11
|
# media box should be a 4 number array that describes the dimensions of the
|
9
12
|
# page to be rendered as described by the page's MediaBox attribute
|
10
13
|
class PageLayout
|
14
|
+
|
15
|
+
DEFAULT_FONT_SIZE = 12
|
16
|
+
|
11
17
|
def initialize(runs, mediabox)
|
12
18
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
13
19
|
|
14
|
-
@runs = merge_runs(runs)
|
15
|
-
@mean_font_size = mean(@runs.map(&:font_size)) ||
|
20
|
+
@runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
|
21
|
+
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
22
|
+
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
16
23
|
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
17
|
-
@page_width = mediabox[2] - mediabox[0]
|
18
|
-
@page_height = mediabox[3] - mediabox[1]
|
19
|
-
@x_offset = @runs.map(&:x).sort.first
|
20
|
-
|
21
|
-
|
24
|
+
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
|
+
@page_height = (mediabox[3] - mediabox[1]).abs
|
26
|
+
@x_offset = @runs.map(&:x).sort.first || 0
|
27
|
+
lowest_y = @runs.map(&:y).sort.first || 0
|
28
|
+
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
22
29
|
end
|
23
30
|
|
24
31
|
def to_s
|
25
32
|
return "" if @runs.empty?
|
33
|
+
return "" if row_count == 0
|
26
34
|
|
27
35
|
page = row_count.times.map { |i| " " * col_count }
|
28
36
|
@runs.each do |run|
|
29
37
|
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
30
|
-
y_pos = row_count - (run.y / row_multiplier).round
|
31
|
-
if y_pos
|
32
|
-
local_string_insert(page[y_pos], run.text, x_pos)
|
38
|
+
y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
|
39
|
+
if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
|
40
|
+
local_string_insert(page[y_pos-1], run.text, x_pos)
|
33
41
|
end
|
34
42
|
end
|
35
43
|
interesting_rows(page).map(&:rstrip).join("\n")
|
@@ -110,21 +118,8 @@ class PDF::Reader
|
|
110
118
|
runs
|
111
119
|
end
|
112
120
|
|
113
|
-
# This is a simple alternative to String#[]=. We can't use the string
|
114
|
-
# method as it's buggy on rubinius 2.0rc1 (in 1.9 mode)
|
115
|
-
#
|
116
|
-
# See my bug report at https://github.com/rubinius/rubinius/issues/1985
|
117
121
|
def local_string_insert(haystack, needle, index)
|
118
|
-
|
119
|
-
char_count = needle.length
|
120
|
-
haystack.replace(
|
121
|
-
(haystack[0,index] || "") +
|
122
|
-
needle +
|
123
|
-
(haystack[index+char_count,500] || "")
|
124
|
-
)
|
125
|
-
else
|
126
|
-
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
127
|
-
end
|
122
|
+
haystack[Range.new(index, index + needle.length - 1)] = String.new(needle)
|
128
123
|
end
|
129
124
|
end
|
130
125
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'pdf/reader/transformation_matrix'
|
4
5
|
|
@@ -29,7 +30,7 @@ class PDF::Reader
|
|
29
30
|
@xobject_stack = [page.xobjects]
|
30
31
|
@cs_stack = [page.color_spaces]
|
31
32
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
32
|
-
state[:ctm]
|
33
|
+
state[:ctm] = identity_matrix
|
33
34
|
end
|
34
35
|
|
35
36
|
#####################################################
|
@@ -321,11 +322,13 @@ class PDF::Reader
|
|
321
322
|
th = state[:h_scaling]
|
322
323
|
# optimise the common path to reduce Float allocations
|
323
324
|
if th == 1 && tj == 0 && tc == 0 && tw == 0
|
324
|
-
|
325
|
-
|
325
|
+
tx = w0 * fs
|
326
|
+
elsif tj != 0
|
327
|
+
# don't apply spacing to TJ displacement
|
328
|
+
tx = (w0 - (tj/1000.0)) * fs * th
|
326
329
|
else
|
327
|
-
|
328
|
-
tx =
|
330
|
+
# apply horizontal scaling to spacing values but not font size
|
331
|
+
tx = ((w0 * fs) + tc + tw) * th
|
329
332
|
end
|
330
333
|
|
331
334
|
# TODO: I'm pretty sure that tx shouldn't need to be divided by
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'forwardable'
|
4
5
|
require 'pdf/reader/page_layout'
|
@@ -40,13 +41,17 @@ module PDF
|
|
40
41
|
# starting a new page
|
41
42
|
def page=(page)
|
42
43
|
@state = PageState.new(page)
|
44
|
+
@page = page
|
43
45
|
@content = []
|
44
46
|
@characters = []
|
45
47
|
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
48
|
+
device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
|
49
|
+
device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
|
50
|
+
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
46
51
|
end
|
47
52
|
|
48
53
|
def content
|
49
|
-
PageLayout.new(@characters, @
|
54
|
+
PageLayout.new(@characters, @device_mediabox).to_s
|
50
55
|
end
|
51
56
|
|
52
57
|
#####################################################
|
@@ -100,6 +105,8 @@ module PDF
|
|
100
105
|
glyphs.each_with_index do |glyph_code, index|
|
101
106
|
# paint the current glyph
|
102
107
|
newx, newy = @state.trm_transform(0,0)
|
108
|
+
newx, newy = apply_rotation(newx, newy)
|
109
|
+
|
103
110
|
utf8_chars = @state.current_font.to_utf8(glyph_code)
|
104
111
|
|
105
112
|
# apply to glyph displacment for the current glyph so the next
|
@@ -114,6 +121,21 @@ module PDF
|
|
114
121
|
end
|
115
122
|
end
|
116
123
|
|
124
|
+
def apply_rotation(x, y)
|
125
|
+
if @page.rotate == 90
|
126
|
+
tmp = x
|
127
|
+
x = y
|
128
|
+
y = tmp * -1
|
129
|
+
elsif @page.rotate == 180
|
130
|
+
y *= -1
|
131
|
+
elsif @page.rotate == 270
|
132
|
+
tmp = x
|
133
|
+
x = y * -1
|
134
|
+
y = tmp * -1
|
135
|
+
end
|
136
|
+
return x, y
|
137
|
+
end
|
138
|
+
|
117
139
|
end
|
118
140
|
end
|
119
141
|
end
|