pdf-reader 2.1.0 → 2.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +28 -1
- data/README.md +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader.rb +2 -2
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +12 -11
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +22 -12
- data/lib/pdf/reader/encoding.rb +12 -9
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +6 -4
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +3 -1
- data/lib/pdf/reader/font.rb +11 -2
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +2 -1
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +22 -10
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +29 -0
- data/lib/pdf/reader/page_layout.rb +10 -5
- data/lib/pdf/reader/page_state.rb +10 -1
- data/lib/pdf/reader/page_text_receiver.rb +5 -1
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +5 -4
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +25 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- metadata +17 -13
- data/lib/pdf/hash.rb +0 -19
data/lib/pdf/reader/cmap.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -95,25 +96,34 @@ class PDF::Reader
|
|
95
96
|
Parser.new(buffer)
|
96
97
|
end
|
97
98
|
|
99
|
+
# The following includes some manual decoding of UTF-16BE strings into unicode codepoints. In
|
100
|
+
# theory we could replace all the UTF-16 code with something based on Ruby's encoding support:
|
101
|
+
#
|
102
|
+
# str.dup.force_encoding("utf-16be").encode!("utf-8").unpack("U*")
|
103
|
+
#
|
104
|
+
# However, some cmaps contain broken surrogate pairs and the ruby encoding support raises an
|
105
|
+
# exception when we try converting broken UTF-16 to UTF-8
|
106
|
+
#
|
98
107
|
def str_to_int(str)
|
99
108
|
return nil if str.nil? || str.size == 0
|
100
|
-
unpacked_string = if str.
|
109
|
+
unpacked_string = if str.bytesize == 1 # UTF-8
|
101
110
|
str.unpack("C*")
|
102
111
|
else # UTF-16
|
103
112
|
str.unpack("n*")
|
104
113
|
end
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
114
|
+
result = []
|
115
|
+
while unpacked_string.any? do
|
116
|
+
if unpacked_string.size >= 2 && unpacked_string[0] > 0xD800 && unpacked_string[0] < 0xDBFF
|
117
|
+
# this is a Unicode UTF-16 "Surrogate Pair" see Unicode Spec. Chapter 3.7
|
118
|
+
# lets convert to a UTF-32. (the high bit is between 0xD800-0xDBFF, the
|
119
|
+
# low bit is between 0xDC00-0xDFFF) for example: U+1D44E (U+D835 U+DC4E)
|
120
|
+
points = [unpacked_string.shift, unpacked_string.shift]
|
121
|
+
result << (points[0] - 0xD800) * 0x400 + (points[1] - 0xDC00) + 0x10000
|
122
|
+
else
|
123
|
+
result << unpacked_string.shift
|
124
|
+
end
|
116
125
|
end
|
126
|
+
result
|
117
127
|
end
|
118
128
|
|
119
129
|
def process_bfchar_instructions(instructions)
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -39,20 +40,22 @@ class PDF::Reader
|
|
39
40
|
@mapping = default_mapping # maps from character codes to Unicode codepoints
|
40
41
|
@string_cache = {} # maps from character codes to UTF-8 strings.
|
41
42
|
|
42
|
-
if enc.kind_of?(Hash)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
enc = enc.to_sym
|
43
|
+
@enc_name = if enc.kind_of?(Hash)
|
44
|
+
enc[:Encoding] || enc[:BaseEncoding]
|
45
|
+
elsif enc && enc.respond_to?(:to_sym)
|
46
|
+
enc.to_sym
|
47
47
|
else
|
48
|
-
|
48
|
+
:StandardEncoding
|
49
49
|
end
|
50
50
|
|
51
|
-
@
|
52
|
-
@
|
53
|
-
@map_file = get_mapping_file(enc)
|
51
|
+
@unpack = get_unpack(@enc_name)
|
52
|
+
@map_file = get_mapping_file(@enc_name)
|
54
53
|
|
55
54
|
load_mapping(@map_file) if @map_file
|
55
|
+
|
56
|
+
if enc.is_a?(Hash) && enc[:Differences]
|
57
|
+
self.differences = enc[:Differences]
|
58
|
+
end
|
56
59
|
end
|
57
60
|
|
58
61
|
# set the differences table for this encoding. should be an array in the following format:
|
data/lib/pdf/reader/error.rb
CHANGED
data/lib/pdf/reader/filter.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
|
4
5
|
require 'zlib'
|
@@ -7,6 +8,8 @@ class PDF::Reader
|
|
7
8
|
module Filter # :nodoc:
|
8
9
|
# implementation of the Flate (zlib) stream filter
|
9
10
|
class Flate
|
11
|
+
ZLIB_AUTO_DETECT_ZLIB_OR_GZIP = 47 # Zlib::MAX_WBITS + 32
|
12
|
+
|
10
13
|
def initialize(options = {})
|
11
14
|
@options = options
|
12
15
|
end
|
@@ -16,16 +19,15 @@ class PDF::Reader
|
|
16
19
|
def filter(data)
|
17
20
|
deflated = nil
|
18
21
|
begin
|
19
|
-
deflated = Zlib::Inflate.new.inflate(data)
|
22
|
+
deflated = Zlib::Inflate.new(ZLIB_AUTO_DETECT_ZLIB_OR_GZIP).inflate(data)
|
20
23
|
rescue Zlib::DataError => e
|
21
24
|
# by default, Ruby's Zlib assumes the data it's inflating
|
22
|
-
# is RFC1951 deflated data, wrapped in a
|
23
|
-
#
|
25
|
+
# is RFC1951 deflated data, wrapped in a RFC1950 zlib container. If that
|
26
|
+
# fails, then use a lightly-documented 'feature' to attempt to inflate
|
24
27
|
# the data as a raw RFC1951 stream.
|
25
28
|
#
|
26
29
|
# See
|
27
30
|
# - http://blade.nagaokaut.ac.jp/cgi-bin/scat.rb/ruby/ruby-talk/243545
|
28
|
-
# - http://www.gzip.org/zlib/zlib_faq.html#faq38
|
29
31
|
deflated = Zlib::Inflate.new(-Zlib::MAX_WBITS).inflate(data)
|
30
32
|
end
|
31
33
|
Depredict.new(@options).filter(deflated)
|
@@ -1,4 +1,6 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
2
4
|
#
|
3
5
|
class PDF::Reader # :nodoc:
|
4
6
|
module Filter # :nodoc:
|
@@ -12,7 +14,7 @@ class PDF::Reader # :nodoc:
|
|
12
14
|
# Decode the specified data with the RunLengthDecode compression algorithm
|
13
15
|
def filter(data)
|
14
16
|
pos = 0
|
15
|
-
out = ""
|
17
|
+
out = "".dup
|
16
18
|
|
17
19
|
while pos < data.length
|
18
20
|
length = data.getbyte(pos)
|
data/lib/pdf/reader/font.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -96,7 +97,13 @@ class PDF::Reader
|
|
96
97
|
elsif @subtype == :Type3
|
97
98
|
PDF::Reader::WidthCalculator::TypeOneOrThree.new(self)
|
98
99
|
elsif @subtype == :TrueType
|
99
|
-
|
100
|
+
if @font_descriptor
|
101
|
+
PDF::Reader::WidthCalculator::TrueType.new(self)
|
102
|
+
else
|
103
|
+
# A TrueType font that isn't embedded. Most readers look for a version on the
|
104
|
+
# local system and fallback to a substitute. For now, we go straight to a substitute
|
105
|
+
PDF::Reader::WidthCalculator::BuiltIn.new(self)
|
106
|
+
end
|
100
107
|
elsif @subtype == :CIDFontType0 || @subtype == :CIDFontType2
|
101
108
|
PDF::Reader::WidthCalculator::Composite.new(self)
|
102
109
|
else
|
@@ -124,7 +131,9 @@ class PDF::Reader
|
|
124
131
|
if obj[:ToUnicode]
|
125
132
|
# ToUnicode is optional for Type1 and Type3
|
126
133
|
stream = @ohash.object(obj[:ToUnicode])
|
127
|
-
|
134
|
+
if stream.is_a?(PDF::Reader::Stream)
|
135
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
136
|
+
end
|
128
137
|
end
|
129
138
|
end
|
130
139
|
|
data/lib/pdf/reader/lzw.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PDF
|
4
5
|
|
@@ -82,7 +83,7 @@ module PDF
|
|
82
83
|
#
|
83
84
|
def self.decode(data)
|
84
85
|
stream = BitStream.new data.to_s, 9 # size of codes between 9 and 12 bits
|
85
|
-
result =
|
86
|
+
result = "".dup
|
86
87
|
until (code = stream.read) == CODE_EOD
|
87
88
|
if code == CODE_CLEAR_TABLE
|
88
89
|
stream.set_bits_in_chunk(9)
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# Provides low level access to the objects in a PDF file via a hash-like
|
@@ -77,16 +78,7 @@ class PDF::Reader
|
|
77
78
|
key = PDF::Reader::Reference.new(key.to_i, 0)
|
78
79
|
end
|
79
80
|
|
80
|
-
|
81
|
-
@cache[key]
|
82
|
-
elsif xref[key].is_a?(Integer)
|
83
|
-
buf = new_buffer(xref[key])
|
84
|
-
@cache[key] = decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
85
|
-
elsif xref[key].is_a?(PDF::Reader::Reference)
|
86
|
-
container_key = xref[key]
|
87
|
-
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
88
|
-
@cache[key] = object_streams[container_key][key.id]
|
89
|
-
end
|
81
|
+
@cache[key] ||= fetch_object(key) || fetch_object_stream(key)
|
90
82
|
rescue InvalidObjectError
|
91
83
|
return default
|
92
84
|
end
|
@@ -253,6 +245,26 @@ class PDF::Reader
|
|
253
245
|
|
254
246
|
private
|
255
247
|
|
248
|
+
# parse a traditional object from the PDF, starting from the byte offset indicated
|
249
|
+
# in the xref table
|
250
|
+
#
|
251
|
+
def fetch_object(key)
|
252
|
+
if xref[key].is_a?(Integer)
|
253
|
+
buf = new_buffer(xref[key])
|
254
|
+
decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
|
255
|
+
end
|
256
|
+
end
|
257
|
+
|
258
|
+
# parse a object that's embedded in an object stream in the PDF
|
259
|
+
#
|
260
|
+
def fetch_object_stream(key)
|
261
|
+
if xref[key].is_a?(PDF::Reader::Reference)
|
262
|
+
container_key = xref[key]
|
263
|
+
object_streams[container_key] ||= PDF::Reader::ObjectStream.new(object(container_key))
|
264
|
+
object_streams[container_key][key.id]
|
265
|
+
end
|
266
|
+
end
|
267
|
+
|
256
268
|
# Private implementation of deref!, which exists to ensure the `seen` argument
|
257
269
|
# isn't publicly available. It's used to avoid endless loops in the recursion, and
|
258
270
|
# doesn't need to be part of the public API.
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# Small util class for detecting the orientation of a single PDF page. Accounts
|
@@ -21,12 +22,12 @@ class PDF::Reader
|
|
21
22
|
def detect_orientation
|
22
23
|
llx,lly,urx,ury = @attributes[:MediaBox]
|
23
24
|
rotation = @attributes[:Rotate].to_i
|
24
|
-
width = urx.to_i - llx.to_i
|
25
|
-
height = ury.to_i - lly.to_i
|
25
|
+
width = (urx.to_i - llx.to_i).abs
|
26
|
+
height = (ury.to_i - lly.to_i).abs
|
26
27
|
if width > height
|
27
|
-
|
28
|
+
(rotation % 180).zero? ? 'landscape' : 'portrait'
|
28
29
|
else
|
29
|
-
|
30
|
+
(rotation % 180).zero? ? 'portrait' : 'landscape'
|
30
31
|
end
|
31
32
|
end
|
32
33
|
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
# remove duplicates from a collection of TextRun objects. This can be helpful when a PDF
|
5
|
+
# uses slightly offset overlapping characters to achieve a fake 'bold' effect.
|
6
|
+
class OverlappingRunsFilter
|
7
|
+
|
8
|
+
# This should be between 0 and 1. If TextRun B obscures this much of TextRun A (and they
|
9
|
+
# have identical characters) then one will be discarded
|
10
|
+
OVERLAPPING_THRESHOLD = 0.5
|
11
|
+
|
12
|
+
def self.exclude_redundant_runs(runs)
|
13
|
+
sweep_line_status = Array.new
|
14
|
+
event_point_schedule = Array.new
|
15
|
+
to_exclude = []
|
16
|
+
|
17
|
+
runs.each do |run|
|
18
|
+
event_point_schedule << EventPoint.new(run.x, run)
|
19
|
+
event_point_schedule << EventPoint.new(run.endx, run)
|
20
|
+
end
|
21
|
+
|
22
|
+
event_point_schedule.sort! { |a,b| a.x <=> b.x }
|
23
|
+
|
24
|
+
event_point_schedule.each do |event_point|
|
25
|
+
run = event_point.run
|
26
|
+
|
27
|
+
if event_point.start?
|
28
|
+
if detect_intersection(sweep_line_status, event_point)
|
29
|
+
to_exclude << run
|
30
|
+
end
|
31
|
+
sweep_line_status.push(run)
|
32
|
+
else
|
33
|
+
sweep_line_status.delete(run)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
runs - to_exclude
|
37
|
+
end
|
38
|
+
|
39
|
+
def self.detect_intersection(sweep_line_status, event_point)
|
40
|
+
sweep_line_status.each do |open_text_run|
|
41
|
+
if event_point.x >= open_text_run.x &&
|
42
|
+
event_point.x <= open_text_run.endx &&
|
43
|
+
open_text_run.intersection_area_percent(event_point.run) >= OVERLAPPING_THRESHOLD
|
44
|
+
return true
|
45
|
+
end
|
46
|
+
end
|
47
|
+
return false
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Utility class used to avoid modifying the underlying TextRun objects while we're
|
52
|
+
# looking for duplicates
|
53
|
+
class EventPoint
|
54
|
+
attr_reader :x, :run
|
55
|
+
|
56
|
+
def initialize x, run
|
57
|
+
@x, @run = x, run
|
58
|
+
end
|
59
|
+
|
60
|
+
def start?
|
61
|
+
@x == @run.x
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
data/lib/pdf/reader/page.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
module PDF
|
4
5
|
class Reader
|
@@ -123,6 +124,34 @@ module PDF
|
|
123
124
|
}.join(" ")
|
124
125
|
end
|
125
126
|
|
127
|
+
# returns the angle to rotate the page clockwise. Always 0, 90, 180 or 270
|
128
|
+
#
|
129
|
+
def rotate
|
130
|
+
value = attributes[:Rotate].to_i
|
131
|
+
case value
|
132
|
+
when 0, 90, 180, 270
|
133
|
+
value
|
134
|
+
else
|
135
|
+
0
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
# returns the "boxes" that define the page object.
|
140
|
+
# values are defaulted according to section 7.7.3.3 of the PDF Spec 1.7
|
141
|
+
#
|
142
|
+
def boxes
|
143
|
+
mediabox = attributes[:MediaBox]
|
144
|
+
cropbox = attributes[:Cropbox] || mediabox
|
145
|
+
|
146
|
+
{
|
147
|
+
MediaBox: objects.deref!(mediabox),
|
148
|
+
CropBox: objects.deref!(cropbox),
|
149
|
+
BleedBox: objects.deref!(attributes[:BleedBox] || cropbox),
|
150
|
+
TrimBox: objects.deref!(attributes[:TrimBox] || cropbox),
|
151
|
+
ArtBox: objects.deref!(attributes[:ArtBox] || cropbox)
|
152
|
+
}
|
153
|
+
end
|
154
|
+
|
126
155
|
private
|
127
156
|
|
128
157
|
def root
|