pdf-reader 2.5.0 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG +17 -0
- data/README.md +16 -1
- data/Rakefile +1 -1
- data/examples/extract_fonts.rb +12 -7
- data/lib/pdf/reader/buffer.rb +62 -21
- data/lib/pdf/reader/encoding.rb +1 -1
- data/lib/pdf/reader/error.rb +3 -3
- data/lib/pdf/reader/filter/ascii85.rb +5 -1
- data/lib/pdf/reader/filter/depredict.rb +3 -3
- data/lib/pdf/reader/glyph_hash.rb +15 -9
- data/lib/pdf/reader/glyphlist-zapfdingbats.txt +245 -0
- data/lib/pdf/reader/page_layout.rb +12 -9
- data/lib/pdf/reader/page_text_receiver.rb +2 -2
- data/lib/pdf/reader/parser.rb +8 -6
- data/lib/pdf/reader/xref.rb +6 -1
- data/lib/pdf/reader/zero_width_runs_filter.rb +11 -0
- metadata +11 -9
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ccc4d14f5820ca798f6eafa1c0978207759ec1668c6f6307acb7cd43bcd0626e
|
4
|
+
data.tar.gz: 466bfe0a91f57463a56d9697ccd2529f981c6917e4ed578b4103f2bc87065522
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45d6c16b3d9ed029e6eb5a45cc64aa95e7ada2950e052053cbe0b6f5aae632f824a86f0505a5cee660abd1cd896177a0637a2f2f5a3f3633e829e8d46fb59817
|
7
|
+
data.tar.gz: e3e566344bd5560387577597dea20b2f7da40aed2a7fa8b8d074c0742486db59d7e349f6c38c91c8dcd9b0a8cf2aa4c19a00d0ee097003449504b3f06f18ca3c
|
data/CHANGELOG
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
v2.6.0 (12th November 2021)
|
2
|
+
- Text extraction improvements
|
3
|
+
- Improved text layout on pages with a variery of font sizes (http://github.com/yob/pdf-reader/pull/355)
|
4
|
+
- Fixed text positioning for some rotated pages (http://github.com/yob/pdf-reader/pull/356)
|
5
|
+
- Improved character width calculation for PDFs using built-in (non-embedded) ZapfDingbats (http://github.com/yob/pdf-reader/pull/373)
|
6
|
+
- Skip zero-width characters (http://github.com/yob/pdf-reader/pull/372)
|
7
|
+
- Performance improvements
|
8
|
+
- Reduced memory pressure when decoding TIFF images (http://github.com/yob/pdf-reader/pull/360)
|
9
|
+
- Optional dependency on ascii81_native gem for faster processing of files using the ascii85 filter (http://github.com/yob/pdf-reader/pull/359)
|
10
|
+
- Successfully parse more files
|
11
|
+
- Gracefully handle some non-spec compliant CR/LF issues (http://github.com/yob/pdf-reader/pull/364)
|
12
|
+
- Fix parsing of some escape sequences in content streams (http://github.com/yob/pdf-reader/pull/368)
|
13
|
+
- Increase the amount of junk bytes we detect and skip at the end of a file (382)
|
14
|
+
- Ignore "/Prev 0" in trailers (http://github.com/yob/pdf-reader/pull/383)
|
15
|
+
- Fix parsing of some inline images (BI ID EI tokens) (http://github.com/yob/pdf-reader/pull/389)
|
16
|
+
- Gracefully handle some xref tables that incorrectly start with 1 (http://github.com/yob/pdf-reader/pull/384)
|
17
|
+
|
1
18
|
v2.5.0 (6th June 2021)
|
2
19
|
- bump minimum ruby version to 2.0
|
3
20
|
- Correctly handle trascoding to UTF-8 from some fonts that use a difference table [#344](https://github.com/yob/pdf-reader/pull/344/)
|
data/README.md
CHANGED
@@ -166,6 +166,19 @@ http://groups.google.com/group/pdf-reader
|
|
166
166
|
The easiest way to explain how this works in practice is to show some examples.
|
167
167
|
Check out the examples/ directory for a few files.
|
168
168
|
|
169
|
+
# Alternate Decoder
|
170
|
+
|
171
|
+
For PDF files containing Ascii85 streams, the [ascii85_native](https://github.com/AnomalousBit/ascii85_native) gem can be used for increased performance. If the ascii85_native gem is detected, pdf-reader will automatically use the gem.
|
172
|
+
|
173
|
+
First, run `gem install ascii85_native` and then require the gem alongside pdf-reader:
|
174
|
+
|
175
|
+
```ruby
|
176
|
+
require "pdf-reader"
|
177
|
+
require "ascii85_native"
|
178
|
+
```
|
179
|
+
|
180
|
+
Another way of enabling native Ascii85 decoding is to place `gem 'ascii85_native'` in your project's `Gemfile`.
|
181
|
+
|
169
182
|
# Known Limitations
|
170
183
|
|
171
184
|
Occasionally some text cannot be extracted properly due to the way it has been
|
@@ -176,7 +189,9 @@ little UTF-8 friendly box to indicate an unrecognisable character.
|
|
176
189
|
|
177
190
|
* PDF::Reader Code Repository: http://github.com/yob/pdf-reader
|
178
191
|
|
179
|
-
* PDF Specification:
|
192
|
+
* PDF Specification: https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
|
193
|
+
|
194
|
+
* Adobe PDF Developer Resources: http://www.adobe.com/devnet/pdf/pdf_reference.html
|
180
195
|
|
181
196
|
* PDF Tutorial Slide Presentations: https://web.archive.org/web/20150110042057/http://home.comcast.net/~jk05/presentations/PDFTutorials.html
|
182
197
|
|
data/Rakefile
CHANGED
@@ -14,7 +14,7 @@ desc "Run cane to check quality metrics"
|
|
14
14
|
Cane::RakeTask.new(:quality) do |cane|
|
15
15
|
cane.abc_max = 20
|
16
16
|
cane.style_measure = 100
|
17
|
-
cane.max_violations =
|
17
|
+
cane.max_violations = 32
|
18
18
|
|
19
19
|
cane.use Morecane::EncodingCheck, :encoding_glob => "{app,lib,spec}/**/*.rb"
|
20
20
|
end
|
data/examples/extract_fonts.rb
CHANGED
@@ -17,8 +17,8 @@ module ExtractFonts
|
|
17
17
|
return count if page.fonts.nil? || page.fonts.empty?
|
18
18
|
|
19
19
|
page.fonts.each do |label, font|
|
20
|
-
next if complete_refs[
|
21
|
-
complete_refs[
|
20
|
+
next if complete_refs[label]
|
21
|
+
complete_refs[label] = true
|
22
22
|
|
23
23
|
process_font(page, font)
|
24
24
|
|
@@ -39,7 +39,7 @@ module ExtractFonts
|
|
39
39
|
when :TrueType, :CIDFontType2 then
|
40
40
|
ExtractFonts::TTF.new(page.objects, font).save("#{font[:BaseFont]}.ttf")
|
41
41
|
else
|
42
|
-
$stderr.puts "unsupported font type #{font[:Subtype]}"
|
42
|
+
$stderr.puts "unsupported font type #{font[:Subtype]} for #{font[:BaseFont]}"
|
43
43
|
end
|
44
44
|
end
|
45
45
|
|
@@ -68,10 +68,15 @@ module ExtractFonts
|
|
68
68
|
end
|
69
69
|
end
|
70
70
|
|
71
|
-
|
71
|
+
if ARGV.size == 0 # default file name
|
72
|
+
ARGV << File.expand_path(File.join(File.dirname(__dir__), "spec", "data", "cairo-unicode.pdf"))
|
73
|
+
end
|
74
|
+
|
72
75
|
extractor = ExtractFonts::Extractor.new
|
73
76
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
+
ARGV.each do |arg|
|
78
|
+
PDF::Reader.open(arg) do |reader|
|
79
|
+
page = reader.page(1)
|
80
|
+
extractor.page(page)
|
81
|
+
end
|
77
82
|
end
|
data/lib/pdf/reader/buffer.rb
CHANGED
@@ -48,6 +48,15 @@ class PDF::Reader
|
|
48
48
|
ID = "ID"
|
49
49
|
FWD_SLASH = "/"
|
50
50
|
NULL_BYTE = "\x00"
|
51
|
+
CR = "\r"
|
52
|
+
LF = "\n"
|
53
|
+
CRLF = "\r\n"
|
54
|
+
WHITE_SPACE = [LF, CR, ' ']
|
55
|
+
|
56
|
+
# Quite a few PDFs have trailing junk.
|
57
|
+
# This can be several k of nuls in some cases
|
58
|
+
# Allow for this here
|
59
|
+
TRAILING_BYTECOUNT = 5000
|
51
60
|
|
52
61
|
attr_reader :pos
|
53
62
|
|
@@ -86,9 +95,12 @@ class PDF::Reader
|
|
86
95
|
#
|
87
96
|
# options:
|
88
97
|
#
|
89
|
-
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF
|
90
|
-
# is sitting under the io cursor.
|
91
|
-
#
|
98
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
|
99
|
+
# that is sitting under the io cursor.
|
100
|
+
# Note:
|
101
|
+
# Skipping a bare CR is not spec-compliant.
|
102
|
+
# This is because the data may start with LF.
|
103
|
+
# However we check for CRLF first, so the ambiguity is avoided.
|
92
104
|
def read(bytes, opts = {})
|
93
105
|
reset_pos
|
94
106
|
|
@@ -97,9 +109,9 @@ class PDF::Reader
|
|
97
109
|
str = @io.read(2)
|
98
110
|
if str.nil?
|
99
111
|
return nil
|
100
|
-
elsif str ==
|
112
|
+
elsif str == CRLF # This MUST be done before checking for CR alone
|
101
113
|
# do nothing
|
102
|
-
elsif str[0,1] ==
|
114
|
+
elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
|
103
115
|
@io.seek(-1, IO::SEEK_CUR)
|
104
116
|
else
|
105
117
|
@io.seek(-2, IO::SEEK_CUR)
|
@@ -127,8 +139,8 @@ class PDF::Reader
|
|
127
139
|
#
|
128
140
|
def find_first_xref_offset
|
129
141
|
check_size_is_non_zero
|
130
|
-
@io.seek(-
|
131
|
-
data = @io.read(
|
142
|
+
@io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
|
143
|
+
data = @io.read(TRAILING_BYTECOUNT)
|
132
144
|
|
133
145
|
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
134
146
|
lines = data.split(/[\n\r]+/).reverse
|
@@ -217,7 +229,9 @@ class PDF::Reader
|
|
217
229
|
return if @tokens.size < 3
|
218
230
|
return if @tokens[2] != "R"
|
219
231
|
|
220
|
-
|
232
|
+
# must match whole tokens
|
233
|
+
digits_only = %r{\A\d+\z}
|
234
|
+
if @tokens[0].match(digits_only) && @tokens[1].match(digits_only)
|
221
235
|
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
222
236
|
@tokens[1] = nil
|
223
237
|
@tokens[2] = nil
|
@@ -225,24 +239,51 @@ class PDF::Reader
|
|
225
239
|
end
|
226
240
|
end
|
227
241
|
|
242
|
+
# Extract data between ID and EI
|
243
|
+
# If the EI follows white-space the space is dropped from the data
|
244
|
+
# The EI must followed by white-space or end of buffer
|
245
|
+
# This is to reduce the chance of accidentally matching an embedded EI
|
228
246
|
def prepare_inline_token
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
247
|
+
idstart = @io.pos
|
248
|
+
chr = prevchr = nil
|
249
|
+
eisize = 0 # how many chars in the end marker
|
250
|
+
seeking = 'E' # what are we looking for now?
|
251
|
+
loop do
|
234
252
|
chr = @io.read(1)
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
253
|
+
break if chr.nil?
|
254
|
+
case seeking
|
255
|
+
when 'E'
|
256
|
+
if chr == 'E'
|
257
|
+
seeking = 'I'
|
258
|
+
if WHITE_SPACE.include? prevchr
|
259
|
+
eisize = 3 # include whitespace in delimiter, i.e. drop from data
|
260
|
+
else # assume the EI immediately follows the data
|
261
|
+
eisize = 2 # leave prevchr in data
|
262
|
+
end
|
263
|
+
end
|
264
|
+
when 'I'
|
265
|
+
if chr == 'I'
|
266
|
+
seeking = :END
|
267
|
+
else
|
268
|
+
seeking = 'E'
|
269
|
+
end
|
270
|
+
when :END
|
271
|
+
if WHITE_SPACE.include? chr
|
272
|
+
eisize += 1 # Drop trailer
|
273
|
+
break
|
274
|
+
else
|
275
|
+
seeking = 'E'
|
276
|
+
end
|
239
277
|
end
|
278
|
+
prevchr = chr
|
240
279
|
end
|
241
|
-
|
242
|
-
|
243
|
-
|
280
|
+
unless seeking == :END
|
281
|
+
raise MalformedPDFError, "EI terminator not found"
|
282
|
+
end
|
283
|
+
eiend = @io.pos
|
284
|
+
@io.seek(idstart, IO::SEEK_SET)
|
285
|
+
str = @io.read(eiend - eisize - idstart) # get the ID content
|
244
286
|
@tokens << string_token(str)
|
245
|
-
@io.seek(-3, IO::SEEK_CUR) unless chr.nil?
|
246
287
|
end
|
247
288
|
|
248
289
|
# if we're currently inside a hex string, read hex nibbles until
|
data/lib/pdf/reader/encoding.rb
CHANGED
@@ -208,7 +208,7 @@ class PDF::Reader
|
|
208
208
|
def load_mapping(file)
|
209
209
|
File.open(file, "r:BINARY") do |f|
|
210
210
|
f.each do |l|
|
211
|
-
_m, single_byte, unicode = *l.match(
|
211
|
+
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
|
212
212
|
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
|
213
213
|
end
|
214
214
|
end
|
data/lib/pdf/reader/error.rb
CHANGED
@@ -33,17 +33,17 @@ class PDF::Reader
|
|
33
33
|
def self.str_assert(lvalue, rvalue, chars=nil)
|
34
34
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
35
35
|
lvalue = lvalue[0,chars] if chars
|
36
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue != rvalue
|
36
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
37
37
|
end
|
38
38
|
################################################################################
|
39
39
|
def self.str_assert_not(lvalue, rvalue, chars=nil)
|
40
40
|
raise MalformedPDFError, "PDF malformed, expected string but found #{lvalue.class} instead" if chars and !lvalue.kind_of?(String)
|
41
41
|
lvalue = lvalue[0,chars] if chars
|
42
|
-
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found #{lvalue} instead" if lvalue == rvalue
|
42
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue == rvalue
|
43
43
|
end
|
44
44
|
################################################################################
|
45
45
|
def self.assert_equal(lvalue, rvalue)
|
46
|
-
raise MalformedPDFError, "PDF malformed, expected #{rvalue} but found #{lvalue} instead" if lvalue != rvalue
|
46
|
+
raise MalformedPDFError, "PDF malformed, expected '#{rvalue}' but found '#{lvalue}' instead" if lvalue != rvalue
|
47
47
|
end
|
48
48
|
################################################################################
|
49
49
|
end
|
@@ -17,7 +17,11 @@ class PDF::Reader
|
|
17
17
|
#
|
18
18
|
def filter(data)
|
19
19
|
data = "<~#{data}" unless data.to_s[0,2] == "<~"
|
20
|
-
::
|
20
|
+
if defined?(::Ascii85Native)
|
21
|
+
::Ascii85Native::decode(data)
|
22
|
+
else
|
23
|
+
::Ascii85::decode(data)
|
24
|
+
end
|
21
25
|
rescue Exception => e
|
22
26
|
# Oops, there was a problem decoding the stream
|
23
27
|
raise MalformedPDFError,
|
@@ -34,7 +34,7 @@ class PDF::Reader
|
|
34
34
|
################################################################################
|
35
35
|
def tiff_depredict(data)
|
36
36
|
data = data.unpack("C*")
|
37
|
-
unfiltered =
|
37
|
+
unfiltered = ''
|
38
38
|
bpc = @options[:BitsPerComponent] || 8
|
39
39
|
pixel_bits = bpc * @options[:Colors]
|
40
40
|
pixel_bytes = pixel_bits / 8
|
@@ -51,11 +51,11 @@ class PDF::Reader
|
|
51
51
|
left = index < pixel_bytes ? 0 : row_data[index - pixel_bytes]
|
52
52
|
row_data[index] = (byte + left) % 256
|
53
53
|
end
|
54
|
-
unfiltered += row_data
|
54
|
+
unfiltered += row_data.pack("C*")
|
55
55
|
pos += line_len
|
56
56
|
end
|
57
57
|
|
58
|
-
unfiltered
|
58
|
+
unfiltered
|
59
59
|
end
|
60
60
|
################################################################################
|
61
61
|
def png_depredict(data)
|
@@ -103,19 +103,25 @@ class PDF::Reader
|
|
103
103
|
|
104
104
|
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
105
105
|
# a text file supplied by Adobe at:
|
106
|
-
#
|
106
|
+
# https://github.com/adobe-type-tools/agl-aglfn
|
107
107
|
def load_adobe_glyph_mapping
|
108
108
|
keyed_by_name = {}
|
109
109
|
keyed_by_codepoint = {}
|
110
110
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
111
|
+
paths = [
|
112
|
+
File.dirname(__FILE__) + "/glyphlist.txt",
|
113
|
+
File.dirname(__FILE__) + "/glyphlist-zapfdingbats.txt",
|
114
|
+
]
|
115
|
+
paths.each do |path|
|
116
|
+
File.open(path, "r:BINARY") do |f|
|
117
|
+
f.each do |l|
|
118
|
+
_m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
119
|
+
if name && code
|
120
|
+
cp = "0x#{code}".hex
|
121
|
+
keyed_by_name[name.to_sym] = cp
|
122
|
+
keyed_by_codepoint[cp] ||= []
|
123
|
+
keyed_by_codepoint[cp] << name.to_sym
|
124
|
+
end
|
119
125
|
end
|
120
126
|
end
|
121
127
|
end
|
@@ -0,0 +1,245 @@
|
|
1
|
+
# -----------------------------------------------------------
|
2
|
+
# Copyright 2002-2019 Adobe (http://www.adobe.com/).
|
3
|
+
#
|
4
|
+
# Redistribution and use in source and binary forms, with or
|
5
|
+
# without modification, are permitted provided that the
|
6
|
+
# following conditions are met:
|
7
|
+
#
|
8
|
+
# Redistributions of source code must retain the above
|
9
|
+
# copyright notice, this list of conditions and the following
|
10
|
+
# disclaimer.
|
11
|
+
#
|
12
|
+
# Redistributions in binary form must reproduce the above
|
13
|
+
# copyright notice, this list of conditions and the following
|
14
|
+
# disclaimer in the documentation and/or other materials
|
15
|
+
# provided with the distribution.
|
16
|
+
#
|
17
|
+
# Neither the name of Adobe nor the names of its contributors
|
18
|
+
# may be used to endorse or promote products derived from this
|
19
|
+
# software without specific prior written permission.
|
20
|
+
#
|
21
|
+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
22
|
+
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
23
|
+
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
24
|
+
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
25
|
+
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
26
|
+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
27
|
+
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
28
|
+
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
29
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
30
|
+
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
31
|
+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
32
|
+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
33
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
34
|
+
# -----------------------------------------------------------
|
35
|
+
# Name: ITC Zapf Dingbats Glyph List
|
36
|
+
# Table version: 2.0
|
37
|
+
# Date: September 20, 2002
|
38
|
+
# URL: https://github.com/adobe-type-tools/agl-aglfn
|
39
|
+
#
|
40
|
+
# Format: two semicolon-delimited fields:
|
41
|
+
# (1) glyph name--upper/lowercase letters and digits
|
42
|
+
# (2) Unicode scalar value--four uppercase hexadecimal digits
|
43
|
+
#
|
44
|
+
a100;275E
|
45
|
+
a101;2761
|
46
|
+
a102;2762
|
47
|
+
a103;2763
|
48
|
+
a104;2764
|
49
|
+
a105;2710
|
50
|
+
a106;2765
|
51
|
+
a107;2766
|
52
|
+
a108;2767
|
53
|
+
a109;2660
|
54
|
+
a10;2721
|
55
|
+
a110;2665
|
56
|
+
a111;2666
|
57
|
+
a112;2663
|
58
|
+
a117;2709
|
59
|
+
a118;2708
|
60
|
+
a119;2707
|
61
|
+
a11;261B
|
62
|
+
a120;2460
|
63
|
+
a121;2461
|
64
|
+
a122;2462
|
65
|
+
a123;2463
|
66
|
+
a124;2464
|
67
|
+
a125;2465
|
68
|
+
a126;2466
|
69
|
+
a127;2467
|
70
|
+
a128;2468
|
71
|
+
a129;2469
|
72
|
+
a12;261E
|
73
|
+
a130;2776
|
74
|
+
a131;2777
|
75
|
+
a132;2778
|
76
|
+
a133;2779
|
77
|
+
a134;277A
|
78
|
+
a135;277B
|
79
|
+
a136;277C
|
80
|
+
a137;277D
|
81
|
+
a138;277E
|
82
|
+
a139;277F
|
83
|
+
a13;270C
|
84
|
+
a140;2780
|
85
|
+
a141;2781
|
86
|
+
a142;2782
|
87
|
+
a143;2783
|
88
|
+
a144;2784
|
89
|
+
a145;2785
|
90
|
+
a146;2786
|
91
|
+
a147;2787
|
92
|
+
a148;2788
|
93
|
+
a149;2789
|
94
|
+
a14;270D
|
95
|
+
a150;278A
|
96
|
+
a151;278B
|
97
|
+
a152;278C
|
98
|
+
a153;278D
|
99
|
+
a154;278E
|
100
|
+
a155;278F
|
101
|
+
a156;2790
|
102
|
+
a157;2791
|
103
|
+
a158;2792
|
104
|
+
a159;2793
|
105
|
+
a15;270E
|
106
|
+
a160;2794
|
107
|
+
a161;2192
|
108
|
+
a162;27A3
|
109
|
+
a163;2194
|
110
|
+
a164;2195
|
111
|
+
a165;2799
|
112
|
+
a166;279B
|
113
|
+
a167;279C
|
114
|
+
a168;279D
|
115
|
+
a169;279E
|
116
|
+
a16;270F
|
117
|
+
a170;279F
|
118
|
+
a171;27A0
|
119
|
+
a172;27A1
|
120
|
+
a173;27A2
|
121
|
+
a174;27A4
|
122
|
+
a175;27A5
|
123
|
+
a176;27A6
|
124
|
+
a177;27A7
|
125
|
+
a178;27A8
|
126
|
+
a179;27A9
|
127
|
+
a17;2711
|
128
|
+
a180;27AB
|
129
|
+
a181;27AD
|
130
|
+
a182;27AF
|
131
|
+
a183;27B2
|
132
|
+
a184;27B3
|
133
|
+
a185;27B5
|
134
|
+
a186;27B8
|
135
|
+
a187;27BA
|
136
|
+
a188;27BB
|
137
|
+
a189;27BC
|
138
|
+
a18;2712
|
139
|
+
a190;27BD
|
140
|
+
a191;27BE
|
141
|
+
a192;279A
|
142
|
+
a193;27AA
|
143
|
+
a194;27B6
|
144
|
+
a195;27B9
|
145
|
+
a196;2798
|
146
|
+
a197;27B4
|
147
|
+
a198;27B7
|
148
|
+
a199;27AC
|
149
|
+
a19;2713
|
150
|
+
a1;2701
|
151
|
+
a200;27AE
|
152
|
+
a201;27B1
|
153
|
+
a202;2703
|
154
|
+
a203;2750
|
155
|
+
a204;2752
|
156
|
+
a205;276E
|
157
|
+
a206;2770
|
158
|
+
a20;2714
|
159
|
+
a21;2715
|
160
|
+
a22;2716
|
161
|
+
a23;2717
|
162
|
+
a24;2718
|
163
|
+
a25;2719
|
164
|
+
a26;271A
|
165
|
+
a27;271B
|
166
|
+
a28;271C
|
167
|
+
a29;2722
|
168
|
+
a2;2702
|
169
|
+
a30;2723
|
170
|
+
a31;2724
|
171
|
+
a32;2725
|
172
|
+
a33;2726
|
173
|
+
a34;2727
|
174
|
+
a35;2605
|
175
|
+
a36;2729
|
176
|
+
a37;272A
|
177
|
+
a38;272B
|
178
|
+
a39;272C
|
179
|
+
a3;2704
|
180
|
+
a40;272D
|
181
|
+
a41;272E
|
182
|
+
a42;272F
|
183
|
+
a43;2730
|
184
|
+
a44;2731
|
185
|
+
a45;2732
|
186
|
+
a46;2733
|
187
|
+
a47;2734
|
188
|
+
a48;2735
|
189
|
+
a49;2736
|
190
|
+
a4;260E
|
191
|
+
a50;2737
|
192
|
+
a51;2738
|
193
|
+
a52;2739
|
194
|
+
a53;273A
|
195
|
+
a54;273B
|
196
|
+
a55;273C
|
197
|
+
a56;273D
|
198
|
+
a57;273E
|
199
|
+
a58;273F
|
200
|
+
a59;2740
|
201
|
+
a5;2706
|
202
|
+
a60;2741
|
203
|
+
a61;2742
|
204
|
+
a62;2743
|
205
|
+
a63;2744
|
206
|
+
a64;2745
|
207
|
+
a65;2746
|
208
|
+
a66;2747
|
209
|
+
a67;2748
|
210
|
+
a68;2749
|
211
|
+
a69;274A
|
212
|
+
a6;271D
|
213
|
+
a70;274B
|
214
|
+
a71;25CF
|
215
|
+
a72;274D
|
216
|
+
a73;25A0
|
217
|
+
a74;274F
|
218
|
+
a75;2751
|
219
|
+
a76;25B2
|
220
|
+
a77;25BC
|
221
|
+
a78;25C6
|
222
|
+
a79;2756
|
223
|
+
a7;271E
|
224
|
+
a81;25D7
|
225
|
+
a82;2758
|
226
|
+
a83;2759
|
227
|
+
a84;275A
|
228
|
+
a85;276F
|
229
|
+
a86;2771
|
230
|
+
a87;2772
|
231
|
+
a88;2773
|
232
|
+
a89;2768
|
233
|
+
a8;271F
|
234
|
+
a90;2769
|
235
|
+
a91;276C
|
236
|
+
a92;276D
|
237
|
+
a93;276A
|
238
|
+
a94;276B
|
239
|
+
a95;2774
|
240
|
+
a96;2775
|
241
|
+
a97;275B
|
242
|
+
a98;275C
|
243
|
+
a99;275D
|
244
|
+
a9;2720
|
245
|
+
# END
|
@@ -2,6 +2,7 @@
|
|
2
2
|
# frozen_string_literal: true
|
3
3
|
|
4
4
|
require 'pdf/reader/overlapping_runs_filter'
|
5
|
+
require 'pdf/reader/zero_width_runs_filter'
|
5
6
|
|
6
7
|
class PDF::Reader
|
7
8
|
|
@@ -17,10 +18,12 @@ class PDF::Reader
|
|
17
18
|
def initialize(runs, mediabox)
|
18
19
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
19
20
|
|
20
|
-
|
21
|
+
runs = ZeroWidthRunsFilter.exclude_zero_width_runs(runs)
|
22
|
+
runs = OverlappingRunsFilter.exclude_redundant_runs(runs)
|
23
|
+
@runs = merge_runs(runs)
|
21
24
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
22
25
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
23
|
-
@
|
26
|
+
@median_glyph_width = median(@runs.map(&:mean_character_width)) || 0
|
24
27
|
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
28
|
@page_height = (mediabox[3] - mediabox[1]).abs
|
26
29
|
@x_offset = @runs.map(&:x).sort.first || 0
|
@@ -67,7 +70,7 @@ class PDF::Reader
|
|
67
70
|
end
|
68
71
|
|
69
72
|
def col_count
|
70
|
-
@col_count ||= ((@page_width / @
|
73
|
+
@col_count ||= ((@page_width / @median_glyph_width) * 1.05).floor
|
71
74
|
end
|
72
75
|
|
73
76
|
def row_multiplier
|
@@ -86,12 +89,12 @@ class PDF::Reader
|
|
86
89
|
end
|
87
90
|
end
|
88
91
|
|
89
|
-
def
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
92
|
+
def median(collection)
|
93
|
+
if collection.size == 0
|
94
|
+
0
|
95
|
+
else
|
96
|
+
collection.sort[(collection.size * 0.5).floor]
|
97
|
+
end
|
95
98
|
end
|
96
99
|
|
97
100
|
# take a collection of TextRun objects and merge any that are in close
|
@@ -45,8 +45,8 @@ module PDF
|
|
45
45
|
@content = []
|
46
46
|
@characters = []
|
47
47
|
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
48
|
-
device_bl =
|
49
|
-
device_tr =
|
48
|
+
device_bl = apply_rotation(*@state.ctm_transform(@mediabox[0], @mediabox[1]))
|
49
|
+
device_tr = apply_rotation(*@state.ctm_transform(@mediabox[2], @mediabox[3]))
|
50
50
|
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
51
51
|
end
|
52
52
|
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -175,15 +175,18 @@ class PDF::Reader
|
|
175
175
|
return "".dup.force_encoding("binary") if str == ")"
|
176
176
|
Error.assert_equal(parse_token, ")")
|
177
177
|
|
178
|
-
str.gsub!(/\\([nrtbf()\\\n]
|
179
|
-
|
178
|
+
str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
|
179
|
+
if $2.nil? # not octal digits
|
180
|
+
MAPPING[match] || "".dup
|
181
|
+
else # must be octal digits
|
182
|
+
($2.oct & 0xff).chr # ignore high level overflow
|
183
|
+
end
|
180
184
|
end
|
181
185
|
str.force_encoding("binary")
|
182
186
|
end
|
183
187
|
|
184
188
|
MAPPING = {
|
185
189
|
"\r" => "\n",
|
186
|
-
"\n\r" => "\n",
|
187
190
|
"\r\n" => "\n",
|
188
191
|
"\\n" => "\n",
|
189
192
|
"\\r" => "\r",
|
@@ -194,10 +197,9 @@ class PDF::Reader
|
|
194
197
|
"\\)" => ")",
|
195
198
|
"\\\\" => "\\",
|
196
199
|
"\\\n" => "",
|
200
|
+
"\\\r" => "",
|
201
|
+
"\\\r\n" => "",
|
197
202
|
}
|
198
|
-
0.upto(9) { |n| MAPPING["\\00"+n.to_s] = ("00"+n.to_s).oct.chr }
|
199
|
-
0.upto(99) { |n| MAPPING["\\0"+n.to_s] = ("0"+n.to_s).oct.chr }
|
200
|
-
0.upto(377) { |n| MAPPING["\\"+n.to_s] = n.to_s.oct.chr }
|
201
203
|
|
202
204
|
################################################################################
|
203
205
|
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -131,6 +131,9 @@ class PDF::Reader
|
|
131
131
|
generation = buf.token.to_i
|
132
132
|
state = buf.token
|
133
133
|
|
134
|
+
# Some PDF writers start numbering at 1 instead of 0. Fix up the number.
|
135
|
+
# TODO should this fix be logged?
|
136
|
+
objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
|
134
137
|
store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
|
135
138
|
objid += 1
|
136
139
|
params.clear
|
@@ -146,7 +149,9 @@ class PDF::Reader
|
|
146
149
|
end
|
147
150
|
|
148
151
|
load_offsets(trailer[:XRefStm]) if trailer.has_key?(:XRefStm)
|
149
|
-
|
152
|
+
# Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
|
153
|
+
# It's not possible for an xref to appear at offset 0, so can safely skip the ref
|
154
|
+
load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0
|
150
155
|
|
151
156
|
trailer
|
152
157
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-
|
11
|
+
date: 2021-11-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -239,6 +239,7 @@ files:
|
|
239
239
|
- lib/pdf/reader/font_descriptor.rb
|
240
240
|
- lib/pdf/reader/form_xobject.rb
|
241
241
|
- lib/pdf/reader/glyph_hash.rb
|
242
|
+
- lib/pdf/reader/glyphlist-zapfdingbats.txt
|
242
243
|
- lib/pdf/reader/glyphlist.txt
|
243
244
|
- lib/pdf/reader/lzw.rb
|
244
245
|
- lib/pdf/reader/null_security_handler.rb
|
@@ -272,15 +273,16 @@ files:
|
|
272
273
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
273
274
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
274
275
|
- lib/pdf/reader/xref.rb
|
276
|
+
- lib/pdf/reader/zero_width_runs_filter.rb
|
275
277
|
homepage: https://github.com/yob/pdf-reader
|
276
278
|
licenses:
|
277
279
|
- MIT
|
278
280
|
metadata:
|
279
281
|
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
280
|
-
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.
|
281
|
-
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.
|
282
|
-
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.
|
283
|
-
post_install_message:
|
282
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.6.0/CHANGELOG
|
283
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.6.0
|
284
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.6.0
|
285
|
+
post_install_message:
|
284
286
|
rdoc_options:
|
285
287
|
- "--title"
|
286
288
|
- PDF::Reader Documentation
|
@@ -300,8 +302,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
300
302
|
- !ruby/object:Gem::Version
|
301
303
|
version: '0'
|
302
304
|
requirements: []
|
303
|
-
rubygems_version: 3.
|
304
|
-
signing_key:
|
305
|
+
rubygems_version: 3.1.4
|
306
|
+
signing_key:
|
305
307
|
specification_version: 4
|
306
308
|
summary: A library for accessing the content of PDF files
|
307
309
|
test_files: []
|