pdf-reader 2.1.0 → 2.4.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG +28 -1
- data/README.md +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf-reader.rb +1 -0
- data/lib/pdf/reader.rb +2 -2
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +12 -11
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +22 -12
- data/lib/pdf/reader/encoding.rb +12 -9
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/filter.rb +1 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +6 -4
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +3 -1
- data/lib/pdf/reader/font.rb +11 -2
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +1 -0
- data/lib/pdf/reader/lzw.rb +2 -1
- data/lib/pdf/reader/null_security_handler.rb +1 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +22 -10
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +29 -0
- data/lib/pdf/reader/page_layout.rb +10 -5
- data/lib/pdf/reader/page_state.rb +10 -1
- data/lib/pdf/reader/page_text_receiver.rb +5 -1
- data/lib/pdf/reader/pages_strategy.rb +1 -0
- data/lib/pdf/reader/parser.rb +5 -4
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +1 -0
- data/lib/pdf/reader/standard_security_handler_v5.rb +2 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +25 -0
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +18 -1
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- metadata +17 -13
- data/lib/pdf/hash.rb +0 -19
@@ -1,4 +1,7 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'pdf/reader/overlapping_runs_filter'
|
2
5
|
|
3
6
|
class PDF::Reader
|
4
7
|
|
@@ -14,13 +17,15 @@ class PDF::Reader
|
|
14
17
|
def initialize(runs, mediabox)
|
15
18
|
raise ArgumentError, "a mediabox must be provided" if mediabox.nil?
|
16
19
|
|
17
|
-
@runs = merge_runs(runs)
|
20
|
+
@runs = merge_runs(OverlappingRunsFilter.exclude_redundant_runs(runs))
|
18
21
|
@mean_font_size = mean(@runs.map(&:font_size)) || DEFAULT_FONT_SIZE
|
19
22
|
@mean_font_size = DEFAULT_FONT_SIZE if @mean_font_size == 0
|
20
23
|
@mean_glyph_width = mean(@runs.map(&:mean_character_width)) || 0
|
21
|
-
@page_width = mediabox[2] - mediabox[0]
|
22
|
-
@page_height = mediabox[3] - mediabox[1]
|
23
|
-
@x_offset = @runs.map(&:x).sort.first
|
24
|
+
@page_width = (mediabox[2] - mediabox[0]).abs
|
25
|
+
@page_height = (mediabox[3] - mediabox[1]).abs
|
26
|
+
@x_offset = @runs.map(&:x).sort.first || 0
|
27
|
+
lowest_y = @runs.map(&:y).sort.first || 0
|
28
|
+
@y_offset = lowest_y > 0 ? 0 : lowest_y
|
24
29
|
end
|
25
30
|
|
26
31
|
def to_s
|
@@ -29,7 +34,7 @@ class PDF::Reader
|
|
29
34
|
page = row_count.times.map { |i| " " * col_count }
|
30
35
|
@runs.each do |run|
|
31
36
|
x_pos = ((run.x - @x_offset) / col_multiplier).round
|
32
|
-
y_pos = row_count - (run.y / row_multiplier).round
|
37
|
+
y_pos = row_count - ((run.y - @y_offset) / row_multiplier).round
|
33
38
|
if y_pos <= row_count && y_pos >= 0 && x_pos <= col_count && x_pos >= 0
|
34
39
|
local_string_insert(page[y_pos-1], run.text, x_pos)
|
35
40
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'pdf/reader/transformation_matrix'
|
4
5
|
|
@@ -29,7 +30,15 @@ class PDF::Reader
|
|
29
30
|
@xobject_stack = [page.xobjects]
|
30
31
|
@cs_stack = [page.color_spaces]
|
31
32
|
@stack = [DEFAULT_GRAPHICS_STATE.dup]
|
32
|
-
|
33
|
+
if page.rotate == 0
|
34
|
+
state[:ctm] = identity_matrix
|
35
|
+
else
|
36
|
+
rotate_cos = Math.cos(page.rotate * (Math::PI/180.0)).round(2)
|
37
|
+
rotate_sin = Math.sin(page.rotate * (Math::PI/180.0)).round(2)
|
38
|
+
state[:ctm] = TransformationMatrix.new(rotate_cos, rotate_sin,
|
39
|
+
rotate_sin * -1, rotate_cos,
|
40
|
+
0, 0)
|
41
|
+
end
|
33
42
|
end
|
34
43
|
|
35
44
|
#####################################################
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'forwardable'
|
4
5
|
require 'pdf/reader/page_layout'
|
@@ -43,10 +44,13 @@ module PDF
|
|
43
44
|
@content = []
|
44
45
|
@characters = []
|
45
46
|
@mediabox = page.objects.deref(page.attributes[:MediaBox])
|
47
|
+
device_bl = @state.ctm_transform(@mediabox[0], @mediabox[1])
|
48
|
+
device_tr = @state.ctm_transform(@mediabox[2], @mediabox[3])
|
49
|
+
@device_mediabox = [ device_bl.first, device_bl.last, device_tr.first, device_tr.last]
|
46
50
|
end
|
47
51
|
|
48
52
|
def content
|
49
|
-
PageLayout.new(@characters, @
|
53
|
+
PageLayout.new(@characters, @device_mediabox).to_s
|
50
54
|
end
|
51
55
|
|
52
56
|
#####################################################
|
data/lib/pdf/reader/parser.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -132,7 +133,7 @@ class PDF::Reader
|
|
132
133
|
# reads a PDF name from the buffer and converts it to a Ruby Symbol
|
133
134
|
def pdf_name
|
134
135
|
tok = @buffer.token
|
135
|
-
tok.gsub
|
136
|
+
tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
|
136
137
|
match[1, 2].hex.chr
|
137
138
|
end
|
138
139
|
tok.to_sym
|
@@ -154,7 +155,7 @@ class PDF::Reader
|
|
154
155
|
################################################################################
|
155
156
|
# Reads a PDF hex string from the buffer and converts it to a Ruby String
|
156
157
|
def hex_string
|
157
|
-
str = ""
|
158
|
+
str = "".dup
|
158
159
|
|
159
160
|
loop do
|
160
161
|
token = @buffer.token
|
@@ -171,11 +172,11 @@ class PDF::Reader
|
|
171
172
|
# Reads a PDF String from the buffer and converts it to a Ruby String
|
172
173
|
def string
|
173
174
|
str = @buffer.token
|
174
|
-
return "".force_encoding("binary") if str == ")"
|
175
|
+
return "".dup.force_encoding("binary") if str == ")"
|
175
176
|
Error.assert_equal(parse_token, ")")
|
176
177
|
|
177
178
|
str.gsub!(/\\([nrtbf()\\\n]|\d{1,3})?|\r\n?|\n\r/m) do |match|
|
178
|
-
MAPPING[match] || ""
|
179
|
+
MAPPING[match] || "".dup
|
179
180
|
end
|
180
181
|
str.force_encoding("binary")
|
181
182
|
end
|
data/lib/pdf/reader/reference.rb
CHANGED
data/lib/pdf/reader/stream.rb
CHANGED
data/lib/pdf/reader/text_run.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
# A value object that represents one or more consecutive characters on a page.
|
@@ -37,6 +38,10 @@ class PDF::Reader
|
|
37
38
|
@endx ||= x + width
|
38
39
|
end
|
39
40
|
|
41
|
+
def endy
|
42
|
+
@endy ||= y + font_size
|
43
|
+
end
|
44
|
+
|
40
45
|
def mean_character_width
|
41
46
|
@width / character_count
|
42
47
|
end
|
@@ -59,8 +64,28 @@ class PDF::Reader
|
|
59
64
|
"#{text} w:#{width} f:#{font_size} @#{x},#{y}"
|
60
65
|
end
|
61
66
|
|
67
|
+
def intersect?(other_run)
|
68
|
+
x <= other_run.endx && endx >= other_run.x &&
|
69
|
+
endy >= other_run.y && y <= other_run.endy
|
70
|
+
end
|
71
|
+
|
72
|
+
# return what percentage of this text run is overlapped by another run
|
73
|
+
def intersection_area_percent(other_run)
|
74
|
+
return 0 unless intersect?(other_run)
|
75
|
+
|
76
|
+
dx = [endx, other_run.endx].min - [x, other_run.x].max
|
77
|
+
dy = [endy, other_run.endy].min - [y, other_run.y].max
|
78
|
+
intersection_area = dx*dy
|
79
|
+
|
80
|
+
intersection_area.to_f / area
|
81
|
+
end
|
82
|
+
|
62
83
|
private
|
63
84
|
|
85
|
+
def area
|
86
|
+
(endx - x) * (endy - y)
|
87
|
+
end
|
88
|
+
|
64
89
|
def mergable_range
|
65
90
|
@mergable_range ||= Range.new(endx - 3, endx + font_size)
|
66
91
|
end
|
data/lib/pdf/reader/token.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
require 'afm'
|
4
5
|
require 'pdf/reader/synchronized_cache'
|
@@ -11,11 +12,20 @@ class PDF::Reader
|
|
11
12
|
# see Section 9.6.2.2, PDF 32000-1:2008, pp 256
|
12
13
|
class BuiltIn
|
13
14
|
|
15
|
+
BUILTINS = [
|
16
|
+
:Courier, :"Courier-Bold", :"Courier-BoldOblique", :"Courier-Oblique",
|
17
|
+
:Helvetica, :"Helvetica-Bold", :"Helvetica-BoldOblique", :"Helvetica-Oblique",
|
18
|
+
:Symbol,
|
19
|
+
:"Times-Roman", :"Times-Bold", :"Times-BoldItalic", :"Times-Italic",
|
20
|
+
:ZapfDingbats
|
21
|
+
]
|
22
|
+
|
14
23
|
def initialize(font)
|
15
24
|
@font = font
|
16
25
|
@@all_metrics ||= PDF::Reader::SynchronizedCache.new
|
17
26
|
|
18
|
-
|
27
|
+
basefont = extract_basefont(font.basefont)
|
28
|
+
metrics_path = File.join(File.dirname(__FILE__), "..","afm","#{basefont}.afm")
|
19
29
|
|
20
30
|
if File.file?(metrics_path)
|
21
31
|
@metrics = @@all_metrics[metrics_path] ||= AFM::Font.new(metrics_path)
|
@@ -53,6 +63,13 @@ class PDF::Reader
|
|
53
63
|
@font.encoding.int_to_name(code_point).first.to_s[/\Acontrol..\Z/]
|
54
64
|
end
|
55
65
|
|
66
|
+
def extract_basefont(font_name)
|
67
|
+
if BUILTINS.include?(font_name)
|
68
|
+
font_name
|
69
|
+
else
|
70
|
+
"Times-Roman"
|
71
|
+
end
|
72
|
+
end
|
56
73
|
end
|
57
74
|
end
|
58
75
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
class PDF::Reader
|
4
5
|
module WidthCalculator
|
@@ -17,8 +18,7 @@ class PDF::Reader
|
|
17
18
|
|
18
19
|
def glyph_width(code_point)
|
19
20
|
return 0 if code_point.nil? || code_point < 0
|
20
|
-
|
21
|
-
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point)
|
21
|
+
glyph_width_from_font(code_point) || glyph_width_from_descriptor(code_point) || 0
|
22
22
|
end
|
23
23
|
|
24
24
|
private
|
data/lib/pdf/reader/xref.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
2
3
|
|
3
4
|
################################################################################
|
4
5
|
#
|
@@ -203,8 +204,10 @@ class PDF::Reader
|
|
203
204
|
("\x00" + bytes).unpack("N")[0]
|
204
205
|
elsif bytes.size == 4
|
205
206
|
bytes.unpack("N")[0]
|
207
|
+
elsif bytes.size == 8
|
208
|
+
bytes.unpack("Q>")[0]
|
206
209
|
else
|
207
|
-
raise UnsupportedFeatureError, "Unable to unpack xref stream entries
|
210
|
+
raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
|
208
211
|
end
|
209
212
|
end
|
210
213
|
################################################################################
|
@@ -227,18 +230,21 @@ class PDF::Reader
|
|
227
230
|
# should always be 0, but all sort of crazy junk is prefixed to PDF files
|
228
231
|
# in the real world.
|
229
232
|
#
|
230
|
-
# Checks up to
|
233
|
+
# Checks up to 1024 chars into the file,
|
234
|
+
# returns nil if no PDF data detected.
|
235
|
+
# Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
|
236
|
+
# header appear somewhere within the first 1024 bytes of the file
|
231
237
|
#
|
232
238
|
def calc_junk_offset(io)
|
233
239
|
io.rewind
|
234
240
|
offset = io.pos
|
235
|
-
until (c = io.readchar) == '%' || c == 37 || offset >
|
241
|
+
until (c = io.readchar) == '%' || c == 37 || offset > 1024
|
236
242
|
offset += 1
|
237
243
|
end
|
238
244
|
io.rewind
|
239
|
-
offset <
|
245
|
+
offset < 1024 ? offset : nil
|
240
246
|
rescue EOFError
|
241
|
-
|
247
|
+
nil
|
242
248
|
end
|
243
249
|
end
|
244
250
|
################################################################################
|
metadata
CHANGED
@@ -1,29 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.1
|
4
|
+
version: 2.4.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-09-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "<"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '13.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "<"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '13.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
@@ -67,7 +67,7 @@ dependencies:
|
|
67
67
|
- !ruby/object:Gem::Version
|
68
68
|
version: '0.2'
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
|
-
name:
|
70
|
+
name: pry
|
71
71
|
requirement: !ruby/object:Gem::Requirement
|
72
72
|
requirements:
|
73
73
|
- - ">="
|
@@ -167,7 +167,7 @@ dependencies:
|
|
167
167
|
description: The PDF::Reader library implements a PDF parser conforming as much as
|
168
168
|
possible to the PDF specification from Adobe
|
169
169
|
email:
|
170
|
-
-
|
170
|
+
- james@yob.id.au
|
171
171
|
executables:
|
172
172
|
- pdf_object
|
173
173
|
- pdf_text
|
@@ -199,7 +199,6 @@ files:
|
|
199
199
|
- examples/text.rb
|
200
200
|
- examples/version.rb
|
201
201
|
- lib/pdf-reader.rb
|
202
|
-
- lib/pdf/hash.rb
|
203
202
|
- lib/pdf/reader.rb
|
204
203
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
205
204
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
@@ -209,6 +208,7 @@ files:
|
|
209
208
|
- lib/pdf/reader/afm/Helvetica-BoldOblique.afm
|
210
209
|
- lib/pdf/reader/afm/Helvetica-Oblique.afm
|
211
210
|
- lib/pdf/reader/afm/Helvetica.afm
|
211
|
+
- lib/pdf/reader/afm/MustRead.html
|
212
212
|
- lib/pdf/reader/afm/Symbol.afm
|
213
213
|
- lib/pdf/reader/afm/Times-Bold.afm
|
214
214
|
- lib/pdf/reader/afm/Times-BoldItalic.afm
|
@@ -246,6 +246,7 @@ files:
|
|
246
246
|
- lib/pdf/reader/object_hash.rb
|
247
247
|
- lib/pdf/reader/object_stream.rb
|
248
248
|
- lib/pdf/reader/orientation_detector.rb
|
249
|
+
- lib/pdf/reader/overlapping_runs_filter.rb
|
249
250
|
- lib/pdf/reader/page.rb
|
250
251
|
- lib/pdf/reader/page_layout.rb
|
251
252
|
- lib/pdf/reader/page_state.rb
|
@@ -271,10 +272,14 @@ files:
|
|
271
272
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
272
273
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
273
274
|
- lib/pdf/reader/xref.rb
|
274
|
-
homepage:
|
275
|
+
homepage: https://github.com/yob/pdf-reader
|
275
276
|
licenses:
|
276
277
|
- MIT
|
277
|
-
metadata:
|
278
|
+
metadata:
|
279
|
+
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
280
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.4.1/CHANGELOG
|
281
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.4.1
|
282
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.4.1
|
278
283
|
post_install_message:
|
279
284
|
rdoc_options:
|
280
285
|
- "--title"
|
@@ -295,8 +300,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
295
300
|
- !ruby/object:Gem::Version
|
296
301
|
version: '0'
|
297
302
|
requirements: []
|
298
|
-
|
299
|
-
rubygems_version: 2.7.3
|
303
|
+
rubygems_version: 3.0.3
|
300
304
|
signing_key:
|
301
305
|
specification_version: 4
|
302
306
|
summary: A library for accessing the content of PDF files
|