pdf-reader 1.4.1 → 2.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/CHANGELOG +53 -3
- data/{README.rdoc → README.md} +40 -23
- data/Rakefile +2 -2
- data/bin/pdf_callbacks +1 -1
- data/bin/pdf_object +4 -1
- data/bin/pdf_text +1 -1
- data/lib/pdf/reader/afm/Courier-Bold.afm +342 -342
- data/lib/pdf/reader/afm/Courier-BoldOblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier-Oblique.afm +342 -342
- data/lib/pdf/reader/afm/Courier.afm +342 -342
- data/lib/pdf/reader/afm/Helvetica-Bold.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-BoldOblique.afm +2827 -2827
- data/lib/pdf/reader/afm/Helvetica-Oblique.afm +3051 -3051
- data/lib/pdf/reader/afm/Helvetica.afm +3051 -3051
- data/lib/pdf/reader/afm/MustRead.html +19 -0
- data/lib/pdf/reader/afm/Symbol.afm +213 -213
- data/lib/pdf/reader/afm/Times-Bold.afm +2588 -2588
- data/lib/pdf/reader/afm/Times-BoldItalic.afm +2384 -2384
- data/lib/pdf/reader/afm/Times-Italic.afm +2667 -2667
- data/lib/pdf/reader/afm/Times-Roman.afm +2419 -2419
- data/lib/pdf/reader/afm/ZapfDingbats.afm +225 -225
- data/lib/pdf/reader/buffer.rb +14 -12
- data/lib/pdf/reader/cid_widths.rb +2 -0
- data/lib/pdf/reader/cmap.rb +48 -36
- data/lib/pdf/reader/encoding.rb +16 -18
- data/lib/pdf/reader/error.rb +5 -0
- data/lib/pdf/reader/filter/ascii85.rb +1 -0
- data/lib/pdf/reader/filter/ascii_hex.rb +2 -0
- data/lib/pdf/reader/filter/depredict.rb +1 -0
- data/lib/pdf/reader/filter/flate.rb +29 -16
- data/lib/pdf/reader/filter/lzw.rb +2 -0
- data/lib/pdf/reader/filter/null.rb +2 -0
- data/lib/pdf/reader/filter/run_length.rb +4 -6
- data/lib/pdf/reader/filter.rb +2 -0
- data/lib/pdf/reader/font.rb +12 -13
- data/lib/pdf/reader/font_descriptor.rb +1 -0
- data/lib/pdf/reader/form_xobject.rb +1 -0
- data/lib/pdf/reader/glyph_hash.rb +7 -2
- data/lib/pdf/reader/lzw.rb +4 -4
- data/lib/pdf/reader/null_security_handler.rb +17 -0
- data/lib/pdf/reader/object_cache.rb +1 -0
- data/lib/pdf/reader/object_hash.rb +91 -37
- data/lib/pdf/reader/object_stream.rb +1 -0
- data/lib/pdf/reader/orientation_detector.rb +5 -4
- data/lib/pdf/reader/overlapping_runs_filter.rb +65 -0
- data/lib/pdf/reader/page.rb +30 -1
- data/lib/pdf/reader/page_layout.rb +19 -24
- data/lib/pdf/reader/page_state.rb +8 -5
- data/lib/pdf/reader/page_text_receiver.rb +23 -1
- data/lib/pdf/reader/pages_strategy.rb +2 -304
- data/lib/pdf/reader/parser.rb +10 -7
- data/lib/pdf/reader/print_receiver.rb +1 -0
- data/lib/pdf/reader/reference.rb +1 -0
- data/lib/pdf/reader/register_receiver.rb +1 -0
- data/lib/pdf/reader/resource_methods.rb +1 -0
- data/lib/pdf/reader/standard_security_handler.rb +80 -42
- data/lib/pdf/reader/standard_security_handler_v5.rb +91 -0
- data/lib/pdf/reader/stream.rb +1 -0
- data/lib/pdf/reader/synchronized_cache.rb +1 -0
- data/lib/pdf/reader/text_run.rb +28 -9
- data/lib/pdf/reader/token.rb +1 -0
- data/lib/pdf/reader/transformation_matrix.rb +1 -0
- data/lib/pdf/reader/unimplemented_security_handler.rb +17 -0
- data/lib/pdf/reader/width_calculator/built_in.rb +25 -16
- data/lib/pdf/reader/width_calculator/composite.rb +1 -0
- data/lib/pdf/reader/width_calculator/true_type.rb +2 -2
- data/lib/pdf/reader/width_calculator/type_one_or_three.rb +1 -0
- data/lib/pdf/reader/width_calculator/type_zero.rb +1 -0
- data/lib/pdf/reader/width_calculator.rb +1 -0
- data/lib/pdf/reader/xref.rb +11 -5
- data/lib/pdf/reader.rb +30 -119
- data/lib/pdf-reader.rb +1 -0
- metadata +35 -61
- data/bin/pdf_list_callbacks +0 -17
- data/lib/pdf/hash.rb +0 -19
- data/lib/pdf/reader/abstract_strategy.rb +0 -81
- data/lib/pdf/reader/metadata_strategy.rb +0 -56
- data/lib/pdf/reader/text_receiver.rb +0 -265
metadata
CHANGED
@@ -1,57 +1,43 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pdf-reader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- James Healy
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-06-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "<"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '0'
|
19
|
+
version: '13.0'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "<"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '0'
|
26
|
+
version: '13.0'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
31
|
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '3.
|
33
|
+
version: '3.5'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
38
|
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '3.
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: ZenTest
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - "~>"
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: 4.4.2
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - "~>"
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: 4.4.2
|
40
|
+
version: '3.5'
|
55
41
|
- !ruby/object:Gem::Dependency
|
56
42
|
name: cane
|
57
43
|
requirement: !ruby/object:Gem::Requirement
|
@@ -70,18 +56,18 @@ dependencies:
|
|
70
56
|
name: morecane
|
71
57
|
requirement: !ruby/object:Gem::Requirement
|
72
58
|
requirements:
|
73
|
-
- - "
|
59
|
+
- - "~>"
|
74
60
|
- !ruby/object:Gem::Version
|
75
|
-
version: '0'
|
61
|
+
version: '0.2'
|
76
62
|
type: :development
|
77
63
|
prerelease: false
|
78
64
|
version_requirements: !ruby/object:Gem::Requirement
|
79
65
|
requirements:
|
80
|
-
- - "
|
66
|
+
- - "~>"
|
81
67
|
- !ruby/object:Gem::Version
|
82
|
-
version: '0'
|
68
|
+
version: '0.2'
|
83
69
|
- !ruby/object:Gem::Dependency
|
84
|
-
name:
|
70
|
+
name: pry
|
85
71
|
requirement: !ruby/object:Gem::Requirement
|
86
72
|
requirements:
|
87
73
|
- - ">="
|
@@ -114,14 +100,14 @@ dependencies:
|
|
114
100
|
requirements:
|
115
101
|
- - "~>"
|
116
102
|
- !ruby/object:Gem::Version
|
117
|
-
version: 1.0
|
103
|
+
version: '1.0'
|
118
104
|
type: :runtime
|
119
105
|
prerelease: false
|
120
106
|
version_requirements: !ruby/object:Gem::Requirement
|
121
107
|
requirements:
|
122
108
|
- - "~>"
|
123
109
|
- !ruby/object:Gem::Version
|
124
|
-
version: 1.0
|
110
|
+
version: '1.0'
|
125
111
|
- !ruby/object:Gem::Dependency
|
126
112
|
name: ruby-rc4
|
127
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -181,26 +167,24 @@ dependencies:
|
|
181
167
|
description: The PDF::Reader library implements a PDF parser conforming as much as
|
182
168
|
possible to the PDF specification from Adobe
|
183
169
|
email:
|
184
|
-
-
|
170
|
+
- james@yob.id.au
|
185
171
|
executables:
|
186
172
|
- pdf_object
|
187
173
|
- pdf_text
|
188
|
-
- pdf_list_callbacks
|
189
174
|
- pdf_callbacks
|
190
175
|
extensions: []
|
191
176
|
extra_rdoc_files:
|
192
|
-
- README.
|
177
|
+
- README.md
|
193
178
|
- TODO
|
194
179
|
- CHANGELOG
|
195
180
|
- MIT-LICENSE
|
196
181
|
files:
|
197
182
|
- CHANGELOG
|
198
183
|
- MIT-LICENSE
|
199
|
-
- README.
|
184
|
+
- README.md
|
200
185
|
- Rakefile
|
201
186
|
- TODO
|
202
187
|
- bin/pdf_callbacks
|
203
|
-
- bin/pdf_list_callbacks
|
204
188
|
- bin/pdf_object
|
205
189
|
- bin/pdf_text
|
206
190
|
- examples/callbacks.rb
|
@@ -215,9 +199,7 @@ files:
|
|
215
199
|
- examples/text.rb
|
216
200
|
- examples/version.rb
|
217
201
|
- lib/pdf-reader.rb
|
218
|
-
- lib/pdf/hash.rb
|
219
202
|
- lib/pdf/reader.rb
|
220
|
-
- lib/pdf/reader/abstract_strategy.rb
|
221
203
|
- lib/pdf/reader/afm/Courier-Bold.afm
|
222
204
|
- lib/pdf/reader/afm/Courier-BoldOblique.afm
|
223
205
|
- lib/pdf/reader/afm/Courier-Oblique.afm
|
@@ -226,6 +208,7 @@ files:
|
|
226
208
|
- lib/pdf/reader/afm/Helvetica-BoldOblique.afm
|
227
209
|
- lib/pdf/reader/afm/Helvetica-Oblique.afm
|
228
210
|
- lib/pdf/reader/afm/Helvetica.afm
|
211
|
+
- lib/pdf/reader/afm/MustRead.html
|
229
212
|
- lib/pdf/reader/afm/Symbol.afm
|
230
213
|
- lib/pdf/reader/afm/Times-Bold.afm
|
231
214
|
- lib/pdf/reader/afm/Times-BoldItalic.afm
|
@@ -258,11 +241,12 @@ files:
|
|
258
241
|
- lib/pdf/reader/glyph_hash.rb
|
259
242
|
- lib/pdf/reader/glyphlist.txt
|
260
243
|
- lib/pdf/reader/lzw.rb
|
261
|
-
- lib/pdf/reader/
|
244
|
+
- lib/pdf/reader/null_security_handler.rb
|
262
245
|
- lib/pdf/reader/object_cache.rb
|
263
246
|
- lib/pdf/reader/object_hash.rb
|
264
247
|
- lib/pdf/reader/object_stream.rb
|
265
248
|
- lib/pdf/reader/orientation_detector.rb
|
249
|
+
- lib/pdf/reader/overlapping_runs_filter.rb
|
266
250
|
- lib/pdf/reader/page.rb
|
267
251
|
- lib/pdf/reader/page_layout.rb
|
268
252
|
- lib/pdf/reader/page_state.rb
|
@@ -274,12 +258,13 @@ files:
|
|
274
258
|
- lib/pdf/reader/register_receiver.rb
|
275
259
|
- lib/pdf/reader/resource_methods.rb
|
276
260
|
- lib/pdf/reader/standard_security_handler.rb
|
261
|
+
- lib/pdf/reader/standard_security_handler_v5.rb
|
277
262
|
- lib/pdf/reader/stream.rb
|
278
263
|
- lib/pdf/reader/synchronized_cache.rb
|
279
|
-
- lib/pdf/reader/text_receiver.rb
|
280
264
|
- lib/pdf/reader/text_run.rb
|
281
265
|
- lib/pdf/reader/token.rb
|
282
266
|
- lib/pdf/reader/transformation_matrix.rb
|
267
|
+
- lib/pdf/reader/unimplemented_security_handler.rb
|
283
268
|
- lib/pdf/reader/width_calculator.rb
|
284
269
|
- lib/pdf/reader/width_calculator/built_in.rb
|
285
270
|
- lib/pdf/reader/width_calculator/composite.rb
|
@@ -287,30 +272,20 @@ files:
|
|
287
272
|
- lib/pdf/reader/width_calculator/type_one_or_three.rb
|
288
273
|
- lib/pdf/reader/width_calculator/type_zero.rb
|
289
274
|
- lib/pdf/reader/xref.rb
|
290
|
-
homepage:
|
275
|
+
homepage: https://github.com/yob/pdf-reader
|
291
276
|
licenses:
|
292
277
|
- MIT
|
293
|
-
metadata:
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
examples showing how to use it in the README and examples directory.
|
300
|
-
|
301
|
-
For detailed documentation, check the rdocs for the PDF::Reader,
|
302
|
-
PDF::Reader::Page and PDF::Reader::ObjectHash classes.
|
303
|
-
|
304
|
-
The old API is marked as deprecated but will continue to work with no
|
305
|
-
visible warnings for now.
|
306
|
-
|
307
|
-
********************************************
|
308
|
-
|
278
|
+
metadata:
|
279
|
+
bug_tracker_uri: https://github.com/yob/pdf-reader/issues
|
280
|
+
changelog_uri: https://github.com/yob/pdf-reader/blob/v2.5.0/CHANGELOG
|
281
|
+
documentation_uri: https://www.rubydoc.info/gems/pdf-reader/2.5.0
|
282
|
+
source_code_uri: https://github.com/yob/pdf-reader/tree/v2.5.0
|
283
|
+
post_install_message:
|
309
284
|
rdoc_options:
|
310
285
|
- "--title"
|
311
286
|
- PDF::Reader Documentation
|
312
287
|
- "--main"
|
313
|
-
- README.
|
288
|
+
- README.md
|
314
289
|
- "-q"
|
315
290
|
require_paths:
|
316
291
|
- lib
|
@@ -318,16 +293,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
318
293
|
requirements:
|
319
294
|
- - ">="
|
320
295
|
- !ruby/object:Gem::Version
|
321
|
-
version:
|
296
|
+
version: '2.0'
|
322
297
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
323
298
|
requirements:
|
324
299
|
- - ">="
|
325
300
|
- !ruby/object:Gem::Version
|
326
301
|
version: '0'
|
327
302
|
requirements: []
|
328
|
-
|
329
|
-
|
330
|
-
signing_key:
|
303
|
+
rubygems_version: 3.2.3
|
304
|
+
signing_key:
|
331
305
|
specification_version: 4
|
332
306
|
summary: A library for accessing the content of PDF files
|
333
307
|
test_files: []
|
data/bin/pdf_list_callbacks
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
|
3
|
-
# this executable is deprecated, use pdf_callbacks instead
|
4
|
-
|
5
|
-
require 'rubygems'
|
6
|
-
|
7
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + "/../lib")
|
8
|
-
|
9
|
-
require 'pdf/reader'
|
10
|
-
|
11
|
-
receiver = PDF::Reader::PrintReceiver.new
|
12
|
-
|
13
|
-
if ARGV.empty?
|
14
|
-
PDF::Reader.new.parse($stdin, receiver)
|
15
|
-
else
|
16
|
-
PDF::Reader.file(ARGV[0], receiver)
|
17
|
-
end
|
data/lib/pdf/hash.rb
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
module PDF
|
4
|
-
# This class is deprecated, please stop using it.
|
5
|
-
class Hash < ::PDF::Reader::ObjectHash # :nodoc:
|
6
|
-
def initialize(input)
|
7
|
-
warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
|
8
|
-
super
|
9
|
-
end
|
10
|
-
|
11
|
-
def version
|
12
|
-
warn <<-EOS
|
13
|
-
DEPRECATION NOTICE: PDF::Hash#version has been deprecated,
|
14
|
-
use PDF::Reader::ObjectHash#pdf_version instead
|
15
|
-
EOS
|
16
|
-
pdf_version
|
17
|
-
end
|
18
|
-
end
|
19
|
-
end
|
@@ -1,81 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
class PDF::Reader
|
4
|
-
|
5
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
-
# eventually be removed
|
7
|
-
class AbstractStrategy # :nodoc:
|
8
|
-
|
9
|
-
def initialize(ohash, receivers, options = {})
|
10
|
-
@ohash, @options = ohash, options
|
11
|
-
if receivers.is_a?(Array)
|
12
|
-
@receivers = receivers
|
13
|
-
else
|
14
|
-
@receivers = [receivers]
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
private
|
19
|
-
|
20
|
-
def options
|
21
|
-
@options || {}
|
22
|
-
end
|
23
|
-
|
24
|
-
# calls the name callback method on the receiver class with params as the arguments
|
25
|
-
#
|
26
|
-
def callback(name, params=[])
|
27
|
-
@receivers.each do |receiver|
|
28
|
-
receiver.send(name, *params) if receiver.respond_to?(name)
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
33
|
-
def decode_strings(obj)
|
34
|
-
case obj
|
35
|
-
when String then
|
36
|
-
if obj[0,2].unpack("C*").slice(0,2) == [254,255]
|
37
|
-
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
38
|
-
else
|
39
|
-
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
40
|
-
end
|
41
|
-
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
42
|
-
when Array then obj.collect { |item| decode_strings(item) }
|
43
|
-
else
|
44
|
-
obj
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def info
|
49
|
-
ohash.object(trailer[:Info])
|
50
|
-
end
|
51
|
-
|
52
|
-
def info?
|
53
|
-
info ? true : false
|
54
|
-
end
|
55
|
-
|
56
|
-
def ohash
|
57
|
-
@ohash
|
58
|
-
end
|
59
|
-
|
60
|
-
def pages
|
61
|
-
ohash.object(root[:Pages])
|
62
|
-
end
|
63
|
-
|
64
|
-
def pages?
|
65
|
-
pages ? true : false
|
66
|
-
end
|
67
|
-
|
68
|
-
def root
|
69
|
-
ohash.object(trailer[:Root])
|
70
|
-
end
|
71
|
-
|
72
|
-
def root?
|
73
|
-
root ? true : false
|
74
|
-
end
|
75
|
-
|
76
|
-
def trailer
|
77
|
-
ohash.trailer
|
78
|
-
end
|
79
|
-
|
80
|
-
end
|
81
|
-
end
|
@@ -1,56 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
class PDF::Reader
|
4
|
-
|
5
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
-
# eventually be removed
|
7
|
-
#
|
8
|
-
class MetadataStrategy < AbstractStrategy # :nodoc:
|
9
|
-
|
10
|
-
def self.to_sym
|
11
|
-
:metadata
|
12
|
-
end
|
13
|
-
|
14
|
-
def process
|
15
|
-
return false unless options[:metadata]
|
16
|
-
|
17
|
-
# may be useful to some people
|
18
|
-
callback(:pdf_version, ohash.pdf_version)
|
19
|
-
|
20
|
-
# ye olde metadata
|
21
|
-
callback(:metadata, [decoded_info]) if info?
|
22
|
-
|
23
|
-
# new style xml metadata
|
24
|
-
callback(:xml_metadata, [xml_metadata]) if xml_metadata?
|
25
|
-
|
26
|
-
# page count
|
27
|
-
if pages?
|
28
|
-
count = ohash.object(pages[:Count])
|
29
|
-
callback(:page_count, count.to_i)
|
30
|
-
end
|
31
|
-
end
|
32
|
-
|
33
|
-
private
|
34
|
-
|
35
|
-
def xml_metadata
|
36
|
-
return @xml_metadata if defined?(@xml_metadata)
|
37
|
-
|
38
|
-
if root[:Metadata].nil?
|
39
|
-
@xml_metadata = nil
|
40
|
-
else
|
41
|
-
string = ohash.object(root[:Metadata]).unfiltered_data
|
42
|
-
string.force_encoding("utf-8") if string.respond_to?(:force_encoding)
|
43
|
-
@xml_metadata = string
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def xml_metadata?
|
48
|
-
xml_metadata ? true : false
|
49
|
-
end
|
50
|
-
|
51
|
-
def decoded_info
|
52
|
-
@decoded_info ||= decode_strings(info)
|
53
|
-
end
|
54
|
-
|
55
|
-
end
|
56
|
-
end
|
@@ -1,265 +0,0 @@
|
|
1
|
-
# coding: utf-8
|
2
|
-
|
3
|
-
################################################################################
|
4
|
-
#
|
5
|
-
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
|
6
|
-
#
|
7
|
-
# Permission is hereby granted, free of charge, to any person obtaining
|
8
|
-
# a copy of this software and associated documentation files (the
|
9
|
-
# "Software"), to deal in the Software without restriction, including
|
10
|
-
# without limitation the rights to use, copy, modify, merge, publish,
|
11
|
-
# distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
-
# permit persons to whom the Software is furnished to do so, subject to
|
13
|
-
# the following conditions:
|
14
|
-
#
|
15
|
-
# The above copyright notice and this permission notice shall be
|
16
|
-
# included in all copies or substantial portions of the Software.
|
17
|
-
#
|
18
|
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
-
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
-
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
-
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
-
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
-
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
-
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
-
#
|
26
|
-
################################################################################
|
27
|
-
|
28
|
-
class PDF::Reader
|
29
|
-
################################################################################
|
30
|
-
# An example receiver class that processes all text found in a PDF file. All text that
|
31
|
-
# is found will be printed to the IO object specified in the constructor.
|
32
|
-
#
|
33
|
-
# Usage:
|
34
|
-
# receiver = PDF::Reader::TextReceiver.new($stdout)
|
35
|
-
# PDF::Reader.file("somefile.pdf", receiver)
|
36
|
-
#
|
37
|
-
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
38
|
-
# eventually be removed
|
39
|
-
class TextReceiver
|
40
|
-
################################################################################
|
41
|
-
# Initialize with the library user's receiver
|
42
|
-
def initialize(main_receiver)
|
43
|
-
@main_receiver = main_receiver
|
44
|
-
@upper_corners = []
|
45
|
-
end
|
46
|
-
################################################################################
|
47
|
-
# Called when the document parsing begins
|
48
|
-
def begin_document(root)
|
49
|
-
@upper_corners = []
|
50
|
-
end
|
51
|
-
################################################################################
|
52
|
-
# Called when the document parsing ends
|
53
|
-
def end_document
|
54
|
-
@state.clear
|
55
|
-
end
|
56
|
-
################################################################################
|
57
|
-
def begin_page_container(page)
|
58
|
-
@upper_corners.push(media_box_check(page))
|
59
|
-
end
|
60
|
-
################################################################################
|
61
|
-
def end_page_container
|
62
|
-
@upper_corners.pop
|
63
|
-
end
|
64
|
-
################################################################################
|
65
|
-
# Called when new page parsing begins
|
66
|
-
def begin_page(info)
|
67
|
-
@page = info
|
68
|
-
|
69
|
-
@state = [{
|
70
|
-
:char_spacing => 0,
|
71
|
-
:word_spacing => 0,
|
72
|
-
:hori_scaling => 100,
|
73
|
-
:leading => 0,
|
74
|
-
:tj_adjustment => 0,
|
75
|
-
}]
|
76
|
-
|
77
|
-
@upper_corners.push(media_box_check(info))
|
78
|
-
|
79
|
-
@output = []
|
80
|
-
@line = 0
|
81
|
-
@location = 0
|
82
|
-
@displacement = {}
|
83
|
-
@smallest_y_loc = @upper_corners.last[:ury]
|
84
|
-
@written_to = false
|
85
|
-
end
|
86
|
-
################################################################################
|
87
|
-
# Called when page parsing ends
|
88
|
-
def end_page
|
89
|
-
@main_receiver << @output.join("\n")
|
90
|
-
@upper_corners.pop
|
91
|
-
end
|
92
|
-
################################################################################
|
93
|
-
# PDF operator BT
|
94
|
-
def begin_text_object
|
95
|
-
@state.push(@state.last.dup)
|
96
|
-
end
|
97
|
-
################################################################################
|
98
|
-
# PDF operator ET
|
99
|
-
def end_text_object
|
100
|
-
@state.pop
|
101
|
-
end
|
102
|
-
################################################################################
|
103
|
-
# PDF operator Tm
|
104
|
-
def set_text_matrix_and_text_line_matrix(*args)
|
105
|
-
# these variable names look bad, but they're from the PDF spec
|
106
|
-
_a, _b, _c, _d, _e, f = *args
|
107
|
-
calculate_line_and_location(f)
|
108
|
-
end
|
109
|
-
################################################################################
|
110
|
-
# PDF operator Tc
|
111
|
-
def set_character_spacing(n)
|
112
|
-
@state.last[:char_spacing] = n
|
113
|
-
end
|
114
|
-
################################################################################
|
115
|
-
# PDF operator Tw
|
116
|
-
def set_word_spacing(n)
|
117
|
-
@state.last[:word_spacing] = n
|
118
|
-
end
|
119
|
-
################################################################################
|
120
|
-
# PDF operator Tz
|
121
|
-
def set_horizontal_text_scaling(n)
|
122
|
-
@state.last[:hori_scaling] = n/100
|
123
|
-
end
|
124
|
-
################################################################################
|
125
|
-
# PDF operator TL
|
126
|
-
def set_text_leading(n)
|
127
|
-
@state.last[:leading] = n
|
128
|
-
end
|
129
|
-
################################################################################
|
130
|
-
# PDF operator T*
|
131
|
-
def move_to_start_of_next_line
|
132
|
-
move_text_position(0, @state.last[:leading])
|
133
|
-
end
|
134
|
-
################################################################################
|
135
|
-
# PDF operator Td
|
136
|
-
def move_text_position(tx, ty)
|
137
|
-
#puts "#{tx} #{ty} Td"
|
138
|
-
calculate_line_and_location(@location + ty)
|
139
|
-
end
|
140
|
-
################################################################################
|
141
|
-
# PDF operator TD
|
142
|
-
def move_text_position_and_set_leading(tx, ty)
|
143
|
-
set_text_leading(ty)# * -1)
|
144
|
-
move_text_position(tx, ty)
|
145
|
-
end
|
146
|
-
################################################################################
|
147
|
-
# PDF operator Tj
|
148
|
-
def show_text(string)
|
149
|
-
#puts "getting line #@line"
|
150
|
-
|
151
|
-
place = (@output[@line] ||= "")
|
152
|
-
#place << " " unless place.empty?
|
153
|
-
|
154
|
-
place << " " * (@state.last[:tj_adjustment].abs/900) if @state.last[:tj_adjustment] < -1000
|
155
|
-
place << string
|
156
|
-
|
157
|
-
#puts "place is now: #{place}"
|
158
|
-
@written_to = true
|
159
|
-
end
|
160
|
-
def super_show_text(string)
|
161
|
-
urx = @upper_corners.last[:urx]/TS_UNITS_PER_H_CHAR
|
162
|
-
ury = @upper_corners.last[:ury]/TS_UNITS_PER_V_CHAR
|
163
|
-
|
164
|
-
x = (@tm[2,0]/TS_UNITS_PER_H_CHAR).to_i
|
165
|
-
y = (ury - (@tm[2,1]/TS_UNITS_PER_V_CHAR)).to_i
|
166
|
-
|
167
|
-
#puts "rendering '#{string}' to #{x}x#{y}"
|
168
|
-
|
169
|
-
place = (@output[y] ||= (" " * urx.to_i))
|
170
|
-
#puts "#{urx} #{place.size} #{string.size} #{x}"
|
171
|
-
return if x+string.size >= urx
|
172
|
-
|
173
|
-
string.split(//).each do |c|
|
174
|
-
chars = 1
|
175
|
-
|
176
|
-
case c
|
177
|
-
when " "
|
178
|
-
chars += @state.last[:word_spacing].to_i
|
179
|
-
place[x-1, chars] = (" " * chars)
|
180
|
-
else
|
181
|
-
chars += @state.last[:char_spacing].to_i
|
182
|
-
chars -= (@state.last[:tj_adjustment]/1000).to_i if @state.last[:tj_adjustment]
|
183
|
-
chars = 1 if chars < 1
|
184
|
-
|
185
|
-
place[x-1] = c
|
186
|
-
place[x, chars-1] = (" " * (chars-1)) if chars > 1
|
187
|
-
end
|
188
|
-
|
189
|
-
x += chars
|
190
|
-
end
|
191
|
-
|
192
|
-
@tm += Matrix.rows([[1, 0, 0], [0, 1, 0], [x*TS_UNITS_PER_H_CHAR, y*TS_UNITS_PER_V_CHAR, 1]])
|
193
|
-
end
|
194
|
-
################################################################################
|
195
|
-
# PDF operator TJ
|
196
|
-
def show_text_with_positioning(params)
|
197
|
-
prev_adjustment = @state.last[:tj_adjustment]
|
198
|
-
|
199
|
-
params.each do |p|
|
200
|
-
case p
|
201
|
-
when Float, Integer
|
202
|
-
@state.last[:tj_adjustment] = p
|
203
|
-
else
|
204
|
-
show_text(p)
|
205
|
-
end
|
206
|
-
end
|
207
|
-
|
208
|
-
@state.last[:tj_adjustment] = prev_adjustment
|
209
|
-
end
|
210
|
-
################################################################################
|
211
|
-
# PDF operator '
|
212
|
-
def move_to_next_line_and_show_text(string)
|
213
|
-
move_to_start_of_next_line
|
214
|
-
show_text(string)
|
215
|
-
end
|
216
|
-
################################################################################
|
217
|
-
# PDF operator "
|
218
|
-
def set_spacing_next_line_show_text(aw, ac, string)
|
219
|
-
set_word_spacing(aw)
|
220
|
-
set_character_spacing(ac)
|
221
|
-
move_to_next_line_and_show_text(string)
|
222
|
-
end
|
223
|
-
################################################################################
|
224
|
-
def media_box_check(dict)
|
225
|
-
corners = (@upper_corners.last || {:urx => 0, :ury => 0}).dup
|
226
|
-
|
227
|
-
if dict.has_key?(:MediaBox)
|
228
|
-
media_box = dict[:MediaBox]
|
229
|
-
corners[:urx] = media_box[2] - media_box[0]
|
230
|
-
corners[:ury] = media_box[3] - media_box[1]
|
231
|
-
end
|
232
|
-
|
233
|
-
corners
|
234
|
-
end
|
235
|
-
################################################################################
|
236
|
-
def calculate_line_and_location(new_loc)
|
237
|
-
##puts "calculate_line_and_location(#{new_loc})"
|
238
|
-
key = new_loc; key.freeze
|
239
|
-
|
240
|
-
#key = new_loc.to_s # because hashes with string keys are magic (auto-freeze)
|
241
|
-
|
242
|
-
if @written_to
|
243
|
-
unless @displacement.has_key?(key)
|
244
|
-
if key < @location
|
245
|
-
@displacement[key] = @line + 1
|
246
|
-
elsif key < @smallest_y_loc
|
247
|
-
@displacement[key] = @line + 1
|
248
|
-
else
|
249
|
-
key = @displacement.keys.find_all {|i| key > i}.sort.last
|
250
|
-
@displacement[key] = 0 unless @displacement.has_key?(key)
|
251
|
-
end
|
252
|
-
end
|
253
|
-
else
|
254
|
-
@displacement[key] = 0
|
255
|
-
end
|
256
|
-
|
257
|
-
@smallest_y_loc = key if key < @smallest_y_loc
|
258
|
-
@location = key
|
259
|
-
@line = @displacement[key]
|
260
|
-
end
|
261
|
-
################################################################################
|
262
|
-
end
|
263
|
-
################################################################################
|
264
|
-
end
|
265
|
-
################################################################################
|