pdf-reader 0.8.6 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +17 -0
- data/README.rdoc +7 -15
- data/Rakefile +10 -63
- data/TODO +6 -8
- data/bin/pdf_object +3 -0
- data/bin/pdf_text +4 -2
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +1 -1
- data/examples/text.rb +3 -0
- data/lib/pdf/hash.rb +8 -225
- data/lib/pdf/reader.rb +79 -55
- data/lib/pdf/reader/abstract_strategy.rb +77 -0
- data/lib/pdf/reader/buffer.rb +61 -40
- data/lib/pdf/reader/cmap.rb +11 -10
- data/lib/pdf/reader/encoding.rb +85 -79
- data/lib/pdf/reader/error.rb +1 -2
- data/lib/pdf/reader/filter.rb +109 -6
- data/lib/pdf/reader/font.rb +11 -11
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +53 -0
- data/lib/pdf/reader/object_hash.rb +275 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/{content.rb → pages_strategy.rb} +63 -100
- data/lib/pdf/reader/parser.rb +74 -37
- data/lib/pdf/reader/print_receiver.rb +0 -1
- data/lib/pdf/reader/register_receiver.rb +21 -0
- data/lib/pdf/reader/stream.rb +5 -1
- data/lib/pdf/reader/text_receiver.rb +3 -1
- data/lib/pdf/reader/token.rb +1 -1
- data/lib/pdf/reader/xref.rb +126 -64
- metadata +61 -13
- data/lib/pdf/reader/explore.rb +0 -116
data/CHANGELOG
CHANGED
@@ -1,3 +1,20 @@
|
|
1
|
+
v0.9.0 (19th November 2010)
|
2
|
+
- support for pdf 1.5+ files that use object and xref streams
|
3
|
+
- support streams that use a flate filter with the predictor option
|
4
|
+
- ensure all content instructions are parsed when split over multiple stream
|
5
|
+
- thanks to Jack Rusher for reporting
|
6
|
+
- Various string parsing bug
|
7
|
+
- some character conversions to utf-8 were failing (thanks Andrea Barisani)
|
8
|
+
- hashes with nested hex strings were tokenising wronly (thanks Evan Arnold)
|
9
|
+
- escaping bug in tokenising of literal strings (thanks David Westerink)
|
10
|
+
- Fix a bug that prevented PDFs with white space after the EOF marker from loading
|
11
|
+
- thanks to Solomon White for reporting the issue
|
12
|
+
- Add support for de-filtering some LZW compressed streams
|
13
|
+
- thanks to Jose Ignacio Rubio Iradi for the patch
|
14
|
+
- some small speed improvements
|
15
|
+
- API CHANGE: PDF::Hash renamed to PDF::Reader::ObjectHash
|
16
|
+
- having a class named Hash was confusing for users
|
17
|
+
|
1
18
|
v0.8.6 (27th August 2010)
|
2
19
|
- new method: hash#page_references
|
3
20
|
- returns references to all page objects, gives rapid access to objects
|
data/README.rdoc
CHANGED
@@ -8,17 +8,6 @@ The PDF 1.7 specification is a weighty document and not all aspects are
|
|
8
8
|
currently supported. I welcome submission of PDF files that exhibit
|
9
9
|
unsupported aspects of the spec to assist with improving out support.
|
10
10
|
|
11
|
-
= Development Status
|
12
|
-
|
13
|
-
I adopted this library in 2007 when I was learning the fundamentals of the PDF
|
14
|
-
spec. I do not currently use it in my day to day work and I just don't have the
|
15
|
-
spare time to dedicate to adding new features.
|
16
|
-
|
17
|
-
The code as it is works fairly well, and I offer it "as is". All patches, bug
|
18
|
-
reports and sample PDFs are welcome - I will work on them when I can. If anyone
|
19
|
-
is interested in adding features to PDF::Reader in their own effort to learn
|
20
|
-
the PDF file format, I'll happy offer help and support.
|
21
|
-
|
22
11
|
= Installation
|
23
12
|
|
24
13
|
The recommended installation method is via Rubygems.
|
@@ -37,12 +26,12 @@ methods do is entirely up to you - save the text, extract images, count pages,
|
|
37
26
|
read metadata, whatever.
|
38
27
|
|
39
28
|
For a full list of the supported callback methods and a description of when they
|
40
|
-
will be called, refer to PDF::Reader::
|
29
|
+
will be called, refer to PDF::Reader::PagesStrategy. See the examples directory for a
|
41
30
|
way to print a list of all the callbacks generated by a file to STDOUT.
|
42
31
|
|
43
|
-
There is also a class called PDF::
|
44
|
-
in a PDF file using a ruby hash-like API. Checkout the
|
45
|
-
for further information.
|
32
|
+
There is also a class called PDF::Reader::ObjectHash. This provides direct
|
33
|
+
access to the objects in a PDF file using a ruby hash-like API. Checkout the
|
34
|
+
documentation for the class for further information.
|
46
35
|
|
47
36
|
= Text Encoding
|
48
37
|
|
@@ -50,6 +39,9 @@ Internally, text can be stored inside a PDF in various encodings, including
|
|
50
39
|
zingbats, win-1252, mac roman and a form of Unicode. To avoid confusion, all
|
51
40
|
text will be converted to UTF-8 before it is passed back from PDF::Reader.
|
52
41
|
|
42
|
+
Strings that contain binary data (like font blobs) will be marked as such on
|
43
|
+
M17N aware VMs.
|
44
|
+
|
53
45
|
= Exceptions
|
54
46
|
|
55
47
|
There are two key exceptions that you will need to watch out for when processing a
|
data/Rakefile
CHANGED
@@ -1,43 +1,21 @@
|
|
1
1
|
require "rubygems"
|
2
|
+
require "bundler"
|
3
|
+
Bundler.setup
|
4
|
+
|
2
5
|
require 'rake'
|
3
|
-
require 'rake/clean'
|
4
6
|
require 'rake/rdoctask'
|
5
|
-
require '
|
6
|
-
require
|
7
|
-
require '
|
8
|
-
|
9
|
-
PKG_VERSION = "0.8.6"
|
10
|
-
PKG_NAME = "pdf-reader"
|
11
|
-
PKG_FILE_NAME = "#{PKG_NAME}-#{PKG_VERSION}"
|
7
|
+
require 'rspec/core/rake_task'
|
8
|
+
require 'roodi'
|
9
|
+
require 'roodi_task'
|
12
10
|
|
13
11
|
desc "Default Task"
|
14
12
|
task :default => [ :spec ]
|
15
13
|
|
16
14
|
# run all rspecs
|
17
15
|
desc "Run all rspec files"
|
18
|
-
|
19
|
-
t.
|
20
|
-
t.
|
21
|
-
t.rcov_dir = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + "/rcov"
|
22
|
-
t.ruby_opts << "-w"
|
23
|
-
# t.rcov_opts = ["--exclude","spec.*\.rb"]
|
24
|
-
end
|
25
|
-
|
26
|
-
# generate specdocs
|
27
|
-
desc "Generate Specdocs"
|
28
|
-
Spec::Rake::SpecTask.new("specdocs") do |t|
|
29
|
-
t.spec_files = FileList['specs/**/*.rb']
|
30
|
-
t.spec_opts = ["--format", "rdoc"]
|
31
|
-
t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/specdoc.rd'
|
32
|
-
end
|
33
|
-
|
34
|
-
# generate failing spec report
|
35
|
-
desc "Generate failing spec report"
|
36
|
-
Spec::Rake::SpecTask.new("spec_report") do |t|
|
37
|
-
t.spec_files = FileList['specs/**/*.rb']
|
38
|
-
t.spec_opts = ["--format", "html", "--diff"]
|
39
|
-
t.out = (ENV['CC_BUILD_ARTIFACTS'] || 'doc') + '/spec_report.html'
|
40
|
-
t.fail_on_error = false
|
16
|
+
RSpec::Core::RakeTask.new("spec") do |t|
|
17
|
+
t.rspec_opts = ["--color", "--format progress"]
|
18
|
+
t.ruby_opts = "-w"
|
41
19
|
end
|
42
20
|
|
43
21
|
# Genereate the RDoc documentation
|
@@ -53,35 +31,4 @@ Rake::RDocTask.new("doc") do |rdoc|
|
|
53
31
|
rdoc.options << "--inline-source"
|
54
32
|
end
|
55
33
|
|
56
|
-
|
57
|
-
# RSpec files aren't included, as they depend on the PDF files,
|
58
|
-
# which will make the gem filesize irritatingly large
|
59
|
-
spec = Gem::Specification.new do |spec|
|
60
|
-
spec.name = PKG_NAME
|
61
|
-
spec.version = PKG_VERSION
|
62
|
-
spec.platform = Gem::Platform::RUBY
|
63
|
-
spec.summary = "A library for accessing the content of PDF files"
|
64
|
-
spec.files = Dir.glob("{examples,lib}/**/**/*") + ["Rakefile"]
|
65
|
-
spec.require_path = "lib"
|
66
|
-
spec.bindir = "bin"
|
67
|
-
spec.executables << "pdf_object"
|
68
|
-
spec.executables << "pdf_text"
|
69
|
-
spec.executables << "pdf_list_callbacks"
|
70
|
-
spec.has_rdoc = true
|
71
|
-
spec.extra_rdoc_files = %w{README.rdoc TODO CHANGELOG MIT-LICENSE }
|
72
|
-
spec.rdoc_options << '--title' << 'PDF::Reader Documentation' <<
|
73
|
-
'--main' << 'README.rdoc' << '-q'
|
74
|
-
spec.author = "James Healy"
|
75
|
-
spec.email = "jimmy@deefa.com"
|
76
|
-
spec.rubyforge_project = "pdf-reader"
|
77
|
-
spec.homepage = "http://github.com/yob/pdf-reader"
|
78
|
-
spec.description = "The PDF::Reader library implements a PDF parser conforming as much as possible to the PDF specification from Adobe"
|
79
|
-
spec.add_dependency('Ascii85', '>=0.9')
|
80
|
-
end
|
81
|
-
|
82
|
-
# package the library into a gem
|
83
|
-
desc "Generate a gem for pdf-reader"
|
84
|
-
Rake::GemPackageTask.new(spec) do |pkg|
|
85
|
-
pkg.need_zip = true
|
86
|
-
pkg.need_tar = true
|
87
|
-
end
|
34
|
+
RoodiTask.new 'roodi', ['lib/**/*.rb']
|
data/TODO
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
v0.8
|
2
|
-
- optimise PDF::Reader::Reference#from_buffer
|
3
|
-
- ruby-prof shows the match() call in this function is a real killer
|
4
2
|
- add extra callbacks
|
5
|
-
- list implemented features
|
3
|
+
- list implemented features
|
6
4
|
- encrypted? tagged? bookmarks? annotated? optimised?
|
7
5
|
- Allow more than just page content and metadata to be parsed (see spec section 3.6.1)
|
8
6
|
- bookmarks?
|
@@ -15,7 +13,6 @@ v0.8
|
|
15
13
|
from the Original encoding to Unicode.
|
16
14
|
- detect when a font's encoding is a CMap (generally used for pre-Unicode, multibyte asian encodings), and display a user friendly error
|
17
15
|
- Improve interpretation of non content stream data (ie metadata). recognise dates, etc
|
18
|
-
- Support Cross Reference Streams (spec 3.4.7)
|
19
16
|
- Fix inheritance of page attributes. Resources has been done, but plenty of other attributes
|
20
17
|
are inheritable. See table 3.2.7 in the spec
|
21
18
|
|
@@ -33,15 +30,16 @@ Sometime
|
|
33
30
|
- Ship some extra receivers in the standard package, particuarly ones that are useful for running
|
34
31
|
rspec over generated PDF files
|
35
32
|
|
36
|
-
- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
|
33
|
+
- When we encounter Identity-H encoded text with no ToUnicode CMap, render the glyphs and treat them as images, as there's no
|
37
34
|
sensible way to convert them to unicode
|
38
35
|
|
39
36
|
- Add support for additional filters: ASCIIHexDecode, ASCII85Decode, LZWDecode, RunLengthDecode, CCITTFaxDecode, JBIG2Decode, DCTDecode, JPXDecode, Crypt?
|
40
37
|
|
41
|
-
- Add support for additional encodings:
|
42
|
-
- PDFDocEncoding
|
38
|
+
- Add support for additional encodings:
|
43
39
|
- Identity-V(I *think* this relates to vertical text. Not sure how we'd support it sensibly)
|
44
40
|
|
45
41
|
- Investigate how R->L text is handled
|
46
42
|
|
47
|
-
-
|
43
|
+
- fix all callbacks to only ever return basic ruby objects (strings, ints,
|
44
|
+
attays, symbols, hashes, etc). No PDF::Reader::Reference or
|
45
|
+
PDF::Reader::Font, etc.
|
data/bin/pdf_object
CHANGED
data/bin/pdf_text
CHANGED
@@ -17,9 +17,11 @@ class PageTextReceiver
|
|
17
17
|
end
|
18
18
|
end
|
19
19
|
|
20
|
-
def show_text(
|
20
|
+
def show_text(*params)
|
21
21
|
@content = "" if @content.nil?
|
22
|
-
|
22
|
+
params.each do |str|
|
23
|
+
@content << str.to_s
|
24
|
+
end
|
23
25
|
end
|
24
26
|
|
25
27
|
# there's a few text callbacks, so make sure we process them all
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
# This demonstrates a way to extract some images (those based on the JPG or
|
4
|
+
# TIFF formats) from a PDF. There are other ways to store images, so
|
5
|
+
# it may need to be expanded for real world usage, but it should serve
|
6
|
+
# as a good guide.
|
7
|
+
#
|
8
|
+
# Thanks to Jack Rusher for the initial version of this example.
|
9
|
+
#
|
10
|
+
# USAGE:
|
11
|
+
#
|
12
|
+
# ruby extract_images.rb somefile.pdf
|
13
|
+
|
14
|
+
require 'pdf/reader'
|
15
|
+
|
16
|
+
module ExtractImages
|
17
|
+
|
18
|
+
class Receiver
|
19
|
+
attr_reader :count
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@count = 0
|
23
|
+
end
|
24
|
+
|
25
|
+
def resource_xobject(name, stream)
|
26
|
+
return unless stream.hash[:Subtype] == :Image
|
27
|
+
increment_count
|
28
|
+
|
29
|
+
case stream.hash[:Filter]
|
30
|
+
when :CCITTFaxDecode
|
31
|
+
ExtractImages::Tiff.new(stream).save("#{count}-#{name}.tif")
|
32
|
+
when :DCTDecode
|
33
|
+
ExtractImages::Jpg.new(stream).save("#{count}-#{name}.jpg")
|
34
|
+
else
|
35
|
+
$stderr.puts "unrecognized image filter '#{stream.hash[:Filter]}'!"
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
def increment_count
|
40
|
+
@count += 1
|
41
|
+
end
|
42
|
+
private :increment_count
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class Jpg
|
47
|
+
attr_reader :stream
|
48
|
+
|
49
|
+
def initialize(stream)
|
50
|
+
@stream = stream
|
51
|
+
end
|
52
|
+
|
53
|
+
def save(filename)
|
54
|
+
w = stream.hash[:Width]
|
55
|
+
h = stream.hash[:Height]
|
56
|
+
puts "#{filename}: h=#{h}, w=#{w}"
|
57
|
+
File.open(filename, "wb") { |file| file.write stream.data }
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
class Tiff
|
62
|
+
attr_reader :stream
|
63
|
+
|
64
|
+
def initialize(stream)
|
65
|
+
@stream = stream
|
66
|
+
end
|
67
|
+
|
68
|
+
def save(filename)
|
69
|
+
if stream.hash[:DecodeParms][:K] <= 0
|
70
|
+
save_group_four(filename)
|
71
|
+
else
|
72
|
+
$stderr.puts "#{filename}: CCITT non-group 4/2D image."
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
# Group 4, 2D
|
79
|
+
def save_group_four(filename)
|
80
|
+
k = stream.hash[:DecodeParms][:K]
|
81
|
+
h = stream.hash[:Height]
|
82
|
+
w = stream.hash[:Width]
|
83
|
+
bpc = stream.hash[:BitsPerComponent]
|
84
|
+
mask = stream.hash[:ImageMask]
|
85
|
+
len = stream.hash[:Length]
|
86
|
+
cols = stream.hash[:DecodeParms][:Columns]
|
87
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, mask=#{mask}, len=#{len}, cols=#{cols}, k=#{k}"
|
88
|
+
|
89
|
+
# Synthesize a TIFF header
|
90
|
+
long_tag = lambda {|tag, value| [ tag, 4, 1, value ].pack( "ssII" ) }
|
91
|
+
short_tag = lambda {|tag, value| [ tag, 3, 1, value ].pack( "ssII" ) }
|
92
|
+
# header = byte order, version magic, offset of directory, directory count,
|
93
|
+
# followed by a series of tags containing metadata: 259 is a magic number for
|
94
|
+
# the compression type; 273 is the offset of the image data.
|
95
|
+
tiff = [ 73, 73, 42, 8, 5 ].pack("ccsIs") \
|
96
|
+
+ short_tag.call( 256, cols ) \
|
97
|
+
+ short_tag.call( 257, h ) \
|
98
|
+
+ short_tag.call( 259, 4 ) \
|
99
|
+
+ long_tag.call( 273, (10 + (5*12)) ) \
|
100
|
+
+ long_tag.call( 279, len) \
|
101
|
+
+ stream.data
|
102
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
receiver = ExtractImages::Receiver.new
|
108
|
+
PDF::Reader.file(ARGV[0], receiver)
|
data/examples/hash.rb
CHANGED
data/examples/text.rb
CHANGED
data/lib/pdf/hash.rb
CHANGED
@@ -1,232 +1,15 @@
|
|
1
|
-
|
2
|
-
# Provides low level access to the objects in a PDF file via a hash-like
|
3
|
-
# object.
|
4
|
-
#
|
5
|
-
# A PDF file can be viewed as a large hash map. It is a series of objects
|
6
|
-
# stored at an exact byte offsets, and a table that maps object IDs to byte
|
7
|
-
# offsets. Given an object ID, looking up an object is an O(1) operation.
|
8
|
-
#
|
9
|
-
# Each PDF object can be mapped to a ruby object, so by passing an object
|
10
|
-
# ID to the [] method, a ruby representation of that object will be
|
11
|
-
# retrieved.
|
12
|
-
#
|
13
|
-
# The class behaves much like a standard Ruby hash, including the use of
|
14
|
-
# the Enumerable mixin. The key difference is no []= method - the hash
|
15
|
-
# is read only.
|
16
|
-
#
|
17
|
-
# == Basic Usage
|
18
|
-
#
|
19
|
-
# h = PDF::Hash.new("somefile.pdf")
|
20
|
-
# h[1]
|
21
|
-
# => 3469
|
22
|
-
#
|
23
|
-
# h[PDF::Reader::Reference.new(1,0)]
|
24
|
-
# => 3469
|
25
|
-
#
|
26
|
-
class Hash
|
27
|
-
include Enumerable
|
28
|
-
|
29
|
-
attr_accessor :default
|
30
|
-
attr_reader :trailer, :version
|
1
|
+
# coding: utf-8
|
31
2
|
|
32
|
-
|
33
|
-
|
34
|
-
#
|
3
|
+
module PDF
|
4
|
+
class Hash < PDF::Reader::ObjectHash # :nodoc:
|
35
5
|
def initialize(input)
|
36
|
-
|
37
|
-
|
38
|
-
elsif File.file?(input.to_s)
|
39
|
-
if File.respond_to?(:binread)
|
40
|
-
input = File.binread(input.to_s)
|
41
|
-
else
|
42
|
-
input = File.read(input.to_s)
|
43
|
-
end
|
44
|
-
io = StringIO.new(input)
|
45
|
-
else
|
46
|
-
raise ArgumentError, "input must be an IO-like object or a filename"
|
47
|
-
end
|
48
|
-
@version = read_version(io)
|
49
|
-
@xref = PDF::Reader::XRef.new(io)
|
50
|
-
@trailer = @xref.load
|
51
|
-
end
|
52
|
-
|
53
|
-
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
54
|
-
# object.
|
55
|
-
#
|
56
|
-
# If an int is used, the object with that ID and a generation number of 0 will
|
57
|
-
# be returned.
|
58
|
-
#
|
59
|
-
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
60
|
-
# can be specified.
|
61
|
-
#
|
62
|
-
def [](key)
|
63
|
-
return default if key.to_i <= 0
|
64
|
-
|
65
|
-
begin
|
66
|
-
unless key.kind_of?(PDF::Reader::Reference)
|
67
|
-
key = PDF::Reader::Reference.new(key.to_i, 0)
|
68
|
-
end
|
69
|
-
@xref.object(key)
|
70
|
-
rescue
|
71
|
-
return default
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
# Access an object from the PDF. key can be an int or a PDF::Reader::Reference
|
76
|
-
# object.
|
77
|
-
#
|
78
|
-
# If an int is used, the object with that ID and a generation number of 0 will
|
79
|
-
# be returned.
|
80
|
-
#
|
81
|
-
# If a PDF::Reader::Reference object is used the exact ID and generation number
|
82
|
-
# can be specified.
|
83
|
-
#
|
84
|
-
# local_deault is the object that will be returned if the requested key doesn't
|
85
|
-
# exist.
|
86
|
-
#
|
87
|
-
def fetch(key, local_default = nil)
|
88
|
-
obj = self[key]
|
89
|
-
if obj
|
90
|
-
return obj
|
91
|
-
elsif local_default
|
92
|
-
return local_default
|
93
|
-
else
|
94
|
-
raise IndexError, "#{key} is invalid" if key.to_i <= 0
|
95
|
-
end
|
96
|
-
end
|
97
|
-
|
98
|
-
# iterate over each key, value. Just like a ruby hash.
|
99
|
-
#
|
100
|
-
def each(&block)
|
101
|
-
@xref.each do |ref, obj|
|
102
|
-
yield ref, obj
|
103
|
-
end
|
104
|
-
end
|
105
|
-
alias :each_pair :each
|
106
|
-
|
107
|
-
# iterate over each key. Just like a ruby hash.
|
108
|
-
#
|
109
|
-
def each_key(&block)
|
110
|
-
each do |id, obj|
|
111
|
-
yield id
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
# iterate over each value. Just like a ruby hash.
|
116
|
-
#
|
117
|
-
def each_value(&block)
|
118
|
-
each do |id, obj|
|
119
|
-
yield obj
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
# return the number of objects in the file. An object with multiple generations
|
124
|
-
# is counted once.
|
125
|
-
def size
|
126
|
-
@xref.size
|
127
|
-
end
|
128
|
-
alias :length :size
|
129
|
-
|
130
|
-
# return true if there are no objects in this file
|
131
|
-
#
|
132
|
-
def empty?
|
133
|
-
size == 0 ? true : false
|
134
|
-
end
|
135
|
-
|
136
|
-
# return true if the specified key exists in the file. key
|
137
|
-
# can be an int or a PDF::Reader::Reference
|
138
|
-
#
|
139
|
-
def has_key?(check_key)
|
140
|
-
# TODO update from O(n) to O(1)
|
141
|
-
each_key do |key|
|
142
|
-
if check_key.kind_of?(PDF::Reader::Reference)
|
143
|
-
return true if check_key == key
|
144
|
-
else
|
145
|
-
return true if check_key.to_i == key.id
|
146
|
-
end
|
147
|
-
end
|
148
|
-
return false
|
6
|
+
warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
|
7
|
+
super
|
149
8
|
end
|
150
|
-
alias :include? :has_key?
|
151
|
-
alias :key? :has_key?
|
152
|
-
alias :member? :has_key?
|
153
9
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
# TODO update from O(n) to O(1)
|
158
|
-
each_value do |obj|
|
159
|
-
return true if obj == value
|
160
|
-
end
|
161
|
-
return false
|
10
|
+
def version
|
11
|
+
warn "DEPRECATION NOTICE: PDF::Hash#version has been deprecated, use PDF::Reader::ObjectHash#pdf_version instead"
|
12
|
+
pdf_version
|
162
13
|
end
|
163
|
-
alias :value? :has_key?
|
164
|
-
|
165
|
-
def to_s
|
166
|
-
"<PDF::Hash size: #{self.size}>"
|
167
|
-
end
|
168
|
-
|
169
|
-
# return an array of all keys in the file
|
170
|
-
#
|
171
|
-
def keys
|
172
|
-
ret = []
|
173
|
-
each_key { |k| ret << k }
|
174
|
-
ret
|
175
|
-
end
|
176
|
-
|
177
|
-
# return an array of all values in the file
|
178
|
-
#
|
179
|
-
def values
|
180
|
-
ret = []
|
181
|
-
each_value { |v| ret << v }
|
182
|
-
ret
|
183
|
-
end
|
184
|
-
|
185
|
-
# return an array of all values from the specified keys
|
186
|
-
#
|
187
|
-
def values_at(*ids)
|
188
|
-
ids.map { |id| self[id] }
|
189
|
-
end
|
190
|
-
|
191
|
-
# return an array of arrays. Each sub array contains a key/value pair.
|
192
|
-
#
|
193
|
-
def to_a
|
194
|
-
ret = []
|
195
|
-
each do |id, obj|
|
196
|
-
ret << [id, obj]
|
197
|
-
end
|
198
|
-
ret
|
199
|
-
end
|
200
|
-
|
201
|
-
# returns an array of PDF::Reader::References. Each reference in the
|
202
|
-
# array points a Page object, one for each page in the PDF. The first
|
203
|
-
# reference is page 1, second reference is page 2, etc.
|
204
|
-
#
|
205
|
-
def page_references
|
206
|
-
root = fetch(trailer[:Root])
|
207
|
-
@page_references ||= get_page_objects(root[:Pages]).flatten
|
208
|
-
end
|
209
|
-
|
210
|
-
private
|
211
|
-
|
212
|
-
# returns a nested array of object references for all pages in this object store.
|
213
|
-
#
|
214
|
-
def get_page_objects(ref)
|
215
|
-
obj = fetch(ref)
|
216
|
-
|
217
|
-
if obj[:Type] == :Page
|
218
|
-
ref
|
219
|
-
elsif obj[:Type] == :Pages
|
220
|
-
obj[:Kids].map { |kid| get_page_objects(kid) }
|
221
|
-
end
|
222
|
-
end
|
223
|
-
|
224
|
-
def read_version(io)
|
225
|
-
io.seek(0)
|
226
|
-
m, version = *io.read(10).match(/PDF-(\d.\d)/)
|
227
|
-
io.seek(0)
|
228
|
-
version
|
229
|
-
end
|
230
|
-
|
231
14
|
end
|
232
15
|
end
|