pdf-reader 0.11.0.alpha → 0.12.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +17 -1
- data/README.rdoc +31 -1
- data/bin/pdf_list_callbacks +2 -0
- data/examples/callbacks.rb +2 -1
- data/examples/extract_bates.rb +3 -2
- data/examples/extract_images.rb +146 -23
- data/examples/hash.rb +5 -5
- data/examples/metadata.rb +5 -16
- data/examples/page_count.rb +13 -0
- data/examples/rspec.rb +17 -41
- data/examples/text.rb +4 -29
- data/examples/version.rb +3 -15
- data/lib/pdf/reader.rb +45 -27
- data/lib/pdf/reader/encoding.rb +3 -3
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/filter.rb +64 -9
- data/lib/pdf/reader/font.rb +0 -17
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyph_hash.rb +88 -0
- data/lib/pdf/reader/glyphlist.txt +1 -1
- data/lib/pdf/reader/object_hash.rb +42 -12
- data/lib/pdf/reader/page.rb +63 -17
- data/lib/pdf/reader/page_text_receiver.rb +38 -4
- data/lib/pdf/reader/standard_security_handler.rb +186 -0
- data/lib/pdf/reader/stream.rb +2 -2
- metadata +39 -9
- data/examples/page_counter_improved.rb +0 -23
- data/examples/page_counter_naive.rb +0 -24
data/CHANGELOG
CHANGED
@@ -1,4 +1,20 @@
|
|
1
|
-
v0.
|
1
|
+
v0.12.0.alpha (28th August 2011)
|
2
|
+
- small breaking changes to the page-based API - it's alpha for a reason
|
3
|
+
- resource related methods on Page object return raw PDF objects
|
4
|
+
- if the caller wants the resources wrapped in a more convenient
|
5
|
+
Ruby object (like PDF::Reader::Font or PDF::Reader::FormXObject) will
|
6
|
+
need to do so themselves
|
7
|
+
- add support for RunLengthDecode filters (thanks Bernerd Schaefer)
|
8
|
+
- add support for standard PDF encryption (thanks Evan Brunner)
|
9
|
+
- add support for decoding stream wityh TIFF prediction
|
10
|
+
- new PDF::Reader::FormXObject class to simplify working with form XObjects
|
11
|
+
|
12
|
+
v0.11.0.alpha (19th July 2011)
|
13
|
+
- introduce experimental new page-based API
|
14
|
+
- old API is deprecated but will continue to work with no warnings
|
15
|
+
- add transparent caching of common objects to ObjectHash
|
16
|
+
|
17
|
+
v0.10.0 (6th July 2011)
|
2
18
|
- support multiple receivers within a single pass over a source file
|
3
19
|
- massive time saving when dealing with multiple receivers
|
4
20
|
|
data/README.rdoc
CHANGED
@@ -8,6 +8,11 @@ The PDF 1.7 specification is a weighty document and not all aspects are
|
|
8
8
|
currently supported. I welcome submission of PDF files that exhibit
|
9
9
|
unsupported aspects of the spec to assist with improving our support.
|
10
10
|
|
11
|
+
This is primarily a low-level library that should be used as the foundation for
|
12
|
+
higher level functionality - it's not going to render a PDF for you. There are
|
13
|
+
a few exceptions to support very common use cases like extracting text from a
|
14
|
+
page.
|
15
|
+
|
11
16
|
= Installation
|
12
17
|
|
13
18
|
The recommended installation method is via Rubygems.
|
@@ -27,6 +32,15 @@ this object.
|
|
27
32
|
puts reader.metadata
|
28
33
|
puts reader.page_count
|
29
34
|
|
35
|
+
PDF::Reader.new can accept an IO stream or a filename. Here's an example with
|
36
|
+
an IO stream:
|
37
|
+
|
38
|
+
require 'open-uri'
|
39
|
+
|
40
|
+
io = open('http://example.com/somefile.pdf')
|
41
|
+
reader = PDF::Reader.new(io)
|
42
|
+
puts reader.info
|
43
|
+
|
30
44
|
PDF is a page based file format, so most visible information is available via
|
31
45
|
page-based iteration
|
32
46
|
|
@@ -34,10 +48,24 @@ page-based iteration
|
|
34
48
|
|
35
49
|
reader.pages.each do |page|
|
36
50
|
puts page.fonts
|
37
|
-
puts page.images
|
38
51
|
puts page.text
|
52
|
+
puts page.raw_content
|
39
53
|
end
|
40
54
|
|
55
|
+
If you need to access the full program for rendering a page, use the walk() method
|
56
|
+
of PDF::Reader::Page.
|
57
|
+
|
58
|
+
class RedGreenBlue
|
59
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
60
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
reader = PDF::Reader.new("somefile.pdf")
|
65
|
+
page = reader.page(1)
|
66
|
+
receiver = RedGreenBlue.new
|
67
|
+
page.walk(receiver)
|
68
|
+
|
41
69
|
For low level access to the objects in a PDF file, use the ObjectHash class. You can
|
42
70
|
build an ObjectHash instance directly:
|
43
71
|
|
@@ -48,6 +76,8 @@ or via a PDF::Reader instance:
|
|
48
76
|
reader = PDF::Reader.new("somefile.pdf")
|
49
77
|
puts reader.objects
|
50
78
|
|
79
|
+
The second method is preferred to increase the effectiveness of internal caching.
|
80
|
+
|
51
81
|
= Text Encoding
|
52
82
|
|
53
83
|
Internally, text can be stored inside a PDF in various encodings, including
|
data/bin/pdf_list_callbacks
CHANGED
data/examples/callbacks.rb
CHANGED
@@ -10,8 +10,9 @@ require 'rubygems'
|
|
10
10
|
require 'pdf/reader'
|
11
11
|
|
12
12
|
receiver = PDF::Reader::RegisterReceiver.new
|
13
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
|
13
14
|
|
14
|
-
PDF::Reader.open(
|
15
|
+
PDF::Reader.open(filename) do |reader|
|
15
16
|
reader.pages.each do |page|
|
16
17
|
page.walk(receiver)
|
17
18
|
receiver.callbacks.each do |cb|
|
data/examples/extract_bates.rb
CHANGED
@@ -35,13 +35,14 @@ class BatesReceiver
|
|
35
35
|
|
36
36
|
end
|
37
37
|
|
38
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
|
38
39
|
|
39
|
-
PDF::Reader.open(
|
40
|
+
PDF::Reader.open(filename) do |reader|
|
40
41
|
reader.pages.each do |page|
|
41
42
|
receiver = BatesReceiver.new
|
42
43
|
page.walk(receiver)
|
43
44
|
if receiver.numbers.empty?
|
44
|
-
puts page.scan(/CC.+/)
|
45
|
+
puts page.text.scan(/CC.+/)
|
45
46
|
else
|
46
47
|
puts receiver.numbers.inspect
|
47
48
|
end
|
data/examples/extract_images.rb
CHANGED
@@ -1,46 +1,164 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
3
|
# This demonstrates a way to extract some images (those based on the JPG or
|
4
|
-
# TIFF formats) from a PDF. There are other ways to store images, so
|
4
|
+
# TIFF formats) from a PDF. There are other ways to store images, so
|
5
5
|
# it may need to be expanded for real world usage, but it should serve
|
6
6
|
# as a good guide.
|
7
7
|
#
|
8
8
|
# Thanks to Jack Rusher for the initial version of this example.
|
9
|
-
#
|
10
|
-
# USAGE:
|
11
|
-
#
|
12
|
-
# ruby extract_images.rb somefile.pdf
|
13
9
|
|
14
10
|
require 'pdf/reader'
|
15
11
|
|
16
12
|
module ExtractImages
|
17
13
|
|
18
|
-
class
|
19
|
-
|
14
|
+
class Extractor
|
15
|
+
|
16
|
+
def page(page)
|
17
|
+
count = 0
|
18
|
+
|
19
|
+
process_resources(page, page.resources, count)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
20
23
|
|
21
|
-
def
|
22
|
-
@
|
24
|
+
def complete_refs
|
25
|
+
@complete_refs ||= {}
|
23
26
|
end
|
24
27
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
+
def process_resources(page, resources, count)
|
29
|
+
xobjects = resources[:XObject]
|
30
|
+
return count if xobjects.nil?
|
31
|
+
|
32
|
+
xobjects.each do |name, stream|
|
33
|
+
next if complete_refs[stream]
|
34
|
+
complete_refs[stream] = true
|
35
|
+
|
36
|
+
stream = page.objects.deref(stream)
|
37
|
+
|
38
|
+
if stream.hash[:Subtype] == :Image
|
39
|
+
count += 1
|
40
|
+
|
41
|
+
case stream.hash[:Filter]
|
42
|
+
when :CCITTFaxDecode then
|
43
|
+
ExtractImages::Tiff.new(stream).save("#{page.number}-#{count}-#{name}.tif")
|
44
|
+
when :DCTDecode then
|
45
|
+
ExtractImages::Jpg.new(stream).save("#{page.number}-#{count}-#{name}.jpg")
|
46
|
+
else
|
47
|
+
ExtractImages::Raw.new(stream).save("#{page.number}-#{count}-#{name}.tif")
|
48
|
+
end
|
49
|
+
elsif stream.hash[:Subtype] == :Form
|
50
|
+
count = process_resources(page, PDF::Reader::FormXObject.new(page, stream).resources, count)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
count
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
class Raw
|
59
|
+
attr_reader :stream
|
28
60
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
61
|
+
def initialize(stream)
|
62
|
+
@stream = stream
|
63
|
+
end
|
64
|
+
|
65
|
+
def save(filename)
|
66
|
+
case @stream.hash[:ColorSpace]
|
67
|
+
when :DeviceCMYK then save_cmyk(filename)
|
68
|
+
when :DeviceGray then save_gray(filename)
|
69
|
+
when :DeviceRGB then save_rgb(filename)
|
34
70
|
else
|
35
|
-
$stderr.puts "
|
71
|
+
$stderr.puts "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
|
36
72
|
end
|
37
73
|
end
|
38
74
|
|
39
|
-
|
40
|
-
|
75
|
+
private
|
76
|
+
|
77
|
+
def save_cmyk(filename)
|
78
|
+
h = stream.hash[:Height]
|
79
|
+
w = stream.hash[:Width]
|
80
|
+
bpc = stream.hash[:BitsPerComponent]
|
81
|
+
len = stream.hash[:Length]
|
82
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
|
83
|
+
|
84
|
+
# Synthesize a TIFF header
|
85
|
+
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
|
86
|
+
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
|
87
|
+
# header = byte order, version magic, offset of directory, directory count,
|
88
|
+
# followed by a series of tags containing metadata.
|
89
|
+
tag_count = 10
|
90
|
+
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
|
91
|
+
tiff = header.dup
|
92
|
+
tiff << short_tag.call( 256, 1, w ) # image width
|
93
|
+
tiff << short_tag.call( 257, 1, h ) # image height
|
94
|
+
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
|
95
|
+
tiff << short_tag.call( 259, 1, 1 ) # compression
|
96
|
+
tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
|
97
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 16) ) # data offset
|
98
|
+
tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
|
99
|
+
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
100
|
+
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
101
|
+
tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
|
102
|
+
tiff << [bpc, bpc, bpc, bpc].pack("IIII")
|
103
|
+
tiff << stream.unfiltered_data
|
104
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
105
|
+
end
|
106
|
+
|
107
|
+
def save_gray(filename)
|
108
|
+
h = stream.hash[:Height]
|
109
|
+
w = stream.hash[:Width]
|
110
|
+
bpc = stream.hash[:BitsPerComponent]
|
111
|
+
len = stream.hash[:Length]
|
112
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
|
113
|
+
|
114
|
+
# Synthesize a TIFF header
|
115
|
+
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
|
116
|
+
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
|
117
|
+
# header = byte order, version magic, offset of directory, directory count,
|
118
|
+
# followed by a series of tags containing metadata.
|
119
|
+
tag_count = 9
|
120
|
+
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
|
121
|
+
tiff = header.dup
|
122
|
+
tiff << short_tag.call( 256, 1, w ) # image width
|
123
|
+
tiff << short_tag.call( 257, 1, h ) # image height
|
124
|
+
tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
|
125
|
+
tiff << short_tag.call( 259, 1, 1 ) # compression
|
126
|
+
tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
|
127
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
|
128
|
+
tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
|
129
|
+
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
130
|
+
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
131
|
+
tiff << stream.unfiltered_data
|
132
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
41
133
|
end
|
42
|
-
private :increment_count
|
43
134
|
|
135
|
+
def save_rgb(filename)
|
136
|
+
h = stream.hash[:Height]
|
137
|
+
w = stream.hash[:Width]
|
138
|
+
bpc = stream.hash[:BitsPerComponent]
|
139
|
+
len = stream.hash[:Length]
|
140
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
|
141
|
+
|
142
|
+
# Synthesize a TIFF header
|
143
|
+
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
|
144
|
+
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
|
145
|
+
# header = byte order, version magic, offset of directory, directory count,
|
146
|
+
# followed by a series of tags containing metadata.
|
147
|
+
tag_count = 8
|
148
|
+
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
|
149
|
+
tiff = header.dup
|
150
|
+
tiff << short_tag.call( 256, 1, w ) # image width
|
151
|
+
tiff << short_tag.call( 257, 1, h ) # image height
|
152
|
+
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
|
153
|
+
tiff << short_tag.call( 259, 1, 1 ) # compression
|
154
|
+
tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
|
155
|
+
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 12) ) # data offset
|
156
|
+
tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
|
157
|
+
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
158
|
+
tiff << [bpc, bpc, bpc].pack("III")
|
159
|
+
tiff << stream.unfiltered_data
|
160
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
161
|
+
end
|
44
162
|
end
|
45
163
|
|
46
164
|
class Jpg
|
@@ -104,5 +222,10 @@ module ExtractImages
|
|
104
222
|
end
|
105
223
|
end
|
106
224
|
|
107
|
-
|
108
|
-
|
225
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/adobe_sample.pdf"
|
226
|
+
extractor = ExtractImages::Extractor.new
|
227
|
+
|
228
|
+
PDF::Reader.open(filename) do |reader|
|
229
|
+
page = reader.page(1)
|
230
|
+
extractor.page(page)
|
231
|
+
end
|
data/examples/hash.rb
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# coding: utf-8
|
3
3
|
|
4
4
|
# get direct access to PDF objects
|
5
|
-
#
|
6
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
|
7
5
|
|
8
6
|
require 'pdf/reader'
|
9
7
|
|
10
|
-
filename = File.dirname(__FILE__) + "/../
|
11
|
-
|
12
|
-
|
8
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
|
9
|
+
|
10
|
+
reader = PDF::Reader.new(filename)
|
11
|
+
puts reader.objects[3]
|
12
|
+
puts reader.objects[4]
|
data/examples/metadata.rb
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
2
|
# coding: utf-8
|
3
|
+
|
4
4
|
# Extract metadata only
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'pdf/reader'
|
8
8
|
|
9
|
-
|
10
|
-
attr_accessor :regular
|
11
|
-
attr_accessor :xml
|
9
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cross_ref_stream.pdf"
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def metadata_xml(data)
|
18
|
-
@xml = data
|
19
|
-
end
|
11
|
+
PDF::Reader.open(filename) do |reader|
|
12
|
+
puts reader.info.inspect
|
13
|
+
puts reader.metadata.inspect
|
20
14
|
end
|
21
|
-
|
22
|
-
receiver = MetaDataReceiver.new
|
23
|
-
pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
|
24
|
-
puts receiver.regular.inspect
|
25
|
-
puts receiver.xml.inspect
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A simple app to count the number of pages in a PDF File.
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cross_ref_stream.pdf"
|
10
|
+
|
11
|
+
PDF::Reader.open(filename) do |reader|
|
12
|
+
puts "#{reader.page_count} page(s)"
|
13
|
+
end
|
data/examples/rspec.rb
CHANGED
@@ -2,56 +2,32 @@
|
|
2
2
|
# coding: utf-8
|
3
3
|
|
4
4
|
# Basic RSpec of a generated PDF
|
5
|
+
#
|
6
|
+
# USAGE: rspec -c examples/rspec.rb
|
5
7
|
|
6
8
|
require 'rubygems'
|
7
9
|
require 'pdf/reader'
|
8
|
-
require '
|
9
|
-
require '
|
10
|
+
require 'rspec'
|
11
|
+
require 'prawn'
|
12
|
+
require 'stringio'
|
10
13
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
@content = []
|
16
|
-
end
|
17
|
-
|
18
|
-
# Called when page parsing starts
|
19
|
-
def begin_page(arg = nil)
|
20
|
-
@content << ""
|
21
|
-
end
|
22
|
-
|
23
|
-
def show_text(string, *params)
|
24
|
-
@content.last << string.strip
|
25
|
-
end
|
26
|
-
|
27
|
-
# there's a few text callbacks, so make sure we process them all
|
28
|
-
alias :super_show_text :show_text
|
29
|
-
alias :move_to_next_line_and_show_text :show_text
|
30
|
-
alias :set_spacing_next_line_show_text :show_text
|
31
|
-
|
32
|
-
def show_text_with_positioning(*params)
|
33
|
-
params = params.first
|
34
|
-
params.each { |str| show_text(str) if str.kind_of?(String)}
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
context "My generated PDF" do
|
39
|
-
specify "should have the correct text on 2 pages" do
|
14
|
+
describe "My generated PDF" do
|
15
|
+
it "should have the correct text on 2 pages" do
|
40
16
|
|
41
17
|
# generate our PDF
|
42
|
-
pdf =
|
43
|
-
pdf.text "Chunky"
|
18
|
+
pdf = Prawn::Document.new
|
19
|
+
pdf.text "Chunky"
|
44
20
|
pdf.start_new_page
|
45
|
-
pdf.text "Bacon"
|
46
|
-
|
21
|
+
pdf.text "Bacon"
|
22
|
+
io = StringIO.new(pdf.render)
|
47
23
|
|
48
24
|
# process the PDF
|
49
|
-
|
50
|
-
|
25
|
+
PDF::Reader.open(io) do |reader|
|
26
|
+
reader.page_count.should eql(2) # correct page count
|
27
|
+
|
28
|
+
reader.page(1).text.should eql("Chunky") # correct content
|
29
|
+
reader.page(2).text.should eql("Bacon") # correct content
|
30
|
+
end
|
51
31
|
|
52
|
-
# confirm the text appears on the correct pages
|
53
|
-
receiver.content.size.should eql(2)
|
54
|
-
receiver.content[0].should eql("Chunky")
|
55
|
-
receiver.content[1].should eql("Bacon")
|
56
32
|
end
|
57
33
|
end
|