pdf-reader 0.11.0.alpha → 0.12.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +17 -1
- data/README.rdoc +31 -1
- data/bin/pdf_list_callbacks +2 -0
- data/examples/callbacks.rb +2 -1
- data/examples/extract_bates.rb +3 -2
- data/examples/extract_images.rb +146 -23
- data/examples/hash.rb +5 -5
- data/examples/metadata.rb +5 -16
- data/examples/page_count.rb +13 -0
- data/examples/rspec.rb +17 -41
- data/examples/text.rb +4 -29
- data/examples/version.rb +3 -15
- data/lib/pdf/reader.rb +45 -27
- data/lib/pdf/reader/encoding.rb +3 -3
- data/lib/pdf/reader/error.rb +1 -0
- data/lib/pdf/reader/filter.rb +64 -9
- data/lib/pdf/reader/font.rb +0 -17
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyph_hash.rb +88 -0
- data/lib/pdf/reader/glyphlist.txt +1 -1
- data/lib/pdf/reader/object_hash.rb +42 -12
- data/lib/pdf/reader/page.rb +63 -17
- data/lib/pdf/reader/page_text_receiver.rb +38 -4
- data/lib/pdf/reader/standard_security_handler.rb +186 -0
- data/lib/pdf/reader/stream.rb +2 -2
- metadata +39 -9
- data/examples/page_counter_improved.rb +0 -23
- data/examples/page_counter_naive.rb +0 -24
data/CHANGELOG
CHANGED
@@ -1,4 +1,20 @@
|
|
1
|
-
v0.
|
1
|
+
v0.12.0.alpha (28th August 2011)
|
2
|
+
- small breaking changes to the page-based API - it's alpha for a reason
|
3
|
+
- resource related methods on Page object return raw PDF objects
|
4
|
+
- if the caller wants the resources wrapped in a more convenient
|
5
|
+
Ruby object (like PDF::Reader::Font or PDF::Reader::FormXObject) will
|
6
|
+
need to do so themselves
|
7
|
+
- add support for RunLengthDecode filters (thanks Bernerd Schaefer)
|
8
|
+
- add support for standard PDF encryption (thanks Evan Brunner)
|
9
|
+
- add support for decoding stream wityh TIFF prediction
|
10
|
+
- new PDF::Reader::FormXObject class to simplify working with form XObjects
|
11
|
+
|
12
|
+
v0.11.0.alpha (19th July 2011)
|
13
|
+
- introduce experimental new page-based API
|
14
|
+
- old API is deprecated but will continue to work with no warnings
|
15
|
+
- add transparent caching of common objects to ObjectHash
|
16
|
+
|
17
|
+
v0.10.0 (6th July 2011)
|
2
18
|
- support multiple receivers within a single pass over a source file
|
3
19
|
- massive time saving when dealing with multiple receivers
|
4
20
|
|
data/README.rdoc
CHANGED
@@ -8,6 +8,11 @@ The PDF 1.7 specification is a weighty document and not all aspects are
|
|
8
8
|
currently supported. I welcome submission of PDF files that exhibit
|
9
9
|
unsupported aspects of the spec to assist with improving our support.
|
10
10
|
|
11
|
+
This is primarily a low-level library that should be used as the foundation for
|
12
|
+
higher level functionality - it's not going to render a PDF for you. There are
|
13
|
+
a few exceptions to support very common use cases like extracting text from a
|
14
|
+
page.
|
15
|
+
|
11
16
|
= Installation
|
12
17
|
|
13
18
|
The recommended installation method is via Rubygems.
|
@@ -27,6 +32,15 @@ this object.
|
|
27
32
|
puts reader.metadata
|
28
33
|
puts reader.page_count
|
29
34
|
|
35
|
+
PDF::Reader.new can accept an IO stream or a filename. Here's an example with
|
36
|
+
an IO stream:
|
37
|
+
|
38
|
+
require 'open-uri'
|
39
|
+
|
40
|
+
io = open('http://example.com/somefile.pdf')
|
41
|
+
reader = PDF::Reader.new(io)
|
42
|
+
puts reader.info
|
43
|
+
|
30
44
|
PDF is a page based file format, so most visible information is available via
|
31
45
|
page-based iteration
|
32
46
|
|
@@ -34,10 +48,24 @@ page-based iteration
|
|
34
48
|
|
35
49
|
reader.pages.each do |page|
|
36
50
|
puts page.fonts
|
37
|
-
puts page.images
|
38
51
|
puts page.text
|
52
|
+
puts page.raw_content
|
39
53
|
end
|
40
54
|
|
55
|
+
If you need to access the full program for rendering a page, use the walk() method
|
56
|
+
of PDF::Reader::Page.
|
57
|
+
|
58
|
+
class RedGreenBlue
|
59
|
+
def set_rgb_color_for_nonstroking(r, g, b)
|
60
|
+
puts "R: #{r}, G: #{g}, B: #{b}"
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
reader = PDF::Reader.new("somefile.pdf")
|
65
|
+
page = reader.page(1)
|
66
|
+
receiver = RedGreenBlue.new
|
67
|
+
page.walk(receiver)
|
68
|
+
|
41
69
|
For low level access to the objects in a PDF file, use the ObjectHash class. You can
|
42
70
|
build an ObjectHash instance directly:
|
43
71
|
|
@@ -48,6 +76,8 @@ or via a PDF::Reader instance:
|
|
48
76
|
reader = PDF::Reader.new("somefile.pdf")
|
49
77
|
puts reader.objects
|
50
78
|
|
79
|
+
The second method is preferred to increase the effectiveness of internal caching.
|
80
|
+
|
51
81
|
= Text Encoding
|
52
82
|
|
53
83
|
Internally, text can be stored inside a PDF in various encodings, including
|
data/bin/pdf_list_callbacks
CHANGED
data/examples/callbacks.rb
CHANGED
@@ -10,8 +10,9 @@ require 'rubygems'
|
|
10
10
|
require 'pdf/reader'
|
11
11
|
|
12
12
|
receiver = PDF::Reader::RegisterReceiver.new
|
13
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
|
13
14
|
|
14
|
-
PDF::Reader.open(
|
15
|
+
PDF::Reader.open(filename) do |reader|
|
15
16
|
reader.pages.each do |page|
|
16
17
|
page.walk(receiver)
|
17
18
|
receiver.callbacks.each do |cb|
|
data/examples/extract_bates.rb
CHANGED
@@ -35,13 +35,14 @@ class BatesReceiver
|
|
35
35
|
|
36
36
|
end
|
37
37
|
|
38
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-basic.pdf"
|
38
39
|
|
39
|
-
PDF::Reader.open(
|
40
|
+
PDF::Reader.open(filename) do |reader|
|
40
41
|
reader.pages.each do |page|
|
41
42
|
receiver = BatesReceiver.new
|
42
43
|
page.walk(receiver)
|
43
44
|
if receiver.numbers.empty?
|
44
|
-
puts page.scan(/CC.+/)
|
45
|
+
puts page.text.scan(/CC.+/)
|
45
46
|
else
|
46
47
|
puts receiver.numbers.inspect
|
47
48
|
end
|
data/examples/extract_images.rb
CHANGED
@@ -1,46 +1,164 @@
|
|
1
1
|
# coding: utf-8
|
2
2
|
|
3
3
|
# This demonstrates a way to extract some images (those based on the JPG or
|
4
|
-
# TIFF formats) from a PDF. There are other ways to store images, so
|
4
|
+
# TIFF formats) from a PDF. There are other ways to store images, so
|
5
5
|
# it may need to be expanded for real world usage, but it should serve
|
6
6
|
# as a good guide.
|
7
7
|
#
|
8
8
|
# Thanks to Jack Rusher for the initial version of this example.
|
9
|
-
#
|
10
|
-
# USAGE:
|
11
|
-
#
|
12
|
-
# ruby extract_images.rb somefile.pdf
|
13
9
|
|
14
10
|
require 'pdf/reader'
|
15
11
|
|
16
12
|
module ExtractImages
|
17
13
|
|
18
|
-
class
|
19
|
-
|
14
|
+
class Extractor
|
15
|
+
|
16
|
+
def page(page)
|
17
|
+
count = 0
|
18
|
+
|
19
|
+
process_resources(page, page.resources, count)
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
20
23
|
|
21
|
-
def
|
22
|
-
@
|
24
|
+
def complete_refs
|
25
|
+
@complete_refs ||= {}
|
23
26
|
end
|
24
27
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
+
def process_resources(page, resources, count)
|
29
|
+
xobjects = resources[:XObject]
|
30
|
+
return count if xobjects.nil?
|
31
|
+
|
32
|
+
xobjects.each do |name, stream|
|
33
|
+
next if complete_refs[stream]
|
34
|
+
complete_refs[stream] = true
|
35
|
+
|
36
|
+
stream = page.objects.deref(stream)
|
37
|
+
|
38
|
+
if stream.hash[:Subtype] == :Image
|
39
|
+
count += 1
|
40
|
+
|
41
|
+
case stream.hash[:Filter]
|
42
|
+
when :CCITTFaxDecode then
|
43
|
+
ExtractImages::Tiff.new(stream).save("#{page.number}-#{count}-#{name}.tif")
|
44
|
+
when :DCTDecode then
|
45
|
+
ExtractImages::Jpg.new(stream).save("#{page.number}-#{count}-#{name}.jpg")
|
46
|
+
else
|
47
|
+
ExtractImages::Raw.new(stream).save("#{page.number}-#{count}-#{name}.tif")
|
48
|
+
end
|
49
|
+
elsif stream.hash[:Subtype] == :Form
|
50
|
+
count = process_resources(page, PDF::Reader::FormXObject.new(page, stream).resources, count)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
count
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
|
58
|
+
class Raw
|
59
|
+
attr_reader :stream
|
28
60
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
61
|
+
def initialize(stream)
|
62
|
+
@stream = stream
|
63
|
+
end
|
64
|
+
|
65
|
+
def save(filename)
|
66
|
+
case @stream.hash[:ColorSpace]
|
67
|
+
when :DeviceCMYK then save_cmyk(filename)
|
68
|
+
when :DeviceGray then save_gray(filename)
|
69
|
+
when :DeviceRGB then save_rgb(filename)
|
34
70
|
else
|
35
|
-
$stderr.puts "
|
71
|
+
$stderr.puts "unsupport color depth #{@stream.hash[:ColorSpace]} #{filename}"
|
36
72
|
end
|
37
73
|
end
|
38
74
|
|
39
|
-
|
40
|
-
|
75
|
+
private
|
76
|
+
|
77
|
+
def save_cmyk(filename)
|
78
|
+
h = stream.hash[:Height]
|
79
|
+
w = stream.hash[:Width]
|
80
|
+
bpc = stream.hash[:BitsPerComponent]
|
81
|
+
len = stream.hash[:Length]
|
82
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
|
83
|
+
|
84
|
+
# Synthesize a TIFF header
|
85
|
+
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
|
86
|
+
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
|
87
|
+
# header = byte order, version magic, offset of directory, directory count,
|
88
|
+
# followed by a series of tags containing metadata.
|
89
|
+
tag_count = 10
|
90
|
+
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
|
91
|
+
tiff = header.dup
|
92
|
+
tiff << short_tag.call( 256, 1, w ) # image width
|
93
|
+
tiff << short_tag.call( 257, 1, h ) # image height
|
94
|
+
tiff << long_tag.call( 258, 4, (header.size + (tag_count*12))) # bits per pixel
|
95
|
+
tiff << short_tag.call( 259, 1, 1 ) # compression
|
96
|
+
tiff << short_tag.call( 262, 1, 5 ) # colorspace - separation
|
97
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12) + 16) ) # data offset
|
98
|
+
tiff << short_tag.call( 277, 1, 4 ) # samples per pixel
|
99
|
+
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
100
|
+
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
101
|
+
tiff << long_tag.call( 332, 1, 1) # inkset - CMYK
|
102
|
+
tiff << [bpc, bpc, bpc, bpc].pack("IIII")
|
103
|
+
tiff << stream.unfiltered_data
|
104
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
105
|
+
end
|
106
|
+
|
107
|
+
def save_gray(filename)
|
108
|
+
h = stream.hash[:Height]
|
109
|
+
w = stream.hash[:Width]
|
110
|
+
bpc = stream.hash[:BitsPerComponent]
|
111
|
+
len = stream.hash[:Length]
|
112
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
|
113
|
+
|
114
|
+
# Synthesize a TIFF header
|
115
|
+
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
|
116
|
+
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
|
117
|
+
# header = byte order, version magic, offset of directory, directory count,
|
118
|
+
# followed by a series of tags containing metadata.
|
119
|
+
tag_count = 9
|
120
|
+
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
|
121
|
+
tiff = header.dup
|
122
|
+
tiff << short_tag.call( 256, 1, w ) # image width
|
123
|
+
tiff << short_tag.call( 257, 1, h ) # image height
|
124
|
+
tiff << short_tag.call( 258, 1, 8 ) # bits per pixel
|
125
|
+
tiff << short_tag.call( 259, 1, 1 ) # compression
|
126
|
+
tiff << short_tag.call( 262, 1, 1 ) # colorspace - grayscale
|
127
|
+
tiff << long_tag.call( 273, 1, (10 + (tag_count*12)) ) # data offset
|
128
|
+
tiff << short_tag.call( 277, 1, 1 ) # samples per pixel
|
129
|
+
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
130
|
+
tiff << short_tag.call( 284, 1, 1 ) # planer config
|
131
|
+
tiff << stream.unfiltered_data
|
132
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
41
133
|
end
|
42
|
-
private :increment_count
|
43
134
|
|
135
|
+
def save_rgb(filename)
|
136
|
+
h = stream.hash[:Height]
|
137
|
+
w = stream.hash[:Width]
|
138
|
+
bpc = stream.hash[:BitsPerComponent]
|
139
|
+
len = stream.hash[:Length]
|
140
|
+
puts "#{filename}: h=#{h}, w=#{w}, bpc=#{bpc}, len=#{len}"
|
141
|
+
|
142
|
+
# Synthesize a TIFF header
|
143
|
+
long_tag = lambda {|tag, count, value| [ tag, 4, count, value ].pack( "ssII" ) }
|
144
|
+
short_tag = lambda {|tag, count, value| [ tag, 3, count, value ].pack( "ssII" ) }
|
145
|
+
# header = byte order, version magic, offset of directory, directory count,
|
146
|
+
# followed by a series of tags containing metadata.
|
147
|
+
tag_count = 8
|
148
|
+
header = [ 73, 73, 42, 8, tag_count ].pack("ccsIs")
|
149
|
+
tiff = header.dup
|
150
|
+
tiff << short_tag.call( 256, 1, w ) # image width
|
151
|
+
tiff << short_tag.call( 257, 1, h ) # image height
|
152
|
+
tiff << long_tag.call( 258, 3, (header.size + (tag_count*12))) # bits per pixel
|
153
|
+
tiff << short_tag.call( 259, 1, 1 ) # compression
|
154
|
+
tiff << short_tag.call( 262, 1, 2 ) # colorspace - RGB
|
155
|
+
tiff << long_tag.call( 273, 1, (header.size + (tag_count*12) + 12) ) # data offset
|
156
|
+
tiff << short_tag.call( 277, 1, 3 ) # samples per pixel
|
157
|
+
tiff << long_tag.call( 279, 1, stream.unfiltered_data.size) # data byte size
|
158
|
+
tiff << [bpc, bpc, bpc].pack("III")
|
159
|
+
tiff << stream.unfiltered_data
|
160
|
+
File.open(filename, "wb") { |file| file.write tiff }
|
161
|
+
end
|
44
162
|
end
|
45
163
|
|
46
164
|
class Jpg
|
@@ -104,5 +222,10 @@ module ExtractImages
|
|
104
222
|
end
|
105
223
|
end
|
106
224
|
|
107
|
-
|
108
|
-
|
225
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/adobe_sample.pdf"
|
226
|
+
extractor = ExtractImages::Extractor.new
|
227
|
+
|
228
|
+
PDF::Reader.open(filename) do |reader|
|
229
|
+
page = reader.page(1)
|
230
|
+
extractor.page(page)
|
231
|
+
end
|
data/examples/hash.rb
CHANGED
@@ -2,11 +2,11 @@
|
|
2
2
|
# coding: utf-8
|
3
3
|
|
4
4
|
# get direct access to PDF objects
|
5
|
-
#
|
6
|
-
$LOAD_PATH.unshift(File.dirname(__FILE__) + '/../lib')
|
7
5
|
|
8
6
|
require 'pdf/reader'
|
9
7
|
|
10
|
-
filename = File.dirname(__FILE__) + "/../
|
11
|
-
|
12
|
-
|
8
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cairo-unicode.pdf"
|
9
|
+
|
10
|
+
reader = PDF::Reader.new(filename)
|
11
|
+
puts reader.objects[3]
|
12
|
+
puts reader.objects[4]
|
data/examples/metadata.rb
CHANGED
@@ -1,25 +1,14 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
|
3
2
|
# coding: utf-8
|
3
|
+
|
4
4
|
# Extract metadata only
|
5
5
|
|
6
6
|
require 'rubygems'
|
7
7
|
require 'pdf/reader'
|
8
8
|
|
9
|
-
|
10
|
-
attr_accessor :regular
|
11
|
-
attr_accessor :xml
|
9
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cross_ref_stream.pdf"
|
12
10
|
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
def metadata_xml(data)
|
18
|
-
@xml = data
|
19
|
-
end
|
11
|
+
PDF::Reader.open(filename) do |reader|
|
12
|
+
puts reader.info.inspect
|
13
|
+
puts reader.metadata.inspect
|
20
14
|
end
|
21
|
-
|
22
|
-
receiver = MetaDataReceiver.new
|
23
|
-
pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
|
24
|
-
puts receiver.regular.inspect
|
25
|
-
puts receiver.xml.inspect
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A simple app to count the number of pages in a PDF File.
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
filename = File.expand_path(File.dirname(__FILE__)) + "/../spec/data/cross_ref_stream.pdf"
|
10
|
+
|
11
|
+
PDF::Reader.open(filename) do |reader|
|
12
|
+
puts "#{reader.page_count} page(s)"
|
13
|
+
end
|
data/examples/rspec.rb
CHANGED
@@ -2,56 +2,32 @@
|
|
2
2
|
# coding: utf-8
|
3
3
|
|
4
4
|
# Basic RSpec of a generated PDF
|
5
|
+
#
|
6
|
+
# USAGE: rspec -c examples/rspec.rb
|
5
7
|
|
6
8
|
require 'rubygems'
|
7
9
|
require 'pdf/reader'
|
8
|
-
require '
|
9
|
-
require '
|
10
|
+
require 'rspec'
|
11
|
+
require 'prawn'
|
12
|
+
require 'stringio'
|
10
13
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def initialize
|
15
|
-
@content = []
|
16
|
-
end
|
17
|
-
|
18
|
-
# Called when page parsing starts
|
19
|
-
def begin_page(arg = nil)
|
20
|
-
@content << ""
|
21
|
-
end
|
22
|
-
|
23
|
-
def show_text(string, *params)
|
24
|
-
@content.last << string.strip
|
25
|
-
end
|
26
|
-
|
27
|
-
# there's a few text callbacks, so make sure we process them all
|
28
|
-
alias :super_show_text :show_text
|
29
|
-
alias :move_to_next_line_and_show_text :show_text
|
30
|
-
alias :set_spacing_next_line_show_text :show_text
|
31
|
-
|
32
|
-
def show_text_with_positioning(*params)
|
33
|
-
params = params.first
|
34
|
-
params.each { |str| show_text(str) if str.kind_of?(String)}
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
context "My generated PDF" do
|
39
|
-
specify "should have the correct text on 2 pages" do
|
14
|
+
describe "My generated PDF" do
|
15
|
+
it "should have the correct text on 2 pages" do
|
40
16
|
|
41
17
|
# generate our PDF
|
42
|
-
pdf =
|
43
|
-
pdf.text "Chunky"
|
18
|
+
pdf = Prawn::Document.new
|
19
|
+
pdf.text "Chunky"
|
44
20
|
pdf.start_new_page
|
45
|
-
pdf.text "Bacon"
|
46
|
-
|
21
|
+
pdf.text "Bacon"
|
22
|
+
io = StringIO.new(pdf.render)
|
47
23
|
|
48
24
|
# process the PDF
|
49
|
-
|
50
|
-
|
25
|
+
PDF::Reader.open(io) do |reader|
|
26
|
+
reader.page_count.should eql(2) # correct page count
|
27
|
+
|
28
|
+
reader.page(1).text.should eql("Chunky") # correct content
|
29
|
+
reader.page(2).text.should eql("Bacon") # correct content
|
30
|
+
end
|
51
31
|
|
52
|
-
# confirm the text appears on the correct pages
|
53
|
-
receiver.content.size.should eql(2)
|
54
|
-
receiver.content[0].should eql("Chunky")
|
55
|
-
receiver.content[1].should eql("Bacon")
|
56
32
|
end
|
57
33
|
end
|