fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
class Font
|
28
|
+
attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
|
29
|
+
attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
|
30
|
+
attr_reader :basefont
|
31
|
+
|
32
|
+
def initialize(ohash = nil, obj = nil)
|
33
|
+
if ohash.nil? || obj.nil?
|
34
|
+
$stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
|
35
|
+
return
|
36
|
+
end
|
37
|
+
@ohash = ohash
|
38
|
+
|
39
|
+
extract_base_info(obj)
|
40
|
+
extract_descriptor(obj)
|
41
|
+
extract_descendants(obj)
|
42
|
+
end
|
43
|
+
|
44
|
+
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
45
|
+
# a text file supplied by Adobe at:
|
46
|
+
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
47
|
+
def self.glyphnames
|
48
|
+
glyphs = {}
|
49
|
+
|
50
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
51
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
52
|
+
f.each do |l|
|
53
|
+
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
54
|
+
glyphs[name.to_sym] = "0x#{code}".hex if name
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
glyphs
|
59
|
+
end
|
60
|
+
|
61
|
+
def basefont=(font)
|
62
|
+
# setup a default encoding for the selected font. It can always be overridden
|
63
|
+
# with encoding= if required
|
64
|
+
case font
|
65
|
+
when "Symbol" then
|
66
|
+
@encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
67
|
+
when "ZapfDingbats" then
|
68
|
+
@encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
69
|
+
else
|
70
|
+
@encoding = nil
|
71
|
+
end
|
72
|
+
@basefont = font
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_utf8(params)
|
76
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
|
77
|
+
|
78
|
+
if params.class == String
|
79
|
+
# translate the bytestram into a UTF-8 string.
|
80
|
+
# If an encoding hasn't been specified, assume the text using this
|
81
|
+
# font is in Adobe Standard Encoding.
|
82
|
+
(encoding || PDF::Reader::Encoding.new(:StandardEncoding)).to_utf8(params, tounicode)
|
83
|
+
elsif params.class == Array
|
84
|
+
params.collect { |param| self.to_utf8(param) }
|
85
|
+
else
|
86
|
+
params
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def glyph_width(c)
|
91
|
+
@missing_width ||= 0
|
92
|
+
if @widths.nil?
|
93
|
+
0
|
94
|
+
else
|
95
|
+
@widths.fetch(c.codepoints.first - @first_char, @missing_width)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def extract_base_info(obj)
|
102
|
+
@subtype = @ohash.object(obj[:Subtype])
|
103
|
+
@basefont = @ohash.object(obj[:BaseFont])
|
104
|
+
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
105
|
+
@widths = @ohash.object(obj[:Widths])
|
106
|
+
@first_char = @ohash.object(obj[:FirstChar])
|
107
|
+
if obj[:ToUnicode]
|
108
|
+
stream = @ohash.object(obj[:ToUnicode])
|
109
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def extract_descriptor(obj)
|
114
|
+
return unless obj[:FontDescriptor]
|
115
|
+
|
116
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
117
|
+
@ascent = @ohash.object(fd[:Ascent])
|
118
|
+
@descent = @ohash.object(fd[:Descent])
|
119
|
+
@missing_width = @ohash.object(fd[:MissingWidth])
|
120
|
+
@bbox = @ohash.object(fd[:FontBBox])
|
121
|
+
end
|
122
|
+
|
123
|
+
def extract_descendants(obj)
|
124
|
+
return unless obj[:DescendantFonts]
|
125
|
+
|
126
|
+
descendants = @ohash.object(obj[:DescendantFonts])
|
127
|
+
@descendantfonts = descendants.map { |desc|
|
128
|
+
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Reader
|
5
|
+
|
6
|
+
# High level representation of a single PDF form xobject. Form xobjects
|
7
|
+
# are contained pieces of content that can be inserted onto multiple
|
8
|
+
# pages. They're generally used as a space efficient way to store
|
9
|
+
# repetative content (like logos, header, footers, etc).
|
10
|
+
#
|
11
|
+
# This behaves and looks much like a limited PDF::Reader::Page class.
|
12
|
+
#
|
13
|
+
class FormXObject
|
14
|
+
|
15
|
+
def initialize(page, xobject)
|
16
|
+
@page = page
|
17
|
+
@objects = page.objects
|
18
|
+
@xobject = @objects.deref(xobject)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns the resources that accompany this form.
|
22
|
+
#
|
23
|
+
def resources
|
24
|
+
@resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
|
25
|
+
end
|
26
|
+
|
27
|
+
# return a hash of fonts used on this form.
|
28
|
+
#
|
29
|
+
# The keys are the font labels used within the form content stream.
|
30
|
+
#
|
31
|
+
# The values are a PDF::Reader::Font instances that provide access
|
32
|
+
# to most available metrics for each font.
|
33
|
+
#
|
34
|
+
def fonts
|
35
|
+
raw_fonts = @objects.deref(resources[:Font] || {})
|
36
|
+
::Hash[raw_fonts.map { |label, font|
|
37
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
|
38
|
+
}]
|
39
|
+
end
|
40
|
+
|
41
|
+
# processes the raw content stream for this form in sequential order and
|
42
|
+
# passes callbacks to the receiver objects.
|
43
|
+
#
|
44
|
+
# See the comments on PDF::Reader::Page#walk for more detail.
|
45
|
+
#
|
46
|
+
def walk(*receivers)
|
47
|
+
content_stream(receivers, raw_content)
|
48
|
+
end
|
49
|
+
|
50
|
+
# returns the raw content stream for this page. This is plumbing, nothing to
|
51
|
+
# see here unless you're a PDF nerd like me.
|
52
|
+
#
|
53
|
+
def raw_content
|
54
|
+
@xobject.unfiltered_data
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def callback(receivers, name, params=[])
|
60
|
+
receivers.each do |receiver|
|
61
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def content_stream(receivers, instructions)
|
66
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
67
|
+
parser = Parser.new(buffer, @objects)
|
68
|
+
params = []
|
69
|
+
|
70
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
71
|
+
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
72
|
+
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
73
|
+
params.clear
|
74
|
+
else
|
75
|
+
params << token
|
76
|
+
end
|
77
|
+
end
|
78
|
+
rescue EOFError => e
|
79
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|