fireinc-pdf-reader 0.11.0.alpha
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
################################################################################
|
2
|
+
#
|
3
|
+
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
|
4
|
+
#
|
5
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
# a copy of this software and associated documentation files (the
|
7
|
+
# "Software"), to deal in the Software without restriction, including
|
8
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
# the following conditions:
|
12
|
+
#
|
13
|
+
# The above copyright notice and this permission notice shall be
|
14
|
+
# included in all copies or substantial portions of the Software.
|
15
|
+
#
|
16
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
23
|
+
#
|
24
|
+
################################################################################
|
25
|
+
|
26
|
+
class PDF::Reader
|
27
|
+
class Font
|
28
|
+
attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
|
29
|
+
attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
|
30
|
+
attr_reader :basefont
|
31
|
+
|
32
|
+
def initialize(ohash = nil, obj = nil)
|
33
|
+
if ohash.nil? || obj.nil?
|
34
|
+
$stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
|
35
|
+
return
|
36
|
+
end
|
37
|
+
@ohash = ohash
|
38
|
+
|
39
|
+
extract_base_info(obj)
|
40
|
+
extract_descriptor(obj)
|
41
|
+
extract_descendants(obj)
|
42
|
+
end
|
43
|
+
|
44
|
+
# returns a hash that maps glyph names to unicode codepoints. The mapping is based on
|
45
|
+
# a text file supplied by Adobe at:
|
46
|
+
# http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
|
47
|
+
def self.glyphnames
|
48
|
+
glyphs = {}
|
49
|
+
|
50
|
+
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
|
51
|
+
File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
|
52
|
+
f.each do |l|
|
53
|
+
m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
|
54
|
+
glyphs[name.to_sym] = "0x#{code}".hex if name
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
glyphs
|
59
|
+
end
|
60
|
+
|
61
|
+
def basefont=(font)
|
62
|
+
# setup a default encoding for the selected font. It can always be overridden
|
63
|
+
# with encoding= if required
|
64
|
+
case font
|
65
|
+
when "Symbol" then
|
66
|
+
@encoding = PDF::Reader::Encoding.new("SymbolEncoding")
|
67
|
+
when "ZapfDingbats" then
|
68
|
+
@encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
|
69
|
+
else
|
70
|
+
@encoding = nil
|
71
|
+
end
|
72
|
+
@basefont = font
|
73
|
+
end
|
74
|
+
|
75
|
+
def to_utf8(params)
|
76
|
+
raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
|
77
|
+
|
78
|
+
if params.class == String
|
79
|
+
# translate the bytestram into a UTF-8 string.
|
80
|
+
# If an encoding hasn't been specified, assume the text using this
|
81
|
+
# font is in Adobe Standard Encoding.
|
82
|
+
(encoding || PDF::Reader::Encoding.new(:StandardEncoding)).to_utf8(params, tounicode)
|
83
|
+
elsif params.class == Array
|
84
|
+
params.collect { |param| self.to_utf8(param) }
|
85
|
+
else
|
86
|
+
params
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def glyph_width(c)
|
91
|
+
@missing_width ||= 0
|
92
|
+
if @widths.nil?
|
93
|
+
0
|
94
|
+
else
|
95
|
+
@widths.fetch(c.codepoints.first - @first_char, @missing_width)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
private
|
100
|
+
|
101
|
+
def extract_base_info(obj)
|
102
|
+
@subtype = @ohash.object(obj[:Subtype])
|
103
|
+
@basefont = @ohash.object(obj[:BaseFont])
|
104
|
+
@encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
|
105
|
+
@widths = @ohash.object(obj[:Widths])
|
106
|
+
@first_char = @ohash.object(obj[:FirstChar])
|
107
|
+
if obj[:ToUnicode]
|
108
|
+
stream = @ohash.object(obj[:ToUnicode])
|
109
|
+
@tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def extract_descriptor(obj)
|
114
|
+
return unless obj[:FontDescriptor]
|
115
|
+
|
116
|
+
fd = @ohash.object(obj[:FontDescriptor])
|
117
|
+
@ascent = @ohash.object(fd[:Ascent])
|
118
|
+
@descent = @ohash.object(fd[:Descent])
|
119
|
+
@missing_width = @ohash.object(fd[:MissingWidth])
|
120
|
+
@bbox = @ohash.object(fd[:FontBBox])
|
121
|
+
end
|
122
|
+
|
123
|
+
def extract_descendants(obj)
|
124
|
+
return unless obj[:DescendantFonts]
|
125
|
+
|
126
|
+
descendants = @ohash.object(obj[:DescendantFonts])
|
127
|
+
@descendantfonts = descendants.map { |desc|
|
128
|
+
PDF::Reader::Font.new(@ohash, @ohash.object(desc))
|
129
|
+
}
|
130
|
+
end
|
131
|
+
|
132
|
+
end
|
133
|
+
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Reader
|
5
|
+
|
6
|
+
# High level representation of a single PDF form xobject. Form xobjects
|
7
|
+
# are contained pieces of content that can be inserted onto multiple
|
8
|
+
# pages. They're generally used as a space efficient way to store
|
9
|
+
# repetative content (like logos, header, footers, etc).
|
10
|
+
#
|
11
|
+
# This behaves and looks much like a limited PDF::Reader::Page class.
|
12
|
+
#
|
13
|
+
class FormXObject
|
14
|
+
|
15
|
+
def initialize(page, xobject)
|
16
|
+
@page = page
|
17
|
+
@objects = page.objects
|
18
|
+
@xobject = @objects.deref(xobject)
|
19
|
+
end
|
20
|
+
|
21
|
+
# Returns the resources that accompany this form.
|
22
|
+
#
|
23
|
+
def resources
|
24
|
+
@resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
|
25
|
+
end
|
26
|
+
|
27
|
+
# return a hash of fonts used on this form.
|
28
|
+
#
|
29
|
+
# The keys are the font labels used within the form content stream.
|
30
|
+
#
|
31
|
+
# The values are a PDF::Reader::Font instances that provide access
|
32
|
+
# to most available metrics for each font.
|
33
|
+
#
|
34
|
+
def fonts
|
35
|
+
raw_fonts = @objects.deref(resources[:Font] || {})
|
36
|
+
::Hash[raw_fonts.map { |label, font|
|
37
|
+
[label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
|
38
|
+
}]
|
39
|
+
end
|
40
|
+
|
41
|
+
# processes the raw content stream for this form in sequential order and
|
42
|
+
# passes callbacks to the receiver objects.
|
43
|
+
#
|
44
|
+
# See the comments on PDF::Reader::Page#walk for more detail.
|
45
|
+
#
|
46
|
+
def walk(*receivers)
|
47
|
+
content_stream(receivers, raw_content)
|
48
|
+
end
|
49
|
+
|
50
|
+
# returns the raw content stream for this page. This is plumbing, nothing to
|
51
|
+
# see here unless you're a PDF nerd like me.
|
52
|
+
#
|
53
|
+
def raw_content
|
54
|
+
@xobject.unfiltered_data
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
|
59
|
+
def callback(receivers, name, params=[])
|
60
|
+
receivers.each do |receiver|
|
61
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def content_stream(receivers, instructions)
|
66
|
+
buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
|
67
|
+
parser = Parser.new(buffer, @objects)
|
68
|
+
params = []
|
69
|
+
|
70
|
+
while (token = parser.parse_token(PagesStrategy::OPERATORS))
|
71
|
+
if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
|
72
|
+
callback(receivers, PagesStrategy::OPERATORS[token], params)
|
73
|
+
params.clear
|
74
|
+
else
|
75
|
+
params << token
|
76
|
+
end
|
77
|
+
end
|
78
|
+
rescue EOFError => e
|
79
|
+
raise MalformedPDFError, "End Of File while processing a content stream"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|