fireinc-pdf-reader 0.11.0.alpha

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,133 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2008 James Healy (jimmy@deefa.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ class Font
28
+ attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
29
+ attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
30
+ attr_reader :basefont
31
+
32
+ def initialize(ohash = nil, obj = nil)
33
+ if ohash.nil? || obj.nil?
34
+ $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
35
+ return
36
+ end
37
+ @ohash = ohash
38
+
39
+ extract_base_info(obj)
40
+ extract_descriptor(obj)
41
+ extract_descendants(obj)
42
+ end
43
+
44
+ # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
45
+ # a text file supplied by Adobe at:
46
+ # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
47
+ def self.glyphnames
48
+ glyphs = {}
49
+
50
+ RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
51
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
52
+ f.each do |l|
53
+ m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
54
+ glyphs[name.to_sym] = "0x#{code}".hex if name
55
+ end
56
+ end
57
+
58
+ glyphs
59
+ end
60
+
61
+ def basefont=(font)
62
+ # setup a default encoding for the selected font. It can always be overridden
63
+ # with encoding= if required
64
+ case font
65
+ when "Symbol" then
66
+ @encoding = PDF::Reader::Encoding.new("SymbolEncoding")
67
+ when "ZapfDingbats" then
68
+ @encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
69
+ else
70
+ @encoding = nil
71
+ end
72
+ @basefont = font
73
+ end
74
+
75
+ def to_utf8(params)
76
+ raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
77
+
78
+ if params.class == String
79
+ # translate the bytestram into a UTF-8 string.
80
+ # If an encoding hasn't been specified, assume the text using this
81
+ # font is in Adobe Standard Encoding.
82
+ (encoding || PDF::Reader::Encoding.new(:StandardEncoding)).to_utf8(params, tounicode)
83
+ elsif params.class == Array
84
+ params.collect { |param| self.to_utf8(param) }
85
+ else
86
+ params
87
+ end
88
+ end
89
+
90
+ def glyph_width(c)
91
+ @missing_width ||= 0
92
+ if @widths.nil?
93
+ 0
94
+ else
95
+ @widths.fetch(c.codepoints.first - @first_char, @missing_width)
96
+ end
97
+ end
98
+
99
+ private
100
+
101
+ def extract_base_info(obj)
102
+ @subtype = @ohash.object(obj[:Subtype])
103
+ @basefont = @ohash.object(obj[:BaseFont])
104
+ @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
105
+ @widths = @ohash.object(obj[:Widths])
106
+ @first_char = @ohash.object(obj[:FirstChar])
107
+ if obj[:ToUnicode]
108
+ stream = @ohash.object(obj[:ToUnicode])
109
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
110
+ end
111
+ end
112
+
113
+ def extract_descriptor(obj)
114
+ return unless obj[:FontDescriptor]
115
+
116
+ fd = @ohash.object(obj[:FontDescriptor])
117
+ @ascent = @ohash.object(fd[:Ascent])
118
+ @descent = @ohash.object(fd[:Descent])
119
+ @missing_width = @ohash.object(fd[:MissingWidth])
120
+ @bbox = @ohash.object(fd[:FontBBox])
121
+ end
122
+
123
+ def extract_descendants(obj)
124
+ return unless obj[:DescendantFonts]
125
+
126
+ descendants = @ohash.object(obj[:DescendantFonts])
127
+ @descendantfonts = descendants.map { |desc|
128
+ PDF::Reader::Font.new(@ohash, @ohash.object(desc))
129
+ }
130
+ end
131
+
132
+ end
133
+ end
@@ -0,0 +1,83 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # High level representation of a single PDF form xobject. Form xobjects
7
+ # are contained pieces of content that can be inserted onto multiple
8
+ # pages. They're generally used as a space efficient way to store
9
+ # repetative content (like logos, header, footers, etc).
10
+ #
11
+ # This behaves and looks much like a limited PDF::Reader::Page class.
12
+ #
13
+ class FormXObject
14
+
15
+ def initialize(page, xobject)
16
+ @page = page
17
+ @objects = page.objects
18
+ @xobject = @objects.deref(xobject)
19
+ end
20
+
21
+ # Returns the resources that accompany this form.
22
+ #
23
+ def resources
24
+ @resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
25
+ end
26
+
27
+ # return a hash of fonts used on this form.
28
+ #
29
+ # The keys are the font labels used within the form content stream.
30
+ #
31
+ # The values are a PDF::Reader::Font instances that provide access
32
+ # to most available metrics for each font.
33
+ #
34
+ def fonts
35
+ raw_fonts = @objects.deref(resources[:Font] || {})
36
+ ::Hash[raw_fonts.map { |label, font|
37
+ [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
38
+ }]
39
+ end
40
+
41
+ # processes the raw content stream for this form in sequential order and
42
+ # passes callbacks to the receiver objects.
43
+ #
44
+ # See the comments on PDF::Reader::Page#walk for more detail.
45
+ #
46
+ def walk(*receivers)
47
+ content_stream(receivers, raw_content)
48
+ end
49
+
50
+ # returns the raw content stream for this page. This is plumbing, nothing to
51
+ # see here unless you're a PDF nerd like me.
52
+ #
53
+ def raw_content
54
+ @xobject.unfiltered_data
55
+ end
56
+
57
+ private
58
+
59
+ def callback(receivers, name, params=[])
60
+ receivers.each do |receiver|
61
+ receiver.send(name, *params) if receiver.respond_to?(name)
62
+ end
63
+ end
64
+
65
+ def content_stream(receivers, instructions)
66
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
67
+ parser = Parser.new(buffer, @objects)
68
+ params = []
69
+
70
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
71
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
72
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
73
+ params.clear
74
+ else
75
+ params << token
76
+ end
77
+ end
78
+ rescue EOFError => e
79
+ raise MalformedPDFError, "End Of File while processing a content stream"
80
+ end
81
+ end
82
+ end
83
+ end