fireinc-pdf-reader 0.11.0.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. data/CHANGELOG +168 -0
  2. data/MIT-LICENSE +21 -0
  3. data/README.rdoc +137 -0
  4. data/Rakefile +34 -0
  5. data/TODO +45 -0
  6. data/bin/pdf_list_callbacks +15 -0
  7. data/bin/pdf_object +48 -0
  8. data/bin/pdf_text +15 -0
  9. data/examples/callbacks.rb +21 -0
  10. data/examples/extract_bates.rb +49 -0
  11. data/examples/extract_images.rb +108 -0
  12. data/examples/hash.rb +12 -0
  13. data/examples/metadata.rb +25 -0
  14. data/examples/page_counter_improved.rb +23 -0
  15. data/examples/page_counter_naive.rb +24 -0
  16. data/examples/rspec.rb +57 -0
  17. data/examples/text.rb +40 -0
  18. data/examples/version.rb +25 -0
  19. data/lib/pdf/hash.rb +15 -0
  20. data/lib/pdf/reader/abstract_strategy.rb +81 -0
  21. data/lib/pdf/reader/buffer.rb +346 -0
  22. data/lib/pdf/reader/cmap.rb +138 -0
  23. data/lib/pdf/reader/encoding.rb +190 -0
  24. data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
  25. data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
  26. data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
  27. data/lib/pdf/reader/encodings/standard.txt +47 -0
  28. data/lib/pdf/reader/encodings/symbol.txt +154 -0
  29. data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
  30. data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
  31. data/lib/pdf/reader/error.rb +53 -0
  32. data/lib/pdf/reader/filter.rb +219 -0
  33. data/lib/pdf/reader/font.rb +133 -0
  34. data/lib/pdf/reader/form_xobject.rb +83 -0
  35. data/lib/pdf/reader/glyphlist.txt +4322 -0
  36. data/lib/pdf/reader/lzw.rb +123 -0
  37. data/lib/pdf/reader/metadata_strategy.rb +56 -0
  38. data/lib/pdf/reader/object_cache.rb +85 -0
  39. data/lib/pdf/reader/object_hash.rb +289 -0
  40. data/lib/pdf/reader/object_stream.rb +51 -0
  41. data/lib/pdf/reader/page.rb +185 -0
  42. data/lib/pdf/reader/page_text_receiver.rb +278 -0
  43. data/lib/pdf/reader/pages_strategy.rb +475 -0
  44. data/lib/pdf/reader/parser.rb +225 -0
  45. data/lib/pdf/reader/print_receiver.rb +18 -0
  46. data/lib/pdf/reader/reference.rb +66 -0
  47. data/lib/pdf/reader/register_receiver.rb +95 -0
  48. data/lib/pdf/reader/stream.rb +69 -0
  49. data/lib/pdf/reader/text_receiver.rb +264 -0
  50. data/lib/pdf/reader/token.rb +41 -0
  51. data/lib/pdf/reader/xref.rb +220 -0
  52. data/lib/pdf/reader.rb +296 -0
  53. data/lib/pdf-reader.rb +1 -0
  54. metadata +211 -0
@@ -0,0 +1,133 @@
1
+ ################################################################################
2
+ #
3
+ # Copyright (C) 2008 James Healy (jimmy@deefa.com)
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining
6
+ # a copy of this software and associated documentation files (the
7
+ # "Software"), to deal in the Software without restriction, including
8
+ # without limitation the rights to use, copy, modify, merge, publish,
9
+ # distribute, sublicense, and/or sell copies of the Software, and to
10
+ # permit persons to whom the Software is furnished to do so, subject to
11
+ # the following conditions:
12
+ #
13
+ # The above copyright notice and this permission notice shall be
14
+ # included in all copies or substantial portions of the Software.
15
+ #
16
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
+ #
24
+ ################################################################################
25
+
26
+ class PDF::Reader
27
+ class Font
28
+ attr_accessor :label, :subtype, :encoding, :descendantfonts, :tounicode
29
+ attr_reader :widths, :first_char, :ascent, :descent, :missing_width, :bbox
30
+ attr_reader :basefont
31
+
32
+ def initialize(ohash = nil, obj = nil)
33
+ if ohash.nil? || obj.nil?
34
+ $stderr.puts "DEPREACTION WARNING - PDF::Reader::Font.new should be called with 2 args"
35
+ return
36
+ end
37
+ @ohash = ohash
38
+
39
+ extract_base_info(obj)
40
+ extract_descriptor(obj)
41
+ extract_descendants(obj)
42
+ end
43
+
44
+ # returns a hash that maps glyph names to unicode codepoints. The mapping is based on
45
+ # a text file supplied by Adobe at:
46
+ # http://www.adobe.com/devnet/opentype/archives/glyphlist.txt
47
+ def self.glyphnames
48
+ glyphs = {}
49
+
50
+ RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
51
+ File.open(File.dirname(__FILE__) + "/glyphlist.txt",mode) do |f|
52
+ f.each do |l|
53
+ m, name, code = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
54
+ glyphs[name.to_sym] = "0x#{code}".hex if name
55
+ end
56
+ end
57
+
58
+ glyphs
59
+ end
60
+
61
+ def basefont=(font)
62
+ # setup a default encoding for the selected font. It can always be overridden
63
+ # with encoding= if required
64
+ case font
65
+ when "Symbol" then
66
+ @encoding = PDF::Reader::Encoding.new("SymbolEncoding")
67
+ when "ZapfDingbats" then
68
+ @encoding = PDF::Reader::Encoding.new("ZapfDingbatsEncoding")
69
+ else
70
+ @encoding = nil
71
+ end
72
+ @basefont = font
73
+ end
74
+
75
+ def to_utf8(params)
76
+ raise UnsupportedFeatureError, "font encoding '#{encoding}' currently unsupported" if encoding.kind_of?(String)
77
+
78
+ if params.class == String
79
+ # translate the bytestram into a UTF-8 string.
80
+ # If an encoding hasn't been specified, assume the text using this
81
+ # font is in Adobe Standard Encoding.
82
+ (encoding || PDF::Reader::Encoding.new(:StandardEncoding)).to_utf8(params, tounicode)
83
+ elsif params.class == Array
84
+ params.collect { |param| self.to_utf8(param) }
85
+ else
86
+ params
87
+ end
88
+ end
89
+
90
+ def glyph_width(c)
91
+ @missing_width ||= 0
92
+ if @widths.nil?
93
+ 0
94
+ else
95
+ @widths.fetch(c.codepoints.first - @first_char, @missing_width)
96
+ end
97
+ end
98
+
99
+ private
100
+
101
+ def extract_base_info(obj)
102
+ @subtype = @ohash.object(obj[:Subtype])
103
+ @basefont = @ohash.object(obj[:BaseFont])
104
+ @encoding = PDF::Reader::Encoding.new(@ohash.object(obj[:Encoding]))
105
+ @widths = @ohash.object(obj[:Widths])
106
+ @first_char = @ohash.object(obj[:FirstChar])
107
+ if obj[:ToUnicode]
108
+ stream = @ohash.object(obj[:ToUnicode])
109
+ @tounicode = PDF::Reader::CMap.new(stream.unfiltered_data)
110
+ end
111
+ end
112
+
113
+ def extract_descriptor(obj)
114
+ return unless obj[:FontDescriptor]
115
+
116
+ fd = @ohash.object(obj[:FontDescriptor])
117
+ @ascent = @ohash.object(fd[:Ascent])
118
+ @descent = @ohash.object(fd[:Descent])
119
+ @missing_width = @ohash.object(fd[:MissingWidth])
120
+ @bbox = @ohash.object(fd[:FontBBox])
121
+ end
122
+
123
+ def extract_descendants(obj)
124
+ return unless obj[:DescendantFonts]
125
+
126
+ descendants = @ohash.object(obj[:DescendantFonts])
127
+ @descendantfonts = descendants.map { |desc|
128
+ PDF::Reader::Font.new(@ohash, @ohash.object(desc))
129
+ }
130
+ end
131
+
132
+ end
133
+ end
@@ -0,0 +1,83 @@
1
+ # coding: utf-8
2
+
3
+ module PDF
4
+ class Reader
5
+
6
+ # High level representation of a single PDF form xobject. Form xobjects
7
+ # are contained pieces of content that can be inserted onto multiple
8
+ # pages. They're generally used as a space efficient way to store
9
+ # repetative content (like logos, header, footers, etc).
10
+ #
11
+ # This behaves and looks much like a limited PDF::Reader::Page class.
12
+ #
13
+ class FormXObject
14
+
15
+ def initialize(page, xobject)
16
+ @page = page
17
+ @objects = page.objects
18
+ @xobject = @objects.deref(xobject)
19
+ end
20
+
21
+ # Returns the resources that accompany this form.
22
+ #
23
+ def resources
24
+ @resources ||= @objects.deref(@xobject.hash[:Resources]) || {}
25
+ end
26
+
27
+ # return a hash of fonts used on this form.
28
+ #
29
+ # The keys are the font labels used within the form content stream.
30
+ #
31
+ # The values are a PDF::Reader::Font instances that provide access
32
+ # to most available metrics for each font.
33
+ #
34
+ def fonts
35
+ raw_fonts = @objects.deref(resources[:Font] || {})
36
+ ::Hash[raw_fonts.map { |label, font|
37
+ [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
38
+ }]
39
+ end
40
+
41
+ # processes the raw content stream for this form in sequential order and
42
+ # passes callbacks to the receiver objects.
43
+ #
44
+ # See the comments on PDF::Reader::Page#walk for more detail.
45
+ #
46
+ def walk(*receivers)
47
+ content_stream(receivers, raw_content)
48
+ end
49
+
50
+ # returns the raw content stream for this page. This is plumbing, nothing to
51
+ # see here unless you're a PDF nerd like me.
52
+ #
53
+ def raw_content
54
+ @xobject.unfiltered_data
55
+ end
56
+
57
+ private
58
+
59
+ def callback(receivers, name, params=[])
60
+ receivers.each do |receiver|
61
+ receiver.send(name, *params) if receiver.respond_to?(name)
62
+ end
63
+ end
64
+
65
+ def content_stream(receivers, instructions)
66
+ buffer = Buffer.new(StringIO.new(instructions), :content_stream => true)
67
+ parser = Parser.new(buffer, @objects)
68
+ params = []
69
+
70
+ while (token = parser.parse_token(PagesStrategy::OPERATORS))
71
+ if token.kind_of?(Token) and PagesStrategy::OPERATORS.has_key?(token)
72
+ callback(receivers, PagesStrategy::OPERATORS[token], params)
73
+ params.clear
74
+ else
75
+ params << token
76
+ end
77
+ end
78
+ rescue EOFError => e
79
+ raise MalformedPDFError, "End Of File while processing a content stream"
80
+ end
81
+ end
82
+ end
83
+ end