fireinc-pdf-reader 0.11.0.alpha
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +168 -0
- data/MIT-LICENSE +21 -0
- data/README.rdoc +137 -0
- data/Rakefile +34 -0
- data/TODO +45 -0
- data/bin/pdf_list_callbacks +15 -0
- data/bin/pdf_object +48 -0
- data/bin/pdf_text +15 -0
- data/examples/callbacks.rb +21 -0
- data/examples/extract_bates.rb +49 -0
- data/examples/extract_images.rb +108 -0
- data/examples/hash.rb +12 -0
- data/examples/metadata.rb +25 -0
- data/examples/page_counter_improved.rb +23 -0
- data/examples/page_counter_naive.rb +24 -0
- data/examples/rspec.rb +57 -0
- data/examples/text.rb +40 -0
- data/examples/version.rb +25 -0
- data/lib/pdf/hash.rb +15 -0
- data/lib/pdf/reader/abstract_strategy.rb +81 -0
- data/lib/pdf/reader/buffer.rb +346 -0
- data/lib/pdf/reader/cmap.rb +138 -0
- data/lib/pdf/reader/encoding.rb +190 -0
- data/lib/pdf/reader/encodings/mac_expert.txt +159 -0
- data/lib/pdf/reader/encodings/mac_roman.txt +128 -0
- data/lib/pdf/reader/encodings/pdf_doc.txt +40 -0
- data/lib/pdf/reader/encodings/standard.txt +47 -0
- data/lib/pdf/reader/encodings/symbol.txt +154 -0
- data/lib/pdf/reader/encodings/win_ansi.txt +29 -0
- data/lib/pdf/reader/encodings/zapf_dingbats.txt +201 -0
- data/lib/pdf/reader/error.rb +53 -0
- data/lib/pdf/reader/filter.rb +219 -0
- data/lib/pdf/reader/font.rb +133 -0
- data/lib/pdf/reader/form_xobject.rb +83 -0
- data/lib/pdf/reader/glyphlist.txt +4322 -0
- data/lib/pdf/reader/lzw.rb +123 -0
- data/lib/pdf/reader/metadata_strategy.rb +56 -0
- data/lib/pdf/reader/object_cache.rb +85 -0
- data/lib/pdf/reader/object_hash.rb +289 -0
- data/lib/pdf/reader/object_stream.rb +51 -0
- data/lib/pdf/reader/page.rb +185 -0
- data/lib/pdf/reader/page_text_receiver.rb +278 -0
- data/lib/pdf/reader/pages_strategy.rb +475 -0
- data/lib/pdf/reader/parser.rb +225 -0
- data/lib/pdf/reader/print_receiver.rb +18 -0
- data/lib/pdf/reader/reference.rb +66 -0
- data/lib/pdf/reader/register_receiver.rb +95 -0
- data/lib/pdf/reader/stream.rb +69 -0
- data/lib/pdf/reader/text_receiver.rb +264 -0
- data/lib/pdf/reader/token.rb +41 -0
- data/lib/pdf/reader/xref.rb +220 -0
- data/lib/pdf/reader.rb +296 -0
- data/lib/pdf-reader.rb +1 -0
- metadata +211 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
# coding: utf-8
|
4
|
+
# Extract metadata only
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class MetaDataReceiver
|
10
|
+
attr_accessor :regular
|
11
|
+
attr_accessor :xml
|
12
|
+
|
13
|
+
def metadata(data)
|
14
|
+
@regular = data
|
15
|
+
end
|
16
|
+
|
17
|
+
def metadata_xml(data)
|
18
|
+
@xml = data
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
receiver = MetaDataReceiver.new
|
23
|
+
pdf = PDF::Reader.file(ARGV.shift, receiver, :pages => false, :metadata => true)
|
24
|
+
puts receiver.regular.inspect
|
25
|
+
puts receiver.xml.inspect
|
@@ -0,0 +1,23 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Improved Page Counter
|
5
|
+
#
|
6
|
+
# A simple app to display the number of pages in a PDF File.
|
7
|
+
#
|
8
|
+
|
9
|
+
require 'rubygems'
|
10
|
+
require 'pdf/reader'
|
11
|
+
|
12
|
+
class PageReceiver
|
13
|
+
attr_accessor :pages
|
14
|
+
|
15
|
+
# Called when page parsing ends
|
16
|
+
def page_count(arg)
|
17
|
+
@pages = arg
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
receiver = PageReceiver.new
|
22
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver, :pages => false)
|
23
|
+
puts "#{receiver.pages} pages"
|
@@ -0,0 +1,24 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# A simple app to count the number of pages in a PDF File.
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class PageReceiver
|
10
|
+
attr_accessor :counter
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@counter = 0
|
14
|
+
end
|
15
|
+
|
16
|
+
# Called when page parsing ends
|
17
|
+
def end_page
|
18
|
+
@counter += 1
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
receiver = PageReceiver.new
|
23
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
24
|
+
puts "#{receiver.counter} pages"
|
data/examples/rspec.rb
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Basic RSpec of a generated PDF
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
require 'pdf/writer'
|
9
|
+
require 'spec'
|
10
|
+
|
11
|
+
class PageTextReceiver
|
12
|
+
attr_accessor :content
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@content = []
|
16
|
+
end
|
17
|
+
|
18
|
+
# Called when page parsing starts
|
19
|
+
def begin_page(arg = nil)
|
20
|
+
@content << ""
|
21
|
+
end
|
22
|
+
|
23
|
+
def show_text(string, *params)
|
24
|
+
@content.last << string.strip
|
25
|
+
end
|
26
|
+
|
27
|
+
# there's a few text callbacks, so make sure we process them all
|
28
|
+
alias :super_show_text :show_text
|
29
|
+
alias :move_to_next_line_and_show_text :show_text
|
30
|
+
alias :set_spacing_next_line_show_text :show_text
|
31
|
+
|
32
|
+
def show_text_with_positioning(*params)
|
33
|
+
params = params.first
|
34
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
context "My generated PDF" do
|
39
|
+
specify "should have the correct text on 2 pages" do
|
40
|
+
|
41
|
+
# generate our PDF
|
42
|
+
pdf = PDF::Writer.new
|
43
|
+
pdf.text "Chunky", :font_size => 32, :justification => :center
|
44
|
+
pdf.start_new_page
|
45
|
+
pdf.text "Bacon", :font_size => 32, :justification => :center
|
46
|
+
pdf.save_as("chunkybacon.pdf")
|
47
|
+
|
48
|
+
# process the PDF
|
49
|
+
receiver = PageTextReceiver.new
|
50
|
+
PDF::Reader.file("chunkybacon.pdf", receiver)
|
51
|
+
|
52
|
+
# confirm the text appears on the correct pages
|
53
|
+
receiver.content.size.should eql(2)
|
54
|
+
receiver.content[0].should eql("Chunky")
|
55
|
+
receiver.content[1].should eql("Bacon")
|
56
|
+
end
|
57
|
+
end
|
data/examples/text.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Extract all text from a single PDF
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class PageTextReceiver
|
10
|
+
attr_accessor :content
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@content = []
|
14
|
+
end
|
15
|
+
|
16
|
+
# Called when page parsing starts
|
17
|
+
def begin_page(arg = nil)
|
18
|
+
@content << ""
|
19
|
+
end
|
20
|
+
|
21
|
+
# record text that is drawn on the page
|
22
|
+
def show_text(string, *params)
|
23
|
+
@content.last << string.strip
|
24
|
+
end
|
25
|
+
|
26
|
+
# there's a few text callbacks, so make sure we process them all
|
27
|
+
alias :super_show_text :show_text
|
28
|
+
alias :move_to_next_line_and_show_text :show_text
|
29
|
+
alias :set_spacing_next_line_show_text :show_text
|
30
|
+
|
31
|
+
# this final text callback takes slightly different arguments
|
32
|
+
def show_text_with_positioning(*params)
|
33
|
+
params = params.first
|
34
|
+
params.each { |str| show_text(str) if str.kind_of?(String)}
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
receiver = PageTextReceiver.new
|
39
|
+
pdf = PDF::Reader.file("somefile.pdf", receiver)
|
40
|
+
puts receiver.content.inspect
|
data/examples/version.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
# Determine the PDF version of a file
|
5
|
+
|
6
|
+
require 'rubygems'
|
7
|
+
require 'pdf/reader'
|
8
|
+
|
9
|
+
class VersionReceiver
|
10
|
+
attr_accessor :version
|
11
|
+
|
12
|
+
def initialize
|
13
|
+
@version = nil
|
14
|
+
end
|
15
|
+
|
16
|
+
# Called when document parsing starts
|
17
|
+
def pdf_version(arg = nil)
|
18
|
+
@version = arg
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
|
23
|
+
receiver = VersionReceiver.new
|
24
|
+
pdf = PDF::Reader.file(ARGV.shift, receiver)
|
25
|
+
puts receiver.version
|
data/lib/pdf/hash.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
module PDF
|
4
|
+
class Hash < ::PDF::Reader::ObjectHash # :nodoc:
|
5
|
+
def initialize(input)
|
6
|
+
warn "DEPRECATION NOTICE: PDF::Hash has been deprecated, use PDF::Reader::ObjectHash instead"
|
7
|
+
super
|
8
|
+
end
|
9
|
+
|
10
|
+
def version
|
11
|
+
warn "DEPRECATION NOTICE: PDF::Hash#version has been deprecated, use PDF::Reader::ObjectHash#pdf_version instead"
|
12
|
+
pdf_version
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
class PDF::Reader
|
4
|
+
|
5
|
+
# DEPRECATED: this class was deprecated in version 0.11.0 and will
|
6
|
+
# eventually be removed
|
7
|
+
class AbstractStrategy # :nodoc:
|
8
|
+
|
9
|
+
def initialize(ohash, receivers, options = {})
|
10
|
+
@ohash, @options = ohash, options
|
11
|
+
if receivers.is_a?(Array)
|
12
|
+
@receivers = receivers
|
13
|
+
else
|
14
|
+
@receivers = [receivers]
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def options
|
21
|
+
@options || {}
|
22
|
+
end
|
23
|
+
|
24
|
+
# calls the name callback method on the receiver class with params as the arguments
|
25
|
+
#
|
26
|
+
def callback (name, params=[])
|
27
|
+
@receivers.each do |receiver|
|
28
|
+
receiver.send(name, *params) if receiver.respond_to?(name)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# strings outside of page content should be in either PDFDocEncoding or UTF-16.
|
33
|
+
def decode_strings(obj)
|
34
|
+
case obj
|
35
|
+
when String then
|
36
|
+
if obj[0,2].unpack("C*").slice(0,2) == [254,255]
|
37
|
+
PDF::Reader::Encoding.new(:UTF16Encoding).to_utf8(obj[2, obj.size])
|
38
|
+
else
|
39
|
+
PDF::Reader::Encoding.new(:PDFDocEncoding).to_utf8(obj)
|
40
|
+
end
|
41
|
+
when Hash then obj.each { |key,val| obj[key] = decode_strings(val) }
|
42
|
+
when Array then obj.collect { |item| decode_strings(item) }
|
43
|
+
else
|
44
|
+
obj
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def info
|
49
|
+
ohash.object(trailer[:Info])
|
50
|
+
end
|
51
|
+
|
52
|
+
def info?
|
53
|
+
info ? true : false
|
54
|
+
end
|
55
|
+
|
56
|
+
def ohash
|
57
|
+
@ohash
|
58
|
+
end
|
59
|
+
|
60
|
+
def pages
|
61
|
+
ohash.object(root[:Pages])
|
62
|
+
end
|
63
|
+
|
64
|
+
def pages?
|
65
|
+
pages ? true : false
|
66
|
+
end
|
67
|
+
|
68
|
+
def root
|
69
|
+
ohash.object(trailer[:Root])
|
70
|
+
end
|
71
|
+
|
72
|
+
def root?
|
73
|
+
root ? true : false
|
74
|
+
end
|
75
|
+
|
76
|
+
def trailer
|
77
|
+
ohash.trailer
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,346 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
################################################################################
|
4
|
+
#
|
5
|
+
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
|
6
|
+
#
|
7
|
+
# Permission is hereby granted, free of charge, to any person obtaining
|
8
|
+
# a copy of this software and associated documentation files (the
|
9
|
+
# "Software"), to deal in the Software without restriction, including
|
10
|
+
# without limitation the rights to use, copy, modify, merge, publish,
|
11
|
+
# distribute, sublicense, and/or sell copies of the Software, and to
|
12
|
+
# permit persons to whom the Software is furnished to do so, subject to
|
13
|
+
# the following conditions:
|
14
|
+
#
|
15
|
+
# The above copyright notice and this permission notice shall be
|
16
|
+
# included in all copies or substantial portions of the Software.
|
17
|
+
#
|
18
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
19
|
+
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
20
|
+
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
21
|
+
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
22
|
+
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
23
|
+
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
24
|
+
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
25
|
+
#
|
26
|
+
################################################################################
|
27
|
+
|
28
|
+
class PDF::Reader
|
29
|
+
|
30
|
+
# A string tokeniser that recognises PDF grammar. When passed an IO stream or a
|
31
|
+
# string, repeated calls to token() will return the next token from the source.
|
32
|
+
#
|
33
|
+
# This is very low level, and getting the raw tokens is not very useful in itself.
|
34
|
+
#
|
35
|
+
# This will usually be used in conjunction with PDF:Reader::Parser, which converts
|
36
|
+
# the raw tokens into objects we can work with (strings, ints, arrays, etc)
|
37
|
+
#
|
38
|
+
class Buffer
|
39
|
+
|
40
|
+
attr_reader :pos
|
41
|
+
|
42
|
+
# Creates a new buffer.
|
43
|
+
#
|
44
|
+
# Params:
|
45
|
+
#
|
46
|
+
# io - an IO stream or string with the raw data to tokenise
|
47
|
+
#
|
48
|
+
# options:
|
49
|
+
#
|
50
|
+
# :seek - a byte offset to seek to before starting to tokenise
|
51
|
+
# :content_stream - set to true if buffer will be tokenising a
|
52
|
+
# content stream. Defaults to false
|
53
|
+
#
|
54
|
+
def initialize (io, opts = {})
|
55
|
+
@io = io
|
56
|
+
@tokens = []
|
57
|
+
@in_content_stream = opts[:content_stream]
|
58
|
+
|
59
|
+
@io.seek(opts[:seek]) if opts[:seek]
|
60
|
+
@pos = @io.pos
|
61
|
+
end
|
62
|
+
|
63
|
+
# return true if there are no more tokens left
|
64
|
+
#
|
65
|
+
def empty?
|
66
|
+
prepare_tokens if @tokens.size < 3
|
67
|
+
|
68
|
+
@tokens.empty?
|
69
|
+
end
|
70
|
+
|
71
|
+
# return raw bytes from the underlying IO stream.
|
72
|
+
#
|
73
|
+
# bytes - the number of bytes to read
|
74
|
+
#
|
75
|
+
# options:
|
76
|
+
#
|
77
|
+
# :skip_eol - if true, the IO stream is advanced past a CRLF or LF that
|
78
|
+
# is sitting under the io cursor.
|
79
|
+
#
|
80
|
+
def read(bytes, opts = {})
|
81
|
+
reset_pos
|
82
|
+
|
83
|
+
if opts[:skip_eol]
|
84
|
+
@io.seek(-1, IO::SEEK_CUR)
|
85
|
+
str = @io.read(2)
|
86
|
+
if str.nil?
|
87
|
+
return nil
|
88
|
+
elsif str == "\r\n"
|
89
|
+
# do nothing
|
90
|
+
elsif str[0,1] == "\n"
|
91
|
+
@io.seek(-1, IO::SEEK_CUR)
|
92
|
+
else
|
93
|
+
@io.seek(-2, IO::SEEK_CUR)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
bytes = @io.read(bytes)
|
98
|
+
save_pos
|
99
|
+
bytes
|
100
|
+
end
|
101
|
+
|
102
|
+
# return the next token from the source. Returns a string if a token
|
103
|
+
# is found, nil if there are no tokens left.
|
104
|
+
#
|
105
|
+
def token
|
106
|
+
reset_pos
|
107
|
+
prepare_tokens if @tokens.size < 3
|
108
|
+
merge_indirect_reference
|
109
|
+
prepare_tokens if @tokens.size < 3
|
110
|
+
|
111
|
+
@tokens.shift
|
112
|
+
end
|
113
|
+
|
114
|
+
# return the byte offset where the first XRef table in th source can be found.
|
115
|
+
#
|
116
|
+
def find_first_xref_offset
|
117
|
+
@io.seek(-1024, IO::SEEK_END) rescue @io.seek(0)
|
118
|
+
data = @io.read(1024)
|
119
|
+
|
120
|
+
# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
|
121
|
+
lines = data.split(/[\n\r]+/).reverse
|
122
|
+
eof_index = lines.index { |l| l.strip == "%%EOF" }
|
123
|
+
|
124
|
+
raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
|
125
|
+
raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
|
126
|
+
lines[eof_index+1].to_i
|
127
|
+
end
|
128
|
+
|
129
|
+
private
|
130
|
+
|
131
|
+
# Returns true if this buffer is parsing a content stream
|
132
|
+
#
|
133
|
+
def in_content_stream?
|
134
|
+
@in_content_stream ? true : false
|
135
|
+
end
|
136
|
+
|
137
|
+
# Some bastard moved our IO stream cursor. Restore it.
|
138
|
+
#
|
139
|
+
def reset_pos
|
140
|
+
@io.seek(@pos) if @io.pos != @pos
|
141
|
+
end
|
142
|
+
|
143
|
+
# save the current position of the source IO stream. If someone else (like another buffer)
|
144
|
+
# moves the cursor, we can then restore it.
|
145
|
+
#
|
146
|
+
def save_pos
|
147
|
+
@pos = @io.pos
|
148
|
+
end
|
149
|
+
|
150
|
+
# attempt to prime the buffer with the next few tokens.
|
151
|
+
#
|
152
|
+
def prepare_tokens
|
153
|
+
10.times do
|
154
|
+
if state == :literal_string
|
155
|
+
prepare_literal_token
|
156
|
+
elsif state == :hex_string
|
157
|
+
prepare_hex_token
|
158
|
+
elsif state == :regular
|
159
|
+
prepare_regular_token
|
160
|
+
elsif state == :inline
|
161
|
+
prepare_inline_token
|
162
|
+
end
|
163
|
+
end
|
164
|
+
|
165
|
+
save_pos
|
166
|
+
end
|
167
|
+
|
168
|
+
# tokenising behaves slightly differently based on the current context.
|
169
|
+
# Determine the current context/state by examining the last token we found
|
170
|
+
#
|
171
|
+
def state
|
172
|
+
if @tokens[-1] == "("
|
173
|
+
:literal_string
|
174
|
+
elsif @tokens[-1] == "<"
|
175
|
+
:hex_string
|
176
|
+
elsif @tokens[-1] == "stream"
|
177
|
+
:stream
|
178
|
+
elsif in_content_stream? && @tokens[-1] == "ID"
|
179
|
+
:inline
|
180
|
+
else
|
181
|
+
:regular
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
# detect a series of 3 tokens that make up an indirect object. If we find
|
186
|
+
# them, replace the tokens with a PDF::Reader::Reference instance.
|
187
|
+
#
|
188
|
+
# Merging them into a single string was another option, but that would mean
|
189
|
+
# code further up the stack would need to check every token to see if it looks
|
190
|
+
# like an indirect object. For optimisation reasons, I'd rather avoid
|
191
|
+
# that extra check.
|
192
|
+
#
|
193
|
+
# It's incredibly likely that the next 3 tokens in the buffer are NOT an
|
194
|
+
# indirect reference, so test for that case first and avoid the relatively
|
195
|
+
# expensive regexp checks if possible.
|
196
|
+
#
|
197
|
+
def merge_indirect_reference
|
198
|
+
return if @tokens.size < 3
|
199
|
+
return if @tokens[2] != "R"
|
200
|
+
|
201
|
+
if @tokens[0].match(/\d+/) && @tokens[1].match(/\d+/)
|
202
|
+
@tokens[0] = PDF::Reader::Reference.new(@tokens[0].to_i, @tokens[1].to_i)
|
203
|
+
@tokens[1] = nil
|
204
|
+
@tokens[2] = nil
|
205
|
+
@tokens.compact!
|
206
|
+
end
|
207
|
+
end
|
208
|
+
|
209
|
+
def prepare_inline_token
|
210
|
+
str = ""
|
211
|
+
|
212
|
+
while str[-2,2] != "EI"
|
213
|
+
chr = @io.read(1)
|
214
|
+
break if chr.nil?
|
215
|
+
str << chr
|
216
|
+
end
|
217
|
+
|
218
|
+
@tokens << str[0, str.size-2].strip
|
219
|
+
@io.seek(-2, IO::SEEK_CUR) unless chr.nil?
|
220
|
+
end
|
221
|
+
|
222
|
+
# if we're currently inside a hex string, read hex nibbles until
|
223
|
+
# we find a closing >
|
224
|
+
#
|
225
|
+
def prepare_hex_token
|
226
|
+
str = ""
|
227
|
+
finished = false
|
228
|
+
|
229
|
+
while !finished
|
230
|
+
chr = @io.read(1)
|
231
|
+
codepoint = chr.to_s.unpack("C*").first
|
232
|
+
if chr.nil?
|
233
|
+
finished = true # unbalanced params
|
234
|
+
elsif (48..57).include?(codepoint) || (65..90).include?(codepoint) || (97..122).include?(codepoint)
|
235
|
+
str << chr
|
236
|
+
elsif codepoint <= 32
|
237
|
+
# ignore it
|
238
|
+
else
|
239
|
+
@tokens << str if str.size > 0
|
240
|
+
@tokens << ">" if chr != ">"
|
241
|
+
@tokens << chr
|
242
|
+
finished = true
|
243
|
+
end
|
244
|
+
end
|
245
|
+
end
|
246
|
+
|
247
|
+
# if we're currently inside a literal string we more or less just read bytes until
|
248
|
+
# we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
|
249
|
+
# start of a new token in regular mode are left untouched when inside a literal
|
250
|
+
# string.
|
251
|
+
#
|
252
|
+
# The entire literal string will be returned as a single token. It will need further
|
253
|
+
# processing to fix things like escaped new lines, but that's someone else's
|
254
|
+
# problem.
|
255
|
+
#
|
256
|
+
def prepare_literal_token
|
257
|
+
str = ""
|
258
|
+
count = 1
|
259
|
+
|
260
|
+
while count > 0
|
261
|
+
chr = @io.read(1)
|
262
|
+
if chr.nil?
|
263
|
+
count = 0 # unbalanced params
|
264
|
+
elsif chr == "\x5c"
|
265
|
+
str << chr << @io.read(1).to_s
|
266
|
+
elsif chr == "("
|
267
|
+
str << "("
|
268
|
+
count += 1
|
269
|
+
elsif chr == ")"
|
270
|
+
count -= 1
|
271
|
+
str << ")" unless count == 0
|
272
|
+
else
|
273
|
+
str << chr unless count == 0
|
274
|
+
end
|
275
|
+
end
|
276
|
+
|
277
|
+
@tokens << str if str.size > 0
|
278
|
+
@tokens << ")"
|
279
|
+
end
|
280
|
+
|
281
|
+
# Extract the next regular token and stock it in our buffer, ready to be returned.
|
282
|
+
#
|
283
|
+
# What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
|
284
|
+
# to read up on it.
|
285
|
+
#
|
286
|
+
def prepare_regular_token
|
287
|
+
tok = ""
|
288
|
+
|
289
|
+
while chr = @io.read(1)
|
290
|
+
case chr
|
291
|
+
when "\x25"
|
292
|
+
# comment, ignore everything until the next EOL char
|
293
|
+
done = false
|
294
|
+
while !done
|
295
|
+
chr = @io.read(1)
|
296
|
+
done = true if chr.nil? || chr == "\x0A" || chr == "\x0D"
|
297
|
+
end
|
298
|
+
when "\x00", "\x09", "\x0A", "\x0C", "\x0D", "\x20"
|
299
|
+
# white space, token finished
|
300
|
+
@tokens << tok if tok.size > 0
|
301
|
+
tok = ""
|
302
|
+
break
|
303
|
+
when "\x3C"
|
304
|
+
# opening delimiter '<', start of new token
|
305
|
+
@tokens << tok if tok.size > 0
|
306
|
+
chr << @io.read(1) if peek_char == "\x3C" # check if token is actually '<<'
|
307
|
+
@tokens << chr
|
308
|
+
tok = ""
|
309
|
+
break
|
310
|
+
when "\x3E"
|
311
|
+
# closing delimiter '>', start of new token
|
312
|
+
@tokens << tok if tok.size > 0
|
313
|
+
chr << @io.read(1) if peek_char == "\x3E" # check if token is actually '>>'
|
314
|
+
@tokens << chr
|
315
|
+
tok = ""
|
316
|
+
break
|
317
|
+
when "\x28", "\x5B", "\x7B", "\x2F"
|
318
|
+
# opening delimiter, start of new token
|
319
|
+
@tokens << tok if tok.size > 0
|
320
|
+
@tokens << chr
|
321
|
+
tok = ""
|
322
|
+
break
|
323
|
+
when "\x29", "\x5D", "\x7D"
|
324
|
+
# closing delimiter
|
325
|
+
@tokens << tok if tok.size > 0
|
326
|
+
@tokens << chr
|
327
|
+
tok = ""
|
328
|
+
break
|
329
|
+
else
|
330
|
+
tok << chr
|
331
|
+
end
|
332
|
+
end
|
333
|
+
|
334
|
+
@tokens << tok if tok.size > 0
|
335
|
+
end
|
336
|
+
|
337
|
+
# peek at the next character in the io stream, leaving the stream position
|
338
|
+
# untouched
|
339
|
+
#
|
340
|
+
def peek_char
|
341
|
+
chr = @io.read(1)
|
342
|
+
@io.seek(-1, IO::SEEK_CUR) unless chr.nil?
|
343
|
+
chr
|
344
|
+
end
|
345
|
+
end
|
346
|
+
end
|