syntax 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/syntax.rb +31 -0
- data/lib/syntax/common.rb +118 -0
- data/lib/syntax/convertors/html.rb +50 -0
- data/lib/syntax/ruby.rb +239 -0
- data/lib/syntax/version.rb +9 -0
- data/lib/syntax/xml.rb +108 -0
- data/lib/syntax/yaml.rb +105 -0
- data/test/ALL-TESTS.rb +5 -0
- data/test/syntax/tc_ruby.rb +518 -0
- data/test/syntax/tc_xml.rb +202 -0
- data/test/syntax/tc_yaml.rb +228 -0
- metadata +51 -0
data/lib/syntax.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'syntax/common'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
|
5
|
+
# A default tokenizer for handling syntaxes that are not explicitly handled
|
6
|
+
# elsewhere. It simply yields the given text as a single token.
|
7
|
+
class Default
|
8
|
+
|
9
|
+
# Yield the given text as a single token.
|
10
|
+
def tokenize( text )
|
11
|
+
yield Token.new( text, :normal )
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
# A hash for registering syntax implementations.
|
17
|
+
SYNTAX = Hash.new( Default )
|
18
|
+
|
19
|
+
# Load the implementation of the requested syntax. If the syntax cannot be
|
20
|
+
# found, or if it cannot be loaded for whatever reason, the Default syntax
|
21
|
+
# handler will be returned.
|
22
|
+
def load( syntax )
|
23
|
+
begin
|
24
|
+
require "syntax/#{syntax}"
|
25
|
+
rescue LoadError
|
26
|
+
end
|
27
|
+
SYNTAX[ syntax ].new
|
28
|
+
end
|
29
|
+
module_function :load
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
|
5
|
+
# A single token extracted by a tokenizer. It is simply the lexeme
|
6
|
+
# itself, decorated with a 'group' attribute to identify the type of the
|
7
|
+
# lexeme.
|
8
|
+
class Token < String
|
9
|
+
|
10
|
+
# the type of the lexeme that was extracted.
|
11
|
+
attr_reader :group
|
12
|
+
|
13
|
+
# Create a new Token representing the given text, and belonging to the
|
14
|
+
# given group.
|
15
|
+
def initialize( text, group )
|
16
|
+
super text
|
17
|
+
@group = group
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
# The base class of all tokenizers. It sets up the scanner and manages the
|
23
|
+
# looping until all tokens have been extracted. It also provides convenience
|
24
|
+
# methods to make sure adjacent tokens of identical groups are returned as
|
25
|
+
# a single token.
|
26
|
+
class Tokenizer
|
27
|
+
|
28
|
+
# Start tokenizing. This sets up the state in preparation for tokenization,
|
29
|
+
# such as creating a new scanner for the text and saving the callback block.
|
30
|
+
# The block will be invoked for each token extracted.
|
31
|
+
def start( text, &block )
|
32
|
+
@chunk = ""
|
33
|
+
@group = :normal
|
34
|
+
@callback = block
|
35
|
+
@text = StringScanner.new( text )
|
36
|
+
setup
|
37
|
+
end
|
38
|
+
|
39
|
+
# Subclasses may override this method to provide implementation-specific
|
40
|
+
# setup logic.
|
41
|
+
def setup
|
42
|
+
end
|
43
|
+
|
44
|
+
# Finish tokenizing. This flushes the buffer, yielding any remaining text
|
45
|
+
# to the client.
|
46
|
+
def finish
|
47
|
+
start_group nil
|
48
|
+
teardown
|
49
|
+
end
|
50
|
+
|
51
|
+
# Subclasses may override this method to provide implementation-specific
|
52
|
+
# teardown logic.
|
53
|
+
def teardown
|
54
|
+
end
|
55
|
+
|
56
|
+
# Subclasses must implement this method, which is called for each iteration
|
57
|
+
# of the tokenization process. This method may extract multiple tokens.
|
58
|
+
def step
|
59
|
+
raise NotImplementedError, "subclasses must implement #step"
|
60
|
+
end
|
61
|
+
|
62
|
+
# Begins tokenizing the given text, calling #step until the text has been
|
63
|
+
# exhausted.
|
64
|
+
def tokenize( text, &block )
|
65
|
+
start text, &block
|
66
|
+
step until @text.eos?
|
67
|
+
finish
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
# A convenience for delegating method calls to the scanner.
|
73
|
+
def self.delegate( sym )
|
74
|
+
define_method( sym ) { |*a| @text.__send__( sym, *a ) }
|
75
|
+
end
|
76
|
+
|
77
|
+
delegate :bol?
|
78
|
+
delegate :eos?
|
79
|
+
delegate :scan
|
80
|
+
delegate :scan_until
|
81
|
+
delegate :check
|
82
|
+
delegate :check_until
|
83
|
+
delegate :getch
|
84
|
+
delegate :matched
|
85
|
+
delegate :pre_match
|
86
|
+
delegate :peek
|
87
|
+
delegate :pos
|
88
|
+
|
89
|
+
# Access the n-th subgroup from the most recent match.
|
90
|
+
def subgroup(n)
|
91
|
+
@text[n]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Append the given data to the currently active chunk.
|
95
|
+
def append( data )
|
96
|
+
@chunk << data
|
97
|
+
end
|
98
|
+
|
99
|
+
# Request that a new group be started. If the current group is the same
|
100
|
+
# as the group being requested, a new group will not be created. If a new
|
101
|
+
# group is created and the current chunk is not empty, the chunk's
|
102
|
+
# contents will be yielded to the client as a token, and then cleared.
|
103
|
+
#
|
104
|
+
# After the new group is started, if +data+ is non-nil it will be appended
|
105
|
+
# to the chunk.
|
106
|
+
def start_group( gr, data=nil )
|
107
|
+
if gr != @group && !@chunk.empty?
|
108
|
+
@callback.call( Token.new( @chunk, @group ) )
|
109
|
+
@chunk = ""
|
110
|
+
end
|
111
|
+
|
112
|
+
@group = gr
|
113
|
+
@chunk << data if data
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'syntax'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
module Convertors
|
5
|
+
|
6
|
+
# A simple class for converting a text into HTML.
|
7
|
+
class HTML
|
8
|
+
|
9
|
+
# A convenience method for instantiating a new HTML convertor for a
|
10
|
+
# specific syntax.
|
11
|
+
def self.for_syntax( syntax )
|
12
|
+
new( Syntax.load( syntax ) )
|
13
|
+
end
|
14
|
+
|
15
|
+
# Creates a new HTML convertor that uses the given tokenizer.
|
16
|
+
def initialize( tokenizer )
|
17
|
+
@tokenizer = tokenizer
|
18
|
+
end
|
19
|
+
|
20
|
+
# Converts the given text to HTML, using spans to represent token groups
|
21
|
+
# of any type but <tt>:normal</tt> (which is always unhighlighted). If
|
22
|
+
# +pre+ is +true+, the html is automatically wrapped in pre tags.
|
23
|
+
def convert( text, pre=true )
|
24
|
+
html = ""
|
25
|
+
html << "<pre>" if pre
|
26
|
+
@tokenizer.tokenize( text ) do |tok|
|
27
|
+
if tok.group == :normal
|
28
|
+
html << html_escape( tok )
|
29
|
+
else
|
30
|
+
html << "<span class=\"#{tok.group}\">#{html_escape(tok)}</span>"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
html << "</pre>" if pre
|
34
|
+
html
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# Replaces some characters with their corresponding HTML entities.
|
40
|
+
def html_escape( string )
|
41
|
+
string.gsub( /&/, "&" ).
|
42
|
+
gsub( /</, "<" ).
|
43
|
+
gsub( />/, ">" ).
|
44
|
+
gsub( /"/, """ )
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
data/lib/syntax/ruby.rb
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
require 'syntax'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
|
5
|
+
# A tokenizer for the Ruby language. It recognizes all common syntax
|
6
|
+
# (and some less common syntax) but because it is not a true lexer, it
|
7
|
+
# will make mistakes on some ambiguous cases.
|
8
|
+
class Ruby < Tokenizer
|
9
|
+
|
10
|
+
# The list of all identifiers recognized as keywords.
|
11
|
+
KEYWORDS =
|
12
|
+
%w{if then elsif else end begin do rescue ensure while for
|
13
|
+
class module def yield raise until unless and or not when
|
14
|
+
case super undef break next redo retry in return alias
|
15
|
+
defined?}
|
16
|
+
|
17
|
+
# Perform ruby-specific setup
|
18
|
+
def setup
|
19
|
+
@selector = false
|
20
|
+
end
|
21
|
+
|
22
|
+
# Step through a single iteration of the tokenization process.
|
23
|
+
def step
|
24
|
+
case
|
25
|
+
when bol? && check( /=begin/ )
|
26
|
+
start_group( :comment, scan_until( /^=end$/ ) )
|
27
|
+
when bol? && check( /__END__$/ )
|
28
|
+
start_group( :comment, scan_until( /\Z/ ) )
|
29
|
+
else
|
30
|
+
case
|
31
|
+
when check( /def\s+/ )
|
32
|
+
start_group :keyword, scan( /def\s+/ )
|
33
|
+
start_group :method, scan_until( /$|(?=[;(\s])/ )
|
34
|
+
when check( /class\s+/ )
|
35
|
+
start_group :keyword, scan( /class\s+/ )
|
36
|
+
start_group :class, scan_until( /$|(?=[;\s<])/ )
|
37
|
+
when check( /module\s+/ )
|
38
|
+
start_group :keyword, scan( /module\s+/ )
|
39
|
+
start_group :module, scan_until( /$|(?=[;\s])/ )
|
40
|
+
when check( /::/ )
|
41
|
+
start_group :punct, scan(/::/)
|
42
|
+
when check( /:"/ )
|
43
|
+
start_group :symbol, scan(/:/)
|
44
|
+
scan_delimited_region :symbol, :symbol, "", true
|
45
|
+
when check( /:'/ )
|
46
|
+
start_group :symbol, scan(/:/)
|
47
|
+
scan_delimited_region :symbol, :symbol, "", false
|
48
|
+
when check( /:\w/ )
|
49
|
+
start_group :symbol, scan(/:\w+[!?]?/)
|
50
|
+
when check( /\?\\?./ )
|
51
|
+
start_group :char, scan(/\?\\?./)
|
52
|
+
when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
|
53
|
+
if @selector || matched[-1] == ?? || matched[-1] == ?!
|
54
|
+
start_group :ident,
|
55
|
+
scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
|
56
|
+
else
|
57
|
+
start_group :constant,
|
58
|
+
scan(/(__FILE__|__LINE__|true|false|nil|self)/)
|
59
|
+
end
|
60
|
+
@selector = false
|
61
|
+
else
|
62
|
+
case peek(2)
|
63
|
+
when "%r"
|
64
|
+
scan_delimited_region :punct, :regex, scan( /../ ), true
|
65
|
+
when "%w", "%q"
|
66
|
+
scan_delimited_region :punct, :string, scan( /../ ), false
|
67
|
+
when "%s"
|
68
|
+
scan_delimited_region :punct, :symbol, scan( /../ ), false
|
69
|
+
when "%W", "%Q", "%x"
|
70
|
+
scan_delimited_region :punct, :string, scan( /../ ), true
|
71
|
+
when /%[^\sa-zA-Z0-9]/
|
72
|
+
scan_delimited_region :punct, :string, scan( /./ ), true
|
73
|
+
when "<<"
|
74
|
+
start_group :punct, scan( /<</ )
|
75
|
+
float_right = scan( /-/ )
|
76
|
+
append "-" if float_right
|
77
|
+
if ( type = scan( /['"]/ ) )
|
78
|
+
append type
|
79
|
+
delim = scan_until( /(?=#{type})/ )
|
80
|
+
if delim.nil?
|
81
|
+
append scan_until( /\Z/ )
|
82
|
+
return
|
83
|
+
end
|
84
|
+
else
|
85
|
+
delim = scan( /\w+/ ) or return
|
86
|
+
end
|
87
|
+
start_group :constant, delim
|
88
|
+
start_group :punct, scan( /#{type}/ ) if type
|
89
|
+
scan_delimited_region :constant, :string, "", ( type != "'" ),
|
90
|
+
delim, true, float_right
|
91
|
+
else
|
92
|
+
case peek(1)
|
93
|
+
when /\s/
|
94
|
+
start_group :normal, scan( /\s+/ )
|
95
|
+
when "#"
|
96
|
+
start_group :comment, scan( /#.*$/ )
|
97
|
+
when /[A-Z]/
|
98
|
+
start_group :constant, scan( /\w+/ )
|
99
|
+
when /[a-z_]/
|
100
|
+
word = scan( /\w+[?!]?/ )
|
101
|
+
if !@selector && KEYWORDS.include?( word )
|
102
|
+
start_group :keyword, word
|
103
|
+
elsif
|
104
|
+
start_group :ident, word
|
105
|
+
end
|
106
|
+
@selector = false
|
107
|
+
when /\d/
|
108
|
+
start_group :number,
|
109
|
+
scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
|
110
|
+
when '"'
|
111
|
+
scan_delimited_region :punct, :string, "", true
|
112
|
+
when '/'
|
113
|
+
scan_delimited_region :punct, :regex, "", true
|
114
|
+
when "'"
|
115
|
+
scan_delimited_region :punct, :string, "", false
|
116
|
+
when "."
|
117
|
+
dots = scan( /\.{1,3}/ )
|
118
|
+
start_group :punct, dots
|
119
|
+
@selector = ( dots.length == 1 )
|
120
|
+
when /[@]/
|
121
|
+
start_group :attribute, scan( /@{1,2}\w*/ )
|
122
|
+
when /[$]/
|
123
|
+
start_group :global, scan(/\$/)
|
124
|
+
start_group :global, scan( /\w+|./ ) if check(/./)
|
125
|
+
when /[-!?*\/+=<>()\[\]\{}:;,&|%]/
|
126
|
+
start_group :punct,
|
127
|
+
scan(/[-!?*\/+=<>()\[\]\{}:;,&|%]/)
|
128
|
+
else
|
129
|
+
# all else just falls through this, to prevent
|
130
|
+
# infinite loops...
|
131
|
+
append getch
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
# Scan a delimited region of text. This handles the simple cases (strings
|
141
|
+
# delimited with quotes) as well as the more complex cases of %-strings
|
142
|
+
# and here-documents.
|
143
|
+
def scan_delimited_region( delim_group, inner_group, starter, exprs,
|
144
|
+
delim=nil, delim_alone=false, float_right=false )
|
145
|
+
# begin
|
146
|
+
if !delim
|
147
|
+
start_group delim_group, starter
|
148
|
+
delim = scan( /./ )
|
149
|
+
append delim
|
150
|
+
|
151
|
+
delim = case delim
|
152
|
+
when '{' then '}'
|
153
|
+
when '(' then ')'
|
154
|
+
when '[' then ']'
|
155
|
+
else delim
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
start_group inner_group
|
160
|
+
|
161
|
+
items = "\\\\|"
|
162
|
+
|
163
|
+
if delim_alone
|
164
|
+
items << "(^"
|
165
|
+
items << '\s*' if float_right
|
166
|
+
items << "#{delim}$)"
|
167
|
+
else
|
168
|
+
items << "#{delim}"
|
169
|
+
end
|
170
|
+
|
171
|
+
items << "|#(\\$|@|\\{)"if exprs
|
172
|
+
items = Regexp.new( items )
|
173
|
+
|
174
|
+
loop do
|
175
|
+
p = pos
|
176
|
+
match = scan_until( items )
|
177
|
+
if match.nil?
|
178
|
+
start_group inner_group, scan_until( /\Z/ )
|
179
|
+
break
|
180
|
+
else
|
181
|
+
text = pre_match[p..-1]
|
182
|
+
start_group inner_group, text if text.length > 0
|
183
|
+
case matched.strip
|
184
|
+
when "\\"
|
185
|
+
unless exprs
|
186
|
+
case peek(1)
|
187
|
+
when "'"
|
188
|
+
scan(/./)
|
189
|
+
start_group :expr, "\\'"
|
190
|
+
when "\\"
|
191
|
+
scan(/./)
|
192
|
+
start_group :expr, "\\\\"
|
193
|
+
else
|
194
|
+
start_group inner_group, "\\"
|
195
|
+
end
|
196
|
+
else
|
197
|
+
start_group :expr, "\\"
|
198
|
+
c = getch
|
199
|
+
append c
|
200
|
+
case c
|
201
|
+
when 'x'
|
202
|
+
append scan( /[a-fA-F0-9]{1,2}/ )
|
203
|
+
when /[0-7]/
|
204
|
+
append scan( /[0-7]{0,2}/ )
|
205
|
+
end
|
206
|
+
end
|
207
|
+
when delim
|
208
|
+
start_group delim_group, matched
|
209
|
+
break
|
210
|
+
when /^#/
|
211
|
+
start_group :expr, matched
|
212
|
+
case matched[1]
|
213
|
+
when ?{
|
214
|
+
depth = 1
|
215
|
+
while depth > 0
|
216
|
+
p = pos
|
217
|
+
c = scan_until( /[\{}]/ )
|
218
|
+
if c.nil?
|
219
|
+
append scan_until( /\Z/ )
|
220
|
+
break
|
221
|
+
else
|
222
|
+
depth += ( matched == "{" ? 1 : -1 )
|
223
|
+
append pre_match[p..-1]
|
224
|
+
append matched
|
225
|
+
end
|
226
|
+
end
|
227
|
+
when ?$, ?@
|
228
|
+
append scan( /\w+/ )
|
229
|
+
end
|
230
|
+
else raise "unexpected match on #{matched}"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
SYNTAX["ruby"] = Ruby
|
238
|
+
|
239
|
+
end
|