syntax 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/syntax.rb +31 -0
- data/lib/syntax/common.rb +118 -0
- data/lib/syntax/convertors/html.rb +50 -0
- data/lib/syntax/ruby.rb +239 -0
- data/lib/syntax/version.rb +9 -0
- data/lib/syntax/xml.rb +108 -0
- data/lib/syntax/yaml.rb +105 -0
- data/test/ALL-TESTS.rb +5 -0
- data/test/syntax/tc_ruby.rb +518 -0
- data/test/syntax/tc_xml.rb +202 -0
- data/test/syntax/tc_yaml.rb +228 -0
- metadata +51 -0
data/lib/syntax.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'syntax/common'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
|
5
|
+
# A default tokenizer for handling syntaxes that are not explicitly handled
|
6
|
+
# elsewhere. It simply yields the given text as a single token.
|
7
|
+
class Default
|
8
|
+
|
9
|
+
# Yield the given text as a single token.
|
10
|
+
def tokenize( text )
|
11
|
+
yield Token.new( text, :normal )
|
12
|
+
end
|
13
|
+
|
14
|
+
end
|
15
|
+
|
16
|
+
# A hash for registering syntax implementations.
|
17
|
+
SYNTAX = Hash.new( Default )
|
18
|
+
|
19
|
+
# Load the implementation of the requested syntax. If the syntax cannot be
|
20
|
+
# found, or if it cannot be loaded for whatever reason, the Default syntax
|
21
|
+
# handler will be returned.
|
22
|
+
def load( syntax )
|
23
|
+
begin
|
24
|
+
require "syntax/#{syntax}"
|
25
|
+
rescue LoadError
|
26
|
+
end
|
27
|
+
SYNTAX[ syntax ].new
|
28
|
+
end
|
29
|
+
module_function :load
|
30
|
+
|
31
|
+
end
|
@@ -0,0 +1,118 @@
|
|
1
|
+
require 'strscan'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
|
5
|
+
# A single token extracted by a tokenizer. It is simply the lexeme
|
6
|
+
# itself, decorated with a 'group' attribute to identify the type of the
|
7
|
+
# lexeme.
|
8
|
+
class Token < String
|
9
|
+
|
10
|
+
# the type of the lexeme that was extracted.
|
11
|
+
attr_reader :group
|
12
|
+
|
13
|
+
# Create a new Token representing the given text, and belonging to the
|
14
|
+
# given group.
|
15
|
+
def initialize( text, group )
|
16
|
+
super text
|
17
|
+
@group = group
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
# The base class of all tokenizers. It sets up the scanner and manages the
|
23
|
+
# looping until all tokens have been extracted. It also provides convenience
|
24
|
+
# methods to make sure adjacent tokens of identical groups are returned as
|
25
|
+
# a single token.
|
26
|
+
class Tokenizer
|
27
|
+
|
28
|
+
# Start tokenizing. This sets up the state in preparation for tokenization,
|
29
|
+
# such as creating a new scanner for the text and saving the callback block.
|
30
|
+
# The block will be invoked for each token extracted.
|
31
|
+
def start( text, &block )
|
32
|
+
@chunk = ""
|
33
|
+
@group = :normal
|
34
|
+
@callback = block
|
35
|
+
@text = StringScanner.new( text )
|
36
|
+
setup
|
37
|
+
end
|
38
|
+
|
39
|
+
# Subclasses may override this method to provide implementation-specific
|
40
|
+
# setup logic.
|
41
|
+
def setup
|
42
|
+
end
|
43
|
+
|
44
|
+
# Finish tokenizing. This flushes the buffer, yielding any remaining text
|
45
|
+
# to the client.
|
46
|
+
def finish
|
47
|
+
start_group nil
|
48
|
+
teardown
|
49
|
+
end
|
50
|
+
|
51
|
+
# Subclasses may override this method to provide implementation-specific
|
52
|
+
# teardown logic.
|
53
|
+
def teardown
|
54
|
+
end
|
55
|
+
|
56
|
+
# Subclasses must implement this method, which is called for each iteration
|
57
|
+
# of the tokenization process. This method may extract multiple tokens.
|
58
|
+
def step
|
59
|
+
raise NotImplementedError, "subclasses must implement #step"
|
60
|
+
end
|
61
|
+
|
62
|
+
# Begins tokenizing the given text, calling #step until the text has been
|
63
|
+
# exhausted.
|
64
|
+
def tokenize( text, &block )
|
65
|
+
start text, &block
|
66
|
+
step until @text.eos?
|
67
|
+
finish
|
68
|
+
end
|
69
|
+
|
70
|
+
private
|
71
|
+
|
72
|
+
# A convenience for delegating method calls to the scanner.
|
73
|
+
def self.delegate( sym )
|
74
|
+
define_method( sym ) { |*a| @text.__send__( sym, *a ) }
|
75
|
+
end
|
76
|
+
|
77
|
+
delegate :bol?
|
78
|
+
delegate :eos?
|
79
|
+
delegate :scan
|
80
|
+
delegate :scan_until
|
81
|
+
delegate :check
|
82
|
+
delegate :check_until
|
83
|
+
delegate :getch
|
84
|
+
delegate :matched
|
85
|
+
delegate :pre_match
|
86
|
+
delegate :peek
|
87
|
+
delegate :pos
|
88
|
+
|
89
|
+
# Access the n-th subgroup from the most recent match.
|
90
|
+
def subgroup(n)
|
91
|
+
@text[n]
|
92
|
+
end
|
93
|
+
|
94
|
+
# Append the given data to the currently active chunk.
|
95
|
+
def append( data )
|
96
|
+
@chunk << data
|
97
|
+
end
|
98
|
+
|
99
|
+
# Request that a new group be started. If the current group is the same
|
100
|
+
# as the group being requested, a new group will not be created. If a new
|
101
|
+
# group is created and the current chunk is not empty, the chunk's
|
102
|
+
# contents will be yielded to the client as a token, and then cleared.
|
103
|
+
#
|
104
|
+
# After the new group is started, if +data+ is non-nil it will be appended
|
105
|
+
# to the chunk.
|
106
|
+
def start_group( gr, data=nil )
|
107
|
+
if gr != @group && !@chunk.empty?
|
108
|
+
@callback.call( Token.new( @chunk, @group ) )
|
109
|
+
@chunk = ""
|
110
|
+
end
|
111
|
+
|
112
|
+
@group = gr
|
113
|
+
@chunk << data if data
|
114
|
+
end
|
115
|
+
|
116
|
+
end
|
117
|
+
|
118
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'syntax'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
module Convertors
|
5
|
+
|
6
|
+
# A simple class for converting a text into HTML.
|
7
|
+
class HTML
|
8
|
+
|
9
|
+
# A convenience method for instantiating a new HTML convertor for a
|
10
|
+
# specific syntax.
|
11
|
+
def self.for_syntax( syntax )
|
12
|
+
new( Syntax.load( syntax ) )
|
13
|
+
end
|
14
|
+
|
15
|
+
# Creates a new HTML convertor that uses the given tokenizer.
|
16
|
+
def initialize( tokenizer )
|
17
|
+
@tokenizer = tokenizer
|
18
|
+
end
|
19
|
+
|
20
|
+
# Converts the given text to HTML, using spans to represent token groups
|
21
|
+
# of any type but <tt>:normal</tt> (which is always unhighlighted). If
|
22
|
+
# +pre+ is +true+, the html is automatically wrapped in pre tags.
|
23
|
+
def convert( text, pre=true )
|
24
|
+
html = ""
|
25
|
+
html << "<pre>" if pre
|
26
|
+
@tokenizer.tokenize( text ) do |tok|
|
27
|
+
if tok.group == :normal
|
28
|
+
html << html_escape( tok )
|
29
|
+
else
|
30
|
+
html << "<span class=\"#{tok.group}\">#{html_escape(tok)}</span>"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
html << "</pre>" if pre
|
34
|
+
html
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
# Replaces some characters with their corresponding HTML entities.
|
40
|
+
def html_escape( string )
|
41
|
+
string.gsub( /&/, "&" ).
|
42
|
+
gsub( /</, "<" ).
|
43
|
+
gsub( />/, ">" ).
|
44
|
+
gsub( /"/, """ )
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
50
|
+
end
|
data/lib/syntax/ruby.rb
ADDED
@@ -0,0 +1,239 @@
|
|
1
|
+
require 'syntax'
|
2
|
+
|
3
|
+
module Syntax
|
4
|
+
|
5
|
+
# A tokenizer for the Ruby language. It recognizes all common syntax
|
6
|
+
# (and some less common syntax) but because it is not a true lexer, it
|
7
|
+
# will make mistakes on some ambiguous cases.
|
8
|
+
class Ruby < Tokenizer
|
9
|
+
|
10
|
+
# The list of all identifiers recognized as keywords.
|
11
|
+
KEYWORDS =
|
12
|
+
%w{if then elsif else end begin do rescue ensure while for
|
13
|
+
class module def yield raise until unless and or not when
|
14
|
+
case super undef break next redo retry in return alias
|
15
|
+
defined?}
|
16
|
+
|
17
|
+
# Perform ruby-specific setup
|
18
|
+
def setup
|
19
|
+
@selector = false
|
20
|
+
end
|
21
|
+
|
22
|
+
# Step through a single iteration of the tokenization process.
|
23
|
+
def step
|
24
|
+
case
|
25
|
+
when bol? && check( /=begin/ )
|
26
|
+
start_group( :comment, scan_until( /^=end$/ ) )
|
27
|
+
when bol? && check( /__END__$/ )
|
28
|
+
start_group( :comment, scan_until( /\Z/ ) )
|
29
|
+
else
|
30
|
+
case
|
31
|
+
when check( /def\s+/ )
|
32
|
+
start_group :keyword, scan( /def\s+/ )
|
33
|
+
start_group :method, scan_until( /$|(?=[;(\s])/ )
|
34
|
+
when check( /class\s+/ )
|
35
|
+
start_group :keyword, scan( /class\s+/ )
|
36
|
+
start_group :class, scan_until( /$|(?=[;\s<])/ )
|
37
|
+
when check( /module\s+/ )
|
38
|
+
start_group :keyword, scan( /module\s+/ )
|
39
|
+
start_group :module, scan_until( /$|(?=[;\s])/ )
|
40
|
+
when check( /::/ )
|
41
|
+
start_group :punct, scan(/::/)
|
42
|
+
when check( /:"/ )
|
43
|
+
start_group :symbol, scan(/:/)
|
44
|
+
scan_delimited_region :symbol, :symbol, "", true
|
45
|
+
when check( /:'/ )
|
46
|
+
start_group :symbol, scan(/:/)
|
47
|
+
scan_delimited_region :symbol, :symbol, "", false
|
48
|
+
when check( /:\w/ )
|
49
|
+
start_group :symbol, scan(/:\w+[!?]?/)
|
50
|
+
when check( /\?\\?./ )
|
51
|
+
start_group :char, scan(/\?\\?./)
|
52
|
+
when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
|
53
|
+
if @selector || matched[-1] == ?? || matched[-1] == ?!
|
54
|
+
start_group :ident,
|
55
|
+
scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
|
56
|
+
else
|
57
|
+
start_group :constant,
|
58
|
+
scan(/(__FILE__|__LINE__|true|false|nil|self)/)
|
59
|
+
end
|
60
|
+
@selector = false
|
61
|
+
else
|
62
|
+
case peek(2)
|
63
|
+
when "%r"
|
64
|
+
scan_delimited_region :punct, :regex, scan( /../ ), true
|
65
|
+
when "%w", "%q"
|
66
|
+
scan_delimited_region :punct, :string, scan( /../ ), false
|
67
|
+
when "%s"
|
68
|
+
scan_delimited_region :punct, :symbol, scan( /../ ), false
|
69
|
+
when "%W", "%Q", "%x"
|
70
|
+
scan_delimited_region :punct, :string, scan( /../ ), true
|
71
|
+
when /%[^\sa-zA-Z0-9]/
|
72
|
+
scan_delimited_region :punct, :string, scan( /./ ), true
|
73
|
+
when "<<"
|
74
|
+
start_group :punct, scan( /<</ )
|
75
|
+
float_right = scan( /-/ )
|
76
|
+
append "-" if float_right
|
77
|
+
if ( type = scan( /['"]/ ) )
|
78
|
+
append type
|
79
|
+
delim = scan_until( /(?=#{type})/ )
|
80
|
+
if delim.nil?
|
81
|
+
append scan_until( /\Z/ )
|
82
|
+
return
|
83
|
+
end
|
84
|
+
else
|
85
|
+
delim = scan( /\w+/ ) or return
|
86
|
+
end
|
87
|
+
start_group :constant, delim
|
88
|
+
start_group :punct, scan( /#{type}/ ) if type
|
89
|
+
scan_delimited_region :constant, :string, "", ( type != "'" ),
|
90
|
+
delim, true, float_right
|
91
|
+
else
|
92
|
+
case peek(1)
|
93
|
+
when /\s/
|
94
|
+
start_group :normal, scan( /\s+/ )
|
95
|
+
when "#"
|
96
|
+
start_group :comment, scan( /#.*$/ )
|
97
|
+
when /[A-Z]/
|
98
|
+
start_group :constant, scan( /\w+/ )
|
99
|
+
when /[a-z_]/
|
100
|
+
word = scan( /\w+[?!]?/ )
|
101
|
+
if !@selector && KEYWORDS.include?( word )
|
102
|
+
start_group :keyword, word
|
103
|
+
elsif
|
104
|
+
start_group :ident, word
|
105
|
+
end
|
106
|
+
@selector = false
|
107
|
+
when /\d/
|
108
|
+
start_group :number,
|
109
|
+
scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
|
110
|
+
when '"'
|
111
|
+
scan_delimited_region :punct, :string, "", true
|
112
|
+
when '/'
|
113
|
+
scan_delimited_region :punct, :regex, "", true
|
114
|
+
when "'"
|
115
|
+
scan_delimited_region :punct, :string, "", false
|
116
|
+
when "."
|
117
|
+
dots = scan( /\.{1,3}/ )
|
118
|
+
start_group :punct, dots
|
119
|
+
@selector = ( dots.length == 1 )
|
120
|
+
when /[@]/
|
121
|
+
start_group :attribute, scan( /@{1,2}\w*/ )
|
122
|
+
when /[$]/
|
123
|
+
start_group :global, scan(/\$/)
|
124
|
+
start_group :global, scan( /\w+|./ ) if check(/./)
|
125
|
+
when /[-!?*\/+=<>()\[\]\{}:;,&|%]/
|
126
|
+
start_group :punct,
|
127
|
+
scan(/[-!?*\/+=<>()\[\]\{}:;,&|%]/)
|
128
|
+
else
|
129
|
+
# all else just falls through this, to prevent
|
130
|
+
# infinite loops...
|
131
|
+
append getch
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
private
|
139
|
+
|
140
|
+
# Scan a delimited region of text. This handles the simple cases (strings
|
141
|
+
# delimited with quotes) as well as the more complex cases of %-strings
|
142
|
+
# and here-documents.
|
143
|
+
def scan_delimited_region( delim_group, inner_group, starter, exprs,
|
144
|
+
delim=nil, delim_alone=false, float_right=false )
|
145
|
+
# begin
|
146
|
+
if !delim
|
147
|
+
start_group delim_group, starter
|
148
|
+
delim = scan( /./ )
|
149
|
+
append delim
|
150
|
+
|
151
|
+
delim = case delim
|
152
|
+
when '{' then '}'
|
153
|
+
when '(' then ')'
|
154
|
+
when '[' then ']'
|
155
|
+
else delim
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
start_group inner_group
|
160
|
+
|
161
|
+
items = "\\\\|"
|
162
|
+
|
163
|
+
if delim_alone
|
164
|
+
items << "(^"
|
165
|
+
items << '\s*' if float_right
|
166
|
+
items << "#{delim}$)"
|
167
|
+
else
|
168
|
+
items << "#{delim}"
|
169
|
+
end
|
170
|
+
|
171
|
+
items << "|#(\\$|@|\\{)"if exprs
|
172
|
+
items = Regexp.new( items )
|
173
|
+
|
174
|
+
loop do
|
175
|
+
p = pos
|
176
|
+
match = scan_until( items )
|
177
|
+
if match.nil?
|
178
|
+
start_group inner_group, scan_until( /\Z/ )
|
179
|
+
break
|
180
|
+
else
|
181
|
+
text = pre_match[p..-1]
|
182
|
+
start_group inner_group, text if text.length > 0
|
183
|
+
case matched.strip
|
184
|
+
when "\\"
|
185
|
+
unless exprs
|
186
|
+
case peek(1)
|
187
|
+
when "'"
|
188
|
+
scan(/./)
|
189
|
+
start_group :expr, "\\'"
|
190
|
+
when "\\"
|
191
|
+
scan(/./)
|
192
|
+
start_group :expr, "\\\\"
|
193
|
+
else
|
194
|
+
start_group inner_group, "\\"
|
195
|
+
end
|
196
|
+
else
|
197
|
+
start_group :expr, "\\"
|
198
|
+
c = getch
|
199
|
+
append c
|
200
|
+
case c
|
201
|
+
when 'x'
|
202
|
+
append scan( /[a-fA-F0-9]{1,2}/ )
|
203
|
+
when /[0-7]/
|
204
|
+
append scan( /[0-7]{0,2}/ )
|
205
|
+
end
|
206
|
+
end
|
207
|
+
when delim
|
208
|
+
start_group delim_group, matched
|
209
|
+
break
|
210
|
+
when /^#/
|
211
|
+
start_group :expr, matched
|
212
|
+
case matched[1]
|
213
|
+
when ?{
|
214
|
+
depth = 1
|
215
|
+
while depth > 0
|
216
|
+
p = pos
|
217
|
+
c = scan_until( /[\{}]/ )
|
218
|
+
if c.nil?
|
219
|
+
append scan_until( /\Z/ )
|
220
|
+
break
|
221
|
+
else
|
222
|
+
depth += ( matched == "{" ? 1 : -1 )
|
223
|
+
append pre_match[p..-1]
|
224
|
+
append matched
|
225
|
+
end
|
226
|
+
end
|
227
|
+
when ?$, ?@
|
228
|
+
append scan( /\w+/ )
|
229
|
+
end
|
230
|
+
else raise "unexpected match on #{matched}"
|
231
|
+
end
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
235
|
+
end
|
236
|
+
|
237
|
+
SYNTAX["ruby"] = Ruby
|
238
|
+
|
239
|
+
end
|