syntax 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,31 @@
1
+ require 'syntax/common'
2
+
3
+ module Syntax
4
+
5
+ # A default tokenizer for handling syntaxes that are not explicitly handled
6
+ # elsewhere. It simply yields the given text as a single token.
7
+ class Default
8
+
9
+ # Yield the given text as a single token.
10
+ def tokenize( text )
11
+ yield Token.new( text, :normal )
12
+ end
13
+
14
+ end
15
+
16
+ # A hash for registering syntax implementations.
17
+ SYNTAX = Hash.new( Default )
18
+
19
+ # Load the implementation of the requested syntax. If the syntax cannot be
20
+ # found, or if it cannot be loaded for whatever reason, the Default syntax
21
+ # handler will be returned.
22
+ def load( syntax )
23
+ begin
24
+ require "syntax/#{syntax}"
25
+ rescue LoadError
26
+ end
27
+ SYNTAX[ syntax ].new
28
+ end
29
+ module_function :load
30
+
31
+ end
@@ -0,0 +1,118 @@
1
+ require 'strscan'
2
+
3
+ module Syntax
4
+
5
+ # A single token extracted by a tokenizer. It is simply the lexeme
6
+ # itself, decorated with a 'group' attribute to identify the type of the
7
+ # lexeme.
8
+ class Token < String
9
+
10
+ # the type of the lexeme that was extracted.
11
+ attr_reader :group
12
+
13
+ # Create a new Token representing the given text, and belonging to the
14
+ # given group.
15
+ def initialize( text, group )
16
+ super text
17
+ @group = group
18
+ end
19
+
20
+ end
21
+
22
+ # The base class of all tokenizers. It sets up the scanner and manages the
23
+ # looping until all tokens have been extracted. It also provides convenience
24
+ # methods to make sure adjacent tokens of identical groups are returned as
25
+ # a single token.
26
+ class Tokenizer
27
+
28
+ # Start tokenizing. This sets up the state in preparation for tokenization,
29
+ # such as creating a new scanner for the text and saving the callback block.
30
+ # The block will be invoked for each token extracted.
31
+ def start( text, &block )
32
+ @chunk = ""
33
+ @group = :normal
34
+ @callback = block
35
+ @text = StringScanner.new( text )
36
+ setup
37
+ end
38
+
39
+ # Subclasses may override this method to provide implementation-specific
40
+ # setup logic.
41
+ def setup
42
+ end
43
+
44
+ # Finish tokenizing. This flushes the buffer, yielding any remaining text
45
+ # to the client.
46
+ def finish
47
+ start_group nil
48
+ teardown
49
+ end
50
+
51
+ # Subclasses may override this method to provide implementation-specific
52
+ # teardown logic.
53
+ def teardown
54
+ end
55
+
56
+ # Subclasses must implement this method, which is called for each iteration
57
+ # of the tokenization process. This method may extract multiple tokens.
58
+ def step
59
+ raise NotImplementedError, "subclasses must implement #step"
60
+ end
61
+
62
+ # Begins tokenizing the given text, calling #step until the text has been
63
+ # exhausted.
64
+ def tokenize( text, &block )
65
+ start text, &block
66
+ step until @text.eos?
67
+ finish
68
+ end
69
+
70
+ private
71
+
72
+ # A convenience for delegating method calls to the scanner.
73
+ def self.delegate( sym )
74
+ define_method( sym ) { |*a| @text.__send__( sym, *a ) }
75
+ end
76
+
77
+ delegate :bol?
78
+ delegate :eos?
79
+ delegate :scan
80
+ delegate :scan_until
81
+ delegate :check
82
+ delegate :check_until
83
+ delegate :getch
84
+ delegate :matched
85
+ delegate :pre_match
86
+ delegate :peek
87
+ delegate :pos
88
+
89
+ # Access the n-th subgroup from the most recent match.
90
+ def subgroup(n)
91
+ @text[n]
92
+ end
93
+
94
+ # Append the given data to the currently active chunk.
95
+ def append( data )
96
+ @chunk << data
97
+ end
98
+
99
+ # Request that a new group be started. If the current group is the same
100
+ # as the group being requested, a new group will not be created. If a new
101
+ # group is created and the current chunk is not empty, the chunk's
102
+ # contents will be yielded to the client as a token, and then cleared.
103
+ #
104
+ # After the new group is started, if +data+ is non-nil it will be appended
105
+ # to the chunk.
106
+ def start_group( gr, data=nil )
107
+ if gr != @group && !@chunk.empty?
108
+ @callback.call( Token.new( @chunk, @group ) )
109
+ @chunk = ""
110
+ end
111
+
112
+ @group = gr
113
+ @chunk << data if data
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -0,0 +1,50 @@
1
+ require 'syntax'
2
+
3
+ module Syntax
4
+ module Convertors
5
+
6
+ # A simple class for converting a text into HTML.
7
+ class HTML
8
+
9
+ # A convenience method for instantiating a new HTML convertor for a
10
+ # specific syntax.
11
+ def self.for_syntax( syntax )
12
+ new( Syntax.load( syntax ) )
13
+ end
14
+
15
+ # Creates a new HTML convertor that uses the given tokenizer.
16
+ def initialize( tokenizer )
17
+ @tokenizer = tokenizer
18
+ end
19
+
20
+ # Converts the given text to HTML, using spans to represent token groups
21
+ # of any type but <tt>:normal</tt> (which is always unhighlighted). If
22
+ # +pre+ is +true+, the html is automatically wrapped in pre tags.
23
+ def convert( text, pre=true )
24
+ html = ""
25
+ html << "<pre>" if pre
26
+ @tokenizer.tokenize( text ) do |tok|
27
+ if tok.group == :normal
28
+ html << html_escape( tok )
29
+ else
30
+ html << "<span class=\"#{tok.group}\">#{html_escape(tok)}</span>"
31
+ end
32
+ end
33
+ html << "</pre>" if pre
34
+ html
35
+ end
36
+
37
+ private
38
+
39
+ # Replaces some characters with their corresponding HTML entities.
40
+ def html_escape( string )
41
+ string.gsub( /&/, "&amp;" ).
42
+ gsub( /</, "&lt;" ).
43
+ gsub( />/, "&gt;" ).
44
+ gsub( /"/, "&quot;" )
45
+ end
46
+
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,239 @@
1
+ require 'syntax'
2
+
3
+ module Syntax
4
+
5
+ # A tokenizer for the Ruby language. It recognizes all common syntax
6
+ # (and some less common syntax) but because it is not a true lexer, it
7
+ # will make mistakes on some ambiguous cases.
8
+ class Ruby < Tokenizer
9
+
10
+ # The list of all identifiers recognized as keywords.
11
+ KEYWORDS =
12
+ %w{if then elsif else end begin do rescue ensure while for
13
+ class module def yield raise until unless and or not when
14
+ case super undef break next redo retry in return alias
15
+ defined?}
16
+
17
+ # Perform ruby-specific setup
18
+ def setup
19
+ @selector = false
20
+ end
21
+
22
+ # Step through a single iteration of the tokenization process.
23
+ def step
24
+ case
25
+ when bol? && check( /=begin/ )
26
+ start_group( :comment, scan_until( /^=end$/ ) )
27
+ when bol? && check( /__END__$/ )
28
+ start_group( :comment, scan_until( /\Z/ ) )
29
+ else
30
+ case
31
+ when check( /def\s+/ )
32
+ start_group :keyword, scan( /def\s+/ )
33
+ start_group :method, scan_until( /$|(?=[;(\s])/ )
34
+ when check( /class\s+/ )
35
+ start_group :keyword, scan( /class\s+/ )
36
+ start_group :class, scan_until( /$|(?=[;\s<])/ )
37
+ when check( /module\s+/ )
38
+ start_group :keyword, scan( /module\s+/ )
39
+ start_group :module, scan_until( /$|(?=[;\s])/ )
40
+ when check( /::/ )
41
+ start_group :punct, scan(/::/)
42
+ when check( /:"/ )
43
+ start_group :symbol, scan(/:/)
44
+ scan_delimited_region :symbol, :symbol, "", true
45
+ when check( /:'/ )
46
+ start_group :symbol, scan(/:/)
47
+ scan_delimited_region :symbol, :symbol, "", false
48
+ when check( /:\w/ )
49
+ start_group :symbol, scan(/:\w+[!?]?/)
50
+ when check( /\?\\?./ )
51
+ start_group :char, scan(/\?\\?./)
52
+ when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
53
+ if @selector || matched[-1] == ?? || matched[-1] == ?!
54
+ start_group :ident,
55
+ scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
56
+ else
57
+ start_group :constant,
58
+ scan(/(__FILE__|__LINE__|true|false|nil|self)/)
59
+ end
60
+ @selector = false
61
+ else
62
+ case peek(2)
63
+ when "%r"
64
+ scan_delimited_region :punct, :regex, scan( /../ ), true
65
+ when "%w", "%q"
66
+ scan_delimited_region :punct, :string, scan( /../ ), false
67
+ when "%s"
68
+ scan_delimited_region :punct, :symbol, scan( /../ ), false
69
+ when "%W", "%Q", "%x"
70
+ scan_delimited_region :punct, :string, scan( /../ ), true
71
+ when /%[^\sa-zA-Z0-9]/
72
+ scan_delimited_region :punct, :string, scan( /./ ), true
73
+ when "<<"
74
+ start_group :punct, scan( /<</ )
75
+ float_right = scan( /-/ )
76
+ append "-" if float_right
77
+ if ( type = scan( /['"]/ ) )
78
+ append type
79
+ delim = scan_until( /(?=#{type})/ )
80
+ if delim.nil?
81
+ append scan_until( /\Z/ )
82
+ return
83
+ end
84
+ else
85
+ delim = scan( /\w+/ ) or return
86
+ end
87
+ start_group :constant, delim
88
+ start_group :punct, scan( /#{type}/ ) if type
89
+ scan_delimited_region :constant, :string, "", ( type != "'" ),
90
+ delim, true, float_right
91
+ else
92
+ case peek(1)
93
+ when /\s/
94
+ start_group :normal, scan( /\s+/ )
95
+ when "#"
96
+ start_group :comment, scan( /#.*$/ )
97
+ when /[A-Z]/
98
+ start_group :constant, scan( /\w+/ )
99
+ when /[a-z_]/
100
+ word = scan( /\w+[?!]?/ )
101
+ if !@selector && KEYWORDS.include?( word )
102
+ start_group :keyword, word
103
+ elsif
104
+ start_group :ident, word
105
+ end
106
+ @selector = false
107
+ when /\d/
108
+ start_group :number,
109
+ scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
110
+ when '"'
111
+ scan_delimited_region :punct, :string, "", true
112
+ when '/'
113
+ scan_delimited_region :punct, :regex, "", true
114
+ when "'"
115
+ scan_delimited_region :punct, :string, "", false
116
+ when "."
117
+ dots = scan( /\.{1,3}/ )
118
+ start_group :punct, dots
119
+ @selector = ( dots.length == 1 )
120
+ when /[@]/
121
+ start_group :attribute, scan( /@{1,2}\w*/ )
122
+ when /[$]/
123
+ start_group :global, scan(/\$/)
124
+ start_group :global, scan( /\w+|./ ) if check(/./)
125
+ when /[-!?*\/+=<>()\[\]\{}:;,&|%]/
126
+ start_group :punct,
127
+ scan(/[-!?*\/+=<>()\[\]\{}:;,&|%]/)
128
+ else
129
+ # all else just falls through this, to prevent
130
+ # infinite loops...
131
+ append getch
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+
138
+ private
139
+
140
+ # Scan a delimited region of text. This handles the simple cases (strings
141
+ # delimited with quotes) as well as the more complex cases of %-strings
142
+ # and here-documents.
143
+ def scan_delimited_region( delim_group, inner_group, starter, exprs,
144
+ delim=nil, delim_alone=false, float_right=false )
145
+ # begin
146
+ if !delim
147
+ start_group delim_group, starter
148
+ delim = scan( /./ )
149
+ append delim
150
+
151
+ delim = case delim
152
+ when '{' then '}'
153
+ when '(' then ')'
154
+ when '[' then ']'
155
+ else delim
156
+ end
157
+ end
158
+
159
+ start_group inner_group
160
+
161
+ items = "\\\\|"
162
+
163
+ if delim_alone
164
+ items << "(^"
165
+ items << '\s*' if float_right
166
+ items << "#{delim}$)"
167
+ else
168
+ items << "#{delim}"
169
+ end
170
+
171
+ items << "|#(\\$|@|\\{)"if exprs
172
+ items = Regexp.new( items )
173
+
174
+ loop do
175
+ p = pos
176
+ match = scan_until( items )
177
+ if match.nil?
178
+ start_group inner_group, scan_until( /\Z/ )
179
+ break
180
+ else
181
+ text = pre_match[p..-1]
182
+ start_group inner_group, text if text.length > 0
183
+ case matched.strip
184
+ when "\\"
185
+ unless exprs
186
+ case peek(1)
187
+ when "'"
188
+ scan(/./)
189
+ start_group :expr, "\\'"
190
+ when "\\"
191
+ scan(/./)
192
+ start_group :expr, "\\\\"
193
+ else
194
+ start_group inner_group, "\\"
195
+ end
196
+ else
197
+ start_group :expr, "\\"
198
+ c = getch
199
+ append c
200
+ case c
201
+ when 'x'
202
+ append scan( /[a-fA-F0-9]{1,2}/ )
203
+ when /[0-7]/
204
+ append scan( /[0-7]{0,2}/ )
205
+ end
206
+ end
207
+ when delim
208
+ start_group delim_group, matched
209
+ break
210
+ when /^#/
211
+ start_group :expr, matched
212
+ case matched[1]
213
+ when ?{
214
+ depth = 1
215
+ while depth > 0
216
+ p = pos
217
+ c = scan_until( /[\{}]/ )
218
+ if c.nil?
219
+ append scan_until( /\Z/ )
220
+ break
221
+ else
222
+ depth += ( matched == "{" ? 1 : -1 )
223
+ append pre_match[p..-1]
224
+ append matched
225
+ end
226
+ end
227
+ when ?$, ?@
228
+ append scan( /\w+/ )
229
+ end
230
+ else raise "unexpected match on #{matched}"
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end
236
+
237
+ SYNTAX["ruby"] = Ruby
238
+
239
+ end
@@ -0,0 +1,9 @@
1
+ module Syntax
2
+ module Version
3
+ MAJOR=0
4
+ MINOR=5
5
+ TINY=0
6
+
7
+ STRING=[MAJOR,MINOR,TINY].join('.')
8
+ end
9
+ end