syntax 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,31 @@
1
+ require 'syntax/common'
2
+
3
+ module Syntax
4
+
5
+ # A default tokenizer for handling syntaxes that are not explicitly handled
6
+ # elsewhere. It simply yields the given text as a single token.
7
+ class Default
8
+
9
+ # Yield the given text as a single token.
10
+ def tokenize( text )
11
+ yield Token.new( text, :normal )
12
+ end
13
+
14
+ end
15
+
16
+ # A hash for registering syntax implementations.
17
+ SYNTAX = Hash.new( Default )
18
+
19
+ # Load the implementation of the requested syntax. If the syntax cannot be
20
+ # found, or if it cannot be loaded for whatever reason, the Default syntax
21
+ # handler will be returned.
22
+ def load( syntax )
23
+ begin
24
+ require "syntax/#{syntax}"
25
+ rescue LoadError
26
+ end
27
+ SYNTAX[ syntax ].new
28
+ end
29
+ module_function :load
30
+
31
+ end
@@ -0,0 +1,118 @@
1
+ require 'strscan'
2
+
3
+ module Syntax
4
+
5
+ # A single token extracted by a tokenizer. It is simply the lexeme
6
+ # itself, decorated with a 'group' attribute to identify the type of the
7
+ # lexeme.
8
+ class Token < String
9
+
10
+ # the type of the lexeme that was extracted.
11
+ attr_reader :group
12
+
13
+ # Create a new Token representing the given text, and belonging to the
14
+ # given group.
15
+ def initialize( text, group )
16
+ super text
17
+ @group = group
18
+ end
19
+
20
+ end
21
+
22
+ # The base class of all tokenizers. It sets up the scanner and manages the
23
+ # looping until all tokens have been extracted. It also provides convenience
24
+ # methods to make sure adjacent tokens of identical groups are returned as
25
+ # a single token.
26
+ class Tokenizer
27
+
28
+ # Start tokenizing. This sets up the state in preparation for tokenization,
29
+ # such as creating a new scanner for the text and saving the callback block.
30
+ # The block will be invoked for each token extracted.
31
+ def start( text, &block )
32
+ @chunk = ""
33
+ @group = :normal
34
+ @callback = block
35
+ @text = StringScanner.new( text )
36
+ setup
37
+ end
38
+
39
+ # Subclasses may override this method to provide implementation-specific
40
+ # setup logic.
41
+ def setup
42
+ end
43
+
44
+ # Finish tokenizing. This flushes the buffer, yielding any remaining text
45
+ # to the client.
46
+ def finish
47
+ start_group nil
48
+ teardown
49
+ end
50
+
51
+ # Subclasses may override this method to provide implementation-specific
52
+ # teardown logic.
53
+ def teardown
54
+ end
55
+
56
+ # Subclasses must implement this method, which is called for each iteration
57
+ # of the tokenization process. This method may extract multiple tokens.
58
+ def step
59
+ raise NotImplementedError, "subclasses must implement #step"
60
+ end
61
+
62
+ # Begins tokenizing the given text, calling #step until the text has been
63
+ # exhausted.
64
+ def tokenize( text, &block )
65
+ start text, &block
66
+ step until @text.eos?
67
+ finish
68
+ end
69
+
70
+ private
71
+
72
+ # A convenience for delegating method calls to the scanner.
73
+ def self.delegate( sym )
74
+ define_method( sym ) { |*a| @text.__send__( sym, *a ) }
75
+ end
76
+
77
+ delegate :bol?
78
+ delegate :eos?
79
+ delegate :scan
80
+ delegate :scan_until
81
+ delegate :check
82
+ delegate :check_until
83
+ delegate :getch
84
+ delegate :matched
85
+ delegate :pre_match
86
+ delegate :peek
87
+ delegate :pos
88
+
89
+ # Access the n-th subgroup from the most recent match.
90
+ def subgroup(n)
91
+ @text[n]
92
+ end
93
+
94
+ # Append the given data to the currently active chunk.
95
+ def append( data )
96
+ @chunk << data
97
+ end
98
+
99
+ # Request that a new group be started. If the current group is the same
100
+ # as the group being requested, a new group will not be created. If a new
101
+ # group is created and the current chunk is not empty, the chunk's
102
+ # contents will be yielded to the client as a token, and then cleared.
103
+ #
104
+ # After the new group is started, if +data+ is non-nil it will be appended
105
+ # to the chunk.
106
+ def start_group( gr, data=nil )
107
+ if gr != @group && !@chunk.empty?
108
+ @callback.call( Token.new( @chunk, @group ) )
109
+ @chunk = ""
110
+ end
111
+
112
+ @group = gr
113
+ @chunk << data if data
114
+ end
115
+
116
+ end
117
+
118
+ end
@@ -0,0 +1,50 @@
1
+ require 'syntax'
2
+
3
+ module Syntax
4
+ module Convertors
5
+
6
+ # A simple class for converting a text into HTML.
7
+ class HTML
8
+
9
+ # A convenience method for instantiating a new HTML convertor for a
10
+ # specific syntax.
11
+ def self.for_syntax( syntax )
12
+ new( Syntax.load( syntax ) )
13
+ end
14
+
15
+ # Creates a new HTML convertor that uses the given tokenizer.
16
+ def initialize( tokenizer )
17
+ @tokenizer = tokenizer
18
+ end
19
+
20
+ # Converts the given text to HTML, using spans to represent token groups
21
+ # of any type but <tt>:normal</tt> (which is always unhighlighted). If
22
+ # +pre+ is +true+, the html is automatically wrapped in pre tags.
23
+ def convert( text, pre=true )
24
+ html = ""
25
+ html << "<pre>" if pre
26
+ @tokenizer.tokenize( text ) do |tok|
27
+ if tok.group == :normal
28
+ html << html_escape( tok )
29
+ else
30
+ html << "<span class=\"#{tok.group}\">#{html_escape(tok)}</span>"
31
+ end
32
+ end
33
+ html << "</pre>" if pre
34
+ html
35
+ end
36
+
37
+ private
38
+
39
+ # Replaces some characters with their corresponding HTML entities.
40
+ def html_escape( string )
41
+ string.gsub( /&/, "&amp;" ).
42
+ gsub( /</, "&lt;" ).
43
+ gsub( />/, "&gt;" ).
44
+ gsub( /"/, "&quot;" )
45
+ end
46
+
47
+ end
48
+
49
+ end
50
+ end
@@ -0,0 +1,239 @@
1
+ require 'syntax'
2
+
3
+ module Syntax
4
+
5
+ # A tokenizer for the Ruby language. It recognizes all common syntax
6
+ # (and some less common syntax) but because it is not a true lexer, it
7
+ # will make mistakes on some ambiguous cases.
8
+ class Ruby < Tokenizer
9
+
10
+ # The list of all identifiers recognized as keywords.
11
+ KEYWORDS =
12
+ %w{if then elsif else end begin do rescue ensure while for
13
+ class module def yield raise until unless and or not when
14
+ case super undef break next redo retry in return alias
15
+ defined?}
16
+
17
+ # Perform ruby-specific setup
18
+ def setup
19
+ @selector = false
20
+ end
21
+
22
+ # Step through a single iteration of the tokenization process.
23
+ def step
24
+ case
25
+ when bol? && check( /=begin/ )
26
+ start_group( :comment, scan_until( /^=end$/ ) )
27
+ when bol? && check( /__END__$/ )
28
+ start_group( :comment, scan_until( /\Z/ ) )
29
+ else
30
+ case
31
+ when check( /def\s+/ )
32
+ start_group :keyword, scan( /def\s+/ )
33
+ start_group :method, scan_until( /$|(?=[;(\s])/ )
34
+ when check( /class\s+/ )
35
+ start_group :keyword, scan( /class\s+/ )
36
+ start_group :class, scan_until( /$|(?=[;\s<])/ )
37
+ when check( /module\s+/ )
38
+ start_group :keyword, scan( /module\s+/ )
39
+ start_group :module, scan_until( /$|(?=[;\s])/ )
40
+ when check( /::/ )
41
+ start_group :punct, scan(/::/)
42
+ when check( /:"/ )
43
+ start_group :symbol, scan(/:/)
44
+ scan_delimited_region :symbol, :symbol, "", true
45
+ when check( /:'/ )
46
+ start_group :symbol, scan(/:/)
47
+ scan_delimited_region :symbol, :symbol, "", false
48
+ when check( /:\w/ )
49
+ start_group :symbol, scan(/:\w+[!?]?/)
50
+ when check( /\?\\?./ )
51
+ start_group :char, scan(/\?\\?./)
52
+ when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
53
+ if @selector || matched[-1] == ?? || matched[-1] == ?!
54
+ start_group :ident,
55
+ scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
56
+ else
57
+ start_group :constant,
58
+ scan(/(__FILE__|__LINE__|true|false|nil|self)/)
59
+ end
60
+ @selector = false
61
+ else
62
+ case peek(2)
63
+ when "%r"
64
+ scan_delimited_region :punct, :regex, scan( /../ ), true
65
+ when "%w", "%q"
66
+ scan_delimited_region :punct, :string, scan( /../ ), false
67
+ when "%s"
68
+ scan_delimited_region :punct, :symbol, scan( /../ ), false
69
+ when "%W", "%Q", "%x"
70
+ scan_delimited_region :punct, :string, scan( /../ ), true
71
+ when /%[^\sa-zA-Z0-9]/
72
+ scan_delimited_region :punct, :string, scan( /./ ), true
73
+ when "<<"
74
+ start_group :punct, scan( /<</ )
75
+ float_right = scan( /-/ )
76
+ append "-" if float_right
77
+ if ( type = scan( /['"]/ ) )
78
+ append type
79
+ delim = scan_until( /(?=#{type})/ )
80
+ if delim.nil?
81
+ append scan_until( /\Z/ )
82
+ return
83
+ end
84
+ else
85
+ delim = scan( /\w+/ ) or return
86
+ end
87
+ start_group :constant, delim
88
+ start_group :punct, scan( /#{type}/ ) if type
89
+ scan_delimited_region :constant, :string, "", ( type != "'" ),
90
+ delim, true, float_right
91
+ else
92
+ case peek(1)
93
+ when /\s/
94
+ start_group :normal, scan( /\s+/ )
95
+ when "#"
96
+ start_group :comment, scan( /#.*$/ )
97
+ when /[A-Z]/
98
+ start_group :constant, scan( /\w+/ )
99
+ when /[a-z_]/
100
+ word = scan( /\w+[?!]?/ )
101
+ if !@selector && KEYWORDS.include?( word )
102
+ start_group :keyword, word
103
+ elsif
104
+ start_group :ident, word
105
+ end
106
+ @selector = false
107
+ when /\d/
108
+ start_group :number,
109
+ scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
110
+ when '"'
111
+ scan_delimited_region :punct, :string, "", true
112
+ when '/'
113
+ scan_delimited_region :punct, :regex, "", true
114
+ when "'"
115
+ scan_delimited_region :punct, :string, "", false
116
+ when "."
117
+ dots = scan( /\.{1,3}/ )
118
+ start_group :punct, dots
119
+ @selector = ( dots.length == 1 )
120
+ when /[@]/
121
+ start_group :attribute, scan( /@{1,2}\w*/ )
122
+ when /[$]/
123
+ start_group :global, scan(/\$/)
124
+ start_group :global, scan( /\w+|./ ) if check(/./)
125
+ when /[-!?*\/+=<>()\[\]\{}:;,&|%]/
126
+ start_group :punct,
127
+ scan(/[-!?*\/+=<>()\[\]\{}:;,&|%]/)
128
+ else
129
+ # all else just falls through this, to prevent
130
+ # infinite loops...
131
+ append getch
132
+ end
133
+ end
134
+ end
135
+ end
136
+ end
137
+
138
+ private
139
+
140
+ # Scan a delimited region of text. This handles the simple cases (strings
141
+ # delimited with quotes) as well as the more complex cases of %-strings
142
+ # and here-documents.
143
+ def scan_delimited_region( delim_group, inner_group, starter, exprs,
144
+ delim=nil, delim_alone=false, float_right=false )
145
+ # begin
146
+ if !delim
147
+ start_group delim_group, starter
148
+ delim = scan( /./ )
149
+ append delim
150
+
151
+ delim = case delim
152
+ when '{' then '}'
153
+ when '(' then ')'
154
+ when '[' then ']'
155
+ else delim
156
+ end
157
+ end
158
+
159
+ start_group inner_group
160
+
161
+ items = "\\\\|"
162
+
163
+ if delim_alone
164
+ items << "(^"
165
+ items << '\s*' if float_right
166
+ items << "#{delim}$)"
167
+ else
168
+ items << "#{delim}"
169
+ end
170
+
171
+ items << "|#(\\$|@|\\{)"if exprs
172
+ items = Regexp.new( items )
173
+
174
+ loop do
175
+ p = pos
176
+ match = scan_until( items )
177
+ if match.nil?
178
+ start_group inner_group, scan_until( /\Z/ )
179
+ break
180
+ else
181
+ text = pre_match[p..-1]
182
+ start_group inner_group, text if text.length > 0
183
+ case matched.strip
184
+ when "\\"
185
+ unless exprs
186
+ case peek(1)
187
+ when "'"
188
+ scan(/./)
189
+ start_group :expr, "\\'"
190
+ when "\\"
191
+ scan(/./)
192
+ start_group :expr, "\\\\"
193
+ else
194
+ start_group inner_group, "\\"
195
+ end
196
+ else
197
+ start_group :expr, "\\"
198
+ c = getch
199
+ append c
200
+ case c
201
+ when 'x'
202
+ append scan( /[a-fA-F0-9]{1,2}/ )
203
+ when /[0-7]/
204
+ append scan( /[0-7]{0,2}/ )
205
+ end
206
+ end
207
+ when delim
208
+ start_group delim_group, matched
209
+ break
210
+ when /^#/
211
+ start_group :expr, matched
212
+ case matched[1]
213
+ when ?{
214
+ depth = 1
215
+ while depth > 0
216
+ p = pos
217
+ c = scan_until( /[\{}]/ )
218
+ if c.nil?
219
+ append scan_until( /\Z/ )
220
+ break
221
+ else
222
+ depth += ( matched == "{" ? 1 : -1 )
223
+ append pre_match[p..-1]
224
+ append matched
225
+ end
226
+ end
227
+ when ?$, ?@
228
+ append scan( /\w+/ )
229
+ end
230
+ else raise "unexpected match on #{matched}"
231
+ end
232
+ end
233
+ end
234
+ end
235
+ end
236
+
237
+ SYNTAX["ruby"] = Ruby
238
+
239
+ end
@@ -0,0 +1,9 @@
1
+ module Syntax
2
+ module Version
3
+ MAJOR=0
4
+ MINOR=5
5
+ TINY=0
6
+
7
+ STRING=[MAJOR,MINOR,TINY].join('.')
8
+ end
9
+ end