coffee-script 0.3.2 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +2 -2
- data/README.md +15 -0
- data/lib/coffee-script.rb +1 -21
- data/lib/coffee_script.rb +31 -0
- metadata +30 -46
- data/README +0 -41
- data/bin/coffee +0 -5
- data/coffee-script.gemspec +0 -27
- data/examples/blocks.coffee +0 -57
- data/examples/code.coffee +0 -173
- data/examples/poignant.coffee +0 -186
- data/examples/potion.coffee +0 -205
- data/examples/underscore.coffee +0 -603
- data/extras/CoffeeScript.tmbundle/Preferences/CoffeeScript.tmPreferences +0 -24
- data/extras/CoffeeScript.tmbundle/Syntaxes/CoffeeScript.tmLanguage +0 -361
- data/extras/CoffeeScript.tmbundle/info.plist +0 -10
- data/extras/EXTRAS +0 -20
- data/extras/coffee.vim +0 -111
- data/lib/coffee_script/coffee-script.js +0 -50
- data/lib/coffee_script/command_line.rb +0 -235
- data/lib/coffee_script/grammar.y +0 -481
- data/lib/coffee_script/lexer.js +0 -363
- data/lib/coffee_script/lexer.rb +0 -272
- data/lib/coffee_script/narwhal/coffee-script.js +0 -96
- data/lib/coffee_script/nodes.js +0 -443
- data/lib/coffee_script/nodes.rb +0 -1050
- data/lib/coffee_script/parse_error.rb +0 -29
- data/lib/coffee_script/parser.js +0 -477
- data/lib/coffee_script/parser.rb +0 -2611
- data/lib/coffee_script/repl.js +0 -33
- data/lib/coffee_script/rewriter.js +0 -377
- data/lib/coffee_script/rewriter.rb +0 -289
- data/lib/coffee_script/runner.js +0 -11
- data/lib/coffee_script/scope.js +0 -73
- data/lib/coffee_script/scope.rb +0 -91
- data/lib/coffee_script/value.rb +0 -64
- data/package.json +0 -8
data/lib/coffee_script/lexer.js
DELETED
@@ -1,363 +0,0 @@
|
|
1
|
-
(function(){
|
2
|
-
var ASSIGNMENT, CALLABLE, CODE, COMMENT, COMMENT_CLEANER, HEREDOC, HEREDOC_INDENT, IDENTIFIER, JS, JS_CLEANER, KEYWORDS, LAST_DENT, LAST_DENTS, MULTILINER, MULTI_DENT, NOT_REGEX, NO_NEWLINE, NUMBER, OPERATOR, REGEX, Rewriter, STRING, STRING_NEWLINES, WHITESPACE, lex;
|
3
|
-
Rewriter = require('./rewriter').Rewriter;
|
4
|
-
// The lexer reads a stream of CoffeeScript and divvys it up into tagged
|
5
|
-
// tokens. A minor bit of the ambiguity in the grammar has been avoided by
|
6
|
-
// pushing some extra smarts into the Lexer.
|
7
|
-
exports.Lexer = (lex = function lex() { });
|
8
|
-
// Constants ============================================================
|
9
|
-
// The list of keywords passed verbatim to the parser.
|
10
|
-
KEYWORDS = ["if", "else", "then", "unless", "true", "false", "yes", "no", "on", "off", "and", "or", "is", "isnt", "not", "new", "return", "arguments", "try", "catch", "finally", "throw", "break", "continue", "for", "in", "of", "by", "where", "while", "delete", "instanceof", "typeof", "switch", "when", "super", "extends"];
|
11
|
-
// Token matching regexes.
|
12
|
-
IDENTIFIER = /^([a-zA-Z$_](\w|\$)*)/;
|
13
|
-
NUMBER = /^(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i;
|
14
|
-
STRING = /^(""|''|"([\s\S]*?)([^\\]|\\\\)"|'([\s\S]*?)([^\\]|\\\\)')/;
|
15
|
-
HEREDOC = /^("{6}|'{6}|"{3}\n?([\s\S]*?)\n?([ \t]*)"{3}|'{3}\n?([\s\S]*?)\n?([ \t]*)'{3})/;
|
16
|
-
JS = /^(``|`([\s\S]*?)([^\\]|\\\\)`)/;
|
17
|
-
OPERATOR = /^([+\*&|\/\-%=<>:!?]+)/;
|
18
|
-
WHITESPACE = /^([ \t]+)/;
|
19
|
-
COMMENT = /^(((\n?[ \t]*)?#.*$)+)/;
|
20
|
-
CODE = /^((-|=)>)/;
|
21
|
-
REGEX = /^(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/;
|
22
|
-
MULTI_DENT = /^((\n([ \t]*))+)(\.)?/;
|
23
|
-
LAST_DENTS = /\n([ \t]*)/g;
|
24
|
-
LAST_DENT = /\n([ \t]*)/;
|
25
|
-
ASSIGNMENT = /^(:|=)$/;
|
26
|
-
// Token cleaning regexes.
|
27
|
-
JS_CLEANER = /(^`|`$)/g;
|
28
|
-
MULTILINER = /\n/g;
|
29
|
-
STRING_NEWLINES = /\n[ \t]*/g;
|
30
|
-
COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/mg;
|
31
|
-
NO_NEWLINE = /^([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)$/;
|
32
|
-
HEREDOC_INDENT = /^[ \t]+/g;
|
33
|
-
// Tokens which a regular expression will never immediately follow, but which
|
34
|
-
// a division operator might.
|
35
|
-
// See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
|
36
|
-
NOT_REGEX = ['IDENTIFIER', 'NUMBER', 'REGEX', 'STRING', ')', '++', '--', ']', '}', 'FALSE', 'NULL', 'TRUE'];
|
37
|
-
// Tokens which could legitimately be invoked or indexed.
|
38
|
-
CALLABLE = ['IDENTIFIER', 'SUPER', ')', ']', '}', 'STRING'];
|
39
|
-
// Scan by attempting to match tokens one character at a time. Slow and steady.
|
40
|
-
lex.prototype.tokenize = function tokenize(code) {
|
41
|
-
this.code = code;
|
42
|
-
// Cleanup code by remove extra line breaks, TODO: chomp
|
43
|
-
this.i = 0;
|
44
|
-
// Current character position we're parsing
|
45
|
-
this.line = 1;
|
46
|
-
// The current line.
|
47
|
-
this.indent = 0;
|
48
|
-
// The current indent level.
|
49
|
-
this.indents = [];
|
50
|
-
// The stack of all indent levels we are currently within.
|
51
|
-
this.tokens = [];
|
52
|
-
// Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
|
53
|
-
this.spaced = null;
|
54
|
-
// The last token that has a space following it.
|
55
|
-
while (this.i < this.code.length) {
|
56
|
-
this.chunk = this.code.slice(this.i);
|
57
|
-
this.extract_next_token();
|
58
|
-
}
|
59
|
-
this.close_indentation();
|
60
|
-
return (new Rewriter()).rewrite(this.tokens);
|
61
|
-
};
|
62
|
-
// At every position, run through this list of attempted matches,
|
63
|
-
// short-circuiting if any of them succeed.
|
64
|
-
lex.prototype.extract_next_token = function extract_next_token() {
|
65
|
-
if (this.identifier_token()) {
|
66
|
-
return null;
|
67
|
-
}
|
68
|
-
if (this.number_token()) {
|
69
|
-
return null;
|
70
|
-
}
|
71
|
-
if (this.heredoc_token()) {
|
72
|
-
return null;
|
73
|
-
}
|
74
|
-
if (this.string_token()) {
|
75
|
-
return null;
|
76
|
-
}
|
77
|
-
if (this.js_token()) {
|
78
|
-
return null;
|
79
|
-
}
|
80
|
-
if (this.regex_token()) {
|
81
|
-
return null;
|
82
|
-
}
|
83
|
-
if (this.indent_token()) {
|
84
|
-
return null;
|
85
|
-
}
|
86
|
-
if (this.comment_token()) {
|
87
|
-
return null;
|
88
|
-
}
|
89
|
-
if (this.whitespace_token()) {
|
90
|
-
return null;
|
91
|
-
}
|
92
|
-
return this.literal_token();
|
93
|
-
};
|
94
|
-
// Tokenizers ==========================================================
|
95
|
-
// Matches identifying literals: variables, keywords, method names, etc.
|
96
|
-
lex.prototype.identifier_token = function identifier_token() {
|
97
|
-
var id, tag;
|
98
|
-
if (!((id = this.match(IDENTIFIER, 1)))) {
|
99
|
-
return false;
|
100
|
-
}
|
101
|
-
// Keywords are special identifiers tagged with their own name,
|
102
|
-
// 'if' will result in an ['IF', "if"] token.
|
103
|
-
tag = KEYWORDS.indexOf(id) >= 0 ? id.toUpperCase() : 'IDENTIFIER';
|
104
|
-
if (tag === 'WHEN' && (this.tag() === 'OUTDENT' || this.tag() === 'INDENT')) {
|
105
|
-
tag = 'LEADING_WHEN';
|
106
|
-
}
|
107
|
-
if (tag === 'IDENTIFIER' && this.value() === '::') {
|
108
|
-
this.tag(-1, 'PROTOTYPE_ACCESS');
|
109
|
-
}
|
110
|
-
if (tag === 'IDENTIFIER' && this.value() === '.' && !(this.value(-2) === '.')) {
|
111
|
-
if (this.tag(-2) === '?') {
|
112
|
-
this.tag(-1, 'SOAK_ACCESS');
|
113
|
-
this.tokens.splice(-2, 1);
|
114
|
-
} else {
|
115
|
-
this.tag(-1, 'PROPERTY_ACCESS');
|
116
|
-
}
|
117
|
-
}
|
118
|
-
this.token(tag, id);
|
119
|
-
this.i += id.length;
|
120
|
-
return true;
|
121
|
-
};
|
122
|
-
// Matches numbers, including decimals, hex, and exponential notation.
|
123
|
-
lex.prototype.number_token = function number_token() {
|
124
|
-
var number;
|
125
|
-
if (!((number = this.match(NUMBER, 1)))) {
|
126
|
-
return false;
|
127
|
-
}
|
128
|
-
this.token('NUMBER', number);
|
129
|
-
this.i += number.length;
|
130
|
-
return true;
|
131
|
-
};
|
132
|
-
// Matches strings, including multi-line strings.
|
133
|
-
lex.prototype.string_token = function string_token() {
|
134
|
-
var escaped, string;
|
135
|
-
if (!((string = this.match(STRING, 1)))) {
|
136
|
-
return false;
|
137
|
-
}
|
138
|
-
escaped = string.replace(STRING_NEWLINES, " \\\n");
|
139
|
-
this.token('STRING', escaped);
|
140
|
-
this.line += this.count(string, "\n");
|
141
|
-
this.i += string.length;
|
142
|
-
return true;
|
143
|
-
};
|
144
|
-
// Matches heredocs, adjusting indentation to the correct level.
|
145
|
-
lex.prototype.heredoc_token = function heredoc_token() {
|
146
|
-
var doc, indent, match;
|
147
|
-
if (!((match = this.chunk.match(HEREDOC)))) {
|
148
|
-
return false;
|
149
|
-
}
|
150
|
-
doc = match[2] || match[4];
|
151
|
-
indent = doc.match(HEREDOC_INDENT).sort()[0];
|
152
|
-
doc = doc.replace(new RegExp("^" + indent, 'g'), '').replace(MULTILINER, "\\n").replace('"', '\\"');
|
153
|
-
this.token('STRING', '"' + doc + '"');
|
154
|
-
this.line += this.count(match[1], "\n");
|
155
|
-
this.i += match[1].length;
|
156
|
-
return true;
|
157
|
-
};
|
158
|
-
// Matches interpolated JavaScript.
|
159
|
-
lex.prototype.js_token = function js_token() {
|
160
|
-
var script;
|
161
|
-
if (!((script = this.match(JS, 1)))) {
|
162
|
-
return false;
|
163
|
-
}
|
164
|
-
this.token('JS', script.replace(JS_CLEANER, ''));
|
165
|
-
this.i += script.length;
|
166
|
-
return true;
|
167
|
-
};
|
168
|
-
// Matches regular expression literals.
|
169
|
-
lex.prototype.regex_token = function regex_token() {
|
170
|
-
var regex;
|
171
|
-
if (!((regex = this.match(REGEX, 1)))) {
|
172
|
-
return false;
|
173
|
-
}
|
174
|
-
if (NOT_REGEX.indexOf(this.tag()) >= 0) {
|
175
|
-
return false;
|
176
|
-
}
|
177
|
-
this.token('REGEX', regex);
|
178
|
-
this.i += regex.length;
|
179
|
-
return true;
|
180
|
-
};
|
181
|
-
// Matches and conumes comments.
|
182
|
-
lex.prototype.comment_token = function comment_token() {
|
183
|
-
var comment;
|
184
|
-
if (!((comment = this.match(COMMENT, 1)))) {
|
185
|
-
return false;
|
186
|
-
}
|
187
|
-
this.line += comment.match(MULTILINER).length;
|
188
|
-
this.token('COMMENT', comment.replace(COMMENT_CLEANER, '').split(MULTILINER));
|
189
|
-
this.token('TERMINATOR', "\n");
|
190
|
-
this.i += comment.length;
|
191
|
-
return true;
|
192
|
-
};
|
193
|
-
// Record tokens for indentation differing from the previous line.
|
194
|
-
lex.prototype.indent_token = function indent_token() {
|
195
|
-
var diff, indent, next_character, no_newlines, size;
|
196
|
-
if (!((indent = this.match(MULTI_DENT, 1)))) {
|
197
|
-
return false;
|
198
|
-
}
|
199
|
-
this.line += indent.match(MULTILINER).length;
|
200
|
-
this.i += indent.length;
|
201
|
-
next_character = this.chunk.match(MULTI_DENT)[4];
|
202
|
-
no_newlines = next_character === '.' || (this.value().match(NO_NEWLINE) && this.tokens[this.tokens.length - 2][0] !== '.' && !this.value().match(CODE));
|
203
|
-
if (no_newlines) {
|
204
|
-
return this.suppress_newlines(indent);
|
205
|
-
}
|
206
|
-
size = indent.match(LAST_DENTS).reverse()[0].match(LAST_DENT)[1].length;
|
207
|
-
if (size === this.indent) {
|
208
|
-
return this.newline_token(indent);
|
209
|
-
}
|
210
|
-
if (size > this.indent) {
|
211
|
-
diff = size - this.indent;
|
212
|
-
this.token('INDENT', diff);
|
213
|
-
this.indents.push(diff);
|
214
|
-
} else {
|
215
|
-
this.outdent_token(this.indent - size);
|
216
|
-
}
|
217
|
-
this.indent = size;
|
218
|
-
return true;
|
219
|
-
};
|
220
|
-
// Record an oudent token or tokens, if we're moving back inwards past
|
221
|
-
// multiple recorded indents.
|
222
|
-
lex.prototype.outdent_token = function outdent_token(move_out) {
|
223
|
-
var last_indent;
|
224
|
-
while (move_out > 0 && this.indents.length) {
|
225
|
-
last_indent = this.indents.pop();
|
226
|
-
this.token('OUTDENT', last_indent);
|
227
|
-
move_out -= last_indent;
|
228
|
-
}
|
229
|
-
this.token('TERMINATOR', "\n");
|
230
|
-
return true;
|
231
|
-
};
|
232
|
-
// Matches and consumes non-meaningful whitespace.
|
233
|
-
lex.prototype.whitespace_token = function whitespace_token() {
|
234
|
-
var space;
|
235
|
-
if (!((space = this.match(WHITESPACE, 1)))) {
|
236
|
-
return false;
|
237
|
-
}
|
238
|
-
this.spaced = this.value();
|
239
|
-
this.i += space.length;
|
240
|
-
return true;
|
241
|
-
};
|
242
|
-
// Multiple newlines get merged together.
|
243
|
-
// Use a trailing \ to escape newlines.
|
244
|
-
lex.prototype.newline_token = function newline_token(newlines) {
|
245
|
-
if (!(this.value() === "\n")) {
|
246
|
-
this.token('TERMINATOR', "\n");
|
247
|
-
}
|
248
|
-
return true;
|
249
|
-
};
|
250
|
-
// Tokens to explicitly escape newlines are removed once their job is done.
|
251
|
-
lex.prototype.suppress_newlines = function suppress_newlines(newlines) {
|
252
|
-
if (this.value() === "\\") {
|
253
|
-
this.tokens.pop();
|
254
|
-
}
|
255
|
-
return true;
|
256
|
-
};
|
257
|
-
// We treat all other single characters as a token. Eg.: ( ) , . !
|
258
|
-
// Multi-character operators are also literal tokens, so that Racc can assign
|
259
|
-
// the proper order of operations.
|
260
|
-
lex.prototype.literal_token = function literal_token() {
|
261
|
-
var match, tag, value;
|
262
|
-
match = this.chunk.match(OPERATOR);
|
263
|
-
value = match && match[1];
|
264
|
-
if (value && value.match(CODE)) {
|
265
|
-
this.tag_parameters();
|
266
|
-
}
|
267
|
-
value = value || this.chunk.substr(0, 1);
|
268
|
-
tag = value.match(ASSIGNMENT) ? 'ASSIGN' : value;
|
269
|
-
if (value === ';') {
|
270
|
-
tag = 'TERMINATOR';
|
271
|
-
}
|
272
|
-
if (this.value() !== this.spaced && CALLABLE.indexOf(this.tag()) >= 0) {
|
273
|
-
if (value === '(') {
|
274
|
-
tag = 'CALL_START';
|
275
|
-
}
|
276
|
-
if (value === '[') {
|
277
|
-
tag = 'INDEX_START';
|
278
|
-
}
|
279
|
-
}
|
280
|
-
this.token(tag, value);
|
281
|
-
this.i += value.length;
|
282
|
-
return true;
|
283
|
-
};
|
284
|
-
// Helpers =============================================================
|
285
|
-
// Add a token to the results, taking note of the line number.
|
286
|
-
lex.prototype.token = function token(tag, value) {
|
287
|
-
return this.tokens.push([tag, value]);
|
288
|
-
// this.tokens.push([tag, Value.new(value, @line)])
|
289
|
-
};
|
290
|
-
// Look at a tag in the current token stream.
|
291
|
-
lex.prototype.tag = function tag(index, tag) {
|
292
|
-
var tok;
|
293
|
-
if (!((tok = this.tokens[this.tokens.length - (index || 1)]))) {
|
294
|
-
return null;
|
295
|
-
}
|
296
|
-
if ((typeof tag !== "undefined" && tag !== null)) {
|
297
|
-
return (tok[0] = tag);
|
298
|
-
}
|
299
|
-
return tok[0];
|
300
|
-
};
|
301
|
-
// Look at a value in the current token stream.
|
302
|
-
lex.prototype.value = function value(index, val) {
|
303
|
-
var tok;
|
304
|
-
if (!((tok = this.tokens[this.tokens.length - (index || 1)]))) {
|
305
|
-
return null;
|
306
|
-
}
|
307
|
-
if ((typeof val !== "undefined" && val !== null)) {
|
308
|
-
return (tok[1] = val);
|
309
|
-
}
|
310
|
-
return tok[1];
|
311
|
-
};
|
312
|
-
// Count the occurences of a character in a string.
|
313
|
-
lex.prototype.count = function count(string, letter) {
|
314
|
-
var num, pos;
|
315
|
-
num = 0;
|
316
|
-
pos = string.indexOf(letter);
|
317
|
-
while (pos !== -1) {
|
318
|
-
count += 1;
|
319
|
-
pos = string.indexOf(letter, pos + 1);
|
320
|
-
}
|
321
|
-
return count;
|
322
|
-
};
|
323
|
-
// Attempt to match a string against the current chunk, returning the indexed
|
324
|
-
// match.
|
325
|
-
lex.prototype.match = function match(regex, index) {
|
326
|
-
var m;
|
327
|
-
if (!((m = this.chunk.match(regex)))) {
|
328
|
-
return false;
|
329
|
-
}
|
330
|
-
return m ? m[index] : false;
|
331
|
-
};
|
332
|
-
// A source of ambiguity in our grammar was parameter lists in function
|
333
|
-
// definitions (as opposed to argument lists in function calls). Tag
|
334
|
-
// parameter identifiers in order to avoid this. Also, parameter lists can
|
335
|
-
// make use of splats.
|
336
|
-
lex.prototype.tag_parameters = function tag_parameters() {
|
337
|
-
var i, tok;
|
338
|
-
if (this.tag() !== ')') {
|
339
|
-
return null;
|
340
|
-
}
|
341
|
-
i = 0;
|
342
|
-
while (true) {
|
343
|
-
i += 1;
|
344
|
-
tok = this.tokens[this.tokens.length - i];
|
345
|
-
if (!tok) {
|
346
|
-
return null;
|
347
|
-
}
|
348
|
-
if (tok[0] === 'IDENTIFIER') {
|
349
|
-
tok[0] = 'PARAM';
|
350
|
-
} else if (tok[0] === ')') {
|
351
|
-
tok[0] = 'PARAM_END';
|
352
|
-
} else if (tok[0] === '(') {
|
353
|
-
return (tok[0] = 'PARAM_START');
|
354
|
-
}
|
355
|
-
}
|
356
|
-
return true;
|
357
|
-
};
|
358
|
-
// Close up all remaining open blocks. IF the first token is an indent,
|
359
|
-
// axe it.
|
360
|
-
lex.prototype.close_indentation = function close_indentation() {
|
361
|
-
return this.outdent_token(this.indent);
|
362
|
-
};
|
363
|
-
})();
|
data/lib/coffee_script/lexer.rb
DELETED
@@ -1,272 +0,0 @@
|
|
1
|
-
module CoffeeScript
|
2
|
-
|
3
|
-
# The lexer reads a stream of CoffeeScript and divvys it up into tagged
|
4
|
-
# tokens. A minor bit of the ambiguity in the grammar has been avoided by
|
5
|
-
# pushing some extra smarts into the Lexer.
|
6
|
-
class Lexer
|
7
|
-
|
8
|
-
# The list of keywords passed verbatim to the parser.
|
9
|
-
KEYWORDS = ["if", "else", "then", "unless",
|
10
|
-
"true", "false", "yes", "no", "on", "off",
|
11
|
-
"and", "or", "is", "isnt", "not",
|
12
|
-
"new", "return",
|
13
|
-
"try", "catch", "finally", "throw",
|
14
|
-
"break", "continue",
|
15
|
-
"for", "in", "of", "by", "where", "while",
|
16
|
-
"delete", "instanceof", "typeof",
|
17
|
-
"switch", "when",
|
18
|
-
"super", "extends"]
|
19
|
-
|
20
|
-
# Token matching regexes.
|
21
|
-
IDENTIFIER = /\A([a-zA-Z$_](\w|\$)*)/
|
22
|
-
NUMBER = /\A(\b((0(x|X)[0-9a-fA-F]+)|([0-9]+(\.[0-9]+)?(e[+\-]?[0-9]+)?)))\b/i
|
23
|
-
STRING = /\A(""|''|"(.*?)([^\\]|\\\\)"|'(.*?)([^\\]|\\\\)')/m
|
24
|
-
HEREDOC = /\A("{6}|'{6}|"{3}\n?(.*?)\n?([ \t]*)"{3}|'{3}\n?(.*?)\n?([ \t]*)'{3})/m
|
25
|
-
JS = /\A(``|`(.*?)([^\\]|\\\\)`)/m
|
26
|
-
OPERATOR = /\A([+\*&|\/\-%=<>:!?]+)/
|
27
|
-
WHITESPACE = /\A([ \t]+)/
|
28
|
-
COMMENT = /\A(((\n?[ \t]*)?#.*$)+)/
|
29
|
-
CODE = /\A((-|=)>)/
|
30
|
-
REGEX = /\A(\/(.*?)([^\\]|\\\\)\/[imgy]{0,4})/
|
31
|
-
MULTI_DENT = /\A((\n([ \t]*))+)(\.)?/
|
32
|
-
LAST_DENT = /\n([ \t]*)/
|
33
|
-
ASSIGNMENT = /\A(:|=)\Z/
|
34
|
-
|
35
|
-
# Token cleaning regexes.
|
36
|
-
JS_CLEANER = /(\A`|`\Z)/
|
37
|
-
MULTILINER = /\n/
|
38
|
-
STRING_NEWLINES = /\n[ \t]*/
|
39
|
-
COMMENT_CLEANER = /(^[ \t]*#|\n[ \t]*$)/
|
40
|
-
NO_NEWLINE = /\A([+\*&|\/\-%=<>:!.\\][<>=&|]*|and|or|is|isnt|not|delete|typeof|instanceof)\Z/
|
41
|
-
HEREDOC_INDENT = /^[ \t]+/
|
42
|
-
|
43
|
-
# Tokens which a regular expression will never immediately follow, but which
|
44
|
-
# a division operator might.
|
45
|
-
# See: http://www.mozilla.org/js/language/js20-2002-04/rationale/syntax.html#regular-expressions
|
46
|
-
NOT_REGEX = [
|
47
|
-
:IDENTIFIER, :NUMBER, :REGEX, :STRING,
|
48
|
-
')', '++', '--', ']', '}',
|
49
|
-
:FALSE, :NULL, :TRUE
|
50
|
-
]
|
51
|
-
|
52
|
-
# Tokens which could legitimately be invoked or indexed.
|
53
|
-
CALLABLE = [:IDENTIFIER, :SUPER, ')', ']', '}', :STRING]
|
54
|
-
|
55
|
-
# Scan by attempting to match tokens one character at a time. Slow and steady.
|
56
|
-
def tokenize(code)
|
57
|
-
@code = code.chomp # Cleanup code by remove extra line breaks
|
58
|
-
@i = 0 # Current character position we're parsing
|
59
|
-
@line = 1 # The current line.
|
60
|
-
@indent = 0 # The current indent level.
|
61
|
-
@indents = [] # The stack of all indent levels we are currently within.
|
62
|
-
@tokens = [] # Collection of all parsed tokens in the form [:TOKEN_TYPE, value]
|
63
|
-
@spaced = nil # The last value that has a space following it.
|
64
|
-
while @i < @code.length
|
65
|
-
@chunk = @code[@i..-1]
|
66
|
-
extract_next_token
|
67
|
-
end
|
68
|
-
puts "original stream: #{@tokens.inspect}" if ENV['VERBOSE']
|
69
|
-
close_indentation
|
70
|
-
Rewriter.new.rewrite(@tokens)
|
71
|
-
end
|
72
|
-
|
73
|
-
# At every position, run through this list of attempted matches,
|
74
|
-
# short-circuiting if any of them succeed.
|
75
|
-
def extract_next_token
|
76
|
-
return if identifier_token
|
77
|
-
return if number_token
|
78
|
-
return if heredoc_token
|
79
|
-
return if string_token
|
80
|
-
return if js_token
|
81
|
-
return if regex_token
|
82
|
-
return if indent_token
|
83
|
-
return if comment_token
|
84
|
-
return if whitespace_token
|
85
|
-
return literal_token
|
86
|
-
end
|
87
|
-
|
88
|
-
# Tokenizers ==========================================================
|
89
|
-
|
90
|
-
# Matches identifying literals: variables, keywords, method names, etc.
|
91
|
-
def identifier_token
|
92
|
-
return false unless identifier = @chunk[IDENTIFIER, 1]
|
93
|
-
# Keywords are special identifiers tagged with their own name,
|
94
|
-
# 'if' will result in an [:IF, "if"] token.
|
95
|
-
tag = KEYWORDS.include?(identifier) ? identifier.upcase.to_sym : :IDENTIFIER
|
96
|
-
tag = :LEADING_WHEN if tag == :WHEN && [:OUTDENT, :INDENT, "\n"].include?(last_tag)
|
97
|
-
@tokens[-1][0] = :PROTOTYPE_ACCESS if tag == :IDENTIFIER && last_value == '::'
|
98
|
-
if tag == :IDENTIFIER && last_value == '.' && !(@tokens[-2] && @tokens[-2][1] == '.')
|
99
|
-
if @tokens[-2][0] == "?"
|
100
|
-
@tokens[-1][0] = :SOAK_ACCESS
|
101
|
-
@tokens.delete_at(-2)
|
102
|
-
else
|
103
|
-
@tokens[-1][0] = :PROPERTY_ACCESS
|
104
|
-
end
|
105
|
-
end
|
106
|
-
token(tag, identifier)
|
107
|
-
@i += identifier.length
|
108
|
-
end
|
109
|
-
|
110
|
-
# Matches numbers, including decimals, hex, and exponential notation.
|
111
|
-
def number_token
|
112
|
-
return false unless number = @chunk[NUMBER, 1]
|
113
|
-
token(:NUMBER, number)
|
114
|
-
@i += number.length
|
115
|
-
end
|
116
|
-
|
117
|
-
# Matches strings, including multi-line strings.
|
118
|
-
def string_token
|
119
|
-
return false unless string = @chunk[STRING, 1]
|
120
|
-
escaped = string.gsub(STRING_NEWLINES, " \\\n")
|
121
|
-
token(:STRING, escaped)
|
122
|
-
@line += string.count("\n")
|
123
|
-
@i += string.length
|
124
|
-
end
|
125
|
-
|
126
|
-
# Matches heredocs, adjusting indentation to the correct level.
|
127
|
-
def heredoc_token
|
128
|
-
return false unless match = @chunk.match(HEREDOC)
|
129
|
-
doc = match[2] || match[4]
|
130
|
-
indent = doc.scan(HEREDOC_INDENT).min
|
131
|
-
doc.gsub!(/^#{indent}/, "")
|
132
|
-
doc.gsub!("\n", "\\n")
|
133
|
-
doc.gsub!('"', '\\"')
|
134
|
-
token(:STRING, "\"#{doc}\"")
|
135
|
-
@line += match[1].count("\n")
|
136
|
-
@i += match[1].length
|
137
|
-
end
|
138
|
-
|
139
|
-
# Matches interpolated JavaScript.
|
140
|
-
def js_token
|
141
|
-
return false unless script = @chunk[JS, 1]
|
142
|
-
token(:JS, script.gsub(JS_CLEANER, ''))
|
143
|
-
@i += script.length
|
144
|
-
end
|
145
|
-
|
146
|
-
# Matches regular expression literals.
|
147
|
-
def regex_token
|
148
|
-
return false unless regex = @chunk[REGEX, 1]
|
149
|
-
return false if NOT_REGEX.include?(last_tag)
|
150
|
-
token(:REGEX, regex)
|
151
|
-
@i += regex.length
|
152
|
-
end
|
153
|
-
|
154
|
-
# Matches and consumes comments.
|
155
|
-
def comment_token
|
156
|
-
return false unless comment = @chunk[COMMENT, 1]
|
157
|
-
@line += comment.scan(MULTILINER).length
|
158
|
-
token(:COMMENT, comment.gsub(COMMENT_CLEANER, '').split(MULTILINER))
|
159
|
-
token("\n", "\n")
|
160
|
-
@i += comment.length
|
161
|
-
end
|
162
|
-
|
163
|
-
# Record tokens for indentation differing from the previous line.
|
164
|
-
def indent_token
|
165
|
-
return false unless indent = @chunk[MULTI_DENT, 1]
|
166
|
-
@line += indent.scan(MULTILINER).size
|
167
|
-
@i += indent.size
|
168
|
-
next_character = @chunk[MULTI_DENT, 4]
|
169
|
-
no_newlines = next_character == '.' || (last_value.to_s.match(NO_NEWLINE) && @tokens[-2][0] != '.' && !last_value.match(CODE))
|
170
|
-
return suppress_newlines(indent) if no_newlines
|
171
|
-
size = indent.scan(LAST_DENT).last.last.length
|
172
|
-
return newline_token(indent) if size == @indent
|
173
|
-
if size > @indent
|
174
|
-
token(:INDENT, size - @indent)
|
175
|
-
@indents << (size - @indent)
|
176
|
-
else
|
177
|
-
outdent_token(@indent - size)
|
178
|
-
end
|
179
|
-
@indent = size
|
180
|
-
end
|
181
|
-
|
182
|
-
# Record an oudent token or tokens, if we're moving back inwards past
|
183
|
-
# multiple recorded indents.
|
184
|
-
def outdent_token(move_out)
|
185
|
-
while move_out > 0 && !@indents.empty?
|
186
|
-
last_indent = @indents.pop
|
187
|
-
token(:OUTDENT, last_indent)
|
188
|
-
move_out -= last_indent
|
189
|
-
end
|
190
|
-
token("\n", "\n")
|
191
|
-
end
|
192
|
-
|
193
|
-
# Matches and consumes non-meaningful whitespace.
|
194
|
-
def whitespace_token
|
195
|
-
return false unless whitespace = @chunk[WHITESPACE, 1]
|
196
|
-
@spaced = last_value
|
197
|
-
@i += whitespace.length
|
198
|
-
end
|
199
|
-
|
200
|
-
# Multiple newlines get merged together.
|
201
|
-
# Use a trailing \ to escape newlines.
|
202
|
-
def newline_token(newlines)
|
203
|
-
token("\n", "\n") unless last_value == "\n"
|
204
|
-
true
|
205
|
-
end
|
206
|
-
|
207
|
-
# Tokens to explicitly escape newlines are removed once their job is done.
|
208
|
-
def suppress_newlines(newlines)
|
209
|
-
@tokens.pop if last_value == "\\"
|
210
|
-
true
|
211
|
-
end
|
212
|
-
|
213
|
-
# We treat all other single characters as a token. Eg.: ( ) , . !
|
214
|
-
# Multi-character operators are also literal tokens, so that Racc can assign
|
215
|
-
# the proper order of operations.
|
216
|
-
def literal_token
|
217
|
-
value = @chunk[OPERATOR, 1]
|
218
|
-
tag_parameters if value && value.match(CODE)
|
219
|
-
value ||= @chunk[0,1]
|
220
|
-
tag = value.match(ASSIGNMENT) ? :ASSIGN : value
|
221
|
-
if !@spaced.equal?(last_value) && CALLABLE.include?(last_tag)
|
222
|
-
tag = :CALL_START if value == '('
|
223
|
-
tag = :INDEX_START if value == '['
|
224
|
-
end
|
225
|
-
token(tag, value)
|
226
|
-
@i += value.length
|
227
|
-
end
|
228
|
-
|
229
|
-
# Helpers ==========================================================
|
230
|
-
|
231
|
-
# Add a token to the results, taking note of the line number.
|
232
|
-
def token(tag, value)
|
233
|
-
@tokens << [tag, Value.new(value, @line)]
|
234
|
-
end
|
235
|
-
|
236
|
-
# Peek at the previous token's value.
|
237
|
-
def last_value
|
238
|
-
@tokens.last && @tokens.last[1]
|
239
|
-
end
|
240
|
-
|
241
|
-
# Peek at the previous token's tag.
|
242
|
-
def last_tag
|
243
|
-
@tokens.last && @tokens.last[0]
|
244
|
-
end
|
245
|
-
|
246
|
-
# A source of ambiguity in our grammar was parameter lists in function
|
247
|
-
# definitions (as opposed to argument lists in function calls). Tag
|
248
|
-
# parameter identifiers in order to avoid this. Also, parameter lists can
|
249
|
-
# make use of splats.
|
250
|
-
def tag_parameters
|
251
|
-
return if last_tag != ')'
|
252
|
-
i = 0
|
253
|
-
loop do
|
254
|
-
i -= 1
|
255
|
-
tok = @tokens[i]
|
256
|
-
return if !tok
|
257
|
-
case tok[0]
|
258
|
-
when :IDENTIFIER then tok[0] = :PARAM
|
259
|
-
when ')' then tok[0] = :PARAM_END
|
260
|
-
when '(' then return tok[0] = :PARAM_START
|
261
|
-
end
|
262
|
-
end
|
263
|
-
end
|
264
|
-
|
265
|
-
# Close up all remaining open blocks. IF the first token is an indent,
|
266
|
-
# axe it.
|
267
|
-
def close_indentation
|
268
|
-
outdent_token(@indent)
|
269
|
-
end
|
270
|
-
|
271
|
-
end
|
272
|
-
end
|