tree-sitter-ts 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +246 -0
- package/dist/index.cjs +5529 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +735 -0
- package/dist/index.d.ts +735 -0
- package/dist/index.js +5464 -0
- package/dist/index.js.map +1 -0
- package/package.json +57 -0
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,735 @@
|
|
|
1
|
+
/** Character position in source code */
|
|
2
|
+
interface Position {
|
|
3
|
+
/** 1-based line number */
|
|
4
|
+
line: number;
|
|
5
|
+
/** 0-based column number */
|
|
6
|
+
column: number;
|
|
7
|
+
/** Byte offset from start of source */
|
|
8
|
+
offset: number;
|
|
9
|
+
}
|
|
10
|
+
/** Range spanning two positions in source code */
|
|
11
|
+
interface Range {
|
|
12
|
+
start: Position;
|
|
13
|
+
end: Position;
|
|
14
|
+
}
|
|
15
|
+
/** Predefined character classes covering common patterns */
|
|
16
|
+
type PredefinedCharClass = "letter" | "upper" | "lower" | "digit" | "hexDigit" | "alphanumeric" | "whitespace" | "newline" | "any";
|
|
17
|
+
/**
|
|
18
|
+
* Character class definition.
|
|
19
|
+
* The building block for lexer patterns, replacing regex character classes
|
|
20
|
+
* with declarative, composable definitions.
|
|
21
|
+
*/
|
|
22
|
+
type CharClass = {
|
|
23
|
+
predefined: PredefinedCharClass;
|
|
24
|
+
} | {
|
|
25
|
+
chars: string;
|
|
26
|
+
} | {
|
|
27
|
+
range: [string, string];
|
|
28
|
+
} | {
|
|
29
|
+
union: CharClass[];
|
|
30
|
+
} | {
|
|
31
|
+
negate: CharClass;
|
|
32
|
+
} | {
|
|
33
|
+
ref: string;
|
|
34
|
+
};
|
|
35
|
+
/**
|
|
36
|
+
* Standard token categories for syntax highlighting.
|
|
37
|
+
* Maps to VS Code / TextMate scope categories for theme compatibility.
|
|
38
|
+
*/
|
|
39
|
+
type TokenCategory = "keyword" | "identifier" | "string" | "number" | "comment" | "operator" | "punctuation" | "type" | "decorator" | "tag" | "attribute" | "meta" | "regexp" | "escape" | "variable" | "constant" | "whitespace" | "newline" | "error" | "plain";
|
|
40
|
+
/**
|
|
41
|
+
* Symbol kinds for code structure classification.
|
|
42
|
+
* Compatible with ragts CodeSymbol.kind and VS Code SymbolKind.
|
|
43
|
+
*/
|
|
44
|
+
type SymbolKind = "function" | "class" | "method" | "interface" | "type" | "enum" | "module" | "variable" | "import" | "export" | "namespace" | "property" | "constant" | "other";
|
|
45
|
+
|
|
46
|
+
/** Complete lexer configuration for a language profile */
|
|
47
|
+
interface LexerConfig {
|
|
48
|
+
/** Reusable named character classes */
|
|
49
|
+
charClasses?: Record<string, CharClass>;
|
|
50
|
+
/** Token type definitions with highlighting category */
|
|
51
|
+
tokenTypes: Record<string, TokenTypeDef>;
|
|
52
|
+
/**
|
|
53
|
+
* Lexer states (modes) for context-dependent tokenization.
|
|
54
|
+
* Each state has its own ordered set of rules.
|
|
55
|
+
*/
|
|
56
|
+
states: Record<string, LexerState>;
|
|
57
|
+
/** Which state to start in (must exist in states) */
|
|
58
|
+
initialState: string;
|
|
59
|
+
/** Token types to skip in parser (still emitted in token stream) */
|
|
60
|
+
skipTokens?: string[];
|
|
61
|
+
/** Indentation tracking for Python/YAML-like languages */
|
|
62
|
+
indentation?: IndentationConfig;
|
|
63
|
+
}
|
|
64
|
+
/** Token type definition with highlighting metadata */
|
|
65
|
+
interface TokenTypeDef {
|
|
66
|
+
/** Primary highlighting category */
|
|
67
|
+
category: TokenCategory;
|
|
68
|
+
/** Optional sub-category for finer-grained highlighting */
|
|
69
|
+
subcategory?: string;
|
|
70
|
+
}
|
|
71
|
+
/** A lexer state (mode) containing ordered matching rules */
|
|
72
|
+
interface LexerState {
|
|
73
|
+
/** Rules applied in priority order - first match wins */
|
|
74
|
+
rules: LexerRule[];
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* A single lexer rule: match a pattern, emit a token, optionally change state.
|
|
78
|
+
*
|
|
79
|
+
* State transitions enable context-dependent tokenization:
|
|
80
|
+
* - push: enter a new state (e.g., entering a template string)
|
|
81
|
+
* - pop: return to the previous state (e.g., closing a template expression)
|
|
82
|
+
* - switchTo: replace current state (e.g., switching from tag to attribute mode)
|
|
83
|
+
*/
|
|
84
|
+
interface LexerRule {
|
|
85
|
+
/** Matcher that detects this token */
|
|
86
|
+
match: Matcher;
|
|
87
|
+
/** Token type to emit (must exist in tokenTypes) */
|
|
88
|
+
token: string;
|
|
89
|
+
/** Push a new state onto the state stack */
|
|
90
|
+
push?: string;
|
|
91
|
+
/** Pop the current state from the stack */
|
|
92
|
+
pop?: boolean;
|
|
93
|
+
/** Replace current state (switch without push/pop) */
|
|
94
|
+
switchTo?: string;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Matcher types - the heart of the lexer.
|
|
98
|
+
*
|
|
99
|
+
* Each type describes a common tokenization pattern declaratively.
|
|
100
|
+
* The engine compiles these to efficient character-level scanners.
|
|
101
|
+
*
|
|
102
|
+
* Key advantages over regex:
|
|
103
|
+
* - Readable and self-documenting
|
|
104
|
+
* - State machine handles nesting (template literals, JSX)
|
|
105
|
+
* - No regex pitfalls (catastrophic backtracking, etc.)
|
|
106
|
+
* - Can be statically analyzed and optimized
|
|
107
|
+
*/
|
|
108
|
+
type Matcher = StringMatcher | KeywordsMatcher | DelimitedMatcher | LineMatcher | CharSequenceMatcher | NumberMatcher | SequenceMatcher | PatternMatcher;
|
|
109
|
+
/**
|
|
110
|
+
* Exact string match. For operators, punctuation, specific tokens.
|
|
111
|
+
* When value is an array, tries each string in order (longest match first recommended).
|
|
112
|
+
*
|
|
113
|
+
* @example { kind: 'string', value: '=>' }
|
|
114
|
+
* @example { kind: 'string', value: ['{', '}', '(', ')'] }
|
|
115
|
+
*/
|
|
116
|
+
interface StringMatcher {
|
|
117
|
+
kind: "string";
|
|
118
|
+
value: string | string[];
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Keyword match with word boundary checking.
|
|
122
|
+
* Ensures "if" doesn't match inside "iframe".
|
|
123
|
+
*
|
|
124
|
+
* @example { kind: 'keywords', words: ['if', 'else', 'for', 'while'] }
|
|
125
|
+
*/
|
|
126
|
+
interface KeywordsMatcher {
|
|
127
|
+
kind: "keywords";
|
|
128
|
+
words: string[];
|
|
129
|
+
}
|
|
130
|
+
/**
|
|
131
|
+
* Delimited content match. Handles strings, block comments, raw strings.
|
|
132
|
+
* Supports escape characters, multiline content, and nested delimiters.
|
|
133
|
+
*
|
|
134
|
+
* @example { kind: 'delimited', open: '"', close: '"', escape: '\\' }
|
|
135
|
+
* @example { kind: 'delimited', open: '/''*', close: '*''/', multiline: true }
|
|
136
|
+
* @example { kind: 'delimited', open: '"""', close: '"""', multiline: true }
|
|
137
|
+
*/
|
|
138
|
+
interface DelimitedMatcher {
|
|
139
|
+
kind: "delimited";
|
|
140
|
+
/** Opening delimiter string */
|
|
141
|
+
open: string;
|
|
142
|
+
/** Closing delimiter string */
|
|
143
|
+
close: string;
|
|
144
|
+
/** Escape character (e.g., '\\' for backslash escaping) */
|
|
145
|
+
escape?: string;
|
|
146
|
+
/** Whether content can span multiple lines (default: false) */
|
|
147
|
+
multiline?: boolean;
|
|
148
|
+
/** Whether delimiters can nest (e.g., Rust nested block comments) */
|
|
149
|
+
nested?: boolean;
|
|
150
|
+
}
|
|
151
|
+
/**
|
|
152
|
+
* Line content match. From marker to end of line.
|
|
153
|
+
* For line comments, preprocessor directives, etc.
|
|
154
|
+
*
|
|
155
|
+
* @example { kind: 'line', start: '//' }
|
|
156
|
+
* @example { kind: 'line', start: '#' }
|
|
157
|
+
*/
|
|
158
|
+
interface LineMatcher {
|
|
159
|
+
kind: "line";
|
|
160
|
+
/** Starting marker string */
|
|
161
|
+
start: string;
|
|
162
|
+
}
|
|
163
|
+
/**
|
|
164
|
+
* Character sequence match. Matches sequences of characters from a class.
|
|
165
|
+
* For identifiers, hex numbers, and other character-class-based tokens.
|
|
166
|
+
*
|
|
167
|
+
* @example // Identifier: starts with letter/_, continues with alphanumeric/_
|
|
168
|
+
* { kind: 'charSequence',
|
|
169
|
+
* first: { union: [{ predefined: 'letter' }, { chars: '_$' }] },
|
|
170
|
+
* rest: { union: [{ predefined: 'alphanumeric' }, { chars: '_$' }] } }
|
|
171
|
+
*/
|
|
172
|
+
interface CharSequenceMatcher {
|
|
173
|
+
kind: "charSequence";
|
|
174
|
+
/** Character class for the first character */
|
|
175
|
+
first: CharClass;
|
|
176
|
+
/** Character class for subsequent characters (if omitted, matches single char) */
|
|
177
|
+
rest?: CharClass;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Number literal match. Handles the common numeric literal formats
|
|
181
|
+
* found across programming languages. Compiles to efficient scanning
|
|
182
|
+
* without regex.
|
|
183
|
+
*
|
|
184
|
+
* @example { kind: 'number', integer: true, float: true, hex: true, separator: '_' }
|
|
185
|
+
* @example { kind: 'number', integer: true, float: true, suffix: ['px', 'em', 'rem', '%'] }
|
|
186
|
+
*/
|
|
187
|
+
interface NumberMatcher {
|
|
188
|
+
kind: "number";
|
|
189
|
+
/** Match decimal integers: 123 */
|
|
190
|
+
integer?: boolean;
|
|
191
|
+
/** Match floating point: 1.5 */
|
|
192
|
+
float?: boolean;
|
|
193
|
+
/** Match hexadecimal: 0xFF */
|
|
194
|
+
hex?: boolean;
|
|
195
|
+
/** Match octal: 0o77 */
|
|
196
|
+
octal?: boolean;
|
|
197
|
+
/** Match binary: 0b1010 */
|
|
198
|
+
binary?: boolean;
|
|
199
|
+
/** Match scientific notation: 1e10, 1.5e-3 */
|
|
200
|
+
scientific?: boolean;
|
|
201
|
+
/** Digit separator character: '_' for 1_000_000 */
|
|
202
|
+
separator?: string;
|
|
203
|
+
/** Unit suffixes: ['px', 'em', 'rem', '%'] for CSS */
|
|
204
|
+
suffix?: string[];
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Sequence match. All elements must match in order.
|
|
208
|
+
* For composite patterns like decorators (@identifier).
|
|
209
|
+
*
|
|
210
|
+
* @example { kind: 'sequence', elements: [
|
|
211
|
+
* { kind: 'string', value: '@' },
|
|
212
|
+
* { kind: 'charSequence', first: { predefined: 'letter' }, rest: { predefined: 'alphanumeric' } }
|
|
213
|
+
* ]}
|
|
214
|
+
*/
|
|
215
|
+
interface SequenceMatcher {
|
|
216
|
+
kind: "sequence";
|
|
217
|
+
elements: Matcher[];
|
|
218
|
+
}
|
|
219
|
+
/**
|
|
220
|
+
* Regex pattern escape hatch. DISCOURAGED - use other matchers when possible.
|
|
221
|
+
* For truly complex edge cases that can't be expressed with other matchers.
|
|
222
|
+
*
|
|
223
|
+
* @example { kind: 'pattern', regex: '\\b0[xX][0-9a-fA-F]+\\b' }
|
|
224
|
+
*/
|
|
225
|
+
interface PatternMatcher {
|
|
226
|
+
kind: "pattern";
|
|
227
|
+
/** Regex pattern string (without flags) */
|
|
228
|
+
regex: string;
|
|
229
|
+
}
|
|
230
|
+
/** Configuration for indentation-based block detection (Python, YAML, etc.) */
|
|
231
|
+
interface IndentationConfig {
|
|
232
|
+
/** Token type to emit when indentation increases */
|
|
233
|
+
indentToken: string;
|
|
234
|
+
/** Token type to emit when indentation decreases */
|
|
235
|
+
dedentToken: string;
|
|
236
|
+
/** How to detect indent units */
|
|
237
|
+
unit: "spaces" | "tab" | "detect";
|
|
238
|
+
/** Number of spaces per indent level (when unit is 'spaces') */
|
|
239
|
+
size?: number;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/** Structure detection configuration */
|
|
243
|
+
interface StructureConfig {
|
|
244
|
+
/** Block delimiter pairs for nesting detection */
|
|
245
|
+
blocks: BlockRule[];
|
|
246
|
+
/** Symbol detection rules - identify functions, classes, etc. */
|
|
247
|
+
symbols: SymbolRule[];
|
|
248
|
+
/** Folding region rules for editors */
|
|
249
|
+
folding?: FoldingRule[];
|
|
250
|
+
}
|
|
251
|
+
/** Block delimiter pair for bracket/brace matching */
|
|
252
|
+
interface BlockRule {
|
|
253
|
+
/** Descriptive name (e.g., 'braces', 'parens', 'brackets') */
|
|
254
|
+
name: string;
|
|
255
|
+
/** Opening delimiter token value */
|
|
256
|
+
open: string;
|
|
257
|
+
/** Closing delimiter token value */
|
|
258
|
+
close: string;
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Symbol detection rule.
|
|
262
|
+
* Matches a sequence of token patterns to identify a language construct
|
|
263
|
+
* (function, class, method, etc.) from the token stream.
|
|
264
|
+
*
|
|
265
|
+
* @example // JavaScript function declaration
|
|
266
|
+
* {
|
|
267
|
+
* name: 'function_declaration',
|
|
268
|
+
* kind: 'function',
|
|
269
|
+
* pattern: [
|
|
270
|
+
* { token: 'keyword', value: 'function' },
|
|
271
|
+
* { token: 'identifier', capture: 'name' }
|
|
272
|
+
* ],
|
|
273
|
+
* hasBody: true,
|
|
274
|
+
* bodyStyle: 'braces'
|
|
275
|
+
* }
|
|
276
|
+
*
|
|
277
|
+
* @example // Python class with indentation body
|
|
278
|
+
* {
|
|
279
|
+
* name: 'class_definition',
|
|
280
|
+
* kind: 'class',
|
|
281
|
+
* pattern: [
|
|
282
|
+
* { token: 'keyword', value: 'class' },
|
|
283
|
+
* { token: 'identifier', capture: 'name' }
|
|
284
|
+
* ],
|
|
285
|
+
* hasBody: true,
|
|
286
|
+
* bodyStyle: 'indentation'
|
|
287
|
+
* }
|
|
288
|
+
*/
|
|
289
|
+
interface SymbolRule {
|
|
290
|
+
/** Rule name (becomes the node type, compatible with tree-sitter names) */
|
|
291
|
+
name: string;
|
|
292
|
+
/** Symbol kind for classification */
|
|
293
|
+
kind: SymbolKind;
|
|
294
|
+
/** Token pattern to detect this symbol */
|
|
295
|
+
pattern: TokenPatternStep[];
|
|
296
|
+
/** Whether this symbol has a body (block) */
|
|
297
|
+
hasBody?: boolean;
|
|
298
|
+
/** How the body is delimited */
|
|
299
|
+
bodyStyle?: "braces" | "indentation" | "end-keyword";
|
|
300
|
+
/** End keyword for bodyStyle 'end-keyword' (e.g., Ruby: 'end') */
|
|
301
|
+
endKeyword?: string;
|
|
302
|
+
/** Can this symbol appear inside another symbol? */
|
|
303
|
+
nested?: boolean;
|
|
304
|
+
}
|
|
305
|
+
/**
|
|
306
|
+
* A step in a token pattern match.
|
|
307
|
+
* Steps are matched sequentially against the token stream.
|
|
308
|
+
*/
|
|
309
|
+
type TokenPatternStep = TokenMatchStep | TokenSkipStep | TokenOptionalStep | TokenAnyOfStep;
|
|
310
|
+
/** Match a specific token type and optionally a value */
|
|
311
|
+
interface TokenMatchStep {
|
|
312
|
+
/** Token type to match (e.g., 'keyword', 'identifier') */
|
|
313
|
+
token: string;
|
|
314
|
+
/** Optional: specific token value to match */
|
|
315
|
+
value?: string;
|
|
316
|
+
/** Optional: capture this token's value under a name (e.g., 'name') */
|
|
317
|
+
capture?: string;
|
|
318
|
+
}
|
|
319
|
+
/** Skip any tokens until the next step matches */
|
|
320
|
+
interface TokenSkipStep {
|
|
321
|
+
skip: true;
|
|
322
|
+
/** Maximum tokens to skip before giving up (prevents runaway matching) */
|
|
323
|
+
maxTokens?: number;
|
|
324
|
+
}
|
|
325
|
+
/** Optional step - matches if possible, skips if not */
|
|
326
|
+
interface TokenOptionalStep {
|
|
327
|
+
optional: TokenPatternStep;
|
|
328
|
+
}
|
|
329
|
+
/** Choice - match any one of the alternatives */
|
|
330
|
+
interface TokenAnyOfStep {
|
|
331
|
+
anyOf: TokenPatternStep[];
|
|
332
|
+
}
|
|
333
|
+
/** Folding region definition for editors */
|
|
334
|
+
interface FoldingRule {
|
|
335
|
+
/** What opens a foldable region */
|
|
336
|
+
open: {
|
|
337
|
+
token: string;
|
|
338
|
+
value?: string;
|
|
339
|
+
};
|
|
340
|
+
/** What closes it */
|
|
341
|
+
close: {
|
|
342
|
+
token: string;
|
|
343
|
+
value?: string;
|
|
344
|
+
};
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/** Full grammar configuration for AST construction */
|
|
348
|
+
interface GrammarConfig {
|
|
349
|
+
/** Entry rule name (top-level production) */
|
|
350
|
+
entry: string;
|
|
351
|
+
/** Grammar production rules */
|
|
352
|
+
rules: Record<string, GrammarRule>;
|
|
353
|
+
/** Operator precedence levels (lowest to highest) */
|
|
354
|
+
precedence?: PrecedenceLevel[];
|
|
355
|
+
/** Error recovery strategies */
|
|
356
|
+
recovery?: RecoveryStrategy[];
|
|
357
|
+
}
|
|
358
|
+
/** A grammar rule with one or more alternative productions */
|
|
359
|
+
interface GrammarRule {
|
|
360
|
+
/** Alternative productions (tried in order) */
|
|
361
|
+
alternatives: Production[];
|
|
362
|
+
/** If true, this rule's node is inlined into parent (no separate AST node) */
|
|
363
|
+
inline?: boolean;
|
|
364
|
+
}
|
|
365
|
+
/** A production is a sequence of elements that form a complete match */
|
|
366
|
+
type Production = ProductionElement[];
|
|
367
|
+
/**
|
|
368
|
+
* An element within a production.
|
|
369
|
+
* Elements compose to describe the full syntax of a language construct.
|
|
370
|
+
*/
|
|
371
|
+
type ProductionElement = TokenElement | RuleElement | OptionalElement | RepeatElement | ChoiceElement | PrecElement;
|
|
372
|
+
/** Match a specific token type (terminal) */
|
|
373
|
+
interface TokenElement {
|
|
374
|
+
/** Token type to match */
|
|
375
|
+
token: string;
|
|
376
|
+
/** Optional: specific token value */
|
|
377
|
+
value?: string;
|
|
378
|
+
/** Optional: field name for the AST node */
|
|
379
|
+
field?: string;
|
|
380
|
+
}
|
|
381
|
+
/** Match another grammar rule (non-terminal) */
|
|
382
|
+
interface RuleElement {
|
|
383
|
+
/** Rule name to match */
|
|
384
|
+
rule: string;
|
|
385
|
+
/** Optional: field name for the AST node */
|
|
386
|
+
field?: string;
|
|
387
|
+
}
|
|
388
|
+
/** Optional element - matches zero or one times */
|
|
389
|
+
interface OptionalElement {
|
|
390
|
+
optional: ProductionElement | Production;
|
|
391
|
+
}
|
|
392
|
+
/** Repeated element - matches zero or more (min=0) or one or more (min=1) */
|
|
393
|
+
interface RepeatElement {
|
|
394
|
+
repeat: ProductionElement | Production;
|
|
395
|
+
/** Minimum occurrences (0 = *, 1 = +). Default: 0 */
|
|
396
|
+
min?: number;
|
|
397
|
+
/** Optional separator between repetitions (e.g., comma in argument lists) */
|
|
398
|
+
separator?: {
|
|
399
|
+
token: string;
|
|
400
|
+
value?: string;
|
|
401
|
+
};
|
|
402
|
+
}
|
|
403
|
+
/** Choice between alternative productions */
|
|
404
|
+
interface ChoiceElement {
|
|
405
|
+
choice: Production[];
|
|
406
|
+
}
|
|
407
|
+
/** Precedence annotation for operator expressions */
|
|
408
|
+
interface PrecElement {
|
|
409
|
+
/** Precedence level (higher binds tighter) */
|
|
410
|
+
prec: number;
|
|
411
|
+
/** Associativity */
|
|
412
|
+
assoc?: "left" | "right" | "none";
|
|
413
|
+
/** The element this precedence applies to */
|
|
414
|
+
element: ProductionElement;
|
|
415
|
+
}
|
|
416
|
+
/** Operator precedence level */
|
|
417
|
+
interface PrecedenceLevel {
|
|
418
|
+
/** Numeric level (higher = binds tighter) */
|
|
419
|
+
level: number;
|
|
420
|
+
/** How operators at this level associate */
|
|
421
|
+
associativity: "left" | "right" | "none";
|
|
422
|
+
/** Operator token values at this level */
|
|
423
|
+
operators: string[];
|
|
424
|
+
}
|
|
425
|
+
/** Error recovery strategy for a grammar context */
|
|
426
|
+
interface RecoveryStrategy {
|
|
427
|
+
/** Grammar rule context where this recovery applies */
|
|
428
|
+
context: string;
|
|
429
|
+
/** Tokens to synchronize on when recovering from errors */
|
|
430
|
+
syncTokens: string[];
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
/**
|
|
434
|
+
* A complete language profile for tree-sitter-ts.
|
|
435
|
+
*
|
|
436
|
+
* Three-level architecture:
|
|
437
|
+
* - Level 1 (lexer): REQUIRED - tokenization, syntax highlighting
|
|
438
|
+
* - Level 2 (structure): Optional - RAG chunking, code folding, symbol outline
|
|
439
|
+
* - Level 3 (grammar): Optional - full AST, syntax validation
|
|
440
|
+
*/
|
|
441
|
+
interface LanguageProfile {
|
|
442
|
+
/** Unique language identifier (e.g., 'typescript', 'python') */
|
|
443
|
+
name: string;
|
|
444
|
+
/** Human-readable display name (e.g., 'TypeScript') */
|
|
445
|
+
displayName: string;
|
|
446
|
+
/** Profile schema version */
|
|
447
|
+
version: string;
|
|
448
|
+
/** File extensions including dot (e.g., ['.ts', '.tsx']) */
|
|
449
|
+
fileExtensions: string[];
|
|
450
|
+
/** MIME types (e.g., ['text/typescript']) */
|
|
451
|
+
mimeTypes?: string[];
|
|
452
|
+
/**
|
|
453
|
+
* Extend another profile by name.
|
|
454
|
+
* The child profile's definitions are merged on top of the parent's.
|
|
455
|
+
* Token types, states, and rules from the child override the parent.
|
|
456
|
+
*/
|
|
457
|
+
extends?: string;
|
|
458
|
+
/** Level 1: Lexer configuration (REQUIRED) */
|
|
459
|
+
lexer: LexerConfig;
|
|
460
|
+
/** Level 2: Structure detection (optional, enables RAG/symbols) */
|
|
461
|
+
structure?: StructureConfig;
|
|
462
|
+
/** Level 3: Full grammar (optional, enables validation/full AST) */
|
|
463
|
+
grammar?: GrammarConfig;
|
|
464
|
+
/** Embedded language regions (e.g., CSS/JS in HTML) */
|
|
465
|
+
embeddedLanguages?: EmbeddedLanguageRule[];
|
|
466
|
+
}
|
|
467
|
+
/** Rule for detecting embedded language regions within a host language */
|
|
468
|
+
interface EmbeddedLanguageRule {
|
|
469
|
+
/** Language to switch to (must be a registered profile name) */
|
|
470
|
+
language: string;
|
|
471
|
+
/** Token pattern that marks the start of the embedded region */
|
|
472
|
+
start: {
|
|
473
|
+
token: string;
|
|
474
|
+
value?: string;
|
|
475
|
+
};
|
|
476
|
+
/** Token pattern that marks the end of the embedded region */
|
|
477
|
+
end: {
|
|
478
|
+
token: string;
|
|
479
|
+
value?: string;
|
|
480
|
+
};
|
|
481
|
+
/** How to determine which language to use */
|
|
482
|
+
languageDetection?: "fixed" | "attribute";
|
|
483
|
+
/** Token whose value contains the language name (when detection is 'attribute') */
|
|
484
|
+
attributeToken?: string;
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
/** A token emitted by the lexer */
|
|
488
|
+
interface Token {
|
|
489
|
+
/** Token type name (from profile's tokenTypes) */
|
|
490
|
+
type: string;
|
|
491
|
+
/** Token text content */
|
|
492
|
+
value: string;
|
|
493
|
+
/** Highlighting category */
|
|
494
|
+
category: TokenCategory;
|
|
495
|
+
/** Position in source */
|
|
496
|
+
range: Range;
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
/** A node in the syntax tree */
|
|
500
|
+
interface SyntaxNode {
|
|
501
|
+
/** Node type (from structure rules or grammar rules) */
|
|
502
|
+
type: string;
|
|
503
|
+
/** Named fields (e.g., 'name', 'body', 'condition') */
|
|
504
|
+
fields: Record<string, SyntaxNode | SyntaxNode[]>;
|
|
505
|
+
/** Child nodes */
|
|
506
|
+
children: SyntaxNode[];
|
|
507
|
+
/** Source text */
|
|
508
|
+
text: string;
|
|
509
|
+
/** Position in source */
|
|
510
|
+
range: Range;
|
|
511
|
+
/** Parent node (null for root) */
|
|
512
|
+
parent: SyntaxNode | null;
|
|
513
|
+
}
|
|
514
|
+
/**
|
|
515
|
+
* Code symbol extracted from structure analysis.
|
|
516
|
+
* Compatible with ragts CodeSymbol interface.
|
|
517
|
+
*/
|
|
518
|
+
interface CodeSymbol {
|
|
519
|
+
/** Symbol name (e.g., function/class name) */
|
|
520
|
+
name: string;
|
|
521
|
+
/** Symbol kind classification */
|
|
522
|
+
kind: SymbolKind;
|
|
523
|
+
/** 1-based start line */
|
|
524
|
+
startLine: number;
|
|
525
|
+
/** 1-based end line */
|
|
526
|
+
endLine: number;
|
|
527
|
+
/** Nesting path for context (e.g., ['ClassName', 'methodName']) */
|
|
528
|
+
path?: string[];
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
/** Compiled lexer ready to tokenize */
|
|
532
|
+
declare class CompiledLexer {
|
|
533
|
+
private readonly states;
|
|
534
|
+
private readonly config;
|
|
535
|
+
constructor(config: LexerConfig);
|
|
536
|
+
/** Tokenize source code into a token stream */
|
|
537
|
+
tokenize(source: string): Token[];
|
|
538
|
+
}
|
|
539
|
+
/** Get or create a compiled lexer for the given config */
|
|
540
|
+
declare function getCompiledLexer(config: LexerConfig): CompiledLexer;
|
|
541
|
+
|
|
542
|
+
/** Reads source code character by character, tracking line/column/offset */
|
|
543
|
+
declare class CharReader {
|
|
544
|
+
private readonly src;
|
|
545
|
+
private readonly len;
|
|
546
|
+
private pos;
|
|
547
|
+
private line;
|
|
548
|
+
private col;
|
|
549
|
+
constructor(source: string);
|
|
550
|
+
/** Current byte offset */
|
|
551
|
+
get offset(): number;
|
|
552
|
+
/** Whether we've reached end of source */
|
|
553
|
+
get eof(): boolean;
|
|
554
|
+
/** Remaining characters from current position */
|
|
555
|
+
get remaining(): number;
|
|
556
|
+
/** Current position as Position object */
|
|
557
|
+
get position(): Position;
|
|
558
|
+
/** Peek at the current character without advancing */
|
|
559
|
+
peek(): string;
|
|
560
|
+
/** Peek at character at offset from current position */
|
|
561
|
+
peekAt(offset: number): string;
|
|
562
|
+
/** Peek at a substring from current position */
|
|
563
|
+
peekString(length: number): string;
|
|
564
|
+
/** Get the char code at current position */
|
|
565
|
+
peekCode(): number;
|
|
566
|
+
/** Advance one character and return it */
|
|
567
|
+
advance(): string;
|
|
568
|
+
/** Advance N characters and return the consumed substring */
|
|
569
|
+
advanceN(n: number): string;
|
|
570
|
+
/** Check if source starts with the given string at current position */
|
|
571
|
+
startsWith(str: string): boolean;
|
|
572
|
+
/** Get a slice of the source from start offset to current position */
|
|
573
|
+
sliceFrom(startOffset: number): string;
|
|
574
|
+
/** Get the full source string */
|
|
575
|
+
get source(): string;
|
|
576
|
+
/** Save current state for backtracking */
|
|
577
|
+
save(): ReaderState;
|
|
578
|
+
/** Restore a previously saved state */
|
|
579
|
+
restore(state: ReaderState): void;
|
|
580
|
+
}
|
|
581
|
+
/** Saved reader state for backtracking */
|
|
582
|
+
interface ReaderState {
|
|
583
|
+
pos: number;
|
|
584
|
+
line: number;
|
|
585
|
+
col: number;
|
|
586
|
+
}
|
|
587
|
+
|
|
588
|
+
/** A compiled scanner: returns chars consumed (0 = no match) */
|
|
589
|
+
type ScanFn = (reader: CharReader) => number;
|
|
590
|
+
/**
|
|
591
|
+
* Compile a Matcher definition into an executable ScanFn.
|
|
592
|
+
* The charClasses map provides named character class resolution.
|
|
593
|
+
*/
|
|
594
|
+
declare function compileMatcher(matcher: Matcher, charClasses?: Record<string, CharClass>): ScanFn;
|
|
595
|
+
|
|
596
|
+
/**
|
|
597
|
+
* Compile a CharClass definition into a fast character-test function.
|
|
598
|
+
* Named references are resolved via the provided charClasses map.
|
|
599
|
+
*/
|
|
600
|
+
declare function compileCharClass(def: CharClass, charClasses?: Record<string, CharClass>): (ch: string) => boolean;
|
|
601
|
+
|
|
602
|
+
/** A matched block with open/close token indices */
|
|
603
|
+
interface BlockSpan {
|
|
604
|
+
/** Block rule name */
|
|
605
|
+
name: string;
|
|
606
|
+
/** Index of opening token in the token array */
|
|
607
|
+
openIndex: number;
|
|
608
|
+
/** Index of closing token in the token array */
|
|
609
|
+
closeIndex: number;
|
|
610
|
+
/** Nesting depth (0 = top level) */
|
|
611
|
+
depth: number;
|
|
612
|
+
}
|
|
613
|
+
/**
|
|
614
|
+
* Find all block spans in a token array using the given block rules.
|
|
615
|
+
* Returns spans sorted by openIndex.
|
|
616
|
+
*/
|
|
617
|
+
declare function findBlockSpans(tokens: Token[], blockRules: BlockRule[]): BlockSpan[];
|
|
618
|
+
|
|
619
|
+
/**
|
|
620
|
+
* Extract code symbols from a pre-tokenized token stream.
|
|
621
|
+
* Useful when you already have tokens and want to avoid re-tokenizing.
|
|
622
|
+
*/
|
|
623
|
+
declare function extractSymbolsFromTokens(tokens: Token[], profile: LanguageProfile): CodeSymbol[];
|
|
624
|
+
|
|
625
|
+
/** JSON language profile - Level 1 (Lexer) + Level 2 (Structure) + Level 3 (Grammar) */
|
|
626
|
+
declare const json: LanguageProfile;
|
|
627
|
+
|
|
628
|
+
/** CSS language profile - Level 1 (Lexer) + Level 2 (Structure) */
|
|
629
|
+
declare const css: LanguageProfile;
|
|
630
|
+
|
|
631
|
+
/** SCSS language profile - extends CSS with nesting, variables, mixins */
|
|
632
|
+
declare const scss: LanguageProfile;
|
|
633
|
+
|
|
634
|
+
/** Python language profile - Level 1 (Lexer) + Level 2 (Structure) */
|
|
635
|
+
declare const python: LanguageProfile;
|
|
636
|
+
|
|
637
|
+
/** Go language profile - Level 1 (Lexer) + Level 2 (Structure) */
|
|
638
|
+
declare const go: LanguageProfile;
|
|
639
|
+
|
|
640
|
+
/** JavaScript language profile - Level 1 (Lexer) + Level 2 (Structure) */
|
|
641
|
+
declare const javascript: LanguageProfile;
|
|
642
|
+
|
|
643
|
+
/**
|
|
644
|
+
* TypeScript language profile - extends JavaScript.
|
|
645
|
+
* Adds type annotations, generics, interfaces, enums, decorators, etc.
|
|
646
|
+
* Level 1 (Lexer) + Level 2 (Structure)
|
|
647
|
+
*/
|
|
648
|
+
declare const typescript: LanguageProfile;
|
|
649
|
+
|
|
650
|
+
/** C++ language profile - Level 1 (Lexer) + Level 2 (Structure) */
|
|
651
|
+
declare const cpp: LanguageProfile;
|
|
652
|
+
|
|
653
|
+
/** HTML language profile - Level 1 (Lexer) + Level 2 (Structure) + Embedded Languages */
|
|
654
|
+
declare const html: LanguageProfile;
|
|
655
|
+
|
|
656
|
+
/** Markdown language profile - Level 1 (Lexer) + Level 2 (Structure) */
|
|
657
|
+
declare const markdown: LanguageProfile;
|
|
658
|
+
|
|
659
|
+
declare const yaml: LanguageProfile;
|
|
660
|
+
|
|
661
|
+
declare const xml: LanguageProfile;
|
|
662
|
+
|
|
663
|
+
declare const java: LanguageProfile;
|
|
664
|
+
|
|
665
|
+
declare const csharp: LanguageProfile;
|
|
666
|
+
|
|
667
|
+
declare const rust: LanguageProfile;
|
|
668
|
+
|
|
669
|
+
declare const ruby: LanguageProfile;
|
|
670
|
+
|
|
671
|
+
declare const php: LanguageProfile;
|
|
672
|
+
|
|
673
|
+
declare const kotlin: LanguageProfile;
|
|
674
|
+
|
|
675
|
+
declare const swift: LanguageProfile;
|
|
676
|
+
|
|
677
|
+
declare const shell: LanguageProfile;
|
|
678
|
+
|
|
679
|
+
declare const sql: LanguageProfile;
|
|
680
|
+
|
|
681
|
+
declare const toml: LanguageProfile;
|
|
682
|
+
|
|
683
|
+
/**
|
|
684
|
+
* Resolve a profile's inheritance chain.
|
|
685
|
+
* If the profile has `extends`, looks up the parent in the registry
|
|
686
|
+
* and merges them (child overrides parent).
|
|
687
|
+
*/
|
|
688
|
+
declare function resolveProfile(profile: LanguageProfile, registry: Map<string, LanguageProfile>): LanguageProfile;
|
|
689
|
+
|
|
690
|
+
/** All built-in language profiles */
|
|
691
|
+
declare const builtinProfiles: LanguageProfile[];
|
|
692
|
+
/** Register a language profile */
|
|
693
|
+
declare function registerProfile(profile: LanguageProfile): void;
|
|
694
|
+
/** Get a profile by name (e.g., 'typescript') or file extension (e.g., '.ts') */
|
|
695
|
+
declare function getProfile(nameOrExt: string): LanguageProfile | undefined;
|
|
696
|
+
/** Get all registered profile names */
|
|
697
|
+
declare function getRegisteredLanguages(): string[];
|
|
698
|
+
/** Get the file extensions supported by all registered profiles */
|
|
699
|
+
declare function getSupportedExtensions(): string[];
|
|
700
|
+
|
|
701
|
+
/**
|
|
702
|
+
* Tokenize source code into a token stream.
|
|
703
|
+
*
|
|
704
|
+
* @param source - The source code to tokenize
|
|
705
|
+
* @param language - Language name (e.g., 'typescript') or file extension (e.g., '.ts')
|
|
706
|
+
* @returns Array of tokens with type, value, category, and position
|
|
707
|
+
*/
|
|
708
|
+
declare function tokenize(source: string, language: string): Token[];
|
|
709
|
+
/**
|
|
710
|
+
* Extract code symbols (functions, classes, etc.) from source code.
|
|
711
|
+
* Requires the language profile to have structure rules (Level 2).
|
|
712
|
+
*
|
|
713
|
+
* @param source - The source code to analyze
|
|
714
|
+
* @param language - Language name or file extension
|
|
715
|
+
* @returns Array of symbols with name, kind, startLine, endLine
|
|
716
|
+
*/
|
|
717
|
+
declare function extractSymbols(source: string, language: string): CodeSymbol[];
|
|
718
|
+
/**
|
|
719
|
+
* Tokenize source code using a specific language profile.
|
|
720
|
+
*
|
|
721
|
+
* @param source - The source code to tokenize
|
|
722
|
+
* @param profile - The language profile to use
|
|
723
|
+
* @returns Array of tokens
|
|
724
|
+
*/
|
|
725
|
+
declare function tokenizeWithProfile(source: string, profile: LanguageProfile): Token[];
|
|
726
|
+
/**
|
|
727
|
+
* Extract code symbols using a specific language profile.
|
|
728
|
+
*
|
|
729
|
+
* @param source - The source code to analyze
|
|
730
|
+
* @param profile - The language profile to use
|
|
731
|
+
* @returns Array of symbols
|
|
732
|
+
*/
|
|
733
|
+
declare function extractSymbolsWithProfile(source: string, profile: LanguageProfile): CodeSymbol[];
|
|
734
|
+
|
|
735
|
+
export { type BlockRule, type BlockSpan, type CharClass, CharReader, type CharSequenceMatcher, type ChoiceElement, type CodeSymbol, CompiledLexer, type DelimitedMatcher, type EmbeddedLanguageRule, type FoldingRule, type GrammarConfig, type GrammarRule, type IndentationConfig, type KeywordsMatcher, type LanguageProfile, type LexerConfig, type LexerRule, type LexerState, type LineMatcher, type Matcher, type NumberMatcher, type OptionalElement, type PatternMatcher, type Position, type PrecElement, type PrecedenceLevel, type PredefinedCharClass, type Production, type ProductionElement, type Range, type RecoveryStrategy, type RepeatElement, type RuleElement, type SequenceMatcher, type StringMatcher, type StructureConfig, type SymbolKind, type SymbolRule, type SyntaxNode, type Token, type TokenAnyOfStep, type TokenCategory, type TokenElement, type TokenMatchStep, type TokenOptionalStep, type TokenPatternStep, type TokenSkipStep, type TokenTypeDef, builtinProfiles, compileCharClass, compileMatcher, cpp, csharp, css, extractSymbols, extractSymbolsFromTokens, extractSymbolsWithProfile, findBlockSpans, getCompiledLexer, getProfile, getRegisteredLanguages, getSupportedExtensions, go, html, java, javascript, json, kotlin, markdown, php, python, registerProfile, resolveProfile, ruby, rust, scss, shell, sql, swift, tokenize, tokenizeWithProfile, toml, typescript, xml, yaml };
|