@jacobknightley/fabric-format 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/README.md +196 -0
  2. package/dist/cell-formatter.d.ts +75 -0
  3. package/dist/cell-formatter.js +144 -0
  4. package/dist/cli.d.ts +2 -0
  5. package/dist/cli.js +435 -0
  6. package/dist/formatters/index.d.ts +19 -0
  7. package/dist/formatters/index.js +76 -0
  8. package/dist/formatters/python/config.d.ts +33 -0
  9. package/dist/formatters/python/config.js +29 -0
  10. package/dist/formatters/python/index.d.ts +7 -0
  11. package/dist/formatters/python/index.js +13 -0
  12. package/dist/formatters/python/python-formatter.d.ts +51 -0
  13. package/dist/formatters/python/python-formatter.js +180 -0
  14. package/dist/formatters/sparksql/constants.d.ts +16 -0
  15. package/dist/formatters/sparksql/constants.js +16 -0
  16. package/dist/formatters/sparksql/fmt-detector.d.ts +65 -0
  17. package/dist/formatters/sparksql/fmt-detector.js +84 -0
  18. package/dist/formatters/sparksql/formatter.d.ts +24 -0
  19. package/dist/formatters/sparksql/formatter.js +1276 -0
  20. package/dist/formatters/sparksql/formatting-context.d.ts +154 -0
  21. package/dist/formatters/sparksql/formatting-context.js +363 -0
  22. package/dist/formatters/sparksql/generated/SqlBaseLexer.d.ts +529 -0
  23. package/dist/formatters/sparksql/generated/SqlBaseLexer.js +2609 -0
  24. package/dist/formatters/sparksql/generated/SqlBaseParser.d.ts +8195 -0
  25. package/dist/formatters/sparksql/generated/SqlBaseParser.js +48793 -0
  26. package/dist/formatters/sparksql/generated/SqlBaseParserListener.d.ts +910 -0
  27. package/dist/formatters/sparksql/generated/SqlBaseParserListener.js +2730 -0
  28. package/dist/formatters/sparksql/generated/SqlBaseParserVisitor.d.ts +456 -0
  29. package/dist/formatters/sparksql/generated/SqlBaseParserVisitor.js +1822 -0
  30. package/dist/formatters/sparksql/generated/builtinFunctions.d.ts +8 -0
  31. package/dist/formatters/sparksql/generated/builtinFunctions.js +510 -0
  32. package/dist/formatters/sparksql/index.d.ts +11 -0
  33. package/dist/formatters/sparksql/index.js +22 -0
  34. package/dist/formatters/sparksql/output-builder.d.ts +89 -0
  35. package/dist/formatters/sparksql/output-builder.js +191 -0
  36. package/dist/formatters/sparksql/parse-tree-analyzer.d.ts +264 -0
  37. package/dist/formatters/sparksql/parse-tree-analyzer.js +1956 -0
  38. package/dist/formatters/sparksql/sql-formatter.d.ts +25 -0
  39. package/dist/formatters/sparksql/sql-formatter.js +56 -0
  40. package/dist/formatters/sparksql/token-utils.d.ts +68 -0
  41. package/dist/formatters/sparksql/token-utils.js +155 -0
  42. package/dist/formatters/sparksql/types.d.ts +264 -0
  43. package/dist/formatters/sparksql/types.js +7 -0
  44. package/dist/formatters/types.d.ts +57 -0
  45. package/dist/formatters/types.js +7 -0
  46. package/dist/index.d.ts +18 -0
  47. package/dist/index.js +41 -0
  48. package/dist/notebook-formatter.d.ts +107 -0
  49. package/dist/notebook-formatter.js +424 -0
  50. package/package.json +63 -0
@@ -0,0 +1,1276 @@
1
+ /**
2
+ * Spark SQL Formatter - Main Entry Point
3
+ *
4
+ * This is the 100% grammar-driven SQL formatter for Apache Spark SQL.
5
+ * NO HARDCODED KEYWORD, FUNCTION, OR CLAUSE LISTS.
6
+ * Everything derived from ANTLR lexer symbolicNames and parse tree context.
7
+ *
8
+ * Architecture:
9
+ * - types.ts: TypeScript interfaces
10
+ * - token-utils.ts: Grammar-derived token detection
11
+ * - parse-tree-analyzer.ts: AST visitor that collects formatting context
12
+ * - formatting-context.ts: State management during formatting
13
+ * - output-builder.ts: Output construction with column tracking
14
+ * - formatter.ts (this file): Main orchestration
15
+ */
16
+ import antlr4 from 'antlr4';
17
+ // Internal modules
18
+ import { SqlBaseLexer, getSymbolicName, isKeywordToken, isFunctionLikeKeyword } from './token-utils.js';
19
+ // @ts-ignore - Generated ANTLR code
20
+ import SqlBaseParser from './generated/SqlBaseParser.js';
21
+ import { ParseTreeAnalyzer } from './parse-tree-analyzer.js';
22
+ import { createInitialState, ExpandedFunctionStack, CommentManager, indentCalc, isUnaryOperator, shouldExpandFunction, shouldExpandWindow, shouldExpandPivot } from './formatting-context.js';
23
+ import { OutputBuilder, outputComments, shouldSkipSpace, shouldAddCommaSpace, formatHintContent } from './output-builder.js';
24
+ import { SPARK_BUILTIN_FUNCTIONS } from './generated/builtinFunctions.js';
25
+ import { MAX_LINE_WIDTH } from './constants.js';
26
+ import { hasFormatOff, detectCollapseDirectives, isFmtInlineComment } from './fmt-detector.js';
27
+ // ============================================================================
28
+ // PUBLIC API
29
+ // ============================================================================
30
+ /**
31
+ * Format SQL - Main entry point.
32
+ * Handles magic commands, semicolon-separated statements, and formatting.
33
+ */
34
+ export function formatSql(sql) {
35
+ try {
36
+ // Handle magic commands (%%sql only - %sql is not valid in Fabric)
37
+ // Find magic command anywhere in input - only format SQL after it
38
+ // This allows content before the magic (e.g., Python code) to remain untouched
39
+ let prefix = '';
40
+ let magicCommand = '';
41
+ let sqlToFormat = sql;
42
+ const magicMatch = sql.match(/(%%sql)[ \t]*\n?/);
43
+ if (magicMatch && magicMatch.index !== undefined) {
44
+ prefix = sql.substring(0, magicMatch.index);
45
+ magicCommand = magicMatch[1];
46
+ sqlToFormat = sql.substring(magicMatch.index + magicMatch[0].length);
47
+ }
48
+ // Split on semicolons and format each statement
49
+ const statements = splitOnSemicolons(sqlToFormat);
50
+ const formattedStatements = [];
51
+ for (const stmt of statements) {
52
+ if (stmt.trim().length === 0)
53
+ continue;
54
+ // Check for statement-level fmt:off (bypass formatting entirely)
55
+ if (hasFormatOff(stmt.trim())) {
56
+ formattedStatements.push(stmt.trim());
57
+ continue;
58
+ }
59
+ const formatted = formatSingleStatement(stmt.trim());
60
+ formattedStatements.push(formatted);
61
+ }
62
+ let result = formattedStatements.join(';\n\n');
63
+ // Preserve trailing semicolon if original had one
64
+ if (sqlToFormat.trimEnd().endsWith(';')) {
65
+ result += ';';
66
+ }
67
+ // Restore magic command and prefix
68
+ if (magicCommand) {
69
+ result = prefix + magicCommand + '\n' + result;
70
+ }
71
+ return result;
72
+ }
73
+ catch {
74
+ return sql;
75
+ }
76
+ }
77
+ /**
78
+ * Check if SQL needs formatting.
79
+ */
80
+ export function needsFormatting(sql) {
81
+ return formatSql(sql) !== sql;
82
+ }
83
+ // ============================================================================
84
+ // STATEMENT SPLITTING
85
+ // ============================================================================
86
+ /**
87
+ * Split SQL on semicolons, but not semicolons inside string literals.
88
+ */
89
+ function splitOnSemicolons(sql) {
90
+ const statements = [];
91
+ let current = '';
92
+ let inSingleQuote = false;
93
+ let inDoubleQuote = false;
94
+ let escaped = false;
95
+ for (let i = 0; i < sql.length; i++) {
96
+ const ch = sql[i];
97
+ if (escaped) {
98
+ current += ch;
99
+ escaped = false;
100
+ continue;
101
+ }
102
+ if (ch === '\\') {
103
+ current += ch;
104
+ escaped = true;
105
+ continue;
106
+ }
107
+ if (ch === "'" && !inDoubleQuote) {
108
+ inSingleQuote = !inSingleQuote;
109
+ current += ch;
110
+ }
111
+ else if (ch === '"' && !inSingleQuote) {
112
+ inDoubleQuote = !inDoubleQuote;
113
+ current += ch;
114
+ }
115
+ else if (ch === ';' && !inSingleQuote && !inDoubleQuote) {
116
+ if (current.trim().length > 0) {
117
+ statements.push(current);
118
+ }
119
+ current = '';
120
+ }
121
+ else {
122
+ current += ch;
123
+ }
124
+ }
125
+ if (current.trim().length > 0) {
126
+ statements.push(current);
127
+ }
128
+ return statements;
129
+ }
130
+ // ============================================================================
131
+ // VARIABLE SUBSTITUTION HANDLING
132
+ // ============================================================================
133
+ /**
134
+ * Spark SQL variable substitution pattern: ${variable_name}
135
+ * These must be preserved exactly during formatting.
136
+ */
137
+ const VARIABLE_PATTERN = /\$\{([^}]+)\}/g;
138
+ /**
139
+ * Replace ${variable} patterns with safe placeholders before formatting.
140
+ * Returns the modified SQL and a map to restore later.
141
+ */
142
+ function extractVariables(sql) {
143
+ const substitutions = [];
144
+ let index = 0;
145
+ const modifiedSql = sql.replace(VARIABLE_PATTERN, (match) => {
146
+ // Use a placeholder that won't be modified by formatting
147
+ // _SPARKVAR_N_ looks like an identifier and won't get spaces added
148
+ const placeholder = `_SPARKVAR_${index}_`;
149
+ substitutions.push({ placeholder, original: match });
150
+ index++;
151
+ return placeholder;
152
+ });
153
+ return { sql: modifiedSql, substitutions };
154
+ }
155
+ /**
156
+ * Restore original ${variable} patterns after formatting.
157
+ */
158
+ function restoreVariables(sql, substitutions) {
159
+ let result = sql;
160
+ for (const sub of substitutions) {
161
+ result = result.replace(sub.placeholder, sub.original);
162
+ }
163
+ return result;
164
+ }
165
+ // ============================================================================
166
+ // SINGLE STATEMENT FORMATTING
167
+ // ============================================================================
168
+ /**
169
+ * Pre-normalize SQL to fix tokenization mismatches.
170
+ * Some SQL constructs tokenize differently based on case:
171
+ * - Scientific notation: 1.23e10 (lowercase 'e') vs 1.23E10 (uppercase 'E')
172
+ *
173
+ * We normalize these to uppercase before lexing so both streams align.
174
+ */
175
+ function normalizeForTokenization(sql) {
176
+ // Normalize scientific notation: replace lowercase 'e' in numbers with uppercase 'E'
177
+ // Pattern matches: integer part (optional decimal), 'e', optional +/-, exponent
178
+ // Examples: 1e10, 1.23e10, .5e-3, 1.e+5
179
+ return sql.replace(/(\d+(?:\.\d*)?|\.\d+)e([+-]?\d+)/gi, (match, mantissa, exponent) => {
180
+ return mantissa + 'E' + exponent;
181
+ });
182
+ }
183
+ /**
184
+ * Format a single SQL statement.
185
+ */
186
+ function formatSingleStatement(sql) {
187
+ try {
188
+ // Extract ${variable} substitutions before formatting
189
+ const { sql: sqlWithPlaceholders, substitutions } = extractVariables(sql);
190
+ // Pre-normalize SQL to fix tokenization mismatches
191
+ const normalizedSql = normalizeForTokenization(sqlWithPlaceholders);
192
+ // Parse with uppercased SQL (grammar matches uppercase keywords)
193
+ const upperSql = normalizedSql.toUpperCase();
194
+ const chars = new antlr4.InputStream(upperSql);
195
+ const lexer = new SqlBaseLexer(chars);
196
+ const tokens = new antlr4.CommonTokenStream(lexer);
197
+ tokens.fill();
198
+ const parser = new SqlBaseParser(tokens);
199
+ // @ts-ignore
200
+ parser.removeErrorListeners?.();
201
+ let tree;
202
+ try {
203
+ tree = parser.singleStatement();
204
+ }
205
+ catch {
206
+ return sql;
207
+ }
208
+ // Analyze parse tree
209
+ const analyzer = new ParseTreeAnalyzer();
210
+ analyzer.visit(tree);
211
+ const analysis = analyzer.getResult();
212
+ // Re-lex normalized SQL to get token texts (now aligned with uppercase stream)
213
+ const origChars = new antlr4.InputStream(normalizedSql);
214
+ const origLexer = new SqlBaseLexer(origChars);
215
+ const origTokens = new antlr4.CommonTokenStream(origLexer);
216
+ origTokens.fill();
217
+ // Detect fmt:collapse directives
218
+ const formatDirectives = detectCollapseDirectives(normalizedSql);
219
+ // Format tokens
220
+ const formatted = formatTokens(tokens.tokens, origTokens.tokens, analysis, formatDirectives);
221
+ // Restore ${variable} substitutions
222
+ return restoreVariables(formatted, substitutions);
223
+ }
224
+ catch (e) {
225
+ console.error('Formatter error:', e.message, e.stack);
226
+ return sql;
227
+ }
228
+ }
229
+ /**
230
+ * Format tokens using the analysis result.
231
+ */
232
+ function formatTokens(tokenList, allOrigTokens, analysis, formatDirectives) {
233
+ const builder = new OutputBuilder();
234
+ const state = createInitialState();
235
+ const expandedFuncs = new ExpandedFunctionStack();
236
+ const comments = new CommentManager();
237
+ let currentExpandedWindow = null;
238
+ let currentExpandedPivot = null;
239
+ let lastProcessedIndex = -1;
240
+ // Populate force-inline ranges from fmt:inline comments (grammar-driven approach)
241
+ const forceInlineRanges = findForceInlineRanges(allOrigTokens, analysis);
242
+ formatDirectives.forceInlineRanges = forceInlineRanges;
243
+ let activeInList = null;
244
+ // Track which simple queries are actually compact (fit within line width)
245
+ const compactQueries = new Set();
246
+ for (const [selectToken, info] of analysis.simpleQueries) {
247
+ // For subqueries (depth > 0), apply tighter width constraint to account for
248
+ // surrounding context (CTE prefix, parentheses, outer query continuation).
249
+ // Typical overhead is 20-40 chars for "WITH name AS (" + ") SELECT ..."
250
+ const effectiveMaxWidth = info.depth > 0 ? MAX_LINE_WIDTH - 40 : MAX_LINE_WIDTH;
251
+ if (info.spanLength <= effectiveMaxWidth) {
252
+ compactQueries.add(selectToken);
253
+ }
254
+ }
255
+ // Check if set operations should stay inline
256
+ // Only inline if: 1) there are set operation parens, 2) total length is short,
257
+ // 3) ALL queries in the set operation are simple (single-item SELECT)
258
+ let isShortSetOperation = false;
259
+ if (analysis.setOperandParens.size > 0) {
260
+ let estimatedQueryLength = 0;
261
+ for (const tok of tokenList) {
262
+ if (tok.type !== SqlBaseLexer.WS && tok.type !== antlr4.Token.EOF) {
263
+ estimatedQueryLength += (tok.text?.length || 0) + 1; // +1 for space
264
+ }
265
+ }
266
+ // Only allow inline if short AND no multi-item clauses exist
267
+ const hasMultiItemClause = analysis.multiItemClauses.size > 0;
268
+ isShortSetOperation = estimatedQueryLength <= MAX_LINE_WIDTH && !hasMultiItemClause;
269
+ }
270
+ // Check if VALUES statement should stay inline (simple values list)
271
+ // Only inline if: 1) has values commas, 2) total length is short, 3) NOT tuples (row format)
272
+ // VALUES 1, 2, 3 -> stays inline if short
273
+ // VALUES (1, 'a'), (2, 'b') -> always expands (has tuples)
274
+ let isShortValues = false;
275
+ if (analysis.valuesCommas.size > 0 && !analysis.valuesHasTuples) {
276
+ let estimatedQueryLength = 0;
277
+ for (const tok of tokenList) {
278
+ if (tok.type !== SqlBaseLexer.WS && tok.type !== antlr4.Token.EOF) {
279
+ estimatedQueryLength += (tok.text?.length || 0) + 1; // +1 for space
280
+ }
281
+ }
282
+ isShortValues = estimatedQueryLength <= MAX_LINE_WIDTH;
283
+ }
284
+ // Helper to find next non-WS token
285
+ const findNextNonWsTokenIndex = (startIdx) => {
286
+ for (let j = startIdx; j < tokenList.length; j++) {
287
+ const tok = tokenList[j];
288
+ if (tok.type !== SqlBaseLexer.WS &&
289
+ tok.type !== antlr4.Token.EOF &&
290
+ tok.type !== SqlBaseLexer.SIMPLE_COMMENT &&
291
+ tok.type !== SqlBaseLexer.BRACKETED_COMMENT) {
292
+ return j;
293
+ }
294
+ }
295
+ return -1;
296
+ };
297
+ // Helper to collect comments from range
298
+ const collectComments = (startIdx, endIdx) => {
299
+ for (let j = startIdx; j < endIdx; j++) {
300
+ const hiddenToken = allOrigTokens[j];
301
+ if (hiddenToken && hiddenToken.channel === 1) {
302
+ if (hiddenToken.type === SqlBaseLexer.SIMPLE_COMMENT ||
303
+ hiddenToken.type === SqlBaseLexer.BRACKETED_COMMENT) {
304
+ const wasOnOwnLine = CommentManager.checkWasOnOwnLine(j, hiddenToken, allOrigTokens);
305
+ const hadBlankLineBefore = CommentManager.checkHadBlankLineBefore(j, allOrigTokens);
306
+ comments.add({ text: hiddenToken.text, type: hiddenToken.type, wasOnOwnLine, hadBlankLineBefore });
307
+ }
308
+ }
309
+ }
310
+ };
311
+ for (let i = 0; i < tokenList.length && i < allOrigTokens.length; i++) {
312
+ const token = tokenList[i];
313
+ const origToken = allOrigTokens[i];
314
+ if (token.type === antlr4.Token.EOF)
315
+ continue;
316
+ // Collect hidden tokens (comments)
317
+ const wasAlreadyProcessed = lastProcessedIndex >= i;
318
+ if (!wasAlreadyProcessed) {
319
+ collectComments(lastProcessedIndex + 1, i);
320
+ }
321
+ lastProcessedIndex = Math.max(lastProcessedIndex, i);
322
+ // Skip WS tokens
323
+ if (token.type === SqlBaseLexer.WS)
324
+ continue;
325
+ // Handle comment tokens directly
326
+ if (token.type === SqlBaseLexer.SIMPLE_COMMENT ||
327
+ token.type === SqlBaseLexer.BRACKETED_COMMENT) {
328
+ if (!wasAlreadyProcessed) {
329
+ const wasOnOwnLine = CommentManager.checkWasOnOwnLine(i, origToken, allOrigTokens);
330
+ const hadBlankLineBefore = CommentManager.checkHadBlankLineBefore(i, allOrigTokens);
331
+ comments.add({ text: origToken.text, type: token.type, wasOnOwnLine, hadBlankLineBefore });
332
+ }
333
+ continue;
334
+ }
335
+ const text = origToken.text;
336
+ const tokenType = token.type;
337
+ const tokenIndex = token.tokenIndex;
338
+ const symbolicName = getSymbolicName(tokenType);
339
+ // Handle hints
340
+ if (tokenType === SqlBaseLexer.HENT_START) {
341
+ builder.addSpaceIfNeeded();
342
+ state.insideHint = true;
343
+ state.hintContent = [];
344
+ builder.push('/*+');
345
+ continue;
346
+ }
347
+ if (state.insideHint) {
348
+ if (tokenType === SqlBaseLexer.HENT_END) {
349
+ const formatted = formatHintContent(state.hintContent.join(''));
350
+ builder.push(' ' + formatted + ' ');
351
+ builder.push('*/');
352
+ state.insideHint = false;
353
+ state.hintContent = [];
354
+ state.prevWasFunctionName = false;
355
+ continue;
356
+ }
357
+ else {
358
+ if (state.hintContent.length > 0) {
359
+ const lastElement = state.hintContent[state.hintContent.length - 1];
360
+ const needsSpace = lastElement !== '(' && lastElement !== ' ' &&
361
+ text !== ')' && text !== ',';
362
+ if (needsSpace)
363
+ state.hintContent.push(' ');
364
+ }
365
+ state.hintContent.push(text);
366
+ continue;
367
+ }
368
+ }
369
+ // Skip AS tokens in table alias context (style: table aliases have no AS)
370
+ if (analysis.tableAliasAsTokens.has(tokenIndex)) {
371
+ continue;
372
+ }
373
+ // Get context from analysis
374
+ const ctx = getTokenContext(tokenIndex, analysis);
375
+ // Compact query tracking: each subquery level is evaluated independently
376
+ // When we hit a SELECT, check if THAT query is compact and push to stack
377
+ const simpleQueryInfo = analysis.simpleQueries.get(tokenIndex);
378
+ if (symbolicName === 'SELECT' && ctx.isClauseStart) {
379
+ const isThisQueryCompact = compactQueries.has(tokenIndex);
380
+ // Push compact state for this query level
381
+ state.compactQueryStack.push({
382
+ isCompact: isThisQueryCompact,
383
+ depth: state.subqueryDepth
384
+ });
385
+ }
386
+ // Pop compact query state when we exit a subquery (depth decreases)
387
+ while (state.compactQueryStack.length > 0 &&
388
+ state.compactQueryStack[state.compactQueryStack.length - 1].depth > state.subqueryDepth) {
389
+ state.compactQueryStack.pop();
390
+ }
391
+ // Also pop on semicolon (statement end at depth 0)
392
+ if (text === ';' && state.subqueryDepth === 0 && state.compactQueryStack.length > 0) {
393
+ state.compactQueryStack.pop();
394
+ }
395
+ // Current query is compact if the top of the stack says so
396
+ const inCompactQuery = state.compactQueryStack.length > 0 &&
397
+ state.compactQueryStack[state.compactQueryStack.length - 1].isCompact;
398
+ // Get multi-arg function info
399
+ const multiArgFuncInfo = analysis.multiArgFunctionInfo.get(tokenIndex);
400
+ const windowDefInfo = analysis.windowDefInfo.get(tokenIndex);
401
+ const pivotInfoLookup = analysis.pivotInfo.get(tokenIndex);
402
+ // Check expanded function state
403
+ const currentFunc = expandedFuncs.current();
404
+ const isExpandedFunctionComma = expandedFuncs.isComma(tokenIndex);
405
+ const isExpandedFunctionCloseParen = expandedFuncs.isCloseParen(tokenIndex);
406
+ // Check expanded window state
407
+ const isExpandedWindowOrderBy = currentExpandedWindow?.orderByTokenIndex === tokenIndex;
408
+ const isExpandedWindowFrame = currentExpandedWindow?.windowFrameTokenIndex === tokenIndex;
409
+ const isExpandedWindowCloseParen = currentExpandedWindow?.closeParenIndex === tokenIndex;
410
+ // Check expanded pivot state
411
+ const isExpandedPivotAggregateComma = currentExpandedPivot?.aggregateCommaIndices.has(tokenIndex) ?? false;
412
+ const isExpandedPivotForKeyword = currentExpandedPivot?.forKeywordIndex === tokenIndex;
413
+ const isExpandedPivotInKeyword = currentExpandedPivot?.inKeywordIndex === tokenIndex;
414
+ // Don't use comma-first expansion for PIVOT IN lists - let IN list wrapping handle it
415
+ const isExpandedPivotInListComma = false; // Disabled - use IN list wrapping instead
416
+ const isExpandedPivotCloseParen = currentExpandedPivot?.closeParenIndex === tokenIndex;
417
+ // Detect unary operator
418
+ const currentTokenIsUnaryOperator = isUnaryOperator(text, state.prevTokenText, state.prevTokenType);
419
+ // Get next token type for lookahead (skip WS tokens)
420
+ let nextTokenType = null;
421
+ for (let j = i + 1; j < tokenList.length; j++) {
422
+ const nextToken = tokenList[j];
423
+ if (nextToken.type !== SqlBaseLexer.WS &&
424
+ nextToken.type !== SqlBaseLexer.SIMPLE_COMMENT &&
425
+ nextToken.type !== SqlBaseLexer.BRACKETED_COMMENT) {
426
+ nextTokenType = nextToken.type;
427
+ break;
428
+ }
429
+ }
430
+ // Determine output text
431
+ const outputText = determineOutputText(tokenIndex, tokenType, text, symbolicName, ctx, analysis, nextTokenType);
432
+ // Check for function-like keyword
433
+ const isBuiltInFunctionKeyword = isFunctionLikeKeyword(tokenType, text);
434
+ // Track function argument depth
435
+ if (text === '(' && (state.prevWasFunctionName || state.prevWasBuiltInFunctionKeyword)) {
436
+ state.insideFunctionArgs++;
437
+ }
438
+ else if (text === ')' && state.insideFunctionArgs > 0) {
439
+ state.insideFunctionArgs--;
440
+ }
441
+ // Track paren depth
442
+ if (text === '(')
443
+ state.insideParens++;
444
+ else if (text === ')' && state.insideParens > 0)
445
+ state.insideParens--;
446
+ // Track complex type depth for ARRAY<>, MAP<>, STRUCT<>
447
+ // These use < and > instead of () for type parameters
448
+ // Note: We increment depth BEFORE processing (for opening <) but decrement AFTER (for closing >)
449
+ const prevSymName = state.prevTokenType >= 0 ? getSymbolicName(state.prevTokenType) : null;
450
+ const prevWasComplexTypeKeyword = prevSymName === 'ARRAY' || prevSymName === 'MAP' || prevSymName === 'STRUCT';
451
+ const wasInsideComplexType = state.complexTypeDepth > 0;
452
+ if (text === '<' && (prevWasComplexTypeKeyword || state.complexTypeDepth > 0)) {
453
+ state.complexTypeDepth++;
454
+ }
455
+ // Store if we should decrement after output (for closing >)
456
+ const shouldDecrementComplexTypeAfter = text === '>' && state.complexTypeDepth > 0;
457
+ // Track IN list wrapping - check if we're entering an IN list
458
+ const inListInfo = analysis.inListInfo.get(tokenIndex);
459
+ // Check if we're exiting an IN list
460
+ if (activeInList && tokenIndex === activeInList.closeParenIndex) {
461
+ // Exiting the IN list
462
+ activeInList = null;
463
+ }
464
+ // Handle AS keyword insertion
465
+ if (analysis.aliasInsertPositions.has(tokenIndex)) {
466
+ builder.addSpaceIfNeeded();
467
+ builder.push('AS');
468
+ }
469
+ // Determine newlines and indent
470
+ const { needsNewline, indent } = determineNewlineAndIndent(tokenIndex, text, symbolicName, ctx, analysis, state, expandedFuncs, currentExpandedWindow, currentExpandedPivot, isExpandedFunctionComma, isExpandedFunctionCloseParen, isExpandedWindowOrderBy, isExpandedWindowFrame, isExpandedWindowCloseParen, isExpandedPivotAggregateComma, isExpandedPivotForKeyword, isExpandedPivotInKeyword, isExpandedPivotInListComma, isExpandedPivotCloseParen, inCompactQuery, isShortSetOperation, isShortValues);
471
+ // Handle list commas - look ahead for comments
472
+ if (ctx.isListComma && state.insideFunctionArgs === 0) {
473
+ const nextIdx = findNextNonWsTokenIndex(i + 1);
474
+ if (nextIdx > 0) {
475
+ collectComments(i + 1, nextIdx);
476
+ lastProcessedIndex = nextIdx - 1;
477
+ }
478
+ }
479
+ // Similar look-ahead for other comma types
480
+ if (ctx.isCteComma || ctx.isDdlComma || ctx.isValuesComma || ctx.isSetComma || isExpandedFunctionComma) {
481
+ const nextIdx = findNextNonWsTokenIndex(i + 1);
482
+ if (nextIdx > 0) {
483
+ collectComments(i + 1, nextIdx);
484
+ lastProcessedIndex = nextIdx - 1;
485
+ }
486
+ }
487
+ // Apply spacing/newlines
488
+ if (needsNewline) {
489
+ outputWithNewline(builder, comments, indent, state);
490
+ }
491
+ else {
492
+ outputWithoutNewline(builder, comments, text, symbolicName, state, currentTokenIsUnaryOperator, ctx.isLateralViewComma);
493
+ }
494
+ builder.push(outputText);
495
+ // Handle IN list wrapping: after outputting a comma in an IN list,
496
+ // check if the next item would exceed line width
497
+ if (activeInList && activeInList.commaIndices.has(tokenIndex) && text === ',') {
498
+ // Look ahead to estimate the length of the next item
499
+ const nextItemLength = estimateNextInListItemLength(tokenList, i, findNextNonWsTokenIndex, activeInList.closeParenIndex);
500
+ const currentCol = builder.getColumn();
501
+ // Add 1 for the space after comma
502
+ if (currentCol + 1 + nextItemLength > MAX_LINE_WIDTH) {
503
+ // Wrap to new line with indent
504
+ builder.push('\n' + ' '.repeat(activeInList.wrapIndent));
505
+ state.justOutputInListWrapNewline = true;
506
+ }
507
+ }
508
+ // Activate IN list tracking AFTER we push the opening paren
509
+ if (inListInfo && text === '(') {
510
+ let wrapIndent = builder.getColumn(); // Column right after the (
511
+ // If wrap indent exceeds 60% of line width, fall back to current indent + 4
512
+ const maxWrapIndent = Math.floor(MAX_LINE_WIDTH * 0.6); // 84 chars
513
+ if (wrapIndent > maxWrapIndent) {
514
+ // Find current line's base indent (position of first non-space on this line)
515
+ // Since we just pushed '(', go back to find the line start indent
516
+ const currentOutput = builder.toString();
517
+ const lastNewline = currentOutput.lastIndexOf('\n');
518
+ const lineStart = lastNewline >= 0 ? currentOutput.slice(lastNewline + 1) : currentOutput;
519
+ const baseIndentMatch = lineStart.match(/^(\s*)/);
520
+ const baseIndent = baseIndentMatch ? baseIndentMatch[1].length : 0;
521
+ wrapIndent = baseIndent + 4; // Fall back to base indent + 1 indent level
522
+ }
523
+ activeInList = {
524
+ wrapIndent,
525
+ closeParenIndex: inListInfo.closeParenIndex,
526
+ commaIndices: new Set(inListInfo.commaIndices),
527
+ };
528
+ }
529
+ // Handle multi-WHEN CASE newline after CASE or after value expression
530
+ // For searchedCase (CASE WHEN ...), newline goes after CASE
531
+ // For simpleCase (CASE x WHEN ...), newline goes after value expression
532
+ if (analysis.multiWhenCaseTokens.has(tokenIndex)) {
533
+ // Check if this CASE has a value expression (simpleCase)
534
+ // If so, we'll add the newline after the value, not here
535
+ const isSimpleCase = analysis.simpleCaseTokens?.has(tokenIndex);
536
+ if (!isSimpleCase) {
537
+ // searchedCase - newline right after CASE
538
+ builder.push('\n');
539
+ }
540
+ state.caseDepth++;
541
+ }
542
+ // For simpleCase, add newline after the value expression
543
+ if (analysis.simpleCaseValueEndTokens?.has(tokenIndex)) {
544
+ builder.push('\n');
545
+ }
546
+ // Track subquery depth changes
547
+ if (ctx.isSubqueryOpenParen)
548
+ state.subqueryDepth++;
549
+ else if (ctx.isSubqueryCloseParen && state.subqueryDepth > 0)
550
+ state.subqueryDepth--;
551
+ // Track DDL depth
552
+ if (ctx.isDdlOpenParen && ctx.isDdlMultiColumn) {
553
+ builder.push('\n' + ' '.repeat(state.subqueryDepth + 1));
554
+ state.ddlDepth++;
555
+ }
556
+ else if (ctx.isDdlCloseParen && state.ddlDepth > 0) {
557
+ state.ddlDepth--;
558
+ }
559
+ // Handle multi-arg function expansion
560
+ // Check if this token is force-inline (either line-based legacy or grammar-driven)
561
+ const tokenLine = allOrigTokens[i]?.line || 0;
562
+ const lineBasedForceCollapse = formatDirectives.collapsedLines.has(tokenLine);
563
+ const grammarBasedForceCollapse = isForceInlineOpen(tokenIndex, forceInlineRanges);
564
+ const forceCollapse = lineBasedForceCollapse || grammarBasedForceCollapse;
565
+ if (multiArgFuncInfo && !forceCollapse && shouldExpandFunction(builder.getColumn(), multiArgFuncInfo)) {
566
+ handleFunctionExpansion(builder, expandedFuncs, multiArgFuncInfo, tokenList, i, findNextNonWsTokenIndex, analysis, state);
567
+ }
568
+ // Handle window expansion (pass multiArgFunctionInfo to check nested function expansion)
569
+ if (windowDefInfo && !forceCollapse && shouldExpandWindow(builder.getColumn(), windowDefInfo, analysis.multiArgFunctionInfo)) {
570
+ currentExpandedWindow = {
571
+ closeParenIndex: windowDefInfo.closeParenIndex,
572
+ orderByTokenIndex: windowDefInfo.orderByTokenIndex,
573
+ windowFrameTokenIndex: windowDefInfo.windowFrameTokenIndex,
574
+ baseDepth: state.subqueryDepth
575
+ };
576
+ const newIndent = '\n' + ' '.repeat(indentCalc.getWindowContentIndent(state.subqueryDepth));
577
+ builder.push(newIndent);
578
+ state.justOutputWindowNewline = true;
579
+ }
580
+ // Handle PIVOT/UNPIVOT expansion
581
+ if (pivotInfoLookup && !forceCollapse && shouldExpandPivot(builder.getColumn(), pivotInfoLookup)) {
582
+ currentExpandedPivot = {
583
+ closeParenIndex: pivotInfoLookup.closeParenIndex,
584
+ aggregateCommaIndices: new Set(pivotInfoLookup.aggregateCommaIndices),
585
+ forKeywordIndex: pivotInfoLookup.forKeywordIndex,
586
+ inKeywordIndex: pivotInfoLookup.inKeywordIndex,
587
+ inListCommaIndices: new Set(pivotInfoLookup.inListCommaIndices),
588
+ depth: state.subqueryDepth,
589
+ openingColumn: builder.getColumn() - 1
590
+ };
591
+ // Output newline after opening paren
592
+ const pivotIndent = '\n' + ' '.repeat(indentCalc.getPivotContentIndent(state.subqueryDepth));
593
+ builder.push(pivotIndent);
594
+ state.justOutputPivotNewline = true;
595
+ }
596
+ // Pop expanded function on close paren
597
+ if (isExpandedFunctionCloseParen && !expandedFuncs.isEmpty()) {
598
+ expandedFuncs.pop();
599
+ }
600
+ // Clear expanded window on close paren
601
+ if (isExpandedWindowCloseParen && currentExpandedWindow) {
602
+ currentExpandedWindow = null;
603
+ }
604
+ // Clear expanded pivot on close paren
605
+ if (isExpandedPivotCloseParen && currentExpandedPivot) {
606
+ currentExpandedPivot = null;
607
+ }
608
+ // Reset flags
609
+ if (state.justOutputMultiArgFunctionNewline && text !== ',' && text !== '(') {
610
+ state.justOutputMultiArgFunctionNewline = false;
611
+ }
612
+ if (state.justOutputWindowNewline && text !== '(' && text !== ',') {
613
+ state.justOutputWindowNewline = false;
614
+ }
615
+ if (state.justOutputPivotNewline && text !== '(' && text !== ',') {
616
+ state.justOutputPivotNewline = false;
617
+ }
618
+ if (state.justOutputInListWrapNewline && text !== ',') {
619
+ state.justOutputInListWrapNewline = false;
620
+ }
621
+ if (state.justOutputCommaFirstStyle && text !== ',') {
622
+ state.justOutputCommaFirstStyle = false;
623
+ }
624
+ // Decrease CASE depth after END
625
+ if (analysis.caseEndTokens.has(tokenIndex) && state.caseDepth > 0) {
626
+ state.caseDepth--;
627
+ }
628
+ // Decrement complex type depth after outputting closing >
629
+ if (shouldDecrementComplexTypeAfter) {
630
+ state.complexTypeDepth--;
631
+ }
632
+ // Reset clause flags
633
+ updateClauseFlags(symbolicName, ctx, state);
634
+ // Check if this token is a partition transform function (followed by paren)
635
+ const partitionTransformFunctions = new Set([
636
+ 'BUCKET', 'TRUNCATE',
637
+ 'YEAR', 'YEARS', 'MONTH', 'MONTHS',
638
+ 'DAY', 'DAYS', 'HOUR', 'HOURS',
639
+ ]);
640
+ const isPartitionTransformFunc = partitionTransformFunctions.has(text.toUpperCase()) &&
641
+ nextTokenType !== null && getSymbolicName(nextTokenType) === 'LEFT_PAREN';
642
+ // Update previous token tracking
643
+ state.prevWasFunctionName = ctx.isFunctionCall || isPartitionTransformFunc;
644
+ state.prevWasBuiltInFunctionKeyword = isBuiltInFunctionKeyword;
645
+ state.isFirstNonWsToken = false;
646
+ state.prevTokenWasUnaryOperator = currentTokenIsUnaryOperator;
647
+ state.prevTokenText = text;
648
+ state.prevTokenType = tokenType;
649
+ }
650
+ // Output remaining comments
651
+ if (comments.hasPending()) {
652
+ outputComments(builder, comments.getPending());
653
+ }
654
+ return builder.toString();
655
+ }
656
+ // ============================================================================
657
+ // HELPER FUNCTIONS
658
+ // ============================================================================
659
+ /**
660
+ * Check if a token is a comma inside an IN list.
661
+ * Used to prevent IN list commas from being treated as regular list commas.
662
+ */
663
+ function isInListComma(tokenIndex, analysis) {
664
+ for (const [, info] of analysis.inListInfo) {
665
+ if (info.commaIndices.includes(tokenIndex)) {
666
+ return true;
667
+ }
668
+ }
669
+ return false;
670
+ }
671
+ /**
672
+ * Scan tokens for fmt:inline comments and find their enclosing expressions.
673
+ * Returns an array of ForceInlineRange for expressions that should not be expanded.
674
+ *
675
+ * The approach:
676
+ * 1. Find all comment tokens that contain fmt:inline
677
+ * 2. For each such comment, find the immediately preceding token (or same position)
678
+ * 3. Check if that token is within any multi-arg function, window def, or pivot
679
+ * 4. If so, add that construct's token range to the force-inline ranges
680
+ */
681
+ function findForceInlineRanges(allOrigTokens, analysis) {
682
+ const ranges = [];
683
+ const addedRanges = new Set(); // Avoid duplicates: "open-close"
684
+ // Helper to add a range if not already added
685
+ const addRange = (openIdx, closeIdx) => {
686
+ const key = `${openIdx}-${closeIdx}`;
687
+ if (!addedRanges.has(key)) {
688
+ addedRanges.add(key);
689
+ ranges.push({ openTokenIndex: openIdx, closeTokenIndex: closeIdx });
690
+ }
691
+ };
692
+ // Scan all tokens for fmt:inline comments
693
+ for (let i = 0; i < allOrigTokens.length; i++) {
694
+ const token = allOrigTokens[i];
695
+ if (!token)
696
+ continue;
697
+ // Check if this is a comment with fmt:inline
698
+ if (token.type === SqlBaseLexer.SIMPLE_COMMENT ||
699
+ token.type === SqlBaseLexer.BRACKETED_COMMENT) {
700
+ if (isFmtInlineComment(token.text || '')) {
701
+ // Found a fmt:inline comment at token index i
702
+ // Find the closest preceding non-WS, non-comment token
703
+ let precedingTokenIdx = i - 1;
704
+ while (precedingTokenIdx >= 0) {
705
+ const prevToken = allOrigTokens[precedingTokenIdx];
706
+ if (prevToken &&
707
+ prevToken.type !== SqlBaseLexer.WS &&
708
+ prevToken.type !== SqlBaseLexer.SIMPLE_COMMENT &&
709
+ prevToken.type !== SqlBaseLexer.BRACKETED_COMMENT) {
710
+ break;
711
+ }
712
+ precedingTokenIdx--;
713
+ }
714
+ // Now find which expression (if any) contains this position
715
+ // Check multi-arg functions
716
+ for (const [openIdx, info] of analysis.multiArgFunctionInfo) {
717
+ if (precedingTokenIdx >= openIdx && precedingTokenIdx <= info.closeParenIndex) {
718
+ addRange(openIdx, info.closeParenIndex);
719
+ }
720
+ // Also check if comment is right after close paren (common placement)
721
+ if (precedingTokenIdx === info.closeParenIndex) {
722
+ addRange(openIdx, info.closeParenIndex);
723
+ }
724
+ }
725
+ // Check window definitions
726
+ for (const [openIdx, info] of analysis.windowDefInfo) {
727
+ if (precedingTokenIdx >= openIdx && precedingTokenIdx <= info.closeParenIndex) {
728
+ addRange(openIdx, info.closeParenIndex);
729
+ }
730
+ if (precedingTokenIdx === info.closeParenIndex) {
731
+ addRange(openIdx, info.closeParenIndex);
732
+ }
733
+ }
734
+ // Check PIVOT/UNPIVOT
735
+ for (const [openIdx, info] of analysis.pivotInfo) {
736
+ if (precedingTokenIdx >= openIdx && precedingTokenIdx <= info.closeParenIndex) {
737
+ addRange(openIdx, info.closeParenIndex);
738
+ }
739
+ if (precedingTokenIdx === info.closeParenIndex) {
740
+ addRange(openIdx, info.closeParenIndex);
741
+ }
742
+ }
743
+ }
744
+ }
745
+ }
746
+ return ranges;
747
+ }
748
+ /**
749
+ * Check if a token index is the opening of a force-inline expression.
750
+ */
751
+ function isForceInlineOpen(tokenIndex, ranges) {
752
+ return ranges.some(r => r.openTokenIndex === tokenIndex);
753
+ }
754
+ /**
755
+ * Estimate the length of the next item in an IN list.
756
+ * Looks ahead from the current comma to find the next comma or close paren.
757
+ */
758
+ function estimateNextInListItemLength(tokenList, currentIndex, findNextNonWsTokenIndex, closeParenIndex) {
759
+ let length = 0;
760
+ let idx = findNextNonWsTokenIndex(currentIndex + 1);
761
+ let depth = 0;
762
+ while (idx >= 0 && idx < tokenList.length) {
763
+ const token = tokenList[idx];
764
+ const tokenIndex = token.tokenIndex;
765
+ const text = token.text || '';
766
+ const symName = SqlBaseLexer.symbolicNames[token.type];
767
+ // Stop at the close paren of the IN list
768
+ if (tokenIndex === closeParenIndex) {
769
+ break;
770
+ }
771
+ // Track nested parens
772
+ if (symName === 'LEFT_PAREN') {
773
+ depth++;
774
+ length += text.length;
775
+ }
776
+ else if (symName === 'RIGHT_PAREN') {
777
+ if (depth > 0) {
778
+ depth--;
779
+ length += text.length;
780
+ }
781
+ else {
782
+ break; // Reached closing paren
783
+ }
784
+ }
785
+ else if (symName === 'COMMA' && depth === 0) {
786
+ // Found the next comma at top level - this is the end of the item
787
+ break;
788
+ }
789
+ else {
790
+ length += text.length;
791
+ // Add space between tokens (rough estimate)
792
+ length += 1;
793
+ }
794
+ idx = findNextNonWsTokenIndex(idx + 1);
795
+ }
796
+ return length;
797
+ }
798
+ /**
799
+ * Extract token context from analysis result.
800
+ */
801
+ function getTokenContext(tokenIndex, analysis) {
802
+ return {
803
+ isInIdentifierContext: analysis.identifierTokens.has(tokenIndex),
804
+ isInQualifiedName: analysis.qualifiedNameTokens.has(tokenIndex),
805
+ isFunctionCall: analysis.functionCallTokens.has(tokenIndex),
806
+ isClauseStart: analysis.clauseStartTokens.has(tokenIndex),
807
+ isListComma: analysis.listItemCommas.has(tokenIndex),
808
+ isConditionOperator: analysis.conditionOperators.has(tokenIndex),
809
+ isBetweenAnd: analysis.betweenAndTokens.has(tokenIndex),
810
+ isJoinOn: analysis.joinOnTokens.has(tokenIndex),
811
+ isSubqueryOpenParen: analysis.subqueryOpenParens.has(tokenIndex),
812
+ isSubqueryCloseParen: analysis.subqueryCloseParens.has(tokenIndex),
813
+ isSetOperandParen: analysis.setOperandParens.has(tokenIndex),
814
+ isCteComma: analysis.cteCommas.has(tokenIndex),
815
+ isCteMainSelect: analysis.cteMainSelectTokens.has(tokenIndex),
816
+ isDdlComma: analysis.ddlColumnCommas.has(tokenIndex),
817
+ isDdlOpenParen: analysis.ddlOpenParens.has(tokenIndex),
818
+ isDdlCloseParen: analysis.ddlCloseParens.has(tokenIndex),
819
+ isDdlMultiColumn: analysis.ddlMultiColumn.has(tokenIndex),
820
+ isValuesComma: analysis.valuesCommas.has(tokenIndex),
821
+ isSetComma: analysis.setClauseCommas.has(tokenIndex),
822
+ isSetKeyword: tokenIndex === analysis.setKeywordToken,
823
+ isLateralViewComma: analysis.lateralViewCommas.has(tokenIndex),
824
+ isMergeUsing: analysis.mergeUsingTokens.has(tokenIndex),
825
+ isMergeOn: analysis.mergeOnTokens.has(tokenIndex),
826
+ isMergeWhen: analysis.mergeWhenTokens.has(tokenIndex),
827
+ };
828
+ }
829
+ /**
830
+ * Determine the output text for a token (casing rules).
831
+ */
832
+ function determineOutputText(tokenIndex, tokenType, text, symbolicName, ctx, analysis, nextTokenType // Added: peek at next token
833
+ ) {
834
+ // SET config tokens - preserve casing
835
+ if (analysis.setConfigTokens.has(tokenIndex)) {
836
+ return text;
837
+ }
838
+ // GROUP BY ALL - uppercase
839
+ if (analysis.groupByAllTokens.has(tokenIndex)) {
840
+ return text.toUpperCase();
841
+ }
842
+ // Function call context
843
+ if (ctx.isFunctionCall) {
844
+ const funcLower = text.toLowerCase();
845
+ const isBuiltIn = SPARK_BUILTIN_FUNCTIONS.has(funcLower) || isKeywordToken(tokenType, text);
846
+ return isBuiltIn ? text.toUpperCase() : text;
847
+ }
848
+ // Structural keywords that should always be uppercase, even in identifier contexts.
849
+ // These are syntactic markers, not actual identifier names.
850
+ // e.g., "LATERAL VIEW EXPLODE(arr) AS item" - AS is a keyword, not an identifier.
851
+ const structuralKeywords = new Set(['AS', 'ON', 'AND', 'OR', 'IN', 'FOR', 'USING']);
852
+ if (symbolicName && structuralKeywords.has(symbolicName)) {
853
+ return text.toUpperCase();
854
+ }
855
+ // Extension keywords: Should always be uppercase, even in identifier context.
856
+ // Keywords not in Spark grammar (Delta Lake extensions).
857
+ const extensionKeywords = new Set([
858
+ // Spark SQL extensions not in grammar
859
+ 'SYSTEM', // SHOW SYSTEM FUNCTIONS
860
+ 'NOSCAN', // ANALYZE TABLE ... NOSCAN
861
+ // Delta Lake keywords (none are in the Apache Spark grammar)
862
+ 'VACUUM', 'RETAIN',
863
+ 'RESTORE',
864
+ 'CLONE', 'SHALLOW', 'DEEP',
865
+ 'OPTIMIZE', 'ZORDER',
866
+ ]);
867
+ const textUpper = text.toUpperCase();
868
+ if (extensionKeywords.has(textUpper)) {
869
+ return textUpper;
870
+ }
871
+ // Partition transform functions: uppercase only when followed by '('
872
+ // These are grammar keywords but appear as transformName=identifier in grammar.
873
+ // When used as column names (not followed by '('), they should preserve casing.
874
+ // e.g., "PARTITIONED BY (bucket(3, col))" - BUCKET uppercase
875
+ // e.g., "SELECT year FROM t" - year lowercase (it's a column name)
876
+ const partitionTransformFunctions = new Set([
877
+ 'BUCKET', 'TRUNCATE',
878
+ 'YEAR', 'YEARS', 'MONTH', 'MONTHS',
879
+ 'DAY', 'DAYS', 'HOUR', 'HOURS',
880
+ ]);
881
+ if (partitionTransformFunctions.has(textUpper)) {
882
+ // Check if next token is '(' (function call context)
883
+ const isFollowedByParen = nextTokenType !== null &&
884
+ getSymbolicName(nextTokenType) === 'LEFT_PAREN';
885
+ if (isFollowedByParen) {
886
+ return textUpper;
887
+ }
888
+ // Not followed by paren - treat as regular identifier, preserve casing
889
+ }
890
+ // Identifier context - preserve casing
891
+ // When a token is marked as identifier by the parse tree, it means the grammar
892
+ // is using it as an identifier (column name, table name, etc.), so preserve casing.
893
+ if (ctx.isInIdentifierContext) {
894
+ return text;
895
+ }
896
+ // Keyword - uppercase
897
+ if (isKeywordToken(tokenType, text)) {
898
+ return text.toUpperCase();
899
+ }
900
+ // Default - preserve
901
+ return text;
902
+ }
903
+ /**
904
+ * Determine if a newline and indent are needed before this token.
905
+ */
906
+ function determineNewlineAndIndent(tokenIndex, text, symbolicName, ctx, analysis, state, expandedFuncs, currentExpandedWindow, currentExpandedPivot, isExpandedFunctionComma, isExpandedFunctionCloseParen, isExpandedWindowOrderBy, isExpandedWindowFrame, isExpandedWindowCloseParen, isExpandedPivotAggregateComma, isExpandedPivotForKeyword, isExpandedPivotInKeyword, isExpandedPivotInListComma, isExpandedPivotCloseParen, inCompactQuery, isShortSetOperation, isShortValues) {
907
+ let needsNewline = false;
908
+ let indent = '';
909
+ const baseIndent = indentCalc.getBaseIndent(state.subqueryDepth, state.ddlDepth);
910
+ // Clause state updates
911
+ if (symbolicName === 'SELECT' && ctx.isClauseStart) {
912
+ state.afterSelectKeyword = true;
913
+ state.isFirstListItem = true;
914
+ state.currentClauseIsMultiItem = analysis.multiItemClauses.has(tokenIndex);
915
+ }
916
+ else if (symbolicName === 'GROUP' && ctx.isClauseStart) {
917
+ state.afterGroupByKeyword = true;
918
+ state.isFirstListItem = true;
919
+ state.currentClauseIsMultiItem = analysis.multiItemClauses.has(tokenIndex);
920
+ }
921
+ else if (symbolicName === 'ORDER' && ctx.isClauseStart) {
922
+ state.afterOrderByKeyword = true;
923
+ state.isFirstListItem = true;
924
+ state.currentClauseIsMultiItem = analysis.multiItemClauses.has(tokenIndex);
925
+ }
926
+ else if (symbolicName === 'WHERE' && ctx.isClauseStart) {
927
+ if (analysis.multilineConditionClauses.has(tokenIndex)) {
928
+ state.afterWhereKeyword = true;
929
+ }
930
+ }
931
+ else if (symbolicName === 'HAVING' && ctx.isClauseStart) {
932
+ if (analysis.multilineConditionClauses.has(tokenIndex)) {
933
+ state.afterHavingKeyword = true;
934
+ }
935
+ }
936
+ else if (symbolicName === 'ON' && ctx.isJoinOn && !state.isFirstNonWsToken) {
937
+ needsNewline = true;
938
+ indent = indentCalc.getOnClauseIndent(state.subqueryDepth, state.ddlDepth);
939
+ }
940
+ else if (symbolicName === 'SET' && ctx.isSetKeyword) {
941
+ state.afterSetKeyword = true;
942
+ state.isFirstListItem = true;
943
+ state.currentClauseIsMultiItem = analysis.multiItemClauses.has(tokenIndex);
944
+ }
945
+ else if (symbolicName === 'VALUES') {
946
+ state.afterValuesKeyword = true;
947
+ state.isFirstListItem = true;
948
+ }
949
+ // CASE expression handling
950
+ // Nested multi-WHEN CASE after THEN should go to new line with extra indent
951
+ if (symbolicName === 'CASE' && analysis.multiWhenCaseTokens.has(tokenIndex) && state.prevTokenText === 'THEN') {
952
+ needsNewline = true;
953
+ // Nested CASE is indented 4 more than the current WHEN level
954
+ // caseDepth represents how many multi-WHEN CASEs we're inside (after their CASE keyword)
955
+ // So nested CASE indent = WHEN indent + 4 = base + 8 + (caseDepth-1)*4 + 4 = base + 8 + caseDepth*4
956
+ const nestingOffset = state.caseDepth * 4;
957
+ indent = indentCalc.getCaseWhenIndent(state.subqueryDepth, state.ddlDepth) + ' '.repeat(nestingOffset);
958
+ }
959
+ if (analysis.caseWhenTokens.has(tokenIndex)) {
960
+ needsNewline = true;
961
+ // WHEN/ELSE indent = base + 8 + (caseDepth-1)*4 for caseDepth >= 1
962
+ const nestingOffset = state.caseDepth > 0 ? (state.caseDepth - 1) * 4 : 0;
963
+ indent = indentCalc.getCaseWhenIndent(state.subqueryDepth, state.ddlDepth) + ' '.repeat(nestingOffset);
964
+ }
965
+ else if (analysis.caseElseTokens.has(tokenIndex)) {
966
+ needsNewline = true;
967
+ const nestingOffset = state.caseDepth > 0 ? (state.caseDepth - 1) * 4 : 0;
968
+ indent = indentCalc.getCaseWhenIndent(state.subqueryDepth, state.ddlDepth) + ' '.repeat(nestingOffset);
969
+ }
970
+ else if (analysis.caseEndTokens.has(tokenIndex)) {
971
+ needsNewline = true;
972
+ // END aligns with its CASE, which is 3 less than WHEN (getCaseEndIndent vs getCaseWhenIndent)
973
+ const nestingOffset = state.caseDepth > 0 ? (state.caseDepth - 1) * 4 : 0;
974
+ indent = indentCalc.getCaseEndIndent(state.subqueryDepth, state.ddlDepth) + ' '.repeat(nestingOffset);
975
+ }
976
+ // MERGE clause handling
977
+ if ((ctx.isMergeUsing || ctx.isMergeOn || ctx.isMergeWhen) && !state.isFirstNonWsToken) {
978
+ needsNewline = true;
979
+ indent = baseIndent;
980
+ }
981
+ // CTE main SELECT - always add newline after CTE block (per STYLE_GUIDE)
982
+ // This takes precedence over compact query logic because the CTE body may have expanded
983
+ if (ctx.isCteMainSelect && !state.isFirstNonWsToken) {
984
+ needsNewline = true;
985
+ indent = baseIndent;
986
+ }
987
+ // Clause start newline - SKIP if inside a compact query OR short set operation
988
+ if (!state.isFirstNonWsToken && ctx.isClauseStart && !ctx.isInIdentifierContext && !inCompactQuery && !isShortSetOperation) {
989
+ needsNewline = true;
990
+ indent = baseIndent;
991
+ }
992
+ // Set operation operand parens - SKIP if short set operation
993
+ if (ctx.isSetOperandParen && !state.isFirstNonWsToken && !isShortSetOperation) {
994
+ needsNewline = true;
995
+ indent = baseIndent;
996
+ }
997
+ // Subquery close paren - only add newline if NOT in a compact query
998
+ if (ctx.isSubqueryCloseParen && !inCompactQuery) {
999
+ needsNewline = true;
1000
+ indent = indentCalc.getBaseIndent(state.subqueryDepth - 1);
1001
+ }
1002
+ // DDL close paren
1003
+ if (ctx.isDdlCloseParen && state.ddlDepth > 0) {
1004
+ needsNewline = true;
1005
+ indent = ' '.repeat(state.subqueryDepth + state.ddlDepth - 1);
1006
+ }
1007
+ // Expanded function close paren
1008
+ if (isExpandedFunctionCloseParen && expandedFuncs.current()) {
1009
+ needsNewline = true;
1010
+ indent = ' '.repeat(indentCalc.getExpandedFunctionCloseIndent(expandedFuncs.current().depth));
1011
+ }
1012
+ // Expanded window handling
1013
+ // Skip if we just output a window expansion newline (don't double-newline)
1014
+ if (isExpandedWindowOrderBy && currentExpandedWindow && !state.justOutputWindowNewline) {
1015
+ needsNewline = true;
1016
+ indent = ' '.repeat(indentCalc.getWindowContentIndent(currentExpandedWindow.baseDepth));
1017
+ }
1018
+ if (isExpandedWindowFrame && currentExpandedWindow && !state.justOutputWindowNewline) {
1019
+ needsNewline = true;
1020
+ indent = ' '.repeat(indentCalc.getWindowContentIndent(currentExpandedWindow.baseDepth));
1021
+ }
1022
+ if (isExpandedWindowCloseParen && currentExpandedWindow) {
1023
+ needsNewline = true;
1024
+ indent = ' '.repeat(indentCalc.getWindowCloseIndent(currentExpandedWindow.baseDepth));
1025
+ }
1026
+ // Expanded PIVOT/UNPIVOT handling
1027
+ if (isExpandedPivotAggregateComma && currentExpandedPivot) {
1028
+ needsNewline = true;
1029
+ indent = ' '.repeat(indentCalc.getPivotCommaIndent(currentExpandedPivot.depth));
1030
+ state.justOutputCommaFirstStyle = true;
1031
+ }
1032
+ if (isExpandedPivotForKeyword && currentExpandedPivot) {
1033
+ needsNewline = true;
1034
+ indent = ' '.repeat(indentCalc.getPivotContentIndent(currentExpandedPivot.depth));
1035
+ }
1036
+ if (isExpandedPivotInListComma && currentExpandedPivot) {
1037
+ needsNewline = true;
1038
+ indent = ' '.repeat(indentCalc.getPivotCommaIndent(currentExpandedPivot.depth) + 4); // Extra indent for IN list
1039
+ state.justOutputCommaFirstStyle = true;
1040
+ }
1041
+ if (isExpandedPivotCloseParen && currentExpandedPivot) {
1042
+ needsNewline = true;
1043
+ indent = ' '.repeat(indentCalc.getPivotCloseIndent(currentExpandedPivot.depth));
1044
+ }
1045
+ // List comma handling - but NOT for IN list commas (those use wrap logic instead)
1046
+ // Also skip for commas inside complex types like MAP<STRING, INT>
1047
+ // Also skip for commas inside EXCEPT clause (column exclusion)
1048
+ const isExceptClauseToken = analysis.exceptClauseTokens.has(tokenIndex);
1049
+ if (ctx.isListComma && state.insideFunctionArgs === 0 && !isInListComma(tokenIndex, analysis) && state.complexTypeDepth === 0 && !isExceptClauseToken) {
1050
+ needsNewline = true;
1051
+ indent = indentCalc.getCommaIndent(state.subqueryDepth, state.ddlDepth);
1052
+ state.isFirstListItem = false;
1053
+ state.justOutputCommaFirstStyle = true;
1054
+ }
1055
+ // CTE comma
1056
+ if (ctx.isCteComma) {
1057
+ needsNewline = true;
1058
+ indent = '';
1059
+ state.justOutputCommaFirstStyle = true;
1060
+ }
1061
+ // DDL comma
1062
+ if (ctx.isDdlComma) {
1063
+ needsNewline = true;
1064
+ indent = indentCalc.getCommaIndent(state.subqueryDepth);
1065
+ state.justOutputCommaFirstStyle = true;
1066
+ }
1067
+ // VALUES comma - expand only if the VALUES statement is long
1068
+ if (ctx.isValuesComma && !isShortValues) {
1069
+ needsNewline = true;
1070
+ indent = baseIndent;
1071
+ state.justOutputCommaFirstStyle = true;
1072
+ }
1073
+ // SET comma
1074
+ if (ctx.isSetComma) {
1075
+ needsNewline = true;
1076
+ indent = indentCalc.getCommaIndent(state.subqueryDepth, state.ddlDepth);
1077
+ state.justOutputCommaFirstStyle = true;
1078
+ }
1079
+ // Expanded function comma
1080
+ if (isExpandedFunctionComma && expandedFuncs.current()) {
1081
+ needsNewline = true;
1082
+ indent = ' '.repeat(indentCalc.getExpandedFunctionCommaIndent(expandedFuncs.current().depth));
1083
+ state.justOutputCommaFirstStyle = true;
1084
+ }
1085
+ // Condition operator (AND/OR) - but not BETWEEN's AND
1086
+ if (ctx.isConditionOperator && !ctx.isBetweenAnd) {
1087
+ needsNewline = true;
1088
+ indent = indentCalc.getCommaIndent(state.subqueryDepth, state.ddlDepth);
1089
+ }
1090
+ // First list item after SELECT/GROUP BY/ORDER BY
1091
+ if (!ctx.isListComma && (state.afterSelectKeyword || state.afterGroupByKeyword || state.afterOrderByKeyword)) {
1092
+ if (symbolicName !== 'SELECT' && symbolicName !== 'GROUP' && symbolicName !== 'ORDER') {
1093
+ if ((state.afterGroupByKeyword && symbolicName === 'BY') ||
1094
+ (state.afterOrderByKeyword && symbolicName === 'BY') ||
1095
+ symbolicName === 'DISTINCT') {
1096
+ // Skip BY or DISTINCT
1097
+ }
1098
+ else if (state.isFirstListItem && state.currentClauseIsMultiItem) {
1099
+ needsNewline = true;
1100
+ indent = indentCalc.getFirstItemIndent(state.subqueryDepth, state.ddlDepth);
1101
+ state.isFirstListItem = false;
1102
+ }
1103
+ else if (state.isFirstListItem) {
1104
+ state.isFirstListItem = false;
1105
+ }
1106
+ }
1107
+ }
1108
+ // First assignment after SET
1109
+ if (!ctx.isSetComma && state.afterSetKeyword && symbolicName !== 'SET' && state.isFirstListItem) {
1110
+ if (state.currentClauseIsMultiItem) {
1111
+ needsNewline = true;
1112
+ indent = indentCalc.getFirstItemIndent(state.subqueryDepth, state.ddlDepth);
1113
+ }
1114
+ state.isFirstListItem = false;
1115
+ state.afterSetKeyword = false;
1116
+ }
1117
+ // First tuple after VALUES - expand only if the VALUES statement is long
1118
+ if (!ctx.isValuesComma && state.afterValuesKeyword && symbolicName !== 'VALUES' && state.isFirstListItem) {
1119
+ if (!isShortValues) {
1120
+ needsNewline = true;
1121
+ indent = baseIndent;
1122
+ }
1123
+ state.isFirstListItem = false;
1124
+ state.afterValuesKeyword = false;
1125
+ }
1126
+ // First condition after WHERE/HAVING
1127
+ if (!ctx.isConditionOperator && (state.afterWhereKeyword || state.afterHavingKeyword)) {
1128
+ if (symbolicName !== 'WHERE' && symbolicName !== 'HAVING') {
1129
+ needsNewline = true;
1130
+ indent = indentCalc.getCommaIndent(state.subqueryDepth, state.ddlDepth);
1131
+ state.afterWhereKeyword = false;
1132
+ state.afterHavingKeyword = false;
1133
+ }
1134
+ }
1135
+ return { needsNewline, indent };
1136
+ }
1137
+ /**
1138
+ * Output token with newline handling.
1139
+ */
1140
+ function outputWithNewline(builder, comments, indent, state) {
1141
+ const inlineComments = comments.getInlineComments();
1142
+ const ownLineComments = comments.getOwnLineComments();
1143
+ // Output inline comments before newline
1144
+ if (inlineComments.length > 0) {
1145
+ outputComments(builder, inlineComments);
1146
+ }
1147
+ // Add newline
1148
+ builder.ensureNewline();
1149
+ // Output own-line comments with indent
1150
+ for (const comment of ownLineComments) {
1151
+ // Preserve blank line before comment if it existed in the original
1152
+ if (comment.hadBlankLineBefore && !builder.isEmpty()) {
1153
+ builder.push('\n'); // Add extra newline for blank line
1154
+ }
1155
+ if (indent)
1156
+ builder.push(indent);
1157
+ builder.push(comment.text);
1158
+ if (comment.type === SqlBaseLexer.BRACKETED_COMMENT && !comment.text.endsWith('\n')) {
1159
+ builder.push('\n');
1160
+ }
1161
+ }
1162
+ // Add indent for token
1163
+ if (indent)
1164
+ builder.push(indent);
1165
+ comments.clear();
1166
+ }
1167
+ /**
1168
+ * Output token without newline.
1169
+ */
1170
+ function outputWithoutNewline(builder, comments, text, symbolicName, state, currentTokenIsUnaryOperator, isLateralViewComma = false) {
1171
+ if (comments.hasPending()) {
1172
+ outputComments(builder, comments.getPending(), !builder.isEmpty());
1173
+ comments.clear();
1174
+ }
1175
+ if (!builder.isEmpty()) {
1176
+ const lastChar = builder.getLastChar();
1177
+ const prevIsDoubleColon = lastChar === ':' && text !== ':';
1178
+ // Check if previous token was actually a DOT token (member access), not a decimal like "1."
1179
+ const prevSymbolicName = state.prevTokenType >= 0 ? getSymbolicName(state.prevTokenType) : null;
1180
+ const prevWasDotToken = prevSymbolicName === 'DOT';
1181
+ const skipSpace = shouldSkipSpace(builder, text, {
1182
+ prevWasFunctionName: state.prevWasFunctionName,
1183
+ prevWasBuiltInFunctionKeyword: state.prevWasBuiltInFunctionKeyword,
1184
+ insideParens: state.insideParens,
1185
+ justOutputCommaFirstStyle: state.justOutputCommaFirstStyle,
1186
+ justOutputMultiArgFunctionNewline: state.justOutputMultiArgFunctionNewline,
1187
+ justOutputWindowNewline: state.justOutputWindowNewline,
1188
+ justOutputInListWrapNewline: state.justOutputInListWrapNewline,
1189
+ afterWhereKeyword: state.afterWhereKeyword,
1190
+ afterHavingKeyword: state.afterHavingKeyword,
1191
+ prevTokenWasUnaryOperator: state.prevTokenWasUnaryOperator &&
1192
+ (state.prevTokenText === '-' || state.prevTokenText === '+' || state.prevTokenText === '~'),
1193
+ currentTokenIsUnaryOperator,
1194
+ isLateralViewComma,
1195
+ prevIsDoubleColon,
1196
+ prevTokenText: state.prevTokenText,
1197
+ currentTokenIsStringLiteral: symbolicName === 'STRING_LITERAL',
1198
+ prevWasDotToken,
1199
+ complexTypeDepth: state.complexTypeDepth,
1200
+ });
1201
+ const needsCommaSpace = shouldAddCommaSpace(builder, state.insideParens, state.justOutputCommaFirstStyle);
1202
+ if (!skipSpace || needsCommaSpace) {
1203
+ builder.push(' ');
1204
+ }
1205
+ }
1206
+ }
1207
+ /**
1208
+ * Handle multi-arg function expansion.
1209
+ */
1210
+ function handleFunctionExpansion(builder, expandedFuncs, funcInfo, tokenList, currentIndex, findNextNonWsTokenIndex, analysis, state) {
1211
+ const depth = expandedFuncs.depth;
1212
+ // Check for chained function pattern
1213
+ let firstArgIsChainedFunc = false;
1214
+ const shouldConsiderChaining = depth % 2 === 1;
1215
+ if (shouldConsiderChaining) {
1216
+ const nextTokenIdx = findNextNonWsTokenIndex(currentIndex + 1);
1217
+ if (nextTokenIdx > 0 && nextTokenIdx < tokenList.length) {
1218
+ const nextToken = tokenList[nextTokenIdx];
1219
+ const isNextTokenFuncName = analysis.functionCallTokens.has(nextToken.tokenIndex);
1220
+ if (isNextTokenFuncName) {
1221
+ const parenIdx = findNextNonWsTokenIndex(nextTokenIdx + 1);
1222
+ if (parenIdx > 0 && parenIdx < tokenList.length) {
1223
+ const parenToken = tokenList[parenIdx];
1224
+ const nestedFuncInfo = analysis.multiArgFunctionInfo.get(parenToken.tokenIndex);
1225
+ if (nestedFuncInfo) {
1226
+ firstArgIsChainedFunc = true;
1227
+ }
1228
+ }
1229
+ }
1230
+ }
1231
+ }
1232
+ // For STACK function, calculate which commas should NOT get newlines (pair grouping)
1233
+ // STACK format: STACK(count, alias1, col1, alias2, col2, ...)
1234
+ // We want: count on its own, then pairs of (alias, col) on each line
1235
+ // So after the first comma (after count), every ODD comma (1st, 3rd, 5th...) gets newline,
1236
+ // every EVEN comma (2nd, 4th, 6th...) stays inline
1237
+ let skipNewlineCommas;
1238
+ if (funcInfo.functionName === 'STACK' && funcInfo.commaIndices.length >= 2) {
1239
+ skipNewlineCommas = new Set();
1240
+ // Skip newline for commas at indices 1, 3, 5... (0-based, so 2nd, 4th, 6th commas)
1241
+ for (let i = 1; i < funcInfo.commaIndices.length; i += 2) {
1242
+ skipNewlineCommas.add(funcInfo.commaIndices[i]);
1243
+ }
1244
+ }
1245
+ expandedFuncs.push({
1246
+ closeParenIndex: funcInfo.closeParenIndex,
1247
+ commaIndices: new Set(funcInfo.commaIndices),
1248
+ depth,
1249
+ openingColumn: builder.getColumn() - 1,
1250
+ firstArgIsChainedFunc,
1251
+ functionName: funcInfo.functionName,
1252
+ skipNewlineCommas,
1253
+ });
1254
+ if (!firstArgIsChainedFunc) {
1255
+ const contentIndent = indentCalc.getExpandedFunctionContentIndent(depth);
1256
+ builder.push('\n' + ' '.repeat(contentIndent));
1257
+ state.justOutputMultiArgFunctionNewline = true;
1258
+ }
1259
+ }
1260
+ /**
1261
+ * Update clause tracking flags after processing a token.
1262
+ */
1263
+ function updateClauseFlags(symbolicName, ctx, state) {
1264
+ if (symbolicName !== 'SELECT' && symbolicName !== 'DISTINCT' &&
1265
+ state.afterSelectKeyword && !ctx.isListComma) {
1266
+ state.afterSelectKeyword = false;
1267
+ }
1268
+ if (symbolicName !== 'GROUP' && symbolicName !== 'BY' &&
1269
+ state.afterGroupByKeyword && !ctx.isListComma) {
1270
+ state.afterGroupByKeyword = false;
1271
+ }
1272
+ if (symbolicName !== 'ORDER' && symbolicName !== 'BY' &&
1273
+ state.afterOrderByKeyword && !ctx.isListComma) {
1274
+ state.afterOrderByKeyword = false;
1275
+ }
1276
+ }