@jacobknightley/fabric-format 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +196 -0
- package/dist/cell-formatter.d.ts +75 -0
- package/dist/cell-formatter.js +144 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +435 -0
- package/dist/formatters/index.d.ts +19 -0
- package/dist/formatters/index.js +76 -0
- package/dist/formatters/python/config.d.ts +33 -0
- package/dist/formatters/python/config.js +29 -0
- package/dist/formatters/python/index.d.ts +7 -0
- package/dist/formatters/python/index.js +13 -0
- package/dist/formatters/python/python-formatter.d.ts +51 -0
- package/dist/formatters/python/python-formatter.js +180 -0
- package/dist/formatters/sparksql/constants.d.ts +16 -0
- package/dist/formatters/sparksql/constants.js +16 -0
- package/dist/formatters/sparksql/fmt-detector.d.ts +65 -0
- package/dist/formatters/sparksql/fmt-detector.js +84 -0
- package/dist/formatters/sparksql/formatter.d.ts +24 -0
- package/dist/formatters/sparksql/formatter.js +1276 -0
- package/dist/formatters/sparksql/formatting-context.d.ts +154 -0
- package/dist/formatters/sparksql/formatting-context.js +363 -0
- package/dist/formatters/sparksql/generated/SqlBaseLexer.d.ts +529 -0
- package/dist/formatters/sparksql/generated/SqlBaseLexer.js +2609 -0
- package/dist/formatters/sparksql/generated/SqlBaseParser.d.ts +8195 -0
- package/dist/formatters/sparksql/generated/SqlBaseParser.js +48793 -0
- package/dist/formatters/sparksql/generated/SqlBaseParserListener.d.ts +910 -0
- package/dist/formatters/sparksql/generated/SqlBaseParserListener.js +2730 -0
- package/dist/formatters/sparksql/generated/SqlBaseParserVisitor.d.ts +456 -0
- package/dist/formatters/sparksql/generated/SqlBaseParserVisitor.js +1822 -0
- package/dist/formatters/sparksql/generated/builtinFunctions.d.ts +8 -0
- package/dist/formatters/sparksql/generated/builtinFunctions.js +510 -0
- package/dist/formatters/sparksql/index.d.ts +11 -0
- package/dist/formatters/sparksql/index.js +22 -0
- package/dist/formatters/sparksql/output-builder.d.ts +89 -0
- package/dist/formatters/sparksql/output-builder.js +191 -0
- package/dist/formatters/sparksql/parse-tree-analyzer.d.ts +264 -0
- package/dist/formatters/sparksql/parse-tree-analyzer.js +1956 -0
- package/dist/formatters/sparksql/sql-formatter.d.ts +25 -0
- package/dist/formatters/sparksql/sql-formatter.js +56 -0
- package/dist/formatters/sparksql/token-utils.d.ts +68 -0
- package/dist/formatters/sparksql/token-utils.js +155 -0
- package/dist/formatters/sparksql/types.d.ts +264 -0
- package/dist/formatters/sparksql/types.js +7 -0
- package/dist/formatters/types.d.ts +57 -0
- package/dist/formatters/types.js +7 -0
- package/dist/index.d.ts +18 -0
- package/dist/index.js +41 -0
- package/dist/notebook-formatter.d.ts +107 -0
- package/dist/notebook-formatter.js +424 -0
- package/package.json +63 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SQL Formatter Wrapper
|
|
3
|
+
*
|
|
4
|
+
* Wraps the existing sparkfmt SQL formatter to implement the LanguageFormatter interface.
|
|
5
|
+
*/
|
|
6
|
+
import type { LanguageFormatter, FormatterOptions, FormatResult } from '../types.js';
|
|
7
|
+
/**
|
|
8
|
+
* SQL formatter for Spark SQL.
|
|
9
|
+
*/
|
|
10
|
+
export declare class SqlFormatter implements LanguageFormatter {
|
|
11
|
+
readonly language = "sql";
|
|
12
|
+
readonly displayName = "Spark SQL";
|
|
13
|
+
isReady(): boolean;
|
|
14
|
+
initialize(): Promise<void>;
|
|
15
|
+
format(code: string, options?: FormatterOptions): FormatResult;
|
|
16
|
+
needsFormatting(code: string, options?: FormatterOptions): boolean;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Detect if content is SQL.
|
|
20
|
+
*/
|
|
21
|
+
export declare function isSqlCode(cellType: string): boolean;
|
|
22
|
+
/**
|
|
23
|
+
* Get the SQL formatter instance (creates on first call).
|
|
24
|
+
*/
|
|
25
|
+
export declare function getSqlFormatter(): SqlFormatter;
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* SQL Formatter Wrapper
|
|
3
|
+
*
|
|
4
|
+
* Wraps the existing sparkfmt SQL formatter to implement the LanguageFormatter interface.
|
|
5
|
+
*/
|
|
6
|
+
import { formatSql, needsFormatting } from './index.js';
|
|
7
|
+
/**
|
|
8
|
+
* SQL formatter for Spark SQL.
|
|
9
|
+
*/
|
|
10
|
+
export class SqlFormatter {
|
|
11
|
+
language = 'sql';
|
|
12
|
+
displayName = 'Spark SQL';
|
|
13
|
+
isReady() {
|
|
14
|
+
// SQL formatter is always ready (no async initialization)
|
|
15
|
+
return true;
|
|
16
|
+
}
|
|
17
|
+
async initialize() {
|
|
18
|
+
// No initialization needed for SQL formatter
|
|
19
|
+
}
|
|
20
|
+
format(code, options) {
|
|
21
|
+
try {
|
|
22
|
+
// Note: formatSql currently only takes one argument
|
|
23
|
+
// TODO: Add options support to formatSql in the future
|
|
24
|
+
const formatted = formatSql(code);
|
|
25
|
+
const changed = formatted !== code;
|
|
26
|
+
return { formatted, changed };
|
|
27
|
+
}
|
|
28
|
+
catch (error) {
|
|
29
|
+
return {
|
|
30
|
+
formatted: code,
|
|
31
|
+
changed: false,
|
|
32
|
+
error: error instanceof Error ? error.message : String(error)
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
needsFormatting(code, options) {
|
|
37
|
+
return needsFormatting(code);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Detect if content is SQL.
|
|
42
|
+
*/
|
|
43
|
+
export function isSqlCode(cellType) {
|
|
44
|
+
return cellType === 'sql' || cellType === 'sparksql';
|
|
45
|
+
}
|
|
46
|
+
/** Singleton instance */
|
|
47
|
+
let sqlFormatterInstance = null;
|
|
48
|
+
/**
|
|
49
|
+
* Get the SQL formatter instance (creates on first call).
|
|
50
|
+
*/
|
|
51
|
+
export function getSqlFormatter() {
|
|
52
|
+
if (!sqlFormatterInstance) {
|
|
53
|
+
sqlFormatterInstance = new SqlFormatter();
|
|
54
|
+
}
|
|
55
|
+
return sqlFormatterInstance;
|
|
56
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token Utilities - Grammar-Derived Token Detection
|
|
3
|
+
*
|
|
4
|
+
* This module provides utilities for detecting token types based on the ANTLR grammar.
|
|
5
|
+
* All detection is grammar-driven - NO hardcoded keyword lists.
|
|
6
|
+
*
|
|
7
|
+
* Key Functions:
|
|
8
|
+
* - getTokenType(): Get token type number from symbolic name
|
|
9
|
+
* - isKeywordToken(): Check if a token is a keyword (grammar-derived)
|
|
10
|
+
*/
|
|
11
|
+
import SqlBaseLexer from './generated/SqlBaseLexer.js';
|
|
12
|
+
/**
|
|
13
|
+
* Get token type number by symbolic name.
|
|
14
|
+
* Returns -1 if the name is not found.
|
|
15
|
+
*
|
|
16
|
+
* @param name - The symbolic name (e.g., 'SELECT', 'COMMA')
|
|
17
|
+
* @returns Token type number or -1
|
|
18
|
+
*/
|
|
19
|
+
export declare function getTokenType(name: string): number;
|
|
20
|
+
/**
|
|
21
|
+
* Get symbolic name for a token type.
|
|
22
|
+
* Returns null if no symbolic name exists.
|
|
23
|
+
*
|
|
24
|
+
* @param tokenType - The token type number
|
|
25
|
+
* @returns Symbolic name or null
|
|
26
|
+
*/
|
|
27
|
+
export declare function getSymbolicName(tokenType: number): string | null;
|
|
28
|
+
/**
|
|
29
|
+
* Check if a token is a keyword based on grammar rules.
|
|
30
|
+
*
|
|
31
|
+
* Keywords in ANTLR are defined like: SELECT: 'SELECT';
|
|
32
|
+
* So symbolicNames[tokenType] === tokenText for keywords.
|
|
33
|
+
*
|
|
34
|
+
* IMPORTANT: Due to dual-lexing (uppercase for token types, original for text),
|
|
35
|
+
* we must be careful not to uppercase non-keywords. The tokenType comes from
|
|
36
|
+
* the uppercase stream, which may misclassify single letters (e.g., X -> BINARY_HEX).
|
|
37
|
+
*
|
|
38
|
+
* We ONLY return true if the symbolic name MATCHES the text (case-insensitive).
|
|
39
|
+
* This ensures we don't uppercase identifiers like 'x' just because X is BINARY_HEX.
|
|
40
|
+
*
|
|
41
|
+
* @param tokenType - The token type number
|
|
42
|
+
* @param tokenText - The original token text
|
|
43
|
+
* @returns true if the token is a keyword
|
|
44
|
+
*/
|
|
45
|
+
export declare function isKeywordToken(tokenType: number, tokenText: string): boolean;
|
|
46
|
+
/**
|
|
47
|
+
* Check if a token type represents a comment.
|
|
48
|
+
*
|
|
49
|
+
* @param tokenType - The token type number
|
|
50
|
+
* @returns true if the token is a comment
|
|
51
|
+
*/
|
|
52
|
+
export declare function isCommentToken(tokenType: number): boolean;
|
|
53
|
+
/**
|
|
54
|
+
* Check if a token type represents whitespace.
|
|
55
|
+
*
|
|
56
|
+
* @param tokenType - The token type number
|
|
57
|
+
* @returns true if the token is whitespace
|
|
58
|
+
*/
|
|
59
|
+
export declare function isWhitespaceToken(tokenType: number): boolean;
|
|
60
|
+
/**
|
|
61
|
+
* Check if a keyword should be treated like a function (no space before paren).
|
|
62
|
+
*
|
|
63
|
+
* @param tokenType - The token type number
|
|
64
|
+
* @param tokenText - The original token text
|
|
65
|
+
* @returns true if this is a function-like keyword
|
|
66
|
+
*/
|
|
67
|
+
export declare function isFunctionLikeKeyword(tokenType: number, tokenText: string): boolean;
|
|
68
|
+
export { SqlBaseLexer };
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Token Utilities - Grammar-Derived Token Detection
|
|
3
|
+
*
|
|
4
|
+
* This module provides utilities for detecting token types based on the ANTLR grammar.
|
|
5
|
+
* All detection is grammar-driven - NO hardcoded keyword lists.
|
|
6
|
+
*
|
|
7
|
+
* Key Functions:
|
|
8
|
+
* - getTokenType(): Get token type number from symbolic name
|
|
9
|
+
* - isKeywordToken(): Check if a token is a keyword (grammar-derived)
|
|
10
|
+
*/
|
|
11
|
+
// @ts-ignore - Generated ANTLR code
|
|
12
|
+
import SqlBaseLexer from './generated/SqlBaseLexer.js';
|
|
13
|
+
// ============================================================================
|
|
14
|
+
// SYMBOLIC NAME MAPPING (Built from grammar at runtime)
|
|
15
|
+
// ============================================================================
|
|
16
|
+
/**
|
|
17
|
+
* Map from symbolic name to token type number.
|
|
18
|
+
* Built once at module load time from the ANTLR lexer.
|
|
19
|
+
*/
|
|
20
|
+
const SYMBOLIC_NAME_TO_TYPE = new Map();
|
|
21
|
+
// Initialize the map from lexer's symbolic names
|
|
22
|
+
for (let i = 0; i < SqlBaseLexer.symbolicNames.length; i++) {
|
|
23
|
+
const name = SqlBaseLexer.symbolicNames[i];
|
|
24
|
+
if (name) {
|
|
25
|
+
SYMBOLIC_NAME_TO_TYPE.set(name, i);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
// ============================================================================
|
|
29
|
+
// PUBLIC API
|
|
30
|
+
// ============================================================================
|
|
31
|
+
/**
|
|
32
|
+
* Get token type number by symbolic name.
|
|
33
|
+
* Returns -1 if the name is not found.
|
|
34
|
+
*
|
|
35
|
+
* @param name - The symbolic name (e.g., 'SELECT', 'COMMA')
|
|
36
|
+
* @returns Token type number or -1
|
|
37
|
+
*/
|
|
38
|
+
export function getTokenType(name) {
|
|
39
|
+
return SYMBOLIC_NAME_TO_TYPE.get(name) ?? -1;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Get symbolic name for a token type.
|
|
43
|
+
* Returns null if no symbolic name exists.
|
|
44
|
+
*
|
|
45
|
+
* @param tokenType - The token type number
|
|
46
|
+
* @returns Symbolic name or null
|
|
47
|
+
*/
|
|
48
|
+
export function getSymbolicName(tokenType) {
|
|
49
|
+
return SqlBaseLexer.symbolicNames[tokenType] ?? null;
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Check if a token is a keyword based on grammar rules.
|
|
53
|
+
*
|
|
54
|
+
* Keywords in ANTLR are defined like: SELECT: 'SELECT';
|
|
55
|
+
* So symbolicNames[tokenType] === tokenText for keywords.
|
|
56
|
+
*
|
|
57
|
+
* IMPORTANT: Due to dual-lexing (uppercase for token types, original for text),
|
|
58
|
+
* we must be careful not to uppercase non-keywords. The tokenType comes from
|
|
59
|
+
* the uppercase stream, which may misclassify single letters (e.g., X -> BINARY_HEX).
|
|
60
|
+
*
|
|
61
|
+
* We ONLY return true if the symbolic name MATCHES the text (case-insensitive).
|
|
62
|
+
* This ensures we don't uppercase identifiers like 'x' just because X is BINARY_HEX.
|
|
63
|
+
*
|
|
64
|
+
* @param tokenType - The token type number
|
|
65
|
+
* @param tokenText - The original token text
|
|
66
|
+
* @returns true if the token is a keyword
|
|
67
|
+
*/
|
|
68
|
+
export function isKeywordToken(tokenType, tokenText) {
|
|
69
|
+
const symbolicName = SqlBaseLexer.symbolicNames[tokenType];
|
|
70
|
+
if (!symbolicName)
|
|
71
|
+
return false;
|
|
72
|
+
const textUpper = tokenText.toUpperCase();
|
|
73
|
+
// Direct match: symbolic name equals uppercase text (e.g., SELECT)
|
|
74
|
+
// This is the ONLY reliable way to detect keywords with dual-lexing
|
|
75
|
+
if (symbolicName === textUpper)
|
|
76
|
+
return true;
|
|
77
|
+
// Handle token name mismatches where the symbolic name differs from the keyword text
|
|
78
|
+
// These are defined in the grammar like: PERCENTLIT: 'PERCENT';
|
|
79
|
+
// Maps: keyword text -> symbolic name
|
|
80
|
+
const tokenNameMismatches = {
|
|
81
|
+
'PERCENT': 'PERCENTLIT', // PERCENTLIT: 'PERCENT';
|
|
82
|
+
'MINUS': 'SETMINUS', // SETMINUS: 'MINUS';
|
|
83
|
+
'IDENTIFIER': 'IDENTIFIER_KW', // IDENTIFIER_KW: 'IDENTIFIER';
|
|
84
|
+
};
|
|
85
|
+
if (tokenNameMismatches[textUpper] === symbolicName) {
|
|
86
|
+
return true;
|
|
87
|
+
}
|
|
88
|
+
// Handle keyword aliases where the grammar defines multiple literals for one token.
|
|
89
|
+
// Example: TEMPORARY: 'TEMPORARY' | 'TEMP'; - when user types 'temp', tokenType is TEMPORARY
|
|
90
|
+
// Note: DEC, INT, CHAR are separate tokens in the grammar (not aliases), so they're not listed here.
|
|
91
|
+
const aliasKeywords = {
|
|
92
|
+
'TEMP': 'TEMPORARY', // TEMPORARY: 'TEMPORARY' | 'TEMP';
|
|
93
|
+
'REGEXP': 'RLIKE', // RLIKE: 'RLIKE' | 'REGEXP';
|
|
94
|
+
};
|
|
95
|
+
if (aliasKeywords[textUpper] && symbolicName === aliasKeywords[textUpper]) {
|
|
96
|
+
return true;
|
|
97
|
+
}
|
|
98
|
+
return false;
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Check if a token type represents a comment.
|
|
102
|
+
*
|
|
103
|
+
* @param tokenType - The token type number
|
|
104
|
+
* @returns true if the token is a comment
|
|
105
|
+
*/
|
|
106
|
+
export function isCommentToken(tokenType) {
|
|
107
|
+
return tokenType === SqlBaseLexer.SIMPLE_COMMENT ||
|
|
108
|
+
tokenType === SqlBaseLexer.BRACKETED_COMMENT;
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Check if a token type represents whitespace.
|
|
112
|
+
*
|
|
113
|
+
* @param tokenType - The token type number
|
|
114
|
+
* @returns true if the token is whitespace
|
|
115
|
+
*/
|
|
116
|
+
export function isWhitespaceToken(tokenType) {
|
|
117
|
+
return tokenType === SqlBaseLexer.WS;
|
|
118
|
+
}
|
|
119
|
+
// ============================================================================
|
|
120
|
+
// FUNCTION-LIKE KEYWORDS
|
|
121
|
+
// ============================================================================
|
|
122
|
+
/**
|
|
123
|
+
* Keywords that are used like functions: KEYWORD(args)
|
|
124
|
+
* These need special handling for spacing (no space before opening paren).
|
|
125
|
+
*
|
|
126
|
+
* Note: This is a style choice for layout, not grammar-derived.
|
|
127
|
+
* These keywords take arguments in parentheses like functions do.
|
|
128
|
+
*/
|
|
129
|
+
const FUNCTION_LIKE_KEYWORDS = new Set([
|
|
130
|
+
'cast', 'try_cast', 'extract', 'position', 'substring', 'trim',
|
|
131
|
+
'overlay', 'percentile_cont', 'percentile_disc', 'any_value',
|
|
132
|
+
'first_value', 'last_value', 'nth_value', 'lead', 'lag',
|
|
133
|
+
'decimal', 'array', 'map', 'struct',
|
|
134
|
+
// Type constructors
|
|
135
|
+
'varchar', 'char',
|
|
136
|
+
// Constraints
|
|
137
|
+
'unique', 'primary', 'foreign', 'check',
|
|
138
|
+
// Hive streaming (SELECT TRANSFORM(...) USING ...)
|
|
139
|
+
'transform',
|
|
140
|
+
// DDL (CREATE FUNCTION name(...))
|
|
141
|
+
'function'
|
|
142
|
+
]);
|
|
143
|
+
/**
|
|
144
|
+
* Check if a keyword should be treated like a function (no space before paren).
|
|
145
|
+
*
|
|
146
|
+
* @param tokenType - The token type number
|
|
147
|
+
* @param tokenText - The original token text
|
|
148
|
+
* @returns true if this is a function-like keyword
|
|
149
|
+
*/
|
|
150
|
+
export function isFunctionLikeKeyword(tokenType, tokenText) {
|
|
151
|
+
return isKeywordToken(tokenType, tokenText) &&
|
|
152
|
+
FUNCTION_LIKE_KEYWORDS.has(tokenText.toLowerCase());
|
|
153
|
+
}
|
|
154
|
+
// Re-export the lexer for use in other modules
|
|
155
|
+
export { SqlBaseLexer };
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Type Definitions for Spark SQL Formatter
|
|
3
|
+
*
|
|
4
|
+
* Central location for all TypeScript interfaces used across the formatter.
|
|
5
|
+
* This improves code readability and enables better IDE support.
|
|
6
|
+
*/
|
|
7
|
+
/**
|
|
8
|
+
* Information about a multi-argument function that may need line-width expansion.
|
|
9
|
+
*/
|
|
10
|
+
export interface MultiArgFunctionInfo {
|
|
11
|
+
closeParenIndex: number;
|
|
12
|
+
commaIndices: number[];
|
|
13
|
+
spanLength: number;
|
|
14
|
+
functionName?: string;
|
|
15
|
+
charStart: number;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Nested function position info for expansion checking.
|
|
19
|
+
*/
|
|
20
|
+
export interface NestedFunctionInfo {
|
|
21
|
+
funcIdx: number;
|
|
22
|
+
relativeOffset: number;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Information about a window definition that may need line-width expansion.
|
|
26
|
+
*/
|
|
27
|
+
export interface WindowDefInfo {
|
|
28
|
+
closeParenIndex: number;
|
|
29
|
+
orderByTokenIndex: number | null;
|
|
30
|
+
windowFrameTokenIndex: number | null;
|
|
31
|
+
spanLength: number;
|
|
32
|
+
nestedFunctions: NestedFunctionInfo[];
|
|
33
|
+
}
|
|
34
|
+
/**
|
|
35
|
+
* Information about a PIVOT/UNPIVOT clause that may need expansion.
|
|
36
|
+
*/
|
|
37
|
+
export interface PivotInfo {
|
|
38
|
+
/** Opening LEFT_PAREN token index */
|
|
39
|
+
openParenIndex: number;
|
|
40
|
+
/** Closing RIGHT_PAREN token index */
|
|
41
|
+
closeParenIndex: number;
|
|
42
|
+
/** Comma indices in the aggregates list (before FOR) */
|
|
43
|
+
aggregateCommaIndices: number[];
|
|
44
|
+
/** FOR keyword token index */
|
|
45
|
+
forKeywordIndex: number | null;
|
|
46
|
+
/** IN keyword token index */
|
|
47
|
+
inKeywordIndex: number | null;
|
|
48
|
+
/** Comma indices in the IN list */
|
|
49
|
+
inListCommaIndices: number[];
|
|
50
|
+
/** Total span length for line width calculation */
|
|
51
|
+
spanLength: number;
|
|
52
|
+
/** Whether this is UNPIVOT (vs PIVOT) */
|
|
53
|
+
isUnpivot: boolean;
|
|
54
|
+
}
|
|
55
|
+
/**
|
|
56
|
+
* Information about an IN list (WHERE IN or PIVOT IN) for wrapping.
|
|
57
|
+
*/
|
|
58
|
+
export interface InListInfo {
|
|
59
|
+
/** Opening LEFT_PAREN token index (after IN keyword) */
|
|
60
|
+
openParenIndex: number;
|
|
61
|
+
/** Closing RIGHT_PAREN token index */
|
|
62
|
+
closeParenIndex: number;
|
|
63
|
+
/** Comma indices in the IN list */
|
|
64
|
+
commaIndices: number[];
|
|
65
|
+
/** Whether this is inside a PIVOT clause */
|
|
66
|
+
isInPivot: boolean;
|
|
67
|
+
}
|
|
68
|
+
/**
|
|
69
|
+
* Information about a simple query that can stay compact (on one line).
|
|
70
|
+
* A query is simple if it has single-item clauses and fits within line width.
|
|
71
|
+
*/
|
|
72
|
+
export interface SimpleQueryInfo {
|
|
73
|
+
/** Token index of the SELECT keyword */
|
|
74
|
+
selectTokenIndex: number;
|
|
75
|
+
/** Total span length of the entire query */
|
|
76
|
+
spanLength: number;
|
|
77
|
+
/** Subquery depth (0 = main query) */
|
|
78
|
+
depth: number;
|
|
79
|
+
}
|
|
80
|
+
/**
|
|
81
|
+
* Complete result from ParseTreeAnalyzer.
|
|
82
|
+
* Contains all token positions that need special handling during formatting.
|
|
83
|
+
*/
|
|
84
|
+
export interface AnalyzerResult {
|
|
85
|
+
identifierTokens: Set<number>;
|
|
86
|
+
functionCallTokens: Set<number>;
|
|
87
|
+
clauseStartTokens: Set<number>;
|
|
88
|
+
qualifiedNameTokens: Set<number>;
|
|
89
|
+
listItemCommas: Set<number>;
|
|
90
|
+
listFirstItems: Set<number>;
|
|
91
|
+
multiItemClauses: Set<number>;
|
|
92
|
+
conditionOperators: Set<number>;
|
|
93
|
+
multilineConditionClauses: Set<number>;
|
|
94
|
+
betweenAndTokens: Set<number>;
|
|
95
|
+
tokenDepthMap: Map<number, number>;
|
|
96
|
+
subqueryOpenParens: Set<number>;
|
|
97
|
+
subqueryCloseParens: Set<number>;
|
|
98
|
+
setOperandParens: Set<number>;
|
|
99
|
+
aliasInsertPositions: Set<number>;
|
|
100
|
+
tableAliasAsTokens: Set<number>;
|
|
101
|
+
joinOnTokens: Set<number>;
|
|
102
|
+
cteCommas: Set<number>;
|
|
103
|
+
cteMainSelectTokens: Set<number>;
|
|
104
|
+
ddlColumnCommas: Set<number>;
|
|
105
|
+
ddlOpenParens: Set<number>;
|
|
106
|
+
ddlCloseParens: Set<number>;
|
|
107
|
+
ddlFirstColumn: Set<number>;
|
|
108
|
+
ddlMultiColumn: Set<number>;
|
|
109
|
+
valuesCommas: Set<number>;
|
|
110
|
+
valuesHasTuples: boolean;
|
|
111
|
+
setClauseCommas: Set<number>;
|
|
112
|
+
setKeywordToken: number;
|
|
113
|
+
multiWhenCaseTokens: Set<number>;
|
|
114
|
+
caseWhenTokens: Set<number>;
|
|
115
|
+
caseElseTokens: Set<number>;
|
|
116
|
+
caseEndTokens: Set<number>;
|
|
117
|
+
simpleCaseTokens: Set<number>;
|
|
118
|
+
simpleCaseValueEndTokens: Set<number>;
|
|
119
|
+
groupingAnalyticsParens: Set<number>;
|
|
120
|
+
exceptClauseTokens: Set<number>;
|
|
121
|
+
setConfigTokens: Set<number>;
|
|
122
|
+
mergeUsingTokens: Set<number>;
|
|
123
|
+
mergeOnTokens: Set<number>;
|
|
124
|
+
mergeWhenTokens: Set<number>;
|
|
125
|
+
lateralViewCommas: Set<number>;
|
|
126
|
+
groupByAllTokens: Set<number>;
|
|
127
|
+
multiArgFunctionInfo: Map<number, MultiArgFunctionInfo>;
|
|
128
|
+
windowDefInfo: Map<number, WindowDefInfo>;
|
|
129
|
+
pivotInfo: Map<number, PivotInfo>;
|
|
130
|
+
inListInfo: Map<number, InListInfo>;
|
|
131
|
+
simpleQueries: Map<number, SimpleQueryInfo>;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Tracks the current state during token formatting.
|
|
135
|
+
* Used to determine spacing, newlines, and indentation.
|
|
136
|
+
*/
|
|
137
|
+
export interface FormattingState {
|
|
138
|
+
subqueryDepth: number;
|
|
139
|
+
ddlDepth: number;
|
|
140
|
+
caseDepth: number;
|
|
141
|
+
insideParens: number;
|
|
142
|
+
insideFunctionArgs: number;
|
|
143
|
+
complexTypeDepth: number;
|
|
144
|
+
currentColumn: number;
|
|
145
|
+
isFirstNonWsToken: boolean;
|
|
146
|
+
afterSelectKeyword: boolean;
|
|
147
|
+
afterGroupByKeyword: boolean;
|
|
148
|
+
afterOrderByKeyword: boolean;
|
|
149
|
+
afterWhereKeyword: boolean;
|
|
150
|
+
afterHavingKeyword: boolean;
|
|
151
|
+
afterSetKeyword: boolean;
|
|
152
|
+
afterValuesKeyword: boolean;
|
|
153
|
+
currentClauseIsMultiItem: boolean;
|
|
154
|
+
isFirstListItem: boolean;
|
|
155
|
+
justOutputCommaFirstStyle: boolean;
|
|
156
|
+
prevWasFunctionName: boolean;
|
|
157
|
+
prevWasBuiltInFunctionKeyword: boolean;
|
|
158
|
+
prevTokenText: string;
|
|
159
|
+
prevTokenType: number;
|
|
160
|
+
prevTokenWasUnaryOperator: boolean;
|
|
161
|
+
insideHint: boolean;
|
|
162
|
+
hintContent: string[];
|
|
163
|
+
justOutputMultiArgFunctionNewline: boolean;
|
|
164
|
+
justOutputWindowNewline: boolean;
|
|
165
|
+
justOutputPivotNewline: boolean;
|
|
166
|
+
/** Current column where IN list content started (after open paren) */
|
|
167
|
+
inListContentStartColumn: number | null;
|
|
168
|
+
/** Whether we're currently inside an IN list */
|
|
169
|
+
insideInList: boolean;
|
|
170
|
+
/** Just output an IN list wrap newline - skip space before next token */
|
|
171
|
+
justOutputInListWrapNewline: boolean;
|
|
172
|
+
/** Stack of compact query state per subquery level */
|
|
173
|
+
compactQueryStack: Array<{
|
|
174
|
+
isCompact: boolean;
|
|
175
|
+
depth: number;
|
|
176
|
+
}>;
|
|
177
|
+
}
|
|
178
|
+
/**
|
|
179
|
+
* Represents an expanded multi-arg function on the stack.
|
|
180
|
+
*/
|
|
181
|
+
export interface ExpandedFunction {
|
|
182
|
+
closeParenIndex: number;
|
|
183
|
+
commaIndices: Set<number>;
|
|
184
|
+
depth: number;
|
|
185
|
+
openingColumn: number;
|
|
186
|
+
firstArgIsChainedFunc: boolean;
|
|
187
|
+
functionName?: string;
|
|
188
|
+
/** For STACK: indices of commas that should NOT get newlines (every other comma) */
|
|
189
|
+
skipNewlineCommas?: Set<number>;
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Represents an expanded window definition.
|
|
193
|
+
*/
|
|
194
|
+
export interface ExpandedWindow {
|
|
195
|
+
closeParenIndex: number;
|
|
196
|
+
orderByTokenIndex: number | null;
|
|
197
|
+
windowFrameTokenIndex: number | null;
|
|
198
|
+
baseDepth: number;
|
|
199
|
+
}
|
|
200
|
+
/**
|
|
201
|
+
* Represents an expanded PIVOT/UNPIVOT clause.
|
|
202
|
+
*/
|
|
203
|
+
export interface ExpandedPivot {
|
|
204
|
+
closeParenIndex: number;
|
|
205
|
+
/** Commas in aggregate list */
|
|
206
|
+
aggregateCommaIndices: Set<number>;
|
|
207
|
+
/** FOR keyword index */
|
|
208
|
+
forKeywordIndex: number | null;
|
|
209
|
+
/** IN keyword index */
|
|
210
|
+
inKeywordIndex: number | null;
|
|
211
|
+
/** Commas in IN list */
|
|
212
|
+
inListCommaIndices: Set<number>;
|
|
213
|
+
/** Depth for indentation */
|
|
214
|
+
depth: number;
|
|
215
|
+
/** Column where PIVOT started */
|
|
216
|
+
openingColumn: number;
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Represents a pending comment to be output.
|
|
220
|
+
*/
|
|
221
|
+
export interface PendingComment {
|
|
222
|
+
text: string;
|
|
223
|
+
type: number;
|
|
224
|
+
wasOnOwnLine: boolean;
|
|
225
|
+
/** True if there was a blank line before this comment (for preserving paragraph breaks) */
|
|
226
|
+
hadBlankLineBefore: boolean;
|
|
227
|
+
}
|
|
228
|
+
/**
|
|
229
|
+
* Context information for a single token during formatting.
|
|
230
|
+
*/
|
|
231
|
+
export interface TokenContext {
|
|
232
|
+
tokenIndex: number;
|
|
233
|
+
tokenType: number;
|
|
234
|
+
text: string;
|
|
235
|
+
symbolicName: string | null;
|
|
236
|
+
isInIdentifierContext: boolean;
|
|
237
|
+
isFunctionCall: boolean;
|
|
238
|
+
isClauseStart: boolean;
|
|
239
|
+
isListComma: boolean;
|
|
240
|
+
isConditionOperator: boolean;
|
|
241
|
+
isBetweenAnd: boolean;
|
|
242
|
+
isJoinOn: boolean;
|
|
243
|
+
isSubqueryOpenParen: boolean;
|
|
244
|
+
isSubqueryCloseParen: boolean;
|
|
245
|
+
isSetOperandParen: boolean;
|
|
246
|
+
isCteComma: boolean;
|
|
247
|
+
isDdlComma: boolean;
|
|
248
|
+
isDdlOpenParen: boolean;
|
|
249
|
+
isDdlCloseParen: boolean;
|
|
250
|
+
isDdlMultiColumn: boolean;
|
|
251
|
+
isValuesComma: boolean;
|
|
252
|
+
isSetComma: boolean;
|
|
253
|
+
isSetKeyword: boolean;
|
|
254
|
+
isLateralViewComma: boolean;
|
|
255
|
+
isMultiWhenCase: boolean;
|
|
256
|
+
isCaseWhen: boolean;
|
|
257
|
+
isCaseElse: boolean;
|
|
258
|
+
isCaseEnd: boolean;
|
|
259
|
+
isMergeUsing: boolean;
|
|
260
|
+
isMergeOn: boolean;
|
|
261
|
+
isMergeWhen: boolean;
|
|
262
|
+
multiArgFuncInfo: MultiArgFunctionInfo | undefined;
|
|
263
|
+
windowDefInfo: WindowDefInfo | undefined;
|
|
264
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Formatter Types
|
|
3
|
+
*
|
|
4
|
+
* Common interfaces for language formatters.
|
|
5
|
+
* Designed to be extensible for future language support.
|
|
6
|
+
*/
|
|
7
|
+
/** Options that can be passed to any formatter */
|
|
8
|
+
export interface FormatterOptions {
|
|
9
|
+
/** Maximum line width (default: 140) */
|
|
10
|
+
lineWidth?: number;
|
|
11
|
+
/** Additional offset to subtract from line width (e.g., for MAGIC prefixes) */
|
|
12
|
+
lineWidthOffset?: number;
|
|
13
|
+
/** Suppress multiline expansion (noqa:expansion) */
|
|
14
|
+
suppressExpansion?: boolean;
|
|
15
|
+
}
|
|
16
|
+
/** Result of a format operation */
|
|
17
|
+
export interface FormatResult {
|
|
18
|
+
/** The formatted code */
|
|
19
|
+
formatted: string;
|
|
20
|
+
/** Whether the code was changed */
|
|
21
|
+
changed: boolean;
|
|
22
|
+
/** Any errors that occurred (null = success) */
|
|
23
|
+
error?: string;
|
|
24
|
+
}
|
|
25
|
+
/** Common interface for all language formatters */
|
|
26
|
+
export interface LanguageFormatter {
|
|
27
|
+
/** Language identifier (e.g., 'sql', 'python', 'scala') */
|
|
28
|
+
readonly language: string;
|
|
29
|
+
/** Human-readable name */
|
|
30
|
+
readonly displayName: string;
|
|
31
|
+
/** Whether the formatter is initialized and ready */
|
|
32
|
+
isReady(): boolean;
|
|
33
|
+
/** Initialize the formatter (load WASM, etc.) */
|
|
34
|
+
initialize(): Promise<void>;
|
|
35
|
+
/** Format code with optional options */
|
|
36
|
+
format(code: string, options?: FormatterOptions): FormatResult;
|
|
37
|
+
/** Check if code needs formatting without modifying it */
|
|
38
|
+
needsFormatting(code: string, options?: FormatterOptions): boolean;
|
|
39
|
+
}
|
|
40
|
+
/** Configuration for a language formatter */
|
|
41
|
+
export interface FormatterConfig {
|
|
42
|
+
/** Whether this formatter is enabled */
|
|
43
|
+
enabled: boolean;
|
|
44
|
+
/** Formatter-specific options */
|
|
45
|
+
options: Record<string, unknown>;
|
|
46
|
+
}
|
|
47
|
+
/** Registry of all available formatters */
|
|
48
|
+
export interface FormatterRegistry {
|
|
49
|
+
/** Get a formatter by language identifier */
|
|
50
|
+
get(language: string): LanguageFormatter | undefined;
|
|
51
|
+
/** Register a new formatter */
|
|
52
|
+
register(formatter: LanguageFormatter): void;
|
|
53
|
+
/** List all registered language identifiers */
|
|
54
|
+
languages(): string[];
|
|
55
|
+
/** Initialize all formatters */
|
|
56
|
+
initializeAll(): Promise<void>;
|
|
57
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* sparkfmt - Spark SQL & Python Formatter
|
|
3
|
+
*
|
|
4
|
+
* A unified formatter for Spark SQL and Python code, designed for
|
|
5
|
+
* Microsoft Fabric notebooks and CI/CD pipelines.
|
|
6
|
+
*
|
|
7
|
+
* Architecture:
|
|
8
|
+
* - formatters/sql/: Core SQL formatting (ANTLR grammar-driven)
|
|
9
|
+
* - formatters/python/: Python formatting via Ruff WASM
|
|
10
|
+
* - notebook-formatter.ts: Fabric notebook handling
|
|
11
|
+
*/
|
|
12
|
+
export { formatSql, needsFormatting } from './formatters/sparksql/index.js';
|
|
13
|
+
export { getFormatterRegistry, detectLanguage, SqlFormatter, getSqlFormatter, isSqlCode, PythonFormatter, getPythonFormatter, isPythonCode, type LanguageFormatter, type FormatterOptions, type FormatResult, type FormatterConfig, type FormatterRegistry, } from './formatters/index.js';
|
|
14
|
+
export { formatCell, formatCellSync, initializePythonFormatter, isPythonFormatterReady, type FormatCellResult, type CellType, } from './cell-formatter.js';
|
|
15
|
+
export { parseNotebook, formatNotebook, type NotebookCell, type FabricNotebook, type FormatStats, } from './notebook-formatter.js';
|
|
16
|
+
export { DEFAULT_RUFF_CONFIG, RUFF_WASM_CONFIG, type RuffConfig, type RuffFormatConfig, type WasmInitOptions, } from './formatters/python/index.js';
|
|
17
|
+
export { hasFormatOff, detectCollapseDirectives, hasCollapseDirective, type FormatDirectiveInfo } from './formatters/sparksql/index.js';
|
|
18
|
+
export type { AnalyzerResult, FormattingState, MultiArgFunctionInfo, WindowDefInfo, TokenContext, PendingComment, ExpandedFunction, ExpandedWindow } from './formatters/sparksql/types.js';
|