sqlspec 0.16.1__cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sqlspec might be problematic. Click here for more details.

Files changed (148) hide show
  1. 51ff5a9eadfdefd49f98__mypyc.cpython-39-aarch64-linux-gnu.so +0 -0
  2. sqlspec/__init__.py +92 -0
  3. sqlspec/__main__.py +12 -0
  4. sqlspec/__metadata__.py +14 -0
  5. sqlspec/_serialization.py +77 -0
  6. sqlspec/_sql.py +1780 -0
  7. sqlspec/_typing.py +680 -0
  8. sqlspec/adapters/__init__.py +0 -0
  9. sqlspec/adapters/adbc/__init__.py +5 -0
  10. sqlspec/adapters/adbc/_types.py +12 -0
  11. sqlspec/adapters/adbc/config.py +361 -0
  12. sqlspec/adapters/adbc/driver.py +512 -0
  13. sqlspec/adapters/aiosqlite/__init__.py +19 -0
  14. sqlspec/adapters/aiosqlite/_types.py +13 -0
  15. sqlspec/adapters/aiosqlite/config.py +253 -0
  16. sqlspec/adapters/aiosqlite/driver.py +248 -0
  17. sqlspec/adapters/asyncmy/__init__.py +19 -0
  18. sqlspec/adapters/asyncmy/_types.py +12 -0
  19. sqlspec/adapters/asyncmy/config.py +180 -0
  20. sqlspec/adapters/asyncmy/driver.py +274 -0
  21. sqlspec/adapters/asyncpg/__init__.py +21 -0
  22. sqlspec/adapters/asyncpg/_types.py +17 -0
  23. sqlspec/adapters/asyncpg/config.py +229 -0
  24. sqlspec/adapters/asyncpg/driver.py +344 -0
  25. sqlspec/adapters/bigquery/__init__.py +18 -0
  26. sqlspec/adapters/bigquery/_types.py +12 -0
  27. sqlspec/adapters/bigquery/config.py +298 -0
  28. sqlspec/adapters/bigquery/driver.py +558 -0
  29. sqlspec/adapters/duckdb/__init__.py +22 -0
  30. sqlspec/adapters/duckdb/_types.py +12 -0
  31. sqlspec/adapters/duckdb/config.py +504 -0
  32. sqlspec/adapters/duckdb/driver.py +368 -0
  33. sqlspec/adapters/oracledb/__init__.py +32 -0
  34. sqlspec/adapters/oracledb/_types.py +14 -0
  35. sqlspec/adapters/oracledb/config.py +317 -0
  36. sqlspec/adapters/oracledb/driver.py +538 -0
  37. sqlspec/adapters/psqlpy/__init__.py +16 -0
  38. sqlspec/adapters/psqlpy/_types.py +11 -0
  39. sqlspec/adapters/psqlpy/config.py +214 -0
  40. sqlspec/adapters/psqlpy/driver.py +530 -0
  41. sqlspec/adapters/psycopg/__init__.py +32 -0
  42. sqlspec/adapters/psycopg/_types.py +17 -0
  43. sqlspec/adapters/psycopg/config.py +426 -0
  44. sqlspec/adapters/psycopg/driver.py +796 -0
  45. sqlspec/adapters/sqlite/__init__.py +15 -0
  46. sqlspec/adapters/sqlite/_types.py +11 -0
  47. sqlspec/adapters/sqlite/config.py +240 -0
  48. sqlspec/adapters/sqlite/driver.py +294 -0
  49. sqlspec/base.py +571 -0
  50. sqlspec/builder/__init__.py +62 -0
  51. sqlspec/builder/_base.py +473 -0
  52. sqlspec/builder/_column.py +320 -0
  53. sqlspec/builder/_ddl.py +1346 -0
  54. sqlspec/builder/_ddl_utils.py +103 -0
  55. sqlspec/builder/_delete.py +76 -0
  56. sqlspec/builder/_insert.py +256 -0
  57. sqlspec/builder/_merge.py +71 -0
  58. sqlspec/builder/_parsing_utils.py +140 -0
  59. sqlspec/builder/_select.py +170 -0
  60. sqlspec/builder/_update.py +188 -0
  61. sqlspec/builder/mixins/__init__.py +55 -0
  62. sqlspec/builder/mixins/_cte_and_set_ops.py +222 -0
  63. sqlspec/builder/mixins/_delete_operations.py +41 -0
  64. sqlspec/builder/mixins/_insert_operations.py +244 -0
  65. sqlspec/builder/mixins/_join_operations.py +122 -0
  66. sqlspec/builder/mixins/_merge_operations.py +476 -0
  67. sqlspec/builder/mixins/_order_limit_operations.py +135 -0
  68. sqlspec/builder/mixins/_pivot_operations.py +153 -0
  69. sqlspec/builder/mixins/_select_operations.py +603 -0
  70. sqlspec/builder/mixins/_update_operations.py +187 -0
  71. sqlspec/builder/mixins/_where_clause.py +621 -0
  72. sqlspec/cli.py +247 -0
  73. sqlspec/config.py +395 -0
  74. sqlspec/core/__init__.py +63 -0
  75. sqlspec/core/cache.cpython-39-aarch64-linux-gnu.so +0 -0
  76. sqlspec/core/cache.py +871 -0
  77. sqlspec/core/compiler.cpython-39-aarch64-linux-gnu.so +0 -0
  78. sqlspec/core/compiler.py +417 -0
  79. sqlspec/core/filters.cpython-39-aarch64-linux-gnu.so +0 -0
  80. sqlspec/core/filters.py +830 -0
  81. sqlspec/core/hashing.cpython-39-aarch64-linux-gnu.so +0 -0
  82. sqlspec/core/hashing.py +310 -0
  83. sqlspec/core/parameters.cpython-39-aarch64-linux-gnu.so +0 -0
  84. sqlspec/core/parameters.py +1237 -0
  85. sqlspec/core/result.cpython-39-aarch64-linux-gnu.so +0 -0
  86. sqlspec/core/result.py +677 -0
  87. sqlspec/core/splitter.cpython-39-aarch64-linux-gnu.so +0 -0
  88. sqlspec/core/splitter.py +819 -0
  89. sqlspec/core/statement.cpython-39-aarch64-linux-gnu.so +0 -0
  90. sqlspec/core/statement.py +676 -0
  91. sqlspec/driver/__init__.py +19 -0
  92. sqlspec/driver/_async.py +502 -0
  93. sqlspec/driver/_common.py +631 -0
  94. sqlspec/driver/_sync.py +503 -0
  95. sqlspec/driver/mixins/__init__.py +6 -0
  96. sqlspec/driver/mixins/_result_tools.py +193 -0
  97. sqlspec/driver/mixins/_sql_translator.py +86 -0
  98. sqlspec/exceptions.py +193 -0
  99. sqlspec/extensions/__init__.py +0 -0
  100. sqlspec/extensions/aiosql/__init__.py +10 -0
  101. sqlspec/extensions/aiosql/adapter.py +461 -0
  102. sqlspec/extensions/litestar/__init__.py +6 -0
  103. sqlspec/extensions/litestar/_utils.py +52 -0
  104. sqlspec/extensions/litestar/cli.py +48 -0
  105. sqlspec/extensions/litestar/config.py +92 -0
  106. sqlspec/extensions/litestar/handlers.py +260 -0
  107. sqlspec/extensions/litestar/plugin.py +145 -0
  108. sqlspec/extensions/litestar/providers.py +454 -0
  109. sqlspec/loader.cpython-39-aarch64-linux-gnu.so +0 -0
  110. sqlspec/loader.py +760 -0
  111. sqlspec/migrations/__init__.py +35 -0
  112. sqlspec/migrations/base.py +414 -0
  113. sqlspec/migrations/commands.py +443 -0
  114. sqlspec/migrations/loaders.py +402 -0
  115. sqlspec/migrations/runner.py +213 -0
  116. sqlspec/migrations/tracker.py +140 -0
  117. sqlspec/migrations/utils.py +129 -0
  118. sqlspec/protocols.py +407 -0
  119. sqlspec/py.typed +0 -0
  120. sqlspec/storage/__init__.py +23 -0
  121. sqlspec/storage/backends/__init__.py +0 -0
  122. sqlspec/storage/backends/base.py +163 -0
  123. sqlspec/storage/backends/fsspec.py +386 -0
  124. sqlspec/storage/backends/obstore.py +459 -0
  125. sqlspec/storage/capabilities.py +102 -0
  126. sqlspec/storage/registry.py +239 -0
  127. sqlspec/typing.py +299 -0
  128. sqlspec/utils/__init__.py +3 -0
  129. sqlspec/utils/correlation.py +150 -0
  130. sqlspec/utils/deprecation.py +106 -0
  131. sqlspec/utils/fixtures.cpython-39-aarch64-linux-gnu.so +0 -0
  132. sqlspec/utils/fixtures.py +58 -0
  133. sqlspec/utils/logging.py +127 -0
  134. sqlspec/utils/module_loader.py +89 -0
  135. sqlspec/utils/serializers.py +4 -0
  136. sqlspec/utils/singleton.py +32 -0
  137. sqlspec/utils/sync_tools.cpython-39-aarch64-linux-gnu.so +0 -0
  138. sqlspec/utils/sync_tools.py +237 -0
  139. sqlspec/utils/text.cpython-39-aarch64-linux-gnu.so +0 -0
  140. sqlspec/utils/text.py +96 -0
  141. sqlspec/utils/type_guards.cpython-39-aarch64-linux-gnu.so +0 -0
  142. sqlspec/utils/type_guards.py +1139 -0
  143. sqlspec-0.16.1.dist-info/METADATA +365 -0
  144. sqlspec-0.16.1.dist-info/RECORD +148 -0
  145. sqlspec-0.16.1.dist-info/WHEEL +7 -0
  146. sqlspec-0.16.1.dist-info/entry_points.txt +2 -0
  147. sqlspec-0.16.1.dist-info/licenses/LICENSE +21 -0
  148. sqlspec-0.16.1.dist-info/licenses/NOTICE +29 -0
@@ -0,0 +1,819 @@
1
+ """SQL statement splitter with caching and dialect support.
2
+
3
+ This module provides a SQL script statement splitter with caching and
4
+ multiple dialect support.
5
+
6
+ Components:
7
+ - StatementSplitter: SQL splitter with caching
8
+ - DialectConfig: Dialect configuration system
9
+ - Token/TokenType: Tokenization system
10
+ - Caching: LRU caching for split results
11
+ - Pattern compilation caching
12
+
13
+ Features:
14
+ - Support for multiple SQL dialects (Oracle, T-SQL, PostgreSQL, MySQL, SQLite, DuckDB, BigQuery)
15
+ - Cached pattern compilation
16
+ - LRU caching for split results
17
+ - Optimized tokenization
18
+ - Complete preservation of split_sql_script function
19
+ """
20
+
21
+ import re
22
+ import threading
23
+ from abc import ABC, abstractmethod
24
+ from collections.abc import Generator
25
+ from enum import Enum
26
+ from re import Pattern
27
+ from typing import Any, Callable, Optional, Union
28
+
29
+ from mypy_extensions import mypyc_attr
30
+ from typing_extensions import TypeAlias
31
+
32
+ from sqlspec.core.cache import CacheKey, UnifiedCache
33
+ from sqlspec.utils.logging import get_logger
34
+
35
+ __all__ = (
36
+ "DialectConfig",
37
+ "OracleDialectConfig",
38
+ "PostgreSQLDialectConfig",
39
+ "StatementSplitter",
40
+ "TSQLDialectConfig",
41
+ "Token",
42
+ "TokenType",
43
+ "split_sql_script",
44
+ )
45
+
46
+ logger = get_logger("sqlspec.core.splitter")
47
+
48
+ DEFAULT_PATTERN_CACHE_SIZE = 1000 # Compiled regex patterns
49
+ DEFAULT_RESULT_CACHE_SIZE = 5000 # Split results
50
+ DEFAULT_CACHE_TTL = 3600 # 1 hour TTL
51
+
52
+ DIALECT_CONFIG_SLOTS = (
53
+ "_block_starters",
54
+ "_block_enders",
55
+ "_statement_terminators",
56
+ "_batch_separators",
57
+ "_special_terminators",
58
+ "_max_nesting_depth",
59
+ "_name",
60
+ )
61
+
62
+ TOKEN_SLOTS = ("type", "value", "line", "column", "position")
63
+
64
+ SPLITTER_SLOTS = (
65
+ "_dialect",
66
+ "_strip_trailing_semicolon",
67
+ "_token_patterns",
68
+ "_compiled_patterns",
69
+ "_pattern_cache_key",
70
+ "_result_cache",
71
+ "_pattern_cache",
72
+ )
73
+
74
+
75
+ class TokenType(Enum):
76
+ """Types of tokens recognized by the SQL lexer."""
77
+
78
+ COMMENT_LINE = "COMMENT_LINE"
79
+ COMMENT_BLOCK = "COMMENT_BLOCK"
80
+ STRING_LITERAL = "STRING_LITERAL"
81
+ QUOTED_IDENTIFIER = "QUOTED_IDENTIFIER"
82
+ KEYWORD = "KEYWORD"
83
+ TERMINATOR = "TERMINATOR"
84
+ BATCH_SEPARATOR = "BATCH_SEPARATOR"
85
+ WHITESPACE = "WHITESPACE"
86
+ OTHER = "OTHER"
87
+
88
+
89
+ @mypyc_attr(allow_interpreted_subclasses=True)
90
+ class Token:
91
+ """SQL token with metadata."""
92
+
93
+ __slots__ = TOKEN_SLOTS
94
+
95
+ def __init__(self, type: TokenType, value: str, line: int, column: int, position: int) -> None:
96
+ self.type = type
97
+ self.value = value
98
+ self.line = line
99
+ self.column = column
100
+ self.position = position
101
+
102
+ def __repr__(self) -> str:
103
+ return f"Token({self.type.value}, {self.value!r}, {self.line}:{self.column})"
104
+
105
+
106
+ TokenHandler: TypeAlias = Callable[[str, int, int, int], Optional[Token]]
107
+ TokenPattern: TypeAlias = Union[str, TokenHandler]
108
+ CompiledTokenPattern: TypeAlias = Union[Pattern[str], TokenHandler]
109
+
110
+
111
+ @mypyc_attr(allow_interpreted_subclasses=True)
112
+ class DialectConfig(ABC):
113
+ """Abstract base class for SQL dialect configurations."""
114
+
115
+ __slots__ = DIALECT_CONFIG_SLOTS
116
+
117
+ def __init__(self) -> None:
118
+ """Initialize dialect configuration."""
119
+ self._name: Optional[str] = None
120
+ self._block_starters: Optional[set[str]] = None
121
+ self._block_enders: Optional[set[str]] = None
122
+ self._statement_terminators: Optional[set[str]] = None
123
+ self._batch_separators: Optional[set[str]] = None
124
+ self._special_terminators: Optional[dict[str, Callable[[list[Token], int], bool]]] = None
125
+ self._max_nesting_depth: Optional[int] = None
126
+
127
+ @property
128
+ @abstractmethod
129
+ def name(self) -> str:
130
+ """Name of the dialect (e.g., 'oracle', 'tsql')."""
131
+
132
+ @property
133
+ @abstractmethod
134
+ def block_starters(self) -> set[str]:
135
+ """Keywords that start a block (e.g., BEGIN, DECLARE)."""
136
+
137
+ @property
138
+ @abstractmethod
139
+ def block_enders(self) -> set[str]:
140
+ """Keywords that end a block (e.g., END)."""
141
+
142
+ @property
143
+ @abstractmethod
144
+ def statement_terminators(self) -> set[str]:
145
+ """Characters that terminate statements (e.g., ;)."""
146
+
147
+ @property
148
+ def batch_separators(self) -> set[str]:
149
+ """Keywords that separate batches (e.g., GO for T-SQL)."""
150
+ if self._batch_separators is None:
151
+ self._batch_separators = set()
152
+ return self._batch_separators
153
+
154
+ @property
155
+ def special_terminators(self) -> dict[str, Callable[[list[Token], int], bool]]:
156
+ """Special terminators that need custom handling."""
157
+ if self._special_terminators is None:
158
+ self._special_terminators = {}
159
+ return self._special_terminators
160
+
161
+ @property
162
+ def max_nesting_depth(self) -> int:
163
+ """Maximum allowed nesting depth for blocks."""
164
+ if self._max_nesting_depth is None:
165
+ self._max_nesting_depth = 256
166
+ return self._max_nesting_depth
167
+
168
+ def get_all_token_patterns(self) -> list[tuple[TokenType, TokenPattern]]:
169
+ """Assembles the complete, ordered list of token regex patterns."""
170
+ patterns: list[tuple[TokenType, TokenPattern]] = [
171
+ (TokenType.COMMENT_LINE, r"--[^\n]*"),
172
+ (TokenType.COMMENT_BLOCK, r"/\*[\s\S]*?\*/"),
173
+ (TokenType.STRING_LITERAL, r"'(?:[^']|'')*'"),
174
+ (TokenType.QUOTED_IDENTIFIER, r'"[^"]*"|\[[^\]]*\]'),
175
+ ]
176
+
177
+ patterns.extend(self._get_dialect_specific_patterns())
178
+
179
+ all_keywords = self.block_starters | self.block_enders | self.batch_separators
180
+ if all_keywords:
181
+ sorted_keywords = sorted(all_keywords, key=len, reverse=True)
182
+ patterns.append((TokenType.KEYWORD, r"\b(" + "|".join(re.escape(kw) for kw in sorted_keywords) + r")\b"))
183
+
184
+ all_terminators = self.statement_terminators | set(self.special_terminators.keys())
185
+ if all_terminators:
186
+ patterns.append((TokenType.TERMINATOR, "|".join(re.escape(t) for t in all_terminators)))
187
+
188
+ patterns.extend([(TokenType.WHITESPACE, r"\s+"), (TokenType.OTHER, r".")])
189
+
190
+ return patterns
191
+
192
+ def _get_dialect_specific_patterns(self) -> list[tuple[TokenType, TokenPattern]]:
193
+ """Override to add dialect-specific token patterns."""
194
+ return []
195
+
196
+ @staticmethod
197
+ def is_real_block_ender(tokens: list[Token], current_pos: int) -> bool: # noqa: ARG004
198
+ """Check if this END keyword is actually a block ender."""
199
+ return True
200
+
201
+ def should_delay_semicolon_termination(self, tokens: list[Token], current_pos: int) -> bool:
202
+ """Check if semicolon termination should be delayed."""
203
+ return False
204
+
205
+
206
+ class OracleDialectConfig(DialectConfig):
207
+ """Configuration for Oracle PL/SQL dialect."""
208
+
209
+ @property
210
+ def name(self) -> str:
211
+ if self._name is None:
212
+ self._name = "oracle"
213
+ return self._name
214
+
215
+ @property
216
+ def block_starters(self) -> set[str]:
217
+ if self._block_starters is None:
218
+ self._block_starters = {"BEGIN", "DECLARE", "CASE"}
219
+ return self._block_starters
220
+
221
+ @property
222
+ def block_enders(self) -> set[str]:
223
+ if self._block_enders is None:
224
+ self._block_enders = {"END"}
225
+ return self._block_enders
226
+
227
+ @property
228
+ def statement_terminators(self) -> set[str]:
229
+ if self._statement_terminators is None:
230
+ self._statement_terminators = {";"}
231
+ return self._statement_terminators
232
+
233
+ @property
234
+ def special_terminators(self) -> dict[str, Callable[[list[Token], int], bool]]:
235
+ if self._special_terminators is None:
236
+ self._special_terminators = {"/": self._handle_slash_terminator}
237
+ return self._special_terminators
238
+
239
+ def should_delay_semicolon_termination(self, tokens: list[Token], current_pos: int) -> bool:
240
+ """Check if we should delay semicolon termination to look for a slash."""
241
+ pos = current_pos - 1
242
+ while pos >= 0:
243
+ token = tokens[pos]
244
+ if token.type == TokenType.WHITESPACE:
245
+ pos -= 1
246
+ continue
247
+ if token.type == TokenType.KEYWORD and token.value.upper() == "END":
248
+ return self._has_upcoming_slash(tokens, current_pos)
249
+ break
250
+
251
+ return False
252
+
253
+ def _has_upcoming_slash(self, tokens: list[Token], current_pos: int) -> bool:
254
+ """Check if there's a / terminator coming up on its own line."""
255
+ pos = current_pos + 1
256
+ found_newline = False
257
+
258
+ while pos < len(tokens):
259
+ token = tokens[pos]
260
+ if token.type == TokenType.WHITESPACE:
261
+ if "\n" in token.value:
262
+ found_newline = True
263
+ pos += 1
264
+ continue
265
+ if token.type == TokenType.TERMINATOR and token.value == "/":
266
+ return found_newline and self._handle_slash_terminator(tokens, pos)
267
+ if token.type in {TokenType.COMMENT_LINE, TokenType.COMMENT_BLOCK}:
268
+ pos += 1
269
+ continue
270
+ break
271
+
272
+ return False
273
+
274
+ @staticmethod
275
+ def is_real_block_ender(tokens: list[Token], current_pos: int) -> bool:
276
+ """Check if this END keyword is actually a block ender for Oracle PL/SQL."""
277
+ pos = current_pos + 1
278
+ while pos < len(tokens):
279
+ next_token = tokens[pos]
280
+
281
+ if next_token.type == TokenType.WHITESPACE:
282
+ pos += 1
283
+ continue
284
+ if next_token.type == TokenType.OTHER:
285
+ word_chars = []
286
+ word_pos = pos
287
+ while word_pos < len(tokens) and tokens[word_pos].type == TokenType.OTHER:
288
+ word_chars.append(tokens[word_pos].value)
289
+ word_pos += 1
290
+
291
+ word = "".join(word_chars).upper()
292
+ if word in {"IF", "LOOP", "CASE", "WHILE"}:
293
+ return False
294
+ break
295
+ return True
296
+
297
+ @staticmethod
298
+ def _handle_slash_terminator(tokens: list[Token], current_pos: int) -> bool:
299
+ """Oracle / must be on its own line after whitespace only."""
300
+ if current_pos == 0:
301
+ return True
302
+
303
+ pos = current_pos - 1
304
+ while pos >= 0:
305
+ token = tokens[pos]
306
+ if "\n" in token.value:
307
+ break
308
+ if token.type not in {TokenType.WHITESPACE, TokenType.COMMENT_LINE}:
309
+ return False
310
+ pos -= 1
311
+
312
+ return True
313
+
314
+
315
+ class TSQLDialectConfig(DialectConfig):
316
+ """Configuration for T-SQL (SQL Server) dialect."""
317
+
318
+ @property
319
+ def name(self) -> str:
320
+ if self._name is None:
321
+ self._name = "tsql"
322
+ return self._name
323
+
324
+ @property
325
+ def block_starters(self) -> set[str]:
326
+ if self._block_starters is None:
327
+ self._block_starters = {"BEGIN", "TRY"}
328
+ return self._block_starters
329
+
330
+ @property
331
+ def block_enders(self) -> set[str]:
332
+ if self._block_enders is None:
333
+ self._block_enders = {"END", "CATCH"}
334
+ return self._block_enders
335
+
336
+ @property
337
+ def statement_terminators(self) -> set[str]:
338
+ if self._statement_terminators is None:
339
+ self._statement_terminators = {";"}
340
+ return self._statement_terminators
341
+
342
+ @property
343
+ def batch_separators(self) -> set[str]:
344
+ if self._batch_separators is None:
345
+ self._batch_separators = {"GO"}
346
+ return self._batch_separators
347
+
348
+
349
+ class PostgreSQLDialectConfig(DialectConfig):
350
+ """Configuration for PostgreSQL dialect with dollar-quoted strings."""
351
+
352
+ @property
353
+ def name(self) -> str:
354
+ if self._name is None:
355
+ self._name = "postgresql"
356
+ return self._name
357
+
358
+ @property
359
+ def block_starters(self) -> set[str]:
360
+ if self._block_starters is None:
361
+ self._block_starters = {"BEGIN", "DECLARE", "CASE", "DO"}
362
+ return self._block_starters
363
+
364
+ @property
365
+ def block_enders(self) -> set[str]:
366
+ if self._block_enders is None:
367
+ self._block_enders = {"END"}
368
+ return self._block_enders
369
+
370
+ @property
371
+ def statement_terminators(self) -> set[str]:
372
+ if self._statement_terminators is None:
373
+ self._statement_terminators = {";"}
374
+ return self._statement_terminators
375
+
376
+ def _get_dialect_specific_patterns(self) -> list[tuple[TokenType, TokenPattern]]:
377
+ """Add PostgreSQL-specific patterns like dollar-quoted strings."""
378
+ return [(TokenType.STRING_LITERAL, self._handle_dollar_quoted_string)]
379
+
380
+ @staticmethod
381
+ def _handle_dollar_quoted_string(text: str, position: int, line: int, column: int) -> Optional[Token]:
382
+ """Handle PostgreSQL dollar-quoted strings like $tag$...$tag$."""
383
+ start_match = re.match(r"\$([a-zA-Z_][a-zA-Z0-9_]*)?\$", text[position:])
384
+ if not start_match:
385
+ return None
386
+
387
+ tag = start_match.group(0)
388
+ content_start = position + len(tag)
389
+
390
+ try:
391
+ content_end = text.index(tag, content_start)
392
+ full_value = text[position : content_end + len(tag)]
393
+
394
+ return Token(type=TokenType.STRING_LITERAL, value=full_value, line=line, column=column, position=position)
395
+ except ValueError:
396
+ return None
397
+
398
+
399
+ class GenericDialectConfig(DialectConfig):
400
+ """Generic SQL dialect configuration for standard SQL."""
401
+
402
+ @property
403
+ def name(self) -> str:
404
+ if self._name is None:
405
+ self._name = "generic"
406
+ return self._name
407
+
408
+ @property
409
+ def block_starters(self) -> set[str]:
410
+ if self._block_starters is None:
411
+ self._block_starters = {"BEGIN", "DECLARE", "CASE"}
412
+ return self._block_starters
413
+
414
+ @property
415
+ def block_enders(self) -> set[str]:
416
+ if self._block_enders is None:
417
+ self._block_enders = {"END"}
418
+ return self._block_enders
419
+
420
+ @property
421
+ def statement_terminators(self) -> set[str]:
422
+ if self._statement_terminators is None:
423
+ self._statement_terminators = {";"}
424
+ return self._statement_terminators
425
+
426
+
427
+ class MySQLDialectConfig(DialectConfig):
428
+ """Configuration for MySQL dialect."""
429
+
430
+ @property
431
+ def name(self) -> str:
432
+ if self._name is None:
433
+ self._name = "mysql"
434
+ return self._name
435
+
436
+ @property
437
+ def block_starters(self) -> set[str]:
438
+ if self._block_starters is None:
439
+ self._block_starters = {"BEGIN", "DECLARE", "CASE"}
440
+ return self._block_starters
441
+
442
+ @property
443
+ def block_enders(self) -> set[str]:
444
+ if self._block_enders is None:
445
+ self._block_enders = {"END"}
446
+ return self._block_enders
447
+
448
+ @property
449
+ def statement_terminators(self) -> set[str]:
450
+ if self._statement_terminators is None:
451
+ self._statement_terminators = {";"}
452
+ return self._statement_terminators
453
+
454
+ @property
455
+ def special_terminators(self) -> dict[str, Callable[[list[Token], int], bool]]:
456
+ if self._special_terminators is None:
457
+ self._special_terminators = {"\\g": lambda _tokens, _pos: True, "\\G": lambda _tokens, _pos: True}
458
+ return self._special_terminators
459
+
460
+
461
+ class SQLiteDialectConfig(DialectConfig):
462
+ """Configuration for SQLite dialect."""
463
+
464
+ @property
465
+ def name(self) -> str:
466
+ if self._name is None:
467
+ self._name = "sqlite"
468
+ return self._name
469
+
470
+ @property
471
+ def block_starters(self) -> set[str]:
472
+ if self._block_starters is None:
473
+ self._block_starters = {"BEGIN", "CASE"}
474
+ return self._block_starters
475
+
476
+ @property
477
+ def block_enders(self) -> set[str]:
478
+ if self._block_enders is None:
479
+ self._block_enders = {"END"}
480
+ return self._block_enders
481
+
482
+ @property
483
+ def statement_terminators(self) -> set[str]:
484
+ if self._statement_terminators is None:
485
+ self._statement_terminators = {";"}
486
+ return self._statement_terminators
487
+
488
+
489
+ class DuckDBDialectConfig(DialectConfig):
490
+ """Configuration for DuckDB dialect."""
491
+
492
+ @property
493
+ def name(self) -> str:
494
+ if self._name is None:
495
+ self._name = "duckdb"
496
+ return self._name
497
+
498
+ @property
499
+ def block_starters(self) -> set[str]:
500
+ if self._block_starters is None:
501
+ self._block_starters = {"BEGIN", "CASE"}
502
+ return self._block_starters
503
+
504
+ @property
505
+ def block_enders(self) -> set[str]:
506
+ if self._block_enders is None:
507
+ self._block_enders = {"END"}
508
+ return self._block_enders
509
+
510
+ @property
511
+ def statement_terminators(self) -> set[str]:
512
+ if self._statement_terminators is None:
513
+ self._statement_terminators = {";"}
514
+ return self._statement_terminators
515
+
516
+
517
+ class BigQueryDialectConfig(DialectConfig):
518
+ """Configuration for BigQuery dialect."""
519
+
520
+ @property
521
+ def name(self) -> str:
522
+ if self._name is None:
523
+ self._name = "bigquery"
524
+ return self._name
525
+
526
+ @property
527
+ def block_starters(self) -> set[str]:
528
+ if self._block_starters is None:
529
+ self._block_starters = {"BEGIN", "CASE"}
530
+ return self._block_starters
531
+
532
+ @property
533
+ def block_enders(self) -> set[str]:
534
+ if self._block_enders is None:
535
+ self._block_enders = {"END"}
536
+ return self._block_enders
537
+
538
+ @property
539
+ def statement_terminators(self) -> set[str]:
540
+ if self._statement_terminators is None:
541
+ self._statement_terminators = {";"}
542
+ return self._statement_terminators
543
+
544
+
545
+ _pattern_cache: Optional[UnifiedCache[list[tuple[TokenType, CompiledTokenPattern]]]] = None
546
+ _result_cache: Optional[UnifiedCache[list[str]]] = None
547
+ _cache_lock = threading.Lock()
548
+
549
+
550
+ def _get_pattern_cache() -> UnifiedCache[list[tuple[TokenType, CompiledTokenPattern]]]:
551
+ """Get or create the pattern compilation cache."""
552
+ global _pattern_cache
553
+ if _pattern_cache is None:
554
+ with _cache_lock:
555
+ if _pattern_cache is None:
556
+ _pattern_cache = UnifiedCache[list[tuple[TokenType, CompiledTokenPattern]]](
557
+ max_size=DEFAULT_PATTERN_CACHE_SIZE, ttl_seconds=DEFAULT_CACHE_TTL
558
+ )
559
+ return _pattern_cache
560
+
561
+
562
+ def _get_result_cache() -> UnifiedCache[list[str]]:
563
+ """Get or create the result cache."""
564
+ global _result_cache
565
+ if _result_cache is None:
566
+ with _cache_lock:
567
+ if _result_cache is None:
568
+ _result_cache = UnifiedCache[list[str]](
569
+ max_size=DEFAULT_RESULT_CACHE_SIZE, ttl_seconds=DEFAULT_CACHE_TTL
570
+ )
571
+ return _result_cache
572
+
573
+
574
+ @mypyc_attr(allow_interpreted_subclasses=False)
575
+ class StatementSplitter:
576
+ """SQL script splitter with caching and dialect support."""
577
+
578
+ __slots__ = SPLITTER_SLOTS
579
+
580
+ def __init__(self, dialect: DialectConfig, strip_trailing_semicolon: bool = False) -> None:
581
+ """Initialize the splitter with caching and dialect support."""
582
+ self._dialect = dialect
583
+ self._strip_trailing_semicolon = strip_trailing_semicolon
584
+ self._token_patterns = dialect.get_all_token_patterns()
585
+
586
+ self._pattern_cache_key = f"{dialect.name}:{hash(tuple(str(p) for _, p in self._token_patterns))}"
587
+
588
+ self._pattern_cache = _get_pattern_cache()
589
+ self._result_cache = _get_result_cache()
590
+
591
+ self._compiled_patterns = self._get_or_compile_patterns()
592
+
593
+ def _get_or_compile_patterns(self) -> list[tuple[TokenType, CompiledTokenPattern]]:
594
+ """Get compiled patterns from cache or compile and cache them."""
595
+ cache_key = CacheKey(("pattern", self._pattern_cache_key))
596
+
597
+ cached_patterns = self._pattern_cache.get(cache_key)
598
+ if cached_patterns is not None:
599
+ return cached_patterns
600
+
601
+ compiled: list[tuple[TokenType, CompiledTokenPattern]] = []
602
+ for token_type, pattern in self._token_patterns:
603
+ if isinstance(pattern, str):
604
+ compiled.append((token_type, re.compile(pattern, re.IGNORECASE | re.DOTALL)))
605
+ else:
606
+ compiled.append((token_type, pattern))
607
+
608
+ self._pattern_cache.put(cache_key, compiled)
609
+ return compiled
610
+
611
+ def _tokenize(self, sql: str) -> Generator[Token, None, None]:
612
+ """Tokenize SQL string."""
613
+ pos = 0
614
+ line = 1
615
+ line_start = 0
616
+
617
+ while pos < len(sql):
618
+ matched = False
619
+
620
+ for token_type, pattern in self._compiled_patterns:
621
+ if callable(pattern):
622
+ column = pos - line_start + 1
623
+ token = pattern(sql, pos, line, column)
624
+ if token:
625
+ newlines = token.value.count("\n")
626
+ if newlines > 0:
627
+ line += newlines
628
+ last_newline = token.value.rfind("\n")
629
+ line_start = pos + last_newline + 1
630
+
631
+ yield token
632
+ pos += len(token.value)
633
+ matched = True
634
+ break
635
+ else:
636
+ match = pattern.match(sql, pos)
637
+ if match:
638
+ value = match.group(0)
639
+ column = pos - line_start + 1
640
+
641
+ newlines = value.count("\n")
642
+ if newlines > 0:
643
+ line += newlines
644
+ last_newline = value.rfind("\n")
645
+ line_start = pos + last_newline + 1
646
+
647
+ yield Token(type=token_type, value=value, line=line, column=column, position=pos)
648
+ pos = match.end()
649
+ matched = True
650
+ break
651
+
652
+ if not matched:
653
+ logger.error("Failed to tokenize at position %d: %s", pos, sql[pos : pos + 20])
654
+ pos += 1
655
+
656
+ def split(self, sql: str) -> list[str]:
657
+ """Split SQL script with result caching."""
658
+ script_hash = hash(sql)
659
+ cache_key = CacheKey(("split", self._dialect.name, script_hash, self._strip_trailing_semicolon))
660
+
661
+ cached_result = self._result_cache.get(cache_key)
662
+ if cached_result is not None:
663
+ return cached_result
664
+
665
+ statements = self._do_split(sql)
666
+
667
+ self._result_cache.put(cache_key, statements)
668
+ return statements
669
+
670
+ def _do_split(self, sql: str) -> list[str]:
671
+ """Perform SQL script splitting."""
672
+ statements = []
673
+ current_statement_tokens = []
674
+ current_statement_chars = []
675
+ block_stack = []
676
+
677
+ all_tokens = list(self._tokenize(sql))
678
+
679
+ for token_idx, token in enumerate(all_tokens):
680
+ current_statement_chars.append(token.value)
681
+
682
+ if token.type in {TokenType.WHITESPACE, TokenType.COMMENT_LINE, TokenType.COMMENT_BLOCK}:
683
+ current_statement_tokens.append(token)
684
+ continue
685
+
686
+ current_statement_tokens.append(token)
687
+ token_upper = token.value.upper()
688
+
689
+ if token.type == TokenType.KEYWORD:
690
+ if token_upper in self._dialect.block_starters:
691
+ block_stack.append(token_upper)
692
+ if len(block_stack) > self._dialect.max_nesting_depth:
693
+ msg = f"Maximum nesting depth ({self._dialect.max_nesting_depth}) exceeded"
694
+ raise ValueError(msg)
695
+ elif token_upper in self._dialect.block_enders:
696
+ if block_stack and self._dialect.is_real_block_ender(all_tokens, token_idx):
697
+ block_stack.pop()
698
+
699
+ is_terminator = False
700
+ if not block_stack:
701
+ if token.type == TokenType.TERMINATOR:
702
+ if token.value in self._dialect.statement_terminators:
703
+ should_delay = self._dialect.should_delay_semicolon_termination(all_tokens, token_idx)
704
+
705
+ if not should_delay and token.value == ";" and self._dialect.batch_separators:
706
+ should_delay = True
707
+
708
+ if not should_delay:
709
+ is_terminator = True
710
+ elif token.value in self._dialect.special_terminators:
711
+ handler = self._dialect.special_terminators[token.value]
712
+ if handler(all_tokens, token_idx):
713
+ is_terminator = True
714
+
715
+ elif token.type == TokenType.KEYWORD and token_upper in self._dialect.batch_separators:
716
+ is_terminator = True
717
+
718
+ if is_terminator:
719
+ statement = "".join(current_statement_chars).strip()
720
+
721
+ is_plsql_block = self._is_plsql_block(current_statement_tokens)
722
+
723
+ if (
724
+ self._strip_trailing_semicolon
725
+ and token.type == TokenType.TERMINATOR
726
+ and statement.endswith(token.value)
727
+ and not is_plsql_block
728
+ ):
729
+ statement = statement[: -len(token.value)].rstrip()
730
+
731
+ if statement and self._contains_executable_content(statement):
732
+ statements.append(statement)
733
+ current_statement_tokens = []
734
+ current_statement_chars = []
735
+
736
+ if current_statement_chars:
737
+ statement = "".join(current_statement_chars).strip()
738
+ if statement and self._contains_executable_content(statement):
739
+ statements.append(statement)
740
+
741
+ return statements
742
+
743
+ @staticmethod
744
+ def _is_plsql_block(tokens: list[Token]) -> bool:
745
+ """Check if the token list represents a PL/SQL block."""
746
+ for token in tokens:
747
+ if token.type == TokenType.KEYWORD:
748
+ return token.value.upper() in {"BEGIN", "DECLARE"}
749
+ return False
750
+
751
+ def _contains_executable_content(self, statement: str) -> bool:
752
+ """Check if a statement contains actual executable content."""
753
+ tokens = list(self._tokenize(statement))
754
+
755
+ for token in tokens:
756
+ if token.type not in {TokenType.WHITESPACE, TokenType.COMMENT_LINE, TokenType.COMMENT_BLOCK}:
757
+ return True
758
+
759
+ return False
760
+
761
+
762
+ def split_sql_script(script: str, dialect: Optional[str] = None, strip_trailing_terminator: bool = False) -> list[str]:
763
+ """Split SQL script into individual statements.
764
+
765
+ Args:
766
+ script: The SQL script to split
767
+ dialect: The SQL dialect name
768
+ strip_trailing_terminator: If True, remove trailing terminators from statements
769
+
770
+ Returns:
771
+ List of individual SQL statements
772
+ """
773
+ if dialect is None:
774
+ dialect = "generic"
775
+
776
+ dialect_configs = {
777
+ "generic": GenericDialectConfig(),
778
+ "oracle": OracleDialectConfig(),
779
+ "tsql": TSQLDialectConfig(),
780
+ "mssql": TSQLDialectConfig(),
781
+ "sqlserver": TSQLDialectConfig(),
782
+ "postgresql": PostgreSQLDialectConfig(),
783
+ "postgres": PostgreSQLDialectConfig(),
784
+ "mysql": MySQLDialectConfig(),
785
+ "sqlite": SQLiteDialectConfig(),
786
+ "duckdb": DuckDBDialectConfig(),
787
+ "bigquery": BigQueryDialectConfig(),
788
+ }
789
+
790
+ config = dialect_configs.get(dialect.lower())
791
+ if not config:
792
+ logger.warning("Unknown dialect '%s', using generic SQL splitter", dialect)
793
+ config = GenericDialectConfig()
794
+
795
+ splitter = StatementSplitter(config, strip_trailing_semicolon=strip_trailing_terminator)
796
+ return splitter.split(script)
797
+
798
+
799
+ def clear_splitter_caches() -> None:
800
+ """Clear all splitter caches for memory management."""
801
+ pattern_cache = _get_pattern_cache()
802
+ result_cache = _get_result_cache()
803
+ pattern_cache.clear()
804
+ result_cache.clear()
805
+
806
+
807
+ def get_splitter_cache_stats() -> dict[str, Any]:
808
+ """Get statistics from splitter caches.
809
+
810
+ Returns:
811
+ Dictionary containing cache statistics
812
+ """
813
+ pattern_cache = _get_pattern_cache()
814
+ result_cache = _get_result_cache()
815
+
816
+ return {
817
+ "pattern_cache": {"size": pattern_cache.size(), "stats": pattern_cache.get_stats()},
818
+ "result_cache": {"size": result_cache.size(), "stats": result_cache.get_stats()},
819
+ }