syncraft 0.2.2__py3-none-any.whl → 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
syncraft/parser.py CHANGED
@@ -21,6 +21,15 @@ from syncraft.constraint import Bindable
21
21
  T = TypeVar('T', bound=TokenProtocol)
22
22
  @dataclass(frozen=True)
23
23
  class ParserState(Bindable, Generic[T]):
24
+ """Immutable state for the SQL token stream during parsing.
25
+
26
+ Keeps a tuple of tokens and the current index. The state is passed through
27
+ parser combinators and can be copied or advanced safely.
28
+
29
+ Attributes:
30
+ input: The full, immutable sequence of tokens.
31
+ index: Current position within ``input``.
32
+ """
24
33
  input: Tuple[T, ...] = field(default_factory=tuple)
25
34
  index: int = 0
26
35
 
@@ -31,23 +40,49 @@ class ParserState(Bindable, Generic[T]):
31
40
  return encode_tokens(*self.input[self.index:self.index + 2])
32
41
 
33
42
  def before(self, length: Optional[int] = 5)->str:
43
+ """Return a string with up to ``length`` tokens before the cursor.
44
+
45
+ Args:
46
+ length: Maximum number of tokens to include.
47
+
48
+ Returns:
49
+ str: Space-separated token texts before the current index.
50
+ """
34
51
  length = min(self.index, length) if length is not None else self.index
35
52
  return " ".join(token.text for token in self.input[self.index - length:self.index])
36
53
 
37
54
  def after(self, length: Optional[int] = 5)->str:
55
+ """Return a string with up to ``length`` tokens from the cursor on.
56
+
57
+ Args:
58
+ length: Maximum number of tokens to include.
59
+
60
+ Returns:
61
+ str: Space-separated token texts starting at the current index.
62
+ """
38
63
  length = min(length, len(self.input) - self.index) if length is not None else len(self.input) - self.index
39
64
  return " ".join(token.text for token in self.input[self.index:self.index + length])
40
65
 
41
66
 
42
67
  def current(self)->T:
68
+ """Get the current token at ``index``.
69
+
70
+ Returns:
71
+ T: The token at the current index.
72
+
73
+ Raises:
74
+ IndexError: If attempting to read past the end of the stream.
75
+ """
43
76
  if self.ended():
44
77
  raise IndexError("Attempted to access token beyond end of stream")
45
78
  return self.input[self.index]
46
79
 
47
80
  def ended(self) -> bool:
81
+ """Whether the cursor is at or past the end of the token stream."""
48
82
  return self.index >= len(self.input)
49
83
 
50
84
  def advance(self) -> ParserState[T]:
85
+ """Return a new state advanced by one token (bounded at end)."""
51
86
  return replace(self, index=min(self.index + 1, len(self.input)))
52
87
 
53
88
  def delta(self, new_state: ParserState[T]) -> Tuple[T, ...]:
@@ -70,6 +105,18 @@ class ParserState(Bindable, Generic[T]):
70
105
  class Parser(Algebra[T, ParserState[T]]):
71
106
  @classmethod
72
107
  def state(cls, sql: str, dialect: str) -> ParserState[T]:
108
+ """Tokenize SQL text into an initial ``ParserState``.
109
+
110
+ Uses ``sqlglot.tokenize`` for the given dialect and wraps tokens into
111
+ the project's ``Token`` type.
112
+
113
+ Args:
114
+ sql: The SQL text to tokenize.
115
+ dialect: The sqlglot dialect name (e.g. "sqlite", "duckdb").
116
+
117
+ Returns:
118
+ ParserState[T]: Initial parser state at index 0.
119
+ """
73
120
  tokens = tuple([Token(token_type=token.token_type, text=token.text) for token in tokenize(sql, dialect=dialect)])
74
121
  return ParserState.from_tokens(tokens) # type: ignore
75
122
 
@@ -80,6 +127,21 @@ class Parser(Algebra[T, ParserState[T]]):
80
127
  case_sensitive: bool = False,
81
128
  regex: Optional[re.Pattern[str]] = None
82
129
  )-> Algebra[T, ParserState[T]]:
130
+ """Match a single token according to a specification.
131
+
132
+ Succeeds when the current token satisfies the provided
133
+ ``TokenSpec`` (by type, exact text, or regex). On failure,
134
+ an informative ``Error`` is produced with location context.
135
+
136
+ Args:
137
+ token_type: Expected enum type of the token.
138
+ text: Exact token text to match.
139
+ case_sensitive: Whether text matching is case sensitive.
140
+ regex: Regular expression pattern to match token text.
141
+
142
+ Returns:
143
+ Algebra[T, ParserState[T]]: An algebra yielding the matched token.
144
+ """
83
145
  spec = TokenSpec(token_type=token_type, text=text, case_sensitive=case_sensitive, regex=regex)
84
146
  def token_run(state: ParserState[T], use_cache:bool) -> Either[Any, Tuple[T, ParserState[T]]]:
85
147
  if state.ended():
@@ -105,6 +167,25 @@ class Parser(Algebra[T, ParserState[T]]):
105
167
  terminator: Optional[Algebra[Any, ParserState[T]]] = None,
106
168
  inclusive: bool = True,
107
169
  strict: bool = True) -> Algebra[Any, ParserState[T]]:
170
+ """Consume tokens until a terminator while respecting nested pairs.
171
+
172
+ Tracks nesting of one or more opener/closer parser pairs. When not
173
+ nested, an optional ``terminator`` may end the scan. If ``inclusive``
174
+ is true, boundary tokens (openers/closers/terminator) are included in
175
+ the returned tuple. If ``strict`` is true, the next token must match an
176
+ opener before scanning continues; otherwise content may start
177
+ immediately.
178
+
179
+ Args:
180
+ open_close: One or more pairs of (open, close) parsers.
181
+ terminator: Optional parser that ends scanning at top level.
182
+ inclusive: Include matched structural tokens in the result.
183
+ strict: Require the very next token to be an opener when provided.
184
+
185
+ Returns:
186
+ Algebra[Any, ParserState[T]]: An algebra yielding a tuple of
187
+ collected tokens upon success.
188
+ """
108
189
  def until_run(state: ParserState[T], use_cache:bool) -> Either[Any, Tuple[Any, ParserState[T]]]:
109
190
  # Use a stack to enforce proper nesting across multiple open/close pairs.
110
191
  tokens: List[Any] = []
@@ -184,11 +265,37 @@ class Parser(Algebra[T, ParserState[T]]):
184
265
 
185
266
  def sqlglot(parser: Syntax[Any, Any],
186
267
  dialect: str) -> Syntax[List[exp.Expression], ParserState[Any]]:
268
+ """Map token tuples into sqlglot expressions for a given dialect.
269
+
270
+ Wraps a ``Syntax`` so its result is parsed by ``sqlglot.Parser``
271
+ using ``raw_tokens`` and returns only non-``None`` expressions.
272
+
273
+ Args:
274
+ parser: A syntax that produces a sequence of tokens.
275
+ dialect: sqlglot dialect name used to parse tokens.
276
+
277
+ Returns:
278
+ Syntax[List[exp.Expression], ParserState[Any]]: Syntax yielding a list
279
+ of parsed expressions.
280
+ """
187
281
  gp = GlotParser(dialect=dialect)
188
282
  return parser.map(lambda tokens: [e for e in gp.parse(raw_tokens=tokens) if e is not None])
189
283
 
190
284
 
191
285
  def parse(syntax: Syntax[Any, Any], sql: str, dialect: str) -> Tuple[AST, FrozenDict[str, Tuple[AST, ...]]] | Tuple[Any, None]:
286
+ """Parse SQL text with a ``Syntax`` using the ``Parser`` backend.
287
+
288
+ Tokenizes the SQL with the specified dialect and executes ``syntax``.
289
+
290
+ Args:
291
+ syntax: The high-level syntax to run.
292
+ sql: SQL text to tokenize and parse.
293
+ dialect: sqlglot dialect name used for tokenization.
294
+
295
+ Returns:
296
+ Tuple[AST, FrozenDict[str, Tuple[AST, ...]]] | Tuple[Any, None]:
297
+ The produced AST and collected marks, or a tuple signaling failure.
298
+ """
192
299
  from syncraft.syntax import run
193
300
  return run(syntax, Parser, True, sql=sql, dialect=dialect)
194
301
 
@@ -200,6 +307,20 @@ def token(token_type: Optional[Enum] = None,
200
307
  case_sensitive: bool = False,
201
308
  regex: Optional[re.Pattern[str]] = None
202
309
  ) -> Syntax[Any, Any]:
310
+ """Build a ``Syntax`` that matches a single token.
311
+
312
+ Convenience wrapper around ``Parser.token``. You can match by
313
+ type, exact text, or regex.
314
+
315
+ Args:
316
+ token_type: Expected token enum type.
317
+ text: Exact token text to match.
318
+ case_sensitive: Whether text matching respects case.
319
+ regex: Pattern to match token text.
320
+
321
+ Returns:
322
+ Syntax[Any, Any]: A syntax that matches one token.
323
+ """
203
324
  token_type_txt = token_type.name if token_type is not None else None
204
325
  token_value_txt = text if text is not None else None
205
326
  msg = 'token(' + ','.join([x for x in [token_type_txt, token_value_txt, str(regex)] if x is not None]) + ')'
@@ -209,24 +330,49 @@ def token(token_type: Optional[Enum] = None,
209
330
 
210
331
 
211
332
  def identifier(value: str | None = None) -> Syntax[Any, Any]:
333
+ """Match an identifier token, optionally with exact text.
334
+
335
+ Args:
336
+ value: Exact identifier text to match, or ``None`` for any identifier.
337
+
338
+ Returns:
339
+ Syntax[Any, Any]: A syntax matching one identifier token.
340
+ """
212
341
  if value is None:
213
342
  return token(TokenType.IDENTIFIER)
214
343
  else:
215
344
  return token(TokenType.IDENTIFIER, text=value)
216
345
 
217
346
  def variable(value: str | None = None) -> Syntax[Any, Any]:
347
+ """Match a variable token, optionally with exact text.
348
+
349
+ Args:
350
+ value: Exact variable text to match, or ``None`` for any variable.
351
+
352
+ Returns:
353
+ Syntax[Any, Any]: A syntax matching one variable token.
354
+ """
218
355
  if value is None:
219
356
  return token(TokenType.VAR)
220
357
  else:
221
358
  return token(TokenType.VAR, text=value)
222
359
 
223
360
  def literal(lit: str) -> Syntax[Any, Any]:
361
+ """Match an exact literal string (case-sensitive)."""
224
362
  return token(token_type=None, text=lit, case_sensitive=True)
225
363
 
226
364
  def regex(regex: re.Pattern[str]) -> Syntax[Any, Any]:
365
+ """Match a token whose text satisfies the given regular expression."""
227
366
  return token(token_type=None, regex=regex, case_sensitive=True)
228
367
 
229
368
  def lift(value: Any)-> Syntax[Any, Any]:
369
+ """Lift a Python value into the nearest matching token syntax.
370
+
371
+ - ``str`` -> ``literal``
372
+ - ``re.Pattern`` -> ``token`` with regex
373
+ - ``Enum`` -> ``token`` with type
374
+ - otherwise -> succeed with the value
375
+ """
230
376
  if isinstance(value, str):
231
377
  return literal(value)
232
378
  elif isinstance(value, re.Pattern):
@@ -237,10 +383,12 @@ def lift(value: Any)-> Syntax[Any, Any]:
237
383
  return Syntax(lambda cls: cls.success(value))
238
384
 
239
385
  def number() -> Syntax[Any, Any]:
386
+ """Match a number token."""
240
387
  return token(TokenType.NUMBER)
241
388
 
242
389
 
243
390
  def string() -> Syntax[Any, Any]:
391
+ """Match a string literal token."""
244
392
  return token(TokenType.STRING)
245
393
 
246
394
 
@@ -249,6 +397,20 @@ def until(*open_close: Tuple[Syntax[Tuple[T, ...] | T, ParserState[T]], Syntax[T
249
397
  terminator: Optional[Syntax[Tuple[T, ...] | T, ParserState[T]]] = None,
250
398
  inclusive: bool = True,
251
399
  strict: bool = True) -> Syntax[Any, Any]:
400
+ """Syntax wrapper to scan until a terminator while handling nesting.
401
+
402
+ Equivalent to ``Parser.until`` but at the ``Syntax`` layer, converting the
403
+ provided syntaxes into parser algebras under the hood.
404
+
405
+ Args:
406
+ open_close: One or more pairs of (open, close) syntaxes.
407
+ terminator: Optional syntax that ends scanning at top level.
408
+ inclusive: Include matched boundary tokens in the result.
409
+ strict: Require the very next token to be an opener when provided.
410
+
411
+ Returns:
412
+ Syntax[Any, Any]: A syntax yielding a tuple of collected tokens.
413
+ """
252
414
  return Syntax(
253
415
  lambda cls: cls.factory('until',
254
416
  *[(left.alg(cls), right.alg(cls)) for left, right in open_close],