syncraft 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of syncraft might be problematic. Click here for more details.

syncraft/parser.py CHANGED
@@ -3,22 +3,25 @@ import re
3
3
  from sqlglot import tokenize, TokenType, Parser as GlotParser, exp
4
4
  from typing import (
5
5
  Optional, List, Any, Tuple, TypeVar,
6
- Generic
6
+ Generic, Generator
7
7
  )
8
+ from syncraft.cache import Cache
8
9
  from syncraft.constraint import FrozenDict
9
10
  from syncraft.algebra import (
10
- Either, Left, Right, Error, Algebra
11
+ Either, Left, Right, Error, Algebra, Incomplete
11
12
  )
12
13
  from dataclasses import dataclass, field, replace
13
14
  from enum import Enum
14
- from functools import reduce
15
- from syncraft.syntax import Syntax
16
15
 
17
- from syncraft.ast import Token, TokenSpec, AST, TokenProtocol
16
+ from syncraft.syntax import Syntax, token
17
+
18
+ from syncraft.ast import Token, TokenSpec, AST, TokenProtocol, SyncraftError
18
19
  from syncraft.constraint import Bindable
19
20
 
20
21
 
21
22
  T = TypeVar('T', bound=TokenProtocol)
23
+
24
+
22
25
  @dataclass(frozen=True)
23
26
  class ParserState(Bindable, Generic[T]):
24
27
  """Immutable state for the SQL token stream during parsing.
@@ -32,14 +35,30 @@ class ParserState(Bindable, Generic[T]):
32
35
  """
33
36
  input: Tuple[T, ...] = field(default_factory=tuple)
34
37
  index: int = 0
38
+ final: bool = False # Whether this is a final state (for error reporting)
39
+
40
+ def __repr__(self) -> str:
41
+ return (f"ParserState("
42
+ f"input=[{self.before() + (' ' if len(self.before())>0 else '')}\u25cf{(' ' if len(self.after()) > 0 else '') + self.after()}], "
43
+ f"ended={self.ended()}, "
44
+ f"pending={self.pending()})")
35
45
 
46
+ def __str__(self) -> str:
47
+ return self.__repr__()
36
48
 
49
+ def __add__(self, other: 'ParserState[T]') -> 'ParserState[T]':
50
+ if not isinstance(other, ParserState):
51
+ raise SyncraftError("Can only concatenate ParserState with another ParserState", offending=self, expect="ParserState")
52
+ if self.final:
53
+ raise SyncraftError("Cannot concatenate to a final ParserState", offending=self, expect="not final")
54
+ return replace(self, input=self.input + other.input, final=other.final)
55
+
37
56
  def token_sample_string(self)-> str:
38
57
  def encode_tokens(*tokens:T) -> str:
39
58
  return ",".join(f"{token.token_type.name}({token.text})" for token in tokens)
40
59
  return encode_tokens(*self.input[self.index:self.index + 2])
41
60
 
42
- def before(self, length: Optional[int] = 5)->str:
61
+ def before(self, length: Optional[int] = 3)->str:
43
62
  """Return a string with up to ``length`` tokens before the cursor.
44
63
 
45
64
  Args:
@@ -51,7 +70,7 @@ class ParserState(Bindable, Generic[T]):
51
70
  length = min(self.index, length) if length is not None else self.index
52
71
  return " ".join(token.text for token in self.input[self.index - length:self.index])
53
72
 
54
- def after(self, length: Optional[int] = 5)->str:
73
+ def after(self, length: Optional[int] = 3)->str:
55
74
  """Return a string with up to ``length`` tokens from the cursor on.
56
75
 
57
76
  Args:
@@ -61,7 +80,8 @@ class ParserState(Bindable, Generic[T]):
61
80
  str: Space-separated token texts starting at the current index.
62
81
  """
63
82
  length = min(length, len(self.input) - self.index) if length is not None else len(self.input) - self.index
64
- return " ".join(token.text for token in self.input[self.index:self.index + length])
83
+ ret = " ".join(token.text for token in self.input[self.index:self.index + length])
84
+ return ret
65
85
 
66
86
 
67
87
  def current(self)->T:
@@ -73,29 +93,26 @@ class ParserState(Bindable, Generic[T]):
73
93
  Raises:
74
94
  IndexError: If attempting to read past the end of the stream.
75
95
  """
76
- if self.ended():
77
- raise IndexError("Attempted to access token beyond end of stream")
96
+ if self.index >= len(self.input):
97
+ raise SyncraftError("Attempted to access token beyond end of stream", offending=self, expect="index < len(input)")
78
98
  return self.input[self.index]
79
99
 
100
+
101
+ def pending(self) -> bool:
102
+ return self.index >= len(self.input) and not self.final
103
+
80
104
  def ended(self) -> bool:
81
105
  """Whether the cursor is at or past the end of the token stream."""
82
- return self.index >= len(self.input)
106
+ return self.index >= len(self.input) and self.final
83
107
 
84
108
  def advance(self) -> ParserState[T]:
85
109
  """Return a new state advanced by one token (bounded at end)."""
86
110
  return replace(self, index=min(self.index + 1, len(self.input)))
87
111
 
88
- def delta(self, new_state: ParserState[T]) -> Tuple[T, ...]:
89
- assert self.input is new_state.input, "Cannot calculate differences between different input streams"
90
- assert 0 <= self.index <= new_state.index <= len(self.input), "Segment indices out of bounds"
91
- return self.input[self.index:new_state.index]
92
112
 
93
- def copy(self) -> ParserState[T]:
94
- return self.__class__(input=self.input, index=self.index)
95
-
96
113
  @classmethod
97
114
  def from_tokens(cls, tokens: Tuple[T, ...]) -> ParserState[T]:
98
- return cls(input=tokens, index=0)
115
+ return cls(input=tokens, index=0, final=True)
99
116
 
100
117
 
101
118
 
@@ -104,7 +121,7 @@ class ParserState(Bindable, Generic[T]):
104
121
  @dataclass(frozen=True)
105
122
  class Parser(Algebra[T, ParserState[T]]):
106
123
  @classmethod
107
- def state(cls, sql: str, dialect: str) -> ParserState[T]:
124
+ def state(cls, sql: str, dialect: str) -> ParserState[T]: # type: ignore
108
125
  """Tokenize SQL text into an initial ``ParserState``.
109
126
 
110
127
  Uses ``sqlglot.tokenize`` for the given dialect and wraps tokens into
@@ -122,6 +139,8 @@ class Parser(Algebra[T, ParserState[T]]):
122
139
 
123
140
  @classmethod
124
141
  def token(cls,
142
+ *,
143
+ cache: Cache,
125
144
  token_type: Optional[Enum] = None,
126
145
  text: Optional[str] = None,
127
146
  case_sensitive: bool = False,
@@ -143,14 +162,19 @@ class Parser(Algebra[T, ParserState[T]]):
143
162
  Algebra[T, ParserState[T]]: An algebra yielding the matched token.
144
163
  """
145
164
  spec = TokenSpec(token_type=token_type, text=text, case_sensitive=case_sensitive, regex=regex)
146
- def token_run(state: ParserState[T], use_cache:bool) -> Either[Any, Tuple[T, ParserState[T]]]:
147
- if state.ended():
148
- return Left(state)
149
- token = state.current()
150
- if token is None or not spec.is_valid(token):
151
- return Left(state)
152
- return Right((Token(token_type = token.token_type, text=token.text), state.advance())) # type: ignore
153
- captured: Algebra[T, ParserState[T]] = cls(token_run, name=cls.__name__ + f'.token({token_type}, {text})')
165
+ def token_run(state: ParserState[T], use_cache:bool) -> Generator[Incomplete[ParserState[T]],ParserState[T], Either[Any, Tuple[T, ParserState[T]]]]:
166
+ while True:
167
+ if state.ended():
168
+ return Left(state)
169
+ elif state.pending():
170
+ state = yield Incomplete(state)
171
+ else:
172
+ token = state.current()
173
+ if token is None or not spec.is_valid(token):
174
+ return Left(state)
175
+ else:
176
+ return Right((Token(token_type = token.token_type, text=token.text), state.advance())) # type: ignore
177
+ captured: Algebra[T, ParserState[T]] = cls(token_run, name=cls.__name__ + f'.token({token_type}, {text})', cache=cache)
154
178
  def error_fn(err: Any) -> Error:
155
179
  if isinstance(err, ParserState):
156
180
  return Error(message=f"Cannot match token at {err}", this=captured, state=err)
@@ -161,107 +185,7 @@ class Parser(Algebra[T, ParserState[T]]):
161
185
  return captured
162
186
 
163
187
 
164
- @classmethod
165
- def until(cls,
166
- *open_close: Tuple[Algebra[Any, ParserState[T]], Algebra[Any, ParserState[T]]],
167
- terminator: Optional[Algebra[Any, ParserState[T]]] = None,
168
- inclusive: bool = True,
169
- strict: bool = True) -> Algebra[Any, ParserState[T]]:
170
- """Consume tokens until a terminator while respecting nested pairs.
171
-
172
- Tracks nesting of one or more opener/closer parser pairs. When not
173
- nested, an optional ``terminator`` may end the scan. If ``inclusive``
174
- is true, boundary tokens (openers/closers/terminator) are included in
175
- the returned tuple. If ``strict`` is true, the next token must match an
176
- opener before scanning continues; otherwise content may start
177
- immediately.
178
-
179
- Args:
180
- open_close: One or more pairs of (open, close) parsers.
181
- terminator: Optional parser that ends scanning at top level.
182
- inclusive: Include matched structural tokens in the result.
183
- strict: Require the very next token to be an opener when provided.
184
-
185
- Returns:
186
- Algebra[Any, ParserState[T]]: An algebra yielding a tuple of
187
- collected tokens upon success.
188
- """
189
- def until_run(state: ParserState[T], use_cache:bool) -> Either[Any, Tuple[Any, ParserState[T]]]:
190
- # Use a stack to enforce proper nesting across multiple open/close pairs.
191
- tokens: List[Any] = []
192
- if not terminator and len(open_close) == 0:
193
- return Left(Error(this=until_run, message="No terminator and no open/close parsers, nothing to parse", state=state))
194
-
195
- # Helper to try matching any of the parsers once, returning early on first match
196
- def try_match(s: ParserState[T], *parsers: Algebra[Any, ParserState[T]]) -> Tuple[bool, Optional[int], Optional[Any], ParserState[T]]:
197
- for i, p in enumerate(parsers):
198
- res = p.run(s, use_cache)
199
- if isinstance(res, Right):
200
- val, ns = res.value
201
- return True, i, val, ns
202
- return False, None, None, s
203
-
204
- opens, closes = zip(*open_close) if len(open_close) > 0 else ((), ())
205
- tmp_state: ParserState[T] = state.copy()
206
- stack: List[int] = [] # indices into open_close indicating expected closer
207
-
208
- # If strict, require the very next token to be an opener of any kind
209
- if strict and len(opens) > 0:
210
- c = reduce(lambda a, b: a.or_else(b), opens).run(tmp_state, use_cache)
211
- if c.is_left():
212
- return Left(Error(this=until_run, message="No opening parser matched", state=tmp_state))
213
-
214
- while not tmp_state.ended():
215
- # Try to open
216
- o_matched, o_idx, o_tok, o_state = try_match(tmp_state, *opens)
217
- if o_matched and o_idx is not None:
218
- stack.append(o_idx)
219
- if inclusive:
220
- tokens.append(o_tok)
221
- tmp_state = o_state
222
- continue
223
-
224
- # Try to close
225
- c_matched, c_idx, c_tok, c_state = try_match(tmp_state, *closes)
226
- if c_matched and c_idx is not None:
227
- if not stack or stack[-1] != c_idx:
228
- return Left(Error(this=until_run, message="Mismatched closing parser", state=tmp_state))
229
- stack.pop()
230
- if inclusive:
231
- tokens.append(c_tok)
232
- tmp_state = c_state
233
- # After closing, if stack empty, we may terminate on a terminator
234
- if len(stack) == 0:
235
- if terminator:
236
- term = terminator.run(tmp_state, use_cache)
237
- if isinstance(term, Right):
238
- if inclusive:
239
- tokens.append(term.value[0])
240
- return Right((tuple(tokens), term.value[1]))
241
- else:
242
- return Right((tuple(tokens), tmp_state))
243
- continue
244
-
245
- # If nothing structural matched, check termination when not nested
246
- if len(stack) == 0:
247
- if terminator:
248
- term2 = terminator.run(tmp_state, use_cache)
249
- if isinstance(term2, Right):
250
- if inclusive:
251
- tokens.append(term2.value[0])
252
- return Right((tuple(tokens), term2.value[1]))
253
- else:
254
- return Right((tuple(tokens), tmp_state))
255
-
256
- # Otherwise, consume one token as payload and continue
257
- tokens.append(tmp_state.current())
258
- tmp_state = tmp_state.advance()
259
188
 
260
- # Reached end of input
261
- if len(stack) != 0:
262
- return Left(Error(this=until_run, message="Unterminated group", state=tmp_state))
263
- return Right((tuple(tokens), tmp_state))
264
- return cls(until_run, name=cls.__name__ + '.until')
265
189
 
266
190
  def sqlglot(parser: Syntax[Any, Any],
267
191
  dialect: str) -> Syntax[List[exp.Expression], ParserState[Any]]:
@@ -282,52 +206,6 @@ def sqlglot(parser: Syntax[Any, Any],
282
206
  return parser.map(lambda tokens: [e for e in gp.parse(raw_tokens=tokens) if e is not None])
283
207
 
284
208
 
285
- def parse(syntax: Syntax[Any, Any], sql: str, dialect: str) -> Tuple[AST, FrozenDict[str, Tuple[AST, ...]]] | Tuple[Any, None]:
286
- """Parse SQL text with a ``Syntax`` using the ``Parser`` backend.
287
-
288
- Tokenizes the SQL with the specified dialect and executes ``syntax``.
289
-
290
- Args:
291
- syntax: The high-level syntax to run.
292
- sql: SQL text to tokenize and parse.
293
- dialect: sqlglot dialect name used for tokenization.
294
-
295
- Returns:
296
- Tuple[AST, FrozenDict[str, Tuple[AST, ...]]] | Tuple[Any, None]:
297
- The produced AST and collected marks, or a tuple signaling failure.
298
- """
299
- from syncraft.syntax import run
300
- return run(syntax, Parser, True, sql=sql, dialect=dialect)
301
-
302
-
303
-
304
-
305
- def token(token_type: Optional[Enum] = None,
306
- text: Optional[str] = None,
307
- case_sensitive: bool = False,
308
- regex: Optional[re.Pattern[str]] = None
309
- ) -> Syntax[Any, Any]:
310
- """Build a ``Syntax`` that matches a single token.
311
-
312
- Convenience wrapper around ``Parser.token``. You can match by
313
- type, exact text, or regex.
314
-
315
- Args:
316
- token_type: Expected token enum type.
317
- text: Exact token text to match.
318
- case_sensitive: Whether text matching respects case.
319
- regex: Pattern to match token text.
320
-
321
- Returns:
322
- Syntax[Any, Any]: A syntax that matches one token.
323
- """
324
- token_type_txt = token_type.name if token_type is not None else None
325
- token_value_txt = text if text is not None else None
326
- msg = 'token(' + ','.join([x for x in [token_type_txt, token_value_txt, str(regex)] if x is not None]) + ')'
327
- return Syntax(
328
- lambda cls: cls.factory('token', token_type=token_type, text=text, case_sensitive=case_sensitive, regex=regex)
329
- ).describe(name=msg, fixity='prefix')
330
-
331
209
 
332
210
  def identifier(value: str | None = None) -> Syntax[Any, Any]:
333
211
  """Match an identifier token, optionally with exact text.
@@ -339,9 +217,9 @@ def identifier(value: str | None = None) -> Syntax[Any, Any]:
339
217
  Syntax[Any, Any]: A syntax matching one identifier token.
340
218
  """
341
219
  if value is None:
342
- return token(TokenType.IDENTIFIER)
220
+ return token(token_type=TokenType.IDENTIFIER)
343
221
  else:
344
- return token(TokenType.IDENTIFIER, text=value)
222
+ return token(token_type=TokenType.IDENTIFIER, text=value)
345
223
 
346
224
  def variable(value: str | None = None) -> Syntax[Any, Any]:
347
225
  """Match a variable token, optionally with exact text.
@@ -353,69 +231,42 @@ def variable(value: str | None = None) -> Syntax[Any, Any]:
353
231
  Syntax[Any, Any]: A syntax matching one variable token.
354
232
  """
355
233
  if value is None:
356
- return token(TokenType.VAR)
234
+ return token(token_type=TokenType.VAR)
357
235
  else:
358
- return token(TokenType.VAR, text=value)
359
-
360
- def literal(lit: str) -> Syntax[Any, Any]:
361
- """Match an exact literal string (case-sensitive)."""
362
- return token(token_type=None, text=lit, case_sensitive=True)
363
-
364
- def regex(regex: re.Pattern[str]) -> Syntax[Any, Any]:
365
- """Match a token whose text satisfies the given regular expression."""
366
- return token(token_type=None, regex=regex, case_sensitive=True)
367
-
368
- def lift(value: Any)-> Syntax[Any, Any]:
369
- """Lift a Python value into the nearest matching token syntax.
236
+ return token(token_type=TokenType.VAR, text=value)
370
237
 
371
- - ``str`` -> ``literal``
372
- - ``re.Pattern`` -> ``token`` with regex
373
- - ``Enum`` -> ``token`` with type
374
- - otherwise -> succeed with the value
375
- """
376
- if isinstance(value, str):
377
- return literal(value)
378
- elif isinstance(value, re.Pattern):
379
- return token(regex=value)
380
- elif isinstance(value, Enum):
381
- return token(value)
382
- else:
383
- return Syntax(lambda cls: cls.success(value))
384
238
 
385
239
  def number() -> Syntax[Any, Any]:
386
240
  """Match a number token."""
387
- return token(TokenType.NUMBER)
241
+ return token(token_type=TokenType.NUMBER)
388
242
 
389
243
 
390
244
  def string() -> Syntax[Any, Any]:
391
245
  """Match a string literal token."""
392
- return token(TokenType.STRING)
246
+ return token(token_type=TokenType.STRING)
247
+
248
+
393
249
 
394
250
 
395
251
 
396
- def until(*open_close: Tuple[Syntax[Tuple[T, ...] | T, ParserState[T]], Syntax[Tuple[T, ...] | T, ParserState[T]]],
397
- terminator: Optional[Syntax[Tuple[T, ...] | T, ParserState[T]]] = None,
398
- inclusive: bool = True,
399
- strict: bool = True) -> Syntax[Any, Any]:
400
- """Syntax wrapper to scan until a terminator while handling nesting.
401
252
 
402
- Equivalent to ``Parser.until`` but at the ``Syntax`` layer, converting the
403
- provided syntaxes into parser algebras under the hood.
253
+ def parse(syntax: Syntax[Any, Any], sql: str, dialect: str) -> Tuple[Any, None | FrozenDict[str, Tuple[AST, ...]]]:
254
+ """Parse SQL text with a ``Syntax`` using the ``Parser`` backend.
255
+
256
+ Tokenizes the SQL with the specified dialect and executes ``syntax``.
404
257
 
405
258
  Args:
406
- open_close: One or more pairs of (open, close) syntaxes.
407
- terminator: Optional syntax that ends scanning at top level.
408
- inclusive: Include matched boundary tokens in the result.
409
- strict: Require the very next token to be an opener when provided.
259
+ syntax: The high-level syntax to run.
260
+ sql: SQL text to tokenize and parse.
261
+ dialect: sqlglot dialect name used for tokenization.
410
262
 
411
263
  Returns:
412
- Syntax[Any, Any]: A syntax yielding a tuple of collected tokens.
264
+ Tuple[AST, FrozenDict[str, Tuple[AST, ...]]] | Tuple[Any, None]:
265
+ The produced AST and collected marks, or a tuple signaling failure.
413
266
  """
414
- return Syntax(
415
- lambda cls: cls.factory('until',
416
- *[(left.alg(cls), right.alg(cls)) for left, right in open_close],
417
- terminator=terminator.alg(cls) if terminator else None,
418
- inclusive=inclusive,
419
- strict=strict)
420
- ).describe(name="until", fixity='prefix')
421
-
267
+ from syncraft.syntax import run
268
+ v, s = run(syntax=syntax, alg=Parser, use_cache=True, sql=sql, dialect=dialect)
269
+ if s is not None:
270
+ return v, s.binding.bound()
271
+ else:
272
+ return v, None