Typhon-Language 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. Typhon/Driver/configs.py +14 -0
  2. Typhon/Driver/debugging.py +148 -5
  3. Typhon/Driver/diagnostic.py +4 -3
  4. Typhon/Driver/language_server.py +25 -0
  5. Typhon/Driver/run.py +1 -1
  6. Typhon/Driver/translate.py +16 -11
  7. Typhon/Driver/utils.py +39 -1
  8. Typhon/Grammar/_typhon_parser.py +2920 -2718
  9. Typhon/Grammar/parser.py +80 -53
  10. Typhon/Grammar/parser_helper.py +68 -87
  11. Typhon/Grammar/syntax_errors.py +41 -20
  12. Typhon/Grammar/token_factory_custom.py +541 -485
  13. Typhon/Grammar/tokenizer_custom.py +52 -0
  14. Typhon/Grammar/typhon_ast.py +754 -76
  15. Typhon/Grammar/typhon_ast_error.py +438 -0
  16. Typhon/Grammar/unparse_custom.py +25 -0
  17. Typhon/LanguageServer/__init__.py +3 -0
  18. Typhon/LanguageServer/client/__init__.py +42 -0
  19. Typhon/LanguageServer/client/pyrefly.py +115 -0
  20. Typhon/LanguageServer/client/pyright.py +173 -0
  21. Typhon/LanguageServer/semantic_tokens.py +446 -0
  22. Typhon/LanguageServer/server.py +376 -0
  23. Typhon/LanguageServer/utils.py +65 -0
  24. Typhon/SourceMap/ast_match_based_map.py +199 -152
  25. Typhon/SourceMap/ast_matching.py +102 -87
  26. Typhon/SourceMap/datatype.py +275 -264
  27. Typhon/SourceMap/defined_name_retrieve.py +145 -0
  28. Typhon/Transform/comprehension_to_function.py +2 -5
  29. Typhon/Transform/const_member_to_final.py +12 -7
  30. Typhon/Transform/extended_patterns.py +139 -0
  31. Typhon/Transform/forbidden_statements.py +25 -0
  32. Typhon/Transform/if_while_let.py +122 -11
  33. Typhon/Transform/inline_statement_block_capture.py +22 -15
  34. Typhon/Transform/optional_operators_to_checked.py +14 -6
  35. Typhon/Transform/placeholder_to_function.py +0 -1
  36. Typhon/Transform/record_to_dataclass.py +22 -238
  37. Typhon/Transform/scope_check_rename.py +109 -29
  38. Typhon/Transform/transform.py +16 -12
  39. Typhon/Transform/type_abbrev_desugar.py +11 -15
  40. Typhon/Transform/type_annotation_check_expand.py +2 -2
  41. Typhon/Transform/utils/__init__.py +0 -0
  42. Typhon/Transform/utils/imports.py +83 -0
  43. Typhon/Transform/{utils.py → utils/jump_away.py} +2 -38
  44. Typhon/Transform/utils/make_class.py +135 -0
  45. Typhon/Transform/visitor.py +25 -0
  46. Typhon/Typing/pyrefly.py +145 -0
  47. Typhon/Typing/pyright.py +141 -144
  48. Typhon/Typing/result_diagnostic.py +1 -1
  49. Typhon/__main__.py +15 -1
  50. {typhon_language-0.1.2.dist-info → typhon_language-0.1.4.dist-info}/METADATA +13 -6
  51. typhon_language-0.1.4.dist-info/RECORD +65 -0
  52. {typhon_language-0.1.2.dist-info → typhon_language-0.1.4.dist-info}/WHEEL +1 -1
  53. typhon_language-0.1.4.dist-info/licenses/LICENSE +201 -0
  54. typhon_language-0.1.2.dist-info/RECORD +0 -48
  55. typhon_language-0.1.2.dist-info/licenses/LICENSE +0 -21
  56. {typhon_language-0.1.2.dist-info → typhon_language-0.1.4.dist-info}/entry_points.txt +0 -0
  57. {typhon_language-0.1.2.dist-info → typhon_language-0.1.4.dist-info}/top_level.txt +0 -0
@@ -1,485 +1,541 @@
1
- from typing import Callable, Iterator, Literal
2
- from tokenize import TokenInfo, generate_tokens
3
- import tokenize
4
- import re
5
- from dataclasses import dataclass
6
- from ..Driver.debugging import debug_print, debug_verbose_print
7
- from enum import Enum, auto
8
-
9
-
10
- def _regularize_token_type(token_type: int) -> int:
11
- """Convert token type to a regularized form for Typhon.
12
-
13
- NL -> NEWLINE
14
- """
15
- if token_type == tokenize.NL:
16
- return tokenize.NEWLINE
17
- return token_type
18
-
19
-
20
- @dataclass
21
- class _BlockComment:
22
- start_line: int
23
- start_col: int
24
- end_line: int
25
- end_col: int
26
- comment: str
27
- lines: str
28
-
29
- def __hash__(self) -> int:
30
- return hash(
31
- (
32
- self.start_line,
33
- self.start_col,
34
- )
35
- )
36
-
37
-
38
- class _StrKind(Enum):
39
- SINGLE_QUOTE = auto()
40
- DOUBLE_QUOTE = auto()
41
- SINGLE_QUOTE_DOCSTRING = auto()
42
- DOUBLE_QUOTE_DOCSTRING = auto()
43
- FSTRING_START = auto()
44
-
45
-
46
- @dataclass
47
- class _StrPrefix:
48
- is_raw: bool
49
- is_fstring: bool
50
-
51
-
52
- @dataclass
53
- class _Str:
54
- prefix: _StrPrefix
55
- kind: _StrKind
56
-
57
- def is_raw(self) -> bool:
58
- return self.prefix.is_raw
59
-
60
- def is_fstring(self) -> bool:
61
- return self.prefix.is_fstring
62
-
63
-
64
- # Line parser that handles block comments and strings.
65
- # This is ONLY for implementing block comments that can span multiple lines.
66
- class _LineParser:
67
- def __init__(self, readline: Callable[[], str]) -> None:
68
- self.readline = readline
69
- self.line = ""
70
- self.result_line = ""
71
- self.line_num = 0
72
- self._column = 0
73
- # Is inside string. Note this is false in f-string expression parts unless not in the string in the expression.
74
- self.in_string = False
75
- self.in_comment = False
76
-
77
- self.interpolation_stack: list[Literal["{"]] = []
78
- self.str_context: list[_Str] = []
79
- self.bracket_stack_in_interpolation: list[str] = []
80
- self.block_comment_begin_stack: list[_BlockComment] = []
81
- self.outermost_block_comments: list[_BlockComment] = []
82
-
83
- def _next_char(self) -> str | None:
84
- if self._column >= len(self.line):
85
- return None
86
- ch = self.line[self._column]
87
- self._column += 1
88
- return ch
89
-
90
- # Current column of character taken last time.
91
- def _get_char_column(self) -> int:
92
- return self._column - 1
93
-
94
- def _peek_char(self, offset: int = 0) -> str | None:
95
- if self._column + offset >= len(self.line):
96
- return None
97
- return self.line[self._column + offset]
98
-
99
- def _passed(self) -> str:
100
- return self.line[: self._column]
101
-
102
- def _pop_index(self, bracket: str) -> int | None:
103
- for idx in range(len(self.bracket_stack_in_interpolation) - 1, -1, -1):
104
- if self.bracket_stack_in_interpolation[idx] == bracket:
105
- return idx
106
- return None
107
-
108
- def _commit(self, ch: str | None) -> None:
109
- if ch is not None:
110
- if self.block_comment_begin_stack:
111
- # Inside block comment, do not commit to result line
112
- self.block_comment_begin_stack[0].comment += ch
113
- else:
114
- # Normal code
115
- self.result_line += ch
116
-
117
- def _handle_bracket(self, ch: str) -> None:
118
- if self.interpolation_stack:
119
- if ch == "{":
120
- self.bracket_stack_in_interpolation.append("{")
121
- elif ch == "[":
122
- self.bracket_stack_in_interpolation.append("[")
123
- elif ch == "(":
124
- self.bracket_stack_in_interpolation.append("(")
125
- # Unclosed brackets to be ignored.
126
- elif ch == "}":
127
- if (pop_idx := self._pop_index("{")) is not None:
128
- self.bracket_stack_in_interpolation = (
129
- self.bracket_stack_in_interpolation[:pop_idx]
130
- )
131
- if not self.bracket_stack_in_interpolation:
132
- # All brackets closed, end of interpolation
133
- self.interpolation_stack.pop()
134
- self.in_string = True
135
- elif ch == "]":
136
- if (pop_idx := self._pop_index("[")) is not None:
137
- self.bracket_stack_in_interpolation = (
138
- self.bracket_stack_in_interpolation[:pop_idx]
139
- )
140
- elif ch == ")":
141
- if (pop_idx := self._pop_index("(")) is not None:
142
- self.bracket_stack_in_interpolation = (
143
- self.bracket_stack_in_interpolation[:pop_idx]
144
- )
145
- elif self.str_context and self.str_context[-1].is_fstring() and ch == "{":
146
- self.interpolation_stack.append("{")
147
- self.bracket_stack_in_interpolation.append("{")
148
- self.in_string = False
149
-
150
- def _get_str_prefix(self) -> _StrPrefix:
151
- is_raw = False
152
- is_fstring = False
153
- for back_ch in reversed(self._passed()[:-1]):
154
- if back_ch in {"r", "R"}:
155
- is_raw = True
156
- elif back_ch in {"f", "F", "t", "T"}:
157
- is_fstring = True
158
- elif back_ch in {"b", "B"}:
159
- continue
160
- else:
161
- break
162
- return _StrPrefix(is_raw=is_raw, is_fstring=is_fstring)
163
-
164
- def _handle_string_delim(self, ch: str) -> None:
165
- if self.in_string:
166
- # Possible string end
167
- assert self.str_context, "String context stack should not be empty"
168
- prefix = self.str_context[-1].prefix
169
- kind = self.str_context[-1].kind
170
- debug_verbose_print(
171
- f"Handling string may end delim: {ch!r} kind={kind} prefix={prefix} column={self._get_char_column()}"
172
- )
173
- if kind == _StrKind.SINGLE_QUOTE and ch == "'":
174
- self.str_context.pop()
175
- self.in_string = False
176
- return
177
- elif kind == _StrKind.DOUBLE_QUOTE and ch == '"':
178
- self.str_context.pop()
179
- self.in_string = False
180
- return
181
- elif kind == _StrKind.SINGLE_QUOTE_DOCSTRING and ch == "'":
182
- next_ch = self._peek_char()
183
- third_ch = self._peek_char(1)
184
- if next_ch == "'" and third_ch == "'":
185
- self._commit(self._next_char()) # consume
186
- self._commit(self._next_char()) # consume
187
- self.str_context.pop()
188
- self.in_string = False
189
- return
190
- elif kind == _StrKind.DOUBLE_QUOTE_DOCSTRING and ch == '"':
191
- next_ch = self._peek_char()
192
- third_ch = self._peek_char(1)
193
- if next_ch == '"' and third_ch == '"':
194
- self._commit(self._next_char()) # consume
195
- self._commit(self._next_char()) # consume
196
- self.str_context.pop()
197
- self.in_string = False
198
- return
199
- else:
200
- # String start
201
- prefix = self._get_str_prefix()
202
- next_ch = self._peek_char()
203
- debug_verbose_print(
204
- f"Handling string start delim: {ch!r} next_ch={next_ch!r} prefix={prefix} passed={self._passed()}"
205
- )
206
- self.in_string = True
207
- if next_ch == ch:
208
- # Maybe triple quote
209
- third_ch = self._peek_char(1)
210
- if third_ch == ch:
211
- self._commit(self._next_char()) # consume
212
- self._commit(self._next_char()) # consume
213
- # Docstring
214
- if ch == "'":
215
- self.str_context.append(
216
- _Str(prefix, _StrKind.SINGLE_QUOTE_DOCSTRING)
217
- )
218
- else:
219
- self.str_context.append(
220
- _Str(prefix, _StrKind.DOUBLE_QUOTE_DOCSTRING)
221
- )
222
- return
223
- if ch == "'":
224
- self.str_context.append(_Str(prefix, _StrKind.SINGLE_QUOTE))
225
- else:
226
- self.str_context.append(_Str(prefix, _StrKind.DOUBLE_QUOTE))
227
- return
228
-
229
- def _handle_comment(self) -> None:
230
- first_sharp_column = self._get_char_column()
231
- debug_verbose_print(
232
- f"Handling comment at line {self.line_num} col {first_sharp_column} in line: {self.line!r}"
233
- )
234
- # Block comment begin in front
235
- while self._peek_char() == "#":
236
- self._next_char()
237
- if self._peek_char() == "(":
238
- # Block comment begin
239
- # Consume the '('
240
- self._next_char()
241
- # All # and (
242
- comment_starter = self.line[
243
- first_sharp_column : self._get_char_column() + 1
244
- ]
245
- debug_verbose_print(
246
- f"Block comment begin detected at col {first_sharp_column} in line comment_starter={comment_starter}: {self.line!r}"
247
- )
248
- self.block_comment_begin_stack.append(
249
- _BlockComment(
250
- start_line=self.line_num,
251
- start_col=first_sharp_column,
252
- end_line=0,
253
- end_col=0,
254
- comment="",
255
- lines=self.line,
256
- )
257
- )
258
- # Accumulate the begin part to the outermost block comment
259
- self.block_comment_begin_stack[0].comment += comment_starter
260
- elif not self.block_comment_begin_stack:
261
- # Normal comment line, skip to end
262
- self.result_line += self.line[first_sharp_column:]
263
- self._column = len(self.line)
264
- else:
265
- # Inside block comment, just commit the '#'
266
- self.block_comment_begin_stack[0].comment += self.line[
267
- first_sharp_column : self._get_char_column()
268
- ]
269
-
270
- def _handle_block_comment_end(self) -> None:
271
- if self.block_comment_begin_stack:
272
- while self._peek_char() == "#":
273
- self._commit(self._next_char())
274
- debug_verbose_print(
275
- f"Block comment end detected at col {self._column} in line: {self.line!r} "
276
- )
277
- if len(self.block_comment_begin_stack) == 1:
278
- block_comment = self.block_comment_begin_stack[-1]
279
- block_comment.end_line = self.line_num
280
- block_comment.end_col = self._column # after the last '#'
281
- self.outermost_block_comments.append(block_comment)
282
- self.in_comment = False
283
- debug_verbose_print(
284
- f"block comment from line {block_comment.start_line} col {block_comment.start_col} "
285
- f"to line {block_comment.end_line} col {block_comment.end_col}"
286
- )
287
- self.result_line += " " # Replace block comment with space
288
- # Pop the block comment begin
289
- self.block_comment_begin_stack.pop()
290
-
291
- def _next_line(self) -> None:
292
- self.line = self.readline()
293
- self._column = 0
294
- self.line_num += 1
295
-
296
- # Parse the line and return true start/end of block comment.
297
- # block comment begin/end is ignored in string/docstring.
298
- # They are valid in f-string expressions.
299
- def parse_next_line(self) -> str:
300
- self._next_line()
301
- ch = ""
302
- while True:
303
- ch = self._next_char()
304
- if ch is None:
305
- # End of line. Continue if block comment continues.
306
- if self.block_comment_begin_stack:
307
- self._next_line()
308
- continue
309
- # True end of line
310
- break
311
- if self.block_comment_begin_stack:
312
- # Inside block comment
313
- if ch == "#":
314
- self._handle_comment()
315
- if ch == ")" and self._peek_char() == "#":
316
- self._commit(ch)
317
- self._handle_block_comment_end()
318
- else:
319
- self._commit(ch)
320
- elif self.in_string: # Inside string
321
- self._commit(ch)
322
- if ch in {"'", '"'}:
323
- self._handle_string_delim(ch)
324
- elif ch == "\\" and not self.str_context[-1].is_raw():
325
- self._commit(self._next_char()) # consume escape character
326
- elif (
327
- ch == "{" and self.str_context and self.str_context[-1].is_fstring()
328
- ):
329
- # Possible interpolation start
330
- self._handle_bracket(ch)
331
- else: # Normal code
332
- if ch == "#":
333
- self._handle_comment()
334
- else:
335
- self._commit(ch)
336
- if ch in {"'", '"'}:
337
- self._handle_string_delim(ch)
338
- elif ch in {"{", "}", "(", ")", "[", "]"}:
339
- self._handle_bracket(ch)
340
- result = self.result_line
341
- self.result_line = ""
342
- debug_verbose_print(f"Parsed line {self.line_num} result: {result!r}")
343
- return result
344
-
345
-
346
- def generate_and_postprocess_tokens(
347
- readline: Callable[[], str],
348
- unconsumed_block_comment: list[_BlockComment],
349
- ) -> Iterator[TokenInfo]:
350
- """Generate tokens from readline, handling block comments."""
351
- line_offset_already_consumed = 0
352
- block_comment_already_output: set[_BlockComment] = set()
353
- # Adjust token positions from generated tokens, and mix in block comment tokens.
354
- for tok in generate_tokens(readline):
355
- debug_verbose_print(
356
- f"Generated token: {tok.string!r} type={tok.type} start={tok.start} end={tok.end}"
357
- )
358
- tok_start_line, tok_start_col = tok.start
359
- while (
360
- unconsumed_block_comment
361
- and (block_comment := unconsumed_block_comment[0])
362
- and (
363
- block_comment.end_line < tok_start_line
364
- or (
365
- block_comment.end_line == tok_start_line
366
- and block_comment.end_col <= tok_start_col
367
- )
368
- )
369
- ):
370
- debug_verbose_print(
371
- f"pop block comment token: {block_comment.comment!r} start=({block_comment.start_line}, {block_comment.start_col}) end=({block_comment.end_line}, {block_comment.end_col})"
372
- )
373
- # Pop away comments that will never affect to remaining tokens.
374
- # Remove already passed block comments.
375
- line_offset_already_consumed += (
376
- block_comment.end_line - block_comment.start_line
377
- )
378
- unconsumed_block_comment.pop(0)
379
- # Adjust the token position if there are block comments before this token.
380
- adjusted_start_line, adjusted_start_col = tok.start
381
- adjusted_end_line, adjusted_end_col = tok.end
382
- adjusted_start_line += line_offset_already_consumed
383
- adjusted_end_line += line_offset_already_consumed
384
- for block_comment in unconsumed_block_comment:
385
- if (block_comment.start_line, block_comment.start_col) >= (
386
- adjusted_start_line,
387
- adjusted_start_col,
388
- ):
389
- break
390
- # This block comment is before the token, yield here first.
391
- if block_comment not in block_comment_already_output:
392
- block_comment_already_output.add(block_comment)
393
- debug_verbose_print(
394
- f"Yielding block comment at start=({block_comment.start_line}, {block_comment.start_col}) "
395
- f"end=({block_comment.end_line}, {block_comment.end_col})"
396
- )
397
- yield TokenInfo(
398
- type=tokenize.COMMENT,
399
- string=block_comment.comment,
400
- start=(block_comment.start_line, block_comment.start_col),
401
- end=(block_comment.end_line, block_comment.end_col),
402
- line=block_comment.lines,
403
- )
404
- # The length of the last line of block comment.
405
- block_comment_last_line_len = (
406
- block_comment.end_col - block_comment.start_col
407
- if block_comment.end_line == block_comment.start_line
408
- else block_comment.end_col
409
- )
410
- # Adjust start position
411
- debug_verbose_print(
412
- f"Adjusting token start {tok.string!r} adjusted_start: {(adjusted_start_line, adjusted_start_col)} adjusted_end:{(adjusted_end_col, adjusted_end_col)} block_comment.start_col: {block_comment.start_col} block_comment.end_col:{block_comment.end_col} block_comment_last_line_len: {block_comment_last_line_len} block_comment.start_line: {block_comment.start_line} block_comment.end_line: {block_comment.end_line}"
413
- )
414
- # Line start adjustment: shift down by number of lines in block comment.
415
- adjusted_start_line = adjusted_start_line + (
416
- block_comment.end_line - block_comment.start_line
417
- )
418
- # Column start adjustment:
419
- adjusted_start_col = (
420
- # If last of the comment is on the same line, add the length of block comment end part.
421
- (
422
- adjusted_start_col
423
- - block_comment.start_col
424
- + block_comment.end_col
425
- - 1 # Account for space
426
- )
427
- if adjusted_start_line == block_comment.end_line
428
- # If on different lines, the start of the token itself is.
429
- else adjusted_start_col
430
- )
431
- # Adjust end position
432
- # Line end adjustment: same as start
433
- adjusted_end_line = adjusted_end_line + (
434
- block_comment.end_line - block_comment.start_line
435
- )
436
- # Column end adjustment:
437
- adjusted_end_col = (
438
- # If last of the comment is on the same line, add the length of block comment end part.
439
- (
440
- adjusted_end_col
441
- - block_comment.start_col
442
- + block_comment.end_col
443
- - 1 # Account for space
444
- )
445
- if adjusted_end_line == block_comment.end_line
446
- # If on different lines, the start of the token itself is.
447
- else adjusted_end_col
448
- )
449
- debug_verbose_print(
450
- f"Block Comment Adjusting token {tok.string!r} to start=({adjusted_start_line}, {adjusted_start_col}) "
451
- f"end=({adjusted_end_line}, {adjusted_end_col}) due to block comment"
452
- )
453
- debug_verbose_print(
454
- f"Yielding token {tok.string!r} at adjusted start=({adjusted_start_line}, {adjusted_start_col}) "
455
- f"end=({adjusted_end_line}, {adjusted_end_col})"
456
- )
457
- yield TokenInfo(
458
- type=_regularize_token_type(tok.type),
459
- string=tok.string,
460
- start=(adjusted_start_line, adjusted_start_col),
461
- end=(adjusted_end_line, adjusted_end_col),
462
- line=tok.line,
463
- )
464
- for block_comment in unconsumed_block_comment:
465
- # Yield remaining unconsumed block comments at the end.
466
- if block_comment not in block_comment_already_output:
467
- debug_verbose_print(
468
- f"Yielding remaining block comment at end: start=({block_comment.start_line}, {block_comment.start_col}) "
469
- f"end=({block_comment.end_line}, {block_comment.end_col})"
470
- )
471
- yield TokenInfo(
472
- type=tokenize.COMMENT,
473
- string=block_comment.comment,
474
- start=(block_comment.start_line, block_comment.start_col),
475
- end=(block_comment.end_line, block_comment.end_col),
476
- line=block_comment.lines,
477
- )
478
-
479
-
480
- def token_stream_factory(readline: Callable[[], str]) -> Iterator[TokenInfo]:
481
- line_parser = _LineParser(readline)
482
-
483
- yield from generate_and_postprocess_tokens(
484
- line_parser.parse_next_line, line_parser.outermost_block_comments
485
- )
1
+ import sys
2
+ import token
3
+ from typing import Callable, Iterator, Literal
4
+ from tokenize import TokenInfo, generate_tokens
5
+ import tokenize
6
+ import re
7
+ from dataclasses import dataclass
8
+ from ..Driver.debugging import debug_print, debug_verbose_print
9
+ from enum import Enum, auto
10
+
11
+
12
+ def generate_tokens_ignore_error(readline: Callable[[], str]) -> Iterator[TokenInfo]:
13
+ # yield from _generate_tokens_parso(readline)
14
+ try:
15
+ for tok in generate_tokens(readline):
16
+ yield tok
17
+ except tokenize.TokenError as e:
18
+ # Ignore the error on EOF in multiline.
19
+ message: str
20
+ lineno: int
21
+ offset: int
22
+ message, (lineno, offset) = e.args
23
+ pos = (lineno, offset)
24
+ print(f"Tokenization error ignored at {pos}: {e}")
25
+ yield TokenInfo(token.ENDMARKER, "", pos, pos, "")
26
+
27
+
28
+ def _regularize_token_type(token_type: int) -> int:
29
+ """Convert token type to a regularized form for Typhon.
30
+
31
+ NL -> NEWLINE
32
+ """
33
+ if token_type == tokenize.NL:
34
+ return tokenize.NEWLINE
35
+ return token_type
36
+
37
+
38
+ @dataclass
39
+ class _BlockComment:
40
+ start_line: int
41
+ start_col: int
42
+ end_line: int
43
+ end_col: int
44
+ comment: str
45
+ lines: str
46
+
47
+ def __hash__(self) -> int:
48
+ return hash(
49
+ (
50
+ self.start_line,
51
+ self.start_col,
52
+ )
53
+ )
54
+
55
+
56
+ class _StrKind(Enum):
57
+ SINGLE_QUOTE = auto()
58
+ DOUBLE_QUOTE = auto()
59
+ SINGLE_QUOTE_DOCSTRING = auto()
60
+ DOUBLE_QUOTE_DOCSTRING = auto()
61
+ FSTRING_START = auto()
62
+
63
+
64
+ @dataclass
65
+ class _StrPrefix:
66
+ is_raw: bool
67
+ is_fstring: bool
68
+
69
+
70
+ @dataclass
71
+ class _Str:
72
+ prefix: _StrPrefix
73
+ kind: _StrKind
74
+
75
+ def is_raw(self) -> bool:
76
+ return self.prefix.is_raw
77
+
78
+ def is_fstring(self) -> bool:
79
+ return self.prefix.is_fstring
80
+
81
+
82
+ # Line parser that handles block comments and strings.
83
+ # This is ONLY for implementing block comments that can span multiple lines.
84
+ class _LineParser:
85
+ def __init__(self, readline: Callable[[], str]) -> None:
86
+ self.readline = readline
87
+ self.line = ""
88
+ self.result_line = ""
89
+ self.line_num = 0
90
+ self._column = 0
91
+ # Is inside string. Note this is false in f-string expression parts unless not in the string in the expression.
92
+ self.in_string = False
93
+ self.in_comment = False
94
+
95
+ # For f-string interpolation handling.
96
+ self.interpolation_stack: list[Literal["{"]] = []
97
+ # String context stack for nested strings (only in f-string expressions).
98
+ self.str_context: list[_Str] = []
99
+ # To count the brackets in f-string interpolation.
100
+ self.bracket_stack_in_interpolation: list[str] = []
101
+ self.block_comment_begin_stack: list[_BlockComment] = []
102
+ self.outermost_block_comments: list[_BlockComment] = []
103
+ self.line_head_spaces: list[str] = []
104
+
105
+ def _next_char(self) -> str | None:
106
+ if self._column >= len(self.line):
107
+ return None
108
+ ch = self.line[self._column]
109
+ self._column += 1
110
+ return ch
111
+
112
+ # Current column of character taken last time.
113
+ def _get_char_column(self) -> int:
114
+ return self._column - 1
115
+
116
+ def _peek_char(self, offset: int = 0) -> str | None:
117
+ if self._column + offset >= len(self.line):
118
+ return None
119
+ return self.line[self._column + offset]
120
+
121
+ def _passed(self) -> str:
122
+ return self.line[: self._column]
123
+
124
+ def _pop_index(self, bracket: str) -> int | None:
125
+ for idx in range(len(self.bracket_stack_in_interpolation) - 1, -1, -1):
126
+ if self.bracket_stack_in_interpolation[idx] == bracket:
127
+ return idx
128
+ return None
129
+
130
+ def _commit(self, ch: str | None) -> None:
131
+ if ch is not None:
132
+ if self.block_comment_begin_stack:
133
+ # Inside block comment, do not commit to result line
134
+ self.block_comment_begin_stack[0].comment += ch
135
+ else:
136
+ # Normal code
137
+ self.result_line += ch
138
+
139
+ def _handle_bracket(self, ch: str) -> None:
140
+ if self.interpolation_stack:
141
+ if ch == "{":
142
+ self.bracket_stack_in_interpolation.append("{")
143
+ elif ch == "[":
144
+ self.bracket_stack_in_interpolation.append("[")
145
+ elif ch == "(":
146
+ self.bracket_stack_in_interpolation.append("(")
147
+ # Unclosed brackets to be ignored.
148
+ elif ch == "}":
149
+ if (pop_idx := self._pop_index("{")) is not None:
150
+ self.bracket_stack_in_interpolation = (
151
+ self.bracket_stack_in_interpolation[:pop_idx]
152
+ )
153
+ if not self.bracket_stack_in_interpolation:
154
+ # All brackets closed, end of interpolation
155
+ self.interpolation_stack.pop()
156
+ self.in_string = True
157
+ elif ch == "]":
158
+ if (pop_idx := self._pop_index("[")) is not None:
159
+ self.bracket_stack_in_interpolation = (
160
+ self.bracket_stack_in_interpolation[:pop_idx]
161
+ )
162
+ elif ch == ")":
163
+ if (pop_idx := self._pop_index("(")) is not None:
164
+ self.bracket_stack_in_interpolation = (
165
+ self.bracket_stack_in_interpolation[:pop_idx]
166
+ )
167
+ elif self.str_context and self.str_context[-1].is_fstring() and ch == "{":
168
+ # Start of f-string interpolation
169
+ debug_verbose_print(
170
+ f"Starting f-string interpolation at column={self._get_char_column()}"
171
+ )
172
+ self.interpolation_stack.append("{")
173
+ self.bracket_stack_in_interpolation.append("{")
174
+ self.in_string = False
175
+
176
+ def _get_str_prefix(self) -> _StrPrefix:
177
+ is_raw = False
178
+ is_fstring = False
179
+ for back_ch in reversed(self._passed()[:-1]):
180
+ if back_ch in {"r", "R"}:
181
+ is_raw = True
182
+ elif back_ch in {"f", "F", "t", "T"}:
183
+ is_fstring = True
184
+ elif back_ch in {"b", "B"}:
185
+ continue
186
+ else:
187
+ break
188
+ debug_verbose_print(
189
+ f"Determined string prefix {list(reversed(self._passed()[:-1]))[0:2]} is_raw={is_raw} is_fstring={is_fstring} at column={self._get_char_column()}"
190
+ )
191
+ return _StrPrefix(is_raw=is_raw, is_fstring=is_fstring)
192
+
193
+ def _handle_string_delim(self, ch: str) -> None:
194
+ if self.in_string:
195
+ # Possible string end
196
+ assert self.str_context, "String context stack should not be empty"
197
+ prefix = self.str_context[-1].prefix
198
+ kind = self.str_context[-1].kind
199
+ debug_verbose_print(
200
+ f"Handling string may end delim: {ch!r} kind={kind} prefix={prefix} column={self._get_char_column()}"
201
+ )
202
+ if kind == _StrKind.SINGLE_QUOTE and ch == "'":
203
+ self.str_context.pop()
204
+ self.in_string = False
205
+ return
206
+ elif kind == _StrKind.DOUBLE_QUOTE and ch == '"':
207
+ self.str_context.pop()
208
+ self.in_string = False
209
+ return
210
+ elif kind == _StrKind.SINGLE_QUOTE_DOCSTRING and ch == "'":
211
+ next_ch = self._peek_char()
212
+ third_ch = self._peek_char(1)
213
+ if next_ch == "'" and third_ch == "'":
214
+ self._commit(self._next_char()) # consume
215
+ self._commit(self._next_char()) # consume
216
+ self.str_context.pop()
217
+ self.in_string = False
218
+ return
219
+ elif kind == _StrKind.DOUBLE_QUOTE_DOCSTRING and ch == '"':
220
+ next_ch = self._peek_char()
221
+ third_ch = self._peek_char(1)
222
+ if next_ch == '"' and third_ch == '"':
223
+ self._commit(self._next_char()) # consume
224
+ self._commit(self._next_char()) # consume
225
+ self.str_context.pop()
226
+ self.in_string = False
227
+ return
228
+ else:
229
+ # String start
230
+ prefix = self._get_str_prefix()
231
+ next_ch = self._peek_char()
232
+ debug_verbose_print(
233
+ f"Handling string start delim: {ch!r} next_ch={next_ch!r} prefix={prefix} passed={self._passed()} column={self._get_char_column()}"
234
+ )
235
+ self.in_string = True
236
+ if next_ch == ch:
237
+ # Maybe triple quote
238
+ third_ch = self._peek_char(1)
239
+ if third_ch == ch:
240
+ self._commit(self._next_char()) # consume
241
+ self._commit(self._next_char()) # consume
242
+ # Docstring
243
+ if ch == "'":
244
+ self.str_context.append(
245
+ _Str(prefix, _StrKind.SINGLE_QUOTE_DOCSTRING)
246
+ )
247
+ else:
248
+ self.str_context.append(
249
+ _Str(prefix, _StrKind.DOUBLE_QUOTE_DOCSTRING)
250
+ )
251
+ return
252
+ if ch == "'":
253
+ self.str_context.append(_Str(prefix, _StrKind.SINGLE_QUOTE))
254
+ else:
255
+ self.str_context.append(_Str(prefix, _StrKind.DOUBLE_QUOTE))
256
+ return
257
+
258
+ def _handle_comment(self) -> None:
259
+ first_sharp_column = self._get_char_column()
260
+ debug_verbose_print(
261
+ f"Handling comment at line {self.line_num} col {first_sharp_column} in line: {self.line!r}"
262
+ )
263
+ # Block comment begin in front
264
+ while self._peek_char() == "#":
265
+ self._next_char()
266
+ if self._peek_char() == "(":
267
+ # Block comment begin
268
+ # Consume the '('
269
+ self._next_char()
270
+ # All # and (
271
+ comment_starter = self.line[
272
+ first_sharp_column : self._get_char_column() + 1
273
+ ]
274
+ debug_verbose_print(
275
+ f"Block comment begin detected at col {first_sharp_column} in line comment_starter={comment_starter}: {self.line!r}"
276
+ )
277
+ self.block_comment_begin_stack.append(
278
+ _BlockComment(
279
+ start_line=self.line_num,
280
+ start_col=first_sharp_column,
281
+ end_line=0,
282
+ end_col=0,
283
+ comment="",
284
+ lines=self.line,
285
+ )
286
+ )
287
+ # Accumulate the begin part to the outermost block comment
288
+ self.block_comment_begin_stack[0].comment += comment_starter
289
+ elif not self.block_comment_begin_stack:
290
+ # Normal comment line, skip to end
291
+ self.result_line += self.line[first_sharp_column:]
292
+ self._column = len(self.line)
293
+ else:
294
+ # Inside block comment, just commit the '#'
295
+ self.block_comment_begin_stack[0].comment += self.line[
296
+ first_sharp_column : self._get_char_column()
297
+ ]
298
+
299
+ def _handle_block_comment_end(self) -> None:
300
+ if self.block_comment_begin_stack:
301
+ while self._peek_char() == "#":
302
+ self._commit(self._next_char())
303
+ debug_verbose_print(
304
+ f"Block comment end detected at col {self._column} in line: {self.line!r} "
305
+ )
306
+ if len(self.block_comment_begin_stack) == 1:
307
+ block_comment = self.block_comment_begin_stack[-1]
308
+ block_comment.end_line = self.line_num
309
+ block_comment.end_col = self._column # after the last '#'
310
+ self.outermost_block_comments.append(block_comment)
311
+ self.in_comment = False
312
+ debug_verbose_print(
313
+ f"block comment from line {block_comment.start_line} col {block_comment.start_col} "
314
+ f"to line {block_comment.end_line} col {block_comment.end_col}"
315
+ )
316
+ self.result_line += " " # Replace block comment with space
317
+ # Pop the block comment begin
318
+ self.block_comment_begin_stack.pop()
319
+
320
+ def _cut_line_head_spaces(
321
+ self, line: str, line_head_in_string_or_comment: bool
322
+ ) -> str:
323
+ if not line_head_in_string_or_comment:
324
+ match = re.match(r"[ \t]*", line)
325
+ if match:
326
+ self.line_head_spaces.append(match.group(0))
327
+ return line[match.end() :]
328
+ self.line_head_spaces.append("")
329
+ return line
330
+
331
+ def _next_line(self) -> None:
332
+ self.line = self.readline()
333
+ self._column = 0
334
+ self.line_num += 1
335
+
336
+ # Parse the line and return true start/end of block comment.
337
+ # block comment begin/end is ignored in string/docstring.
338
+ # They are valid in f-string expressions.
339
+ def parse_next_line(self) -> str:
340
+ self._next_line()
341
+ ch = ""
342
+ line_head_in_string_or_comment = self.in_string or self.in_comment
343
+ while True:
344
+ ch = self._next_char()
345
+ if ch is None:
346
+ # End of line. Continue if block comment continues.
347
+ if self.block_comment_begin_stack:
348
+ self._next_line()
349
+ continue
350
+ # True end of line
351
+ break
352
+ if self.block_comment_begin_stack:
353
+ # Inside block comment
354
+ if ch == "#":
355
+ self._handle_comment()
356
+ if ch == ")" and self._peek_char() == "#":
357
+ self._commit(ch)
358
+ self._handle_block_comment_end()
359
+ else:
360
+ self._commit(ch)
361
+ elif self.in_string: # Inside string
362
+ self._commit(ch)
363
+ if ch in {"'", '"'}:
364
+ self._handle_string_delim(ch)
365
+ elif ch == "\\" and not self.str_context[-1].is_raw():
366
+ self._commit(self._next_char()) # consume escape character
367
+ elif (
368
+ ch == "{" and self.str_context and self.str_context[-1].is_fstring()
369
+ ):
370
+ # Possible interpolation start
371
+ self._handle_bracket(ch)
372
+ else: # Normal code
373
+ if ch == "#":
374
+ self._handle_comment()
375
+ else:
376
+ self._commit(ch)
377
+ if ch in {"'", '"'}:
378
+ self._handle_string_delim(ch)
379
+ elif ch in {"{", "}", "(", ")", "[", "]"}:
380
+ self._handle_bracket(ch)
381
+ result = self._cut_line_head_spaces(
382
+ self.result_line, line_head_in_string_or_comment
383
+ )
384
+ self.result_line = ""
385
+ debug_verbose_print(f"Parsed line {self.line_num} result: {result!r}")
386
+ return result
387
+
388
+
389
+ def _generate_and_postprocess_tokens(
390
+ readline: Callable[[], str], # After block comment is processed.
391
+ unconsumed_block_comment: list[_BlockComment],
392
+ head_space_lines: list[str],
393
+ ) -> Iterator[TokenInfo]:
394
+ """Generate tokens from readline, handling head space and block comments."""
395
+ line_offset_already_consumed = 0
396
+ block_comment_already_output: set[_BlockComment] = set()
397
+ # Adjust token positions from generated tokens, and mix in block comment tokens.
398
+ for tok in generate_tokens_ignore_error(readline):
399
+ debug_verbose_print(
400
+ f"Generated token: {tok.string!r} type={tok.type} start={tok.start} end={tok.end}"
401
+ )
402
+ # Retrieve the line head spaces for this line.
403
+ start = (
404
+ tok.start[0],
405
+ tok.start[1] + len(head_space_lines[tok.start[0] - 1]),
406
+ )
407
+ end = (
408
+ tok.end[0],
409
+ tok.end[1] + len(head_space_lines[tok.end[0] - 1]),
410
+ )
411
+ # Gather unconsumed block comments before this token.
412
+ tok_start_line, tok_start_col = start
413
+ while (
414
+ unconsumed_block_comment
415
+ and (block_comment := unconsumed_block_comment[0])
416
+ and (
417
+ block_comment.end_line < tok_start_line
418
+ or (
419
+ block_comment.end_line == tok_start_line
420
+ and block_comment.end_col <= tok_start_col
421
+ )
422
+ )
423
+ ):
424
+ debug_verbose_print(
425
+ f"pop block comment token: {block_comment.comment!r} start=({block_comment.start_line}, {block_comment.start_col}) end=({block_comment.end_line}, {block_comment.end_col})"
426
+ )
427
+ # Pop away comments that will never affect to remaining tokens.
428
+ # Remove already passed block comments.
429
+ line_offset_already_consumed += (
430
+ block_comment.end_line - block_comment.start_line
431
+ )
432
+ unconsumed_block_comment.pop(0)
433
+ # Adjust the token position if there are block comments before this token.
434
+ adjusted_start_line, adjusted_start_col = start
435
+ adjusted_end_line, adjusted_end_col = end
436
+ adjusted_start_line += line_offset_already_consumed
437
+ adjusted_end_line += line_offset_already_consumed
438
+ for block_comment in unconsumed_block_comment:
439
+ if (block_comment.start_line, block_comment.start_col) >= (
440
+ adjusted_start_line,
441
+ adjusted_start_col,
442
+ ):
443
+ break
444
+ # This block comment is before the token, yield here first.
445
+ if block_comment not in block_comment_already_output:
446
+ block_comment_already_output.add(block_comment)
447
+ debug_verbose_print(
448
+ f"Yielding block comment at start=({block_comment.start_line}, {block_comment.start_col}) "
449
+ f"end=({block_comment.end_line}, {block_comment.end_col})"
450
+ )
451
+ yield TokenInfo(
452
+ type=tokenize.COMMENT,
453
+ string=block_comment.comment,
454
+ start=(block_comment.start_line, block_comment.start_col),
455
+ end=(block_comment.end_line, block_comment.end_col),
456
+ line=block_comment.lines,
457
+ )
458
+ # The length of the last line of block comment.
459
+ block_comment_last_line_len = (
460
+ block_comment.end_col - block_comment.start_col
461
+ if block_comment.end_line == block_comment.start_line
462
+ else block_comment.end_col
463
+ )
464
+ # Adjust start position
465
+ debug_verbose_print(
466
+ f"Adjusting token start {tok.string!r} adjusted_start: {(adjusted_start_line, adjusted_start_col)} adjusted_end:{(adjusted_end_col, adjusted_end_col)} block_comment.start_col: {block_comment.start_col} block_comment.end_col:{block_comment.end_col} block_comment_last_line_len: {block_comment_last_line_len} block_comment.start_line: {block_comment.start_line} block_comment.end_line: {block_comment.end_line}"
467
+ )
468
+ # Line start adjustment: shift down by number of lines in block comment.
469
+ adjusted_start_line = adjusted_start_line + (
470
+ block_comment.end_line - block_comment.start_line
471
+ )
472
+ # Column start adjustment:
473
+ adjusted_start_col = (
474
+ # If last of the comment is on the same line, add the length of block comment end part.
475
+ (
476
+ adjusted_start_col
477
+ - block_comment.start_col
478
+ + block_comment.end_col
479
+ - 1 # Account for space
480
+ )
481
+ if adjusted_start_line == block_comment.end_line
482
+ # If on different lines, the start of the token itself is.
483
+ else adjusted_start_col
484
+ )
485
+ # Adjust end position
486
+ # Line end adjustment: same as start
487
+ adjusted_end_line = adjusted_end_line + (
488
+ block_comment.end_line - block_comment.start_line
489
+ )
490
+ # Column end adjustment:
491
+ adjusted_end_col = (
492
+ # If last of the comment is on the same line, add the length of block comment end part.
493
+ (
494
+ adjusted_end_col
495
+ - block_comment.start_col
496
+ + block_comment.end_col
497
+ - 1 # Account for space
498
+ )
499
+ if adjusted_end_line == block_comment.end_line
500
+ # If on different lines, the start of the token itself is.
501
+ else adjusted_end_col
502
+ )
503
+ debug_verbose_print(
504
+ f"Block Comment Adjusting token {tok.string!r} to start=({adjusted_start_line}, {adjusted_start_col}) "
505
+ f"end=({adjusted_end_line}, {adjusted_end_col}) due to block comment"
506
+ )
507
+ debug_verbose_print(
508
+ f"Yielding token {tok.string!r} at adjusted start=({adjusted_start_line}, {adjusted_start_col}) "
509
+ f"end=({adjusted_end_line}, {adjusted_end_col})"
510
+ )
511
+ yield TokenInfo(
512
+ type=_regularize_token_type(tok.type),
513
+ string=tok.string,
514
+ start=(adjusted_start_line, adjusted_start_col),
515
+ end=(adjusted_end_line, adjusted_end_col),
516
+ line=tok.line,
517
+ )
518
+ for block_comment in unconsumed_block_comment:
519
+ # Yield remaining unconsumed block comments at the end.
520
+ if block_comment not in block_comment_already_output:
521
+ debug_verbose_print(
522
+ f"Yielding remaining block comment at end: start=({block_comment.start_line}, {block_comment.start_col}) "
523
+ f"end=({block_comment.end_line}, {block_comment.end_col})"
524
+ )
525
+ yield TokenInfo(
526
+ type=tokenize.COMMENT,
527
+ string=block_comment.comment,
528
+ start=(block_comment.start_line, block_comment.start_col),
529
+ end=(block_comment.end_line, block_comment.end_col),
530
+ line=block_comment.lines,
531
+ )
532
+
533
+
534
+ def token_stream_factory(readline: Callable[[], str]) -> Iterator[TokenInfo]:
535
+ line_parser = _LineParser(readline)
536
+
537
+ yield from _generate_and_postprocess_tokens(
538
+ line_parser.parse_next_line,
539
+ line_parser.outermost_block_comments,
540
+ line_parser.line_head_spaces,
541
+ )