just-bash 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- just_bash/__init__.py +55 -0
- just_bash/ast/__init__.py +213 -0
- just_bash/ast/factory.py +320 -0
- just_bash/ast/types.py +953 -0
- just_bash/bash.py +220 -0
- just_bash/commands/__init__.py +23 -0
- just_bash/commands/argv/__init__.py +5 -0
- just_bash/commands/argv/argv.py +21 -0
- just_bash/commands/awk/__init__.py +5 -0
- just_bash/commands/awk/awk.py +1168 -0
- just_bash/commands/base64/__init__.py +5 -0
- just_bash/commands/base64/base64.py +138 -0
- just_bash/commands/basename/__init__.py +5 -0
- just_bash/commands/basename/basename.py +72 -0
- just_bash/commands/bash/__init__.py +5 -0
- just_bash/commands/bash/bash.py +188 -0
- just_bash/commands/cat/__init__.py +5 -0
- just_bash/commands/cat/cat.py +173 -0
- just_bash/commands/checksum/__init__.py +5 -0
- just_bash/commands/checksum/checksum.py +179 -0
- just_bash/commands/chmod/__init__.py +5 -0
- just_bash/commands/chmod/chmod.py +216 -0
- just_bash/commands/column/__init__.py +5 -0
- just_bash/commands/column/column.py +180 -0
- just_bash/commands/comm/__init__.py +5 -0
- just_bash/commands/comm/comm.py +150 -0
- just_bash/commands/compression/__init__.py +5 -0
- just_bash/commands/compression/compression.py +298 -0
- just_bash/commands/cp/__init__.py +5 -0
- just_bash/commands/cp/cp.py +149 -0
- just_bash/commands/curl/__init__.py +5 -0
- just_bash/commands/curl/curl.py +801 -0
- just_bash/commands/cut/__init__.py +5 -0
- just_bash/commands/cut/cut.py +327 -0
- just_bash/commands/date/__init__.py +5 -0
- just_bash/commands/date/date.py +258 -0
- just_bash/commands/diff/__init__.py +5 -0
- just_bash/commands/diff/diff.py +118 -0
- just_bash/commands/dirname/__init__.py +5 -0
- just_bash/commands/dirname/dirname.py +56 -0
- just_bash/commands/du/__init__.py +5 -0
- just_bash/commands/du/du.py +150 -0
- just_bash/commands/echo/__init__.py +5 -0
- just_bash/commands/echo/echo.py +125 -0
- just_bash/commands/env/__init__.py +5 -0
- just_bash/commands/env/env.py +163 -0
- just_bash/commands/expand/__init__.py +5 -0
- just_bash/commands/expand/expand.py +299 -0
- just_bash/commands/expr/__init__.py +5 -0
- just_bash/commands/expr/expr.py +273 -0
- just_bash/commands/file/__init__.py +5 -0
- just_bash/commands/file/file.py +274 -0
- just_bash/commands/find/__init__.py +5 -0
- just_bash/commands/find/find.py +623 -0
- just_bash/commands/fold/__init__.py +5 -0
- just_bash/commands/fold/fold.py +160 -0
- just_bash/commands/grep/__init__.py +5 -0
- just_bash/commands/grep/grep.py +418 -0
- just_bash/commands/head/__init__.py +5 -0
- just_bash/commands/head/head.py +167 -0
- just_bash/commands/help/__init__.py +5 -0
- just_bash/commands/help/help.py +67 -0
- just_bash/commands/hostname/__init__.py +5 -0
- just_bash/commands/hostname/hostname.py +21 -0
- just_bash/commands/html_to_markdown/__init__.py +5 -0
- just_bash/commands/html_to_markdown/html_to_markdown.py +191 -0
- just_bash/commands/join/__init__.py +5 -0
- just_bash/commands/join/join.py +252 -0
- just_bash/commands/jq/__init__.py +5 -0
- just_bash/commands/jq/jq.py +280 -0
- just_bash/commands/ln/__init__.py +5 -0
- just_bash/commands/ln/ln.py +127 -0
- just_bash/commands/ls/__init__.py +5 -0
- just_bash/commands/ls/ls.py +280 -0
- just_bash/commands/mkdir/__init__.py +5 -0
- just_bash/commands/mkdir/mkdir.py +92 -0
- just_bash/commands/mv/__init__.py +5 -0
- just_bash/commands/mv/mv.py +142 -0
- just_bash/commands/nl/__init__.py +5 -0
- just_bash/commands/nl/nl.py +180 -0
- just_bash/commands/od/__init__.py +5 -0
- just_bash/commands/od/od.py +157 -0
- just_bash/commands/paste/__init__.py +5 -0
- just_bash/commands/paste/paste.py +100 -0
- just_bash/commands/printf/__init__.py +5 -0
- just_bash/commands/printf/printf.py +157 -0
- just_bash/commands/pwd/__init__.py +5 -0
- just_bash/commands/pwd/pwd.py +23 -0
- just_bash/commands/read/__init__.py +5 -0
- just_bash/commands/read/read.py +185 -0
- just_bash/commands/readlink/__init__.py +5 -0
- just_bash/commands/readlink/readlink.py +86 -0
- just_bash/commands/registry.py +844 -0
- just_bash/commands/rev/__init__.py +5 -0
- just_bash/commands/rev/rev.py +74 -0
- just_bash/commands/rg/__init__.py +5 -0
- just_bash/commands/rg/rg.py +1048 -0
- just_bash/commands/rm/__init__.py +5 -0
- just_bash/commands/rm/rm.py +106 -0
- just_bash/commands/search_engine/__init__.py +13 -0
- just_bash/commands/search_engine/matcher.py +170 -0
- just_bash/commands/search_engine/regex.py +159 -0
- just_bash/commands/sed/__init__.py +5 -0
- just_bash/commands/sed/sed.py +863 -0
- just_bash/commands/seq/__init__.py +5 -0
- just_bash/commands/seq/seq.py +190 -0
- just_bash/commands/shell/__init__.py +5 -0
- just_bash/commands/shell/shell.py +206 -0
- just_bash/commands/sleep/__init__.py +5 -0
- just_bash/commands/sleep/sleep.py +62 -0
- just_bash/commands/sort/__init__.py +5 -0
- just_bash/commands/sort/sort.py +411 -0
- just_bash/commands/split/__init__.py +5 -0
- just_bash/commands/split/split.py +237 -0
- just_bash/commands/sqlite3/__init__.py +5 -0
- just_bash/commands/sqlite3/sqlite3_cmd.py +505 -0
- just_bash/commands/stat/__init__.py +5 -0
- just_bash/commands/stat/stat.py +150 -0
- just_bash/commands/strings/__init__.py +5 -0
- just_bash/commands/strings/strings.py +150 -0
- just_bash/commands/tac/__init__.py +5 -0
- just_bash/commands/tac/tac.py +158 -0
- just_bash/commands/tail/__init__.py +5 -0
- just_bash/commands/tail/tail.py +180 -0
- just_bash/commands/tar/__init__.py +5 -0
- just_bash/commands/tar/tar.py +1067 -0
- just_bash/commands/tee/__init__.py +5 -0
- just_bash/commands/tee/tee.py +63 -0
- just_bash/commands/timeout/__init__.py +5 -0
- just_bash/commands/timeout/timeout.py +188 -0
- just_bash/commands/touch/__init__.py +5 -0
- just_bash/commands/touch/touch.py +91 -0
- just_bash/commands/tr/__init__.py +5 -0
- just_bash/commands/tr/tr.py +297 -0
- just_bash/commands/tree/__init__.py +5 -0
- just_bash/commands/tree/tree.py +139 -0
- just_bash/commands/true/__init__.py +5 -0
- just_bash/commands/true/true.py +32 -0
- just_bash/commands/uniq/__init__.py +5 -0
- just_bash/commands/uniq/uniq.py +323 -0
- just_bash/commands/wc/__init__.py +5 -0
- just_bash/commands/wc/wc.py +169 -0
- just_bash/commands/which/__init__.py +5 -0
- just_bash/commands/which/which.py +52 -0
- just_bash/commands/xan/__init__.py +5 -0
- just_bash/commands/xan/xan.py +1663 -0
- just_bash/commands/xargs/__init__.py +5 -0
- just_bash/commands/xargs/xargs.py +136 -0
- just_bash/commands/yq/__init__.py +5 -0
- just_bash/commands/yq/yq.py +848 -0
- just_bash/fs/__init__.py +29 -0
- just_bash/fs/in_memory_fs.py +621 -0
- just_bash/fs/mountable_fs.py +504 -0
- just_bash/fs/overlay_fs.py +894 -0
- just_bash/fs/read_write_fs.py +455 -0
- just_bash/interpreter/__init__.py +37 -0
- just_bash/interpreter/builtins/__init__.py +92 -0
- just_bash/interpreter/builtins/alias.py +154 -0
- just_bash/interpreter/builtins/cd.py +76 -0
- just_bash/interpreter/builtins/control.py +127 -0
- just_bash/interpreter/builtins/declare.py +336 -0
- just_bash/interpreter/builtins/export.py +56 -0
- just_bash/interpreter/builtins/let.py +44 -0
- just_bash/interpreter/builtins/local.py +57 -0
- just_bash/interpreter/builtins/mapfile.py +152 -0
- just_bash/interpreter/builtins/misc.py +378 -0
- just_bash/interpreter/builtins/readonly.py +80 -0
- just_bash/interpreter/builtins/set.py +234 -0
- just_bash/interpreter/builtins/shopt.py +201 -0
- just_bash/interpreter/builtins/source.py +136 -0
- just_bash/interpreter/builtins/test.py +290 -0
- just_bash/interpreter/builtins/unset.py +53 -0
- just_bash/interpreter/conditionals.py +387 -0
- just_bash/interpreter/control_flow.py +381 -0
- just_bash/interpreter/errors.py +116 -0
- just_bash/interpreter/expansion.py +1156 -0
- just_bash/interpreter/interpreter.py +813 -0
- just_bash/interpreter/types.py +134 -0
- just_bash/network/__init__.py +1 -0
- just_bash/parser/__init__.py +39 -0
- just_bash/parser/lexer.py +948 -0
- just_bash/parser/parser.py +2162 -0
- just_bash/py.typed +0 -0
- just_bash/query_engine/__init__.py +83 -0
- just_bash/query_engine/builtins/__init__.py +1283 -0
- just_bash/query_engine/evaluator.py +578 -0
- just_bash/query_engine/parser.py +525 -0
- just_bash/query_engine/tokenizer.py +329 -0
- just_bash/query_engine/types.py +373 -0
- just_bash/types.py +180 -0
- just_bash-0.1.5.dist-info/METADATA +410 -0
- just_bash-0.1.5.dist-info/RECORD +193 -0
- just_bash-0.1.5.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,948 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lexer for Bash Scripts
|
|
3
|
+
|
|
4
|
+
The lexer tokenizes input into a stream of tokens that the parser consumes.
|
|
5
|
+
It handles:
|
|
6
|
+
- Operators and delimiters
|
|
7
|
+
- Words (with quoting rules)
|
|
8
|
+
- Comments
|
|
9
|
+
- Here-documents
|
|
10
|
+
- Escape sequences
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import re
|
|
16
|
+
from dataclasses import dataclass, field
|
|
17
|
+
from enum import Enum, auto
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class TokenType(Enum):
|
|
22
|
+
"""Token types for bash lexer."""
|
|
23
|
+
|
|
24
|
+
# End of input
|
|
25
|
+
EOF = auto()
|
|
26
|
+
|
|
27
|
+
# Newlines and separators
|
|
28
|
+
NEWLINE = auto()
|
|
29
|
+
SEMICOLON = auto()
|
|
30
|
+
AMP = auto() # &
|
|
31
|
+
|
|
32
|
+
# Operators
|
|
33
|
+
PIPE = auto() # |
|
|
34
|
+
PIPE_AMP = auto() # |&
|
|
35
|
+
AND_AND = auto() # &&
|
|
36
|
+
OR_OR = auto() # ||
|
|
37
|
+
BANG = auto() # !
|
|
38
|
+
|
|
39
|
+
# Redirections
|
|
40
|
+
LESS = auto() # <
|
|
41
|
+
GREAT = auto() # >
|
|
42
|
+
DLESS = auto() # <<
|
|
43
|
+
DGREAT = auto() # >>
|
|
44
|
+
LESSAND = auto() # <&
|
|
45
|
+
GREATAND = auto() # >&
|
|
46
|
+
LESSGREAT = auto() # <>
|
|
47
|
+
DLESSDASH = auto() # <<-
|
|
48
|
+
CLOBBER = auto() # >|
|
|
49
|
+
TLESS = auto() # <<<
|
|
50
|
+
AND_GREAT = auto() # &>
|
|
51
|
+
AND_DGREAT = auto() # &>>
|
|
52
|
+
|
|
53
|
+
# Grouping
|
|
54
|
+
LPAREN = auto() # (
|
|
55
|
+
RPAREN = auto() # )
|
|
56
|
+
LBRACE = auto() # {
|
|
57
|
+
RBRACE = auto() # }
|
|
58
|
+
|
|
59
|
+
# Special
|
|
60
|
+
DSEMI = auto() # ;;
|
|
61
|
+
SEMI_AND = auto() # ;&
|
|
62
|
+
SEMI_SEMI_AND = auto() # ;;&
|
|
63
|
+
|
|
64
|
+
# Compound commands
|
|
65
|
+
DBRACK_START = auto() # [[
|
|
66
|
+
DBRACK_END = auto() # ]]
|
|
67
|
+
DPAREN_START = auto() # ((
|
|
68
|
+
DPAREN_END = auto() # ))
|
|
69
|
+
|
|
70
|
+
# Reserved words
|
|
71
|
+
IF = auto()
|
|
72
|
+
THEN = auto()
|
|
73
|
+
ELSE = auto()
|
|
74
|
+
ELIF = auto()
|
|
75
|
+
FI = auto()
|
|
76
|
+
FOR = auto()
|
|
77
|
+
WHILE = auto()
|
|
78
|
+
UNTIL = auto()
|
|
79
|
+
DO = auto()
|
|
80
|
+
DONE = auto()
|
|
81
|
+
CASE = auto()
|
|
82
|
+
ESAC = auto()
|
|
83
|
+
IN = auto()
|
|
84
|
+
FUNCTION = auto()
|
|
85
|
+
SELECT = auto()
|
|
86
|
+
TIME = auto()
|
|
87
|
+
COPROC = auto()
|
|
88
|
+
|
|
89
|
+
# Words and identifiers
|
|
90
|
+
WORD = auto()
|
|
91
|
+
NAME = auto() # Valid variable name
|
|
92
|
+
NUMBER = auto() # For redirections like 2>&1
|
|
93
|
+
ASSIGNMENT_WORD = auto() # VAR=value
|
|
94
|
+
|
|
95
|
+
# Comments
|
|
96
|
+
COMMENT = auto()
|
|
97
|
+
|
|
98
|
+
# Here-document content
|
|
99
|
+
HEREDOC_CONTENT = auto()
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Reserved words mapping
|
|
103
|
+
RESERVED_WORDS: dict[str, TokenType] = {
|
|
104
|
+
"if": TokenType.IF,
|
|
105
|
+
"then": TokenType.THEN,
|
|
106
|
+
"else": TokenType.ELSE,
|
|
107
|
+
"elif": TokenType.ELIF,
|
|
108
|
+
"fi": TokenType.FI,
|
|
109
|
+
"for": TokenType.FOR,
|
|
110
|
+
"while": TokenType.WHILE,
|
|
111
|
+
"until": TokenType.UNTIL,
|
|
112
|
+
"do": TokenType.DO,
|
|
113
|
+
"done": TokenType.DONE,
|
|
114
|
+
"case": TokenType.CASE,
|
|
115
|
+
"esac": TokenType.ESAC,
|
|
116
|
+
"in": TokenType.IN,
|
|
117
|
+
"function": TokenType.FUNCTION,
|
|
118
|
+
"select": TokenType.SELECT,
|
|
119
|
+
"time": TokenType.TIME,
|
|
120
|
+
"coproc": TokenType.COPROC,
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@dataclass
|
|
125
|
+
class Token:
|
|
126
|
+
"""A lexical token."""
|
|
127
|
+
|
|
128
|
+
type: TokenType
|
|
129
|
+
value: str
|
|
130
|
+
start: int
|
|
131
|
+
end: int
|
|
132
|
+
line: int
|
|
133
|
+
column: int
|
|
134
|
+
quoted: bool = False
|
|
135
|
+
single_quoted: bool = False
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@dataclass
|
|
139
|
+
class HeredocInfo:
|
|
140
|
+
"""Information about a pending here-document."""
|
|
141
|
+
|
|
142
|
+
delimiter: str
|
|
143
|
+
strip_tabs: bool = False
|
|
144
|
+
quoted: bool = False
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# Regular expressions for validation
|
|
148
|
+
NAME_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
|
|
149
|
+
NUMBER_PATTERN = re.compile(r"^[0-9]+$")
|
|
150
|
+
ASSIGNMENT_LHS_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*")
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def is_valid_name(s: str) -> bool:
|
|
154
|
+
"""Check if string is a valid variable name."""
|
|
155
|
+
return bool(NAME_PATTERN.match(s))
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def is_valid_assignment_lhs(s: str) -> bool:
|
|
159
|
+
"""
|
|
160
|
+
Check if a string is a valid assignment LHS with optional nested array subscript.
|
|
161
|
+
Handles: VAR, a[0], a[x], a[a[0]], a[x+1], etc.
|
|
162
|
+
"""
|
|
163
|
+
match = ASSIGNMENT_LHS_PATTERN.match(s)
|
|
164
|
+
if not match:
|
|
165
|
+
return False
|
|
166
|
+
|
|
167
|
+
after_name = s[match.end() :]
|
|
168
|
+
|
|
169
|
+
# If nothing after name, it's valid (simple variable)
|
|
170
|
+
if after_name == "" or after_name == "+":
|
|
171
|
+
return True
|
|
172
|
+
|
|
173
|
+
# If it's an array subscript, check for balanced brackets
|
|
174
|
+
if after_name and after_name[0] == "[":
|
|
175
|
+
depth = 0
|
|
176
|
+
i = 0
|
|
177
|
+
for i, c in enumerate(after_name):
|
|
178
|
+
if c == "[":
|
|
179
|
+
depth += 1
|
|
180
|
+
elif c == "]":
|
|
181
|
+
depth -= 1
|
|
182
|
+
if depth == 0:
|
|
183
|
+
break
|
|
184
|
+
# Must have found closing bracket
|
|
185
|
+
if depth != 0 or i >= len(after_name):
|
|
186
|
+
return False
|
|
187
|
+
# After closing bracket, only + is allowed (for +=)
|
|
188
|
+
after_bracket = after_name[i + 1 :]
|
|
189
|
+
return after_bracket == "" or after_bracket == "+"
|
|
190
|
+
|
|
191
|
+
return False
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# Three-character operators
|
|
195
|
+
THREE_CHAR_OPS: list[tuple[str, TokenType]] = [
|
|
196
|
+
(";;&", TokenType.SEMI_SEMI_AND),
|
|
197
|
+
("<<<", TokenType.TLESS),
|
|
198
|
+
("&>>", TokenType.AND_DGREAT),
|
|
199
|
+
]
|
|
200
|
+
|
|
201
|
+
# Two-character operators
|
|
202
|
+
TWO_CHAR_OPS: list[tuple[str, TokenType]] = [
|
|
203
|
+
("[[", TokenType.DBRACK_START),
|
|
204
|
+
("]]", TokenType.DBRACK_END),
|
|
205
|
+
("((", TokenType.DPAREN_START),
|
|
206
|
+
("))", TokenType.DPAREN_END),
|
|
207
|
+
("&&", TokenType.AND_AND),
|
|
208
|
+
("||", TokenType.OR_OR),
|
|
209
|
+
(";;", TokenType.DSEMI),
|
|
210
|
+
(";&", TokenType.SEMI_AND),
|
|
211
|
+
("|&", TokenType.PIPE_AMP),
|
|
212
|
+
(">>", TokenType.DGREAT),
|
|
213
|
+
("<&", TokenType.LESSAND),
|
|
214
|
+
(">&", TokenType.GREATAND),
|
|
215
|
+
("<>", TokenType.LESSGREAT),
|
|
216
|
+
(">|", TokenType.CLOBBER),
|
|
217
|
+
("&>", TokenType.AND_GREAT),
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
# Single-character operators
|
|
221
|
+
SINGLE_CHAR_OPS: dict[str, TokenType] = {
|
|
222
|
+
"|": TokenType.PIPE,
|
|
223
|
+
"&": TokenType.AMP,
|
|
224
|
+
";": TokenType.SEMICOLON,
|
|
225
|
+
"(": TokenType.LPAREN,
|
|
226
|
+
")": TokenType.RPAREN,
|
|
227
|
+
"<": TokenType.LESS,
|
|
228
|
+
">": TokenType.GREAT,
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
# Word boundary characters
|
|
232
|
+
WORD_BREAK_CHARS = frozenset(" \t\n;|&()<>")
|
|
233
|
+
SPECIAL_CHARS = frozenset("'\"\\$`{}~*?[")
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
class Lexer:
|
|
237
|
+
"""Lexer for bash scripts."""
|
|
238
|
+
|
|
239
|
+
def __init__(self, input_text: str) -> None:
|
|
240
|
+
self.input = input_text
|
|
241
|
+
self.pos = 0
|
|
242
|
+
self.line = 1
|
|
243
|
+
self.column = 1
|
|
244
|
+
self.tokens: list[Token] = []
|
|
245
|
+
self.pending_heredocs: list[HeredocInfo] = []
|
|
246
|
+
|
|
247
|
+
def tokenize(self) -> list[Token]:
|
|
248
|
+
"""Tokenize the entire input."""
|
|
249
|
+
input_text = self.input
|
|
250
|
+
input_len = len(input_text)
|
|
251
|
+
|
|
252
|
+
while self.pos < input_len:
|
|
253
|
+
self._skip_whitespace()
|
|
254
|
+
|
|
255
|
+
if self.pos >= input_len:
|
|
256
|
+
break
|
|
257
|
+
|
|
258
|
+
# Check for pending here-documents after newline
|
|
259
|
+
if (
|
|
260
|
+
self.pending_heredocs
|
|
261
|
+
and self.tokens
|
|
262
|
+
and self.tokens[-1].type == TokenType.NEWLINE
|
|
263
|
+
):
|
|
264
|
+
self._read_heredoc_content()
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
token = self._next_token()
|
|
268
|
+
if token:
|
|
269
|
+
self.tokens.append(token)
|
|
270
|
+
|
|
271
|
+
# Add EOF token
|
|
272
|
+
self.tokens.append(
|
|
273
|
+
Token(
|
|
274
|
+
type=TokenType.EOF,
|
|
275
|
+
value="",
|
|
276
|
+
start=self.pos,
|
|
277
|
+
end=self.pos,
|
|
278
|
+
line=self.line,
|
|
279
|
+
column=self.column,
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return self.tokens
|
|
284
|
+
|
|
285
|
+
def _skip_whitespace(self) -> None:
|
|
286
|
+
"""Skip whitespace and line continuations."""
|
|
287
|
+
input_text = self.input
|
|
288
|
+
input_len = len(input_text)
|
|
289
|
+
|
|
290
|
+
while self.pos < input_len:
|
|
291
|
+
char = input_text[self.pos]
|
|
292
|
+
if char == " " or char == "\t":
|
|
293
|
+
self.pos += 1
|
|
294
|
+
self.column += 1
|
|
295
|
+
elif (
|
|
296
|
+
char == "\\"
|
|
297
|
+
and self.pos + 1 < input_len
|
|
298
|
+
and input_text[self.pos + 1] == "\n"
|
|
299
|
+
):
|
|
300
|
+
# Line continuation
|
|
301
|
+
self.pos += 2
|
|
302
|
+
self.line += 1
|
|
303
|
+
self.column = 1
|
|
304
|
+
else:
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
def _next_token(self) -> Optional[Token]:
|
|
308
|
+
"""Get the next token."""
|
|
309
|
+
input_text = self.input
|
|
310
|
+
pos = self.pos
|
|
311
|
+
start_line = self.line
|
|
312
|
+
start_column = self.column
|
|
313
|
+
|
|
314
|
+
if pos >= len(input_text):
|
|
315
|
+
return None
|
|
316
|
+
|
|
317
|
+
c0 = input_text[pos]
|
|
318
|
+
c1 = input_text[pos + 1] if pos + 1 < len(input_text) else ""
|
|
319
|
+
c2 = input_text[pos + 2] if pos + 2 < len(input_text) else ""
|
|
320
|
+
|
|
321
|
+
# Comments
|
|
322
|
+
if c0 == "#":
|
|
323
|
+
return self._read_comment(pos, start_line, start_column)
|
|
324
|
+
|
|
325
|
+
# Newline
|
|
326
|
+
if c0 == "\n":
|
|
327
|
+
self.pos = pos + 1
|
|
328
|
+
self.line += 1
|
|
329
|
+
self.column = 1
|
|
330
|
+
return Token(
|
|
331
|
+
type=TokenType.NEWLINE,
|
|
332
|
+
value="\n",
|
|
333
|
+
start=pos,
|
|
334
|
+
end=pos + 1,
|
|
335
|
+
line=start_line,
|
|
336
|
+
column=start_column,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
# Three-character operators
|
|
340
|
+
# Special case: <<- (heredoc with tab stripping)
|
|
341
|
+
if c0 == "<" and c1 == "<" and c2 == "-":
|
|
342
|
+
self.pos = pos + 3
|
|
343
|
+
self.column = start_column + 3
|
|
344
|
+
self._register_heredoc_from_lookahead(strip_tabs=True)
|
|
345
|
+
return self._make_token(
|
|
346
|
+
TokenType.DLESSDASH, "<<-", pos, start_line, start_column
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
# Check other three-char operators
|
|
350
|
+
three_chars = c0 + c1 + c2
|
|
351
|
+
for op, token_type in THREE_CHAR_OPS:
|
|
352
|
+
if three_chars == op:
|
|
353
|
+
self.pos = pos + 3
|
|
354
|
+
self.column = start_column + 3
|
|
355
|
+
return self._make_token(token_type, op, pos, start_line, start_column)
|
|
356
|
+
|
|
357
|
+
# Two-character operators
|
|
358
|
+
# Special case: << (heredoc)
|
|
359
|
+
if c0 == "<" and c1 == "<":
|
|
360
|
+
self.pos = pos + 2
|
|
361
|
+
self.column = start_column + 2
|
|
362
|
+
self._register_heredoc_from_lookahead(strip_tabs=False)
|
|
363
|
+
return self._make_token(TokenType.DLESS, "<<", pos, start_line, start_column)
|
|
364
|
+
|
|
365
|
+
# Check other two-char operators
|
|
366
|
+
two_chars = c0 + c1
|
|
367
|
+
for op, token_type in TWO_CHAR_OPS:
|
|
368
|
+
if two_chars == op:
|
|
369
|
+
self.pos = pos + 2
|
|
370
|
+
self.column = start_column + 2
|
|
371
|
+
return self._make_token(token_type, op, pos, start_line, start_column)
|
|
372
|
+
|
|
373
|
+
# Single-character operators
|
|
374
|
+
if c0 in SINGLE_CHAR_OPS:
|
|
375
|
+
self.pos = pos + 1
|
|
376
|
+
self.column = start_column + 1
|
|
377
|
+
return self._make_token(
|
|
378
|
+
SINGLE_CHAR_OPS[c0], c0, pos, start_line, start_column
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
# Special handling for { and }
|
|
382
|
+
if c0 == "{":
|
|
383
|
+
# Check for {} as a word (used in find -exec)
|
|
384
|
+
if c1 == "}":
|
|
385
|
+
self.pos = pos + 2
|
|
386
|
+
self.column = start_column + 2
|
|
387
|
+
return Token(
|
|
388
|
+
type=TokenType.WORD,
|
|
389
|
+
value="{}",
|
|
390
|
+
start=pos,
|
|
391
|
+
end=pos + 2,
|
|
392
|
+
line=start_line,
|
|
393
|
+
column=start_column,
|
|
394
|
+
quoted=False,
|
|
395
|
+
single_quoted=False,
|
|
396
|
+
)
|
|
397
|
+
# In bash, { must be followed by whitespace to be a group start
|
|
398
|
+
if c1 and c1 not in " \t\n":
|
|
399
|
+
return self._read_word(pos, start_line, start_column)
|
|
400
|
+
self.pos = pos + 1
|
|
401
|
+
self.column = start_column + 1
|
|
402
|
+
return self._make_token(TokenType.LBRACE, "{", pos, start_line, start_column)
|
|
403
|
+
|
|
404
|
+
if c0 == "}":
|
|
405
|
+
self.pos = pos + 1
|
|
406
|
+
self.column = start_column + 1
|
|
407
|
+
return self._make_token(TokenType.RBRACE, "}", pos, start_line, start_column)
|
|
408
|
+
|
|
409
|
+
if c0 == "!":
|
|
410
|
+
# Check for != operator (used in [[ ]] tests)
|
|
411
|
+
if c1 == "=":
|
|
412
|
+
self.pos = pos + 2
|
|
413
|
+
self.column = start_column + 2
|
|
414
|
+
return self._make_token(TokenType.WORD, "!=", pos, start_line, start_column)
|
|
415
|
+
self.pos = pos + 1
|
|
416
|
+
self.column = start_column + 1
|
|
417
|
+
return self._make_token(TokenType.BANG, "!", pos, start_line, start_column)
|
|
418
|
+
|
|
419
|
+
# Words
|
|
420
|
+
return self._read_word(pos, start_line, start_column)
|
|
421
|
+
|
|
422
|
+
def _make_token(
|
|
423
|
+
self, type_: TokenType, value: str, start: int, line: int, column: int
|
|
424
|
+
) -> Token:
|
|
425
|
+
"""Create a token."""
|
|
426
|
+
return Token(
|
|
427
|
+
type=type_,
|
|
428
|
+
value=value,
|
|
429
|
+
start=start,
|
|
430
|
+
end=self.pos,
|
|
431
|
+
line=line,
|
|
432
|
+
column=column,
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
def _read_comment(self, start: int, line: int, column: int) -> Token:
|
|
436
|
+
"""Read a comment token."""
|
|
437
|
+
input_text = self.input
|
|
438
|
+
input_len = len(input_text)
|
|
439
|
+
pos = self.pos
|
|
440
|
+
|
|
441
|
+
# Find end of comment (newline or EOF)
|
|
442
|
+
while pos < input_len and input_text[pos] != "\n":
|
|
443
|
+
pos += 1
|
|
444
|
+
|
|
445
|
+
value = input_text[start:pos]
|
|
446
|
+
self.pos = pos
|
|
447
|
+
self.column = column + (pos - start)
|
|
448
|
+
|
|
449
|
+
return Token(
|
|
450
|
+
type=TokenType.COMMENT,
|
|
451
|
+
value=value,
|
|
452
|
+
start=start,
|
|
453
|
+
end=pos,
|
|
454
|
+
line=line,
|
|
455
|
+
column=column,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
def _read_word(self, start: int, line: int, column: int) -> Token:
|
|
459
|
+
"""Read a word token (with possible quotes, escapes, expansions)."""
|
|
460
|
+
input_text = self.input
|
|
461
|
+
input_len = len(input_text)
|
|
462
|
+
pos = self.pos
|
|
463
|
+
|
|
464
|
+
# Fast path: scan for simple word (no quotes, escapes, or expansions)
|
|
465
|
+
fast_start = pos
|
|
466
|
+
while pos < input_len:
|
|
467
|
+
c = input_text[pos]
|
|
468
|
+
if c in WORD_BREAK_CHARS or c in SPECIAL_CHARS:
|
|
469
|
+
break
|
|
470
|
+
pos += 1
|
|
471
|
+
|
|
472
|
+
# If we consumed characters and hit a simple delimiter
|
|
473
|
+
if pos > fast_start:
|
|
474
|
+
c = input_text[pos] if pos < input_len else ""
|
|
475
|
+
if c == "" or c in WORD_BREAK_CHARS:
|
|
476
|
+
value = input_text[fast_start:pos]
|
|
477
|
+
self.pos = pos
|
|
478
|
+
self.column = column + (pos - fast_start)
|
|
479
|
+
|
|
480
|
+
# Check for reserved words
|
|
481
|
+
if value in RESERVED_WORDS:
|
|
482
|
+
return Token(
|
|
483
|
+
type=RESERVED_WORDS[value],
|
|
484
|
+
value=value,
|
|
485
|
+
start=start,
|
|
486
|
+
end=pos,
|
|
487
|
+
line=line,
|
|
488
|
+
column=column,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
# Check for assignment
|
|
492
|
+
eq_idx = value.find("=")
|
|
493
|
+
if eq_idx > 0 and is_valid_assignment_lhs(value[:eq_idx]):
|
|
494
|
+
return Token(
|
|
495
|
+
type=TokenType.ASSIGNMENT_WORD,
|
|
496
|
+
value=value,
|
|
497
|
+
start=start,
|
|
498
|
+
end=pos,
|
|
499
|
+
line=line,
|
|
500
|
+
column=column,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
# Check for number
|
|
504
|
+
if NUMBER_PATTERN.match(value):
|
|
505
|
+
return Token(
|
|
506
|
+
type=TokenType.NUMBER,
|
|
507
|
+
value=value,
|
|
508
|
+
start=start,
|
|
509
|
+
end=pos,
|
|
510
|
+
line=line,
|
|
511
|
+
column=column,
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
# Check for valid name
|
|
515
|
+
if NAME_PATTERN.match(value):
|
|
516
|
+
return Token(
|
|
517
|
+
type=TokenType.NAME,
|
|
518
|
+
value=value,
|
|
519
|
+
start=start,
|
|
520
|
+
end=pos,
|
|
521
|
+
line=line,
|
|
522
|
+
column=column,
|
|
523
|
+
quoted=False,
|
|
524
|
+
single_quoted=False,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
return Token(
|
|
528
|
+
type=TokenType.WORD,
|
|
529
|
+
value=value,
|
|
530
|
+
start=start,
|
|
531
|
+
end=pos,
|
|
532
|
+
line=line,
|
|
533
|
+
column=column,
|
|
534
|
+
quoted=False,
|
|
535
|
+
single_quoted=False,
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
# Slow path: handle complex words with quotes, escapes, expansions
|
|
539
|
+
pos = self.pos # Reset position
|
|
540
|
+
col = self.column
|
|
541
|
+
ln = self.line
|
|
542
|
+
|
|
543
|
+
value = ""
|
|
544
|
+
quoted = False
|
|
545
|
+
single_quoted = False
|
|
546
|
+
in_single_quote = False
|
|
547
|
+
in_double_quote = False
|
|
548
|
+
starts_with_quote = input_text[pos] in "\"'" if pos < input_len else False
|
|
549
|
+
|
|
550
|
+
while pos < input_len:
|
|
551
|
+
char = input_text[pos]
|
|
552
|
+
|
|
553
|
+
# Check for word boundaries
|
|
554
|
+
if not in_single_quote and not in_double_quote:
|
|
555
|
+
if char in WORD_BREAK_CHARS:
|
|
556
|
+
break
|
|
557
|
+
|
|
558
|
+
# Handle $'' ANSI-C quoting
|
|
559
|
+
if (
|
|
560
|
+
char == "$"
|
|
561
|
+
and pos + 1 < input_len
|
|
562
|
+
and input_text[pos + 1] == "'"
|
|
563
|
+
and not in_single_quote
|
|
564
|
+
and not in_double_quote
|
|
565
|
+
):
|
|
566
|
+
value += "$'"
|
|
567
|
+
pos += 2
|
|
568
|
+
col += 2
|
|
569
|
+
# Read until closing quote, handling escape sequences
|
|
570
|
+
while pos < input_len and input_text[pos] != "'":
|
|
571
|
+
if input_text[pos] == "\\" and pos + 1 < input_len:
|
|
572
|
+
value += input_text[pos : pos + 2]
|
|
573
|
+
pos += 2
|
|
574
|
+
col += 2
|
|
575
|
+
else:
|
|
576
|
+
value += input_text[pos]
|
|
577
|
+
pos += 1
|
|
578
|
+
col += 1
|
|
579
|
+
if pos < input_len:
|
|
580
|
+
value += "'"
|
|
581
|
+
pos += 1
|
|
582
|
+
col += 1
|
|
583
|
+
continue
|
|
584
|
+
|
|
585
|
+
# Handle $"..." locale quoting
|
|
586
|
+
if (
|
|
587
|
+
char == "$"
|
|
588
|
+
and pos + 1 < input_len
|
|
589
|
+
and input_text[pos + 1] == '"'
|
|
590
|
+
and not in_single_quote
|
|
591
|
+
and not in_double_quote
|
|
592
|
+
):
|
|
593
|
+
pos += 1
|
|
594
|
+
col += 1
|
|
595
|
+
in_double_quote = True
|
|
596
|
+
quoted = True
|
|
597
|
+
if value == "":
|
|
598
|
+
starts_with_quote = True
|
|
599
|
+
pos += 1
|
|
600
|
+
col += 1
|
|
601
|
+
continue
|
|
602
|
+
|
|
603
|
+
# Handle quotes
|
|
604
|
+
if char == "'" and not in_double_quote:
|
|
605
|
+
if in_single_quote:
|
|
606
|
+
in_single_quote = False
|
|
607
|
+
if not starts_with_quote:
|
|
608
|
+
value += char
|
|
609
|
+
else:
|
|
610
|
+
in_single_quote = True
|
|
611
|
+
if starts_with_quote:
|
|
612
|
+
single_quoted = True
|
|
613
|
+
quoted = True
|
|
614
|
+
else:
|
|
615
|
+
value += char
|
|
616
|
+
pos += 1
|
|
617
|
+
col += 1
|
|
618
|
+
continue
|
|
619
|
+
|
|
620
|
+
if char == '"' and not in_single_quote:
|
|
621
|
+
if in_double_quote:
|
|
622
|
+
in_double_quote = False
|
|
623
|
+
if not starts_with_quote:
|
|
624
|
+
value += char
|
|
625
|
+
else:
|
|
626
|
+
in_double_quote = True
|
|
627
|
+
if starts_with_quote:
|
|
628
|
+
quoted = True
|
|
629
|
+
else:
|
|
630
|
+
value += char
|
|
631
|
+
pos += 1
|
|
632
|
+
col += 1
|
|
633
|
+
continue
|
|
634
|
+
|
|
635
|
+
# Handle escapes
|
|
636
|
+
if char == "\\" and not in_single_quote and pos + 1 < input_len:
|
|
637
|
+
next_char = input_text[pos + 1]
|
|
638
|
+
if next_char == "\n":
|
|
639
|
+
# Line continuation
|
|
640
|
+
pos += 2
|
|
641
|
+
ln += 1
|
|
642
|
+
col = 1
|
|
643
|
+
continue
|
|
644
|
+
if in_double_quote:
|
|
645
|
+
# In double quotes, only certain escapes are special
|
|
646
|
+
if next_char in "\"\\$`\n":
|
|
647
|
+
if next_char in "$`":
|
|
648
|
+
value += char + next_char
|
|
649
|
+
else:
|
|
650
|
+
value += next_char
|
|
651
|
+
pos += 2
|
|
652
|
+
col += 2
|
|
653
|
+
continue
|
|
654
|
+
else:
|
|
655
|
+
# Outside quotes, backslash escapes next character
|
|
656
|
+
if next_char in "\"'":
|
|
657
|
+
value += char + next_char
|
|
658
|
+
else:
|
|
659
|
+
value += next_char
|
|
660
|
+
pos += 2
|
|
661
|
+
col += 2
|
|
662
|
+
continue
|
|
663
|
+
|
|
664
|
+
# Handle $(...) command substitution
|
|
665
|
+
if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "(":
|
|
666
|
+
value += char
|
|
667
|
+
pos += 1
|
|
668
|
+
col += 1
|
|
669
|
+
value += input_text[pos] # Add the (
|
|
670
|
+
pos += 1
|
|
671
|
+
col += 1
|
|
672
|
+
|
|
673
|
+
# Track parenthesis depth
|
|
674
|
+
depth = 1
|
|
675
|
+
cmd_in_single_quote = False
|
|
676
|
+
cmd_in_double_quote = False
|
|
677
|
+
|
|
678
|
+
while depth > 0 and pos < input_len:
|
|
679
|
+
c = input_text[pos]
|
|
680
|
+
value += c
|
|
681
|
+
|
|
682
|
+
if cmd_in_single_quote:
|
|
683
|
+
if c == "'":
|
|
684
|
+
cmd_in_single_quote = False
|
|
685
|
+
elif cmd_in_double_quote:
|
|
686
|
+
if c == "\\" and pos + 1 < input_len:
|
|
687
|
+
value += input_text[pos + 1]
|
|
688
|
+
pos += 1
|
|
689
|
+
col += 1
|
|
690
|
+
elif c == '"':
|
|
691
|
+
cmd_in_double_quote = False
|
|
692
|
+
else:
|
|
693
|
+
if c == "'":
|
|
694
|
+
cmd_in_single_quote = True
|
|
695
|
+
elif c == '"':
|
|
696
|
+
cmd_in_double_quote = True
|
|
697
|
+
elif c == "\\" and pos + 1 < input_len:
|
|
698
|
+
value += input_text[pos + 1]
|
|
699
|
+
pos += 1
|
|
700
|
+
col += 1
|
|
701
|
+
elif c == "(":
|
|
702
|
+
depth += 1
|
|
703
|
+
elif c == ")":
|
|
704
|
+
depth -= 1
|
|
705
|
+
|
|
706
|
+
pos += 1
|
|
707
|
+
col += 1
|
|
708
|
+
continue
|
|
709
|
+
|
|
710
|
+
# Handle ${...} parameter expansion
|
|
711
|
+
if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "{":
|
|
712
|
+
value += char
|
|
713
|
+
pos += 1
|
|
714
|
+
col += 1
|
|
715
|
+
value += input_text[pos] # Add the {
|
|
716
|
+
pos += 1
|
|
717
|
+
col += 1
|
|
718
|
+
|
|
719
|
+
# Track brace depth
|
|
720
|
+
depth = 1
|
|
721
|
+
while depth > 0 and pos < input_len:
|
|
722
|
+
c = input_text[pos]
|
|
723
|
+
value += c
|
|
724
|
+
|
|
725
|
+
if c == "{":
|
|
726
|
+
depth += 1
|
|
727
|
+
elif c == "}":
|
|
728
|
+
depth -= 1
|
|
729
|
+
|
|
730
|
+
pos += 1
|
|
731
|
+
col += 1
|
|
732
|
+
continue
|
|
733
|
+
|
|
734
|
+
# Handle simple $VAR expansion
|
|
735
|
+
if char == "$":
|
|
736
|
+
value += char
|
|
737
|
+
pos += 1
|
|
738
|
+
col += 1
|
|
739
|
+
# Read variable name
|
|
740
|
+
while pos < input_len and (
|
|
741
|
+
input_text[pos].isalnum() or input_text[pos] == "_"
|
|
742
|
+
):
|
|
743
|
+
value += input_text[pos]
|
|
744
|
+
pos += 1
|
|
745
|
+
col += 1
|
|
746
|
+
continue
|
|
747
|
+
|
|
748
|
+
# Handle backtick command substitution
|
|
749
|
+
if char == "`":
|
|
750
|
+
value += char
|
|
751
|
+
pos += 1
|
|
752
|
+
col += 1
|
|
753
|
+
# Read until closing backtick
|
|
754
|
+
while pos < input_len and input_text[pos] != "`":
|
|
755
|
+
if input_text[pos] == "\\" and pos + 1 < input_len:
|
|
756
|
+
value += input_text[pos : pos + 2]
|
|
757
|
+
pos += 2
|
|
758
|
+
col += 2
|
|
759
|
+
else:
|
|
760
|
+
value += input_text[pos]
|
|
761
|
+
pos += 1
|
|
762
|
+
col += 1
|
|
763
|
+
if pos < input_len:
|
|
764
|
+
value += "`"
|
|
765
|
+
pos += 1
|
|
766
|
+
col += 1
|
|
767
|
+
continue
|
|
768
|
+
|
|
769
|
+
# Handle brace expansion and glob patterns
|
|
770
|
+
if char in "{}*?[~":
|
|
771
|
+
value += char
|
|
772
|
+
pos += 1
|
|
773
|
+
col += 1
|
|
774
|
+
continue
|
|
775
|
+
|
|
776
|
+
# Regular character
|
|
777
|
+
value += char
|
|
778
|
+
pos += 1
|
|
779
|
+
col += 1
|
|
780
|
+
|
|
781
|
+
self.pos = pos
|
|
782
|
+
self.column = col
|
|
783
|
+
self.line = ln
|
|
784
|
+
|
|
785
|
+
# Determine token type
|
|
786
|
+
# Note: An empty value is valid if it was quoted (e.g., "" or '')
|
|
787
|
+
if not value and not quoted:
|
|
788
|
+
return None
|
|
789
|
+
|
|
790
|
+
# Check for reserved words (only if unquoted)
|
|
791
|
+
if not quoted and value in RESERVED_WORDS:
|
|
792
|
+
return Token(
|
|
793
|
+
type=RESERVED_WORDS[value],
|
|
794
|
+
value=value,
|
|
795
|
+
start=start,
|
|
796
|
+
end=pos,
|
|
797
|
+
line=line,
|
|
798
|
+
column=column,
|
|
799
|
+
)
|
|
800
|
+
|
|
801
|
+
# Check for assignment (only if unquoted)
|
|
802
|
+
if not quoted:
|
|
803
|
+
eq_idx = value.find("=")
|
|
804
|
+
if eq_idx > 0 and is_valid_assignment_lhs(value[:eq_idx]):
|
|
805
|
+
return Token(
|
|
806
|
+
type=TokenType.ASSIGNMENT_WORD,
|
|
807
|
+
value=value,
|
|
808
|
+
start=start,
|
|
809
|
+
end=pos,
|
|
810
|
+
line=line,
|
|
811
|
+
column=column,
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
return Token(
|
|
815
|
+
type=TokenType.WORD,
|
|
816
|
+
value=value,
|
|
817
|
+
start=start,
|
|
818
|
+
end=pos,
|
|
819
|
+
line=line,
|
|
820
|
+
column=column,
|
|
821
|
+
quoted=quoted,
|
|
822
|
+
single_quoted=single_quoted,
|
|
823
|
+
)
|
|
824
|
+
|
|
825
|
+
def _register_heredoc_from_lookahead(self, strip_tabs: bool) -> None:
|
|
826
|
+
"""Register a here-document by looking ahead for the delimiter."""
|
|
827
|
+
input_text = self.input
|
|
828
|
+
input_len = len(input_text)
|
|
829
|
+
pos = self.pos
|
|
830
|
+
|
|
831
|
+
# Skip whitespace
|
|
832
|
+
while pos < input_len and input_text[pos] in " \t":
|
|
833
|
+
pos += 1
|
|
834
|
+
|
|
835
|
+
if pos >= input_len:
|
|
836
|
+
return
|
|
837
|
+
|
|
838
|
+
# Read delimiter
|
|
839
|
+
delimiter = ""
|
|
840
|
+
quoted = False
|
|
841
|
+
in_single_quote = False
|
|
842
|
+
in_double_quote = False
|
|
843
|
+
|
|
844
|
+
# Check for quoted delimiter
|
|
845
|
+
if input_text[pos] == "'":
|
|
846
|
+
quoted = True
|
|
847
|
+
in_single_quote = True
|
|
848
|
+
pos += 1
|
|
849
|
+
elif input_text[pos] == '"':
|
|
850
|
+
quoted = True
|
|
851
|
+
in_double_quote = True
|
|
852
|
+
pos += 1
|
|
853
|
+
|
|
854
|
+
while pos < input_len:
|
|
855
|
+
c = input_text[pos]
|
|
856
|
+
|
|
857
|
+
if in_single_quote:
|
|
858
|
+
if c == "'":
|
|
859
|
+
pos += 1
|
|
860
|
+
break
|
|
861
|
+
delimiter += c
|
|
862
|
+
elif in_double_quote:
|
|
863
|
+
if c == '"':
|
|
864
|
+
pos += 1
|
|
865
|
+
break
|
|
866
|
+
delimiter += c
|
|
867
|
+
else:
|
|
868
|
+
if c in " \t\n;|&<>()":
|
|
869
|
+
break
|
|
870
|
+
# Handle backslash escapes in unquoted delimiter
|
|
871
|
+
if c == "\\" and pos + 1 < input_len:
|
|
872
|
+
delimiter += input_text[pos + 1]
|
|
873
|
+
pos += 2
|
|
874
|
+
quoted = True # Backslash makes it quoted
|
|
875
|
+
continue
|
|
876
|
+
delimiter += c
|
|
877
|
+
|
|
878
|
+
pos += 1
|
|
879
|
+
|
|
880
|
+
if delimiter:
|
|
881
|
+
self.pending_heredocs.append(
|
|
882
|
+
HeredocInfo(delimiter=delimiter, strip_tabs=strip_tabs, quoted=quoted)
|
|
883
|
+
)
|
|
884
|
+
|
|
885
|
+
def _read_heredoc_content(self) -> None:
|
|
886
|
+
"""Read here-document content."""
|
|
887
|
+
if not self.pending_heredocs:
|
|
888
|
+
return
|
|
889
|
+
|
|
890
|
+
input_text = self.input
|
|
891
|
+
input_len = len(input_text)
|
|
892
|
+
|
|
893
|
+
for heredoc in self.pending_heredocs:
|
|
894
|
+
delimiter = heredoc.delimiter
|
|
895
|
+
strip_tabs = heredoc.strip_tabs
|
|
896
|
+
start = self.pos
|
|
897
|
+
start_line = self.line
|
|
898
|
+
start_column = self.column
|
|
899
|
+
|
|
900
|
+
content = ""
|
|
901
|
+
while self.pos < input_len:
|
|
902
|
+
# Read a line
|
|
903
|
+
line_start = self.pos
|
|
904
|
+
line_content = ""
|
|
905
|
+
|
|
906
|
+
while self.pos < input_len and input_text[self.pos] != "\n":
|
|
907
|
+
line_content += input_text[self.pos]
|
|
908
|
+
self.pos += 1
|
|
909
|
+
|
|
910
|
+
# Include newline if present
|
|
911
|
+
if self.pos < input_len:
|
|
912
|
+
self.pos += 1
|
|
913
|
+
self.line += 1
|
|
914
|
+
self.column = 1
|
|
915
|
+
|
|
916
|
+
# Check if this line is the delimiter
|
|
917
|
+
check_line = line_content
|
|
918
|
+
if strip_tabs:
|
|
919
|
+
check_line = line_content.lstrip("\t")
|
|
920
|
+
|
|
921
|
+
if check_line == delimiter:
|
|
922
|
+
break
|
|
923
|
+
|
|
924
|
+
# Add line to content (with tab stripping if applicable)
|
|
925
|
+
if strip_tabs:
|
|
926
|
+
content += line_content.lstrip("\t") + "\n"
|
|
927
|
+
else:
|
|
928
|
+
content += line_content + "\n"
|
|
929
|
+
|
|
930
|
+
# Create token for heredoc content
|
|
931
|
+
self.tokens.append(
|
|
932
|
+
Token(
|
|
933
|
+
type=TokenType.HEREDOC_CONTENT,
|
|
934
|
+
value=content,
|
|
935
|
+
start=start,
|
|
936
|
+
end=self.pos,
|
|
937
|
+
line=start_line,
|
|
938
|
+
column=start_column,
|
|
939
|
+
)
|
|
940
|
+
)
|
|
941
|
+
|
|
942
|
+
self.pending_heredocs.clear()
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
def tokenize(input_text: str) -> list[Token]:
|
|
946
|
+
"""Convenience function to tokenize input."""
|
|
947
|
+
lexer = Lexer(input_text)
|
|
948
|
+
return lexer.tokenize()
|