just-bash 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. just_bash/__init__.py +55 -0
  2. just_bash/ast/__init__.py +213 -0
  3. just_bash/ast/factory.py +320 -0
  4. just_bash/ast/types.py +953 -0
  5. just_bash/bash.py +220 -0
  6. just_bash/commands/__init__.py +23 -0
  7. just_bash/commands/argv/__init__.py +5 -0
  8. just_bash/commands/argv/argv.py +21 -0
  9. just_bash/commands/awk/__init__.py +5 -0
  10. just_bash/commands/awk/awk.py +1168 -0
  11. just_bash/commands/base64/__init__.py +5 -0
  12. just_bash/commands/base64/base64.py +138 -0
  13. just_bash/commands/basename/__init__.py +5 -0
  14. just_bash/commands/basename/basename.py +72 -0
  15. just_bash/commands/bash/__init__.py +5 -0
  16. just_bash/commands/bash/bash.py +188 -0
  17. just_bash/commands/cat/__init__.py +5 -0
  18. just_bash/commands/cat/cat.py +173 -0
  19. just_bash/commands/checksum/__init__.py +5 -0
  20. just_bash/commands/checksum/checksum.py +179 -0
  21. just_bash/commands/chmod/__init__.py +5 -0
  22. just_bash/commands/chmod/chmod.py +216 -0
  23. just_bash/commands/column/__init__.py +5 -0
  24. just_bash/commands/column/column.py +180 -0
  25. just_bash/commands/comm/__init__.py +5 -0
  26. just_bash/commands/comm/comm.py +150 -0
  27. just_bash/commands/compression/__init__.py +5 -0
  28. just_bash/commands/compression/compression.py +298 -0
  29. just_bash/commands/cp/__init__.py +5 -0
  30. just_bash/commands/cp/cp.py +149 -0
  31. just_bash/commands/curl/__init__.py +5 -0
  32. just_bash/commands/curl/curl.py +801 -0
  33. just_bash/commands/cut/__init__.py +5 -0
  34. just_bash/commands/cut/cut.py +327 -0
  35. just_bash/commands/date/__init__.py +5 -0
  36. just_bash/commands/date/date.py +258 -0
  37. just_bash/commands/diff/__init__.py +5 -0
  38. just_bash/commands/diff/diff.py +118 -0
  39. just_bash/commands/dirname/__init__.py +5 -0
  40. just_bash/commands/dirname/dirname.py +56 -0
  41. just_bash/commands/du/__init__.py +5 -0
  42. just_bash/commands/du/du.py +150 -0
  43. just_bash/commands/echo/__init__.py +5 -0
  44. just_bash/commands/echo/echo.py +125 -0
  45. just_bash/commands/env/__init__.py +5 -0
  46. just_bash/commands/env/env.py +163 -0
  47. just_bash/commands/expand/__init__.py +5 -0
  48. just_bash/commands/expand/expand.py +299 -0
  49. just_bash/commands/expr/__init__.py +5 -0
  50. just_bash/commands/expr/expr.py +273 -0
  51. just_bash/commands/file/__init__.py +5 -0
  52. just_bash/commands/file/file.py +274 -0
  53. just_bash/commands/find/__init__.py +5 -0
  54. just_bash/commands/find/find.py +623 -0
  55. just_bash/commands/fold/__init__.py +5 -0
  56. just_bash/commands/fold/fold.py +160 -0
  57. just_bash/commands/grep/__init__.py +5 -0
  58. just_bash/commands/grep/grep.py +418 -0
  59. just_bash/commands/head/__init__.py +5 -0
  60. just_bash/commands/head/head.py +167 -0
  61. just_bash/commands/help/__init__.py +5 -0
  62. just_bash/commands/help/help.py +67 -0
  63. just_bash/commands/hostname/__init__.py +5 -0
  64. just_bash/commands/hostname/hostname.py +21 -0
  65. just_bash/commands/html_to_markdown/__init__.py +5 -0
  66. just_bash/commands/html_to_markdown/html_to_markdown.py +191 -0
  67. just_bash/commands/join/__init__.py +5 -0
  68. just_bash/commands/join/join.py +252 -0
  69. just_bash/commands/jq/__init__.py +5 -0
  70. just_bash/commands/jq/jq.py +280 -0
  71. just_bash/commands/ln/__init__.py +5 -0
  72. just_bash/commands/ln/ln.py +127 -0
  73. just_bash/commands/ls/__init__.py +5 -0
  74. just_bash/commands/ls/ls.py +280 -0
  75. just_bash/commands/mkdir/__init__.py +5 -0
  76. just_bash/commands/mkdir/mkdir.py +92 -0
  77. just_bash/commands/mv/__init__.py +5 -0
  78. just_bash/commands/mv/mv.py +142 -0
  79. just_bash/commands/nl/__init__.py +5 -0
  80. just_bash/commands/nl/nl.py +180 -0
  81. just_bash/commands/od/__init__.py +5 -0
  82. just_bash/commands/od/od.py +157 -0
  83. just_bash/commands/paste/__init__.py +5 -0
  84. just_bash/commands/paste/paste.py +100 -0
  85. just_bash/commands/printf/__init__.py +5 -0
  86. just_bash/commands/printf/printf.py +157 -0
  87. just_bash/commands/pwd/__init__.py +5 -0
  88. just_bash/commands/pwd/pwd.py +23 -0
  89. just_bash/commands/read/__init__.py +5 -0
  90. just_bash/commands/read/read.py +185 -0
  91. just_bash/commands/readlink/__init__.py +5 -0
  92. just_bash/commands/readlink/readlink.py +86 -0
  93. just_bash/commands/registry.py +844 -0
  94. just_bash/commands/rev/__init__.py +5 -0
  95. just_bash/commands/rev/rev.py +74 -0
  96. just_bash/commands/rg/__init__.py +5 -0
  97. just_bash/commands/rg/rg.py +1048 -0
  98. just_bash/commands/rm/__init__.py +5 -0
  99. just_bash/commands/rm/rm.py +106 -0
  100. just_bash/commands/search_engine/__init__.py +13 -0
  101. just_bash/commands/search_engine/matcher.py +170 -0
  102. just_bash/commands/search_engine/regex.py +159 -0
  103. just_bash/commands/sed/__init__.py +5 -0
  104. just_bash/commands/sed/sed.py +863 -0
  105. just_bash/commands/seq/__init__.py +5 -0
  106. just_bash/commands/seq/seq.py +190 -0
  107. just_bash/commands/shell/__init__.py +5 -0
  108. just_bash/commands/shell/shell.py +206 -0
  109. just_bash/commands/sleep/__init__.py +5 -0
  110. just_bash/commands/sleep/sleep.py +62 -0
  111. just_bash/commands/sort/__init__.py +5 -0
  112. just_bash/commands/sort/sort.py +411 -0
  113. just_bash/commands/split/__init__.py +5 -0
  114. just_bash/commands/split/split.py +237 -0
  115. just_bash/commands/sqlite3/__init__.py +5 -0
  116. just_bash/commands/sqlite3/sqlite3_cmd.py +505 -0
  117. just_bash/commands/stat/__init__.py +5 -0
  118. just_bash/commands/stat/stat.py +150 -0
  119. just_bash/commands/strings/__init__.py +5 -0
  120. just_bash/commands/strings/strings.py +150 -0
  121. just_bash/commands/tac/__init__.py +5 -0
  122. just_bash/commands/tac/tac.py +158 -0
  123. just_bash/commands/tail/__init__.py +5 -0
  124. just_bash/commands/tail/tail.py +180 -0
  125. just_bash/commands/tar/__init__.py +5 -0
  126. just_bash/commands/tar/tar.py +1067 -0
  127. just_bash/commands/tee/__init__.py +5 -0
  128. just_bash/commands/tee/tee.py +63 -0
  129. just_bash/commands/timeout/__init__.py +5 -0
  130. just_bash/commands/timeout/timeout.py +188 -0
  131. just_bash/commands/touch/__init__.py +5 -0
  132. just_bash/commands/touch/touch.py +91 -0
  133. just_bash/commands/tr/__init__.py +5 -0
  134. just_bash/commands/tr/tr.py +297 -0
  135. just_bash/commands/tree/__init__.py +5 -0
  136. just_bash/commands/tree/tree.py +139 -0
  137. just_bash/commands/true/__init__.py +5 -0
  138. just_bash/commands/true/true.py +32 -0
  139. just_bash/commands/uniq/__init__.py +5 -0
  140. just_bash/commands/uniq/uniq.py +323 -0
  141. just_bash/commands/wc/__init__.py +5 -0
  142. just_bash/commands/wc/wc.py +169 -0
  143. just_bash/commands/which/__init__.py +5 -0
  144. just_bash/commands/which/which.py +52 -0
  145. just_bash/commands/xan/__init__.py +5 -0
  146. just_bash/commands/xan/xan.py +1663 -0
  147. just_bash/commands/xargs/__init__.py +5 -0
  148. just_bash/commands/xargs/xargs.py +136 -0
  149. just_bash/commands/yq/__init__.py +5 -0
  150. just_bash/commands/yq/yq.py +848 -0
  151. just_bash/fs/__init__.py +29 -0
  152. just_bash/fs/in_memory_fs.py +621 -0
  153. just_bash/fs/mountable_fs.py +504 -0
  154. just_bash/fs/overlay_fs.py +894 -0
  155. just_bash/fs/read_write_fs.py +455 -0
  156. just_bash/interpreter/__init__.py +37 -0
  157. just_bash/interpreter/builtins/__init__.py +92 -0
  158. just_bash/interpreter/builtins/alias.py +154 -0
  159. just_bash/interpreter/builtins/cd.py +76 -0
  160. just_bash/interpreter/builtins/control.py +127 -0
  161. just_bash/interpreter/builtins/declare.py +336 -0
  162. just_bash/interpreter/builtins/export.py +56 -0
  163. just_bash/interpreter/builtins/let.py +44 -0
  164. just_bash/interpreter/builtins/local.py +57 -0
  165. just_bash/interpreter/builtins/mapfile.py +152 -0
  166. just_bash/interpreter/builtins/misc.py +378 -0
  167. just_bash/interpreter/builtins/readonly.py +80 -0
  168. just_bash/interpreter/builtins/set.py +234 -0
  169. just_bash/interpreter/builtins/shopt.py +201 -0
  170. just_bash/interpreter/builtins/source.py +136 -0
  171. just_bash/interpreter/builtins/test.py +290 -0
  172. just_bash/interpreter/builtins/unset.py +53 -0
  173. just_bash/interpreter/conditionals.py +387 -0
  174. just_bash/interpreter/control_flow.py +381 -0
  175. just_bash/interpreter/errors.py +116 -0
  176. just_bash/interpreter/expansion.py +1156 -0
  177. just_bash/interpreter/interpreter.py +813 -0
  178. just_bash/interpreter/types.py +134 -0
  179. just_bash/network/__init__.py +1 -0
  180. just_bash/parser/__init__.py +39 -0
  181. just_bash/parser/lexer.py +948 -0
  182. just_bash/parser/parser.py +2162 -0
  183. just_bash/py.typed +0 -0
  184. just_bash/query_engine/__init__.py +83 -0
  185. just_bash/query_engine/builtins/__init__.py +1283 -0
  186. just_bash/query_engine/evaluator.py +578 -0
  187. just_bash/query_engine/parser.py +525 -0
  188. just_bash/query_engine/tokenizer.py +329 -0
  189. just_bash/query_engine/types.py +373 -0
  190. just_bash/types.py +180 -0
  191. just_bash-0.1.5.dist-info/METADATA +410 -0
  192. just_bash-0.1.5.dist-info/RECORD +193 -0
  193. just_bash-0.1.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,948 @@
1
+ """
2
+ Lexer for Bash Scripts
3
+
4
+ The lexer tokenizes input into a stream of tokens that the parser consumes.
5
+ It handles:
6
+ - Operators and delimiters
7
+ - Words (with quoting rules)
8
+ - Comments
9
+ - Here-documents
10
+ - Escape sequences
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from dataclasses import dataclass, field
17
+ from enum import Enum, auto
18
+ from typing import Optional
19
+
20
+
21
+ class TokenType(Enum):
22
+ """Token types for bash lexer."""
23
+
24
+ # End of input
25
+ EOF = auto()
26
+
27
+ # Newlines and separators
28
+ NEWLINE = auto()
29
+ SEMICOLON = auto()
30
+ AMP = auto() # &
31
+
32
+ # Operators
33
+ PIPE = auto() # |
34
+ PIPE_AMP = auto() # |&
35
+ AND_AND = auto() # &&
36
+ OR_OR = auto() # ||
37
+ BANG = auto() # !
38
+
39
+ # Redirections
40
+ LESS = auto() # <
41
+ GREAT = auto() # >
42
+ DLESS = auto() # <<
43
+ DGREAT = auto() # >>
44
+ LESSAND = auto() # <&
45
+ GREATAND = auto() # >&
46
+ LESSGREAT = auto() # <>
47
+ DLESSDASH = auto() # <<-
48
+ CLOBBER = auto() # >|
49
+ TLESS = auto() # <<<
50
+ AND_GREAT = auto() # &>
51
+ AND_DGREAT = auto() # &>>
52
+
53
+ # Grouping
54
+ LPAREN = auto() # (
55
+ RPAREN = auto() # )
56
+ LBRACE = auto() # {
57
+ RBRACE = auto() # }
58
+
59
+ # Special
60
+ DSEMI = auto() # ;;
61
+ SEMI_AND = auto() # ;&
62
+ SEMI_SEMI_AND = auto() # ;;&
63
+
64
+ # Compound commands
65
+ DBRACK_START = auto() # [[
66
+ DBRACK_END = auto() # ]]
67
+ DPAREN_START = auto() # ((
68
+ DPAREN_END = auto() # ))
69
+
70
+ # Reserved words
71
+ IF = auto()
72
+ THEN = auto()
73
+ ELSE = auto()
74
+ ELIF = auto()
75
+ FI = auto()
76
+ FOR = auto()
77
+ WHILE = auto()
78
+ UNTIL = auto()
79
+ DO = auto()
80
+ DONE = auto()
81
+ CASE = auto()
82
+ ESAC = auto()
83
+ IN = auto()
84
+ FUNCTION = auto()
85
+ SELECT = auto()
86
+ TIME = auto()
87
+ COPROC = auto()
88
+
89
+ # Words and identifiers
90
+ WORD = auto()
91
+ NAME = auto() # Valid variable name
92
+ NUMBER = auto() # For redirections like 2>&1
93
+ ASSIGNMENT_WORD = auto() # VAR=value
94
+
95
+ # Comments
96
+ COMMENT = auto()
97
+
98
+ # Here-document content
99
+ HEREDOC_CONTENT = auto()
100
+
101
+
102
+ # Reserved words mapping
103
+ RESERVED_WORDS: dict[str, TokenType] = {
104
+ "if": TokenType.IF,
105
+ "then": TokenType.THEN,
106
+ "else": TokenType.ELSE,
107
+ "elif": TokenType.ELIF,
108
+ "fi": TokenType.FI,
109
+ "for": TokenType.FOR,
110
+ "while": TokenType.WHILE,
111
+ "until": TokenType.UNTIL,
112
+ "do": TokenType.DO,
113
+ "done": TokenType.DONE,
114
+ "case": TokenType.CASE,
115
+ "esac": TokenType.ESAC,
116
+ "in": TokenType.IN,
117
+ "function": TokenType.FUNCTION,
118
+ "select": TokenType.SELECT,
119
+ "time": TokenType.TIME,
120
+ "coproc": TokenType.COPROC,
121
+ }
122
+
123
+
124
+ @dataclass
125
+ class Token:
126
+ """A lexical token."""
127
+
128
+ type: TokenType
129
+ value: str
130
+ start: int
131
+ end: int
132
+ line: int
133
+ column: int
134
+ quoted: bool = False
135
+ single_quoted: bool = False
136
+
137
+
138
+ @dataclass
139
+ class HeredocInfo:
140
+ """Information about a pending here-document."""
141
+
142
+ delimiter: str
143
+ strip_tabs: bool = False
144
+ quoted: bool = False
145
+
146
+
147
+ # Regular expressions for validation
148
+ NAME_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*$")
149
+ NUMBER_PATTERN = re.compile(r"^[0-9]+$")
150
+ ASSIGNMENT_LHS_PATTERN = re.compile(r"^[a-zA-Z_][a-zA-Z0-9_]*")
151
+
152
+
153
+ def is_valid_name(s: str) -> bool:
154
+ """Check if string is a valid variable name."""
155
+ return bool(NAME_PATTERN.match(s))
156
+
157
+
158
+ def is_valid_assignment_lhs(s: str) -> bool:
159
+ """
160
+ Check if a string is a valid assignment LHS with optional nested array subscript.
161
+ Handles: VAR, a[0], a[x], a[a[0]], a[x+1], etc.
162
+ """
163
+ match = ASSIGNMENT_LHS_PATTERN.match(s)
164
+ if not match:
165
+ return False
166
+
167
+ after_name = s[match.end() :]
168
+
169
+ # If nothing after name, it's valid (simple variable)
170
+ if after_name == "" or after_name == "+":
171
+ return True
172
+
173
+ # If it's an array subscript, check for balanced brackets
174
+ if after_name and after_name[0] == "[":
175
+ depth = 0
176
+ i = 0
177
+ for i, c in enumerate(after_name):
178
+ if c == "[":
179
+ depth += 1
180
+ elif c == "]":
181
+ depth -= 1
182
+ if depth == 0:
183
+ break
184
+ # Must have found closing bracket
185
+ if depth != 0 or i >= len(after_name):
186
+ return False
187
+ # After closing bracket, only + is allowed (for +=)
188
+ after_bracket = after_name[i + 1 :]
189
+ return after_bracket == "" or after_bracket == "+"
190
+
191
+ return False
192
+
193
+
194
+ # Three-character operators
195
+ THREE_CHAR_OPS: list[tuple[str, TokenType]] = [
196
+ (";;&", TokenType.SEMI_SEMI_AND),
197
+ ("<<<", TokenType.TLESS),
198
+ ("&>>", TokenType.AND_DGREAT),
199
+ ]
200
+
201
+ # Two-character operators
202
+ TWO_CHAR_OPS: list[tuple[str, TokenType]] = [
203
+ ("[[", TokenType.DBRACK_START),
204
+ ("]]", TokenType.DBRACK_END),
205
+ ("((", TokenType.DPAREN_START),
206
+ ("))", TokenType.DPAREN_END),
207
+ ("&&", TokenType.AND_AND),
208
+ ("||", TokenType.OR_OR),
209
+ (";;", TokenType.DSEMI),
210
+ (";&", TokenType.SEMI_AND),
211
+ ("|&", TokenType.PIPE_AMP),
212
+ (">>", TokenType.DGREAT),
213
+ ("<&", TokenType.LESSAND),
214
+ (">&", TokenType.GREATAND),
215
+ ("<>", TokenType.LESSGREAT),
216
+ (">|", TokenType.CLOBBER),
217
+ ("&>", TokenType.AND_GREAT),
218
+ ]
219
+
220
+ # Single-character operators
221
+ SINGLE_CHAR_OPS: dict[str, TokenType] = {
222
+ "|": TokenType.PIPE,
223
+ "&": TokenType.AMP,
224
+ ";": TokenType.SEMICOLON,
225
+ "(": TokenType.LPAREN,
226
+ ")": TokenType.RPAREN,
227
+ "<": TokenType.LESS,
228
+ ">": TokenType.GREAT,
229
+ }
230
+
231
+ # Word boundary characters
232
+ WORD_BREAK_CHARS = frozenset(" \t\n;|&()<>")
233
+ SPECIAL_CHARS = frozenset("'\"\\$`{}~*?[")
234
+
235
+
236
+ class Lexer:
237
+ """Lexer for bash scripts."""
238
+
239
+ def __init__(self, input_text: str) -> None:
240
+ self.input = input_text
241
+ self.pos = 0
242
+ self.line = 1
243
+ self.column = 1
244
+ self.tokens: list[Token] = []
245
+ self.pending_heredocs: list[HeredocInfo] = []
246
+
247
+ def tokenize(self) -> list[Token]:
248
+ """Tokenize the entire input."""
249
+ input_text = self.input
250
+ input_len = len(input_text)
251
+
252
+ while self.pos < input_len:
253
+ self._skip_whitespace()
254
+
255
+ if self.pos >= input_len:
256
+ break
257
+
258
+ # Check for pending here-documents after newline
259
+ if (
260
+ self.pending_heredocs
261
+ and self.tokens
262
+ and self.tokens[-1].type == TokenType.NEWLINE
263
+ ):
264
+ self._read_heredoc_content()
265
+ continue
266
+
267
+ token = self._next_token()
268
+ if token:
269
+ self.tokens.append(token)
270
+
271
+ # Add EOF token
272
+ self.tokens.append(
273
+ Token(
274
+ type=TokenType.EOF,
275
+ value="",
276
+ start=self.pos,
277
+ end=self.pos,
278
+ line=self.line,
279
+ column=self.column,
280
+ )
281
+ )
282
+
283
+ return self.tokens
284
+
285
+ def _skip_whitespace(self) -> None:
286
+ """Skip whitespace and line continuations."""
287
+ input_text = self.input
288
+ input_len = len(input_text)
289
+
290
+ while self.pos < input_len:
291
+ char = input_text[self.pos]
292
+ if char == " " or char == "\t":
293
+ self.pos += 1
294
+ self.column += 1
295
+ elif (
296
+ char == "\\"
297
+ and self.pos + 1 < input_len
298
+ and input_text[self.pos + 1] == "\n"
299
+ ):
300
+ # Line continuation
301
+ self.pos += 2
302
+ self.line += 1
303
+ self.column = 1
304
+ else:
305
+ break
306
+
307
+ def _next_token(self) -> Optional[Token]:
308
+ """Get the next token."""
309
+ input_text = self.input
310
+ pos = self.pos
311
+ start_line = self.line
312
+ start_column = self.column
313
+
314
+ if pos >= len(input_text):
315
+ return None
316
+
317
+ c0 = input_text[pos]
318
+ c1 = input_text[pos + 1] if pos + 1 < len(input_text) else ""
319
+ c2 = input_text[pos + 2] if pos + 2 < len(input_text) else ""
320
+
321
+ # Comments
322
+ if c0 == "#":
323
+ return self._read_comment(pos, start_line, start_column)
324
+
325
+ # Newline
326
+ if c0 == "\n":
327
+ self.pos = pos + 1
328
+ self.line += 1
329
+ self.column = 1
330
+ return Token(
331
+ type=TokenType.NEWLINE,
332
+ value="\n",
333
+ start=pos,
334
+ end=pos + 1,
335
+ line=start_line,
336
+ column=start_column,
337
+ )
338
+
339
+ # Three-character operators
340
+ # Special case: <<- (heredoc with tab stripping)
341
+ if c0 == "<" and c1 == "<" and c2 == "-":
342
+ self.pos = pos + 3
343
+ self.column = start_column + 3
344
+ self._register_heredoc_from_lookahead(strip_tabs=True)
345
+ return self._make_token(
346
+ TokenType.DLESSDASH, "<<-", pos, start_line, start_column
347
+ )
348
+
349
+ # Check other three-char operators
350
+ three_chars = c0 + c1 + c2
351
+ for op, token_type in THREE_CHAR_OPS:
352
+ if three_chars == op:
353
+ self.pos = pos + 3
354
+ self.column = start_column + 3
355
+ return self._make_token(token_type, op, pos, start_line, start_column)
356
+
357
+ # Two-character operators
358
+ # Special case: << (heredoc)
359
+ if c0 == "<" and c1 == "<":
360
+ self.pos = pos + 2
361
+ self.column = start_column + 2
362
+ self._register_heredoc_from_lookahead(strip_tabs=False)
363
+ return self._make_token(TokenType.DLESS, "<<", pos, start_line, start_column)
364
+
365
+ # Check other two-char operators
366
+ two_chars = c0 + c1
367
+ for op, token_type in TWO_CHAR_OPS:
368
+ if two_chars == op:
369
+ self.pos = pos + 2
370
+ self.column = start_column + 2
371
+ return self._make_token(token_type, op, pos, start_line, start_column)
372
+
373
+ # Single-character operators
374
+ if c0 in SINGLE_CHAR_OPS:
375
+ self.pos = pos + 1
376
+ self.column = start_column + 1
377
+ return self._make_token(
378
+ SINGLE_CHAR_OPS[c0], c0, pos, start_line, start_column
379
+ )
380
+
381
+ # Special handling for { and }
382
+ if c0 == "{":
383
+ # Check for {} as a word (used in find -exec)
384
+ if c1 == "}":
385
+ self.pos = pos + 2
386
+ self.column = start_column + 2
387
+ return Token(
388
+ type=TokenType.WORD,
389
+ value="{}",
390
+ start=pos,
391
+ end=pos + 2,
392
+ line=start_line,
393
+ column=start_column,
394
+ quoted=False,
395
+ single_quoted=False,
396
+ )
397
+ # In bash, { must be followed by whitespace to be a group start
398
+ if c1 and c1 not in " \t\n":
399
+ return self._read_word(pos, start_line, start_column)
400
+ self.pos = pos + 1
401
+ self.column = start_column + 1
402
+ return self._make_token(TokenType.LBRACE, "{", pos, start_line, start_column)
403
+
404
+ if c0 == "}":
405
+ self.pos = pos + 1
406
+ self.column = start_column + 1
407
+ return self._make_token(TokenType.RBRACE, "}", pos, start_line, start_column)
408
+
409
+ if c0 == "!":
410
+ # Check for != operator (used in [[ ]] tests)
411
+ if c1 == "=":
412
+ self.pos = pos + 2
413
+ self.column = start_column + 2
414
+ return self._make_token(TokenType.WORD, "!=", pos, start_line, start_column)
415
+ self.pos = pos + 1
416
+ self.column = start_column + 1
417
+ return self._make_token(TokenType.BANG, "!", pos, start_line, start_column)
418
+
419
+ # Words
420
+ return self._read_word(pos, start_line, start_column)
421
+
422
+ def _make_token(
423
+ self, type_: TokenType, value: str, start: int, line: int, column: int
424
+ ) -> Token:
425
+ """Create a token."""
426
+ return Token(
427
+ type=type_,
428
+ value=value,
429
+ start=start,
430
+ end=self.pos,
431
+ line=line,
432
+ column=column,
433
+ )
434
+
435
+ def _read_comment(self, start: int, line: int, column: int) -> Token:
436
+ """Read a comment token."""
437
+ input_text = self.input
438
+ input_len = len(input_text)
439
+ pos = self.pos
440
+
441
+ # Find end of comment (newline or EOF)
442
+ while pos < input_len and input_text[pos] != "\n":
443
+ pos += 1
444
+
445
+ value = input_text[start:pos]
446
+ self.pos = pos
447
+ self.column = column + (pos - start)
448
+
449
+ return Token(
450
+ type=TokenType.COMMENT,
451
+ value=value,
452
+ start=start,
453
+ end=pos,
454
+ line=line,
455
+ column=column,
456
+ )
457
+
458
+ def _read_word(self, start: int, line: int, column: int) -> Token:
459
+ """Read a word token (with possible quotes, escapes, expansions)."""
460
+ input_text = self.input
461
+ input_len = len(input_text)
462
+ pos = self.pos
463
+
464
+ # Fast path: scan for simple word (no quotes, escapes, or expansions)
465
+ fast_start = pos
466
+ while pos < input_len:
467
+ c = input_text[pos]
468
+ if c in WORD_BREAK_CHARS or c in SPECIAL_CHARS:
469
+ break
470
+ pos += 1
471
+
472
+ # If we consumed characters and hit a simple delimiter
473
+ if pos > fast_start:
474
+ c = input_text[pos] if pos < input_len else ""
475
+ if c == "" or c in WORD_BREAK_CHARS:
476
+ value = input_text[fast_start:pos]
477
+ self.pos = pos
478
+ self.column = column + (pos - fast_start)
479
+
480
+ # Check for reserved words
481
+ if value in RESERVED_WORDS:
482
+ return Token(
483
+ type=RESERVED_WORDS[value],
484
+ value=value,
485
+ start=start,
486
+ end=pos,
487
+ line=line,
488
+ column=column,
489
+ )
490
+
491
+ # Check for assignment
492
+ eq_idx = value.find("=")
493
+ if eq_idx > 0 and is_valid_assignment_lhs(value[:eq_idx]):
494
+ return Token(
495
+ type=TokenType.ASSIGNMENT_WORD,
496
+ value=value,
497
+ start=start,
498
+ end=pos,
499
+ line=line,
500
+ column=column,
501
+ )
502
+
503
+ # Check for number
504
+ if NUMBER_PATTERN.match(value):
505
+ return Token(
506
+ type=TokenType.NUMBER,
507
+ value=value,
508
+ start=start,
509
+ end=pos,
510
+ line=line,
511
+ column=column,
512
+ )
513
+
514
+ # Check for valid name
515
+ if NAME_PATTERN.match(value):
516
+ return Token(
517
+ type=TokenType.NAME,
518
+ value=value,
519
+ start=start,
520
+ end=pos,
521
+ line=line,
522
+ column=column,
523
+ quoted=False,
524
+ single_quoted=False,
525
+ )
526
+
527
+ return Token(
528
+ type=TokenType.WORD,
529
+ value=value,
530
+ start=start,
531
+ end=pos,
532
+ line=line,
533
+ column=column,
534
+ quoted=False,
535
+ single_quoted=False,
536
+ )
537
+
538
+ # Slow path: handle complex words with quotes, escapes, expansions
539
+ pos = self.pos # Reset position
540
+ col = self.column
541
+ ln = self.line
542
+
543
+ value = ""
544
+ quoted = False
545
+ single_quoted = False
546
+ in_single_quote = False
547
+ in_double_quote = False
548
+ starts_with_quote = input_text[pos] in "\"'" if pos < input_len else False
549
+
550
+ while pos < input_len:
551
+ char = input_text[pos]
552
+
553
+ # Check for word boundaries
554
+ if not in_single_quote and not in_double_quote:
555
+ if char in WORD_BREAK_CHARS:
556
+ break
557
+
558
+ # Handle $'' ANSI-C quoting
559
+ if (
560
+ char == "$"
561
+ and pos + 1 < input_len
562
+ and input_text[pos + 1] == "'"
563
+ and not in_single_quote
564
+ and not in_double_quote
565
+ ):
566
+ value += "$'"
567
+ pos += 2
568
+ col += 2
569
+ # Read until closing quote, handling escape sequences
570
+ while pos < input_len and input_text[pos] != "'":
571
+ if input_text[pos] == "\\" and pos + 1 < input_len:
572
+ value += input_text[pos : pos + 2]
573
+ pos += 2
574
+ col += 2
575
+ else:
576
+ value += input_text[pos]
577
+ pos += 1
578
+ col += 1
579
+ if pos < input_len:
580
+ value += "'"
581
+ pos += 1
582
+ col += 1
583
+ continue
584
+
585
+ # Handle $"..." locale quoting
586
+ if (
587
+ char == "$"
588
+ and pos + 1 < input_len
589
+ and input_text[pos + 1] == '"'
590
+ and not in_single_quote
591
+ and not in_double_quote
592
+ ):
593
+ pos += 1
594
+ col += 1
595
+ in_double_quote = True
596
+ quoted = True
597
+ if value == "":
598
+ starts_with_quote = True
599
+ pos += 1
600
+ col += 1
601
+ continue
602
+
603
+ # Handle quotes
604
+ if char == "'" and not in_double_quote:
605
+ if in_single_quote:
606
+ in_single_quote = False
607
+ if not starts_with_quote:
608
+ value += char
609
+ else:
610
+ in_single_quote = True
611
+ if starts_with_quote:
612
+ single_quoted = True
613
+ quoted = True
614
+ else:
615
+ value += char
616
+ pos += 1
617
+ col += 1
618
+ continue
619
+
620
+ if char == '"' and not in_single_quote:
621
+ if in_double_quote:
622
+ in_double_quote = False
623
+ if not starts_with_quote:
624
+ value += char
625
+ else:
626
+ in_double_quote = True
627
+ if starts_with_quote:
628
+ quoted = True
629
+ else:
630
+ value += char
631
+ pos += 1
632
+ col += 1
633
+ continue
634
+
635
+ # Handle escapes
636
+ if char == "\\" and not in_single_quote and pos + 1 < input_len:
637
+ next_char = input_text[pos + 1]
638
+ if next_char == "\n":
639
+ # Line continuation
640
+ pos += 2
641
+ ln += 1
642
+ col = 1
643
+ continue
644
+ if in_double_quote:
645
+ # In double quotes, only certain escapes are special
646
+ if next_char in "\"\\$`\n":
647
+ if next_char in "$`":
648
+ value += char + next_char
649
+ else:
650
+ value += next_char
651
+ pos += 2
652
+ col += 2
653
+ continue
654
+ else:
655
+ # Outside quotes, backslash escapes next character
656
+ if next_char in "\"'":
657
+ value += char + next_char
658
+ else:
659
+ value += next_char
660
+ pos += 2
661
+ col += 2
662
+ continue
663
+
664
+ # Handle $(...) command substitution
665
+ if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "(":
666
+ value += char
667
+ pos += 1
668
+ col += 1
669
+ value += input_text[pos] # Add the (
670
+ pos += 1
671
+ col += 1
672
+
673
+ # Track parenthesis depth
674
+ depth = 1
675
+ cmd_in_single_quote = False
676
+ cmd_in_double_quote = False
677
+
678
+ while depth > 0 and pos < input_len:
679
+ c = input_text[pos]
680
+ value += c
681
+
682
+ if cmd_in_single_quote:
683
+ if c == "'":
684
+ cmd_in_single_quote = False
685
+ elif cmd_in_double_quote:
686
+ if c == "\\" and pos + 1 < input_len:
687
+ value += input_text[pos + 1]
688
+ pos += 1
689
+ col += 1
690
+ elif c == '"':
691
+ cmd_in_double_quote = False
692
+ else:
693
+ if c == "'":
694
+ cmd_in_single_quote = True
695
+ elif c == '"':
696
+ cmd_in_double_quote = True
697
+ elif c == "\\" and pos + 1 < input_len:
698
+ value += input_text[pos + 1]
699
+ pos += 1
700
+ col += 1
701
+ elif c == "(":
702
+ depth += 1
703
+ elif c == ")":
704
+ depth -= 1
705
+
706
+ pos += 1
707
+ col += 1
708
+ continue
709
+
710
+ # Handle ${...} parameter expansion
711
+ if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "{":
712
+ value += char
713
+ pos += 1
714
+ col += 1
715
+ value += input_text[pos] # Add the {
716
+ pos += 1
717
+ col += 1
718
+
719
+ # Track brace depth
720
+ depth = 1
721
+ while depth > 0 and pos < input_len:
722
+ c = input_text[pos]
723
+ value += c
724
+
725
+ if c == "{":
726
+ depth += 1
727
+ elif c == "}":
728
+ depth -= 1
729
+
730
+ pos += 1
731
+ col += 1
732
+ continue
733
+
734
+ # Handle simple $VAR expansion
735
+ if char == "$":
736
+ value += char
737
+ pos += 1
738
+ col += 1
739
+ # Read variable name
740
+ while pos < input_len and (
741
+ input_text[pos].isalnum() or input_text[pos] == "_"
742
+ ):
743
+ value += input_text[pos]
744
+ pos += 1
745
+ col += 1
746
+ continue
747
+
748
+ # Handle backtick command substitution
749
+ if char == "`":
750
+ value += char
751
+ pos += 1
752
+ col += 1
753
+ # Read until closing backtick
754
+ while pos < input_len and input_text[pos] != "`":
755
+ if input_text[pos] == "\\" and pos + 1 < input_len:
756
+ value += input_text[pos : pos + 2]
757
+ pos += 2
758
+ col += 2
759
+ else:
760
+ value += input_text[pos]
761
+ pos += 1
762
+ col += 1
763
+ if pos < input_len:
764
+ value += "`"
765
+ pos += 1
766
+ col += 1
767
+ continue
768
+
769
+ # Handle brace expansion and glob patterns
770
+ if char in "{}*?[~":
771
+ value += char
772
+ pos += 1
773
+ col += 1
774
+ continue
775
+
776
+ # Regular character
777
+ value += char
778
+ pos += 1
779
+ col += 1
780
+
781
+ self.pos = pos
782
+ self.column = col
783
+ self.line = ln
784
+
785
+ # Determine token type
786
+ # Note: An empty value is valid if it was quoted (e.g., "" or '')
787
+ if not value and not quoted:
788
+ return None
789
+
790
+ # Check for reserved words (only if unquoted)
791
+ if not quoted and value in RESERVED_WORDS:
792
+ return Token(
793
+ type=RESERVED_WORDS[value],
794
+ value=value,
795
+ start=start,
796
+ end=pos,
797
+ line=line,
798
+ column=column,
799
+ )
800
+
801
+ # Check for assignment (only if unquoted)
802
+ if not quoted:
803
+ eq_idx = value.find("=")
804
+ if eq_idx > 0 and is_valid_assignment_lhs(value[:eq_idx]):
805
+ return Token(
806
+ type=TokenType.ASSIGNMENT_WORD,
807
+ value=value,
808
+ start=start,
809
+ end=pos,
810
+ line=line,
811
+ column=column,
812
+ )
813
+
814
+ return Token(
815
+ type=TokenType.WORD,
816
+ value=value,
817
+ start=start,
818
+ end=pos,
819
+ line=line,
820
+ column=column,
821
+ quoted=quoted,
822
+ single_quoted=single_quoted,
823
+ )
824
+
825
+ def _register_heredoc_from_lookahead(self, strip_tabs: bool) -> None:
826
+ """Register a here-document by looking ahead for the delimiter."""
827
+ input_text = self.input
828
+ input_len = len(input_text)
829
+ pos = self.pos
830
+
831
+ # Skip whitespace
832
+ while pos < input_len and input_text[pos] in " \t":
833
+ pos += 1
834
+
835
+ if pos >= input_len:
836
+ return
837
+
838
+ # Read delimiter
839
+ delimiter = ""
840
+ quoted = False
841
+ in_single_quote = False
842
+ in_double_quote = False
843
+
844
+ # Check for quoted delimiter
845
+ if input_text[pos] == "'":
846
+ quoted = True
847
+ in_single_quote = True
848
+ pos += 1
849
+ elif input_text[pos] == '"':
850
+ quoted = True
851
+ in_double_quote = True
852
+ pos += 1
853
+
854
+ while pos < input_len:
855
+ c = input_text[pos]
856
+
857
+ if in_single_quote:
858
+ if c == "'":
859
+ pos += 1
860
+ break
861
+ delimiter += c
862
+ elif in_double_quote:
863
+ if c == '"':
864
+ pos += 1
865
+ break
866
+ delimiter += c
867
+ else:
868
+ if c in " \t\n;|&<>()":
869
+ break
870
+ # Handle backslash escapes in unquoted delimiter
871
+ if c == "\\" and pos + 1 < input_len:
872
+ delimiter += input_text[pos + 1]
873
+ pos += 2
874
+ quoted = True # Backslash makes it quoted
875
+ continue
876
+ delimiter += c
877
+
878
+ pos += 1
879
+
880
+ if delimiter:
881
+ self.pending_heredocs.append(
882
+ HeredocInfo(delimiter=delimiter, strip_tabs=strip_tabs, quoted=quoted)
883
+ )
884
+
885
+ def _read_heredoc_content(self) -> None:
886
+ """Read here-document content."""
887
+ if not self.pending_heredocs:
888
+ return
889
+
890
+ input_text = self.input
891
+ input_len = len(input_text)
892
+
893
+ for heredoc in self.pending_heredocs:
894
+ delimiter = heredoc.delimiter
895
+ strip_tabs = heredoc.strip_tabs
896
+ start = self.pos
897
+ start_line = self.line
898
+ start_column = self.column
899
+
900
+ content = ""
901
+ while self.pos < input_len:
902
+ # Read a line
903
+ line_start = self.pos
904
+ line_content = ""
905
+
906
+ while self.pos < input_len and input_text[self.pos] != "\n":
907
+ line_content += input_text[self.pos]
908
+ self.pos += 1
909
+
910
+ # Include newline if present
911
+ if self.pos < input_len:
912
+ self.pos += 1
913
+ self.line += 1
914
+ self.column = 1
915
+
916
+ # Check if this line is the delimiter
917
+ check_line = line_content
918
+ if strip_tabs:
919
+ check_line = line_content.lstrip("\t")
920
+
921
+ if check_line == delimiter:
922
+ break
923
+
924
+ # Add line to content (with tab stripping if applicable)
925
+ if strip_tabs:
926
+ content += line_content.lstrip("\t") + "\n"
927
+ else:
928
+ content += line_content + "\n"
929
+
930
+ # Create token for heredoc content
931
+ self.tokens.append(
932
+ Token(
933
+ type=TokenType.HEREDOC_CONTENT,
934
+ value=content,
935
+ start=start,
936
+ end=self.pos,
937
+ line=start_line,
938
+ column=start_column,
939
+ )
940
+ )
941
+
942
+ self.pending_heredocs.clear()
943
+
944
+
945
+ def tokenize(input_text: str) -> list[Token]:
946
+ """Convenience function to tokenize input."""
947
+ lexer = Lexer(input_text)
948
+ return lexer.tokenize()