just-bash 0.1.5__py3-none-any.whl → 0.1.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. just_bash/ast/factory.py +3 -1
  2. just_bash/bash.py +28 -6
  3. just_bash/commands/awk/awk.py +362 -17
  4. just_bash/commands/cat/cat.py +5 -1
  5. just_bash/commands/echo/echo.py +33 -1
  6. just_bash/commands/grep/grep.py +141 -3
  7. just_bash/commands/od/od.py +144 -30
  8. just_bash/commands/printf/printf.py +289 -87
  9. just_bash/commands/pwd/pwd.py +32 -2
  10. just_bash/commands/read/read.py +243 -64
  11. just_bash/commands/readlink/readlink.py +3 -9
  12. just_bash/commands/registry.py +32 -0
  13. just_bash/commands/rmdir/__init__.py +5 -0
  14. just_bash/commands/rmdir/rmdir.py +160 -0
  15. just_bash/commands/sed/sed.py +142 -31
  16. just_bash/commands/shuf/__init__.py +5 -0
  17. just_bash/commands/shuf/shuf.py +242 -0
  18. just_bash/commands/stat/stat.py +9 -0
  19. just_bash/commands/time/__init__.py +5 -0
  20. just_bash/commands/time/time.py +74 -0
  21. just_bash/commands/touch/touch.py +118 -8
  22. just_bash/commands/whoami/__init__.py +5 -0
  23. just_bash/commands/whoami/whoami.py +18 -0
  24. just_bash/fs/in_memory_fs.py +22 -0
  25. just_bash/fs/overlay_fs.py +22 -1
  26. just_bash/interpreter/__init__.py +1 -1
  27. just_bash/interpreter/builtins/__init__.py +2 -0
  28. just_bash/interpreter/builtins/control.py +4 -8
  29. just_bash/interpreter/builtins/declare.py +321 -24
  30. just_bash/interpreter/builtins/getopts.py +163 -0
  31. just_bash/interpreter/builtins/let.py +2 -2
  32. just_bash/interpreter/builtins/local.py +71 -5
  33. just_bash/interpreter/builtins/misc.py +22 -6
  34. just_bash/interpreter/builtins/readonly.py +38 -10
  35. just_bash/interpreter/builtins/set.py +58 -8
  36. just_bash/interpreter/builtins/test.py +136 -19
  37. just_bash/interpreter/builtins/unset.py +62 -10
  38. just_bash/interpreter/conditionals.py +29 -4
  39. just_bash/interpreter/control_flow.py +61 -17
  40. just_bash/interpreter/expansion.py +1647 -104
  41. just_bash/interpreter/interpreter.py +436 -69
  42. just_bash/interpreter/types.py +263 -2
  43. just_bash/parser/__init__.py +2 -0
  44. just_bash/parser/lexer.py +295 -26
  45. just_bash/parser/parser.py +523 -64
  46. just_bash/types.py +11 -0
  47. {just_bash-0.1.5.dist-info → just_bash-0.1.10.dist-info}/METADATA +40 -1
  48. {just_bash-0.1.5.dist-info → just_bash-0.1.10.dist-info}/RECORD +49 -40
  49. {just_bash-0.1.5.dist-info → just_bash-0.1.10.dist-info}/WHEEL +0 -0
just_bash/parser/lexer.py CHANGED
@@ -116,7 +116,7 @@ RESERVED_WORDS: dict[str, TokenType] = {
116
116
  "in": TokenType.IN,
117
117
  "function": TokenType.FUNCTION,
118
118
  "select": TokenType.SELECT,
119
- "time": TokenType.TIME,
119
+ # "time" is handled as a regular command, not a reserved keyword
120
120
  "coproc": TokenType.COPROC,
121
121
  }
122
122
 
@@ -133,6 +133,7 @@ class Token:
133
133
  column: int
134
134
  quoted: bool = False
135
135
  single_quoted: bool = False
136
+ segments: list | None = None # list of (text, mode) tuples for mixed quoting
136
137
 
137
138
 
138
139
  @dataclass
@@ -256,11 +257,7 @@ class Lexer:
256
257
  break
257
258
 
258
259
  # Check for pending here-documents after newline
259
- if (
260
- self.pending_heredocs
261
- and self.tokens
262
- and self.tokens[-1].type == TokenType.NEWLINE
263
- ):
260
+ if self.pending_heredocs and self.tokens and self.tokens[-1].type == TokenType.NEWLINE:
264
261
  self._read_heredoc_content()
265
262
  continue
266
263
 
@@ -292,11 +289,7 @@ class Lexer:
292
289
  if char == " " or char == "\t":
293
290
  self.pos += 1
294
291
  self.column += 1
295
- elif (
296
- char == "\\"
297
- and self.pos + 1 < input_len
298
- and input_text[self.pos + 1] == "\n"
299
- ):
292
+ elif char == "\\" and self.pos + 1 < input_len and input_text[self.pos + 1] == "\n":
300
293
  # Line continuation
301
294
  self.pos += 2
302
295
  self.line += 1
@@ -342,9 +335,7 @@ class Lexer:
342
335
  self.pos = pos + 3
343
336
  self.column = start_column + 3
344
337
  self._register_heredoc_from_lookahead(strip_tabs=True)
345
- return self._make_token(
346
- TokenType.DLESSDASH, "<<-", pos, start_line, start_column
347
- )
338
+ return self._make_token(TokenType.DLESSDASH, "<<-", pos, start_line, start_column)
348
339
 
349
340
  # Check other three-char operators
350
341
  three_chars = c0 + c1 + c2
@@ -374,9 +365,7 @@ class Lexer:
374
365
  if c0 in SINGLE_CHAR_OPS:
375
366
  self.pos = pos + 1
376
367
  self.column = start_column + 1
377
- return self._make_token(
378
- SINGLE_CHAR_OPS[c0], c0, pos, start_line, start_column
379
- )
368
+ return self._make_token(SINGLE_CHAR_OPS[c0], c0, pos, start_line, start_column)
380
369
 
381
370
  # Special handling for { and }
382
371
  if c0 == "{":
@@ -470,9 +459,17 @@ class Lexer:
470
459
  pos += 1
471
460
 
472
461
  # If we consumed characters and hit a simple delimiter
462
+ _use_fast_path = False
473
463
  if pos > fast_start:
474
464
  c = input_text[pos] if pos < input_len else ""
475
465
  if c == "" or c in WORD_BREAK_CHARS:
466
+ # Don't use fast path if we're at an extglob pattern: @( ?( *( +( !(
467
+ if c == "(" and pos > fast_start and input_text[pos - 1] in "@?*+!":
468
+ _use_fast_path = False # Fall through to slow path
469
+ else:
470
+ _use_fast_path = True
471
+
472
+ if _use_fast_path:
476
473
  value = input_text[fast_start:pos]
477
474
  self.pos = pos
478
475
  self.column = column + (pos - fast_start)
@@ -547,12 +544,34 @@ class Lexer:
547
544
  in_double_quote = False
548
545
  starts_with_quote = input_text[pos] in "\"'" if pos < input_len else False
549
546
 
547
+ # Segment boundary tracking for mixed quoting (e.g., "pre"{a,b}"suf")
548
+ # Records (value_offset, mode) at each quoting transition
549
+ seg_boundaries: list[tuple[int, str]] = []
550
+ seg_mode = "unquoted"
551
+
550
552
  while pos < input_len:
551
553
  char = input_text[pos]
552
554
 
553
555
  # Check for word boundaries
554
556
  if not in_single_quote and not in_double_quote:
555
557
  if char in WORD_BREAK_CHARS:
558
+ # Handle extglob patterns: @( ?( *( +( !(
559
+ if char == "(" and value and value[-1] in "@?*+!":
560
+ # Read balanced paren group as part of word
561
+ value += char
562
+ pos += 1
563
+ col += 1
564
+ depth = 1
565
+ while pos < input_len and depth > 0:
566
+ ec = input_text[pos]
567
+ if ec == "(":
568
+ depth += 1
569
+ elif ec == ")":
570
+ depth -= 1
571
+ value += ec
572
+ pos += 1
573
+ col += 1
574
+ continue
556
575
  break
557
576
 
558
577
  # Handle $'' ANSI-C quoting
@@ -604,13 +623,20 @@ class Lexer:
604
623
  if char == "'" and not in_double_quote:
605
624
  if in_single_quote:
606
625
  in_single_quote = False
607
- if not starts_with_quote:
626
+ if starts_with_quote:
627
+ # Record transition: single → unquoted
628
+ seg_boundaries.append((len(value), seg_mode))
629
+ seg_mode = "unquoted"
630
+ else:
608
631
  value += char
609
632
  else:
610
633
  in_single_quote = True
611
634
  if starts_with_quote:
612
635
  single_quoted = True
613
636
  quoted = True
637
+ # Record transition: current → single
638
+ seg_boundaries.append((len(value), seg_mode))
639
+ seg_mode = "single"
614
640
  else:
615
641
  value += char
616
642
  pos += 1
@@ -620,12 +646,19 @@ class Lexer:
620
646
  if char == '"' and not in_single_quote:
621
647
  if in_double_quote:
622
648
  in_double_quote = False
623
- if not starts_with_quote:
649
+ if starts_with_quote:
650
+ # Record transition: double → unquoted
651
+ seg_boundaries.append((len(value), seg_mode))
652
+ seg_mode = "unquoted"
653
+ else:
624
654
  value += char
625
655
  else:
626
656
  in_double_quote = True
627
657
  if starts_with_quote:
628
658
  quoted = True
659
+ # Record transition: current → double
660
+ seg_boundaries.append((len(value), seg_mode))
661
+ seg_mode = "double"
629
662
  else:
630
663
  value += char
631
664
  pos += 1
@@ -643,7 +676,7 @@ class Lexer:
643
676
  continue
644
677
  if in_double_quote:
645
678
  # In double quotes, only certain escapes are special
646
- if next_char in "\"\\$`\n":
679
+ if next_char in '"\\$`\n':
647
680
  if next_char in "$`":
648
681
  value += char + next_char
649
682
  else:
@@ -653,7 +686,9 @@ class Lexer:
653
686
  continue
654
687
  else:
655
688
  # Outside quotes, backslash escapes next character
656
- if next_char in "\"'":
689
+ if next_char in "\"'{}":
690
+ # Preserve backslash for quotes and braces so parser
691
+ # can create EscapedPart (prevents brace expansion)
657
692
  value += char + next_char
658
693
  else:
659
694
  value += next_char
@@ -707,6 +742,30 @@ class Lexer:
707
742
  col += 1
708
743
  continue
709
744
 
745
+ # Handle $[...] legacy arithmetic expansion
746
+ if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "[":
747
+ value += char
748
+ pos += 1
749
+ col += 1
750
+ value += input_text[pos] # Add the [
751
+ pos += 1
752
+ col += 1
753
+
754
+ # Track bracket depth
755
+ depth = 1
756
+ while depth > 0 and pos < input_len:
757
+ c = input_text[pos]
758
+ value += c
759
+
760
+ if c == "[":
761
+ depth += 1
762
+ elif c == "]":
763
+ depth -= 1
764
+
765
+ pos += 1
766
+ col += 1
767
+ continue
768
+
710
769
  # Handle ${...} parameter expansion
711
770
  if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "{":
712
771
  value += char
@@ -737,9 +796,7 @@ class Lexer:
737
796
  pos += 1
738
797
  col += 1
739
798
  # Read variable name
740
- while pos < input_len and (
741
- input_text[pos].isalnum() or input_text[pos] == "_"
742
- ):
799
+ while pos < input_len and (input_text[pos].isalnum() or input_text[pos] == "_"):
743
800
  value += input_text[pos]
744
801
  pos += 1
745
802
  col += 1
@@ -811,6 +868,24 @@ class Lexer:
811
868
  column=column,
812
869
  )
813
870
 
871
+ # Build segments from boundaries if we had quoting transitions
872
+ final_segments = None
873
+ if seg_boundaries:
874
+ final_segments = []
875
+ prev_offset = 0
876
+ for offset, mode in seg_boundaries:
877
+ text = value[prev_offset:offset]
878
+ if text:
879
+ final_segments.append((text, mode))
880
+ prev_offset = offset
881
+ # Add final segment
882
+ final_text = value[prev_offset:]
883
+ if final_text:
884
+ final_segments.append((final_text, seg_mode))
885
+ # Only use segments if there are multiple (mixed quoting)
886
+ if len(final_segments) <= 1:
887
+ final_segments = None
888
+
814
889
  return Token(
815
890
  type=TokenType.WORD,
816
891
  value=value,
@@ -820,6 +895,7 @@ class Lexer:
820
895
  column=column,
821
896
  quoted=quoted,
822
897
  single_quoted=single_quoted,
898
+ segments=final_segments,
823
899
  )
824
900
 
825
901
  def _register_heredoc_from_lookahead(self, strip_tabs: bool) -> None:
@@ -856,13 +932,15 @@ class Lexer:
856
932
 
857
933
  if in_single_quote:
858
934
  if c == "'":
935
+ in_single_quote = False
859
936
  pos += 1
860
- break
937
+ continue
861
938
  delimiter += c
862
939
  elif in_double_quote:
863
940
  if c == '"':
941
+ in_double_quote = False
864
942
  pos += 1
865
- break
943
+ continue
866
944
  delimiter += c
867
945
  else:
868
946
  if c in " \t\n;|&<>()":
@@ -873,6 +951,17 @@ class Lexer:
873
951
  pos += 2
874
952
  quoted = True # Backslash makes it quoted
875
953
  continue
954
+ # Handle embedded quotes (e.g., E'O'F)
955
+ if c == "'":
956
+ in_single_quote = True
957
+ quoted = True
958
+ pos += 1
959
+ continue
960
+ if c == '"':
961
+ in_double_quote = True
962
+ quoted = True
963
+ pos += 1
964
+ continue
876
965
  delimiter += c
877
966
 
878
967
  pos += 1
@@ -946,3 +1035,183 @@ def tokenize(input_text: str) -> list[Token]:
946
1035
  """Convenience function to tokenize input."""
947
1036
  lexer = Lexer(input_text)
948
1037
  return lexer.tokenize()
1038
+
1039
+
1040
+ # HTML entity mappings
1041
+ HTML_ENTITIES: dict[str, str] = {
1042
+ "&lt;": "<",
1043
+ "&gt;": ">",
1044
+ "&amp;": "&",
1045
+ "&quot;": '"',
1046
+ "&apos;": "'",
1047
+ }
1048
+
1049
+
1050
+ def unescape_html_entities(input_text: str) -> str:
1051
+ """Unescape HTML entities in operator positions (outside quotes and heredocs).
1052
+
1053
+ This handles LLM-generated bash commands that contain HTML-escaped
1054
+ operators like &lt; instead of <.
1055
+
1056
+ Only unescapes entities outside of:
1057
+ - Single quotes
1058
+ - Double quotes
1059
+ - Heredoc content
1060
+
1061
+ Args:
1062
+ input_text: The bash script that may contain HTML entities.
1063
+
1064
+ Returns:
1065
+ The script with HTML entities unescaped in operator positions.
1066
+ """
1067
+ result: list[str] = []
1068
+ i = 0
1069
+ n = len(input_text)
1070
+ in_single_quote = False
1071
+ in_double_quote = False
1072
+ heredoc_delimiter: str | None = None # None means not in heredoc
1073
+
1074
+ while i < n:
1075
+ char = input_text[i]
1076
+
1077
+ # If we're in a heredoc, look for the end delimiter
1078
+ if heredoc_delimiter is not None:
1079
+ # Check if this line matches the heredoc delimiter
1080
+ line_start = i
1081
+ line_end = input_text.find("\n", i)
1082
+ if line_end == -1:
1083
+ line_end = n
1084
+
1085
+ line = input_text[line_start:line_end]
1086
+ # For <<- heredocs, delimiter may be preceded by tabs
1087
+ stripped_line = line.lstrip("\t")
1088
+
1089
+ if stripped_line == heredoc_delimiter or line == heredoc_delimiter:
1090
+ # End of heredoc - output the line and exit heredoc mode
1091
+ if line_end < n:
1092
+ result.append(input_text[i : line_end + 1])
1093
+ i = line_end + 1
1094
+ else:
1095
+ result.append(input_text[i:line_end])
1096
+ i = line_end
1097
+ heredoc_delimiter = None
1098
+ continue
1099
+
1100
+ # Still in heredoc - output the entire line as-is
1101
+ if line_end < n:
1102
+ result.append(input_text[i : line_end + 1])
1103
+ i = line_end + 1
1104
+ else:
1105
+ result.append(input_text[i:line_end])
1106
+ i = line_end
1107
+ continue
1108
+
1109
+ # Handle quote state tracking
1110
+ if char == "'" and not in_double_quote:
1111
+ in_single_quote = not in_single_quote
1112
+ result.append(char)
1113
+ i += 1
1114
+ continue
1115
+
1116
+ if char == '"' and not in_single_quote:
1117
+ in_double_quote = not in_double_quote
1118
+ result.append(char)
1119
+ i += 1
1120
+ continue
1121
+
1122
+ # Handle backslash escapes (only outside single quotes)
1123
+ if char == "\\" and not in_single_quote and i + 1 < n:
1124
+ # Keep the backslash and next character as-is
1125
+ result.append(char)
1126
+ result.append(input_text[i + 1])
1127
+ i += 2
1128
+ continue
1129
+
1130
+ # Detect heredoc start (only outside quotes)
1131
+ if not in_single_quote and not in_double_quote and char == "<":
1132
+ # Check for << or <<-
1133
+ if i + 1 < n and input_text[i + 1] == "<":
1134
+ strip_tabs = i + 2 < n and input_text[i + 2] == "-"
1135
+ heredoc_op_len = 3 if strip_tabs else 2
1136
+
1137
+ # Output the << or <<-
1138
+ result.append(input_text[i : i + heredoc_op_len])
1139
+ i += heredoc_op_len
1140
+
1141
+ # Skip whitespace after operator
1142
+ while i < n and input_text[i] in " \t":
1143
+ result.append(input_text[i])
1144
+ i += 1
1145
+
1146
+ if i >= n:
1147
+ continue
1148
+
1149
+ # Parse the delimiter
1150
+ delimiter = ""
1151
+
1152
+ if input_text[i] == "'":
1153
+ # Single-quoted delimiter
1154
+ result.append("'")
1155
+ i += 1
1156
+ while i < n and input_text[i] != "'":
1157
+ delimiter += input_text[i]
1158
+ result.append(input_text[i])
1159
+ i += 1
1160
+ if i < n:
1161
+ result.append("'")
1162
+ i += 1
1163
+ elif input_text[i] == '"':
1164
+ # Double-quoted delimiter
1165
+ result.append('"')
1166
+ i += 1
1167
+ while i < n and input_text[i] != '"':
1168
+ delimiter += input_text[i]
1169
+ result.append(input_text[i])
1170
+ i += 1
1171
+ if i < n:
1172
+ result.append('"')
1173
+ i += 1
1174
+ else:
1175
+ # Unquoted delimiter
1176
+ while i < n and input_text[i] not in " \t\n;|&<>()":
1177
+ if input_text[i] == "\\" and i + 1 < n:
1178
+ # Backslash-escaped character in delimiter
1179
+ delimiter += input_text[i + 1]
1180
+ result.append(input_text[i : i + 2])
1181
+ i += 2
1182
+ else:
1183
+ delimiter += input_text[i]
1184
+ result.append(input_text[i])
1185
+ i += 1
1186
+
1187
+ # Find the end of this line (heredoc content starts on next line)
1188
+ while i < n and input_text[i] != "\n":
1189
+ result.append(input_text[i])
1190
+ i += 1
1191
+
1192
+ if i < n:
1193
+ result.append("\n")
1194
+ i += 1
1195
+ # Now in heredoc mode
1196
+ heredoc_delimiter = delimiter
1197
+
1198
+ continue
1199
+
1200
+ # Only attempt HTML entity replacement outside quotes
1201
+ if not in_single_quote and not in_double_quote and char == "&":
1202
+ # Check for HTML entities
1203
+ matched = False
1204
+ for entity, replacement in HTML_ENTITIES.items():
1205
+ if input_text[i:].startswith(entity):
1206
+ result.append(replacement)
1207
+ i += len(entity)
1208
+ matched = True
1209
+ break
1210
+ if matched:
1211
+ continue
1212
+
1213
+ # Regular character
1214
+ result.append(char)
1215
+ i += 1
1216
+
1217
+ return "".join(result)