just-bash 0.1.5__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- just_bash/ast/factory.py +3 -1
- just_bash/bash.py +28 -6
- just_bash/commands/awk/awk.py +362 -17
- just_bash/commands/cat/cat.py +5 -1
- just_bash/commands/echo/echo.py +33 -1
- just_bash/commands/grep/grep.py +141 -3
- just_bash/commands/od/od.py +144 -30
- just_bash/commands/printf/printf.py +289 -87
- just_bash/commands/pwd/pwd.py +32 -2
- just_bash/commands/read/read.py +243 -64
- just_bash/commands/readlink/readlink.py +3 -9
- just_bash/commands/registry.py +32 -0
- just_bash/commands/rmdir/__init__.py +5 -0
- just_bash/commands/rmdir/rmdir.py +160 -0
- just_bash/commands/sed/sed.py +142 -31
- just_bash/commands/shuf/__init__.py +5 -0
- just_bash/commands/shuf/shuf.py +242 -0
- just_bash/commands/stat/stat.py +9 -0
- just_bash/commands/time/__init__.py +5 -0
- just_bash/commands/time/time.py +74 -0
- just_bash/commands/touch/touch.py +118 -8
- just_bash/commands/whoami/__init__.py +5 -0
- just_bash/commands/whoami/whoami.py +18 -0
- just_bash/fs/in_memory_fs.py +22 -0
- just_bash/fs/overlay_fs.py +22 -1
- just_bash/interpreter/__init__.py +1 -1
- just_bash/interpreter/builtins/__init__.py +2 -0
- just_bash/interpreter/builtins/control.py +4 -8
- just_bash/interpreter/builtins/declare.py +321 -24
- just_bash/interpreter/builtins/getopts.py +163 -0
- just_bash/interpreter/builtins/let.py +2 -2
- just_bash/interpreter/builtins/local.py +71 -5
- just_bash/interpreter/builtins/misc.py +22 -6
- just_bash/interpreter/builtins/readonly.py +38 -10
- just_bash/interpreter/builtins/set.py +58 -8
- just_bash/interpreter/builtins/test.py +136 -19
- just_bash/interpreter/builtins/unset.py +62 -10
- just_bash/interpreter/conditionals.py +29 -4
- just_bash/interpreter/control_flow.py +61 -17
- just_bash/interpreter/expansion.py +1647 -104
- just_bash/interpreter/interpreter.py +436 -69
- just_bash/interpreter/types.py +263 -2
- just_bash/parser/__init__.py +2 -0
- just_bash/parser/lexer.py +295 -26
- just_bash/parser/parser.py +523 -64
- just_bash/types.py +11 -0
- {just_bash-0.1.5.dist-info → just_bash-0.1.10.dist-info}/METADATA +40 -1
- {just_bash-0.1.5.dist-info → just_bash-0.1.10.dist-info}/RECORD +49 -40
- {just_bash-0.1.5.dist-info → just_bash-0.1.10.dist-info}/WHEEL +0 -0
just_bash/parser/lexer.py
CHANGED
|
@@ -116,7 +116,7 @@ RESERVED_WORDS: dict[str, TokenType] = {
|
|
|
116
116
|
"in": TokenType.IN,
|
|
117
117
|
"function": TokenType.FUNCTION,
|
|
118
118
|
"select": TokenType.SELECT,
|
|
119
|
-
"time"
|
|
119
|
+
# "time" is handled as a regular command, not a reserved keyword
|
|
120
120
|
"coproc": TokenType.COPROC,
|
|
121
121
|
}
|
|
122
122
|
|
|
@@ -133,6 +133,7 @@ class Token:
|
|
|
133
133
|
column: int
|
|
134
134
|
quoted: bool = False
|
|
135
135
|
single_quoted: bool = False
|
|
136
|
+
segments: list | None = None # list of (text, mode) tuples for mixed quoting
|
|
136
137
|
|
|
137
138
|
|
|
138
139
|
@dataclass
|
|
@@ -256,11 +257,7 @@ class Lexer:
|
|
|
256
257
|
break
|
|
257
258
|
|
|
258
259
|
# Check for pending here-documents after newline
|
|
259
|
-
if
|
|
260
|
-
self.pending_heredocs
|
|
261
|
-
and self.tokens
|
|
262
|
-
and self.tokens[-1].type == TokenType.NEWLINE
|
|
263
|
-
):
|
|
260
|
+
if self.pending_heredocs and self.tokens and self.tokens[-1].type == TokenType.NEWLINE:
|
|
264
261
|
self._read_heredoc_content()
|
|
265
262
|
continue
|
|
266
263
|
|
|
@@ -292,11 +289,7 @@ class Lexer:
|
|
|
292
289
|
if char == " " or char == "\t":
|
|
293
290
|
self.pos += 1
|
|
294
291
|
self.column += 1
|
|
295
|
-
elif
|
|
296
|
-
char == "\\"
|
|
297
|
-
and self.pos + 1 < input_len
|
|
298
|
-
and input_text[self.pos + 1] == "\n"
|
|
299
|
-
):
|
|
292
|
+
elif char == "\\" and self.pos + 1 < input_len and input_text[self.pos + 1] == "\n":
|
|
300
293
|
# Line continuation
|
|
301
294
|
self.pos += 2
|
|
302
295
|
self.line += 1
|
|
@@ -342,9 +335,7 @@ class Lexer:
|
|
|
342
335
|
self.pos = pos + 3
|
|
343
336
|
self.column = start_column + 3
|
|
344
337
|
self._register_heredoc_from_lookahead(strip_tabs=True)
|
|
345
|
-
return self._make_token(
|
|
346
|
-
TokenType.DLESSDASH, "<<-", pos, start_line, start_column
|
|
347
|
-
)
|
|
338
|
+
return self._make_token(TokenType.DLESSDASH, "<<-", pos, start_line, start_column)
|
|
348
339
|
|
|
349
340
|
# Check other three-char operators
|
|
350
341
|
three_chars = c0 + c1 + c2
|
|
@@ -374,9 +365,7 @@ class Lexer:
|
|
|
374
365
|
if c0 in SINGLE_CHAR_OPS:
|
|
375
366
|
self.pos = pos + 1
|
|
376
367
|
self.column = start_column + 1
|
|
377
|
-
return self._make_token(
|
|
378
|
-
SINGLE_CHAR_OPS[c0], c0, pos, start_line, start_column
|
|
379
|
-
)
|
|
368
|
+
return self._make_token(SINGLE_CHAR_OPS[c0], c0, pos, start_line, start_column)
|
|
380
369
|
|
|
381
370
|
# Special handling for { and }
|
|
382
371
|
if c0 == "{":
|
|
@@ -470,9 +459,17 @@ class Lexer:
|
|
|
470
459
|
pos += 1
|
|
471
460
|
|
|
472
461
|
# If we consumed characters and hit a simple delimiter
|
|
462
|
+
_use_fast_path = False
|
|
473
463
|
if pos > fast_start:
|
|
474
464
|
c = input_text[pos] if pos < input_len else ""
|
|
475
465
|
if c == "" or c in WORD_BREAK_CHARS:
|
|
466
|
+
# Don't use fast path if we're at an extglob pattern: @( ?( *( +( !(
|
|
467
|
+
if c == "(" and pos > fast_start and input_text[pos - 1] in "@?*+!":
|
|
468
|
+
_use_fast_path = False # Fall through to slow path
|
|
469
|
+
else:
|
|
470
|
+
_use_fast_path = True
|
|
471
|
+
|
|
472
|
+
if _use_fast_path:
|
|
476
473
|
value = input_text[fast_start:pos]
|
|
477
474
|
self.pos = pos
|
|
478
475
|
self.column = column + (pos - fast_start)
|
|
@@ -547,12 +544,34 @@ class Lexer:
|
|
|
547
544
|
in_double_quote = False
|
|
548
545
|
starts_with_quote = input_text[pos] in "\"'" if pos < input_len else False
|
|
549
546
|
|
|
547
|
+
# Segment boundary tracking for mixed quoting (e.g., "pre"{a,b}"suf")
|
|
548
|
+
# Records (value_offset, mode) at each quoting transition
|
|
549
|
+
seg_boundaries: list[tuple[int, str]] = []
|
|
550
|
+
seg_mode = "unquoted"
|
|
551
|
+
|
|
550
552
|
while pos < input_len:
|
|
551
553
|
char = input_text[pos]
|
|
552
554
|
|
|
553
555
|
# Check for word boundaries
|
|
554
556
|
if not in_single_quote and not in_double_quote:
|
|
555
557
|
if char in WORD_BREAK_CHARS:
|
|
558
|
+
# Handle extglob patterns: @( ?( *( +( !(
|
|
559
|
+
if char == "(" and value and value[-1] in "@?*+!":
|
|
560
|
+
# Read balanced paren group as part of word
|
|
561
|
+
value += char
|
|
562
|
+
pos += 1
|
|
563
|
+
col += 1
|
|
564
|
+
depth = 1
|
|
565
|
+
while pos < input_len and depth > 0:
|
|
566
|
+
ec = input_text[pos]
|
|
567
|
+
if ec == "(":
|
|
568
|
+
depth += 1
|
|
569
|
+
elif ec == ")":
|
|
570
|
+
depth -= 1
|
|
571
|
+
value += ec
|
|
572
|
+
pos += 1
|
|
573
|
+
col += 1
|
|
574
|
+
continue
|
|
556
575
|
break
|
|
557
576
|
|
|
558
577
|
# Handle $'' ANSI-C quoting
|
|
@@ -604,13 +623,20 @@ class Lexer:
|
|
|
604
623
|
if char == "'" and not in_double_quote:
|
|
605
624
|
if in_single_quote:
|
|
606
625
|
in_single_quote = False
|
|
607
|
-
if
|
|
626
|
+
if starts_with_quote:
|
|
627
|
+
# Record transition: single → unquoted
|
|
628
|
+
seg_boundaries.append((len(value), seg_mode))
|
|
629
|
+
seg_mode = "unquoted"
|
|
630
|
+
else:
|
|
608
631
|
value += char
|
|
609
632
|
else:
|
|
610
633
|
in_single_quote = True
|
|
611
634
|
if starts_with_quote:
|
|
612
635
|
single_quoted = True
|
|
613
636
|
quoted = True
|
|
637
|
+
# Record transition: current → single
|
|
638
|
+
seg_boundaries.append((len(value), seg_mode))
|
|
639
|
+
seg_mode = "single"
|
|
614
640
|
else:
|
|
615
641
|
value += char
|
|
616
642
|
pos += 1
|
|
@@ -620,12 +646,19 @@ class Lexer:
|
|
|
620
646
|
if char == '"' and not in_single_quote:
|
|
621
647
|
if in_double_quote:
|
|
622
648
|
in_double_quote = False
|
|
623
|
-
if
|
|
649
|
+
if starts_with_quote:
|
|
650
|
+
# Record transition: double → unquoted
|
|
651
|
+
seg_boundaries.append((len(value), seg_mode))
|
|
652
|
+
seg_mode = "unquoted"
|
|
653
|
+
else:
|
|
624
654
|
value += char
|
|
625
655
|
else:
|
|
626
656
|
in_double_quote = True
|
|
627
657
|
if starts_with_quote:
|
|
628
658
|
quoted = True
|
|
659
|
+
# Record transition: current → double
|
|
660
|
+
seg_boundaries.append((len(value), seg_mode))
|
|
661
|
+
seg_mode = "double"
|
|
629
662
|
else:
|
|
630
663
|
value += char
|
|
631
664
|
pos += 1
|
|
@@ -643,7 +676,7 @@ class Lexer:
|
|
|
643
676
|
continue
|
|
644
677
|
if in_double_quote:
|
|
645
678
|
# In double quotes, only certain escapes are special
|
|
646
|
-
if next_char in "
|
|
679
|
+
if next_char in '"\\$`\n':
|
|
647
680
|
if next_char in "$`":
|
|
648
681
|
value += char + next_char
|
|
649
682
|
else:
|
|
@@ -653,7 +686,9 @@ class Lexer:
|
|
|
653
686
|
continue
|
|
654
687
|
else:
|
|
655
688
|
# Outside quotes, backslash escapes next character
|
|
656
|
-
if next_char in "\"'":
|
|
689
|
+
if next_char in "\"'{}":
|
|
690
|
+
# Preserve backslash for quotes and braces so parser
|
|
691
|
+
# can create EscapedPart (prevents brace expansion)
|
|
657
692
|
value += char + next_char
|
|
658
693
|
else:
|
|
659
694
|
value += next_char
|
|
@@ -707,6 +742,30 @@ class Lexer:
|
|
|
707
742
|
col += 1
|
|
708
743
|
continue
|
|
709
744
|
|
|
745
|
+
# Handle $[...] legacy arithmetic expansion
|
|
746
|
+
if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "[":
|
|
747
|
+
value += char
|
|
748
|
+
pos += 1
|
|
749
|
+
col += 1
|
|
750
|
+
value += input_text[pos] # Add the [
|
|
751
|
+
pos += 1
|
|
752
|
+
col += 1
|
|
753
|
+
|
|
754
|
+
# Track bracket depth
|
|
755
|
+
depth = 1
|
|
756
|
+
while depth > 0 and pos < input_len:
|
|
757
|
+
c = input_text[pos]
|
|
758
|
+
value += c
|
|
759
|
+
|
|
760
|
+
if c == "[":
|
|
761
|
+
depth += 1
|
|
762
|
+
elif c == "]":
|
|
763
|
+
depth -= 1
|
|
764
|
+
|
|
765
|
+
pos += 1
|
|
766
|
+
col += 1
|
|
767
|
+
continue
|
|
768
|
+
|
|
710
769
|
# Handle ${...} parameter expansion
|
|
711
770
|
if char == "$" and pos + 1 < input_len and input_text[pos + 1] == "{":
|
|
712
771
|
value += char
|
|
@@ -737,9 +796,7 @@ class Lexer:
|
|
|
737
796
|
pos += 1
|
|
738
797
|
col += 1
|
|
739
798
|
# Read variable name
|
|
740
|
-
while pos < input_len and (
|
|
741
|
-
input_text[pos].isalnum() or input_text[pos] == "_"
|
|
742
|
-
):
|
|
799
|
+
while pos < input_len and (input_text[pos].isalnum() or input_text[pos] == "_"):
|
|
743
800
|
value += input_text[pos]
|
|
744
801
|
pos += 1
|
|
745
802
|
col += 1
|
|
@@ -811,6 +868,24 @@ class Lexer:
|
|
|
811
868
|
column=column,
|
|
812
869
|
)
|
|
813
870
|
|
|
871
|
+
# Build segments from boundaries if we had quoting transitions
|
|
872
|
+
final_segments = None
|
|
873
|
+
if seg_boundaries:
|
|
874
|
+
final_segments = []
|
|
875
|
+
prev_offset = 0
|
|
876
|
+
for offset, mode in seg_boundaries:
|
|
877
|
+
text = value[prev_offset:offset]
|
|
878
|
+
if text:
|
|
879
|
+
final_segments.append((text, mode))
|
|
880
|
+
prev_offset = offset
|
|
881
|
+
# Add final segment
|
|
882
|
+
final_text = value[prev_offset:]
|
|
883
|
+
if final_text:
|
|
884
|
+
final_segments.append((final_text, seg_mode))
|
|
885
|
+
# Only use segments if there are multiple (mixed quoting)
|
|
886
|
+
if len(final_segments) <= 1:
|
|
887
|
+
final_segments = None
|
|
888
|
+
|
|
814
889
|
return Token(
|
|
815
890
|
type=TokenType.WORD,
|
|
816
891
|
value=value,
|
|
@@ -820,6 +895,7 @@ class Lexer:
|
|
|
820
895
|
column=column,
|
|
821
896
|
quoted=quoted,
|
|
822
897
|
single_quoted=single_quoted,
|
|
898
|
+
segments=final_segments,
|
|
823
899
|
)
|
|
824
900
|
|
|
825
901
|
def _register_heredoc_from_lookahead(self, strip_tabs: bool) -> None:
|
|
@@ -856,13 +932,15 @@ class Lexer:
|
|
|
856
932
|
|
|
857
933
|
if in_single_quote:
|
|
858
934
|
if c == "'":
|
|
935
|
+
in_single_quote = False
|
|
859
936
|
pos += 1
|
|
860
|
-
|
|
937
|
+
continue
|
|
861
938
|
delimiter += c
|
|
862
939
|
elif in_double_quote:
|
|
863
940
|
if c == '"':
|
|
941
|
+
in_double_quote = False
|
|
864
942
|
pos += 1
|
|
865
|
-
|
|
943
|
+
continue
|
|
866
944
|
delimiter += c
|
|
867
945
|
else:
|
|
868
946
|
if c in " \t\n;|&<>()":
|
|
@@ -873,6 +951,17 @@ class Lexer:
|
|
|
873
951
|
pos += 2
|
|
874
952
|
quoted = True # Backslash makes it quoted
|
|
875
953
|
continue
|
|
954
|
+
# Handle embedded quotes (e.g., E'O'F)
|
|
955
|
+
if c == "'":
|
|
956
|
+
in_single_quote = True
|
|
957
|
+
quoted = True
|
|
958
|
+
pos += 1
|
|
959
|
+
continue
|
|
960
|
+
if c == '"':
|
|
961
|
+
in_double_quote = True
|
|
962
|
+
quoted = True
|
|
963
|
+
pos += 1
|
|
964
|
+
continue
|
|
876
965
|
delimiter += c
|
|
877
966
|
|
|
878
967
|
pos += 1
|
|
@@ -946,3 +1035,183 @@ def tokenize(input_text: str) -> list[Token]:
|
|
|
946
1035
|
"""Convenience function to tokenize input."""
|
|
947
1036
|
lexer = Lexer(input_text)
|
|
948
1037
|
return lexer.tokenize()
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
# HTML entity mappings
|
|
1041
|
+
HTML_ENTITIES: dict[str, str] = {
|
|
1042
|
+
"<": "<",
|
|
1043
|
+
">": ">",
|
|
1044
|
+
"&": "&",
|
|
1045
|
+
""": '"',
|
|
1046
|
+
"'": "'",
|
|
1047
|
+
}
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def unescape_html_entities(input_text: str) -> str:
|
|
1051
|
+
"""Unescape HTML entities in operator positions (outside quotes and heredocs).
|
|
1052
|
+
|
|
1053
|
+
This handles LLM-generated bash commands that contain HTML-escaped
|
|
1054
|
+
operators like < instead of <.
|
|
1055
|
+
|
|
1056
|
+
Only unescapes entities outside of:
|
|
1057
|
+
- Single quotes
|
|
1058
|
+
- Double quotes
|
|
1059
|
+
- Heredoc content
|
|
1060
|
+
|
|
1061
|
+
Args:
|
|
1062
|
+
input_text: The bash script that may contain HTML entities.
|
|
1063
|
+
|
|
1064
|
+
Returns:
|
|
1065
|
+
The script with HTML entities unescaped in operator positions.
|
|
1066
|
+
"""
|
|
1067
|
+
result: list[str] = []
|
|
1068
|
+
i = 0
|
|
1069
|
+
n = len(input_text)
|
|
1070
|
+
in_single_quote = False
|
|
1071
|
+
in_double_quote = False
|
|
1072
|
+
heredoc_delimiter: str | None = None # None means not in heredoc
|
|
1073
|
+
|
|
1074
|
+
while i < n:
|
|
1075
|
+
char = input_text[i]
|
|
1076
|
+
|
|
1077
|
+
# If we're in a heredoc, look for the end delimiter
|
|
1078
|
+
if heredoc_delimiter is not None:
|
|
1079
|
+
# Check if this line matches the heredoc delimiter
|
|
1080
|
+
line_start = i
|
|
1081
|
+
line_end = input_text.find("\n", i)
|
|
1082
|
+
if line_end == -1:
|
|
1083
|
+
line_end = n
|
|
1084
|
+
|
|
1085
|
+
line = input_text[line_start:line_end]
|
|
1086
|
+
# For <<- heredocs, delimiter may be preceded by tabs
|
|
1087
|
+
stripped_line = line.lstrip("\t")
|
|
1088
|
+
|
|
1089
|
+
if stripped_line == heredoc_delimiter or line == heredoc_delimiter:
|
|
1090
|
+
# End of heredoc - output the line and exit heredoc mode
|
|
1091
|
+
if line_end < n:
|
|
1092
|
+
result.append(input_text[i : line_end + 1])
|
|
1093
|
+
i = line_end + 1
|
|
1094
|
+
else:
|
|
1095
|
+
result.append(input_text[i:line_end])
|
|
1096
|
+
i = line_end
|
|
1097
|
+
heredoc_delimiter = None
|
|
1098
|
+
continue
|
|
1099
|
+
|
|
1100
|
+
# Still in heredoc - output the entire line as-is
|
|
1101
|
+
if line_end < n:
|
|
1102
|
+
result.append(input_text[i : line_end + 1])
|
|
1103
|
+
i = line_end + 1
|
|
1104
|
+
else:
|
|
1105
|
+
result.append(input_text[i:line_end])
|
|
1106
|
+
i = line_end
|
|
1107
|
+
continue
|
|
1108
|
+
|
|
1109
|
+
# Handle quote state tracking
|
|
1110
|
+
if char == "'" and not in_double_quote:
|
|
1111
|
+
in_single_quote = not in_single_quote
|
|
1112
|
+
result.append(char)
|
|
1113
|
+
i += 1
|
|
1114
|
+
continue
|
|
1115
|
+
|
|
1116
|
+
if char == '"' and not in_single_quote:
|
|
1117
|
+
in_double_quote = not in_double_quote
|
|
1118
|
+
result.append(char)
|
|
1119
|
+
i += 1
|
|
1120
|
+
continue
|
|
1121
|
+
|
|
1122
|
+
# Handle backslash escapes (only outside single quotes)
|
|
1123
|
+
if char == "\\" and not in_single_quote and i + 1 < n:
|
|
1124
|
+
# Keep the backslash and next character as-is
|
|
1125
|
+
result.append(char)
|
|
1126
|
+
result.append(input_text[i + 1])
|
|
1127
|
+
i += 2
|
|
1128
|
+
continue
|
|
1129
|
+
|
|
1130
|
+
# Detect heredoc start (only outside quotes)
|
|
1131
|
+
if not in_single_quote and not in_double_quote and char == "<":
|
|
1132
|
+
# Check for << or <<-
|
|
1133
|
+
if i + 1 < n and input_text[i + 1] == "<":
|
|
1134
|
+
strip_tabs = i + 2 < n and input_text[i + 2] == "-"
|
|
1135
|
+
heredoc_op_len = 3 if strip_tabs else 2
|
|
1136
|
+
|
|
1137
|
+
# Output the << or <<-
|
|
1138
|
+
result.append(input_text[i : i + heredoc_op_len])
|
|
1139
|
+
i += heredoc_op_len
|
|
1140
|
+
|
|
1141
|
+
# Skip whitespace after operator
|
|
1142
|
+
while i < n and input_text[i] in " \t":
|
|
1143
|
+
result.append(input_text[i])
|
|
1144
|
+
i += 1
|
|
1145
|
+
|
|
1146
|
+
if i >= n:
|
|
1147
|
+
continue
|
|
1148
|
+
|
|
1149
|
+
# Parse the delimiter
|
|
1150
|
+
delimiter = ""
|
|
1151
|
+
|
|
1152
|
+
if input_text[i] == "'":
|
|
1153
|
+
# Single-quoted delimiter
|
|
1154
|
+
result.append("'")
|
|
1155
|
+
i += 1
|
|
1156
|
+
while i < n and input_text[i] != "'":
|
|
1157
|
+
delimiter += input_text[i]
|
|
1158
|
+
result.append(input_text[i])
|
|
1159
|
+
i += 1
|
|
1160
|
+
if i < n:
|
|
1161
|
+
result.append("'")
|
|
1162
|
+
i += 1
|
|
1163
|
+
elif input_text[i] == '"':
|
|
1164
|
+
# Double-quoted delimiter
|
|
1165
|
+
result.append('"')
|
|
1166
|
+
i += 1
|
|
1167
|
+
while i < n and input_text[i] != '"':
|
|
1168
|
+
delimiter += input_text[i]
|
|
1169
|
+
result.append(input_text[i])
|
|
1170
|
+
i += 1
|
|
1171
|
+
if i < n:
|
|
1172
|
+
result.append('"')
|
|
1173
|
+
i += 1
|
|
1174
|
+
else:
|
|
1175
|
+
# Unquoted delimiter
|
|
1176
|
+
while i < n and input_text[i] not in " \t\n;|&<>()":
|
|
1177
|
+
if input_text[i] == "\\" and i + 1 < n:
|
|
1178
|
+
# Backslash-escaped character in delimiter
|
|
1179
|
+
delimiter += input_text[i + 1]
|
|
1180
|
+
result.append(input_text[i : i + 2])
|
|
1181
|
+
i += 2
|
|
1182
|
+
else:
|
|
1183
|
+
delimiter += input_text[i]
|
|
1184
|
+
result.append(input_text[i])
|
|
1185
|
+
i += 1
|
|
1186
|
+
|
|
1187
|
+
# Find the end of this line (heredoc content starts on next line)
|
|
1188
|
+
while i < n and input_text[i] != "\n":
|
|
1189
|
+
result.append(input_text[i])
|
|
1190
|
+
i += 1
|
|
1191
|
+
|
|
1192
|
+
if i < n:
|
|
1193
|
+
result.append("\n")
|
|
1194
|
+
i += 1
|
|
1195
|
+
# Now in heredoc mode
|
|
1196
|
+
heredoc_delimiter = delimiter
|
|
1197
|
+
|
|
1198
|
+
continue
|
|
1199
|
+
|
|
1200
|
+
# Only attempt HTML entity replacement outside quotes
|
|
1201
|
+
if not in_single_quote and not in_double_quote and char == "&":
|
|
1202
|
+
# Check for HTML entities
|
|
1203
|
+
matched = False
|
|
1204
|
+
for entity, replacement in HTML_ENTITIES.items():
|
|
1205
|
+
if input_text[i:].startswith(entity):
|
|
1206
|
+
result.append(replacement)
|
|
1207
|
+
i += len(entity)
|
|
1208
|
+
matched = True
|
|
1209
|
+
break
|
|
1210
|
+
if matched:
|
|
1211
|
+
continue
|
|
1212
|
+
|
|
1213
|
+
# Regular character
|
|
1214
|
+
result.append(char)
|
|
1215
|
+
i += 1
|
|
1216
|
+
|
|
1217
|
+
return "".join(result)
|