rubish-gem 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.dockerignore +23 -0
  3. data/Dockerfile +54 -0
  4. data/LICENSE.txt +21 -0
  5. data/README.md +39 -0
  6. data/Rakefile +12 -0
  7. data/lib/rubish/arithmetic.rb +140 -0
  8. data/lib/rubish/ast.rb +168 -0
  9. data/lib/rubish/builtins/arithmetic.rb +129 -0
  10. data/lib/rubish/builtins/bind_readline.rb +834 -0
  11. data/lib/rubish/builtins/directory_stack.rb +182 -0
  12. data/lib/rubish/builtins/echo_printf.rb +510 -0
  13. data/lib/rubish/builtins/hash_directories.rb +260 -0
  14. data/lib/rubish/builtins/read.rb +299 -0
  15. data/lib/rubish/builtins/trap.rb +324 -0
  16. data/lib/rubish/codegen.rb +1273 -0
  17. data/lib/rubish/completion.rb +840 -0
  18. data/lib/rubish/completions/bash_helpers.rb +530 -0
  19. data/lib/rubish/completions/git.rb +431 -0
  20. data/lib/rubish/completions/help_parser.rb +453 -0
  21. data/lib/rubish/completions/ssh.rb +114 -0
  22. data/lib/rubish/config.rb +267 -0
  23. data/lib/rubish/data/builtin_help.rb +716 -0
  24. data/lib/rubish/data/completion_data.rb +53 -0
  25. data/lib/rubish/data/readline_config.rb +47 -0
  26. data/lib/rubish/data/shell_options.rb +251 -0
  27. data/lib/rubish/data_define.rb +65 -0
  28. data/lib/rubish/execution_context.rb +1124 -0
  29. data/lib/rubish/expansion.rb +988 -0
  30. data/lib/rubish/history.rb +663 -0
  31. data/lib/rubish/lazy_loader.rb +127 -0
  32. data/lib/rubish/lexer.rb +1194 -0
  33. data/lib/rubish/parser.rb +1167 -0
  34. data/lib/rubish/prompt.rb +766 -0
  35. data/lib/rubish/repl.rb +2267 -0
  36. data/lib/rubish/runtime/builtins.rb +7222 -0
  37. data/lib/rubish/runtime/command.rb +1153 -0
  38. data/lib/rubish/runtime/job.rb +153 -0
  39. data/lib/rubish/runtime.rb +1169 -0
  40. data/lib/rubish/shell_state.rb +241 -0
  41. data/lib/rubish/startup_profiler.rb +67 -0
  42. data/lib/rubish/version.rb +5 -0
  43. data/lib/rubish.rb +60 -0
  44. data/sig/rubish.rbs +4 -0
  45. metadata +85 -0
@@ -0,0 +1,1194 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Rubish
4
+ class Lexer
5
+ Token = Data.define(:type, :value)
6
+
7
+ OPERATORS = {
8
+ '|' => :PIPE,
9
+ '|&' => :PIPE_BOTH, # Pipe stdout and stderr: cmd1 |& cmd2 = cmd1 2>&1 | cmd2
10
+ ';' => :SEMICOLON,
11
+ ';;' => :DOUBLE_SEMI, # For case statement pattern terminators
12
+ ';&' => :CASE_FALL, # Case fall-through (execute next pattern)
13
+ ';;&' => :CASE_CONT, # Case continue (test next pattern)
14
+ '&' => :AMPERSAND,
15
+ '>' => :REDIRECT_OUT,
16
+ '>|' => :REDIRECT_CLOBBER, # Force overwrite even with noclobber
17
+ '>>' => :REDIRECT_APPEND,
18
+ '<' => :REDIRECT_IN,
19
+ '<<' => :HEREDOC, # Here document
20
+ '<<-' => :HEREDOC_INDENT, # Here document with indented delimiter
21
+ '<<<' => :HERESTRING, # Here string
22
+ '2>' => :REDIRECT_ERR,
23
+ '>&' => :DUP_OUT, # Duplicate output FD
24
+ '<&' => :DUP_IN, # Duplicate input FD
25
+ '&&' => :AND,
26
+ '||' => :OR,
27
+ '(' => :LPAREN,
28
+ ')' => :RPAREN,
29
+ '()' => :PARENS, # For function definitions: name() { }
30
+ '{' => :LBRACE,
31
+ '}' => :RBRACE
32
+ }.freeze
33
+
34
+ KEYWORDS = {
35
+ 'if' => :IF,
36
+ 'unless' => :UNLESS,
37
+ 'then' => :THEN,
38
+ 'else' => :ELSE,
39
+ 'elif' => :ELIF,
40
+ 'elsif' => :ELSIF,
41
+ 'fi' => :FI,
42
+ 'while' => :WHILE,
43
+ 'until' => :UNTIL,
44
+ 'for' => :FOR,
45
+ 'select' => :SELECT,
46
+ 'function' => :FUNCTION,
47
+ 'def' => :DEF,
48
+ 'case' => :CASE,
49
+ 'when' => :WHEN,
50
+ 'esac' => :ESAC,
51
+ 'coproc' => :COPROC,
52
+ 'time' => :TIME,
53
+ 'lazy_load' => :LAZY_LOAD
54
+ # Note: 'do', 'done', 'in', 'end' are handled as WORD tokens and checked by parser
55
+ # to allow them as command arguments (e.g., "echo done")
56
+ }.freeze
57
+
58
+ def initialize(input)
59
+ @input = input
60
+ @pos = 0
61
+ @last_token_type = nil
62
+ @last_word_value = nil
63
+ end
64
+
65
+ def tokenize
66
+ tokens = []
67
+ while @pos < @input.length
68
+ skip_whitespace
69
+ break if @pos >= @input.length
70
+
71
+ token = read_token
72
+ if token
73
+ tokens << token
74
+ @last_token_type = token.type
75
+ # Track word value for block detection (also SELECT for filtering select)
76
+ @last_word_value = token.value if token.type == :WORD || token.type == :SELECT
77
+ end
78
+ end
79
+ tokens
80
+ end
81
+
82
+ private
83
+
84
+ def skip_whitespace
85
+ # Only skip spaces and tabs, not newlines
86
+ # Newlines act as command separators (like semicolons)
87
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[ \t]/
88
+ end
89
+
90
+ def skip_newlines
91
+ # Skip consecutive newlines (used after reading a newline as separator)
92
+ @pos += 1 while @pos < @input.length && @input[@pos] == "\n"
93
+ end
94
+
95
+ def read_token
96
+ # Handle newlines as command separators (like semicolons)
97
+ # Collapse consecutive newlines into one separator
98
+ if @input[@pos] == "\n"
99
+ skip_newlines
100
+ skip_whitespace
101
+ # Don't emit separator if we're at EOF or if previous token was already a separator
102
+ return nil if @pos >= @input.length
103
+ return nil if @last_token_type == :SEMICOLON
104
+ return Token.new(:SEMICOLON, "\n")
105
+ end
106
+ # Check for multi-char operators first
107
+ three_char = @input[@pos, 3]
108
+ if three_char == '<<<'
109
+ @pos += 3
110
+ return read_herestring
111
+ elsif three_char == '<<-'
112
+ @pos += 3
113
+ return read_heredoc_delimiter(:HEREDOC_INDENT)
114
+ end
115
+
116
+ # Check for {varname} redirection pattern: {fd}>file, {fd}<file, etc.
117
+ if @input[@pos] == '{' && looks_like_varname_redirect?
118
+ return read_varname_redirect
119
+ end
120
+
121
+ two_char = @input[@pos, 2]
122
+ if two_char == '<<'
123
+ @pos += 2
124
+ return read_heredoc_delimiter(:HEREDOC)
125
+ end
126
+ # Arithmetic command (( )) - only when in command position
127
+ # Distinguish from nested subshell: ((cmd)) vs (( expr ))
128
+ # If followed by a word then space (like "echo "), it's likely a nested subshell
129
+ if two_char == '(('
130
+ # Look ahead to see if this looks like an arithmetic expression
131
+ # Skip whitespace to find what comes after ((
132
+ lookahead_pos = @pos + 2
133
+ while lookahead_pos < @input.length && @input[lookahead_pos] =~ /[ \t]/
134
+ lookahead_pos += 1
135
+ end
136
+ # Arithmetic expressions start with: number, variable (optionally with $),
137
+ # unary operators (!, -, ~, ++, --), or (
138
+ # Commands start with: letter followed by space, or are builtins like echo, cd, etc.
139
+ first_content = @input[lookahead_pos, 30] || ''
140
+ # It's arithmetic if it starts with:
141
+ # - A number (possibly negative)
142
+ # - $ (variable reference)
143
+ # - ! or ~ (unary operators)
144
+ # - identifier followed by arithmetic operator (=, +, -, ++, --, *, /, etc.)
145
+ # - ( followed by space or non-alpha (grouped expression, not command)
146
+ # Note: We must NOT match patterns like ((abc)(123)) which is regex grouping
147
+ is_arithmetic = case first_content
148
+ when /\A-?\d/ then true # Number
149
+ when /\A\$/ then true # Variable reference
150
+ when /\A[!~]/ then true # Unary operators
151
+ when /\A(\+\+|--)[a-zA-Z_]/ then true # Pre-increment/decrement
152
+ when /\A[a-zA-Z_][a-zA-Z0-9_]*\s*(\+\+|--|[=+\-*\/%<>&|^]=?|\[)/ then true # Identifier with operator
153
+ when /\A\(\s*[\d$!~(+-]/ then true # Grouped expression starting with arith
154
+ when /\A;/ then true # Empty init in for ((; cond; update))
155
+ else false
156
+ end
157
+ if is_arithmetic
158
+ @pos += 2
159
+ return read_arithmetic_command
160
+ end
161
+ # Otherwise fall through to handle as nested subshells
162
+ end
163
+ # Extended test command [[ ]] - only when in command position
164
+ # Not when it's a nested array like [[1, 2], [3, 4]]
165
+ if two_char == '[['
166
+ # Check if followed by space (conditional) or digit/quote (array)
167
+ next_char = @input[@pos + 2]
168
+ if next_char.nil? || next_char =~ /[\s\-!]/
169
+ @pos += 2
170
+ return Token.new(:DOUBLE_LBRACKET, '[[')
171
+ end
172
+ # Otherwise it's a nested array, fall through to array handling
173
+ end
174
+ if two_char == ']]'
175
+ @pos += 2
176
+ return Token.new(:DOUBLE_RBRACKET, ']]')
177
+ end
178
+ # Process substitution: <(...) and >(...)
179
+ if two_char == '<('
180
+ return read_process_substitution(:PROC_SUB_IN)
181
+ end
182
+ if two_char == '>('
183
+ return read_process_substitution(:PROC_SUB_OUT)
184
+ end
185
+ # Check for three-char operators first: ;;&
186
+ three_char_op = @input[@pos, 3]
187
+ if three_char_op == ';;&'
188
+ @pos += 3
189
+ return Token.new(:CASE_CONT, ';;&')
190
+ end
191
+ if %w[>> >| 2> >& <& && || () ;; ;& |&].include?(two_char)
192
+ @pos += 2
193
+ return Token.new(OPERATORS[two_char], two_char)
194
+ end
195
+
196
+ # Single char operators
197
+ # Note: () is handled above as two-char for function defs, so ( here is for subshells
198
+ char = @input[@pos]
199
+ if %w[| ; & > ) (].include?(char)
200
+ @pos += 1
201
+ return Token.new(OPERATORS[char], char)
202
+ end
203
+ # < alone is redirect in (heredocs handled above)
204
+ if char == '<'
205
+ @pos += 1
206
+ return Token.new(:REDIRECT_IN, char)
207
+ end
208
+
209
+ # Ruby literals
210
+ case char
211
+ when '['
212
+ # Check if this is a command [ (test) or an array literal
213
+ # [ as command is followed by space, array literal is not
214
+ if @input[@pos + 1] =~ /[\s]/
215
+ @pos += 1
216
+ return Token.new(:WORD, '[')
217
+ end
218
+ # Check if this is a glob pattern like [abc]file vs array [1, 2, 3]
219
+ # Glob pattern: [chars] followed by more word characters
220
+ # Array: [value, value, ...] with commas inside
221
+ if looks_like_glob_bracket?
222
+ read_word
223
+ else
224
+ read_array
225
+ end
226
+ when '/'
227
+ read_regexp_or_word
228
+ when '{'
229
+ # Check if this is a brace expansion pattern like {a,b,c} or {1..5}
230
+ if looks_like_brace_expansion?
231
+ read_word
232
+ else
233
+ # Check if this is a Ruby block { |x| ... } or shell function body { cmd; }
234
+ # Ruby blocks have | after optional whitespace
235
+ lookahead = @pos + 1
236
+ lookahead += 1 while lookahead < @input.length && @input[lookahead] =~ /\s/
237
+ if @input[lookahead] == '|'
238
+ read_block
239
+ elsif @last_word_value == 'each' || @last_word_value == '.each' ||
240
+ @last_word_value == 'map' || @last_word_value == '.map' ||
241
+ @last_word_value == 'select' || @last_word_value == '.select' ||
242
+ @last_word_value == 'detect' || @last_word_value == '.detect'
243
+ # Block after 'each'/'map'/'select'/'detect' without explicit variable: each { body }
244
+ # Uses implicit 'it' variable (accessed as $it)
245
+ read_block
246
+ elsif %i[IF WHILE UNTIL ELIF ELSIF UNLESS CASE].include?(@last_token_type)
247
+ # Ruby expression block after if/while/until/elif/elsif/unless: { condition }
248
+ # Or after case: case { expression } in ...
249
+ read_ruby_condition
250
+ else
251
+ # Shell function body or standalone brace
252
+ @pos += 1
253
+ Token.new(:LBRACE, '{')
254
+ end
255
+ end
256
+ when '}'
257
+ @pos += 1
258
+ Token.new(:RBRACE, '}')
259
+ when '.'
260
+ # Check if this is a method chain: .identifier(
261
+ # Not: .hidden (hidden file), ./path (relative path)
262
+ if looks_like_method_chain_start?
263
+ @pos += 1
264
+ Token.new(:DOT, '.')
265
+ else
266
+ read_word
267
+ end
268
+ when 'd'
269
+ # Check for Ruby 'do' block (do |x| ... end or do ... end after 'each')
270
+ # Only treat as block if followed by space/| (not 'done' or other words)
271
+ if @input[@pos, 2] == 'do' && @input[@pos + 2] =~ /[\s|]/
272
+ # Look ahead to see if this has block args (|...|) - distinguishes from shell 'do'
273
+ lookahead = @pos + 2
274
+ lookahead += 1 while lookahead < @input.length && @input[lookahead] =~ /\s/
275
+ if @input[lookahead] == '|'
276
+ read_do_block
277
+ elsif @last_word_value == 'each' || @last_word_value == '.each' ||
278
+ @last_word_value == 'map' || @last_word_value == '.map' ||
279
+ @last_word_value == 'select' || @last_word_value == '.select' ||
280
+ @last_word_value == 'detect' || @last_word_value == '.detect'
281
+ # Block after 'each'/'map'/'select'/'detect' without explicit variable: each do body end
282
+ # Uses implicit 'it' variable (accessed as $it)
283
+ read_do_block
284
+ else
285
+ read_word
286
+ end
287
+ else
288
+ read_word
289
+ end
290
+ else
291
+ read_word
292
+ end
293
+ end
294
+
295
+ def looks_like_glob_bracket?
296
+ # Glob pattern: [abc] or [a-z] followed by more word characters
297
+ # Array: [1, 2, 3] or ["a", "b"] with commas
298
+ lookahead = @pos + 1
299
+ has_comma = false
300
+ while lookahead < @input.length
301
+ char = @input[lookahead]
302
+ if char == ']'
303
+ # Found closing bracket - check what follows
304
+ next_char = @input[lookahead + 1]
305
+ # If followed by word characters, it's a glob pattern
306
+ return true if next_char && next_char =~ /[a-zA-Z0-9_.\-]/
307
+ # If followed by space/operator/end, could be either
308
+ # Check if we saw commas inside - if so, it's an array
309
+ return !has_comma
310
+ elsif char == ','
311
+ has_comma = true
312
+ elsif char =~ /[\s]/
313
+ # Whitespace inside brackets suggests array (glob patterns are compact)
314
+ return false
315
+ end
316
+ lookahead += 1
317
+ end
318
+ false # Unclosed bracket, treat as array
319
+ end
320
+
321
+ def looks_like_brace_expansion?
322
+ # Brace expansion: {a,b,c} or {1..5} or prefix{a,b}suffix
323
+ # Must have matching braces with comma or ..
324
+ # Not: ${VAR} (variable) or { cmd; } (function body)
325
+ lookahead = @pos + 1
326
+ depth = 1
327
+ has_comma = false
328
+ has_dotdot = false
329
+
330
+ while lookahead < @input.length && depth > 0
331
+ char = @input[lookahead]
332
+ case char
333
+ when '{'
334
+ depth += 1
335
+ when '}'
336
+ depth -= 1
337
+ when ','
338
+ has_comma = true if depth == 1
339
+ when '.'
340
+ if @input[lookahead + 1] == '.'
341
+ has_dotdot = true if depth == 1
342
+ lookahead += 1 # Skip second dot
343
+ end
344
+ when ' ', "\t", "\n"
345
+ # Whitespace inside braces suggests function body, not brace expansion
346
+ return false if depth > 0
347
+ end
348
+ lookahead += 1
349
+ end
350
+
351
+ # Must have found closing brace and have either comma or ..
352
+ depth == 0 && (has_comma || has_dotdot)
353
+ end
354
+
355
+ def read_array
356
+ start = @pos
357
+ depth = 0
358
+ while @pos < @input.length
359
+ char = @input[@pos]
360
+ if char == '['
361
+ depth += 1
362
+ elsif char == ']'
363
+ depth -= 1
364
+ if depth == 0
365
+ @pos += 1
366
+ break
367
+ end
368
+ elsif char == '"'
369
+ read_double_quoted_string
370
+ next
371
+ elsif char == "'"
372
+ read_single_quoted_string
373
+ next
374
+ end
375
+ @pos += 1
376
+ end
377
+ Token.new(:ARRAY, @input[start...@pos])
378
+ end
379
+
380
+ def read_regexp_or_word
381
+ # Look ahead to see if this is a regexp or a path
382
+ # Regexp: /pattern/ followed by whitespace, operator, or end
383
+ # Path: /foo/bar (continues after the closing /) or /bin/ (trailing slash)
384
+ lookahead = @pos + 1
385
+ while lookahead < @input.length
386
+ char = @input[lookahead]
387
+ break if char =~ /[ \t]/
388
+ if char == '/' && lookahead > @pos + 1
389
+ # Check what comes after the potential closing /
390
+ after_slash = lookahead + 1
391
+ # Skip optional regexp flags
392
+ after_slash += 1 while after_slash < @input.length && @input[after_slash] =~ /[imxo]/
393
+ # If followed by whitespace, operator (except {), or end, it might be a regexp
394
+ # Exclude { because it could be brace expansion in a path like /tmp/{a,b}
395
+ next_char = @input[after_slash]
396
+ if next_char.nil? || next_char =~ /[ \t]/ || (OPERATORS.key?(next_char) && next_char != '{')
397
+ # Check if content looks like a path (no regex metacharacters) or a regexp
398
+ # Paths like /bin/ or /opt/homebrew/ contain alphanumeric, underscore, dash, dot, slash
399
+ # Regexps typically have metacharacters like * + ? ^ $ [ ] ( ) | \
400
+ content = @input[@pos + 1...lookahead]
401
+ if content =~ /\A[a-zA-Z0-9_.\-\/]+\z/
402
+ # Looks like a path component, not a regexp - treat as word
403
+ break
404
+ end
405
+ return read_regexp
406
+ end
407
+ # Otherwise continue - it's a path like /tmp/file
408
+ end
409
+ # Check for escape in regexp
410
+ if char == '\\'
411
+ lookahead += 2
412
+ next
413
+ end
414
+ lookahead += 1
415
+ end
416
+ # Not a regexp, treat as word
417
+ read_word
418
+ end
419
+
420
+ def read_regexp
421
+ start = @pos
422
+ @pos += 1 # skip opening /
423
+ while @pos < @input.length
424
+ char = @input[@pos]
425
+ if char == '\\'
426
+ @pos += 2 # skip escaped char
427
+ next
428
+ end
429
+ if char == '/'
430
+ @pos += 1
431
+ # Read optional flags (i, m, x, etc.)
432
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[imxo]/
433
+ break
434
+ end
435
+ @pos += 1
436
+ end
437
+ Token.new(:REGEXP, @input[start...@pos])
438
+ end
439
+
440
+ def read_block
441
+ start = @pos
442
+ depth = 0
443
+ while @pos < @input.length
444
+ char = @input[@pos]
445
+ if char == '{'
446
+ depth += 1
447
+ elsif char == '}'
448
+ depth -= 1
449
+ if depth == 0
450
+ @pos += 1
451
+ break
452
+ end
453
+ elsif char == '"'
454
+ read_double_quoted_string
455
+ next
456
+ elsif char == "'"
457
+ read_single_quoted_string
458
+ next
459
+ end
460
+ @pos += 1
461
+ end
462
+ Token.new(:BLOCK, @input[start...@pos])
463
+ end
464
+
465
+ # Read Ruby condition block: { expression }
466
+ # Returns raw expression content without braces
467
+ def read_ruby_condition
468
+ @pos += 1 # skip opening {
469
+ start = @pos
470
+ depth = 1
471
+
472
+ while @pos < @input.length && depth > 0
473
+ char = @input[@pos]
474
+ if char == '{'
475
+ depth += 1
476
+ elsif char == '}'
477
+ depth -= 1
478
+ break if depth == 0
479
+ elsif char == '"'
480
+ read_double_quoted_string
481
+ next
482
+ elsif char == "'"
483
+ read_single_quoted_string
484
+ next
485
+ end
486
+ @pos += 1
487
+ end
488
+
489
+ content = @input[start...@pos].strip
490
+ @pos += 1 # skip closing }
491
+ Token.new(:RUBY_CONDITION, content)
492
+ end
493
+
494
+ def read_do_block
495
+ start = @pos
496
+ depth = 1
497
+ @pos += 2 # skip 'do'
498
+ while @pos < @input.length
499
+ # Check for 'do' (increase depth)
500
+ if @input[@pos, 2] == 'do' && (@pos == 0 || @input[@pos - 1] =~ /\s/) &&
501
+ (@input[@pos + 2].nil? || @input[@pos + 2] =~ /[\s|]/)
502
+ depth += 1
503
+ @pos += 2
504
+ next
505
+ end
506
+ # Check for 'end' (decrease depth)
507
+ if @input[@pos, 3] == 'end' && (@pos == 0 || @input[@pos - 1] =~ /\s/) &&
508
+ (@input[@pos + 3].nil? || @input[@pos + 3] =~ /[\s|;]/)
509
+ depth -= 1
510
+ if depth == 0
511
+ @pos += 3
512
+ break
513
+ end
514
+ end
515
+ if @input[@pos] == '"'
516
+ read_double_quoted_string
517
+ next
518
+ elsif @input[@pos] == "'"
519
+ read_single_quoted_string
520
+ next
521
+ end
522
+ @pos += 1
523
+ end
524
+ Token.new(:BLOCK, @input[start...@pos])
525
+ end
526
+
527
+ def read_word
528
+ start = @pos
529
+ while @pos < @input.length
530
+ char = @input[@pos]
531
+
532
+ # Handle { specially BEFORE the general operator check
533
+ # { could be brace expansion (part of word) or operator
534
+ if char == '{'
535
+ if @pos > start && @input[@pos - 1] == '$'
536
+ # ${VAR} - variable expansion, let read_braced_variable handle it below
537
+ elsif looks_like_brace_expansion?
538
+ # Brace expansion pattern like {a,b,c} - read the whole thing
539
+ read_brace_expansion
540
+ next
541
+ else
542
+ # Not brace expansion (e.g. shell function body), treat as operator
543
+ break
544
+ end
545
+ end
546
+
547
+ # General break conditions - exclude { since it's handled above
548
+ break if char =~ /[ \t\n]/ || (OPERATORS.key?(char) && char != '{')
549
+ break if @input[@pos, 2] == '>>' || @input[@pos, 2] == '2>' || @input[@pos, 2] == ';;'
550
+ # Stop at Ruby literal starters only at the start of a word
551
+ # In the middle of a word, [ is a glob pattern like file[12].txt
552
+ # At the start, [ might be a glob pattern like [abc]file
553
+ # Exception: ${VAR} is a shell variable, not a Ruby block
554
+ break if char == '[' && @pos == start && !looks_like_glob_bracket?
555
+ # Stop at . if it's a method chain (e.g., ls.grep(/foo/))
556
+ # But not for filenames like file.txt or paths like ./script
557
+ break if char == '.' && looks_like_method_chain_start?
558
+
559
+ if char == '\\'
560
+ # Backslash escape - skip the next character
561
+ @pos += 2
562
+ elsif char == '"'
563
+ read_double_quoted_string
564
+ elsif char == '$' && @input[@pos + 1] == "'"
565
+ # $'...' ANSI-C quoting - handle escape sequences including \'
566
+ read_ansi_c_quoted_string
567
+ elsif char == "'"
568
+ read_single_quoted_string
569
+ elsif char == '`'
570
+ # Backtick command substitution `...`
571
+ read_backtick_substitution
572
+ elsif char == '$' && @input[@pos + 1] == '('
573
+ # Command substitution $(...)
574
+ read_command_substitution
575
+ elsif char == '$' && @input[@pos + 1] == '{'
576
+ # Variable expansion ${VAR}
577
+ read_braced_variable
578
+ else
579
+ @pos += 1
580
+ end
581
+ end
582
+ value = @input[start...@pos]
583
+ return nil if value.empty?
584
+
585
+ # Check for array assignment: VAR=(...) or VAR+=(...)
586
+ if (value.end_with?('=') || value.end_with?('+=')) && @input[@pos] == '('
587
+ return read_array_assignment(value)
588
+ end
589
+
590
+ # Check for function call syntax: cmd(arg1, arg2) - but not:
591
+ # - cmd() which is function def
592
+ # - extglob patterns like word?(pat), word*(pat), word+(pat), @(pat), !(pat)
593
+ # - after def/function keywords (where the word is a function name being defined)
594
+ # - words that don't look like command names (e.g., regex metacharacters like ^ or $)
595
+ # - Ruby-like code (contains keyword args with :, nested method calls, etc.)
596
+ if @input[@pos] == '(' && @input[@pos + 1] != ')' &&
597
+ !extglob_prefix?(value) && ![:DEF, :FUNCTION].include?(@last_token_type) &&
598
+ valid_func_call_name?(value) && !looks_like_ruby_call?
599
+ return read_func_call(value)
600
+ end
601
+
602
+ # Check if word is a keyword
603
+ if KEYWORDS.key?(value)
604
+ Token.new(KEYWORDS[value], value)
605
+ else
606
+ Token.new(:WORD, value)
607
+ end
608
+ end
609
+
610
+ def read_array_assignment(var_part)
611
+ # Read array contents: (elem1 elem2 elem3)
612
+ @pos += 1 # skip opening (
613
+ elements = []
614
+
615
+ while @pos < @input.length
616
+ skip_whitespace
617
+ break if @input[@pos] == ')'
618
+
619
+ elem = read_array_element
620
+ elements << elem if elem && !elem.empty?
621
+ end
622
+
623
+ @pos += 1 if @input[@pos] == ')' # skip closing )
624
+
625
+ Token.new(:ARRAY_ASSIGN, {var: var_part, elements: elements})
626
+ end
627
+
628
+ def read_array_element
629
+ start = @pos
630
+
631
+ while @pos < @input.length
632
+ char = @input[@pos]
633
+
634
+ # Stop at whitespace or closing paren
635
+ break if char =~ /[ \t\n]/ || char == ')'
636
+
637
+ if char == '"'
638
+ read_double_quoted_string
639
+ elsif char == '$' && @input[@pos + 1] == "'"
640
+ read_ansi_c_quoted_string
641
+ elsif char == "'"
642
+ read_single_quoted_string
643
+ elsif char == '$' && @input[@pos + 1] == '('
644
+ read_command_substitution
645
+ elsif char == '$' && @input[@pos + 1] == '{'
646
+ read_braced_variable
647
+ else
648
+ @pos += 1
649
+ end
650
+ end
651
+
652
+ @input[start...@pos]
653
+ end
654
+
655
+ def read_func_call(name)
656
+ # Read function call syntax: cmd(arg1, arg2, ...)
657
+ @pos += 1 # skip opening (
658
+ args = []
659
+
660
+ while @pos < @input.length
661
+ # Skip whitespace
662
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[ \t]/
663
+
664
+ break if @input[@pos] == ')'
665
+
666
+ arg = read_func_call_arg
667
+ args << arg if arg && !arg.empty?
668
+
669
+ # Skip whitespace after arg
670
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[ \t]/
671
+
672
+ # Check for comma or closing paren
673
+ if @input[@pos] == ','
674
+ @pos += 1 # skip comma
675
+ elsif @input[@pos] == ')'
676
+ break
677
+ else
678
+ # Unexpected character, stop parsing
679
+ break
680
+ end
681
+ end
682
+
683
+ @pos += 1 if @input[@pos] == ')' # skip closing )
684
+
685
+ Token.new(:FUNC_CALL, {name: name, args: args})
686
+ end
687
+
688
+ def read_func_call_arg
689
+ start = @pos
690
+
691
+ # Check for special cases first
692
+ char = @input[@pos]
693
+
694
+ # Quoted strings
695
+ if char == '"'
696
+ read_double_quoted_string
697
+ return @input[start...@pos]
698
+ elsif char == "'"
699
+ read_single_quoted_string
700
+ return @input[start...@pos]
701
+ elsif char == '$' && @input[@pos + 1] == "'"
702
+ read_ansi_c_quoted_string
703
+ return @input[start...@pos]
704
+ end
705
+
706
+ # Check for regexp or path starting with /
707
+ if char == '/'
708
+ return read_func_call_slash_arg
709
+ end
710
+
711
+ # Check for array literal
712
+ if char == '['
713
+ read_array
714
+ return @input[start...@pos]
715
+ end
716
+
717
+ # Regular word argument
718
+ while @pos < @input.length
719
+ char = @input[@pos]
720
+
721
+ # Stop at comma, closing paren, or whitespace
722
+ break if char =~ /[ \t]/ || char == ',' || char == ')'
723
+
724
+ if char == '\\'
725
+ @pos += 2
726
+ elsif char == '"'
727
+ read_double_quoted_string
728
+ elsif char == "'"
729
+ read_single_quoted_string
730
+ elsif char == '$' && @input[@pos + 1] == '('
731
+ read_command_substitution
732
+ elsif char == '$' && @input[@pos + 1] == '{'
733
+ read_braced_variable
734
+ else
735
+ @pos += 1
736
+ end
737
+ end
738
+
739
+ @input[start...@pos]
740
+ end
741
+
742
+ def read_func_call_slash_arg
743
+ # Determine if /.../ is a path or regexp inside function call
744
+ # Path: contains only alphanumeric, _, ., -, /
745
+ # Regexp: contains metacharacters like *, +, ?, ^, $, [, ], (, ), |, \
746
+ start = @pos
747
+ @pos += 1 # skip opening /
748
+
749
+ has_metachar = false
750
+ closed = false
751
+
752
+ while @pos < @input.length
753
+ char = @input[@pos]
754
+
755
+ # Stop at comma, closing paren (without closing /), or whitespace
756
+ if char =~ /[ \t]/ || char == ',' || char == ')'
757
+ break
758
+ end
759
+
760
+ if char == '/'
761
+ # Check if this looks like end of regexp or middle of path
762
+ # If we've seen metacharacters, it's likely a regexp
763
+ # If content is path-like, continue as path
764
+ content = @input[start + 1...@pos]
765
+ if has_metachar || content !~ /\A[a-zA-Z0-9_.\-\/]*\z/
766
+ # Regexp - consume closing / and optional flags
767
+ @pos += 1
768
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[imxo]/
769
+ closed = true
770
+ break
771
+ else
772
+ # Path - continue reading
773
+ @pos += 1
774
+ end
775
+ elsif char == '\\' && has_metachar
776
+ # Escape in regexp
777
+ @pos += 2
778
+ elsif char =~ /[*+?^$\[\]()|\\.]/
779
+ has_metachar = true
780
+ @pos += 1
781
+ else
782
+ @pos += 1
783
+ end
784
+ end
785
+
786
+ @input[start...@pos]
787
+ end
788
+
789
+ def extglob_prefix?(word)
790
+ # Check if word ends with extglob prefix: ?, *, +, @, !
791
+ # These form patterns like foo?(bar), *(pat), @(a|b), !(neg)
792
+ return true if word.empty? # standalone @( or !( etc.
793
+ return true if word =~ /[?*+@!]\z/
794
+ # Also check for patterns that are entirely glob characters
795
+ return true if word =~ /\A[*?@!]+\z/
796
+ false
797
+ end
798
+
799
+ def looks_like_method_chain_start?
800
+ # Check if current position (at '.') starts a method chain:
801
+ # - .identifier(args) - method call with args
802
+ # - .identifier { block } - method call with block (like .each {|x| ...})
803
+ return false unless @input[@pos] == '.'
804
+
805
+ lookahead = @pos + 1
806
+ # Must start with letter or underscore (not / for paths or digit for decimals)
807
+ return false unless lookahead < @input.length && @input[lookahead] =~ /[a-zA-Z_]/
808
+
809
+ # Read the identifier
810
+ id_start = lookahead
811
+ lookahead += 1
812
+ lookahead += 1 while lookahead < @input.length && @input[lookahead] =~ /[a-zA-Z0-9_]/
813
+ identifier = @input[id_start...lookahead]
814
+
815
+ # Skip optional whitespace
816
+ block_lookahead = lookahead
817
+ block_lookahead += 1 while block_lookahead < @input.length && @input[block_lookahead] =~ /[ \t]/
818
+
819
+ # Check for block: { followed by |
820
+ if block_lookahead < @input.length && @input[block_lookahead] == '{'
821
+ # Check if this is a Ruby block {|...| or { |...|
822
+ inner = block_lookahead + 1
823
+ inner += 1 while inner < @input.length && @input[inner] =~ /\s/
824
+ return true if inner < @input.length && @input[inner] == '|'
825
+ # For each/map/select/detect, also allow implicit 'it' blocks without |
826
+ return true if %w[each map select detect].include?(identifier)
827
+ end
828
+
829
+ # Must be followed by ( for method call
830
+ return false unless lookahead < @input.length && @input[lookahead] == '('
831
+
832
+ # Additional check: not Ruby keyword args inside (to avoid false positives)
833
+ !looks_like_ruby_method_chain?(lookahead)
834
+ end
835
+
836
+ def looks_like_ruby_method_chain?(paren_pos)
837
+ # Check if the content inside parens looks like Ruby code
838
+ # Similar to looks_like_ruby_call? but starting from a specific position
839
+ lookahead = paren_pos + 1
840
+ depth = 1
841
+ in_string = false
842
+ string_char = nil
843
+
844
+ while lookahead < @input.length && depth > 0
845
+ char = @input[lookahead]
846
+
847
+ if !in_string && (char == '"' || char == "'")
848
+ in_string = true
849
+ string_char = char
850
+ elsif in_string && char == string_char && @input[lookahead - 1] != '\\'
851
+ in_string = false
852
+ elsif !in_string
853
+ case char
854
+ when '('
855
+ depth += 1
856
+ when ')'
857
+ depth -= 1
858
+ when ':'
859
+ prev_char = lookahead > 0 ? @input[lookahead - 1] : nil
860
+ next_char = @input[lookahead + 1]
861
+ if prev_char =~ /[a-zA-Z0-9_]/ && (next_char.nil? || next_char =~ /[\s\w]/)
862
+ return true
863
+ end
864
+ end
865
+ end
866
+
867
+ lookahead += 1
868
+ end
869
+
870
+ false
871
+ end
872
+
873
+ def valid_func_call_name?(name)
874
+ # Valid function/command names must start with letter, underscore, or be a path
875
+ # Not valid: regex metacharacters like ^, $, or single special chars
876
+ return false if name.empty?
877
+ # Must start with letter, underscore, digit, dot, or / (for paths like /bin/ls)
878
+ return false unless name =~ /\A[a-zA-Z_0-9.\/]/
879
+ # Must not be just special characters
880
+ return false if name =~ /\A[\^$]+\z/
881
+ true
882
+ end
883
+
884
+ def looks_like_ruby_call?
885
+ # Check if the content inside parens looks like Ruby code rather than shell args
886
+ # Look ahead from current position (which is at '(')
887
+ lookahead = @pos + 1
888
+ depth = 1
889
+ in_string = false
890
+ string_char = nil
891
+
892
+ while lookahead < @input.length && depth > 0
893
+ char = @input[lookahead]
894
+
895
+ # Track string state
896
+ if !in_string && (char == '"' || char == "'")
897
+ in_string = true
898
+ string_char = char
899
+ elsif in_string && char == string_char && @input[lookahead - 1] != '\\'
900
+ in_string = false
901
+ elsif !in_string
902
+ case char
903
+ when '('
904
+ depth += 1
905
+ when ')'
906
+ depth -= 1
907
+ when ':'
908
+ # Check for Ruby keyword arg syntax: identifier followed by : and space/value
909
+ # e.g., "foo: bar" or "foo:bar" but not ":/path" or "$:"
910
+ prev_char = lookahead > 0 ? @input[lookahead - 1] : nil
911
+ next_char = @input[lookahead + 1]
912
+ # If : follows a word character and precedes space or word, it's likely Ruby
913
+ if prev_char =~ /[a-zA-Z0-9_]/ && (next_char.nil? || next_char =~ /[\s\w]/)
914
+ return true
915
+ end
916
+ end
917
+ end
918
+
919
+ lookahead += 1
920
+ end
921
+
922
+ false
923
+ end
924
+
925
+ def read_double_quoted_string
926
+ @pos += 1 # skip opening "
927
+ while @pos < @input.length && @input[@pos] != '"'
928
+ if @input[@pos] == '\\'
929
+ @pos += 2 # skip escaped char
930
+ next
931
+ end
932
+ @pos += 1
933
+ end
934
+ @pos += 1 # skip closing "
935
+ end
936
+
937
+ def read_single_quoted_string
938
+ @pos += 1 # skip opening '
939
+ @pos += 1 while @pos < @input.length && @input[@pos] != "'"
940
+ @pos += 1 # skip closing '
941
+ end
942
+
943
+ def read_ansi_c_quoted_string
944
+ # $'...' - ANSI-C quoting with escape sequences
945
+ @pos += 2 # skip $'
946
+ while @pos < @input.length
947
+ char = @input[@pos]
948
+ if char == '\\'
949
+ # Skip escaped character (including \')
950
+ @pos += 2
951
+ elsif char == "'"
952
+ @pos += 1 # skip closing '
953
+ break
954
+ else
955
+ @pos += 1
956
+ end
957
+ end
958
+ end
959
+
960
+ def read_command_substitution
961
+ # $(...)
962
+ @pos += 2 # skip $(
963
+ depth = 1
964
+ while @pos < @input.length && depth > 0
965
+ char = @input[@pos]
966
+ if char == '('
967
+ depth += 1
968
+ elsif char == ')'
969
+ depth -= 1
970
+ elsif char == '"'
971
+ read_double_quoted_string
972
+ next
973
+ elsif char == "'"
974
+ read_single_quoted_string
975
+ next
976
+ end
977
+ @pos += 1
978
+ end
979
+ end
980
+
981
+ def read_backtick_substitution
982
+ # `...`
983
+ @pos += 1 # skip opening `
984
+ while @pos < @input.length
985
+ char = @input[@pos]
986
+ if char == '\\'
987
+ # Skip escaped character (including escaped backtick)
988
+ @pos += 2
989
+ next
990
+ elsif char == '`'
991
+ @pos += 1 # skip closing `
992
+ break
993
+ end
994
+ @pos += 1
995
+ end
996
+ end
997
+
998
+ def read_braced_variable
999
+ # ${VAR}
1000
+ @pos += 2 # skip ${
1001
+ @pos += 1 while @pos < @input.length && @input[@pos] != '}'
1002
+ @pos += 1 if @pos < @input.length # skip closing }
1003
+ end
1004
+
1005
+ def read_brace_expansion
1006
+ # Read a brace expansion pattern like {a,b,c} or {1..5}
1007
+ # Handles nested braces
1008
+ depth = 0
1009
+ while @pos < @input.length
1010
+ char = @input[@pos]
1011
+ if char == '{'
1012
+ depth += 1
1013
+ elsif char == '}'
1014
+ depth -= 1
1015
+ @pos += 1
1016
+ break if depth == 0
1017
+ next
1018
+ end
1019
+ @pos += 1
1020
+ end
1021
+ end
1022
+
1023
+ def read_process_substitution(type)
1024
+ # Read <(...) or >(...) - the command inside parens
1025
+ @pos += 2 # skip <( or >(
1026
+ start = @pos
1027
+ depth = 1
1028
+ while @pos < @input.length && depth > 0
1029
+ char = @input[@pos]
1030
+ if char == '('
1031
+ depth += 1
1032
+ elsif char == ')'
1033
+ depth -= 1
1034
+ break if depth == 0
1035
+ elsif char == '"'
1036
+ read_double_quoted_string
1037
+ next
1038
+ elsif char == "'"
1039
+ read_single_quoted_string
1040
+ next
1041
+ end
1042
+ @pos += 1
1043
+ end
1044
+ command = @input[start...@pos]
1045
+ @pos += 1 if @pos < @input.length # skip closing )
1046
+ Token.new(type, command)
1047
+ end
1048
+
1049
+ def read_heredoc_delimiter(type)
1050
+ skip_whitespace
1051
+
1052
+ # Check for quoted delimiter (no variable expansion)
1053
+ quoted = false
1054
+ if @input[@pos] == "'" || @input[@pos] == '"'
1055
+ quote = @input[@pos]
1056
+ @pos += 1
1057
+ start = @pos
1058
+ @pos += 1 while @pos < @input.length && @input[@pos] != quote
1059
+ delimiter = @input[start...@pos]
1060
+ @pos += 1 if @pos < @input.length # skip closing quote
1061
+ quoted = true
1062
+ else
1063
+ # Unquoted delimiter
1064
+ start = @pos
1065
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[a-zA-Z0-9_]/
1066
+ delimiter = @input[start...@pos]
1067
+ end
1068
+
1069
+ # Return token with delimiter info: "delimiter:quoted" format
1070
+ # quoted=true means no variable expansion
1071
+ value = quoted ? "#{delimiter}:quoted" : delimiter
1072
+ Token.new(type, value)
1073
+ end
1074
+
1075
+ def read_herestring
1076
+ skip_whitespace
1077
+
1078
+ # Read the string (can be quoted or unquoted)
1079
+ if @input[@pos] == '"'
1080
+ start = @pos
1081
+ read_double_quoted_string
1082
+ value = @input[start...@pos]
1083
+ elsif @input[@pos] == "'"
1084
+ start = @pos
1085
+ read_single_quoted_string
1086
+ value = @input[start...@pos]
1087
+ else
1088
+ # Unquoted - read until whitespace or operator
1089
+ start = @pos
1090
+ while @pos < @input.length
1091
+ char = @input[@pos]
1092
+ break if char =~ /[ \t]/ || OPERATORS.key?(char)
1093
+ @pos += 1
1094
+ end
1095
+ value = @input[start...@pos]
1096
+ end
1097
+
1098
+ Token.new(:HERESTRING, value)
1099
+ end
1100
+
1101
+ def read_arithmetic_command
1102
+ # Read the arithmetic expression until ))
1103
+ # Need to handle nested parentheses
1104
+ expression = +''
1105
+ depth = 1 # We've already consumed the opening ((
1106
+
1107
+ while @pos < @input.length && depth > 0
1108
+ char = @input[@pos]
1109
+ two_char = @input[@pos, 2]
1110
+
1111
+ if two_char == '))'
1112
+ depth -= 1
1113
+ if depth == 0
1114
+ @pos += 2
1115
+ break
1116
+ else
1117
+ expression << '))'
1118
+ @pos += 2
1119
+ end
1120
+ elsif two_char == '(('
1121
+ depth += 1
1122
+ expression << '(('
1123
+ @pos += 2
1124
+ elsif char == '('
1125
+ expression << char
1126
+ @pos += 1
1127
+ elsif char == ')'
1128
+ expression << char
1129
+ @pos += 1
1130
+ else
1131
+ expression << char
1132
+ @pos += 1
1133
+ end
1134
+ end
1135
+
1136
+ raise 'Expected ")))" to close arithmetic command' if depth > 0
1137
+
1138
+ Token.new(:ARITH_CMD, expression.strip)
1139
+ end
1140
+
1141
+ # Check if current position is a {varname} redirection pattern
1142
+ # Pattern: {identifier} followed by >, >>, <, >&, <&
1143
+ def looks_like_varname_redirect?
1144
+ return false unless @input[@pos] == '{'
1145
+
1146
+ # Look for closing } followed by redirection operator
1147
+ lookahead = @pos + 1
1148
+ # Identifier must start with letter or underscore
1149
+ return false unless lookahead < @input.length && @input[lookahead] =~ /[a-zA-Z_]/
1150
+
1151
+ # Find the closing brace
1152
+ lookahead += 1
1153
+ lookahead += 1 while lookahead < @input.length && @input[lookahead] =~ /[a-zA-Z0-9_]/
1154
+
1155
+ # Must be followed by }
1156
+ return false unless lookahead < @input.length && @input[lookahead] == '}'
1157
+
1158
+ # Must be followed by a redirection operator
1159
+ after_brace = lookahead + 1
1160
+ return false unless after_brace < @input.length
1161
+
1162
+ next_two = @input[after_brace, 2]
1163
+ next_one = @input[after_brace]
1164
+
1165
+ # Check for valid redirection operators
1166
+ %w[>> >| >& <& < >].any? { |op| @input[after_brace, op.length] == op }
1167
+ end
1168
+
1169
+ # Read a {varname} redirection: {fd}>file or {fd}<file
1170
+ def read_varname_redirect
1171
+ @pos += 1 # skip opening {
1172
+
1173
+ # Read variable name
1174
+ start = @pos
1175
+ @pos += 1 while @pos < @input.length && @input[@pos] =~ /[a-zA-Z0-9_]/
1176
+ varname = @input[start...@pos]
1177
+
1178
+ @pos += 1 # skip closing }
1179
+
1180
+ # Read the redirection operator
1181
+ two_char = @input[@pos, 2]
1182
+ if %w[>> >| >& <&].include?(two_char)
1183
+ op = two_char
1184
+ @pos += 2
1185
+ else
1186
+ op = @input[@pos] # Single char: > or <
1187
+ @pos += 1
1188
+ end
1189
+
1190
+ # Return token with varname and operator info
1191
+ Token.new(:VARNAME_REDIRECT, {varname: varname, operator: op})
1192
+ end
1193
+ end
1194
+ end