@contextfort-ai/openclaw-secure 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,183 @@
1
+ import re
2
+ import regex
3
+ from ..patterns import (
4
+ LOOKALIKE_TLDS,
5
+ INVALID_HOST_CHARS,
6
+ UNICODE_DOTS,
7
+ )
8
+ from ..data import get_known_domains, skeleton, is_known_domain
9
+ from .utils import levenshtein, extract_host_from_url
10
+
11
+
12
+ def check_non_ascii_hostname(command):
13
+ urls = re.findall(r'https?://[^\s]+', command)
14
+
15
+ for url in urls:
16
+ host = extract_host_from_url(url)
17
+ if host and any(ord(c) > 127 for c in host):
18
+ return ("non_ascii_hostname", f"Non-ASCII characters in hostname: {host}")
19
+
20
+ return None
21
+
22
+
23
+ def check_mixed_script_in_label(command):
24
+ urls = re.findall(r'https?://[^\s]+', command)
25
+
26
+ for url in urls:
27
+ host = extract_host_from_url(url)
28
+ if not host:
29
+ continue
30
+
31
+ for label in host.split("."):
32
+ if not label:
33
+ continue
34
+
35
+ scripts = set()
36
+ for char in label:
37
+ if char == "-" or char.isdigit():
38
+ continue
39
+
40
+ if regex.match(r'\p{Script=Latin}', char):
41
+ scripts.add('Latin')
42
+ elif regex.match(r'\p{Script=Cyrillic}', char):
43
+ scripts.add('Cyrillic')
44
+ elif regex.match(r'\p{Script=Greek}', char):
45
+ scripts.add('Greek')
46
+ elif regex.match(r'\p{Script=Han}', char):
47
+ scripts.add('Han')
48
+ elif regex.match(r'\p{Script=Hiragana}', char):
49
+ scripts.add('Hiragana')
50
+ elif regex.match(r'\p{Script=Katakana}', char):
51
+ scripts.add('Katakana')
52
+ elif regex.match(r'\p{Script=Arabic}', char):
53
+ scripts.add('Arabic')
54
+ elif regex.match(r'\p{Script=Hebrew}', char):
55
+ scripts.add('Hebrew')
56
+
57
+ if len(scripts) > 1:
58
+ return ("mixed_script_in_label", f"Mixed scripts in hostname label '{label}': {scripts}")
59
+
60
+ return None
61
+
62
+
63
+ def check_userinfo_trick(command):
64
+ urls = re.findall(r'https?://([^@/\s]+)@[^\s]+', command)
65
+
66
+ for userinfo in urls:
67
+ if "." in userinfo:
68
+ return ("userinfo_trick", f"Domain-like userinfo in URL: {userinfo}@...")
69
+
70
+ return None
71
+
72
+
73
+ def check_confusable_domain(command):
74
+ known_domains = get_known_domains()
75
+ if not known_domains:
76
+ return None
77
+
78
+ urls = re.findall(r'https?://[^\s]+', command)
79
+
80
+ for url in urls:
81
+ host = extract_host_from_url(url)
82
+ if not host:
83
+ continue
84
+
85
+ host_lower = host.lower()
86
+
87
+ if host_lower in known_domains:
88
+ continue
89
+
90
+ host_skeleton = skeleton(host_lower)
91
+
92
+ for known in known_domains:
93
+ if host_skeleton == known and host_lower != known:
94
+ return ("confusable_domain", f"Domain '{host}' is visually similar to known domain '{known}'")
95
+
96
+ if len(known) >= 8:
97
+ len_diff = abs(len(host_lower) - len(known))
98
+ if len_diff <= 3 and levenshtein(host_lower, known) == 1:
99
+ return ("confusable_domain", f"Domain '{host}' is 1 edit from known domain '{known}'")
100
+
101
+ return None
102
+
103
+
104
+ def check_invalid_host_chars(command):
105
+ urls = re.findall(r'https?://[^\s]+', command)
106
+
107
+ for url in urls:
108
+ host = extract_host_from_url(url)
109
+ if not host:
110
+ continue
111
+
112
+ for char in host:
113
+ if char in INVALID_HOST_CHARS:
114
+ return ("invalid_host_chars", f"Invalid character '{char}' in hostname")
115
+ if char in UNICODE_DOTS:
116
+ return ("invalid_host_chars", f"Unicode dot character in hostname: U+{ord(char):04X}")
117
+ if ord(char) < 0x20 or char.isspace():
118
+ return ("invalid_host_chars", "Control character or whitespace in hostname")
119
+
120
+ return None
121
+
122
+
123
+ def check_trailing_dot_whitespace(command):
124
+ urls = re.findall(r'https?://[^\s]+', command)
125
+
126
+ for url in urls:
127
+ host = extract_host_from_url(url)
128
+ if not host:
129
+ continue
130
+
131
+ if host.endswith("."):
132
+ return ("trailing_dot_whitespace", "Trailing dot in hostname")
133
+ if host[-1:].isspace():
134
+ return ("trailing_dot_whitespace", "Trailing whitespace in hostname")
135
+
136
+ return None
137
+
138
+
139
+ def check_non_standard_port(command):
140
+ standard_ports = {80, 443, 22, 9418}
141
+
142
+ url_port_pattern = r'https?://([^:/\s]+):(\d+)'
143
+ matches = re.findall(url_port_pattern, command)
144
+
145
+ for host, port_str in matches:
146
+ try:
147
+ port = int(port_str)
148
+ except ValueError:
149
+ continue
150
+
151
+ if port in standard_ports:
152
+ continue
153
+
154
+ if is_known_domain(host.lower()):
155
+ return ("non_standard_port", f"Non-standard port {port} on known domain '{host}'")
156
+
157
+ return None
158
+
159
+
160
+ def check_lookalike_tld(command):
161
+ urls = re.findall(r'https?://[^\s]+', command)
162
+ for url in urls:
163
+ host = url.split("://")[1].split("/")[0].split(":")[0].lower()
164
+ tld = host.rsplit(".", 1)[-1] if "." in host else ""
165
+ if tld in LOOKALIKE_TLDS:
166
+ return ("lookalike_tld", f"Lookalike TLD detected: .{tld}")
167
+
168
+ return None
169
+
170
+
171
+ def check_punycode_domain(command):
172
+ if "xn--" in command.lower():
173
+ return ("punycode_domain", "Punycode domain detected (potential homograph)")
174
+
175
+ return None
176
+
177
+
178
+ def check_raw_ip_url(command):
179
+ ipv4_pattern = r'https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'
180
+ if re.search(ipv4_pattern, command):
181
+ return ("raw_ip_url", "URL uses raw IP address instead of domain")
182
+
183
+ return None
@@ -0,0 +1,57 @@
1
+ import re
2
+ from ..patterns import KNOWN_SENSITIVE_PATHS
3
+ from .utils import levenshtein
4
+
5
+
6
+ def check_non_ascii_path(command):
7
+ urls = re.findall(r'https?://[^\s]+', command)
8
+
9
+ for url in urls:
10
+ try:
11
+ parts = url.split("://")[1].split("/", 1)
12
+ if len(parts) < 2:
13
+ continue
14
+ path = "/" + parts[1]
15
+ except IndexError:
16
+ continue
17
+
18
+ if any(ord(c) > 127 for c in path):
19
+ return ("non_ascii_path", "Non-ASCII characters in URL path")
20
+
21
+ return None
22
+
23
+
24
+ def check_homoglyph_in_path(command):
25
+ urls = re.findall(r'https?://[^\s]+', command)
26
+
27
+ for url in urls:
28
+ try:
29
+ path = url.split("://")[1].split("/", 1)
30
+ if len(path) < 2:
31
+ continue
32
+ path = "/" + path[1]
33
+ except IndexError:
34
+ continue
35
+
36
+ for segment in path.split("/"):
37
+ if not segment:
38
+ continue
39
+
40
+ has_ascii = any(c.isascii() and c.isalpha() for c in segment)
41
+ has_non_ascii = any(ord(c) > 127 for c in segment)
42
+
43
+ if has_ascii and has_non_ascii:
44
+ segment_lower = segment.lower()
45
+ for known in KNOWN_SENSITIVE_PATHS:
46
+ if levenshtein(segment_lower, known) <= 2:
47
+ return ("homoglyph_in_path", f"Potential homoglyph in path: '{segment}' looks like '{known}'")
48
+
49
+ return None
50
+
51
+
52
+ def check_double_encoding(command):
53
+ if "%25" in command:
54
+ if re.search(r'%25[0-9a-fA-F]{2}', command):
55
+ return ("double_encoding", "Double-encoded URL path detected (%25XX)")
56
+
57
+ return None
@@ -0,0 +1,44 @@
1
+ from ..patterns import (
2
+ BIDI_CONTROL_CHARS,
3
+ ZERO_WIDTH_CHARS,
4
+ HIDDEN_COMMAND_INDICATORS,
5
+ )
6
+
7
+
8
+ def check_terminal_injection(command):
9
+ if '\x1b[' in command or '\x1b]' in command or '\x1b_' in command or '\x1bP' in command:
10
+ return ("ansi_escapes", "ANSI escape sequences detected")
11
+
12
+ for char in command:
13
+ code = ord(char)
14
+ if code < 0x20 and char not in ('\n', '\t', '\x1b'):
15
+ return ("control_chars", f"Control character detected: 0x{code:02x}")
16
+ if code == 0x7f:
17
+ return ("control_chars", "DEL control character detected")
18
+
19
+ for char in command:
20
+ if char in BIDI_CONTROL_CHARS:
21
+ return ("bidi_controls", f"Bidirectional control character: U+{ord(char):04X}")
22
+
23
+ for char in command:
24
+ if char in ZERO_WIDTH_CHARS:
25
+ return ("zero_width_chars", f"Zero-width character: U+{ord(char):04X}")
26
+
27
+ return None
28
+
29
+
30
+ def check_hidden_multiline(command):
31
+ lines = command.split('\n')
32
+ if len(lines) <= 1:
33
+ return None
34
+
35
+ for i, line in enumerate(lines[1:], 1):
36
+ trimmed = line.strip()
37
+ if not trimmed:
38
+ continue
39
+
40
+ for indicator in HIDDEN_COMMAND_INDICATORS:
41
+ if indicator in trimmed:
42
+ return ("hidden_multiline", f"Hidden command on line {i+1}: {trimmed[:60]}")
43
+
44
+ return None
@@ -0,0 +1,155 @@
1
+ import re
2
+ from ..patterns import (
3
+ INTERPRETERS,
4
+ INSECURE_TLS_FLAGS,
5
+ URL_SHORTENERS,
6
+ CURL_UPLOAD_FLAGS,
7
+ WGET_UPLOAD_FLAGS,
8
+ )
9
+ from ..data import is_known_domain
10
+ from .utils import split_commands, extract_host_from_url
11
+
12
+
13
+ def check_insecure_tls_flags(command):
14
+ words = command.split()
15
+ for word in words:
16
+ clean = word.strip("'\"")
17
+ if clean in INSECURE_TLS_FLAGS:
18
+ return ("insecure_tls_flags", f"Insecure TLS flag: {clean}")
19
+
20
+ return None
21
+
22
+
23
+ def check_shortened_url(command):
24
+ command_lower = command.lower()
25
+ for shortener in URL_SHORTENERS:
26
+ if shortener in command_lower:
27
+ return ("shortened_url", f"Shortened URL detected: {shortener}")
28
+
29
+ return None
30
+
31
+
32
+ def _check_plain_http_to_sink_single(command):
33
+ """Check a single command for plain HTTP piped to sink."""
34
+ if "http://" not in command.lower():
35
+ return None
36
+
37
+ if "|" not in command:
38
+ return None
39
+
40
+ parts = command.split("|")
41
+ for part in parts[1:]:
42
+ words = part.strip().split()
43
+ if words:
44
+ cmd = words[0].lower().rsplit("/", 1)[-1]
45
+ if cmd in INTERPRETERS or cmd in ("sudo", "env"):
46
+ return ("plain_http_to_sink", "Plain HTTP URL piped to interpreter")
47
+
48
+ return None
49
+
50
+
51
+ def check_plain_http_to_sink(command):
52
+ """Check for plain HTTP to sink, handling && and ; separators."""
53
+ for subcmd in split_commands(command):
54
+ result = _check_plain_http_to_sink_single(subcmd)
55
+ if result:
56
+ return result
57
+ return None
58
+
59
+
60
+ def _check_schemeless_to_sink_single(command):
61
+ """Check a single command for schemeless URL piped to sink."""
62
+ if "|" not in command:
63
+ return None
64
+
65
+ schemeless_pattern = r'(?<!\w://)(?<!\w://www\.)([a-zA-Z0-9][-a-zA-Z0-9]*\.[a-zA-Z]{2,})/\S+'
66
+
67
+ parts = command.split("|")
68
+ first_part = parts[0]
69
+
70
+ if re.search(schemeless_pattern, first_part):
71
+ for part in parts[1:]:
72
+ words = part.strip().split()
73
+ if words:
74
+ cmd = words[0].lower().rsplit("/", 1)[-1]
75
+ if cmd in INTERPRETERS or cmd in ("sudo", "env"):
76
+ return ("schemeless_to_sink", "Schemeless URL piped to interpreter")
77
+
78
+ return None
79
+
80
+
81
+ def check_schemeless_to_sink(command):
82
+ """Check for schemeless URL to sink, handling && and ; separators."""
83
+ for subcmd in split_commands(command):
84
+ result = _check_schemeless_to_sink_single(subcmd)
85
+ if result:
86
+ return result
87
+ return None
88
+
89
+
90
+ _LOCALHOST_HOSTS = {"localhost", "127.0.0.1", "::1"}
91
+
92
+
93
+ def _check_curl_upload_single(command):
94
+ """Check a single command for curl/wget uploading data to unknown hosts."""
95
+ words = command.split()
96
+ if not words:
97
+ return None
98
+
99
+ cmd = words[0].rsplit("/", 1)[-1].lower()
100
+ if cmd not in ("curl", "wget"):
101
+ return None
102
+
103
+ upload_flags = CURL_UPLOAD_FLAGS if cmd == "curl" else WGET_UPLOAD_FLAGS
104
+
105
+ has_upload_flag = False
106
+ for word in words[1:]:
107
+ clean = word.strip("'\"")
108
+ # Exact match: -d, --data, etc.
109
+ if clean in upload_flags:
110
+ has_upload_flag = True
111
+ break
112
+ # Prefix match for flag=value: --data=foo, --post-file=secrets.txt
113
+ for flag in upload_flags:
114
+ if clean.startswith(flag + "="):
115
+ has_upload_flag = True
116
+ break
117
+ if has_upload_flag:
118
+ break
119
+
120
+ if not has_upload_flag:
121
+ return None
122
+
123
+ # Extract URL and check host
124
+ for word in words[1:]:
125
+ clean = word.strip("'\"")
126
+ if re.match(r'https?://', clean, re.IGNORECASE):
127
+ host = extract_host_from_url(clean)
128
+ if not host:
129
+ continue
130
+ host_lower = host.lower()
131
+ if host_lower in _LOCALHOST_HOSTS:
132
+ return None
133
+ # Check full host first (e.g. api.github.com), then parent domain
134
+ if is_known_domain(host_lower):
135
+ return None
136
+ parts = host_lower.split(".")
137
+ if len(parts) > 2:
138
+ parent = ".".join(parts[-2:])
139
+ if is_known_domain(parent):
140
+ return None
141
+ return (
142
+ "curl_upload_to_unknown",
143
+ f"Data upload via {cmd} to unknown host: {host}",
144
+ )
145
+
146
+ return None
147
+
148
+
149
+ def check_curl_upload(command):
150
+ """Check for curl/wget data uploads to unknown hosts, handling && and ; separators."""
151
+ for subcmd in split_commands(command):
152
+ result = _check_curl_upload_single(subcmd)
153
+ if result:
154
+ return result
155
+ return None
@@ -0,0 +1,93 @@
1
+ def split_commands(command):
2
+ """
3
+ Split on && ; || while respecting quotes and escapes.
4
+ Note: Does not handle $() or () subshells.
5
+ """
6
+ result = []
7
+ current = []
8
+ i = 0
9
+ in_single = False
10
+ in_double = False
11
+
12
+ while i < len(command):
13
+ char = command[i]
14
+
15
+ # Backslash escape (not in single quotes - they're literal there)
16
+ if char == '\\' and not in_single and i + 1 < len(command):
17
+ current.append(char)
18
+ current.append(command[i + 1])
19
+ i += 2
20
+ continue
21
+
22
+ if char == '"' and not in_single:
23
+ in_double = not in_double
24
+ current.append(char)
25
+ elif char == "'" and not in_double:
26
+ in_single = not in_single
27
+ current.append(char)
28
+ elif not in_single and not in_double:
29
+ if command[i:i+2] in ('&&', '||'):
30
+ if current:
31
+ result.append(''.join(current).strip())
32
+ current = []
33
+ i += 2
34
+ continue
35
+ elif char == ';':
36
+ if current:
37
+ result.append(''.join(current).strip())
38
+ current = []
39
+ else:
40
+ current.append(char)
41
+ else:
42
+ current.append(char)
43
+ i += 1
44
+
45
+ if current:
46
+ result.append(''.join(current).strip())
47
+
48
+ return [c for c in result if c] or [command]
49
+
50
+
51
+ def levenshtein(a, b):
52
+ m, n = len(a), len(b)
53
+ if m == 0:
54
+ return n
55
+ if n == 0:
56
+ return m
57
+
58
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
59
+ for i in range(m + 1):
60
+ dp[i][0] = i
61
+ for j in range(n + 1):
62
+ dp[0][j] = j
63
+
64
+ for i in range(1, m + 1):
65
+ for j in range(1, n + 1):
66
+ cost = 0 if a[i-1] == b[j-1] else 1
67
+ dp[i][j] = min(
68
+ dp[i-1][j] + 1,
69
+ dp[i][j-1] + 1,
70
+ dp[i-1][j-1] + cost
71
+ )
72
+
73
+ return dp[m][n]
74
+
75
+
76
+ def extract_host_from_url(url):
77
+ try:
78
+ if "://" in url:
79
+ rest = url.split("://")[1]
80
+ else:
81
+ rest = url
82
+
83
+ if "@" in rest.split("/")[0]:
84
+ rest = rest.split("@")[-1]
85
+
86
+ host_part = rest.split("/")[0]
87
+
88
+ if ":" in host_part:
89
+ host_part = host_part.rsplit(":", 1)[0]
90
+
91
+ return host_part
92
+ except (IndexError, ValueError):
93
+ return None
@@ -0,0 +1,97 @@
1
+ # Subset of Unicode confusable characters relevant to URL security
2
+ # Format: confusable_codepoint target_codepoint # comment
3
+ # Based on Unicode TR39 confusables.txt
4
+ # Cyrillic -> Latin
5
+ 0430 0061 # а -> a
6
+ 0435 0065 # е -> e
7
+ 043E 006F # о -> o
8
+ 0440 0070 # р -> p
9
+ 0441 0063 # с -> c
10
+ 0443 0079 # у -> y
11
+ 0445 0078 # х -> x
12
+ 0456 0069 # і -> i
13
+ 0458 006A # ј -> j
14
+ 04BB 0068 # һ -> h
15
+ 0410 0041 # А -> A
16
+ 0412 0042 # В -> B
17
+ 0415 0045 # Е -> E
18
+ 041A 004B # К -> K
19
+ 041C 004D # М -> M
20
+ 041D 0048 # Н -> H
21
+ 041E 004F # О -> O
22
+ 0420 0050 # Р -> P
23
+ 0421 0043 # С -> C
24
+ 0422 0054 # Т -> T
25
+ 0425 0058 # Х -> X
26
+ # Greek -> Latin
27
+ 03B1 0061 # α -> a
28
+ 03B5 0065 # ε -> e
29
+ 03B9 0069 # ι -> i
30
+ 03BF 006F # ο -> o
31
+ 03C1 0070 # ρ -> p
32
+ 03C5 0075 # υ -> u
33
+ 03C7 0078 # χ -> x
34
+ # Fullwidth Latin
35
+ FF21 0041 # A -> A
36
+ FF22 0042 # B -> B
37
+ FF23 0043 # C -> C
38
+ FF24 0044 # D -> D
39
+ FF25 0045 # E -> E
40
+ FF26 0046 # F -> F
41
+ FF27 0047 # G -> G
42
+ FF28 0048 # H -> H
43
+ FF29 0049 # I -> I
44
+ FF2A 004A # J -> J
45
+ FF2B 004B # K -> K
46
+ FF2C 004C # L -> L
47
+ FF2D 004D # M -> M
48
+ FF2E 004E # N -> N
49
+ FF2F 004F # O -> O
50
+ FF30 0050 # P -> P
51
+ FF31 0051 # Q -> Q
52
+ FF32 0052 # R -> R
53
+ FF33 0053 # S -> S
54
+ FF34 0054 # T -> T
55
+ FF35 0055 # U -> U
56
+ FF36 0056 # V -> V
57
+ FF37 0057 # W -> W
58
+ FF38 0058 # X -> X
59
+ FF39 0059 # Y -> Y
60
+ FF3A 005A # Z -> Z
61
+ FF41 0061 # a -> a
62
+ FF42 0062 # b -> b
63
+ FF43 0063 # c -> c
64
+ FF44 0064 # d -> d
65
+ FF45 0065 # e -> e
66
+ FF46 0066 # f -> f
67
+ FF47 0067 # g -> g
68
+ FF48 0068 # h -> h
69
+ FF49 0069 # i -> i
70
+ FF4A 006A # j -> j
71
+ FF4B 006B # k -> k
72
+ FF4C 006C # l -> l
73
+ FF4D 006D # m -> m
74
+ FF4E 006E # n -> n
75
+ FF4F 006F # o -> o
76
+ FF50 0070 # p -> p
77
+ FF51 0071 # q -> q
78
+ FF52 0072 # r -> r
79
+ FF53 0073 # s -> s
80
+ FF54 0074 # t -> t
81
+ FF55 0075 # u -> u
82
+ FF56 0076 # v -> v
83
+ FF57 0077 # w -> w
84
+ FF58 0078 # x -> x
85
+ FF59 0079 # y -> y
86
+ FF5A 007A # z -> z
87
+ # Common lookalikes
88
+ 0131 006C # ı -> l (dotless i looks like l)
89
+ 0269 0069 # ɩ -> i
90
+ 026A 0069 # ɪ -> i
91
+ 1D00 0041 # ᴀ -> A (small cap)
92
+ 029F 004C # ʟ -> L
93
+ 0280 0052 # ʀ -> R
94
+ # Dot variants
95
+ FF0E 002E # . -> . (fullwidth dot)
96
+ 3002 002E # 。 -> . (ideographic period)
97
+ FF61 002E # 。 -> . (halfwidth ideographic period)