tweek 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tweek/__init__.py +2 -2
- tweek/_keygen.py +53 -0
- tweek/audit.py +288 -0
- tweek/cli.py +5303 -2396
- tweek/cli_model.py +380 -0
- tweek/config/families.yaml +609 -0
- tweek/config/manager.py +42 -5
- tweek/config/patterns.yaml +1510 -8
- tweek/config/tiers.yaml +161 -11
- tweek/diagnostics.py +71 -2
- tweek/hooks/break_glass.py +163 -0
- tweek/hooks/feedback.py +223 -0
- tweek/hooks/overrides.py +531 -0
- tweek/hooks/post_tool_use.py +472 -0
- tweek/hooks/pre_tool_use.py +1024 -62
- tweek/integrations/openclaw.py +443 -0
- tweek/integrations/openclaw_server.py +385 -0
- tweek/licensing.py +14 -54
- tweek/logging/bundle.py +2 -2
- tweek/logging/security_log.py +56 -13
- tweek/mcp/approval.py +57 -16
- tweek/mcp/proxy.py +18 -0
- tweek/mcp/screening.py +5 -5
- tweek/mcp/server.py +4 -1
- tweek/memory/__init__.py +24 -0
- tweek/memory/queries.py +223 -0
- tweek/memory/safety.py +140 -0
- tweek/memory/schemas.py +80 -0
- tweek/memory/store.py +989 -0
- tweek/platform/__init__.py +4 -4
- tweek/plugins/__init__.py +40 -24
- tweek/plugins/base.py +1 -1
- tweek/plugins/detectors/__init__.py +3 -3
- tweek/plugins/detectors/{moltbot.py → openclaw.py} +30 -27
- tweek/plugins/git_discovery.py +16 -4
- tweek/plugins/git_registry.py +8 -2
- tweek/plugins/git_security.py +21 -9
- tweek/plugins/screening/__init__.py +10 -1
- tweek/plugins/screening/heuristic_scorer.py +477 -0
- tweek/plugins/screening/llm_reviewer.py +14 -6
- tweek/plugins/screening/local_model_reviewer.py +161 -0
- tweek/proxy/__init__.py +38 -37
- tweek/proxy/addon.py +22 -3
- tweek/proxy/interceptor.py +1 -0
- tweek/proxy/server.py +4 -2
- tweek/sandbox/__init__.py +11 -0
- tweek/sandbox/docker_bridge.py +143 -0
- tweek/sandbox/executor.py +9 -6
- tweek/sandbox/layers.py +97 -0
- tweek/sandbox/linux.py +1 -0
- tweek/sandbox/project.py +548 -0
- tweek/sandbox/registry.py +149 -0
- tweek/security/__init__.py +9 -0
- tweek/security/language.py +250 -0
- tweek/security/llm_reviewer.py +1146 -60
- tweek/security/local_model.py +331 -0
- tweek/security/local_reviewer.py +146 -0
- tweek/security/model_registry.py +371 -0
- tweek/security/rate_limiter.py +11 -6
- tweek/security/secret_scanner.py +70 -4
- tweek/security/session_analyzer.py +26 -2
- tweek/skill_template/SKILL.md +200 -0
- tweek/skill_template/__init__.py +0 -0
- tweek/skill_template/cli-reference.md +331 -0
- tweek/skill_template/overrides-reference.md +184 -0
- tweek/skill_template/scripts/__init__.py +0 -0
- tweek/skill_template/scripts/check_installed.py +170 -0
- tweek/skills/__init__.py +38 -0
- tweek/skills/config.py +150 -0
- tweek/skills/fingerprints.py +198 -0
- tweek/skills/guard.py +293 -0
- tweek/skills/isolation.py +469 -0
- tweek/skills/scanner.py +715 -0
- tweek/vault/__init__.py +0 -1
- tweek/vault/cross_platform.py +12 -1
- tweek/vault/keychain.py +87 -29
- tweek-0.2.0.dist-info/METADATA +281 -0
- tweek-0.2.0.dist-info/RECORD +121 -0
- {tweek-0.1.0.dist-info → tweek-0.2.0.dist-info}/entry_points.txt +8 -1
- {tweek-0.1.0.dist-info → tweek-0.2.0.dist-info}/licenses/LICENSE +80 -0
- tweek/integrations/moltbot.py +0 -243
- tweek-0.1.0.dist-info/METADATA +0 -335
- tweek-0.1.0.dist-info/RECORD +0 -85
- {tweek-0.1.0.dist-info → tweek-0.2.0.dist-info}/WHEEL +0 -0
- {tweek-0.1.0.dist-info → tweek-0.2.0.dist-info}/top_level.txt +0 -0
tweek/security/__init__.py
CHANGED
|
@@ -5,11 +5,17 @@ Advanced security components for detecting and preventing attacks:
|
|
|
5
5
|
- Rate limiting for resource theft protection
|
|
6
6
|
- Session analysis for cross-turn anomaly detection
|
|
7
7
|
- LLM-based secondary review for risky operations
|
|
8
|
+
- Language detection for non-English prompt injection
|
|
8
9
|
"""
|
|
9
10
|
|
|
10
11
|
from tweek.security.rate_limiter import RateLimiter, RateLimitResult, RateLimitConfig
|
|
11
12
|
from tweek.security.session_analyzer import SessionAnalyzer, SessionAnalysis
|
|
12
13
|
from tweek.security.llm_reviewer import LLMReviewer, LLMReviewResult
|
|
14
|
+
from tweek.security.language import (
|
|
15
|
+
detect_non_english,
|
|
16
|
+
LanguageDetectionResult,
|
|
17
|
+
NonEnglishHandling,
|
|
18
|
+
)
|
|
13
19
|
|
|
14
20
|
__all__ = [
|
|
15
21
|
"RateLimiter",
|
|
@@ -19,4 +25,7 @@ __all__ = [
|
|
|
19
25
|
"SessionAnalysis",
|
|
20
26
|
"LLMReviewer",
|
|
21
27
|
"LLMReviewResult",
|
|
28
|
+
"detect_non_english",
|
|
29
|
+
"LanguageDetectionResult",
|
|
30
|
+
"NonEnglishHandling",
|
|
22
31
|
]
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tweek Language Detection
|
|
3
|
+
|
|
4
|
+
Lightweight, zero-dependency detection of non-English natural language content.
|
|
5
|
+
Uses Unicode script analysis to identify when tool call content contains
|
|
6
|
+
non-English text that would bypass English-only regex patterns (prompt injection,
|
|
7
|
+
social engineering, etc.).
|
|
8
|
+
|
|
9
|
+
Technical shell commands (file paths, tool names, flags) are language-independent
|
|
10
|
+
and do not trigger detection.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import re
|
|
14
|
+
import unicodedata
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from enum import Enum
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class NonEnglishHandling(Enum):
|
|
21
|
+
"""How to handle non-English content in the screening pipeline."""
|
|
22
|
+
ESCALATE = "escalate" # Auto-escalate tier to force LLM review (default)
|
|
23
|
+
TRANSLATE = "translate" # Translate to English before pattern matching
|
|
24
|
+
BOTH = "both" # Escalate AND translate
|
|
25
|
+
NONE = "none" # No special handling
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class LanguageDetectionResult:
|
|
30
|
+
"""Result of language detection on tool call content."""
|
|
31
|
+
has_non_english: bool
|
|
32
|
+
confidence: float # 0.0 - 1.0
|
|
33
|
+
detected_scripts: list # e.g. ["CJK", "CYRILLIC", "ARABIC"]
|
|
34
|
+
non_english_ratio: float # ratio of non-English characters to total
|
|
35
|
+
sample: Optional[str] = None # short sample of detected non-English text
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# Unicode block ranges for non-Latin scripts commonly used in prompt injection
|
|
39
|
+
# We detect these because English-only regex patterns cannot match them
|
|
40
|
+
_SCRIPT_RANGES = {
|
|
41
|
+
"CJK": [
|
|
42
|
+
(0x4E00, 0x9FFF), # CJK Unified Ideographs
|
|
43
|
+
(0x3400, 0x4DBF), # CJK Extension A
|
|
44
|
+
(0x3000, 0x303F), # CJK Symbols and Punctuation
|
|
45
|
+
(0x3040, 0x309F), # Hiragana
|
|
46
|
+
(0x30A0, 0x30FF), # Katakana
|
|
47
|
+
(0xAC00, 0xD7AF), # Hangul Syllables
|
|
48
|
+
],
|
|
49
|
+
"CYRILLIC": [
|
|
50
|
+
(0x0400, 0x04FF), # Cyrillic
|
|
51
|
+
(0x0500, 0x052F), # Cyrillic Supplement
|
|
52
|
+
],
|
|
53
|
+
"ARABIC": [
|
|
54
|
+
(0x0600, 0x06FF), # Arabic
|
|
55
|
+
(0x0750, 0x077F), # Arabic Supplement
|
|
56
|
+
(0xFB50, 0xFDFF), # Arabic Presentation Forms-A
|
|
57
|
+
],
|
|
58
|
+
"DEVANAGARI": [
|
|
59
|
+
(0x0900, 0x097F), # Devanagari
|
|
60
|
+
],
|
|
61
|
+
"THAI": [
|
|
62
|
+
(0x0E00, 0x0E7F), # Thai
|
|
63
|
+
],
|
|
64
|
+
"HEBREW": [
|
|
65
|
+
(0x0590, 0x05FF), # Hebrew
|
|
66
|
+
],
|
|
67
|
+
"GREEK": [
|
|
68
|
+
(0x0370, 0x03FF), # Greek and Coptic
|
|
69
|
+
],
|
|
70
|
+
"GEORGIAN": [
|
|
71
|
+
(0x10A0, 0x10FF), # Georgian
|
|
72
|
+
],
|
|
73
|
+
"ARMENIAN": [
|
|
74
|
+
(0x0530, 0x058F), # Armenian
|
|
75
|
+
],
|
|
76
|
+
"BENGALI": [
|
|
77
|
+
(0x0980, 0x09FF), # Bengali
|
|
78
|
+
],
|
|
79
|
+
"TAMIL": [
|
|
80
|
+
(0x0B80, 0x0BFF), # Tamil
|
|
81
|
+
],
|
|
82
|
+
"KOREAN": [
|
|
83
|
+
(0x1100, 0x11FF), # Hangul Jamo
|
|
84
|
+
],
|
|
85
|
+
"ETHIOPIC": [
|
|
86
|
+
(0x1200, 0x137F), # Ethiopic
|
|
87
|
+
],
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Extended Latin characters used in European languages (French, German, Spanish, etc.)
|
|
91
|
+
# These are harder to detect as "non-English" since they use Latin script,
|
|
92
|
+
# so we use word-level heuristics instead.
|
|
93
|
+
_EXTENDED_LATIN_RANGE = (0x00C0, 0x024F) # Latin Extended-A and B
|
|
94
|
+
|
|
95
|
+
# Common non-English European words that signal prompt injection in other languages.
|
|
96
|
+
# These are the translations of key injection phrases.
|
|
97
|
+
_NON_ENGLISH_INJECTION_KEYWORDS = [
|
|
98
|
+
# French
|
|
99
|
+
r"\b(ignorez|oubliez|annulez|remplacez)\s+(les\s+)?(instructions|directives|règles)",
|
|
100
|
+
r"\b(tu\s+es\s+maintenant|agis\s+comme|fais\s+semblant)",
|
|
101
|
+
r"\b(en\s+tant\s+qu[e'])\s*(admin|root|propriétaire)",
|
|
102
|
+
# German
|
|
103
|
+
r"\b(ignoriere|vergiss|überschreibe)\s+(die\s+)?(vorherigen|bisherigen)\s+(Anweisungen|Regeln|Instruktionen)",
|
|
104
|
+
r"\b(du\s+bist\s+jetzt|handle\s+als|tu\s+so\s+als)",
|
|
105
|
+
# Spanish
|
|
106
|
+
r"\b(ignora|olvida|anula)\s+(las\s+)?(instrucciones|directivas|reglas)\s+(anteriores|previas)",
|
|
107
|
+
r"\b(ahora\s+eres|actúa\s+como|finge\s+ser)",
|
|
108
|
+
# Portuguese
|
|
109
|
+
r"\b(ignore|esqueça|anule)\s+(as\s+)?(instruções|diretivas|regras)\s+(anteriores|prévias)",
|
|
110
|
+
r"\b(agora\s+você\s+é|atue\s+como|finja\s+ser)",
|
|
111
|
+
# Italian
|
|
112
|
+
r"\b(ignora|dimentica|sovrascrivi)\s+(le\s+)?(istruzioni|direttive|regole)\s+(precedenti|anteriori)",
|
|
113
|
+
r"\b(ora\s+sei|agisci\s+come|fingi\s+di\s+essere)",
|
|
114
|
+
# Russian (transliterated)
|
|
115
|
+
r"\b(ignoriruj|zabudj|otmeni)\s+(predydushchie|sistemnye)\s+(instrukcii|pravila)",
|
|
116
|
+
# Dutch
|
|
117
|
+
r"\b(negeer|vergeet)\s+(de\s+)?(vorige|eerdere)\s+(instructies|regels)",
|
|
118
|
+
# Chinese (pinyin patterns that might appear in mixed content)
|
|
119
|
+
r"\b(hūlüè|wúshì)\s+(zhǐlìng|guīzé)",
|
|
120
|
+
# Japanese (romaji)
|
|
121
|
+
r"\b(mushi\s+shite|aratana\s+yakuwari)",
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
# Compile the keyword patterns
|
|
125
|
+
_COMPILED_KEYWORDS = [re.compile(p, re.IGNORECASE | re.UNICODE) for p in _NON_ENGLISH_INJECTION_KEYWORDS]
|
|
126
|
+
|
|
127
|
+
# Characters to exclude from analysis (technical syntax, punctuation, digits)
|
|
128
|
+
_TECHNICAL_CHARS = re.compile(r'[a-zA-Z0-9\s\-_./\\:;,!?@#$%^&*()+=\[\]{}<>|~`"\']+')
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _get_script(char: str) -> Optional[str]:
|
|
132
|
+
"""Determine which non-Latin script a character belongs to."""
|
|
133
|
+
cp = ord(char)
|
|
134
|
+
for script_name, ranges in _SCRIPT_RANGES.items():
|
|
135
|
+
for start, end in ranges:
|
|
136
|
+
if start <= cp <= end:
|
|
137
|
+
return script_name
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _has_extended_latin(text: str) -> bool:
|
|
142
|
+
"""Check if text contains extended Latin characters (accented European)."""
|
|
143
|
+
for char in text:
|
|
144
|
+
cp = ord(char)
|
|
145
|
+
if _EXTENDED_LATIN_RANGE[0] <= cp <= _EXTENDED_LATIN_RANGE[1]:
|
|
146
|
+
return True
|
|
147
|
+
return False
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def detect_non_english(content: str, min_confidence: float = 0.3) -> LanguageDetectionResult:
|
|
151
|
+
"""
|
|
152
|
+
Detect non-English natural language in tool call content.
|
|
153
|
+
|
|
154
|
+
Uses Unicode script analysis for non-Latin scripts (CJK, Cyrillic, Arabic, etc.)
|
|
155
|
+
and keyword matching for Latin-script European languages.
|
|
156
|
+
|
|
157
|
+
Technical content (shell commands, file paths, flags) is ignored since it's
|
|
158
|
+
language-independent.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
content: The tool call content to analyze
|
|
162
|
+
min_confidence: Minimum confidence threshold to report detection
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
LanguageDetectionResult with detection details
|
|
166
|
+
"""
|
|
167
|
+
if not content or len(content.strip()) < 3:
|
|
168
|
+
return LanguageDetectionResult(
|
|
169
|
+
has_non_english=False,
|
|
170
|
+
confidence=0.0,
|
|
171
|
+
detected_scripts=[],
|
|
172
|
+
non_english_ratio=0.0,
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
detected_scripts = set()
|
|
176
|
+
non_latin_count = 0
|
|
177
|
+
total_alpha = 0
|
|
178
|
+
sample_chars = []
|
|
179
|
+
|
|
180
|
+
# Pass 1: Unicode script detection (non-Latin scripts)
|
|
181
|
+
for char in content:
|
|
182
|
+
if char.isalpha():
|
|
183
|
+
total_alpha += 1
|
|
184
|
+
script = _get_script(char)
|
|
185
|
+
if script:
|
|
186
|
+
non_latin_count += 1
|
|
187
|
+
detected_scripts.add(script)
|
|
188
|
+
if len(sample_chars) < 30:
|
|
189
|
+
sample_chars.append(char)
|
|
190
|
+
|
|
191
|
+
# Calculate non-English ratio from non-Latin characters
|
|
192
|
+
non_english_ratio = non_latin_count / max(total_alpha, 1)
|
|
193
|
+
|
|
194
|
+
# If we found non-Latin scripts, that's a strong signal
|
|
195
|
+
if non_latin_count >= 5 and non_english_ratio >= 0.05:
|
|
196
|
+
confidence = min(1.0, non_english_ratio * 2 + 0.3)
|
|
197
|
+
sample = "".join(sample_chars) if sample_chars else None
|
|
198
|
+
|
|
199
|
+
if confidence >= min_confidence:
|
|
200
|
+
return LanguageDetectionResult(
|
|
201
|
+
has_non_english=True,
|
|
202
|
+
confidence=confidence,
|
|
203
|
+
detected_scripts=sorted(detected_scripts),
|
|
204
|
+
non_english_ratio=non_english_ratio,
|
|
205
|
+
sample=sample,
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Pass 2: Latin-script European language detection via keyword matching
|
|
209
|
+
# This catches French, German, Spanish, etc. prompt injection phrases
|
|
210
|
+
for pattern in _COMPILED_KEYWORDS:
|
|
211
|
+
match = pattern.search(content)
|
|
212
|
+
if match:
|
|
213
|
+
detected_scripts.add("LATIN_EUROPEAN")
|
|
214
|
+
sample = match.group(0)[:50]
|
|
215
|
+
return LanguageDetectionResult(
|
|
216
|
+
has_non_english=True,
|
|
217
|
+
confidence=0.8,
|
|
218
|
+
detected_scripts=sorted(detected_scripts),
|
|
219
|
+
non_english_ratio=0.0, # Can't easily compute for Latin scripts
|
|
220
|
+
sample=sample,
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
# Pass 3: Extended Latin character density check
|
|
224
|
+
# High density of accented characters suggests European language
|
|
225
|
+
if _has_extended_latin(content) and total_alpha > 10:
|
|
226
|
+
extended_count = sum(
|
|
227
|
+
1 for c in content
|
|
228
|
+
if _EXTENDED_LATIN_RANGE[0] <= ord(c) <= _EXTENDED_LATIN_RANGE[1]
|
|
229
|
+
)
|
|
230
|
+
extended_ratio = extended_count / max(total_alpha, 1)
|
|
231
|
+
|
|
232
|
+
if extended_ratio >= 0.08: # 8%+ accented characters suggests non-English
|
|
233
|
+
detected_scripts.add("LATIN_EXTENDED")
|
|
234
|
+
confidence = min(1.0, extended_ratio * 5)
|
|
235
|
+
|
|
236
|
+
if confidence >= min_confidence:
|
|
237
|
+
return LanguageDetectionResult(
|
|
238
|
+
has_non_english=True,
|
|
239
|
+
confidence=confidence,
|
|
240
|
+
detected_scripts=sorted(detected_scripts),
|
|
241
|
+
non_english_ratio=extended_ratio,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
# No significant non-English content detected
|
|
245
|
+
return LanguageDetectionResult(
|
|
246
|
+
has_non_english=False,
|
|
247
|
+
confidence=0.0,
|
|
248
|
+
detected_scripts=[],
|
|
249
|
+
non_english_ratio=0.0,
|
|
250
|
+
)
|