@voodocs/cli 2.4.0 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +47 -0
- package/README.md +26 -0
- package/lib/cli/analyze.py +277 -0
- package/lib/darkarts/priority_analyzer/__init__.py +0 -0
- package/lib/darkarts/priority_analyzer/analyzer.py +301 -0
- package/lib/darkarts/priority_analyzer/complexity.py +271 -0
- package/lib/darkarts/priority_analyzer/dependencies.py +275 -0
- package/lib/darkarts/priority_analyzer/security.py +200 -0
- package/lib/darkarts/voodocs_lite_dict.py +216 -0
- package/lib/darkarts/voodocs_lite_dict_v2.py +198 -0
- package/lib/darkarts/voodocs_lite_parser.py +343 -0
- package/package.json +5 -1
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Security Analyzer for VooDocs Priority System
|
|
3
|
+
|
|
4
|
+
Detects security-sensitive code based on keyword analysis.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import Dict, List, Set
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SecurityAnalyzer:
|
|
12
|
+
"""Analyzes code for security-sensitive keywords and patterns."""
|
|
13
|
+
|
|
14
|
+
# Critical security keywords (10 points each)
|
|
15
|
+
CRITICAL_KEYWORDS = {
|
|
16
|
+
'password', 'secret', 'token', 'key', 'private',
|
|
17
|
+
'auth', 'authentication', 'authorization',
|
|
18
|
+
'admin', 'root', 'sudo', 'privilege',
|
|
19
|
+
'encrypt', 'decrypt', 'hash', 'crypto',
|
|
20
|
+
'payment', 'credit', 'card', 'billing',
|
|
21
|
+
'sql', 'query', 'execute', 'eval',
|
|
22
|
+
'unsafe', 'dangerous', 'vulnerable',
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# High priority security keywords (5 points each)
|
|
26
|
+
HIGH_KEYWORDS = {
|
|
27
|
+
'user', 'session', 'cookie', 'jwt',
|
|
28
|
+
'login', 'logout', 'signin', 'signup',
|
|
29
|
+
'access', 'permission', 'role', 'grant',
|
|
30
|
+
'sanitize', 'validate', 'escape', 'filter',
|
|
31
|
+
'upload', 'download', 'file', 'path',
|
|
32
|
+
'api', 'endpoint', 'route', 'handler',
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# Medium priority security keywords (2 points each)
|
|
36
|
+
MEDIUM_KEYWORDS = {
|
|
37
|
+
'input', 'output', 'request', 'response',
|
|
38
|
+
'data', 'database', 'db', 'storage',
|
|
39
|
+
'config', 'settings', 'env', 'environment',
|
|
40
|
+
'error', 'exception', 'fail', 'crash',
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
def __init__(self):
|
|
44
|
+
"""Initialize security analyzer."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
def analyze_file(self, filepath: str) -> Dict:
|
|
48
|
+
"""
|
|
49
|
+
Analyze a file for security-sensitive keywords.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
filepath: Path to file to analyze
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
Dictionary with security analysis results
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
with open(filepath, 'r', encoding='utf-8') as f:
|
|
59
|
+
content = f.read()
|
|
60
|
+
except Exception as e:
|
|
61
|
+
return {
|
|
62
|
+
'critical_keywords': [],
|
|
63
|
+
'high_keywords': [],
|
|
64
|
+
'medium_keywords': [],
|
|
65
|
+
'total_score': 0,
|
|
66
|
+
'error': str(e)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Scan for keywords
|
|
70
|
+
critical_found = self._scan_keywords(content, self.CRITICAL_KEYWORDS)
|
|
71
|
+
high_found = self._scan_keywords(content, self.HIGH_KEYWORDS)
|
|
72
|
+
medium_found = self._scan_keywords(content, self.MEDIUM_KEYWORDS)
|
|
73
|
+
|
|
74
|
+
# Calculate score
|
|
75
|
+
score = (
|
|
76
|
+
len(critical_found) * 10 +
|
|
77
|
+
len(high_found) * 5 +
|
|
78
|
+
len(medium_found) * 2
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# Cap at 100
|
|
82
|
+
total_score = min(100, score)
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
'critical_keywords': sorted(list(critical_found)),
|
|
86
|
+
'high_keywords': sorted(list(high_found)),
|
|
87
|
+
'medium_keywords': sorted(list(medium_found)),
|
|
88
|
+
'total_score': total_score,
|
|
89
|
+
'keyword_count': len(critical_found) + len(high_found) + len(medium_found)
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
def _scan_keywords(self, content: str, keywords: Set[str]) -> Set[str]:
|
|
93
|
+
"""
|
|
94
|
+
Scan content for keywords (case-insensitive).
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
content: File content
|
|
98
|
+
keywords: Set of keywords to search for
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Set of found keywords
|
|
102
|
+
"""
|
|
103
|
+
found = set()
|
|
104
|
+
content_lower = content.lower()
|
|
105
|
+
|
|
106
|
+
for keyword in keywords:
|
|
107
|
+
# Use word boundaries to avoid partial matches
|
|
108
|
+
pattern = r'\b' + re.escape(keyword) + r'\b'
|
|
109
|
+
if re.search(pattern, content_lower):
|
|
110
|
+
found.add(keyword)
|
|
111
|
+
|
|
112
|
+
return found
|
|
113
|
+
|
|
114
|
+
def get_security_reasons(self, analysis: Dict) -> List[str]:
|
|
115
|
+
"""
|
|
116
|
+
Generate human-readable reasons for security score.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
analysis: Security analysis results
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
List of reason strings
|
|
123
|
+
"""
|
|
124
|
+
reasons = []
|
|
125
|
+
|
|
126
|
+
if analysis['critical_keywords']:
|
|
127
|
+
keywords_str = ', '.join(analysis['critical_keywords'][:5])
|
|
128
|
+
if len(analysis['critical_keywords']) > 5:
|
|
129
|
+
keywords_str += f" (+{len(analysis['critical_keywords']) - 5} more)"
|
|
130
|
+
reasons.append(f"Critical security keywords: {keywords_str}")
|
|
131
|
+
|
|
132
|
+
if analysis['high_keywords']:
|
|
133
|
+
keywords_str = ', '.join(analysis['high_keywords'][:5])
|
|
134
|
+
if len(analysis['high_keywords']) > 5:
|
|
135
|
+
keywords_str += f" (+{len(analysis['high_keywords']) - 5} more)"
|
|
136
|
+
reasons.append(f"High-priority keywords: {keywords_str}")
|
|
137
|
+
|
|
138
|
+
if analysis['total_score'] >= 80:
|
|
139
|
+
reasons.append("Highly security-sensitive code")
|
|
140
|
+
elif analysis['total_score'] >= 50:
|
|
141
|
+
reasons.append("Security-sensitive code")
|
|
142
|
+
|
|
143
|
+
return reasons
|
|
144
|
+
|
|
145
|
+
def get_security_suggestions(self, analysis: Dict) -> List[str]:
|
|
146
|
+
"""
|
|
147
|
+
Generate security-specific suggestions.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
analysis: Security analysis results
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
List of suggestion strings
|
|
154
|
+
"""
|
|
155
|
+
suggestions = []
|
|
156
|
+
|
|
157
|
+
critical = set(analysis['critical_keywords'])
|
|
158
|
+
high = set(analysis['high_keywords'])
|
|
159
|
+
|
|
160
|
+
# Authentication/Authorization
|
|
161
|
+
if critical & {'auth', 'authentication', 'authorization', 'login', 'logout'}:
|
|
162
|
+
suggestions.append("Document authentication/authorization flow")
|
|
163
|
+
suggestions.append("Specify security invariants and assumptions")
|
|
164
|
+
|
|
165
|
+
# Cryptography
|
|
166
|
+
if critical & {'encrypt', 'decrypt', 'hash', 'crypto', 'key', 'secret'}:
|
|
167
|
+
suggestions.append("Document encryption algorithms and key management")
|
|
168
|
+
suggestions.append("Specify cryptographic assumptions")
|
|
169
|
+
|
|
170
|
+
# Payment/Financial
|
|
171
|
+
if critical & {'payment', 'credit', 'card', 'billing'}:
|
|
172
|
+
suggestions.append("Document payment processing flow")
|
|
173
|
+
suggestions.append("Add PCI-DSS compliance notes")
|
|
174
|
+
|
|
175
|
+
# SQL/Database
|
|
176
|
+
if critical & {'sql', 'query', 'execute'}:
|
|
177
|
+
suggestions.append("Document SQL injection prevention measures")
|
|
178
|
+
suggestions.append("Specify query sanitization rules")
|
|
179
|
+
|
|
180
|
+
# Input Validation
|
|
181
|
+
if high & {'sanitize', 'validate', 'escape', 'filter', 'input'}:
|
|
182
|
+
suggestions.append("Document input validation rules")
|
|
183
|
+
suggestions.append("Specify allowed input formats")
|
|
184
|
+
|
|
185
|
+
# File Operations
|
|
186
|
+
if high & {'upload', 'download', 'file', 'path'}:
|
|
187
|
+
suggestions.append("Document file validation and size limits")
|
|
188
|
+
suggestions.append("Specify path traversal prevention")
|
|
189
|
+
|
|
190
|
+
# Session Management
|
|
191
|
+
if high & {'session', 'cookie', 'jwt', 'token'}:
|
|
192
|
+
suggestions.append("Document session/token lifecycle")
|
|
193
|
+
suggestions.append("Specify expiration and refresh logic")
|
|
194
|
+
|
|
195
|
+
# Generic security suggestions
|
|
196
|
+
if analysis['total_score'] >= 60 and not suggestions:
|
|
197
|
+
suggestions.append("Add security assumptions and invariants")
|
|
198
|
+
suggestions.append("Document security-critical logic")
|
|
199
|
+
|
|
200
|
+
return suggestions
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
"""
|
|
2
|
+
VooDocs Lite - Abbreviation Dictionary
|
|
3
|
+
|
|
4
|
+
Provides bidirectional mapping between full words and abbreviations
|
|
5
|
+
for ultra-compact symbolic notation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# Abbreviation dictionary: abbr -> full
|
|
9
|
+
ABBREVIATIONS = {
|
|
10
|
+
# Storage & Data
|
|
11
|
+
'db': 'database',
|
|
12
|
+
'cfg': 'configuration',
|
|
13
|
+
'var': 'variable',
|
|
14
|
+
'const': 'constant',
|
|
15
|
+
'param': 'parameter',
|
|
16
|
+
'arg': 'argument',
|
|
17
|
+
'id': 'identifier',
|
|
18
|
+
|
|
19
|
+
# Common verbs
|
|
20
|
+
'must': 'must',
|
|
21
|
+
'be': 'be',
|
|
22
|
+
'is': 'is',
|
|
23
|
+
'are': 'are',
|
|
24
|
+
'has': 'has',
|
|
25
|
+
'have': 'have',
|
|
26
|
+
'does': 'does',
|
|
27
|
+
'do': 'do',
|
|
28
|
+
'will': 'will',
|
|
29
|
+
'can': 'can',
|
|
30
|
+
'should': 'should',
|
|
31
|
+
'may': 'may',
|
|
32
|
+
'contains': 'contains',
|
|
33
|
+
'returns': 'returns',
|
|
34
|
+
'expire': 'expire',
|
|
35
|
+
'expires': 'expires',
|
|
36
|
+
'stored': 'stored',
|
|
37
|
+
'signed': 'signed',
|
|
38
|
+
'hashed': 'hashed',
|
|
39
|
+
'valid': 'valid',
|
|
40
|
+
'logged': 'logged',
|
|
41
|
+
|
|
42
|
+
# Actions
|
|
43
|
+
'init': 'initialize',
|
|
44
|
+
'val': 'validate',
|
|
45
|
+
'ver': 'verify',
|
|
46
|
+
'gen': 'generate',
|
|
47
|
+
'cr': 'create',
|
|
48
|
+
'upd': 'update',
|
|
49
|
+
'del': 'delete',
|
|
50
|
+
'mod': 'modify',
|
|
51
|
+
'get': 'retrieve',
|
|
52
|
+
'qry': 'query',
|
|
53
|
+
'chk': 'check',
|
|
54
|
+
|
|
55
|
+
# Security
|
|
56
|
+
'auth': 'authentication',
|
|
57
|
+
'authz': 'authorization',
|
|
58
|
+
'pwd': 'password',
|
|
59
|
+
'tok': 'token',
|
|
60
|
+
|
|
61
|
+
# Entities
|
|
62
|
+
'usr': 'user',
|
|
63
|
+
'usrs': 'users',
|
|
64
|
+
|
|
65
|
+
# Time
|
|
66
|
+
'ts': 'timestamp',
|
|
67
|
+
'exp': 'expiration',
|
|
68
|
+
|
|
69
|
+
# Communication
|
|
70
|
+
'resp': 'response',
|
|
71
|
+
'req': 'request',
|
|
72
|
+
'msg': 'message',
|
|
73
|
+
|
|
74
|
+
# Status & Errors
|
|
75
|
+
'err': 'error',
|
|
76
|
+
'exc': 'exception',
|
|
77
|
+
'ok': 'success',
|
|
78
|
+
'fail': 'failure',
|
|
79
|
+
|
|
80
|
+
# Boolean & Values
|
|
81
|
+
'T': 'true',
|
|
82
|
+
'F': 'false',
|
|
83
|
+
'N': 'null',
|
|
84
|
+
'U': 'undefined',
|
|
85
|
+
'E': 'empty',
|
|
86
|
+
|
|
87
|
+
# Types
|
|
88
|
+
'str': 'string',
|
|
89
|
+
'num': 'number',
|
|
90
|
+
'int': 'integer',
|
|
91
|
+
'bool': 'boolean',
|
|
92
|
+
'arr': 'array',
|
|
93
|
+
'obj': 'object',
|
|
94
|
+
'fn': 'function',
|
|
95
|
+
|
|
96
|
+
# Blockchain
|
|
97
|
+
'addr': 'address',
|
|
98
|
+
'ctr': 'contract',
|
|
99
|
+
'tx': 'transaction',
|
|
100
|
+
'blk': 'block',
|
|
101
|
+
'bal': 'balance',
|
|
102
|
+
'amt': 'amount',
|
|
103
|
+
|
|
104
|
+
# Domain-specific
|
|
105
|
+
'sub': 'subdomain',
|
|
106
|
+
'subs': 'subdomains',
|
|
107
|
+
'reg': 'registry',
|
|
108
|
+
'own': 'owner',
|
|
109
|
+
'mgmt': 'management',
|
|
110
|
+
|
|
111
|
+
# Common words
|
|
112
|
+
'w/': 'with',
|
|
113
|
+
'wo/': 'without',
|
|
114
|
+
'svc': 'service',
|
|
115
|
+
'sys': 'system',
|
|
116
|
+
'ops': 'operations',
|
|
117
|
+
'ret': 'returns',
|
|
118
|
+
'res': 'result',
|
|
119
|
+
'val': 'value',
|
|
120
|
+
'vals': 'values',
|
|
121
|
+
'len': 'length',
|
|
122
|
+
'cnt': 'count',
|
|
123
|
+
'max': 'maximum',
|
|
124
|
+
'min': 'minimum',
|
|
125
|
+
'avg': 'average',
|
|
126
|
+
'sum': 'summary',
|
|
127
|
+
'desc': 'description',
|
|
128
|
+
'info': 'information',
|
|
129
|
+
'spec': 'specification',
|
|
130
|
+
'impl': 'implementation',
|
|
131
|
+
'ref': 'reference',
|
|
132
|
+
'def': 'definition',
|
|
133
|
+
'decl': 'declaration',
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Reverse mapping: full -> abbr
|
|
137
|
+
EXPANSIONS = {v: k for k, v in ABBREVIATIONS.items()}
|
|
138
|
+
|
|
139
|
+
# Symbol mappings
|
|
140
|
+
LITE_TO_STANDARD = {
|
|
141
|
+
'>': '⊢', # Purpose/Postcondition (context-dependent)
|
|
142
|
+
'@': '∂', # Dependencies
|
|
143
|
+
'!': '⚠', # Assumptions
|
|
144
|
+
'<': '⊳', # Preconditions
|
|
145
|
+
'=': '⊨', # Invariants
|
|
146
|
+
'~': '⚡', # Complexity
|
|
147
|
+
'#': '🔒', # Security
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
STANDARD_TO_LITE = {v: k for k, v in LITE_TO_STANDARD.items()}
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def expand_abbreviation(abbr: str) -> str:
|
|
154
|
+
"""Expand an abbreviation to its full form."""
|
|
155
|
+
return ABBREVIATIONS.get(abbr, abbr)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def compress_word(word: str) -> str:
|
|
159
|
+
"""Compress a word to its abbreviation."""
|
|
160
|
+
return EXPANSIONS.get(word.lower(), word)
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def expand_text(text: str) -> str:
|
|
164
|
+
"""
|
|
165
|
+
Expand abbreviated text to full form.
|
|
166
|
+
|
|
167
|
+
Example:
|
|
168
|
+
"usr auth svc w/ JWT gen" -> "user authentication service with JWT generation"
|
|
169
|
+
"""
|
|
170
|
+
words = text.split()
|
|
171
|
+
expanded = []
|
|
172
|
+
|
|
173
|
+
for word in words:
|
|
174
|
+
# Check if word has punctuation
|
|
175
|
+
if word[-1] in '.,;:!?':
|
|
176
|
+
punct = word[-1]
|
|
177
|
+
word_part = word[:-1]
|
|
178
|
+
expanded_word = expand_abbreviation(word_part)
|
|
179
|
+
expanded.append(expanded_word + punct)
|
|
180
|
+
else:
|
|
181
|
+
expanded.append(expand_abbreviation(word))
|
|
182
|
+
|
|
183
|
+
return ' '.join(expanded)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def compress_text(text: str) -> str:
|
|
187
|
+
"""
|
|
188
|
+
Compress text using abbreviations.
|
|
189
|
+
|
|
190
|
+
Example:
|
|
191
|
+
"user authentication service with JWT generation" -> "usr auth svc w/ JWT gen"
|
|
192
|
+
"""
|
|
193
|
+
words = text.split()
|
|
194
|
+
compressed = []
|
|
195
|
+
|
|
196
|
+
for word in words:
|
|
197
|
+
# Check if word has punctuation
|
|
198
|
+
if word[-1] in '.,;:!?':
|
|
199
|
+
punct = word[-1]
|
|
200
|
+
word_part = word[:-1]
|
|
201
|
+
compressed_word = compress_word(word_part)
|
|
202
|
+
compressed.append(compressed_word + punct)
|
|
203
|
+
else:
|
|
204
|
+
compressed.append(compress_word(word))
|
|
205
|
+
|
|
206
|
+
return ' '.join(compressed)
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def get_lite_symbol(standard_symbol: str) -> str:
|
|
210
|
+
"""Convert standard VooDocs symbol to Lite symbol."""
|
|
211
|
+
return STANDARD_TO_LITE.get(standard_symbol, standard_symbol)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def get_standard_symbol(lite_symbol: str) -> str:
|
|
215
|
+
"""Convert Lite symbol to standard VooDocs symbol."""
|
|
216
|
+
return LITE_TO_STANDARD.get(lite_symbol, lite_symbol)
|
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
"""
|
|
2
|
+
VooDocs Lite - Ultra-Aggressive Abbreviation Dictionary v2
|
|
3
|
+
|
|
4
|
+
Provides maximum compression by:
|
|
5
|
+
1. Removing articles (a, an, the)
|
|
6
|
+
2. Removing unnecessary words
|
|
7
|
+
3. Using aggressive abbreviations
|
|
8
|
+
4. Using symbols instead of words
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
# Core abbreviations
|
|
12
|
+
CORE_ABBR = {
|
|
13
|
+
# Entities
|
|
14
|
+
'user': 'u',
|
|
15
|
+
'users': 'us',
|
|
16
|
+
'database': 'db',
|
|
17
|
+
'token': 'tok',
|
|
18
|
+
'password': 'pw',
|
|
19
|
+
'authentication': 'auth',
|
|
20
|
+
'authorization': 'authz',
|
|
21
|
+
'service': 'svc',
|
|
22
|
+
'function': 'fn',
|
|
23
|
+
'identifier': 'id',
|
|
24
|
+
'configuration': 'cfg',
|
|
25
|
+
'parameter': 'p',
|
|
26
|
+
'argument': 'a',
|
|
27
|
+
'variable': 'v',
|
|
28
|
+
'constant': 'c',
|
|
29
|
+
'timestamp': 'ts',
|
|
30
|
+
'expiration': 'exp',
|
|
31
|
+
'response': 'r',
|
|
32
|
+
'request': 'q',
|
|
33
|
+
'error': 'e',
|
|
34
|
+
'exception': 'x',
|
|
35
|
+
|
|
36
|
+
# Actions (ultra-short)
|
|
37
|
+
'initialize': 'init',
|
|
38
|
+
'validate': 'val',
|
|
39
|
+
'verify': 'ver',
|
|
40
|
+
'generate': 'gen',
|
|
41
|
+
'create': 'cr',
|
|
42
|
+
'update': 'upd',
|
|
43
|
+
'delete': 'del',
|
|
44
|
+
'modify': 'mod',
|
|
45
|
+
'retrieve': 'get',
|
|
46
|
+
'query': 'qry',
|
|
47
|
+
'check': 'chk',
|
|
48
|
+
'returns': 'ret',
|
|
49
|
+
'contains': 'has',
|
|
50
|
+
|
|
51
|
+
# Blockchain
|
|
52
|
+
'address': 'addr',
|
|
53
|
+
'contract': 'ctr',
|
|
54
|
+
'transaction': 'tx',
|
|
55
|
+
'block': 'blk',
|
|
56
|
+
'balance': 'bal',
|
|
57
|
+
'amount': 'amt',
|
|
58
|
+
'subdomain': 'sub',
|
|
59
|
+
'registry': 'reg',
|
|
60
|
+
'owner': 'own',
|
|
61
|
+
|
|
62
|
+
# Common words
|
|
63
|
+
'with': 'w/',
|
|
64
|
+
'without': 'wo/',
|
|
65
|
+
'management': 'mgmt',
|
|
66
|
+
'system': 'sys',
|
|
67
|
+
'operations': 'ops',
|
|
68
|
+
'result': 'res',
|
|
69
|
+
'value': 'val',
|
|
70
|
+
'length': 'len',
|
|
71
|
+
'count': 'cnt',
|
|
72
|
+
'maximum': 'max',
|
|
73
|
+
'minimum': 'min',
|
|
74
|
+
'average': 'avg',
|
|
75
|
+
'information': 'info',
|
|
76
|
+
'specification': 'spec',
|
|
77
|
+
'implementation': 'impl',
|
|
78
|
+
'reference': 'ref',
|
|
79
|
+
'definition': 'def',
|
|
80
|
+
'description': 'desc',
|
|
81
|
+
|
|
82
|
+
# Types
|
|
83
|
+
'string': 'str',
|
|
84
|
+
'number': 'num',
|
|
85
|
+
'integer': 'int',
|
|
86
|
+
'boolean': 'bool',
|
|
87
|
+
'array': 'arr',
|
|
88
|
+
'object': 'obj',
|
|
89
|
+
|
|
90
|
+
# Boolean/Values
|
|
91
|
+
'true': 'T',
|
|
92
|
+
'false': 'F',
|
|
93
|
+
'null': 'N',
|
|
94
|
+
'undefined': 'U',
|
|
95
|
+
'empty': 'E',
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# Words to remove entirely
|
|
99
|
+
REMOVE_WORDS = {'a', 'an', 'the', 'is', 'are', 'be', 'been', 'being', 'was', 'were', 'will', 'would', 'should', 'could', 'may', 'might', 'can', 'must'}
|
|
100
|
+
|
|
101
|
+
# Symbol replacements
|
|
102
|
+
SYMBOL_REPLACEMENTS = {
|
|
103
|
+
' and ': '&',
|
|
104
|
+
' or ': '|',
|
|
105
|
+
' not ': '!',
|
|
106
|
+
'greater than or equal': '>=',
|
|
107
|
+
'less than or equal': '<=',
|
|
108
|
+
'greater than': '>',
|
|
109
|
+
'less than': '<',
|
|
110
|
+
'equal to': '=',
|
|
111
|
+
'not equal': '!=',
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def ultra_compress(text: str) -> str:
|
|
116
|
+
"""
|
|
117
|
+
Ultra-aggressive compression.
|
|
118
|
+
|
|
119
|
+
Steps:
|
|
120
|
+
1. Replace symbols
|
|
121
|
+
2. Remove articles and unnecessary words
|
|
122
|
+
3. Apply abbreviations
|
|
123
|
+
4. Remove extra whitespace
|
|
124
|
+
"""
|
|
125
|
+
# Step 1: Symbol replacements (longest first)
|
|
126
|
+
for phrase, symbol in sorted(SYMBOL_REPLACEMENTS.items(), key=lambda x: -len(x[0])):
|
|
127
|
+
text = text.replace(phrase, symbol)
|
|
128
|
+
|
|
129
|
+
# Step 2: Split into words
|
|
130
|
+
words = text.split()
|
|
131
|
+
|
|
132
|
+
# Step 3: Process each word
|
|
133
|
+
compressed_words = []
|
|
134
|
+
for word in words:
|
|
135
|
+
# Check for punctuation
|
|
136
|
+
punct = ''
|
|
137
|
+
if word and word[-1] in '.,;:!?':
|
|
138
|
+
punct = word[-1]
|
|
139
|
+
word = word[:-1]
|
|
140
|
+
|
|
141
|
+
# Convert to lowercase for matching
|
|
142
|
+
word_lower = word.lower()
|
|
143
|
+
|
|
144
|
+
# Skip if in remove list
|
|
145
|
+
if word_lower in REMOVE_WORDS:
|
|
146
|
+
continue
|
|
147
|
+
|
|
148
|
+
# Apply abbreviation if exists
|
|
149
|
+
if word_lower in CORE_ABBR:
|
|
150
|
+
compressed_words.append(CORE_ABBR[word_lower] + punct)
|
|
151
|
+
else:
|
|
152
|
+
compressed_words.append(word + punct)
|
|
153
|
+
|
|
154
|
+
# Step 4: Join and clean up
|
|
155
|
+
result = ' '.join(compressed_words)
|
|
156
|
+
|
|
157
|
+
# Remove spaces around symbols
|
|
158
|
+
result = result.replace(' & ', '&')
|
|
159
|
+
result = result.replace(' | ', '|')
|
|
160
|
+
result = result.replace(' >= ', '>=')
|
|
161
|
+
result = result.replace(' <= ', '<=')
|
|
162
|
+
result = result.replace(' != ', '!=')
|
|
163
|
+
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def ultra_expand(text: str) -> str:
|
|
168
|
+
"""
|
|
169
|
+
Expand ultra-compressed text back to full form.
|
|
170
|
+
"""
|
|
171
|
+
# Reverse mapping
|
|
172
|
+
ABBR_TO_FULL = {v: k for k, v in CORE_ABBR.items()}
|
|
173
|
+
|
|
174
|
+
# Step 1: Add spaces around symbols
|
|
175
|
+
text = text.replace('&', ' and ')
|
|
176
|
+
text = text.replace('|', ' or ')
|
|
177
|
+
text = text.replace('>=', ' greater than or equal ')
|
|
178
|
+
text = text.replace('<=', ' less than or equal ')
|
|
179
|
+
text = text.replace('!=', ' not equal ')
|
|
180
|
+
|
|
181
|
+
# Step 2: Split and expand
|
|
182
|
+
words = text.split()
|
|
183
|
+
expanded_words = []
|
|
184
|
+
|
|
185
|
+
for word in words:
|
|
186
|
+
# Check for punctuation
|
|
187
|
+
punct = ''
|
|
188
|
+
if word and word[-1] in '.,;:!?':
|
|
189
|
+
punct = word[-1]
|
|
190
|
+
word = word[:-1]
|
|
191
|
+
|
|
192
|
+
# Expand if abbreviation exists
|
|
193
|
+
if word in ABBR_TO_FULL:
|
|
194
|
+
expanded_words.append(ABBR_TO_FULL[word] + punct)
|
|
195
|
+
else:
|
|
196
|
+
expanded_words.append(word + punct)
|
|
197
|
+
|
|
198
|
+
return ' '.join(expanded_words)
|