moltlang 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_server/__init__.py +13 -0
- mcp_server/endpoints.py +177 -0
- mcp_server/server.py +303 -0
- moltlang/__init__.py +64 -0
- moltlang/cli.py +247 -0
- moltlang/config.py +86 -0
- moltlang/openclaw/__init__.py +11 -0
- moltlang/openclaw/skill.py +77 -0
- moltlang/tokens.py +311 -0
- moltlang/training/__init__.py +12 -0
- moltlang/training/data_gen.py +118 -0
- moltlang/training/distill.py +86 -0
- moltlang/translator.py +965 -0
- moltlang/validator.py +378 -0
- moltlang-0.1.0.dist-info/METADATA +187 -0
- moltlang-0.1.0.dist-info/RECORD +20 -0
- moltlang-0.1.0.dist-info/WHEEL +5 -0
- moltlang-0.1.0.dist-info/entry_points.txt +2 -0
- moltlang-0.1.0.dist-info/licenses/LICENSE +23 -0
- moltlang-0.1.0.dist-info/top_level.txt +2 -0
moltlang/translator.py
ADDED
|
@@ -0,0 +1,965 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MoltLang translation module.
|
|
3
|
+
|
|
4
|
+
This module provides bidirectional translation between English (and other human languages)
|
|
5
|
+
and MoltLang, the AI-optimized language.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from moltlang.config import MoltConfig, get_config
|
|
12
|
+
from moltlang.tokens import Token, TokenSequence, TokenType
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class DetectedToken:
|
|
17
|
+
"""Token with position information for semantic grouping."""
|
|
18
|
+
token: Token
|
|
19
|
+
position: int # Character position in original text
|
|
20
|
+
keyword: str # The keyword that triggered detection
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class TranslationResult:
|
|
25
|
+
"""
|
|
26
|
+
Result of a translation operation.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
text: The translated text
|
|
30
|
+
tokens: Token sequence (for MoltLang output)
|
|
31
|
+
token_count: Number of tokens used
|
|
32
|
+
confidence: Translation confidence score (0.0-1.0)
|
|
33
|
+
original_token_count: Original token count (for efficiency calculation)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
text: str
|
|
37
|
+
tokens: TokenSequence | None = None
|
|
38
|
+
token_count: int = 0
|
|
39
|
+
confidence: float = 0.0
|
|
40
|
+
original_token_count: int = 0
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def token_efficiency(self) -> float:
|
|
44
|
+
"""Calculate token efficiency (reduction percentage)."""
|
|
45
|
+
if self.original_token_count == 0:
|
|
46
|
+
return 0.0
|
|
47
|
+
return 1.0 - (self.token_count / self.original_token_count)
|
|
48
|
+
|
|
49
|
+
def __str__(self) -> str:
|
|
50
|
+
"""Return the translated text."""
|
|
51
|
+
return self.text
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class MoltTranslator:
|
|
55
|
+
"""
|
|
56
|
+
Translator for MoltLang.
|
|
57
|
+
|
|
58
|
+
Handles bidirectional translation between human languages and MoltLang.
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, config: MoltConfig | None = None):
|
|
62
|
+
"""
|
|
63
|
+
Initialize the translator.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
config: Optional configuration. Uses default if not provided.
|
|
67
|
+
"""
|
|
68
|
+
self.config = config or get_config()
|
|
69
|
+
self._translation_cache: dict[str, TranslationResult] = {}
|
|
70
|
+
|
|
71
|
+
def translate_to_molt(
|
|
72
|
+
self, text: str, config: MoltConfig | None = None
|
|
73
|
+
) -> TranslationResult:
|
|
74
|
+
"""
|
|
75
|
+
Translate human language text to MoltLang.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
text: Human language text to translate
|
|
79
|
+
config: Optional configuration override
|
|
80
|
+
|
|
81
|
+
Returns:
|
|
82
|
+
TranslationResult containing the MoltLang translation
|
|
83
|
+
|
|
84
|
+
Examples:
|
|
85
|
+
>>> translator = MoltTranslator()
|
|
86
|
+
>>> result = translator.translate_to_molt("Fetch data from API")
|
|
87
|
+
>>> print(result.text)
|
|
88
|
+
[OP:FETCH][SRC:API]
|
|
89
|
+
"""
|
|
90
|
+
cfg = config or self.config
|
|
91
|
+
|
|
92
|
+
# Check cache
|
|
93
|
+
if cfg.enable_cache and text in self._translation_cache:
|
|
94
|
+
return self._translation_cache[text]
|
|
95
|
+
|
|
96
|
+
# Tokenize input
|
|
97
|
+
original_tokens = self._count_word_tokens(text)
|
|
98
|
+
|
|
99
|
+
# Analyze and translate
|
|
100
|
+
tokens = self._analyze_and_translate(text)
|
|
101
|
+
|
|
102
|
+
# Build result
|
|
103
|
+
result = TranslationResult(
|
|
104
|
+
text=str(tokens),
|
|
105
|
+
tokens=tokens,
|
|
106
|
+
token_count=len(tokens),
|
|
107
|
+
confidence=self._calculate_confidence(text, tokens),
|
|
108
|
+
original_token_count=original_tokens,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
# Cache result
|
|
112
|
+
if cfg.enable_cache:
|
|
113
|
+
self._translation_cache[text] = result
|
|
114
|
+
|
|
115
|
+
return result
|
|
116
|
+
|
|
117
|
+
def translate_from_molt(
|
|
118
|
+
self, molt_text: str, config: MoltConfig | None = None
|
|
119
|
+
) -> TranslationResult:
|
|
120
|
+
"""
|
|
121
|
+
Translate MoltLang to human language text.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
molt_text: MoltLang text to translate
|
|
125
|
+
config: Optional configuration override
|
|
126
|
+
|
|
127
|
+
Returns:
|
|
128
|
+
TranslationResult containing the human language translation
|
|
129
|
+
|
|
130
|
+
Examples:
|
|
131
|
+
>>> translator = MoltTranslator()
|
|
132
|
+
>>> result = translator.translate_from_molt("[OP:FETCH][SRC:API]")
|
|
133
|
+
>>> print(result.text)
|
|
134
|
+
Fetch data from API
|
|
135
|
+
"""
|
|
136
|
+
cfg = config or self.config
|
|
137
|
+
|
|
138
|
+
# Parse MoltLang tokens
|
|
139
|
+
tokens = self._parse_molt_tokens(molt_text)
|
|
140
|
+
|
|
141
|
+
# Generate human language translation
|
|
142
|
+
translation = self._generate_human_translation(tokens, cfg.human_language)
|
|
143
|
+
|
|
144
|
+
return TranslationResult(
|
|
145
|
+
text=translation,
|
|
146
|
+
tokens=tokens,
|
|
147
|
+
token_count=len(tokens),
|
|
148
|
+
confidence=self._calculate_confidence(translation, tokens),
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
def _count_word_tokens(self, text: str) -> int:
|
|
152
|
+
"""Count word tokens in text."""
|
|
153
|
+
return len(text.split())
|
|
154
|
+
|
|
155
|
+
def _analyze_and_translate(self, text: str) -> TokenSequence:
|
|
156
|
+
"""
|
|
157
|
+
Analyze human text and generate MoltLang tokens with semantic grouping.
|
|
158
|
+
|
|
159
|
+
LLM-Friendly: Supports multiple operations, modifiers, and parameters.
|
|
160
|
+
Uses position-based semantic grouping to associate sources/returns with operations.
|
|
161
|
+
"""
|
|
162
|
+
# Step 1: Detect all tokens with positions
|
|
163
|
+
detected = self._detect_with_positions(text)
|
|
164
|
+
|
|
165
|
+
# Step 2: Build semantic groups
|
|
166
|
+
groups = self._build_semantic_groups(text, detected)
|
|
167
|
+
|
|
168
|
+
# Step 3: Flatten to final sequence
|
|
169
|
+
tokens = self._flatten_groups(detected, groups)
|
|
170
|
+
|
|
171
|
+
# Step 4: Apply fallback rules
|
|
172
|
+
return self._apply_fallback_rules(text, tokens)
|
|
173
|
+
|
|
174
|
+
def _detect_with_positions(self, text: str) -> dict[str, list[DetectedToken]]:
|
|
175
|
+
"""Detect all tokens with their positions in text."""
|
|
176
|
+
import re
|
|
177
|
+
text_lower = text.lower()
|
|
178
|
+
detected = {
|
|
179
|
+
"modifiers": [],
|
|
180
|
+
"control": [],
|
|
181
|
+
"errors": [],
|
|
182
|
+
"operations": [],
|
|
183
|
+
"sources": [],
|
|
184
|
+
"returns": [],
|
|
185
|
+
"params": [],
|
|
186
|
+
"types": [],
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
# MODIFIER detection
|
|
190
|
+
# Check for "multiple X" where X is a source - don't add MOD:batch
|
|
191
|
+
multiple_source_pattern = re.search(r'multiple\s+(?:apis?|sources?|endpoints?)', text_lower)
|
|
192
|
+
|
|
193
|
+
mod_keywords = {
|
|
194
|
+
TokenType.MOD_ASYNC: ["async", "asynchronous", "asyncronously"],
|
|
195
|
+
TokenType.MOD_PARALLEL: ["parallel", "concurrent", "simultaneous"],
|
|
196
|
+
TokenType.MOD_BATCH: ["batch", "bulk"], # "multiple" excluded when followed by sources
|
|
197
|
+
TokenType.MOD_CACHED: ["cache", "cached", "caching"],
|
|
198
|
+
}
|
|
199
|
+
for token_type, keywords in mod_keywords.items():
|
|
200
|
+
# Skip MOD_BATCH for "multiple APIs" pattern
|
|
201
|
+
if token_type == TokenType.MOD_BATCH and multiple_source_pattern:
|
|
202
|
+
# Check if keywords would match "multiple" - skip if so
|
|
203
|
+
if "multiple" in keywords:
|
|
204
|
+
continue
|
|
205
|
+
for keyword in keywords:
|
|
206
|
+
pos = text_lower.find(keyword)
|
|
207
|
+
if pos != -1:
|
|
208
|
+
detected["modifiers"].append(DetectedToken(
|
|
209
|
+
token=Token(type=token_type),
|
|
210
|
+
position=pos,
|
|
211
|
+
keyword=keyword
|
|
212
|
+
))
|
|
213
|
+
break
|
|
214
|
+
|
|
215
|
+
# CONTROL FLOW detection
|
|
216
|
+
has_error_context = any(err in text_lower for err in ["error", "fail", "exception"])
|
|
217
|
+
# Also check for "otherwise" pattern - only CTL_ELSE if NOT in "otherwise log" pattern
|
|
218
|
+
otherwise_in_log_pattern = "otherwise" in text_lower and any(word in text_lower for word in ["log", "record", "print"])
|
|
219
|
+
|
|
220
|
+
ctl_keywords = {
|
|
221
|
+
TokenType.CTL_TRY: ["try", "attempt", "attempting", "trying to", "give it a shot"],
|
|
222
|
+
TokenType.CTL_CATCH: ["catch", "handle error", "on error", "except", "when error", "on failure", "error handler"],
|
|
223
|
+
TokenType.CTL_FINALLY: ["finally", "cleanup", "afterwards", "always do"],
|
|
224
|
+
TokenType.CTL_IF: ["if", "conditional", "when", "whenever", "in case", "depending on"],
|
|
225
|
+
TokenType.CTL_ELSE: ["else", "alternative", "or else", "fallback"], # "otherwise" excluded if followed by log
|
|
226
|
+
TokenType.CTL_LOOP: ["loop", "iterate", "repeat", "cycle", "for each", "while"],
|
|
227
|
+
}
|
|
228
|
+
for token_type, keywords in ctl_keywords.items():
|
|
229
|
+
# Skip CTL_IF in error context
|
|
230
|
+
if token_type == TokenType.CTL_IF and has_error_context:
|
|
231
|
+
continue
|
|
232
|
+
# Skip CTL_ELSE if in "otherwise log" pattern
|
|
233
|
+
if token_type == TokenType.CTL_ELSE and otherwise_in_log_pattern:
|
|
234
|
+
continue
|
|
235
|
+
for keyword in keywords:
|
|
236
|
+
pos = text_lower.find(keyword)
|
|
237
|
+
if pos != -1:
|
|
238
|
+
detected["control"].append(DetectedToken(
|
|
239
|
+
token=Token(type=token_type),
|
|
240
|
+
position=pos,
|
|
241
|
+
keyword=keyword
|
|
242
|
+
))
|
|
243
|
+
break
|
|
244
|
+
|
|
245
|
+
# ERROR HANDLING detection
|
|
246
|
+
# Check for "on failure" pattern - don't add ERR:FAIL for this
|
|
247
|
+
on_failure_pattern = "on failure" in text_lower or "on error" in text_lower
|
|
248
|
+
|
|
249
|
+
err_keywords = {
|
|
250
|
+
TokenType.ERR_RETRY: ["retry", "try again", "reattempt", "attempt again", "keep trying"],
|
|
251
|
+
TokenType.ERR_LOG: ["log", "logging", "record", "write log", "log entry", "log error"],
|
|
252
|
+
TokenType.ERR_FAIL: ["fail", "throw error", "raise error", "abort on error"], # "failure" excluded if in "on failure"
|
|
253
|
+
TokenType.ERR_IGNORE: ["ignore", "skip error", "continue on error", "suppress error"],
|
|
254
|
+
}
|
|
255
|
+
for token_type, keywords in err_keywords.items():
|
|
256
|
+
for keyword in keywords:
|
|
257
|
+
# Skip ERR:FAIL detection for "failure" in "failed records" or "on failure" context
|
|
258
|
+
if token_type == TokenType.ERR_FAIL:
|
|
259
|
+
if "failed" in text_lower or "on failure" in text_lower:
|
|
260
|
+
# Only add if explicit "fail" keyword (not "failed" or "failure")
|
|
261
|
+
if keyword == "fail" and re.search(r'\bfail\b', text_lower):
|
|
262
|
+
pos = text_lower.find(keyword)
|
|
263
|
+
if pos != -1:
|
|
264
|
+
detected["errors"].append(DetectedToken(
|
|
265
|
+
token=Token(type=token_type),
|
|
266
|
+
position=pos,
|
|
267
|
+
keyword=keyword
|
|
268
|
+
))
|
|
269
|
+
continue
|
|
270
|
+
pos = text_lower.find(keyword)
|
|
271
|
+
if pos != -1:
|
|
272
|
+
detected["errors"].append(DetectedToken(
|
|
273
|
+
token=Token(type=token_type),
|
|
274
|
+
position=pos,
|
|
275
|
+
keyword=keyword
|
|
276
|
+
))
|
|
277
|
+
break
|
|
278
|
+
|
|
279
|
+
# OPERATION detection - support MULTIPLE operations
|
|
280
|
+
# Check for "ensure X returns" pattern - this should NOT be OP:validate
|
|
281
|
+
ensure_returns_pattern = re.search(r'ensure\s+(?:it\s+)?returns?', text_lower)
|
|
282
|
+
|
|
283
|
+
op_keywords = {
|
|
284
|
+
TokenType.OP_FETCH: ["fetch", "get", "retrieve", "download"],
|
|
285
|
+
TokenType.OP_PARSE: ["parse", "analyze", "extract"],
|
|
286
|
+
TokenType.OP_TRANSFORM: ["transform", "convert", "change"],
|
|
287
|
+
TokenType.OP_SEARCH: ["search", "find", "lookup"],
|
|
288
|
+
TokenType.OP_VALIDATE: ["validate", "verify"], # "check" and "ensure" excluded in some contexts
|
|
289
|
+
TokenType.OP_FILTER: ["filter", "sift", "screen"],
|
|
290
|
+
TokenType.OP_AGGREGATE: ["aggregate", "combine", "merge", "summarize"],
|
|
291
|
+
TokenType.OP_PROCESS: ["process", "handle"],
|
|
292
|
+
}
|
|
293
|
+
for token_type, keywords in op_keywords.items():
|
|
294
|
+
# Skip OP_VALIDATE if in "ensure returns" context
|
|
295
|
+
if token_type == TokenType.OP_VALIDATE and ensure_returns_pattern:
|
|
296
|
+
# Still check for validate/verify but not check/ensure
|
|
297
|
+
keywords = [k for k in keywords if k not in ["check", "ensure"]]
|
|
298
|
+
for keyword in keywords:
|
|
299
|
+
pos = text_lower.find(keyword)
|
|
300
|
+
if pos != -1:
|
|
301
|
+
detected["operations"].append(DetectedToken(
|
|
302
|
+
token=Token(type=token_type),
|
|
303
|
+
position=pos,
|
|
304
|
+
keyword=keyword
|
|
305
|
+
))
|
|
306
|
+
break # Only first match per operation type
|
|
307
|
+
|
|
308
|
+
# Default to COMPUTE if no operation found (unless question)
|
|
309
|
+
if not detected["operations"]:
|
|
310
|
+
if text.strip().endswith("?"):
|
|
311
|
+
detected["operations"].append(DetectedToken(
|
|
312
|
+
token=Token(type=TokenType.OP_SEARCH),
|
|
313
|
+
position=0,
|
|
314
|
+
keyword="?"
|
|
315
|
+
))
|
|
316
|
+
else:
|
|
317
|
+
detected["operations"].append(DetectedToken(
|
|
318
|
+
token=Token(type=TokenType.OP_COMPUTE),
|
|
319
|
+
position=0,
|
|
320
|
+
keyword="compute"
|
|
321
|
+
))
|
|
322
|
+
|
|
323
|
+
# SOURCE detection
|
|
324
|
+
src_keywords = {
|
|
325
|
+
TokenType.SRC_API: ["api", "endpoint", "rest", "graphql"],
|
|
326
|
+
TokenType.SRC_DB: ["database", "db", "sql", "nosql"],
|
|
327
|
+
TokenType.SRC_FILE: ["file", "csv", "json file", "data file"],
|
|
328
|
+
TokenType.SRC_MEM: ["memory", "cache", "ram"],
|
|
329
|
+
}
|
|
330
|
+
for token_type, keywords in src_keywords.items():
|
|
331
|
+
for keyword in keywords:
|
|
332
|
+
pos = text_lower.find(keyword)
|
|
333
|
+
if pos != -1:
|
|
334
|
+
detected["sources"].append(DetectedToken(
|
|
335
|
+
token=Token(type=token_type),
|
|
336
|
+
position=pos,
|
|
337
|
+
keyword=keyword
|
|
338
|
+
))
|
|
339
|
+
break
|
|
340
|
+
|
|
341
|
+
# PARAMETER detection - extract values with regex
|
|
342
|
+
# Check for ID/key patterns first (more specific)
|
|
343
|
+
param_patterns = [
|
|
344
|
+
# ID detection (most specific - "ID 12345", "user ID 12345")
|
|
345
|
+
(TokenType.PARAM_KEY, r'(?:user\s+)?(?:id|identifier)\s*(?:of|:|=)?\s*(\d+[\w-]*)', 'id'),
|
|
346
|
+
# API key detection
|
|
347
|
+
(TokenType.PARAM_KEY, r'(?:api\s+)?key\s*(?:of|:|=)?\s*["\']?([\w-]+)["\']?', 'key'),
|
|
348
|
+
# Times/retry count - handle "retry X times" or "retry ... X times"
|
|
349
|
+
(TokenType.PARAM_TIMES, r'(?:retry|repeat)\s+(?:.*?)?(\d+)\s+times?', 'times'),
|
|
350
|
+
(TokenType.PARAM_TIMEOUT, r'timeout\s*(?:of|:|=)?\s*(\d+)\s*(?:seconds?|secs?|s)?', 'timeout'),
|
|
351
|
+
(TokenType.PARAM_TIMEOUT, r'timeout\s*(?:of|:|=)?\s*(\d+)\s*(?:seconds?|secs?|s)?', 'timeout'),
|
|
352
|
+
# Limit - handle "X records" or "limit X"
|
|
353
|
+
(TokenType.PARAM_LIMIT, r'(?:process|batch|handle)\s+(\d+)\s+(?:records?|items?|entries?)', 'limit'),
|
|
354
|
+
(TokenType.PARAM_LIMIT, r'(?:limit|max|maximum)\s*(?:of|:|=)?\s*(\d+)|(?:at\s+most)\s*(\d+)', 'limit2'),
|
|
355
|
+
(TokenType.PARAM_OFFSET, r'(?:offset|skip)\s*(?:of|:|=)?\s*(\d+)', 'offset'),
|
|
356
|
+
# Token/auth
|
|
357
|
+
(TokenType.PARAM_TOKEN, r'(?:auth|bearer|access)?\s*token\s*(?:of|:|=)?\s*["\']?([\w.-]+)["\']?', 'token'),
|
|
358
|
+
# Query (least specific - check last) - capture only up to next delimiter
|
|
359
|
+
# Exclude when "search" is followed by database/api/file (sources)
|
|
360
|
+
(TokenType.PARAM_QUERY, r'(?:query|find)\s+(?:for|:|=)?\s*["\']?([^"\']{1,30}?)(?:["\']|,|\.|and)\s', 'query'),
|
|
361
|
+
]
|
|
362
|
+
for token_type, pattern, name in param_patterns:
|
|
363
|
+
match = re.search(pattern, text_lower)
|
|
364
|
+
if match:
|
|
365
|
+
# Handle PARAM_LIMIT which has two groups
|
|
366
|
+
if token_type == TokenType.PARAM_LIMIT:
|
|
367
|
+
value = match.group(1) if match.group(1) else match.group(2)
|
|
368
|
+
# For limit2, use the position of the matched group
|
|
369
|
+
if name == 'limit2' and match.group(2):
|
|
370
|
+
param_pos = text_lower.find(match.group(2), match.start())
|
|
371
|
+
else:
|
|
372
|
+
param_pos = match.start()
|
|
373
|
+
elif name == 'id':
|
|
374
|
+
value = match.group(1)
|
|
375
|
+
param_pos = match.start()
|
|
376
|
+
elif name == 'times':
|
|
377
|
+
value = match.group(1)
|
|
378
|
+
# For PARAM:TIMES, find the position of the number (not the start of "retry")
|
|
379
|
+
param_pos = text_lower.find(match.group(1), match.start())
|
|
380
|
+
else:
|
|
381
|
+
value = match.group(1) if match.lastindex and match.group(1) else None
|
|
382
|
+
param_pos = match.start()
|
|
383
|
+
detected["params"].append(DetectedToken(
|
|
384
|
+
token=Token(type=token_type, value=value),
|
|
385
|
+
position=param_pos,
|
|
386
|
+
keyword=match.group(0)
|
|
387
|
+
))
|
|
388
|
+
|
|
389
|
+
# RETURN type detection - IMPORTANT: detect ALL occurrences
|
|
390
|
+
ret_keywords = {
|
|
391
|
+
TokenType.RET_JSON: ["json", "object"],
|
|
392
|
+
TokenType.RET_TEXT: ["csv", "text", "plain"],
|
|
393
|
+
TokenType.RET_LIST: ["list", "array"],
|
|
394
|
+
TokenType.RET_DICT: ["dictionary", "dict", "map"],
|
|
395
|
+
TokenType.RET_BOOL: ["boolean", "bool", "true", "false"],
|
|
396
|
+
TokenType.RET_NUM: ["number", "numeric"],
|
|
397
|
+
}
|
|
398
|
+
for token_type, keywords in ret_keywords.items():
|
|
399
|
+
for keyword in keywords:
|
|
400
|
+
# Find ALL occurrences, not just first
|
|
401
|
+
start = 0
|
|
402
|
+
found = False
|
|
403
|
+
while True:
|
|
404
|
+
pos = text_lower.find(keyword, start)
|
|
405
|
+
if pos == -1:
|
|
406
|
+
break
|
|
407
|
+
detected["returns"].append(DetectedToken(
|
|
408
|
+
token=Token(type=token_type),
|
|
409
|
+
position=pos,
|
|
410
|
+
keyword=keyword
|
|
411
|
+
))
|
|
412
|
+
start = pos + 1
|
|
413
|
+
found = True
|
|
414
|
+
if found:
|
|
415
|
+
break # Only use first matching keyword set per type
|
|
416
|
+
|
|
417
|
+
# TYPE constraint detection - only explicit "type X" patterns, not return keywords
|
|
418
|
+
# Filter out TYPE tokens that overlap with RETURN tokens (avoid duplicates)
|
|
419
|
+
ret_positions = {ret.position for ret in detected["returns"]}
|
|
420
|
+
|
|
421
|
+
# Special pattern: "list of strings" or "list of <type>" should add TYPE constraint
|
|
422
|
+
list_of_match = re.search(r'list\s+(?:of\s+)?(?:strings?|ints?|floats?|strs?|texts?|booleans?)', text_lower)
|
|
423
|
+
if list_of_match and "list" not in {rt.position for rt in detected["returns"]}:
|
|
424
|
+
# Extract the type from the match
|
|
425
|
+
matched_text = list_of_match.group(0).lower()
|
|
426
|
+
if "string" in matched_text or "str" in matched_text or "text" in matched_text:
|
|
427
|
+
# Add TYPE:str at the position of the type word
|
|
428
|
+
type_pos = text_lower.find("string", list_of_match.start())
|
|
429
|
+
if type_pos == -1:
|
|
430
|
+
type_pos = text_lower.find("str", list_of_match.start())
|
|
431
|
+
if type_pos != -1:
|
|
432
|
+
detected["types"].append(DetectedToken(
|
|
433
|
+
token=Token(type=TokenType.TYPE_STR),
|
|
434
|
+
position=type_pos,
|
|
435
|
+
keyword="string"
|
|
436
|
+
))
|
|
437
|
+
|
|
438
|
+
type_keywords = {
|
|
439
|
+
TokenType.TYPE_STR: ["type str", "type string", "string type", "typed list of strings", r'\btype\s*[:=]\s*str(?:ing)?\b'],
|
|
440
|
+
TokenType.TYPE_INT: ["type int", "integer type", "as integer", "to integer", r'\btype\s*[:=]\s*int(?:eger)?\b'],
|
|
441
|
+
TokenType.TYPE_FLOAT: ["type float", "float type", "decimal type", r'\btype\s*[:=]\s*float\b'],
|
|
442
|
+
TokenType.TYPE_BOOL: ["type bool", "boolean type", "as boolean", r'\btype\s*[:=]\s*bool(?:ean)?\b'],
|
|
443
|
+
TokenType.TYPE_LIST: ["type list", "list type", "array type", "as list", "to list", r'\btype\s*[:=]\s*list\b'],
|
|
444
|
+
TokenType.TYPE_DICT: ["type dict", "dict type", "map type", "as dict", "to dict", r'\btype\s*[:=]\s*dict\b'],
|
|
445
|
+
TokenType.TYPE_ANY: ["type any", "any type", r'\btype\s*[:=]\s*any\b'],
|
|
446
|
+
}
|
|
447
|
+
for token_type, keywords in type_keywords.items():
|
|
448
|
+
for keyword in keywords:
|
|
449
|
+
if keyword.startswith(r'\b'): # Regex pattern
|
|
450
|
+
match = re.search(keyword, text_lower)
|
|
451
|
+
if match and match.start() not in ret_positions:
|
|
452
|
+
detected["types"].append(DetectedToken(
|
|
453
|
+
token=Token(type=token_type),
|
|
454
|
+
position=match.start(),
|
|
455
|
+
keyword=keyword
|
|
456
|
+
))
|
|
457
|
+
break
|
|
458
|
+
else:
|
|
459
|
+
pos = text_lower.find(keyword)
|
|
460
|
+
if pos != -1 and pos not in ret_positions:
|
|
461
|
+
detected["types"].append(DetectedToken(
|
|
462
|
+
token=Token(type=token_type),
|
|
463
|
+
position=pos,
|
|
464
|
+
keyword=keyword
|
|
465
|
+
))
|
|
466
|
+
break
|
|
467
|
+
|
|
468
|
+
return detected
|
|
469
|
+
|
|
470
|
+
def _build_semantic_groups(self, text: str, detected: dict) -> list[dict]:
|
|
471
|
+
"""Build operation groups based on text position."""
|
|
472
|
+
import re
|
|
473
|
+
text_lower = text.lower()
|
|
474
|
+
|
|
475
|
+
# Sort operations by position
|
|
476
|
+
operations = sorted(detected["operations"], key=lambda x: x.position)
|
|
477
|
+
sources = sorted(detected["sources"], key=lambda x: x.position)
|
|
478
|
+
# Deduplicate returns by token type (keep first occurrence of each type)
|
|
479
|
+
unique_returns = {}
|
|
480
|
+
for ret in detected["returns"]:
|
|
481
|
+
if ret.token.type not in unique_returns:
|
|
482
|
+
unique_returns[ret.token.type] = ret
|
|
483
|
+
returns = sorted(unique_returns.values(), key=lambda x: x.position)
|
|
484
|
+
params = sorted(detected["params"], key=lambda x: x.position)
|
|
485
|
+
|
|
486
|
+
groups = []
|
|
487
|
+
used_sources = set()
|
|
488
|
+
used_returns = set()
|
|
489
|
+
used_params = set()
|
|
490
|
+
|
|
491
|
+
# Detect "return as <type>" or "return <type>" pattern for final return
|
|
492
|
+
final_return = None
|
|
493
|
+
final_return_pos = len(text_lower)
|
|
494
|
+
return_match = re.search(r'return\s+(?:as\s+)?(?:a\s+)?(\w+)', text_lower)
|
|
495
|
+
if return_match:
|
|
496
|
+
return_type = return_match.group(1)
|
|
497
|
+
# Map return type to token
|
|
498
|
+
return_map = {
|
|
499
|
+
"json": TokenType.RET_JSON, "object": TokenType.RET_JSON,
|
|
500
|
+
"csv": TokenType.RET_TEXT, "text": TokenType.RET_TEXT, "plain": TokenType.RET_TEXT,
|
|
501
|
+
"list": TokenType.RET_LIST, "array": TokenType.RET_LIST,
|
|
502
|
+
"dict": TokenType.RET_DICT, "dictionary": TokenType.RET_DICT, "map": TokenType.RET_DICT,
|
|
503
|
+
"bool": TokenType.RET_BOOL, "boolean": TokenType.RET_BOOL,
|
|
504
|
+
"number": TokenType.RET_NUM, "numeric": TokenType.RET_NUM,
|
|
505
|
+
}
|
|
506
|
+
if return_type in return_map:
|
|
507
|
+
final_return = Token(type=return_map[return_type])
|
|
508
|
+
final_return_pos = return_match.start()
|
|
509
|
+
|
|
510
|
+
for i, op in enumerate(operations):
|
|
511
|
+
group = {
|
|
512
|
+
"operation": op.token,
|
|
513
|
+
"source": None,
|
|
514
|
+
"returns": [],
|
|
515
|
+
"params": []
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
# Find nearest source BEFORE this operation (or first op gets first source)
|
|
519
|
+
if i == 0 and sources:
|
|
520
|
+
# First operation gets the source
|
|
521
|
+
group["source"] = sources[0].token
|
|
522
|
+
used_sources.add(0)
|
|
523
|
+
else:
|
|
524
|
+
# Check for source between previous op and this one
|
|
525
|
+
prev_pos = operations[i-1].position if i > 0 else 0
|
|
526
|
+
for j, src in enumerate(sources):
|
|
527
|
+
if j not in used_sources and prev_pos < src.position <= op.position:
|
|
528
|
+
group["source"] = src.token
|
|
529
|
+
used_sources.add(j)
|
|
530
|
+
break
|
|
531
|
+
|
|
532
|
+
# Find returns that are "near" this operation
|
|
533
|
+
next_op_pos = operations[i+1].position if i+1 < len(operations) else len(text_lower)
|
|
534
|
+
|
|
535
|
+
for j, ret in enumerate(returns):
|
|
536
|
+
if j not in used_returns:
|
|
537
|
+
# Skip if this is the final return (belongs to last operation)
|
|
538
|
+
if abs(ret.position - final_return_pos) < 5:
|
|
539
|
+
continue
|
|
540
|
+
# Check for explicit "to <type>" pattern with this operation
|
|
541
|
+
if f"{op.keyword} to" in text_lower or f"{op.keyword}s to" in text_lower:
|
|
542
|
+
to_pos = text_lower.find(" to ", op.position)
|
|
543
|
+
if to_pos != -1 and to_pos < next_op_pos:
|
|
544
|
+
if to_pos < ret.position < to_pos + 20:
|
|
545
|
+
group["returns"].append(ret.token)
|
|
546
|
+
used_returns.add(j)
|
|
547
|
+
# Otherwise, only assign return to first operation if it's very close
|
|
548
|
+
elif i == 0 and ret.position < op.position + 30:
|
|
549
|
+
# First operation gets returns that appear early
|
|
550
|
+
group["returns"].append(ret.token)
|
|
551
|
+
used_returns.add(j)
|
|
552
|
+
|
|
553
|
+
# Find params that are "near" this operation
|
|
554
|
+
# Exception: PARAM:times that comes after ERR:retry should NOT be added to operation group
|
|
555
|
+
err_retry_positions = [e.position for e in detected["errors"] if e.token.type == TokenType.ERR_RETRY]
|
|
556
|
+
|
|
557
|
+
for j, param in enumerate(params):
|
|
558
|
+
if j not in used_params:
|
|
559
|
+
# Skip PARAM:times if it comes after ERR:retry (belongs with error handling)
|
|
560
|
+
if param.token.type == TokenType.PARAM_TIMES:
|
|
561
|
+
# Check if there's an ERR:retry before this param
|
|
562
|
+
if any(err_pos < param.position for err_pos in err_retry_positions):
|
|
563
|
+
# Mark as used so it doesn't get added by "Handle remaining params"
|
|
564
|
+
used_params.add(j)
|
|
565
|
+
continue # Don't add to operation group
|
|
566
|
+
if op.position <= param.position < next_op_pos:
|
|
567
|
+
group["params"].append(param.token)
|
|
568
|
+
used_params.add(j)
|
|
569
|
+
|
|
570
|
+
groups.append(group)
|
|
571
|
+
|
|
572
|
+
# Assign final return to last operation (but only if not already added)
|
|
573
|
+
if groups and final_return:
|
|
574
|
+
# Check if this return type is already in the last group
|
|
575
|
+
last_return_types = {r.type for r in groups[-1]["returns"]}
|
|
576
|
+
if final_return.type not in last_return_types:
|
|
577
|
+
groups[-1]["returns"].append(final_return)
|
|
578
|
+
# Mark returns near final_return_pos as used
|
|
579
|
+
for j, ret in enumerate(returns):
|
|
580
|
+
if abs(ret.position - final_return_pos) < 5:
|
|
581
|
+
used_returns.add(j)
|
|
582
|
+
|
|
583
|
+
# Handle remaining returns - assign to last operation
|
|
584
|
+
if groups:
|
|
585
|
+
for j, ret in enumerate(returns):
|
|
586
|
+
if j not in used_returns:
|
|
587
|
+
# Check if this return type is already in the last group
|
|
588
|
+
last_return_types = {r.type for r in groups[-1]["returns"]}
|
|
589
|
+
if ret.token.type not in last_return_types:
|
|
590
|
+
groups[-1]["returns"].append(ret.token)
|
|
591
|
+
|
|
592
|
+
# Handle remaining params - assign to last operation
|
|
593
|
+
if groups:
|
|
594
|
+
for j, param in enumerate(params):
|
|
595
|
+
if j not in used_params:
|
|
596
|
+
groups[-1]["params"].append(param.token)
|
|
597
|
+
|
|
598
|
+
return groups
|
|
599
|
+
|
|
600
|
+
def _flatten_groups(self, detected: dict, groups: list[dict]) -> TokenSequence:
|
|
601
|
+
"""Flatten semantic groups into final token sequence.
|
|
602
|
+
|
|
603
|
+
Token ordering follows semantic flow:
|
|
604
|
+
1. CTL:try (if present) - comes first
|
|
605
|
+
2. MOD:async, MOD:batch, MOD:parallel (operation modifiers, sorted by type priority)
|
|
606
|
+
3. Operation groups (OP + SRC + PARAM + RET)
|
|
607
|
+
4. MOD:cached (can come after operation)
|
|
608
|
+
5. CTL:catch, CTL:finally (error handling blocks)
|
|
609
|
+
6. ERR:retry, ERR:log (error handling actions)
|
|
610
|
+
7. TYPE constraints
|
|
611
|
+
"""
|
|
612
|
+
tokens = TokenSequence()
|
|
613
|
+
|
|
614
|
+
# Separate control flow tokens
|
|
615
|
+
try_tokens = [dt for dt in detected["control"] if dt.token.type == TokenType.CTL_TRY]
|
|
616
|
+
catch_tokens = [dt for dt in detected["control"] if dt.token.type == TokenType.CTL_CATCH]
|
|
617
|
+
finally_tokens = [dt for dt in detected["control"] if dt.token.type == TokenType.CTL_FINALLY]
|
|
618
|
+
other_ctl_tokens = [dt for dt in detected["control"] if dt.token.type not in [TokenType.CTL_TRY, TokenType.CTL_CATCH, TokenType.CTL_FINALLY]]
|
|
619
|
+
|
|
620
|
+
# Separate modifiers - cached can come after operation, sort others by text position
|
|
621
|
+
cached_mods = [dt for dt in detected["modifiers"] if dt.token.type == TokenType.MOD_CACHED]
|
|
622
|
+
other_mods = [dt for dt in detected["modifiers"] if dt.token.type != TokenType.MOD_CACHED]
|
|
623
|
+
# Sort modifiers by text position (maintains order as they appear in text)
|
|
624
|
+
other_mods.sort(key=lambda dt: dt.position)
|
|
625
|
+
|
|
626
|
+
# Separate error tokens - fail should only be explicit
|
|
627
|
+
# Also extract any PARAM:times that should come with ERR:retry
|
|
628
|
+
err_tokens = []
|
|
629
|
+
times_params = []
|
|
630
|
+
for dt in detected["errors"]:
|
|
631
|
+
if dt.token.type == TokenType.ERR_RETRY:
|
|
632
|
+
err_tokens.append(dt)
|
|
633
|
+
# Find associated PARAM:times
|
|
634
|
+
for pt in detected["params"]:
|
|
635
|
+
if pt.token.type == TokenType.PARAM_TIMES and pt.position > dt.position:
|
|
636
|
+
times_params.append(pt)
|
|
637
|
+
break
|
|
638
|
+
else:
|
|
639
|
+
err_tokens.append(dt)
|
|
640
|
+
# Remove times params from regular params list (they'll be added with error tokens)
|
|
641
|
+
detected_params_for_groups = [pt for pt in detected["params"] if pt not in times_params]
|
|
642
|
+
|
|
643
|
+
# Collect all return types used in groups to filter out duplicate TYPE tokens
|
|
644
|
+
used_return_types = set()
|
|
645
|
+
for group in groups:
|
|
646
|
+
for ret in group["returns"]:
|
|
647
|
+
ret_to_type_map = {
|
|
648
|
+
TokenType.RET_JSON: TokenType.TYPE_STR,
|
|
649
|
+
TokenType.RET_TEXT: TokenType.TYPE_STR,
|
|
650
|
+
TokenType.RET_LIST: TokenType.TYPE_LIST,
|
|
651
|
+
TokenType.RET_DICT: TokenType.TYPE_DICT,
|
|
652
|
+
TokenType.RET_BOOL: TokenType.TYPE_BOOL,
|
|
653
|
+
TokenType.RET_NUM: TokenType.TYPE_INT,
|
|
654
|
+
}
|
|
655
|
+
if ret.type in ret_to_type_map:
|
|
656
|
+
used_return_types.add(ret_to_type_map[ret.type])
|
|
657
|
+
|
|
658
|
+
# 1. CTL:try first (if present)
|
|
659
|
+
for dt in try_tokens:
|
|
660
|
+
tokens.add(dt.token)
|
|
661
|
+
|
|
662
|
+
# 2. Other modifiers (batch, parallel, async) - sorted by priority
|
|
663
|
+
for dt in other_mods:
|
|
664
|
+
tokens.add(dt.token)
|
|
665
|
+
|
|
666
|
+
# 3. Other control flow (if, else, loop)
|
|
667
|
+
for dt in other_ctl_tokens:
|
|
668
|
+
tokens.add(dt.token)
|
|
669
|
+
|
|
670
|
+
# 4. Error handling (retry, log, etc.) - but save some for after catch
|
|
671
|
+
# Split: retry/log can go before or after catch depending on context
|
|
672
|
+
main_err_tokens = []
|
|
673
|
+
post_catch_err_tokens = []
|
|
674
|
+
for dt in err_tokens:
|
|
675
|
+
# ERR:retry and ERR:log typically come after CTL:catch
|
|
676
|
+
if dt.token.type in [TokenType.ERR_RETRY, TokenType.ERR_LOG]:
|
|
677
|
+
post_catch_err_tokens.append(dt)
|
|
678
|
+
else:
|
|
679
|
+
main_err_tokens.append(dt)
|
|
680
|
+
|
|
681
|
+
# 5. Operation groups in order
|
|
682
|
+
for group in groups:
|
|
683
|
+
tokens.add(group["operation"])
|
|
684
|
+
if group["source"]:
|
|
685
|
+
tokens.add(group["source"])
|
|
686
|
+
for param in group["params"]:
|
|
687
|
+
tokens.add(param)
|
|
688
|
+
for ret in group["returns"]:
|
|
689
|
+
tokens.add(ret)
|
|
690
|
+
|
|
691
|
+
# 6. MOD:cached (can come after operation)
|
|
692
|
+
for dt in cached_mods:
|
|
693
|
+
tokens.add(dt.token)
|
|
694
|
+
|
|
695
|
+
# 7. CTL:catch - add implicitly if we have try + error handling but no explicit catch
|
|
696
|
+
has_try = len(try_tokens) > 0
|
|
697
|
+
has_explicit_catch = len(catch_tokens) > 0
|
|
698
|
+
has_error_handling = len(post_catch_err_tokens) > 0
|
|
699
|
+
|
|
700
|
+
if has_try and has_error_handling and not has_explicit_catch:
|
|
701
|
+
# Add implicit CTL:catch
|
|
702
|
+
tokens.add(Token(type=TokenType.CTL_CATCH))
|
|
703
|
+
|
|
704
|
+
for dt in catch_tokens:
|
|
705
|
+
tokens.add(dt.token)
|
|
706
|
+
for dt in finally_tokens:
|
|
707
|
+
tokens.add(dt.token)
|
|
708
|
+
|
|
709
|
+
# 8. Post-catch error tokens (retry, log) and their associated params
|
|
710
|
+
for dt in post_catch_err_tokens:
|
|
711
|
+
tokens.add(dt.token)
|
|
712
|
+
# Add PARAM:times if it's associated with this ERR:retry
|
|
713
|
+
if dt.token.type == TokenType.ERR_RETRY:
|
|
714
|
+
for pt in detected["params"]:
|
|
715
|
+
if pt.token.type == TokenType.PARAM_TIMES and pt.position > dt.position:
|
|
716
|
+
tokens.add(pt.token)
|
|
717
|
+
break
|
|
718
|
+
|
|
719
|
+
# 9. Other error tokens
|
|
720
|
+
for dt in main_err_tokens:
|
|
721
|
+
tokens.add(dt.token)
|
|
722
|
+
|
|
723
|
+
# 10. Type constraints at end - filter out if return type already used, also deduplicate
|
|
724
|
+
seen_types = set()
|
|
725
|
+
for dt in detected["types"]:
|
|
726
|
+
if dt.token.type not in used_return_types and dt.token.type not in seen_types:
|
|
727
|
+
tokens.add(dt.token)
|
|
728
|
+
seen_types.add(dt.token.type)
|
|
729
|
+
|
|
730
|
+
return tokens
|
|
731
|
+
|
|
732
|
+
def _parse_molt_tokens(self, molt_text: str) -> TokenSequence:
|
|
733
|
+
"""
|
|
734
|
+
Parse MoltLang text into tokens.
|
|
735
|
+
|
|
736
|
+
LLM-friendly: Case-insensitive parsing for flexibility.
|
|
737
|
+
Accepts both [RET:JSON] and [RET:json] - normalizes to enum values.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
molt_text: MoltLang string representation
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
TokenSequence containing parsed tokens
|
|
744
|
+
"""
|
|
745
|
+
tokens = TokenSequence()
|
|
746
|
+
import re
|
|
747
|
+
|
|
748
|
+
# Find all token patterns like [TYPE:VALUE] - case-insensitive
|
|
749
|
+
pattern = r"\[([a-zA-Z]+):([a-zA-Z_0-9]+)(?:=([^\]]+))?\]"
|
|
750
|
+
matches = re.findall(pattern, molt_text)
|
|
751
|
+
|
|
752
|
+
for category, value, param in matches:
|
|
753
|
+
# Normalize to uppercase for enum lookup
|
|
754
|
+
token_type_str = f"{category.upper()}_{value.upper()}"
|
|
755
|
+
try:
|
|
756
|
+
token_type = TokenType[token_type_str]
|
|
757
|
+
token = Token(type=token_type, value=param if param else None)
|
|
758
|
+
tokens.add(token)
|
|
759
|
+
except KeyError:
|
|
760
|
+
# Unknown token type, skip or handle as custom
|
|
761
|
+
pass
|
|
762
|
+
|
|
763
|
+
return tokens
|
|
764
|
+
|
|
765
|
+
def _generate_human_translation(
|
|
766
|
+
self, tokens: TokenSequence, target_language: str = "en"
|
|
767
|
+
) -> str:
|
|
768
|
+
"""
|
|
769
|
+
Generate human language translation from MoltLang tokens.
|
|
770
|
+
|
|
771
|
+
Args:
|
|
772
|
+
tokens: TokenSequence to translate
|
|
773
|
+
target_language: Target human language (default: English)
|
|
774
|
+
|
|
775
|
+
Returns:
|
|
776
|
+
Human language translation
|
|
777
|
+
"""
|
|
778
|
+
parts: list[str] = []
|
|
779
|
+
|
|
780
|
+
for token in tokens.tokens:
|
|
781
|
+
# Operation translations
|
|
782
|
+
if token.type == TokenType.OP_FETCH:
|
|
783
|
+
parts.append("Fetch")
|
|
784
|
+
elif token.type == TokenType.OP_PARSE:
|
|
785
|
+
parts.append("Parse")
|
|
786
|
+
elif token.type == TokenType.OP_TRANSFORM:
|
|
787
|
+
parts.append("Transform")
|
|
788
|
+
elif token.type == TokenType.OP_SEARCH:
|
|
789
|
+
parts.append("Search")
|
|
790
|
+
elif token.type == TokenType.OP_VALIDATE:
|
|
791
|
+
parts.append("Validate")
|
|
792
|
+
elif token.type == TokenType.OP_FILTER:
|
|
793
|
+
parts.append("Filter")
|
|
794
|
+
elif token.type == TokenType.OP_COMPUTE:
|
|
795
|
+
parts.append("Compute")
|
|
796
|
+
|
|
797
|
+
# Source translations
|
|
798
|
+
elif token.type == TokenType.SRC_API:
|
|
799
|
+
parts.append("data from API")
|
|
800
|
+
elif token.type == TokenType.SRC_DB:
|
|
801
|
+
parts.append("data from database")
|
|
802
|
+
elif token.type == TokenType.SRC_FILE:
|
|
803
|
+
parts.append("data from file")
|
|
804
|
+
elif token.type == TokenType.SRC_MEM:
|
|
805
|
+
parts.append("data from memory")
|
|
806
|
+
|
|
807
|
+
# Return type translations
|
|
808
|
+
elif token.type == TokenType.RET_JSON:
|
|
809
|
+
parts.append("return JSON")
|
|
810
|
+
elif token.type == TokenType.RET_TEXT:
|
|
811
|
+
parts.append("return text")
|
|
812
|
+
elif token.type == TokenType.RET_BOOL:
|
|
813
|
+
parts.append("return boolean")
|
|
814
|
+
elif token.type == TokenType.RET_NUM:
|
|
815
|
+
parts.append("return number")
|
|
816
|
+
elif token.type == TokenType.RET_LIST:
|
|
817
|
+
parts.append("return list")
|
|
818
|
+
elif token.type == TokenType.RET_DICT:
|
|
819
|
+
parts.append("return dictionary")
|
|
820
|
+
|
|
821
|
+
return " ".join(parts) if parts else "Empty operation"
|
|
822
|
+
|
|
823
|
+
def _calculate_confidence(self, original: str, tokens: TokenSequence) -> float:
|
|
824
|
+
"""
|
|
825
|
+
Calculate translation confidence score.
|
|
826
|
+
|
|
827
|
+
Enhanced to consider semantic completeness, not just token count.
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
original: Original text
|
|
831
|
+
tokens: Translated token sequence
|
|
832
|
+
|
|
833
|
+
Returns:
|
|
834
|
+
Confidence score (0.0-1.0)
|
|
835
|
+
"""
|
|
836
|
+
if len(tokens) == 0:
|
|
837
|
+
return 0.0
|
|
838
|
+
|
|
839
|
+
# Base score from token count (capped at 0.7 for 3+ tokens)
|
|
840
|
+
base_score = min(0.7, 0.5 + (len(tokens) * 0.1))
|
|
841
|
+
|
|
842
|
+
# Semantic completeness bonus (only if complete)
|
|
843
|
+
has_operation = any("OP:" in t.type.value for t in tokens.tokens)
|
|
844
|
+
has_source = any("SRC:" in t.type.value for t in tokens.tokens)
|
|
845
|
+
has_return = any("RET:" in t.type.value for t in tokens.tokens)
|
|
846
|
+
|
|
847
|
+
completeness = 0.0
|
|
848
|
+
# Only give bonus for complete operations (operation + source/return)
|
|
849
|
+
if has_operation and (has_source or has_return):
|
|
850
|
+
completeness += 0.25
|
|
851
|
+
# Extra bonus for having both source AND return
|
|
852
|
+
if has_operation and has_source and has_return:
|
|
853
|
+
completeness += 0.15
|
|
854
|
+
# Small bonus for having control flow or error handling
|
|
855
|
+
has_ctl = any("CTL:" in t.type.value for t in tokens.tokens)
|
|
856
|
+
has_err = any("ERR:" in t.type.value for t in tokens.tokens)
|
|
857
|
+
if has_ctl or has_err:
|
|
858
|
+
completeness += 0.05
|
|
859
|
+
|
|
860
|
+
return round(min(1.0, base_score + completeness), 2)
|
|
861
|
+
|
|
862
|
+
def _apply_fallback_rules(self, text: str, tokens: TokenSequence) -> TokenSequence:
|
|
863
|
+
"""
|
|
864
|
+
Apply fallback rules when direct matching fails.
|
|
865
|
+
|
|
866
|
+
This uses heuristics to infer likely tokens from context.
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
text: Original human language text
|
|
870
|
+
tokens: Current token sequence
|
|
871
|
+
|
|
872
|
+
Returns:
|
|
873
|
+
Potentially modified token sequence
|
|
874
|
+
"""
|
|
875
|
+
import re
|
|
876
|
+
text_lower = text.lower()
|
|
877
|
+
|
|
878
|
+
# Fallback 1: "safe" or "careful" implies error handling
|
|
879
|
+
if any(word in text_lower for word in ["safe", "careful", "graceful", "handle"]):
|
|
880
|
+
if not any(t.type.value.startswith("CTL:") for t in tokens.tokens):
|
|
881
|
+
tokens.add(Token(type=TokenType.CTL_TRY))
|
|
882
|
+
tokens.add(Token(type=TokenType.CTL_CATCH))
|
|
883
|
+
|
|
884
|
+
# Fallback 2: "ensure" or "guarantee" implies validation
|
|
885
|
+
# Exception: "ensure [it] returns" describes return type, not validation
|
|
886
|
+
ensure_returns_pattern = re.search(r'ensure\s+(?:it\s+)?returns?', text_lower)
|
|
887
|
+
if any(word in text_lower for word in ["ensure", "guarantee", "verify"]):
|
|
888
|
+
# Skip if "ensure returns" pattern (describes return type, not validation)
|
|
889
|
+
if not ensure_returns_pattern:
|
|
890
|
+
if not any(t.type == TokenType.OP_VALIDATE for t in tokens.tokens):
|
|
891
|
+
tokens.add(Token(type=TokenType.OP_VALIDATE))
|
|
892
|
+
|
|
893
|
+
# Fallback 3: Questions default to search
|
|
894
|
+
if text.strip().endswith("?"):
|
|
895
|
+
if not any(t.type.value.startswith("OP:") for t in tokens.tokens):
|
|
896
|
+
tokens.add(Token(type=TokenType.OP_SEARCH))
|
|
897
|
+
# Fallback 3b: Questions with source but no operation
|
|
898
|
+
if text.strip().endswith("?"):
|
|
899
|
+
has_source = any(t.type.value.startswith("SRC:") for t in tokens.tokens)
|
|
900
|
+
has_operation = any(t.type.value.startswith("OP:") for t in tokens.tokens)
|
|
901
|
+
if has_source and not has_operation:
|
|
902
|
+
tokens.add(Token(type=TokenType.OP_SEARCH))
|
|
903
|
+
|
|
904
|
+
return tokens
|
|
905
|
+
|
|
906
|
+
|
|
907
|
+
# Convenience functions for direct usage
|
|
908
|
+
|
|
909
|
+
_translator_instance: MoltTranslator | None = None
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
def _get_translator() -> MoltTranslator:
|
|
913
|
+
"""Get or create the shared translator instance."""
|
|
914
|
+
global _translator_instance
|
|
915
|
+
if _translator_instance is None:
|
|
916
|
+
_translator_instance = MoltTranslator()
|
|
917
|
+
return _translator_instance
|
|
918
|
+
|
|
919
|
+
|
|
920
|
+
def translate_to_molt(text: str, config: MoltConfig | None = None) -> str:
|
|
921
|
+
"""
|
|
922
|
+
Translate human language text to MoltLang.
|
|
923
|
+
|
|
924
|
+
This is a convenience function that uses a shared translator instance.
|
|
925
|
+
|
|
926
|
+
Args:
|
|
927
|
+
text: Human language text to translate
|
|
928
|
+
config: Optional configuration override
|
|
929
|
+
|
|
930
|
+
Returns:
|
|
931
|
+
MoltLang string representation
|
|
932
|
+
|
|
933
|
+
Examples:
|
|
934
|
+
>>> from moltlang import translate_to_molt
|
|
935
|
+
>>> molt = translate_to_molt("Fetch data from API and return JSON")
|
|
936
|
+
>>> print(molt)
|
|
937
|
+
[OP:FETCH][SRC:API][RET:JSON]
|
|
938
|
+
"""
|
|
939
|
+
translator = _get_translator()
|
|
940
|
+
result = translator.translate_to_molt(text, config)
|
|
941
|
+
return result.text
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
def translate_from_molt(molt_text: str, config: MoltConfig | None = None) -> str:
|
|
945
|
+
"""
|
|
946
|
+
Translate MoltLang to human language text.
|
|
947
|
+
|
|
948
|
+
This is a convenience function that uses a shared translator instance.
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
molt_text: MoltLang text to translate
|
|
952
|
+
config: Optional configuration override
|
|
953
|
+
|
|
954
|
+
Returns:
|
|
955
|
+
Human language translation
|
|
956
|
+
|
|
957
|
+
Examples:
|
|
958
|
+
>>> from moltlang import translate_from_molt
|
|
959
|
+
>>> english = translate_from_molt("[OP:FETCH][SRC:API][RET:JSON]")
|
|
960
|
+
>>> print(english)
|
|
961
|
+
Fetch data from API return JSON
|
|
962
|
+
"""
|
|
963
|
+
translator = _get_translator()
|
|
964
|
+
result = translator.translate_from_molt(molt_text, config)
|
|
965
|
+
return result.text
|