moltlang 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
moltlang/translator.py ADDED
@@ -0,0 +1,965 @@
1
+ """
2
+ MoltLang translation module.
3
+
4
+ This module provides bidirectional translation between English (and other human languages)
5
+ and MoltLang, the AI-optimized language.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+ from moltlang.config import MoltConfig, get_config
12
+ from moltlang.tokens import Token, TokenSequence, TokenType
13
+
14
+
15
+ @dataclass
16
+ class DetectedToken:
17
+ """Token with position information for semantic grouping."""
18
+ token: Token
19
+ position: int # Character position in original text
20
+ keyword: str # The keyword that triggered detection
21
+
22
+
23
+ @dataclass
24
+ class TranslationResult:
25
+ """
26
+ Result of a translation operation.
27
+
28
+ Attributes:
29
+ text: The translated text
30
+ tokens: Token sequence (for MoltLang output)
31
+ token_count: Number of tokens used
32
+ confidence: Translation confidence score (0.0-1.0)
33
+ original_token_count: Original token count (for efficiency calculation)
34
+ """
35
+
36
+ text: str
37
+ tokens: TokenSequence | None = None
38
+ token_count: int = 0
39
+ confidence: float = 0.0
40
+ original_token_count: int = 0
41
+
42
+ @property
43
+ def token_efficiency(self) -> float:
44
+ """Calculate token efficiency (reduction percentage)."""
45
+ if self.original_token_count == 0:
46
+ return 0.0
47
+ return 1.0 - (self.token_count / self.original_token_count)
48
+
49
+ def __str__(self) -> str:
50
+ """Return the translated text."""
51
+ return self.text
52
+
53
+
54
+ class MoltTranslator:
55
+ """
56
+ Translator for MoltLang.
57
+
58
+ Handles bidirectional translation between human languages and MoltLang.
59
+ """
60
+
61
+ def __init__(self, config: MoltConfig | None = None):
62
+ """
63
+ Initialize the translator.
64
+
65
+ Args:
66
+ config: Optional configuration. Uses default if not provided.
67
+ """
68
+ self.config = config or get_config()
69
+ self._translation_cache: dict[str, TranslationResult] = {}
70
+
71
+ def translate_to_molt(
72
+ self, text: str, config: MoltConfig | None = None
73
+ ) -> TranslationResult:
74
+ """
75
+ Translate human language text to MoltLang.
76
+
77
+ Args:
78
+ text: Human language text to translate
79
+ config: Optional configuration override
80
+
81
+ Returns:
82
+ TranslationResult containing the MoltLang translation
83
+
84
+ Examples:
85
+ >>> translator = MoltTranslator()
86
+ >>> result = translator.translate_to_molt("Fetch data from API")
87
+ >>> print(result.text)
88
+ [OP:FETCH][SRC:API]
89
+ """
90
+ cfg = config or self.config
91
+
92
+ # Check cache
93
+ if cfg.enable_cache and text in self._translation_cache:
94
+ return self._translation_cache[text]
95
+
96
+ # Tokenize input
97
+ original_tokens = self._count_word_tokens(text)
98
+
99
+ # Analyze and translate
100
+ tokens = self._analyze_and_translate(text)
101
+
102
+ # Build result
103
+ result = TranslationResult(
104
+ text=str(tokens),
105
+ tokens=tokens,
106
+ token_count=len(tokens),
107
+ confidence=self._calculate_confidence(text, tokens),
108
+ original_token_count=original_tokens,
109
+ )
110
+
111
+ # Cache result
112
+ if cfg.enable_cache:
113
+ self._translation_cache[text] = result
114
+
115
+ return result
116
+
117
+ def translate_from_molt(
118
+ self, molt_text: str, config: MoltConfig | None = None
119
+ ) -> TranslationResult:
120
+ """
121
+ Translate MoltLang to human language text.
122
+
123
+ Args:
124
+ molt_text: MoltLang text to translate
125
+ config: Optional configuration override
126
+
127
+ Returns:
128
+ TranslationResult containing the human language translation
129
+
130
+ Examples:
131
+ >>> translator = MoltTranslator()
132
+ >>> result = translator.translate_from_molt("[OP:FETCH][SRC:API]")
133
+ >>> print(result.text)
134
+ Fetch data from API
135
+ """
136
+ cfg = config or self.config
137
+
138
+ # Parse MoltLang tokens
139
+ tokens = self._parse_molt_tokens(molt_text)
140
+
141
+ # Generate human language translation
142
+ translation = self._generate_human_translation(tokens, cfg.human_language)
143
+
144
+ return TranslationResult(
145
+ text=translation,
146
+ tokens=tokens,
147
+ token_count=len(tokens),
148
+ confidence=self._calculate_confidence(translation, tokens),
149
+ )
150
+
151
+ def _count_word_tokens(self, text: str) -> int:
152
+ """Count word tokens in text."""
153
+ return len(text.split())
154
+
155
+ def _analyze_and_translate(self, text: str) -> TokenSequence:
156
+ """
157
+ Analyze human text and generate MoltLang tokens with semantic grouping.
158
+
159
+ LLM-Friendly: Supports multiple operations, modifiers, and parameters.
160
+ Uses position-based semantic grouping to associate sources/returns with operations.
161
+ """
162
+ # Step 1: Detect all tokens with positions
163
+ detected = self._detect_with_positions(text)
164
+
165
+ # Step 2: Build semantic groups
166
+ groups = self._build_semantic_groups(text, detected)
167
+
168
+ # Step 3: Flatten to final sequence
169
+ tokens = self._flatten_groups(detected, groups)
170
+
171
+ # Step 4: Apply fallback rules
172
+ return self._apply_fallback_rules(text, tokens)
173
+
174
+ def _detect_with_positions(self, text: str) -> dict[str, list[DetectedToken]]:
175
+ """Detect all tokens with their positions in text."""
176
+ import re
177
+ text_lower = text.lower()
178
+ detected = {
179
+ "modifiers": [],
180
+ "control": [],
181
+ "errors": [],
182
+ "operations": [],
183
+ "sources": [],
184
+ "returns": [],
185
+ "params": [],
186
+ "types": [],
187
+ }
188
+
189
+ # MODIFIER detection
190
+ # Check for "multiple X" where X is a source - don't add MOD:batch
191
+ multiple_source_pattern = re.search(r'multiple\s+(?:apis?|sources?|endpoints?)', text_lower)
192
+
193
+ mod_keywords = {
194
+ TokenType.MOD_ASYNC: ["async", "asynchronous", "asyncronously"],
195
+ TokenType.MOD_PARALLEL: ["parallel", "concurrent", "simultaneous"],
196
+ TokenType.MOD_BATCH: ["batch", "bulk"], # "multiple" excluded when followed by sources
197
+ TokenType.MOD_CACHED: ["cache", "cached", "caching"],
198
+ }
199
+ for token_type, keywords in mod_keywords.items():
200
+ # Skip MOD_BATCH for "multiple APIs" pattern
201
+ if token_type == TokenType.MOD_BATCH and multiple_source_pattern:
202
+ # Check if keywords would match "multiple" - skip if so
203
+ if "multiple" in keywords:
204
+ continue
205
+ for keyword in keywords:
206
+ pos = text_lower.find(keyword)
207
+ if pos != -1:
208
+ detected["modifiers"].append(DetectedToken(
209
+ token=Token(type=token_type),
210
+ position=pos,
211
+ keyword=keyword
212
+ ))
213
+ break
214
+
215
+ # CONTROL FLOW detection
216
+ has_error_context = any(err in text_lower for err in ["error", "fail", "exception"])
217
+ # Also check for "otherwise" pattern - only CTL_ELSE if NOT in "otherwise log" pattern
218
+ otherwise_in_log_pattern = "otherwise" in text_lower and any(word in text_lower for word in ["log", "record", "print"])
219
+
220
+ ctl_keywords = {
221
+ TokenType.CTL_TRY: ["try", "attempt", "attempting", "trying to", "give it a shot"],
222
+ TokenType.CTL_CATCH: ["catch", "handle error", "on error", "except", "when error", "on failure", "error handler"],
223
+ TokenType.CTL_FINALLY: ["finally", "cleanup", "afterwards", "always do"],
224
+ TokenType.CTL_IF: ["if", "conditional", "when", "whenever", "in case", "depending on"],
225
+ TokenType.CTL_ELSE: ["else", "alternative", "or else", "fallback"], # "otherwise" excluded if followed by log
226
+ TokenType.CTL_LOOP: ["loop", "iterate", "repeat", "cycle", "for each", "while"],
227
+ }
228
+ for token_type, keywords in ctl_keywords.items():
229
+ # Skip CTL_IF in error context
230
+ if token_type == TokenType.CTL_IF and has_error_context:
231
+ continue
232
+ # Skip CTL_ELSE if in "otherwise log" pattern
233
+ if token_type == TokenType.CTL_ELSE and otherwise_in_log_pattern:
234
+ continue
235
+ for keyword in keywords:
236
+ pos = text_lower.find(keyword)
237
+ if pos != -1:
238
+ detected["control"].append(DetectedToken(
239
+ token=Token(type=token_type),
240
+ position=pos,
241
+ keyword=keyword
242
+ ))
243
+ break
244
+
245
+ # ERROR HANDLING detection
246
+ # Check for "on failure" pattern - don't add ERR:FAIL for this
247
+ on_failure_pattern = "on failure" in text_lower or "on error" in text_lower
248
+
249
+ err_keywords = {
250
+ TokenType.ERR_RETRY: ["retry", "try again", "reattempt", "attempt again", "keep trying"],
251
+ TokenType.ERR_LOG: ["log", "logging", "record", "write log", "log entry", "log error"],
252
+ TokenType.ERR_FAIL: ["fail", "throw error", "raise error", "abort on error"], # "failure" excluded if in "on failure"
253
+ TokenType.ERR_IGNORE: ["ignore", "skip error", "continue on error", "suppress error"],
254
+ }
255
+ for token_type, keywords in err_keywords.items():
256
+ for keyword in keywords:
257
+ # Skip ERR:FAIL detection for "failure" in "failed records" or "on failure" context
258
+ if token_type == TokenType.ERR_FAIL:
259
+ if "failed" in text_lower or "on failure" in text_lower:
260
+ # Only add if explicit "fail" keyword (not "failed" or "failure")
261
+ if keyword == "fail" and re.search(r'\bfail\b', text_lower):
262
+ pos = text_lower.find(keyword)
263
+ if pos != -1:
264
+ detected["errors"].append(DetectedToken(
265
+ token=Token(type=token_type),
266
+ position=pos,
267
+ keyword=keyword
268
+ ))
269
+ continue
270
+ pos = text_lower.find(keyword)
271
+ if pos != -1:
272
+ detected["errors"].append(DetectedToken(
273
+ token=Token(type=token_type),
274
+ position=pos,
275
+ keyword=keyword
276
+ ))
277
+ break
278
+
279
+ # OPERATION detection - support MULTIPLE operations
280
+ # Check for "ensure X returns" pattern - this should NOT be OP:validate
281
+ ensure_returns_pattern = re.search(r'ensure\s+(?:it\s+)?returns?', text_lower)
282
+
283
+ op_keywords = {
284
+ TokenType.OP_FETCH: ["fetch", "get", "retrieve", "download"],
285
+ TokenType.OP_PARSE: ["parse", "analyze", "extract"],
286
+ TokenType.OP_TRANSFORM: ["transform", "convert", "change"],
287
+ TokenType.OP_SEARCH: ["search", "find", "lookup"],
288
+ TokenType.OP_VALIDATE: ["validate", "verify"], # "check" and "ensure" excluded in some contexts
289
+ TokenType.OP_FILTER: ["filter", "sift", "screen"],
290
+ TokenType.OP_AGGREGATE: ["aggregate", "combine", "merge", "summarize"],
291
+ TokenType.OP_PROCESS: ["process", "handle"],
292
+ }
293
+ for token_type, keywords in op_keywords.items():
294
+ # Skip OP_VALIDATE if in "ensure returns" context
295
+ if token_type == TokenType.OP_VALIDATE and ensure_returns_pattern:
296
+ # Still check for validate/verify but not check/ensure
297
+ keywords = [k for k in keywords if k not in ["check", "ensure"]]
298
+ for keyword in keywords:
299
+ pos = text_lower.find(keyword)
300
+ if pos != -1:
301
+ detected["operations"].append(DetectedToken(
302
+ token=Token(type=token_type),
303
+ position=pos,
304
+ keyword=keyword
305
+ ))
306
+ break # Only first match per operation type
307
+
308
+ # Default to COMPUTE if no operation found (unless question)
309
+ if not detected["operations"]:
310
+ if text.strip().endswith("?"):
311
+ detected["operations"].append(DetectedToken(
312
+ token=Token(type=TokenType.OP_SEARCH),
313
+ position=0,
314
+ keyword="?"
315
+ ))
316
+ else:
317
+ detected["operations"].append(DetectedToken(
318
+ token=Token(type=TokenType.OP_COMPUTE),
319
+ position=0,
320
+ keyword="compute"
321
+ ))
322
+
323
+ # SOURCE detection
324
+ src_keywords = {
325
+ TokenType.SRC_API: ["api", "endpoint", "rest", "graphql"],
326
+ TokenType.SRC_DB: ["database", "db", "sql", "nosql"],
327
+ TokenType.SRC_FILE: ["file", "csv", "json file", "data file"],
328
+ TokenType.SRC_MEM: ["memory", "cache", "ram"],
329
+ }
330
+ for token_type, keywords in src_keywords.items():
331
+ for keyword in keywords:
332
+ pos = text_lower.find(keyword)
333
+ if pos != -1:
334
+ detected["sources"].append(DetectedToken(
335
+ token=Token(type=token_type),
336
+ position=pos,
337
+ keyword=keyword
338
+ ))
339
+ break
340
+
341
+ # PARAMETER detection - extract values with regex
342
+ # Check for ID/key patterns first (more specific)
343
+ param_patterns = [
344
+ # ID detection (most specific - "ID 12345", "user ID 12345")
345
+ (TokenType.PARAM_KEY, r'(?:user\s+)?(?:id|identifier)\s*(?:of|:|=)?\s*(\d+[\w-]*)', 'id'),
346
+ # API key detection
347
+ (TokenType.PARAM_KEY, r'(?:api\s+)?key\s*(?:of|:|=)?\s*["\']?([\w-]+)["\']?', 'key'),
348
+ # Times/retry count - handle "retry X times" or "retry ... X times"
349
+ (TokenType.PARAM_TIMES, r'(?:retry|repeat)\s+(?:.*?)?(\d+)\s+times?', 'times'),
350
+ (TokenType.PARAM_TIMEOUT, r'timeout\s*(?:of|:|=)?\s*(\d+)\s*(?:seconds?|secs?|s)?', 'timeout'),
351
+ (TokenType.PARAM_TIMEOUT, r'timeout\s*(?:of|:|=)?\s*(\d+)\s*(?:seconds?|secs?|s)?', 'timeout'),
352
+ # Limit - handle "X records" or "limit X"
353
+ (TokenType.PARAM_LIMIT, r'(?:process|batch|handle)\s+(\d+)\s+(?:records?|items?|entries?)', 'limit'),
354
+ (TokenType.PARAM_LIMIT, r'(?:limit|max|maximum)\s*(?:of|:|=)?\s*(\d+)|(?:at\s+most)\s*(\d+)', 'limit2'),
355
+ (TokenType.PARAM_OFFSET, r'(?:offset|skip)\s*(?:of|:|=)?\s*(\d+)', 'offset'),
356
+ # Token/auth
357
+ (TokenType.PARAM_TOKEN, r'(?:auth|bearer|access)?\s*token\s*(?:of|:|=)?\s*["\']?([\w.-]+)["\']?', 'token'),
358
+ # Query (least specific - check last) - capture only up to next delimiter
359
+ # Exclude when "search" is followed by database/api/file (sources)
360
+ (TokenType.PARAM_QUERY, r'(?:query|find)\s+(?:for|:|=)?\s*["\']?([^"\']{1,30}?)(?:["\']|,|\.|and)\s', 'query'),
361
+ ]
362
+ for token_type, pattern, name in param_patterns:
363
+ match = re.search(pattern, text_lower)
364
+ if match:
365
+ # Handle PARAM_LIMIT which has two groups
366
+ if token_type == TokenType.PARAM_LIMIT:
367
+ value = match.group(1) if match.group(1) else match.group(2)
368
+ # For limit2, use the position of the matched group
369
+ if name == 'limit2' and match.group(2):
370
+ param_pos = text_lower.find(match.group(2), match.start())
371
+ else:
372
+ param_pos = match.start()
373
+ elif name == 'id':
374
+ value = match.group(1)
375
+ param_pos = match.start()
376
+ elif name == 'times':
377
+ value = match.group(1)
378
+ # For PARAM:TIMES, find the position of the number (not the start of "retry")
379
+ param_pos = text_lower.find(match.group(1), match.start())
380
+ else:
381
+ value = match.group(1) if match.lastindex and match.group(1) else None
382
+ param_pos = match.start()
383
+ detected["params"].append(DetectedToken(
384
+ token=Token(type=token_type, value=value),
385
+ position=param_pos,
386
+ keyword=match.group(0)
387
+ ))
388
+
389
+ # RETURN type detection - IMPORTANT: detect ALL occurrences
390
+ ret_keywords = {
391
+ TokenType.RET_JSON: ["json", "object"],
392
+ TokenType.RET_TEXT: ["csv", "text", "plain"],
393
+ TokenType.RET_LIST: ["list", "array"],
394
+ TokenType.RET_DICT: ["dictionary", "dict", "map"],
395
+ TokenType.RET_BOOL: ["boolean", "bool", "true", "false"],
396
+ TokenType.RET_NUM: ["number", "numeric"],
397
+ }
398
+ for token_type, keywords in ret_keywords.items():
399
+ for keyword in keywords:
400
+ # Find ALL occurrences, not just first
401
+ start = 0
402
+ found = False
403
+ while True:
404
+ pos = text_lower.find(keyword, start)
405
+ if pos == -1:
406
+ break
407
+ detected["returns"].append(DetectedToken(
408
+ token=Token(type=token_type),
409
+ position=pos,
410
+ keyword=keyword
411
+ ))
412
+ start = pos + 1
413
+ found = True
414
+ if found:
415
+ break # Only use first matching keyword set per type
416
+
417
+ # TYPE constraint detection - only explicit "type X" patterns, not return keywords
418
+ # Filter out TYPE tokens that overlap with RETURN tokens (avoid duplicates)
419
+ ret_positions = {ret.position for ret in detected["returns"]}
420
+
421
+ # Special pattern: "list of strings" or "list of <type>" should add TYPE constraint
422
+ list_of_match = re.search(r'list\s+(?:of\s+)?(?:strings?|ints?|floats?|strs?|texts?|booleans?)', text_lower)
423
+ if list_of_match and "list" not in {rt.position for rt in detected["returns"]}:
424
+ # Extract the type from the match
425
+ matched_text = list_of_match.group(0).lower()
426
+ if "string" in matched_text or "str" in matched_text or "text" in matched_text:
427
+ # Add TYPE:str at the position of the type word
428
+ type_pos = text_lower.find("string", list_of_match.start())
429
+ if type_pos == -1:
430
+ type_pos = text_lower.find("str", list_of_match.start())
431
+ if type_pos != -1:
432
+ detected["types"].append(DetectedToken(
433
+ token=Token(type=TokenType.TYPE_STR),
434
+ position=type_pos,
435
+ keyword="string"
436
+ ))
437
+
438
+ type_keywords = {
439
+ TokenType.TYPE_STR: ["type str", "type string", "string type", "typed list of strings", r'\btype\s*[:=]\s*str(?:ing)?\b'],
440
+ TokenType.TYPE_INT: ["type int", "integer type", "as integer", "to integer", r'\btype\s*[:=]\s*int(?:eger)?\b'],
441
+ TokenType.TYPE_FLOAT: ["type float", "float type", "decimal type", r'\btype\s*[:=]\s*float\b'],
442
+ TokenType.TYPE_BOOL: ["type bool", "boolean type", "as boolean", r'\btype\s*[:=]\s*bool(?:ean)?\b'],
443
+ TokenType.TYPE_LIST: ["type list", "list type", "array type", "as list", "to list", r'\btype\s*[:=]\s*list\b'],
444
+ TokenType.TYPE_DICT: ["type dict", "dict type", "map type", "as dict", "to dict", r'\btype\s*[:=]\s*dict\b'],
445
+ TokenType.TYPE_ANY: ["type any", "any type", r'\btype\s*[:=]\s*any\b'],
446
+ }
447
+ for token_type, keywords in type_keywords.items():
448
+ for keyword in keywords:
449
+ if keyword.startswith(r'\b'): # Regex pattern
450
+ match = re.search(keyword, text_lower)
451
+ if match and match.start() not in ret_positions:
452
+ detected["types"].append(DetectedToken(
453
+ token=Token(type=token_type),
454
+ position=match.start(),
455
+ keyword=keyword
456
+ ))
457
+ break
458
+ else:
459
+ pos = text_lower.find(keyword)
460
+ if pos != -1 and pos not in ret_positions:
461
+ detected["types"].append(DetectedToken(
462
+ token=Token(type=token_type),
463
+ position=pos,
464
+ keyword=keyword
465
+ ))
466
+ break
467
+
468
+ return detected
469
+
470
+ def _build_semantic_groups(self, text: str, detected: dict) -> list[dict]:
471
+ """Build operation groups based on text position."""
472
+ import re
473
+ text_lower = text.lower()
474
+
475
+ # Sort operations by position
476
+ operations = sorted(detected["operations"], key=lambda x: x.position)
477
+ sources = sorted(detected["sources"], key=lambda x: x.position)
478
+ # Deduplicate returns by token type (keep first occurrence of each type)
479
+ unique_returns = {}
480
+ for ret in detected["returns"]:
481
+ if ret.token.type not in unique_returns:
482
+ unique_returns[ret.token.type] = ret
483
+ returns = sorted(unique_returns.values(), key=lambda x: x.position)
484
+ params = sorted(detected["params"], key=lambda x: x.position)
485
+
486
+ groups = []
487
+ used_sources = set()
488
+ used_returns = set()
489
+ used_params = set()
490
+
491
+ # Detect "return as <type>" or "return <type>" pattern for final return
492
+ final_return = None
493
+ final_return_pos = len(text_lower)
494
+ return_match = re.search(r'return\s+(?:as\s+)?(?:a\s+)?(\w+)', text_lower)
495
+ if return_match:
496
+ return_type = return_match.group(1)
497
+ # Map return type to token
498
+ return_map = {
499
+ "json": TokenType.RET_JSON, "object": TokenType.RET_JSON,
500
+ "csv": TokenType.RET_TEXT, "text": TokenType.RET_TEXT, "plain": TokenType.RET_TEXT,
501
+ "list": TokenType.RET_LIST, "array": TokenType.RET_LIST,
502
+ "dict": TokenType.RET_DICT, "dictionary": TokenType.RET_DICT, "map": TokenType.RET_DICT,
503
+ "bool": TokenType.RET_BOOL, "boolean": TokenType.RET_BOOL,
504
+ "number": TokenType.RET_NUM, "numeric": TokenType.RET_NUM,
505
+ }
506
+ if return_type in return_map:
507
+ final_return = Token(type=return_map[return_type])
508
+ final_return_pos = return_match.start()
509
+
510
+ for i, op in enumerate(operations):
511
+ group = {
512
+ "operation": op.token,
513
+ "source": None,
514
+ "returns": [],
515
+ "params": []
516
+ }
517
+
518
+ # Find nearest source BEFORE this operation (or first op gets first source)
519
+ if i == 0 and sources:
520
+ # First operation gets the source
521
+ group["source"] = sources[0].token
522
+ used_sources.add(0)
523
+ else:
524
+ # Check for source between previous op and this one
525
+ prev_pos = operations[i-1].position if i > 0 else 0
526
+ for j, src in enumerate(sources):
527
+ if j not in used_sources and prev_pos < src.position <= op.position:
528
+ group["source"] = src.token
529
+ used_sources.add(j)
530
+ break
531
+
532
+ # Find returns that are "near" this operation
533
+ next_op_pos = operations[i+1].position if i+1 < len(operations) else len(text_lower)
534
+
535
+ for j, ret in enumerate(returns):
536
+ if j not in used_returns:
537
+ # Skip if this is the final return (belongs to last operation)
538
+ if abs(ret.position - final_return_pos) < 5:
539
+ continue
540
+ # Check for explicit "to <type>" pattern with this operation
541
+ if f"{op.keyword} to" in text_lower or f"{op.keyword}s to" in text_lower:
542
+ to_pos = text_lower.find(" to ", op.position)
543
+ if to_pos != -1 and to_pos < next_op_pos:
544
+ if to_pos < ret.position < to_pos + 20:
545
+ group["returns"].append(ret.token)
546
+ used_returns.add(j)
547
+ # Otherwise, only assign return to first operation if it's very close
548
+ elif i == 0 and ret.position < op.position + 30:
549
+ # First operation gets returns that appear early
550
+ group["returns"].append(ret.token)
551
+ used_returns.add(j)
552
+
553
+ # Find params that are "near" this operation
554
+ # Exception: PARAM:times that comes after ERR:retry should NOT be added to operation group
555
+ err_retry_positions = [e.position for e in detected["errors"] if e.token.type == TokenType.ERR_RETRY]
556
+
557
+ for j, param in enumerate(params):
558
+ if j not in used_params:
559
+ # Skip PARAM:times if it comes after ERR:retry (belongs with error handling)
560
+ if param.token.type == TokenType.PARAM_TIMES:
561
+ # Check if there's an ERR:retry before this param
562
+ if any(err_pos < param.position for err_pos in err_retry_positions):
563
+ # Mark as used so it doesn't get added by "Handle remaining params"
564
+ used_params.add(j)
565
+ continue # Don't add to operation group
566
+ if op.position <= param.position < next_op_pos:
567
+ group["params"].append(param.token)
568
+ used_params.add(j)
569
+
570
+ groups.append(group)
571
+
572
+ # Assign final return to last operation (but only if not already added)
573
+ if groups and final_return:
574
+ # Check if this return type is already in the last group
575
+ last_return_types = {r.type for r in groups[-1]["returns"]}
576
+ if final_return.type not in last_return_types:
577
+ groups[-1]["returns"].append(final_return)
578
+ # Mark returns near final_return_pos as used
579
+ for j, ret in enumerate(returns):
580
+ if abs(ret.position - final_return_pos) < 5:
581
+ used_returns.add(j)
582
+
583
+ # Handle remaining returns - assign to last operation
584
+ if groups:
585
+ for j, ret in enumerate(returns):
586
+ if j not in used_returns:
587
+ # Check if this return type is already in the last group
588
+ last_return_types = {r.type for r in groups[-1]["returns"]}
589
+ if ret.token.type not in last_return_types:
590
+ groups[-1]["returns"].append(ret.token)
591
+
592
+ # Handle remaining params - assign to last operation
593
+ if groups:
594
+ for j, param in enumerate(params):
595
+ if j not in used_params:
596
+ groups[-1]["params"].append(param.token)
597
+
598
+ return groups
599
+
600
+ def _flatten_groups(self, detected: dict, groups: list[dict]) -> TokenSequence:
601
+ """Flatten semantic groups into final token sequence.
602
+
603
+ Token ordering follows semantic flow:
604
+ 1. CTL:try (if present) - comes first
605
+ 2. MOD:async, MOD:batch, MOD:parallel (operation modifiers, sorted by type priority)
606
+ 3. Operation groups (OP + SRC + PARAM + RET)
607
+ 4. MOD:cached (can come after operation)
608
+ 5. CTL:catch, CTL:finally (error handling blocks)
609
+ 6. ERR:retry, ERR:log (error handling actions)
610
+ 7. TYPE constraints
611
+ """
612
+ tokens = TokenSequence()
613
+
614
+ # Separate control flow tokens
615
+ try_tokens = [dt for dt in detected["control"] if dt.token.type == TokenType.CTL_TRY]
616
+ catch_tokens = [dt for dt in detected["control"] if dt.token.type == TokenType.CTL_CATCH]
617
+ finally_tokens = [dt for dt in detected["control"] if dt.token.type == TokenType.CTL_FINALLY]
618
+ other_ctl_tokens = [dt for dt in detected["control"] if dt.token.type not in [TokenType.CTL_TRY, TokenType.CTL_CATCH, TokenType.CTL_FINALLY]]
619
+
620
+ # Separate modifiers - cached can come after operation, sort others by text position
621
+ cached_mods = [dt for dt in detected["modifiers"] if dt.token.type == TokenType.MOD_CACHED]
622
+ other_mods = [dt for dt in detected["modifiers"] if dt.token.type != TokenType.MOD_CACHED]
623
+ # Sort modifiers by text position (maintains order as they appear in text)
624
+ other_mods.sort(key=lambda dt: dt.position)
625
+
626
+ # Separate error tokens - fail should only be explicit
627
+ # Also extract any PARAM:times that should come with ERR:retry
628
+ err_tokens = []
629
+ times_params = []
630
+ for dt in detected["errors"]:
631
+ if dt.token.type == TokenType.ERR_RETRY:
632
+ err_tokens.append(dt)
633
+ # Find associated PARAM:times
634
+ for pt in detected["params"]:
635
+ if pt.token.type == TokenType.PARAM_TIMES and pt.position > dt.position:
636
+ times_params.append(pt)
637
+ break
638
+ else:
639
+ err_tokens.append(dt)
640
+ # Remove times params from regular params list (they'll be added with error tokens)
641
+ detected_params_for_groups = [pt for pt in detected["params"] if pt not in times_params]
642
+
643
+ # Collect all return types used in groups to filter out duplicate TYPE tokens
644
+ used_return_types = set()
645
+ for group in groups:
646
+ for ret in group["returns"]:
647
+ ret_to_type_map = {
648
+ TokenType.RET_JSON: TokenType.TYPE_STR,
649
+ TokenType.RET_TEXT: TokenType.TYPE_STR,
650
+ TokenType.RET_LIST: TokenType.TYPE_LIST,
651
+ TokenType.RET_DICT: TokenType.TYPE_DICT,
652
+ TokenType.RET_BOOL: TokenType.TYPE_BOOL,
653
+ TokenType.RET_NUM: TokenType.TYPE_INT,
654
+ }
655
+ if ret.type in ret_to_type_map:
656
+ used_return_types.add(ret_to_type_map[ret.type])
657
+
658
+ # 1. CTL:try first (if present)
659
+ for dt in try_tokens:
660
+ tokens.add(dt.token)
661
+
662
+ # 2. Other modifiers (batch, parallel, async) - sorted by priority
663
+ for dt in other_mods:
664
+ tokens.add(dt.token)
665
+
666
+ # 3. Other control flow (if, else, loop)
667
+ for dt in other_ctl_tokens:
668
+ tokens.add(dt.token)
669
+
670
+ # 4. Error handling (retry, log, etc.) - but save some for after catch
671
+ # Split: retry/log can go before or after catch depending on context
672
+ main_err_tokens = []
673
+ post_catch_err_tokens = []
674
+ for dt in err_tokens:
675
+ # ERR:retry and ERR:log typically come after CTL:catch
676
+ if dt.token.type in [TokenType.ERR_RETRY, TokenType.ERR_LOG]:
677
+ post_catch_err_tokens.append(dt)
678
+ else:
679
+ main_err_tokens.append(dt)
680
+
681
+ # 5. Operation groups in order
682
+ for group in groups:
683
+ tokens.add(group["operation"])
684
+ if group["source"]:
685
+ tokens.add(group["source"])
686
+ for param in group["params"]:
687
+ tokens.add(param)
688
+ for ret in group["returns"]:
689
+ tokens.add(ret)
690
+
691
+ # 6. MOD:cached (can come after operation)
692
+ for dt in cached_mods:
693
+ tokens.add(dt.token)
694
+
695
+ # 7. CTL:catch - add implicitly if we have try + error handling but no explicit catch
696
+ has_try = len(try_tokens) > 0
697
+ has_explicit_catch = len(catch_tokens) > 0
698
+ has_error_handling = len(post_catch_err_tokens) > 0
699
+
700
+ if has_try and has_error_handling and not has_explicit_catch:
701
+ # Add implicit CTL:catch
702
+ tokens.add(Token(type=TokenType.CTL_CATCH))
703
+
704
+ for dt in catch_tokens:
705
+ tokens.add(dt.token)
706
+ for dt in finally_tokens:
707
+ tokens.add(dt.token)
708
+
709
+ # 8. Post-catch error tokens (retry, log) and their associated params
710
+ for dt in post_catch_err_tokens:
711
+ tokens.add(dt.token)
712
+ # Add PARAM:times if it's associated with this ERR:retry
713
+ if dt.token.type == TokenType.ERR_RETRY:
714
+ for pt in detected["params"]:
715
+ if pt.token.type == TokenType.PARAM_TIMES and pt.position > dt.position:
716
+ tokens.add(pt.token)
717
+ break
718
+
719
+ # 9. Other error tokens
720
+ for dt in main_err_tokens:
721
+ tokens.add(dt.token)
722
+
723
+ # 10. Type constraints at end - filter out if return type already used, also deduplicate
724
+ seen_types = set()
725
+ for dt in detected["types"]:
726
+ if dt.token.type not in used_return_types and dt.token.type not in seen_types:
727
+ tokens.add(dt.token)
728
+ seen_types.add(dt.token.type)
729
+
730
+ return tokens
731
+
732
+ def _parse_molt_tokens(self, molt_text: str) -> TokenSequence:
733
+ """
734
+ Parse MoltLang text into tokens.
735
+
736
+ LLM-friendly: Case-insensitive parsing for flexibility.
737
+ Accepts both [RET:JSON] and [RET:json] - normalizes to enum values.
738
+
739
+ Args:
740
+ molt_text: MoltLang string representation
741
+
742
+ Returns:
743
+ TokenSequence containing parsed tokens
744
+ """
745
+ tokens = TokenSequence()
746
+ import re
747
+
748
+ # Find all token patterns like [TYPE:VALUE] - case-insensitive
749
+ pattern = r"\[([a-zA-Z]+):([a-zA-Z_0-9]+)(?:=([^\]]+))?\]"
750
+ matches = re.findall(pattern, molt_text)
751
+
752
+ for category, value, param in matches:
753
+ # Normalize to uppercase for enum lookup
754
+ token_type_str = f"{category.upper()}_{value.upper()}"
755
+ try:
756
+ token_type = TokenType[token_type_str]
757
+ token = Token(type=token_type, value=param if param else None)
758
+ tokens.add(token)
759
+ except KeyError:
760
+ # Unknown token type, skip or handle as custom
761
+ pass
762
+
763
+ return tokens
764
+
765
+ def _generate_human_translation(
766
+ self, tokens: TokenSequence, target_language: str = "en"
767
+ ) -> str:
768
+ """
769
+ Generate human language translation from MoltLang tokens.
770
+
771
+ Args:
772
+ tokens: TokenSequence to translate
773
+ target_language: Target human language (default: English)
774
+
775
+ Returns:
776
+ Human language translation
777
+ """
778
+ parts: list[str] = []
779
+
780
+ for token in tokens.tokens:
781
+ # Operation translations
782
+ if token.type == TokenType.OP_FETCH:
783
+ parts.append("Fetch")
784
+ elif token.type == TokenType.OP_PARSE:
785
+ parts.append("Parse")
786
+ elif token.type == TokenType.OP_TRANSFORM:
787
+ parts.append("Transform")
788
+ elif token.type == TokenType.OP_SEARCH:
789
+ parts.append("Search")
790
+ elif token.type == TokenType.OP_VALIDATE:
791
+ parts.append("Validate")
792
+ elif token.type == TokenType.OP_FILTER:
793
+ parts.append("Filter")
794
+ elif token.type == TokenType.OP_COMPUTE:
795
+ parts.append("Compute")
796
+
797
+ # Source translations
798
+ elif token.type == TokenType.SRC_API:
799
+ parts.append("data from API")
800
+ elif token.type == TokenType.SRC_DB:
801
+ parts.append("data from database")
802
+ elif token.type == TokenType.SRC_FILE:
803
+ parts.append("data from file")
804
+ elif token.type == TokenType.SRC_MEM:
805
+ parts.append("data from memory")
806
+
807
+ # Return type translations
808
+ elif token.type == TokenType.RET_JSON:
809
+ parts.append("return JSON")
810
+ elif token.type == TokenType.RET_TEXT:
811
+ parts.append("return text")
812
+ elif token.type == TokenType.RET_BOOL:
813
+ parts.append("return boolean")
814
+ elif token.type == TokenType.RET_NUM:
815
+ parts.append("return number")
816
+ elif token.type == TokenType.RET_LIST:
817
+ parts.append("return list")
818
+ elif token.type == TokenType.RET_DICT:
819
+ parts.append("return dictionary")
820
+
821
+ return " ".join(parts) if parts else "Empty operation"
822
+
823
+ def _calculate_confidence(self, original: str, tokens: TokenSequence) -> float:
824
+ """
825
+ Calculate translation confidence score.
826
+
827
+ Enhanced to consider semantic completeness, not just token count.
828
+
829
+ Args:
830
+ original: Original text
831
+ tokens: Translated token sequence
832
+
833
+ Returns:
834
+ Confidence score (0.0-1.0)
835
+ """
836
+ if len(tokens) == 0:
837
+ return 0.0
838
+
839
+ # Base score from token count (capped at 0.7 for 3+ tokens)
840
+ base_score = min(0.7, 0.5 + (len(tokens) * 0.1))
841
+
842
+ # Semantic completeness bonus (only if complete)
843
+ has_operation = any("OP:" in t.type.value for t in tokens.tokens)
844
+ has_source = any("SRC:" in t.type.value for t in tokens.tokens)
845
+ has_return = any("RET:" in t.type.value for t in tokens.tokens)
846
+
847
+ completeness = 0.0
848
+ # Only give bonus for complete operations (operation + source/return)
849
+ if has_operation and (has_source or has_return):
850
+ completeness += 0.25
851
+ # Extra bonus for having both source AND return
852
+ if has_operation and has_source and has_return:
853
+ completeness += 0.15
854
+ # Small bonus for having control flow or error handling
855
+ has_ctl = any("CTL:" in t.type.value for t in tokens.tokens)
856
+ has_err = any("ERR:" in t.type.value for t in tokens.tokens)
857
+ if has_ctl or has_err:
858
+ completeness += 0.05
859
+
860
+ return round(min(1.0, base_score + completeness), 2)
861
+
862
+ def _apply_fallback_rules(self, text: str, tokens: TokenSequence) -> TokenSequence:
863
+ """
864
+ Apply fallback rules when direct matching fails.
865
+
866
+ This uses heuristics to infer likely tokens from context.
867
+
868
+ Args:
869
+ text: Original human language text
870
+ tokens: Current token sequence
871
+
872
+ Returns:
873
+ Potentially modified token sequence
874
+ """
875
+ import re
876
+ text_lower = text.lower()
877
+
878
+ # Fallback 1: "safe" or "careful" implies error handling
879
+ if any(word in text_lower for word in ["safe", "careful", "graceful", "handle"]):
880
+ if not any(t.type.value.startswith("CTL:") for t in tokens.tokens):
881
+ tokens.add(Token(type=TokenType.CTL_TRY))
882
+ tokens.add(Token(type=TokenType.CTL_CATCH))
883
+
884
+ # Fallback 2: "ensure" or "guarantee" implies validation
885
+ # Exception: "ensure [it] returns" describes return type, not validation
886
+ ensure_returns_pattern = re.search(r'ensure\s+(?:it\s+)?returns?', text_lower)
887
+ if any(word in text_lower for word in ["ensure", "guarantee", "verify"]):
888
+ # Skip if "ensure returns" pattern (describes return type, not validation)
889
+ if not ensure_returns_pattern:
890
+ if not any(t.type == TokenType.OP_VALIDATE for t in tokens.tokens):
891
+ tokens.add(Token(type=TokenType.OP_VALIDATE))
892
+
893
+ # Fallback 3: Questions default to search
894
+ if text.strip().endswith("?"):
895
+ if not any(t.type.value.startswith("OP:") for t in tokens.tokens):
896
+ tokens.add(Token(type=TokenType.OP_SEARCH))
897
+ # Fallback 3b: Questions with source but no operation
898
+ if text.strip().endswith("?"):
899
+ has_source = any(t.type.value.startswith("SRC:") for t in tokens.tokens)
900
+ has_operation = any(t.type.value.startswith("OP:") for t in tokens.tokens)
901
+ if has_source and not has_operation:
902
+ tokens.add(Token(type=TokenType.OP_SEARCH))
903
+
904
+ return tokens
905
+
906
+
907
+ # Convenience functions for direct usage
908
+
909
+ _translator_instance: MoltTranslator | None = None
910
+
911
+
912
+ def _get_translator() -> MoltTranslator:
913
+ """Get or create the shared translator instance."""
914
+ global _translator_instance
915
+ if _translator_instance is None:
916
+ _translator_instance = MoltTranslator()
917
+ return _translator_instance
918
+
919
+
920
+ def translate_to_molt(text: str, config: MoltConfig | None = None) -> str:
921
+ """
922
+ Translate human language text to MoltLang.
923
+
924
+ This is a convenience function that uses a shared translator instance.
925
+
926
+ Args:
927
+ text: Human language text to translate
928
+ config: Optional configuration override
929
+
930
+ Returns:
931
+ MoltLang string representation
932
+
933
+ Examples:
934
+ >>> from moltlang import translate_to_molt
935
+ >>> molt = translate_to_molt("Fetch data from API and return JSON")
936
+ >>> print(molt)
937
+ [OP:FETCH][SRC:API][RET:JSON]
938
+ """
939
+ translator = _get_translator()
940
+ result = translator.translate_to_molt(text, config)
941
+ return result.text
942
+
943
+
944
+ def translate_from_molt(molt_text: str, config: MoltConfig | None = None) -> str:
945
+ """
946
+ Translate MoltLang to human language text.
947
+
948
+ This is a convenience function that uses a shared translator instance.
949
+
950
+ Args:
951
+ molt_text: MoltLang text to translate
952
+ config: Optional configuration override
953
+
954
+ Returns:
955
+ Human language translation
956
+
957
+ Examples:
958
+ >>> from moltlang import translate_from_molt
959
+ >>> english = translate_from_molt("[OP:FETCH][SRC:API][RET:JSON]")
960
+ >>> print(english)
961
+ Fetch data from API return JSON
962
+ """
963
+ translator = _get_translator()
964
+ result = translator.translate_from_molt(molt_text, config)
965
+ return result.text