moltlang 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
moltlang/tokens.py ADDED
@@ -0,0 +1,311 @@
1
+ """
2
+ MoltLang token system.
3
+
4
+ This module defines the token types, token registry, and token operations
5
+ for the MoltLang language system.
6
+
7
+ LLM-Friendly Design: All token values use lowercase for natural generation.
8
+ The parser is case-insensitive and accepts both [RET:json] and [RET:JSON].
9
+ """
10
+
11
+ from dataclasses import dataclass, field
12
+ from enum import Enum
13
+ from typing import Any
14
+
15
+
16
+ class TokenType(Enum):
17
+ """
18
+ Types of tokens in the MoltLang language.
19
+
20
+ Low-hanging fruit categories for MVP:
21
+ - Operations: Common AI actions (fetch, parse, transform, etc.)
22
+ - Sources: Data sources (API, database, file, etc.)
23
+ - Parameters: Common parameter types
24
+ - Returns: Return value types
25
+ - Control: Control flow structures
26
+
27
+ Note: All values are lowercase for LLM-friendliness.
28
+ The parser accepts both cases (case-insensitive).
29
+ """
30
+
31
+ # Operation tokens - core AI actions
32
+ OP_FETCH = "OP:fetch" # Retrieve data
33
+ OP_PARSE = "OP:parse" # Parse structured data
34
+ OP_TRANSFORM = "OP:transform" # Transform data
35
+ OP_VALIDATE = "OP:validate" # Validate input
36
+ OP_COMPUTE = "OP:compute" # Perform computation
37
+ OP_SEARCH = "OP:search" # Search for data
38
+ OP_FILTER = "OP:filter" # Filter data
39
+ OP_MAP = "OP:map" # Map over data
40
+ OP_REDUCE = "OP:reduce" # Reduce data
41
+ OP_AGGREGATE = "OP:aggregate" # Aggregate results
42
+ OP_PROCESS = "OP:process" # Process data
43
+
44
+ # Source tokens - data sources
45
+ SRC_API = "SRC:api" # REST/GraphQL API
46
+ SRC_DB = "SRC:db" # Database
47
+ SRC_FILE = "SRC:file" # File system
48
+ SRC_MEM = "SRC:mem" # In-memory data
49
+ SRC_STREAM = "SRC:stream" # Data stream
50
+ SRC_QUEUE = "SRC:queue" # Message queue
51
+ SRC_CACHE = "SRC:cache" # Cache layer
52
+
53
+ # Parameter tokens - common parameter types
54
+ PARAM_TOKEN = "PARAM:token" # Authentication token
55
+ PARAM_KEY = "PARAM:key" # API key or identifier
56
+ PARAM_QUERY = "PARAM:query" # Query string
57
+ PARAM_BODY = "PARAM:body" # Request body
58
+ PARAM_HEADER = "PARAM:header" # HTTP header
59
+ PARAM_TIMEOUT = "PARAM:timeout" # Timeout value
60
+ PARAM_LIMIT = "PARAM:limit" # Result limit
61
+ PARAM_OFFSET = "PARAM:offset" # Pagination offset
62
+ PARAM_TIMES = "PARAM:times" # Retry count / repetition count
63
+
64
+ # Return type tokens
65
+ RET_JSON = "RET:json" # JSON format
66
+ RET_TEXT = "RET:text" # Plain text
67
+ RET_BIN = "RET:bin" # Binary data
68
+ RET_STREAM = "RET:stream" # Streaming response
69
+ RET_BOOL = "RET:bool" # Boolean result
70
+ RET_NUM = "RET:num" # Numeric result
71
+ RET_LIST = "RET:list" # List result
72
+ RET_DICT = "RET:dict" # Dictionary result
73
+ RET_NULL = "RET:null" # Null/void result
74
+
75
+ # Control flow tokens
76
+ CTL_IF = "CTL:if" # Conditional
77
+ CTL_ELSE = "CTL:else" # Alternative
78
+ CTL_LOOP = "CTL:loop" # Loop/iterate
79
+ CTL_BREAK = "CTL:break" # Exit loop
80
+ CTL_CONTINUE = "CTL:continue" # Next iteration
81
+ CTL_TRY = "CTL:try" # Error handling start
82
+ CTL_CATCH = "CTL:catch" # Error handler
83
+ CTL_FINALLY = "CTL:finally" # Cleanup block
84
+
85
+ # Data type tokens
86
+ TYPE_STR = "TYPE:str" # String type
87
+ TYPE_INT = "TYPE:int" # Integer type
88
+ TYPE_FLOAT = "TYPE:float" # Float type
89
+ TYPE_BOOL = "TYPE:bool" # Boolean type
90
+ TYPE_LIST = "TYPE:list" # List type
91
+ TYPE_DICT = "TYPE:dict" # Dictionary type
92
+ TYPE_ANY = "TYPE:any" # Any type
93
+
94
+ # Error handling tokens
95
+ ERR_RETRY = "ERR:retry" # Retry operation
96
+ ERR_FAIL = "ERR:fail" # Fail operation
97
+ ERR_LOG = "ERR:log" # Log error
98
+ ERR_IGNORE = "ERR:ignore" # Ignore error
99
+
100
+ # Modifiers
101
+ MOD_ASYNC = "MOD:async" # Async operation
102
+ MOD_BATCH = "MOD:batch" # Batch operation
103
+ MOD_PARALLEL = "MOD:parallel" # Parallel execution
104
+ MOD_CACHED = "MOD:cached" # Use cached value
105
+
106
+
107
+ @dataclass
108
+ class Token:
109
+ """
110
+ A single MoltLang token.
111
+
112
+ Attributes:
113
+ type: The token type
114
+ value: Optional value associated with the token
115
+ position: Position in the token sequence
116
+ """
117
+
118
+ type: TokenType
119
+ value: str | None = None
120
+ position: int = 0
121
+
122
+ def __str__(self) -> str:
123
+ """Return the string representation of the token."""
124
+ if self.value:
125
+ return f"[{self.type.value}={self.value}]"
126
+ return f"[{self.type.value}]"
127
+
128
+ def __len__(self) -> int:
129
+ """Return the token length (always 1 for MoltLang tokens)."""
130
+ return 1
131
+
132
+
133
+ @dataclass
134
+ class TokenSequence:
135
+ """
136
+ A sequence of MoltLang tokens.
137
+
138
+ Attributes:
139
+ tokens: List of tokens in the sequence
140
+ metadata: Optional metadata about the sequence
141
+ """
142
+
143
+ tokens: list[Token] = field(default_factory=list)
144
+ metadata: dict[str, Any] = field(default_factory=dict)
145
+
146
+ def __str__(self) -> str:
147
+ """Return the string representation of the token sequence."""
148
+ return "".join(str(token) for token in self.tokens)
149
+
150
+ def __len__(self) -> int:
151
+ """Return the number of tokens in the sequence."""
152
+ return len(self.tokens)
153
+
154
+ def add(self, token: Token) -> "TokenSequence":
155
+ """Add a token to the sequence."""
156
+ token.position = len(self.tokens)
157
+ self.tokens.append(token)
158
+ return self
159
+
160
+ def token_count(self) -> int:
161
+ """Return the total token count."""
162
+ return len(self.tokens)
163
+
164
+ def compare_token_efficiency(self, english_word_count: int) -> float:
165
+ """
166
+ Compare token efficiency against English word count.
167
+
168
+ Args:
169
+ english_word_count: Number of words in English equivalent
170
+
171
+ Returns:
172
+ Token reduction percentage (0.0-1.0)
173
+ """
174
+ if english_word_count == 0:
175
+ return 0.0
176
+ return 1.0 - (self.token_count() / english_word_count)
177
+
178
+
179
+ class TokenRegistry:
180
+ """
181
+ Registry for managing MoltLang tokens.
182
+
183
+ Provides methods for looking up tokens, managing custom tokens,
184
+ and validating token sequences.
185
+ """
186
+
187
+ _instance: "TokenRegistry | None" = None
188
+ _tokens: dict[str, Token] = None
189
+
190
+ def __new__(cls) -> "TokenRegistry":
191
+ """Singleton pattern for token registry."""
192
+ if cls._instance is None:
193
+ cls._instance = super().__new__(cls)
194
+ cls._instance._initialize()
195
+ return cls._instance
196
+
197
+ def _initialize(self) -> None:
198
+ """Initialize the token registry with default tokens."""
199
+ # Initialize _tokens as a mutable dict
200
+ self._tokens = {}
201
+ # Initialize _custom_tokens as a mutable dict
202
+ self._custom_tokens = {}
203
+ # Register all TokenType values as tokens
204
+ for token_type in TokenType:
205
+ self._tokens[token_type.value] = Token(type=token_type)
206
+
207
+ def get(self, token_str: str) -> Token | None:
208
+ """
209
+ Get a token by its string representation (case-insensitive).
210
+
211
+ Args:
212
+ token_str: String representation of the token (e.g., "[OP:FETCH]" or "[OP:fetch]")
213
+
214
+ Returns:
215
+ Token if found, None otherwise
216
+ """
217
+ # Strip brackets if present
218
+ clean = token_str.strip("[]")
219
+ if "=" in clean:
220
+ clean = clean.split("=")[0]
221
+
222
+ # Try exact match first
223
+ if clean in self._tokens:
224
+ return self._tokens[clean]
225
+
226
+ # Try case-insensitive match for LLM-friendliness
227
+ clean_lower = clean.lower()
228
+ for key, token in self._tokens.items():
229
+ if key.lower() == clean_lower:
230
+ return token
231
+
232
+ # Check custom tokens
233
+ if clean in self._custom_tokens:
234
+ return self._custom_tokens[clean]
235
+
236
+ return None
237
+
238
+ def register_custom(self, name: str, token_type: TokenType) -> Token:
239
+ """
240
+ Register a custom token.
241
+
242
+ Args:
243
+ name: Name of the custom token
244
+ token_type: Type of the token
245
+
246
+ Returns:
247
+ The registered token
248
+ """
249
+ token = Token(type=token_type, value=name)
250
+ self._custom_tokens[f"{token_type.value}:{name}"] = token
251
+ return token
252
+
253
+ def list_tokens(self, token_type: TokenType | None = None) -> list[Token]:
254
+ """
255
+ List all tokens, optionally filtered by type.
256
+
257
+ Args:
258
+ token_type: Optional token type filter
259
+
260
+ Returns:
261
+ List of tokens
262
+ """
263
+ all_tokens = {**self._tokens, **self._custom_tokens}
264
+ if token_type:
265
+ return [t for t in all_tokens.values() if t.type == token_type]
266
+ return list(all_tokens.values())
267
+
268
+ def validate_sequence(self, sequence: TokenSequence) -> bool:
269
+ """
270
+ Validate a token sequence.
271
+
272
+ Args:
273
+ sequence: Token sequence to validate
274
+
275
+ Returns:
276
+ True if valid, False otherwise
277
+ """
278
+ for token in sequence.tokens:
279
+ if token not in self._tokens.values() and token not in self._custom_tokens.values():
280
+ return False
281
+ return True
282
+
283
+
284
+ # Convenience functions for common operations
285
+
286
+ def op(operation: str) -> Token:
287
+ """Create an operation token."""
288
+ return Token(type=TokenType[f"OP_{operation.upper()}"])
289
+
290
+
291
+ def src(source: str) -> Token:
292
+ """Create a source token."""
293
+ return Token(type=TokenType[f"SRC_{source.upper()}"])
294
+
295
+
296
+ def param(param_type: str) -> Token:
297
+ """Create a parameter token."""
298
+ return Token(type=TokenType[f"PARAM_{param_type.upper()}"])
299
+
300
+
301
+ def ret(return_type: str) -> Token:
302
+ """Create a return type token."""
303
+ return Token(type=TokenType[f"RET_{return_type.upper()}"])
304
+
305
+
306
+ def sequence(*tokens: Token) -> TokenSequence:
307
+ """Create a token sequence from tokens."""
308
+ seq = TokenSequence()
309
+ for token in tokens:
310
+ seq.add(token)
311
+ return seq
@@ -0,0 +1,12 @@
1
+ """
2
+ MoltLang Training Module.
3
+
4
+ This module provides functionality for training models on MoltLang.
5
+ """
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ from moltlang.training.data_gen import SyntheticDataGenerator
10
+ from moltlang.training.distill import KnowledgeDistillation
11
+
12
+ __all__ = ["SyntheticDataGenerator", "KnowledgeDistillation"]
@@ -0,0 +1,118 @@
1
+ """
2
+ Synthetic data generation for MoltLang training.
3
+
4
+ This module generates synthetic training data for fine-tuning models
5
+ on MoltLang.
6
+ """
7
+
8
+ import random
9
+ from typing import Any
10
+
11
+ from moltlang import translate_to_molt
12
+ from moltlang.tokens import TokenType
13
+
14
+
15
+ class SyntheticDataGenerator:
16
+ """
17
+ Generate synthetic training data for MoltLang.
18
+
19
+ Creates pairs of human language text and MoltLang translations
20
+ for training and fine-tuning language models.
21
+ """
22
+
23
+ def __init__(self):
24
+ """Initialize the data generator."""
25
+ self.templates = self._load_templates()
26
+
27
+ def _load_templates(self) -> list[dict[str, Any]]:
28
+ """Load templates for data generation."""
29
+ return [
30
+ {
31
+ "pattern": "Fetch {data} from {source}",
32
+ "tokens": ["OP:FETCH", "SRC:{source}"],
33
+ "values": {
34
+ "data": ["data", "user", "records", "information"],
35
+ "source": ["API", "database", "file", "cache"],
36
+ },
37
+ },
38
+ {
39
+ "pattern": "Parse {data_type} from {source}",
40
+ "tokens": ["OP:PARSE", "SRC:{source}"],
41
+ "values": {
42
+ "data_type": ["JSON", "XML", "CSV", "text"],
43
+ "source": ["file", "API", "stream", "buffer"],
44
+ },
45
+ },
46
+ {
47
+ "pattern": "Search for {query} in {source}",
48
+ "tokens": ["OP:SEARCH", "SRC:{source}"],
49
+ "values": {
50
+ "query": ["user", "record", "data", "item"],
51
+ "source": ["database", "file", "memory", "cache"],
52
+ },
53
+ },
54
+ {
55
+ "pattern": "Transform {data} and return {format}",
56
+ "tokens": ["OP:TRANSFORM", "RET:{format}"],
57
+ "values": {
58
+ "data": ["data", "input", "content"],
59
+ "format": ["JSON", "text", "list", "dict"],
60
+ },
61
+ },
62
+ {
63
+ "pattern": "Validate {input} from {source}",
64
+ "tokens": ["OP:VALIDATE", "SRC:{source}"],
65
+ "values": {
66
+ "input": ["input", "data", "parameters", "request"],
67
+ "source": ["API", "form", "file", "stream"],
68
+ },
69
+ },
70
+ ]
71
+
72
+ def generate(self, count: int = 100) -> list[dict[str, str]]:
73
+ """
74
+ Generate synthetic training examples.
75
+
76
+ Args:
77
+ count: Number of examples to generate
78
+
79
+ Returns:
80
+ List of training examples with 'input' and 'output' keys
81
+ """
82
+ examples = []
83
+
84
+ for _ in range(count):
85
+ template = random.choice(self.templates)
86
+ example = self._generate_from_template(template)
87
+ examples.append(example)
88
+
89
+ return examples
90
+
91
+ def _generate_from_template(self, template: dict[str, Any]) -> dict[str, str]:
92
+ """Generate a single example from a template."""
93
+ pattern = template["pattern"]
94
+ values = template["values"]
95
+
96
+ # Fill in random values
97
+ text = pattern
98
+ for key, options in values.items():
99
+ text = text.replace(f"{{{key}}}", random.choice(options))
100
+
101
+ # Generate MoltLang translation
102
+ molt = translate_to_molt(text)
103
+
104
+ return {"input": text, "output": molt.text}
105
+
106
+ def save_to_file(self, examples: list[dict[str, str]], filepath: str) -> None:
107
+ """
108
+ Save examples to a JSONL file.
109
+
110
+ Args:
111
+ examples: List of training examples
112
+ filepath: Path to save the file
113
+ """
114
+ import json
115
+
116
+ with open(filepath, "w") as f:
117
+ for example in examples:
118
+ f.write(json.dumps(example) + "\n")
@@ -0,0 +1,86 @@
1
+ """
2
+ Knowledge distillation for MoltLang models.
3
+
4
+ This module provides functionality for distilling knowledge from larger
5
+ teacher models into smaller student models trained on MoltLang.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any
10
+
11
+
12
+ @dataclass
13
+ class DistillationConfig:
14
+ """Configuration for knowledge distillation."""
15
+
16
+ teacher_model: str = "gpt-4"
17
+ student_model: str = "gpt-3.5-turbo"
18
+ temperature: float = 0.7
19
+ alpha: float = 0.5 # Balance between distillation and task loss
20
+ epochs: int = 3
21
+ batch_size: int = 8
22
+
23
+
24
+ class KnowledgeDistillation:
25
+ """
26
+ Knowledge distillation for training efficient MoltLang models.
27
+
28
+ Trains smaller student models to mimic larger teacher models
29
+ while using MoltLang for efficient representation.
30
+ """
31
+
32
+ def __init__(self, config: DistillationConfig | None = None):
33
+ """
34
+ Initialize the distillation trainer.
35
+
36
+ Args:
37
+ config: Optional distillation configuration
38
+ """
39
+ self.config = config or DistillationConfig()
40
+
41
+ def prepare_data(self, data: list[dict[str, Any]]) -> list[dict[str, Any]]:
42
+ """
43
+ Prepare data for distillation.
44
+
45
+ Args:
46
+ data: Raw training data
47
+
48
+ Returns:
49
+ Prepared data with teacher outputs
50
+ """
51
+ # Placeholder for data preparation
52
+ # In production, this would:
53
+ # 1. Translate inputs to MoltLang
54
+ # 2. Get teacher model outputs
55
+ # 3. Format for distillation training
56
+ return data
57
+
58
+ def train(self, train_data: list[dict[str, Any]]) -> dict[str, Any]:
59
+ """
60
+ Train student model using distillation.
61
+
62
+ Args:
63
+ train_data: Training data with teacher outputs
64
+
65
+ Returns:
66
+ Training metrics
67
+ """
68
+ # Placeholder for training logic
69
+ # In production, this would:
70
+ # 1. Load teacher and student models
71
+ # 2. Train student to mimic teacher
72
+ # 3. Use MoltLang for efficient intermediate representation
73
+ return {"loss": 0.0, "accuracy": 0.0}
74
+
75
+ def evaluate(self, test_data: list[dict[str, Any]]) -> dict[str, Any]:
76
+ """
77
+ Evaluate student model performance.
78
+
79
+ Args:
80
+ test_data: Test data
81
+
82
+ Returns:
83
+ Evaluation metrics
84
+ """
85
+ # Placeholder for evaluation logic
86
+ return {"accuracy": 0.0, "f1": 0.0}