multilingualprogramming 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- multilingualprogramming/__init__.py +74 -0
- multilingualprogramming/__main__.py +194 -0
- multilingualprogramming/codegen/__init__.py +12 -0
- multilingualprogramming/codegen/executor.py +215 -0
- multilingualprogramming/codegen/python_generator.py +592 -0
- multilingualprogramming/codegen/repl.py +489 -0
- multilingualprogramming/codegen/runtime_builtins.py +308 -0
- multilingualprogramming/core/__init__.py +12 -0
- multilingualprogramming/core/ir.py +29 -0
- multilingualprogramming/core/lowering.py +24 -0
- multilingualprogramming/datetime/__init__.py +11 -0
- multilingualprogramming/datetime/date_parser.py +190 -0
- multilingualprogramming/datetime/mp_date.py +210 -0
- multilingualprogramming/datetime/mp_datetime.py +153 -0
- multilingualprogramming/datetime/mp_time.py +147 -0
- multilingualprogramming/datetime/resource_loader.py +18 -0
- multilingualprogramming/exceptions.py +158 -0
- multilingualprogramming/imports.py +150 -0
- multilingualprogramming/keyword/__init__.py +13 -0
- multilingualprogramming/keyword/keyword_registry.py +249 -0
- multilingualprogramming/keyword/keyword_validator.py +59 -0
- multilingualprogramming/keyword/language_pack_validator.py +110 -0
- multilingualprogramming/lexer/__init__.py +11 -0
- multilingualprogramming/lexer/lexer.py +570 -0
- multilingualprogramming/lexer/source_reader.py +91 -0
- multilingualprogramming/lexer/token.py +54 -0
- multilingualprogramming/lexer/token_types.py +38 -0
- multilingualprogramming/numeral/__init__.py +11 -0
- multilingualprogramming/numeral/abstract_numeral.py +232 -0
- multilingualprogramming/numeral/complex_numeral.py +190 -0
- multilingualprogramming/numeral/fraction_numeral.py +165 -0
- multilingualprogramming/numeral/mp_numeral.py +243 -0
- multilingualprogramming/numeral/numeral_converter.py +151 -0
- multilingualprogramming/numeral/roman_numeral.py +301 -0
- multilingualprogramming/numeral/unicode_numeral.py +292 -0
- multilingualprogramming/parser/__init__.py +28 -0
- multilingualprogramming/parser/ast_nodes.py +459 -0
- multilingualprogramming/parser/ast_printer.py +677 -0
- multilingualprogramming/parser/error_messages.py +75 -0
- multilingualprogramming/parser/parser.py +1796 -0
- multilingualprogramming/parser/semantic_analyzer.py +689 -0
- multilingualprogramming/parser/surface_normalizer.py +282 -0
- multilingualprogramming/resources/datetime/eras.json +23 -0
- multilingualprogramming/resources/datetime/formats.json +32 -0
- multilingualprogramming/resources/datetime/months.json +150 -0
- multilingualprogramming/resources/datetime/weekdays.json +90 -0
- multilingualprogramming/resources/parser/error_messages.json +310 -0
- multilingualprogramming/resources/repl/commands.json +636 -0
- multilingualprogramming/resources/usm/builtins_aliases.json +731 -0
- multilingualprogramming/resources/usm/keywords.json +1063 -0
- multilingualprogramming/resources/usm/operators.json +532 -0
- multilingualprogramming/resources/usm/schema.json +34 -0
- multilingualprogramming/resources/usm/surface_patterns.json +1523 -0
- multilingualprogramming/unicode_string.py +140 -0
- multilingualprogramming/version.py +9 -0
- multilingualprogramming-0.2.0.dist-info/METADATA +350 -0
- multilingualprogramming-0.2.0.dist-info/RECORD +61 -0
- multilingualprogramming-0.2.0.dist-info/WHEEL +5 -0
- multilingualprogramming-0.2.0.dist-info/entry_points.txt +3 -0
- multilingualprogramming-0.2.0.dist-info/licenses/LICENSE +674 -0
- multilingualprogramming-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
Runtime built-in functions for the multilingual programming language.
|
|
9
|
+
|
|
10
|
+
Provides a namespace dict of built-in functions that are injected into
|
|
11
|
+
the execution environment so that multilingual identifiers (e.g., the
|
|
12
|
+
Hindi word for "print") resolve to Python built-ins.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from multilingualprogramming.keyword.keyword_registry import KeywordRegistry
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RuntimeBuiltins:
|
|
22
|
+
"""
|
|
23
|
+
Builds a dict of built-in names that should be available at runtime.
|
|
24
|
+
|
|
25
|
+
For a given source language, the keyword used for PRINT, INPUT, and
|
|
26
|
+
type keywords are mapped to the corresponding Python built-ins.
|
|
27
|
+
|
|
28
|
+
Usage:
|
|
29
|
+
builtins = RuntimeBuiltins("fr").namespace()
|
|
30
|
+
# {'afficher': <built-in function print>,
|
|
31
|
+
# 'saisir': <built-in function input>, ...}
|
|
32
|
+
|
|
33
|
+
The returned dict is intended to be merged into the exec() globals
|
|
34
|
+
so that transpiled code can call built-in functions by their
|
|
35
|
+
multilingual names.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
# Mapping from USM concept ID to the Python built-in object
|
|
39
|
+
_CONCEPT_TO_BUILTIN = {
|
|
40
|
+
"PRINT": print,
|
|
41
|
+
"INPUT": input,
|
|
42
|
+
"TYPE_INT": int,
|
|
43
|
+
"TYPE_FLOAT": float,
|
|
44
|
+
"TYPE_STR": str,
|
|
45
|
+
"TYPE_BOOL": bool,
|
|
46
|
+
"TYPE_LIST": list,
|
|
47
|
+
"TYPE_DICT": dict,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
# Additional Python built-ins available in every language
|
|
51
|
+
_UNIVERSAL_BUILTINS = {
|
|
52
|
+
"len": len,
|
|
53
|
+
"range": range,
|
|
54
|
+
"abs": abs,
|
|
55
|
+
"min": min,
|
|
56
|
+
"max": max,
|
|
57
|
+
"sum": sum,
|
|
58
|
+
"sorted": sorted,
|
|
59
|
+
"reversed": reversed,
|
|
60
|
+
"enumerate": enumerate,
|
|
61
|
+
"zip": zip,
|
|
62
|
+
"map": map,
|
|
63
|
+
"filter": filter,
|
|
64
|
+
"isinstance": isinstance,
|
|
65
|
+
"type": type,
|
|
66
|
+
"hasattr": hasattr,
|
|
67
|
+
"getattr": getattr,
|
|
68
|
+
"setattr": setattr,
|
|
69
|
+
"repr": repr,
|
|
70
|
+
"round": round,
|
|
71
|
+
"open": open,
|
|
72
|
+
"iter": iter,
|
|
73
|
+
"next": next,
|
|
74
|
+
"any": any,
|
|
75
|
+
"all": all,
|
|
76
|
+
"chr": chr,
|
|
77
|
+
"ord": ord,
|
|
78
|
+
"hex": hex,
|
|
79
|
+
"oct": oct,
|
|
80
|
+
"bin": bin,
|
|
81
|
+
"id": id,
|
|
82
|
+
"hash": hash,
|
|
83
|
+
"callable": callable,
|
|
84
|
+
"dir": dir,
|
|
85
|
+
"vars": vars,
|
|
86
|
+
"super": super,
|
|
87
|
+
"property": property,
|
|
88
|
+
"staticmethod": staticmethod,
|
|
89
|
+
"classmethod": classmethod,
|
|
90
|
+
"print": print,
|
|
91
|
+
"input": input,
|
|
92
|
+
"int": int,
|
|
93
|
+
"float": float,
|
|
94
|
+
"str": str,
|
|
95
|
+
"bool": bool,
|
|
96
|
+
"list": list,
|
|
97
|
+
"dict": dict,
|
|
98
|
+
"set": set,
|
|
99
|
+
"tuple": tuple,
|
|
100
|
+
"frozenset": frozenset,
|
|
101
|
+
"bytes": bytes,
|
|
102
|
+
"bytearray": bytearray,
|
|
103
|
+
"memoryview": memoryview,
|
|
104
|
+
"object": object,
|
|
105
|
+
"Exception": Exception,
|
|
106
|
+
"ValueError": ValueError,
|
|
107
|
+
"TypeError": TypeError,
|
|
108
|
+
"KeyError": KeyError,
|
|
109
|
+
"IndexError": IndexError,
|
|
110
|
+
"AttributeError": AttributeError,
|
|
111
|
+
"RuntimeError": RuntimeError,
|
|
112
|
+
"StopIteration": StopIteration,
|
|
113
|
+
"ZeroDivisionError": ZeroDivisionError,
|
|
114
|
+
"FileNotFoundError": FileNotFoundError,
|
|
115
|
+
"IOError": IOError,
|
|
116
|
+
"OSError": OSError,
|
|
117
|
+
"ImportError": ImportError,
|
|
118
|
+
"NotImplementedError": NotImplementedError,
|
|
119
|
+
"True": True,
|
|
120
|
+
"False": False,
|
|
121
|
+
"None": None,
|
|
122
|
+
# Additional built-in functions
|
|
123
|
+
"pow": pow,
|
|
124
|
+
"divmod": divmod,
|
|
125
|
+
"complex": complex,
|
|
126
|
+
"format": format,
|
|
127
|
+
"ascii": ascii,
|
|
128
|
+
"breakpoint": breakpoint,
|
|
129
|
+
"compile": compile,
|
|
130
|
+
"eval": eval,
|
|
131
|
+
"exec": exec,
|
|
132
|
+
"globals": globals,
|
|
133
|
+
"locals": locals,
|
|
134
|
+
"issubclass": issubclass,
|
|
135
|
+
"delattr": delattr,
|
|
136
|
+
"slice": slice,
|
|
137
|
+
# Additional exception types
|
|
138
|
+
"ArithmeticError": ArithmeticError,
|
|
139
|
+
"AssertionError": AssertionError,
|
|
140
|
+
"BufferError": BufferError,
|
|
141
|
+
"EOFError": EOFError,
|
|
142
|
+
"FloatingPointError": FloatingPointError,
|
|
143
|
+
"GeneratorExit": GeneratorExit,
|
|
144
|
+
"LookupError": LookupError,
|
|
145
|
+
"NameError": NameError,
|
|
146
|
+
"OverflowError": OverflowError,
|
|
147
|
+
"PermissionError": PermissionError,
|
|
148
|
+
"RecursionError": RecursionError,
|
|
149
|
+
"ReferenceError": ReferenceError,
|
|
150
|
+
"SyntaxError": SyntaxError,
|
|
151
|
+
"SystemError": SystemError,
|
|
152
|
+
"SystemExit": SystemExit,
|
|
153
|
+
"TimeoutError": TimeoutError,
|
|
154
|
+
"UnicodeError": UnicodeError,
|
|
155
|
+
"UnicodeDecodeError": UnicodeDecodeError,
|
|
156
|
+
"UnicodeEncodeError": UnicodeEncodeError,
|
|
157
|
+
"Warning": Warning,
|
|
158
|
+
"DeprecationWarning": DeprecationWarning,
|
|
159
|
+
"UserWarning": UserWarning,
|
|
160
|
+
"FutureWarning": FutureWarning,
|
|
161
|
+
"ResourceWarning": ResourceWarning,
|
|
162
|
+
"ConnectionError": ConnectionError,
|
|
163
|
+
"BrokenPipeError": BrokenPipeError,
|
|
164
|
+
"BlockingIOError": BlockingIOError,
|
|
165
|
+
"ChildProcessError": ChildProcessError,
|
|
166
|
+
"ConnectionAbortedError": ConnectionAbortedError,
|
|
167
|
+
"ConnectionRefusedError": ConnectionRefusedError,
|
|
168
|
+
"ConnectionResetError": ConnectionResetError,
|
|
169
|
+
"FileExistsError": FileExistsError,
|
|
170
|
+
"InterruptedError": InterruptedError,
|
|
171
|
+
"IsADirectoryError": IsADirectoryError,
|
|
172
|
+
"NotADirectoryError": NotADirectoryError,
|
|
173
|
+
"ProcessLookupError": ProcessLookupError,
|
|
174
|
+
"StopAsyncIteration": StopAsyncIteration,
|
|
175
|
+
"UnboundLocalError": UnboundLocalError,
|
|
176
|
+
# Base exception classes
|
|
177
|
+
"BaseException": BaseException,
|
|
178
|
+
"KeyboardInterrupt": KeyboardInterrupt,
|
|
179
|
+
# Additional exception types (Python 3.12)
|
|
180
|
+
"ModuleNotFoundError": ModuleNotFoundError,
|
|
181
|
+
"IndentationError": IndentationError,
|
|
182
|
+
"TabError": TabError,
|
|
183
|
+
"UnicodeTranslateError": UnicodeTranslateError,
|
|
184
|
+
"ExceptionGroup": ExceptionGroup,
|
|
185
|
+
"BaseExceptionGroup": BaseExceptionGroup,
|
|
186
|
+
# Additional warning types
|
|
187
|
+
"BytesWarning": BytesWarning,
|
|
188
|
+
"EncodingWarning": EncodingWarning,
|
|
189
|
+
"ImportWarning": ImportWarning,
|
|
190
|
+
"PendingDeprecationWarning": PendingDeprecationWarning,
|
|
191
|
+
"RuntimeWarning": RuntimeWarning,
|
|
192
|
+
"SyntaxWarning": SyntaxWarning,
|
|
193
|
+
"UnicodeWarning": UnicodeWarning,
|
|
194
|
+
# Async built-in functions (Python 3.10+)
|
|
195
|
+
"aiter": aiter,
|
|
196
|
+
"anext": anext,
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
# Non-callable special values available in exec() namespace
|
|
200
|
+
_SPECIAL_VALUES = {
|
|
201
|
+
"Ellipsis": Ellipsis,
|
|
202
|
+
"NotImplemented": NotImplemented,
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
_BUILTIN_ALIAS_CATALOG = None
|
|
206
|
+
|
|
207
|
+
def __init__(self, source_language="en"):
|
|
208
|
+
self._language = source_language
|
|
209
|
+
self._registry = KeywordRegistry()
|
|
210
|
+
|
|
211
|
+
@classmethod
|
|
212
|
+
def _load_builtin_alias_catalog(cls):
|
|
213
|
+
"""Load localized built-in aliases from resources."""
|
|
214
|
+
if cls._BUILTIN_ALIAS_CATALOG is not None:
|
|
215
|
+
return cls._BUILTIN_ALIAS_CATALOG
|
|
216
|
+
|
|
217
|
+
path = (
|
|
218
|
+
Path(__file__).resolve().parent.parent
|
|
219
|
+
/ "resources" / "usm" / "builtins_aliases.json"
|
|
220
|
+
)
|
|
221
|
+
with open(path, "r", encoding="utf-8-sig") as handle:
|
|
222
|
+
cls._BUILTIN_ALIAS_CATALOG = json.load(handle)
|
|
223
|
+
return cls._BUILTIN_ALIAS_CATALOG
|
|
224
|
+
|
|
225
|
+
@classmethod
|
|
226
|
+
def _localized_builtin_aliases(cls, language):
|
|
227
|
+
"""Return alias->builtin map for a given language."""
|
|
228
|
+
catalog = cls._load_builtin_alias_catalog()
|
|
229
|
+
aliases = {}
|
|
230
|
+
for canonical, by_language in catalog.get("aliases", {}).items():
|
|
231
|
+
builtin_obj = cls._UNIVERSAL_BUILTINS.get(canonical)
|
|
232
|
+
if builtin_obj is None or not isinstance(by_language, dict):
|
|
233
|
+
continue
|
|
234
|
+
for alias in by_language.get(language, []):
|
|
235
|
+
aliases[alias] = builtin_obj
|
|
236
|
+
return aliases
|
|
237
|
+
|
|
238
|
+
def namespace(self):
|
|
239
|
+
"""
|
|
240
|
+
Return a dict mapping multilingual names to Python built-ins.
|
|
241
|
+
|
|
242
|
+
Includes:
|
|
243
|
+
1. Language-specific keyword mappings (PRINT -> afficher, etc.)
|
|
244
|
+
2. Universal Python built-ins (len, range, abs, etc.)
|
|
245
|
+
"""
|
|
246
|
+
ns = dict(self._UNIVERSAL_BUILTINS)
|
|
247
|
+
ns.update(self._SPECIAL_VALUES)
|
|
248
|
+
|
|
249
|
+
# Add language-specific mappings (all variants, not only canonical).
|
|
250
|
+
concept_map = self._registry.get_concept_map()
|
|
251
|
+
for concept, builtin_obj in self._CONCEPT_TO_BUILTIN.items():
|
|
252
|
+
try:
|
|
253
|
+
translations = concept_map[concept]
|
|
254
|
+
keyword_value = translations.get(self._language)
|
|
255
|
+
if keyword_value is None:
|
|
256
|
+
continue
|
|
257
|
+
keywords = (
|
|
258
|
+
keyword_value if isinstance(keyword_value, list)
|
|
259
|
+
else [keyword_value]
|
|
260
|
+
)
|
|
261
|
+
for keyword in keywords:
|
|
262
|
+
if keyword not in ns:
|
|
263
|
+
ns[keyword] = builtin_obj
|
|
264
|
+
except Exception:
|
|
265
|
+
pass # Skip if concept not found for this language
|
|
266
|
+
for alias, builtin_obj in self._localized_builtin_aliases(
|
|
267
|
+
self._language
|
|
268
|
+
).items():
|
|
269
|
+
# Keep canonical names stable if an alias collides.
|
|
270
|
+
if alias not in ns:
|
|
271
|
+
ns[alias] = builtin_obj
|
|
272
|
+
|
|
273
|
+
return ns
|
|
274
|
+
|
|
275
|
+
@classmethod
|
|
276
|
+
def all_languages_namespace(cls):
|
|
277
|
+
"""
|
|
278
|
+
Return a namespace containing built-in mappings for ALL supported
|
|
279
|
+
languages simultaneously. Useful for multi-language environments.
|
|
280
|
+
"""
|
|
281
|
+
ns = dict(cls._UNIVERSAL_BUILTINS)
|
|
282
|
+
registry = KeywordRegistry()
|
|
283
|
+
|
|
284
|
+
concept_map = registry.get_concept_map()
|
|
285
|
+
for lang in registry.get_supported_languages():
|
|
286
|
+
for concept, builtin_obj in cls._CONCEPT_TO_BUILTIN.items():
|
|
287
|
+
try:
|
|
288
|
+
translations = concept_map[concept]
|
|
289
|
+
keyword_value = translations.get(lang)
|
|
290
|
+
if keyword_value is None:
|
|
291
|
+
continue
|
|
292
|
+
keywords = (
|
|
293
|
+
keyword_value if isinstance(keyword_value, list)
|
|
294
|
+
else [keyword_value]
|
|
295
|
+
)
|
|
296
|
+
for keyword in keywords:
|
|
297
|
+
if keyword not in ns:
|
|
298
|
+
ns[keyword] = builtin_obj
|
|
299
|
+
except Exception:
|
|
300
|
+
pass
|
|
301
|
+
for alias, builtin_obj in cls._localized_builtin_aliases(
|
|
302
|
+
lang
|
|
303
|
+
).items():
|
|
304
|
+
# Keep canonical names stable if an alias collides.
|
|
305
|
+
if alias not in ns:
|
|
306
|
+
ns[alias] = builtin_obj
|
|
307
|
+
|
|
308
|
+
return ns
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2026 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Core IR package for language-agnostic internal representations."""
|
|
8
|
+
|
|
9
|
+
from multilingualprogramming.core.ir import CoreIRProgram
|
|
10
|
+
from multilingualprogramming.core.lowering import lower_to_core_ir
|
|
11
|
+
|
|
12
|
+
__all__ = ["CoreIRProgram", "lower_to_core_ir"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2026 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Typed core IR containers shared by all language frontends."""
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from multilingualprogramming.parser.ast_nodes import Program
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class CoreIRProgram:
|
|
17
|
+
"""Language-agnostic, forward-only lowered program representation."""
|
|
18
|
+
|
|
19
|
+
ast: Program
|
|
20
|
+
source_language: str
|
|
21
|
+
core_version: str = "0.1"
|
|
22
|
+
frontend_metadata: dict[str, Any] = field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
def validate(self):
|
|
25
|
+
"""Validate minimal structural constraints of the core object."""
|
|
26
|
+
if not isinstance(self.ast, Program):
|
|
27
|
+
raise TypeError("CoreIRProgram.ast must be a Program AST node")
|
|
28
|
+
if not isinstance(self.source_language, str) or not self.source_language:
|
|
29
|
+
raise ValueError("CoreIRProgram.source_language must be a non-empty string")
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2026 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Lowering boundary between language frontends and the shared core IR."""
|
|
8
|
+
|
|
9
|
+
from multilingualprogramming.core.ir import CoreIRProgram
|
|
10
|
+
from multilingualprogramming.parser.ast_nodes import Program
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def lower_to_core_ir(program_ast, source_language, frontend_name="default"):
|
|
14
|
+
"""Lower a frontend AST into the typed core representation."""
|
|
15
|
+
if not isinstance(program_ast, Program):
|
|
16
|
+
raise TypeError("lower_to_core_ir expects a Program AST root")
|
|
17
|
+
|
|
18
|
+
core = CoreIRProgram(
|
|
19
|
+
ast=program_ast,
|
|
20
|
+
source_language=source_language or "en",
|
|
21
|
+
frontend_metadata={"frontend_name": frontend_name},
|
|
22
|
+
)
|
|
23
|
+
core.validate()
|
|
24
|
+
return core
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Datetime subpackage for multilingual programming."""
|
|
8
|
+
|
|
9
|
+
from multilingualprogramming.datetime.mp_date import MPDate
|
|
10
|
+
from multilingualprogramming.datetime.mp_time import MPTime
|
|
11
|
+
from multilingualprogramming.datetime.mp_datetime import MPDatetime
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#
|
|
2
|
+
# SPDX-FileCopyrightText: 2024 John Samuel <johnsamuelwrites@gmail.com>
|
|
3
|
+
#
|
|
4
|
+
# SPDX-License-Identifier: GPL-3.0-or-later
|
|
5
|
+
#
|
|
6
|
+
|
|
7
|
+
"""Parser for multilingual date strings."""
|
|
8
|
+
|
|
9
|
+
import unicodedata
|
|
10
|
+
from typing import Any, cast
|
|
11
|
+
from multilingualprogramming.datetime.resource_loader import load_datetime_resource
|
|
12
|
+
from multilingualprogramming.exceptions import InvalidDateError
|
|
13
|
+
from multilingualprogramming.unicode_string import get_language_from_character
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# pylint: disable=too-few-public-methods
|
|
17
|
+
class DateParser:
|
|
18
|
+
"""
|
|
19
|
+
Parses multilingual date strings into (year, month, day) tuples.
|
|
20
|
+
|
|
21
|
+
Supports date strings with:
|
|
22
|
+
- Month names in any of the 10 pilot languages
|
|
23
|
+
- Numerals in any Unicode script
|
|
24
|
+
- Various separator characters (-, /, .)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
_months_data: dict[str, Any] | None = None
|
|
28
|
+
_month_lookup: dict[str, dict[str, int]] | None = None
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def _load(cls):
|
|
32
|
+
"""Load month names from JSON."""
|
|
33
|
+
if cls._months_data is not None and cls._month_lookup is not None:
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
months_data = load_datetime_resource("months.json")
|
|
37
|
+
if not isinstance(months_data, dict):
|
|
38
|
+
raise InvalidDateError("Invalid months data format")
|
|
39
|
+
|
|
40
|
+
# Build lookup: {lang: {month_name_lower: month_number}}
|
|
41
|
+
month_lookup: dict[str, dict[str, int]] = {}
|
|
42
|
+
month_names = list(months_data["months"].keys())
|
|
43
|
+
for idx, month_key in enumerate(month_names, 1):
|
|
44
|
+
month_data = months_data["months"][month_key]
|
|
45
|
+
for lang, names in month_data.items():
|
|
46
|
+
if lang not in month_lookup:
|
|
47
|
+
month_lookup[lang] = {}
|
|
48
|
+
month_lookup[lang][names["full"].lower()] = idx
|
|
49
|
+
month_lookup[lang][names["abbr"].lower()] = idx
|
|
50
|
+
|
|
51
|
+
cls._months_data = months_data
|
|
52
|
+
cls._month_lookup = month_lookup
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def _extract_number(cls, token):
|
|
56
|
+
"""
|
|
57
|
+
Extract a numeric value from a token that may contain Unicode digits.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
int | None: The numeric value, or None if not a number
|
|
61
|
+
"""
|
|
62
|
+
if not token:
|
|
63
|
+
return None
|
|
64
|
+
try:
|
|
65
|
+
return int(token)
|
|
66
|
+
except ValueError:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
result = 0
|
|
70
|
+
for char in token:
|
|
71
|
+
if unicodedata.category(char) == "Nd":
|
|
72
|
+
result = result * 10 + unicodedata.decimal(char)
|
|
73
|
+
else:
|
|
74
|
+
return None
|
|
75
|
+
return result
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def _find_month(cls, token):
|
|
79
|
+
"""
|
|
80
|
+
Find a month number from a token string.
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
tuple(int, str) | None: (month_number, language) or None
|
|
84
|
+
"""
|
|
85
|
+
if cls._month_lookup is None:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
token_lower = token.lower()
|
|
89
|
+
for lang, months in cls._month_lookup.items():
|
|
90
|
+
if token_lower in months:
|
|
91
|
+
return (months[token_lower], lang)
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
# pylint: disable=too-many-locals,too-many-return-statements,too-many-branches,too-many-statements
|
|
96
|
+
def parse(cls, datestr, language=None):
|
|
97
|
+
"""
|
|
98
|
+
Parse a multilingual date string.
|
|
99
|
+
|
|
100
|
+
Parameters:
|
|
101
|
+
datestr (str): Date string (e.g., "15-January-2024")
|
|
102
|
+
language (str): Optional language hint
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
tuple(int, int, int, str): (year, month, day, detected_language)
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
InvalidDateError: If the date cannot be parsed
|
|
109
|
+
"""
|
|
110
|
+
cls._load()
|
|
111
|
+
datestr = datestr.strip()
|
|
112
|
+
|
|
113
|
+
separators = ["-", "/", ".", " ", "年", "月", "日"]
|
|
114
|
+
tokens = []
|
|
115
|
+
current = ""
|
|
116
|
+
for char in datestr:
|
|
117
|
+
if char in separators:
|
|
118
|
+
if current:
|
|
119
|
+
tokens.append(current)
|
|
120
|
+
current = ""
|
|
121
|
+
else:
|
|
122
|
+
current += char
|
|
123
|
+
if current:
|
|
124
|
+
tokens.append(current)
|
|
125
|
+
|
|
126
|
+
if len(tokens) < 2:
|
|
127
|
+
raise InvalidDateError(f"Cannot parse date: {datestr}")
|
|
128
|
+
|
|
129
|
+
numbers = []
|
|
130
|
+
month_info = None
|
|
131
|
+
detected_lang = language
|
|
132
|
+
|
|
133
|
+
for token in tokens:
|
|
134
|
+
num = cls._extract_number(token)
|
|
135
|
+
if num is not None:
|
|
136
|
+
numbers.append(num)
|
|
137
|
+
if detected_lang is None:
|
|
138
|
+
for char in token:
|
|
139
|
+
lang_name = get_language_from_character(char)
|
|
140
|
+
if lang_name and lang_name != "DIGIT":
|
|
141
|
+
detected_lang = lang_name
|
|
142
|
+
break
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
result = cls._find_month(token)
|
|
146
|
+
if result is None:
|
|
147
|
+
raise InvalidDateError(f"Cannot identify token: {token}")
|
|
148
|
+
|
|
149
|
+
month_num = result[0]
|
|
150
|
+
lang = result[1]
|
|
151
|
+
month_info = month_num
|
|
152
|
+
if detected_lang is None:
|
|
153
|
+
detected_lang = lang
|
|
154
|
+
|
|
155
|
+
if detected_lang is None:
|
|
156
|
+
detected_lang = "en"
|
|
157
|
+
|
|
158
|
+
if month_info is not None:
|
|
159
|
+
if len(numbers) == 2:
|
|
160
|
+
first = numbers[0]
|
|
161
|
+
second = numbers[1]
|
|
162
|
+
if first > 31:
|
|
163
|
+
year, day = first, second
|
|
164
|
+
elif second > 31:
|
|
165
|
+
day, year = first, second
|
|
166
|
+
elif first > 12:
|
|
167
|
+
day, year = first, second
|
|
168
|
+
else:
|
|
169
|
+
day, year = first, second
|
|
170
|
+
return (year, month_info, day, detected_lang)
|
|
171
|
+
|
|
172
|
+
if len(numbers) == 1:
|
|
173
|
+
value = cast(int, numbers[0])
|
|
174
|
+
if value > 31:
|
|
175
|
+
return (value, month_info, 1, detected_lang)
|
|
176
|
+
return (2024, month_info, value, detected_lang)
|
|
177
|
+
|
|
178
|
+
if len(numbers) == 3:
|
|
179
|
+
a = numbers[0]
|
|
180
|
+
b = numbers[1]
|
|
181
|
+
c = numbers[2]
|
|
182
|
+
if a > 31:
|
|
183
|
+
return (a, b, c, detected_lang)
|
|
184
|
+
if c > 31:
|
|
185
|
+
if a > 12:
|
|
186
|
+
return (c, b, a, detected_lang)
|
|
187
|
+
return (c, a, b, detected_lang)
|
|
188
|
+
return (c, b, a, detected_lang)
|
|
189
|
+
|
|
190
|
+
raise InvalidDateError(f"Cannot determine date components: {datestr}")
|