lokit-python 0.1.3__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lokit_python-0.1.4/MANIFEST.in +1 -0
- lokit_python-0.1.4/PKG-INFO +278 -0
- lokit_python-0.1.4/README.md +245 -0
- lokit_python-0.1.4/pyproject.toml +95 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/setup.py +0 -9
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/__init__.py +8 -0
- lokit_python-0.1.4/src/lokit/compat.py +7 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/lang_codes.py +1 -1
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/structure.py +1 -1
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/tag_types.py +2 -1
- lokit_python-0.1.4/src/lokit/exporters/__init__.py +125 -0
- lokit_python-0.1.4/src/lokit/exporters/xlsx.py +141 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/format_detection.py +1 -1
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/stream_json.py +1 -1
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/logic.py +6 -4
- lokit_python-0.1.4/src/lokit/parsers/__init__.py +315 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/models.py +2 -1
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/tags.py +1 -1
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xlsx/extraction.py +56 -59
- lokit_python-0.1.4/src/lokit_python.egg-info/PKG-INFO +278 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit_python.egg-info/SOURCES.txt +5 -0
- lokit_python-0.1.4/src/lokit_python.egg-info/requires.txt +4 -0
- lokit_python-0.1.4/src/lokit_python.egg-info/top_level.txt +2 -0
- lokit_python-0.1.4/stubs/python_calamine.pyi +26 -0
- lokit_python-0.1.4/stubs/rustpy_xlsxwriter.pyi +25 -0
- lokit_python-0.1.4/tests/test_public_api.py +67 -0
- lokit_python-0.1.3/PKG-INFO +0 -149
- lokit_python-0.1.3/README.md +0 -139
- lokit_python-0.1.3/pyproject.toml +0 -39
- lokit_python-0.1.3/src/lokit/exporters/__init__.py +0 -34
- lokit_python-0.1.3/src/lokit/exporters/xlsx.py +0 -69
- lokit_python-0.1.3/src/lokit/parsers/xlsx/__init__.py +0 -1
- lokit_python-0.1.3/src/lokit_python.egg-info/PKG-INFO +0 -149
- lokit_python-0.1.3/src/lokit_python.egg-info/requires.txt +0 -3
- lokit_python-0.1.3/src/lokit_python.egg-info/top_level.txt +0 -2
- {lokit_python-0.1.3 → lokit_python-0.1.4}/setup.cfg +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/core/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/core/logger.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/csv.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/html.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/idml.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/json_i18n.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/po.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/tmx.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/xliff.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/importers.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/atomic.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/json.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/async_bridge.py +0 -0
- {lokit_python-0.1.3/src/lokit/parsers → lokit_python-0.1.4/src/lokit/parsers/csv}/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/csv/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/html/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/html/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/idml/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/idml/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/json_i18n/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/json_i18n/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/po/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/po/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/base.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/header.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/helpers.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/parallel.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/props.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/xml_utils.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xliff/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xliff/extraction.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xliff/tags.py +0 -0
- {lokit_python-0.1.3/src/lokit/parsers/csv → lokit_python-0.1.4/src/lokit/parsers/xlsx}/__init__.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/py.typed +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit_python.egg-info/dependency_links.txt +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_csv.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_html.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_idml.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_json_i18n.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_performance_safety.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_po.py +0 -0
- {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_xlsx.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include stubs/*.pyi
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lokit-python
|
|
3
|
+
Version: 0.1.4
|
|
4
|
+
Summary: A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files.
|
|
5
|
+
Project-URL: Homepage, https://github.com/ciarandarby/lokit
|
|
6
|
+
Project-URL: Repository, https://github.com/ciarandarby/lokit
|
|
7
|
+
Project-URL: Issues, https://github.com/ciarandarby/lokit/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/ciarandarby/lokit#readme
|
|
9
|
+
Keywords: localization,l10n,i18n,translation,translation-memory,tmx,xliff,gettext,po,idml,xlsx,csv,json,html,type-safe,async,streaming,mypy,mypyc,backend-localization,parsing,parsers,localization-parsers,performative,internationalization,streaming,backend,memory-safe
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Information Technology
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
21
|
+
Classifier: Topic :: Software Development :: Internationalization
|
|
22
|
+
Classifier: Topic :: Software Development :: Localization
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
Requires-Dist: lxml>=6.1.1
|
|
30
|
+
Requires-Dist: python-calamine>=0.6.2
|
|
31
|
+
Requires-Dist: polib>=1.2.0
|
|
32
|
+
Requires-Dist: rustpy-xlsxwriter<0.5,>=0.4.4
|
|
33
|
+
|
|
34
|
+
# lokit
|
|
35
|
+
|
|
36
|
+
> [!WARNING]
|
|
37
|
+
> **Beta Release:** lokit is currently in Beta. The API is volatile and subject to rapid, breaking changes prior to the official V1 release.
|
|
38
|
+
|
|
39
|
+
<br>
|
|
40
|
+
|
|
41
|
+
lokit is a high-performance, strictly type-safe, and highly memory-efficient localization toolkit for Python.
|
|
42
|
+
|
|
43
|
+
<br>
|
|
44
|
+
|
|
45
|
+
Supports Python 3.10+.
|
|
46
|
+
|
|
47
|
+
<br>
|
|
48
|
+
<hr>
|
|
49
|
+
<br>
|
|
50
|
+
|
|
51
|
+
Unlike legacy tools that wrap around XML DOM element trees in-memory, lokit represents a shift away from XML-based localization interchange formats towards native language parsing. It ingests localization formats (TMX, XLIFF, PO, XLSX, CSV, JSON, HTML, IDML) and compiles them into a strict, unified structural data model. This enables not just parsing, but robust data manipulation, semantic extraction, and advanced translation memory features out-of-the-box. Lokit focuses on streaming and asynchronous processing rather than synchronous events using in-memory files.
|
|
52
|
+
|
|
53
|
+
<br>
|
|
54
|
+
|
|
55
|
+
This format type can be easily converted to JSON for interchange with other systems. I've made parsing and data transfers as native as possible by capturing all elements of traditional interchange formats in a common format structure. This allows for much better compatibility, especially in terms of segment matching and leveraging as it uses flattened strings as standard. Tags are preserved but as a common format, meaning the structure parsed from XLIFF will be the same as the structure parsed from HTML.
|
|
56
|
+
|
|
57
|
+
<br>
|
|
58
|
+
|
|
59
|
+
These legacy file formats have supported vendor-lock in for many year, making it difficult for any client to move to another system. Seeing that this is a major issue across the domain, something new is needed where vendors do not use hidden, legacy technology to lock in their clients. Localization deserves innovation.
|
|
60
|
+
|
|
61
|
+
<br>
|
|
62
|
+
<hr>
|
|
63
|
+
|
|
64
|
+
> The main premise here is a common, structured and type-safe dataclass model structure that is intentionally compatible with any file format, not just localization interchange formats, although these are optimized for performance and memory efficiency due to the verbose nature of XML based formats.
|
|
65
|
+
|
|
66
|
+
<br>
|
|
67
|
+
|
|
68
|
+
Note: This project was originally written in Rust and is still unreleased. Adding Rust extensions did not show a major performance improvement over the current C-Extension modules due to bridging overheads, this will be re-addressed in future releases. SDKs in other languages including the Rust prototype are coming soon.
|
|
69
|
+
|
|
70
|
+
<br>
|
|
71
|
+
|
|
72
|
+
## Core Features
|
|
73
|
+
|
|
74
|
+
<br>
|
|
75
|
+
|
|
76
|
+
lokit provides a comprehensive suite of tools for managing localization data:
|
|
77
|
+
|
|
78
|
+
* **Native Structural Modeling:** Converts interchange formats into a strict, unified Python Data classes, ensuring complete type safety.
|
|
79
|
+
* **Advanced Matching Engine:** Provides Exact Matching, Fuzzy Matching (via SequenceMatcher), and In-Context Exact (ICE) Matching leveraging previous and next segment context, as well as with inline tags.
|
|
80
|
+
* **Sub-segment Extraction:** Automatically parses and isolates inline tags, properties, and formatting markers, allowing for safe manipulation of text without corrupting code.
|
|
81
|
+
* **Semantic Querying:** Easily filter translation units using any attribute, exact ID lookups, or deep nested JSON path querying (`where()`).
|
|
82
|
+
* **Plural Support:** Native extraction and structuring of pluralized translation units, compatible with UI frameworks.
|
|
83
|
+
* **Universal Format Conversion:** Instantly import and export between any supported format (e.g., TMX to JSON, HTML to XLIFF) with zero data loss.
|
|
84
|
+
* **Synchronous and Asynchronous Streaming:** Process massive enterprise files natively using Python async generators to keep memory overhead to an absolute minimum.
|
|
85
|
+
|
|
86
|
+
<br>
|
|
87
|
+
|
|
88
|
+
### Type Safety and C-Extensions
|
|
89
|
+
|
|
90
|
+
<br>
|
|
91
|
+
|
|
92
|
+
The entire library is very strictly typed and mypy compliant, so strict it compiles to C-extensions via mypyc and pre-attached via wheels. Additionally, any XML processing uses C-based packages. Compiling to these extensions has shown a 23% in overall performance increases over pure-python modules with additional benefits such as lower memory usage. C extensions are standard for MacOS (ARM+Intel), Windows, and Linux.
|
|
93
|
+
|
|
94
|
+
<br>
|
|
95
|
+
|
|
96
|
+
## Parsing Performance
|
|
97
|
+
|
|
98
|
+
<br>
|
|
99
|
+
|
|
100
|
+
When dealing with enterprise-scale localization environments, parsing performance and memory efficiency are paramount. lokit is designed to be significantly leaner and faster than the industry standard.
|
|
101
|
+
|
|
102
|
+
<br>
|
|
103
|
+
|
|
104
|
+
Using another package, `translate-toolkit`, as a reference as it is the de-facto and feature-rich standard for localization file format parsing and conversion in Python for comparison, we benchmarked lokit's modules against its equivalents.
|
|
105
|
+
|
|
106
|
+
<br>
|
|
107
|
+
|
|
108
|
+
In a stress-test benchmark on a +600 MB `.TMX` file containing **557,058 segments**, converting to JSON with `Lokit.to_json_async()` over 3 iterations yielded the following comparative averages:
|
|
109
|
+
|
|
110
|
+
<br>
|
|
111
|
+
|
|
112
|
+
| Library | Avg Duration | Peak Memory | Memory Efficiency |
|
|
113
|
+
|---------|------------------|------------------|-------------------|
|
|
114
|
+
| **lokit** | 13.57s | 135.9 MB | 15x Less Memory |
|
|
115
|
+
| **translate-toolkit** | 20.30s | 2,034.5 MB | ~2.0 GB |
|
|
116
|
+
|
|
117
|
+
<br>
|
|
118
|
+
|
|
119
|
+
Tests for both covered from TMX to JSON with inline tag sanitization in both using the respective packages' tooling.
|
|
120
|
+
|
|
121
|
+
<br>
|
|
122
|
+
|
|
123
|
+
The major focus on memory safety allows for parallel processing of events, making it suitable for large-scale localization workflows and backend systems.
|
|
124
|
+
|
|
125
|
+
<br>
|
|
126
|
+
|
|
127
|
+
**Note:** this package is not a replacement or substitution for the already amazing translate-toolkit. The functionality is quite differet across both libraries and have their own use cases.
|
|
128
|
+
|
|
129
|
+
<br>
|
|
130
|
+
<hr>
|
|
131
|
+
|
|
132
|
+
## SDK Usage Reference
|
|
133
|
+
|
|
134
|
+
<br>
|
|
135
|
+
|
|
136
|
+
Lokit operates around a central `BaseStructure` dataclass model, which standardizes localization units and segments. This instructs better standardization and branching in a more language native way compared to XML based file formats. Parsing SDKs are added for both extraction and export tasks for localization interchange formats along with common file types.
|
|
137
|
+
|
|
138
|
+
<br>
|
|
139
|
+
|
|
140
|
+
### Installation
|
|
141
|
+
|
|
142
|
+
<br>
|
|
143
|
+
|
|
144
|
+
Install lokit via pip:
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
pip install lokit-python
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
<br>
|
|
151
|
+
|
|
152
|
+
### Basic Parsing and Conversion
|
|
153
|
+
|
|
154
|
+
<br>
|
|
155
|
+
|
|
156
|
+
Converting files synchronously is straightforward through the structured `lokit` API. Import the package once, then use the format paths under `lokit.parsers` and `lokit.exporters`.
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
import lokit
|
|
160
|
+
|
|
161
|
+
document = lokit.parsers.read.tmx("path/to/source.tmx")
|
|
162
|
+
|
|
163
|
+
lokit.exporters.write.xliff(document, "path/to/target.xliff")
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
<br>
|
|
167
|
+
|
|
168
|
+
### Asynchronous Streaming for Large Interchange Files
|
|
169
|
+
|
|
170
|
+
<br>
|
|
171
|
+
|
|
172
|
+
For files spanning hundreds of megabytes, parsing the entire DOM structure into memory is inefficient. Lokit supports stream-parsing natively.
|
|
173
|
+
|
|
174
|
+
<br>
|
|
175
|
+
|
|
176
|
+
Here's some simple scripting code to show how easy it is. This simple program has no boilderplate and can be reduced to a few lines of code, but for the purpose of showcasing, we added some wrapper functions. The stream APIs take the static attributes such as language codes, keeping them in an immutable state. Then quickly streams the mutables. All other parsing modules also use streaming to parse to and from the common typed format.
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
import asyncio
|
|
180
|
+
import os
|
|
181
|
+
|
|
182
|
+
import lokit
|
|
183
|
+
|
|
184
|
+
input_dir = "data/language_tmx"
|
|
185
|
+
output_dir = "data/out"
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def convert_to_json(filepath: str):
|
|
189
|
+
print(f"Starting: {filepath}")
|
|
190
|
+
output = f"{output_dir}/{os.path.splitext(os.path.basename(filepath))[0]}.json"
|
|
191
|
+
await lokit.parsers.stream.json(
|
|
192
|
+
filepath=filepath,
|
|
193
|
+
output=output,
|
|
194
|
+
)
|
|
195
|
+
print(f"Completed: {output}")
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
async def process():
|
|
199
|
+
if not os.path.exists(output_dir):
|
|
200
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
201
|
+
|
|
202
|
+
files = [os.path.join(input_dir, i) for i in os.listdir(input_dir)]
|
|
203
|
+
tasks = [convert_to_json(filepath=file) for file in files]
|
|
204
|
+
await asyncio.gather(*tasks)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
if __name__ == "__main__":
|
|
208
|
+
asyncio.run(process())
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
<br>
|
|
212
|
+
|
|
213
|
+
### Advanced Querying and Matching
|
|
214
|
+
|
|
215
|
+
<br>
|
|
216
|
+
|
|
217
|
+
The `Lokit` logic wrapper provides access to the powerful matching engine and data manipulation features. This does not substitute for enterprise database semantic search but can be used as an after-step for evaluating matching results after retrieving translation units from a semantic/vector database.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
import lokit
|
|
221
|
+
|
|
222
|
+
engine = lokit.Lokit.parse("path/to/source.xliff")
|
|
223
|
+
|
|
224
|
+
button_units = engine.where("extensions.component", "checkout_button")
|
|
225
|
+
|
|
226
|
+
results = engine.fuzzy_find("Complete your purchase", limit=5, threshold=0.75)
|
|
227
|
+
for match in results:
|
|
228
|
+
print(f"Match found: {match.unit_id} (Score: {match.score})")
|
|
229
|
+
|
|
230
|
+
ice_match = engine.match(
|
|
231
|
+
source="Submit",
|
|
232
|
+
target_unit_id="submit_btn_1",
|
|
233
|
+
previous_source="Enter your email",
|
|
234
|
+
require_context=True
|
|
235
|
+
)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
### Structured API Paths
|
|
239
|
+
|
|
240
|
+
The preferred public API is available from a single package import:
|
|
241
|
+
|
|
242
|
+
```python
|
|
243
|
+
import lokit
|
|
244
|
+
|
|
245
|
+
document = lokit.parsers.read.file("path/to/source.tmx")
|
|
246
|
+
document = lokit.parsers.read.csv("path/to/source.csv", source_locale="en-US")
|
|
247
|
+
streamed_tmx = lokit.parsers.stream.tmx("path/to/source.tmx")
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
async def stream_to_json() -> None:
|
|
251
|
+
await lokit.parsers.stream.json("path/to/source.tmx", "path/to/out")
|
|
252
|
+
|
|
253
|
+
lokit.exporters.write.csv(document, "path/to/target.csv")
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
async def export_xlsx() -> None:
|
|
257
|
+
await lokit.exporters.async_.xlsx(document, "path/to/target.xlsx")
|
|
258
|
+
|
|
259
|
+
CsvExtractor = lokit.parsers.extractors.csv
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
Existing direct imports from `lokit.importers`, `lokit.exporters`, and format modules remain supported for compatibility.
|
|
263
|
+
|
|
264
|
+
<br>
|
|
265
|
+
<hr>
|
|
266
|
+
|
|
267
|
+
## Supported Formats
|
|
268
|
+
|
|
269
|
+
<br>
|
|
270
|
+
|
|
271
|
+
* TMX
|
|
272
|
+
* XLIFF
|
|
273
|
+
* PO/POT
|
|
274
|
+
* XLSX
|
|
275
|
+
* CSV
|
|
276
|
+
* JSON
|
|
277
|
+
* HTML
|
|
278
|
+
* IDML
|
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
# lokit
|
|
2
|
+
|
|
3
|
+
> [!WARNING]
|
|
4
|
+
> **Beta Release:** lokit is currently in Beta. The API is volatile and subject to rapid, breaking changes prior to the official V1 release.
|
|
5
|
+
|
|
6
|
+
<br>
|
|
7
|
+
|
|
8
|
+
lokit is a high-performance, strictly type-safe, and highly memory-efficient localization toolkit for Python.
|
|
9
|
+
|
|
10
|
+
<br>
|
|
11
|
+
|
|
12
|
+
Supports Python 3.10+.
|
|
13
|
+
|
|
14
|
+
<br>
|
|
15
|
+
<hr>
|
|
16
|
+
<br>
|
|
17
|
+
|
|
18
|
+
Unlike legacy tools that wrap around XML DOM element trees in-memory, lokit represents a shift away from XML-based localization interchange formats towards native language parsing. It ingests localization formats (TMX, XLIFF, PO, XLSX, CSV, JSON, HTML, IDML) and compiles them into a strict, unified structural data model. This enables not just parsing, but robust data manipulation, semantic extraction, and advanced translation memory features out-of-the-box. Lokit focuses on streaming and asynchronous processing rather than synchronous events using in-memory files.
|
|
19
|
+
|
|
20
|
+
<br>
|
|
21
|
+
|
|
22
|
+
This format type can be easily converted to JSON for interchange with other systems. I've made parsing and data transfers as native as possible by capturing all elements of traditional interchange formats in a common format structure. This allows for much better compatibility, especially in terms of segment matching and leveraging as it uses flattened strings as standard. Tags are preserved but as a common format, meaning the structure parsed from XLIFF will be the same as the structure parsed from HTML.
|
|
23
|
+
|
|
24
|
+
<br>
|
|
25
|
+
|
|
26
|
+
These legacy file formats have supported vendor-lock in for many year, making it difficult for any client to move to another system. Seeing that this is a major issue across the domain, something new is needed where vendors do not use hidden, legacy technology to lock in their clients. Localization deserves innovation.
|
|
27
|
+
|
|
28
|
+
<br>
|
|
29
|
+
<hr>
|
|
30
|
+
|
|
31
|
+
> The main premise here is a common, structured and type-safe dataclass model structure that is intentionally compatible with any file format, not just localization interchange formats, although these are optimized for performance and memory efficiency due to the verbose nature of XML based formats.
|
|
32
|
+
|
|
33
|
+
<br>
|
|
34
|
+
|
|
35
|
+
Note: This project was originally written in Rust and is still unreleased. Adding Rust extensions did not show a major performance improvement over the current C-Extension modules due to bridging overheads, this will be re-addressed in future releases. SDKs in other languages including the Rust prototype are coming soon.
|
|
36
|
+
|
|
37
|
+
<br>
|
|
38
|
+
|
|
39
|
+
## Core Features
|
|
40
|
+
|
|
41
|
+
<br>
|
|
42
|
+
|
|
43
|
+
lokit provides a comprehensive suite of tools for managing localization data:
|
|
44
|
+
|
|
45
|
+
* **Native Structural Modeling:** Converts interchange formats into a strict, unified Python Data classes, ensuring complete type safety.
|
|
46
|
+
* **Advanced Matching Engine:** Provides Exact Matching, Fuzzy Matching (via SequenceMatcher), and In-Context Exact (ICE) Matching leveraging previous and next segment context, as well as with inline tags.
|
|
47
|
+
* **Sub-segment Extraction:** Automatically parses and isolates inline tags, properties, and formatting markers, allowing for safe manipulation of text without corrupting code.
|
|
48
|
+
* **Semantic Querying:** Easily filter translation units using any attribute, exact ID lookups, or deep nested JSON path querying (`where()`).
|
|
49
|
+
* **Plural Support:** Native extraction and structuring of pluralized translation units, compatible with UI frameworks.
|
|
50
|
+
* **Universal Format Conversion:** Instantly import and export between any supported format (e.g., TMX to JSON, HTML to XLIFF) with zero data loss.
|
|
51
|
+
* **Synchronous and Asynchronous Streaming:** Process massive enterprise files natively using Python async generators to keep memory overhead to an absolute minimum.
|
|
52
|
+
|
|
53
|
+
<br>
|
|
54
|
+
|
|
55
|
+
### Type Safety and C-Extensions
|
|
56
|
+
|
|
57
|
+
<br>
|
|
58
|
+
|
|
59
|
+
The entire library is very strictly typed and mypy compliant, so strict it compiles to C-extensions via mypyc and pre-attached via wheels. Additionally, any XML processing uses C-based packages. Compiling to these extensions has shown a 23% in overall performance increases over pure-python modules with additional benefits such as lower memory usage. C extensions are standard for MacOS (ARM+Intel), Windows, and Linux.
|
|
60
|
+
|
|
61
|
+
<br>
|
|
62
|
+
|
|
63
|
+
## Parsing Performance
|
|
64
|
+
|
|
65
|
+
<br>
|
|
66
|
+
|
|
67
|
+
When dealing with enterprise-scale localization environments, parsing performance and memory efficiency are paramount. lokit is designed to be significantly leaner and faster than the industry standard.
|
|
68
|
+
|
|
69
|
+
<br>
|
|
70
|
+
|
|
71
|
+
Using another package, `translate-toolkit`, as a reference as it is the de-facto and feature-rich standard for localization file format parsing and conversion in Python for comparison, we benchmarked lokit's modules against its equivalents.
|
|
72
|
+
|
|
73
|
+
<br>
|
|
74
|
+
|
|
75
|
+
In a stress-test benchmark on a +600 MB `.TMX` file containing **557,058 segments**, converting to JSON with `Lokit.to_json_async()` over 3 iterations yielded the following comparative averages:
|
|
76
|
+
|
|
77
|
+
<br>
|
|
78
|
+
|
|
79
|
+
| Library | Avg Duration | Peak Memory | Memory Efficiency |
|
|
80
|
+
|---------|------------------|------------------|-------------------|
|
|
81
|
+
| **lokit** | 13.57s | 135.9 MB | 15x Less Memory |
|
|
82
|
+
| **translate-toolkit** | 20.30s | 2,034.5 MB | ~2.0 GB |
|
|
83
|
+
|
|
84
|
+
<br>
|
|
85
|
+
|
|
86
|
+
Tests for both covered from TMX to JSON with inline tag sanitization in both using the respective packages' tooling.
|
|
87
|
+
|
|
88
|
+
<br>
|
|
89
|
+
|
|
90
|
+
The major focus on memory safety allows for parallel processing of events, making it suitable for large-scale localization workflows and backend systems.
|
|
91
|
+
|
|
92
|
+
<br>
|
|
93
|
+
|
|
94
|
+
**Note:** this package is not a replacement or substitution for the already amazing translate-toolkit. The functionality is quite differet across both libraries and have their own use cases.
|
|
95
|
+
|
|
96
|
+
<br>
|
|
97
|
+
<hr>
|
|
98
|
+
|
|
99
|
+
## SDK Usage Reference
|
|
100
|
+
|
|
101
|
+
<br>
|
|
102
|
+
|
|
103
|
+
Lokit operates around a central `BaseStructure` dataclass model, which standardizes localization units and segments. This instructs better standardization and branching in a more language native way compared to XML based file formats. Parsing SDKs are added for both extraction and export tasks for localization interchange formats along with common file types.
|
|
104
|
+
|
|
105
|
+
<br>
|
|
106
|
+
|
|
107
|
+
### Installation
|
|
108
|
+
|
|
109
|
+
<br>
|
|
110
|
+
|
|
111
|
+
Install lokit via pip:
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pip install lokit-python
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
<br>
|
|
118
|
+
|
|
119
|
+
### Basic Parsing and Conversion
|
|
120
|
+
|
|
121
|
+
<br>
|
|
122
|
+
|
|
123
|
+
Converting files synchronously is straightforward through the structured `lokit` API. Import the package once, then use the format paths under `lokit.parsers` and `lokit.exporters`.
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
import lokit
|
|
127
|
+
|
|
128
|
+
document = lokit.parsers.read.tmx("path/to/source.tmx")
|
|
129
|
+
|
|
130
|
+
lokit.exporters.write.xliff(document, "path/to/target.xliff")
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
<br>
|
|
134
|
+
|
|
135
|
+
### Asynchronous Streaming for Large Interchange Files
|
|
136
|
+
|
|
137
|
+
<br>
|
|
138
|
+
|
|
139
|
+
For files spanning hundreds of megabytes, parsing the entire DOM structure into memory is inefficient. Lokit supports stream-parsing natively.
|
|
140
|
+
|
|
141
|
+
<br>
|
|
142
|
+
|
|
143
|
+
Here's some simple scripting code to show how easy it is. This simple program has no boilderplate and can be reduced to a few lines of code, but for the purpose of showcasing, we added some wrapper functions. The stream APIs take the static attributes such as language codes, keeping them in an immutable state. Then quickly streams the mutables. All other parsing modules also use streaming to parse to and from the common typed format.
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
import asyncio
|
|
147
|
+
import os
|
|
148
|
+
|
|
149
|
+
import lokit
|
|
150
|
+
|
|
151
|
+
input_dir = "data/language_tmx"
|
|
152
|
+
output_dir = "data/out"
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def convert_to_json(filepath: str):
|
|
156
|
+
print(f"Starting: {filepath}")
|
|
157
|
+
output = f"{output_dir}/{os.path.splitext(os.path.basename(filepath))[0]}.json"
|
|
158
|
+
await lokit.parsers.stream.json(
|
|
159
|
+
filepath=filepath,
|
|
160
|
+
output=output,
|
|
161
|
+
)
|
|
162
|
+
print(f"Completed: {output}")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
async def process():
|
|
166
|
+
if not os.path.exists(output_dir):
|
|
167
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
168
|
+
|
|
169
|
+
files = [os.path.join(input_dir, i) for i in os.listdir(input_dir)]
|
|
170
|
+
tasks = [convert_to_json(filepath=file) for file in files]
|
|
171
|
+
await asyncio.gather(*tasks)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
asyncio.run(process())
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
<br>
|
|
179
|
+
|
|
180
|
+
### Advanced Querying and Matching
|
|
181
|
+
|
|
182
|
+
<br>
|
|
183
|
+
|
|
184
|
+
The `Lokit` logic wrapper provides access to the powerful matching engine and data manipulation features. This does not substitute for enterprise database semantic search but can be used as an after-step for evaluating matching results after retrieving translation units from a semantic/vector database.
|
|
185
|
+
|
|
186
|
+
```python
|
|
187
|
+
import lokit
|
|
188
|
+
|
|
189
|
+
engine = lokit.Lokit.parse("path/to/source.xliff")
|
|
190
|
+
|
|
191
|
+
button_units = engine.where("extensions.component", "checkout_button")
|
|
192
|
+
|
|
193
|
+
results = engine.fuzzy_find("Complete your purchase", limit=5, threshold=0.75)
|
|
194
|
+
for match in results:
|
|
195
|
+
print(f"Match found: {match.unit_id} (Score: {match.score})")
|
|
196
|
+
|
|
197
|
+
ice_match = engine.match(
|
|
198
|
+
source="Submit",
|
|
199
|
+
target_unit_id="submit_btn_1",
|
|
200
|
+
previous_source="Enter your email",
|
|
201
|
+
require_context=True
|
|
202
|
+
)
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
### Structured API Paths
|
|
206
|
+
|
|
207
|
+
The preferred public API is available from a single package import:
|
|
208
|
+
|
|
209
|
+
```python
|
|
210
|
+
import lokit
|
|
211
|
+
|
|
212
|
+
document = lokit.parsers.read.file("path/to/source.tmx")
|
|
213
|
+
document = lokit.parsers.read.csv("path/to/source.csv", source_locale="en-US")
|
|
214
|
+
streamed_tmx = lokit.parsers.stream.tmx("path/to/source.tmx")
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
async def stream_to_json() -> None:
|
|
218
|
+
await lokit.parsers.stream.json("path/to/source.tmx", "path/to/out")
|
|
219
|
+
|
|
220
|
+
lokit.exporters.write.csv(document, "path/to/target.csv")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
async def export_xlsx() -> None:
|
|
224
|
+
await lokit.exporters.async_.xlsx(document, "path/to/target.xlsx")
|
|
225
|
+
|
|
226
|
+
CsvExtractor = lokit.parsers.extractors.csv
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Existing direct imports from `lokit.importers`, `lokit.exporters`, and format modules remain supported for compatibility.
|
|
230
|
+
|
|
231
|
+
<br>
|
|
232
|
+
<hr>
|
|
233
|
+
|
|
234
|
+
## Supported Formats
|
|
235
|
+
|
|
236
|
+
<br>
|
|
237
|
+
|
|
238
|
+
* TMX
|
|
239
|
+
* XLIFF
|
|
240
|
+
* PO/POT
|
|
241
|
+
* XLSX
|
|
242
|
+
* CSV
|
|
243
|
+
* JSON
|
|
244
|
+
* HTML
|
|
245
|
+
* IDML
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "lokit-python"
|
|
3
|
+
version = "0.1.4"
|
|
4
|
+
description = "A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
keywords = [
|
|
8
|
+
"localization",
|
|
9
|
+
"l10n",
|
|
10
|
+
"i18n",
|
|
11
|
+
"translation",
|
|
12
|
+
"translation-memory",
|
|
13
|
+
"tmx",
|
|
14
|
+
"xliff",
|
|
15
|
+
"gettext",
|
|
16
|
+
"po",
|
|
17
|
+
"idml",
|
|
18
|
+
"xlsx",
|
|
19
|
+
"csv",
|
|
20
|
+
"json",
|
|
21
|
+
"html",
|
|
22
|
+
"type-safe",
|
|
23
|
+
"async",
|
|
24
|
+
"streaming",
|
|
25
|
+
"mypy",
|
|
26
|
+
"mypyc",
|
|
27
|
+
"backend-localization",
|
|
28
|
+
"parsing",
|
|
29
|
+
"parsers",
|
|
30
|
+
"localization-parsers",
|
|
31
|
+
"performative",
|
|
32
|
+
"internationalization",
|
|
33
|
+
"streaming",
|
|
34
|
+
"backend",
|
|
35
|
+
"memory-safe",
|
|
36
|
+
|
|
37
|
+
]
|
|
38
|
+
classifiers = [
|
|
39
|
+
"Development Status :: 4 - Beta",
|
|
40
|
+
"Intended Audience :: Developers",
|
|
41
|
+
"Intended Audience :: Information Technology",
|
|
42
|
+
"Operating System :: OS Independent",
|
|
43
|
+
"Programming Language :: Python :: 3",
|
|
44
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
45
|
+
"Programming Language :: Python :: 3.10",
|
|
46
|
+
"Programming Language :: Python :: 3.11",
|
|
47
|
+
"Programming Language :: Python :: 3.12",
|
|
48
|
+
"Programming Language :: Python :: 3.13",
|
|
49
|
+
"Programming Language :: Python :: 3.14",
|
|
50
|
+
"Topic :: Software Development :: Internationalization",
|
|
51
|
+
"Topic :: Software Development :: Localization",
|
|
52
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
53
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
54
|
+
"Topic :: Text Processing :: Linguistic",
|
|
55
|
+
"Typing :: Typed",
|
|
56
|
+
]
|
|
57
|
+
dependencies = [
|
|
58
|
+
"lxml>=6.1.1",
|
|
59
|
+
"python-calamine>=0.6.2",
|
|
60
|
+
"polib>=1.2.0",
|
|
61
|
+
"rustpy-xlsxwriter>=0.4.4,<0.5",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
[project.urls]
|
|
65
|
+
Homepage = "https://github.com/ciarandarby/lokit"
|
|
66
|
+
Repository = "https://github.com/ciarandarby/lokit"
|
|
67
|
+
Issues = "https://github.com/ciarandarby/lokit/issues"
|
|
68
|
+
Documentation = "https://github.com/ciarandarby/lokit#readme"
|
|
69
|
+
|
|
70
|
+
[build-system]
|
|
71
|
+
requires = ["setuptools>=82.0.1", "mypy>=1.10.0", "lxml-stubs>=0.5.1", "types-polib"]
|
|
72
|
+
build-backend = "setuptools.build_meta"
|
|
73
|
+
|
|
74
|
+
[dependency-groups]
|
|
75
|
+
dev = [
|
|
76
|
+
"lxml-stubs>=0.5.1",
|
|
77
|
+
"types-polib",
|
|
78
|
+
"mypy>=2.1.0",
|
|
79
|
+
"pytest>=8.0",
|
|
80
|
+
"pytest-asyncio>=0.24",
|
|
81
|
+
"ruff>=0.15.15",
|
|
82
|
+
"setuptools>=82.0.1",
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
[tool.setuptools.packages.find]
|
|
86
|
+
where = ["src"]
|
|
87
|
+
|
|
88
|
+
[tool.setuptools.package-data]
|
|
89
|
+
lokit = ["py.typed"]
|
|
90
|
+
|
|
91
|
+
[tool.mypy]
|
|
92
|
+
mypy_path = ["src", "stubs"]
|
|
93
|
+
strict = true
|
|
94
|
+
warn_unreachable = true
|
|
95
|
+
show_error_codes = true
|
|
@@ -6,16 +6,7 @@ from setuptools.command.build_ext import build_ext
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
def _build_path_replacements(src_files):
|
|
9
|
-
"""Build a mapping of Windows-form paths to POSIX-form paths.
|
|
10
9
|
|
|
11
|
-
For each source file, compute its Windows backslash representation
|
|
12
|
-
and its POSIX forward-slash representation. Only include entries
|
|
13
|
-
where the two differ (i.e. the path contains directory separators).
|
|
14
|
-
|
|
15
|
-
This is used to fix mypyc-generated C files on Windows, where
|
|
16
|
-
embedded Python source paths use backslashes that MSVC interprets
|
|
17
|
-
as C escape sequences (e.g. \\x in \\xliff causes error C2153).
|
|
18
|
-
"""
|
|
19
10
|
replacements = {}
|
|
20
11
|
for src_file in src_files:
|
|
21
12
|
posix_form = PurePosixPath(src_file).as_posix()
|
|
@@ -77,6 +77,10 @@ from lokit.parsers.tmx.extraction import TmxExtractor
|
|
|
77
77
|
from lokit.parsers.tmx.models import TmxParseMode
|
|
78
78
|
from lokit.parsers.tmx.parallel import TmxParallelOptions
|
|
79
79
|
from lokit.parsers.xliff.extraction import XliffExtractor
|
|
80
|
+
from lokit import data as data
|
|
81
|
+
from lokit import exporters as exporters
|
|
82
|
+
from lokit import io as io
|
|
83
|
+
from lokit import parsers as parsers
|
|
80
84
|
|
|
81
85
|
__all__ = [
|
|
82
86
|
"AdjacentContext",
|
|
@@ -109,6 +113,8 @@ __all__ = [
|
|
|
109
113
|
"PoExtractor",
|
|
110
114
|
"JsonI18nExtractor",
|
|
111
115
|
"IdmlExtractor",
|
|
116
|
+
"data",
|
|
117
|
+
"exporters",
|
|
112
118
|
"export_csv",
|
|
113
119
|
"export_csv_async",
|
|
114
120
|
"export_idml",
|
|
@@ -153,6 +159,8 @@ __all__ = [
|
|
|
153
159
|
"import_xliff_async",
|
|
154
160
|
"import_xlsx",
|
|
155
161
|
"import_xlsx_async",
|
|
162
|
+
"io",
|
|
156
163
|
"load_lokit_json",
|
|
157
164
|
"load_lokit_json_bytes",
|
|
165
|
+
"parsers",
|
|
158
166
|
]
|