lokit-python 0.1.3__tar.gz → 0.1.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. lokit_python-0.1.4/MANIFEST.in +1 -0
  2. lokit_python-0.1.4/PKG-INFO +278 -0
  3. lokit_python-0.1.4/README.md +245 -0
  4. lokit_python-0.1.4/pyproject.toml +95 -0
  5. {lokit_python-0.1.3 → lokit_python-0.1.4}/setup.py +0 -9
  6. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/__init__.py +8 -0
  7. lokit_python-0.1.4/src/lokit/compat.py +7 -0
  8. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/lang_codes.py +1 -1
  9. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/structure.py +1 -1
  10. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/tag_types.py +2 -1
  11. lokit_python-0.1.4/src/lokit/exporters/__init__.py +125 -0
  12. lokit_python-0.1.4/src/lokit/exporters/xlsx.py +141 -0
  13. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/format_detection.py +1 -1
  14. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/stream_json.py +1 -1
  15. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/logic.py +6 -4
  16. lokit_python-0.1.4/src/lokit/parsers/__init__.py +315 -0
  17. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/models.py +2 -1
  18. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/tags.py +1 -1
  19. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xlsx/extraction.py +56 -59
  20. lokit_python-0.1.4/src/lokit_python.egg-info/PKG-INFO +278 -0
  21. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit_python.egg-info/SOURCES.txt +5 -0
  22. lokit_python-0.1.4/src/lokit_python.egg-info/requires.txt +4 -0
  23. lokit_python-0.1.4/src/lokit_python.egg-info/top_level.txt +2 -0
  24. lokit_python-0.1.4/stubs/python_calamine.pyi +26 -0
  25. lokit_python-0.1.4/stubs/rustpy_xlsxwriter.pyi +25 -0
  26. lokit_python-0.1.4/tests/test_public_api.py +67 -0
  27. lokit_python-0.1.3/PKG-INFO +0 -149
  28. lokit_python-0.1.3/README.md +0 -139
  29. lokit_python-0.1.3/pyproject.toml +0 -39
  30. lokit_python-0.1.3/src/lokit/exporters/__init__.py +0 -34
  31. lokit_python-0.1.3/src/lokit/exporters/xlsx.py +0 -69
  32. lokit_python-0.1.3/src/lokit/parsers/xlsx/__init__.py +0 -1
  33. lokit_python-0.1.3/src/lokit_python.egg-info/PKG-INFO +0 -149
  34. lokit_python-0.1.3/src/lokit_python.egg-info/requires.txt +0 -3
  35. lokit_python-0.1.3/src/lokit_python.egg-info/top_level.txt +0 -2
  36. {lokit_python-0.1.3 → lokit_python-0.1.4}/setup.cfg +0 -0
  37. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/core/__init__.py +0 -0
  38. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/core/logger.py +0 -0
  39. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/data/__init__.py +0 -0
  40. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/csv.py +0 -0
  41. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/html.py +0 -0
  42. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/idml.py +0 -0
  43. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/json_i18n.py +0 -0
  44. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/po.py +0 -0
  45. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/tmx.py +0 -0
  46. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/exporters/xliff.py +0 -0
  47. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/importers.py +0 -0
  48. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/__init__.py +0 -0
  49. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/atomic.py +0 -0
  50. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/io/json.py +0 -0
  51. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/async_bridge.py +0 -0
  52. {lokit_python-0.1.3/src/lokit/parsers → lokit_python-0.1.4/src/lokit/parsers/csv}/__init__.py +0 -0
  53. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/csv/extraction.py +0 -0
  54. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/html/__init__.py +0 -0
  55. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/html/extraction.py +0 -0
  56. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/idml/__init__.py +0 -0
  57. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/idml/extraction.py +0 -0
  58. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/json_i18n/__init__.py +0 -0
  59. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/json_i18n/extraction.py +0 -0
  60. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/po/__init__.py +0 -0
  61. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/po/extraction.py +0 -0
  62. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/__init__.py +0 -0
  63. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/base.py +0 -0
  64. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/extraction.py +0 -0
  65. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/header.py +0 -0
  66. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/helpers.py +0 -0
  67. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/parallel.py +0 -0
  68. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/props.py +0 -0
  69. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/tmx/xml_utils.py +0 -0
  70. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xliff/__init__.py +0 -0
  71. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xliff/extraction.py +0 -0
  72. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/parsers/xliff/tags.py +0 -0
  73. {lokit_python-0.1.3/src/lokit/parsers/csv → lokit_python-0.1.4/src/lokit/parsers/xlsx}/__init__.py +0 -0
  74. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit/py.typed +0 -0
  75. {lokit_python-0.1.3 → lokit_python-0.1.4}/src/lokit_python.egg-info/dependency_links.txt +0 -0
  76. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_csv.py +0 -0
  77. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_html.py +0 -0
  78. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_idml.py +0 -0
  79. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_json_i18n.py +0 -0
  80. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_performance_safety.py +0 -0
  81. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_po.py +0 -0
  82. {lokit_python-0.1.3 → lokit_python-0.1.4}/tests/test_xlsx.py +0 -0
@@ -0,0 +1 @@
1
+ include stubs/*.pyi
@@ -0,0 +1,278 @@
1
+ Metadata-Version: 2.4
2
+ Name: lokit-python
3
+ Version: 0.1.4
4
+ Summary: A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files.
5
+ Project-URL: Homepage, https://github.com/ciarandarby/lokit
6
+ Project-URL: Repository, https://github.com/ciarandarby/lokit
7
+ Project-URL: Issues, https://github.com/ciarandarby/lokit/issues
8
+ Project-URL: Documentation, https://github.com/ciarandarby/lokit#readme
9
+ Keywords: localization,l10n,i18n,translation,translation-memory,tmx,xliff,gettext,po,idml,xlsx,csv,json,html,type-safe,async,streaming,mypy,mypyc,backend-localization,parsing,parsers,localization-parsers,performative,internationalization,streaming,backend,memory-safe
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Information Technology
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Programming Language :: Python :: 3.14
21
+ Classifier: Topic :: Software Development :: Internationalization
22
+ Classifier: Topic :: Software Development :: Localization
23
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
24
+ Classifier: Topic :: Text Processing :: Markup :: XML
25
+ Classifier: Topic :: Text Processing :: Linguistic
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.10
28
+ Description-Content-Type: text/markdown
29
+ Requires-Dist: lxml>=6.1.1
30
+ Requires-Dist: python-calamine>=0.6.2
31
+ Requires-Dist: polib>=1.2.0
32
+ Requires-Dist: rustpy-xlsxwriter<0.5,>=0.4.4
33
+
34
+ # lokit
35
+
36
+ > [!WARNING]
37
+ > **Beta Release:** lokit is currently in Beta. The API is volatile and subject to rapid, breaking changes prior to the official V1 release.
38
+
39
+ <br>
40
+
41
+ lokit is a high-performance, strictly type-safe, and highly memory-efficient localization toolkit for Python.
42
+
43
+ <br>
44
+
45
+ Supports Python 3.10+.
46
+
47
+ <br>
48
+ <hr>
49
+ <br>
50
+
51
+ Unlike legacy tools that wrap around XML DOM element trees in-memory, lokit represents a shift away from XML-based localization interchange formats towards native language parsing. It ingests localization formats (TMX, XLIFF, PO, XLSX, CSV, JSON, HTML, IDML) and compiles them into a strict, unified structural data model. This enables not just parsing, but robust data manipulation, semantic extraction, and advanced translation memory features out-of-the-box. Lokit focuses on streaming and asynchronous processing rather than synchronous events using in-memory files.
52
+
53
+ <br>
54
+
55
+ This format type can be easily converted to JSON for interchange with other systems. I've made parsing and data transfers as native as possible by capturing all elements of traditional interchange formats in a common format structure. This allows for much better compatibility, especially in terms of segment matching and leveraging as it uses flattened strings as standard. Tags are preserved but as a common format, meaning the structure parsed from XLIFF will be the same as the structure parsed from HTML.
56
+
57
+ <br>
58
+
59
+ These legacy file formats have supported vendor-lock in for many year, making it difficult for any client to move to another system. Seeing that this is a major issue across the domain, something new is needed where vendors do not use hidden, legacy technology to lock in their clients. Localization deserves innovation.
60
+
61
+ <br>
62
+ <hr>
63
+
64
+ > The main premise here is a common, structured and type-safe dataclass model structure that is intentionally compatible with any file format, not just localization interchange formats, although these are optimized for performance and memory efficiency due to the verbose nature of XML based formats.
65
+
66
+ <br>
67
+
68
+ Note: This project was originally written in Rust and is still unreleased. Adding Rust extensions did not show a major performance improvement over the current C-Extension modules due to bridging overheads, this will be re-addressed in future releases. SDKs in other languages including the Rust prototype are coming soon.
69
+
70
+ <br>
71
+
72
+ ## Core Features
73
+
74
+ <br>
75
+
76
+ lokit provides a comprehensive suite of tools for managing localization data:
77
+
78
+ * **Native Structural Modeling:** Converts interchange formats into a strict, unified Python Data classes, ensuring complete type safety.
79
+ * **Advanced Matching Engine:** Provides Exact Matching, Fuzzy Matching (via SequenceMatcher), and In-Context Exact (ICE) Matching leveraging previous and next segment context, as well as with inline tags.
80
+ * **Sub-segment Extraction:** Automatically parses and isolates inline tags, properties, and formatting markers, allowing for safe manipulation of text without corrupting code.
81
+ * **Semantic Querying:** Easily filter translation units using any attribute, exact ID lookups, or deep nested JSON path querying (`where()`).
82
+ * **Plural Support:** Native extraction and structuring of pluralized translation units, compatible with UI frameworks.
83
+ * **Universal Format Conversion:** Instantly import and export between any supported format (e.g., TMX to JSON, HTML to XLIFF) with zero data loss.
84
+ * **Synchronous and Asynchronous Streaming:** Process massive enterprise files natively using Python async generators to keep memory overhead to an absolute minimum.
85
+
86
+ <br>
87
+
88
+ ### Type Safety and C-Extensions
89
+
90
+ <br>
91
+
92
+ The entire library is very strictly typed and mypy compliant, so strict it compiles to C-extensions via mypyc and pre-attached via wheels. Additionally, any XML processing uses C-based packages. Compiling to these extensions has shown a 23% in overall performance increases over pure-python modules with additional benefits such as lower memory usage. C extensions are standard for MacOS (ARM+Intel), Windows, and Linux.
93
+
94
+ <br>
95
+
96
+ ## Parsing Performance
97
+
98
+ <br>
99
+
100
+ When dealing with enterprise-scale localization environments, parsing performance and memory efficiency are paramount. lokit is designed to be significantly leaner and faster than the industry standard.
101
+
102
+ <br>
103
+
104
+ Using another package, `translate-toolkit`, as a reference as it is the de-facto and feature-rich standard for localization file format parsing and conversion in Python for comparison, we benchmarked lokit's modules against its equivalents.
105
+
106
+ <br>
107
+
108
+ In a stress-test benchmark on a +600 MB `.TMX` file containing **557,058 segments**, converting to JSON with `Lokit.to_json_async()` over 3 iterations yielded the following comparative averages:
109
+
110
+ <br>
111
+
112
+ | Library | Avg Duration | Peak Memory | Memory Efficiency |
113
+ |---------|------------------|------------------|-------------------|
114
+ | **lokit** | 13.57s | 135.9 MB | 15x Less Memory |
115
+ | **translate-toolkit** | 20.30s | 2,034.5 MB | ~2.0 GB |
116
+
117
+ <br>
118
+
119
+ Tests for both covered from TMX to JSON with inline tag sanitization in both using the respective packages' tooling.
120
+
121
+ <br>
122
+
123
+ The major focus on memory safety allows for parallel processing of events, making it suitable for large-scale localization workflows and backend systems.
124
+
125
+ <br>
126
+
127
+ **Note:** this package is not a replacement or substitution for the already amazing translate-toolkit. The functionality is quite differet across both libraries and have their own use cases.
128
+
129
+ <br>
130
+ <hr>
131
+
132
+ ## SDK Usage Reference
133
+
134
+ <br>
135
+
136
+ Lokit operates around a central `BaseStructure` dataclass model, which standardizes localization units and segments. This instructs better standardization and branching in a more language native way compared to XML based file formats. Parsing SDKs are added for both extraction and export tasks for localization interchange formats along with common file types.
137
+
138
+ <br>
139
+
140
+ ### Installation
141
+
142
+ <br>
143
+
144
+ Install lokit via pip:
145
+
146
+ ```bash
147
+ pip install lokit-python
148
+ ```
149
+
150
+ <br>
151
+
152
+ ### Basic Parsing and Conversion
153
+
154
+ <br>
155
+
156
+ Converting files synchronously is straightforward through the structured `lokit` API. Import the package once, then use the format paths under `lokit.parsers` and `lokit.exporters`.
157
+
158
+ ```python
159
+ import lokit
160
+
161
+ document = lokit.parsers.read.tmx("path/to/source.tmx")
162
+
163
+ lokit.exporters.write.xliff(document, "path/to/target.xliff")
164
+ ```
165
+
166
+ <br>
167
+
168
+ ### Asynchronous Streaming for Large Interchange Files
169
+
170
+ <br>
171
+
172
+ For files spanning hundreds of megabytes, parsing the entire DOM structure into memory is inefficient. Lokit supports stream-parsing natively.
173
+
174
+ <br>
175
+
176
+ Here's some simple scripting code to show how easy it is. This simple program has no boilderplate and can be reduced to a few lines of code, but for the purpose of showcasing, we added some wrapper functions. The stream APIs take the static attributes such as language codes, keeping them in an immutable state. Then quickly streams the mutables. All other parsing modules also use streaming to parse to and from the common typed format.
177
+
178
+ ```python
179
+ import asyncio
180
+ import os
181
+
182
+ import lokit
183
+
184
+ input_dir = "data/language_tmx"
185
+ output_dir = "data/out"
186
+
187
+
188
+ async def convert_to_json(filepath: str):
189
+ print(f"Starting: {filepath}")
190
+ output = f"{output_dir}/{os.path.splitext(os.path.basename(filepath))[0]}.json"
191
+ await lokit.parsers.stream.json(
192
+ filepath=filepath,
193
+ output=output,
194
+ )
195
+ print(f"Completed: {output}")
196
+
197
+
198
+ async def process():
199
+ if not os.path.exists(output_dir):
200
+ os.makedirs(output_dir, exist_ok=True)
201
+
202
+ files = [os.path.join(input_dir, i) for i in os.listdir(input_dir)]
203
+ tasks = [convert_to_json(filepath=file) for file in files]
204
+ await asyncio.gather(*tasks)
205
+
206
+
207
+ if __name__ == "__main__":
208
+ asyncio.run(process())
209
+ ```
210
+
211
+ <br>
212
+
213
+ ### Advanced Querying and Matching
214
+
215
+ <br>
216
+
217
+ The `Lokit` logic wrapper provides access to the powerful matching engine and data manipulation features. This does not substitute for enterprise database semantic search but can be used as an after-step for evaluating matching results after retrieving translation units from a semantic/vector database.
218
+
219
+ ```python
220
+ import lokit
221
+
222
+ engine = lokit.Lokit.parse("path/to/source.xliff")
223
+
224
+ button_units = engine.where("extensions.component", "checkout_button")
225
+
226
+ results = engine.fuzzy_find("Complete your purchase", limit=5, threshold=0.75)
227
+ for match in results:
228
+ print(f"Match found: {match.unit_id} (Score: {match.score})")
229
+
230
+ ice_match = engine.match(
231
+ source="Submit",
232
+ target_unit_id="submit_btn_1",
233
+ previous_source="Enter your email",
234
+ require_context=True
235
+ )
236
+ ```
237
+
238
+ ### Structured API Paths
239
+
240
+ The preferred public API is available from a single package import:
241
+
242
+ ```python
243
+ import lokit
244
+
245
+ document = lokit.parsers.read.file("path/to/source.tmx")
246
+ document = lokit.parsers.read.csv("path/to/source.csv", source_locale="en-US")
247
+ streamed_tmx = lokit.parsers.stream.tmx("path/to/source.tmx")
248
+
249
+
250
+ async def stream_to_json() -> None:
251
+ await lokit.parsers.stream.json("path/to/source.tmx", "path/to/out")
252
+
253
+ lokit.exporters.write.csv(document, "path/to/target.csv")
254
+
255
+
256
+ async def export_xlsx() -> None:
257
+ await lokit.exporters.async_.xlsx(document, "path/to/target.xlsx")
258
+
259
+ CsvExtractor = lokit.parsers.extractors.csv
260
+ ```
261
+
262
+ Existing direct imports from `lokit.importers`, `lokit.exporters`, and format modules remain supported for compatibility.
263
+
264
+ <br>
265
+ <hr>
266
+
267
+ ## Supported Formats
268
+
269
+ <br>
270
+
271
+ * TMX
272
+ * XLIFF
273
+ * PO/POT
274
+ * XLSX
275
+ * CSV
276
+ * JSON
277
+ * HTML
278
+ * IDML
@@ -0,0 +1,245 @@
1
+ # lokit
2
+
3
+ > [!WARNING]
4
+ > **Beta Release:** lokit is currently in Beta. The API is volatile and subject to rapid, breaking changes prior to the official V1 release.
5
+
6
+ <br>
7
+
8
+ lokit is a high-performance, strictly type-safe, and highly memory-efficient localization toolkit for Python.
9
+
10
+ <br>
11
+
12
+ Supports Python 3.10+.
13
+
14
+ <br>
15
+ <hr>
16
+ <br>
17
+
18
+ Unlike legacy tools that wrap around XML DOM element trees in-memory, lokit represents a shift away from XML-based localization interchange formats towards native language parsing. It ingests localization formats (TMX, XLIFF, PO, XLSX, CSV, JSON, HTML, IDML) and compiles them into a strict, unified structural data model. This enables not just parsing, but robust data manipulation, semantic extraction, and advanced translation memory features out-of-the-box. Lokit focuses on streaming and asynchronous processing rather than synchronous events using in-memory files.
19
+
20
+ <br>
21
+
22
+ This format type can be easily converted to JSON for interchange with other systems. I've made parsing and data transfers as native as possible by capturing all elements of traditional interchange formats in a common format structure. This allows for much better compatibility, especially in terms of segment matching and leveraging as it uses flattened strings as standard. Tags are preserved but as a common format, meaning the structure parsed from XLIFF will be the same as the structure parsed from HTML.
23
+
24
+ <br>
25
+
26
+ These legacy file formats have supported vendor-lock in for many year, making it difficult for any client to move to another system. Seeing that this is a major issue across the domain, something new is needed where vendors do not use hidden, legacy technology to lock in their clients. Localization deserves innovation.
27
+
28
+ <br>
29
+ <hr>
30
+
31
+ > The main premise here is a common, structured and type-safe dataclass model structure that is intentionally compatible with any file format, not just localization interchange formats, although these are optimized for performance and memory efficiency due to the verbose nature of XML based formats.
32
+
33
+ <br>
34
+
35
+ Note: This project was originally written in Rust and is still unreleased. Adding Rust extensions did not show a major performance improvement over the current C-Extension modules due to bridging overheads, this will be re-addressed in future releases. SDKs in other languages including the Rust prototype are coming soon.
36
+
37
+ <br>
38
+
39
+ ## Core Features
40
+
41
+ <br>
42
+
43
+ lokit provides a comprehensive suite of tools for managing localization data:
44
+
45
+ * **Native Structural Modeling:** Converts interchange formats into a strict, unified Python Data classes, ensuring complete type safety.
46
+ * **Advanced Matching Engine:** Provides Exact Matching, Fuzzy Matching (via SequenceMatcher), and In-Context Exact (ICE) Matching leveraging previous and next segment context, as well as with inline tags.
47
+ * **Sub-segment Extraction:** Automatically parses and isolates inline tags, properties, and formatting markers, allowing for safe manipulation of text without corrupting code.
48
+ * **Semantic Querying:** Easily filter translation units using any attribute, exact ID lookups, or deep nested JSON path querying (`where()`).
49
+ * **Plural Support:** Native extraction and structuring of pluralized translation units, compatible with UI frameworks.
50
+ * **Universal Format Conversion:** Instantly import and export between any supported format (e.g., TMX to JSON, HTML to XLIFF) with zero data loss.
51
+ * **Synchronous and Asynchronous Streaming:** Process massive enterprise files natively using Python async generators to keep memory overhead to an absolute minimum.
52
+
53
+ <br>
54
+
55
+ ### Type Safety and C-Extensions
56
+
57
+ <br>
58
+
59
+ The entire library is very strictly typed and mypy compliant, so strict it compiles to C-extensions via mypyc and pre-attached via wheels. Additionally, any XML processing uses C-based packages. Compiling to these extensions has shown a 23% in overall performance increases over pure-python modules with additional benefits such as lower memory usage. C extensions are standard for MacOS (ARM+Intel), Windows, and Linux.
60
+
61
+ <br>
62
+
63
+ ## Parsing Performance
64
+
65
+ <br>
66
+
67
+ When dealing with enterprise-scale localization environments, parsing performance and memory efficiency are paramount. lokit is designed to be significantly leaner and faster than the industry standard.
68
+
69
+ <br>
70
+
71
+ Using another package, `translate-toolkit`, as a reference as it is the de-facto and feature-rich standard for localization file format parsing and conversion in Python for comparison, we benchmarked lokit's modules against its equivalents.
72
+
73
+ <br>
74
+
75
+ In a stress-test benchmark on a +600 MB `.TMX` file containing **557,058 segments**, converting to JSON with `Lokit.to_json_async()` over 3 iterations yielded the following comparative averages:
76
+
77
+ <br>
78
+
79
+ | Library | Avg Duration | Peak Memory | Memory Efficiency |
80
+ |---------|------------------|------------------|-------------------|
81
+ | **lokit** | 13.57s | 135.9 MB | 15x Less Memory |
82
+ | **translate-toolkit** | 20.30s | 2,034.5 MB | ~2.0 GB |
83
+
84
+ <br>
85
+
86
+ Tests for both covered from TMX to JSON with inline tag sanitization in both using the respective packages' tooling.
87
+
88
+ <br>
89
+
90
+ The major focus on memory safety allows for parallel processing of events, making it suitable for large-scale localization workflows and backend systems.
91
+
92
+ <br>
93
+
94
+ **Note:** this package is not a replacement or substitution for the already amazing translate-toolkit. The functionality is quite differet across both libraries and have their own use cases.
95
+
96
+ <br>
97
+ <hr>
98
+
99
+ ## SDK Usage Reference
100
+
101
+ <br>
102
+
103
+ Lokit operates around a central `BaseStructure` dataclass model, which standardizes localization units and segments. This instructs better standardization and branching in a more language native way compared to XML based file formats. Parsing SDKs are added for both extraction and export tasks for localization interchange formats along with common file types.
104
+
105
+ <br>
106
+
107
+ ### Installation
108
+
109
+ <br>
110
+
111
+ Install lokit via pip:
112
+
113
+ ```bash
114
+ pip install lokit-python
115
+ ```
116
+
117
+ <br>
118
+
119
+ ### Basic Parsing and Conversion
120
+
121
+ <br>
122
+
123
+ Converting files synchronously is straightforward through the structured `lokit` API. Import the package once, then use the format paths under `lokit.parsers` and `lokit.exporters`.
124
+
125
+ ```python
126
+ import lokit
127
+
128
+ document = lokit.parsers.read.tmx("path/to/source.tmx")
129
+
130
+ lokit.exporters.write.xliff(document, "path/to/target.xliff")
131
+ ```
132
+
133
+ <br>
134
+
135
+ ### Asynchronous Streaming for Large Interchange Files
136
+
137
+ <br>
138
+
139
+ For files spanning hundreds of megabytes, parsing the entire DOM structure into memory is inefficient. Lokit supports stream-parsing natively.
140
+
141
+ <br>
142
+
143
+ Here's some simple scripting code to show how easy it is. This simple program has no boilderplate and can be reduced to a few lines of code, but for the purpose of showcasing, we added some wrapper functions. The stream APIs take the static attributes such as language codes, keeping them in an immutable state. Then quickly streams the mutables. All other parsing modules also use streaming to parse to and from the common typed format.
144
+
145
+ ```python
146
+ import asyncio
147
+ import os
148
+
149
+ import lokit
150
+
151
+ input_dir = "data/language_tmx"
152
+ output_dir = "data/out"
153
+
154
+
155
+ async def convert_to_json(filepath: str):
156
+ print(f"Starting: {filepath}")
157
+ output = f"{output_dir}/{os.path.splitext(os.path.basename(filepath))[0]}.json"
158
+ await lokit.parsers.stream.json(
159
+ filepath=filepath,
160
+ output=output,
161
+ )
162
+ print(f"Completed: {output}")
163
+
164
+
165
+ async def process():
166
+ if not os.path.exists(output_dir):
167
+ os.makedirs(output_dir, exist_ok=True)
168
+
169
+ files = [os.path.join(input_dir, i) for i in os.listdir(input_dir)]
170
+ tasks = [convert_to_json(filepath=file) for file in files]
171
+ await asyncio.gather(*tasks)
172
+
173
+
174
+ if __name__ == "__main__":
175
+ asyncio.run(process())
176
+ ```
177
+
178
+ <br>
179
+
180
+ ### Advanced Querying and Matching
181
+
182
+ <br>
183
+
184
+ The `Lokit` logic wrapper provides access to the powerful matching engine and data manipulation features. This does not substitute for enterprise database semantic search but can be used as an after-step for evaluating matching results after retrieving translation units from a semantic/vector database.
185
+
186
+ ```python
187
+ import lokit
188
+
189
+ engine = lokit.Lokit.parse("path/to/source.xliff")
190
+
191
+ button_units = engine.where("extensions.component", "checkout_button")
192
+
193
+ results = engine.fuzzy_find("Complete your purchase", limit=5, threshold=0.75)
194
+ for match in results:
195
+ print(f"Match found: {match.unit_id} (Score: {match.score})")
196
+
197
+ ice_match = engine.match(
198
+ source="Submit",
199
+ target_unit_id="submit_btn_1",
200
+ previous_source="Enter your email",
201
+ require_context=True
202
+ )
203
+ ```
204
+
205
+ ### Structured API Paths
206
+
207
+ The preferred public API is available from a single package import:
208
+
209
+ ```python
210
+ import lokit
211
+
212
+ document = lokit.parsers.read.file("path/to/source.tmx")
213
+ document = lokit.parsers.read.csv("path/to/source.csv", source_locale="en-US")
214
+ streamed_tmx = lokit.parsers.stream.tmx("path/to/source.tmx")
215
+
216
+
217
+ async def stream_to_json() -> None:
218
+ await lokit.parsers.stream.json("path/to/source.tmx", "path/to/out")
219
+
220
+ lokit.exporters.write.csv(document, "path/to/target.csv")
221
+
222
+
223
+ async def export_xlsx() -> None:
224
+ await lokit.exporters.async_.xlsx(document, "path/to/target.xlsx")
225
+
226
+ CsvExtractor = lokit.parsers.extractors.csv
227
+ ```
228
+
229
+ Existing direct imports from `lokit.importers`, `lokit.exporters`, and format modules remain supported for compatibility.
230
+
231
+ <br>
232
+ <hr>
233
+
234
+ ## Supported Formats
235
+
236
+ <br>
237
+
238
+ * TMX
239
+ * XLIFF
240
+ * PO/POT
241
+ * XLSX
242
+ * CSV
243
+ * JSON
244
+ * HTML
245
+ * IDML
@@ -0,0 +1,95 @@
1
+ [project]
2
+ name = "lokit-python"
3
+ version = "0.1.4"
4
+ description = "A type-safe localization toolkit for parsing, converting, and matching TMX, XLIFF, PO, JSON, HTML, CSV, XLSX, and IDML files."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ keywords = [
8
+ "localization",
9
+ "l10n",
10
+ "i18n",
11
+ "translation",
12
+ "translation-memory",
13
+ "tmx",
14
+ "xliff",
15
+ "gettext",
16
+ "po",
17
+ "idml",
18
+ "xlsx",
19
+ "csv",
20
+ "json",
21
+ "html",
22
+ "type-safe",
23
+ "async",
24
+ "streaming",
25
+ "mypy",
26
+ "mypyc",
27
+ "backend-localization",
28
+ "parsing",
29
+ "parsers",
30
+ "localization-parsers",
31
+ "performative",
32
+ "internationalization",
33
+ "streaming",
34
+ "backend",
35
+ "memory-safe",
36
+
37
+ ]
38
+ classifiers = [
39
+ "Development Status :: 4 - Beta",
40
+ "Intended Audience :: Developers",
41
+ "Intended Audience :: Information Technology",
42
+ "Operating System :: OS Independent",
43
+ "Programming Language :: Python :: 3",
44
+ "Programming Language :: Python :: 3 :: Only",
45
+ "Programming Language :: Python :: 3.10",
46
+ "Programming Language :: Python :: 3.11",
47
+ "Programming Language :: Python :: 3.12",
48
+ "Programming Language :: Python :: 3.13",
49
+ "Programming Language :: Python :: 3.14",
50
+ "Topic :: Software Development :: Internationalization",
51
+ "Topic :: Software Development :: Localization",
52
+ "Topic :: Software Development :: Libraries :: Python Modules",
53
+ "Topic :: Text Processing :: Markup :: XML",
54
+ "Topic :: Text Processing :: Linguistic",
55
+ "Typing :: Typed",
56
+ ]
57
+ dependencies = [
58
+ "lxml>=6.1.1",
59
+ "python-calamine>=0.6.2",
60
+ "polib>=1.2.0",
61
+ "rustpy-xlsxwriter>=0.4.4,<0.5",
62
+ ]
63
+
64
+ [project.urls]
65
+ Homepage = "https://github.com/ciarandarby/lokit"
66
+ Repository = "https://github.com/ciarandarby/lokit"
67
+ Issues = "https://github.com/ciarandarby/lokit/issues"
68
+ Documentation = "https://github.com/ciarandarby/lokit#readme"
69
+
70
+ [build-system]
71
+ requires = ["setuptools>=82.0.1", "mypy>=1.10.0", "lxml-stubs>=0.5.1", "types-polib"]
72
+ build-backend = "setuptools.build_meta"
73
+
74
+ [dependency-groups]
75
+ dev = [
76
+ "lxml-stubs>=0.5.1",
77
+ "types-polib",
78
+ "mypy>=2.1.0",
79
+ "pytest>=8.0",
80
+ "pytest-asyncio>=0.24",
81
+ "ruff>=0.15.15",
82
+ "setuptools>=82.0.1",
83
+ ]
84
+
85
+ [tool.setuptools.packages.find]
86
+ where = ["src"]
87
+
88
+ [tool.setuptools.package-data]
89
+ lokit = ["py.typed"]
90
+
91
+ [tool.mypy]
92
+ mypy_path = ["src", "stubs"]
93
+ strict = true
94
+ warn_unreachable = true
95
+ show_error_codes = true
@@ -6,16 +6,7 @@ from setuptools.command.build_ext import build_ext
6
6
 
7
7
 
8
8
  def _build_path_replacements(src_files):
9
- """Build a mapping of Windows-form paths to POSIX-form paths.
10
9
 
11
- For each source file, compute its Windows backslash representation
12
- and its POSIX forward-slash representation. Only include entries
13
- where the two differ (i.e. the path contains directory separators).
14
-
15
- This is used to fix mypyc-generated C files on Windows, where
16
- embedded Python source paths use backslashes that MSVC interprets
17
- as C escape sequences (e.g. \\x in \\xliff causes error C2153).
18
- """
19
10
  replacements = {}
20
11
  for src_file in src_files:
21
12
  posix_form = PurePosixPath(src_file).as_posix()
@@ -77,6 +77,10 @@ from lokit.parsers.tmx.extraction import TmxExtractor
77
77
  from lokit.parsers.tmx.models import TmxParseMode
78
78
  from lokit.parsers.tmx.parallel import TmxParallelOptions
79
79
  from lokit.parsers.xliff.extraction import XliffExtractor
80
+ from lokit import data as data
81
+ from lokit import exporters as exporters
82
+ from lokit import io as io
83
+ from lokit import parsers as parsers
80
84
 
81
85
  __all__ = [
82
86
  "AdjacentContext",
@@ -109,6 +113,8 @@ __all__ = [
109
113
  "PoExtractor",
110
114
  "JsonI18nExtractor",
111
115
  "IdmlExtractor",
116
+ "data",
117
+ "exporters",
112
118
  "export_csv",
113
119
  "export_csv_async",
114
120
  "export_idml",
@@ -153,6 +159,8 @@ __all__ = [
153
159
  "import_xliff_async",
154
160
  "import_xlsx",
155
161
  "import_xlsx_async",
162
+ "io",
156
163
  "load_lokit_json",
157
164
  "load_lokit_json_bytes",
165
+ "parsers",
158
166
  ]
@@ -0,0 +1,7 @@
1
+ from enum import Enum
2
+ from typing import cast
3
+
4
+
5
+ class StrEnum(str, Enum):
6
+ def __str__(self) -> str:
7
+ return cast("str", self.value)
@@ -1,4 +1,4 @@
1
- from enum import StrEnum
1
+ from lokit.compat import StrEnum
2
2
 
3
3
 
4
4
  class Language(StrEnum):
@@ -1,8 +1,8 @@
1
1
  from dataclasses import dataclass, field
2
- from enum import StrEnum
3
2
  from collections.abc import Iterable
4
3
  from typing import Optional
5
4
 
5
+ from lokit.compat import StrEnum
6
6
  from lokit.data.tag_types import TieData
7
7
 
8
8