exonware-xwsystem 0.0.1.410__py3-none-any.whl → 0.1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (259) hide show
  1. exonware/__init__.py +1 -1
  2. exonware/conf.py +1 -1
  3. exonware/xwsystem/__init__.py +2 -2
  4. exonware/xwsystem/caching/__init__.py +1 -1
  5. exonware/xwsystem/caching/base.py +2 -2
  6. exonware/xwsystem/caching/bloom_cache.py +2 -2
  7. exonware/xwsystem/caching/cache_manager.py +1 -1
  8. exonware/xwsystem/caching/conditional.py +2 -2
  9. exonware/xwsystem/caching/contracts.py +1 -1
  10. exonware/xwsystem/caching/decorators.py +2 -2
  11. exonware/xwsystem/caching/defs.py +1 -1
  12. exonware/xwsystem/caching/disk_cache.py +1 -1
  13. exonware/xwsystem/caching/distributed.py +1 -1
  14. exonware/xwsystem/caching/errors.py +1 -1
  15. exonware/xwsystem/caching/events.py +2 -2
  16. exonware/xwsystem/caching/eviction_strategies.py +1 -1
  17. exonware/xwsystem/caching/fluent.py +1 -1
  18. exonware/xwsystem/caching/integrity.py +1 -1
  19. exonware/xwsystem/caching/lfu_cache.py +2 -2
  20. exonware/xwsystem/caching/lfu_optimized.py +3 -3
  21. exonware/xwsystem/caching/lru_cache.py +2 -2
  22. exonware/xwsystem/caching/memory_bounded.py +2 -2
  23. exonware/xwsystem/caching/metrics_exporter.py +2 -2
  24. exonware/xwsystem/caching/observable_cache.py +1 -1
  25. exonware/xwsystem/caching/pluggable_cache.py +2 -2
  26. exonware/xwsystem/caching/rate_limiter.py +1 -1
  27. exonware/xwsystem/caching/read_through.py +2 -2
  28. exonware/xwsystem/caching/secure_cache.py +1 -1
  29. exonware/xwsystem/caching/serializable.py +2 -2
  30. exonware/xwsystem/caching/stats.py +1 -1
  31. exonware/xwsystem/caching/tagging.py +2 -2
  32. exonware/xwsystem/caching/ttl_cache.py +1 -1
  33. exonware/xwsystem/caching/two_tier_cache.py +1 -1
  34. exonware/xwsystem/caching/utils.py +1 -1
  35. exonware/xwsystem/caching/validation.py +1 -1
  36. exonware/xwsystem/caching/warming.py +2 -2
  37. exonware/xwsystem/caching/write_behind.py +2 -2
  38. exonware/xwsystem/cli/__init__.py +1 -1
  39. exonware/xwsystem/cli/args.py +1 -1
  40. exonware/xwsystem/cli/base.py +1 -1
  41. exonware/xwsystem/cli/colors.py +1 -1
  42. exonware/xwsystem/cli/console.py +1 -1
  43. exonware/xwsystem/cli/contracts.py +1 -1
  44. exonware/xwsystem/cli/defs.py +1 -1
  45. exonware/xwsystem/cli/errors.py +1 -1
  46. exonware/xwsystem/cli/progress.py +1 -1
  47. exonware/xwsystem/cli/prompts.py +1 -1
  48. exonware/xwsystem/cli/tables.py +1 -1
  49. exonware/xwsystem/config/__init__.py +1 -1
  50. exonware/xwsystem/config/base.py +2 -2
  51. exonware/xwsystem/config/contracts.py +1 -1
  52. exonware/xwsystem/config/defaults.py +1 -1
  53. exonware/xwsystem/config/defs.py +1 -1
  54. exonware/xwsystem/config/errors.py +2 -2
  55. exonware/xwsystem/config/logging.py +1 -1
  56. exonware/xwsystem/config/logging_setup.py +2 -2
  57. exonware/xwsystem/config/performance.py +115 -388
  58. exonware/xwsystem/http_client/__init__.py +1 -1
  59. exonware/xwsystem/http_client/advanced_client.py +2 -2
  60. exonware/xwsystem/http_client/base.py +2 -2
  61. exonware/xwsystem/http_client/client.py +2 -2
  62. exonware/xwsystem/http_client/contracts.py +1 -1
  63. exonware/xwsystem/http_client/defs.py +1 -1
  64. exonware/xwsystem/http_client/errors.py +2 -2
  65. exonware/xwsystem/io/__init__.py +1 -1
  66. exonware/xwsystem/io/archive/__init__.py +1 -1
  67. exonware/xwsystem/io/archive/archive.py +1 -1
  68. exonware/xwsystem/io/archive/archive_files.py +1 -1
  69. exonware/xwsystem/io/archive/archivers.py +2 -2
  70. exonware/xwsystem/io/archive/base.py +6 -6
  71. exonware/xwsystem/io/archive/codec_integration.py +1 -1
  72. exonware/xwsystem/io/archive/compression.py +1 -1
  73. exonware/xwsystem/io/archive/formats/__init__.py +1 -1
  74. exonware/xwsystem/io/archive/formats/brotli_format.py +6 -3
  75. exonware/xwsystem/io/archive/formats/lz4_format.py +6 -3
  76. exonware/xwsystem/io/archive/formats/rar.py +6 -3
  77. exonware/xwsystem/io/archive/formats/sevenzip.py +6 -3
  78. exonware/xwsystem/io/archive/formats/squashfs_format.py +1 -1
  79. exonware/xwsystem/io/archive/formats/tar.py +1 -1
  80. exonware/xwsystem/io/archive/formats/wim_format.py +6 -3
  81. exonware/xwsystem/io/archive/formats/zip.py +1 -1
  82. exonware/xwsystem/io/archive/formats/zpaq_format.py +1 -1
  83. exonware/xwsystem/io/archive/formats/zstandard.py +6 -3
  84. exonware/xwsystem/io/base.py +1 -1
  85. exonware/xwsystem/io/codec/__init__.py +1 -1
  86. exonware/xwsystem/io/codec/base.py +6 -6
  87. exonware/xwsystem/io/codec/contracts.py +1 -1
  88. exonware/xwsystem/io/codec/registry.py +5 -5
  89. exonware/xwsystem/io/common/__init__.py +1 -1
  90. exonware/xwsystem/io/common/base.py +1 -1
  91. exonware/xwsystem/io/common/lock.py +1 -1
  92. exonware/xwsystem/io/common/watcher.py +1 -1
  93. exonware/xwsystem/io/contracts.py +1 -1
  94. exonware/xwsystem/io/data_operations.py +746 -0
  95. exonware/xwsystem/io/defs.py +1 -1
  96. exonware/xwsystem/io/errors.py +1 -1
  97. exonware/xwsystem/io/facade.py +2 -2
  98. exonware/xwsystem/io/file/__init__.py +1 -1
  99. exonware/xwsystem/io/file/base.py +1 -1
  100. exonware/xwsystem/io/file/conversion.py +1 -1
  101. exonware/xwsystem/io/file/file.py +8 -6
  102. exonware/xwsystem/io/file/paged_source.py +8 -1
  103. exonware/xwsystem/io/file/paging/__init__.py +1 -1
  104. exonware/xwsystem/io/file/paging/byte_paging.py +1 -1
  105. exonware/xwsystem/io/file/paging/line_paging.py +1 -1
  106. exonware/xwsystem/io/file/paging/record_paging.py +1 -1
  107. exonware/xwsystem/io/file/paging/registry.py +4 -4
  108. exonware/xwsystem/io/file/source.py +20 -9
  109. exonware/xwsystem/io/filesystem/__init__.py +1 -1
  110. exonware/xwsystem/io/filesystem/base.py +1 -1
  111. exonware/xwsystem/io/filesystem/local.py +9 -1
  112. exonware/xwsystem/io/folder/__init__.py +1 -1
  113. exonware/xwsystem/io/folder/base.py +1 -1
  114. exonware/xwsystem/io/folder/folder.py +2 -2
  115. exonware/xwsystem/io/serialization/__init__.py +1 -1
  116. exonware/xwsystem/io/serialization/auto_serializer.py +52 -39
  117. exonware/xwsystem/io/serialization/base.py +165 -1
  118. exonware/xwsystem/io/serialization/contracts.py +88 -1
  119. exonware/xwsystem/io/serialization/defs.py +1 -1
  120. exonware/xwsystem/io/serialization/errors.py +1 -1
  121. exonware/xwsystem/io/serialization/flyweight.py +10 -10
  122. exonware/xwsystem/io/serialization/format_detector.py +8 -5
  123. exonware/xwsystem/io/serialization/formats/__init__.py +1 -1
  124. exonware/xwsystem/io/serialization/formats/binary/bson.py +1 -1
  125. exonware/xwsystem/io/serialization/formats/binary/cbor.py +1 -1
  126. exonware/xwsystem/io/serialization/formats/binary/marshal.py +1 -1
  127. exonware/xwsystem/io/serialization/formats/binary/msgpack.py +1 -1
  128. exonware/xwsystem/io/serialization/formats/binary/pickle.py +1 -1
  129. exonware/xwsystem/io/serialization/formats/binary/plistlib.py +1 -1
  130. exonware/xwsystem/io/serialization/formats/database/dbm.py +53 -1
  131. exonware/xwsystem/io/serialization/formats/database/shelve.py +48 -1
  132. exonware/xwsystem/io/serialization/formats/database/sqlite3.py +85 -1
  133. exonware/xwsystem/io/serialization/formats/text/append_only_log.py +201 -0
  134. exonware/xwsystem/io/serialization/formats/text/configparser.py +1 -1
  135. exonware/xwsystem/io/serialization/formats/text/csv.py +1 -1
  136. exonware/xwsystem/io/serialization/formats/text/formdata.py +1 -1
  137. exonware/xwsystem/io/serialization/formats/text/json.py +43 -20
  138. exonware/xwsystem/io/serialization/formats/text/json5.py +7 -5
  139. exonware/xwsystem/io/serialization/formats/text/jsonlines.py +316 -22
  140. exonware/xwsystem/io/serialization/formats/text/multipart.py +1 -1
  141. exonware/xwsystem/io/serialization/formats/text/toml.py +19 -3
  142. exonware/xwsystem/io/serialization/formats/text/xml.py +8 -1
  143. exonware/xwsystem/io/serialization/formats/text/yaml.py +52 -2
  144. exonware/xwsystem/io/serialization/parsers/__init__.py +15 -0
  145. exonware/xwsystem/io/serialization/parsers/base.py +59 -0
  146. exonware/xwsystem/io/serialization/parsers/hybrid_parser.py +61 -0
  147. exonware/xwsystem/io/serialization/parsers/msgspec_parser.py +45 -0
  148. exonware/xwsystem/io/serialization/parsers/orjson_direct_parser.py +53 -0
  149. exonware/xwsystem/io/serialization/parsers/orjson_parser.py +59 -0
  150. exonware/xwsystem/io/serialization/parsers/pysimdjson_parser.py +51 -0
  151. exonware/xwsystem/io/serialization/parsers/rapidjson_parser.py +50 -0
  152. exonware/xwsystem/io/serialization/parsers/registry.py +90 -0
  153. exonware/xwsystem/io/serialization/parsers/standard.py +43 -0
  154. exonware/xwsystem/io/serialization/parsers/ujson_parser.py +50 -0
  155. exonware/xwsystem/io/serialization/registry.py +1 -1
  156. exonware/xwsystem/io/serialization/serializer.py +175 -3
  157. exonware/xwsystem/io/serialization/utils/__init__.py +1 -1
  158. exonware/xwsystem/io/serialization/utils/path_ops.py +1 -1
  159. exonware/xwsystem/io/stream/__init__.py +1 -1
  160. exonware/xwsystem/io/stream/async_operations.py +1 -1
  161. exonware/xwsystem/io/stream/base.py +1 -1
  162. exonware/xwsystem/io/stream/codec_io.py +1 -1
  163. exonware/xwsystem/ipc/async_fabric.py +1 -2
  164. exonware/xwsystem/ipc/base.py +2 -2
  165. exonware/xwsystem/ipc/contracts.py +2 -2
  166. exonware/xwsystem/ipc/defs.py +1 -1
  167. exonware/xwsystem/ipc/errors.py +2 -2
  168. exonware/xwsystem/ipc/pipes.py +2 -2
  169. exonware/xwsystem/ipc/shared_memory.py +2 -2
  170. exonware/xwsystem/monitoring/base.py +2 -2
  171. exonware/xwsystem/monitoring/contracts.py +1 -1
  172. exonware/xwsystem/monitoring/defs.py +1 -1
  173. exonware/xwsystem/monitoring/error_recovery.py +2 -2
  174. exonware/xwsystem/monitoring/errors.py +2 -2
  175. exonware/xwsystem/monitoring/memory_monitor.py +1 -1
  176. exonware/xwsystem/monitoring/performance_manager_generic.py +2 -2
  177. exonware/xwsystem/monitoring/performance_validator.py +1 -1
  178. exonware/xwsystem/monitoring/system_monitor.py +2 -2
  179. exonware/xwsystem/monitoring/tracing.py +2 -2
  180. exonware/xwsystem/monitoring/tracker.py +1 -1
  181. exonware/xwsystem/operations/__init__.py +1 -1
  182. exonware/xwsystem/operations/base.py +1 -1
  183. exonware/xwsystem/operations/defs.py +1 -1
  184. exonware/xwsystem/operations/diff.py +1 -1
  185. exonware/xwsystem/operations/merge.py +1 -1
  186. exonware/xwsystem/operations/patch.py +1 -1
  187. exonware/xwsystem/patterns/base.py +2 -2
  188. exonware/xwsystem/patterns/context_manager.py +2 -2
  189. exonware/xwsystem/patterns/contracts.py +9 -9
  190. exonware/xwsystem/patterns/defs.py +1 -1
  191. exonware/xwsystem/patterns/dynamic_facade.py +8 -8
  192. exonware/xwsystem/patterns/errors.py +5 -5
  193. exonware/xwsystem/patterns/handler_factory.py +6 -6
  194. exonware/xwsystem/patterns/object_pool.py +7 -7
  195. exonware/xwsystem/patterns/registry.py +3 -3
  196. exonware/xwsystem/plugins/__init__.py +1 -1
  197. exonware/xwsystem/plugins/base.py +5 -5
  198. exonware/xwsystem/plugins/contracts.py +5 -5
  199. exonware/xwsystem/plugins/defs.py +1 -1
  200. exonware/xwsystem/plugins/errors.py +4 -4
  201. exonware/xwsystem/runtime/__init__.py +1 -1
  202. exonware/xwsystem/runtime/base.py +6 -6
  203. exonware/xwsystem/runtime/contracts.py +6 -6
  204. exonware/xwsystem/runtime/defs.py +1 -1
  205. exonware/xwsystem/runtime/env.py +2 -2
  206. exonware/xwsystem/runtime/errors.py +1 -1
  207. exonware/xwsystem/runtime/reflection.py +8 -8
  208. exonware/xwsystem/security/auth.py +1 -1
  209. exonware/xwsystem/security/base.py +2 -2
  210. exonware/xwsystem/security/contracts.py +1 -1
  211. exonware/xwsystem/security/crypto.py +2 -2
  212. exonware/xwsystem/security/defs.py +1 -1
  213. exonware/xwsystem/security/errors.py +2 -2
  214. exonware/xwsystem/security/hazmat.py +2 -2
  215. exonware/xwsystem/shared/__init__.py +1 -1
  216. exonware/xwsystem/shared/base.py +1 -1
  217. exonware/xwsystem/shared/contracts.py +1 -1
  218. exonware/xwsystem/shared/defs.py +1 -1
  219. exonware/xwsystem/shared/errors.py +1 -1
  220. exonware/xwsystem/structures/__init__.py +1 -1
  221. exonware/xwsystem/structures/base.py +2 -2
  222. exonware/xwsystem/structures/contracts.py +1 -1
  223. exonware/xwsystem/structures/defs.py +1 -1
  224. exonware/xwsystem/structures/errors.py +2 -2
  225. exonware/xwsystem/threading/async_primitives.py +2 -2
  226. exonware/xwsystem/threading/base.py +2 -2
  227. exonware/xwsystem/threading/contracts.py +1 -1
  228. exonware/xwsystem/threading/defs.py +1 -1
  229. exonware/xwsystem/threading/errors.py +2 -2
  230. exonware/xwsystem/threading/safe_factory.py +6 -6
  231. exonware/xwsystem/utils/base.py +2 -2
  232. exonware/xwsystem/utils/contracts.py +1 -1
  233. exonware/xwsystem/utils/dt/__init__.py +1 -1
  234. exonware/xwsystem/utils/dt/base.py +2 -2
  235. exonware/xwsystem/utils/dt/contracts.py +1 -1
  236. exonware/xwsystem/utils/dt/defs.py +1 -1
  237. exonware/xwsystem/utils/dt/errors.py +2 -2
  238. exonware/xwsystem/utils/dt/formatting.py +1 -1
  239. exonware/xwsystem/utils/dt/humanize.py +2 -2
  240. exonware/xwsystem/utils/dt/parsing.py +1 -1
  241. exonware/xwsystem/utils/dt/timezone_utils.py +1 -1
  242. exonware/xwsystem/utils/errors.py +2 -2
  243. exonware/xwsystem/utils/utils_contracts.py +1 -1
  244. exonware/xwsystem/validation/__init__.py +1 -1
  245. exonware/xwsystem/validation/base.py +15 -15
  246. exonware/xwsystem/validation/contracts.py +1 -1
  247. exonware/xwsystem/validation/data_validator.py +10 -0
  248. exonware/xwsystem/validation/declarative.py +9 -9
  249. exonware/xwsystem/validation/defs.py +1 -1
  250. exonware/xwsystem/validation/errors.py +2 -2
  251. exonware/xwsystem/validation/fluent_validator.py +4 -4
  252. exonware/xwsystem/version.py +4 -4
  253. {exonware_xwsystem-0.0.1.410.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/METADATA +3 -3
  254. exonware_xwsystem-0.1.0.1.dist-info/RECORD +284 -0
  255. exonware/xwsystem/caching/USAGE_GUIDE.md +0 -779
  256. exonware/xwsystem/utils/test_runner.py +0 -526
  257. exonware_xwsystem-0.0.1.410.dist-info/RECORD +0 -273
  258. {exonware_xwsystem-0.0.1.410.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/WHEEL +0 -0
  259. {exonware_xwsystem-0.0.1.410.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,746 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ #exonware/xwsystem/src/exonware/xwsystem/io/data_operations.py
4
+
5
+ Generic data-operations layer for large, file-backed datasets.
6
+
7
+ This module provides:
8
+ - A small indexing model for line-oriented files (e.g. NDJSON / JSONL)
9
+ - Streaming read / update helpers with atomic guarantees
10
+ - Paging helpers built on top of line offsets
11
+
12
+ The goal is to expose these capabilities in a format-agnostic way so that
13
+ higher-level libraries (xwdata, xwnode, xwentity, etc.) can build powerful
14
+ lazy, paged, and atomic access features without re-implementing I/O logic.
15
+
16
+ Company: eXonware.com
17
+ Author: Eng. Muhammad AlShehri
18
+ Email: connect@exonware.com
19
+ Version: 0.1.0.1
20
+ Generation Date: 15-Dec-2025
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ from dataclasses import dataclass
26
+ from pathlib import Path
27
+ from typing import Any, Callable, Optional
28
+ from abc import ABC, abstractmethod
29
+ import json
30
+ import os
31
+ import tempfile
32
+ import multiprocessing as mp
33
+ from concurrent.futures import ProcessPoolExecutor, as_completed
34
+
35
+ from .serialization.auto_serializer import AutoSerializer
36
+ from exonware.xwsystem.config.logging_setup import get_logger
37
+ from exonware.xwsystem.config.performance import get_performance_config
38
+
39
+
40
+ logger = get_logger(__name__)
41
+
42
+
43
+ JsonMatchFn = Callable[[Any], bool]
44
+
45
+
46
+ def _process_chunk_worker(args: tuple[int, int, int, str, str, str | None, int | None, bool]) -> tuple[list[int] | None, dict[str, int], int]:
47
+ """
48
+ Process a single chunk (runs in worker process).
49
+
50
+ This is a module-level function to make it picklable for multiprocessing.
51
+ """
52
+ chunk_id, start_offset, end_offset, file_path_str, encoding, id_field_arg, max_id_index_arg, build_line_offsets_arg = args
53
+ chunk_line_offsets: list[int] | None = [] if build_line_offsets_arg else None
54
+ chunk_id_index: dict[str, int] = {}
55
+ lines_processed = 0
56
+
57
+ # Import parser in worker process (can't pickle serializer)
58
+ try:
59
+ from exonware.xwsystem.io.serialization.parsers.registry import get_best_available_parser
60
+ parser = get_best_available_parser()
61
+ except ImportError:
62
+ import json as parser
63
+
64
+ try:
65
+ with open(file_path_str, "rb") as f:
66
+ f.seek(start_offset)
67
+ current_offset = start_offset
68
+
69
+ while current_offset < end_offset:
70
+ line_start = current_offset
71
+ line = f.readline()
72
+
73
+ if not line:
74
+ break
75
+
76
+ current_offset = f.tell()
77
+
78
+ # Skip if we've gone past the end
79
+ if line_start >= end_offset:
80
+ break
81
+
82
+ # Optimize: Check empty lines early (match example code pattern)
83
+ raw = line.strip()
84
+ if not raw:
85
+ continue
86
+
87
+ # Track line offset if requested, calculate line_idx once
88
+ if build_line_offsets_arg:
89
+ chunk_line_offsets.append(line_start)
90
+ line_idx = len(chunk_line_offsets) - 1
91
+ else:
92
+ line_idx = lines_processed
93
+
94
+ if id_field_arg and (max_id_index_arg is None or len(chunk_id_index) < max_id_index_arg):
95
+ try:
96
+ # Parser accepts bytes directly (hybrid parser handles it)
97
+ obj = parser.loads(raw)
98
+ if isinstance(obj, dict) and id_field_arg in obj:
99
+ id_val = str(obj[id_field_arg])
100
+ chunk_id_index[id_val] = line_idx
101
+ except Exception:
102
+ # Skip invalid lines (best-effort indexing)
103
+ pass
104
+
105
+ lines_processed += 1
106
+ except Exception as e:
107
+ # Can't use logger in worker process, just pass
108
+ pass
109
+
110
+ return (chunk_line_offsets, chunk_id_index, lines_processed)
111
+ JsonUpdateFn = Callable[[Any], Any]
112
+
113
+
114
+ @dataclass
115
+ class JsonIndexMeta:
116
+ """
117
+ Minimal metadata for a JSONL/NDJSON index.
118
+
119
+ This intentionally mirrors the capabilities used in the x5 examples
120
+ without pulling in any of the example code directly.
121
+ """
122
+
123
+ path: str
124
+ size: int
125
+ mtime: float
126
+ version: int = 1
127
+
128
+
129
+ @dataclass
130
+ class JsonIndex:
131
+ """
132
+ Simple index for line-oriented JSON files.
133
+
134
+ - line_offsets: byte offset of each JSON line
135
+ - id_index: optional mapping id_value -> line_number
136
+ """
137
+
138
+ meta: JsonIndexMeta
139
+ line_offsets: list[int]
140
+ id_index: Optional[dict[str, int]] = None
141
+
142
+
143
+ class ADataOperations(ABC):
144
+ """
145
+ Abstract, format-agnostic interface for large, file-backed data operations.
146
+
147
+ Concrete implementations may target specific physical layouts
148
+ (NDJSON/JSONL, multi-document YAML, binary record stores, etc.), but MUST
149
+ conform to these semantics:
150
+
151
+ - Streaming, record-by-record read with a match predicate.
152
+ - Streaming, atomic update using a temp file + replace pattern.
153
+ - Optional indexing for random access and paging.
154
+ """
155
+
156
+ @abstractmethod
157
+ def stream_read(
158
+ self,
159
+ file_path: str | Path,
160
+ match: JsonMatchFn,
161
+ path: Optional[list[object]] = None,
162
+ encoding: str = "utf-8",
163
+ ) -> Any:
164
+ """Return the first record (or sub-path) that matches the predicate."""
165
+ raise NotImplementedError
166
+
167
+ @abstractmethod
168
+ def stream_update(
169
+ self,
170
+ file_path: str | Path,
171
+ match: JsonMatchFn,
172
+ updater: JsonUpdateFn,
173
+ *,
174
+ encoding: str = "utf-8",
175
+ newline: str = "\n",
176
+ atomic: bool = True,
177
+ ) -> int:
178
+ """
179
+ Stream-copy the backing store, applying `updater` to matching records.
180
+
181
+ MUST use atomic replace semantics when `atomic=True`.
182
+ Returns number of updated records.
183
+ """
184
+ raise NotImplementedError
185
+
186
+ @abstractmethod
187
+ def build_index(
188
+ self,
189
+ file_path: str | Path,
190
+ *,
191
+ encoding: str = "utf-8",
192
+ id_field: str | None = None,
193
+ max_id_index: int | None = None,
194
+ ) -> JsonIndex:
195
+ """Build an index structure suitable for random access and paging."""
196
+ raise NotImplementedError
197
+
198
+ @abstractmethod
199
+ def indexed_get_by_line(
200
+ self,
201
+ file_path: str | Path,
202
+ line_number: int,
203
+ *,
204
+ encoding: str = "utf-8",
205
+ index: Optional[JsonIndex] = None,
206
+ ) -> Any:
207
+ """Random-access a specific logical record by its index position."""
208
+ raise NotImplementedError
209
+
210
+ @abstractmethod
211
+ def indexed_get_by_id(
212
+ self,
213
+ file_path: str | Path,
214
+ id_value: Any,
215
+ *,
216
+ encoding: str = "utf-8",
217
+ id_field: str = "id",
218
+ index: Optional[JsonIndex] = None,
219
+ ) -> Any:
220
+ """Random-access a record by logical identifier, with optional index."""
221
+ raise NotImplementedError
222
+
223
+ @abstractmethod
224
+ def get_page(
225
+ self,
226
+ file_path: str | Path,
227
+ page_number: int,
228
+ page_size: int,
229
+ *,
230
+ encoding: str = "utf-8",
231
+ index: Optional[JsonIndex] = None,
232
+ ) -> list[Any]:
233
+ """Return a page of logical records using an index for efficiency."""
234
+ raise NotImplementedError
235
+
236
+
237
+ class NDJSONDataOperations(ADataOperations):
238
+ """
239
+ Generic data-operations helper for NDJSON / JSONL style files.
240
+
241
+ This class is deliberately low-level and works directly with paths and
242
+ native Python data. XWData and other libraries can wrap it to provide
243
+ higher-level, type-agnostic facades.
244
+ """
245
+
246
+ def __init__(self, serializer: Optional[AutoSerializer] = None):
247
+ # Reuse xwsystem's AutoSerializer so we do not re-implement parsing.
248
+ self._serializer = serializer or AutoSerializer(default_format="JSON")
249
+
250
+ # ------------------------------------------------------------------
251
+ # Streaming read
252
+ # ------------------------------------------------------------------
253
+
254
+ def stream_read(
255
+ self,
256
+ file_path: str | Path,
257
+ match: JsonMatchFn,
258
+ path: Optional[list[object]] = None,
259
+ encoding: str = "utf-8",
260
+ ) -> Any:
261
+ """
262
+ Stream a huge NDJSON file and return the first record (or sub-path)
263
+ matching `match`.
264
+
265
+ This is intentionally simple and focused:
266
+ - Reads one line at a time
267
+ - Uses AutoSerializer(JSON) for parsing
268
+ - Optional path extraction
269
+ """
270
+ target = Path(file_path)
271
+ if not target.exists():
272
+ raise FileNotFoundError(str(target))
273
+
274
+ with target.open("r", encoding=encoding) as f:
275
+ for line in f:
276
+ line = line.strip()
277
+ if not line:
278
+ continue
279
+ obj = self._serializer.detect_and_deserialize(
280
+ line, file_path=target, format_hint="JSON"
281
+ )
282
+ if match(obj):
283
+ return self._extract_path(obj, path)
284
+
285
+ raise KeyError("No matching record found")
286
+
287
+ # ------------------------------------------------------------------
288
+ # Streaming update with atomic replace
289
+ # ------------------------------------------------------------------
290
+
291
+ def stream_update(
292
+ self,
293
+ file_path: str | Path,
294
+ match: JsonMatchFn,
295
+ updater: JsonUpdateFn,
296
+ *,
297
+ encoding: str = "utf-8",
298
+ newline: str = "\n",
299
+ atomic: bool = True,
300
+ ) -> int:
301
+ """
302
+ Stream-copy a huge NDJSON file, applying `updater` to records
303
+ where `match(obj)` is True.
304
+
305
+ Only matching records are fully materialized. All writes go to a
306
+ temporary file, which is atomically replaced on success.
307
+
308
+ Returns the number of updated records.
309
+ """
310
+ target = Path(file_path)
311
+ if not target.exists():
312
+ raise FileNotFoundError(str(target))
313
+
314
+ updated = 0
315
+ dir_path = target.parent
316
+
317
+ # Write to a temp file in the same directory for atomic replace.
318
+ fd, tmp_path_str = tempfile.mkstemp(
319
+ prefix=f".{target.name}.tmp.", dir=str(dir_path)
320
+ )
321
+ tmp_path = Path(tmp_path_str)
322
+
323
+ try:
324
+ with os.fdopen(fd, "w", encoding=encoding, newline=newline) as out_f, target.open(
325
+ "r", encoding=encoding
326
+ ) as in_f:
327
+ for line in in_f:
328
+ raw = line.rstrip("\n")
329
+ if not raw:
330
+ out_f.write(line)
331
+ continue
332
+
333
+ obj = self._serializer.detect_and_deserialize(
334
+ raw, file_path=target, format_hint="JSON"
335
+ )
336
+ if match(obj):
337
+ updated_obj = updater(obj)
338
+ updated_line = json.dumps(updated_obj, ensure_ascii=False)
339
+ out_f.write(updated_line + newline)
340
+ updated += 1
341
+ else:
342
+ out_f.write(line)
343
+
344
+ if atomic:
345
+ os.replace(tmp_path, target)
346
+ else:
347
+ tmp_path.replace(target)
348
+
349
+ return updated
350
+ finally:
351
+ # Ensure temp file is removed on error
352
+ if tmp_path.exists():
353
+ try:
354
+ tmp_path.unlink()
355
+ except OSError:
356
+ # Best-effort cleanup; do not mask original error.
357
+ logger.debug("Failed to cleanup temp file %s", tmp_path)
358
+
359
+ # ------------------------------------------------------------------
360
+ # Indexing and paging
361
+ # ------------------------------------------------------------------
362
+
363
+ def build_index(
364
+ self,
365
+ file_path: str | Path,
366
+ *,
367
+ encoding: str = "utf-8",
368
+ id_field: str | None = None,
369
+ max_id_index: int | None = None,
370
+ use_parallel: bool | None = None,
371
+ num_workers: int | None = None,
372
+ chunk_size_mb: int = 100,
373
+ build_line_offsets: bool = True,
374
+ ) -> JsonIndex:
375
+ """
376
+ One-time full scan to build an index:
377
+ - line_offsets: byte offset of each JSON line
378
+ - optional id_index: obj[id_field] -> line_number
379
+
380
+ Args:
381
+ file_path: Path to JSONL file
382
+ encoding: File encoding (default: utf-8)
383
+ id_field: Optional field name to build id_index
384
+ max_id_index: Maximum entries in id_index (None = unlimited)
385
+ use_parallel: Enable parallel processing (None = auto-detect based on file size)
386
+ num_workers: Number of worker processes (None = CPU count)
387
+ chunk_size_mb: Chunk size in MB for parallel processing (default: 100MB)
388
+ build_line_offsets: If True, build line_offsets list (default: True, set False for faster id_index-only builds)
389
+
390
+ Returns:
391
+ JsonIndex with line_offsets (if build_line_offsets=True) and optional id_index
392
+ """
393
+ target = Path(file_path)
394
+ if not target.exists():
395
+ raise FileNotFoundError(str(target))
396
+
397
+ # Auto-detect parallel based on config
398
+ perf_config = get_performance_config()
399
+ if use_parallel is None:
400
+ if not perf_config.enable_parallel_index:
401
+ use_parallel = False
402
+ else:
403
+ file_size_mb = target.stat().st_size / 1_048_576 # 1024 * 1024
404
+ use_parallel = file_size_mb > perf_config.parallel_index_threshold_mb
405
+
406
+ # Use config defaults for workers and chunk size
407
+ if num_workers is None:
408
+ num_workers = perf_config.parallel_index_workers
409
+ if chunk_size_mb == 100: # Only use default if not explicitly set
410
+ chunk_size_mb = perf_config.parallel_index_chunk_size_mb
411
+
412
+ # Use parallel processing if enabled and file is large enough
413
+ if use_parallel:
414
+ try:
415
+ return self._build_index_parallel(
416
+ target,
417
+ encoding=encoding,
418
+ id_field=id_field,
419
+ max_id_index=max_id_index,
420
+ num_workers=num_workers,
421
+ chunk_size_mb=chunk_size_mb,
422
+ build_line_offsets=build_line_offsets,
423
+ )
424
+ except Exception as e:
425
+ logger.warning(f"Parallel index building failed, falling back to single-threaded: {e}")
426
+ # Fall through to single-threaded
427
+
428
+ # Single-threaded implementation (optimized - matches example code exactly)
429
+ line_offsets: list[int] | None = [] if build_line_offsets else None
430
+ id_index: dict[str, int] | None = {} if id_field else None
431
+
432
+ size = target.stat().st_size
433
+ mtime = target.stat().st_mtime
434
+
435
+ # Cache parser instance (matches example code pattern)
436
+ try:
437
+ from exonware.xwsystem.io.serialization.parsers.registry import get_best_available_parser
438
+ parser = get_best_available_parser()
439
+ except ImportError:
440
+ import json as parser
441
+
442
+ offset = 0
443
+ with target.open("rb") as f:
444
+ line_no = 0
445
+ while True:
446
+ line = f.readline()
447
+ if not line:
448
+ break
449
+ if build_line_offsets:
450
+ line_offsets.append(offset)
451
+
452
+ if id_index is not None:
453
+ try:
454
+ # Match example code exactly: strip bytes, parse directly
455
+ raw = line.strip()
456
+ if raw:
457
+ # Parser accepts bytes directly (hybrid parser handles it)
458
+ obj = parser.loads(raw)
459
+ if isinstance(obj, dict) and id_field in obj:
460
+ id_val = str(obj[id_field])
461
+ if max_id_index is None or len(id_index) < max_id_index:
462
+ id_index[id_val] = line_no
463
+ except Exception:
464
+ # Index should be best-effort and robust to bad lines.
465
+ # Skip invalid lines silently for performance
466
+ pass
467
+
468
+ offset += len(line)
469
+ line_no += 1
470
+
471
+ meta = JsonIndexMeta(path=str(target), size=size, mtime=mtime, version=1)
472
+ return JsonIndex(meta=meta, line_offsets=line_offsets, id_index=id_index)
473
+
474
+ def _build_index_parallel(
475
+ self,
476
+ file_path: Path,
477
+ *,
478
+ encoding: str = "utf-8",
479
+ id_field: str | None = None,
480
+ max_id_index: int | None = None,
481
+ num_workers: int | None = None,
482
+ chunk_size_mb: int = 100,
483
+ build_line_offsets: bool = True,
484
+ ) -> JsonIndex:
485
+ """
486
+ Parallel index building using multiple CPU cores.
487
+
488
+ This is an internal method called by build_index() when use_parallel=True.
489
+ """
490
+ if num_workers is None:
491
+ # Optimize: Simple formula - 1 worker per 10MB (capped at ProcessPoolExecutor limit)
492
+ # ProcessPoolExecutor max_workers limit is 61 on Windows
493
+ file_size_mb = file_path.stat().st_size / 1_048_576 # 1024 * 1024
494
+ calculated_workers = int(file_size_mb / 10) # 1 worker per 10MB
495
+ # Cap at 61 (ProcessPoolExecutor limit) or CPU count, whichever is higher
496
+ cpu_count = mp.cpu_count()
497
+ num_workers = max(cpu_count, min(61, calculated_workers))
498
+
499
+ file_size = file_path.stat().st_size
500
+ chunk_size_bytes = chunk_size_mb * 1_048_576 # 1024 * 1024
501
+
502
+ # If file is too small, fall back to single-threaded
503
+ if file_size < chunk_size_bytes * 2:
504
+ raise ValueError("File too small for parallel processing")
505
+
506
+ # Split file into chunks
507
+ chunks = []
508
+ current_offset = 0
509
+ chunk_id = 0
510
+
511
+ while current_offset < file_size:
512
+ chunk_end = min(current_offset + chunk_size_bytes, file_size)
513
+ chunks.append((chunk_id, current_offset, chunk_end))
514
+ current_offset = chunk_end
515
+ chunk_id += 1
516
+
517
+ # Limit number of chunks
518
+ if len(chunks) > num_workers * 2:
519
+ merged_chunks = []
520
+ for i in range(0, len(chunks), max(1, len(chunks) // num_workers)):
521
+ chunk_group = chunks[i:i + max(1, len(chunks) // num_workers)]
522
+ if chunk_group:
523
+ merged_chunks.append((
524
+ chunk_group[0][0],
525
+ chunk_group[0][1],
526
+ chunk_group[-1][2]
527
+ ))
528
+ chunks = merged_chunks
529
+
530
+ logger.debug(f"Processing {len(chunks)} chunks with {num_workers} workers")
531
+
532
+ # Process chunks in parallel
533
+ line_offsets: list[int] | None = [] if build_line_offsets else None
534
+ id_index: dict[str, int] | None = {} if id_field else None
535
+
536
+ # Prepare arguments for worker processes
537
+ chunk_args = [
538
+ (chunk[0], chunk[1], chunk[2], str(file_path), encoding, id_field, max_id_index, build_line_offsets)
539
+ for chunk in chunks
540
+ ]
541
+
542
+ # Execute parallel processing
543
+ with ProcessPoolExecutor(max_workers=num_workers) as executor:
544
+ futures = {
545
+ executor.submit(_process_chunk_worker, args): args[0]
546
+ for args in chunk_args
547
+ }
548
+
549
+ # Optimize: Use dict for O(1) lookup instead of sorting
550
+ chunk_results_dict: dict[int, tuple[list[int], dict[str, int]]] = {}
551
+ for future in as_completed(futures):
552
+ try:
553
+ chunk_offsets, chunk_ids, _ = future.result()
554
+ chunk_id = futures[future]
555
+ chunk_results_dict[chunk_id] = (chunk_offsets, chunk_ids)
556
+ except Exception as e:
557
+ logger.warning(f"Chunk processing failed: {e}")
558
+ raise
559
+
560
+ # Merge results (process in order by chunk_id)
561
+ if build_line_offsets:
562
+ # Optimize: Pre-calculate total size for better memory allocation
563
+ total_offsets = sum(len(offsets) if offsets else 0 for offsets, _ in chunk_results_dict.values())
564
+ if total_offsets > 0:
565
+ # Pre-allocate list for better performance
566
+ line_offsets = [0] * total_offsets
567
+ current_idx = 0
568
+ else:
569
+ line_offsets = []
570
+ current_idx = 0
571
+ else:
572
+ current_idx = 0
573
+
574
+ for chunk_id in sorted(chunk_results_dict.keys()):
575
+ chunk_offsets, chunk_ids = chunk_results_dict[chunk_id]
576
+
577
+ # Merge line_offsets if building them
578
+ if build_line_offsets and chunk_offsets:
579
+ # Optimize: Use slice assignment for faster extend
580
+ if total_offsets > 0:
581
+ line_offsets[current_idx:current_idx + len(chunk_offsets)] = chunk_offsets
582
+ base_line = current_idx
583
+ current_idx += len(chunk_offsets)
584
+ else:
585
+ base_line = len(line_offsets)
586
+ line_offsets.extend(chunk_offsets)
587
+ else:
588
+ # Calculate base_line for id_index even without line_offsets
589
+ base_line = current_idx
590
+ if chunk_offsets:
591
+ current_idx += len(chunk_offsets)
592
+ else:
593
+ # Estimate: assume average line size if we don't have offsets
594
+ current_idx += 300 # Rough estimate
595
+
596
+ if id_index is not None and chunk_ids:
597
+ # Optimize: Batch update with dict.update() if no limit
598
+ if max_id_index is None:
599
+ # Fast path: no limit, use dict comprehension + update
600
+ id_index.update({id_val: base_line + rel_line for id_val, rel_line in chunk_ids.items()})
601
+ else:
602
+ # Slower path: check limit per item
603
+ for id_val, rel_line in chunk_ids.items():
604
+ if len(id_index) < max_id_index:
605
+ id_index[id_val] = base_line + rel_line
606
+
607
+ size = file_path.stat().st_size
608
+ mtime = file_path.stat().st_mtime
609
+ meta = JsonIndexMeta(path=str(file_path), size=size, mtime=mtime, version=1)
610
+ return JsonIndex(meta=meta, line_offsets=line_offsets, id_index=id_index)
611
+
612
+ def indexed_get_by_line(
613
+ self,
614
+ file_path: str | Path,
615
+ line_number: int,
616
+ *,
617
+ encoding: str = "utf-8",
618
+ index: Optional[JsonIndex] = None,
619
+ ) -> Any:
620
+ """
621
+ Random-access a specific record by line_number (0-based) using index.
622
+ """
623
+ target = Path(file_path)
624
+ if index is None:
625
+ index = self.build_index(target, encoding=encoding)
626
+
627
+ if line_number < 0 or line_number >= len(index.line_offsets):
628
+ raise IndexError("line_number out of range")
629
+
630
+ offset = index.line_offsets[line_number]
631
+ with target.open("rb") as f:
632
+ f.seek(offset)
633
+ line = f.readline()
634
+ text = line.decode(encoding).strip()
635
+ if not text:
636
+ raise ValueError("Empty line at indexed position")
637
+ return self._serializer.detect_and_deserialize(
638
+ text, file_path=target, format_hint="JSON"
639
+ )
640
+
641
+ def indexed_get_by_id(
642
+ self,
643
+ file_path: str | Path,
644
+ id_value: Any,
645
+ *,
646
+ encoding: str = "utf-8",
647
+ id_field: str = "id",
648
+ index: Optional[JsonIndex] = None,
649
+ ) -> Any:
650
+ """
651
+ Random-access a record by logical id using id_index if available.
652
+ Falls back to linear scan if id_index missing or incomplete.
653
+ """
654
+ target = Path(file_path)
655
+ if index is None:
656
+ index = self.build_index(target, encoding=encoding, id_field=id_field)
657
+
658
+ id_index = index.id_index
659
+ if id_index is not None:
660
+ key = str(id_value)
661
+ if key in id_index:
662
+ return self.indexed_get_by_line(
663
+ target, id_index[key], encoding=encoding, index=index
664
+ )
665
+
666
+ # Fallback: linear scan using stream_read semantics
667
+ def _match(obj: Any) -> bool:
668
+ return isinstance(obj, dict) and obj.get(id_field) == id_value
669
+
670
+ return self.stream_read(target, _match, path=None, encoding=encoding)
671
+
672
+ def get_page(
673
+ self,
674
+ file_path: str | Path,
675
+ page_number: int,
676
+ page_size: int,
677
+ *,
678
+ encoding: str = "utf-8",
679
+ index: Optional[JsonIndex] = None,
680
+ ) -> list[Any]:
681
+ """
682
+ Paging helper using index:
683
+ - page_number: 1-based
684
+ - page_size: number of records per page
685
+ """
686
+ target = Path(file_path)
687
+ if index is None:
688
+ index = self.build_index(target, encoding=encoding)
689
+
690
+ if page_number < 1 or page_size <= 0:
691
+ raise ValueError("Invalid page_number or page_size")
692
+
693
+ start = (page_number - 1) * page_size
694
+ end = start + page_size
695
+
696
+ if start >= len(index.line_offsets):
697
+ return []
698
+
699
+ end = min(end, len(index.line_offsets))
700
+
701
+ results: list[Any] = []
702
+ with target.open("rb") as f:
703
+ for line_no in range(start, end):
704
+ offset = index.line_offsets[line_no]
705
+ f.seek(offset)
706
+ line = f.readline()
707
+ text = line.decode(encoding).strip()
708
+ if not text:
709
+ continue
710
+ obj = self._serializer.detect_and_deserialize(
711
+ text, file_path=target, format_hint="JSON"
712
+ )
713
+ results.append(obj)
714
+
715
+ return results
716
+
717
+ # ------------------------------------------------------------------
718
+ # Helpers
719
+ # ------------------------------------------------------------------
720
+
721
+ def _extract_path(self, obj: Any, path: Optional[list[object]]) -> Any:
722
+ """Extract a nested path like ['user', 'email'] or ['tags', 0]."""
723
+ if not path:
724
+ return obj
725
+
726
+ current = obj
727
+ for part in path:
728
+ if isinstance(current, dict) and isinstance(part, str):
729
+ if part not in current:
730
+ raise KeyError(part)
731
+ current = current[part]
732
+ elif isinstance(current, list) and isinstance(part, int):
733
+ current = current[part]
734
+ else:
735
+ raise KeyError(part)
736
+ return current
737
+
738
+
739
+ __all__ = [
740
+ "JsonIndexMeta",
741
+ "JsonIndex",
742
+ "ADataOperations",
743
+ "NDJSONDataOperations",
744
+ ]
745
+
746
+