exonware-xwsystem 0.0.1.411__py3-none-any.whl → 0.1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exonware/__init__.py +1 -1
- exonware/conf.py +1 -1
- exonware/xwsystem/__init__.py +1 -1
- exonware/xwsystem/caching/__init__.py +1 -1
- exonware/xwsystem/caching/base.py +1 -1
- exonware/xwsystem/caching/bloom_cache.py +1 -1
- exonware/xwsystem/caching/cache_manager.py +1 -1
- exonware/xwsystem/caching/conditional.py +1 -1
- exonware/xwsystem/caching/contracts.py +1 -1
- exonware/xwsystem/caching/decorators.py +1 -1
- exonware/xwsystem/caching/defs.py +1 -1
- exonware/xwsystem/caching/disk_cache.py +1 -1
- exonware/xwsystem/caching/distributed.py +1 -1
- exonware/xwsystem/caching/errors.py +1 -1
- exonware/xwsystem/caching/events.py +1 -1
- exonware/xwsystem/caching/eviction_strategies.py +1 -1
- exonware/xwsystem/caching/fluent.py +1 -1
- exonware/xwsystem/caching/integrity.py +1 -1
- exonware/xwsystem/caching/lfu_cache.py +1 -1
- exonware/xwsystem/caching/lfu_optimized.py +1 -1
- exonware/xwsystem/caching/lru_cache.py +1 -1
- exonware/xwsystem/caching/memory_bounded.py +1 -1
- exonware/xwsystem/caching/metrics_exporter.py +1 -1
- exonware/xwsystem/caching/observable_cache.py +1 -1
- exonware/xwsystem/caching/pluggable_cache.py +1 -1
- exonware/xwsystem/caching/rate_limiter.py +1 -1
- exonware/xwsystem/caching/read_through.py +1 -1
- exonware/xwsystem/caching/secure_cache.py +1 -1
- exonware/xwsystem/caching/serializable.py +1 -1
- exonware/xwsystem/caching/stats.py +1 -1
- exonware/xwsystem/caching/tagging.py +1 -1
- exonware/xwsystem/caching/ttl_cache.py +1 -1
- exonware/xwsystem/caching/two_tier_cache.py +1 -1
- exonware/xwsystem/caching/utils.py +1 -1
- exonware/xwsystem/caching/validation.py +1 -1
- exonware/xwsystem/caching/warming.py +1 -1
- exonware/xwsystem/caching/write_behind.py +1 -1
- exonware/xwsystem/cli/__init__.py +1 -1
- exonware/xwsystem/cli/args.py +1 -1
- exonware/xwsystem/cli/base.py +1 -1
- exonware/xwsystem/cli/colors.py +1 -1
- exonware/xwsystem/cli/console.py +1 -1
- exonware/xwsystem/cli/contracts.py +1 -1
- exonware/xwsystem/cli/defs.py +1 -1
- exonware/xwsystem/cli/errors.py +1 -1
- exonware/xwsystem/cli/progress.py +1 -1
- exonware/xwsystem/cli/prompts.py +1 -1
- exonware/xwsystem/cli/tables.py +1 -1
- exonware/xwsystem/config/__init__.py +1 -1
- exonware/xwsystem/config/base.py +1 -1
- exonware/xwsystem/config/contracts.py +1 -1
- exonware/xwsystem/config/defaults.py +1 -1
- exonware/xwsystem/config/defs.py +1 -1
- exonware/xwsystem/config/errors.py +1 -1
- exonware/xwsystem/config/logging.py +1 -1
- exonware/xwsystem/config/logging_setup.py +1 -1
- exonware/xwsystem/config/performance.py +115 -388
- exonware/xwsystem/http_client/__init__.py +1 -1
- exonware/xwsystem/http_client/advanced_client.py +1 -1
- exonware/xwsystem/http_client/base.py +1 -1
- exonware/xwsystem/http_client/client.py +1 -1
- exonware/xwsystem/http_client/contracts.py +1 -1
- exonware/xwsystem/http_client/defs.py +1 -1
- exonware/xwsystem/http_client/errors.py +1 -1
- exonware/xwsystem/io/__init__.py +1 -1
- exonware/xwsystem/io/archive/__init__.py +1 -1
- exonware/xwsystem/io/archive/archive.py +1 -1
- exonware/xwsystem/io/archive/archive_files.py +1 -1
- exonware/xwsystem/io/archive/archivers.py +1 -1
- exonware/xwsystem/io/archive/base.py +1 -1
- exonware/xwsystem/io/archive/codec_integration.py +1 -1
- exonware/xwsystem/io/archive/compression.py +1 -1
- exonware/xwsystem/io/archive/formats/__init__.py +1 -1
- exonware/xwsystem/io/archive/formats/brotli_format.py +1 -1
- exonware/xwsystem/io/archive/formats/lz4_format.py +1 -1
- exonware/xwsystem/io/archive/formats/rar.py +1 -1
- exonware/xwsystem/io/archive/formats/sevenzip.py +1 -1
- exonware/xwsystem/io/archive/formats/squashfs_format.py +1 -1
- exonware/xwsystem/io/archive/formats/tar.py +1 -1
- exonware/xwsystem/io/archive/formats/wim_format.py +1 -1
- exonware/xwsystem/io/archive/formats/zip.py +1 -1
- exonware/xwsystem/io/archive/formats/zpaq_format.py +1 -1
- exonware/xwsystem/io/archive/formats/zstandard.py +1 -1
- exonware/xwsystem/io/base.py +1 -1
- exonware/xwsystem/io/codec/__init__.py +1 -1
- exonware/xwsystem/io/codec/base.py +1 -1
- exonware/xwsystem/io/codec/contracts.py +1 -1
- exonware/xwsystem/io/codec/registry.py +1 -1
- exonware/xwsystem/io/common/__init__.py +1 -1
- exonware/xwsystem/io/common/base.py +1 -1
- exonware/xwsystem/io/common/lock.py +1 -1
- exonware/xwsystem/io/common/watcher.py +1 -1
- exonware/xwsystem/io/contracts.py +1 -1
- exonware/xwsystem/io/data_operations.py +276 -10
- exonware/xwsystem/io/defs.py +1 -1
- exonware/xwsystem/io/errors.py +1 -1
- exonware/xwsystem/io/facade.py +1 -1
- exonware/xwsystem/io/file/__init__.py +1 -1
- exonware/xwsystem/io/file/base.py +1 -1
- exonware/xwsystem/io/file/conversion.py +1 -1
- exonware/xwsystem/io/file/file.py +1 -1
- exonware/xwsystem/io/file/paged_source.py +1 -1
- exonware/xwsystem/io/file/paging/__init__.py +1 -1
- exonware/xwsystem/io/file/paging/byte_paging.py +1 -1
- exonware/xwsystem/io/file/paging/line_paging.py +1 -1
- exonware/xwsystem/io/file/paging/record_paging.py +1 -1
- exonware/xwsystem/io/file/paging/registry.py +1 -1
- exonware/xwsystem/io/file/source.py +1 -1
- exonware/xwsystem/io/filesystem/__init__.py +1 -1
- exonware/xwsystem/io/filesystem/base.py +1 -1
- exonware/xwsystem/io/filesystem/local.py +1 -1
- exonware/xwsystem/io/folder/__init__.py +1 -1
- exonware/xwsystem/io/folder/base.py +1 -1
- exonware/xwsystem/io/folder/folder.py +1 -1
- exonware/xwsystem/io/serialization/__init__.py +1 -1
- exonware/xwsystem/io/serialization/auto_serializer.py +1 -1
- exonware/xwsystem/io/serialization/base.py +1 -1
- exonware/xwsystem/io/serialization/contracts.py +1 -1
- exonware/xwsystem/io/serialization/defs.py +1 -1
- exonware/xwsystem/io/serialization/errors.py +1 -1
- exonware/xwsystem/io/serialization/flyweight.py +1 -1
- exonware/xwsystem/io/serialization/format_detector.py +1 -1
- exonware/xwsystem/io/serialization/formats/__init__.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/bson.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/cbor.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/marshal.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/msgpack.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/pickle.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/plistlib.py +1 -1
- exonware/xwsystem/io/serialization/formats/database/dbm.py +1 -1
- exonware/xwsystem/io/serialization/formats/database/shelve.py +1 -1
- exonware/xwsystem/io/serialization/formats/database/sqlite3.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/append_only_log.py +201 -0
- exonware/xwsystem/io/serialization/formats/text/configparser.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/csv.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/formdata.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/json.py +43 -20
- exonware/xwsystem/io/serialization/formats/text/json5.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/jsonlines.py +99 -15
- exonware/xwsystem/io/serialization/formats/text/multipart.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/toml.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/xml.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/yaml.py +1 -1
- exonware/xwsystem/io/serialization/parsers/__init__.py +15 -0
- exonware/xwsystem/io/serialization/parsers/base.py +59 -0
- exonware/xwsystem/io/serialization/parsers/hybrid_parser.py +61 -0
- exonware/xwsystem/io/serialization/parsers/msgspec_parser.py +45 -0
- exonware/xwsystem/io/serialization/parsers/orjson_direct_parser.py +53 -0
- exonware/xwsystem/io/serialization/parsers/orjson_parser.py +59 -0
- exonware/xwsystem/io/serialization/parsers/pysimdjson_parser.py +51 -0
- exonware/xwsystem/io/serialization/parsers/rapidjson_parser.py +50 -0
- exonware/xwsystem/io/serialization/parsers/registry.py +90 -0
- exonware/xwsystem/io/serialization/parsers/standard.py +43 -0
- exonware/xwsystem/io/serialization/parsers/ujson_parser.py +50 -0
- exonware/xwsystem/io/serialization/registry.py +1 -1
- exonware/xwsystem/io/serialization/serializer.py +1 -1
- exonware/xwsystem/io/serialization/utils/__init__.py +1 -1
- exonware/xwsystem/io/serialization/utils/path_ops.py +1 -1
- exonware/xwsystem/io/stream/__init__.py +1 -1
- exonware/xwsystem/io/stream/async_operations.py +1 -1
- exonware/xwsystem/io/stream/base.py +1 -1
- exonware/xwsystem/io/stream/codec_io.py +1 -1
- exonware/xwsystem/ipc/async_fabric.py +1 -1
- exonware/xwsystem/ipc/base.py +1 -1
- exonware/xwsystem/ipc/contracts.py +1 -1
- exonware/xwsystem/ipc/defs.py +1 -1
- exonware/xwsystem/ipc/errors.py +1 -1
- exonware/xwsystem/monitoring/base.py +1 -1
- exonware/xwsystem/monitoring/contracts.py +1 -1
- exonware/xwsystem/monitoring/defs.py +1 -1
- exonware/xwsystem/monitoring/errors.py +1 -1
- exonware/xwsystem/monitoring/performance_manager_generic.py +1 -1
- exonware/xwsystem/monitoring/system_monitor.py +1 -1
- exonware/xwsystem/monitoring/tracing.py +1 -1
- exonware/xwsystem/monitoring/tracker.py +1 -1
- exonware/xwsystem/operations/__init__.py +1 -1
- exonware/xwsystem/operations/base.py +1 -1
- exonware/xwsystem/operations/defs.py +1 -1
- exonware/xwsystem/operations/diff.py +1 -1
- exonware/xwsystem/operations/merge.py +1 -1
- exonware/xwsystem/operations/patch.py +1 -1
- exonware/xwsystem/patterns/base.py +1 -1
- exonware/xwsystem/patterns/contracts.py +1 -1
- exonware/xwsystem/patterns/defs.py +1 -1
- exonware/xwsystem/patterns/errors.py +1 -1
- exonware/xwsystem/patterns/registry.py +1 -1
- exonware/xwsystem/plugins/__init__.py +1 -1
- exonware/xwsystem/plugins/base.py +1 -1
- exonware/xwsystem/plugins/contracts.py +1 -1
- exonware/xwsystem/plugins/defs.py +1 -1
- exonware/xwsystem/plugins/errors.py +1 -1
- exonware/xwsystem/runtime/__init__.py +1 -1
- exonware/xwsystem/runtime/base.py +1 -1
- exonware/xwsystem/runtime/contracts.py +1 -1
- exonware/xwsystem/runtime/defs.py +1 -1
- exonware/xwsystem/runtime/env.py +1 -1
- exonware/xwsystem/runtime/errors.py +1 -1
- exonware/xwsystem/runtime/reflection.py +1 -1
- exonware/xwsystem/security/auth.py +1 -1
- exonware/xwsystem/security/base.py +1 -1
- exonware/xwsystem/security/contracts.py +1 -1
- exonware/xwsystem/security/crypto.py +1 -1
- exonware/xwsystem/security/defs.py +1 -1
- exonware/xwsystem/security/errors.py +1 -1
- exonware/xwsystem/security/hazmat.py +1 -1
- exonware/xwsystem/shared/__init__.py +1 -1
- exonware/xwsystem/shared/base.py +1 -1
- exonware/xwsystem/shared/contracts.py +1 -1
- exonware/xwsystem/shared/defs.py +1 -1
- exonware/xwsystem/shared/errors.py +1 -1
- exonware/xwsystem/structures/base.py +1 -1
- exonware/xwsystem/structures/contracts.py +1 -1
- exonware/xwsystem/structures/defs.py +1 -1
- exonware/xwsystem/structures/errors.py +1 -1
- exonware/xwsystem/threading/async_primitives.py +1 -1
- exonware/xwsystem/threading/base.py +1 -1
- exonware/xwsystem/threading/contracts.py +1 -1
- exonware/xwsystem/threading/defs.py +1 -1
- exonware/xwsystem/threading/errors.py +1 -1
- exonware/xwsystem/utils/base.py +1 -1
- exonware/xwsystem/utils/contracts.py +1 -1
- exonware/xwsystem/utils/dt/__init__.py +1 -1
- exonware/xwsystem/utils/dt/base.py +1 -1
- exonware/xwsystem/utils/dt/contracts.py +1 -1
- exonware/xwsystem/utils/dt/defs.py +1 -1
- exonware/xwsystem/utils/dt/errors.py +1 -1
- exonware/xwsystem/utils/dt/formatting.py +1 -1
- exonware/xwsystem/utils/dt/humanize.py +1 -1
- exonware/xwsystem/utils/dt/parsing.py +1 -1
- exonware/xwsystem/utils/dt/timezone_utils.py +1 -1
- exonware/xwsystem/utils/errors.py +1 -1
- exonware/xwsystem/utils/utils_contracts.py +1 -1
- exonware/xwsystem/validation/__init__.py +1 -1
- exonware/xwsystem/validation/base.py +1 -1
- exonware/xwsystem/validation/contracts.py +1 -1
- exonware/xwsystem/validation/declarative.py +1 -1
- exonware/xwsystem/validation/defs.py +1 -1
- exonware/xwsystem/validation/errors.py +1 -1
- exonware/xwsystem/validation/fluent_validator.py +1 -1
- exonware/xwsystem/version.py +4 -4
- {exonware_xwsystem-0.0.1.411.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/METADATA +3 -3
- exonware_xwsystem-0.1.0.1.dist-info/RECORD +284 -0
- exonware/xwsystem/caching/USAGE_GUIDE.md +0 -779
- exonware/xwsystem/utils/test_runner.py +0 -526
- exonware_xwsystem-0.0.1.411.dist-info/RECORD +0 -274
- {exonware_xwsystem-0.0.1.411.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/WHEEL +0 -0
- {exonware_xwsystem-0.0.1.411.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/licenses/LICENSE +0 -0
exonware/xwsystem/io/base.py
CHANGED
|
@@ -16,7 +16,7 @@ lazy, paged, and atomic access features without re-implementing I/O logic.
|
|
|
16
16
|
Company: eXonware.com
|
|
17
17
|
Author: Eng. Muhammad AlShehri
|
|
18
18
|
Email: connect@exonware.com
|
|
19
|
-
Version: 0.0.1
|
|
19
|
+
Version: 0.1.0.1
|
|
20
20
|
Generation Date: 15-Dec-2025
|
|
21
21
|
"""
|
|
22
22
|
|
|
@@ -29,15 +29,85 @@ from abc import ABC, abstractmethod
|
|
|
29
29
|
import json
|
|
30
30
|
import os
|
|
31
31
|
import tempfile
|
|
32
|
+
import multiprocessing as mp
|
|
33
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
32
34
|
|
|
33
35
|
from .serialization.auto_serializer import AutoSerializer
|
|
34
|
-
from
|
|
36
|
+
from exonware.xwsystem.config.logging_setup import get_logger
|
|
37
|
+
from exonware.xwsystem.config.performance import get_performance_config
|
|
35
38
|
|
|
36
39
|
|
|
37
40
|
logger = get_logger(__name__)
|
|
38
41
|
|
|
39
42
|
|
|
40
43
|
JsonMatchFn = Callable[[Any], bool]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _process_chunk_worker(args: tuple[int, int, int, str, str, str | None, int | None, bool]) -> tuple[list[int] | None, dict[str, int], int]:
|
|
47
|
+
"""
|
|
48
|
+
Process a single chunk (runs in worker process).
|
|
49
|
+
|
|
50
|
+
This is a module-level function to make it picklable for multiprocessing.
|
|
51
|
+
"""
|
|
52
|
+
chunk_id, start_offset, end_offset, file_path_str, encoding, id_field_arg, max_id_index_arg, build_line_offsets_arg = args
|
|
53
|
+
chunk_line_offsets: list[int] | None = [] if build_line_offsets_arg else None
|
|
54
|
+
chunk_id_index: dict[str, int] = {}
|
|
55
|
+
lines_processed = 0
|
|
56
|
+
|
|
57
|
+
# Import parser in worker process (can't pickle serializer)
|
|
58
|
+
try:
|
|
59
|
+
from exonware.xwsystem.io.serialization.parsers.registry import get_best_available_parser
|
|
60
|
+
parser = get_best_available_parser()
|
|
61
|
+
except ImportError:
|
|
62
|
+
import json as parser
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
with open(file_path_str, "rb") as f:
|
|
66
|
+
f.seek(start_offset)
|
|
67
|
+
current_offset = start_offset
|
|
68
|
+
|
|
69
|
+
while current_offset < end_offset:
|
|
70
|
+
line_start = current_offset
|
|
71
|
+
line = f.readline()
|
|
72
|
+
|
|
73
|
+
if not line:
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
current_offset = f.tell()
|
|
77
|
+
|
|
78
|
+
# Skip if we've gone past the end
|
|
79
|
+
if line_start >= end_offset:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
# Optimize: Check empty lines early (match example code pattern)
|
|
83
|
+
raw = line.strip()
|
|
84
|
+
if not raw:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Track line offset if requested, calculate line_idx once
|
|
88
|
+
if build_line_offsets_arg:
|
|
89
|
+
chunk_line_offsets.append(line_start)
|
|
90
|
+
line_idx = len(chunk_line_offsets) - 1
|
|
91
|
+
else:
|
|
92
|
+
line_idx = lines_processed
|
|
93
|
+
|
|
94
|
+
if id_field_arg and (max_id_index_arg is None or len(chunk_id_index) < max_id_index_arg):
|
|
95
|
+
try:
|
|
96
|
+
# Parser accepts bytes directly (hybrid parser handles it)
|
|
97
|
+
obj = parser.loads(raw)
|
|
98
|
+
if isinstance(obj, dict) and id_field_arg in obj:
|
|
99
|
+
id_val = str(obj[id_field_arg])
|
|
100
|
+
chunk_id_index[id_val] = line_idx
|
|
101
|
+
except Exception:
|
|
102
|
+
# Skip invalid lines (best-effort indexing)
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
lines_processed += 1
|
|
106
|
+
except Exception as e:
|
|
107
|
+
# Can't use logger in worker process, just pass
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
return (chunk_line_offsets, chunk_id_index, lines_processed)
|
|
41
111
|
JsonUpdateFn = Callable[[Any], Any]
|
|
42
112
|
|
|
43
113
|
|
|
@@ -297,22 +367,78 @@ class NDJSONDataOperations(ADataOperations):
|
|
|
297
367
|
encoding: str = "utf-8",
|
|
298
368
|
id_field: str | None = None,
|
|
299
369
|
max_id_index: int | None = None,
|
|
370
|
+
use_parallel: bool | None = None,
|
|
371
|
+
num_workers: int | None = None,
|
|
372
|
+
chunk_size_mb: int = 100,
|
|
373
|
+
build_line_offsets: bool = True,
|
|
300
374
|
) -> JsonIndex:
|
|
301
375
|
"""
|
|
302
376
|
One-time full scan to build an index:
|
|
303
377
|
- line_offsets: byte offset of each JSON line
|
|
304
378
|
- optional id_index: obj[id_field] -> line_number
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
file_path: Path to JSONL file
|
|
382
|
+
encoding: File encoding (default: utf-8)
|
|
383
|
+
id_field: Optional field name to build id_index
|
|
384
|
+
max_id_index: Maximum entries in id_index (None = unlimited)
|
|
385
|
+
use_parallel: Enable parallel processing (None = auto-detect based on file size)
|
|
386
|
+
num_workers: Number of worker processes (None = CPU count)
|
|
387
|
+
chunk_size_mb: Chunk size in MB for parallel processing (default: 100MB)
|
|
388
|
+
build_line_offsets: If True, build line_offsets list (default: True, set False for faster id_index-only builds)
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
JsonIndex with line_offsets (if build_line_offsets=True) and optional id_index
|
|
305
392
|
"""
|
|
306
393
|
target = Path(file_path)
|
|
307
394
|
if not target.exists():
|
|
308
395
|
raise FileNotFoundError(str(target))
|
|
309
396
|
|
|
310
|
-
|
|
397
|
+
# Auto-detect parallel based on config
|
|
398
|
+
perf_config = get_performance_config()
|
|
399
|
+
if use_parallel is None:
|
|
400
|
+
if not perf_config.enable_parallel_index:
|
|
401
|
+
use_parallel = False
|
|
402
|
+
else:
|
|
403
|
+
file_size_mb = target.stat().st_size / 1_048_576 # 1024 * 1024
|
|
404
|
+
use_parallel = file_size_mb > perf_config.parallel_index_threshold_mb
|
|
405
|
+
|
|
406
|
+
# Use config defaults for workers and chunk size
|
|
407
|
+
if num_workers is None:
|
|
408
|
+
num_workers = perf_config.parallel_index_workers
|
|
409
|
+
if chunk_size_mb == 100: # Only use default if not explicitly set
|
|
410
|
+
chunk_size_mb = perf_config.parallel_index_chunk_size_mb
|
|
411
|
+
|
|
412
|
+
# Use parallel processing if enabled and file is large enough
|
|
413
|
+
if use_parallel:
|
|
414
|
+
try:
|
|
415
|
+
return self._build_index_parallel(
|
|
416
|
+
target,
|
|
417
|
+
encoding=encoding,
|
|
418
|
+
id_field=id_field,
|
|
419
|
+
max_id_index=max_id_index,
|
|
420
|
+
num_workers=num_workers,
|
|
421
|
+
chunk_size_mb=chunk_size_mb,
|
|
422
|
+
build_line_offsets=build_line_offsets,
|
|
423
|
+
)
|
|
424
|
+
except Exception as e:
|
|
425
|
+
logger.warning(f"Parallel index building failed, falling back to single-threaded: {e}")
|
|
426
|
+
# Fall through to single-threaded
|
|
427
|
+
|
|
428
|
+
# Single-threaded implementation (optimized - matches example code exactly)
|
|
429
|
+
line_offsets: list[int] | None = [] if build_line_offsets else None
|
|
311
430
|
id_index: dict[str, int] | None = {} if id_field else None
|
|
312
431
|
|
|
313
432
|
size = target.stat().st_size
|
|
314
433
|
mtime = target.stat().st_mtime
|
|
315
434
|
|
|
435
|
+
# Cache parser instance (matches example code pattern)
|
|
436
|
+
try:
|
|
437
|
+
from exonware.xwsystem.io.serialization.parsers.registry import get_best_available_parser
|
|
438
|
+
parser = get_best_available_parser()
|
|
439
|
+
except ImportError:
|
|
440
|
+
import json as parser
|
|
441
|
+
|
|
316
442
|
offset = 0
|
|
317
443
|
with target.open("rb") as f:
|
|
318
444
|
line_no = 0
|
|
@@ -320,28 +446,168 @@ class NDJSONDataOperations(ADataOperations):
|
|
|
320
446
|
line = f.readline()
|
|
321
447
|
if not line:
|
|
322
448
|
break
|
|
323
|
-
|
|
449
|
+
if build_line_offsets:
|
|
450
|
+
line_offsets.append(offset)
|
|
324
451
|
|
|
325
452
|
if id_index is not None:
|
|
326
453
|
try:
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
)
|
|
454
|
+
# Match example code exactly: strip bytes, parse directly
|
|
455
|
+
raw = line.strip()
|
|
456
|
+
if raw:
|
|
457
|
+
# Parser accepts bytes directly (hybrid parser handles it)
|
|
458
|
+
obj = parser.loads(raw)
|
|
332
459
|
if isinstance(obj, dict) and id_field in obj:
|
|
333
460
|
id_val = str(obj[id_field])
|
|
334
461
|
if max_id_index is None or len(id_index) < max_id_index:
|
|
335
462
|
id_index[id_val] = line_no
|
|
336
463
|
except Exception:
|
|
337
464
|
# Index should be best-effort and robust to bad lines.
|
|
338
|
-
|
|
465
|
+
# Skip invalid lines silently for performance
|
|
466
|
+
pass
|
|
339
467
|
|
|
340
468
|
offset += len(line)
|
|
341
469
|
line_no += 1
|
|
342
470
|
|
|
343
471
|
meta = JsonIndexMeta(path=str(target), size=size, mtime=mtime, version=1)
|
|
344
472
|
return JsonIndex(meta=meta, line_offsets=line_offsets, id_index=id_index)
|
|
473
|
+
|
|
474
|
+
def _build_index_parallel(
|
|
475
|
+
self,
|
|
476
|
+
file_path: Path,
|
|
477
|
+
*,
|
|
478
|
+
encoding: str = "utf-8",
|
|
479
|
+
id_field: str | None = None,
|
|
480
|
+
max_id_index: int | None = None,
|
|
481
|
+
num_workers: int | None = None,
|
|
482
|
+
chunk_size_mb: int = 100,
|
|
483
|
+
build_line_offsets: bool = True,
|
|
484
|
+
) -> JsonIndex:
|
|
485
|
+
"""
|
|
486
|
+
Parallel index building using multiple CPU cores.
|
|
487
|
+
|
|
488
|
+
This is an internal method called by build_index() when use_parallel=True.
|
|
489
|
+
"""
|
|
490
|
+
if num_workers is None:
|
|
491
|
+
# Optimize: Simple formula - 1 worker per 10MB (capped at ProcessPoolExecutor limit)
|
|
492
|
+
# ProcessPoolExecutor max_workers limit is 61 on Windows
|
|
493
|
+
file_size_mb = file_path.stat().st_size / 1_048_576 # 1024 * 1024
|
|
494
|
+
calculated_workers = int(file_size_mb / 10) # 1 worker per 10MB
|
|
495
|
+
# Cap at 61 (ProcessPoolExecutor limit) or CPU count, whichever is higher
|
|
496
|
+
cpu_count = mp.cpu_count()
|
|
497
|
+
num_workers = max(cpu_count, min(61, calculated_workers))
|
|
498
|
+
|
|
499
|
+
file_size = file_path.stat().st_size
|
|
500
|
+
chunk_size_bytes = chunk_size_mb * 1_048_576 # 1024 * 1024
|
|
501
|
+
|
|
502
|
+
# If file is too small, fall back to single-threaded
|
|
503
|
+
if file_size < chunk_size_bytes * 2:
|
|
504
|
+
raise ValueError("File too small for parallel processing")
|
|
505
|
+
|
|
506
|
+
# Split file into chunks
|
|
507
|
+
chunks = []
|
|
508
|
+
current_offset = 0
|
|
509
|
+
chunk_id = 0
|
|
510
|
+
|
|
511
|
+
while current_offset < file_size:
|
|
512
|
+
chunk_end = min(current_offset + chunk_size_bytes, file_size)
|
|
513
|
+
chunks.append((chunk_id, current_offset, chunk_end))
|
|
514
|
+
current_offset = chunk_end
|
|
515
|
+
chunk_id += 1
|
|
516
|
+
|
|
517
|
+
# Limit number of chunks
|
|
518
|
+
if len(chunks) > num_workers * 2:
|
|
519
|
+
merged_chunks = []
|
|
520
|
+
for i in range(0, len(chunks), max(1, len(chunks) // num_workers)):
|
|
521
|
+
chunk_group = chunks[i:i + max(1, len(chunks) // num_workers)]
|
|
522
|
+
if chunk_group:
|
|
523
|
+
merged_chunks.append((
|
|
524
|
+
chunk_group[0][0],
|
|
525
|
+
chunk_group[0][1],
|
|
526
|
+
chunk_group[-1][2]
|
|
527
|
+
))
|
|
528
|
+
chunks = merged_chunks
|
|
529
|
+
|
|
530
|
+
logger.debug(f"Processing {len(chunks)} chunks with {num_workers} workers")
|
|
531
|
+
|
|
532
|
+
# Process chunks in parallel
|
|
533
|
+
line_offsets: list[int] | None = [] if build_line_offsets else None
|
|
534
|
+
id_index: dict[str, int] | None = {} if id_field else None
|
|
535
|
+
|
|
536
|
+
# Prepare arguments for worker processes
|
|
537
|
+
chunk_args = [
|
|
538
|
+
(chunk[0], chunk[1], chunk[2], str(file_path), encoding, id_field, max_id_index, build_line_offsets)
|
|
539
|
+
for chunk in chunks
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
# Execute parallel processing
|
|
543
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
544
|
+
futures = {
|
|
545
|
+
executor.submit(_process_chunk_worker, args): args[0]
|
|
546
|
+
for args in chunk_args
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
# Optimize: Use dict for O(1) lookup instead of sorting
|
|
550
|
+
chunk_results_dict: dict[int, tuple[list[int], dict[str, int]]] = {}
|
|
551
|
+
for future in as_completed(futures):
|
|
552
|
+
try:
|
|
553
|
+
chunk_offsets, chunk_ids, _ = future.result()
|
|
554
|
+
chunk_id = futures[future]
|
|
555
|
+
chunk_results_dict[chunk_id] = (chunk_offsets, chunk_ids)
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.warning(f"Chunk processing failed: {e}")
|
|
558
|
+
raise
|
|
559
|
+
|
|
560
|
+
# Merge results (process in order by chunk_id)
|
|
561
|
+
if build_line_offsets:
|
|
562
|
+
# Optimize: Pre-calculate total size for better memory allocation
|
|
563
|
+
total_offsets = sum(len(offsets) if offsets else 0 for offsets, _ in chunk_results_dict.values())
|
|
564
|
+
if total_offsets > 0:
|
|
565
|
+
# Pre-allocate list for better performance
|
|
566
|
+
line_offsets = [0] * total_offsets
|
|
567
|
+
current_idx = 0
|
|
568
|
+
else:
|
|
569
|
+
line_offsets = []
|
|
570
|
+
current_idx = 0
|
|
571
|
+
else:
|
|
572
|
+
current_idx = 0
|
|
573
|
+
|
|
574
|
+
for chunk_id in sorted(chunk_results_dict.keys()):
|
|
575
|
+
chunk_offsets, chunk_ids = chunk_results_dict[chunk_id]
|
|
576
|
+
|
|
577
|
+
# Merge line_offsets if building them
|
|
578
|
+
if build_line_offsets and chunk_offsets:
|
|
579
|
+
# Optimize: Use slice assignment for faster extend
|
|
580
|
+
if total_offsets > 0:
|
|
581
|
+
line_offsets[current_idx:current_idx + len(chunk_offsets)] = chunk_offsets
|
|
582
|
+
base_line = current_idx
|
|
583
|
+
current_idx += len(chunk_offsets)
|
|
584
|
+
else:
|
|
585
|
+
base_line = len(line_offsets)
|
|
586
|
+
line_offsets.extend(chunk_offsets)
|
|
587
|
+
else:
|
|
588
|
+
# Calculate base_line for id_index even without line_offsets
|
|
589
|
+
base_line = current_idx
|
|
590
|
+
if chunk_offsets:
|
|
591
|
+
current_idx += len(chunk_offsets)
|
|
592
|
+
else:
|
|
593
|
+
# Estimate: assume average line size if we don't have offsets
|
|
594
|
+
current_idx += 300 # Rough estimate
|
|
595
|
+
|
|
596
|
+
if id_index is not None and chunk_ids:
|
|
597
|
+
# Optimize: Batch update with dict.update() if no limit
|
|
598
|
+
if max_id_index is None:
|
|
599
|
+
# Fast path: no limit, use dict comprehension + update
|
|
600
|
+
id_index.update({id_val: base_line + rel_line for id_val, rel_line in chunk_ids.items()})
|
|
601
|
+
else:
|
|
602
|
+
# Slower path: check limit per item
|
|
603
|
+
for id_val, rel_line in chunk_ids.items():
|
|
604
|
+
if len(id_index) < max_id_index:
|
|
605
|
+
id_index[id_val] = base_line + rel_line
|
|
606
|
+
|
|
607
|
+
size = file_path.stat().st_size
|
|
608
|
+
mtime = file_path.stat().st_mtime
|
|
609
|
+
meta = JsonIndexMeta(path=str(file_path), size=size, mtime=mtime, version=1)
|
|
610
|
+
return JsonIndex(meta=meta, line_offsets=line_offsets, id_index=id_index)
|
|
345
611
|
|
|
346
612
|
def indexed_get_by_line(
|
|
347
613
|
self,
|
exonware/xwsystem/io/defs.py
CHANGED
exonware/xwsystem/io/errors.py
CHANGED
exonware/xwsystem/io/facade.py
CHANGED