exonware-xwsystem 0.0.1.410__py3-none-any.whl → 0.1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- exonware/__init__.py +1 -1
- exonware/conf.py +1 -1
- exonware/xwsystem/__init__.py +2 -2
- exonware/xwsystem/caching/__init__.py +1 -1
- exonware/xwsystem/caching/base.py +2 -2
- exonware/xwsystem/caching/bloom_cache.py +2 -2
- exonware/xwsystem/caching/cache_manager.py +1 -1
- exonware/xwsystem/caching/conditional.py +2 -2
- exonware/xwsystem/caching/contracts.py +1 -1
- exonware/xwsystem/caching/decorators.py +2 -2
- exonware/xwsystem/caching/defs.py +1 -1
- exonware/xwsystem/caching/disk_cache.py +1 -1
- exonware/xwsystem/caching/distributed.py +1 -1
- exonware/xwsystem/caching/errors.py +1 -1
- exonware/xwsystem/caching/events.py +2 -2
- exonware/xwsystem/caching/eviction_strategies.py +1 -1
- exonware/xwsystem/caching/fluent.py +1 -1
- exonware/xwsystem/caching/integrity.py +1 -1
- exonware/xwsystem/caching/lfu_cache.py +2 -2
- exonware/xwsystem/caching/lfu_optimized.py +3 -3
- exonware/xwsystem/caching/lru_cache.py +2 -2
- exonware/xwsystem/caching/memory_bounded.py +2 -2
- exonware/xwsystem/caching/metrics_exporter.py +2 -2
- exonware/xwsystem/caching/observable_cache.py +1 -1
- exonware/xwsystem/caching/pluggable_cache.py +2 -2
- exonware/xwsystem/caching/rate_limiter.py +1 -1
- exonware/xwsystem/caching/read_through.py +2 -2
- exonware/xwsystem/caching/secure_cache.py +1 -1
- exonware/xwsystem/caching/serializable.py +2 -2
- exonware/xwsystem/caching/stats.py +1 -1
- exonware/xwsystem/caching/tagging.py +2 -2
- exonware/xwsystem/caching/ttl_cache.py +1 -1
- exonware/xwsystem/caching/two_tier_cache.py +1 -1
- exonware/xwsystem/caching/utils.py +1 -1
- exonware/xwsystem/caching/validation.py +1 -1
- exonware/xwsystem/caching/warming.py +2 -2
- exonware/xwsystem/caching/write_behind.py +2 -2
- exonware/xwsystem/cli/__init__.py +1 -1
- exonware/xwsystem/cli/args.py +1 -1
- exonware/xwsystem/cli/base.py +1 -1
- exonware/xwsystem/cli/colors.py +1 -1
- exonware/xwsystem/cli/console.py +1 -1
- exonware/xwsystem/cli/contracts.py +1 -1
- exonware/xwsystem/cli/defs.py +1 -1
- exonware/xwsystem/cli/errors.py +1 -1
- exonware/xwsystem/cli/progress.py +1 -1
- exonware/xwsystem/cli/prompts.py +1 -1
- exonware/xwsystem/cli/tables.py +1 -1
- exonware/xwsystem/config/__init__.py +1 -1
- exonware/xwsystem/config/base.py +2 -2
- exonware/xwsystem/config/contracts.py +1 -1
- exonware/xwsystem/config/defaults.py +1 -1
- exonware/xwsystem/config/defs.py +1 -1
- exonware/xwsystem/config/errors.py +2 -2
- exonware/xwsystem/config/logging.py +1 -1
- exonware/xwsystem/config/logging_setup.py +2 -2
- exonware/xwsystem/config/performance.py +115 -388
- exonware/xwsystem/http_client/__init__.py +1 -1
- exonware/xwsystem/http_client/advanced_client.py +2 -2
- exonware/xwsystem/http_client/base.py +2 -2
- exonware/xwsystem/http_client/client.py +2 -2
- exonware/xwsystem/http_client/contracts.py +1 -1
- exonware/xwsystem/http_client/defs.py +1 -1
- exonware/xwsystem/http_client/errors.py +2 -2
- exonware/xwsystem/io/__init__.py +1 -1
- exonware/xwsystem/io/archive/__init__.py +1 -1
- exonware/xwsystem/io/archive/archive.py +1 -1
- exonware/xwsystem/io/archive/archive_files.py +1 -1
- exonware/xwsystem/io/archive/archivers.py +2 -2
- exonware/xwsystem/io/archive/base.py +6 -6
- exonware/xwsystem/io/archive/codec_integration.py +1 -1
- exonware/xwsystem/io/archive/compression.py +1 -1
- exonware/xwsystem/io/archive/formats/__init__.py +1 -1
- exonware/xwsystem/io/archive/formats/brotli_format.py +6 -3
- exonware/xwsystem/io/archive/formats/lz4_format.py +6 -3
- exonware/xwsystem/io/archive/formats/rar.py +6 -3
- exonware/xwsystem/io/archive/formats/sevenzip.py +6 -3
- exonware/xwsystem/io/archive/formats/squashfs_format.py +1 -1
- exonware/xwsystem/io/archive/formats/tar.py +1 -1
- exonware/xwsystem/io/archive/formats/wim_format.py +6 -3
- exonware/xwsystem/io/archive/formats/zip.py +1 -1
- exonware/xwsystem/io/archive/formats/zpaq_format.py +1 -1
- exonware/xwsystem/io/archive/formats/zstandard.py +6 -3
- exonware/xwsystem/io/base.py +1 -1
- exonware/xwsystem/io/codec/__init__.py +1 -1
- exonware/xwsystem/io/codec/base.py +6 -6
- exonware/xwsystem/io/codec/contracts.py +1 -1
- exonware/xwsystem/io/codec/registry.py +5 -5
- exonware/xwsystem/io/common/__init__.py +1 -1
- exonware/xwsystem/io/common/base.py +1 -1
- exonware/xwsystem/io/common/lock.py +1 -1
- exonware/xwsystem/io/common/watcher.py +1 -1
- exonware/xwsystem/io/contracts.py +1 -1
- exonware/xwsystem/io/data_operations.py +746 -0
- exonware/xwsystem/io/defs.py +1 -1
- exonware/xwsystem/io/errors.py +1 -1
- exonware/xwsystem/io/facade.py +2 -2
- exonware/xwsystem/io/file/__init__.py +1 -1
- exonware/xwsystem/io/file/base.py +1 -1
- exonware/xwsystem/io/file/conversion.py +1 -1
- exonware/xwsystem/io/file/file.py +8 -6
- exonware/xwsystem/io/file/paged_source.py +8 -1
- exonware/xwsystem/io/file/paging/__init__.py +1 -1
- exonware/xwsystem/io/file/paging/byte_paging.py +1 -1
- exonware/xwsystem/io/file/paging/line_paging.py +1 -1
- exonware/xwsystem/io/file/paging/record_paging.py +1 -1
- exonware/xwsystem/io/file/paging/registry.py +4 -4
- exonware/xwsystem/io/file/source.py +20 -9
- exonware/xwsystem/io/filesystem/__init__.py +1 -1
- exonware/xwsystem/io/filesystem/base.py +1 -1
- exonware/xwsystem/io/filesystem/local.py +9 -1
- exonware/xwsystem/io/folder/__init__.py +1 -1
- exonware/xwsystem/io/folder/base.py +1 -1
- exonware/xwsystem/io/folder/folder.py +2 -2
- exonware/xwsystem/io/serialization/__init__.py +1 -1
- exonware/xwsystem/io/serialization/auto_serializer.py +52 -39
- exonware/xwsystem/io/serialization/base.py +165 -1
- exonware/xwsystem/io/serialization/contracts.py +88 -1
- exonware/xwsystem/io/serialization/defs.py +1 -1
- exonware/xwsystem/io/serialization/errors.py +1 -1
- exonware/xwsystem/io/serialization/flyweight.py +10 -10
- exonware/xwsystem/io/serialization/format_detector.py +8 -5
- exonware/xwsystem/io/serialization/formats/__init__.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/bson.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/cbor.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/marshal.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/msgpack.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/pickle.py +1 -1
- exonware/xwsystem/io/serialization/formats/binary/plistlib.py +1 -1
- exonware/xwsystem/io/serialization/formats/database/dbm.py +53 -1
- exonware/xwsystem/io/serialization/formats/database/shelve.py +48 -1
- exonware/xwsystem/io/serialization/formats/database/sqlite3.py +85 -1
- exonware/xwsystem/io/serialization/formats/text/append_only_log.py +201 -0
- exonware/xwsystem/io/serialization/formats/text/configparser.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/csv.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/formdata.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/json.py +43 -20
- exonware/xwsystem/io/serialization/formats/text/json5.py +7 -5
- exonware/xwsystem/io/serialization/formats/text/jsonlines.py +316 -22
- exonware/xwsystem/io/serialization/formats/text/multipart.py +1 -1
- exonware/xwsystem/io/serialization/formats/text/toml.py +19 -3
- exonware/xwsystem/io/serialization/formats/text/xml.py +8 -1
- exonware/xwsystem/io/serialization/formats/text/yaml.py +52 -2
- exonware/xwsystem/io/serialization/parsers/__init__.py +15 -0
- exonware/xwsystem/io/serialization/parsers/base.py +59 -0
- exonware/xwsystem/io/serialization/parsers/hybrid_parser.py +61 -0
- exonware/xwsystem/io/serialization/parsers/msgspec_parser.py +45 -0
- exonware/xwsystem/io/serialization/parsers/orjson_direct_parser.py +53 -0
- exonware/xwsystem/io/serialization/parsers/orjson_parser.py +59 -0
- exonware/xwsystem/io/serialization/parsers/pysimdjson_parser.py +51 -0
- exonware/xwsystem/io/serialization/parsers/rapidjson_parser.py +50 -0
- exonware/xwsystem/io/serialization/parsers/registry.py +90 -0
- exonware/xwsystem/io/serialization/parsers/standard.py +43 -0
- exonware/xwsystem/io/serialization/parsers/ujson_parser.py +50 -0
- exonware/xwsystem/io/serialization/registry.py +1 -1
- exonware/xwsystem/io/serialization/serializer.py +175 -3
- exonware/xwsystem/io/serialization/utils/__init__.py +1 -1
- exonware/xwsystem/io/serialization/utils/path_ops.py +1 -1
- exonware/xwsystem/io/stream/__init__.py +1 -1
- exonware/xwsystem/io/stream/async_operations.py +1 -1
- exonware/xwsystem/io/stream/base.py +1 -1
- exonware/xwsystem/io/stream/codec_io.py +1 -1
- exonware/xwsystem/ipc/async_fabric.py +1 -2
- exonware/xwsystem/ipc/base.py +2 -2
- exonware/xwsystem/ipc/contracts.py +2 -2
- exonware/xwsystem/ipc/defs.py +1 -1
- exonware/xwsystem/ipc/errors.py +2 -2
- exonware/xwsystem/ipc/pipes.py +2 -2
- exonware/xwsystem/ipc/shared_memory.py +2 -2
- exonware/xwsystem/monitoring/base.py +2 -2
- exonware/xwsystem/monitoring/contracts.py +1 -1
- exonware/xwsystem/monitoring/defs.py +1 -1
- exonware/xwsystem/monitoring/error_recovery.py +2 -2
- exonware/xwsystem/monitoring/errors.py +2 -2
- exonware/xwsystem/monitoring/memory_monitor.py +1 -1
- exonware/xwsystem/monitoring/performance_manager_generic.py +2 -2
- exonware/xwsystem/monitoring/performance_validator.py +1 -1
- exonware/xwsystem/monitoring/system_monitor.py +2 -2
- exonware/xwsystem/monitoring/tracing.py +2 -2
- exonware/xwsystem/monitoring/tracker.py +1 -1
- exonware/xwsystem/operations/__init__.py +1 -1
- exonware/xwsystem/operations/base.py +1 -1
- exonware/xwsystem/operations/defs.py +1 -1
- exonware/xwsystem/operations/diff.py +1 -1
- exonware/xwsystem/operations/merge.py +1 -1
- exonware/xwsystem/operations/patch.py +1 -1
- exonware/xwsystem/patterns/base.py +2 -2
- exonware/xwsystem/patterns/context_manager.py +2 -2
- exonware/xwsystem/patterns/contracts.py +9 -9
- exonware/xwsystem/patterns/defs.py +1 -1
- exonware/xwsystem/patterns/dynamic_facade.py +8 -8
- exonware/xwsystem/patterns/errors.py +5 -5
- exonware/xwsystem/patterns/handler_factory.py +6 -6
- exonware/xwsystem/patterns/object_pool.py +7 -7
- exonware/xwsystem/patterns/registry.py +3 -3
- exonware/xwsystem/plugins/__init__.py +1 -1
- exonware/xwsystem/plugins/base.py +5 -5
- exonware/xwsystem/plugins/contracts.py +5 -5
- exonware/xwsystem/plugins/defs.py +1 -1
- exonware/xwsystem/plugins/errors.py +4 -4
- exonware/xwsystem/runtime/__init__.py +1 -1
- exonware/xwsystem/runtime/base.py +6 -6
- exonware/xwsystem/runtime/contracts.py +6 -6
- exonware/xwsystem/runtime/defs.py +1 -1
- exonware/xwsystem/runtime/env.py +2 -2
- exonware/xwsystem/runtime/errors.py +1 -1
- exonware/xwsystem/runtime/reflection.py +8 -8
- exonware/xwsystem/security/auth.py +1 -1
- exonware/xwsystem/security/base.py +2 -2
- exonware/xwsystem/security/contracts.py +1 -1
- exonware/xwsystem/security/crypto.py +2 -2
- exonware/xwsystem/security/defs.py +1 -1
- exonware/xwsystem/security/errors.py +2 -2
- exonware/xwsystem/security/hazmat.py +2 -2
- exonware/xwsystem/shared/__init__.py +1 -1
- exonware/xwsystem/shared/base.py +1 -1
- exonware/xwsystem/shared/contracts.py +1 -1
- exonware/xwsystem/shared/defs.py +1 -1
- exonware/xwsystem/shared/errors.py +1 -1
- exonware/xwsystem/structures/__init__.py +1 -1
- exonware/xwsystem/structures/base.py +2 -2
- exonware/xwsystem/structures/contracts.py +1 -1
- exonware/xwsystem/structures/defs.py +1 -1
- exonware/xwsystem/structures/errors.py +2 -2
- exonware/xwsystem/threading/async_primitives.py +2 -2
- exonware/xwsystem/threading/base.py +2 -2
- exonware/xwsystem/threading/contracts.py +1 -1
- exonware/xwsystem/threading/defs.py +1 -1
- exonware/xwsystem/threading/errors.py +2 -2
- exonware/xwsystem/threading/safe_factory.py +6 -6
- exonware/xwsystem/utils/base.py +2 -2
- exonware/xwsystem/utils/contracts.py +1 -1
- exonware/xwsystem/utils/dt/__init__.py +1 -1
- exonware/xwsystem/utils/dt/base.py +2 -2
- exonware/xwsystem/utils/dt/contracts.py +1 -1
- exonware/xwsystem/utils/dt/defs.py +1 -1
- exonware/xwsystem/utils/dt/errors.py +2 -2
- exonware/xwsystem/utils/dt/formatting.py +1 -1
- exonware/xwsystem/utils/dt/humanize.py +2 -2
- exonware/xwsystem/utils/dt/parsing.py +1 -1
- exonware/xwsystem/utils/dt/timezone_utils.py +1 -1
- exonware/xwsystem/utils/errors.py +2 -2
- exonware/xwsystem/utils/utils_contracts.py +1 -1
- exonware/xwsystem/validation/__init__.py +1 -1
- exonware/xwsystem/validation/base.py +15 -15
- exonware/xwsystem/validation/contracts.py +1 -1
- exonware/xwsystem/validation/data_validator.py +10 -0
- exonware/xwsystem/validation/declarative.py +9 -9
- exonware/xwsystem/validation/defs.py +1 -1
- exonware/xwsystem/validation/errors.py +2 -2
- exonware/xwsystem/validation/fluent_validator.py +4 -4
- exonware/xwsystem/version.py +4 -4
- {exonware_xwsystem-0.0.1.410.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/METADATA +3 -3
- exonware_xwsystem-0.1.0.1.dist-info/RECORD +284 -0
- exonware/xwsystem/caching/USAGE_GUIDE.md +0 -779
- exonware/xwsystem/utils/test_runner.py +0 -526
- exonware_xwsystem-0.0.1.410.dist-info/RECORD +0 -273
- {exonware_xwsystem-0.0.1.410.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/WHEEL +0 -0
- {exonware_xwsystem-0.0.1.410.dist-info → exonware_xwsystem-0.1.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,746 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
#exonware/xwsystem/src/exonware/xwsystem/io/data_operations.py
|
|
4
|
+
|
|
5
|
+
Generic data-operations layer for large, file-backed datasets.
|
|
6
|
+
|
|
7
|
+
This module provides:
|
|
8
|
+
- A small indexing model for line-oriented files (e.g. NDJSON / JSONL)
|
|
9
|
+
- Streaming read / update helpers with atomic guarantees
|
|
10
|
+
- Paging helpers built on top of line offsets
|
|
11
|
+
|
|
12
|
+
The goal is to expose these capabilities in a format-agnostic way so that
|
|
13
|
+
higher-level libraries (xwdata, xwnode, xwentity, etc.) can build powerful
|
|
14
|
+
lazy, paged, and atomic access features without re-implementing I/O logic.
|
|
15
|
+
|
|
16
|
+
Company: eXonware.com
|
|
17
|
+
Author: Eng. Muhammad AlShehri
|
|
18
|
+
Email: connect@exonware.com
|
|
19
|
+
Version: 0.1.0.1
|
|
20
|
+
Generation Date: 15-Dec-2025
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any, Callable, Optional
|
|
28
|
+
from abc import ABC, abstractmethod
|
|
29
|
+
import json
|
|
30
|
+
import os
|
|
31
|
+
import tempfile
|
|
32
|
+
import multiprocessing as mp
|
|
33
|
+
from concurrent.futures import ProcessPoolExecutor, as_completed
|
|
34
|
+
|
|
35
|
+
from .serialization.auto_serializer import AutoSerializer
|
|
36
|
+
from exonware.xwsystem.config.logging_setup import get_logger
|
|
37
|
+
from exonware.xwsystem.config.performance import get_performance_config
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
logger = get_logger(__name__)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
JsonMatchFn = Callable[[Any], bool]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _process_chunk_worker(args: tuple[int, int, int, str, str, str | None, int | None, bool]) -> tuple[list[int] | None, dict[str, int], int]:
|
|
47
|
+
"""
|
|
48
|
+
Process a single chunk (runs in worker process).
|
|
49
|
+
|
|
50
|
+
This is a module-level function to make it picklable for multiprocessing.
|
|
51
|
+
"""
|
|
52
|
+
chunk_id, start_offset, end_offset, file_path_str, encoding, id_field_arg, max_id_index_arg, build_line_offsets_arg = args
|
|
53
|
+
chunk_line_offsets: list[int] | None = [] if build_line_offsets_arg else None
|
|
54
|
+
chunk_id_index: dict[str, int] = {}
|
|
55
|
+
lines_processed = 0
|
|
56
|
+
|
|
57
|
+
# Import parser in worker process (can't pickle serializer)
|
|
58
|
+
try:
|
|
59
|
+
from exonware.xwsystem.io.serialization.parsers.registry import get_best_available_parser
|
|
60
|
+
parser = get_best_available_parser()
|
|
61
|
+
except ImportError:
|
|
62
|
+
import json as parser
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
with open(file_path_str, "rb") as f:
|
|
66
|
+
f.seek(start_offset)
|
|
67
|
+
current_offset = start_offset
|
|
68
|
+
|
|
69
|
+
while current_offset < end_offset:
|
|
70
|
+
line_start = current_offset
|
|
71
|
+
line = f.readline()
|
|
72
|
+
|
|
73
|
+
if not line:
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
current_offset = f.tell()
|
|
77
|
+
|
|
78
|
+
# Skip if we've gone past the end
|
|
79
|
+
if line_start >= end_offset:
|
|
80
|
+
break
|
|
81
|
+
|
|
82
|
+
# Optimize: Check empty lines early (match example code pattern)
|
|
83
|
+
raw = line.strip()
|
|
84
|
+
if not raw:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Track line offset if requested, calculate line_idx once
|
|
88
|
+
if build_line_offsets_arg:
|
|
89
|
+
chunk_line_offsets.append(line_start)
|
|
90
|
+
line_idx = len(chunk_line_offsets) - 1
|
|
91
|
+
else:
|
|
92
|
+
line_idx = lines_processed
|
|
93
|
+
|
|
94
|
+
if id_field_arg and (max_id_index_arg is None or len(chunk_id_index) < max_id_index_arg):
|
|
95
|
+
try:
|
|
96
|
+
# Parser accepts bytes directly (hybrid parser handles it)
|
|
97
|
+
obj = parser.loads(raw)
|
|
98
|
+
if isinstance(obj, dict) and id_field_arg in obj:
|
|
99
|
+
id_val = str(obj[id_field_arg])
|
|
100
|
+
chunk_id_index[id_val] = line_idx
|
|
101
|
+
except Exception:
|
|
102
|
+
# Skip invalid lines (best-effort indexing)
|
|
103
|
+
pass
|
|
104
|
+
|
|
105
|
+
lines_processed += 1
|
|
106
|
+
except Exception as e:
|
|
107
|
+
# Can't use logger in worker process, just pass
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
return (chunk_line_offsets, chunk_id_index, lines_processed)
|
|
111
|
+
JsonUpdateFn = Callable[[Any], Any]
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
@dataclass
|
|
115
|
+
class JsonIndexMeta:
|
|
116
|
+
"""
|
|
117
|
+
Minimal metadata for a JSONL/NDJSON index.
|
|
118
|
+
|
|
119
|
+
This intentionally mirrors the capabilities used in the x5 examples
|
|
120
|
+
without pulling in any of the example code directly.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
path: str
|
|
124
|
+
size: int
|
|
125
|
+
mtime: float
|
|
126
|
+
version: int = 1
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@dataclass
|
|
130
|
+
class JsonIndex:
|
|
131
|
+
"""
|
|
132
|
+
Simple index for line-oriented JSON files.
|
|
133
|
+
|
|
134
|
+
- line_offsets: byte offset of each JSON line
|
|
135
|
+
- id_index: optional mapping id_value -> line_number
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
meta: JsonIndexMeta
|
|
139
|
+
line_offsets: list[int]
|
|
140
|
+
id_index: Optional[dict[str, int]] = None
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class ADataOperations(ABC):
|
|
144
|
+
"""
|
|
145
|
+
Abstract, format-agnostic interface for large, file-backed data operations.
|
|
146
|
+
|
|
147
|
+
Concrete implementations may target specific physical layouts
|
|
148
|
+
(NDJSON/JSONL, multi-document YAML, binary record stores, etc.), but MUST
|
|
149
|
+
conform to these semantics:
|
|
150
|
+
|
|
151
|
+
- Streaming, record-by-record read with a match predicate.
|
|
152
|
+
- Streaming, atomic update using a temp file + replace pattern.
|
|
153
|
+
- Optional indexing for random access and paging.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
@abstractmethod
|
|
157
|
+
def stream_read(
|
|
158
|
+
self,
|
|
159
|
+
file_path: str | Path,
|
|
160
|
+
match: JsonMatchFn,
|
|
161
|
+
path: Optional[list[object]] = None,
|
|
162
|
+
encoding: str = "utf-8",
|
|
163
|
+
) -> Any:
|
|
164
|
+
"""Return the first record (or sub-path) that matches the predicate."""
|
|
165
|
+
raise NotImplementedError
|
|
166
|
+
|
|
167
|
+
@abstractmethod
|
|
168
|
+
def stream_update(
|
|
169
|
+
self,
|
|
170
|
+
file_path: str | Path,
|
|
171
|
+
match: JsonMatchFn,
|
|
172
|
+
updater: JsonUpdateFn,
|
|
173
|
+
*,
|
|
174
|
+
encoding: str = "utf-8",
|
|
175
|
+
newline: str = "\n",
|
|
176
|
+
atomic: bool = True,
|
|
177
|
+
) -> int:
|
|
178
|
+
"""
|
|
179
|
+
Stream-copy the backing store, applying `updater` to matching records.
|
|
180
|
+
|
|
181
|
+
MUST use atomic replace semantics when `atomic=True`.
|
|
182
|
+
Returns number of updated records.
|
|
183
|
+
"""
|
|
184
|
+
raise NotImplementedError
|
|
185
|
+
|
|
186
|
+
@abstractmethod
|
|
187
|
+
def build_index(
|
|
188
|
+
self,
|
|
189
|
+
file_path: str | Path,
|
|
190
|
+
*,
|
|
191
|
+
encoding: str = "utf-8",
|
|
192
|
+
id_field: str | None = None,
|
|
193
|
+
max_id_index: int | None = None,
|
|
194
|
+
) -> JsonIndex:
|
|
195
|
+
"""Build an index structure suitable for random access and paging."""
|
|
196
|
+
raise NotImplementedError
|
|
197
|
+
|
|
198
|
+
@abstractmethod
|
|
199
|
+
def indexed_get_by_line(
|
|
200
|
+
self,
|
|
201
|
+
file_path: str | Path,
|
|
202
|
+
line_number: int,
|
|
203
|
+
*,
|
|
204
|
+
encoding: str = "utf-8",
|
|
205
|
+
index: Optional[JsonIndex] = None,
|
|
206
|
+
) -> Any:
|
|
207
|
+
"""Random-access a specific logical record by its index position."""
|
|
208
|
+
raise NotImplementedError
|
|
209
|
+
|
|
210
|
+
@abstractmethod
|
|
211
|
+
def indexed_get_by_id(
|
|
212
|
+
self,
|
|
213
|
+
file_path: str | Path,
|
|
214
|
+
id_value: Any,
|
|
215
|
+
*,
|
|
216
|
+
encoding: str = "utf-8",
|
|
217
|
+
id_field: str = "id",
|
|
218
|
+
index: Optional[JsonIndex] = None,
|
|
219
|
+
) -> Any:
|
|
220
|
+
"""Random-access a record by logical identifier, with optional index."""
|
|
221
|
+
raise NotImplementedError
|
|
222
|
+
|
|
223
|
+
@abstractmethod
|
|
224
|
+
def get_page(
|
|
225
|
+
self,
|
|
226
|
+
file_path: str | Path,
|
|
227
|
+
page_number: int,
|
|
228
|
+
page_size: int,
|
|
229
|
+
*,
|
|
230
|
+
encoding: str = "utf-8",
|
|
231
|
+
index: Optional[JsonIndex] = None,
|
|
232
|
+
) -> list[Any]:
|
|
233
|
+
"""Return a page of logical records using an index for efficiency."""
|
|
234
|
+
raise NotImplementedError
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
class NDJSONDataOperations(ADataOperations):
|
|
238
|
+
"""
|
|
239
|
+
Generic data-operations helper for NDJSON / JSONL style files.
|
|
240
|
+
|
|
241
|
+
This class is deliberately low-level and works directly with paths and
|
|
242
|
+
native Python data. XWData and other libraries can wrap it to provide
|
|
243
|
+
higher-level, type-agnostic facades.
|
|
244
|
+
"""
|
|
245
|
+
|
|
246
|
+
def __init__(self, serializer: Optional[AutoSerializer] = None):
|
|
247
|
+
# Reuse xwsystem's AutoSerializer so we do not re-implement parsing.
|
|
248
|
+
self._serializer = serializer or AutoSerializer(default_format="JSON")
|
|
249
|
+
|
|
250
|
+
# ------------------------------------------------------------------
|
|
251
|
+
# Streaming read
|
|
252
|
+
# ------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
def stream_read(
|
|
255
|
+
self,
|
|
256
|
+
file_path: str | Path,
|
|
257
|
+
match: JsonMatchFn,
|
|
258
|
+
path: Optional[list[object]] = None,
|
|
259
|
+
encoding: str = "utf-8",
|
|
260
|
+
) -> Any:
|
|
261
|
+
"""
|
|
262
|
+
Stream a huge NDJSON file and return the first record (or sub-path)
|
|
263
|
+
matching `match`.
|
|
264
|
+
|
|
265
|
+
This is intentionally simple and focused:
|
|
266
|
+
- Reads one line at a time
|
|
267
|
+
- Uses AutoSerializer(JSON) for parsing
|
|
268
|
+
- Optional path extraction
|
|
269
|
+
"""
|
|
270
|
+
target = Path(file_path)
|
|
271
|
+
if not target.exists():
|
|
272
|
+
raise FileNotFoundError(str(target))
|
|
273
|
+
|
|
274
|
+
with target.open("r", encoding=encoding) as f:
|
|
275
|
+
for line in f:
|
|
276
|
+
line = line.strip()
|
|
277
|
+
if not line:
|
|
278
|
+
continue
|
|
279
|
+
obj = self._serializer.detect_and_deserialize(
|
|
280
|
+
line, file_path=target, format_hint="JSON"
|
|
281
|
+
)
|
|
282
|
+
if match(obj):
|
|
283
|
+
return self._extract_path(obj, path)
|
|
284
|
+
|
|
285
|
+
raise KeyError("No matching record found")
|
|
286
|
+
|
|
287
|
+
# ------------------------------------------------------------------
|
|
288
|
+
# Streaming update with atomic replace
|
|
289
|
+
# ------------------------------------------------------------------
|
|
290
|
+
|
|
291
|
+
def stream_update(
|
|
292
|
+
self,
|
|
293
|
+
file_path: str | Path,
|
|
294
|
+
match: JsonMatchFn,
|
|
295
|
+
updater: JsonUpdateFn,
|
|
296
|
+
*,
|
|
297
|
+
encoding: str = "utf-8",
|
|
298
|
+
newline: str = "\n",
|
|
299
|
+
atomic: bool = True,
|
|
300
|
+
) -> int:
|
|
301
|
+
"""
|
|
302
|
+
Stream-copy a huge NDJSON file, applying `updater` to records
|
|
303
|
+
where `match(obj)` is True.
|
|
304
|
+
|
|
305
|
+
Only matching records are fully materialized. All writes go to a
|
|
306
|
+
temporary file, which is atomically replaced on success.
|
|
307
|
+
|
|
308
|
+
Returns the number of updated records.
|
|
309
|
+
"""
|
|
310
|
+
target = Path(file_path)
|
|
311
|
+
if not target.exists():
|
|
312
|
+
raise FileNotFoundError(str(target))
|
|
313
|
+
|
|
314
|
+
updated = 0
|
|
315
|
+
dir_path = target.parent
|
|
316
|
+
|
|
317
|
+
# Write to a temp file in the same directory for atomic replace.
|
|
318
|
+
fd, tmp_path_str = tempfile.mkstemp(
|
|
319
|
+
prefix=f".{target.name}.tmp.", dir=str(dir_path)
|
|
320
|
+
)
|
|
321
|
+
tmp_path = Path(tmp_path_str)
|
|
322
|
+
|
|
323
|
+
try:
|
|
324
|
+
with os.fdopen(fd, "w", encoding=encoding, newline=newline) as out_f, target.open(
|
|
325
|
+
"r", encoding=encoding
|
|
326
|
+
) as in_f:
|
|
327
|
+
for line in in_f:
|
|
328
|
+
raw = line.rstrip("\n")
|
|
329
|
+
if not raw:
|
|
330
|
+
out_f.write(line)
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
obj = self._serializer.detect_and_deserialize(
|
|
334
|
+
raw, file_path=target, format_hint="JSON"
|
|
335
|
+
)
|
|
336
|
+
if match(obj):
|
|
337
|
+
updated_obj = updater(obj)
|
|
338
|
+
updated_line = json.dumps(updated_obj, ensure_ascii=False)
|
|
339
|
+
out_f.write(updated_line + newline)
|
|
340
|
+
updated += 1
|
|
341
|
+
else:
|
|
342
|
+
out_f.write(line)
|
|
343
|
+
|
|
344
|
+
if atomic:
|
|
345
|
+
os.replace(tmp_path, target)
|
|
346
|
+
else:
|
|
347
|
+
tmp_path.replace(target)
|
|
348
|
+
|
|
349
|
+
return updated
|
|
350
|
+
finally:
|
|
351
|
+
# Ensure temp file is removed on error
|
|
352
|
+
if tmp_path.exists():
|
|
353
|
+
try:
|
|
354
|
+
tmp_path.unlink()
|
|
355
|
+
except OSError:
|
|
356
|
+
# Best-effort cleanup; do not mask original error.
|
|
357
|
+
logger.debug("Failed to cleanup temp file %s", tmp_path)
|
|
358
|
+
|
|
359
|
+
# ------------------------------------------------------------------
|
|
360
|
+
# Indexing and paging
|
|
361
|
+
# ------------------------------------------------------------------
|
|
362
|
+
|
|
363
|
+
def build_index(
|
|
364
|
+
self,
|
|
365
|
+
file_path: str | Path,
|
|
366
|
+
*,
|
|
367
|
+
encoding: str = "utf-8",
|
|
368
|
+
id_field: str | None = None,
|
|
369
|
+
max_id_index: int | None = None,
|
|
370
|
+
use_parallel: bool | None = None,
|
|
371
|
+
num_workers: int | None = None,
|
|
372
|
+
chunk_size_mb: int = 100,
|
|
373
|
+
build_line_offsets: bool = True,
|
|
374
|
+
) -> JsonIndex:
|
|
375
|
+
"""
|
|
376
|
+
One-time full scan to build an index:
|
|
377
|
+
- line_offsets: byte offset of each JSON line
|
|
378
|
+
- optional id_index: obj[id_field] -> line_number
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
file_path: Path to JSONL file
|
|
382
|
+
encoding: File encoding (default: utf-8)
|
|
383
|
+
id_field: Optional field name to build id_index
|
|
384
|
+
max_id_index: Maximum entries in id_index (None = unlimited)
|
|
385
|
+
use_parallel: Enable parallel processing (None = auto-detect based on file size)
|
|
386
|
+
num_workers: Number of worker processes (None = CPU count)
|
|
387
|
+
chunk_size_mb: Chunk size in MB for parallel processing (default: 100MB)
|
|
388
|
+
build_line_offsets: If True, build line_offsets list (default: True, set False for faster id_index-only builds)
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
JsonIndex with line_offsets (if build_line_offsets=True) and optional id_index
|
|
392
|
+
"""
|
|
393
|
+
target = Path(file_path)
|
|
394
|
+
if not target.exists():
|
|
395
|
+
raise FileNotFoundError(str(target))
|
|
396
|
+
|
|
397
|
+
# Auto-detect parallel based on config
|
|
398
|
+
perf_config = get_performance_config()
|
|
399
|
+
if use_parallel is None:
|
|
400
|
+
if not perf_config.enable_parallel_index:
|
|
401
|
+
use_parallel = False
|
|
402
|
+
else:
|
|
403
|
+
file_size_mb = target.stat().st_size / 1_048_576 # 1024 * 1024
|
|
404
|
+
use_parallel = file_size_mb > perf_config.parallel_index_threshold_mb
|
|
405
|
+
|
|
406
|
+
# Use config defaults for workers and chunk size
|
|
407
|
+
if num_workers is None:
|
|
408
|
+
num_workers = perf_config.parallel_index_workers
|
|
409
|
+
if chunk_size_mb == 100: # Only use default if not explicitly set
|
|
410
|
+
chunk_size_mb = perf_config.parallel_index_chunk_size_mb
|
|
411
|
+
|
|
412
|
+
# Use parallel processing if enabled and file is large enough
|
|
413
|
+
if use_parallel:
|
|
414
|
+
try:
|
|
415
|
+
return self._build_index_parallel(
|
|
416
|
+
target,
|
|
417
|
+
encoding=encoding,
|
|
418
|
+
id_field=id_field,
|
|
419
|
+
max_id_index=max_id_index,
|
|
420
|
+
num_workers=num_workers,
|
|
421
|
+
chunk_size_mb=chunk_size_mb,
|
|
422
|
+
build_line_offsets=build_line_offsets,
|
|
423
|
+
)
|
|
424
|
+
except Exception as e:
|
|
425
|
+
logger.warning(f"Parallel index building failed, falling back to single-threaded: {e}")
|
|
426
|
+
# Fall through to single-threaded
|
|
427
|
+
|
|
428
|
+
# Single-threaded implementation (optimized - matches example code exactly)
|
|
429
|
+
line_offsets: list[int] | None = [] if build_line_offsets else None
|
|
430
|
+
id_index: dict[str, int] | None = {} if id_field else None
|
|
431
|
+
|
|
432
|
+
size = target.stat().st_size
|
|
433
|
+
mtime = target.stat().st_mtime
|
|
434
|
+
|
|
435
|
+
# Cache parser instance (matches example code pattern)
|
|
436
|
+
try:
|
|
437
|
+
from exonware.xwsystem.io.serialization.parsers.registry import get_best_available_parser
|
|
438
|
+
parser = get_best_available_parser()
|
|
439
|
+
except ImportError:
|
|
440
|
+
import json as parser
|
|
441
|
+
|
|
442
|
+
offset = 0
|
|
443
|
+
with target.open("rb") as f:
|
|
444
|
+
line_no = 0
|
|
445
|
+
while True:
|
|
446
|
+
line = f.readline()
|
|
447
|
+
if not line:
|
|
448
|
+
break
|
|
449
|
+
if build_line_offsets:
|
|
450
|
+
line_offsets.append(offset)
|
|
451
|
+
|
|
452
|
+
if id_index is not None:
|
|
453
|
+
try:
|
|
454
|
+
# Match example code exactly: strip bytes, parse directly
|
|
455
|
+
raw = line.strip()
|
|
456
|
+
if raw:
|
|
457
|
+
# Parser accepts bytes directly (hybrid parser handles it)
|
|
458
|
+
obj = parser.loads(raw)
|
|
459
|
+
if isinstance(obj, dict) and id_field in obj:
|
|
460
|
+
id_val = str(obj[id_field])
|
|
461
|
+
if max_id_index is None or len(id_index) < max_id_index:
|
|
462
|
+
id_index[id_val] = line_no
|
|
463
|
+
except Exception:
|
|
464
|
+
# Index should be best-effort and robust to bad lines.
|
|
465
|
+
# Skip invalid lines silently for performance
|
|
466
|
+
pass
|
|
467
|
+
|
|
468
|
+
offset += len(line)
|
|
469
|
+
line_no += 1
|
|
470
|
+
|
|
471
|
+
meta = JsonIndexMeta(path=str(target), size=size, mtime=mtime, version=1)
|
|
472
|
+
return JsonIndex(meta=meta, line_offsets=line_offsets, id_index=id_index)
|
|
473
|
+
|
|
474
|
+
def _build_index_parallel(
|
|
475
|
+
self,
|
|
476
|
+
file_path: Path,
|
|
477
|
+
*,
|
|
478
|
+
encoding: str = "utf-8",
|
|
479
|
+
id_field: str | None = None,
|
|
480
|
+
max_id_index: int | None = None,
|
|
481
|
+
num_workers: int | None = None,
|
|
482
|
+
chunk_size_mb: int = 100,
|
|
483
|
+
build_line_offsets: bool = True,
|
|
484
|
+
) -> JsonIndex:
|
|
485
|
+
"""
|
|
486
|
+
Parallel index building using multiple CPU cores.
|
|
487
|
+
|
|
488
|
+
This is an internal method called by build_index() when use_parallel=True.
|
|
489
|
+
"""
|
|
490
|
+
if num_workers is None:
|
|
491
|
+
# Optimize: Simple formula - 1 worker per 10MB (capped at ProcessPoolExecutor limit)
|
|
492
|
+
# ProcessPoolExecutor max_workers limit is 61 on Windows
|
|
493
|
+
file_size_mb = file_path.stat().st_size / 1_048_576 # 1024 * 1024
|
|
494
|
+
calculated_workers = int(file_size_mb / 10) # 1 worker per 10MB
|
|
495
|
+
# Cap at 61 (ProcessPoolExecutor limit) or CPU count, whichever is higher
|
|
496
|
+
cpu_count = mp.cpu_count()
|
|
497
|
+
num_workers = max(cpu_count, min(61, calculated_workers))
|
|
498
|
+
|
|
499
|
+
file_size = file_path.stat().st_size
|
|
500
|
+
chunk_size_bytes = chunk_size_mb * 1_048_576 # 1024 * 1024
|
|
501
|
+
|
|
502
|
+
# If file is too small, fall back to single-threaded
|
|
503
|
+
if file_size < chunk_size_bytes * 2:
|
|
504
|
+
raise ValueError("File too small for parallel processing")
|
|
505
|
+
|
|
506
|
+
# Split file into chunks
|
|
507
|
+
chunks = []
|
|
508
|
+
current_offset = 0
|
|
509
|
+
chunk_id = 0
|
|
510
|
+
|
|
511
|
+
while current_offset < file_size:
|
|
512
|
+
chunk_end = min(current_offset + chunk_size_bytes, file_size)
|
|
513
|
+
chunks.append((chunk_id, current_offset, chunk_end))
|
|
514
|
+
current_offset = chunk_end
|
|
515
|
+
chunk_id += 1
|
|
516
|
+
|
|
517
|
+
# Limit number of chunks
|
|
518
|
+
if len(chunks) > num_workers * 2:
|
|
519
|
+
merged_chunks = []
|
|
520
|
+
for i in range(0, len(chunks), max(1, len(chunks) // num_workers)):
|
|
521
|
+
chunk_group = chunks[i:i + max(1, len(chunks) // num_workers)]
|
|
522
|
+
if chunk_group:
|
|
523
|
+
merged_chunks.append((
|
|
524
|
+
chunk_group[0][0],
|
|
525
|
+
chunk_group[0][1],
|
|
526
|
+
chunk_group[-1][2]
|
|
527
|
+
))
|
|
528
|
+
chunks = merged_chunks
|
|
529
|
+
|
|
530
|
+
logger.debug(f"Processing {len(chunks)} chunks with {num_workers} workers")
|
|
531
|
+
|
|
532
|
+
# Process chunks in parallel
|
|
533
|
+
line_offsets: list[int] | None = [] if build_line_offsets else None
|
|
534
|
+
id_index: dict[str, int] | None = {} if id_field else None
|
|
535
|
+
|
|
536
|
+
# Prepare arguments for worker processes
|
|
537
|
+
chunk_args = [
|
|
538
|
+
(chunk[0], chunk[1], chunk[2], str(file_path), encoding, id_field, max_id_index, build_line_offsets)
|
|
539
|
+
for chunk in chunks
|
|
540
|
+
]
|
|
541
|
+
|
|
542
|
+
# Execute parallel processing
|
|
543
|
+
with ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
544
|
+
futures = {
|
|
545
|
+
executor.submit(_process_chunk_worker, args): args[0]
|
|
546
|
+
for args in chunk_args
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
# Optimize: Use dict for O(1) lookup instead of sorting
|
|
550
|
+
chunk_results_dict: dict[int, tuple[list[int], dict[str, int]]] = {}
|
|
551
|
+
for future in as_completed(futures):
|
|
552
|
+
try:
|
|
553
|
+
chunk_offsets, chunk_ids, _ = future.result()
|
|
554
|
+
chunk_id = futures[future]
|
|
555
|
+
chunk_results_dict[chunk_id] = (chunk_offsets, chunk_ids)
|
|
556
|
+
except Exception as e:
|
|
557
|
+
logger.warning(f"Chunk processing failed: {e}")
|
|
558
|
+
raise
|
|
559
|
+
|
|
560
|
+
# Merge results (process in order by chunk_id)
|
|
561
|
+
if build_line_offsets:
|
|
562
|
+
# Optimize: Pre-calculate total size for better memory allocation
|
|
563
|
+
total_offsets = sum(len(offsets) if offsets else 0 for offsets, _ in chunk_results_dict.values())
|
|
564
|
+
if total_offsets > 0:
|
|
565
|
+
# Pre-allocate list for better performance
|
|
566
|
+
line_offsets = [0] * total_offsets
|
|
567
|
+
current_idx = 0
|
|
568
|
+
else:
|
|
569
|
+
line_offsets = []
|
|
570
|
+
current_idx = 0
|
|
571
|
+
else:
|
|
572
|
+
current_idx = 0
|
|
573
|
+
|
|
574
|
+
for chunk_id in sorted(chunk_results_dict.keys()):
|
|
575
|
+
chunk_offsets, chunk_ids = chunk_results_dict[chunk_id]
|
|
576
|
+
|
|
577
|
+
# Merge line_offsets if building them
|
|
578
|
+
if build_line_offsets and chunk_offsets:
|
|
579
|
+
# Optimize: Use slice assignment for faster extend
|
|
580
|
+
if total_offsets > 0:
|
|
581
|
+
line_offsets[current_idx:current_idx + len(chunk_offsets)] = chunk_offsets
|
|
582
|
+
base_line = current_idx
|
|
583
|
+
current_idx += len(chunk_offsets)
|
|
584
|
+
else:
|
|
585
|
+
base_line = len(line_offsets)
|
|
586
|
+
line_offsets.extend(chunk_offsets)
|
|
587
|
+
else:
|
|
588
|
+
# Calculate base_line for id_index even without line_offsets
|
|
589
|
+
base_line = current_idx
|
|
590
|
+
if chunk_offsets:
|
|
591
|
+
current_idx += len(chunk_offsets)
|
|
592
|
+
else:
|
|
593
|
+
# Estimate: assume average line size if we don't have offsets
|
|
594
|
+
current_idx += 300 # Rough estimate
|
|
595
|
+
|
|
596
|
+
if id_index is not None and chunk_ids:
|
|
597
|
+
# Optimize: Batch update with dict.update() if no limit
|
|
598
|
+
if max_id_index is None:
|
|
599
|
+
# Fast path: no limit, use dict comprehension + update
|
|
600
|
+
id_index.update({id_val: base_line + rel_line for id_val, rel_line in chunk_ids.items()})
|
|
601
|
+
else:
|
|
602
|
+
# Slower path: check limit per item
|
|
603
|
+
for id_val, rel_line in chunk_ids.items():
|
|
604
|
+
if len(id_index) < max_id_index:
|
|
605
|
+
id_index[id_val] = base_line + rel_line
|
|
606
|
+
|
|
607
|
+
size = file_path.stat().st_size
|
|
608
|
+
mtime = file_path.stat().st_mtime
|
|
609
|
+
meta = JsonIndexMeta(path=str(file_path), size=size, mtime=mtime, version=1)
|
|
610
|
+
return JsonIndex(meta=meta, line_offsets=line_offsets, id_index=id_index)
|
|
611
|
+
|
|
612
|
+
def indexed_get_by_line(
|
|
613
|
+
self,
|
|
614
|
+
file_path: str | Path,
|
|
615
|
+
line_number: int,
|
|
616
|
+
*,
|
|
617
|
+
encoding: str = "utf-8",
|
|
618
|
+
index: Optional[JsonIndex] = None,
|
|
619
|
+
) -> Any:
|
|
620
|
+
"""
|
|
621
|
+
Random-access a specific record by line_number (0-based) using index.
|
|
622
|
+
"""
|
|
623
|
+
target = Path(file_path)
|
|
624
|
+
if index is None:
|
|
625
|
+
index = self.build_index(target, encoding=encoding)
|
|
626
|
+
|
|
627
|
+
if line_number < 0 or line_number >= len(index.line_offsets):
|
|
628
|
+
raise IndexError("line_number out of range")
|
|
629
|
+
|
|
630
|
+
offset = index.line_offsets[line_number]
|
|
631
|
+
with target.open("rb") as f:
|
|
632
|
+
f.seek(offset)
|
|
633
|
+
line = f.readline()
|
|
634
|
+
text = line.decode(encoding).strip()
|
|
635
|
+
if not text:
|
|
636
|
+
raise ValueError("Empty line at indexed position")
|
|
637
|
+
return self._serializer.detect_and_deserialize(
|
|
638
|
+
text, file_path=target, format_hint="JSON"
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
def indexed_get_by_id(
|
|
642
|
+
self,
|
|
643
|
+
file_path: str | Path,
|
|
644
|
+
id_value: Any,
|
|
645
|
+
*,
|
|
646
|
+
encoding: str = "utf-8",
|
|
647
|
+
id_field: str = "id",
|
|
648
|
+
index: Optional[JsonIndex] = None,
|
|
649
|
+
) -> Any:
|
|
650
|
+
"""
|
|
651
|
+
Random-access a record by logical id using id_index if available.
|
|
652
|
+
Falls back to linear scan if id_index missing or incomplete.
|
|
653
|
+
"""
|
|
654
|
+
target = Path(file_path)
|
|
655
|
+
if index is None:
|
|
656
|
+
index = self.build_index(target, encoding=encoding, id_field=id_field)
|
|
657
|
+
|
|
658
|
+
id_index = index.id_index
|
|
659
|
+
if id_index is not None:
|
|
660
|
+
key = str(id_value)
|
|
661
|
+
if key in id_index:
|
|
662
|
+
return self.indexed_get_by_line(
|
|
663
|
+
target, id_index[key], encoding=encoding, index=index
|
|
664
|
+
)
|
|
665
|
+
|
|
666
|
+
# Fallback: linear scan using stream_read semantics
|
|
667
|
+
def _match(obj: Any) -> bool:
|
|
668
|
+
return isinstance(obj, dict) and obj.get(id_field) == id_value
|
|
669
|
+
|
|
670
|
+
return self.stream_read(target, _match, path=None, encoding=encoding)
|
|
671
|
+
|
|
672
|
+
def get_page(
|
|
673
|
+
self,
|
|
674
|
+
file_path: str | Path,
|
|
675
|
+
page_number: int,
|
|
676
|
+
page_size: int,
|
|
677
|
+
*,
|
|
678
|
+
encoding: str = "utf-8",
|
|
679
|
+
index: Optional[JsonIndex] = None,
|
|
680
|
+
) -> list[Any]:
|
|
681
|
+
"""
|
|
682
|
+
Paging helper using index:
|
|
683
|
+
- page_number: 1-based
|
|
684
|
+
- page_size: number of records per page
|
|
685
|
+
"""
|
|
686
|
+
target = Path(file_path)
|
|
687
|
+
if index is None:
|
|
688
|
+
index = self.build_index(target, encoding=encoding)
|
|
689
|
+
|
|
690
|
+
if page_number < 1 or page_size <= 0:
|
|
691
|
+
raise ValueError("Invalid page_number or page_size")
|
|
692
|
+
|
|
693
|
+
start = (page_number - 1) * page_size
|
|
694
|
+
end = start + page_size
|
|
695
|
+
|
|
696
|
+
if start >= len(index.line_offsets):
|
|
697
|
+
return []
|
|
698
|
+
|
|
699
|
+
end = min(end, len(index.line_offsets))
|
|
700
|
+
|
|
701
|
+
results: list[Any] = []
|
|
702
|
+
with target.open("rb") as f:
|
|
703
|
+
for line_no in range(start, end):
|
|
704
|
+
offset = index.line_offsets[line_no]
|
|
705
|
+
f.seek(offset)
|
|
706
|
+
line = f.readline()
|
|
707
|
+
text = line.decode(encoding).strip()
|
|
708
|
+
if not text:
|
|
709
|
+
continue
|
|
710
|
+
obj = self._serializer.detect_and_deserialize(
|
|
711
|
+
text, file_path=target, format_hint="JSON"
|
|
712
|
+
)
|
|
713
|
+
results.append(obj)
|
|
714
|
+
|
|
715
|
+
return results
|
|
716
|
+
|
|
717
|
+
# ------------------------------------------------------------------
|
|
718
|
+
# Helpers
|
|
719
|
+
# ------------------------------------------------------------------
|
|
720
|
+
|
|
721
|
+
def _extract_path(self, obj: Any, path: Optional[list[object]]) -> Any:
|
|
722
|
+
"""Extract a nested path like ['user', 'email'] or ['tags', 0]."""
|
|
723
|
+
if not path:
|
|
724
|
+
return obj
|
|
725
|
+
|
|
726
|
+
current = obj
|
|
727
|
+
for part in path:
|
|
728
|
+
if isinstance(current, dict) and isinstance(part, str):
|
|
729
|
+
if part not in current:
|
|
730
|
+
raise KeyError(part)
|
|
731
|
+
current = current[part]
|
|
732
|
+
elif isinstance(current, list) and isinstance(part, int):
|
|
733
|
+
current = current[part]
|
|
734
|
+
else:
|
|
735
|
+
raise KeyError(part)
|
|
736
|
+
return current
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
__all__ = [
|
|
740
|
+
"JsonIndexMeta",
|
|
741
|
+
"JsonIndex",
|
|
742
|
+
"ADataOperations",
|
|
743
|
+
"NDJSONDataOperations",
|
|
744
|
+
]
|
|
745
|
+
|
|
746
|
+
|