chunksmith-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_cli/__init__.py +3 -0
- chunksmith_cli/__main__.py +41 -0
- chunksmith_cli/agent/__init__.py +1 -0
- chunksmith_cli/agent/agent_display.py +160 -0
- chunksmith_cli/agent/agent_session.py +294 -0
- chunksmith_cli/agent/agent_stream.py +152 -0
- chunksmith_cli/agent/agent_wizard.py +5 -0
- chunksmith_cli/agent_display.py +6 -0
- chunksmith_cli/agent_session.py +6 -0
- chunksmith_cli/agent_stream.py +6 -0
- chunksmith_cli/agent_wizard.py +6 -0
- chunksmith_cli/assets/chunksmith_logo.png +0 -0
- chunksmith_cli/branding.py +6 -0
- chunksmith_cli/config.py +6 -0
- chunksmith_cli/core/__init__.py +21 -0
- chunksmith_cli/core/artifact_layout.py +60 -0
- chunksmith_cli/core/branding.py +137 -0
- chunksmith_cli/core/config.py +32 -0
- chunksmith_cli/core/media_preview.py +72 -0
- chunksmith_cli/core/menu.py +110 -0
- chunksmith_cli/core/menus.py +55 -0
- chunksmith_cli/core/panels.py +24 -0
- chunksmith_cli/core/paths.py +869 -0
- chunksmith_cli/core/prefs_mapper.py +97 -0
- chunksmith_cli/core/saved_catalog.py +180 -0
- chunksmith_cli/core/theme.py +38 -0
- chunksmith_cli/elements_json_prompt.py +6 -0
- chunksmith_cli/json_view.py +6 -0
- chunksmith_cli/media_preview.py +6 -0
- chunksmith_cli/menu.py +6 -0
- chunksmith_cli/menus.py +55 -0
- chunksmith_cli/multi_indexing_wizard.py +3 -0
- chunksmith_cli/outline_browser.py +6 -0
- chunksmith_cli/panels.py +6 -0
- chunksmith_cli/partition_prefs.py +74 -0
- chunksmith_cli/paths.py +6 -0
- chunksmith_cli/pdf_prompt.py +6 -0
- chunksmith_cli/pipelines/__init__.py +1 -0
- chunksmith_cli/pipelines/mapping_validation.py +31 -0
- chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
- chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
- chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
- chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
- chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
- chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
- chunksmith_cli/pipelines/run_multi.py +21 -0
- chunksmith_cli/prompts/__init__.py +1 -0
- chunksmith_cli/prompts/elements_json_prompt.py +49 -0
- chunksmith_cli/prompts/pdf_prompt.py +37 -0
- chunksmith_cli/saved_catalog.py +6 -0
- chunksmith_cli/theme.py +6 -0
- chunksmith_cli/tree_view.py +6 -0
- chunksmith_cli/view_session.py +6 -0
- chunksmith_cli/views/__init__.py +1 -0
- chunksmith_cli/views/json_view.py +11 -0
- chunksmith_cli/views/outline_browser.py +247 -0
- chunksmith_cli/views/tree_view.py +59 -0
- chunksmith_cli/views/view_session.py +32 -0
- chunksmith_cli/wizard.py +357 -0
- chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
- chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
- chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
- chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
- chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
- chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,869 @@
|
|
|
1
|
+
"""Path helpers for the CLI (normalization, ``cli/data`` archives)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from datetime import datetime, timezone
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from urllib.parse import unquote, urlparse
|
|
11
|
+
|
|
12
|
+
from chunksmith_cli.core.artifact_layout import ArtifactLayout
|
|
13
|
+
|
|
14
|
+
ARTIFACT_SUBDIR_JSON = "json"
|
|
15
|
+
ARTIFACT_SUBDIR_IMAGE = "image"
|
|
16
|
+
ARTIFACT_SUBDIR_TEXT = "text"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def normalize_path_string(raw: str) -> Path:
|
|
20
|
+
"""
|
|
21
|
+
Turn raw user input into a ``Path``: strip whitespace, matching outer quotes,
|
|
22
|
+
expand ``~``, and map common ``file://`` forms on Windows.
|
|
23
|
+
"""
|
|
24
|
+
s = (raw or "").strip()
|
|
25
|
+
if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
|
|
26
|
+
s = s[1:-1].strip()
|
|
27
|
+
if s.lower().startswith("file:"):
|
|
28
|
+
parsed = urlparse(s)
|
|
29
|
+
path_part = unquote(parsed.path or "")
|
|
30
|
+
# Windows: file:///D:/x -> /D:/x
|
|
31
|
+
if os.name == "nt" and len(path_part) >= 3 and path_part[0] == "/" and path_part[2] == ":":
|
|
32
|
+
path_part = path_part.lstrip("/")
|
|
33
|
+
s = path_part or s
|
|
34
|
+
return Path(s).expanduser()
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def resolve_pdf_candidate(p: Path, *, cwd: Path | None = None) -> Path:
|
|
38
|
+
"""
|
|
39
|
+
If ``p`` is not an existing file, try ``cwd / p`` when ``p`` is relative.
|
|
40
|
+
Does not guarantee the result exists.
|
|
41
|
+
"""
|
|
42
|
+
if p.is_file():
|
|
43
|
+
return p
|
|
44
|
+
base = cwd or Path.cwd()
|
|
45
|
+
if not p.is_absolute():
|
|
46
|
+
alt = (base / p).resolve()
|
|
47
|
+
if alt.is_file():
|
|
48
|
+
return alt
|
|
49
|
+
return p.resolve() if p.exists() else p
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _cli_package_dir() -> Path:
|
|
53
|
+
"""``cli/`` package root (parent of ``core/``)."""
|
|
54
|
+
return Path(__file__).resolve().parent.parent
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def cli_data_dir() -> Path:
|
|
58
|
+
"""Runtime data root: ``cli/data`` (legacy flat JSON + per-run artifact folders)."""
|
|
59
|
+
return _cli_package_dir() / "data"
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def cli_runs_dir() -> Path:
|
|
63
|
+
"""Per-run artifact roots: ``cli/data/runs/{doc-stem}/`` with ``json/``, ``image/``, etc."""
|
|
64
|
+
return cli_data_dir() / "runs"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def cli_json_storage_dir() -> Path:
|
|
68
|
+
"""Legacy flat JSON archive: ``cli/data/legacy`` (no ``json/`` subfolder)."""
|
|
69
|
+
return cli_data_dir() / "legacy"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def ensure_cli_storage() -> Path:
|
|
73
|
+
"""Create ``cli/data`` layout and return the legacy flat archive directory."""
|
|
74
|
+
for d in (cli_data_dir(), cli_json_storage_dir(), cli_runs_dir()):
|
|
75
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
return cli_json_storage_dir()
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _storage_search_dirs(storage: Path) -> list[Path]:
|
|
80
|
+
"""Directories to search for saved indexes (legacy flat + ``runs/**``)."""
|
|
81
|
+
storage = storage.resolve()
|
|
82
|
+
data = cli_data_dir().resolve()
|
|
83
|
+
legacy = cli_json_storage_dir().resolve()
|
|
84
|
+
runs = cli_runs_dir().resolve()
|
|
85
|
+
roots: list[Path] = [storage]
|
|
86
|
+
if storage in {legacy, data}:
|
|
87
|
+
if runs.is_dir():
|
|
88
|
+
roots.append(runs)
|
|
89
|
+
if storage == legacy and data.is_dir():
|
|
90
|
+
roots.append(data)
|
|
91
|
+
return list(dict.fromkeys(roots))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@dataclass(frozen=True)
|
|
95
|
+
class ResolvedAgentIndex:
|
|
96
|
+
"""Paths passed to ``build_document_index_from_saved``."""
|
|
97
|
+
|
|
98
|
+
pageindex_path: Path | None
|
|
99
|
+
canonical_bundle_path: Path | None
|
|
100
|
+
label: str
|
|
101
|
+
artifact_root: Path | None = None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def default_project_logs_dir() -> Path | None:
|
|
105
|
+
"""Legacy ``python/logs`` or ``logs/`` next to the CLI package, when it exists."""
|
|
106
|
+
logs = _cli_package_dir().parent / "logs"
|
|
107
|
+
return logs.resolve() if logs.is_dir() else None
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _artifact_json_dir(root: Path) -> Path | None:
|
|
111
|
+
"""Directory containing ``*_canonical_bundle.json`` (artifact ``json/`` or flat)."""
|
|
112
|
+
if not root.is_dir():
|
|
113
|
+
return None
|
|
114
|
+
resolved = root.resolve()
|
|
115
|
+
if resolved.name == ARTIFACT_SUBDIR_JSON:
|
|
116
|
+
return resolved if any(resolved.glob("*_canonical_bundle.json")) else None
|
|
117
|
+
nested = resolved / ARTIFACT_SUBDIR_JSON
|
|
118
|
+
if nested.is_dir() and any(nested.glob("*_canonical_bundle.json")):
|
|
119
|
+
return nested
|
|
120
|
+
if any(resolved.glob("*_canonical_bundle.json")):
|
|
121
|
+
return resolved
|
|
122
|
+
return None
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _artifact_root_from_json_dir(json_dir: Path) -> Path:
|
|
126
|
+
if json_dir.name == ARTIFACT_SUBDIR_JSON:
|
|
127
|
+
return json_dir.parent.resolve()
|
|
128
|
+
return json_dir.resolve()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _discover_canonical_bundles(json_dir: Path) -> list[Path]:
|
|
132
|
+
return sorted(
|
|
133
|
+
json_dir.glob("*_canonical_bundle.json"),
|
|
134
|
+
key=lambda p: p.stat().st_mtime,
|
|
135
|
+
reverse=True,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _resolved_from_canonical_bundle(
|
|
140
|
+
cb: Path,
|
|
141
|
+
*,
|
|
142
|
+
artifact_root: Path | None = None,
|
|
143
|
+
) -> ResolvedAgentIndex:
|
|
144
|
+
stem = cb.name[: -len("_canonical_bundle.json")]
|
|
145
|
+
json_dir = cb.parent
|
|
146
|
+
pi = json_dir / f"{stem}_pageindex.json"
|
|
147
|
+
ar = artifact_root or _artifact_root_from_json_dir(json_dir)
|
|
148
|
+
if pi.is_file():
|
|
149
|
+
return ResolvedAgentIndex(pi.resolve(), cb.resolve(), stem, ar)
|
|
150
|
+
outline = json_dir / f"{stem}.json"
|
|
151
|
+
if outline.is_file() and _json_has_outline_shape(outline):
|
|
152
|
+
return ResolvedAgentIndex(outline.resolve(), cb.resolve(), stem, ar)
|
|
153
|
+
return ResolvedAgentIndex(None, cb.resolve(), stem, ar)
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def resolve_artifact_root_input(root: Path) -> ResolvedAgentIndex | None:
|
|
157
|
+
"""
|
|
158
|
+
Resolve a multimodal artifact folder (``logs``, ``logs/json``, etc.) to the newest
|
|
159
|
+
``*_canonical_bundle.json`` and optional sibling pageindex.
|
|
160
|
+
"""
|
|
161
|
+
json_dir = _artifact_json_dir(root)
|
|
162
|
+
if json_dir is None:
|
|
163
|
+
return None
|
|
164
|
+
bundles = _discover_canonical_bundles(json_dir)
|
|
165
|
+
if not bundles:
|
|
166
|
+
return None
|
|
167
|
+
return _resolved_from_canonical_bundle(
|
|
168
|
+
bundles[0].resolve(),
|
|
169
|
+
artifact_root=_artifact_root_from_json_dir(json_dir),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _canonical_bundle_sibling(pageindex_path: Path) -> Path | None:
|
|
174
|
+
name = pageindex_path.name
|
|
175
|
+
if name.endswith("_pageindex.json"):
|
|
176
|
+
stem = name[: -len("_pageindex.json")]
|
|
177
|
+
sibling = pageindex_path.parent / f"{stem}_canonical_bundle.json"
|
|
178
|
+
return sibling if sibling.is_file() else None
|
|
179
|
+
if name.endswith(".json"):
|
|
180
|
+
stem = Path(name).stem
|
|
181
|
+
sibling = pageindex_path.parent / f"{stem}_canonical_bundle.json"
|
|
182
|
+
return sibling if sibling.is_file() else None
|
|
183
|
+
return None
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _json_has_outline_shape(path: Path) -> bool:
|
|
187
|
+
try:
|
|
188
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
189
|
+
except (OSError, json.JSONDecodeError):
|
|
190
|
+
return False
|
|
191
|
+
return isinstance(data, dict) and isinstance(data.get("structure"), list)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def resolve_agent_index_input(
|
|
195
|
+
raw: str,
|
|
196
|
+
*,
|
|
197
|
+
storage: Path,
|
|
198
|
+
cwd: Path | None = None,
|
|
199
|
+
) -> ResolvedAgentIndex | None:
|
|
200
|
+
"""
|
|
201
|
+
Resolve user input to index JSON paths.
|
|
202
|
+
|
|
203
|
+
Accepts a full/relative path, ``file://…``, a file under ``cli/data``,
|
|
204
|
+
a multimodal artifact folder (e.g. ``logs`` with ``json/`` + ``image/``),
|
|
205
|
+
a ``*_pageindex.json`` path, outline JSON with ``structure``, a
|
|
206
|
+
``*_canonical_bundle.json`` (with or without a sibling pageindex), or a stem.
|
|
207
|
+
"""
|
|
208
|
+
s = (raw or "").strip()
|
|
209
|
+
if not s:
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
base = cwd or Path.cwd()
|
|
213
|
+
candidates: list[Path] = []
|
|
214
|
+
|
|
215
|
+
norm = normalize_path_string(s)
|
|
216
|
+
dir_candidates: list[Path] = []
|
|
217
|
+
if not norm.is_absolute():
|
|
218
|
+
dir_candidates.append((base / norm).resolve())
|
|
219
|
+
try:
|
|
220
|
+
dir_candidates.append(norm.resolve())
|
|
221
|
+
except OSError:
|
|
222
|
+
pass
|
|
223
|
+
for resolved_dir in dir_candidates:
|
|
224
|
+
if resolved_dir.is_dir():
|
|
225
|
+
from_dir = resolve_artifact_root_input(resolved_dir)
|
|
226
|
+
if from_dir is not None:
|
|
227
|
+
return from_dir
|
|
228
|
+
|
|
229
|
+
p = resolve_pdf_candidate(norm, cwd=base)
|
|
230
|
+
candidates.append(p)
|
|
231
|
+
candidates.append((storage / p.name).resolve())
|
|
232
|
+
if not Path(s).is_absolute():
|
|
233
|
+
candidates.append((storage / s).resolve())
|
|
234
|
+
|
|
235
|
+
stem = s.strip().strip('"').strip("'")
|
|
236
|
+
logs = default_project_logs_dir()
|
|
237
|
+
if logs is not None and stem.lower() != "logs":
|
|
238
|
+
candidates.append(logs / s)
|
|
239
|
+
json_dir = logs / ARTIFACT_SUBDIR_JSON
|
|
240
|
+
if json_dir.is_dir() and not stem.lower().endswith(".json"):
|
|
241
|
+
candidates.append(json_dir / f"{stem}_canonical_bundle.json")
|
|
242
|
+
for hit in sorted(json_dir.glob(f"{stem}*_canonical_bundle.json"), reverse=True):
|
|
243
|
+
candidates.append(hit)
|
|
244
|
+
elif logs is not None and stem.lower() == "logs":
|
|
245
|
+
# Prefer cwd-relative ``logs/``; fall back to legacy ``python/logs`` when missing.
|
|
246
|
+
rel_logs = (base / "logs").resolve()
|
|
247
|
+
if not rel_logs.is_dir():
|
|
248
|
+
candidates.append(logs)
|
|
249
|
+
json_dir = logs / ARTIFACT_SUBDIR_JSON
|
|
250
|
+
if json_dir.is_dir():
|
|
251
|
+
for hit in sorted(json_dir.glob("*_canonical_bundle.json"), reverse=True):
|
|
252
|
+
candidates.append(hit)
|
|
253
|
+
|
|
254
|
+
# Bare stem (no .json): try storage artifacts (legacy flat + runs/**/json/)
|
|
255
|
+
if not stem.lower().endswith(".json"):
|
|
256
|
+
for root in _storage_search_dirs(storage):
|
|
257
|
+
for pattern in (f"{stem}_pageindex.json", f"{stem}.json"):
|
|
258
|
+
candidates.append(root / pattern)
|
|
259
|
+
for hit in sorted(root.glob(f"**/{stem}*_pageindex.json"), reverse=True):
|
|
260
|
+
candidates.append(hit)
|
|
261
|
+
|
|
262
|
+
seen: set[Path] = set()
|
|
263
|
+
for cand in candidates:
|
|
264
|
+
try:
|
|
265
|
+
key = cand.resolve()
|
|
266
|
+
except OSError:
|
|
267
|
+
continue
|
|
268
|
+
if key in seen:
|
|
269
|
+
continue
|
|
270
|
+
seen.add(key)
|
|
271
|
+
if not key.is_file():
|
|
272
|
+
continue
|
|
273
|
+
if key.name.endswith("_canonical_bundle.json"):
|
|
274
|
+
return _resolved_from_canonical_bundle(key.resolve())
|
|
275
|
+
if not _json_has_outline_shape(key):
|
|
276
|
+
continue
|
|
277
|
+
cb = _canonical_bundle_sibling(key)
|
|
278
|
+
label = key.stem.replace("_pageindex", "")
|
|
279
|
+
ar = _artifact_root_from_json_dir(key.parent) if key.parent.name == ARTIFACT_SUBDIR_JSON else None
|
|
280
|
+
return ResolvedAgentIndex(
|
|
281
|
+
key.resolve(),
|
|
282
|
+
cb.resolve() if cb else None,
|
|
283
|
+
label,
|
|
284
|
+
ar,
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def cli_archive_stamp() -> str:
|
|
291
|
+
"""UTC timestamp suffix shared by JSON artifacts from one CLI run."""
|
|
292
|
+
return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def archive_cli_json(
|
|
296
|
+
storage_root: Path,
|
|
297
|
+
pdf_stem: str,
|
|
298
|
+
json_text: str,
|
|
299
|
+
*,
|
|
300
|
+
stamp: str | None = None,
|
|
301
|
+
suffix: str = "",
|
|
302
|
+
) -> Path:
|
|
303
|
+
"""
|
|
304
|
+
Write ``{stem}_{stamp}{suffix}.json`` under ``storage_root`` and return the file path.
|
|
305
|
+
|
|
306
|
+
``suffix`` is inserted before ``.json`` (e.g. ``_canonical_bundle``).
|
|
307
|
+
"""
|
|
308
|
+
stamp = stamp or cli_archive_stamp()
|
|
309
|
+
path = storage_root / f"{pdf_stem}_{stamp}{suffix}.json"
|
|
310
|
+
path.write_text(json_text, encoding="utf-8")
|
|
311
|
+
return path
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def archive_cli_json_object(
|
|
315
|
+
storage_root: Path,
|
|
316
|
+
pdf_stem: str,
|
|
317
|
+
data: object,
|
|
318
|
+
*,
|
|
319
|
+
stamp: str | None = None,
|
|
320
|
+
suffix: str = "",
|
|
321
|
+
) -> Path:
|
|
322
|
+
"""Pretty-print ``data`` as UTF-8 JSON via :func:`archive_cli_json`."""
|
|
323
|
+
import json
|
|
324
|
+
|
|
325
|
+
text = json.dumps(data, ensure_ascii=False, indent=2)
|
|
326
|
+
return archive_cli_json(storage_root, pdf_stem, text, stamp=stamp, suffix=suffix)
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _bundle_json_disk_shape(bundle: dict[str, object]) -> dict[str, object]:
|
|
330
|
+
"""
|
|
331
|
+
Slim canonical bundle for ``json/`` (matches V3-style separation).
|
|
332
|
+
|
|
333
|
+
Omits ``coded_formate`` (see ``text/*.txt``) and ``unstructured_elements`` (sibling JSON).
|
|
334
|
+
"""
|
|
335
|
+
return {k: v for k, v in bundle.items() if k not in ("coded_formate", "unstructured_elements")}
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _stamp_from_canonical_bundle_path(path_str: str | None) -> str | None:
|
|
339
|
+
"""Parse ``{stem}_{stamp}_canonical_bundle.json`` → stamp."""
|
|
340
|
+
if not path_str:
|
|
341
|
+
return None
|
|
342
|
+
name = Path(path_str).name
|
|
343
|
+
marker = "_canonical_bundle.json"
|
|
344
|
+
if not name.endswith(marker):
|
|
345
|
+
return None
|
|
346
|
+
stem_part = name[: -len(marker)]
|
|
347
|
+
if "_" not in stem_part:
|
|
348
|
+
return None
|
|
349
|
+
return stem_part.rsplit("_", 1)[-1]
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
# Bulky linear tags live under ``text/`` (see ``path_coded_text``, ``path_compressed_tree_text``).
|
|
353
|
+
_MULTIMODAL_TEXT_ONLY_KEYS = frozenset({"coded_formate", "compressed_tree_string"})
|
|
354
|
+
_PAGEINDEX_JSON_OMIT_KEYS = frozenset({"canonical_bundle", "coded_formate", "compressed_tree_string"})
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _multimodal_pageindex_disk_shape(out: dict[str, object]) -> dict[str, object]:
|
|
358
|
+
"""Outline JSON only (no nested bundle, no coded strings), slim ``structure`` like V3."""
|
|
359
|
+
from chunksmith_v3.outline_tree import (
|
|
360
|
+
slim_outline_bundle_for_export,
|
|
361
|
+
strip_redundant_page_fields,
|
|
362
|
+
strip_text_from_structure,
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
pageindex = {k: v for k, v in out.items() if k not in _PAGEINDEX_JSON_OMIT_KEYS}
|
|
366
|
+
st = pageindex.get("structure")
|
|
367
|
+
if isinstance(st, list):
|
|
368
|
+
pageindex["structure"] = strip_redundant_page_fields(strip_text_from_structure(st))
|
|
369
|
+
ob = pageindex.get("outline_bundle")
|
|
370
|
+
if isinstance(ob, dict):
|
|
371
|
+
pageindex["outline_bundle"] = slim_outline_bundle_for_export(ob)
|
|
372
|
+
return pageindex
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _write_multimodal_text_artifacts_flat(
|
|
376
|
+
storage_root: Path,
|
|
377
|
+
pdf_stem: str,
|
|
378
|
+
*,
|
|
379
|
+
stamp: str,
|
|
380
|
+
bundle: dict[str, object] | None,
|
|
381
|
+
out: dict[str, object],
|
|
382
|
+
pageindex: dict[str, object],
|
|
383
|
+
) -> None:
|
|
384
|
+
"""Legacy flat ``cli/data/legacy`` text files (not under artifact ``text/``)."""
|
|
385
|
+
if isinstance(bundle, dict):
|
|
386
|
+
coded = bundle.get("coded_formate")
|
|
387
|
+
if isinstance(coded, str) and coded.strip():
|
|
388
|
+
p = storage_root / f"{pdf_stem}_{stamp}_coded.txt"
|
|
389
|
+
p.write_text(coded, encoding="utf-8")
|
|
390
|
+
bundle["path_coded_text"] = str(p.resolve())
|
|
391
|
+
cts = out.get("compressed_tree_string")
|
|
392
|
+
if isinstance(cts, str) and cts.strip():
|
|
393
|
+
tp = storage_root / f"{pdf_stem}_{stamp}_compressed_tree.txt"
|
|
394
|
+
tp.write_text(cts, encoding="utf-8")
|
|
395
|
+
pageindex["path_compressed_tree_text"] = str(tp.resolve())
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _write_multimodal_text_artifacts(
|
|
399
|
+
layout: ArtifactLayout,
|
|
400
|
+
*,
|
|
401
|
+
stamp: str,
|
|
402
|
+
bundle: dict[str, object],
|
|
403
|
+
out: dict[str, object],
|
|
404
|
+
pageindex: dict[str, object],
|
|
405
|
+
) -> None:
|
|
406
|
+
"""Persist ``coded_formate`` / ``compressed_tree_string`` under ``text/`` once; set path refs on JSON dicts."""
|
|
407
|
+
coded = bundle.get("coded_formate")
|
|
408
|
+
coded_path_ref = bundle.get("path_coded_text")
|
|
409
|
+
if isinstance(coded_path_ref, str) and Path(coded_path_ref).is_file():
|
|
410
|
+
bundle["path_coded_text"] = str(Path(coded_path_ref).resolve())
|
|
411
|
+
elif isinstance(coded, str) and coded.strip():
|
|
412
|
+
coded_path = layout.coded_text_path(stamp=stamp)
|
|
413
|
+
coded_path.write_text(coded, encoding="utf-8")
|
|
414
|
+
bundle["path_coded_text"] = str(coded_path.resolve())
|
|
415
|
+
cts = out.get("compressed_tree_string")
|
|
416
|
+
if isinstance(cts, str) and cts.strip():
|
|
417
|
+
tree_path = layout.compressed_tree_text_path(stamp=stamp)
|
|
418
|
+
if not tree_path.is_file():
|
|
419
|
+
tree_path.write_text(cts, encoding="utf-8")
|
|
420
|
+
pageindex["path_compressed_tree_text"] = str(tree_path.resolve())
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
def _multimodal_out_json_disk_shape(out: dict[str, object]) -> dict[str, object]:
|
|
424
|
+
o = {k: v for k, v in out.items() if k not in _MULTIMODAL_TEXT_ONLY_KEYS}
|
|
425
|
+
cb = o.get("canonical_bundle")
|
|
426
|
+
if isinstance(cb, dict):
|
|
427
|
+
o["canonical_bundle"] = _bundle_json_disk_shape(cb) # type: ignore[assignment]
|
|
428
|
+
return o
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def archive_multimodal_cli_outputs(
|
|
432
|
+
storage_root: Path,
|
|
433
|
+
pdf_stem: str,
|
|
434
|
+
out: dict[str, object],
|
|
435
|
+
*,
|
|
436
|
+
stamp: str | None = None,
|
|
437
|
+
artifact_root: Path | None = None,
|
|
438
|
+
) -> dict[str, Path]:
|
|
439
|
+
"""
|
|
440
|
+
Write JSON files for one multimodal run (single stamp, one file per role).
|
|
441
|
+
|
|
442
|
+
When ``artifact_root`` is set, this is the **only** writer for ``json/`` (coding step writes
|
|
443
|
+
pickle, ``image/``, and ``text/`` only). Layout matches V3: canonical bundle + unstructured
|
|
444
|
+
+ pageindex.
|
|
445
|
+
|
|
446
|
+
Otherwise writes flat files under ``storage_root`` (legacy ``cli/data/legacy``).
|
|
447
|
+
"""
|
|
448
|
+
import json
|
|
449
|
+
|
|
450
|
+
from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
|
|
451
|
+
|
|
452
|
+
bundle = out.get("canonical_bundle")
|
|
453
|
+
if stamp is None and isinstance(bundle, dict):
|
|
454
|
+
stamp = _stamp_from_canonical_bundle_path(str(bundle.get("path_canonical_bundle_json") or ""))
|
|
455
|
+
stamp = stamp or cli_archive_stamp()
|
|
456
|
+
|
|
457
|
+
pageindex = _multimodal_pageindex_disk_shape(out)
|
|
458
|
+
out_disk = _multimodal_out_json_disk_shape(out) # type: ignore[arg-type]
|
|
459
|
+
paths: dict[str, Path] = {}
|
|
460
|
+
|
|
461
|
+
if artifact_root is not None:
|
|
462
|
+
layout = ensure_artifact_layout(artifact_root, pdf_stem)
|
|
463
|
+
cb_path = layout.canonical_bundle_json_path(stamp=stamp)
|
|
464
|
+
uj_path = layout.unstructured_json_path(stamp=stamp)
|
|
465
|
+
pi_path = layout.pageindex_json_path(stamp=stamp)
|
|
466
|
+
|
|
467
|
+
bundle_for_paths = dict(bundle) if isinstance(bundle, dict) else {}
|
|
468
|
+
_write_multimodal_text_artifacts(
|
|
469
|
+
layout,
|
|
470
|
+
stamp=stamp,
|
|
471
|
+
bundle=bundle_for_paths,
|
|
472
|
+
out=out,
|
|
473
|
+
pageindex=pageindex,
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
if isinstance(bundle, dict):
|
|
477
|
+
cb_path.write_text(
|
|
478
|
+
json.dumps(_bundle_json_disk_shape(bundle_for_paths), ensure_ascii=False, indent=2),
|
|
479
|
+
encoding="utf-8",
|
|
480
|
+
)
|
|
481
|
+
paths["canonical_bundle"] = cb_path.resolve()
|
|
482
|
+
uels = bundle.get("unstructured_elements")
|
|
483
|
+
if isinstance(uels, list) and uels:
|
|
484
|
+
uj_path.write_text(json.dumps(uels, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
485
|
+
paths["unstructured_elements"] = uj_path.resolve()
|
|
486
|
+
pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
487
|
+
paths["pageindex"] = pi_path.resolve()
|
|
488
|
+
paths["artifact_root"] = layout.root.resolve()
|
|
489
|
+
return paths
|
|
490
|
+
|
|
491
|
+
bundle_flat = dict(bundle) if isinstance(bundle, dict) else {}
|
|
492
|
+
_write_multimodal_text_artifacts_flat(
|
|
493
|
+
storage_root,
|
|
494
|
+
pdf_stem,
|
|
495
|
+
stamp=stamp,
|
|
496
|
+
bundle=bundle_flat,
|
|
497
|
+
out=out,
|
|
498
|
+
pageindex=pageindex,
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
paths["full"] = archive_cli_json_object(storage_root, pdf_stem, out_disk, stamp=stamp)
|
|
502
|
+
if isinstance(bundle, dict):
|
|
503
|
+
paths["canonical_bundle"] = archive_cli_json_object(
|
|
504
|
+
storage_root,
|
|
505
|
+
pdf_stem,
|
|
506
|
+
_bundle_json_disk_shape(bundle_flat), # type: ignore[arg-type]
|
|
507
|
+
stamp=stamp,
|
|
508
|
+
suffix="_canonical_bundle",
|
|
509
|
+
)
|
|
510
|
+
uels = bundle.get("unstructured_elements")
|
|
511
|
+
if isinstance(uels, list) and uels:
|
|
512
|
+
paths["unstructured_elements"] = archive_cli_json_object(
|
|
513
|
+
storage_root,
|
|
514
|
+
pdf_stem,
|
|
515
|
+
uels,
|
|
516
|
+
stamp=stamp,
|
|
517
|
+
suffix="_unstructured_elements",
|
|
518
|
+
)
|
|
519
|
+
paths["pageindex"] = archive_cli_json_object(
|
|
520
|
+
storage_root,
|
|
521
|
+
pdf_stem,
|
|
522
|
+
pageindex,
|
|
523
|
+
stamp=stamp,
|
|
524
|
+
suffix="_pageindex",
|
|
525
|
+
)
|
|
526
|
+
return paths
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _v3_index_disk_shape(data: dict[str, object]) -> dict[str, object]:
|
|
530
|
+
"""Omit bulky/duplicate fields from saved V3 JSON (``text``, extra page keys, ``image_base64``)."""
|
|
531
|
+
from chunksmith_v3.outline_tree import (
|
|
532
|
+
omit_top_level_elements_for_export,
|
|
533
|
+
redistribute_top_level_elements_to_media,
|
|
534
|
+
slim_outline_bundle_for_export,
|
|
535
|
+
strip_redundant_page_fields,
|
|
536
|
+
strip_text_from_structure,
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
out = dict(data)
|
|
540
|
+
st = out.get("structure")
|
|
541
|
+
if isinstance(st, list):
|
|
542
|
+
out["structure"] = strip_redundant_page_fields(strip_text_from_structure(st))
|
|
543
|
+
ob = out.get("outline_bundle")
|
|
544
|
+
if isinstance(ob, dict):
|
|
545
|
+
out["outline_bundle"] = slim_outline_bundle_for_export(ob)
|
|
546
|
+
media = out.get("media_by_node")
|
|
547
|
+
top_els = out.get("elements")
|
|
548
|
+
if isinstance(media, dict) and isinstance(st, list):
|
|
549
|
+
needs_media_els = not any(isinstance(nm, dict) and nm.get("elements") for nm in media.values())
|
|
550
|
+
if needs_media_els:
|
|
551
|
+
from chunksmith_v3.parsed_document import build_parsed_document_from_elements
|
|
552
|
+
|
|
553
|
+
parsed_doc = None
|
|
554
|
+
artifact_root = out.get("artifact_root")
|
|
555
|
+
if isinstance(artifact_root, str):
|
|
556
|
+
json_dir = Path(artifact_root) / "json"
|
|
557
|
+
if json_dir.is_dir():
|
|
558
|
+
uj_candidates = sorted(
|
|
559
|
+
json_dir.glob("*_unstructured_elements.json"),
|
|
560
|
+
key=lambda p: p.stat().st_mtime,
|
|
561
|
+
reverse=True,
|
|
562
|
+
)
|
|
563
|
+
if uj_candidates:
|
|
564
|
+
raw = json.loads(uj_candidates[0].read_text(encoding="utf-8"))
|
|
565
|
+
if isinstance(raw, list):
|
|
566
|
+
parsed_doc = build_parsed_document_from_elements(
|
|
567
|
+
str(out.get("doc_name") or "document.pdf"),
|
|
568
|
+
[e for e in raw if isinstance(e, dict)],
|
|
569
|
+
)
|
|
570
|
+
if parsed_doc is None and isinstance(top_els, list) and top_els:
|
|
571
|
+
parsed_doc = build_parsed_document_from_elements(
|
|
572
|
+
str(out.get("doc_name") or "document.pdf"),
|
|
573
|
+
[e for e in top_els if isinstance(e, dict)],
|
|
574
|
+
)
|
|
575
|
+
if parsed_doc is not None:
|
|
576
|
+
out["media_by_node"] = redistribute_top_level_elements_to_media(
|
|
577
|
+
structure=st,
|
|
578
|
+
media_by_node=media,
|
|
579
|
+
parsed=parsed_doc,
|
|
580
|
+
doc_name=str(out.get("doc_name") or "document.pdf"),
|
|
581
|
+
)
|
|
582
|
+
if isinstance(media, dict) or isinstance(out.get("media_by_node"), dict):
|
|
583
|
+
slim_media: dict[str, object] = {}
|
|
584
|
+
for nid, nm in (out.get("media_by_node") or media).items(): # type: ignore[union-attr]
|
|
585
|
+
if not isinstance(nm, dict):
|
|
586
|
+
continue
|
|
587
|
+
row = dict(nm)
|
|
588
|
+
imgs = row.get("images")
|
|
589
|
+
if isinstance(imgs, list):
|
|
590
|
+
row["images"] = [
|
|
591
|
+
{k: v for k, v in im.items() if k != "image_base64"} if isinstance(im, dict) else im for im in imgs
|
|
592
|
+
]
|
|
593
|
+
slim_media[str(nid)] = row
|
|
594
|
+
out["media_by_node"] = slim_media
|
|
595
|
+
return omit_top_level_elements_for_export(out)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def archive_v3_cli_outputs(
|
|
599
|
+
storage_root: Path,
|
|
600
|
+
pdf_stem: str,
|
|
601
|
+
index_dict: dict[str, object],
|
|
602
|
+
*,
|
|
603
|
+
stamp: str | None = None,
|
|
604
|
+
artifact_root: Path | None = None,
|
|
605
|
+
) -> dict[str, Path]:
|
|
606
|
+
"""
|
|
607
|
+
Write JSON for one V3 run (same timestamp).
|
|
608
|
+
|
|
609
|
+
When ``artifact_root`` is set, all JSON goes under ``{artifact_root}/json/`` only
|
|
610
|
+
(``persist_v3_artifacts`` does not write these files).
|
|
611
|
+
|
|
612
|
+
Otherwise writes flat files under ``storage_root`` (``cli/data/legacy``).
|
|
613
|
+
"""
|
|
614
|
+
from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
|
|
615
|
+
|
|
616
|
+
stamp = stamp or cli_archive_stamp()
|
|
617
|
+
slim = _v3_index_disk_shape(index_dict)
|
|
618
|
+
pageindex = {
|
|
619
|
+
"doc_name": slim.get("doc_name"),
|
|
620
|
+
"structure": slim.get("structure"),
|
|
621
|
+
"doc_description": slim.get("doc_description"),
|
|
622
|
+
"indexing_mode": slim.get("indexing_mode"),
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
if artifact_root is not None:
|
|
626
|
+
layout = ensure_artifact_layout(artifact_root, pdf_stem)
|
|
627
|
+
v3_path = layout.json_dir / f"{pdf_stem}_{stamp}_v3_index.json"
|
|
628
|
+
pi_path = layout.json_dir / f"{pdf_stem}_{stamp}_pageindex.json"
|
|
629
|
+
v3_path.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
630
|
+
pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
631
|
+
return {
|
|
632
|
+
"v3_index": v3_path.resolve(),
|
|
633
|
+
"pageindex": pi_path.resolve(),
|
|
634
|
+
"artifact_root": layout.root.resolve(),
|
|
635
|
+
"artifact_v3_index": v3_path.resolve(),
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
paths: dict[str, Path] = {
|
|
639
|
+
"v3_index": archive_cli_json_object(
|
|
640
|
+
storage_root,
|
|
641
|
+
pdf_stem,
|
|
642
|
+
slim,
|
|
643
|
+
stamp=stamp,
|
|
644
|
+
suffix="_v3_index",
|
|
645
|
+
),
|
|
646
|
+
}
|
|
647
|
+
paths["pageindex"] = archive_cli_json_object(
|
|
648
|
+
storage_root,
|
|
649
|
+
pdf_stem,
|
|
650
|
+
pageindex,
|
|
651
|
+
stamp=stamp,
|
|
652
|
+
suffix="_pageindex",
|
|
653
|
+
)
|
|
654
|
+
return paths
|
|
655
|
+
|
|
656
|
+
|
|
657
|
+
def _title_index_disk_shape(data: dict[str, object]) -> dict[str, object]:
|
|
658
|
+
"""Slim title-index JSON for disk (coded string stored separately when large)."""
|
|
659
|
+
out = dict(data)
|
|
660
|
+
coded = out.pop("title_coded_formate", None)
|
|
661
|
+
if coded:
|
|
662
|
+
out["title_coded_formate_chars"] = len(str(coded))
|
|
663
|
+
return out
|
|
664
|
+
|
|
665
|
+
|
|
666
|
+
def archive_title_cli_outputs(
|
|
667
|
+
storage_root: Path,
|
|
668
|
+
pdf_stem: str,
|
|
669
|
+
index_dict: dict[str, object],
|
|
670
|
+
*,
|
|
671
|
+
stamp: str | None = None,
|
|
672
|
+
artifact_root: Path | None = None,
|
|
673
|
+
) -> dict[str, Path]:
|
|
674
|
+
"""Persist title-indexer run artifacts (index, chunk_nodes, coded text, pageindex)."""
|
|
675
|
+
import json
|
|
676
|
+
|
|
677
|
+
from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
|
|
678
|
+
|
|
679
|
+
stamp = stamp or cli_archive_stamp()
|
|
680
|
+
slim = _title_index_disk_shape(index_dict)
|
|
681
|
+
pageindex: dict[str, object] = {
|
|
682
|
+
"doc_name": slim.get("doc_name"),
|
|
683
|
+
"structure": slim.get("structure"),
|
|
684
|
+
"doc_description": slim.get("doc_description"),
|
|
685
|
+
"indexing_mode": slim.get("indexing_mode"),
|
|
686
|
+
}
|
|
687
|
+
ob = index_dict.get("outline_bundle")
|
|
688
|
+
if isinstance(ob, dict) and isinstance(ob.get("pageindex"), dict):
|
|
689
|
+
pi = ob["pageindex"]
|
|
690
|
+
for key in ("extraction_warnings", "outline_thinking", "outline_verification"):
|
|
691
|
+
if key in pi:
|
|
692
|
+
pageindex[key] = pi[key]
|
|
693
|
+
|
|
694
|
+
paths: dict[str, Path] = {}
|
|
695
|
+
coded = index_dict.get("title_coded_formate")
|
|
696
|
+
chunk_nodes = index_dict.get("chunk_nodes")
|
|
697
|
+
elements = index_dict.get("elements")
|
|
698
|
+
|
|
699
|
+
if artifact_root is not None:
|
|
700
|
+
layout = ensure_artifact_layout(artifact_root, pdf_stem)
|
|
701
|
+
ti_path = layout.json_dir / f"{pdf_stem}_{stamp}_title_index.json"
|
|
702
|
+
pi_path = layout.pageindex_json_path(stamp=stamp)
|
|
703
|
+
ti_path.parent.mkdir(parents=True, exist_ok=True)
|
|
704
|
+
ti_path.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
705
|
+
paths["title_index"] = ti_path.resolve()
|
|
706
|
+
pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
707
|
+
paths["pageindex"] = pi_path.resolve()
|
|
708
|
+
if isinstance(coded, str) and coded:
|
|
709
|
+
coded_path = layout.text_dir / f"{pdf_stem}_{stamp}_title_coded_formate.txt"
|
|
710
|
+
coded_path.parent.mkdir(parents=True, exist_ok=True)
|
|
711
|
+
coded_path.write_text(coded, encoding="utf-8")
|
|
712
|
+
paths["title_coded_formate"] = coded_path.resolve()
|
|
713
|
+
if isinstance(chunk_nodes, list):
|
|
714
|
+
cn_path = layout.json_dir / f"{pdf_stem}_{stamp}_chunk_nodes.json"
|
|
715
|
+
cn_path.write_text(json.dumps(chunk_nodes, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
716
|
+
paths["chunk_nodes"] = cn_path.resolve()
|
|
717
|
+
if isinstance(elements, list) and elements:
|
|
718
|
+
uj_path = layout.unstructured_json_path(stamp=stamp)
|
|
719
|
+
uj_path.write_text(json.dumps(elements, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
720
|
+
paths["unstructured_elements"] = uj_path.resolve()
|
|
721
|
+
paths["artifact_root"] = layout.root.resolve()
|
|
722
|
+
return paths
|
|
723
|
+
|
|
724
|
+
paths["title_index"] = archive_cli_json_object(storage_root, pdf_stem, slim, stamp=stamp, suffix="_title_index")
|
|
725
|
+
paths["pageindex"] = archive_cli_json_object(storage_root, pdf_stem, pageindex, stamp=stamp, suffix="_pageindex")
|
|
726
|
+
if isinstance(coded, str) and coded:
|
|
727
|
+
text_path = storage_root / f"{pdf_stem}_{stamp}_title_coded_formate.txt"
|
|
728
|
+
text_path.write_text(coded, encoding="utf-8")
|
|
729
|
+
paths["title_coded_formate"] = text_path.resolve()
|
|
730
|
+
if isinstance(chunk_nodes, list):
|
|
731
|
+
paths["chunk_nodes"] = archive_cli_json_object(
|
|
732
|
+
storage_root, pdf_stem, chunk_nodes, stamp=stamp, suffix="_chunk_nodes"
|
|
733
|
+
)
|
|
734
|
+
if isinstance(elements, list) and elements:
|
|
735
|
+
paths["unstructured_elements"] = archive_cli_json_object(
|
|
736
|
+
storage_root, pdf_stem, elements, stamp=stamp, suffix="_unstructured_elements"
|
|
737
|
+
)
|
|
738
|
+
return paths
|
|
739
|
+
|
|
740
|
+
|
|
741
|
+
def _multi_index_disk_shape(data: dict[str, object]) -> dict[str, object]:
|
|
742
|
+
"""Slim multi-index JSON (coded markup stored as separate text file)."""
|
|
743
|
+
out = dict(data)
|
|
744
|
+
if "title_coded_formate_chars" in out:
|
|
745
|
+
pass
|
|
746
|
+
for key in ("flat_rows_toon", "structure_toon"):
|
|
747
|
+
if key in out and len(str(out.get(key) or "")) > 50000:
|
|
748
|
+
out[key] = f"(omitted on disk — {len(str(data.get(key) or ''))} chars; see run artifacts)"
|
|
749
|
+
return out
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def archive_multi_indexing_cli_outputs(
|
|
753
|
+
storage_root: Path,
|
|
754
|
+
pdf_stem: str,
|
|
755
|
+
index_dict: dict[str, object],
|
|
756
|
+
*,
|
|
757
|
+
elements: list[dict[str, object]] | None = None,
|
|
758
|
+
title_coded_formate: str | None = None,
|
|
759
|
+
title_outline: dict[str, object] | None = None,
|
|
760
|
+
mapper_output: dict[str, object] | None = None,
|
|
761
|
+
stamp: str | None = None,
|
|
762
|
+
artifact_root: Path | None = None,
|
|
763
|
+
) -> dict[str, Path]:
|
|
764
|
+
"""Persist one multi-indexing CLI run (json/, pickle/, image/, text/)."""
|
|
765
|
+
import json
|
|
766
|
+
|
|
767
|
+
from chunksmith_cli.pipelines.multi_indexing_storage import persist_multi_indexing_artifacts
|
|
768
|
+
|
|
769
|
+
stamp = stamp or cli_archive_stamp()
|
|
770
|
+
slim = _multi_index_disk_shape(index_dict)
|
|
771
|
+
pageindex: dict[str, object] = {
|
|
772
|
+
"doc_name": slim.get("doc_name"),
|
|
773
|
+
"structure": slim.get("structure"),
|
|
774
|
+
"doc_description": slim.get("doc_description"),
|
|
775
|
+
"llm_context_format": slim.get("llm_context_format"),
|
|
776
|
+
"indexer_options": slim.get("indexer_options"),
|
|
777
|
+
"outline_thinking": slim.get("outline_thinking"),
|
|
778
|
+
}
|
|
779
|
+
paths: dict[str, Path] = {}
|
|
780
|
+
|
|
781
|
+
if artifact_root is not None:
|
|
782
|
+
binary_paths = persist_multi_indexing_artifacts(
|
|
783
|
+
artifact_root=artifact_root,
|
|
784
|
+
pdf_stem=pdf_stem,
|
|
785
|
+
stamp=stamp,
|
|
786
|
+
elements=elements,
|
|
787
|
+
title_coded_formate=title_coded_formate,
|
|
788
|
+
title_outline=title_outline if isinstance(title_outline, dict) else None,
|
|
789
|
+
)
|
|
790
|
+
paths.update(binary_paths)
|
|
791
|
+
layout_json = artifact_root.resolve() / ARTIFACT_SUBDIR_JSON
|
|
792
|
+
layout_json.mkdir(parents=True, exist_ok=True)
|
|
793
|
+
mi_path = layout_json / f"{pdf_stem}_{stamp}_multi_index.json"
|
|
794
|
+
pi_path = layout_json / f"{pdf_stem}_{stamp}_pageindex.json"
|
|
795
|
+
if "title_coded_formate" in binary_paths:
|
|
796
|
+
slim = dict(slim)
|
|
797
|
+
slim["path_coded_text"] = str(binary_paths["title_coded_formate"])
|
|
798
|
+
mi_path.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
799
|
+
pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
800
|
+
paths["multi_index"] = mi_path.resolve()
|
|
801
|
+
paths["pageindex"] = pi_path.resolve()
|
|
802
|
+
if isinstance(mapper_output, dict) and mapper_output:
|
|
803
|
+
mp = layout_json / f"{pdf_stem}_{stamp}_mapper.json"
|
|
804
|
+
mp.write_text(json.dumps(mapper_output, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
805
|
+
paths["mapper"] = mp.resolve()
|
|
806
|
+
return paths
|
|
807
|
+
|
|
808
|
+
paths["multi_index"] = archive_cli_json_object(storage_root, pdf_stem, slim, stamp=stamp, suffix="_multi_index")
|
|
809
|
+
paths["pageindex"] = archive_cli_json_object(storage_root, pdf_stem, pageindex, stamp=stamp, suffix="_pageindex")
|
|
810
|
+
if isinstance(elements, list) and elements:
|
|
811
|
+
paths["unstructured_elements"] = archive_cli_json_object(
|
|
812
|
+
storage_root, pdf_stem, elements, stamp=stamp, suffix="_unstructured_elements"
|
|
813
|
+
)
|
|
814
|
+
if isinstance(title_coded_formate, str) and title_coded_formate.strip():
|
|
815
|
+
coded_path = storage_root / f"{pdf_stem}_{stamp}_title_coded_formate.txt"
|
|
816
|
+
coded_path.write_text(title_coded_formate, encoding="utf-8")
|
|
817
|
+
paths["title_coded_formate"] = coded_path.resolve()
|
|
818
|
+
slim = dict(slim)
|
|
819
|
+
slim["path_coded_text"] = str(paths["title_coded_formate"])
|
|
820
|
+
paths["multi_index"].write_text(
|
|
821
|
+
json.dumps(slim, ensure_ascii=False, indent=2),
|
|
822
|
+
encoding="utf-8",
|
|
823
|
+
)
|
|
824
|
+
if isinstance(mapper_output, dict) and mapper_output:
|
|
825
|
+
paths["mapper"] = archive_cli_json_object(storage_root, pdf_stem, mapper_output, stamp=stamp, suffix="_mapper")
|
|
826
|
+
return paths
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def archive_pageindex_cli_outputs(
|
|
830
|
+
storage_root: Path,
|
|
831
|
+
pdf_stem: str,
|
|
832
|
+
outline: dict[str, object],
|
|
833
|
+
*,
|
|
834
|
+
stamp: str | None = None,
|
|
835
|
+
artifact_root: Path | None = None,
|
|
836
|
+
tagged_pages: str | None = None,
|
|
837
|
+
) -> dict[str, Path]:
|
|
838
|
+
"""Persist a standalone PageIndexer CLI run (pageindex JSON + optional tagged pages text)."""
|
|
839
|
+
import json
|
|
840
|
+
|
|
841
|
+
from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
|
|
842
|
+
|
|
843
|
+
stamp = stamp or cli_archive_stamp()
|
|
844
|
+
pageindex: dict[str, object] = {
|
|
845
|
+
"doc_name": outline.get("doc_name"),
|
|
846
|
+
"structure": outline.get("structure"),
|
|
847
|
+
"doc_description": outline.get("doc_description"),
|
|
848
|
+
"indexing_mode": "chunksmith_pageindex",
|
|
849
|
+
}
|
|
850
|
+
paths: dict[str, Path] = {}
|
|
851
|
+
|
|
852
|
+
if artifact_root is not None:
|
|
853
|
+
layout = ensure_artifact_layout(artifact_root, pdf_stem)
|
|
854
|
+
pi_path = layout.pageindex_json_path(stamp=stamp)
|
|
855
|
+
pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
856
|
+
paths["pageindex"] = pi_path.resolve()
|
|
857
|
+
paths["artifact_root"] = layout.root.resolve()
|
|
858
|
+
if isinstance(tagged_pages, str) and tagged_pages.strip():
|
|
859
|
+
tagged_path = layout.text_dir / f"{pdf_stem}_{stamp}_tagged_pages.txt"
|
|
860
|
+
tagged_path.write_text(tagged_pages, encoding="utf-8")
|
|
861
|
+
paths["tagged_pages"] = tagged_path.resolve()
|
|
862
|
+
return paths
|
|
863
|
+
|
|
864
|
+
paths["pageindex"] = archive_cli_json_object(storage_root, pdf_stem, pageindex, stamp=stamp, suffix="_pageindex")
|
|
865
|
+
if isinstance(tagged_pages, str) and tagged_pages.strip():
|
|
866
|
+
text_path = storage_root / f"{pdf_stem}_{stamp}_tagged_pages.txt"
|
|
867
|
+
text_path.write_text(tagged_pages, encoding="utf-8")
|
|
868
|
+
paths["tagged_pages"] = text_path.resolve()
|
|
869
|
+
return paths
|