chunksmith-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. chunksmith_cli/__init__.py +3 -0
  2. chunksmith_cli/__main__.py +41 -0
  3. chunksmith_cli/agent/__init__.py +1 -0
  4. chunksmith_cli/agent/agent_display.py +160 -0
  5. chunksmith_cli/agent/agent_session.py +294 -0
  6. chunksmith_cli/agent/agent_stream.py +152 -0
  7. chunksmith_cli/agent/agent_wizard.py +5 -0
  8. chunksmith_cli/agent_display.py +6 -0
  9. chunksmith_cli/agent_session.py +6 -0
  10. chunksmith_cli/agent_stream.py +6 -0
  11. chunksmith_cli/agent_wizard.py +6 -0
  12. chunksmith_cli/assets/chunksmith_logo.png +0 -0
  13. chunksmith_cli/branding.py +6 -0
  14. chunksmith_cli/config.py +6 -0
  15. chunksmith_cli/core/__init__.py +21 -0
  16. chunksmith_cli/core/artifact_layout.py +60 -0
  17. chunksmith_cli/core/branding.py +137 -0
  18. chunksmith_cli/core/config.py +32 -0
  19. chunksmith_cli/core/media_preview.py +72 -0
  20. chunksmith_cli/core/menu.py +110 -0
  21. chunksmith_cli/core/menus.py +55 -0
  22. chunksmith_cli/core/panels.py +24 -0
  23. chunksmith_cli/core/paths.py +869 -0
  24. chunksmith_cli/core/prefs_mapper.py +97 -0
  25. chunksmith_cli/core/saved_catalog.py +180 -0
  26. chunksmith_cli/core/theme.py +38 -0
  27. chunksmith_cli/elements_json_prompt.py +6 -0
  28. chunksmith_cli/json_view.py +6 -0
  29. chunksmith_cli/media_preview.py +6 -0
  30. chunksmith_cli/menu.py +6 -0
  31. chunksmith_cli/menus.py +55 -0
  32. chunksmith_cli/multi_indexing_wizard.py +3 -0
  33. chunksmith_cli/outline_browser.py +6 -0
  34. chunksmith_cli/panels.py +6 -0
  35. chunksmith_cli/partition_prefs.py +74 -0
  36. chunksmith_cli/paths.py +6 -0
  37. chunksmith_cli/pdf_prompt.py +6 -0
  38. chunksmith_cli/pipelines/__init__.py +1 -0
  39. chunksmith_cli/pipelines/mapping_validation.py +31 -0
  40. chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
  41. chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
  42. chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
  43. chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
  44. chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
  45. chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
  46. chunksmith_cli/pipelines/run_multi.py +21 -0
  47. chunksmith_cli/prompts/__init__.py +1 -0
  48. chunksmith_cli/prompts/elements_json_prompt.py +49 -0
  49. chunksmith_cli/prompts/pdf_prompt.py +37 -0
  50. chunksmith_cli/saved_catalog.py +6 -0
  51. chunksmith_cli/theme.py +6 -0
  52. chunksmith_cli/tree_view.py +6 -0
  53. chunksmith_cli/view_session.py +6 -0
  54. chunksmith_cli/views/__init__.py +1 -0
  55. chunksmith_cli/views/json_view.py +11 -0
  56. chunksmith_cli/views/outline_browser.py +247 -0
  57. chunksmith_cli/views/tree_view.py +59 -0
  58. chunksmith_cli/views/view_session.py +32 -0
  59. chunksmith_cli/wizard.py +357 -0
  60. chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
  61. chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
  62. chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
  63. chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
  64. chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
  65. chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,869 @@
1
+ """Path helpers for the CLI (normalization, ``cli/data`` archives)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from dataclasses import dataclass
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from urllib.parse import unquote, urlparse
11
+
12
+ from chunksmith_cli.core.artifact_layout import ArtifactLayout
13
+
14
+ ARTIFACT_SUBDIR_JSON = "json"
15
+ ARTIFACT_SUBDIR_IMAGE = "image"
16
+ ARTIFACT_SUBDIR_TEXT = "text"
17
+
18
+
19
+ def normalize_path_string(raw: str) -> Path:
20
+ """
21
+ Turn raw user input into a ``Path``: strip whitespace, matching outer quotes,
22
+ expand ``~``, and map common ``file://`` forms on Windows.
23
+ """
24
+ s = (raw or "").strip()
25
+ if (s.startswith('"') and s.endswith('"')) or (s.startswith("'") and s.endswith("'")):
26
+ s = s[1:-1].strip()
27
+ if s.lower().startswith("file:"):
28
+ parsed = urlparse(s)
29
+ path_part = unquote(parsed.path or "")
30
+ # Windows: file:///D:/x -> /D:/x
31
+ if os.name == "nt" and len(path_part) >= 3 and path_part[0] == "/" and path_part[2] == ":":
32
+ path_part = path_part.lstrip("/")
33
+ s = path_part or s
34
+ return Path(s).expanduser()
35
+
36
+
37
+ def resolve_pdf_candidate(p: Path, *, cwd: Path | None = None) -> Path:
38
+ """
39
+ If ``p`` is not an existing file, try ``cwd / p`` when ``p`` is relative.
40
+ Does not guarantee the result exists.
41
+ """
42
+ if p.is_file():
43
+ return p
44
+ base = cwd or Path.cwd()
45
+ if not p.is_absolute():
46
+ alt = (base / p).resolve()
47
+ if alt.is_file():
48
+ return alt
49
+ return p.resolve() if p.exists() else p
50
+
51
+
52
+ def _cli_package_dir() -> Path:
53
+ """``cli/`` package root (parent of ``core/``)."""
54
+ return Path(__file__).resolve().parent.parent
55
+
56
+
57
+ def cli_data_dir() -> Path:
58
+ """Runtime data root: ``cli/data`` (legacy flat JSON + per-run artifact folders)."""
59
+ return _cli_package_dir() / "data"
60
+
61
+
62
+ def cli_runs_dir() -> Path:
63
+ """Per-run artifact roots: ``cli/data/runs/{doc-stem}/`` with ``json/``, ``image/``, etc."""
64
+ return cli_data_dir() / "runs"
65
+
66
+
67
+ def cli_json_storage_dir() -> Path:
68
+ """Legacy flat JSON archive: ``cli/data/legacy`` (no ``json/`` subfolder)."""
69
+ return cli_data_dir() / "legacy"
70
+
71
+
72
+ def ensure_cli_storage() -> Path:
73
+ """Create ``cli/data`` layout and return the legacy flat archive directory."""
74
+ for d in (cli_data_dir(), cli_json_storage_dir(), cli_runs_dir()):
75
+ d.mkdir(parents=True, exist_ok=True)
76
+ return cli_json_storage_dir()
77
+
78
+
79
+ def _storage_search_dirs(storage: Path) -> list[Path]:
80
+ """Directories to search for saved indexes (legacy flat + ``runs/**``)."""
81
+ storage = storage.resolve()
82
+ data = cli_data_dir().resolve()
83
+ legacy = cli_json_storage_dir().resolve()
84
+ runs = cli_runs_dir().resolve()
85
+ roots: list[Path] = [storage]
86
+ if storage in {legacy, data}:
87
+ if runs.is_dir():
88
+ roots.append(runs)
89
+ if storage == legacy and data.is_dir():
90
+ roots.append(data)
91
+ return list(dict.fromkeys(roots))
92
+
93
+
94
+ @dataclass(frozen=True)
95
+ class ResolvedAgentIndex:
96
+ """Paths passed to ``build_document_index_from_saved``."""
97
+
98
+ pageindex_path: Path | None
99
+ canonical_bundle_path: Path | None
100
+ label: str
101
+ artifact_root: Path | None = None
102
+
103
+
104
+ def default_project_logs_dir() -> Path | None:
105
+ """Legacy ``python/logs`` or ``logs/`` next to the CLI package, when it exists."""
106
+ logs = _cli_package_dir().parent / "logs"
107
+ return logs.resolve() if logs.is_dir() else None
108
+
109
+
110
+ def _artifact_json_dir(root: Path) -> Path | None:
111
+ """Directory containing ``*_canonical_bundle.json`` (artifact ``json/`` or flat)."""
112
+ if not root.is_dir():
113
+ return None
114
+ resolved = root.resolve()
115
+ if resolved.name == ARTIFACT_SUBDIR_JSON:
116
+ return resolved if any(resolved.glob("*_canonical_bundle.json")) else None
117
+ nested = resolved / ARTIFACT_SUBDIR_JSON
118
+ if nested.is_dir() and any(nested.glob("*_canonical_bundle.json")):
119
+ return nested
120
+ if any(resolved.glob("*_canonical_bundle.json")):
121
+ return resolved
122
+ return None
123
+
124
+
125
+ def _artifact_root_from_json_dir(json_dir: Path) -> Path:
126
+ if json_dir.name == ARTIFACT_SUBDIR_JSON:
127
+ return json_dir.parent.resolve()
128
+ return json_dir.resolve()
129
+
130
+
131
+ def _discover_canonical_bundles(json_dir: Path) -> list[Path]:
132
+ return sorted(
133
+ json_dir.glob("*_canonical_bundle.json"),
134
+ key=lambda p: p.stat().st_mtime,
135
+ reverse=True,
136
+ )
137
+
138
+
139
+ def _resolved_from_canonical_bundle(
140
+ cb: Path,
141
+ *,
142
+ artifact_root: Path | None = None,
143
+ ) -> ResolvedAgentIndex:
144
+ stem = cb.name[: -len("_canonical_bundle.json")]
145
+ json_dir = cb.parent
146
+ pi = json_dir / f"{stem}_pageindex.json"
147
+ ar = artifact_root or _artifact_root_from_json_dir(json_dir)
148
+ if pi.is_file():
149
+ return ResolvedAgentIndex(pi.resolve(), cb.resolve(), stem, ar)
150
+ outline = json_dir / f"{stem}.json"
151
+ if outline.is_file() and _json_has_outline_shape(outline):
152
+ return ResolvedAgentIndex(outline.resolve(), cb.resolve(), stem, ar)
153
+ return ResolvedAgentIndex(None, cb.resolve(), stem, ar)
154
+
155
+
156
+ def resolve_artifact_root_input(root: Path) -> ResolvedAgentIndex | None:
157
+ """
158
+ Resolve a multimodal artifact folder (``logs``, ``logs/json``, etc.) to the newest
159
+ ``*_canonical_bundle.json`` and optional sibling pageindex.
160
+ """
161
+ json_dir = _artifact_json_dir(root)
162
+ if json_dir is None:
163
+ return None
164
+ bundles = _discover_canonical_bundles(json_dir)
165
+ if not bundles:
166
+ return None
167
+ return _resolved_from_canonical_bundle(
168
+ bundles[0].resolve(),
169
+ artifact_root=_artifact_root_from_json_dir(json_dir),
170
+ )
171
+
172
+
173
+ def _canonical_bundle_sibling(pageindex_path: Path) -> Path | None:
174
+ name = pageindex_path.name
175
+ if name.endswith("_pageindex.json"):
176
+ stem = name[: -len("_pageindex.json")]
177
+ sibling = pageindex_path.parent / f"{stem}_canonical_bundle.json"
178
+ return sibling if sibling.is_file() else None
179
+ if name.endswith(".json"):
180
+ stem = Path(name).stem
181
+ sibling = pageindex_path.parent / f"{stem}_canonical_bundle.json"
182
+ return sibling if sibling.is_file() else None
183
+ return None
184
+
185
+
186
+ def _json_has_outline_shape(path: Path) -> bool:
187
+ try:
188
+ data = json.loads(path.read_text(encoding="utf-8"))
189
+ except (OSError, json.JSONDecodeError):
190
+ return False
191
+ return isinstance(data, dict) and isinstance(data.get("structure"), list)
192
+
193
+
194
+ def resolve_agent_index_input(
195
+ raw: str,
196
+ *,
197
+ storage: Path,
198
+ cwd: Path | None = None,
199
+ ) -> ResolvedAgentIndex | None:
200
+ """
201
+ Resolve user input to index JSON paths.
202
+
203
+ Accepts a full/relative path, ``file://…``, a file under ``cli/data``,
204
+ a multimodal artifact folder (e.g. ``logs`` with ``json/`` + ``image/``),
205
+ a ``*_pageindex.json`` path, outline JSON with ``structure``, a
206
+ ``*_canonical_bundle.json`` (with or without a sibling pageindex), or a stem.
207
+ """
208
+ s = (raw or "").strip()
209
+ if not s:
210
+ return None
211
+
212
+ base = cwd or Path.cwd()
213
+ candidates: list[Path] = []
214
+
215
+ norm = normalize_path_string(s)
216
+ dir_candidates: list[Path] = []
217
+ if not norm.is_absolute():
218
+ dir_candidates.append((base / norm).resolve())
219
+ try:
220
+ dir_candidates.append(norm.resolve())
221
+ except OSError:
222
+ pass
223
+ for resolved_dir in dir_candidates:
224
+ if resolved_dir.is_dir():
225
+ from_dir = resolve_artifact_root_input(resolved_dir)
226
+ if from_dir is not None:
227
+ return from_dir
228
+
229
+ p = resolve_pdf_candidate(norm, cwd=base)
230
+ candidates.append(p)
231
+ candidates.append((storage / p.name).resolve())
232
+ if not Path(s).is_absolute():
233
+ candidates.append((storage / s).resolve())
234
+
235
+ stem = s.strip().strip('"').strip("'")
236
+ logs = default_project_logs_dir()
237
+ if logs is not None and stem.lower() != "logs":
238
+ candidates.append(logs / s)
239
+ json_dir = logs / ARTIFACT_SUBDIR_JSON
240
+ if json_dir.is_dir() and not stem.lower().endswith(".json"):
241
+ candidates.append(json_dir / f"{stem}_canonical_bundle.json")
242
+ for hit in sorted(json_dir.glob(f"{stem}*_canonical_bundle.json"), reverse=True):
243
+ candidates.append(hit)
244
+ elif logs is not None and stem.lower() == "logs":
245
+ # Prefer cwd-relative ``logs/``; fall back to legacy ``python/logs`` when missing.
246
+ rel_logs = (base / "logs").resolve()
247
+ if not rel_logs.is_dir():
248
+ candidates.append(logs)
249
+ json_dir = logs / ARTIFACT_SUBDIR_JSON
250
+ if json_dir.is_dir():
251
+ for hit in sorted(json_dir.glob("*_canonical_bundle.json"), reverse=True):
252
+ candidates.append(hit)
253
+
254
+ # Bare stem (no .json): try storage artifacts (legacy flat + runs/**/json/)
255
+ if not stem.lower().endswith(".json"):
256
+ for root in _storage_search_dirs(storage):
257
+ for pattern in (f"{stem}_pageindex.json", f"{stem}.json"):
258
+ candidates.append(root / pattern)
259
+ for hit in sorted(root.glob(f"**/{stem}*_pageindex.json"), reverse=True):
260
+ candidates.append(hit)
261
+
262
+ seen: set[Path] = set()
263
+ for cand in candidates:
264
+ try:
265
+ key = cand.resolve()
266
+ except OSError:
267
+ continue
268
+ if key in seen:
269
+ continue
270
+ seen.add(key)
271
+ if not key.is_file():
272
+ continue
273
+ if key.name.endswith("_canonical_bundle.json"):
274
+ return _resolved_from_canonical_bundle(key.resolve())
275
+ if not _json_has_outline_shape(key):
276
+ continue
277
+ cb = _canonical_bundle_sibling(key)
278
+ label = key.stem.replace("_pageindex", "")
279
+ ar = _artifact_root_from_json_dir(key.parent) if key.parent.name == ARTIFACT_SUBDIR_JSON else None
280
+ return ResolvedAgentIndex(
281
+ key.resolve(),
282
+ cb.resolve() if cb else None,
283
+ label,
284
+ ar,
285
+ )
286
+
287
+ return None
288
+
289
+
290
+ def cli_archive_stamp() -> str:
291
+ """UTC timestamp suffix shared by JSON artifacts from one CLI run."""
292
+ return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
293
+
294
+
295
+ def archive_cli_json(
296
+ storage_root: Path,
297
+ pdf_stem: str,
298
+ json_text: str,
299
+ *,
300
+ stamp: str | None = None,
301
+ suffix: str = "",
302
+ ) -> Path:
303
+ """
304
+ Write ``{stem}_{stamp}{suffix}.json`` under ``storage_root`` and return the file path.
305
+
306
+ ``suffix`` is inserted before ``.json`` (e.g. ``_canonical_bundle``).
307
+ """
308
+ stamp = stamp or cli_archive_stamp()
309
+ path = storage_root / f"{pdf_stem}_{stamp}{suffix}.json"
310
+ path.write_text(json_text, encoding="utf-8")
311
+ return path
312
+
313
+
314
+ def archive_cli_json_object(
315
+ storage_root: Path,
316
+ pdf_stem: str,
317
+ data: object,
318
+ *,
319
+ stamp: str | None = None,
320
+ suffix: str = "",
321
+ ) -> Path:
322
+ """Pretty-print ``data`` as UTF-8 JSON via :func:`archive_cli_json`."""
323
+ import json
324
+
325
+ text = json.dumps(data, ensure_ascii=False, indent=2)
326
+ return archive_cli_json(storage_root, pdf_stem, text, stamp=stamp, suffix=suffix)
327
+
328
+
329
+ def _bundle_json_disk_shape(bundle: dict[str, object]) -> dict[str, object]:
330
+ """
331
+ Slim canonical bundle for ``json/`` (matches V3-style separation).
332
+
333
+ Omits ``coded_formate`` (see ``text/*.txt``) and ``unstructured_elements`` (sibling JSON).
334
+ """
335
+ return {k: v for k, v in bundle.items() if k not in ("coded_formate", "unstructured_elements")}
336
+
337
+
338
+ def _stamp_from_canonical_bundle_path(path_str: str | None) -> str | None:
339
+ """Parse ``{stem}_{stamp}_canonical_bundle.json`` → stamp."""
340
+ if not path_str:
341
+ return None
342
+ name = Path(path_str).name
343
+ marker = "_canonical_bundle.json"
344
+ if not name.endswith(marker):
345
+ return None
346
+ stem_part = name[: -len(marker)]
347
+ if "_" not in stem_part:
348
+ return None
349
+ return stem_part.rsplit("_", 1)[-1]
350
+
351
+
352
+ # Bulky linear tags live under ``text/`` (see ``path_coded_text``, ``path_compressed_tree_text``).
353
+ _MULTIMODAL_TEXT_ONLY_KEYS = frozenset({"coded_formate", "compressed_tree_string"})
354
+ _PAGEINDEX_JSON_OMIT_KEYS = frozenset({"canonical_bundle", "coded_formate", "compressed_tree_string"})
355
+
356
+
357
+ def _multimodal_pageindex_disk_shape(out: dict[str, object]) -> dict[str, object]:
358
+ """Outline JSON only (no nested bundle, no coded strings), slim ``structure`` like V3."""
359
+ from chunksmith_v3.outline_tree import (
360
+ slim_outline_bundle_for_export,
361
+ strip_redundant_page_fields,
362
+ strip_text_from_structure,
363
+ )
364
+
365
+ pageindex = {k: v for k, v in out.items() if k not in _PAGEINDEX_JSON_OMIT_KEYS}
366
+ st = pageindex.get("structure")
367
+ if isinstance(st, list):
368
+ pageindex["structure"] = strip_redundant_page_fields(strip_text_from_structure(st))
369
+ ob = pageindex.get("outline_bundle")
370
+ if isinstance(ob, dict):
371
+ pageindex["outline_bundle"] = slim_outline_bundle_for_export(ob)
372
+ return pageindex
373
+
374
+
375
+ def _write_multimodal_text_artifacts_flat(
376
+ storage_root: Path,
377
+ pdf_stem: str,
378
+ *,
379
+ stamp: str,
380
+ bundle: dict[str, object] | None,
381
+ out: dict[str, object],
382
+ pageindex: dict[str, object],
383
+ ) -> None:
384
+ """Legacy flat ``cli/data/legacy`` text files (not under artifact ``text/``)."""
385
+ if isinstance(bundle, dict):
386
+ coded = bundle.get("coded_formate")
387
+ if isinstance(coded, str) and coded.strip():
388
+ p = storage_root / f"{pdf_stem}_{stamp}_coded.txt"
389
+ p.write_text(coded, encoding="utf-8")
390
+ bundle["path_coded_text"] = str(p.resolve())
391
+ cts = out.get("compressed_tree_string")
392
+ if isinstance(cts, str) and cts.strip():
393
+ tp = storage_root / f"{pdf_stem}_{stamp}_compressed_tree.txt"
394
+ tp.write_text(cts, encoding="utf-8")
395
+ pageindex["path_compressed_tree_text"] = str(tp.resolve())
396
+
397
+
398
+ def _write_multimodal_text_artifacts(
399
+ layout: ArtifactLayout,
400
+ *,
401
+ stamp: str,
402
+ bundle: dict[str, object],
403
+ out: dict[str, object],
404
+ pageindex: dict[str, object],
405
+ ) -> None:
406
+ """Persist ``coded_formate`` / ``compressed_tree_string`` under ``text/`` once; set path refs on JSON dicts."""
407
+ coded = bundle.get("coded_formate")
408
+ coded_path_ref = bundle.get("path_coded_text")
409
+ if isinstance(coded_path_ref, str) and Path(coded_path_ref).is_file():
410
+ bundle["path_coded_text"] = str(Path(coded_path_ref).resolve())
411
+ elif isinstance(coded, str) and coded.strip():
412
+ coded_path = layout.coded_text_path(stamp=stamp)
413
+ coded_path.write_text(coded, encoding="utf-8")
414
+ bundle["path_coded_text"] = str(coded_path.resolve())
415
+ cts = out.get("compressed_tree_string")
416
+ if isinstance(cts, str) and cts.strip():
417
+ tree_path = layout.compressed_tree_text_path(stamp=stamp)
418
+ if not tree_path.is_file():
419
+ tree_path.write_text(cts, encoding="utf-8")
420
+ pageindex["path_compressed_tree_text"] = str(tree_path.resolve())
421
+
422
+
423
+ def _multimodal_out_json_disk_shape(out: dict[str, object]) -> dict[str, object]:
424
+ o = {k: v for k, v in out.items() if k not in _MULTIMODAL_TEXT_ONLY_KEYS}
425
+ cb = o.get("canonical_bundle")
426
+ if isinstance(cb, dict):
427
+ o["canonical_bundle"] = _bundle_json_disk_shape(cb) # type: ignore[assignment]
428
+ return o
429
+
430
+
431
+ def archive_multimodal_cli_outputs(
432
+ storage_root: Path,
433
+ pdf_stem: str,
434
+ out: dict[str, object],
435
+ *,
436
+ stamp: str | None = None,
437
+ artifact_root: Path | None = None,
438
+ ) -> dict[str, Path]:
439
+ """
440
+ Write JSON files for one multimodal run (single stamp, one file per role).
441
+
442
+ When ``artifact_root`` is set, this is the **only** writer for ``json/`` (coding step writes
443
+ pickle, ``image/``, and ``text/`` only). Layout matches V3: canonical bundle + unstructured
444
+ + pageindex.
445
+
446
+ Otherwise writes flat files under ``storage_root`` (legacy ``cli/data/legacy``).
447
+ """
448
+ import json
449
+
450
+ from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
451
+
452
+ bundle = out.get("canonical_bundle")
453
+ if stamp is None and isinstance(bundle, dict):
454
+ stamp = _stamp_from_canonical_bundle_path(str(bundle.get("path_canonical_bundle_json") or ""))
455
+ stamp = stamp or cli_archive_stamp()
456
+
457
+ pageindex = _multimodal_pageindex_disk_shape(out)
458
+ out_disk = _multimodal_out_json_disk_shape(out) # type: ignore[arg-type]
459
+ paths: dict[str, Path] = {}
460
+
461
+ if artifact_root is not None:
462
+ layout = ensure_artifact_layout(artifact_root, pdf_stem)
463
+ cb_path = layout.canonical_bundle_json_path(stamp=stamp)
464
+ uj_path = layout.unstructured_json_path(stamp=stamp)
465
+ pi_path = layout.pageindex_json_path(stamp=stamp)
466
+
467
+ bundle_for_paths = dict(bundle) if isinstance(bundle, dict) else {}
468
+ _write_multimodal_text_artifacts(
469
+ layout,
470
+ stamp=stamp,
471
+ bundle=bundle_for_paths,
472
+ out=out,
473
+ pageindex=pageindex,
474
+ )
475
+
476
+ if isinstance(bundle, dict):
477
+ cb_path.write_text(
478
+ json.dumps(_bundle_json_disk_shape(bundle_for_paths), ensure_ascii=False, indent=2),
479
+ encoding="utf-8",
480
+ )
481
+ paths["canonical_bundle"] = cb_path.resolve()
482
+ uels = bundle.get("unstructured_elements")
483
+ if isinstance(uels, list) and uels:
484
+ uj_path.write_text(json.dumps(uels, ensure_ascii=False, indent=2), encoding="utf-8")
485
+ paths["unstructured_elements"] = uj_path.resolve()
486
+ pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
487
+ paths["pageindex"] = pi_path.resolve()
488
+ paths["artifact_root"] = layout.root.resolve()
489
+ return paths
490
+
491
+ bundle_flat = dict(bundle) if isinstance(bundle, dict) else {}
492
+ _write_multimodal_text_artifacts_flat(
493
+ storage_root,
494
+ pdf_stem,
495
+ stamp=stamp,
496
+ bundle=bundle_flat,
497
+ out=out,
498
+ pageindex=pageindex,
499
+ )
500
+
501
+ paths["full"] = archive_cli_json_object(storage_root, pdf_stem, out_disk, stamp=stamp)
502
+ if isinstance(bundle, dict):
503
+ paths["canonical_bundle"] = archive_cli_json_object(
504
+ storage_root,
505
+ pdf_stem,
506
+ _bundle_json_disk_shape(bundle_flat), # type: ignore[arg-type]
507
+ stamp=stamp,
508
+ suffix="_canonical_bundle",
509
+ )
510
+ uels = bundle.get("unstructured_elements")
511
+ if isinstance(uels, list) and uels:
512
+ paths["unstructured_elements"] = archive_cli_json_object(
513
+ storage_root,
514
+ pdf_stem,
515
+ uels,
516
+ stamp=stamp,
517
+ suffix="_unstructured_elements",
518
+ )
519
+ paths["pageindex"] = archive_cli_json_object(
520
+ storage_root,
521
+ pdf_stem,
522
+ pageindex,
523
+ stamp=stamp,
524
+ suffix="_pageindex",
525
+ )
526
+ return paths
527
+
528
+
529
+ def _v3_index_disk_shape(data: dict[str, object]) -> dict[str, object]:
530
+ """Omit bulky/duplicate fields from saved V3 JSON (``text``, extra page keys, ``image_base64``)."""
531
+ from chunksmith_v3.outline_tree import (
532
+ omit_top_level_elements_for_export,
533
+ redistribute_top_level_elements_to_media,
534
+ slim_outline_bundle_for_export,
535
+ strip_redundant_page_fields,
536
+ strip_text_from_structure,
537
+ )
538
+
539
+ out = dict(data)
540
+ st = out.get("structure")
541
+ if isinstance(st, list):
542
+ out["structure"] = strip_redundant_page_fields(strip_text_from_structure(st))
543
+ ob = out.get("outline_bundle")
544
+ if isinstance(ob, dict):
545
+ out["outline_bundle"] = slim_outline_bundle_for_export(ob)
546
+ media = out.get("media_by_node")
547
+ top_els = out.get("elements")
548
+ if isinstance(media, dict) and isinstance(st, list):
549
+ needs_media_els = not any(isinstance(nm, dict) and nm.get("elements") for nm in media.values())
550
+ if needs_media_els:
551
+ from chunksmith_v3.parsed_document import build_parsed_document_from_elements
552
+
553
+ parsed_doc = None
554
+ artifact_root = out.get("artifact_root")
555
+ if isinstance(artifact_root, str):
556
+ json_dir = Path(artifact_root) / "json"
557
+ if json_dir.is_dir():
558
+ uj_candidates = sorted(
559
+ json_dir.glob("*_unstructured_elements.json"),
560
+ key=lambda p: p.stat().st_mtime,
561
+ reverse=True,
562
+ )
563
+ if uj_candidates:
564
+ raw = json.loads(uj_candidates[0].read_text(encoding="utf-8"))
565
+ if isinstance(raw, list):
566
+ parsed_doc = build_parsed_document_from_elements(
567
+ str(out.get("doc_name") or "document.pdf"),
568
+ [e for e in raw if isinstance(e, dict)],
569
+ )
570
+ if parsed_doc is None and isinstance(top_els, list) and top_els:
571
+ parsed_doc = build_parsed_document_from_elements(
572
+ str(out.get("doc_name") or "document.pdf"),
573
+ [e for e in top_els if isinstance(e, dict)],
574
+ )
575
+ if parsed_doc is not None:
576
+ out["media_by_node"] = redistribute_top_level_elements_to_media(
577
+ structure=st,
578
+ media_by_node=media,
579
+ parsed=parsed_doc,
580
+ doc_name=str(out.get("doc_name") or "document.pdf"),
581
+ )
582
+ if isinstance(media, dict) or isinstance(out.get("media_by_node"), dict):
583
+ slim_media: dict[str, object] = {}
584
+ for nid, nm in (out.get("media_by_node") or media).items(): # type: ignore[union-attr]
585
+ if not isinstance(nm, dict):
586
+ continue
587
+ row = dict(nm)
588
+ imgs = row.get("images")
589
+ if isinstance(imgs, list):
590
+ row["images"] = [
591
+ {k: v for k, v in im.items() if k != "image_base64"} if isinstance(im, dict) else im for im in imgs
592
+ ]
593
+ slim_media[str(nid)] = row
594
+ out["media_by_node"] = slim_media
595
+ return omit_top_level_elements_for_export(out)
596
+
597
+
598
+ def archive_v3_cli_outputs(
599
+ storage_root: Path,
600
+ pdf_stem: str,
601
+ index_dict: dict[str, object],
602
+ *,
603
+ stamp: str | None = None,
604
+ artifact_root: Path | None = None,
605
+ ) -> dict[str, Path]:
606
+ """
607
+ Write JSON for one V3 run (same timestamp).
608
+
609
+ When ``artifact_root`` is set, all JSON goes under ``{artifact_root}/json/`` only
610
+ (``persist_v3_artifacts`` does not write these files).
611
+
612
+ Otherwise writes flat files under ``storage_root`` (``cli/data/legacy``).
613
+ """
614
+ from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
615
+
616
+ stamp = stamp or cli_archive_stamp()
617
+ slim = _v3_index_disk_shape(index_dict)
618
+ pageindex = {
619
+ "doc_name": slim.get("doc_name"),
620
+ "structure": slim.get("structure"),
621
+ "doc_description": slim.get("doc_description"),
622
+ "indexing_mode": slim.get("indexing_mode"),
623
+ }
624
+
625
+ if artifact_root is not None:
626
+ layout = ensure_artifact_layout(artifact_root, pdf_stem)
627
+ v3_path = layout.json_dir / f"{pdf_stem}_{stamp}_v3_index.json"
628
+ pi_path = layout.json_dir / f"{pdf_stem}_{stamp}_pageindex.json"
629
+ v3_path.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
630
+ pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
631
+ return {
632
+ "v3_index": v3_path.resolve(),
633
+ "pageindex": pi_path.resolve(),
634
+ "artifact_root": layout.root.resolve(),
635
+ "artifact_v3_index": v3_path.resolve(),
636
+ }
637
+
638
+ paths: dict[str, Path] = {
639
+ "v3_index": archive_cli_json_object(
640
+ storage_root,
641
+ pdf_stem,
642
+ slim,
643
+ stamp=stamp,
644
+ suffix="_v3_index",
645
+ ),
646
+ }
647
+ paths["pageindex"] = archive_cli_json_object(
648
+ storage_root,
649
+ pdf_stem,
650
+ pageindex,
651
+ stamp=stamp,
652
+ suffix="_pageindex",
653
+ )
654
+ return paths
655
+
656
+
657
+ def _title_index_disk_shape(data: dict[str, object]) -> dict[str, object]:
658
+ """Slim title-index JSON for disk (coded string stored separately when large)."""
659
+ out = dict(data)
660
+ coded = out.pop("title_coded_formate", None)
661
+ if coded:
662
+ out["title_coded_formate_chars"] = len(str(coded))
663
+ return out
664
+
665
+
666
+ def archive_title_cli_outputs(
667
+ storage_root: Path,
668
+ pdf_stem: str,
669
+ index_dict: dict[str, object],
670
+ *,
671
+ stamp: str | None = None,
672
+ artifact_root: Path | None = None,
673
+ ) -> dict[str, Path]:
674
+ """Persist title-indexer run artifacts (index, chunk_nodes, coded text, pageindex)."""
675
+ import json
676
+
677
+ from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
678
+
679
+ stamp = stamp or cli_archive_stamp()
680
+ slim = _title_index_disk_shape(index_dict)
681
+ pageindex: dict[str, object] = {
682
+ "doc_name": slim.get("doc_name"),
683
+ "structure": slim.get("structure"),
684
+ "doc_description": slim.get("doc_description"),
685
+ "indexing_mode": slim.get("indexing_mode"),
686
+ }
687
+ ob = index_dict.get("outline_bundle")
688
+ if isinstance(ob, dict) and isinstance(ob.get("pageindex"), dict):
689
+ pi = ob["pageindex"]
690
+ for key in ("extraction_warnings", "outline_thinking", "outline_verification"):
691
+ if key in pi:
692
+ pageindex[key] = pi[key]
693
+
694
+ paths: dict[str, Path] = {}
695
+ coded = index_dict.get("title_coded_formate")
696
+ chunk_nodes = index_dict.get("chunk_nodes")
697
+ elements = index_dict.get("elements")
698
+
699
+ if artifact_root is not None:
700
+ layout = ensure_artifact_layout(artifact_root, pdf_stem)
701
+ ti_path = layout.json_dir / f"{pdf_stem}_{stamp}_title_index.json"
702
+ pi_path = layout.pageindex_json_path(stamp=stamp)
703
+ ti_path.parent.mkdir(parents=True, exist_ok=True)
704
+ ti_path.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
705
+ paths["title_index"] = ti_path.resolve()
706
+ pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
707
+ paths["pageindex"] = pi_path.resolve()
708
+ if isinstance(coded, str) and coded:
709
+ coded_path = layout.text_dir / f"{pdf_stem}_{stamp}_title_coded_formate.txt"
710
+ coded_path.parent.mkdir(parents=True, exist_ok=True)
711
+ coded_path.write_text(coded, encoding="utf-8")
712
+ paths["title_coded_formate"] = coded_path.resolve()
713
+ if isinstance(chunk_nodes, list):
714
+ cn_path = layout.json_dir / f"{pdf_stem}_{stamp}_chunk_nodes.json"
715
+ cn_path.write_text(json.dumps(chunk_nodes, ensure_ascii=False, indent=2), encoding="utf-8")
716
+ paths["chunk_nodes"] = cn_path.resolve()
717
+ if isinstance(elements, list) and elements:
718
+ uj_path = layout.unstructured_json_path(stamp=stamp)
719
+ uj_path.write_text(json.dumps(elements, ensure_ascii=False, indent=2), encoding="utf-8")
720
+ paths["unstructured_elements"] = uj_path.resolve()
721
+ paths["artifact_root"] = layout.root.resolve()
722
+ return paths
723
+
724
+ paths["title_index"] = archive_cli_json_object(storage_root, pdf_stem, slim, stamp=stamp, suffix="_title_index")
725
+ paths["pageindex"] = archive_cli_json_object(storage_root, pdf_stem, pageindex, stamp=stamp, suffix="_pageindex")
726
+ if isinstance(coded, str) and coded:
727
+ text_path = storage_root / f"{pdf_stem}_{stamp}_title_coded_formate.txt"
728
+ text_path.write_text(coded, encoding="utf-8")
729
+ paths["title_coded_formate"] = text_path.resolve()
730
+ if isinstance(chunk_nodes, list):
731
+ paths["chunk_nodes"] = archive_cli_json_object(
732
+ storage_root, pdf_stem, chunk_nodes, stamp=stamp, suffix="_chunk_nodes"
733
+ )
734
+ if isinstance(elements, list) and elements:
735
+ paths["unstructured_elements"] = archive_cli_json_object(
736
+ storage_root, pdf_stem, elements, stamp=stamp, suffix="_unstructured_elements"
737
+ )
738
+ return paths
739
+
740
+
741
+ def _multi_index_disk_shape(data: dict[str, object]) -> dict[str, object]:
742
+ """Slim multi-index JSON (coded markup stored as separate text file)."""
743
+ out = dict(data)
744
+ if "title_coded_formate_chars" in out:
745
+ pass
746
+ for key in ("flat_rows_toon", "structure_toon"):
747
+ if key in out and len(str(out.get(key) or "")) > 50000:
748
+ out[key] = f"(omitted on disk — {len(str(data.get(key) or ''))} chars; see run artifacts)"
749
+ return out
750
+
751
+
752
+ def archive_multi_indexing_cli_outputs(
753
+ storage_root: Path,
754
+ pdf_stem: str,
755
+ index_dict: dict[str, object],
756
+ *,
757
+ elements: list[dict[str, object]] | None = None,
758
+ title_coded_formate: str | None = None,
759
+ title_outline: dict[str, object] | None = None,
760
+ mapper_output: dict[str, object] | None = None,
761
+ stamp: str | None = None,
762
+ artifact_root: Path | None = None,
763
+ ) -> dict[str, Path]:
764
+ """Persist one multi-indexing CLI run (json/, pickle/, image/, text/)."""
765
+ import json
766
+
767
+ from chunksmith_cli.pipelines.multi_indexing_storage import persist_multi_indexing_artifacts
768
+
769
+ stamp = stamp or cli_archive_stamp()
770
+ slim = _multi_index_disk_shape(index_dict)
771
+ pageindex: dict[str, object] = {
772
+ "doc_name": slim.get("doc_name"),
773
+ "structure": slim.get("structure"),
774
+ "doc_description": slim.get("doc_description"),
775
+ "llm_context_format": slim.get("llm_context_format"),
776
+ "indexer_options": slim.get("indexer_options"),
777
+ "outline_thinking": slim.get("outline_thinking"),
778
+ }
779
+ paths: dict[str, Path] = {}
780
+
781
+ if artifact_root is not None:
782
+ binary_paths = persist_multi_indexing_artifacts(
783
+ artifact_root=artifact_root,
784
+ pdf_stem=pdf_stem,
785
+ stamp=stamp,
786
+ elements=elements,
787
+ title_coded_formate=title_coded_formate,
788
+ title_outline=title_outline if isinstance(title_outline, dict) else None,
789
+ )
790
+ paths.update(binary_paths)
791
+ layout_json = artifact_root.resolve() / ARTIFACT_SUBDIR_JSON
792
+ layout_json.mkdir(parents=True, exist_ok=True)
793
+ mi_path = layout_json / f"{pdf_stem}_{stamp}_multi_index.json"
794
+ pi_path = layout_json / f"{pdf_stem}_{stamp}_pageindex.json"
795
+ if "title_coded_formate" in binary_paths:
796
+ slim = dict(slim)
797
+ slim["path_coded_text"] = str(binary_paths["title_coded_formate"])
798
+ mi_path.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
799
+ pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
800
+ paths["multi_index"] = mi_path.resolve()
801
+ paths["pageindex"] = pi_path.resolve()
802
+ if isinstance(mapper_output, dict) and mapper_output:
803
+ mp = layout_json / f"{pdf_stem}_{stamp}_mapper.json"
804
+ mp.write_text(json.dumps(mapper_output, ensure_ascii=False, indent=2), encoding="utf-8")
805
+ paths["mapper"] = mp.resolve()
806
+ return paths
807
+
808
+ paths["multi_index"] = archive_cli_json_object(storage_root, pdf_stem, slim, stamp=stamp, suffix="_multi_index")
809
+ paths["pageindex"] = archive_cli_json_object(storage_root, pdf_stem, pageindex, stamp=stamp, suffix="_pageindex")
810
+ if isinstance(elements, list) and elements:
811
+ paths["unstructured_elements"] = archive_cli_json_object(
812
+ storage_root, pdf_stem, elements, stamp=stamp, suffix="_unstructured_elements"
813
+ )
814
+ if isinstance(title_coded_formate, str) and title_coded_formate.strip():
815
+ coded_path = storage_root / f"{pdf_stem}_{stamp}_title_coded_formate.txt"
816
+ coded_path.write_text(title_coded_formate, encoding="utf-8")
817
+ paths["title_coded_formate"] = coded_path.resolve()
818
+ slim = dict(slim)
819
+ slim["path_coded_text"] = str(paths["title_coded_formate"])
820
+ paths["multi_index"].write_text(
821
+ json.dumps(slim, ensure_ascii=False, indent=2),
822
+ encoding="utf-8",
823
+ )
824
+ if isinstance(mapper_output, dict) and mapper_output:
825
+ paths["mapper"] = archive_cli_json_object(storage_root, pdf_stem, mapper_output, stamp=stamp, suffix="_mapper")
826
+ return paths
827
+
828
+
829
+ def archive_pageindex_cli_outputs(
830
+ storage_root: Path,
831
+ pdf_stem: str,
832
+ outline: dict[str, object],
833
+ *,
834
+ stamp: str | None = None,
835
+ artifact_root: Path | None = None,
836
+ tagged_pages: str | None = None,
837
+ ) -> dict[str, Path]:
838
+ """Persist a standalone PageIndexer CLI run (pageindex JSON + optional tagged pages text)."""
839
+ import json
840
+
841
+ from chunksmith_cli.core.artifact_layout import ensure_artifact_layout
842
+
843
+ stamp = stamp or cli_archive_stamp()
844
+ pageindex: dict[str, object] = {
845
+ "doc_name": outline.get("doc_name"),
846
+ "structure": outline.get("structure"),
847
+ "doc_description": outline.get("doc_description"),
848
+ "indexing_mode": "chunksmith_pageindex",
849
+ }
850
+ paths: dict[str, Path] = {}
851
+
852
+ if artifact_root is not None:
853
+ layout = ensure_artifact_layout(artifact_root, pdf_stem)
854
+ pi_path = layout.pageindex_json_path(stamp=stamp)
855
+ pi_path.write_text(json.dumps(pageindex, ensure_ascii=False, indent=2), encoding="utf-8")
856
+ paths["pageindex"] = pi_path.resolve()
857
+ paths["artifact_root"] = layout.root.resolve()
858
+ if isinstance(tagged_pages, str) and tagged_pages.strip():
859
+ tagged_path = layout.text_dir / f"{pdf_stem}_{stamp}_tagged_pages.txt"
860
+ tagged_path.write_text(tagged_pages, encoding="utf-8")
861
+ paths["tagged_pages"] = tagged_path.resolve()
862
+ return paths
863
+
864
+ paths["pageindex"] = archive_cli_json_object(storage_root, pdf_stem, pageindex, stamp=stamp, suffix="_pageindex")
865
+ if isinstance(tagged_pages, str) and tagged_pages.strip():
866
+ text_path = storage_root / f"{pdf_stem}_{stamp}_tagged_pages.txt"
867
+ text_path.write_text(tagged_pages, encoding="utf-8")
868
+ paths["tagged_pages"] = text_path.resolve()
869
+ return paths