chunksmith-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. chunksmith_cli/__init__.py +3 -0
  2. chunksmith_cli/__main__.py +41 -0
  3. chunksmith_cli/agent/__init__.py +1 -0
  4. chunksmith_cli/agent/agent_display.py +160 -0
  5. chunksmith_cli/agent/agent_session.py +294 -0
  6. chunksmith_cli/agent/agent_stream.py +152 -0
  7. chunksmith_cli/agent/agent_wizard.py +5 -0
  8. chunksmith_cli/agent_display.py +6 -0
  9. chunksmith_cli/agent_session.py +6 -0
  10. chunksmith_cli/agent_stream.py +6 -0
  11. chunksmith_cli/agent_wizard.py +6 -0
  12. chunksmith_cli/assets/chunksmith_logo.png +0 -0
  13. chunksmith_cli/branding.py +6 -0
  14. chunksmith_cli/config.py +6 -0
  15. chunksmith_cli/core/__init__.py +21 -0
  16. chunksmith_cli/core/artifact_layout.py +60 -0
  17. chunksmith_cli/core/branding.py +137 -0
  18. chunksmith_cli/core/config.py +32 -0
  19. chunksmith_cli/core/media_preview.py +72 -0
  20. chunksmith_cli/core/menu.py +110 -0
  21. chunksmith_cli/core/menus.py +55 -0
  22. chunksmith_cli/core/panels.py +24 -0
  23. chunksmith_cli/core/paths.py +869 -0
  24. chunksmith_cli/core/prefs_mapper.py +97 -0
  25. chunksmith_cli/core/saved_catalog.py +180 -0
  26. chunksmith_cli/core/theme.py +38 -0
  27. chunksmith_cli/elements_json_prompt.py +6 -0
  28. chunksmith_cli/json_view.py +6 -0
  29. chunksmith_cli/media_preview.py +6 -0
  30. chunksmith_cli/menu.py +6 -0
  31. chunksmith_cli/menus.py +55 -0
  32. chunksmith_cli/multi_indexing_wizard.py +3 -0
  33. chunksmith_cli/outline_browser.py +6 -0
  34. chunksmith_cli/panels.py +6 -0
  35. chunksmith_cli/partition_prefs.py +74 -0
  36. chunksmith_cli/paths.py +6 -0
  37. chunksmith_cli/pdf_prompt.py +6 -0
  38. chunksmith_cli/pipelines/__init__.py +1 -0
  39. chunksmith_cli/pipelines/mapping_validation.py +31 -0
  40. chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
  41. chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
  42. chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
  43. chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
  44. chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
  45. chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
  46. chunksmith_cli/pipelines/run_multi.py +21 -0
  47. chunksmith_cli/prompts/__init__.py +1 -0
  48. chunksmith_cli/prompts/elements_json_prompt.py +49 -0
  49. chunksmith_cli/prompts/pdf_prompt.py +37 -0
  50. chunksmith_cli/saved_catalog.py +6 -0
  51. chunksmith_cli/theme.py +6 -0
  52. chunksmith_cli/tree_view.py +6 -0
  53. chunksmith_cli/view_session.py +6 -0
  54. chunksmith_cli/views/__init__.py +1 -0
  55. chunksmith_cli/views/json_view.py +11 -0
  56. chunksmith_cli/views/outline_browser.py +247 -0
  57. chunksmith_cli/views/tree_view.py +59 -0
  58. chunksmith_cli/views/view_session.py +32 -0
  59. chunksmith_cli/wizard.py +357 -0
  60. chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
  61. chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
  62. chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
  63. chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
  64. chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
  65. chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,375 @@
1
+ """Prompts for the unified multi-indexing CLI wizard (JSON + TOON, MVL/notebook parity)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from rich.console import Console
8
+ from rich.prompt import Confirm, Prompt
9
+
10
+ from chunksmith_cli.core.paths import cli_runs_dir, normalize_path_string, resolve_pdf_candidate
11
+ from chunksmith_cli.partition_prefs import PdfPartitionPreferences
12
+
13
+
14
+ def prompt_element_source(console: Console) -> tuple[str, bool]:
15
+ """
16
+ Returns ``(mode, partition)`` where ``mode`` is ``json`` | ``partition``.
17
+
18
+ Multi-indexing packages use Unstructured **API** only (no local partition).
19
+ """
20
+ console.print(
21
+ "\n[bold cyan]Step 2 — Elements[/bold cyan]\n"
22
+ " [cyan]1[/cyan] — Load existing [bold]Unstructured JSON[/bold] (array of elements)\n"
23
+ " [cyan]2[/cyan] — [bold]Partition PDF via Unstructured API[/bold] "
24
+ "[dim](``UNSTRUCTURED_API_KEY`` in ``.env``)[/dim]\n"
25
+ )
26
+ src = Prompt.ask("Element source", choices=["1", "2"], default="2", show_choices=True)
27
+ if src == "1":
28
+ return "json", False
29
+ return "partition", True
30
+
31
+
32
+ def prompt_unstructured_ocr_language(console: Console, *, default: str = "eng") -> str:
33
+ console.print(
34
+ "\n[bold]OCR / partition language[/bold]\n"
35
+ " [dim]Unstructured ``languages`` (e.g. eng, deu, spa). Comma-separated for multiple.[/dim]\n"
36
+ )
37
+ raw = Prompt.ask("[bold]Language code(s)[/bold]", default=default).strip()
38
+ return raw or default
39
+
40
+
41
+ def preflight_unstructured_api(console: Console, preflight_api_partition) -> bool:
42
+ """Run package preflight and print Rich-friendly errors."""
43
+ ok, msg = preflight_api_partition()
44
+ if ok:
45
+ console.print(f"[green]{msg}[/green]")
46
+ return True
47
+ console.print(f"[bold red]{msg}[/bold red]")
48
+ return False
49
+
50
+
51
+ def prompt_partition_chunking(
52
+ console: Console, *, default_pages: int = 10, default_concurrent: int = 2
53
+ ) -> tuple[int, int]:
54
+ console.print(
55
+ "\n[bold]Partition chunking[/bold]\n"
56
+ " [dim]``pages_per_chunk``: pages per API request (default 10). Use [bold]0[/bold] for one full-PDF call.[/dim]\n"
57
+ " [dim]``max_concurrent``: parallel chunk requests (default 2).[/dim]\n"
58
+ )
59
+ ppc_raw = Prompt.ask("Pages per chunk", default=str(default_pages)).strip()
60
+ mc_raw = Prompt.ask("Max concurrent chunks", default=str(default_concurrent)).strip()
61
+ try:
62
+ pages_per_chunk = int(ppc_raw)
63
+ except ValueError:
64
+ pages_per_chunk = default_pages
65
+ try:
66
+ max_concurrent = int(mc_raw)
67
+ except ValueError:
68
+ max_concurrent = default_concurrent
69
+ return max(0, pages_per_chunk), max(1, max_concurrent)
70
+
71
+
72
+ def prompt_mapping_method(
73
+ console: Console,
74
+ *,
75
+ mapping_methods: frozenset[str],
76
+ use_group_by_title: bool,
77
+ add_anchor: bool,
78
+ ) -> str:
79
+ options = sorted(mapping_methods)
80
+ console.print("\n[bold]Mapper method[/bold] [dim](combined index layout)[/dim]\n")
81
+ for i, name in enumerate(options, start=1):
82
+ note = ""
83
+ if name == "group_indexing" and not use_group_by_title:
84
+ note = " [red](requires group-by-title)[/red]"
85
+ if name == "anchor_indexing" and not add_anchor:
86
+ note = " [red](requires add_anchor)[/red]"
87
+ console.print(f" [cyan]{i}[/cyan] — [bold]{name}[/bold]{note}")
88
+ default = "group_indexing" if use_group_by_title and "group_indexing" in mapping_methods else options[0]
89
+ default_idx = str(options.index(default) + 1) if default in options else "1"
90
+ choice = Prompt.ask("Mapping method", choices=[str(i) for i in range(1, len(options) + 1)], default=default_idx)
91
+ return options[int(choice) - 1]
92
+
93
+
94
+ def prompt_group_by_title_prefs(
95
+ console: Console,
96
+ GroupByTitlePrefs: type,
97
+ default_factory,
98
+ prefs: PdfPartitionPreferences,
99
+ ) -> object:
100
+ """Chunking limits for ``group_elements_by_title`` (also updates ``prefs``)."""
101
+ d = default_factory()
102
+ console.print(
103
+ "\n[bold cyan]Group-by-title chunking[/bold cyan]\n"
104
+ "[dim]Controls how Unstructured elements are merged into ``TitleChunk`` sections.[/dim]\n"
105
+ )
106
+ max_characters = int(
107
+ Prompt.ask("[bold]max_characters[/bold] per chunk", default=str(d.max_characters)).strip()
108
+ or str(d.max_characters)
109
+ )
110
+ new_after = int(
111
+ Prompt.ask("[bold]new_after_n_chars[/bold]", default=str(d.new_after_n_chars)).strip()
112
+ or str(d.new_after_n_chars)
113
+ )
114
+ combine_under = int(
115
+ Prompt.ask(
116
+ "[bold]combine_text_under_n_chars[/bold]",
117
+ default=str(d.combine_text_under_n_chars),
118
+ ).strip()
119
+ or str(d.combine_text_under_n_chars)
120
+ )
121
+ multipage = Confirm.ask("[bold]multipage_sections[/bold]", default=bool(d.multipage_sections))
122
+ prefs.max_characters = max_characters
123
+ prefs.new_after_n_chars = new_after
124
+ prefs.combine_text_under_n_chars = combine_under
125
+ prefs.chunksmith_group_multipage_sections = multipage
126
+ return GroupByTitlePrefs(
127
+ max_characters=max_characters,
128
+ new_after_n_chars=new_after,
129
+ combine_text_under_n_chars=combine_under,
130
+ multipage_sections=multipage,
131
+ )
132
+
133
+
134
+ _ELEMENT_XML_PRESETS: dict[str, frozenset[str]] = {
135
+ "all": frozenset(
136
+ (
137
+ "FigureCaption",
138
+ "Footer",
139
+ "Formula",
140
+ "Header",
141
+ "Image",
142
+ "ListItem",
143
+ "NarrativeText",
144
+ "Table",
145
+ "Text",
146
+ "Title",
147
+ )
148
+ ),
149
+ "core": frozenset(("Title", "NarrativeText", "Text", "ListItem", "Table", "Image", "Header", "Footer")),
150
+ "minimal": frozenset(("Title", "NarrativeText", "Table", "Image")),
151
+ "none": frozenset(),
152
+ }
153
+
154
+
155
+ def _element_xml_from_preset(
156
+ element_xml_types: tuple[str, ...],
157
+ preset: str,
158
+ ) -> dict[str, bool]:
159
+ if preset == "all":
160
+ return dict.fromkeys(element_xml_types, True)
161
+ enabled = _ELEMENT_XML_PRESETS.get(preset, frozenset())
162
+ return {t: (t in enabled) for t in element_xml_types}
163
+
164
+
165
+ def _prompt_element_xml_per_type(
166
+ console: Console,
167
+ element_xml_types: tuple[str, ...],
168
+ defaults: dict[str, bool],
169
+ ) -> dict[str, bool]:
170
+ """Yes/no for each canonical element type."""
171
+ console.print(
172
+ "\n[bold]Per-type XML wrappers[/bold]\n"
173
+ "[dim]``y`` = emit ``<Type>…</Type>`` around that element; "
174
+ "``n`` = include text without a type tag.[/dim]\n"
175
+ )
176
+ element_xml: dict[str, bool] = {}
177
+ for i, name in enumerate(element_xml_types, start=1):
178
+ default_on = bool(defaults.get(name, False))
179
+ element_xml[name] = Confirm.ask(
180
+ f" [cyan]{i:2d}[/cyan] [bold]{name}[/bold]",
181
+ default=default_on,
182
+ )
183
+ enabled = [t for t, on in element_xml.items() if on]
184
+ console.print(
185
+ f"\n[dim]Enabled ({len(enabled)}/{len(element_xml_types)}):[/dim] "
186
+ + (", ".join(enabled) if enabled else "(none)")
187
+ )
188
+ return element_xml
189
+
190
+
191
+ def prompt_coded_formate_options(
192
+ console: Console,
193
+ CodedFormateOptions: type,
194
+ default_factory,
195
+ element_xml_types: tuple[str, ...],
196
+ *,
197
+ use_group_by_title: bool,
198
+ ) -> tuple[object, str, dict[str, bool] | None]:
199
+ """XML wrapper toggles for ``title_chunks_to_coded_formate`` / ``elements_to_coded_formate``."""
200
+ defaults = default_factory()
201
+ default_map = dict(defaults.element_xml)
202
+ console.print(
203
+ "\n[bold cyan]Step — Coded format (markup)[/bold cyan]\n"
204
+ "[dim]Builds the string sent to the page-index LLM. "
205
+ "Saved after the run as[/dim] "
206
+ "[cyan]text/{stem}_{stamp}_title_coded_formate.txt[/cyan]\n"
207
+ )
208
+ add_page_xml = Confirm.ask(
209
+ "[bold]Page wrappers[/bold] [dim](``<Page_P_Start>`` … ``<Page_P_End>``)[/dim]",
210
+ default=bool(defaults.add_page_xml),
211
+ )
212
+ add_group_xml = bool(defaults.add_group_by_title_xml)
213
+ if use_group_by_title:
214
+ add_group_xml = Confirm.ask(
215
+ "[bold]Grouped-title wrappers[/bold] [dim](``<Grouped_Title_k>``)[/dim]",
216
+ default=bool(defaults.add_group_by_title_xml),
217
+ )
218
+
219
+ console.print(
220
+ "\n[bold]Element XML wrappers[/bold] [dim](which Unstructured types get tags)[/dim]\n"
221
+ " [cyan]1[/cyan] — All types on\n"
222
+ " [cyan]2[/cyan] — Core (Title, NarrativeText, Text, ListItem, Table, Image, Header, Footer)\n"
223
+ " [cyan]3[/cyan] — Minimal (Title, NarrativeText, Table, Image)\n"
224
+ " [cyan]4[/cyan] — None (no per-element tags)\n"
225
+ " [cyan]5[/cyan] — Customize each type (y/n per line)\n"
226
+ " [cyan]6[/cyan] — Keep package / ``.env`` defaults\n"
227
+ )
228
+ default_all_on = all(default_map.get(t, False) for t in element_xml_types)
229
+ default_choice = "1" if default_all_on else "6"
230
+ if not default_all_on and any(default_map.get(t) for t in element_xml_types):
231
+ default_choice = "6"
232
+ mode = Prompt.ask(
233
+ "Element XML mode",
234
+ choices=["1", "2", "3", "4", "5", "6"],
235
+ default=default_choice,
236
+ show_choices=True,
237
+ )
238
+ xml_mode = "all"
239
+ xml_custom: dict[str, bool] | None = None
240
+ if mode == "1":
241
+ element_xml = _element_xml_from_preset(element_xml_types, "all")
242
+ xml_mode = "all"
243
+ elif mode == "2":
244
+ element_xml = _element_xml_from_preset(element_xml_types, "core")
245
+ xml_mode = "core"
246
+ elif mode == "3":
247
+ element_xml = _element_xml_from_preset(element_xml_types, "minimal")
248
+ xml_mode = "minimal"
249
+ elif mode == "4":
250
+ element_xml = _element_xml_from_preset(element_xml_types, "none")
251
+ xml_mode = "none"
252
+ elif mode == "5":
253
+ element_xml = _prompt_element_xml_per_type(console, element_xml_types, default_map)
254
+ xml_mode = "custom"
255
+ xml_custom = dict(element_xml)
256
+ else:
257
+ element_xml = {t: bool(default_map.get(t, False)) for t in element_xml_types}
258
+ xml_mode = "custom"
259
+ xml_custom = dict(element_xml)
260
+
261
+ enabled = [t for t in element_xml_types if element_xml.get(t)]
262
+ console.print(f"[dim]Selected tags:[/dim] {', '.join(enabled) if enabled else '(none)'}")
263
+ coded = CodedFormateOptions(
264
+ add_page_xml=add_page_xml,
265
+ add_group_by_title_xml=add_group_xml,
266
+ element_xml=element_xml,
267
+ )
268
+ return coded, xml_mode, xml_custom
269
+
270
+
271
+ def prompt_partition_prefs_base(console: Console) -> PdfPartitionPreferences:
272
+ """Notebook/MVL defaults for partition + group-by-title limits."""
273
+ return PdfPartitionPreferences()
274
+
275
+
276
+ def prompt_partition_api_settings(console: Console, prefs: PdfPartitionPreferences) -> None:
277
+ prefs.languages = prompt_unstructured_ocr_language(console, default=prefs.languages)
278
+ prefs.extract_images = Confirm.ask("Extract images from PDF?", default=prefs.extract_images)
279
+ prefs.extract_tables = Confirm.ask("Extract tables from PDF?", default=prefs.extract_tables)
280
+ pages_per_chunk, max_concurrent = prompt_partition_chunking(
281
+ console,
282
+ default_pages=prefs.pages_per_chunk,
283
+ default_concurrent=prefs.max_concurrent,
284
+ )
285
+ prefs.pages_per_chunk = pages_per_chunk
286
+ prefs.max_concurrent = max_concurrent
287
+
288
+
289
+ def apply_coded_prefs(
290
+ prefs: PdfPartitionPreferences,
291
+ coded: object,
292
+ xml_mode: str,
293
+ xml_custom: dict[str, bool] | None,
294
+ ) -> None:
295
+ prefs.chunksmith_coded_add_page_xml = bool(getattr(coded, "add_page_xml", True))
296
+ prefs.chunksmith_coded_add_group_by_title_xml = bool(getattr(coded, "add_group_by_title_xml", True))
297
+ prefs.chunksmith_coded_element_xml_mode = xml_mode
298
+ prefs.chunksmith_coded_element_xml_custom = xml_custom
299
+
300
+
301
+ def apply_indexer_prefs(
302
+ prefs: PdfPartitionPreferences,
303
+ indexer_options: object,
304
+ *,
305
+ summarize_outline_nodes: bool | None = None,
306
+ ) -> None:
307
+ prefs.chunksmith_pageindex_add_anchor = bool(getattr(indexer_options, "add_anchor", False))
308
+ prefs.chunksmith_pageindex_use_embedded_pdf_toc = bool(getattr(indexer_options, "use_embedded_pdf_toc", True))
309
+ prefs.chunksmith_pageindex_off_llm_parsing = bool(getattr(indexer_options, "off_llm_parsing", False))
310
+ if prefs.chunksmith_probe_first_excerpt is None:
311
+ prefs.chunksmith_probe_first_excerpt = bool(getattr(indexer_options, "probe_first_excerpt", False))
312
+ if prefs.chunksmith_probe_merged_reflection is None:
313
+ prefs.chunksmith_probe_merged_reflection = bool(getattr(indexer_options, "probe_merged_reflection", False))
314
+ if summarize_outline_nodes is not None:
315
+ prefs.summarize_outline_nodes = summarize_outline_nodes
316
+ prefs.chunksmith_pageindex_add_outline_summaries = bool(getattr(indexer_options, "add_summary", True))
317
+ gen = getattr(indexer_options, "generate_doc_summary", None)
318
+ if gen is not None:
319
+ prefs.chunksmith_pageindex_generate_doc_summary = bool(gen)
320
+
321
+
322
+ def prompt_indexer_options(console: Console, IndexerOptions: type, default_factory) -> object:
323
+ """Yes/no prompts for each IndexerOptions field (defaults from config + env)."""
324
+ defaults = default_factory()
325
+ console.print(
326
+ "\n[bold cyan]Step — Page index (LLM) options[/bold cyan]\n"
327
+ "[dim]Model and token limits come from ``.env`` (``PAGEINDEX_MODEL``, Azure vars, etc.).[/dim]\n"
328
+ )
329
+
330
+ def _bool(label: str, attr: str) -> bool:
331
+ current = getattr(defaults, attr)
332
+ return Confirm.ask(f"{label}", default=bool(current))
333
+
334
+ return IndexerOptions(
335
+ add_summary=_bool("Add per-section summaries", "add_summary"),
336
+ add_anchor=_bool("Add anchor tags in outline", "add_anchor"),
337
+ generate_doc_summary=_bool("Generate document description", "generate_doc_summary"),
338
+ use_embedded_pdf_toc=_bool("Use embedded PDF table of contents", "use_embedded_pdf_toc"),
339
+ off_llm_parsing=_bool("TOC-only (skip LLM outline parsing)", "off_llm_parsing"),
340
+ probe_first_excerpt=_bool("Probe first excerpt (extra LLM call)", "probe_first_excerpt"),
341
+ probe_merged_reflection=_bool("Probe merged-outline reflection", "probe_merged_reflection"),
342
+ )
343
+
344
+
345
+ def prompt_artifact_root(console: Console, pdf_path: Path) -> Path | None:
346
+ console.print(
347
+ "\n[bold]Artifact folder[/bold]\n"
348
+ " [dim]Optional run folder under ``cli/data/runs/{pdf-stem}/`` "
349
+ "(json/, text/, etc.). Enter = save flat under ``cli/data/legacy`` only.[/dim]\n"
350
+ )
351
+ raw = Prompt.ask(
352
+ "Artifact root [dim](Enter for legacy flat save)[/dim]",
353
+ default="",
354
+ show_default=False,
355
+ ).strip()
356
+ if not raw:
357
+ return None
358
+ p = resolve_pdf_candidate(normalize_path_string(raw))
359
+ if p.is_dir():
360
+ return p.resolve()
361
+ if not p.is_absolute():
362
+ alt = (Path.cwd() / p).resolve()
363
+ if alt.is_dir():
364
+ return alt
365
+ p.mkdir(parents=True, exist_ok=True)
366
+ return p.resolve()
367
+
368
+
369
+ def prompt_runs_artifact_default(console: Console, pdf_path: Path) -> Path:
370
+ root = cli_runs_dir() / pdf_path.stem
371
+ root.mkdir(parents=True, exist_ok=True)
372
+ console.print(f"[dim]Default artifact root:[/dim] [cyan]{root}[/cyan]")
373
+ if Confirm.ask("Use this folder for this run?", default=True):
374
+ return root.resolve()
375
+ return prompt_artifact_root(console, pdf_path) or root.resolve()
@@ -0,0 +1,38 @@
1
+ """PyPI library bindings for the multi-indexing CLI wizard."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Literal
7
+
8
+ from chunksmith_multimodal import MAPPING_METHODS, IndexerOptions, run_multi_indexing_with_preferences
9
+ from chunksmith_multimodal.coded_formate import CodedFormateOptions, ELEMENT_XML_TYPES
10
+ from chunksmith_multimodal.config.config import (
11
+ default_coded_formate_options,
12
+ default_group_by_title_prefs,
13
+ default_indexer_options,
14
+ )
15
+ from chunksmith_multimodal.helpers import GroupByTitlePrefs
16
+ from chunksmith_multimodal.unstructured import preflight_api_partition
17
+
18
+ Encoding = Literal["json", "toon"]
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class MultiIndexingModules:
23
+ encoding: Encoding
24
+ package: str = "chunksmith_multimodal"
25
+ IndexerOptions: type = IndexerOptions
26
+ default_indexer_options: Any = default_indexer_options
27
+ default_coded_formate_options: Any = default_coded_formate_options
28
+ default_group_by_title_prefs: Any = default_group_by_title_prefs
29
+ CodedFormateOptions: type = CodedFormateOptions
30
+ GroupByTitlePrefs: type = GroupByTitlePrefs
31
+ ELEMENT_XML_TYPES: tuple[str, ...] = ELEMENT_XML_TYPES
32
+ preflight_api_partition: Any = preflight_api_partition
33
+ MAPPING_METHODS: frozenset[str] = frozenset(MAPPING_METHODS)
34
+ run_multi_indexing_with_preferences: Any = run_multi_indexing_with_preferences
35
+
36
+
37
+ def load_multi_indexing(encoding: Encoding) -> MultiIndexingModules:
38
+ return MultiIndexingModules(encoding=encoding)
@@ -0,0 +1,157 @@
1
+ """Persist pickle, PNG, TOON, and slim JSON for multi-indexing CLI runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ import copy
7
+ import json
8
+ import pickle
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from chunksmith_cli.core.artifact_layout import ArtifactLayout, ensure_artifact_layout
13
+ from chunksmith_cli.core.media_preview import export_media_images_to_folder
14
+
15
+
16
+ def _element_image_base64(el: dict[str, Any]) -> str | None:
17
+ md = el.get("metadata")
18
+ if isinstance(md, dict) and isinstance(md.get("image_base64"), str):
19
+ return md["image_base64"]
20
+ b64 = el.get("image_base64")
21
+ return b64 if isinstance(b64, str) else None
22
+
23
+
24
+ def export_element_images_to_folder(
25
+ elements: list[dict[str, Any]],
26
+ dest_dir: Path,
27
+ *,
28
+ max_files: int = 10_000,
29
+ ) -> list[Path]:
30
+ """Write ``elem_{i}__p{page}__img0.png`` from Unstructured ``image_base64`` fields."""
31
+ dest_dir.mkdir(parents=True, exist_ok=True)
32
+ written: list[Path] = []
33
+ for i, el in enumerate(elements):
34
+ if len(written) >= max_files:
35
+ break
36
+ if not isinstance(el, dict):
37
+ continue
38
+ b64 = _element_image_base64(el)
39
+ if not b64 or not b64.strip():
40
+ continue
41
+ try:
42
+ raw = base64.b64decode(b64, validate=False)
43
+ except Exception:
44
+ continue
45
+ md = el.get("metadata") if isinstance(el.get("metadata"), dict) else {}
46
+ page = md.get("page_number", el.get("page_number", 0))
47
+ try:
48
+ page_num = int(page) if page is not None else 0
49
+ except (TypeError, ValueError):
50
+ page_num = 0
51
+ out = dest_dir / f"elem_{i:04d}__p{page_num}__img0.png"
52
+ try:
53
+ out.write_bytes(raw)
54
+ written.append(out)
55
+ except OSError:
56
+ continue
57
+ return written
58
+
59
+
60
+ def unstructured_json_without_base64(elements: list[dict[str, Any]]) -> list[dict[str, Any]]:
61
+ """JSON-safe element list with ``image_base64`` stripped (paths added when PNGs exist)."""
62
+ rows: list[dict[str, Any]] = []
63
+ for el in elements:
64
+ if not isinstance(el, dict):
65
+ continue
66
+ row = copy.deepcopy(el)
67
+ row.pop("image_base64", None)
68
+ md = row.get("metadata")
69
+ if isinstance(md, dict):
70
+ md = dict(md)
71
+ md.pop("image_base64", None)
72
+ row["metadata"] = md
73
+ rows.append(row)
74
+ return rows
75
+
76
+
77
+ def save_raw_elements_pickle(elements: list[dict[str, Any]], path: Path) -> Path:
78
+ path.parent.mkdir(parents=True, exist_ok=True)
79
+ with path.open("wb") as f:
80
+ pickle.dump(elements, f, protocol=pickle.HIGHEST_PROTOCOL)
81
+ return path.resolve()
82
+
83
+
84
+ def save_toon_artifacts(
85
+ layout: ArtifactLayout,
86
+ *,
87
+ stamp: str,
88
+ title_outline: dict[str, Any],
89
+ ) -> dict[str, Path]:
90
+ """Write ``.toon`` files under ``text/`` when present on the outline dict."""
91
+ paths: dict[str, Path] = {}
92
+ flat = title_outline.get("flat_rows_toon")
93
+ if isinstance(flat, str) and flat.strip():
94
+ p = layout.text_dir / f"{layout.pdf_stem}_{stamp}_flat_rows.toon"
95
+ p.write_text(flat, encoding="utf-8")
96
+ paths["flat_rows_toon"] = p.resolve()
97
+ struct = title_outline.get("structure_toon")
98
+ if isinstance(struct, str) and struct.strip():
99
+ p = layout.text_dir / f"{layout.pdf_stem}_{stamp}_structure.toon"
100
+ p.write_text(struct, encoding="utf-8")
101
+ paths["structure_toon"] = p.resolve()
102
+ trace = title_outline.get("outline_thinking")
103
+ if isinstance(trace, dict):
104
+ llm_trace = trace.get("llm_toon_trace")
105
+ if isinstance(llm_trace, list) and llm_trace:
106
+ p = layout.text_dir / f"{layout.pdf_stem}_{stamp}_llm_toon_trace.json"
107
+ p.write_text(json.dumps(llm_trace, ensure_ascii=False, indent=2), encoding="utf-8")
108
+ paths["llm_toon_trace"] = p.resolve()
109
+ return paths
110
+
111
+
112
+ def persist_multi_indexing_artifacts(
113
+ *,
114
+ artifact_root: Path,
115
+ pdf_stem: str,
116
+ stamp: str,
117
+ elements: list[dict[str, Any]] | None,
118
+ title_coded_formate: str | None,
119
+ title_outline: dict[str, Any] | None,
120
+ media_by_node: dict[str, Any] | None = None,
121
+ ) -> dict[str, Path]:
122
+ """
123
+ Write pickle, PNG, TOON, and coded text under ``artifact_root``.
124
+
125
+ JSON files are written by :func:`cli.core.paths.archive_multi_indexing_cli_outputs`.
126
+ """
127
+ layout = ensure_artifact_layout(artifact_root, pdf_stem)
128
+ paths: dict[str, Path] = {"artifact_root": layout.root}
129
+
130
+ if elements:
131
+ pkl = save_raw_elements_pickle(elements, layout.pickle_path(stamp=stamp))
132
+ paths["pickle"] = pkl
133
+ pngs = export_element_images_to_folder(elements, layout.image_dir)
134
+ if pngs:
135
+ paths["image_dir"] = layout.image_dir
136
+ slim = unstructured_json_without_base64(elements)
137
+ for i, row in enumerate(slim):
138
+ if i < len(pngs):
139
+ row["image_path"] = str(pngs[i].resolve())
140
+ uj = layout.unstructured_json_path(stamp=stamp)
141
+ uj.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
142
+ paths["unstructured_elements"] = uj.resolve()
143
+
144
+ if isinstance(title_coded_formate, str) and title_coded_formate.strip():
145
+ coded_path = layout.text_dir / f"{layout.pdf_stem}_{stamp}_title_coded_formate.txt"
146
+ coded_path.write_text(title_coded_formate, encoding="utf-8")
147
+ paths["title_coded_formate"] = coded_path.resolve()
148
+
149
+ if isinstance(title_outline, dict):
150
+ paths.update(save_toon_artifacts(layout, stamp=stamp, title_outline=title_outline))
151
+
152
+ if isinstance(media_by_node, dict) and media_by_node:
153
+ extra = export_media_images_to_folder(media_by_node, layout.image_dir, max_files=10_000)
154
+ if extra:
155
+ paths["image_dir"] = layout.image_dir
156
+
157
+ return paths