chunksmith-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_cli/__init__.py +3 -0
- chunksmith_cli/__main__.py +41 -0
- chunksmith_cli/agent/__init__.py +1 -0
- chunksmith_cli/agent/agent_display.py +160 -0
- chunksmith_cli/agent/agent_session.py +294 -0
- chunksmith_cli/agent/agent_stream.py +152 -0
- chunksmith_cli/agent/agent_wizard.py +5 -0
- chunksmith_cli/agent_display.py +6 -0
- chunksmith_cli/agent_session.py +6 -0
- chunksmith_cli/agent_stream.py +6 -0
- chunksmith_cli/agent_wizard.py +6 -0
- chunksmith_cli/assets/chunksmith_logo.png +0 -0
- chunksmith_cli/branding.py +6 -0
- chunksmith_cli/config.py +6 -0
- chunksmith_cli/core/__init__.py +21 -0
- chunksmith_cli/core/artifact_layout.py +60 -0
- chunksmith_cli/core/branding.py +137 -0
- chunksmith_cli/core/config.py +32 -0
- chunksmith_cli/core/media_preview.py +72 -0
- chunksmith_cli/core/menu.py +110 -0
- chunksmith_cli/core/menus.py +55 -0
- chunksmith_cli/core/panels.py +24 -0
- chunksmith_cli/core/paths.py +869 -0
- chunksmith_cli/core/prefs_mapper.py +97 -0
- chunksmith_cli/core/saved_catalog.py +180 -0
- chunksmith_cli/core/theme.py +38 -0
- chunksmith_cli/elements_json_prompt.py +6 -0
- chunksmith_cli/json_view.py +6 -0
- chunksmith_cli/media_preview.py +6 -0
- chunksmith_cli/menu.py +6 -0
- chunksmith_cli/menus.py +55 -0
- chunksmith_cli/multi_indexing_wizard.py +3 -0
- chunksmith_cli/outline_browser.py +6 -0
- chunksmith_cli/panels.py +6 -0
- chunksmith_cli/partition_prefs.py +74 -0
- chunksmith_cli/paths.py +6 -0
- chunksmith_cli/pdf_prompt.py +6 -0
- chunksmith_cli/pipelines/__init__.py +1 -0
- chunksmith_cli/pipelines/mapping_validation.py +31 -0
- chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
- chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
- chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
- chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
- chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
- chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
- chunksmith_cli/pipelines/run_multi.py +21 -0
- chunksmith_cli/prompts/__init__.py +1 -0
- chunksmith_cli/prompts/elements_json_prompt.py +49 -0
- chunksmith_cli/prompts/pdf_prompt.py +37 -0
- chunksmith_cli/saved_catalog.py +6 -0
- chunksmith_cli/theme.py +6 -0
- chunksmith_cli/tree_view.py +6 -0
- chunksmith_cli/view_session.py +6 -0
- chunksmith_cli/views/__init__.py +1 -0
- chunksmith_cli/views/json_view.py +11 -0
- chunksmith_cli/views/outline_browser.py +247 -0
- chunksmith_cli/views/tree_view.py +59 -0
- chunksmith_cli/views/view_session.py +32 -0
- chunksmith_cli/wizard.py +357 -0
- chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
- chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
- chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
- chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
- chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
- chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
"""Prompts for the unified multi-indexing CLI wizard (JSON + TOON, MVL/notebook parity)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.prompt import Confirm, Prompt
|
|
9
|
+
|
|
10
|
+
from chunksmith_cli.core.paths import cli_runs_dir, normalize_path_string, resolve_pdf_candidate
|
|
11
|
+
from chunksmith_cli.partition_prefs import PdfPartitionPreferences
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def prompt_element_source(console: Console) -> tuple[str, bool]:
|
|
15
|
+
"""
|
|
16
|
+
Returns ``(mode, partition)`` where ``mode`` is ``json`` | ``partition``.
|
|
17
|
+
|
|
18
|
+
Multi-indexing packages use Unstructured **API** only (no local partition).
|
|
19
|
+
"""
|
|
20
|
+
console.print(
|
|
21
|
+
"\n[bold cyan]Step 2 — Elements[/bold cyan]\n"
|
|
22
|
+
" [cyan]1[/cyan] — Load existing [bold]Unstructured JSON[/bold] (array of elements)\n"
|
|
23
|
+
" [cyan]2[/cyan] — [bold]Partition PDF via Unstructured API[/bold] "
|
|
24
|
+
"[dim](``UNSTRUCTURED_API_KEY`` in ``.env``)[/dim]\n"
|
|
25
|
+
)
|
|
26
|
+
src = Prompt.ask("Element source", choices=["1", "2"], default="2", show_choices=True)
|
|
27
|
+
if src == "1":
|
|
28
|
+
return "json", False
|
|
29
|
+
return "partition", True
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def prompt_unstructured_ocr_language(console: Console, *, default: str = "eng") -> str:
|
|
33
|
+
console.print(
|
|
34
|
+
"\n[bold]OCR / partition language[/bold]\n"
|
|
35
|
+
" [dim]Unstructured ``languages`` (e.g. eng, deu, spa). Comma-separated for multiple.[/dim]\n"
|
|
36
|
+
)
|
|
37
|
+
raw = Prompt.ask("[bold]Language code(s)[/bold]", default=default).strip()
|
|
38
|
+
return raw or default
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def preflight_unstructured_api(console: Console, preflight_api_partition) -> bool:
|
|
42
|
+
"""Run package preflight and print Rich-friendly errors."""
|
|
43
|
+
ok, msg = preflight_api_partition()
|
|
44
|
+
if ok:
|
|
45
|
+
console.print(f"[green]{msg}[/green]")
|
|
46
|
+
return True
|
|
47
|
+
console.print(f"[bold red]{msg}[/bold red]")
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def prompt_partition_chunking(
|
|
52
|
+
console: Console, *, default_pages: int = 10, default_concurrent: int = 2
|
|
53
|
+
) -> tuple[int, int]:
|
|
54
|
+
console.print(
|
|
55
|
+
"\n[bold]Partition chunking[/bold]\n"
|
|
56
|
+
" [dim]``pages_per_chunk``: pages per API request (default 10). Use [bold]0[/bold] for one full-PDF call.[/dim]\n"
|
|
57
|
+
" [dim]``max_concurrent``: parallel chunk requests (default 2).[/dim]\n"
|
|
58
|
+
)
|
|
59
|
+
ppc_raw = Prompt.ask("Pages per chunk", default=str(default_pages)).strip()
|
|
60
|
+
mc_raw = Prompt.ask("Max concurrent chunks", default=str(default_concurrent)).strip()
|
|
61
|
+
try:
|
|
62
|
+
pages_per_chunk = int(ppc_raw)
|
|
63
|
+
except ValueError:
|
|
64
|
+
pages_per_chunk = default_pages
|
|
65
|
+
try:
|
|
66
|
+
max_concurrent = int(mc_raw)
|
|
67
|
+
except ValueError:
|
|
68
|
+
max_concurrent = default_concurrent
|
|
69
|
+
return max(0, pages_per_chunk), max(1, max_concurrent)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def prompt_mapping_method(
|
|
73
|
+
console: Console,
|
|
74
|
+
*,
|
|
75
|
+
mapping_methods: frozenset[str],
|
|
76
|
+
use_group_by_title: bool,
|
|
77
|
+
add_anchor: bool,
|
|
78
|
+
) -> str:
|
|
79
|
+
options = sorted(mapping_methods)
|
|
80
|
+
console.print("\n[bold]Mapper method[/bold] [dim](combined index layout)[/dim]\n")
|
|
81
|
+
for i, name in enumerate(options, start=1):
|
|
82
|
+
note = ""
|
|
83
|
+
if name == "group_indexing" and not use_group_by_title:
|
|
84
|
+
note = " [red](requires group-by-title)[/red]"
|
|
85
|
+
if name == "anchor_indexing" and not add_anchor:
|
|
86
|
+
note = " [red](requires add_anchor)[/red]"
|
|
87
|
+
console.print(f" [cyan]{i}[/cyan] — [bold]{name}[/bold]{note}")
|
|
88
|
+
default = "group_indexing" if use_group_by_title and "group_indexing" in mapping_methods else options[0]
|
|
89
|
+
default_idx = str(options.index(default) + 1) if default in options else "1"
|
|
90
|
+
choice = Prompt.ask("Mapping method", choices=[str(i) for i in range(1, len(options) + 1)], default=default_idx)
|
|
91
|
+
return options[int(choice) - 1]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def prompt_group_by_title_prefs(
|
|
95
|
+
console: Console,
|
|
96
|
+
GroupByTitlePrefs: type,
|
|
97
|
+
default_factory,
|
|
98
|
+
prefs: PdfPartitionPreferences,
|
|
99
|
+
) -> object:
|
|
100
|
+
"""Chunking limits for ``group_elements_by_title`` (also updates ``prefs``)."""
|
|
101
|
+
d = default_factory()
|
|
102
|
+
console.print(
|
|
103
|
+
"\n[bold cyan]Group-by-title chunking[/bold cyan]\n"
|
|
104
|
+
"[dim]Controls how Unstructured elements are merged into ``TitleChunk`` sections.[/dim]\n"
|
|
105
|
+
)
|
|
106
|
+
max_characters = int(
|
|
107
|
+
Prompt.ask("[bold]max_characters[/bold] per chunk", default=str(d.max_characters)).strip()
|
|
108
|
+
or str(d.max_characters)
|
|
109
|
+
)
|
|
110
|
+
new_after = int(
|
|
111
|
+
Prompt.ask("[bold]new_after_n_chars[/bold]", default=str(d.new_after_n_chars)).strip()
|
|
112
|
+
or str(d.new_after_n_chars)
|
|
113
|
+
)
|
|
114
|
+
combine_under = int(
|
|
115
|
+
Prompt.ask(
|
|
116
|
+
"[bold]combine_text_under_n_chars[/bold]",
|
|
117
|
+
default=str(d.combine_text_under_n_chars),
|
|
118
|
+
).strip()
|
|
119
|
+
or str(d.combine_text_under_n_chars)
|
|
120
|
+
)
|
|
121
|
+
multipage = Confirm.ask("[bold]multipage_sections[/bold]", default=bool(d.multipage_sections))
|
|
122
|
+
prefs.max_characters = max_characters
|
|
123
|
+
prefs.new_after_n_chars = new_after
|
|
124
|
+
prefs.combine_text_under_n_chars = combine_under
|
|
125
|
+
prefs.chunksmith_group_multipage_sections = multipage
|
|
126
|
+
return GroupByTitlePrefs(
|
|
127
|
+
max_characters=max_characters,
|
|
128
|
+
new_after_n_chars=new_after,
|
|
129
|
+
combine_text_under_n_chars=combine_under,
|
|
130
|
+
multipage_sections=multipage,
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
_ELEMENT_XML_PRESETS: dict[str, frozenset[str]] = {
|
|
135
|
+
"all": frozenset(
|
|
136
|
+
(
|
|
137
|
+
"FigureCaption",
|
|
138
|
+
"Footer",
|
|
139
|
+
"Formula",
|
|
140
|
+
"Header",
|
|
141
|
+
"Image",
|
|
142
|
+
"ListItem",
|
|
143
|
+
"NarrativeText",
|
|
144
|
+
"Table",
|
|
145
|
+
"Text",
|
|
146
|
+
"Title",
|
|
147
|
+
)
|
|
148
|
+
),
|
|
149
|
+
"core": frozenset(("Title", "NarrativeText", "Text", "ListItem", "Table", "Image", "Header", "Footer")),
|
|
150
|
+
"minimal": frozenset(("Title", "NarrativeText", "Table", "Image")),
|
|
151
|
+
"none": frozenset(),
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _element_xml_from_preset(
|
|
156
|
+
element_xml_types: tuple[str, ...],
|
|
157
|
+
preset: str,
|
|
158
|
+
) -> dict[str, bool]:
|
|
159
|
+
if preset == "all":
|
|
160
|
+
return dict.fromkeys(element_xml_types, True)
|
|
161
|
+
enabled = _ELEMENT_XML_PRESETS.get(preset, frozenset())
|
|
162
|
+
return {t: (t in enabled) for t in element_xml_types}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _prompt_element_xml_per_type(
|
|
166
|
+
console: Console,
|
|
167
|
+
element_xml_types: tuple[str, ...],
|
|
168
|
+
defaults: dict[str, bool],
|
|
169
|
+
) -> dict[str, bool]:
|
|
170
|
+
"""Yes/no for each canonical element type."""
|
|
171
|
+
console.print(
|
|
172
|
+
"\n[bold]Per-type XML wrappers[/bold]\n"
|
|
173
|
+
"[dim]``y`` = emit ``<Type>…</Type>`` around that element; "
|
|
174
|
+
"``n`` = include text without a type tag.[/dim]\n"
|
|
175
|
+
)
|
|
176
|
+
element_xml: dict[str, bool] = {}
|
|
177
|
+
for i, name in enumerate(element_xml_types, start=1):
|
|
178
|
+
default_on = bool(defaults.get(name, False))
|
|
179
|
+
element_xml[name] = Confirm.ask(
|
|
180
|
+
f" [cyan]{i:2d}[/cyan] [bold]{name}[/bold]",
|
|
181
|
+
default=default_on,
|
|
182
|
+
)
|
|
183
|
+
enabled = [t for t, on in element_xml.items() if on]
|
|
184
|
+
console.print(
|
|
185
|
+
f"\n[dim]Enabled ({len(enabled)}/{len(element_xml_types)}):[/dim] "
|
|
186
|
+
+ (", ".join(enabled) if enabled else "(none)")
|
|
187
|
+
)
|
|
188
|
+
return element_xml
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def prompt_coded_formate_options(
|
|
192
|
+
console: Console,
|
|
193
|
+
CodedFormateOptions: type,
|
|
194
|
+
default_factory,
|
|
195
|
+
element_xml_types: tuple[str, ...],
|
|
196
|
+
*,
|
|
197
|
+
use_group_by_title: bool,
|
|
198
|
+
) -> tuple[object, str, dict[str, bool] | None]:
|
|
199
|
+
"""XML wrapper toggles for ``title_chunks_to_coded_formate`` / ``elements_to_coded_formate``."""
|
|
200
|
+
defaults = default_factory()
|
|
201
|
+
default_map = dict(defaults.element_xml)
|
|
202
|
+
console.print(
|
|
203
|
+
"\n[bold cyan]Step — Coded format (markup)[/bold cyan]\n"
|
|
204
|
+
"[dim]Builds the string sent to the page-index LLM. "
|
|
205
|
+
"Saved after the run as[/dim] "
|
|
206
|
+
"[cyan]text/{stem}_{stamp}_title_coded_formate.txt[/cyan]\n"
|
|
207
|
+
)
|
|
208
|
+
add_page_xml = Confirm.ask(
|
|
209
|
+
"[bold]Page wrappers[/bold] [dim](``<Page_P_Start>`` … ``<Page_P_End>``)[/dim]",
|
|
210
|
+
default=bool(defaults.add_page_xml),
|
|
211
|
+
)
|
|
212
|
+
add_group_xml = bool(defaults.add_group_by_title_xml)
|
|
213
|
+
if use_group_by_title:
|
|
214
|
+
add_group_xml = Confirm.ask(
|
|
215
|
+
"[bold]Grouped-title wrappers[/bold] [dim](``<Grouped_Title_k>``)[/dim]",
|
|
216
|
+
default=bool(defaults.add_group_by_title_xml),
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
console.print(
|
|
220
|
+
"\n[bold]Element XML wrappers[/bold] [dim](which Unstructured types get tags)[/dim]\n"
|
|
221
|
+
" [cyan]1[/cyan] — All types on\n"
|
|
222
|
+
" [cyan]2[/cyan] — Core (Title, NarrativeText, Text, ListItem, Table, Image, Header, Footer)\n"
|
|
223
|
+
" [cyan]3[/cyan] — Minimal (Title, NarrativeText, Table, Image)\n"
|
|
224
|
+
" [cyan]4[/cyan] — None (no per-element tags)\n"
|
|
225
|
+
" [cyan]5[/cyan] — Customize each type (y/n per line)\n"
|
|
226
|
+
" [cyan]6[/cyan] — Keep package / ``.env`` defaults\n"
|
|
227
|
+
)
|
|
228
|
+
default_all_on = all(default_map.get(t, False) for t in element_xml_types)
|
|
229
|
+
default_choice = "1" if default_all_on else "6"
|
|
230
|
+
if not default_all_on and any(default_map.get(t) for t in element_xml_types):
|
|
231
|
+
default_choice = "6"
|
|
232
|
+
mode = Prompt.ask(
|
|
233
|
+
"Element XML mode",
|
|
234
|
+
choices=["1", "2", "3", "4", "5", "6"],
|
|
235
|
+
default=default_choice,
|
|
236
|
+
show_choices=True,
|
|
237
|
+
)
|
|
238
|
+
xml_mode = "all"
|
|
239
|
+
xml_custom: dict[str, bool] | None = None
|
|
240
|
+
if mode == "1":
|
|
241
|
+
element_xml = _element_xml_from_preset(element_xml_types, "all")
|
|
242
|
+
xml_mode = "all"
|
|
243
|
+
elif mode == "2":
|
|
244
|
+
element_xml = _element_xml_from_preset(element_xml_types, "core")
|
|
245
|
+
xml_mode = "core"
|
|
246
|
+
elif mode == "3":
|
|
247
|
+
element_xml = _element_xml_from_preset(element_xml_types, "minimal")
|
|
248
|
+
xml_mode = "minimal"
|
|
249
|
+
elif mode == "4":
|
|
250
|
+
element_xml = _element_xml_from_preset(element_xml_types, "none")
|
|
251
|
+
xml_mode = "none"
|
|
252
|
+
elif mode == "5":
|
|
253
|
+
element_xml = _prompt_element_xml_per_type(console, element_xml_types, default_map)
|
|
254
|
+
xml_mode = "custom"
|
|
255
|
+
xml_custom = dict(element_xml)
|
|
256
|
+
else:
|
|
257
|
+
element_xml = {t: bool(default_map.get(t, False)) for t in element_xml_types}
|
|
258
|
+
xml_mode = "custom"
|
|
259
|
+
xml_custom = dict(element_xml)
|
|
260
|
+
|
|
261
|
+
enabled = [t for t in element_xml_types if element_xml.get(t)]
|
|
262
|
+
console.print(f"[dim]Selected tags:[/dim] {', '.join(enabled) if enabled else '(none)'}")
|
|
263
|
+
coded = CodedFormateOptions(
|
|
264
|
+
add_page_xml=add_page_xml,
|
|
265
|
+
add_group_by_title_xml=add_group_xml,
|
|
266
|
+
element_xml=element_xml,
|
|
267
|
+
)
|
|
268
|
+
return coded, xml_mode, xml_custom
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def prompt_partition_prefs_base(console: Console) -> PdfPartitionPreferences:
|
|
272
|
+
"""Notebook/MVL defaults for partition + group-by-title limits."""
|
|
273
|
+
return PdfPartitionPreferences()
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def prompt_partition_api_settings(console: Console, prefs: PdfPartitionPreferences) -> None:
|
|
277
|
+
prefs.languages = prompt_unstructured_ocr_language(console, default=prefs.languages)
|
|
278
|
+
prefs.extract_images = Confirm.ask("Extract images from PDF?", default=prefs.extract_images)
|
|
279
|
+
prefs.extract_tables = Confirm.ask("Extract tables from PDF?", default=prefs.extract_tables)
|
|
280
|
+
pages_per_chunk, max_concurrent = prompt_partition_chunking(
|
|
281
|
+
console,
|
|
282
|
+
default_pages=prefs.pages_per_chunk,
|
|
283
|
+
default_concurrent=prefs.max_concurrent,
|
|
284
|
+
)
|
|
285
|
+
prefs.pages_per_chunk = pages_per_chunk
|
|
286
|
+
prefs.max_concurrent = max_concurrent
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def apply_coded_prefs(
|
|
290
|
+
prefs: PdfPartitionPreferences,
|
|
291
|
+
coded: object,
|
|
292
|
+
xml_mode: str,
|
|
293
|
+
xml_custom: dict[str, bool] | None,
|
|
294
|
+
) -> None:
|
|
295
|
+
prefs.chunksmith_coded_add_page_xml = bool(getattr(coded, "add_page_xml", True))
|
|
296
|
+
prefs.chunksmith_coded_add_group_by_title_xml = bool(getattr(coded, "add_group_by_title_xml", True))
|
|
297
|
+
prefs.chunksmith_coded_element_xml_mode = xml_mode
|
|
298
|
+
prefs.chunksmith_coded_element_xml_custom = xml_custom
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def apply_indexer_prefs(
|
|
302
|
+
prefs: PdfPartitionPreferences,
|
|
303
|
+
indexer_options: object,
|
|
304
|
+
*,
|
|
305
|
+
summarize_outline_nodes: bool | None = None,
|
|
306
|
+
) -> None:
|
|
307
|
+
prefs.chunksmith_pageindex_add_anchor = bool(getattr(indexer_options, "add_anchor", False))
|
|
308
|
+
prefs.chunksmith_pageindex_use_embedded_pdf_toc = bool(getattr(indexer_options, "use_embedded_pdf_toc", True))
|
|
309
|
+
prefs.chunksmith_pageindex_off_llm_parsing = bool(getattr(indexer_options, "off_llm_parsing", False))
|
|
310
|
+
if prefs.chunksmith_probe_first_excerpt is None:
|
|
311
|
+
prefs.chunksmith_probe_first_excerpt = bool(getattr(indexer_options, "probe_first_excerpt", False))
|
|
312
|
+
if prefs.chunksmith_probe_merged_reflection is None:
|
|
313
|
+
prefs.chunksmith_probe_merged_reflection = bool(getattr(indexer_options, "probe_merged_reflection", False))
|
|
314
|
+
if summarize_outline_nodes is not None:
|
|
315
|
+
prefs.summarize_outline_nodes = summarize_outline_nodes
|
|
316
|
+
prefs.chunksmith_pageindex_add_outline_summaries = bool(getattr(indexer_options, "add_summary", True))
|
|
317
|
+
gen = getattr(indexer_options, "generate_doc_summary", None)
|
|
318
|
+
if gen is not None:
|
|
319
|
+
prefs.chunksmith_pageindex_generate_doc_summary = bool(gen)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def prompt_indexer_options(console: Console, IndexerOptions: type, default_factory) -> object:
|
|
323
|
+
"""Yes/no prompts for each IndexerOptions field (defaults from config + env)."""
|
|
324
|
+
defaults = default_factory()
|
|
325
|
+
console.print(
|
|
326
|
+
"\n[bold cyan]Step — Page index (LLM) options[/bold cyan]\n"
|
|
327
|
+
"[dim]Model and token limits come from ``.env`` (``PAGEINDEX_MODEL``, Azure vars, etc.).[/dim]\n"
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
def _bool(label: str, attr: str) -> bool:
|
|
331
|
+
current = getattr(defaults, attr)
|
|
332
|
+
return Confirm.ask(f"{label}", default=bool(current))
|
|
333
|
+
|
|
334
|
+
return IndexerOptions(
|
|
335
|
+
add_summary=_bool("Add per-section summaries", "add_summary"),
|
|
336
|
+
add_anchor=_bool("Add anchor tags in outline", "add_anchor"),
|
|
337
|
+
generate_doc_summary=_bool("Generate document description", "generate_doc_summary"),
|
|
338
|
+
use_embedded_pdf_toc=_bool("Use embedded PDF table of contents", "use_embedded_pdf_toc"),
|
|
339
|
+
off_llm_parsing=_bool("TOC-only (skip LLM outline parsing)", "off_llm_parsing"),
|
|
340
|
+
probe_first_excerpt=_bool("Probe first excerpt (extra LLM call)", "probe_first_excerpt"),
|
|
341
|
+
probe_merged_reflection=_bool("Probe merged-outline reflection", "probe_merged_reflection"),
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def prompt_artifact_root(console: Console, pdf_path: Path) -> Path | None:
|
|
346
|
+
console.print(
|
|
347
|
+
"\n[bold]Artifact folder[/bold]\n"
|
|
348
|
+
" [dim]Optional run folder under ``cli/data/runs/{pdf-stem}/`` "
|
|
349
|
+
"(json/, text/, etc.). Enter = save flat under ``cli/data/legacy`` only.[/dim]\n"
|
|
350
|
+
)
|
|
351
|
+
raw = Prompt.ask(
|
|
352
|
+
"Artifact root [dim](Enter for legacy flat save)[/dim]",
|
|
353
|
+
default="",
|
|
354
|
+
show_default=False,
|
|
355
|
+
).strip()
|
|
356
|
+
if not raw:
|
|
357
|
+
return None
|
|
358
|
+
p = resolve_pdf_candidate(normalize_path_string(raw))
|
|
359
|
+
if p.is_dir():
|
|
360
|
+
return p.resolve()
|
|
361
|
+
if not p.is_absolute():
|
|
362
|
+
alt = (Path.cwd() / p).resolve()
|
|
363
|
+
if alt.is_dir():
|
|
364
|
+
return alt
|
|
365
|
+
p.mkdir(parents=True, exist_ok=True)
|
|
366
|
+
return p.resolve()
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
def prompt_runs_artifact_default(console: Console, pdf_path: Path) -> Path:
|
|
370
|
+
root = cli_runs_dir() / pdf_path.stem
|
|
371
|
+
root.mkdir(parents=True, exist_ok=True)
|
|
372
|
+
console.print(f"[dim]Default artifact root:[/dim] [cyan]{root}[/cyan]")
|
|
373
|
+
if Confirm.ask("Use this folder for this run?", default=True):
|
|
374
|
+
return root.resolve()
|
|
375
|
+
return prompt_artifact_root(console, pdf_path) or root.resolve()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""PyPI library bindings for the multi-indexing CLI wizard."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Literal
|
|
7
|
+
|
|
8
|
+
from chunksmith_multimodal import MAPPING_METHODS, IndexerOptions, run_multi_indexing_with_preferences
|
|
9
|
+
from chunksmith_multimodal.coded_formate import CodedFormateOptions, ELEMENT_XML_TYPES
|
|
10
|
+
from chunksmith_multimodal.config.config import (
|
|
11
|
+
default_coded_formate_options,
|
|
12
|
+
default_group_by_title_prefs,
|
|
13
|
+
default_indexer_options,
|
|
14
|
+
)
|
|
15
|
+
from chunksmith_multimodal.helpers import GroupByTitlePrefs
|
|
16
|
+
from chunksmith_multimodal.unstructured import preflight_api_partition
|
|
17
|
+
|
|
18
|
+
Encoding = Literal["json", "toon"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class MultiIndexingModules:
|
|
23
|
+
encoding: Encoding
|
|
24
|
+
package: str = "chunksmith_multimodal"
|
|
25
|
+
IndexerOptions: type = IndexerOptions
|
|
26
|
+
default_indexer_options: Any = default_indexer_options
|
|
27
|
+
default_coded_formate_options: Any = default_coded_formate_options
|
|
28
|
+
default_group_by_title_prefs: Any = default_group_by_title_prefs
|
|
29
|
+
CodedFormateOptions: type = CodedFormateOptions
|
|
30
|
+
GroupByTitlePrefs: type = GroupByTitlePrefs
|
|
31
|
+
ELEMENT_XML_TYPES: tuple[str, ...] = ELEMENT_XML_TYPES
|
|
32
|
+
preflight_api_partition: Any = preflight_api_partition
|
|
33
|
+
MAPPING_METHODS: frozenset[str] = frozenset(MAPPING_METHODS)
|
|
34
|
+
run_multi_indexing_with_preferences: Any = run_multi_indexing_with_preferences
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def load_multi_indexing(encoding: Encoding) -> MultiIndexingModules:
|
|
38
|
+
return MultiIndexingModules(encoding=encoding)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Persist pickle, PNG, TOON, and slim JSON for multi-indexing CLI runs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import copy
|
|
7
|
+
import json
|
|
8
|
+
import pickle
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from chunksmith_cli.core.artifact_layout import ArtifactLayout, ensure_artifact_layout
|
|
13
|
+
from chunksmith_cli.core.media_preview import export_media_images_to_folder
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _element_image_base64(el: dict[str, Any]) -> str | None:
|
|
17
|
+
md = el.get("metadata")
|
|
18
|
+
if isinstance(md, dict) and isinstance(md.get("image_base64"), str):
|
|
19
|
+
return md["image_base64"]
|
|
20
|
+
b64 = el.get("image_base64")
|
|
21
|
+
return b64 if isinstance(b64, str) else None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def export_element_images_to_folder(
|
|
25
|
+
elements: list[dict[str, Any]],
|
|
26
|
+
dest_dir: Path,
|
|
27
|
+
*,
|
|
28
|
+
max_files: int = 10_000,
|
|
29
|
+
) -> list[Path]:
|
|
30
|
+
"""Write ``elem_{i}__p{page}__img0.png`` from Unstructured ``image_base64`` fields."""
|
|
31
|
+
dest_dir.mkdir(parents=True, exist_ok=True)
|
|
32
|
+
written: list[Path] = []
|
|
33
|
+
for i, el in enumerate(elements):
|
|
34
|
+
if len(written) >= max_files:
|
|
35
|
+
break
|
|
36
|
+
if not isinstance(el, dict):
|
|
37
|
+
continue
|
|
38
|
+
b64 = _element_image_base64(el)
|
|
39
|
+
if not b64 or not b64.strip():
|
|
40
|
+
continue
|
|
41
|
+
try:
|
|
42
|
+
raw = base64.b64decode(b64, validate=False)
|
|
43
|
+
except Exception:
|
|
44
|
+
continue
|
|
45
|
+
md = el.get("metadata") if isinstance(el.get("metadata"), dict) else {}
|
|
46
|
+
page = md.get("page_number", el.get("page_number", 0))
|
|
47
|
+
try:
|
|
48
|
+
page_num = int(page) if page is not None else 0
|
|
49
|
+
except (TypeError, ValueError):
|
|
50
|
+
page_num = 0
|
|
51
|
+
out = dest_dir / f"elem_{i:04d}__p{page_num}__img0.png"
|
|
52
|
+
try:
|
|
53
|
+
out.write_bytes(raw)
|
|
54
|
+
written.append(out)
|
|
55
|
+
except OSError:
|
|
56
|
+
continue
|
|
57
|
+
return written
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def unstructured_json_without_base64(elements: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
61
|
+
"""JSON-safe element list with ``image_base64`` stripped (paths added when PNGs exist)."""
|
|
62
|
+
rows: list[dict[str, Any]] = []
|
|
63
|
+
for el in elements:
|
|
64
|
+
if not isinstance(el, dict):
|
|
65
|
+
continue
|
|
66
|
+
row = copy.deepcopy(el)
|
|
67
|
+
row.pop("image_base64", None)
|
|
68
|
+
md = row.get("metadata")
|
|
69
|
+
if isinstance(md, dict):
|
|
70
|
+
md = dict(md)
|
|
71
|
+
md.pop("image_base64", None)
|
|
72
|
+
row["metadata"] = md
|
|
73
|
+
rows.append(row)
|
|
74
|
+
return rows
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def save_raw_elements_pickle(elements: list[dict[str, Any]], path: Path) -> Path:
|
|
78
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
with path.open("wb") as f:
|
|
80
|
+
pickle.dump(elements, f, protocol=pickle.HIGHEST_PROTOCOL)
|
|
81
|
+
return path.resolve()
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def save_toon_artifacts(
|
|
85
|
+
layout: ArtifactLayout,
|
|
86
|
+
*,
|
|
87
|
+
stamp: str,
|
|
88
|
+
title_outline: dict[str, Any],
|
|
89
|
+
) -> dict[str, Path]:
|
|
90
|
+
"""Write ``.toon`` files under ``text/`` when present on the outline dict."""
|
|
91
|
+
paths: dict[str, Path] = {}
|
|
92
|
+
flat = title_outline.get("flat_rows_toon")
|
|
93
|
+
if isinstance(flat, str) and flat.strip():
|
|
94
|
+
p = layout.text_dir / f"{layout.pdf_stem}_{stamp}_flat_rows.toon"
|
|
95
|
+
p.write_text(flat, encoding="utf-8")
|
|
96
|
+
paths["flat_rows_toon"] = p.resolve()
|
|
97
|
+
struct = title_outline.get("structure_toon")
|
|
98
|
+
if isinstance(struct, str) and struct.strip():
|
|
99
|
+
p = layout.text_dir / f"{layout.pdf_stem}_{stamp}_structure.toon"
|
|
100
|
+
p.write_text(struct, encoding="utf-8")
|
|
101
|
+
paths["structure_toon"] = p.resolve()
|
|
102
|
+
trace = title_outline.get("outline_thinking")
|
|
103
|
+
if isinstance(trace, dict):
|
|
104
|
+
llm_trace = trace.get("llm_toon_trace")
|
|
105
|
+
if isinstance(llm_trace, list) and llm_trace:
|
|
106
|
+
p = layout.text_dir / f"{layout.pdf_stem}_{stamp}_llm_toon_trace.json"
|
|
107
|
+
p.write_text(json.dumps(llm_trace, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
108
|
+
paths["llm_toon_trace"] = p.resolve()
|
|
109
|
+
return paths
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def persist_multi_indexing_artifacts(
|
|
113
|
+
*,
|
|
114
|
+
artifact_root: Path,
|
|
115
|
+
pdf_stem: str,
|
|
116
|
+
stamp: str,
|
|
117
|
+
elements: list[dict[str, Any]] | None,
|
|
118
|
+
title_coded_formate: str | None,
|
|
119
|
+
title_outline: dict[str, Any] | None,
|
|
120
|
+
media_by_node: dict[str, Any] | None = None,
|
|
121
|
+
) -> dict[str, Path]:
|
|
122
|
+
"""
|
|
123
|
+
Write pickle, PNG, TOON, and coded text under ``artifact_root``.
|
|
124
|
+
|
|
125
|
+
JSON files are written by :func:`cli.core.paths.archive_multi_indexing_cli_outputs`.
|
|
126
|
+
"""
|
|
127
|
+
layout = ensure_artifact_layout(artifact_root, pdf_stem)
|
|
128
|
+
paths: dict[str, Path] = {"artifact_root": layout.root}
|
|
129
|
+
|
|
130
|
+
if elements:
|
|
131
|
+
pkl = save_raw_elements_pickle(elements, layout.pickle_path(stamp=stamp))
|
|
132
|
+
paths["pickle"] = pkl
|
|
133
|
+
pngs = export_element_images_to_folder(elements, layout.image_dir)
|
|
134
|
+
if pngs:
|
|
135
|
+
paths["image_dir"] = layout.image_dir
|
|
136
|
+
slim = unstructured_json_without_base64(elements)
|
|
137
|
+
for i, row in enumerate(slim):
|
|
138
|
+
if i < len(pngs):
|
|
139
|
+
row["image_path"] = str(pngs[i].resolve())
|
|
140
|
+
uj = layout.unstructured_json_path(stamp=stamp)
|
|
141
|
+
uj.write_text(json.dumps(slim, ensure_ascii=False, indent=2), encoding="utf-8")
|
|
142
|
+
paths["unstructured_elements"] = uj.resolve()
|
|
143
|
+
|
|
144
|
+
if isinstance(title_coded_formate, str) and title_coded_formate.strip():
|
|
145
|
+
coded_path = layout.text_dir / f"{layout.pdf_stem}_{stamp}_title_coded_formate.txt"
|
|
146
|
+
coded_path.write_text(title_coded_formate, encoding="utf-8")
|
|
147
|
+
paths["title_coded_formate"] = coded_path.resolve()
|
|
148
|
+
|
|
149
|
+
if isinstance(title_outline, dict):
|
|
150
|
+
paths.update(save_toon_artifacts(layout, stamp=stamp, title_outline=title_outline))
|
|
151
|
+
|
|
152
|
+
if isinstance(media_by_node, dict) and media_by_node:
|
|
153
|
+
extra = export_media_images_to_folder(media_by_node, layout.image_dir, max_files=10_000)
|
|
154
|
+
if extra:
|
|
155
|
+
paths["image_dir"] = layout.image_dir
|
|
156
|
+
|
|
157
|
+
return paths
|