chunksmith-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_cli/__init__.py +3 -0
- chunksmith_cli/__main__.py +41 -0
- chunksmith_cli/agent/__init__.py +1 -0
- chunksmith_cli/agent/agent_display.py +160 -0
- chunksmith_cli/agent/agent_session.py +294 -0
- chunksmith_cli/agent/agent_stream.py +152 -0
- chunksmith_cli/agent/agent_wizard.py +5 -0
- chunksmith_cli/agent_display.py +6 -0
- chunksmith_cli/agent_session.py +6 -0
- chunksmith_cli/agent_stream.py +6 -0
- chunksmith_cli/agent_wizard.py +6 -0
- chunksmith_cli/assets/chunksmith_logo.png +0 -0
- chunksmith_cli/branding.py +6 -0
- chunksmith_cli/config.py +6 -0
- chunksmith_cli/core/__init__.py +21 -0
- chunksmith_cli/core/artifact_layout.py +60 -0
- chunksmith_cli/core/branding.py +137 -0
- chunksmith_cli/core/config.py +32 -0
- chunksmith_cli/core/media_preview.py +72 -0
- chunksmith_cli/core/menu.py +110 -0
- chunksmith_cli/core/menus.py +55 -0
- chunksmith_cli/core/panels.py +24 -0
- chunksmith_cli/core/paths.py +869 -0
- chunksmith_cli/core/prefs_mapper.py +97 -0
- chunksmith_cli/core/saved_catalog.py +180 -0
- chunksmith_cli/core/theme.py +38 -0
- chunksmith_cli/elements_json_prompt.py +6 -0
- chunksmith_cli/json_view.py +6 -0
- chunksmith_cli/media_preview.py +6 -0
- chunksmith_cli/menu.py +6 -0
- chunksmith_cli/menus.py +55 -0
- chunksmith_cli/multi_indexing_wizard.py +3 -0
- chunksmith_cli/outline_browser.py +6 -0
- chunksmith_cli/panels.py +6 -0
- chunksmith_cli/partition_prefs.py +74 -0
- chunksmith_cli/paths.py +6 -0
- chunksmith_cli/pdf_prompt.py +6 -0
- chunksmith_cli/pipelines/__init__.py +1 -0
- chunksmith_cli/pipelines/mapping_validation.py +31 -0
- chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
- chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
- chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
- chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
- chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
- chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
- chunksmith_cli/pipelines/run_multi.py +21 -0
- chunksmith_cli/prompts/__init__.py +1 -0
- chunksmith_cli/prompts/elements_json_prompt.py +49 -0
- chunksmith_cli/prompts/pdf_prompt.py +37 -0
- chunksmith_cli/saved_catalog.py +6 -0
- chunksmith_cli/theme.py +6 -0
- chunksmith_cli/tree_view.py +6 -0
- chunksmith_cli/view_session.py +6 -0
- chunksmith_cli/views/__init__.py +1 -0
- chunksmith_cli/views/json_view.py +11 -0
- chunksmith_cli/views/outline_browser.py +247 -0
- chunksmith_cli/views/tree_view.py +59 -0
- chunksmith_cli/views/view_session.py +32 -0
- chunksmith_cli/wizard.py +357 -0
- chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
- chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
- chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
- chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
- chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
- chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
"""Unified wizard for multi-indexing (JSON or TOON via ``llm_context_format``)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Literal
|
|
9
|
+
|
|
10
|
+
from chunksmith_core.preferences import MultiIndexingPreferences
|
|
11
|
+
from rich.console import Console
|
|
12
|
+
from rich.prompt import Confirm, Prompt
|
|
13
|
+
from rich.table import Table
|
|
14
|
+
|
|
15
|
+
from chunksmith_cli.partition_prefs import PdfPartitionPreferences
|
|
16
|
+
from chunksmith_cli.pipelines.multi_indexing_config import build_multi_indexing_preferences
|
|
17
|
+
from chunksmith_cli.pipelines.multi_indexing_prompts import (
|
|
18
|
+
apply_coded_prefs,
|
|
19
|
+
apply_indexer_prefs,
|
|
20
|
+
preflight_unstructured_api,
|
|
21
|
+
prompt_artifact_root,
|
|
22
|
+
prompt_coded_formate_options,
|
|
23
|
+
prompt_element_source,
|
|
24
|
+
prompt_group_by_title_prefs,
|
|
25
|
+
prompt_indexer_options,
|
|
26
|
+
prompt_mapping_method,
|
|
27
|
+
prompt_partition_api_settings,
|
|
28
|
+
prompt_partition_prefs_base,
|
|
29
|
+
prompt_runs_artifact_default,
|
|
30
|
+
)
|
|
31
|
+
from chunksmith_cli.pipelines.multi_indexing_runtime import Encoding, load_multi_indexing
|
|
32
|
+
from chunksmith_cli.prompts.elements_json_prompt import prompt_for_elements_json
|
|
33
|
+
from chunksmith_cli.prompts.pdf_prompt import prompt_for_existing_pdf
|
|
34
|
+
from chunksmith_cli.core.paths import ensure_cli_storage
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class MultiIndexingCliInputs:
|
|
39
|
+
encoding: Encoding
|
|
40
|
+
pdf_path: Path
|
|
41
|
+
storage_root: Path
|
|
42
|
+
artifact_root: Path | None
|
|
43
|
+
mi_prefs: MultiIndexingPreferences
|
|
44
|
+
prefs: PdfPartitionPreferences
|
|
45
|
+
mods: Any
|
|
46
|
+
stream_events: bool
|
|
47
|
+
stream_llm_tokens: bool
|
|
48
|
+
show_toon_after: bool
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _prompt_llm_context_format(console: Console) -> Literal["json", "toon"]:
|
|
52
|
+
console.print(
|
|
53
|
+
"\n[bold]LLM context format[/bold]\n"
|
|
54
|
+
" [cyan]1[/cyan] — [bold]JSON[/bold] (default)\n"
|
|
55
|
+
" [cyan]2[/cyan] — [bold]TOON[/bold] (compact LLM context)\n"
|
|
56
|
+
)
|
|
57
|
+
choice = Prompt.ask("Format", choices=["1", "2"], default="1", show_choices=True)
|
|
58
|
+
return "toon" if choice == "2" else "json"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _review_table(rows: list[tuple[str, str]]) -> Table:
|
|
62
|
+
from rich import box
|
|
63
|
+
|
|
64
|
+
t = Table(title="Run summary", box=box.ROUNDED, header_style="bold magenta")
|
|
65
|
+
t.add_column("Option", style="cyan", no_wrap=True)
|
|
66
|
+
t.add_column("Value", style="white")
|
|
67
|
+
for k, v in rows:
|
|
68
|
+
t.add_row(k, v)
|
|
69
|
+
return t
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def gather_multi_indexing_inputs(
|
|
73
|
+
console: Console,
|
|
74
|
+
*,
|
|
75
|
+
encoding: Encoding | None = None,
|
|
76
|
+
) -> tuple[int, MultiIndexingCliInputs | None]:
|
|
77
|
+
if encoding is None:
|
|
78
|
+
encoding = _prompt_llm_context_format(console)
|
|
79
|
+
|
|
80
|
+
mods = load_multi_indexing(encoding)
|
|
81
|
+
label = "TOON" if encoding == "toon" else "JSON"
|
|
82
|
+
console.print(
|
|
83
|
+
f"\n[bold]Multi-indexing ({label} LLM context)[/bold]\n"
|
|
84
|
+
f"[dim]Library:[/dim] [cyan]{mods.package}[/cyan]\n"
|
|
85
|
+
"[dim]partition → group → coded → page index → mapper[/dim]\n"
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
prefs = prompt_partition_prefs_base(console)
|
|
89
|
+
|
|
90
|
+
console.print("[bold cyan]Step 1 — PDF[/bold cyan]\n")
|
|
91
|
+
pdf_path = prompt_for_existing_pdf(console)
|
|
92
|
+
if pdf_path is None:
|
|
93
|
+
console.print("[dim]Cancelled.[/dim]")
|
|
94
|
+
return 0, None
|
|
95
|
+
|
|
96
|
+
mode, partition = prompt_element_source(console)
|
|
97
|
+
elements_json_path: Path | None = None
|
|
98
|
+
element_label: str
|
|
99
|
+
|
|
100
|
+
if mode == "json":
|
|
101
|
+
elements_json_path = prompt_for_elements_json(console, show_banner=False)
|
|
102
|
+
if elements_json_path is None:
|
|
103
|
+
console.print("[dim]Cancelled.[/dim]")
|
|
104
|
+
return 0, None
|
|
105
|
+
element_label = f"JSON: {elements_json_path}"
|
|
106
|
+
artifact_root = prompt_artifact_root(console, pdf_path)
|
|
107
|
+
else:
|
|
108
|
+
if not preflight_unstructured_api(console, mods.preflight_api_partition):
|
|
109
|
+
return 1, None
|
|
110
|
+
prompt_partition_api_settings(console, prefs)
|
|
111
|
+
element_label = "Unstructured API partition"
|
|
112
|
+
if Confirm.ask("Save under chunksmith_cli/data/runs/{stem}/?", default=True):
|
|
113
|
+
artifact_root = prompt_runs_artifact_default(console, pdf_path)
|
|
114
|
+
else:
|
|
115
|
+
artifact_root = prompt_artifact_root(console, pdf_path)
|
|
116
|
+
|
|
117
|
+
console.print("\n[bold cyan]Step 3 — Grouping & coded markup[/bold cyan]\n")
|
|
118
|
+
prefs.chunksmith_use_group_by_title = Confirm.ask(
|
|
119
|
+
"Group elements by title before coding?", default=prefs.chunksmith_use_group_by_title
|
|
120
|
+
)
|
|
121
|
+
if prefs.chunksmith_use_group_by_title and Confirm.ask(
|
|
122
|
+
"[bold]Customize group-by-title chunk limits?[/bold]", default=False
|
|
123
|
+
):
|
|
124
|
+
prompt_group_by_title_prefs(console, mods.GroupByTitlePrefs, mods.default_group_by_title_prefs, prefs)
|
|
125
|
+
|
|
126
|
+
coded, xml_mode, xml_custom = prompt_coded_formate_options(
|
|
127
|
+
console,
|
|
128
|
+
mods.CodedFormateOptions,
|
|
129
|
+
mods.default_coded_formate_options,
|
|
130
|
+
mods.ELEMENT_XML_TYPES,
|
|
131
|
+
use_group_by_title=prefs.chunksmith_use_group_by_title,
|
|
132
|
+
)
|
|
133
|
+
apply_coded_prefs(prefs, coded, xml_mode, xml_custom)
|
|
134
|
+
|
|
135
|
+
console.print("\n[bold cyan]Step 4 — Page index & mapper[/bold cyan]\n")
|
|
136
|
+
indexer_options = prompt_indexer_options(console, mods.IndexerOptions, mods.default_indexer_options)
|
|
137
|
+
prefs.summarize_outline_nodes = Confirm.ask(
|
|
138
|
+
"[bold]Summarize outline nodes[/bold] [dim](MVL post-step; CLI saves outline only)[/dim]",
|
|
139
|
+
default=prefs.summarize_outline_nodes,
|
|
140
|
+
)
|
|
141
|
+
apply_indexer_prefs(prefs, indexer_options, summarize_outline_nodes=prefs.summarize_outline_nodes)
|
|
142
|
+
|
|
143
|
+
mapping_method = prompt_mapping_method(
|
|
144
|
+
console,
|
|
145
|
+
mapping_methods=mods.MAPPING_METHODS,
|
|
146
|
+
use_group_by_title=prefs.chunksmith_use_group_by_title,
|
|
147
|
+
add_anchor=prefs.chunksmith_pageindex_add_anchor,
|
|
148
|
+
)
|
|
149
|
+
prefs.chunksmith_mapping_method = mapping_method
|
|
150
|
+
prefs.chunksmith_include_empty_mapper_fields = Confirm.ask(
|
|
151
|
+
"Include empty mapper fields?", default=prefs.chunksmith_include_empty_mapper_fields
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
storage_root = ensure_cli_storage()
|
|
155
|
+
stream_events = Confirm.ask("[bold]Show pipeline progress events?[/bold]", default=True)
|
|
156
|
+
stream_llm_tokens = Confirm.ask("[bold]Stream LLM tokens[/bold] to this terminal?", default=True)
|
|
157
|
+
show_toon_after = encoding == "toon" and Confirm.ask(
|
|
158
|
+
"[bold]After run, print TOON outline snippet?[/bold]", default=False
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
try:
|
|
162
|
+
mi_prefs = build_multi_indexing_preferences(
|
|
163
|
+
mods,
|
|
164
|
+
prefs,
|
|
165
|
+
llm_context_format=encoding,
|
|
166
|
+
pdf_path=pdf_path,
|
|
167
|
+
elements_json_path=elements_json_path,
|
|
168
|
+
partition_pdf=partition,
|
|
169
|
+
stream_llm_tokens=stream_llm_tokens,
|
|
170
|
+
)
|
|
171
|
+
except ValueError as e:
|
|
172
|
+
console.print(f"[bold red]Config error:[/bold red] {e}")
|
|
173
|
+
return 1, None
|
|
174
|
+
|
|
175
|
+
summary_rows = [
|
|
176
|
+
("LLM context", label),
|
|
177
|
+
("Library", mods.package),
|
|
178
|
+
("PDF", str(pdf_path)),
|
|
179
|
+
("Elements", element_label),
|
|
180
|
+
("Group by title", str(prefs.chunksmith_use_group_by_title)),
|
|
181
|
+
("Mapping", mapping_method),
|
|
182
|
+
("element_xml mode", prefs.chunksmith_coded_element_xml_mode),
|
|
183
|
+
("add_page_xml", str(prefs.chunksmith_coded_add_page_xml)),
|
|
184
|
+
("add_group_by_title_xml", str(prefs.chunksmith_coded_add_group_by_title_xml)),
|
|
185
|
+
("Partition pages/chunk", str(prefs.pages_per_chunk) if partition else "n/a"),
|
|
186
|
+
("Max concurrent", str(prefs.max_concurrent) if partition else "n/a"),
|
|
187
|
+
("Extract images", str(prefs.extract_images) if partition else "n/a"),
|
|
188
|
+
("Extract tables", str(prefs.extract_tables) if partition else "n/a"),
|
|
189
|
+
("Languages", prefs.languages if partition else "n/a"),
|
|
190
|
+
("add_summary", str(prefs.effective_pageindex_add_outline_summaries())),
|
|
191
|
+
("add_anchor", str(prefs.chunksmith_pageindex_add_anchor)),
|
|
192
|
+
("probe_first_excerpt", str(prefs.chunksmith_probe_first_excerpt)),
|
|
193
|
+
("probe_merged_reflection", str(prefs.chunksmith_probe_merged_reflection)),
|
|
194
|
+
("Artifact root", str(artifact_root) if artifact_root else "(legacy flat save)"),
|
|
195
|
+
("Storage", str(storage_root.resolve())),
|
|
196
|
+
("PAGEINDEX_MODEL", os.getenv("PAGEINDEX_MODEL", "(default)")),
|
|
197
|
+
("Stream LLM tokens", str(stream_llm_tokens)),
|
|
198
|
+
("Stream pipeline events", str(stream_events)),
|
|
199
|
+
]
|
|
200
|
+
console.print()
|
|
201
|
+
console.print(_review_table(summary_rows))
|
|
202
|
+
|
|
203
|
+
if not Confirm.ask("\n[bold]Start run[/bold]?", default=True):
|
|
204
|
+
console.print("[dim]Cancelled.[/dim]")
|
|
205
|
+
return 0, None
|
|
206
|
+
|
|
207
|
+
return 0, MultiIndexingCliInputs(
|
|
208
|
+
encoding=encoding,
|
|
209
|
+
pdf_path=pdf_path,
|
|
210
|
+
storage_root=storage_root,
|
|
211
|
+
artifact_root=artifact_root,
|
|
212
|
+
mi_prefs=mi_prefs,
|
|
213
|
+
prefs=prefs,
|
|
214
|
+
mods=mods,
|
|
215
|
+
stream_events=stream_events,
|
|
216
|
+
stream_llm_tokens=stream_llm_tokens,
|
|
217
|
+
show_toon_after=show_toon_after,
|
|
218
|
+
)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""Interactive wizard for PageIndexer (PDF parser + LLM outline)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from chunksmith_pageindex import load_settings
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.prompt import Confirm, Prompt
|
|
12
|
+
from rich.table import Table
|
|
13
|
+
|
|
14
|
+
from chunksmith_cli.core.paths import ensure_cli_storage
|
|
15
|
+
from chunksmith_cli.partition_prefs import PdfPartitionPreferences
|
|
16
|
+
from chunksmith_cli.pipelines.multi_indexing_prompts import prompt_artifact_root, prompt_runs_artifact_default
|
|
17
|
+
from chunksmith_cli.prompts.pdf_prompt import prompt_for_existing_pdf
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class PageIndexCliInputs:
|
|
22
|
+
pdf_path: Path
|
|
23
|
+
storage_root: Path
|
|
24
|
+
artifact_root: Path | None
|
|
25
|
+
settings: object
|
|
26
|
+
add_summary: bool
|
|
27
|
+
add_word_range: bool
|
|
28
|
+
add_text: bool
|
|
29
|
+
assign_node_ids: bool
|
|
30
|
+
generate_doc_summary: bool | None
|
|
31
|
+
pdf_parser: str | None
|
|
32
|
+
stream_progress: bool
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _review_table(rows: list[tuple[str, str]]) -> Table:
|
|
36
|
+
from rich import box
|
|
37
|
+
|
|
38
|
+
t = Table(title="PageIndexer run summary", box=box.ROUNDED, header_style="bold magenta")
|
|
39
|
+
t.add_column("Option", style="cyan", no_wrap=True)
|
|
40
|
+
t.add_column("Value", style="white")
|
|
41
|
+
for k, v in rows:
|
|
42
|
+
t.add_row(k, v)
|
|
43
|
+
return t
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def gather_pageindex_inputs(console: Console) -> tuple[int, PageIndexCliInputs | None]:
|
|
47
|
+
console.print(
|
|
48
|
+
"\n[bold]PageIndexer[/bold]\n"
|
|
49
|
+
"[dim]Library:[/dim] [cyan]chunksmith-pageindex[/cyan]\n"
|
|
50
|
+
"[dim]PDF parser → tagged pages → LLM outline (no Unstructured partition)[/dim]\n"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
console.print("[bold cyan]Step 1 — PDF[/bold cyan]\n")
|
|
54
|
+
pdf_path = prompt_for_existing_pdf(console)
|
|
55
|
+
if pdf_path is None:
|
|
56
|
+
console.print("[dim]Cancelled.[/dim]")
|
|
57
|
+
return 0, None
|
|
58
|
+
|
|
59
|
+
console.print("\n[bold cyan]Step 2 — Outline options[/bold cyan]\n")
|
|
60
|
+
prefs = PdfPartitionPreferences()
|
|
61
|
+
add_summary = Confirm.ask(
|
|
62
|
+
"[bold]Per-section summaries[/bold] [dim](add_summary)[/dim]",
|
|
63
|
+
default=prefs.effective_pageindex_add_outline_summaries(),
|
|
64
|
+
)
|
|
65
|
+
add_word_range = Confirm.ask(
|
|
66
|
+
"[bold]Split document anchors[/bold] [dim](add_word_range / anchors)[/dim]",
|
|
67
|
+
default=prefs.chunksmith_pageindex_add_anchor,
|
|
68
|
+
)
|
|
69
|
+
add_text = Confirm.ask(
|
|
70
|
+
"[bold]Attach section text to nodes[/bold] [dim](add_text)[/dim]",
|
|
71
|
+
default=prefs.chunksmith_pageindex_add_text,
|
|
72
|
+
)
|
|
73
|
+
assign_node_ids = Confirm.ask(
|
|
74
|
+
"[bold]Assign node IDs[/bold] [dim](for retriever / agent)[/dim]",
|
|
75
|
+
default=prefs.chunksmith_pageindex_assign_node_ids,
|
|
76
|
+
)
|
|
77
|
+
gen_doc: bool | None = None
|
|
78
|
+
if Confirm.ask("[bold]Generate document description?[/bold] [dim](extra LLM call)[/dim]", default=False):
|
|
79
|
+
gen_doc = True
|
|
80
|
+
|
|
81
|
+
parser_default = os.getenv("CHUNKSMITH_PDF_PARSER", "PyPDF2")
|
|
82
|
+
pdf_parser = Prompt.ask(
|
|
83
|
+
"PDF parser",
|
|
84
|
+
choices=["PyPDF2", "PyMuPDF"],
|
|
85
|
+
default=parser_default if parser_default in ("PyPDF2", "PyMuPDF") else "PyPDF2",
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
console.print("\n[bold cyan]Step 3 — Save location[/bold cyan]\n")
|
|
89
|
+
if Confirm.ask("Save under chunksmith_cli/data/runs/{stem}/?", default=True):
|
|
90
|
+
artifact_root = prompt_runs_artifact_default(console, pdf_path)
|
|
91
|
+
else:
|
|
92
|
+
artifact_root = prompt_artifact_root(console, pdf_path)
|
|
93
|
+
|
|
94
|
+
storage_root = ensure_cli_storage()
|
|
95
|
+
stream_progress = Confirm.ask("[bold]Show progress messages?[/bold]", default=True)
|
|
96
|
+
|
|
97
|
+
overrides: dict[str, object] = {"pdf_parser": pdf_parser}
|
|
98
|
+
if gen_doc is not None:
|
|
99
|
+
overrides["generate_doc_summary"] = gen_doc
|
|
100
|
+
try:
|
|
101
|
+
settings = load_settings(defaults=overrides)
|
|
102
|
+
except ValueError as e:
|
|
103
|
+
console.print(f"[bold red]Settings error:[/bold red] {e}")
|
|
104
|
+
return 1, None
|
|
105
|
+
|
|
106
|
+
summary_rows = [
|
|
107
|
+
("Pipeline", "PageIndexer"),
|
|
108
|
+
("PDF", str(pdf_path)),
|
|
109
|
+
("PDF parser", pdf_parser),
|
|
110
|
+
("add_summary", str(add_summary)),
|
|
111
|
+
("add_word_range", str(add_word_range)),
|
|
112
|
+
("add_text", str(add_text)),
|
|
113
|
+
("assign_node_ids", str(assign_node_ids)),
|
|
114
|
+
("generate_doc_summary", str(gen_doc if gen_doc is not None else settings.generate_doc_summary)),
|
|
115
|
+
("Artifact root", str(artifact_root) if artifact_root else "(legacy flat save)"),
|
|
116
|
+
("Storage", str(storage_root.resolve())),
|
|
117
|
+
("PAGEINDEX_MODEL", os.getenv("PAGEINDEX_MODEL", settings.pageindex_model)),
|
|
118
|
+
("Max tokens/chunk", str(settings.max_tokens_per_chunk)),
|
|
119
|
+
("Overlap pages", str(settings.overlap_pages)),
|
|
120
|
+
]
|
|
121
|
+
console.print()
|
|
122
|
+
console.print(_review_table(summary_rows))
|
|
123
|
+
|
|
124
|
+
if not Confirm.ask("\n[bold]Start run[/bold]?", default=True):
|
|
125
|
+
console.print("[dim]Cancelled.[/dim]")
|
|
126
|
+
return 0, None
|
|
127
|
+
|
|
128
|
+
return 0, PageIndexCliInputs(
|
|
129
|
+
pdf_path=pdf_path,
|
|
130
|
+
storage_root=storage_root,
|
|
131
|
+
artifact_root=artifact_root,
|
|
132
|
+
settings=settings,
|
|
133
|
+
add_summary=add_summary,
|
|
134
|
+
add_word_range=add_word_range,
|
|
135
|
+
add_text=add_text,
|
|
136
|
+
assign_node_ids=assign_node_ids,
|
|
137
|
+
generate_doc_summary=gen_doc,
|
|
138
|
+
pdf_parser=pdf_parser,
|
|
139
|
+
stream_progress=stream_progress,
|
|
140
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Run multi-indexing via published ``chunksmith-multimodal`` APIs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, Callable
|
|
7
|
+
|
|
8
|
+
from chunksmith_core.preferences import MultiIndexingPreferences
|
|
9
|
+
from chunksmith_multimodal import run_multi_indexing_with_preferences
|
|
10
|
+
|
|
11
|
+
ProgressCallback = Callable[[str, dict[str, Any]], None]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def run_multi_indexing(
|
|
15
|
+
pdf_path: Path,
|
|
16
|
+
prefs: MultiIndexingPreferences,
|
|
17
|
+
*,
|
|
18
|
+
progress: ProgressCallback | None = None,
|
|
19
|
+
) -> Any:
|
|
20
|
+
"""Execute the library pipeline with optional Rich progress callback."""
|
|
21
|
+
return run_multi_indexing_with_preferences(pdf_path, prefs, progress=progress)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Shared Rich prompts for pipeline wizards."""
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Prompt for an Unstructured partition JSON file (array of element dicts)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.prompt import Confirm, Prompt
|
|
9
|
+
|
|
10
|
+
from chunksmith_cli.paths import normalize_path_string, resolve_pdf_candidate
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def prompt_for_elements_json(console: Console, *, show_banner: bool = True) -> Path | None:
|
|
14
|
+
"""
|
|
15
|
+
Path to a JSON file whose root is a list of Unstructured-like element dicts.
|
|
16
|
+
|
|
17
|
+
Cancel with an empty line.
|
|
18
|
+
"""
|
|
19
|
+
if show_banner:
|
|
20
|
+
console.print(
|
|
21
|
+
"\n[bold green]Unstructured JSON[/bold green] — file with a top-level array of elements "
|
|
22
|
+
"(``type``, ``text``, ``metadata.page_number``, …).\n"
|
|
23
|
+
"[dim]Same path rules as the PDF prompt.[/dim]\n"
|
|
24
|
+
)
|
|
25
|
+
while True:
|
|
26
|
+
raw = Prompt.ask("[bold]Elements JSON path[/bold]", default="", show_default=False)
|
|
27
|
+
if not (raw or "").strip():
|
|
28
|
+
return None
|
|
29
|
+
p = resolve_pdf_candidate(normalize_path_string(raw))
|
|
30
|
+
if p.is_file():
|
|
31
|
+
if p.suffix.lower() == ".pdf":
|
|
32
|
+
console.print(
|
|
33
|
+
"[bold red]That file is a PDF.[/bold red] This step needs the [bold]Unstructured partition JSON[/bold] "
|
|
34
|
+
"(a ``.json`` file whose root is an array of elements), not the PDF.\n"
|
|
35
|
+
"[dim]Partition the same PDF with Unstructured, save the element list to JSON, then enter that path.[/dim]\n"
|
|
36
|
+
)
|
|
37
|
+
continue
|
|
38
|
+
if p.suffix.lower() != ".json":
|
|
39
|
+
if not Confirm.ask(
|
|
40
|
+
f"[yellow]Extension is {p.suffix!r}, not .json.[/yellow] Use this file anyway? "
|
|
41
|
+
f"[dim](It must still be UTF-8 JSON with a top-level array.)[/dim]",
|
|
42
|
+
default=False,
|
|
43
|
+
):
|
|
44
|
+
continue
|
|
45
|
+
return p
|
|
46
|
+
console.print(
|
|
47
|
+
f"[yellow]Could not open as a file:[/yellow] {p}\n"
|
|
48
|
+
"[dim]Try again, or Enter on an empty line to cancel.[/dim]\n"
|
|
49
|
+
)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Prompt for an existing PDF path (ChunkSmith PDF CLI flow)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.prompt import Confirm, Prompt
|
|
9
|
+
|
|
10
|
+
from chunksmith_cli.paths import normalize_path_string, resolve_pdf_candidate
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def prompt_for_existing_pdf(console: Console) -> Path | None:
|
|
14
|
+
"""
|
|
15
|
+
Accept flexible path input; re-prompt until a readable file is chosen or the user cancels.
|
|
16
|
+
"""
|
|
17
|
+
console.print(
|
|
18
|
+
'[dim]Paths: plain, in "quotes", ~ home, relative to this folder, or file:///… URLs. '
|
|
19
|
+
"Press Enter on an empty line to cancel.[/dim]\n"
|
|
20
|
+
)
|
|
21
|
+
while True:
|
|
22
|
+
raw = Prompt.ask("[bold]PDF path[/bold]", default="", show_default=False)
|
|
23
|
+
if not (raw or "").strip():
|
|
24
|
+
return None
|
|
25
|
+
p = resolve_pdf_candidate(normalize_path_string(raw))
|
|
26
|
+
if p.is_file():
|
|
27
|
+
if p.suffix.lower() != ".pdf":
|
|
28
|
+
if not Confirm.ask(
|
|
29
|
+
f"[yellow]Extension is {p.suffix!r}, not .pdf.[/yellow] Use this file anyway?",
|
|
30
|
+
default=False,
|
|
31
|
+
):
|
|
32
|
+
continue
|
|
33
|
+
return p
|
|
34
|
+
console.print(
|
|
35
|
+
f"[yellow]Could not open as a file:[/yellow] {p}\n"
|
|
36
|
+
"[dim]Check the path and try again, or Enter on an empty line to cancel.[/dim]\n"
|
|
37
|
+
)
|
chunksmith_cli/theme.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Read-only viewers: outline tree, element lookup, explorer."""
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""View saved outline JSON — delegates to :func:`outline_browser.run_outline_browser_session`."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
|
|
7
|
+
from chunksmith_cli.outline_browser import run_outline_browser_session
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def run_saved_json_view(console: Console) -> int:
|
|
11
|
+
return run_outline_browser_session(console)
|