chunksmith-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_cli/__init__.py +3 -0
- chunksmith_cli/__main__.py +41 -0
- chunksmith_cli/agent/__init__.py +1 -0
- chunksmith_cli/agent/agent_display.py +160 -0
- chunksmith_cli/agent/agent_session.py +294 -0
- chunksmith_cli/agent/agent_stream.py +152 -0
- chunksmith_cli/agent/agent_wizard.py +5 -0
- chunksmith_cli/agent_display.py +6 -0
- chunksmith_cli/agent_session.py +6 -0
- chunksmith_cli/agent_stream.py +6 -0
- chunksmith_cli/agent_wizard.py +6 -0
- chunksmith_cli/assets/chunksmith_logo.png +0 -0
- chunksmith_cli/branding.py +6 -0
- chunksmith_cli/config.py +6 -0
- chunksmith_cli/core/__init__.py +21 -0
- chunksmith_cli/core/artifact_layout.py +60 -0
- chunksmith_cli/core/branding.py +137 -0
- chunksmith_cli/core/config.py +32 -0
- chunksmith_cli/core/media_preview.py +72 -0
- chunksmith_cli/core/menu.py +110 -0
- chunksmith_cli/core/menus.py +55 -0
- chunksmith_cli/core/panels.py +24 -0
- chunksmith_cli/core/paths.py +869 -0
- chunksmith_cli/core/prefs_mapper.py +97 -0
- chunksmith_cli/core/saved_catalog.py +180 -0
- chunksmith_cli/core/theme.py +38 -0
- chunksmith_cli/elements_json_prompt.py +6 -0
- chunksmith_cli/json_view.py +6 -0
- chunksmith_cli/media_preview.py +6 -0
- chunksmith_cli/menu.py +6 -0
- chunksmith_cli/menus.py +55 -0
- chunksmith_cli/multi_indexing_wizard.py +3 -0
- chunksmith_cli/outline_browser.py +6 -0
- chunksmith_cli/panels.py +6 -0
- chunksmith_cli/partition_prefs.py +74 -0
- chunksmith_cli/paths.py +6 -0
- chunksmith_cli/pdf_prompt.py +6 -0
- chunksmith_cli/pipelines/__init__.py +1 -0
- chunksmith_cli/pipelines/mapping_validation.py +31 -0
- chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
- chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
- chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
- chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
- chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
- chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
- chunksmith_cli/pipelines/run_multi.py +21 -0
- chunksmith_cli/prompts/__init__.py +1 -0
- chunksmith_cli/prompts/elements_json_prompt.py +49 -0
- chunksmith_cli/prompts/pdf_prompt.py +37 -0
- chunksmith_cli/saved_catalog.py +6 -0
- chunksmith_cli/theme.py +6 -0
- chunksmith_cli/tree_view.py +6 -0
- chunksmith_cli/view_session.py +6 -0
- chunksmith_cli/views/__init__.py +1 -0
- chunksmith_cli/views/json_view.py +11 -0
- chunksmith_cli/views/outline_browser.py +247 -0
- chunksmith_cli/views/tree_view.py +59 -0
- chunksmith_cli/views/view_session.py +32 -0
- chunksmith_cli/wizard.py +357 -0
- chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
- chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
- chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
- chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
- chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
- chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""Session-based outline browser (like Document Q&A load + sub-menu)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Callable
|
|
9
|
+
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.markup import escape
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.prompt import Prompt
|
|
14
|
+
|
|
15
|
+
from chunksmith_cli.menu import MenuOption, prompt_menu
|
|
16
|
+
from chunksmith_cli.panels import print_extraction_warnings_panel
|
|
17
|
+
from chunksmith_cli.paths import ensure_cli_storage, normalize_path_string, resolve_pdf_candidate
|
|
18
|
+
from chunksmith_cli.saved_catalog import list_saved_outlines
|
|
19
|
+
from chunksmith_cli.tree_view import print_structure_tree
|
|
20
|
+
|
|
21
|
+
OUTLINE_BROWSER_MENU: tuple[MenuOption, ...] = (
|
|
22
|
+
MenuOption("1", "Open saved outline", "Pick from cli/data or enter a path"),
|
|
23
|
+
MenuOption("2", "Show section tree", "Nested outline (structure)"),
|
|
24
|
+
MenuOption("3", "Document summary", "doc_description and extraction warnings"),
|
|
25
|
+
MenuOption("4", "View raw JSON", "Full file contents in terminal"),
|
|
26
|
+
MenuOption(
|
|
27
|
+
"5",
|
|
28
|
+
"Outline reasoning",
|
|
29
|
+
"Chain-of-thought from outline_thinking (if saved)",
|
|
30
|
+
),
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class OutlineBrowserState:
|
|
36
|
+
data: dict[str, Any] | None = None
|
|
37
|
+
path: Path | None = None
|
|
38
|
+
label: str | None = None
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def is_ready(self) -> bool:
|
|
42
|
+
return self.data is not None and isinstance(self.data.get("structure"), list)
|
|
43
|
+
|
|
44
|
+
def status_line(self) -> str:
|
|
45
|
+
if not self.is_ready:
|
|
46
|
+
return (
|
|
47
|
+
"[dim]No outline loaded — use option 1. "
|
|
48
|
+
"Works with ``*_multi_index.json``, ``*_pageindex.json``, or legacy ``*_v3_index.json``.[/dim]"
|
|
49
|
+
)
|
|
50
|
+
n_nodes = 0
|
|
51
|
+
st = self.data.get("structure") if self.data else None
|
|
52
|
+
if isinstance(st, list):
|
|
53
|
+
|
|
54
|
+
def _count(nodes: list[Any]) -> int:
|
|
55
|
+
total = len(nodes)
|
|
56
|
+
for node in nodes:
|
|
57
|
+
if isinstance(node, dict) and isinstance(node.get("nodes"), list):
|
|
58
|
+
total += _count(node["nodes"])
|
|
59
|
+
return total
|
|
60
|
+
|
|
61
|
+
n_nodes = _count(st)
|
|
62
|
+
kind = "outline"
|
|
63
|
+
if self.path:
|
|
64
|
+
if "_multi_index" in self.path.name:
|
|
65
|
+
kind = "multi-index"
|
|
66
|
+
elif "_pageindex" in self.path.name:
|
|
67
|
+
kind = "pageindex"
|
|
68
|
+
elif "_v3_index" in self.path.name:
|
|
69
|
+
kind = "v3 (legacy)"
|
|
70
|
+
return (
|
|
71
|
+
f"[green]Loaded:[/green] [bold]{escape(self.label or 'outline')}[/bold] "
|
|
72
|
+
f"[dim]({kind} · {n_nodes} nodes)[/dim]"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _load_outline_file(path: Path) -> dict[str, Any]:
|
|
77
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
78
|
+
if not isinstance(data, dict):
|
|
79
|
+
raise ValueError("root must be a JSON object")
|
|
80
|
+
if not isinstance(data.get("structure"), list):
|
|
81
|
+
raise ValueError("missing top-level 'structure' (not an outline file)")
|
|
82
|
+
return data
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _prompt_outline_path(console: Console, storage: Path) -> Path | None:
|
|
86
|
+
rows = list_saved_outlines()
|
|
87
|
+
console.print(
|
|
88
|
+
f"\n[bold]Open saved outline[/bold]\n"
|
|
89
|
+
f"[dim]``*_pageindex.json`` or ``*_v3_index.json`` under[/dim] [cyan]{storage}[/cyan]\n"
|
|
90
|
+
f"[dim]Path, stem, or number from list. Empty line = cancel.[/dim]\n"
|
|
91
|
+
)
|
|
92
|
+
if rows:
|
|
93
|
+
console.print("[dim]Recent outlines:[/dim]")
|
|
94
|
+
for i, r in enumerate(rows, start=1):
|
|
95
|
+
tag = f" [dim]({r.kind} · {r.source})[/dim]"
|
|
96
|
+
console.print(f" [cyan]{i:>2}[/] {escape(r.stem)}{tag}")
|
|
97
|
+
|
|
98
|
+
while True:
|
|
99
|
+
raw = Prompt.ask("[bold]Outline path or number[/bold]", default="", show_default=False).strip()
|
|
100
|
+
if not raw:
|
|
101
|
+
if rows:
|
|
102
|
+
pick = Prompt.ask("[bold]Or pick number[/bold]", default="1")
|
|
103
|
+
try:
|
|
104
|
+
return rows[int(pick) - 1].path
|
|
105
|
+
except (ValueError, IndexError):
|
|
106
|
+
console.print("[red]Invalid number.[/red]")
|
|
107
|
+
continue
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
if raw.isdigit() and rows:
|
|
111
|
+
idx = int(raw)
|
|
112
|
+
if 1 <= idx <= len(rows):
|
|
113
|
+
return rows[idx - 1].path
|
|
114
|
+
|
|
115
|
+
path = resolve_pdf_candidate(normalize_path_string(raw))
|
|
116
|
+
if not path.is_file():
|
|
117
|
+
alt = (storage / path.name).resolve()
|
|
118
|
+
if alt.is_file():
|
|
119
|
+
path = alt
|
|
120
|
+
if path.is_file():
|
|
121
|
+
return path
|
|
122
|
+
console.print("[yellow]Not a file — try again or leave empty to cancel.[/yellow]\n")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _open_outline(console: Console, state: OutlineBrowserState, storage: Path) -> None:
|
|
126
|
+
path = _prompt_outline_path(console, storage)
|
|
127
|
+
if path is None:
|
|
128
|
+
console.print("[dim]Cancelled.[/dim]")
|
|
129
|
+
return
|
|
130
|
+
try:
|
|
131
|
+
data = _load_outline_file(path)
|
|
132
|
+
except (OSError, json.JSONDecodeError, ValueError) as e:
|
|
133
|
+
console.print(f"[bold red]Could not load:[/bold red] {e}")
|
|
134
|
+
return
|
|
135
|
+
state.data = data
|
|
136
|
+
state.path = path.resolve()
|
|
137
|
+
state.label = str(data.get("doc_name") or path.stem)
|
|
138
|
+
console.print(f"\n[bold green]Loaded.[/bold green] {state.status_line()}\n")
|
|
139
|
+
doc_title = str(data.get("doc_name") or path.stem)
|
|
140
|
+
print_structure_tree(console, data.get("structure"), title=f"Outline — {doc_title}")
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _show_tree(console: Console, state: OutlineBrowserState) -> None:
|
|
144
|
+
if not state.is_ready:
|
|
145
|
+
console.print("[yellow]Load an outline first (option 1).[/yellow]")
|
|
146
|
+
return
|
|
147
|
+
doc_title = str(state.data.get("doc_name") or state.label or "document")
|
|
148
|
+
print_structure_tree(console, state.data.get("structure"), title=f"Outline — {doc_title}")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _show_summary(console: Console, state: OutlineBrowserState) -> None:
|
|
152
|
+
if not state.is_ready:
|
|
153
|
+
console.print("[yellow]Load an outline first (option 1).[/yellow]")
|
|
154
|
+
return
|
|
155
|
+
print_extraction_warnings_panel(console, state.data.get("extraction_warnings"))
|
|
156
|
+
desc = state.data.get("doc_description")
|
|
157
|
+
if isinstance(desc, str) and desc.strip():
|
|
158
|
+
console.print(
|
|
159
|
+
Panel(
|
|
160
|
+
escape(desc.strip()),
|
|
161
|
+
title="[bold]Document description[/bold]",
|
|
162
|
+
border_style="blue",
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
elif not state.data.get("extraction_warnings"):
|
|
166
|
+
console.print("[dim]No doc_description or warnings in this file.[/dim]")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def _show_raw_json(console: Console, state: OutlineBrowserState) -> None:
|
|
170
|
+
if not state.is_ready:
|
|
171
|
+
console.print("[yellow]Load an outline first (option 1).[/yellow]")
|
|
172
|
+
return
|
|
173
|
+
console.print(
|
|
174
|
+
Panel(
|
|
175
|
+
json.dumps(state.data, ensure_ascii=False, indent=2),
|
|
176
|
+
title=f"[bold]JSON — {escape(state.label or '')}[/bold]",
|
|
177
|
+
border_style="green",
|
|
178
|
+
expand=False,
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _show_outline_thinking(console: Console, state: OutlineBrowserState) -> None:
|
|
184
|
+
if not state.is_ready:
|
|
185
|
+
console.print("[yellow]Load an outline first (option 1).[/yellow]")
|
|
186
|
+
return
|
|
187
|
+
ot = state.data.get("outline_thinking")
|
|
188
|
+
if not isinstance(ot, list) or not ot:
|
|
189
|
+
console.print("[dim]No ``outline_thinking`` in this file.[/dim]")
|
|
190
|
+
return
|
|
191
|
+
console.print(
|
|
192
|
+
Panel(
|
|
193
|
+
json.dumps(ot, ensure_ascii=False, indent=2),
|
|
194
|
+
title="[bold]outline_thinking[/bold]",
|
|
195
|
+
border_style="magenta",
|
|
196
|
+
expand=False,
|
|
197
|
+
)
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
_BROWSER_ACTIONS: dict[str, Callable[[Console, OutlineBrowserState, Path], None]] = {
|
|
202
|
+
"1": _open_outline,
|
|
203
|
+
"2": _show_tree,
|
|
204
|
+
"3": _show_summary,
|
|
205
|
+
"4": _show_raw_json,
|
|
206
|
+
"5": _show_outline_thinking,
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def run_outline_browser_session(
|
|
211
|
+
console: Console,
|
|
212
|
+
*,
|
|
213
|
+
preload_path: Path | None = None,
|
|
214
|
+
preload_data: dict[str, Any] | None = None,
|
|
215
|
+
) -> int:
|
|
216
|
+
"""Agent-style loop: load once, browse outline with a sub-menu."""
|
|
217
|
+
state = OutlineBrowserState()
|
|
218
|
+
storage = ensure_cli_storage()
|
|
219
|
+
|
|
220
|
+
if preload_data is not None:
|
|
221
|
+
state.data = preload_data
|
|
222
|
+
state.path = preload_path
|
|
223
|
+
state.label = str(preload_data.get("doc_name") or (preload_path.stem if preload_path else "outline"))
|
|
224
|
+
|
|
225
|
+
console.print(
|
|
226
|
+
"\n[bold cyan]View outlines[/bold cyan]\n"
|
|
227
|
+
"[dim]Read-only — inspect section trees from saved runs (no element search here).[/dim]\n"
|
|
228
|
+
"[dim]For page/section text chunks use [bold]View → View elements[/bold].[/dim]\n"
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
while True:
|
|
232
|
+
console.print()
|
|
233
|
+
console.print(state.status_line())
|
|
234
|
+
choice = prompt_menu(
|
|
235
|
+
console,
|
|
236
|
+
title="View outlines",
|
|
237
|
+
options=OUTLINE_BROWSER_MENU,
|
|
238
|
+
prompt="What next",
|
|
239
|
+
default_key="2" if state.is_ready else "1",
|
|
240
|
+
allow_back=True,
|
|
241
|
+
hint="Option 2+ need a loaded outline — use option 1 first.",
|
|
242
|
+
)
|
|
243
|
+
if choice in ("exit", "back"):
|
|
244
|
+
return 0
|
|
245
|
+
action = _BROWSER_ACTIONS.get(choice)
|
|
246
|
+
if action:
|
|
247
|
+
action(console, state, storage)
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Rich tree rendering for outline ``structure`` JSON."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.markup import escape
|
|
9
|
+
from rich.tree import Tree
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _node_caption(node: dict[str, Any], *, summary_max: int = 72) -> str:
|
|
13
|
+
title = str(node.get("title") or "").strip() or "(untitled)"
|
|
14
|
+
parts: list[str] = []
|
|
15
|
+
si, ei = node.get("start_index"), node.get("end_index")
|
|
16
|
+
if si is not None and ei is not None:
|
|
17
|
+
parts.append(f"pp.{si}–{ei}")
|
|
18
|
+
elif si is not None:
|
|
19
|
+
parts.append(f"pp.{si}+")
|
|
20
|
+
nid = node.get("node_id")
|
|
21
|
+
if nid:
|
|
22
|
+
parts.append(str(nid))
|
|
23
|
+
meta = f" [dim]({', '.join(parts)})[/dim]" if parts else ""
|
|
24
|
+
summary = str(node.get("summary") or "").strip().replace("\n", " ")
|
|
25
|
+
if summary:
|
|
26
|
+
if len(summary) > summary_max:
|
|
27
|
+
summary = summary[: summary_max - 1] + "…"
|
|
28
|
+
summary = f" [dim]— {escape(summary)}[/dim]"
|
|
29
|
+
return f"[bold white]{escape(title)}[/bold white]{meta}{summary}"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _add_node(branch: Tree, node: dict[str, Any]) -> None:
|
|
33
|
+
cap = _node_caption(node)
|
|
34
|
+
sub = node.get("nodes")
|
|
35
|
+
if isinstance(sub, list) and sub:
|
|
36
|
+
twig = branch.add(cap)
|
|
37
|
+
for child in sub:
|
|
38
|
+
if isinstance(child, dict):
|
|
39
|
+
_add_node(twig, child)
|
|
40
|
+
else:
|
|
41
|
+
twig.add(escape(str(child)))
|
|
42
|
+
else:
|
|
43
|
+
branch.add(cap)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def print_structure_tree(console: Console, structure: Any, *, title: str = "Processed outline") -> None:
|
|
47
|
+
"""Print nested ``structure`` (list of root dicts) as a Rich ``Tree``."""
|
|
48
|
+
root = Tree(f"[bold cyan]{escape(title)}[/bold cyan]")
|
|
49
|
+
if isinstance(structure, list):
|
|
50
|
+
for item in structure:
|
|
51
|
+
if isinstance(item, dict):
|
|
52
|
+
_add_node(root, item)
|
|
53
|
+
else:
|
|
54
|
+
root.add(escape(str(item)))
|
|
55
|
+
elif isinstance(structure, dict):
|
|
56
|
+
_add_node(root, structure)
|
|
57
|
+
else:
|
|
58
|
+
root.add(escape(str(structure)))
|
|
59
|
+
console.print(root)
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""View mode: browse saved outlines (multi-indexing and legacy pageindex JSON)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
|
|
7
|
+
from chunksmith_cli.menu import prompt_menu
|
|
8
|
+
from chunksmith_cli.menus import VIEW_MENU
|
|
9
|
+
from chunksmith_cli.outline_browser import run_outline_browser_session
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run_view_session(console: Console) -> None:
|
|
13
|
+
"""Sub-menu: view saved outline JSON under cli/data."""
|
|
14
|
+
console.print(
|
|
15
|
+
"\n[bold cyan]View[/bold cyan]\n"
|
|
16
|
+
"[dim]Inspect saved runs — ``*_multi_index.json``, ``*_pageindex.json``, or legacy outlines.[/dim]\n"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
while True:
|
|
20
|
+
choice = prompt_menu(
|
|
21
|
+
console,
|
|
22
|
+
title="View",
|
|
23
|
+
options=VIEW_MENU,
|
|
24
|
+
prompt="What to view",
|
|
25
|
+
default_key="1",
|
|
26
|
+
allow_back=True,
|
|
27
|
+
hint="Read-only section tree and JSON from prior chunking runs.",
|
|
28
|
+
)
|
|
29
|
+
if choice in ("exit", "back"):
|
|
30
|
+
return
|
|
31
|
+
if choice == "1":
|
|
32
|
+
run_outline_browser_session(console)
|
chunksmith_cli/wizard.py
ADDED
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""Interactive ChunkSmith CLI — multi-indexing and PageIndexer."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any, Callable
|
|
9
|
+
|
|
10
|
+
from dotenv import load_dotenv
|
|
11
|
+
|
|
12
|
+
from chunksmith_cli.agent_session import run_agent_session
|
|
13
|
+
from chunksmith_cli.branding import print_chunksmith_banner, print_session_hint
|
|
14
|
+
from chunksmith_cli.config import show_verbose_pipeline_events
|
|
15
|
+
from chunksmith_cli.menu import confirm_continue_session, prompt_menu
|
|
16
|
+
from chunksmith_cli.menus import CHUNKING_MENU, ROOT_MENU
|
|
17
|
+
from chunksmith_cli.panels import print_extraction_warnings_panel
|
|
18
|
+
from chunksmith_cli.paths import archive_multi_indexing_cli_outputs, archive_pageindex_cli_outputs, cli_archive_stamp
|
|
19
|
+
from chunksmith_cli.pipelines.multi_indexing_wizard import gather_multi_indexing_inputs
|
|
20
|
+
from chunksmith_cli.pipelines.pageindex_wizard import gather_pageindex_inputs
|
|
21
|
+
from chunksmith_cli.pipelines.run_multi import run_multi_indexing
|
|
22
|
+
from chunksmith_cli.tree_view import print_structure_tree
|
|
23
|
+
from chunksmith_cli.view_session import run_view_session
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
from rich.console import Console
|
|
27
|
+
from rich.markup import escape
|
|
28
|
+
from rich.panel import Panel
|
|
29
|
+
from rich.prompt import Confirm
|
|
30
|
+
|
|
31
|
+
from chunksmith_cli.theme import make_console
|
|
32
|
+
except ImportError:
|
|
33
|
+
print(
|
|
34
|
+
"Missing dependency: install Rich for the CLI.\n pip install rich\n pip install chunksmith-cli",
|
|
35
|
+
file=sys.stderr,
|
|
36
|
+
)
|
|
37
|
+
raise SystemExit(1) from None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _show_optional_outline_extras(console: Console, out: dict[str, Any]) -> None:
|
|
41
|
+
ot = out.get("outline_thinking")
|
|
42
|
+
if (
|
|
43
|
+
isinstance(ot, list)
|
|
44
|
+
and ot
|
|
45
|
+
and Confirm.ask(
|
|
46
|
+
"\n[bold]Show outline thinking?[/bold] [dim](optional JSON)[/dim]",
|
|
47
|
+
default=False,
|
|
48
|
+
)
|
|
49
|
+
):
|
|
50
|
+
console.print(
|
|
51
|
+
Panel(
|
|
52
|
+
json.dumps(ot, ensure_ascii=False, indent=2),
|
|
53
|
+
title="[bold]outline_thinking[/bold]",
|
|
54
|
+
border_style="magenta",
|
|
55
|
+
expand=False,
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
desc = out.get("doc_description")
|
|
60
|
+
if isinstance(desc, str) and desc.strip():
|
|
61
|
+
console.print(
|
|
62
|
+
Panel(
|
|
63
|
+
escape(desc.strip()),
|
|
64
|
+
title="[bold]Document description[/bold]",
|
|
65
|
+
border_style="blue",
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
if encoding_fmt := out.get("flat_rows_toon"):
|
|
70
|
+
if Confirm.ask("\n[bold]Show flat_rows TOON snippet?[/bold]", default=False):
|
|
71
|
+
text = str(encoding_fmt)
|
|
72
|
+
console.print(
|
|
73
|
+
Panel(
|
|
74
|
+
text[:12000] + ("…" if len(text) > 12000 else ""),
|
|
75
|
+
title="[bold]flat_rows_toon[/bold]",
|
|
76
|
+
border_style="cyan",
|
|
77
|
+
expand=False,
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _pipeline_result_to_dict(result: Any, *, encoding: str, pdf_path: Path) -> dict[str, Any]:
|
|
83
|
+
outline = result.title_outline if isinstance(result.title_outline, dict) else {}
|
|
84
|
+
out: dict[str, Any] = {
|
|
85
|
+
"doc_name": outline.get("doc_name") or pdf_path.name,
|
|
86
|
+
"llm_context_format": encoding,
|
|
87
|
+
"structure": outline.get("structure"),
|
|
88
|
+
"flat_rows": outline.get("flat_rows"),
|
|
89
|
+
"doc_description": outline.get("doc_description"),
|
|
90
|
+
"outline_thinking": outline.get("outline_thinking"),
|
|
91
|
+
"indexer_options": outline.get("indexer_options"),
|
|
92
|
+
"mapping_method": result.config.mapping_method,
|
|
93
|
+
"use_group_by_title": result.config.use_group_by_title,
|
|
94
|
+
"title_coded_formate_chars": len(result.title_coded_formate or ""),
|
|
95
|
+
"n_elements": len(result.elements or []),
|
|
96
|
+
"n_chunks": len(result.chunks_local or []) if result.chunks_local else 0,
|
|
97
|
+
}
|
|
98
|
+
if encoding == "toon":
|
|
99
|
+
out["flat_rows_toon"] = outline.get("flat_rows_toon")
|
|
100
|
+
out["structure_toon"] = outline.get("structure_toon")
|
|
101
|
+
return out
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _run_multi_indexing_pipeline(console: Console) -> int:
|
|
105
|
+
code, inputs = gather_multi_indexing_inputs(console)
|
|
106
|
+
if code != 0 or inputs is None:
|
|
107
|
+
return code
|
|
108
|
+
|
|
109
|
+
encoding = inputs.encoding
|
|
110
|
+
console.print(
|
|
111
|
+
f"\n[bold cyan]Running multi-indexing ({encoding.upper()} context)…[/bold cyan]\n"
|
|
112
|
+
"[dim]partition → group → coded format → page index → mapper[/dim]\n"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
def _progress(event: str, payload: dict[str, Any]) -> None:
|
|
116
|
+
if not inputs.stream_events:
|
|
117
|
+
return
|
|
118
|
+
if event == "pipeline:done":
|
|
119
|
+
console.print("[bold green]Pipeline finished[/bold green]")
|
|
120
|
+
elif event.startswith("partition:") or event.startswith("pipeline:"):
|
|
121
|
+
console.print(f"[cyan]{escape(event)}[/cyan] [dim]{escape(str(payload))}[/dim]", highlight=False)
|
|
122
|
+
elif show_verbose_pipeline_events():
|
|
123
|
+
console.print(f"[dim]{escape(event)}[/dim] {escape(str(payload))}", highlight=False)
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
result = run_multi_indexing(
|
|
127
|
+
inputs.pdf_path,
|
|
128
|
+
inputs.mi_prefs,
|
|
129
|
+
progress=_progress,
|
|
130
|
+
)
|
|
131
|
+
except ImportError as e:
|
|
132
|
+
if encoding == "toon" and "toon" in str(e).lower():
|
|
133
|
+
console.print(
|
|
134
|
+
f"[bold red]Missing TOON dependency:[/bold red] {e}\n"
|
|
135
|
+
'[cyan]pip install "chunksmith-multimodal[toon]"[/cyan]'
|
|
136
|
+
)
|
|
137
|
+
else:
|
|
138
|
+
console.print(f"[bold red]Import error:[/bold red] {e}")
|
|
139
|
+
return 1
|
|
140
|
+
except ConnectionError as e:
|
|
141
|
+
console.print(f"[bold red]Network error:[/bold red] {e}")
|
|
142
|
+
return 1
|
|
143
|
+
except OSError as e:
|
|
144
|
+
if getattr(e, "winerror", None) == 11001 or getattr(e, "errno", None) in (11001, -2, -3):
|
|
145
|
+
console.print(
|
|
146
|
+
"[bold red]Cannot reach Unstructured API.[/bold red] "
|
|
147
|
+
"Check `.env` (``UNSTRUCTURED_API_KEY``, ``UNSTRUCTURED_API_URL``).\n"
|
|
148
|
+
f"[dim]{e}[/dim]"
|
|
149
|
+
)
|
|
150
|
+
return 1
|
|
151
|
+
console.print(f"[bold red]Failed:[/bold red] {e}")
|
|
152
|
+
return 1
|
|
153
|
+
except Exception as e:
|
|
154
|
+
console.print(f"[bold red]Failed:[/bold red] {e}")
|
|
155
|
+
return 1
|
|
156
|
+
|
|
157
|
+
out = _pipeline_result_to_dict(result, encoding=encoding, pdf_path=inputs.pdf_path)
|
|
158
|
+
doc_title = str(out.get("doc_name") or inputs.pdf_path.stem)
|
|
159
|
+
print_structure_tree(console, out.get("structure"), title=f"Outline tree — {doc_title}")
|
|
160
|
+
print_extraction_warnings_panel(console, out.get("extraction_warnings"))
|
|
161
|
+
_show_optional_outline_extras(console, out)
|
|
162
|
+
|
|
163
|
+
if inputs.show_toon_after and encoding == "toon":
|
|
164
|
+
flat_toon = out.get("flat_rows_toon") or out.get("structure_toon")
|
|
165
|
+
if flat_toon:
|
|
166
|
+
console.print("\n[bold cyan]TOON outline snippet[/bold cyan]\n")
|
|
167
|
+
text = str(flat_toon)
|
|
168
|
+
console.print(text[:8000] + ("…" if len(text) > 8000 else ""), highlight=False)
|
|
169
|
+
|
|
170
|
+
stamp = cli_archive_stamp()
|
|
171
|
+
archived = archive_multi_indexing_cli_outputs(
|
|
172
|
+
inputs.storage_root,
|
|
173
|
+
inputs.pdf_path.stem,
|
|
174
|
+
out,
|
|
175
|
+
elements=result.elements,
|
|
176
|
+
title_coded_formate=result.title_coded_formate,
|
|
177
|
+
title_outline=result.title_outline if isinstance(result.title_outline, dict) else None,
|
|
178
|
+
mapper_output=result.mapper_output,
|
|
179
|
+
stamp=stamp,
|
|
180
|
+
artifact_root=inputs.artifact_root,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
if inputs.artifact_root is not None:
|
|
184
|
+
console.print(
|
|
185
|
+
f"\n[bold green]Saved under[/bold green] [cyan]{archived.get('artifact_root', inputs.artifact_root)}[/cyan]"
|
|
186
|
+
)
|
|
187
|
+
if "title_coded_formate" in archived:
|
|
188
|
+
console.print(f"[dim]Coded markup (LLM input string):[/dim] [cyan]{archived['title_coded_formate']}[/cyan]")
|
|
189
|
+
else:
|
|
190
|
+
console.print("\n[bold green]Saved to legacy flat storage[/bold green]")
|
|
191
|
+
|
|
192
|
+
for label, key in (
|
|
193
|
+
("Multi-index", "multi_index"),
|
|
194
|
+
("Page index", "pageindex"),
|
|
195
|
+
("Unstructured JSON", "unstructured_elements"),
|
|
196
|
+
("Pickle (raw elements)", "pickle"),
|
|
197
|
+
("Images folder", "image_dir"),
|
|
198
|
+
("Coded markup", "title_coded_formate"),
|
|
199
|
+
("TOON flat rows", "flat_rows_toon"),
|
|
200
|
+
("TOON structure", "structure_toon"),
|
|
201
|
+
("Mapper", "mapper"),
|
|
202
|
+
):
|
|
203
|
+
if key in archived:
|
|
204
|
+
console.print(f" [cyan]{label}[/cyan] {archived[key].resolve()}")
|
|
205
|
+
|
|
206
|
+
if Confirm.ask("\n[bold]Print full JSON here?[/bold]", default=False):
|
|
207
|
+
console.print(
|
|
208
|
+
Panel(
|
|
209
|
+
json.dumps(out, ensure_ascii=False, indent=2),
|
|
210
|
+
title="[bold]JSON[/bold]",
|
|
211
|
+
border_style="green",
|
|
212
|
+
expand=False,
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
console.print("\n[bold green]Done.[/bold green]")
|
|
217
|
+
return 0
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _run_pageindex_pipeline(console: Console) -> int:
|
|
221
|
+
code, inputs = gather_pageindex_inputs(console)
|
|
222
|
+
if code != 0 or inputs is None:
|
|
223
|
+
return code
|
|
224
|
+
|
|
225
|
+
from chunksmith_pageindex import build_outline_from_pdf
|
|
226
|
+
|
|
227
|
+
console.print("\n[bold cyan]Running PageIndexer…[/bold cyan]\n[dim]PDF → tagged pages → LLM outline[/dim]\n")
|
|
228
|
+
|
|
229
|
+
if inputs.stream_progress:
|
|
230
|
+
console.print("[cyan]pipeline:load_pdf[/cyan] [dim]loading pages…[/dim]")
|
|
231
|
+
|
|
232
|
+
try:
|
|
233
|
+
result = build_outline_from_pdf(
|
|
234
|
+
str(inputs.pdf_path),
|
|
235
|
+
settings=inputs.settings,
|
|
236
|
+
add_text=inputs.add_text,
|
|
237
|
+
assign_node_ids=inputs.assign_node_ids,
|
|
238
|
+
add_summary=inputs.add_summary,
|
|
239
|
+
add_word_range=inputs.add_word_range,
|
|
240
|
+
generate_doc_summary=inputs.generate_doc_summary,
|
|
241
|
+
)
|
|
242
|
+
except Exception as e:
|
|
243
|
+
console.print(f"[bold red]PageIndexer failed:[/bold red] {e}")
|
|
244
|
+
return 1
|
|
245
|
+
|
|
246
|
+
if inputs.stream_progress:
|
|
247
|
+
console.print("[cyan]pipeline:done[/cyan] [dim]outline built[/dim]")
|
|
248
|
+
|
|
249
|
+
structure = result.get("structure")
|
|
250
|
+
if isinstance(structure, list) and structure:
|
|
251
|
+
console.print("\n[bold cyan]Outline tree[/bold cyan]\n")
|
|
252
|
+
print_structure_tree(console, structure, title=f"Outline — {inputs.pdf_path.stem}")
|
|
253
|
+
|
|
254
|
+
stamp = cli_archive_stamp()
|
|
255
|
+
archived = archive_pageindex_cli_outputs(
|
|
256
|
+
inputs.storage_root,
|
|
257
|
+
inputs.pdf_path.stem,
|
|
258
|
+
result,
|
|
259
|
+
stamp=stamp,
|
|
260
|
+
artifact_root=inputs.artifact_root,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if inputs.artifact_root is not None:
|
|
264
|
+
console.print(
|
|
265
|
+
f"\n[bold green]Saved under[/bold green] [cyan]{archived.get('artifact_root', inputs.artifact_root)}[/cyan]"
|
|
266
|
+
)
|
|
267
|
+
else:
|
|
268
|
+
console.print("\n[bold green]Saved to legacy flat storage[/bold green]")
|
|
269
|
+
|
|
270
|
+
if "pageindex" in archived:
|
|
271
|
+
console.print(f" [cyan]Page index[/cyan] {archived['pageindex'].resolve()}")
|
|
272
|
+
if "tagged_pages" in archived:
|
|
273
|
+
console.print(f" [cyan]Tagged pages[/cyan] {archived['tagged_pages'].resolve()}")
|
|
274
|
+
|
|
275
|
+
if Confirm.ask("\n[bold]Print full JSON here?[/bold]", default=False):
|
|
276
|
+
console.print(
|
|
277
|
+
Panel(
|
|
278
|
+
json.dumps(result, ensure_ascii=False, indent=2),
|
|
279
|
+
title="[bold]pageindex JSON[/bold]",
|
|
280
|
+
border_style="green",
|
|
281
|
+
expand=False,
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
console.print("\n[bold green]Done.[/bold green]")
|
|
286
|
+
return 0
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
_CHUNKING_ACTIONS: dict[str, Callable[[Console], int]] = {
|
|
290
|
+
"1": _run_multi_indexing_pipeline,
|
|
291
|
+
"multi": _run_multi_indexing_pipeline,
|
|
292
|
+
"indexing": _run_multi_indexing_pipeline,
|
|
293
|
+
"2": _run_pageindex_pipeline,
|
|
294
|
+
"pageindex": _run_pageindex_pipeline,
|
|
295
|
+
"pi": _run_pageindex_pipeline,
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def run_interactive(console: Console | None = None) -> int:
|
|
300
|
+
console = console or make_console()
|
|
301
|
+
load_dotenv()
|
|
302
|
+
print_chunksmith_banner(console)
|
|
303
|
+
print_session_hint(console)
|
|
304
|
+
|
|
305
|
+
while True:
|
|
306
|
+
root = prompt_menu(
|
|
307
|
+
console,
|
|
308
|
+
title="Start here",
|
|
309
|
+
options=ROOT_MENU,
|
|
310
|
+
prompt="Mode",
|
|
311
|
+
default_key="chunking",
|
|
312
|
+
hint="Chunking = build indexes · View = inspect saved JSON · Agent = Q&A (saved indexes).",
|
|
313
|
+
)
|
|
314
|
+
if root == "exit":
|
|
315
|
+
console.print("[dim]Goodbye.[/dim]")
|
|
316
|
+
return 0
|
|
317
|
+
|
|
318
|
+
if root == "agent":
|
|
319
|
+
try:
|
|
320
|
+
run_agent_session(console)
|
|
321
|
+
except ImportError as e:
|
|
322
|
+
console.print(
|
|
323
|
+
"[bold yellow]Agent mode unavailable.[/bold yellow]\n"
|
|
324
|
+
f"[dim]{e}[/dim]\n"
|
|
325
|
+
'Install separately: [cyan]pip install "chunksmith-agent[langchain]"[/cyan]'
|
|
326
|
+
)
|
|
327
|
+
if not confirm_continue_session(console):
|
|
328
|
+
return 0
|
|
329
|
+
continue
|
|
330
|
+
|
|
331
|
+
if root == "view":
|
|
332
|
+
run_view_session(console)
|
|
333
|
+
if not confirm_continue_session(console):
|
|
334
|
+
return 0
|
|
335
|
+
continue
|
|
336
|
+
|
|
337
|
+
choice = prompt_menu(
|
|
338
|
+
console,
|
|
339
|
+
title="Chunking — pick pipeline",
|
|
340
|
+
options=CHUNKING_MENU,
|
|
341
|
+
prompt="What to run",
|
|
342
|
+
default_key="1",
|
|
343
|
+
allow_back=True,
|
|
344
|
+
hint="1 = Multi-indexing (JSON/TOON) · 2 = PageIndexer (no Unstructured API).",
|
|
345
|
+
)
|
|
346
|
+
if choice == "exit":
|
|
347
|
+
console.print("[dim]Goodbye.[/dim]")
|
|
348
|
+
return 0
|
|
349
|
+
if choice == "back":
|
|
350
|
+
continue
|
|
351
|
+
|
|
352
|
+
action = _CHUNKING_ACTIONS.get(choice)
|
|
353
|
+
if action:
|
|
354
|
+
action(console)
|
|
355
|
+
|
|
356
|
+
if not confirm_continue_session(console):
|
|
357
|
+
return 0
|