chunksmith-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. chunksmith_cli/__init__.py +3 -0
  2. chunksmith_cli/__main__.py +41 -0
  3. chunksmith_cli/agent/__init__.py +1 -0
  4. chunksmith_cli/agent/agent_display.py +160 -0
  5. chunksmith_cli/agent/agent_session.py +294 -0
  6. chunksmith_cli/agent/agent_stream.py +152 -0
  7. chunksmith_cli/agent/agent_wizard.py +5 -0
  8. chunksmith_cli/agent_display.py +6 -0
  9. chunksmith_cli/agent_session.py +6 -0
  10. chunksmith_cli/agent_stream.py +6 -0
  11. chunksmith_cli/agent_wizard.py +6 -0
  12. chunksmith_cli/assets/chunksmith_logo.png +0 -0
  13. chunksmith_cli/branding.py +6 -0
  14. chunksmith_cli/config.py +6 -0
  15. chunksmith_cli/core/__init__.py +21 -0
  16. chunksmith_cli/core/artifact_layout.py +60 -0
  17. chunksmith_cli/core/branding.py +137 -0
  18. chunksmith_cli/core/config.py +32 -0
  19. chunksmith_cli/core/media_preview.py +72 -0
  20. chunksmith_cli/core/menu.py +110 -0
  21. chunksmith_cli/core/menus.py +55 -0
  22. chunksmith_cli/core/panels.py +24 -0
  23. chunksmith_cli/core/paths.py +869 -0
  24. chunksmith_cli/core/prefs_mapper.py +97 -0
  25. chunksmith_cli/core/saved_catalog.py +180 -0
  26. chunksmith_cli/core/theme.py +38 -0
  27. chunksmith_cli/elements_json_prompt.py +6 -0
  28. chunksmith_cli/json_view.py +6 -0
  29. chunksmith_cli/media_preview.py +6 -0
  30. chunksmith_cli/menu.py +6 -0
  31. chunksmith_cli/menus.py +55 -0
  32. chunksmith_cli/multi_indexing_wizard.py +3 -0
  33. chunksmith_cli/outline_browser.py +6 -0
  34. chunksmith_cli/panels.py +6 -0
  35. chunksmith_cli/partition_prefs.py +74 -0
  36. chunksmith_cli/paths.py +6 -0
  37. chunksmith_cli/pdf_prompt.py +6 -0
  38. chunksmith_cli/pipelines/__init__.py +1 -0
  39. chunksmith_cli/pipelines/mapping_validation.py +31 -0
  40. chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
  41. chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
  42. chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
  43. chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
  44. chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
  45. chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
  46. chunksmith_cli/pipelines/run_multi.py +21 -0
  47. chunksmith_cli/prompts/__init__.py +1 -0
  48. chunksmith_cli/prompts/elements_json_prompt.py +49 -0
  49. chunksmith_cli/prompts/pdf_prompt.py +37 -0
  50. chunksmith_cli/saved_catalog.py +6 -0
  51. chunksmith_cli/theme.py +6 -0
  52. chunksmith_cli/tree_view.py +6 -0
  53. chunksmith_cli/view_session.py +6 -0
  54. chunksmith_cli/views/__init__.py +1 -0
  55. chunksmith_cli/views/json_view.py +11 -0
  56. chunksmith_cli/views/outline_browser.py +247 -0
  57. chunksmith_cli/views/tree_view.py +59 -0
  58. chunksmith_cli/views/view_session.py +32 -0
  59. chunksmith_cli/wizard.py +357 -0
  60. chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
  61. chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
  62. chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
  63. chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
  64. chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
  65. chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,247 @@
1
+ """Session-based outline browser (like Document Q&A load + sub-menu)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Callable
9
+
10
+ from rich.console import Console
11
+ from rich.markup import escape
12
+ from rich.panel import Panel
13
+ from rich.prompt import Prompt
14
+
15
+ from chunksmith_cli.menu import MenuOption, prompt_menu
16
+ from chunksmith_cli.panels import print_extraction_warnings_panel
17
+ from chunksmith_cli.paths import ensure_cli_storage, normalize_path_string, resolve_pdf_candidate
18
+ from chunksmith_cli.saved_catalog import list_saved_outlines
19
+ from chunksmith_cli.tree_view import print_structure_tree
20
+
21
+ OUTLINE_BROWSER_MENU: tuple[MenuOption, ...] = (
22
+ MenuOption("1", "Open saved outline", "Pick from cli/data or enter a path"),
23
+ MenuOption("2", "Show section tree", "Nested outline (structure)"),
24
+ MenuOption("3", "Document summary", "doc_description and extraction warnings"),
25
+ MenuOption("4", "View raw JSON", "Full file contents in terminal"),
26
+ MenuOption(
27
+ "5",
28
+ "Outline reasoning",
29
+ "Chain-of-thought from outline_thinking (if saved)",
30
+ ),
31
+ )
32
+
33
+
34
+ @dataclass
35
+ class OutlineBrowserState:
36
+ data: dict[str, Any] | None = None
37
+ path: Path | None = None
38
+ label: str | None = None
39
+
40
+ @property
41
+ def is_ready(self) -> bool:
42
+ return self.data is not None and isinstance(self.data.get("structure"), list)
43
+
44
+ def status_line(self) -> str:
45
+ if not self.is_ready:
46
+ return (
47
+ "[dim]No outline loaded — use option 1. "
48
+ "Works with ``*_multi_index.json``, ``*_pageindex.json``, or legacy ``*_v3_index.json``.[/dim]"
49
+ )
50
+ n_nodes = 0
51
+ st = self.data.get("structure") if self.data else None
52
+ if isinstance(st, list):
53
+
54
+ def _count(nodes: list[Any]) -> int:
55
+ total = len(nodes)
56
+ for node in nodes:
57
+ if isinstance(node, dict) and isinstance(node.get("nodes"), list):
58
+ total += _count(node["nodes"])
59
+ return total
60
+
61
+ n_nodes = _count(st)
62
+ kind = "outline"
63
+ if self.path:
64
+ if "_multi_index" in self.path.name:
65
+ kind = "multi-index"
66
+ elif "_pageindex" in self.path.name:
67
+ kind = "pageindex"
68
+ elif "_v3_index" in self.path.name:
69
+ kind = "v3 (legacy)"
70
+ return (
71
+ f"[green]Loaded:[/green] [bold]{escape(self.label or 'outline')}[/bold] "
72
+ f"[dim]({kind} · {n_nodes} nodes)[/dim]"
73
+ )
74
+
75
+
76
+ def _load_outline_file(path: Path) -> dict[str, Any]:
77
+ data = json.loads(path.read_text(encoding="utf-8"))
78
+ if not isinstance(data, dict):
79
+ raise ValueError("root must be a JSON object")
80
+ if not isinstance(data.get("structure"), list):
81
+ raise ValueError("missing top-level 'structure' (not an outline file)")
82
+ return data
83
+
84
+
85
+ def _prompt_outline_path(console: Console, storage: Path) -> Path | None:
86
+ rows = list_saved_outlines()
87
+ console.print(
88
+ f"\n[bold]Open saved outline[/bold]\n"
89
+ f"[dim]``*_pageindex.json`` or ``*_v3_index.json`` under[/dim] [cyan]{storage}[/cyan]\n"
90
+ f"[dim]Path, stem, or number from list. Empty line = cancel.[/dim]\n"
91
+ )
92
+ if rows:
93
+ console.print("[dim]Recent outlines:[/dim]")
94
+ for i, r in enumerate(rows, start=1):
95
+ tag = f" [dim]({r.kind} · {r.source})[/dim]"
96
+ console.print(f" [cyan]{i:>2}[/] {escape(r.stem)}{tag}")
97
+
98
+ while True:
99
+ raw = Prompt.ask("[bold]Outline path or number[/bold]", default="", show_default=False).strip()
100
+ if not raw:
101
+ if rows:
102
+ pick = Prompt.ask("[bold]Or pick number[/bold]", default="1")
103
+ try:
104
+ return rows[int(pick) - 1].path
105
+ except (ValueError, IndexError):
106
+ console.print("[red]Invalid number.[/red]")
107
+ continue
108
+ return None
109
+
110
+ if raw.isdigit() and rows:
111
+ idx = int(raw)
112
+ if 1 <= idx <= len(rows):
113
+ return rows[idx - 1].path
114
+
115
+ path = resolve_pdf_candidate(normalize_path_string(raw))
116
+ if not path.is_file():
117
+ alt = (storage / path.name).resolve()
118
+ if alt.is_file():
119
+ path = alt
120
+ if path.is_file():
121
+ return path
122
+ console.print("[yellow]Not a file — try again or leave empty to cancel.[/yellow]\n")
123
+
124
+
125
+ def _open_outline(console: Console, state: OutlineBrowserState, storage: Path) -> None:
126
+ path = _prompt_outline_path(console, storage)
127
+ if path is None:
128
+ console.print("[dim]Cancelled.[/dim]")
129
+ return
130
+ try:
131
+ data = _load_outline_file(path)
132
+ except (OSError, json.JSONDecodeError, ValueError) as e:
133
+ console.print(f"[bold red]Could not load:[/bold red] {e}")
134
+ return
135
+ state.data = data
136
+ state.path = path.resolve()
137
+ state.label = str(data.get("doc_name") or path.stem)
138
+ console.print(f"\n[bold green]Loaded.[/bold green] {state.status_line()}\n")
139
+ doc_title = str(data.get("doc_name") or path.stem)
140
+ print_structure_tree(console, data.get("structure"), title=f"Outline — {doc_title}")
141
+
142
+
143
+ def _show_tree(console: Console, state: OutlineBrowserState) -> None:
144
+ if not state.is_ready:
145
+ console.print("[yellow]Load an outline first (option 1).[/yellow]")
146
+ return
147
+ doc_title = str(state.data.get("doc_name") or state.label or "document")
148
+ print_structure_tree(console, state.data.get("structure"), title=f"Outline — {doc_title}")
149
+
150
+
151
+ def _show_summary(console: Console, state: OutlineBrowserState) -> None:
152
+ if not state.is_ready:
153
+ console.print("[yellow]Load an outline first (option 1).[/yellow]")
154
+ return
155
+ print_extraction_warnings_panel(console, state.data.get("extraction_warnings"))
156
+ desc = state.data.get("doc_description")
157
+ if isinstance(desc, str) and desc.strip():
158
+ console.print(
159
+ Panel(
160
+ escape(desc.strip()),
161
+ title="[bold]Document description[/bold]",
162
+ border_style="blue",
163
+ )
164
+ )
165
+ elif not state.data.get("extraction_warnings"):
166
+ console.print("[dim]No doc_description or warnings in this file.[/dim]")
167
+
168
+
169
+ def _show_raw_json(console: Console, state: OutlineBrowserState) -> None:
170
+ if not state.is_ready:
171
+ console.print("[yellow]Load an outline first (option 1).[/yellow]")
172
+ return
173
+ console.print(
174
+ Panel(
175
+ json.dumps(state.data, ensure_ascii=False, indent=2),
176
+ title=f"[bold]JSON — {escape(state.label or '')}[/bold]",
177
+ border_style="green",
178
+ expand=False,
179
+ )
180
+ )
181
+
182
+
183
+ def _show_outline_thinking(console: Console, state: OutlineBrowserState) -> None:
184
+ if not state.is_ready:
185
+ console.print("[yellow]Load an outline first (option 1).[/yellow]")
186
+ return
187
+ ot = state.data.get("outline_thinking")
188
+ if not isinstance(ot, list) or not ot:
189
+ console.print("[dim]No ``outline_thinking`` in this file.[/dim]")
190
+ return
191
+ console.print(
192
+ Panel(
193
+ json.dumps(ot, ensure_ascii=False, indent=2),
194
+ title="[bold]outline_thinking[/bold]",
195
+ border_style="magenta",
196
+ expand=False,
197
+ )
198
+ )
199
+
200
+
201
+ _BROWSER_ACTIONS: dict[str, Callable[[Console, OutlineBrowserState, Path], None]] = {
202
+ "1": _open_outline,
203
+ "2": _show_tree,
204
+ "3": _show_summary,
205
+ "4": _show_raw_json,
206
+ "5": _show_outline_thinking,
207
+ }
208
+
209
+
210
+ def run_outline_browser_session(
211
+ console: Console,
212
+ *,
213
+ preload_path: Path | None = None,
214
+ preload_data: dict[str, Any] | None = None,
215
+ ) -> int:
216
+ """Agent-style loop: load once, browse outline with a sub-menu."""
217
+ state = OutlineBrowserState()
218
+ storage = ensure_cli_storage()
219
+
220
+ if preload_data is not None:
221
+ state.data = preload_data
222
+ state.path = preload_path
223
+ state.label = str(preload_data.get("doc_name") or (preload_path.stem if preload_path else "outline"))
224
+
225
+ console.print(
226
+ "\n[bold cyan]View outlines[/bold cyan]\n"
227
+ "[dim]Read-only — inspect section trees from saved runs (no element search here).[/dim]\n"
228
+ "[dim]For page/section text chunks use [bold]View → View elements[/bold].[/dim]\n"
229
+ )
230
+
231
+ while True:
232
+ console.print()
233
+ console.print(state.status_line())
234
+ choice = prompt_menu(
235
+ console,
236
+ title="View outlines",
237
+ options=OUTLINE_BROWSER_MENU,
238
+ prompt="What next",
239
+ default_key="2" if state.is_ready else "1",
240
+ allow_back=True,
241
+ hint="Option 2+ need a loaded outline — use option 1 first.",
242
+ )
243
+ if choice in ("exit", "back"):
244
+ return 0
245
+ action = _BROWSER_ACTIONS.get(choice)
246
+ if action:
247
+ action(console, state, storage)
@@ -0,0 +1,59 @@
1
+ """Rich tree rendering for outline ``structure`` JSON."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from rich.console import Console
8
+ from rich.markup import escape
9
+ from rich.tree import Tree
10
+
11
+
12
+ def _node_caption(node: dict[str, Any], *, summary_max: int = 72) -> str:
13
+ title = str(node.get("title") or "").strip() or "(untitled)"
14
+ parts: list[str] = []
15
+ si, ei = node.get("start_index"), node.get("end_index")
16
+ if si is not None and ei is not None:
17
+ parts.append(f"pp.{si}–{ei}")
18
+ elif si is not None:
19
+ parts.append(f"pp.{si}+")
20
+ nid = node.get("node_id")
21
+ if nid:
22
+ parts.append(str(nid))
23
+ meta = f" [dim]({', '.join(parts)})[/dim]" if parts else ""
24
+ summary = str(node.get("summary") or "").strip().replace("\n", " ")
25
+ if summary:
26
+ if len(summary) > summary_max:
27
+ summary = summary[: summary_max - 1] + "…"
28
+ summary = f" [dim]— {escape(summary)}[/dim]"
29
+ return f"[bold white]{escape(title)}[/bold white]{meta}{summary}"
30
+
31
+
32
+ def _add_node(branch: Tree, node: dict[str, Any]) -> None:
33
+ cap = _node_caption(node)
34
+ sub = node.get("nodes")
35
+ if isinstance(sub, list) and sub:
36
+ twig = branch.add(cap)
37
+ for child in sub:
38
+ if isinstance(child, dict):
39
+ _add_node(twig, child)
40
+ else:
41
+ twig.add(escape(str(child)))
42
+ else:
43
+ branch.add(cap)
44
+
45
+
46
+ def print_structure_tree(console: Console, structure: Any, *, title: str = "Processed outline") -> None:
47
+ """Print nested ``structure`` (list of root dicts) as a Rich ``Tree``."""
48
+ root = Tree(f"[bold cyan]{escape(title)}[/bold cyan]")
49
+ if isinstance(structure, list):
50
+ for item in structure:
51
+ if isinstance(item, dict):
52
+ _add_node(root, item)
53
+ else:
54
+ root.add(escape(str(item)))
55
+ elif isinstance(structure, dict):
56
+ _add_node(root, structure)
57
+ else:
58
+ root.add(escape(str(structure)))
59
+ console.print(root)
@@ -0,0 +1,32 @@
1
+ """View mode: browse saved outlines (multi-indexing and legacy pageindex JSON)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from rich.console import Console
6
+
7
+ from chunksmith_cli.menu import prompt_menu
8
+ from chunksmith_cli.menus import VIEW_MENU
9
+ from chunksmith_cli.outline_browser import run_outline_browser_session
10
+
11
+
12
+ def run_view_session(console: Console) -> None:
13
+ """Sub-menu: view saved outline JSON under cli/data."""
14
+ console.print(
15
+ "\n[bold cyan]View[/bold cyan]\n"
16
+ "[dim]Inspect saved runs — ``*_multi_index.json``, ``*_pageindex.json``, or legacy outlines.[/dim]\n"
17
+ )
18
+
19
+ while True:
20
+ choice = prompt_menu(
21
+ console,
22
+ title="View",
23
+ options=VIEW_MENU,
24
+ prompt="What to view",
25
+ default_key="1",
26
+ allow_back=True,
27
+ hint="Read-only section tree and JSON from prior chunking runs.",
28
+ )
29
+ if choice in ("exit", "back"):
30
+ return
31
+ if choice == "1":
32
+ run_outline_browser_session(console)
@@ -0,0 +1,357 @@
1
+ """Interactive ChunkSmith CLI — multi-indexing and PageIndexer."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Any, Callable
9
+
10
+ from dotenv import load_dotenv
11
+
12
+ from chunksmith_cli.agent_session import run_agent_session
13
+ from chunksmith_cli.branding import print_chunksmith_banner, print_session_hint
14
+ from chunksmith_cli.config import show_verbose_pipeline_events
15
+ from chunksmith_cli.menu import confirm_continue_session, prompt_menu
16
+ from chunksmith_cli.menus import CHUNKING_MENU, ROOT_MENU
17
+ from chunksmith_cli.panels import print_extraction_warnings_panel
18
+ from chunksmith_cli.paths import archive_multi_indexing_cli_outputs, archive_pageindex_cli_outputs, cli_archive_stamp
19
+ from chunksmith_cli.pipelines.multi_indexing_wizard import gather_multi_indexing_inputs
20
+ from chunksmith_cli.pipelines.pageindex_wizard import gather_pageindex_inputs
21
+ from chunksmith_cli.pipelines.run_multi import run_multi_indexing
22
+ from chunksmith_cli.tree_view import print_structure_tree
23
+ from chunksmith_cli.view_session import run_view_session
24
+
25
+ try:
26
+ from rich.console import Console
27
+ from rich.markup import escape
28
+ from rich.panel import Panel
29
+ from rich.prompt import Confirm
30
+
31
+ from chunksmith_cli.theme import make_console
32
+ except ImportError:
33
+ print(
34
+ "Missing dependency: install Rich for the CLI.\n pip install rich\n pip install chunksmith-cli",
35
+ file=sys.stderr,
36
+ )
37
+ raise SystemExit(1) from None
38
+
39
+
40
+ def _show_optional_outline_extras(console: Console, out: dict[str, Any]) -> None:
41
+ ot = out.get("outline_thinking")
42
+ if (
43
+ isinstance(ot, list)
44
+ and ot
45
+ and Confirm.ask(
46
+ "\n[bold]Show outline thinking?[/bold] [dim](optional JSON)[/dim]",
47
+ default=False,
48
+ )
49
+ ):
50
+ console.print(
51
+ Panel(
52
+ json.dumps(ot, ensure_ascii=False, indent=2),
53
+ title="[bold]outline_thinking[/bold]",
54
+ border_style="magenta",
55
+ expand=False,
56
+ )
57
+ )
58
+
59
+ desc = out.get("doc_description")
60
+ if isinstance(desc, str) and desc.strip():
61
+ console.print(
62
+ Panel(
63
+ escape(desc.strip()),
64
+ title="[bold]Document description[/bold]",
65
+ border_style="blue",
66
+ )
67
+ )
68
+
69
+ if encoding_fmt := out.get("flat_rows_toon"):
70
+ if Confirm.ask("\n[bold]Show flat_rows TOON snippet?[/bold]", default=False):
71
+ text = str(encoding_fmt)
72
+ console.print(
73
+ Panel(
74
+ text[:12000] + ("…" if len(text) > 12000 else ""),
75
+ title="[bold]flat_rows_toon[/bold]",
76
+ border_style="cyan",
77
+ expand=False,
78
+ )
79
+ )
80
+
81
+
82
+ def _pipeline_result_to_dict(result: Any, *, encoding: str, pdf_path: Path) -> dict[str, Any]:
83
+ outline = result.title_outline if isinstance(result.title_outline, dict) else {}
84
+ out: dict[str, Any] = {
85
+ "doc_name": outline.get("doc_name") or pdf_path.name,
86
+ "llm_context_format": encoding,
87
+ "structure": outline.get("structure"),
88
+ "flat_rows": outline.get("flat_rows"),
89
+ "doc_description": outline.get("doc_description"),
90
+ "outline_thinking": outline.get("outline_thinking"),
91
+ "indexer_options": outline.get("indexer_options"),
92
+ "mapping_method": result.config.mapping_method,
93
+ "use_group_by_title": result.config.use_group_by_title,
94
+ "title_coded_formate_chars": len(result.title_coded_formate or ""),
95
+ "n_elements": len(result.elements or []),
96
+ "n_chunks": len(result.chunks_local or []) if result.chunks_local else 0,
97
+ }
98
+ if encoding == "toon":
99
+ out["flat_rows_toon"] = outline.get("flat_rows_toon")
100
+ out["structure_toon"] = outline.get("structure_toon")
101
+ return out
102
+
103
+
104
+ def _run_multi_indexing_pipeline(console: Console) -> int:
105
+ code, inputs = gather_multi_indexing_inputs(console)
106
+ if code != 0 or inputs is None:
107
+ return code
108
+
109
+ encoding = inputs.encoding
110
+ console.print(
111
+ f"\n[bold cyan]Running multi-indexing ({encoding.upper()} context)…[/bold cyan]\n"
112
+ "[dim]partition → group → coded format → page index → mapper[/dim]\n"
113
+ )
114
+
115
+ def _progress(event: str, payload: dict[str, Any]) -> None:
116
+ if not inputs.stream_events:
117
+ return
118
+ if event == "pipeline:done":
119
+ console.print("[bold green]Pipeline finished[/bold green]")
120
+ elif event.startswith("partition:") or event.startswith("pipeline:"):
121
+ console.print(f"[cyan]{escape(event)}[/cyan] [dim]{escape(str(payload))}[/dim]", highlight=False)
122
+ elif show_verbose_pipeline_events():
123
+ console.print(f"[dim]{escape(event)}[/dim] {escape(str(payload))}", highlight=False)
124
+
125
+ try:
126
+ result = run_multi_indexing(
127
+ inputs.pdf_path,
128
+ inputs.mi_prefs,
129
+ progress=_progress,
130
+ )
131
+ except ImportError as e:
132
+ if encoding == "toon" and "toon" in str(e).lower():
133
+ console.print(
134
+ f"[bold red]Missing TOON dependency:[/bold red] {e}\n"
135
+ '[cyan]pip install "chunksmith-multimodal[toon]"[/cyan]'
136
+ )
137
+ else:
138
+ console.print(f"[bold red]Import error:[/bold red] {e}")
139
+ return 1
140
+ except ConnectionError as e:
141
+ console.print(f"[bold red]Network error:[/bold red] {e}")
142
+ return 1
143
+ except OSError as e:
144
+ if getattr(e, "winerror", None) == 11001 or getattr(e, "errno", None) in (11001, -2, -3):
145
+ console.print(
146
+ "[bold red]Cannot reach Unstructured API.[/bold red] "
147
+ "Check `.env` (``UNSTRUCTURED_API_KEY``, ``UNSTRUCTURED_API_URL``).\n"
148
+ f"[dim]{e}[/dim]"
149
+ )
150
+ return 1
151
+ console.print(f"[bold red]Failed:[/bold red] {e}")
152
+ return 1
153
+ except Exception as e:
154
+ console.print(f"[bold red]Failed:[/bold red] {e}")
155
+ return 1
156
+
157
+ out = _pipeline_result_to_dict(result, encoding=encoding, pdf_path=inputs.pdf_path)
158
+ doc_title = str(out.get("doc_name") or inputs.pdf_path.stem)
159
+ print_structure_tree(console, out.get("structure"), title=f"Outline tree — {doc_title}")
160
+ print_extraction_warnings_panel(console, out.get("extraction_warnings"))
161
+ _show_optional_outline_extras(console, out)
162
+
163
+ if inputs.show_toon_after and encoding == "toon":
164
+ flat_toon = out.get("flat_rows_toon") or out.get("structure_toon")
165
+ if flat_toon:
166
+ console.print("\n[bold cyan]TOON outline snippet[/bold cyan]\n")
167
+ text = str(flat_toon)
168
+ console.print(text[:8000] + ("…" if len(text) > 8000 else ""), highlight=False)
169
+
170
+ stamp = cli_archive_stamp()
171
+ archived = archive_multi_indexing_cli_outputs(
172
+ inputs.storage_root,
173
+ inputs.pdf_path.stem,
174
+ out,
175
+ elements=result.elements,
176
+ title_coded_formate=result.title_coded_formate,
177
+ title_outline=result.title_outline if isinstance(result.title_outline, dict) else None,
178
+ mapper_output=result.mapper_output,
179
+ stamp=stamp,
180
+ artifact_root=inputs.artifact_root,
181
+ )
182
+
183
+ if inputs.artifact_root is not None:
184
+ console.print(
185
+ f"\n[bold green]Saved under[/bold green] [cyan]{archived.get('artifact_root', inputs.artifact_root)}[/cyan]"
186
+ )
187
+ if "title_coded_formate" in archived:
188
+ console.print(f"[dim]Coded markup (LLM input string):[/dim] [cyan]{archived['title_coded_formate']}[/cyan]")
189
+ else:
190
+ console.print("\n[bold green]Saved to legacy flat storage[/bold green]")
191
+
192
+ for label, key in (
193
+ ("Multi-index", "multi_index"),
194
+ ("Page index", "pageindex"),
195
+ ("Unstructured JSON", "unstructured_elements"),
196
+ ("Pickle (raw elements)", "pickle"),
197
+ ("Images folder", "image_dir"),
198
+ ("Coded markup", "title_coded_formate"),
199
+ ("TOON flat rows", "flat_rows_toon"),
200
+ ("TOON structure", "structure_toon"),
201
+ ("Mapper", "mapper"),
202
+ ):
203
+ if key in archived:
204
+ console.print(f" [cyan]{label}[/cyan] {archived[key].resolve()}")
205
+
206
+ if Confirm.ask("\n[bold]Print full JSON here?[/bold]", default=False):
207
+ console.print(
208
+ Panel(
209
+ json.dumps(out, ensure_ascii=False, indent=2),
210
+ title="[bold]JSON[/bold]",
211
+ border_style="green",
212
+ expand=False,
213
+ )
214
+ )
215
+
216
+ console.print("\n[bold green]Done.[/bold green]")
217
+ return 0
218
+
219
+
220
+ def _run_pageindex_pipeline(console: Console) -> int:
221
+ code, inputs = gather_pageindex_inputs(console)
222
+ if code != 0 or inputs is None:
223
+ return code
224
+
225
+ from chunksmith_pageindex import build_outline_from_pdf
226
+
227
+ console.print("\n[bold cyan]Running PageIndexer…[/bold cyan]\n[dim]PDF → tagged pages → LLM outline[/dim]\n")
228
+
229
+ if inputs.stream_progress:
230
+ console.print("[cyan]pipeline:load_pdf[/cyan] [dim]loading pages…[/dim]")
231
+
232
+ try:
233
+ result = build_outline_from_pdf(
234
+ str(inputs.pdf_path),
235
+ settings=inputs.settings,
236
+ add_text=inputs.add_text,
237
+ assign_node_ids=inputs.assign_node_ids,
238
+ add_summary=inputs.add_summary,
239
+ add_word_range=inputs.add_word_range,
240
+ generate_doc_summary=inputs.generate_doc_summary,
241
+ )
242
+ except Exception as e:
243
+ console.print(f"[bold red]PageIndexer failed:[/bold red] {e}")
244
+ return 1
245
+
246
+ if inputs.stream_progress:
247
+ console.print("[cyan]pipeline:done[/cyan] [dim]outline built[/dim]")
248
+
249
+ structure = result.get("structure")
250
+ if isinstance(structure, list) and structure:
251
+ console.print("\n[bold cyan]Outline tree[/bold cyan]\n")
252
+ print_structure_tree(console, structure, title=f"Outline — {inputs.pdf_path.stem}")
253
+
254
+ stamp = cli_archive_stamp()
255
+ archived = archive_pageindex_cli_outputs(
256
+ inputs.storage_root,
257
+ inputs.pdf_path.stem,
258
+ result,
259
+ stamp=stamp,
260
+ artifact_root=inputs.artifact_root,
261
+ )
262
+
263
+ if inputs.artifact_root is not None:
264
+ console.print(
265
+ f"\n[bold green]Saved under[/bold green] [cyan]{archived.get('artifact_root', inputs.artifact_root)}[/cyan]"
266
+ )
267
+ else:
268
+ console.print("\n[bold green]Saved to legacy flat storage[/bold green]")
269
+
270
+ if "pageindex" in archived:
271
+ console.print(f" [cyan]Page index[/cyan] {archived['pageindex'].resolve()}")
272
+ if "tagged_pages" in archived:
273
+ console.print(f" [cyan]Tagged pages[/cyan] {archived['tagged_pages'].resolve()}")
274
+
275
+ if Confirm.ask("\n[bold]Print full JSON here?[/bold]", default=False):
276
+ console.print(
277
+ Panel(
278
+ json.dumps(result, ensure_ascii=False, indent=2),
279
+ title="[bold]pageindex JSON[/bold]",
280
+ border_style="green",
281
+ expand=False,
282
+ )
283
+ )
284
+
285
+ console.print("\n[bold green]Done.[/bold green]")
286
+ return 0
287
+
288
+
289
+ _CHUNKING_ACTIONS: dict[str, Callable[[Console], int]] = {
290
+ "1": _run_multi_indexing_pipeline,
291
+ "multi": _run_multi_indexing_pipeline,
292
+ "indexing": _run_multi_indexing_pipeline,
293
+ "2": _run_pageindex_pipeline,
294
+ "pageindex": _run_pageindex_pipeline,
295
+ "pi": _run_pageindex_pipeline,
296
+ }
297
+
298
+
299
+ def run_interactive(console: Console | None = None) -> int:
300
+ console = console or make_console()
301
+ load_dotenv()
302
+ print_chunksmith_banner(console)
303
+ print_session_hint(console)
304
+
305
+ while True:
306
+ root = prompt_menu(
307
+ console,
308
+ title="Start here",
309
+ options=ROOT_MENU,
310
+ prompt="Mode",
311
+ default_key="chunking",
312
+ hint="Chunking = build indexes · View = inspect saved JSON · Agent = Q&A (saved indexes).",
313
+ )
314
+ if root == "exit":
315
+ console.print("[dim]Goodbye.[/dim]")
316
+ return 0
317
+
318
+ if root == "agent":
319
+ try:
320
+ run_agent_session(console)
321
+ except ImportError as e:
322
+ console.print(
323
+ "[bold yellow]Agent mode unavailable.[/bold yellow]\n"
324
+ f"[dim]{e}[/dim]\n"
325
+ 'Install separately: [cyan]pip install "chunksmith-agent[langchain]"[/cyan]'
326
+ )
327
+ if not confirm_continue_session(console):
328
+ return 0
329
+ continue
330
+
331
+ if root == "view":
332
+ run_view_session(console)
333
+ if not confirm_continue_session(console):
334
+ return 0
335
+ continue
336
+
337
+ choice = prompt_menu(
338
+ console,
339
+ title="Chunking — pick pipeline",
340
+ options=CHUNKING_MENU,
341
+ prompt="What to run",
342
+ default_key="1",
343
+ allow_back=True,
344
+ hint="1 = Multi-indexing (JSON/TOON) · 2 = PageIndexer (no Unstructured API).",
345
+ )
346
+ if choice == "exit":
347
+ console.print("[dim]Goodbye.[/dim]")
348
+ return 0
349
+ if choice == "back":
350
+ continue
351
+
352
+ action = _CHUNKING_ACTIONS.get(choice)
353
+ if action:
354
+ action(console)
355
+
356
+ if not confirm_continue_session(console):
357
+ return 0