chunksmith-cli 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. chunksmith_cli/__init__.py +3 -0
  2. chunksmith_cli/__main__.py +41 -0
  3. chunksmith_cli/agent/__init__.py +1 -0
  4. chunksmith_cli/agent/agent_display.py +160 -0
  5. chunksmith_cli/agent/agent_session.py +294 -0
  6. chunksmith_cli/agent/agent_stream.py +152 -0
  7. chunksmith_cli/agent/agent_wizard.py +5 -0
  8. chunksmith_cli/agent_display.py +6 -0
  9. chunksmith_cli/agent_session.py +6 -0
  10. chunksmith_cli/agent_stream.py +6 -0
  11. chunksmith_cli/agent_wizard.py +6 -0
  12. chunksmith_cli/assets/chunksmith_logo.png +0 -0
  13. chunksmith_cli/branding.py +6 -0
  14. chunksmith_cli/config.py +6 -0
  15. chunksmith_cli/core/__init__.py +21 -0
  16. chunksmith_cli/core/artifact_layout.py +60 -0
  17. chunksmith_cli/core/branding.py +137 -0
  18. chunksmith_cli/core/config.py +32 -0
  19. chunksmith_cli/core/media_preview.py +72 -0
  20. chunksmith_cli/core/menu.py +110 -0
  21. chunksmith_cli/core/menus.py +55 -0
  22. chunksmith_cli/core/panels.py +24 -0
  23. chunksmith_cli/core/paths.py +869 -0
  24. chunksmith_cli/core/prefs_mapper.py +97 -0
  25. chunksmith_cli/core/saved_catalog.py +180 -0
  26. chunksmith_cli/core/theme.py +38 -0
  27. chunksmith_cli/elements_json_prompt.py +6 -0
  28. chunksmith_cli/json_view.py +6 -0
  29. chunksmith_cli/media_preview.py +6 -0
  30. chunksmith_cli/menu.py +6 -0
  31. chunksmith_cli/menus.py +55 -0
  32. chunksmith_cli/multi_indexing_wizard.py +3 -0
  33. chunksmith_cli/outline_browser.py +6 -0
  34. chunksmith_cli/panels.py +6 -0
  35. chunksmith_cli/partition_prefs.py +74 -0
  36. chunksmith_cli/paths.py +6 -0
  37. chunksmith_cli/pdf_prompt.py +6 -0
  38. chunksmith_cli/pipelines/__init__.py +1 -0
  39. chunksmith_cli/pipelines/mapping_validation.py +31 -0
  40. chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
  41. chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
  42. chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
  43. chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
  44. chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
  45. chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
  46. chunksmith_cli/pipelines/run_multi.py +21 -0
  47. chunksmith_cli/prompts/__init__.py +1 -0
  48. chunksmith_cli/prompts/elements_json_prompt.py +49 -0
  49. chunksmith_cli/prompts/pdf_prompt.py +37 -0
  50. chunksmith_cli/saved_catalog.py +6 -0
  51. chunksmith_cli/theme.py +6 -0
  52. chunksmith_cli/tree_view.py +6 -0
  53. chunksmith_cli/view_session.py +6 -0
  54. chunksmith_cli/views/__init__.py +1 -0
  55. chunksmith_cli/views/json_view.py +11 -0
  56. chunksmith_cli/views/outline_browser.py +247 -0
  57. chunksmith_cli/views/tree_view.py +59 -0
  58. chunksmith_cli/views/view_session.py +32 -0
  59. chunksmith_cli/wizard.py +357 -0
  60. chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
  61. chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
  62. chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
  63. chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
  64. chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
  65. chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,97 @@
1
+ """Map CLI ``PdfPartitionPreferences`` → ``MultiIndexingPreferences`` (PyPI library)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from typing import Literal
7
+
8
+ from chunksmith_core.preferences import MultiIndexingPreferences
9
+ from chunksmith_multimodal.config.config import default_anchor_mapping_options
10
+
11
+ from chunksmith_cli.partition_prefs import PdfPartitionPreferences
12
+ from chunksmith_cli.pipelines.mapping_validation import resolve_multi_indexing_mapping
13
+ from chunksmith_cli.pipelines.multi_indexing_runtime import MultiIndexingModules
14
+
15
+
16
+ def _env_int(name: str, default: int) -> int:
17
+ raw = os.getenv(name)
18
+ if not raw:
19
+ return default
20
+ try:
21
+ return int(str(raw).split("#", 1)[0].strip())
22
+ except ValueError:
23
+ return default
24
+
25
+
26
+ def _element_xml_from_mode(
27
+ mode: str,
28
+ custom: dict[str, bool] | None,
29
+ *,
30
+ element_xml_types: tuple[str, ...],
31
+ ) -> dict[str, bool]:
32
+ m = (mode or "all").strip().lower()
33
+ if m == "none":
34
+ return dict.fromkeys(element_xml_types, False)
35
+ if m == "minimal":
36
+ keep = {"Title", "NarrativeText", "Table", "Image"}
37
+ return {t: (t in keep) for t in element_xml_types}
38
+ if m == "core":
39
+ keep = {"Title", "NarrativeText", "Text", "ListItem", "Table", "Image", "Header", "Footer"}
40
+ return {t: (t in keep) for t in element_xml_types}
41
+ if m == "custom":
42
+ custom_map = custom or {}
43
+ return {t: bool(custom_map.get(t, False)) for t in element_xml_types}
44
+ return dict.fromkeys(element_xml_types, True)
45
+
46
+
47
+ def multi_indexing_preferences_from_partition(
48
+ prefs: PdfPartitionPreferences,
49
+ mods: MultiIndexingModules,
50
+ *,
51
+ llm_context_format: Literal["json", "toon"],
52
+ partition_pdf: bool,
53
+ elements_json_path: str | None = None,
54
+ stream_llm_tokens: bool = False,
55
+ ) -> MultiIndexingPreferences:
56
+ """Build library preferences from wizard partition prefs."""
57
+ anchor = default_anchor_mapping_options()
58
+ mapping = resolve_multi_indexing_mapping(prefs, mapping_methods=mods.MAPPING_METHODS)
59
+ gen_doc = prefs.chunksmith_pageindex_generate_doc_summary
60
+
61
+ return MultiIndexingPreferences(
62
+ languages=prefs.languages,
63
+ extract_images=bool(prefs.extract_images),
64
+ pages_per_chunk=max(0, int(prefs.pages_per_chunk)),
65
+ max_concurrent=max(1, int(prefs.max_concurrent)),
66
+ partition_pdf=partition_pdf,
67
+ use_group_by_title=bool(prefs.chunksmith_use_group_by_title),
68
+ max_characters=int(prefs.max_characters),
69
+ new_after_n_chars=int(prefs.new_after_n_chars),
70
+ combine_text_under_n_chars=int(prefs.combine_text_under_n_chars),
71
+ group_multipage_sections=bool(prefs.chunksmith_group_multipage_sections),
72
+ mapping_method=mapping, # type: ignore[arg-type]
73
+ llm_context_format=llm_context_format,
74
+ include_empty_mapper_fields=bool(prefs.chunksmith_include_empty_mapper_fields),
75
+ use_start_anchor_mapping=bool(anchor.use_start_anchor_mapping),
76
+ use_end_anchor_mapping=bool(anchor.use_end_anchor_mapping),
77
+ coded_add_page_xml=bool(prefs.chunksmith_coded_add_page_xml),
78
+ coded_add_group_by_title_xml=bool(prefs.chunksmith_coded_add_group_by_title_xml),
79
+ coded_element_xml=_element_xml_from_mode(
80
+ prefs.chunksmith_coded_element_xml_mode,
81
+ prefs.chunksmith_coded_element_xml_custom,
82
+ element_xml_types=mods.ELEMENT_XML_TYPES,
83
+ ),
84
+ add_outline_summaries=bool(prefs.effective_pageindex_add_outline_summaries()),
85
+ add_anchor=bool(prefs.chunksmith_pageindex_add_anchor),
86
+ generate_doc_summary=bool(gen_doc) if gen_doc is not None else True,
87
+ use_embedded_pdf_toc=bool(prefs.chunksmith_pageindex_use_embedded_pdf_toc),
88
+ off_llm_parsing=bool(prefs.chunksmith_pageindex_off_llm_parsing),
89
+ probe_first_excerpt=bool(prefs.chunksmith_probe_first_excerpt or False),
90
+ probe_merged_reflection=bool(prefs.chunksmith_probe_merged_reflection or False),
91
+ stream_llm_tokens=bool(stream_llm_tokens),
92
+ pageindex_model=os.getenv("PAGEINDEX_MODEL"),
93
+ llm_model=os.getenv("LLM_MODEL") or os.getenv("CHATGPT_MODEL"),
94
+ max_tokens_per_chunk=_env_int("CHUNKSMITH_MAX_TOKENS_PER_CHUNK", 3000),
95
+ overlap_pages=_env_int("CHUNKSMITH_OVERLAP_PAGES", 1),
96
+ elements_json_path=elements_json_path,
97
+ )
@@ -0,0 +1,180 @@
1
+ """Discover recent saved outlines and element indexes under cli/data (and logs)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+
9
+ from chunksmith_cli.core.paths import cli_data_dir, default_project_logs_dir, ensure_cli_storage
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class SavedOutlineRow:
14
+ stem: str
15
+ path: Path
16
+ kind: str # pageindex | v3 | outline
17
+ source: str
18
+
19
+
20
+ @dataclass(frozen=True)
21
+ class SavedElementIndexRow:
22
+ stem: str
23
+ path: Path
24
+ kind: str # multimodal | v3
25
+ pageindex_path: Path | None
26
+ source: str
27
+
28
+
29
+ def _index_has_queryable_elements(data: dict[str, object], *, kind: str) -> bool:
30
+ if kind == "multi_index":
31
+ return isinstance(data.get("n_elements"), int) and data["n_elements"] > 0
32
+ return isinstance(data.get("elements"), list)
33
+
34
+
35
+ def _has_structure(path: Path) -> bool:
36
+ try:
37
+ data = json.loads(path.read_text(encoding="utf-8"))
38
+ except (OSError, json.JSONDecodeError):
39
+ return False
40
+ return isinstance(data, dict) and isinstance(data.get("structure"), list)
41
+
42
+
43
+ def _glob_by_mtime(directory: Path, pattern: str) -> list[Path]:
44
+ hits: list[Path] = []
45
+ seen: set[Path] = set()
46
+ for p in directory.glob(pattern):
47
+ key = p.resolve()
48
+ if key not in seen:
49
+ seen.add(key)
50
+ hits.append(p)
51
+ if directory.name != "json":
52
+ for p in directory.glob(f"**/json/{pattern}"):
53
+ key = p.resolve()
54
+ if key not in seen:
55
+ seen.add(key)
56
+ hits.append(p)
57
+ hits.sort(key=lambda p: p.stat().st_mtime, reverse=True)
58
+ return hits
59
+
60
+
61
+ def _collect_outline_files(directory: Path, *, source: str, limit: int) -> list[SavedOutlineRow]:
62
+ rows: list[SavedOutlineRow] = []
63
+ seen: set[str] = set()
64
+
65
+ hits: list[Path] = []
66
+ for pat in ("*_pageindex.json", "*_multi_index.json", "*_v3_index.json"):
67
+ for p in _glob_by_mtime(directory, pat):
68
+ if p not in hits:
69
+ hits.append(p)
70
+ hits.sort(key=lambda p: p.stat().st_mtime, reverse=True)
71
+
72
+ for p in hits:
73
+ if len(rows) >= limit:
74
+ break
75
+ stem = p.name
76
+ if stem.endswith("_pageindex.json"):
77
+ label = stem[: -len("_pageindex.json")]
78
+ kind = "pageindex"
79
+ elif stem.endswith("_multi_index.json"):
80
+ label = stem[: -len("_multi_index.json")]
81
+ kind = "multi_index"
82
+ elif stem.endswith("_v3_index.json"):
83
+ label = stem[: -len("_v3_index.json")]
84
+ kind = "v3"
85
+ else:
86
+ continue
87
+ if label in seen:
88
+ continue
89
+ if not _has_structure(p):
90
+ continue
91
+ seen.add(label)
92
+ rows.append(SavedOutlineRow(stem=label, path=p.resolve(), kind=kind, source=source))
93
+ return rows
94
+
95
+
96
+ def list_saved_outlines(*, limit: int = 10) -> list[SavedOutlineRow]:
97
+ ensure_cli_storage()
98
+ rows = _collect_outline_files(cli_data_dir(), source="cli/data (+ artifact folders)", limit=limit)
99
+ logs = default_project_logs_dir()
100
+ if logs is not None:
101
+ json_dir = logs / "json"
102
+ if json_dir.is_dir():
103
+ for row in _collect_outline_files(json_dir, source="logs/json", limit=limit):
104
+ if len(rows) >= limit:
105
+ break
106
+ if not any(r.stem == row.stem for r in rows):
107
+ rows.append(row)
108
+ return rows[:limit]
109
+
110
+
111
+ def _pageindex_sibling(index_path: Path) -> Path | None:
112
+ name = index_path.name
113
+ parent = index_path.parent
114
+ if name.endswith("_canonical_bundle.json"):
115
+ alt = parent / name.replace("_canonical_bundle.json", "_pageindex.json")
116
+ return alt if alt.is_file() else None
117
+ if name.endswith("_v3_index.json"):
118
+ alt = parent / name.replace("_v3_index.json", "_pageindex.json")
119
+ return alt if alt.is_file() else None
120
+ return None
121
+
122
+
123
+ def _collect_element_indexes(directory: Path, *, source: str, limit: int) -> list[SavedElementIndexRow]:
124
+ rows: list[SavedElementIndexRow] = []
125
+ seen: set[str] = set()
126
+
127
+ hits: list[Path] = []
128
+ for pat in ("*_canonical_bundle.json", "*_v3_index.json"):
129
+ for p in _glob_by_mtime(directory, pat):
130
+ if p not in hits:
131
+ hits.append(p)
132
+ hits.sort(key=lambda p: p.stat().st_mtime, reverse=True)
133
+
134
+ for p in hits:
135
+ if len(rows) >= limit:
136
+ break
137
+ name = p.name
138
+ if name.endswith("_canonical_bundle.json"):
139
+ stem = name[: -len("_canonical_bundle.json")]
140
+ kind = "multimodal"
141
+ elif name.endswith("_v3_index.json"):
142
+ stem = name[: -len("_v3_index.json")]
143
+ kind = "v3"
144
+ else:
145
+ continue
146
+ if stem in seen:
147
+ continue
148
+ try:
149
+ data = json.loads(p.read_text(encoding="utf-8"))
150
+ except (OSError, json.JSONDecodeError):
151
+ continue
152
+ if not isinstance(data, dict) or not _index_has_queryable_elements(data, kind=kind):
153
+ continue
154
+ seen.add(stem)
155
+ pi = _pageindex_sibling(p)
156
+ rows.append(
157
+ SavedElementIndexRow(
158
+ stem=stem,
159
+ path=p.resolve(),
160
+ kind=kind,
161
+ pageindex_path=pi.resolve() if pi else None,
162
+ source=source,
163
+ )
164
+ )
165
+ return rows
166
+
167
+
168
+ def list_saved_element_indexes(*, limit: int = 10) -> list[SavedElementIndexRow]:
169
+ ensure_cli_storage()
170
+ rows = _collect_element_indexes(cli_data_dir(), source="cli/data (+ artifact folders)", limit=limit)
171
+ logs = default_project_logs_dir()
172
+ if logs is not None:
173
+ json_dir = logs / "json"
174
+ if json_dir.is_dir():
175
+ for row in _collect_element_indexes(json_dir, source="logs/json", limit=limit):
176
+ if len(rows) >= limit:
177
+ break
178
+ if not any(r.stem == row.stem for r in rows):
179
+ rows.append(row)
180
+ return rows[:limit]
@@ -0,0 +1,38 @@
1
+ """Shared terminal theme and console factory."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from importlib.metadata import PackageNotFoundError, version
6
+
7
+ from rich.console import Console
8
+ from rich.theme import Theme
9
+
10
+ VERSION = "0.4.0"
11
+ try:
12
+ VERSION = version("chunksmith-cli")
13
+ except PackageNotFoundError:
14
+ pass
15
+
16
+ NAVY = "#1e3a5f"
17
+ NAVY_LIGHT = "#2d5a87"
18
+ TEAL = "#00b4a6"
19
+ TEAL_BRIGHT = "#2dd4bf"
20
+ TEAL_DIM = "#5ec4bc"
21
+ MUTED = "#94a3b8"
22
+
23
+ CHUNKSMITH_THEME = Theme(
24
+ {
25
+ "brand.navy": NAVY,
26
+ "brand.teal": TEAL,
27
+ "brand.muted": MUTED,
28
+ "info": TEAL_BRIGHT,
29
+ "success": "green",
30
+ "warning": "yellow",
31
+ "error": "bold red",
32
+ }
33
+ )
34
+
35
+
36
+ def make_console(**kwargs) -> Console:
37
+ """Application console with ChunkSmith colors."""
38
+ return Console(theme=CHUNKSMITH_THEME, highlight=True, **kwargs)
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.prompts.elements_json_prompt`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.prompts.elements_json_prompt")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.views.json_view`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.views.json_view")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.core.media_preview`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.core.media_preview")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
chunksmith_cli/menu.py ADDED
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.core.menu`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.core.menu")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1,55 @@
1
+ """Menu definitions for the interactive CLI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from chunksmith_cli.menu import MenuOption
6
+
7
+ ROOT_MENU: tuple[MenuOption, ...] = (
8
+ MenuOption(
9
+ "chunking",
10
+ "Multi-indexing",
11
+ "Unstructured → group-by-title → coded markup → LLM outline → mapper",
12
+ aliases=("c", "index"),
13
+ ),
14
+ MenuOption(
15
+ "view",
16
+ "View saved runs",
17
+ "Inspect outline JSON from saved runs",
18
+ aliases=("v", "browse"),
19
+ ),
20
+ MenuOption(
21
+ "agent",
22
+ "Document Q&A",
23
+ "Load a saved index and chat (legacy multimodal bundles)",
24
+ aliases=("a", "chat"),
25
+ ),
26
+ )
27
+
28
+ CHUNKING_MENU: tuple[MenuOption, ...] = (
29
+ MenuOption(
30
+ "1",
31
+ "Multi-indexing",
32
+ "Partition → group → coded markup → LLM outline → mapper (JSON or TOON)",
33
+ aliases=("multi", "indexing"),
34
+ ),
35
+ MenuOption(
36
+ "2",
37
+ "PageIndexer",
38
+ "PDF parser + LLM outline (no Unstructured partition)",
39
+ aliases=("pageindex", "pi"),
40
+ ),
41
+ )
42
+
43
+ VIEW_MENU: tuple[MenuOption, ...] = (
44
+ MenuOption(
45
+ "1",
46
+ "View outlines",
47
+ "Section tree, summary, raw JSON (read-only)",
48
+ aliases=("outline", "json", "tree"),
49
+ ),
50
+ )
51
+
52
+ AGENT_MENU: tuple[MenuOption, ...] = (
53
+ MenuOption("1", "Open saved index", "Enter a path or stem from saved runs"),
54
+ MenuOption("2", "Chat", "Ask questions about the loaded document"),
55
+ )
@@ -0,0 +1,3 @@
1
+ """Shim — canonical implementation in ``cli.pipelines.multi_indexing_wizard``."""
2
+
3
+ from chunksmith_cli.pipelines.multi_indexing_wizard import * # noqa: F403
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.views.outline_browser`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.views.outline_browser")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.core.panels`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.core.panels")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1,74 @@
1
+ """CLI process settings — mirrors MVL ``PdfPartitionPreferences`` / ``ChunkSmithProcessOptions``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from typing import Any, Dict
7
+
8
+
9
+ @dataclass
10
+ class PdfPartitionPreferences:
11
+ """Same fields as MVL-AgneticAI ``app.services.indexing.partition_prefs``."""
12
+
13
+ languages: str = "eng"
14
+ extract_images: bool = True
15
+ extract_tables: bool = True
16
+ max_characters: int = 3000
17
+ new_after_n_chars: int = 3800
18
+ combine_text_under_n_chars: int = 200
19
+ pages_per_chunk: int = 10
20
+ max_concurrent: int = 2
21
+ summarize_outline_nodes: bool = True
22
+
23
+ chunksmith_pageindex_add_text: bool = False
24
+ chunksmith_pageindex_node_text_source: str = "pdf_parser"
25
+ chunksmith_pageindex_add_anchor: bool = False
26
+ chunksmith_pageindex_assign_node_ids: bool = True
27
+ chunksmith_pageindex_add_outline_summaries: bool | None = None
28
+ chunksmith_pageindex_generate_doc_summary: bool | None = None
29
+
30
+ chunksmith_partition_backend: str = "api"
31
+ chunksmith_probe_first_excerpt: bool | None = None
32
+ chunksmith_probe_merged_reflection: bool | None = None
33
+
34
+ chunksmith_use_group_by_title: bool = True
35
+ chunksmith_mapping_method: str | None = None
36
+ chunksmith_include_empty_mapper_fields: bool = False
37
+ chunksmith_group_multipage_sections: bool = True
38
+ chunksmith_coded_add_page_xml: bool = True
39
+ chunksmith_coded_add_group_by_title_xml: bool = True
40
+ chunksmith_coded_element_xml_mode: str = "all"
41
+ chunksmith_coded_element_xml_custom: Dict[str, bool] | None = None
42
+ chunksmith_pageindex_use_embedded_pdf_toc: bool = True
43
+ chunksmith_pageindex_off_llm_parsing: bool = False
44
+
45
+ def effective_pageindex_add_outline_summaries(self) -> bool:
46
+ if self.chunksmith_pageindex_add_outline_summaries is not None:
47
+ return bool(self.chunksmith_pageindex_add_outline_summaries)
48
+ return bool(self.summarize_outline_nodes)
49
+
50
+ def to_stored_dict(self) -> Dict[str, Any]:
51
+ return {
52
+ "languages": self.languages,
53
+ "extract_images": self.extract_images,
54
+ "extract_tables": self.extract_tables,
55
+ "max_characters": self.max_characters,
56
+ "new_after_n_chars": self.new_after_n_chars,
57
+ "combine_text_under_n_chars": self.combine_text_under_n_chars,
58
+ "pages_per_chunk": self.pages_per_chunk,
59
+ "max_concurrent": self.max_concurrent,
60
+ "summarize_outline_nodes": self.summarize_outline_nodes,
61
+ "chunksmith_use_group_by_title": self.chunksmith_use_group_by_title,
62
+ "chunksmith_mapping_method": self.chunksmith_mapping_method,
63
+ "chunksmith_include_empty_mapper_fields": self.chunksmith_include_empty_mapper_fields,
64
+ "chunksmith_group_multipage_sections": self.chunksmith_group_multipage_sections,
65
+ "chunksmith_coded_add_page_xml": self.chunksmith_coded_add_page_xml,
66
+ "chunksmith_coded_add_group_by_title_xml": self.chunksmith_coded_add_group_by_title_xml,
67
+ "chunksmith_coded_element_xml_mode": self.chunksmith_coded_element_xml_mode,
68
+ "chunksmith_coded_element_xml_custom": self.chunksmith_coded_element_xml_custom,
69
+ "chunksmith_pageindex_add_anchor": self.chunksmith_pageindex_add_anchor,
70
+ "chunksmith_probe_first_excerpt": self.chunksmith_probe_first_excerpt,
71
+ "chunksmith_probe_merged_reflection": self.chunksmith_probe_merged_reflection,
72
+ "chunksmith_pageindex_use_embedded_pdf_toc": self.chunksmith_pageindex_use_embedded_pdf_toc,
73
+ "chunksmith_pageindex_off_llm_parsing": self.chunksmith_pageindex_off_llm_parsing,
74
+ }
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.core.paths`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.core.paths")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1,6 @@
1
+ """Backward-compatible import path. Prefer `cli.prompts.pdf_prompt`."""
2
+
3
+ import importlib
4
+
5
+ _mod = importlib.import_module("chunksmith_cli.prompts.pdf_prompt")
6
+ globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
@@ -0,0 +1 @@
1
+ """Multi-indexing ingest wizards and artifact persistence."""
@@ -0,0 +1,31 @@
1
+ """Validate MultiIndexing ``mapping_method`` (aligned with MVL app)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from chunksmith_cli.partition_prefs import PdfPartitionPreferences # noqa: TC001
9
+
10
+
11
+ def require_mapping_method(value: str | None, *, mapping_methods: frozenset[str]) -> str:
12
+ raw = (value or "").strip().lower()
13
+ if raw not in mapping_methods:
14
+ raise ValueError(f"chunksmith_mapping_method is required; use one of: {', '.join(sorted(mapping_methods))}")
15
+ return raw
16
+
17
+
18
+ def resolve_multi_indexing_mapping(
19
+ prefs: PdfPartitionPreferences,
20
+ *,
21
+ mapping_methods: frozenset[str],
22
+ ) -> str:
23
+ mapping_method = require_mapping_method(prefs.chunksmith_mapping_method, mapping_methods=mapping_methods)
24
+ use_group = bool(prefs.chunksmith_use_group_by_title)
25
+ if mapping_method in ("group_indexing", "title_indexing") and not use_group:
26
+ raise ValueError(
27
+ f"{mapping_method} requires chunksmith_use_group_by_title=true (or choose page_indexing / anchor_indexing)."
28
+ )
29
+ if mapping_method == "anchor_indexing" and not prefs.chunksmith_pageindex_add_anchor:
30
+ raise ValueError("anchor_indexing requires chunksmith_pageindex_add_anchor=true.")
31
+ return mapping_method
@@ -0,0 +1,35 @@
1
+ """Build ``MultiIndexingPreferences`` from CLI prefs (MVL app + notebook parity)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Literal
7
+
8
+ from chunksmith_core.preferences import MultiIndexingPreferences
9
+
10
+ from chunksmith_cli.core.prefs_mapper import multi_indexing_preferences_from_partition
11
+ from chunksmith_cli.partition_prefs import PdfPartitionPreferences
12
+ from chunksmith_cli.pipelines.multi_indexing_runtime import MultiIndexingModules
13
+
14
+
15
+ def build_multi_indexing_preferences(
16
+ mods: MultiIndexingModules,
17
+ prefs: PdfPartitionPreferences,
18
+ *,
19
+ llm_context_format: Literal["json", "toon"],
20
+ pdf_path: Path,
21
+ elements_json_path: Path | None,
22
+ partition_pdf: bool,
23
+ stream_llm_tokens: bool,
24
+ ) -> MultiIndexingPreferences:
25
+ """Map wizard answers to ``MultiIndexingPreferences`` for the PyPI pipeline."""
26
+ del pdf_path # path is passed separately to the runner
27
+ elements_path = str(elements_json_path) if elements_json_path is not None else None
28
+ return multi_indexing_preferences_from_partition(
29
+ prefs,
30
+ mods,
31
+ llm_context_format=llm_context_format,
32
+ partition_pdf=partition_pdf,
33
+ elements_json_path=elements_path,
34
+ stream_llm_tokens=stream_llm_tokens,
35
+ )