chunksmith-cli 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunksmith_cli/__init__.py +3 -0
- chunksmith_cli/__main__.py +41 -0
- chunksmith_cli/agent/__init__.py +1 -0
- chunksmith_cli/agent/agent_display.py +160 -0
- chunksmith_cli/agent/agent_session.py +294 -0
- chunksmith_cli/agent/agent_stream.py +152 -0
- chunksmith_cli/agent/agent_wizard.py +5 -0
- chunksmith_cli/agent_display.py +6 -0
- chunksmith_cli/agent_session.py +6 -0
- chunksmith_cli/agent_stream.py +6 -0
- chunksmith_cli/agent_wizard.py +6 -0
- chunksmith_cli/assets/chunksmith_logo.png +0 -0
- chunksmith_cli/branding.py +6 -0
- chunksmith_cli/config.py +6 -0
- chunksmith_cli/core/__init__.py +21 -0
- chunksmith_cli/core/artifact_layout.py +60 -0
- chunksmith_cli/core/branding.py +137 -0
- chunksmith_cli/core/config.py +32 -0
- chunksmith_cli/core/media_preview.py +72 -0
- chunksmith_cli/core/menu.py +110 -0
- chunksmith_cli/core/menus.py +55 -0
- chunksmith_cli/core/panels.py +24 -0
- chunksmith_cli/core/paths.py +869 -0
- chunksmith_cli/core/prefs_mapper.py +97 -0
- chunksmith_cli/core/saved_catalog.py +180 -0
- chunksmith_cli/core/theme.py +38 -0
- chunksmith_cli/elements_json_prompt.py +6 -0
- chunksmith_cli/json_view.py +6 -0
- chunksmith_cli/media_preview.py +6 -0
- chunksmith_cli/menu.py +6 -0
- chunksmith_cli/menus.py +55 -0
- chunksmith_cli/multi_indexing_wizard.py +3 -0
- chunksmith_cli/outline_browser.py +6 -0
- chunksmith_cli/panels.py +6 -0
- chunksmith_cli/partition_prefs.py +74 -0
- chunksmith_cli/paths.py +6 -0
- chunksmith_cli/pdf_prompt.py +6 -0
- chunksmith_cli/pipelines/__init__.py +1 -0
- chunksmith_cli/pipelines/mapping_validation.py +31 -0
- chunksmith_cli/pipelines/multi_indexing_config.py +35 -0
- chunksmith_cli/pipelines/multi_indexing_prompts.py +375 -0
- chunksmith_cli/pipelines/multi_indexing_runtime.py +38 -0
- chunksmith_cli/pipelines/multi_indexing_storage.py +157 -0
- chunksmith_cli/pipelines/multi_indexing_wizard.py +218 -0
- chunksmith_cli/pipelines/pageindex_wizard.py +140 -0
- chunksmith_cli/pipelines/run_multi.py +21 -0
- chunksmith_cli/prompts/__init__.py +1 -0
- chunksmith_cli/prompts/elements_json_prompt.py +49 -0
- chunksmith_cli/prompts/pdf_prompt.py +37 -0
- chunksmith_cli/saved_catalog.py +6 -0
- chunksmith_cli/theme.py +6 -0
- chunksmith_cli/tree_view.py +6 -0
- chunksmith_cli/view_session.py +6 -0
- chunksmith_cli/views/__init__.py +1 -0
- chunksmith_cli/views/json_view.py +11 -0
- chunksmith_cli/views/outline_browser.py +247 -0
- chunksmith_cli/views/tree_view.py +59 -0
- chunksmith_cli/views/view_session.py +32 -0
- chunksmith_cli/wizard.py +357 -0
- chunksmith_cli-0.4.0.dist-info/METADATA +61 -0
- chunksmith_cli-0.4.0.dist-info/RECORD +65 -0
- chunksmith_cli-0.4.0.dist-info/WHEEL +5 -0
- chunksmith_cli-0.4.0.dist-info/entry_points.txt +2 -0
- chunksmith_cli-0.4.0.dist-info/licenses/LICENSE.vectify +21 -0
- chunksmith_cli-0.4.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Map CLI ``PdfPartitionPreferences`` → ``MultiIndexingPreferences`` (PyPI library)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from chunksmith_core.preferences import MultiIndexingPreferences
|
|
9
|
+
from chunksmith_multimodal.config.config import default_anchor_mapping_options
|
|
10
|
+
|
|
11
|
+
from chunksmith_cli.partition_prefs import PdfPartitionPreferences
|
|
12
|
+
from chunksmith_cli.pipelines.mapping_validation import resolve_multi_indexing_mapping
|
|
13
|
+
from chunksmith_cli.pipelines.multi_indexing_runtime import MultiIndexingModules
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _env_int(name: str, default: int) -> int:
|
|
17
|
+
raw = os.getenv(name)
|
|
18
|
+
if not raw:
|
|
19
|
+
return default
|
|
20
|
+
try:
|
|
21
|
+
return int(str(raw).split("#", 1)[0].strip())
|
|
22
|
+
except ValueError:
|
|
23
|
+
return default
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _element_xml_from_mode(
|
|
27
|
+
mode: str,
|
|
28
|
+
custom: dict[str, bool] | None,
|
|
29
|
+
*,
|
|
30
|
+
element_xml_types: tuple[str, ...],
|
|
31
|
+
) -> dict[str, bool]:
|
|
32
|
+
m = (mode or "all").strip().lower()
|
|
33
|
+
if m == "none":
|
|
34
|
+
return dict.fromkeys(element_xml_types, False)
|
|
35
|
+
if m == "minimal":
|
|
36
|
+
keep = {"Title", "NarrativeText", "Table", "Image"}
|
|
37
|
+
return {t: (t in keep) for t in element_xml_types}
|
|
38
|
+
if m == "core":
|
|
39
|
+
keep = {"Title", "NarrativeText", "Text", "ListItem", "Table", "Image", "Header", "Footer"}
|
|
40
|
+
return {t: (t in keep) for t in element_xml_types}
|
|
41
|
+
if m == "custom":
|
|
42
|
+
custom_map = custom or {}
|
|
43
|
+
return {t: bool(custom_map.get(t, False)) for t in element_xml_types}
|
|
44
|
+
return dict.fromkeys(element_xml_types, True)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def multi_indexing_preferences_from_partition(
|
|
48
|
+
prefs: PdfPartitionPreferences,
|
|
49
|
+
mods: MultiIndexingModules,
|
|
50
|
+
*,
|
|
51
|
+
llm_context_format: Literal["json", "toon"],
|
|
52
|
+
partition_pdf: bool,
|
|
53
|
+
elements_json_path: str | None = None,
|
|
54
|
+
stream_llm_tokens: bool = False,
|
|
55
|
+
) -> MultiIndexingPreferences:
|
|
56
|
+
"""Build library preferences from wizard partition prefs."""
|
|
57
|
+
anchor = default_anchor_mapping_options()
|
|
58
|
+
mapping = resolve_multi_indexing_mapping(prefs, mapping_methods=mods.MAPPING_METHODS)
|
|
59
|
+
gen_doc = prefs.chunksmith_pageindex_generate_doc_summary
|
|
60
|
+
|
|
61
|
+
return MultiIndexingPreferences(
|
|
62
|
+
languages=prefs.languages,
|
|
63
|
+
extract_images=bool(prefs.extract_images),
|
|
64
|
+
pages_per_chunk=max(0, int(prefs.pages_per_chunk)),
|
|
65
|
+
max_concurrent=max(1, int(prefs.max_concurrent)),
|
|
66
|
+
partition_pdf=partition_pdf,
|
|
67
|
+
use_group_by_title=bool(prefs.chunksmith_use_group_by_title),
|
|
68
|
+
max_characters=int(prefs.max_characters),
|
|
69
|
+
new_after_n_chars=int(prefs.new_after_n_chars),
|
|
70
|
+
combine_text_under_n_chars=int(prefs.combine_text_under_n_chars),
|
|
71
|
+
group_multipage_sections=bool(prefs.chunksmith_group_multipage_sections),
|
|
72
|
+
mapping_method=mapping, # type: ignore[arg-type]
|
|
73
|
+
llm_context_format=llm_context_format,
|
|
74
|
+
include_empty_mapper_fields=bool(prefs.chunksmith_include_empty_mapper_fields),
|
|
75
|
+
use_start_anchor_mapping=bool(anchor.use_start_anchor_mapping),
|
|
76
|
+
use_end_anchor_mapping=bool(anchor.use_end_anchor_mapping),
|
|
77
|
+
coded_add_page_xml=bool(prefs.chunksmith_coded_add_page_xml),
|
|
78
|
+
coded_add_group_by_title_xml=bool(prefs.chunksmith_coded_add_group_by_title_xml),
|
|
79
|
+
coded_element_xml=_element_xml_from_mode(
|
|
80
|
+
prefs.chunksmith_coded_element_xml_mode,
|
|
81
|
+
prefs.chunksmith_coded_element_xml_custom,
|
|
82
|
+
element_xml_types=mods.ELEMENT_XML_TYPES,
|
|
83
|
+
),
|
|
84
|
+
add_outline_summaries=bool(prefs.effective_pageindex_add_outline_summaries()),
|
|
85
|
+
add_anchor=bool(prefs.chunksmith_pageindex_add_anchor),
|
|
86
|
+
generate_doc_summary=bool(gen_doc) if gen_doc is not None else True,
|
|
87
|
+
use_embedded_pdf_toc=bool(prefs.chunksmith_pageindex_use_embedded_pdf_toc),
|
|
88
|
+
off_llm_parsing=bool(prefs.chunksmith_pageindex_off_llm_parsing),
|
|
89
|
+
probe_first_excerpt=bool(prefs.chunksmith_probe_first_excerpt or False),
|
|
90
|
+
probe_merged_reflection=bool(prefs.chunksmith_probe_merged_reflection or False),
|
|
91
|
+
stream_llm_tokens=bool(stream_llm_tokens),
|
|
92
|
+
pageindex_model=os.getenv("PAGEINDEX_MODEL"),
|
|
93
|
+
llm_model=os.getenv("LLM_MODEL") or os.getenv("CHATGPT_MODEL"),
|
|
94
|
+
max_tokens_per_chunk=_env_int("CHUNKSMITH_MAX_TOKENS_PER_CHUNK", 3000),
|
|
95
|
+
overlap_pages=_env_int("CHUNKSMITH_OVERLAP_PAGES", 1),
|
|
96
|
+
elements_json_path=elements_json_path,
|
|
97
|
+
)
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
"""Discover recent saved outlines and element indexes under cli/data (and logs)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from chunksmith_cli.core.paths import cli_data_dir, default_project_logs_dir, ensure_cli_storage
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class SavedOutlineRow:
|
|
14
|
+
stem: str
|
|
15
|
+
path: Path
|
|
16
|
+
kind: str # pageindex | v3 | outline
|
|
17
|
+
source: str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(frozen=True)
|
|
21
|
+
class SavedElementIndexRow:
|
|
22
|
+
stem: str
|
|
23
|
+
path: Path
|
|
24
|
+
kind: str # multimodal | v3
|
|
25
|
+
pageindex_path: Path | None
|
|
26
|
+
source: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _index_has_queryable_elements(data: dict[str, object], *, kind: str) -> bool:
|
|
30
|
+
if kind == "multi_index":
|
|
31
|
+
return isinstance(data.get("n_elements"), int) and data["n_elements"] > 0
|
|
32
|
+
return isinstance(data.get("elements"), list)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _has_structure(path: Path) -> bool:
|
|
36
|
+
try:
|
|
37
|
+
data = json.loads(path.read_text(encoding="utf-8"))
|
|
38
|
+
except (OSError, json.JSONDecodeError):
|
|
39
|
+
return False
|
|
40
|
+
return isinstance(data, dict) and isinstance(data.get("structure"), list)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _glob_by_mtime(directory: Path, pattern: str) -> list[Path]:
|
|
44
|
+
hits: list[Path] = []
|
|
45
|
+
seen: set[Path] = set()
|
|
46
|
+
for p in directory.glob(pattern):
|
|
47
|
+
key = p.resolve()
|
|
48
|
+
if key not in seen:
|
|
49
|
+
seen.add(key)
|
|
50
|
+
hits.append(p)
|
|
51
|
+
if directory.name != "json":
|
|
52
|
+
for p in directory.glob(f"**/json/{pattern}"):
|
|
53
|
+
key = p.resolve()
|
|
54
|
+
if key not in seen:
|
|
55
|
+
seen.add(key)
|
|
56
|
+
hits.append(p)
|
|
57
|
+
hits.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
58
|
+
return hits
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _collect_outline_files(directory: Path, *, source: str, limit: int) -> list[SavedOutlineRow]:
|
|
62
|
+
rows: list[SavedOutlineRow] = []
|
|
63
|
+
seen: set[str] = set()
|
|
64
|
+
|
|
65
|
+
hits: list[Path] = []
|
|
66
|
+
for pat in ("*_pageindex.json", "*_multi_index.json", "*_v3_index.json"):
|
|
67
|
+
for p in _glob_by_mtime(directory, pat):
|
|
68
|
+
if p not in hits:
|
|
69
|
+
hits.append(p)
|
|
70
|
+
hits.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
71
|
+
|
|
72
|
+
for p in hits:
|
|
73
|
+
if len(rows) >= limit:
|
|
74
|
+
break
|
|
75
|
+
stem = p.name
|
|
76
|
+
if stem.endswith("_pageindex.json"):
|
|
77
|
+
label = stem[: -len("_pageindex.json")]
|
|
78
|
+
kind = "pageindex"
|
|
79
|
+
elif stem.endswith("_multi_index.json"):
|
|
80
|
+
label = stem[: -len("_multi_index.json")]
|
|
81
|
+
kind = "multi_index"
|
|
82
|
+
elif stem.endswith("_v3_index.json"):
|
|
83
|
+
label = stem[: -len("_v3_index.json")]
|
|
84
|
+
kind = "v3"
|
|
85
|
+
else:
|
|
86
|
+
continue
|
|
87
|
+
if label in seen:
|
|
88
|
+
continue
|
|
89
|
+
if not _has_structure(p):
|
|
90
|
+
continue
|
|
91
|
+
seen.add(label)
|
|
92
|
+
rows.append(SavedOutlineRow(stem=label, path=p.resolve(), kind=kind, source=source))
|
|
93
|
+
return rows
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def list_saved_outlines(*, limit: int = 10) -> list[SavedOutlineRow]:
|
|
97
|
+
ensure_cli_storage()
|
|
98
|
+
rows = _collect_outline_files(cli_data_dir(), source="cli/data (+ artifact folders)", limit=limit)
|
|
99
|
+
logs = default_project_logs_dir()
|
|
100
|
+
if logs is not None:
|
|
101
|
+
json_dir = logs / "json"
|
|
102
|
+
if json_dir.is_dir():
|
|
103
|
+
for row in _collect_outline_files(json_dir, source="logs/json", limit=limit):
|
|
104
|
+
if len(rows) >= limit:
|
|
105
|
+
break
|
|
106
|
+
if not any(r.stem == row.stem for r in rows):
|
|
107
|
+
rows.append(row)
|
|
108
|
+
return rows[:limit]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _pageindex_sibling(index_path: Path) -> Path | None:
|
|
112
|
+
name = index_path.name
|
|
113
|
+
parent = index_path.parent
|
|
114
|
+
if name.endswith("_canonical_bundle.json"):
|
|
115
|
+
alt = parent / name.replace("_canonical_bundle.json", "_pageindex.json")
|
|
116
|
+
return alt if alt.is_file() else None
|
|
117
|
+
if name.endswith("_v3_index.json"):
|
|
118
|
+
alt = parent / name.replace("_v3_index.json", "_pageindex.json")
|
|
119
|
+
return alt if alt.is_file() else None
|
|
120
|
+
return None
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _collect_element_indexes(directory: Path, *, source: str, limit: int) -> list[SavedElementIndexRow]:
|
|
124
|
+
rows: list[SavedElementIndexRow] = []
|
|
125
|
+
seen: set[str] = set()
|
|
126
|
+
|
|
127
|
+
hits: list[Path] = []
|
|
128
|
+
for pat in ("*_canonical_bundle.json", "*_v3_index.json"):
|
|
129
|
+
for p in _glob_by_mtime(directory, pat):
|
|
130
|
+
if p not in hits:
|
|
131
|
+
hits.append(p)
|
|
132
|
+
hits.sort(key=lambda p: p.stat().st_mtime, reverse=True)
|
|
133
|
+
|
|
134
|
+
for p in hits:
|
|
135
|
+
if len(rows) >= limit:
|
|
136
|
+
break
|
|
137
|
+
name = p.name
|
|
138
|
+
if name.endswith("_canonical_bundle.json"):
|
|
139
|
+
stem = name[: -len("_canonical_bundle.json")]
|
|
140
|
+
kind = "multimodal"
|
|
141
|
+
elif name.endswith("_v3_index.json"):
|
|
142
|
+
stem = name[: -len("_v3_index.json")]
|
|
143
|
+
kind = "v3"
|
|
144
|
+
else:
|
|
145
|
+
continue
|
|
146
|
+
if stem in seen:
|
|
147
|
+
continue
|
|
148
|
+
try:
|
|
149
|
+
data = json.loads(p.read_text(encoding="utf-8"))
|
|
150
|
+
except (OSError, json.JSONDecodeError):
|
|
151
|
+
continue
|
|
152
|
+
if not isinstance(data, dict) or not _index_has_queryable_elements(data, kind=kind):
|
|
153
|
+
continue
|
|
154
|
+
seen.add(stem)
|
|
155
|
+
pi = _pageindex_sibling(p)
|
|
156
|
+
rows.append(
|
|
157
|
+
SavedElementIndexRow(
|
|
158
|
+
stem=stem,
|
|
159
|
+
path=p.resolve(),
|
|
160
|
+
kind=kind,
|
|
161
|
+
pageindex_path=pi.resolve() if pi else None,
|
|
162
|
+
source=source,
|
|
163
|
+
)
|
|
164
|
+
)
|
|
165
|
+
return rows
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def list_saved_element_indexes(*, limit: int = 10) -> list[SavedElementIndexRow]:
|
|
169
|
+
ensure_cli_storage()
|
|
170
|
+
rows = _collect_element_indexes(cli_data_dir(), source="cli/data (+ artifact folders)", limit=limit)
|
|
171
|
+
logs = default_project_logs_dir()
|
|
172
|
+
if logs is not None:
|
|
173
|
+
json_dir = logs / "json"
|
|
174
|
+
if json_dir.is_dir():
|
|
175
|
+
for row in _collect_element_indexes(json_dir, source="logs/json", limit=limit):
|
|
176
|
+
if len(rows) >= limit:
|
|
177
|
+
break
|
|
178
|
+
if not any(r.stem == row.stem for r in rows):
|
|
179
|
+
rows.append(row)
|
|
180
|
+
return rows[:limit]
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Shared terminal theme and console factory."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
6
|
+
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
from rich.theme import Theme
|
|
9
|
+
|
|
10
|
+
VERSION = "0.4.0"
|
|
11
|
+
try:
|
|
12
|
+
VERSION = version("chunksmith-cli")
|
|
13
|
+
except PackageNotFoundError:
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
NAVY = "#1e3a5f"
|
|
17
|
+
NAVY_LIGHT = "#2d5a87"
|
|
18
|
+
TEAL = "#00b4a6"
|
|
19
|
+
TEAL_BRIGHT = "#2dd4bf"
|
|
20
|
+
TEAL_DIM = "#5ec4bc"
|
|
21
|
+
MUTED = "#94a3b8"
|
|
22
|
+
|
|
23
|
+
CHUNKSMITH_THEME = Theme(
|
|
24
|
+
{
|
|
25
|
+
"brand.navy": NAVY,
|
|
26
|
+
"brand.teal": TEAL,
|
|
27
|
+
"brand.muted": MUTED,
|
|
28
|
+
"info": TEAL_BRIGHT,
|
|
29
|
+
"success": "green",
|
|
30
|
+
"warning": "yellow",
|
|
31
|
+
"error": "bold red",
|
|
32
|
+
}
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def make_console(**kwargs) -> Console:
|
|
37
|
+
"""Application console with ChunkSmith colors."""
|
|
38
|
+
return Console(theme=CHUNKSMITH_THEME, highlight=True, **kwargs)
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""Backward-compatible import path. Prefer `cli.prompts.elements_json_prompt`."""
|
|
2
|
+
|
|
3
|
+
import importlib
|
|
4
|
+
|
|
5
|
+
_mod = importlib.import_module("chunksmith_cli.prompts.elements_json_prompt")
|
|
6
|
+
globals().update({k: v for k, v in _mod.__dict__.items() if not k.startswith("__")})
|
chunksmith_cli/menu.py
ADDED
chunksmith_cli/menus.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Menu definitions for the interactive CLI."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from chunksmith_cli.menu import MenuOption
|
|
6
|
+
|
|
7
|
+
ROOT_MENU: tuple[MenuOption, ...] = (
|
|
8
|
+
MenuOption(
|
|
9
|
+
"chunking",
|
|
10
|
+
"Multi-indexing",
|
|
11
|
+
"Unstructured → group-by-title → coded markup → LLM outline → mapper",
|
|
12
|
+
aliases=("c", "index"),
|
|
13
|
+
),
|
|
14
|
+
MenuOption(
|
|
15
|
+
"view",
|
|
16
|
+
"View saved runs",
|
|
17
|
+
"Inspect outline JSON from saved runs",
|
|
18
|
+
aliases=("v", "browse"),
|
|
19
|
+
),
|
|
20
|
+
MenuOption(
|
|
21
|
+
"agent",
|
|
22
|
+
"Document Q&A",
|
|
23
|
+
"Load a saved index and chat (legacy multimodal bundles)",
|
|
24
|
+
aliases=("a", "chat"),
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
CHUNKING_MENU: tuple[MenuOption, ...] = (
|
|
29
|
+
MenuOption(
|
|
30
|
+
"1",
|
|
31
|
+
"Multi-indexing",
|
|
32
|
+
"Partition → group → coded markup → LLM outline → mapper (JSON or TOON)",
|
|
33
|
+
aliases=("multi", "indexing"),
|
|
34
|
+
),
|
|
35
|
+
MenuOption(
|
|
36
|
+
"2",
|
|
37
|
+
"PageIndexer",
|
|
38
|
+
"PDF parser + LLM outline (no Unstructured partition)",
|
|
39
|
+
aliases=("pageindex", "pi"),
|
|
40
|
+
),
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
VIEW_MENU: tuple[MenuOption, ...] = (
|
|
44
|
+
MenuOption(
|
|
45
|
+
"1",
|
|
46
|
+
"View outlines",
|
|
47
|
+
"Section tree, summary, raw JSON (read-only)",
|
|
48
|
+
aliases=("outline", "json", "tree"),
|
|
49
|
+
),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
AGENT_MENU: tuple[MenuOption, ...] = (
|
|
53
|
+
MenuOption("1", "Open saved index", "Enter a path or stem from saved runs"),
|
|
54
|
+
MenuOption("2", "Chat", "Ask questions about the loaded document"),
|
|
55
|
+
)
|
chunksmith_cli/panels.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""CLI process settings — mirrors MVL ``PdfPartitionPreferences`` / ``ChunkSmithProcessOptions``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any, Dict
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class PdfPartitionPreferences:
|
|
11
|
+
"""Same fields as MVL-AgneticAI ``app.services.indexing.partition_prefs``."""
|
|
12
|
+
|
|
13
|
+
languages: str = "eng"
|
|
14
|
+
extract_images: bool = True
|
|
15
|
+
extract_tables: bool = True
|
|
16
|
+
max_characters: int = 3000
|
|
17
|
+
new_after_n_chars: int = 3800
|
|
18
|
+
combine_text_under_n_chars: int = 200
|
|
19
|
+
pages_per_chunk: int = 10
|
|
20
|
+
max_concurrent: int = 2
|
|
21
|
+
summarize_outline_nodes: bool = True
|
|
22
|
+
|
|
23
|
+
chunksmith_pageindex_add_text: bool = False
|
|
24
|
+
chunksmith_pageindex_node_text_source: str = "pdf_parser"
|
|
25
|
+
chunksmith_pageindex_add_anchor: bool = False
|
|
26
|
+
chunksmith_pageindex_assign_node_ids: bool = True
|
|
27
|
+
chunksmith_pageindex_add_outline_summaries: bool | None = None
|
|
28
|
+
chunksmith_pageindex_generate_doc_summary: bool | None = None
|
|
29
|
+
|
|
30
|
+
chunksmith_partition_backend: str = "api"
|
|
31
|
+
chunksmith_probe_first_excerpt: bool | None = None
|
|
32
|
+
chunksmith_probe_merged_reflection: bool | None = None
|
|
33
|
+
|
|
34
|
+
chunksmith_use_group_by_title: bool = True
|
|
35
|
+
chunksmith_mapping_method: str | None = None
|
|
36
|
+
chunksmith_include_empty_mapper_fields: bool = False
|
|
37
|
+
chunksmith_group_multipage_sections: bool = True
|
|
38
|
+
chunksmith_coded_add_page_xml: bool = True
|
|
39
|
+
chunksmith_coded_add_group_by_title_xml: bool = True
|
|
40
|
+
chunksmith_coded_element_xml_mode: str = "all"
|
|
41
|
+
chunksmith_coded_element_xml_custom: Dict[str, bool] | None = None
|
|
42
|
+
chunksmith_pageindex_use_embedded_pdf_toc: bool = True
|
|
43
|
+
chunksmith_pageindex_off_llm_parsing: bool = False
|
|
44
|
+
|
|
45
|
+
def effective_pageindex_add_outline_summaries(self) -> bool:
|
|
46
|
+
if self.chunksmith_pageindex_add_outline_summaries is not None:
|
|
47
|
+
return bool(self.chunksmith_pageindex_add_outline_summaries)
|
|
48
|
+
return bool(self.summarize_outline_nodes)
|
|
49
|
+
|
|
50
|
+
def to_stored_dict(self) -> Dict[str, Any]:
|
|
51
|
+
return {
|
|
52
|
+
"languages": self.languages,
|
|
53
|
+
"extract_images": self.extract_images,
|
|
54
|
+
"extract_tables": self.extract_tables,
|
|
55
|
+
"max_characters": self.max_characters,
|
|
56
|
+
"new_after_n_chars": self.new_after_n_chars,
|
|
57
|
+
"combine_text_under_n_chars": self.combine_text_under_n_chars,
|
|
58
|
+
"pages_per_chunk": self.pages_per_chunk,
|
|
59
|
+
"max_concurrent": self.max_concurrent,
|
|
60
|
+
"summarize_outline_nodes": self.summarize_outline_nodes,
|
|
61
|
+
"chunksmith_use_group_by_title": self.chunksmith_use_group_by_title,
|
|
62
|
+
"chunksmith_mapping_method": self.chunksmith_mapping_method,
|
|
63
|
+
"chunksmith_include_empty_mapper_fields": self.chunksmith_include_empty_mapper_fields,
|
|
64
|
+
"chunksmith_group_multipage_sections": self.chunksmith_group_multipage_sections,
|
|
65
|
+
"chunksmith_coded_add_page_xml": self.chunksmith_coded_add_page_xml,
|
|
66
|
+
"chunksmith_coded_add_group_by_title_xml": self.chunksmith_coded_add_group_by_title_xml,
|
|
67
|
+
"chunksmith_coded_element_xml_mode": self.chunksmith_coded_element_xml_mode,
|
|
68
|
+
"chunksmith_coded_element_xml_custom": self.chunksmith_coded_element_xml_custom,
|
|
69
|
+
"chunksmith_pageindex_add_anchor": self.chunksmith_pageindex_add_anchor,
|
|
70
|
+
"chunksmith_probe_first_excerpt": self.chunksmith_probe_first_excerpt,
|
|
71
|
+
"chunksmith_probe_merged_reflection": self.chunksmith_probe_merged_reflection,
|
|
72
|
+
"chunksmith_pageindex_use_embedded_pdf_toc": self.chunksmith_pageindex_use_embedded_pdf_toc,
|
|
73
|
+
"chunksmith_pageindex_off_llm_parsing": self.chunksmith_pageindex_off_llm_parsing,
|
|
74
|
+
}
|
chunksmith_cli/paths.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Multi-indexing ingest wizards and artifact persistence."""
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Validate MultiIndexing ``mapping_method`` (aligned with MVL app)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from chunksmith_cli.partition_prefs import PdfPartitionPreferences # noqa: TC001
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def require_mapping_method(value: str | None, *, mapping_methods: frozenset[str]) -> str:
|
|
12
|
+
raw = (value or "").strip().lower()
|
|
13
|
+
if raw not in mapping_methods:
|
|
14
|
+
raise ValueError(f"chunksmith_mapping_method is required; use one of: {', '.join(sorted(mapping_methods))}")
|
|
15
|
+
return raw
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def resolve_multi_indexing_mapping(
|
|
19
|
+
prefs: PdfPartitionPreferences,
|
|
20
|
+
*,
|
|
21
|
+
mapping_methods: frozenset[str],
|
|
22
|
+
) -> str:
|
|
23
|
+
mapping_method = require_mapping_method(prefs.chunksmith_mapping_method, mapping_methods=mapping_methods)
|
|
24
|
+
use_group = bool(prefs.chunksmith_use_group_by_title)
|
|
25
|
+
if mapping_method in ("group_indexing", "title_indexing") and not use_group:
|
|
26
|
+
raise ValueError(
|
|
27
|
+
f"{mapping_method} requires chunksmith_use_group_by_title=true (or choose page_indexing / anchor_indexing)."
|
|
28
|
+
)
|
|
29
|
+
if mapping_method == "anchor_indexing" and not prefs.chunksmith_pageindex_add_anchor:
|
|
30
|
+
raise ValueError("anchor_indexing requires chunksmith_pageindex_add_anchor=true.")
|
|
31
|
+
return mapping_method
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Build ``MultiIndexingPreferences`` from CLI prefs (MVL app + notebook parity)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from chunksmith_core.preferences import MultiIndexingPreferences
|
|
9
|
+
|
|
10
|
+
from chunksmith_cli.core.prefs_mapper import multi_indexing_preferences_from_partition
|
|
11
|
+
from chunksmith_cli.partition_prefs import PdfPartitionPreferences
|
|
12
|
+
from chunksmith_cli.pipelines.multi_indexing_runtime import MultiIndexingModules
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def build_multi_indexing_preferences(
|
|
16
|
+
mods: MultiIndexingModules,
|
|
17
|
+
prefs: PdfPartitionPreferences,
|
|
18
|
+
*,
|
|
19
|
+
llm_context_format: Literal["json", "toon"],
|
|
20
|
+
pdf_path: Path,
|
|
21
|
+
elements_json_path: Path | None,
|
|
22
|
+
partition_pdf: bool,
|
|
23
|
+
stream_llm_tokens: bool,
|
|
24
|
+
) -> MultiIndexingPreferences:
|
|
25
|
+
"""Map wizard answers to ``MultiIndexingPreferences`` for the PyPI pipeline."""
|
|
26
|
+
del pdf_path # path is passed separately to the runner
|
|
27
|
+
elements_path = str(elements_json_path) if elements_json_path is not None else None
|
|
28
|
+
return multi_indexing_preferences_from_partition(
|
|
29
|
+
prefs,
|
|
30
|
+
mods,
|
|
31
|
+
llm_context_format=llm_context_format,
|
|
32
|
+
partition_pdf=partition_pdf,
|
|
33
|
+
elements_json_path=elements_path,
|
|
34
|
+
stream_llm_tokens=stream_llm_tokens,
|
|
35
|
+
)
|