chunksmith-core 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunksmith-core
3
+ Version: 0.3.0
4
+ Summary: ChunkSmith core ports, preferences, and in-memory element sources.
5
+ Author-email: AnshulParate2004 <anshulnparate@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
8
+ Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
9
+ Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: pydantic>=2.10.0
12
+ Requires-Dist: typing_extensions>=4.0.0
13
+ Requires-Dist: python-dotenv>=1.0.0
@@ -0,0 +1,25 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "chunksmith-core"
7
+ version = "0.3.0"
8
+ description = "ChunkSmith core ports, preferences, and in-memory element sources."
9
+ requires-python = ">=3.10"
10
+ license = "MIT"
11
+ authors = [{ name = "AnshulParate2004", email = "anshulnparate@gmail.com" }]
12
+ dependencies = [
13
+ "pydantic>=2.10.0",
14
+ "typing_extensions>=4.0.0",
15
+ "python-dotenv>=1.0.0",
16
+ ]
17
+
18
+ [project.urls]
19
+ Homepage = "https://github.com/AnshulParate2004/chunksmith-lib"
20
+ Repository = "https://github.com/AnshulParate2004/chunksmith-lib"
21
+ Changelog = "https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md"
22
+
23
+ [tool.setuptools.packages.find]
24
+ where = ["src"]
25
+ include = ["chunksmith_core", "chunksmith_core.*"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,28 @@
1
+ """Shared ports, types, and adapters for ChunkSmith library packages."""
2
+
3
+ from chunksmith_core.element_sources import InMemoryElementSource
4
+ from chunksmith_core.output import summarize_multi_indexing_result
5
+ from chunksmith_core.preferences import MultiIndexingPreferences
6
+ from chunksmith_core.ports import (
7
+ ArtifactSink,
8
+ ElementSource,
9
+ LlmClient,
10
+ PartitionOptions,
11
+ PipelineStorage,
12
+ ProgressObserver,
13
+ )
14
+ from chunksmith_core.persist import persist_multi_indexing_artifacts, persist_pageindex_outline
15
+
16
+ __all__ = [
17
+ "ArtifactSink",
18
+ "ElementSource",
19
+ "InMemoryElementSource",
20
+ "LlmClient",
21
+ "MultiIndexingPreferences",
22
+ "PartitionOptions",
23
+ "PipelineStorage",
24
+ "ProgressObserver",
25
+ "persist_multi_indexing_artifacts",
26
+ "persist_pageindex_outline",
27
+ "summarize_multi_indexing_result",
28
+ ]
@@ -0,0 +1,26 @@
1
+ """ElementSource adapters for in-memory / JSON elements."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from chunksmith_core.ports import PartitionOptions
10
+
11
+
12
+ class InMemoryElementSource:
13
+ """Return caller-supplied elements (no partition)."""
14
+
15
+ def __init__(self, elements: list[dict[str, Any]]) -> None:
16
+ self._elements = list(elements)
17
+
18
+ def partition_pdf(self, pdf_path: Path | str, opts: PartitionOptions) -> tuple[list[dict[str, Any]], Any]:
19
+ return self._elements, self._elements
20
+
21
+ def load_json(self, path: Path | str) -> list[dict[str, Any]]:
22
+ raw = Path(path).read_text(encoding="utf-8")
23
+ data = json.loads(raw)
24
+ if not isinstance(data, list):
25
+ raise ValueError("elements JSON must be a list of dicts")
26
+ return data
@@ -0,0 +1,48 @@
1
+ """Default LiteLLM-backed LlmClient adapter."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from chunksmith_core.ports import LlmClient
8
+
9
+
10
+ class LiteLLMClient:
11
+ """Wrap langchain-litellm ChatLiteLLM for indexer and PageIndexer use."""
12
+
13
+ def __init__(self, **litellm_kwargs: Any) -> None:
14
+ self._litellm_kwargs = dict(litellm_kwargs)
15
+
16
+ def complete(
17
+ self,
18
+ messages: list[dict[str, Any]],
19
+ *,
20
+ model: str,
21
+ response_format: dict[str, Any] | None = None,
22
+ **kwargs: Any,
23
+ ) -> str:
24
+ from langchain_litellm import ChatLiteLLM
25
+ from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
26
+
27
+ llm_kwargs = {**self._litellm_kwargs, **kwargs}
28
+ if response_format is not None:
29
+ llm_kwargs["response_format"] = response_format
30
+ llm = ChatLiteLLM(model=model, **llm_kwargs)
31
+
32
+ lc_messages = []
33
+ for msg in messages:
34
+ role = (msg.get("role") or "user").lower()
35
+ content = msg.get("content") or ""
36
+ if role == "system":
37
+ lc_messages.append(SystemMessage(content=content))
38
+ elif role == "assistant":
39
+ lc_messages.append(AIMessage(content=content))
40
+ else:
41
+ lc_messages.append(HumanMessage(content=content))
42
+
43
+ response = llm.invoke(lc_messages)
44
+ return str(getattr(response, "content", response) or "")
45
+
46
+
47
+ def default_llm_client(**litellm_kwargs: Any) -> LlmClient:
48
+ return LiteLLMClient(**litellm_kwargs)
@@ -0,0 +1,44 @@
1
+ """Describe pipeline output artifacts for callers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def summarize_multi_indexing_result(result: Any) -> dict[str, Any]:
9
+ """
10
+ Human-readable summary of ``MultiIndexingPipelineResult`` without dumping full payloads.
11
+
12
+ Use this to inspect output shape before persisting to Supabase, Mongo, S3, etc.
13
+ """
14
+ outline = dict(getattr(result, "title_outline", None) or {})
15
+ mapper = dict(getattr(result, "mapper_output", None) or {})
16
+ structure = outline.get("structure") or []
17
+
18
+ def _count_nodes(nodes: list) -> int:
19
+ total = 0
20
+ for node in nodes:
21
+ total += 1
22
+ child = node.get("nodes")
23
+ if isinstance(child, list):
24
+ total += _count_nodes(child)
25
+ return total
26
+
27
+ return {
28
+ "doc_name": outline.get("doc_name"),
29
+ "doc_description": outline.get("doc_description"),
30
+ "mapping_method": mapper.get("mapping_method") or outline.get("mapping_method"),
31
+ "element_count": len(getattr(result, "elements", None) or []),
32
+ "chunk_count": len(getattr(result, "chunks_local", None) or []) or None,
33
+ "outline_node_count": _count_nodes(structure) if isinstance(structure, list) else 0,
34
+ "coded_formate_chars": len(getattr(result, "title_coded_formate", "") or ""),
35
+ "mapper_keys": sorted(mapper.keys()) if mapper else [],
36
+ "artifacts": {
37
+ "elements": "list[dict] — Unstructured element rows",
38
+ "title_coded_formate": "str — XML coded document for LLM",
39
+ "title_outline": "dict — doc_name, doc_description, structure (nested tree)",
40
+ "mapper_output": "dict — combined index + mapping report",
41
+ "chunks_local": "list[TitleChunk] | None — title groups when enabled",
42
+ "raw_partition_output": "Any — raw Unstructured API payload",
43
+ },
44
+ }
@@ -0,0 +1,34 @@
1
+ """Opt-in artifact persistence helpers (library pipelines stay in-memory)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ from chunksmith_core.ports import ArtifactSink, RunArtifactBundle
8
+
9
+
10
+ def persist_multi_indexing_artifacts(result: Any, sink: ArtifactSink) -> None:
11
+ """Persist a MultiIndexingPipelineResult via an ArtifactSink."""
12
+ if result.elements:
13
+ sink.save_elements(list(result.elements))
14
+ if getattr(result, "title_coded_formate", None):
15
+ sink.save_coded_formate(str(result.title_coded_formate))
16
+ if getattr(result, "title_outline", None):
17
+ sink.save_outline(dict(result.title_outline))
18
+ if getattr(result, "mapper_output", None):
19
+ sink.save_mapper(dict(result.mapper_output))
20
+
21
+
22
+ def persist_pageindex_outline(outline: dict[str, Any], sink: ArtifactSink) -> None:
23
+ sink.save_outline(dict(outline))
24
+
25
+
26
+ def persist_bundle(bundle: RunArtifactBundle, sink: ArtifactSink) -> None:
27
+ if bundle.elements:
28
+ sink.save_elements(bundle.elements)
29
+ if bundle.coded_formate:
30
+ sink.save_coded_formate(bundle.coded_formate)
31
+ if bundle.outline:
32
+ sink.save_outline(bundle.outline)
33
+ if bundle.mapper:
34
+ sink.save_mapper(bundle.mapper)
@@ -0,0 +1,86 @@
1
+ """Hexagonal ports — core library never imports Supabase, Mongo, S3, or FastAPI."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Protocol, runtime_checkable
8
+
9
+
10
+ @dataclass
11
+ class PartitionOptions:
12
+ """Options passed to an ElementSource when partitioning a PDF."""
13
+
14
+ languages: str | list[str] = "eng"
15
+ extract_images: bool = True
16
+ pages_per_chunk: int = 10
17
+ max_concurrent: int = 2
18
+
19
+
20
+ @runtime_checkable
21
+ class ElementSource(Protocol):
22
+ """Supply Unstructured-style element dicts from a PDF or cached JSON."""
23
+
24
+ def partition_pdf(self, pdf_path: Path | str, opts: PartitionOptions) -> tuple[list[dict[str, Any]], Any]:
25
+ """Return (elements, raw_partition_output)."""
26
+
27
+ def load_json(self, path: Path | str) -> list[dict[str, Any]]:
28
+ """Load pre-partitioned elements from a JSON file."""
29
+
30
+
31
+ @runtime_checkable
32
+ class LlmClient(Protocol):
33
+ """LLM completion port (LiteLLM adapter lives in chunksmith_core.llm)."""
34
+
35
+ def complete(
36
+ self,
37
+ messages: list[dict[str, Any]],
38
+ *,
39
+ model: str,
40
+ response_format: dict[str, Any] | None = None,
41
+ **kwargs: Any,
42
+ ) -> str: ...
43
+
44
+
45
+ @runtime_checkable
46
+ class ProgressObserver(Protocol):
47
+ """Observe pipeline stage events (UI/API/logging)."""
48
+
49
+ def on_event(self, event: str, payload: dict[str, Any]) -> None: ...
50
+
51
+
52
+ @runtime_checkable
53
+ class ArtifactSink(Protocol):
54
+ """Optional persistence — never invoked by default pipeline runners."""
55
+
56
+ def save_elements(self, elements: list[dict[str, Any]]) -> None: ...
57
+
58
+ def save_coded_formate(self, text: str) -> None: ...
59
+
60
+ def save_outline(self, outline: dict[str, Any]) -> None: ...
61
+
62
+ def save_mapper(self, mapper: dict[str, Any]) -> None: ...
63
+
64
+
65
+ @runtime_checkable
66
+ class PipelineStorage(ArtifactSink, Protocol):
67
+ """
68
+ Progressive storage during pipeline runs.
69
+
70
+ Core pipeline calls ``on_progress`` at each stage and ``save_*`` when artifacts
71
+ are ready. Implement in ``chunksmith_adapters`` (S3, Mongo, filesystem).
72
+ """
73
+
74
+ def on_progress(self, event: str, payload: dict[str, Any]) -> None: ...
75
+
76
+ def finalize(self) -> None: ...
77
+
78
+
79
+ @dataclass
80
+ class RunArtifactBundle:
81
+ """Normalized artifact set for persist helpers."""
82
+
83
+ elements: list[dict[str, Any]] = field(default_factory=list)
84
+ coded_formate: str = ""
85
+ outline: dict[str, Any] = field(default_factory=dict)
86
+ mapper: dict[str, Any] = field(default_factory=dict)
@@ -0,0 +1,82 @@
1
+ """User-facing multi-indexing preferences."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, dataclass, field
6
+ from typing import Any, Literal
7
+
8
+ MappingMethod = Literal[
9
+ "page_indexing",
10
+ "group_indexing",
11
+ "anchor_indexing",
12
+ "chunk_assignment_indexing",
13
+ "title_indexing",
14
+ ]
15
+
16
+ LlmContextFormat = Literal["json", "toon"]
17
+
18
+
19
+ @dataclass
20
+ class MultiIndexingPreferences:
21
+ """
22
+ Configure the full multi-indexing pipeline in one place.
23
+
24
+ Every field is set directly. Pass to ``build_pipeline_config`` or ``run_multi_indexing_with_preferences`` (``chunksmith_multimodal``).
25
+ """
26
+
27
+ # --- Unstructured partition ---
28
+ languages: str | list[str] = "eng"
29
+ extract_images: bool = True
30
+ pages_per_chunk: int = 10
31
+ max_concurrent: int = 2
32
+ partition_pdf: bool = True
33
+
34
+ # --- Group by title ---
35
+ use_group_by_title: bool = True
36
+ max_characters: int = 3000
37
+ new_after_n_chars: int = 3800
38
+ combine_text_under_n_chars: int = 200
39
+ group_multipage_sections: bool = True
40
+
41
+ # --- Mapper ---
42
+ mapping_method: MappingMethod = "page_indexing"
43
+ llm_context_format: LlmContextFormat = "json"
44
+ include_empty_mapper_fields: bool = False
45
+ use_start_anchor_mapping: bool = True
46
+ use_end_anchor_mapping: bool = True
47
+
48
+ # --- Coded markup ---
49
+ coded_add_page_xml: bool = True
50
+ coded_add_group_by_title_xml: bool = True
51
+ # Per element type: FigureCaption, Footer, Formula, Header, Image, ListItem,
52
+ # NarrativeText, Table, Text, Title. Omit keys to keep package default (True).
53
+ coded_element_xml: dict[str, bool] = field(default_factory=dict)
54
+
55
+ # --- LLM page index (indexer) ---
56
+ add_outline_summaries: bool = True
57
+ add_anchor: bool = False
58
+ generate_doc_summary: bool = True
59
+ use_embedded_pdf_toc: bool = True
60
+ off_llm_parsing: bool = False
61
+ probe_first_excerpt: bool = False
62
+ probe_merged_reflection: bool = False
63
+ stream_llm_tokens: bool = False
64
+
65
+ # --- LLM models (optional; env defaults when None) ---
66
+ pageindex_model: str | None = None
67
+ llm_model: str | None = None
68
+ max_tokens_per_chunk: int | None = None
69
+ overlap_pages: int | None = None
70
+ litellm_kwargs: dict[str, Any] = field(default_factory=dict)
71
+
72
+ # --- Resume / skip stages (advanced) ---
73
+ elements: list[dict[str, Any]] | None = None
74
+ elements_json_path: str | None = None
75
+
76
+ def to_dict(self) -> dict[str, Any]:
77
+ return asdict(self)
78
+
79
+ @classmethod
80
+ def from_dict(cls, data: dict[str, Any]) -> MultiIndexingPreferences:
81
+ known = {f.name for f in cls.__dataclass_fields__.values()} # type: ignore[attr-defined]
82
+ return cls(**{k: v for k, v in data.items() if k in known})
@@ -0,0 +1,13 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunksmith-core
3
+ Version: 0.3.0
4
+ Summary: ChunkSmith core ports, preferences, and in-memory element sources.
5
+ Author-email: AnshulParate2004 <anshulnparate@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
8
+ Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
9
+ Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: pydantic>=2.10.0
12
+ Requires-Dist: typing_extensions>=4.0.0
13
+ Requires-Dist: python-dotenv>=1.0.0
@@ -0,0 +1,13 @@
1
+ pyproject.toml
2
+ src/chunksmith_core/__init__.py
3
+ src/chunksmith_core/element_sources.py
4
+ src/chunksmith_core/llm.py
5
+ src/chunksmith_core/output.py
6
+ src/chunksmith_core/persist.py
7
+ src/chunksmith_core/ports.py
8
+ src/chunksmith_core/preferences.py
9
+ src/chunksmith_core.egg-info/PKG-INFO
10
+ src/chunksmith_core.egg-info/SOURCES.txt
11
+ src/chunksmith_core.egg-info/dependency_links.txt
12
+ src/chunksmith_core.egg-info/requires.txt
13
+ src/chunksmith_core.egg-info/top_level.txt
@@ -0,0 +1,3 @@
1
+ pydantic>=2.10.0
2
+ typing_extensions>=4.0.0
3
+ python-dotenv>=1.0.0
@@ -0,0 +1 @@
1
+ chunksmith_core