PyPI - chunksmith-core - Versions diffs - 0.3.0__tar.gz - Mend

chunksmith-core 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

chunksmith_core-0.3.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,13 @@
+Metadata-Version: 2.4
+Name: chunksmith-core
+Version: 0.3.0
+Summary: ChunkSmith core ports, preferences, and in-memory element sources.
+Author-email: AnshulParate2004 <anshulnparate@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
+Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
+Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
+Requires-Python: >=3.10
+Requires-Dist: pydantic>=2.10.0
+Requires-Dist: typing_extensions>=4.0.0
+Requires-Dist: python-dotenv>=1.0.0

chunksmith_core-0.3.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "chunksmith-core"
+version = "0.3.0"
+description = "ChunkSmith core ports, preferences, and in-memory element sources."
+requires-python = ">=3.10"
+license = "MIT"
+authors = [{ name = "AnshulParate2004", email = "anshulnparate@gmail.com" }]
+dependencies = [
+    "pydantic>=2.10.0",
+    "typing_extensions>=4.0.0",
+    "python-dotenv>=1.0.0",
+]
+[project.urls]
+Homepage = "https://github.com/AnshulParate2004/chunksmith-lib"
+Repository = "https://github.com/AnshulParate2004/chunksmith-lib"
+Changelog = "https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md"
+[tool.setuptools.packages.find]
+where = ["src"]
+include = ["chunksmith_core", "chunksmith_core.*"]

chunksmith_core-0.3.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

chunksmith_core-0.3.0/src/chunksmith_core/__init__.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""Shared ports, types, and adapters for ChunkSmith library packages."""
+from chunksmith_core.element_sources import InMemoryElementSource
+from chunksmith_core.output import summarize_multi_indexing_result
+from chunksmith_core.preferences import MultiIndexingPreferences
+from chunksmith_core.ports import (
+    ArtifactSink,
+    ElementSource,
+    LlmClient,
+    PartitionOptions,
+    PipelineStorage,
+    ProgressObserver,
+)
+from chunksmith_core.persist import persist_multi_indexing_artifacts, persist_pageindex_outline
+__all__ = [
+    "ArtifactSink",
+    "ElementSource",
+    "InMemoryElementSource",
+    "LlmClient",
+    "MultiIndexingPreferences",
+    "PartitionOptions",
+    "PipelineStorage",
+    "ProgressObserver",
+    "persist_multi_indexing_artifacts",
+    "persist_pageindex_outline",
+    "summarize_multi_indexing_result",
+]

chunksmith_core-0.3.0/src/chunksmith_core/element_sources.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""ElementSource adapters for in-memory / JSON elements."""
+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any
+from chunksmith_core.ports import PartitionOptions
+class InMemoryElementSource:
+    """Return caller-supplied elements (no partition)."""
+    def __init__(self, elements: list[dict[str, Any]]) -> None:
+        self._elements = list(elements)
+    def partition_pdf(self, pdf_path: Path | str, opts: PartitionOptions) -> tuple[list[dict[str, Any]], Any]:
+        return self._elements, self._elements
+    def load_json(self, path: Path | str) -> list[dict[str, Any]]:
+        raw = Path(path).read_text(encoding="utf-8")
+        data = json.loads(raw)
+        if not isinstance(data, list):
+            raise ValueError("elements JSON must be a list of dicts")
+        return data

chunksmith_core-0.3.0/src/chunksmith_core/llm.py ADDED Viewed

@@ -0,0 +1,48 @@
+"""Default LiteLLM-backed LlmClient adapter."""
+from __future__ import annotations
+from typing import Any
+from chunksmith_core.ports import LlmClient
+class LiteLLMClient:
+    """Wrap langchain-litellm ChatLiteLLM for indexer and PageIndexer use."""
+    def __init__(self, **litellm_kwargs: Any) -> None:
+        self._litellm_kwargs = dict(litellm_kwargs)
+    def complete(
+        self,
+        messages: list[dict[str, Any]],
+        *,
+        model: str,
+        response_format: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ) -> str:
+        from langchain_litellm import ChatLiteLLM
+        from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
+        llm_kwargs = {**self._litellm_kwargs, **kwargs}
+        if response_format is not None:
+            llm_kwargs["response_format"] = response_format
+        llm = ChatLiteLLM(model=model, **llm_kwargs)
+        lc_messages = []
+        for msg in messages:
+            role = (msg.get("role") or "user").lower()
+            content = msg.get("content") or ""
+            if role == "system":
+                lc_messages.append(SystemMessage(content=content))
+            elif role == "assistant":
+                lc_messages.append(AIMessage(content=content))
+            else:
+                lc_messages.append(HumanMessage(content=content))
+        response = llm.invoke(lc_messages)
+        return str(getattr(response, "content", response) or "")
+def default_llm_client(**litellm_kwargs: Any) -> LlmClient:
+    return LiteLLMClient(**litellm_kwargs)

chunksmith_core-0.3.0/src/chunksmith_core/output.py ADDED Viewed

@@ -0,0 +1,44 @@
+"""Describe pipeline output artifacts for callers."""
+from __future__ import annotations
+from typing import Any
+def summarize_multi_indexing_result(result: Any) -> dict[str, Any]:
+    """
+    Human-readable summary of ``MultiIndexingPipelineResult`` without dumping full payloads.
+    Use this to inspect output shape before persisting to Supabase, Mongo, S3, etc.
+    """
+    outline = dict(getattr(result, "title_outline", None) or {})
+    mapper = dict(getattr(result, "mapper_output", None) or {})
+    structure = outline.get("structure") or []
+    def _count_nodes(nodes: list) -> int:
+        total = 0
+        for node in nodes:
+            total += 1
+            child = node.get("nodes")
+            if isinstance(child, list):
+                total += _count_nodes(child)
+        return total
+    return {
+        "doc_name": outline.get("doc_name"),
+        "doc_description": outline.get("doc_description"),
+        "mapping_method": mapper.get("mapping_method") or outline.get("mapping_method"),
+        "element_count": len(getattr(result, "elements", None) or []),
+        "chunk_count": len(getattr(result, "chunks_local", None) or []) or None,
+        "outline_node_count": _count_nodes(structure) if isinstance(structure, list) else 0,
+        "coded_formate_chars": len(getattr(result, "title_coded_formate", "") or ""),
+        "mapper_keys": sorted(mapper.keys()) if mapper else [],
+        "artifacts": {
+            "elements": "list[dict] — Unstructured element rows",
+            "title_coded_formate": "str — XML coded document for LLM",
+            "title_outline": "dict — doc_name, doc_description, structure (nested tree)",
+            "mapper_output": "dict — combined index + mapping report",
+            "chunks_local": "list[TitleChunk] | None — title groups when enabled",
+            "raw_partition_output": "Any — raw Unstructured API payload",
+        },
+    }

chunksmith_core-0.3.0/src/chunksmith_core/persist.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""Opt-in artifact persistence helpers (library pipelines stay in-memory)."""
+from __future__ import annotations
+from typing import Any
+from chunksmith_core.ports import ArtifactSink, RunArtifactBundle
+def persist_multi_indexing_artifacts(result: Any, sink: ArtifactSink) -> None:
+    """Persist a MultiIndexingPipelineResult via an ArtifactSink."""
+    if result.elements:
+        sink.save_elements(list(result.elements))
+    if getattr(result, "title_coded_formate", None):
+        sink.save_coded_formate(str(result.title_coded_formate))
+    if getattr(result, "title_outline", None):
+        sink.save_outline(dict(result.title_outline))
+    if getattr(result, "mapper_output", None):
+        sink.save_mapper(dict(result.mapper_output))
+def persist_pageindex_outline(outline: dict[str, Any], sink: ArtifactSink) -> None:
+    sink.save_outline(dict(outline))
+def persist_bundle(bundle: RunArtifactBundle, sink: ArtifactSink) -> None:
+    if bundle.elements:
+        sink.save_elements(bundle.elements)
+    if bundle.coded_formate:
+        sink.save_coded_formate(bundle.coded_formate)
+    if bundle.outline:
+        sink.save_outline(bundle.outline)
+    if bundle.mapper:
+        sink.save_mapper(bundle.mapper)

chunksmith_core-0.3.0/src/chunksmith_core/ports.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""Hexagonal ports — core library never imports Supabase, Mongo, S3, or FastAPI."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Any, Protocol, runtime_checkable
+@dataclass
+class PartitionOptions:
+    """Options passed to an ElementSource when partitioning a PDF."""
+    languages: str | list[str] = "eng"
+    extract_images: bool = True
+    pages_per_chunk: int = 10
+    max_concurrent: int = 2
+@runtime_checkable
+class ElementSource(Protocol):
+    """Supply Unstructured-style element dicts from a PDF or cached JSON."""
+    def partition_pdf(self, pdf_path: Path | str, opts: PartitionOptions) -> tuple[list[dict[str, Any]], Any]:
+        """Return (elements, raw_partition_output)."""
+    def load_json(self, path: Path | str) -> list[dict[str, Any]]:
+        """Load pre-partitioned elements from a JSON file."""
+@runtime_checkable
+class LlmClient(Protocol):
+    """LLM completion port (LiteLLM adapter lives in chunksmith_core.llm)."""
+    def complete(
+        self,
+        messages: list[dict[str, Any]],
+        *,
+        model: str,
+        response_format: dict[str, Any] | None = None,
+        **kwargs: Any,
+    ) -> str: ...
+@runtime_checkable
+class ProgressObserver(Protocol):
+    """Observe pipeline stage events (UI/API/logging)."""
+    def on_event(self, event: str, payload: dict[str, Any]) -> None: ...
+@runtime_checkable
+class ArtifactSink(Protocol):
+    """Optional persistence — never invoked by default pipeline runners."""
+    def save_elements(self, elements: list[dict[str, Any]]) -> None: ...
+    def save_coded_formate(self, text: str) -> None: ...
+    def save_outline(self, outline: dict[str, Any]) -> None: ...
+    def save_mapper(self, mapper: dict[str, Any]) -> None: ...
+@runtime_checkable
+class PipelineStorage(ArtifactSink, Protocol):
+    """
+    Progressive storage during pipeline runs.
+    Core pipeline calls ``on_progress`` at each stage and ``save_*`` when artifacts
+    are ready. Implement in ``chunksmith_adapters`` (S3, Mongo, filesystem).
+    """
+    def on_progress(self, event: str, payload: dict[str, Any]) -> None: ...
+    def finalize(self) -> None: ...
+@dataclass
+class RunArtifactBundle:
+    """Normalized artifact set for persist helpers."""
+    elements: list[dict[str, Any]] = field(default_factory=list)
+    coded_formate: str = ""
+    outline: dict[str, Any] = field(default_factory=dict)
+    mapper: dict[str, Any] = field(default_factory=dict)

chunksmith_core-0.3.0/src/chunksmith_core/preferences.py ADDED Viewed

@@ -0,0 +1,82 @@
+"""User-facing multi-indexing preferences."""
+from __future__ import annotations
+from dataclasses import asdict, dataclass, field
+from typing import Any, Literal
+MappingMethod = Literal[
+    "page_indexing",
+    "group_indexing",
+    "anchor_indexing",
+    "chunk_assignment_indexing",
+    "title_indexing",
+]
+LlmContextFormat = Literal["json", "toon"]
+@dataclass
+class MultiIndexingPreferences:
+    """
+    Configure the full multi-indexing pipeline in one place.
+    Every field is set directly.     Pass to ``build_pipeline_config`` or ``run_multi_indexing_with_preferences`` (``chunksmith_multimodal``).
+    """
+    # --- Unstructured partition ---
+    languages: str | list[str] = "eng"
+    extract_images: bool = True
+    pages_per_chunk: int = 10
+    max_concurrent: int = 2
+    partition_pdf: bool = True
+    # --- Group by title ---
+    use_group_by_title: bool = True
+    max_characters: int = 3000
+    new_after_n_chars: int = 3800
+    combine_text_under_n_chars: int = 200
+    group_multipage_sections: bool = True
+    # --- Mapper ---
+    mapping_method: MappingMethod = "page_indexing"
+    llm_context_format: LlmContextFormat = "json"
+    include_empty_mapper_fields: bool = False
+    use_start_anchor_mapping: bool = True
+    use_end_anchor_mapping: bool = True
+    # --- Coded markup ---
+    coded_add_page_xml: bool = True
+    coded_add_group_by_title_xml: bool = True
+    # Per element type: FigureCaption, Footer, Formula, Header, Image, ListItem,
+    # NarrativeText, Table, Text, Title. Omit keys to keep package default (True).
+    coded_element_xml: dict[str, bool] = field(default_factory=dict)
+    # --- LLM page index (indexer) ---
+    add_outline_summaries: bool = True
+    add_anchor: bool = False
+    generate_doc_summary: bool = True
+    use_embedded_pdf_toc: bool = True
+    off_llm_parsing: bool = False
+    probe_first_excerpt: bool = False
+    probe_merged_reflection: bool = False
+    stream_llm_tokens: bool = False
+    # --- LLM models (optional; env defaults when None) ---
+    pageindex_model: str | None = None
+    llm_model: str | None = None
+    max_tokens_per_chunk: int | None = None
+    overlap_pages: int | None = None
+    litellm_kwargs: dict[str, Any] = field(default_factory=dict)
+    # --- Resume / skip stages (advanced) ---
+    elements: list[dict[str, Any]] | None = None
+    elements_json_path: str | None = None
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> MultiIndexingPreferences:
+        known = {f.name for f in cls.__dataclass_fields__.values()}  # type: ignore[attr-defined]
+        return cls(**{k: v for k, v in data.items() if k in known})

chunksmith_core-0.3.0/src/chunksmith_core.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,13 @@
+Metadata-Version: 2.4
+Name: chunksmith-core
+Version: 0.3.0
+Summary: ChunkSmith core ports, preferences, and in-memory element sources.
+Author-email: AnshulParate2004 <anshulnparate@gmail.com>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/AnshulParate2004/chunksmith-lib
+Project-URL: Repository, https://github.com/AnshulParate2004/chunksmith-lib
+Project-URL: Changelog, https://github.com/AnshulParate2004/chunksmith-lib/blob/main/CHANGELOG.md
+Requires-Python: >=3.10
+Requires-Dist: pydantic>=2.10.0
+Requires-Dist: typing_extensions>=4.0.0
+Requires-Dist: python-dotenv>=1.0.0

chunksmith_core-0.3.0/src/chunksmith_core.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,13 @@
+pyproject.toml
+src/chunksmith_core/__init__.py
+src/chunksmith_core/element_sources.py
+src/chunksmith_core/llm.py
+src/chunksmith_core/output.py
+src/chunksmith_core/persist.py
+src/chunksmith_core/ports.py
+src/chunksmith_core/preferences.py
+src/chunksmith_core.egg-info/PKG-INFO
+src/chunksmith_core.egg-info/SOURCES.txt
+src/chunksmith_core.egg-info/dependency_links.txt
+src/chunksmith_core.egg-info/requires.txt
+src/chunksmith_core.egg-info/top_level.txt

chunksmith_core-0.3.0/src/chunksmith_core.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

chunksmith_core-0.3.0/src/chunksmith_core.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,3 @@
+pydantic>=2.10.0
+typing_extensions>=4.0.0
+python-dotenv>=1.0.0

chunksmith_core-0.3.0/src/chunksmith_core.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ chunksmith_core