PyPI - fancychunk - Versions diffs - 0.1.0__py3-none-any.whl - Mend

fancychunk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

fancychunk/__init__.py +57 -0
fancychunk/_constants.py +30 -0
fancychunk/_markdown.py +169 -0
fancychunk/_segmenter.py +117 -0
fancychunk/_telemetry.py +52 -0
fancychunk/_typing.py +13 -0
fancychunk/chunklets.py +310 -0
fancychunk/chunks.py +256 -0
fancychunk/errors.py +59 -0
fancychunk/headings.py +90 -0
fancychunk/late_chunking.py +343 -0
fancychunk/py.typed +0 -0
fancychunk/sentences.py +263 -0
fancychunk-0.1.0.dist-info/METADATA +300 -0
fancychunk-0.1.0.dist-info/RECORD +17 -0
fancychunk-0.1.0.dist-info/WHEEL +4 -0
fancychunk-0.1.0.dist-info/licenses/LICENSE +21 -0

fancychunk/__init__.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""fancychunk — text chunking for retrieval-augmented generation.
+Behavioral specs live in ``docs/specs/``; this package implements the
+three required pipeline stages and the two optional helpers documented
+in ``docs/specs/contracts/public-api.md``.
+"""
+from __future__ import annotations
+from . import _constants as constants
+from .chunklets import split_chunklets
+from .chunks import split_chunks
+from .errors import (
+    FancyChunkError,
+    OptimizationFailedError,
+    OversizedChunkletError,
+    OversizedSentenceError,
+    SegmenterError,
+    SentenceExceedsContextError,
+    UnsplittableDocumentError,
+    ValidationError,
+    ZeroNormEmbeddingError,
+)
+from .headings import heading_paths
+from .late_chunking import SegmentEmbedder, embed_with_late_chunking
+from ._segmenter import SaTSegmenter, SentenceSegmenter, punctuation_segmenter
+from .sentences import split_sentences
+__all__ = [
+    "split_sentences",
+    "split_chunklets",
+    "split_chunks",
+    "embed_with_late_chunking",
+    "heading_paths",
+    "SaTSegmenter",
+    "SentenceSegmenter",
+    "punctuation_segmenter",
+    "SegmentEmbedder",
+    "FancyChunkError",
+    "ValidationError",
+    "UnsplittableDocumentError",
+    "OversizedSentenceError",
+    "OversizedChunkletError",
+    "ZeroNormEmbeddingError",
+    "SentenceExceedsContextError",
+    "OptimizationFailedError",
+    "SegmenterError",
+    "constants",
+]
+try:
+    from importlib.metadata import version as _pkg_version
+    __version__ = _pkg_version("fancychunk")
+except Exception:
+    # Source checkout or build-time call before metadata exists.
+    __version__ = "0.0.0+unknown"

fancychunk/_constants.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Named constants from the specs."""
+from __future__ import annotations
+DEFAULT_MAX_SIZE_CHARS = 2048
+BOUNDARY_SCORE_THRESHOLD = 0.25
+TARGET_STATEMENTS_PER_CHUNKLET = 3
+STATEMENT_COST_FLOOR = 1e-6
+STATEMENT_COST_SCALE = 0.5
+MIN_Q25_WORDS = 1.0
+STATEMENTS_AT_Q25 = 0.75
+QUARTILE_GAP_STATEMENTS = 0.50
+BOUNDARY_STRENGTH_HEADING = 1.00
+BOUNDARY_STRENGTH_BLOCKQUOTE = 0.75
+BOUNDARY_STRENGTH_PARAGRAPH = 0.50
+BOUNDARY_STRENGTH_LIST = 0.25
+TYPICAL_CHUNKLET_LOWER_QUANTILE = 0.15
+TYPICAL_CHUNKLET_UPPER_QUANTILE = 0.85
+HEADING_SPLIT_BEFORE_DIVISOR = 4.0
+HEADING_SPLIT_AFTER_FORBID = 1.0
+DEFAULT_PREAMBLE_FRACTION = 0.382
+MAX_HEADING_LEVELS = 6
+HEADING_PATH_SEPARATOR = "\n"

fancychunk/_markdown.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""Helpers that interpret a Markdown document via markdown-it-py.
+Two queries the splitter stages need against the parsed token stream:
+* ``heading_spans`` — character spans of ATX/Setext heading content, used
+  by stage 1 to override boundary probabilities.
+* ``token_openers_by_line`` — for each source line, which block-level
+  token types open on that line, used by stage 2 to assign per-sentence
+  boundary probabilities.
+Both helpers operate on the original document string; line numbers from
+markdown-it tokens are 0-indexed into a list of lines split on ``\n``.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from markdown_it import MarkdownIt
+@dataclass(frozen=True)
+class HeadingSpan:
+    """Character span of a heading's marker through its last
+    non-whitespace text character.
+    ``first`` is the position of the heading marker's first character
+    (e.g. the ``#``); ``last`` is the index of the last non-whitespace
+    character on the heading's text line(s). Both are inclusive.
+    """
+    first: int
+    last: int
+def _line_starts(document: str) -> list[int]:
+    """Return the character offset of each line's first character.
+    Index ``i`` is the offset of line ``i`` (0-indexed). An empty
+    document yields a single offset 0.
+    """
+    starts: list[int] = [0]
+    for idx, ch in enumerate(document):
+        if ch == "\n":
+            starts.append(idx + 1)
+    return starts
+# Module-level CommonMark parser. ``markdown-it-py`` parsing is
+# reentrant, so a single shared instance is safe and avoids the
+# per-call construction overhead.
+_PARSER = MarkdownIt("commonmark")
+def _parser() -> MarkdownIt:
+    return _PARSER
+def heading_spans(document: str) -> list[HeadingSpan]:
+    """Return character spans for every ATX/Setext heading in ``document``.
+    A heading's span runs from the first character of the heading
+    marker (or, for Setext headings, the first character of the heading
+    text) through the last non-whitespace character of the heading's
+    text. Trailing whitespace and following blank lines are not part
+    of the span.
+    Headings with empty text bodies (e.g. ``# \\n``) are returned with
+    ``first == last + 1`` (a degenerate span). Callers should detect
+    this and apply the SPEC-CHUNK-108 edge case for empty headings.
+    """
+    if not document:
+        return []
+    md = _parser()
+    tokens = md.parse(document)
+    line_starts = _line_starts(document)
+    n = len(document)
+    spans: list[HeadingSpan] = []
+    for tok in tokens:
+        if tok.type != "heading_open":
+            continue
+        if tok.map is None:
+            continue
+        start_line, end_line = tok.map  # half-open
+        # Character window: from the marker's start through end of the
+        # heading content. For ATX, both start_line == end_line-1
+        # typically; for Setext, end_line - 1 is the underline line.
+        first = line_starts[start_line]
+        # Trim leading whitespace on the start line to find the marker
+        while first < n and document[first] in (" ", "\t"):
+            first += 1
+        # Span end: scan back from end_line-1's terminating newline
+        # over whitespace to find last non-whitespace character.
+        end_line_idx = end_line - 1
+        if end_line_idx + 1 < len(line_starts):
+            end_pos = line_starts[end_line_idx + 1] - 1  # newline index
+        else:
+            end_pos = n - 1
+        # If line ends without newline, end_pos may equal n-1 already.
+        # Walk back from end_pos to the last non-whitespace char.
+        last = end_pos
+        while last >= first and document[last] in (" ", "\t", "\n", "\r"):
+            last -= 1
+        spans.append(HeadingSpan(first=first, last=last))
+    return spans
+@dataclass(frozen=True)
+class LineOpeners:
+    """The set of block-level token types that open on a given source
+    line (0-indexed).
+    """
+    line: int
+    types: tuple[str, ...]
+_RELEVANT_OPEN_TYPES = frozenset(
+    {
+        "heading_open",
+        "blockquote_open",
+        "paragraph_open",
+        "bullet_list_open",
+        "ordered_list_open",
+    }
+)
+def openers_by_line(document: str) -> dict[int, set[str]]:
+    """For each source line, which relevant block-level openers begin there.
+    Only the token types listed in SPEC-CHUNK-240 are tracked. Lines
+    with no relevant opener are simply absent from the result.
+    """
+    out: dict[int, set[str]] = {}
+    if not document:
+        return out
+    md = _parser()
+    tokens = md.parse(document)
+    for tok in tokens:
+        if tok.type not in _RELEVANT_OPEN_TYPES:
+            continue
+        if tok.map is None:
+            continue
+        line = tok.map[0]
+        out.setdefault(line, set()).add(tok.type)
+    return out
+def line_of_offset(line_starts: list[int], offset: int) -> int:
+    """Return the 0-indexed line containing ``offset``.
+    ``line_starts`` must be sorted ascending (e.g. produced by
+    :func:`compute_line_starts`).
+    """
+    lo, hi = 0, len(line_starts) - 1
+    while lo < hi:
+        mid = (lo + hi + 1) // 2
+        if line_starts[mid] <= offset:
+            lo = mid
+        else:
+            hi = mid - 1
+    return lo
+def compute_line_starts(document: str) -> list[int]:
+    """Public wrapper around the internal ``_line_starts`` helper."""
+    return _line_starts(document)

fancychunk/_segmenter.py ADDED Viewed

@@ -0,0 +1,117 @@
+"""Sentence-segmentation model interface and default implementations.
+Two segmenters are bundled:
+* :class:`SaTSegmenter` (the default) wraps a Segment Any Text (SaT)
+  model from `wtpsplit-lite` and returns per-character boundary
+  probabilities exactly as SPEC-CHUNK-106 prescribes. The 408 MB
+  ``sat-3l-sm`` weights download lazily on first call so importing
+  ``fancychunk`` stays cheap.
+* :func:`punctuation_segmenter` is a no-dependencies fallback that
+  marks ``.``/``!``/``?`` followed by whitespace or end-of-document.
+Either is a valid SentenceSegmenter; callers may pass their own
+through the keyword-only ``segmenter`` parameter on
+``split_sentences``.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING, Callable, Protocol
+import numpy as np
+from ._typing import Vector
+from .errors import SegmenterError
+if TYPE_CHECKING:
+    from wtpsplit_lite import SaT
+class SentenceSegmenter(Protocol):
+    """Callable mapping a document to a per-character boundary probability.
+    The returned array has length ``len(document)`` and dtype
+    convertible to ``float64``.
+    """
+    def __call__(self, document: str) -> Vector: ...
+_TERMINATORS = frozenset({".", "!", "?"})
+_DEFAULT_BOUNDARY_PROB = 0.9
+def punctuation_segmenter(document: str) -> Vector:
+    """Rule-based fallback segmenter.
+    Assigns ``_DEFAULT_BOUNDARY_PROB`` at every character that is a
+    sentence-final terminator (``.!?``) followed by whitespace or end
+    of document; ``0.0`` elsewhere. Crude but adequate for tests that
+    don't need real model output.
+    """
+    n = len(document)
+    probs: Vector = np.zeros(n, dtype=np.float64)
+    for i, ch in enumerate(document):
+        if ch in _TERMINATORS:
+            if i == n - 1 or document[i + 1].isspace():
+                probs[i] = _DEFAULT_BOUNDARY_PROB
+    return probs
+_DEFAULT_SAT_MODEL = "sat-3l-sm"
+class SaTSegmenter:
+    """SPEC-CHUNK-106 segmenter backed by wtpsplit-lite's SaT model.
+    The 408 MB ``sat-3l-sm`` weights are downloaded by Hugging Face on
+    first use (subsequent calls use the cache); the import itself is
+    cheap because the model is only loaded on the first
+    ``__call__``. Instances are reusable and thread-safe for read
+    (wtpsplit-lite's ONNX backend is itself reentrant).
+    """
+    def __init__(self, model_name: str = _DEFAULT_SAT_MODEL) -> None:
+        self.model_name: str = model_name
+        self._sat: SaT | None = None
+    def _ensure_loaded(self) -> SaT:
+        if self._sat is None:
+            # Local import keeps ``import fancychunk`` lightweight even
+            # when the SaT weights aren't yet cached.
+            from wtpsplit_lite import SaT as _SaT
+            self._sat = _SaT(self.model_name)
+        return self._sat
+    def __call__(self, document: str) -> Vector:
+        sat = self._ensure_loaded()
+        raw = sat.predict_proba(document)
+        arr = np.asarray(raw, dtype=np.float64)
+        if arr.ndim != 1 or arr.shape[0] != len(document):
+            raise SegmenterError(
+                f"SaT returned shape {arr.shape}; expected ({len(document)},)"
+            )
+        return arr
+# Module-level default singleton (lazy weight load on first call).
+_default_segmenter: SaTSegmenter | None = None
+def get_default_segmenter() -> SaTSegmenter:
+    """Return the process-wide default segmenter (the SaT singleton)."""
+    global _default_segmenter
+    if _default_segmenter is None:
+        _default_segmenter = SaTSegmenter()
+    return _default_segmenter
+def make_segmenter(
+    segmenter: SentenceSegmenter | None,
+) -> Callable[[str], Vector]:
+    """Resolve ``segmenter`` to a callable, defaulting to SaT."""
+    if segmenter is None:
+        return get_default_segmenter()
+    return segmenter

fancychunk/_telemetry.py ADDED Viewed

@@ -0,0 +1,52 @@
+"""Module-shared tracer and logger.
+The library uses ``opentelemetry-api`` for tracing. With no SDK
+configured the spans created here are zero-cost no-ops; once the
+caller's application installs an SDK and exporter the same spans
+appear in their trace backend.
+Naming conventions for span attributes follow OpenTelemetry's
+recommendation: lowercase dotted strings, scoped under
+``fancychunk.<stage>.<attribute>``. Counts and lengths are integers;
+durations are not set explicitly (the SDK measures them).
+"""
+from __future__ import annotations
+import logging
+from opentelemetry import trace
+from opentelemetry.trace import Tracer
+_INSTRUMENTATION_NAME = "fancychunk"
+def _instrumentation_version() -> str:
+    """Best-effort package version, used as the tracer's library version."""
+    try:
+        from importlib.metadata import version
+        return version("fancychunk")
+    except Exception:
+        return "0.0.0+unknown"
+def get_tracer() -> Tracer:
+    """Return the module-shared tracer.
+    Re-resolved each call so that a caller installing an SDK *after*
+    importing fancychunk still sees their spans (OpenTelemetry's API
+    is designed for this — the underlying ``ProxyTracer`` delegates
+    dynamically).
+    """
+    return trace.get_tracer(_INSTRUMENTATION_NAME, _instrumentation_version())
+def get_logger() -> logging.Logger:
+    """Return the library logger.
+    By default Python's logging machinery silences messages from this
+    logger; callers opt in with e.g.
+    ``logging.getLogger('fancychunk').setLevel(logging.INFO)``.
+    """
+    return logging.getLogger("fancychunk")

fancychunk/_typing.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Shared numpy type aliases used across stages.
+Centralizing these prevents subtle drift (e.g., ``NDArray[np.float64]``
+vs. ``NDArray[np.floating]``) across the per-stage modules.
+"""
+from __future__ import annotations
+import numpy as np
+from numpy.typing import NDArray
+Vector = NDArray[np.float64]
+Matrix = NDArray[np.float64]