fancychunk 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
fancychunk/__init__.py ADDED
@@ -0,0 +1,57 @@
1
+ """fancychunk — text chunking for retrieval-augmented generation.
2
+
3
+ Behavioral specs live in ``docs/specs/``; this package implements the
4
+ three required pipeline stages and the two optional helpers documented
5
+ in ``docs/specs/contracts/public-api.md``.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from . import _constants as constants
11
+ from .chunklets import split_chunklets
12
+ from .chunks import split_chunks
13
+ from .errors import (
14
+ FancyChunkError,
15
+ OptimizationFailedError,
16
+ OversizedChunkletError,
17
+ OversizedSentenceError,
18
+ SegmenterError,
19
+ SentenceExceedsContextError,
20
+ UnsplittableDocumentError,
21
+ ValidationError,
22
+ ZeroNormEmbeddingError,
23
+ )
24
+ from .headings import heading_paths
25
+ from .late_chunking import SegmentEmbedder, embed_with_late_chunking
26
+ from ._segmenter import SaTSegmenter, SentenceSegmenter, punctuation_segmenter
27
+ from .sentences import split_sentences
28
+
29
+ __all__ = [
30
+ "split_sentences",
31
+ "split_chunklets",
32
+ "split_chunks",
33
+ "embed_with_late_chunking",
34
+ "heading_paths",
35
+ "SaTSegmenter",
36
+ "SentenceSegmenter",
37
+ "punctuation_segmenter",
38
+ "SegmentEmbedder",
39
+ "FancyChunkError",
40
+ "ValidationError",
41
+ "UnsplittableDocumentError",
42
+ "OversizedSentenceError",
43
+ "OversizedChunkletError",
44
+ "ZeroNormEmbeddingError",
45
+ "SentenceExceedsContextError",
46
+ "OptimizationFailedError",
47
+ "SegmenterError",
48
+ "constants",
49
+ ]
50
+
51
+ try:
52
+ from importlib.metadata import version as _pkg_version
53
+
54
+ __version__ = _pkg_version("fancychunk")
55
+ except Exception:
56
+ # Source checkout or build-time call before metadata exists.
57
+ __version__ = "0.0.0+unknown"
@@ -0,0 +1,30 @@
1
+ """Named constants from the specs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ DEFAULT_MAX_SIZE_CHARS = 2048
6
+
7
+ BOUNDARY_SCORE_THRESHOLD = 0.25
8
+
9
+ TARGET_STATEMENTS_PER_CHUNKLET = 3
10
+ STATEMENT_COST_FLOOR = 1e-6
11
+ STATEMENT_COST_SCALE = 0.5
12
+
13
+ MIN_Q25_WORDS = 1.0
14
+ STATEMENTS_AT_Q25 = 0.75
15
+ QUARTILE_GAP_STATEMENTS = 0.50
16
+
17
+ BOUNDARY_STRENGTH_HEADING = 1.00
18
+ BOUNDARY_STRENGTH_BLOCKQUOTE = 0.75
19
+ BOUNDARY_STRENGTH_PARAGRAPH = 0.50
20
+ BOUNDARY_STRENGTH_LIST = 0.25
21
+
22
+ TYPICAL_CHUNKLET_LOWER_QUANTILE = 0.15
23
+ TYPICAL_CHUNKLET_UPPER_QUANTILE = 0.85
24
+ HEADING_SPLIT_BEFORE_DIVISOR = 4.0
25
+ HEADING_SPLIT_AFTER_FORBID = 1.0
26
+
27
+ DEFAULT_PREAMBLE_FRACTION = 0.382
28
+
29
+ MAX_HEADING_LEVELS = 6
30
+ HEADING_PATH_SEPARATOR = "\n"
@@ -0,0 +1,169 @@
1
+ """Helpers that interpret a Markdown document via markdown-it-py.
2
+
3
+ Two queries the splitter stages need against the parsed token stream:
4
+
5
+ * ``heading_spans`` — character spans of ATX/Setext heading content, used
6
+ by stage 1 to override boundary probabilities.
7
+ * ``token_openers_by_line`` — for each source line, which block-level
8
+ token types open on that line, used by stage 2 to assign per-sentence
9
+ boundary probabilities.
10
+
11
+ Both helpers operate on the original document string; line numbers from
12
+ markdown-it tokens are 0-indexed into a list of lines split on ``\n``.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from dataclasses import dataclass
18
+
19
+ from markdown_it import MarkdownIt
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class HeadingSpan:
24
+ """Character span of a heading's marker through its last
25
+ non-whitespace text character.
26
+
27
+ ``first`` is the position of the heading marker's first character
28
+ (e.g. the ``#``); ``last`` is the index of the last non-whitespace
29
+ character on the heading's text line(s). Both are inclusive.
30
+ """
31
+
32
+ first: int
33
+ last: int
34
+
35
+
36
+ def _line_starts(document: str) -> list[int]:
37
+ """Return the character offset of each line's first character.
38
+
39
+ Index ``i`` is the offset of line ``i`` (0-indexed). An empty
40
+ document yields a single offset 0.
41
+ """
42
+ starts: list[int] = [0]
43
+ for idx, ch in enumerate(document):
44
+ if ch == "\n":
45
+ starts.append(idx + 1)
46
+ return starts
47
+
48
+
49
+ # Module-level CommonMark parser. ``markdown-it-py`` parsing is
50
+ # reentrant, so a single shared instance is safe and avoids the
51
+ # per-call construction overhead.
52
+ _PARSER = MarkdownIt("commonmark")
53
+
54
+
55
+ def _parser() -> MarkdownIt:
56
+ return _PARSER
57
+
58
+
59
+ def heading_spans(document: str) -> list[HeadingSpan]:
60
+ """Return character spans for every ATX/Setext heading in ``document``.
61
+
62
+ A heading's span runs from the first character of the heading
63
+ marker (or, for Setext headings, the first character of the heading
64
+ text) through the last non-whitespace character of the heading's
65
+ text. Trailing whitespace and following blank lines are not part
66
+ of the span.
67
+
68
+ Headings with empty text bodies (e.g. ``# \\n``) are returned with
69
+ ``first == last + 1`` (a degenerate span). Callers should detect
70
+ this and apply the SPEC-CHUNK-108 edge case for empty headings.
71
+ """
72
+ if not document:
73
+ return []
74
+ md = _parser()
75
+ tokens = md.parse(document)
76
+ line_starts = _line_starts(document)
77
+ n = len(document)
78
+
79
+ spans: list[HeadingSpan] = []
80
+ for tok in tokens:
81
+ if tok.type != "heading_open":
82
+ continue
83
+ if tok.map is None:
84
+ continue
85
+ start_line, end_line = tok.map # half-open
86
+ # Character window: from the marker's start through end of the
87
+ # heading content. For ATX, both start_line == end_line-1
88
+ # typically; for Setext, end_line - 1 is the underline line.
89
+ first = line_starts[start_line]
90
+ # Trim leading whitespace on the start line to find the marker
91
+ while first < n and document[first] in (" ", "\t"):
92
+ first += 1
93
+ # Span end: scan back from end_line-1's terminating newline
94
+ # over whitespace to find last non-whitespace character.
95
+ end_line_idx = end_line - 1
96
+ if end_line_idx + 1 < len(line_starts):
97
+ end_pos = line_starts[end_line_idx + 1] - 1 # newline index
98
+ else:
99
+ end_pos = n - 1
100
+ # If line ends without newline, end_pos may equal n-1 already.
101
+ # Walk back from end_pos to the last non-whitespace char.
102
+ last = end_pos
103
+ while last >= first and document[last] in (" ", "\t", "\n", "\r"):
104
+ last -= 1
105
+ spans.append(HeadingSpan(first=first, last=last))
106
+ return spans
107
+
108
+
109
+ @dataclass(frozen=True)
110
+ class LineOpeners:
111
+ """The set of block-level token types that open on a given source
112
+ line (0-indexed).
113
+ """
114
+
115
+ line: int
116
+ types: tuple[str, ...]
117
+
118
+
119
+ _RELEVANT_OPEN_TYPES = frozenset(
120
+ {
121
+ "heading_open",
122
+ "blockquote_open",
123
+ "paragraph_open",
124
+ "bullet_list_open",
125
+ "ordered_list_open",
126
+ }
127
+ )
128
+
129
+
130
+ def openers_by_line(document: str) -> dict[int, set[str]]:
131
+ """For each source line, which relevant block-level openers begin there.
132
+
133
+ Only the token types listed in SPEC-CHUNK-240 are tracked. Lines
134
+ with no relevant opener are simply absent from the result.
135
+ """
136
+ out: dict[int, set[str]] = {}
137
+ if not document:
138
+ return out
139
+ md = _parser()
140
+ tokens = md.parse(document)
141
+ for tok in tokens:
142
+ if tok.type not in _RELEVANT_OPEN_TYPES:
143
+ continue
144
+ if tok.map is None:
145
+ continue
146
+ line = tok.map[0]
147
+ out.setdefault(line, set()).add(tok.type)
148
+ return out
149
+
150
+
151
+ def line_of_offset(line_starts: list[int], offset: int) -> int:
152
+ """Return the 0-indexed line containing ``offset``.
153
+
154
+ ``line_starts`` must be sorted ascending (e.g. produced by
155
+ :func:`compute_line_starts`).
156
+ """
157
+ lo, hi = 0, len(line_starts) - 1
158
+ while lo < hi:
159
+ mid = (lo + hi + 1) // 2
160
+ if line_starts[mid] <= offset:
161
+ lo = mid
162
+ else:
163
+ hi = mid - 1
164
+ return lo
165
+
166
+
167
+ def compute_line_starts(document: str) -> list[int]:
168
+ """Public wrapper around the internal ``_line_starts`` helper."""
169
+ return _line_starts(document)
@@ -0,0 +1,117 @@
1
+ """Sentence-segmentation model interface and default implementations.
2
+
3
+ Two segmenters are bundled:
4
+
5
+ * :class:`SaTSegmenter` (the default) wraps a Segment Any Text (SaT)
6
+ model from `wtpsplit-lite` and returns per-character boundary
7
+ probabilities exactly as SPEC-CHUNK-106 prescribes. The 408 MB
8
+ ``sat-3l-sm`` weights download lazily on first call so importing
9
+ ``fancychunk`` stays cheap.
10
+ * :func:`punctuation_segmenter` is a no-dependencies fallback that
11
+ marks ``.``/``!``/``?`` followed by whitespace or end-of-document.
12
+
13
+ Either is a valid SentenceSegmenter; callers may pass their own
14
+ through the keyword-only ``segmenter`` parameter on
15
+ ``split_sentences``.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from typing import TYPE_CHECKING, Callable, Protocol
21
+
22
+ import numpy as np
23
+
24
+ from ._typing import Vector
25
+ from .errors import SegmenterError
26
+
27
+ if TYPE_CHECKING:
28
+ from wtpsplit_lite import SaT
29
+
30
+
31
+ class SentenceSegmenter(Protocol):
32
+ """Callable mapping a document to a per-character boundary probability.
33
+
34
+ The returned array has length ``len(document)`` and dtype
35
+ convertible to ``float64``.
36
+ """
37
+
38
+ def __call__(self, document: str) -> Vector: ...
39
+
40
+
41
+ _TERMINATORS = frozenset({".", "!", "?"})
42
+ _DEFAULT_BOUNDARY_PROB = 0.9
43
+
44
+
45
+ def punctuation_segmenter(document: str) -> Vector:
46
+ """Rule-based fallback segmenter.
47
+
48
+ Assigns ``_DEFAULT_BOUNDARY_PROB`` at every character that is a
49
+ sentence-final terminator (``.!?``) followed by whitespace or end
50
+ of document; ``0.0`` elsewhere. Crude but adequate for tests that
51
+ don't need real model output.
52
+ """
53
+ n = len(document)
54
+ probs: Vector = np.zeros(n, dtype=np.float64)
55
+ for i, ch in enumerate(document):
56
+ if ch in _TERMINATORS:
57
+ if i == n - 1 or document[i + 1].isspace():
58
+ probs[i] = _DEFAULT_BOUNDARY_PROB
59
+ return probs
60
+
61
+
62
+ _DEFAULT_SAT_MODEL = "sat-3l-sm"
63
+
64
+
65
+ class SaTSegmenter:
66
+ """SPEC-CHUNK-106 segmenter backed by wtpsplit-lite's SaT model.
67
+
68
+ The 408 MB ``sat-3l-sm`` weights are downloaded by Hugging Face on
69
+ first use (subsequent calls use the cache); the import itself is
70
+ cheap because the model is only loaded on the first
71
+ ``__call__``. Instances are reusable and thread-safe for read
72
+ (wtpsplit-lite's ONNX backend is itself reentrant).
73
+ """
74
+
75
+ def __init__(self, model_name: str = _DEFAULT_SAT_MODEL) -> None:
76
+ self.model_name: str = model_name
77
+ self._sat: SaT | None = None
78
+
79
+ def _ensure_loaded(self) -> SaT:
80
+ if self._sat is None:
81
+ # Local import keeps ``import fancychunk`` lightweight even
82
+ # when the SaT weights aren't yet cached.
83
+ from wtpsplit_lite import SaT as _SaT
84
+
85
+ self._sat = _SaT(self.model_name)
86
+ return self._sat
87
+
88
+ def __call__(self, document: str) -> Vector:
89
+ sat = self._ensure_loaded()
90
+ raw = sat.predict_proba(document)
91
+ arr = np.asarray(raw, dtype=np.float64)
92
+ if arr.ndim != 1 or arr.shape[0] != len(document):
93
+ raise SegmenterError(
94
+ f"SaT returned shape {arr.shape}; expected ({len(document)},)"
95
+ )
96
+ return arr
97
+
98
+
99
+ # Module-level default singleton (lazy weight load on first call).
100
+ _default_segmenter: SaTSegmenter | None = None
101
+
102
+
103
+ def get_default_segmenter() -> SaTSegmenter:
104
+ """Return the process-wide default segmenter (the SaT singleton)."""
105
+ global _default_segmenter
106
+ if _default_segmenter is None:
107
+ _default_segmenter = SaTSegmenter()
108
+ return _default_segmenter
109
+
110
+
111
+ def make_segmenter(
112
+ segmenter: SentenceSegmenter | None,
113
+ ) -> Callable[[str], Vector]:
114
+ """Resolve ``segmenter`` to a callable, defaulting to SaT."""
115
+ if segmenter is None:
116
+ return get_default_segmenter()
117
+ return segmenter
@@ -0,0 +1,52 @@
1
+ """Module-shared tracer and logger.
2
+
3
+ The library uses ``opentelemetry-api`` for tracing. With no SDK
4
+ configured the spans created here are zero-cost no-ops; once the
5
+ caller's application installs an SDK and exporter the same spans
6
+ appear in their trace backend.
7
+
8
+ Naming conventions for span attributes follow OpenTelemetry's
9
+ recommendation: lowercase dotted strings, scoped under
10
+ ``fancychunk.<stage>.<attribute>``. Counts and lengths are integers;
11
+ durations are not set explicitly (the SDK measures them).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import logging
17
+
18
+ from opentelemetry import trace
19
+ from opentelemetry.trace import Tracer
20
+
21
+ _INSTRUMENTATION_NAME = "fancychunk"
22
+
23
+
24
+ def _instrumentation_version() -> str:
25
+ """Best-effort package version, used as the tracer's library version."""
26
+ try:
27
+ from importlib.metadata import version
28
+
29
+ return version("fancychunk")
30
+ except Exception:
31
+ return "0.0.0+unknown"
32
+
33
+
34
+ def get_tracer() -> Tracer:
35
+ """Return the module-shared tracer.
36
+
37
+ Re-resolved each call so that a caller installing an SDK *after*
38
+ importing fancychunk still sees their spans (OpenTelemetry's API
39
+ is designed for this — the underlying ``ProxyTracer`` delegates
40
+ dynamically).
41
+ """
42
+ return trace.get_tracer(_INSTRUMENTATION_NAME, _instrumentation_version())
43
+
44
+
45
+ def get_logger() -> logging.Logger:
46
+ """Return the library logger.
47
+
48
+ By default Python's logging machinery silences messages from this
49
+ logger; callers opt in with e.g.
50
+ ``logging.getLogger('fancychunk').setLevel(logging.INFO)``.
51
+ """
52
+ return logging.getLogger("fancychunk")
fancychunk/_typing.py ADDED
@@ -0,0 +1,13 @@
1
+ """Shared numpy type aliases used across stages.
2
+
3
+ Centralizing these prevents subtle drift (e.g., ``NDArray[np.float64]``
4
+ vs. ``NDArray[np.floating]``) across the per-stage modules.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import numpy as np
10
+ from numpy.typing import NDArray
11
+
12
+ Vector = NDArray[np.float64]
13
+ Matrix = NDArray[np.float64]