chunky-files 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ build/
8
+ dist/
9
+ *.egg-info/
10
+ .eggs/
11
+
12
+ # Installer logs
13
+ pip-log.txt
14
+ pip-delete-this-directory.txt
15
+
16
+ # Unit test / coverage reports
17
+ htmlcov/
18
+ .tox/
19
+ .coverage
20
+ .coverage.*
21
+ .cache
22
+ pytest_cache/
23
+ .coverage
24
+ coverage.xml
25
+
26
+ # Sphinx build artifacts
27
+ docs/_build/
28
+
29
+ # IDEs and editors
30
+ .vscode/
31
+ .idea/
32
+ *.swp
33
+
34
+ # macOS
35
+ .DS_Store
36
+
37
+ # Hatch environments
38
+ .hatch/
39
+
40
+ # Environment file
41
+ .env
42
+ .venv/
43
+ env/
44
+ venv/
45
+ ENV/
46
+ env.bak/
47
+ venv.bak/
@@ -0,0 +1,30 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.2.0] - TBD
11
+ ### Added
12
+ - Changelog (`CHANGELOG.md`; this file).
13
+ - Release process section added to the existing `README.md`
14
+ - `PYPI_TOKEN`, `TEST_PYPI_TOKEN`, and `CODECOV_TOKEN` added to github secrets
15
+ - `.env` and other common evironment file name added to the `.gitignore` for token security.
16
+ ### Changes
17
+ - Release workflow updated to have matching secrets name.
18
+ ### Fixes
19
+ - Updated dependencies and improve type hints in codebase (ruff compliance).
20
+ - Update build tooling installation in release .
21
+ - Included pyproject.toml in sdist build targets.
22
+
23
+ ## [0.1.0] - 2025-09-30
24
+ ### Added
25
+ - Initial project scaffolding with Hatchling build system and CI/release workflows.
26
+ - Core chunking data models (`Document`, `Chunk`, `ChunkerConfig`).
27
+ - Sliding-window fallback chunker with metadata-rich outputs.
28
+ - `ChunkPipeline` orchestration, registry, and filesystem loader.
29
+ - Sphinx documentation skeleton and Read the Docs configuration.
30
+ - Pytest and Ruff tooling with baseline tests for the sliding-window chunker.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Nancy Brain Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,145 @@
1
+ Metadata-Version: 2.4
2
+ Name: chunky-files
3
+ Version: 0.2.0
4
+ Summary: Semantic chunking utilities for scientific code and documentation corpora.
5
+ Project-URL: Home, https://github.com/AmberLee2427/chunky
6
+ Project-URL: Documentation, https://chunky.readthedocs.io/
7
+ Project-URL: Issues, https://github.com/AmberLee2427/chunky/issues
8
+ Author: Nancy Brain Contributors
9
+ License: MIT License
10
+
11
+ Copyright (c) 2024 Nancy Brain Contributors
12
+
13
+ Permission is hereby granted, free of charge, to any person obtaining a copy
14
+ of this software and associated documentation files (the "Software"), to deal
15
+ in the Software without restriction, including without limitation the rights
16
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17
+ copies of the Software, and to permit persons to whom the Software is
18
+ furnished to do so, subject to the following conditions:
19
+
20
+ The above copyright notice and this permission notice shall be included in all
21
+ copies or substantial portions of the Software.
22
+
23
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
26
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
27
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
28
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
29
+ SOFTWARE.
30
+ License-File: LICENSE
31
+ Classifier: Development Status :: 3 - Alpha
32
+ Classifier: Intended Audience :: Developers
33
+ Classifier: Intended Audience :: Science/Research
34
+ Classifier: License :: OSI Approved :: MIT License
35
+ Classifier: Programming Language :: Python
36
+ Classifier: Programming Language :: Python :: 3
37
+ Classifier: Programming Language :: Python :: 3.8
38
+ Classifier: Programming Language :: Python :: 3.9
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Topic :: Scientific/Engineering
43
+ Classifier: Topic :: Text Processing :: Linguistic
44
+ Requires-Python: >=3.8
45
+ Provides-Extra: dev
46
+ Requires-Dist: build; extra == 'dev'
47
+ Requires-Dist: bump-my-version>=0.6; extra == 'dev'
48
+ Requires-Dist: coverage[toml]>=7; extra == 'dev'
49
+ Requires-Dist: pytest-cov>=4; extra == 'dev'
50
+ Requires-Dist: pytest>=7; extra == 'dev'
51
+ Requires-Dist: ruff>=0.6; extra == 'dev'
52
+ Provides-Extra: docs
53
+ Requires-Dist: furo>=2024.0.0; extra == 'docs'
54
+ Requires-Dist: myst-parser>=2; extra == 'docs'
55
+ Requires-Dist: sphinx>=7; extra == 'docs'
56
+ Description-Content-Type: text/markdown
57
+
58
+ # Chunky
59
+
60
+ Chunky is a python package for intelligently chunking scientific and technical repositories.
61
+ It provides a modular pipeline that powers the Nancy Brain knowledge base and MCP services,
62
+ while remaining useful as a standalone library for retrieval systems that need deterministic,
63
+ metadata-rich chunks.
64
+
65
+ Documentation lives on Read the Docs: <https://chunky.readthedocs.io>
66
+
67
+ ## Installation
68
+
69
+ Install from source using the `pyproject.toml` metadata:
70
+
71
+ ```bash
72
+ # clone the repo (if you haven't already)
73
+ git clone https://github.com/AmberLee2427/chunky.git
74
+ cd chunky
75
+
76
+ # install the library
77
+ pip install .
78
+ ```
79
+
80
+ For development and documentation builds, install the optional extras:
81
+
82
+ ```bash
83
+ pip install -e ".[dev,docs]"
84
+ ```
85
+
86
+ > `-e` performs an editable install so local changes reflect immediately.
87
+ > `.[dev,docs]` installs the tooling declared under the `dev` and `docs` extras in
88
+ > `pyproject.toml`.
89
+
90
+ ## Tooling
91
+
92
+ * **Code style:** Ruff (`ruff check src tests` or `ruff check src tests --fix`)
93
+ * **Tests:** Pytest (`pytest --cov=chunky`)
94
+ * **Docs:** Sphinx + MyST + Furo (`sphinx-build -b html docs docs/_build/html`)
95
+ * **Packaging:** Hatchling build backend
96
+ * **Versioning:** bump-my-version (driven by tags and the release workflow)
97
+
98
+ ## Workflows
99
+
100
+ * CI tests run on Linux, macOS, and Windows for Python 3.8 through 3.12.
101
+ * Pushing a tag that matches the form `vX.Y.Z` triggers the release workflow. It validates that the
102
+ tag matches the version in `pyproject.toml`, builds the distribution, and publishes to PyPI using
103
+ the `PYPI_API_TOKEN` secret.
104
+ * Read the Docs builds the documentation automatically for pushes to the default branch. Local
105
+ builds use `sphinx-build -b html docs docs/_build/html`.
106
+
107
+ Release checklist:
108
+
109
+ 1. Review and update `CHANGELOG.md`, keeping the `[Unreleased]` section accurate.
110
+ 2. Run `bump-my-version bump <part>` to update version metadata and append a dated entry in the
111
+ changelog.
112
+ 3. Commit the changes and push to `main`.
113
+ 4. Tag the commit (`git tag vX.Y.Z && git push origin vX.Y.Z`) to trigger the Release workflow.
114
+ 5. Verify the PyPI publish job and Read the Docs build succeed.
115
+
116
+ ## Contributing
117
+
118
+ * Know your audience: most contributors will be scientific coders. Write docs assuming limited
119
+ familiarity with packaging internals.
120
+ * Use Ruff for style checks and keep numpy-style docstrings on all non-test functions.
121
+ * Target test coverage above 70% and ensure existing CI jobs pass before opening a PR.
122
+ * In pull requests, summarise code changes, testing/validation, doc updates, and provide a brief
123
+ TL;DR when the description runs long.
124
+
125
+ ## License
126
+
127
+ Chunky is released under the [MIT License](LICENSE).
128
+
129
+ ## Glossary
130
+
131
+ | Term | Meaning |
132
+ | ---- | ------- |
133
+ | PR | GitHub pull request – a request to merge one branch or fork with another |
134
+ | Release | Publishing a tagged version of the project to PyPI |
135
+ | ChangeLog | A document describing changes between releases |
136
+ | PyPI | Python Package Index – where published distributions live |
137
+ | Ruff | A fast Python linter/formatter used for style enforcement |
138
+ | origin | The upstream GitHub repository |
139
+ | fork | A downstream copy of the origin repo used for contributing |
140
+ | master/main | The default branch |
141
+ | CI | Continuous Integration – automated checks that run on every push/PR |
142
+ | GitHub Workflows | GitHub’s automation runner configured via YAML files |
143
+ | `pyproject.toml` | Core metadata and build configuration for the package |
144
+ | bump-my-version | CLI used to bump version numbers consistently |
145
+ | Read the Docs | Hosted documentation service that builds from the repo |
@@ -0,0 +1,88 @@
1
+ # Chunky
2
+
3
+ Chunky is a python package for intelligently chunking scientific and technical repositories.
4
+ It provides a modular pipeline that powers the Nancy Brain knowledge base and MCP services,
5
+ while remaining useful as a standalone library for retrieval systems that need deterministic,
6
+ metadata-rich chunks.
7
+
8
+ Documentation lives on Read the Docs: <https://chunky.readthedocs.io>
9
+
10
+ ## Installation
11
+
12
+ Install from source using the `pyproject.toml` metadata:
13
+
14
+ ```bash
15
+ # clone the repo (if you haven't already)
16
+ git clone https://github.com/AmberLee2427/chunky.git
17
+ cd chunky
18
+
19
+ # install the library
20
+ pip install .
21
+ ```
22
+
23
+ For development and documentation builds, install the optional extras:
24
+
25
+ ```bash
26
+ pip install -e ".[dev,docs]"
27
+ ```
28
+
29
+ > `-e` performs an editable install so local changes reflect immediately.
30
+ > `.[dev,docs]` installs the tooling declared under the `dev` and `docs` extras in
31
+ > `pyproject.toml`.
32
+
33
+ ## Tooling
34
+
35
+ * **Code style:** Ruff (`ruff check src tests` or `ruff check src tests --fix`)
36
+ * **Tests:** Pytest (`pytest --cov=chunky`)
37
+ * **Docs:** Sphinx + MyST + Furo (`sphinx-build -b html docs docs/_build/html`)
38
+ * **Packaging:** Hatchling build backend
39
+ * **Versioning:** bump-my-version (driven by tags and the release workflow)
40
+
41
+ ## Workflows
42
+
43
+ * CI tests run on Linux, macOS, and Windows for Python 3.8 through 3.12.
44
+ * Pushing a tag that matches the form `vX.Y.Z` triggers the release workflow. It validates that the
45
+ tag matches the version in `pyproject.toml`, builds the distribution, and publishes to PyPI using
46
+ the `PYPI_API_TOKEN` secret.
47
+ * Read the Docs builds the documentation automatically for pushes to the default branch. Local
48
+ builds use `sphinx-build -b html docs docs/_build/html`.
49
+
50
+ Release checklist:
51
+
52
+ 1. Review and update `CHANGELOG.md`, keeping the `[Unreleased]` section accurate.
53
+ 2. Run `bump-my-version bump <part>` to update version metadata and append a dated entry in the
54
+ changelog.
55
+ 3. Commit the changes and push to `main`.
56
+ 4. Tag the commit (`git tag vX.Y.Z && git push origin vX.Y.Z`) to trigger the Release workflow.
57
+ 5. Verify the PyPI publish job and Read the Docs build succeed.
58
+
59
+ ## Contributing
60
+
61
+ * Know your audience: most contributors will be scientific coders. Write docs assuming limited
62
+ familiarity with packaging internals.
63
+ * Use Ruff for style checks and keep numpy-style docstrings on all non-test functions.
64
+ * Target test coverage above 70% and ensure existing CI jobs pass before opening a PR.
65
+ * In pull requests, summarise code changes, testing/validation, doc updates, and provide a brief
66
+ TL;DR when the description runs long.
67
+
68
+ ## License
69
+
70
+ Chunky is released under the [MIT License](LICENSE).
71
+
72
+ ## Glossary
73
+
74
+ | Term | Meaning |
75
+ | ---- | ------- |
76
+ | PR | GitHub pull request – a request to merge one branch or fork with another |
77
+ | Release | Publishing a tagged version of the project to PyPI |
78
+ | ChangeLog | A document describing changes between releases |
79
+ | PyPI | Python Package Index – where published distributions live |
80
+ | Ruff | A fast Python linter/formatter used for style enforcement |
81
+ | origin | The upstream GitHub repository |
82
+ | fork | A downstream copy of the origin repo used for contributing |
83
+ | master/main | The default branch |
84
+ | CI | Continuous Integration – automated checks that run on every push/PR |
85
+ | GitHub Workflows | GitHub’s automation runner configured via YAML files |
86
+ | `pyproject.toml` | Core metadata and build configuration for the package |
87
+ | bump-my-version | CLI used to bump version numbers consistently |
88
+ | Read the Docs | Hosted documentation service that builds from the repo |
File without changes
File without changes
@@ -0,0 +1,9 @@
1
+ API Reference
2
+ =============
3
+
4
+ .. autosummary::
5
+ :toctree: _autosummary
6
+ :caption: Public API
7
+ :recursive:
8
+
9
+ chunky
@@ -0,0 +1,50 @@
1
+ """Sphinx configuration for chunky documentation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import importlib.metadata
6
+ import os
7
+ import sys
8
+ from datetime import datetime
9
+
10
+ PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
11
+ SRC_DIR = os.path.join(PROJECT_ROOT, "src")
12
+ if SRC_DIR not in sys.path:
13
+ sys.path.insert(0, SRC_DIR)
14
+
15
+ project = "chunky"
16
+ copyright = f"{datetime.now():%Y}, Nancy Brain Contributors"
17
+
18
+ try:
19
+ release = importlib.metadata.version("chunky")
20
+ except importlib.metadata.PackageNotFoundError:
21
+ from chunky.__about__ import __version__ as release # type: ignore[assignment]
22
+
23
+ extensions = [
24
+ "sphinx.ext.autodoc",
25
+ "sphinx.ext.napoleon",
26
+ "sphinx.ext.autosummary",
27
+ "sphinx.ext.intersphinx",
28
+ "myst_parser",
29
+ ]
30
+
31
+ autosummary_generate = True
32
+ napoleon_google_docstring = False
33
+ napoleon_use_param = True
34
+ napoleon_use_rtype = True
35
+
36
+ html_theme = os.environ.get("SPHINX_HTML_THEME", "furo")
37
+ html_static_path = ["_static"]
38
+
39
+ intersphinx_mapping = {
40
+ "python": ("https://docs.python.org/3", {}),
41
+ }
42
+
43
+ myst_enable_extensions = [
44
+ "colon_fence",
45
+ "deflist",
46
+ ]
47
+
48
+ templates_path = ["_templates"]
49
+
50
+ exclude_patterns: list[str] = ["_build", "Thumbs.db", ".DS_Store"]
@@ -0,0 +1,116 @@
1
+ ## Semantic code chunking
2
+
3
+ Semantic code file chunking in Python involves splitting a code file into meaningful, self-contained units based on its structure and semantics, rather than just arbitrary character counts. This approach aims to create chunks that represent logical components like functions, classes, or distinct blocks of code, improving the effectiveness of operations like embedding for RAG pipelines or code analysis.
4
+
5
+ Here's how you can achieve this in Python:
6
+
7
+ 1. **Using Language-Specific Text Splitters:**
8
+ Libraries like LangChain offer specialized text splitters for different programming languages.
9
+ ```python
10
+ from langchain_experimental.text_splitter import PythonCodeTextSplitter
11
+
12
+ # Initialize the splitter
13
+ python_splitter = PythonCodeTextSplitter()
14
+
15
+ # Split the code
16
+ code_chunks = python_splitter.split_text(your_python_code_string)
17
+ ```
18
+ This `PythonCodeTextSplitter` is designed to understand Python syntax and split based on elements like function definitions, class definitions, and other structural components. Similar splitters exist for other languages.
19
+
20
+ 2. **Utilizing Tree-Sitter for AST-based Chunking:**
21
+ Tree-sitter is a parsing library that can generate Abstract Syntax Trees (ASTs) for various programming languages. You can leverage this to identify semantic boundaries more precisely.
22
+
23
+ ```python
24
+ # Example using a conceptual tree-sitter based approach
25
+ # (Requires a tree-sitter parser for Python)
26
+ from tree_sitter import Language, Parser
27
+
28
+ # Load the Python language parser (you'd need to compile it first)
29
+ Language.build_library('build/my-languages.so', ['path/to/tree-sitter-python'])
30
+ PYTHON_LANGUAGE = Language('build/my-languages.so', 'python')
31
+
32
+ parser = Parser()
33
+ parser.set_language(PYTHON_LANGUAGE)
34
+
35
+ tree = parser.parse(bytes(your_python_code_string, 'utf8'))
36
+
37
+ # Traverse the AST to identify meaningful nodes (e.g., function definitions, class definitions)
38
+ # and extract their corresponding code snippets as chunks.
39
+ ```
40
+ This method offers fine-grained control over chunking based on the exact structure of the code, but requires more manual implementation to define how AST nodes translate into chunks.
41
+ 3. **Combining Semantic and Heuristic Approaches:**
42
+ You can also combine semantic understanding with more traditional heuristic rules, such as splitting by multiple newlines or specific keywords, to create robust chunking strategies.
43
+
44
+ Considerations for Semantic Code Chunking:
45
+ • **Granularity:** Decide on the appropriate level of granularity for your chunks (e.g., entire functions, individual statements, or logical blocks within functions).
46
+ • **Context:** Ensure that each chunk retains enough context to be meaningful on its own, especially for tasks like embedding and retrieval.
47
+ • **Language Specificity:** The best chunking strategy often depends on the specific programming language and its conventions.
48
+
49
+ AI responses may include mistakes.
50
+
51
+ ---
52
+
53
+ ## Semantic file chunking
54
+
55
+ Semantic file chunking in Python involves splitting a document into meaningful segments based on the semantic relatedness of its content, rather than fixed-size or character-based methods. This approach aims to keep semantically coherent information together within a single chunk, which can be beneficial for tasks like Retrieval Augmented Generation (RAG) in Large Language Models (LLMs).
56
+
57
+ Here's how you can perform semantic chunking in Python:
58
+
59
+ 1. **Using LlamaIndex's Semantic Splitter:**
60
+ LlamaIndex provides a SemanticSplitterNodeParser designed for semantic chunking.
61
+ ```python
62
+ from llama_index.node_parser import SemanticSplitterNodeParser
63
+ from llama_index.embeddings import OpenAIEmbedding
64
+
65
+ # Initialize the embedding model (e.g., OpenAIEmbeddings)
66
+ embed_model = OpenAIEmbedding()
67
+
68
+ # Initialize the semantic splitter
69
+ # `buffer_size` determines how many sentences to consider for similarity comparison
70
+ # `breakpoint_percentile_threshold` controls the sensitivity of splitting
71
+ splitter = SemanticSplitterNodeParser(
72
+ buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
73
+ )
74
+
75
+ # Load your document (e.g., from a file)
76
+ # Example: text = "Your long document text here..."
77
+ # Or use LlamaIndex's SimpleDirectoryReader to load documents from a directory
78
+
79
+ # Parse the document into nodes (chunks)
80
+ nodes = splitter.get_nodes_from_documents([document]) # Replace 'document' with your LlamaIndex Document object
81
+
82
+ # Access the content of the semantic chunks
83
+ for node in nodes:
84
+ print(node.text)
85
+ ```
86
+ 2. **Using LangChain's Semantic Chunking (Experimental):**
87
+ LangChain also offers an experimental SemanticChunker within langchain_experimental.
88
+ ```python
89
+ from langchain_experimental.text_splitter import SemanticChunker
90
+ from langchain_openai.embeddings import OpenAIEmbeddings
91
+
92
+ # Initialize the embedding model
93
+ embeddings = OpenAIEmbeddings()
94
+
95
+ # Initialize the semantic chunker
96
+ semantic_chunker = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
97
+
98
+ # Split your text into semantic chunks
99
+ text = "Your long document text here..."
100
+ chunks = semantic_chunker.split_text(text)
101
+
102
+ for chunk in chunks:
103
+ print(chunk)
104
+ ```
105
+
106
+ Key Concepts in Semantic Chunking:
107
+ • **Embeddings:** Text is converted into numerical vector representations (embeddings) that capture its semantic meaning.
108
+ • **Similarity Measurement:** The similarity between embeddings of adjacent sentences or segments is calculated (e.g., using cosine similarity).
109
+ • **Breakpoint Threshold:** A threshold is used to identify points where the semantic similarity drops significantly, indicating a natural break point for a new chunk. This can be based on percentiles, standard deviation, or interquartile range of similarity scores.
110
+ • **Adaptive Chunk Sizes:** Unlike fixed-size chunking, semantic chunking results in chunks of varying lengths, as the splits are determined by semantic coherence.
111
+
112
+ By using these methods, you can create more semantically meaningful chunks, which can lead to improved performance in downstream applications like RAG by ensuring that relevant contextual information remains together.
113
+
114
+ AI responses may include mistakes.
115
+
116
+ ---
@@ -0,0 +1,165 @@
1
+ # Semantic Chunking Library Design
2
+
3
+ ## 1. Background & Motivation
4
+
5
+ Our existing `SmartChunker` performs a hybrid of sliding windows and heuristic boundary searches. When processing medium-sized code files the forward/backward scans can explode in CPU and memory, causing build failures. We also have no semantic awareness for other file types, leading to arbitrary splits. To make the knowledge-base pipeline reliable and extensible we want a modular chunking library that plugs cleanly into the Nancy Brain build as well as the new MCP-based RAG service powering the Slack bot. The same library will ship as a standalone package (working name `chunky`) so other indexing services can reuse it. We want that library to:
6
+
7
+ - Handles our common file types (Python, Markdown, JSON/YAML, plain text) with sensible defaults.
8
+ - Lets us plug in stronger semantic strategies (AST, embeddings) as optional enhancements.
9
+ - Keeps configuration centralized and easy to override via environment variables or config files.
10
+ - Produces consistent `Chunk` objects that slot directly into the indexing pipeline.
11
+
12
+ ## 2. Goals & Non-Goals
13
+
14
+ ### Goals
15
+ - Deterministic chunking for code and docs without pathological loops.
16
+ - Environment-driven configuration (e.g., tweak window sizes per build).
17
+ - Pipeline orchestration that picks the right chunker based on file metadata.
18
+ - Clear surface for future semantic/AST-based chunkers.
19
+
20
+ ### Non-Goals
21
+ - Building a full AST parser for every language on day one.
22
+ - Re-implementing vector stores or summarization; we only prepare text for indexing/summarizing.
23
+ - Handling binary formats such as PDFs (they stay outside this module).
24
+
25
+ ## 3. High-Level Architecture
26
+
27
+ ```
28
+ chunky/
29
+ ├── types.py # Chunk, Document, ChunkerConfig definitions
30
+ ├── core.py # Chunker protocol, ChunkingError
31
+ ├── chunkers/
32
+ │ ├── python.py # PythonSemanticChunker (AST-aware)
33
+ │ ├── markdown.py # MarkdownHeadingChunker
34
+ │ ├── yaml_json.py # JSONYamlChunker
35
+ │ ├── text.py # PlainTextChunker
36
+ │ └── fallback.py # SlidingWindowChunker
37
+ ├── registry.py # ChunkerRegistry + DEFAULT_REGISTRY
38
+ ├── loaders.py # DocumentLoader hierarchy
39
+ ├── pipeline.py # ChunkPipeline orchestrator
40
+ └── utils.py # Shared helpers (token counting, environment hooks)
41
+ ```
42
+
43
+ The code will live in the dedicated `chunky` package and be imported by Nancy Brain (and future MCP clients) like any other dependency. Keeping the chunker in its own package keeps agent scopes narrow and makes reuse easier.
44
+
45
+ ## 4. Core Concepts
46
+
47
+ - **Document**: normalized representation of a file with (path, content, metadata, language).
48
+ - **Chunk**: dataclass with `chunk_id`, `text`, `metadata` (JSON-serializable), `source_document`.
49
+ - **Chunker**: object exposing `chunk(document, config) -> List[Chunk]`.
50
+ - **ChunkerRegistry**: resolves the appropriate chunker for a document (by extension, language, or explicit override).
51
+ - **ChunkPipeline**: orchestrates loading, chunking, and optional summarization hooks.
52
+
53
+ ## 5. Chunker Implementations
54
+
55
+ Minimum viable set:
56
+
57
+ | Chunker | Description | Notes |
58
+ |---------|-------------|-------|
59
+ | `SlidingWindowChunker` | Simple fixed-line window with overlap | Always available; zero dependencies |
60
+ | `PythonSemanticChunker` | AST-based splitting on top-level functions/classes; falls back to window | Requires `ast` (built-in). Optionally `tree_sitter` later |
61
+ | `MarkdownHeadingChunker` | Breaks on heading hierarchy; merges small sections | No heavy deps |
62
+ | `JSONYamlChunker` | Treats top-level keys/arrays as chunks; flatten nested objects | Uses `json` / `yaml` |
63
+ | `PlainTextChunker` | Sentence/paragraph segmentation using regex or spaCy optional | Configurable sentence splitter |
64
+ | `SemanticEmbeddingChunker` (optional) | Embedding-based breakpoints (cosine drift) | Depends on configured embedding model; opt-in |
65
+ | `NotebookChunker` (via `nb4llm`) | Works with notebook-derived fenced text | Delegates heavy lifting to `nb4llm`; enforces Markdown/Python fence boundaries |
66
+
67
+ Each chunker adheres to the `Chunker` protocol and accepts a `ChunkerConfig`. The fallback chunker is always used last to guarantee progress.
68
+
69
+ ## 6. Configuration Strategy
70
+
71
+ - `ChunkerConfig` stores generic knobs (`max_chars`, `max_tokens`, `code_window_lines`, `code_overlap_lines`, `semantic_model`, etc.).
72
+ - Defaults come from environment variables (`SMART_CHUNK_CODE_LINES`, `SMART_CHUNK_CODE_OVERLAP`, `SMART_CHUNK_TEXT_CHARS`, `SEMANTIC_MODEL`) or a YAML file (`semantic_chunker.yaml`). For MCP deployments we also respect `MCP_CHUNKER_CONFIG`, pointing to a remote-friendly YAML/JSON config path.
73
+ - The pipeline allows per-call overrides, e.g., `pipeline.chunk_file(path, config=ChunkerConfig(code_window_lines=60))`.
74
+ - All chunkers attach useful metadata (`line_start`, `line_end`, `language`, optional `semantic_score`) so MCP clients and Nancy's Slack responses can surface precise citations.
75
+
76
+ ## 7. API Sketch
77
+
78
+ ```python
79
+ from chunky.pipeline import ChunkPipeline
80
+ from chunky.types import ChunkerConfig
81
+
82
+ pipeline = ChunkPipeline() # uses DEFAULT_REGISTRY
83
+
84
+ chunks = pipeline.chunk_file(
85
+ path="knowledge_base/raw/general_tools/Dazzle/dazzle/dazzle.py",
86
+ config=ChunkerConfig(
87
+ code_window_lines=80,
88
+ code_overlap_lines=10,
89
+ semantic_model=None, # disable embedding-based splits
90
+ ),
91
+ )
92
+
93
+ for chunk in chunks:
94
+ print(chunk.chunk_id, chunk.metadata["line_start"], chunk.metadata["line_end"])
95
+ ```
96
+
97
+ Pipeline steps internally:
98
+ 1. Load `Document` via registered loader.
99
+ 2. Resolve chunker from registry (falls back to sliding window).
100
+ 3. Invoke chunker with provided config.
101
+ 4. Optionally post-process (e.g., summarization hook, metadata enrichment).
102
+ 5. Return list of `Chunk` objects ready for indexing. Downstream consumers (Nancy Brain builders, MCP adapters, Slack bot citation tooling) rely on `chunk_id`, `source_document`, and line metadata to link answers back to source material.
103
+
104
+ ## 8. Integration with KB Build
105
+
106
+ - Replace direct `SmartChunker` usage in `scripts/build_knowledge_base.py` with `ChunkPipeline`.
107
+ - All metadata stays JSON-serializable; pipeline returns chunks with ready metadata.
108
+ - Existing environment flags (`SKIP_PDF_PROCESSING`, `NB_PER_FILE_LOG`) remain untouched.
109
+
110
+ ## 9. Extensibility Hooks
111
+
112
+ - `ChunkerRegistry.register(ext, chunker_cls)` to add new chunkers (e.g., notebook support).
113
+ - `ChunkPipeline` accepts custom registry or pre/post hooks (e.g., run summarizer on each chunk).
114
+ - Optional plugin entry points for projects to register chunkers via setuptools entry points.
115
+
116
+ ## 10. Risks & Mitigations
117
+
118
+ | Risk | Mitigation |
119
+ |------|------------|
120
+ | AST parser errors on malformed code | Catch exceptions, fall back to sliding window |
121
+ | Semantic chunker slows builds | Make embedding-driven chunker opt-in; default to cheapest strategy |
122
+ | Dependency bloat | Fuck it. the heavy hitters already live in Nancy Brain and people can nuke the env after a build. I’ll plan assuming we can pull in whatever libraries make the chunkers accurate and fast; if anything starts to feel gratuitous later, we can revisit. |
123
+ | Inconsistent metadata | Centralize metadata construction utilities; reuse JSON serialization helpers |
124
+
125
+ ## 11. Implementation Plan
126
+
127
+ 1. **Phase 1 – Infrastructure**
128
+ - Define `Chunk`, `Document`, `ChunkerConfig`.
129
+ - Implement `SlidingWindowChunker` and registry/pipeline scaffold.
130
+ - Swap KB build to use pipeline with sliding window (parity with current behavior).
131
+
132
+ 2. **Phase 2 – Language-specific chunkers**
133
+ - Implement `PythonSemanticChunker`, `MarkdownHeadingChunker`, `JSONYamlChunker`, `PlainTextChunker`.
134
+ - Add tests covering line ranges, metadata, and fallback behavior.
135
+
136
+ 3. **Phase 3 – Semantic chunking (optional)**
137
+ - Prototype embedding-based chunker using existing sentence-transformer models.
138
+ - Benchmark build impact; keep behind feature flag.
139
+
140
+ 4. **Phase 4 – Documentation & Adoption**
141
+ - Document env vars/config file usage.
142
+ - Update KB pipeline guide in README/docs.
143
+ - Gather feedback, iterate on defaults.
144
+
145
+ ## 12. Testing Strategy
146
+
147
+ - Unit tests per chunker verifying chunk counts, metadata integrity, and fallback logic.
148
+ - Golden-file tests comparing chunk outputs for representative code/docs.
149
+ - Integration test running pipeline on sample repo (like Dazzle) ensuring no timeouts or memory blowups.
150
+ - Benchmark harness to track runtime vs. file size.
151
+
152
+ ## 13. Open Questions
153
+
154
+ - Do we need per-language registries (e.g., `.py` vs `.pyi`)?
155
+ - Should summarization integrate directly into pipeline or remain in KB build script?
156
+ - How aggressively should we cache chunk results (hash by content) to avoid recomputation when files don’t change?
157
+ - How do we expose chunk metadata in downstream tools (UI, Slack bot) for debugging?
158
+
159
+ ---
160
+
161
+ **Next Steps:**
162
+ - Finalize `ChunkerConfig` shape and default values.
163
+ - Implement Phase 1 (sliding window + pipeline scaffolding).
164
+ - Write tests ensuring no regression against current KB build.
165
+ - Incrementally add higher-level chunkers in Phase 2.
@@ -0,0 +1,22 @@
1
+ .. chunky documentation master file
2
+
3
+ Welcome to Chunky's documentation!
4
+ =================================
5
+
6
+ Chunky provides modular chunking primitives tailored for heterogeneous scientific repositories.
7
+ It is designed to serve both the Nancy Brain knowledge-base pipeline and any external RAG
8
+ pipelines that need deterministic, metadata-rich chunks.
9
+
10
+ .. toctree::
11
+ :maxdepth: 2
12
+ :caption: Contents
13
+
14
+ overview
15
+ api
16
+
17
+ Indices and tables
18
+ ==================
19
+
20
+ * :ref:`genindex`
21
+ * :ref:`modindex`
22
+ * :ref:`search`
@@ -0,0 +1,50 @@
1
+ Overview
2
+ ========
3
+
4
+ Chunky exposes a modular pipeline for converting heterogeneous project artefacts into
5
+ well-behaved text chunks. The pipeline is language-aware, pluggable, and ready for
6
+ Nancy Brain's MCP-backed retrieval workflows.
7
+
8
+ .. note::
9
+ The implementation is in active development. See ``SEMANTIC_CHUNKER.md`` for the full
10
+ design document and roadmap.
11
+
12
+ Getting Started
13
+ ---------------
14
+
15
+ Install the package from source:
16
+
17
+ .. code-block:: bash
18
+
19
+ git clone https://github.com/AmberLee2427/chunky.git
20
+ cd chunky
21
+ pip install .
22
+
23
+ For development work and documentation builds:
24
+
25
+ .. code-block:: bash
26
+
27
+ pip install -e ".[dev,docs]"
28
+
29
+ First chunks via the pipeline:
30
+
31
+ .. code-block:: python
32
+
33
+ from pathlib import Path
34
+
35
+ from chunky import ChunkPipeline, ChunkerConfig
36
+
37
+ pipeline = ChunkPipeline()
38
+ config = ChunkerConfig(lines_per_chunk=80, line_overlap=10)
39
+ chunks = pipeline.chunk_file(Path("/path/to/file.py"), config=config)
40
+
41
+ for chunk in chunks:
42
+ print(chunk.chunk_id, chunk.metadata["line_start"], chunk.metadata["line_end"])
43
+
44
+ Roadmap
45
+ -------
46
+
47
+ * Phase 1: infrastructure scaffolding and sliding-window baseline.
48
+ * Phase 2: language-specific chunkers (Python, Markdown, JSON/YAML, notebooks).
49
+ * Phase 3: semantic/embedding-driven chunking.
50
+ * Phase 4: documentation, benchmarks, and Nancy Brain integration.
@@ -0,0 +1,111 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.25"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "chunky-files"
7
+ version = "0.2.0"
8
+ description = "Semantic chunking utilities for scientific code and documentation corpora."
9
+ readme = "README.md"
10
+ authors = [
11
+ { name = "Nancy Brain Contributors" }
12
+ ]
13
+ license = { file = "LICENSE" }
14
+ requires-python = ">=3.8"
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Intended Audience :: Developers",
18
+ "Intended Audience :: Science/Research",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.8",
23
+ "Programming Language :: Python :: 3.9",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Scientific/Engineering",
28
+ "Topic :: Text Processing :: Linguistic",
29
+ ]
30
+ dependencies = []
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=7",
35
+ "pytest-cov>=4",
36
+ "coverage[toml]>=7",
37
+ "ruff>=0.6",
38
+ "bump-my-version>=0.6",
39
+ "build"
40
+ ]
41
+ docs = [
42
+ "sphinx>=7",
43
+ "myst-parser>=2",
44
+ "furo>=2024.0.0",
45
+ ]
46
+
47
+ [project.urls]
48
+ Home = "https://github.com/AmberLee2427/chunky"
49
+ Documentation = "https://chunky.readthedocs.io/"
50
+ Issues = "https://github.com/AmberLee2427/chunky/issues"
51
+
52
+ [tool.hatch.version]
53
+ path = "src/chunky/__about__.py"
54
+
55
+ [tool.hatch.build.targets.sdist]
56
+ include = [
57
+ "src/**",
58
+ "docs/**",
59
+ "README.md",
60
+ "LICENSE",
61
+ "CHANGELOG.md",
62
+ "pyproject.toml",
63
+ ]
64
+
65
+ [tool.hatch.build.targets.wheel]
66
+ packages = ["src/chunky"]
67
+
68
+ [tool.hatch.envs.default]
69
+ dependencies = [
70
+ "pytest",
71
+ ]
72
+
73
+ [tool.pytest.ini_options]
74
+ minversion = "7.0"
75
+ addopts = "-ra --showlocals --strict-markers --strict-config"
76
+ testpaths = [
77
+ "tests",
78
+ ]
79
+
80
+
81
+ [tool.ruff]
82
+ line-length = 100
83
+ src = ["src", "tests"]
84
+
85
+ [tool.ruff.lint]
86
+ select = ["E", "F", "I", "B"]
87
+ ignore = ["E203"]
88
+
89
+ [tool.ruff.format]
90
+ quote-style = "double"
91
+ indent-style = "space"
92
+
93
+ [tool.bumpversion]
94
+ current_version = "0.2.0"
95
+ commit = true
96
+ message = "chore: bump version to {new_version}"
97
+
98
+ [[tool.bumpversion.files]]
99
+ filename = "pyproject.toml"
100
+ search = "version = \"{current_version}\""
101
+ replace = "version = \"{new_version}\""
102
+
103
+ [[tool.bumpversion.files]]
104
+ filename = "src/chunky/__about__.py"
105
+ search = "__version__ = \"{current_version}\""
106
+ replace = "__version__ = \"{new_version}\""
107
+
108
+ [[tool.bumpversion.files]]
109
+ filename = "CHANGELOG.md"
110
+ search = "## [Unreleased]"
111
+ replace = "## [Unreleased]\n\n## [{new_version}] - TBD"
@@ -0,0 +1,5 @@
1
+ """Project metadata."""
2
+
3
+ __all__ = ["__version__"]
4
+
5
+ __version__ = "0.2.0"
@@ -0,0 +1,13 @@
1
+ """Chunky: semantic chunking utilities for heterogeneous repositories."""
2
+
3
+ from .__about__ import __version__
4
+ from .pipeline import ChunkPipeline
5
+ from .types import Chunk, ChunkerConfig, Document
6
+
7
+ __all__ = [
8
+ "__version__",
9
+ "ChunkPipeline",
10
+ "Chunk",
11
+ "ChunkerConfig",
12
+ "Document",
13
+ ]
@@ -0,0 +1,5 @@
1
+ """Built-in chunker implementations."""
2
+
3
+ from .fallback import SlidingWindowChunker
4
+
5
+ __all__ = ["SlidingWindowChunker"]
@@ -0,0 +1,112 @@
1
+ """Sliding window fallback chunker."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ from ..core import Chunker
8
+ from ..types import Chunk, ChunkerConfig, Document
9
+
10
+
11
+ class SlidingWindowChunker(Chunker):
12
+ """Chunker that produces fixed-size line windows with optional overlap."""
13
+
14
+ def chunk(self, document: Document, config: ChunkerConfig) -> list[Chunk]:
15
+ lines = document.content.splitlines()
16
+ window = config.clamp_lines(config.lines_per_chunk)
17
+ overlap = config.clamp_overlap(config.line_overlap, window)
18
+
19
+ if not lines:
20
+ chunk_id = self._build_chunk_id(document.path, 0)
21
+ return [
22
+ Chunk(
23
+ chunk_id=chunk_id,
24
+ text="",
25
+ source_document=document.path,
26
+ metadata=self._chunk_metadata(
27
+ chunk_index=0,
28
+ line_start=0,
29
+ line_end=0,
30
+ span_start=0,
31
+ span_end=0,
32
+ config=config,
33
+ ),
34
+ )
35
+ ]
36
+
37
+ chunks: list[Chunk] = []
38
+ line_count = len(lines)
39
+ # Pre-compute character offsets once to avoid quadratic scans.
40
+ line_starts: list[int] = []
41
+ line_ends: list[int] = []
42
+ cursor = 0
43
+ for idx, line in enumerate(lines):
44
+ if idx > 0:
45
+ cursor += 1 # newline preceding this line
46
+ line_starts.append(cursor)
47
+ cursor += len(line)
48
+ line_ends.append(cursor)
49
+
50
+ start_line = 0
51
+ chunk_index = 0
52
+
53
+ while start_line < line_count:
54
+ previous_start = start_line
55
+ end_line = min(start_line + window, line_count)
56
+ text = "\n".join(lines[start_line:end_line])
57
+ chunk_id = self._build_chunk_id(document.path, chunk_index)
58
+ metadata = self._chunk_metadata(
59
+ chunk_index=chunk_index,
60
+ line_start=start_line + 1,
61
+ line_end=end_line,
62
+ span_start=line_starts[start_line],
63
+ span_end=line_ends[end_line - 1],
64
+ config=config,
65
+ )
66
+
67
+ chunks.append(
68
+ Chunk(
69
+ chunk_id=chunk_id,
70
+ text=text,
71
+ source_document=document.path,
72
+ metadata=metadata,
73
+ )
74
+ )
75
+
76
+ chunk_index += 1
77
+ if config.max_chunks and chunk_index >= config.max_chunks:
78
+ break
79
+
80
+ if end_line >= line_count:
81
+ break
82
+
83
+ next_start = end_line - overlap
84
+ if next_start <= previous_start:
85
+ next_start = end_line
86
+ start_line = next_start
87
+
88
+ return chunks
89
+
90
+ @staticmethod
91
+ def _build_chunk_id(path: Path, index: int) -> str:
92
+ return f"{path}::chunk-{index}"
93
+
94
+ @staticmethod
95
+ def _chunk_metadata(
96
+ chunk_index: int,
97
+ line_start: int,
98
+ line_end: int,
99
+ span_start: int,
100
+ span_end: int,
101
+ config: ChunkerConfig,
102
+ ) -> dict[str, int | str]:
103
+ metadata: dict[str, int | str] = {
104
+ "chunk_index": chunk_index,
105
+ "line_start": line_start,
106
+ "line_end": line_end,
107
+ "span_start": span_start,
108
+ "span_end": span_end,
109
+ }
110
+ if config.metadata:
111
+ metadata.update(config.metadata)
112
+ return metadata
@@ -0,0 +1,22 @@
1
+ """Core interfaces and exceptions for chunkers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Protocol, runtime_checkable
6
+
7
+ from .types import Chunk, ChunkerConfig, Document
8
+
9
+
10
+ class ChunkingError(RuntimeError):
11
+ """Raised when a chunker cannot process the provided document."""
12
+
13
+
14
+ @runtime_checkable
15
+ class Chunker(Protocol):
16
+ """Protocol implemented by all chunkers."""
17
+
18
+ def chunk(self, document: Document, config: ChunkerConfig) -> list[Chunk]:
19
+ """Return a list of chunks for the given document."""
20
+
21
+
22
+ __all__ = ["ChunkingError", "Chunker"]
@@ -0,0 +1,29 @@
1
+ """Document loaders."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Protocol
7
+
8
+ from .types import Document
9
+
10
+
11
+ class DocumentLoader(Protocol):
12
+ """Protocol for converting files into :class:`Document` instances."""
13
+
14
+ def load(self, path: Path) -> Document:
15
+ ...
16
+
17
+
18
+ class FileSystemLoader:
19
+ """Loader that reads text files from disk."""
20
+
21
+ def load(self, path: Path) -> Document:
22
+ content = path.read_text(encoding="utf-8")
23
+ return Document(path=path, content=content)
24
+
25
+
26
+ DEFAULT_LOADER = FileSystemLoader()
27
+
28
+
29
+ __all__ = ["DocumentLoader", "FileSystemLoader", "DEFAULT_LOADER"]
@@ -0,0 +1,59 @@
1
+ """High-level orchestration for chunking documents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ from .chunkers import SlidingWindowChunker
9
+ from .loaders import DEFAULT_LOADER, DocumentLoader
10
+ from .registry import DEFAULT_REGISTRY, ChunkerRegistry
11
+ from .types import Chunk, ChunkerConfig, Document
12
+
13
+
14
+ class ChunkPipeline:
15
+ """Pipeline that orchestrates document loading and chunking."""
16
+
17
+ def __init__(
18
+ self,
19
+ *,
20
+ registry: Optional[ChunkerRegistry] = None,
21
+ loader: Optional[DocumentLoader] = None,
22
+ ) -> None:
23
+ self.registry = registry or DEFAULT_REGISTRY
24
+ self.loader = loader or DEFAULT_LOADER
25
+ self._ensure_fallback()
26
+
27
+ def chunk_file(
28
+ self,
29
+ path: Path | str,
30
+ *,
31
+ config: Optional[ChunkerConfig] = None,
32
+ ) -> list[Chunk]:
33
+ """Chunk a file from disk."""
34
+
35
+ config = config or ChunkerConfig()
36
+ document = self.loader.load(Path(path))
37
+ chunker = self.registry.get(document.path)
38
+ return chunker.chunk(document, config)
39
+
40
+ def chunk_documents(
41
+ self,
42
+ documents: list[Document],
43
+ *,
44
+ config: Optional[ChunkerConfig] = None,
45
+ ) -> list[Chunk]:
46
+ """Chunk pre-loaded documents."""
47
+
48
+ config = config or ChunkerConfig()
49
+ chunks: list[Chunk] = []
50
+ for document in documents:
51
+ chunker = self.registry.get(document.path)
52
+ chunks.extend(chunker.chunk(document, config))
53
+ return chunks
54
+
55
+ def _ensure_fallback(self) -> None:
56
+ try:
57
+ self.registry.get(Path("__dummy__"))
58
+ except KeyError:
59
+ self.registry.set_fallback(SlidingWindowChunker())
@@ -0,0 +1,60 @@
1
+ """Chunker registry responsible for resolving the appropriate implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+ from typing import Iterable, MutableMapping
7
+
8
+ from .core import Chunker
9
+
10
+
11
+ class ChunkerRegistry:
12
+ """Runtime registry mapping file extensions to chunkers."""
13
+
14
+ def __init__(self) -> None:
15
+ self._registry: MutableMapping[str, Chunker] = {}
16
+ self._fallback: Chunker | None = None
17
+
18
+ def register(
19
+ self,
20
+ extensions: Iterable[str] | str,
21
+ chunker: Chunker,
22
+ *,
23
+ overwrite: bool = False,
24
+ ) -> None:
25
+ """Register a chunker for one or more extensions."""
26
+
27
+ if isinstance(extensions, str):
28
+ extensions = [extensions]
29
+
30
+ for ext in extensions:
31
+ key = self._normalize(ext)
32
+ if not overwrite and key in self._registry:
33
+ raise ValueError(f"Chunker already registered for extension '{ext}'")
34
+ self._registry[key] = chunker
35
+
36
+ def set_fallback(self, chunker: Chunker) -> None:
37
+ """Set the fallback chunker used for unknown extensions."""
38
+
39
+ self._fallback = chunker
40
+
41
+ def get(self, path: Path) -> Chunker:
42
+ """Return the chunker associated with the file path or the fallback."""
43
+
44
+ suffix = self._normalize(path.suffix or "")
45
+ chunker = self._registry.get(suffix)
46
+ if chunker is not None:
47
+ return chunker
48
+ if self._fallback is None:
49
+ raise KeyError(f"No chunker registered for {suffix!r} and no fallback configured")
50
+ return self._fallback
51
+
52
+ @staticmethod
53
+ def _normalize(extension: str) -> str:
54
+ return extension.lower().lstrip(".")
55
+
56
+
57
+ DEFAULT_REGISTRY = ChunkerRegistry()
58
+
59
+
60
+ __all__ = ["ChunkerRegistry", "DEFAULT_REGISTRY"]
@@ -0,0 +1,49 @@
1
+ """Core data structures for the semantic chunking pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from pathlib import Path
7
+ from typing import Any, Dict, Optional
8
+
9
+
10
+ @dataclass
11
+ class Document:
12
+ """Normalized representation of a file to be chunked."""
13
+
14
+ path: Path
15
+ content: str
16
+ language: Optional[str] = None
17
+ metadata: Dict[str, Any] = field(default_factory=dict)
18
+
19
+
20
+ @dataclass
21
+ class Chunk:
22
+ """A chunk of text ready for downstream indexing."""
23
+
24
+ chunk_id: str
25
+ text: str
26
+ source_document: Path
27
+ metadata: Dict[str, Any] = field(default_factory=dict)
28
+
29
+
30
+ @dataclass
31
+ class ChunkerConfig:
32
+ """Configuration shared across chunkers."""
33
+
34
+ max_chars: int = 2000
35
+ lines_per_chunk: int = 120
36
+ line_overlap: int = 20
37
+ max_chunks: Optional[int] = None
38
+ metadata: Dict[str, Any] = field(default_factory=dict)
39
+
40
+ def clamp_lines(self, lines: int) -> int:
41
+ """Clamp the requested line count to a sensible positive value."""
42
+
43
+ return max(1, lines)
44
+
45
+ def clamp_overlap(self, overlap: int, window: int) -> int:
46
+ """Clamp overlap so it cannot exceed the window size."""
47
+
48
+ overlap = max(0, overlap)
49
+ return min(overlap, max(0, window - 1))