chunky-files 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chunky_files-0.2.0/.gitignore +47 -0
- chunky_files-0.2.0/CHANGELOG.md +30 -0
- chunky_files-0.2.0/LICENSE +21 -0
- chunky_files-0.2.0/PKG-INFO +145 -0
- chunky_files-0.2.0/README.md +88 -0
- chunky_files-0.2.0/docs/_static/.gitkeep +0 -0
- chunky_files-0.2.0/docs/_templates/.gitkeep +0 -0
- chunky_files-0.2.0/docs/api.rst +9 -0
- chunky_files-0.2.0/docs/conf.py +50 -0
- chunky_files-0.2.0/docs/design/GOOGLE_RESULTS.md +116 -0
- chunky_files-0.2.0/docs/design/SEMANTIC_CHUNKER.md +165 -0
- chunky_files-0.2.0/docs/index.rst +22 -0
- chunky_files-0.2.0/docs/overview.rst +50 -0
- chunky_files-0.2.0/pyproject.toml +111 -0
- chunky_files-0.2.0/src/chunky/__about__.py +5 -0
- chunky_files-0.2.0/src/chunky/__init__.py +13 -0
- chunky_files-0.2.0/src/chunky/chunkers/__init__.py +5 -0
- chunky_files-0.2.0/src/chunky/chunkers/fallback.py +112 -0
- chunky_files-0.2.0/src/chunky/core.py +22 -0
- chunky_files-0.2.0/src/chunky/loaders.py +29 -0
- chunky_files-0.2.0/src/chunky/pipeline.py +59 -0
- chunky_files-0.2.0/src/chunky/registry.py +60 -0
- chunky_files-0.2.0/src/chunky/types.py +49 -0
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# Distribution / packaging
|
|
7
|
+
build/
|
|
8
|
+
dist/
|
|
9
|
+
*.egg-info/
|
|
10
|
+
.eggs/
|
|
11
|
+
|
|
12
|
+
# Installer logs
|
|
13
|
+
pip-log.txt
|
|
14
|
+
pip-delete-this-directory.txt
|
|
15
|
+
|
|
16
|
+
# Unit test / coverage reports
|
|
17
|
+
htmlcov/
|
|
18
|
+
.tox/
|
|
19
|
+
.coverage
|
|
20
|
+
.coverage.*
|
|
21
|
+
.cache
|
|
22
|
+
pytest_cache/
|
|
23
|
+
.coverage
|
|
24
|
+
coverage.xml
|
|
25
|
+
|
|
26
|
+
# Sphinx build artifacts
|
|
27
|
+
docs/_build/
|
|
28
|
+
|
|
29
|
+
# IDEs and editors
|
|
30
|
+
.vscode/
|
|
31
|
+
.idea/
|
|
32
|
+
*.swp
|
|
33
|
+
|
|
34
|
+
# macOS
|
|
35
|
+
.DS_Store
|
|
36
|
+
|
|
37
|
+
# Hatch environments
|
|
38
|
+
.hatch/
|
|
39
|
+
|
|
40
|
+
# Environment file
|
|
41
|
+
.env
|
|
42
|
+
.venv/
|
|
43
|
+
env/
|
|
44
|
+
venv/
|
|
45
|
+
ENV/
|
|
46
|
+
env.bak/
|
|
47
|
+
venv.bak/
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.2.0] - TBD
|
|
11
|
+
### Added
|
|
12
|
+
- Changelog (`CHANGELOG.md`; this file).
|
|
13
|
+
- Release process section added to the existing `README.md`
|
|
14
|
+
- `PYPI_TOKEN`, `TEST_PYPI_TOKEN`, and `CODECOV_TOKEN` added to github secrets
|
|
15
|
+
- `.env` and other common evironment file name added to the `.gitignore` for token security.
|
|
16
|
+
### Changes
|
|
17
|
+
- Release workflow updated to have matching secrets name.
|
|
18
|
+
### Fixes
|
|
19
|
+
- Updated dependencies and improve type hints in codebase (ruff compliance).
|
|
20
|
+
- Update build tooling installation in release .
|
|
21
|
+
- Included pyproject.toml in sdist build targets.
|
|
22
|
+
|
|
23
|
+
## [0.1.0] - 2025-09-30
|
|
24
|
+
### Added
|
|
25
|
+
- Initial project scaffolding with Hatchling build system and CI/release workflows.
|
|
26
|
+
- Core chunking data models (`Document`, `Chunk`, `ChunkerConfig`).
|
|
27
|
+
- Sliding-window fallback chunker with metadata-rich outputs.
|
|
28
|
+
- `ChunkPipeline` orchestration, registry, and filesystem loader.
|
|
29
|
+
- Sphinx documentation skeleton and Read the Docs configuration.
|
|
30
|
+
- Pytest and Ruff tooling with baseline tests for the sliding-window chunker.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Nancy Brain Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chunky-files
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Semantic chunking utilities for scientific code and documentation corpora.
|
|
5
|
+
Project-URL: Home, https://github.com/AmberLee2427/chunky
|
|
6
|
+
Project-URL: Documentation, https://chunky.readthedocs.io/
|
|
7
|
+
Project-URL: Issues, https://github.com/AmberLee2427/chunky/issues
|
|
8
|
+
Author: Nancy Brain Contributors
|
|
9
|
+
License: MIT License
|
|
10
|
+
|
|
11
|
+
Copyright (c) 2024 Nancy Brain Contributors
|
|
12
|
+
|
|
13
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
14
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
15
|
+
in the Software without restriction, including without limitation the rights
|
|
16
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
17
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
18
|
+
furnished to do so, subject to the following conditions:
|
|
19
|
+
|
|
20
|
+
The above copyright notice and this permission notice shall be included in all
|
|
21
|
+
copies or substantial portions of the Software.
|
|
22
|
+
|
|
23
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
24
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
25
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
26
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
27
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
28
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
29
|
+
SOFTWARE.
|
|
30
|
+
License-File: LICENSE
|
|
31
|
+
Classifier: Development Status :: 3 - Alpha
|
|
32
|
+
Classifier: Intended Audience :: Developers
|
|
33
|
+
Classifier: Intended Audience :: Science/Research
|
|
34
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
35
|
+
Classifier: Programming Language :: Python
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
40
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
41
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
42
|
+
Classifier: Topic :: Scientific/Engineering
|
|
43
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
44
|
+
Requires-Python: >=3.8
|
|
45
|
+
Provides-Extra: dev
|
|
46
|
+
Requires-Dist: build; extra == 'dev'
|
|
47
|
+
Requires-Dist: bump-my-version>=0.6; extra == 'dev'
|
|
48
|
+
Requires-Dist: coverage[toml]>=7; extra == 'dev'
|
|
49
|
+
Requires-Dist: pytest-cov>=4; extra == 'dev'
|
|
50
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
51
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
52
|
+
Provides-Extra: docs
|
|
53
|
+
Requires-Dist: furo>=2024.0.0; extra == 'docs'
|
|
54
|
+
Requires-Dist: myst-parser>=2; extra == 'docs'
|
|
55
|
+
Requires-Dist: sphinx>=7; extra == 'docs'
|
|
56
|
+
Description-Content-Type: text/markdown
|
|
57
|
+
|
|
58
|
+
# Chunky
|
|
59
|
+
|
|
60
|
+
Chunky is a python package for intelligently chunking scientific and technical repositories.
|
|
61
|
+
It provides a modular pipeline that powers the Nancy Brain knowledge base and MCP services,
|
|
62
|
+
while remaining useful as a standalone library for retrieval systems that need deterministic,
|
|
63
|
+
metadata-rich chunks.
|
|
64
|
+
|
|
65
|
+
Documentation lives on Read the Docs: <https://chunky.readthedocs.io>
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
Install from source using the `pyproject.toml` metadata:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# clone the repo (if you haven't already)
|
|
73
|
+
git clone https://github.com/AmberLee2427/chunky.git
|
|
74
|
+
cd chunky
|
|
75
|
+
|
|
76
|
+
# install the library
|
|
77
|
+
pip install .
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
For development and documentation builds, install the optional extras:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
pip install -e ".[dev,docs]"
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
> `-e` performs an editable install so local changes reflect immediately.
|
|
87
|
+
> `.[dev,docs]` installs the tooling declared under the `dev` and `docs` extras in
|
|
88
|
+
> `pyproject.toml`.
|
|
89
|
+
|
|
90
|
+
## Tooling
|
|
91
|
+
|
|
92
|
+
* **Code style:** Ruff (`ruff check src tests` or `ruff check src tests --fix`)
|
|
93
|
+
* **Tests:** Pytest (`pytest --cov=chunky`)
|
|
94
|
+
* **Docs:** Sphinx + MyST + Furo (`sphinx-build -b html docs docs/_build/html`)
|
|
95
|
+
* **Packaging:** Hatchling build backend
|
|
96
|
+
* **Versioning:** bump-my-version (driven by tags and the release workflow)
|
|
97
|
+
|
|
98
|
+
## Workflows
|
|
99
|
+
|
|
100
|
+
* CI tests run on Linux, macOS, and Windows for Python 3.8 through 3.12.
|
|
101
|
+
* Pushing a tag that matches the form `vX.Y.Z` triggers the release workflow. It validates that the
|
|
102
|
+
tag matches the version in `pyproject.toml`, builds the distribution, and publishes to PyPI using
|
|
103
|
+
the `PYPI_API_TOKEN` secret.
|
|
104
|
+
* Read the Docs builds the documentation automatically for pushes to the default branch. Local
|
|
105
|
+
builds use `sphinx-build -b html docs docs/_build/html`.
|
|
106
|
+
|
|
107
|
+
Release checklist:
|
|
108
|
+
|
|
109
|
+
1. Review and update `CHANGELOG.md`, keeping the `[Unreleased]` section accurate.
|
|
110
|
+
2. Run `bump-my-version bump <part>` to update version metadata and append a dated entry in the
|
|
111
|
+
changelog.
|
|
112
|
+
3. Commit the changes and push to `main`.
|
|
113
|
+
4. Tag the commit (`git tag vX.Y.Z && git push origin vX.Y.Z`) to trigger the Release workflow.
|
|
114
|
+
5. Verify the PyPI publish job and Read the Docs build succeed.
|
|
115
|
+
|
|
116
|
+
## Contributing
|
|
117
|
+
|
|
118
|
+
* Know your audience: most contributors will be scientific coders. Write docs assuming limited
|
|
119
|
+
familiarity with packaging internals.
|
|
120
|
+
* Use Ruff for style checks and keep numpy-style docstrings on all non-test functions.
|
|
121
|
+
* Target test coverage above 70% and ensure existing CI jobs pass before opening a PR.
|
|
122
|
+
* In pull requests, summarise code changes, testing/validation, doc updates, and provide a brief
|
|
123
|
+
TL;DR when the description runs long.
|
|
124
|
+
|
|
125
|
+
## License
|
|
126
|
+
|
|
127
|
+
Chunky is released under the [MIT License](LICENSE).
|
|
128
|
+
|
|
129
|
+
## Glossary
|
|
130
|
+
|
|
131
|
+
| Term | Meaning |
|
|
132
|
+
| ---- | ------- |
|
|
133
|
+
| PR | GitHub pull request – a request to merge one branch or fork with another |
|
|
134
|
+
| Release | Publishing a tagged version of the project to PyPI |
|
|
135
|
+
| ChangeLog | A document describing changes between releases |
|
|
136
|
+
| PyPI | Python Package Index – where published distributions live |
|
|
137
|
+
| Ruff | A fast Python linter/formatter used for style enforcement |
|
|
138
|
+
| origin | The upstream GitHub repository |
|
|
139
|
+
| fork | A downstream copy of the origin repo used for contributing |
|
|
140
|
+
| master/main | The default branch |
|
|
141
|
+
| CI | Continuous Integration – automated checks that run on every push/PR |
|
|
142
|
+
| GitHub Workflows | GitHub’s automation runner configured via YAML files |
|
|
143
|
+
| `pyproject.toml` | Core metadata and build configuration for the package |
|
|
144
|
+
| bump-my-version | CLI used to bump version numbers consistently |
|
|
145
|
+
| Read the Docs | Hosted documentation service that builds from the repo |
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# Chunky
|
|
2
|
+
|
|
3
|
+
Chunky is a python package for intelligently chunking scientific and technical repositories.
|
|
4
|
+
It provides a modular pipeline that powers the Nancy Brain knowledge base and MCP services,
|
|
5
|
+
while remaining useful as a standalone library for retrieval systems that need deterministic,
|
|
6
|
+
metadata-rich chunks.
|
|
7
|
+
|
|
8
|
+
Documentation lives on Read the Docs: <https://chunky.readthedocs.io>
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
Install from source using the `pyproject.toml` metadata:
|
|
13
|
+
|
|
14
|
+
```bash
|
|
15
|
+
# clone the repo (if you haven't already)
|
|
16
|
+
git clone https://github.com/AmberLee2427/chunky.git
|
|
17
|
+
cd chunky
|
|
18
|
+
|
|
19
|
+
# install the library
|
|
20
|
+
pip install .
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
For development and documentation builds, install the optional extras:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
pip install -e ".[dev,docs]"
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
> `-e` performs an editable install so local changes reflect immediately.
|
|
30
|
+
> `.[dev,docs]` installs the tooling declared under the `dev` and `docs` extras in
|
|
31
|
+
> `pyproject.toml`.
|
|
32
|
+
|
|
33
|
+
## Tooling
|
|
34
|
+
|
|
35
|
+
* **Code style:** Ruff (`ruff check src tests` or `ruff check src tests --fix`)
|
|
36
|
+
* **Tests:** Pytest (`pytest --cov=chunky`)
|
|
37
|
+
* **Docs:** Sphinx + MyST + Furo (`sphinx-build -b html docs docs/_build/html`)
|
|
38
|
+
* **Packaging:** Hatchling build backend
|
|
39
|
+
* **Versioning:** bump-my-version (driven by tags and the release workflow)
|
|
40
|
+
|
|
41
|
+
## Workflows
|
|
42
|
+
|
|
43
|
+
* CI tests run on Linux, macOS, and Windows for Python 3.8 through 3.12.
|
|
44
|
+
* Pushing a tag that matches the form `vX.Y.Z` triggers the release workflow. It validates that the
|
|
45
|
+
tag matches the version in `pyproject.toml`, builds the distribution, and publishes to PyPI using
|
|
46
|
+
the `PYPI_API_TOKEN` secret.
|
|
47
|
+
* Read the Docs builds the documentation automatically for pushes to the default branch. Local
|
|
48
|
+
builds use `sphinx-build -b html docs docs/_build/html`.
|
|
49
|
+
|
|
50
|
+
Release checklist:
|
|
51
|
+
|
|
52
|
+
1. Review and update `CHANGELOG.md`, keeping the `[Unreleased]` section accurate.
|
|
53
|
+
2. Run `bump-my-version bump <part>` to update version metadata and append a dated entry in the
|
|
54
|
+
changelog.
|
|
55
|
+
3. Commit the changes and push to `main`.
|
|
56
|
+
4. Tag the commit (`git tag vX.Y.Z && git push origin vX.Y.Z`) to trigger the Release workflow.
|
|
57
|
+
5. Verify the PyPI publish job and Read the Docs build succeed.
|
|
58
|
+
|
|
59
|
+
## Contributing
|
|
60
|
+
|
|
61
|
+
* Know your audience: most contributors will be scientific coders. Write docs assuming limited
|
|
62
|
+
familiarity with packaging internals.
|
|
63
|
+
* Use Ruff for style checks and keep numpy-style docstrings on all non-test functions.
|
|
64
|
+
* Target test coverage above 70% and ensure existing CI jobs pass before opening a PR.
|
|
65
|
+
* In pull requests, summarise code changes, testing/validation, doc updates, and provide a brief
|
|
66
|
+
TL;DR when the description runs long.
|
|
67
|
+
|
|
68
|
+
## License
|
|
69
|
+
|
|
70
|
+
Chunky is released under the [MIT License](LICENSE).
|
|
71
|
+
|
|
72
|
+
## Glossary
|
|
73
|
+
|
|
74
|
+
| Term | Meaning |
|
|
75
|
+
| ---- | ------- |
|
|
76
|
+
| PR | GitHub pull request – a request to merge one branch or fork with another |
|
|
77
|
+
| Release | Publishing a tagged version of the project to PyPI |
|
|
78
|
+
| ChangeLog | A document describing changes between releases |
|
|
79
|
+
| PyPI | Python Package Index – where published distributions live |
|
|
80
|
+
| Ruff | A fast Python linter/formatter used for style enforcement |
|
|
81
|
+
| origin | The upstream GitHub repository |
|
|
82
|
+
| fork | A downstream copy of the origin repo used for contributing |
|
|
83
|
+
| master/main | The default branch |
|
|
84
|
+
| CI | Continuous Integration – automated checks that run on every push/PR |
|
|
85
|
+
| GitHub Workflows | GitHub’s automation runner configured via YAML files |
|
|
86
|
+
| `pyproject.toml` | Core metadata and build configuration for the package |
|
|
87
|
+
| bump-my-version | CLI used to bump version numbers consistently |
|
|
88
|
+
| Read the Docs | Hosted documentation service that builds from the repo |
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Sphinx configuration for chunky documentation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib.metadata
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
|
|
10
|
+
PROJECT_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
11
|
+
SRC_DIR = os.path.join(PROJECT_ROOT, "src")
|
|
12
|
+
if SRC_DIR not in sys.path:
|
|
13
|
+
sys.path.insert(0, SRC_DIR)
|
|
14
|
+
|
|
15
|
+
project = "chunky"
|
|
16
|
+
copyright = f"{datetime.now():%Y}, Nancy Brain Contributors"
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
release = importlib.metadata.version("chunky")
|
|
20
|
+
except importlib.metadata.PackageNotFoundError:
|
|
21
|
+
from chunky.__about__ import __version__ as release # type: ignore[assignment]
|
|
22
|
+
|
|
23
|
+
extensions = [
|
|
24
|
+
"sphinx.ext.autodoc",
|
|
25
|
+
"sphinx.ext.napoleon",
|
|
26
|
+
"sphinx.ext.autosummary",
|
|
27
|
+
"sphinx.ext.intersphinx",
|
|
28
|
+
"myst_parser",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
autosummary_generate = True
|
|
32
|
+
napoleon_google_docstring = False
|
|
33
|
+
napoleon_use_param = True
|
|
34
|
+
napoleon_use_rtype = True
|
|
35
|
+
|
|
36
|
+
html_theme = os.environ.get("SPHINX_HTML_THEME", "furo")
|
|
37
|
+
html_static_path = ["_static"]
|
|
38
|
+
|
|
39
|
+
intersphinx_mapping = {
|
|
40
|
+
"python": ("https://docs.python.org/3", {}),
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
myst_enable_extensions = [
|
|
44
|
+
"colon_fence",
|
|
45
|
+
"deflist",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
templates_path = ["_templates"]
|
|
49
|
+
|
|
50
|
+
exclude_patterns: list[str] = ["_build", "Thumbs.db", ".DS_Store"]
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
## Semantic code chunking
|
|
2
|
+
|
|
3
|
+
Semantic code file chunking in Python involves splitting a code file into meaningful, self-contained units based on its structure and semantics, rather than just arbitrary character counts. This approach aims to create chunks that represent logical components like functions, classes, or distinct blocks of code, improving the effectiveness of operations like embedding for RAG pipelines or code analysis.
|
|
4
|
+
|
|
5
|
+
Here's how you can achieve this in Python:
|
|
6
|
+
|
|
7
|
+
1. **Using Language-Specific Text Splitters:**
|
|
8
|
+
Libraries like LangChain offer specialized text splitters for different programming languages.
|
|
9
|
+
```python
|
|
10
|
+
from langchain_experimental.text_splitter import PythonCodeTextSplitter
|
|
11
|
+
|
|
12
|
+
# Initialize the splitter
|
|
13
|
+
python_splitter = PythonCodeTextSplitter()
|
|
14
|
+
|
|
15
|
+
# Split the code
|
|
16
|
+
code_chunks = python_splitter.split_text(your_python_code_string)
|
|
17
|
+
```
|
|
18
|
+
This `PythonCodeTextSplitter` is designed to understand Python syntax and split based on elements like function definitions, class definitions, and other structural components. Similar splitters exist for other languages.
|
|
19
|
+
|
|
20
|
+
2. **Utilizing Tree-Sitter for AST-based Chunking:**
|
|
21
|
+
Tree-sitter is a parsing library that can generate Abstract Syntax Trees (ASTs) for various programming languages. You can leverage this to identify semantic boundaries more precisely.
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
# Example using a conceptual tree-sitter based approach
|
|
25
|
+
# (Requires a tree-sitter parser for Python)
|
|
26
|
+
from tree_sitter import Language, Parser
|
|
27
|
+
|
|
28
|
+
# Load the Python language parser (you'd need to compile it first)
|
|
29
|
+
Language.build_library('build/my-languages.so', ['path/to/tree-sitter-python'])
|
|
30
|
+
PYTHON_LANGUAGE = Language('build/my-languages.so', 'python')
|
|
31
|
+
|
|
32
|
+
parser = Parser()
|
|
33
|
+
parser.set_language(PYTHON_LANGUAGE)
|
|
34
|
+
|
|
35
|
+
tree = parser.parse(bytes(your_python_code_string, 'utf8'))
|
|
36
|
+
|
|
37
|
+
# Traverse the AST to identify meaningful nodes (e.g., function definitions, class definitions)
|
|
38
|
+
# and extract their corresponding code snippets as chunks.
|
|
39
|
+
```
|
|
40
|
+
This method offers fine-grained control over chunking based on the exact structure of the code, but requires more manual implementation to define how AST nodes translate into chunks.
|
|
41
|
+
3. **Combining Semantic and Heuristic Approaches:**
|
|
42
|
+
You can also combine semantic understanding with more traditional heuristic rules, such as splitting by multiple newlines or specific keywords, to create robust chunking strategies.
|
|
43
|
+
|
|
44
|
+
Considerations for Semantic Code Chunking:
|
|
45
|
+
• **Granularity:** Decide on the appropriate level of granularity for your chunks (e.g., entire functions, individual statements, or logical blocks within functions).
|
|
46
|
+
• **Context:** Ensure that each chunk retains enough context to be meaningful on its own, especially for tasks like embedding and retrieval.
|
|
47
|
+
• **Language Specificity:** The best chunking strategy often depends on the specific programming language and its conventions.
|
|
48
|
+
|
|
49
|
+
AI responses may include mistakes.
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Semantic file chunking
|
|
54
|
+
|
|
55
|
+
Semantic file chunking in Python involves splitting a document into meaningful segments based on the semantic relatedness of its content, rather than fixed-size or character-based methods. This approach aims to keep semantically coherent information together within a single chunk, which can be beneficial for tasks like Retrieval Augmented Generation (RAG) in Large Language Models (LLMs).
|
|
56
|
+
|
|
57
|
+
Here's how you can perform semantic chunking in Python:
|
|
58
|
+
|
|
59
|
+
1. **Using LlamaIndex's Semantic Splitter:**
|
|
60
|
+
LlamaIndex provides a SemanticSplitterNodeParser designed for semantic chunking.
|
|
61
|
+
```python
|
|
62
|
+
from llama_index.node_parser import SemanticSplitterNodeParser
|
|
63
|
+
from llama_index.embeddings import OpenAIEmbedding
|
|
64
|
+
|
|
65
|
+
# Initialize the embedding model (e.g., OpenAIEmbeddings)
|
|
66
|
+
embed_model = OpenAIEmbedding()
|
|
67
|
+
|
|
68
|
+
# Initialize the semantic splitter
|
|
69
|
+
# `buffer_size` determines how many sentences to consider for similarity comparison
|
|
70
|
+
# `breakpoint_percentile_threshold` controls the sensitivity of splitting
|
|
71
|
+
splitter = SemanticSplitterNodeParser(
|
|
72
|
+
buffer_size=1, breakpoint_percentile_threshold=95, embed_model=embed_model
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
# Load your document (e.g., from a file)
|
|
76
|
+
# Example: text = "Your long document text here..."
|
|
77
|
+
# Or use LlamaIndex's SimpleDirectoryReader to load documents from a directory
|
|
78
|
+
|
|
79
|
+
# Parse the document into nodes (chunks)
|
|
80
|
+
nodes = splitter.get_nodes_from_documents([document]) # Replace 'document' with your LlamaIndex Document object
|
|
81
|
+
|
|
82
|
+
# Access the content of the semantic chunks
|
|
83
|
+
for node in nodes:
|
|
84
|
+
print(node.text)
|
|
85
|
+
```
|
|
86
|
+
2. **Using LangChain's Semantic Chunking (Experimental):**
|
|
87
|
+
LangChain also offers an experimental SemanticChunker within langchain_experimental.
|
|
88
|
+
```python
|
|
89
|
+
from langchain_experimental.text_splitter import SemanticChunker
|
|
90
|
+
from langchain_openai.embeddings import OpenAIEmbeddings
|
|
91
|
+
|
|
92
|
+
# Initialize the embedding model
|
|
93
|
+
embeddings = OpenAIEmbeddings()
|
|
94
|
+
|
|
95
|
+
# Initialize the semantic chunker
|
|
96
|
+
semantic_chunker = SemanticChunker(embeddings, breakpoint_threshold_type="percentile")
|
|
97
|
+
|
|
98
|
+
# Split your text into semantic chunks
|
|
99
|
+
text = "Your long document text here..."
|
|
100
|
+
chunks = semantic_chunker.split_text(text)
|
|
101
|
+
|
|
102
|
+
for chunk in chunks:
|
|
103
|
+
print(chunk)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Key Concepts in Semantic Chunking:
|
|
107
|
+
• **Embeddings:** Text is converted into numerical vector representations (embeddings) that capture its semantic meaning.
|
|
108
|
+
• **Similarity Measurement:** The similarity between embeddings of adjacent sentences or segments is calculated (e.g., using cosine similarity).
|
|
109
|
+
• **Breakpoint Threshold:** A threshold is used to identify points where the semantic similarity drops significantly, indicating a natural break point for a new chunk. This can be based on percentiles, standard deviation, or interquartile range of similarity scores.
|
|
110
|
+
• **Adaptive Chunk Sizes:** Unlike fixed-size chunking, semantic chunking results in chunks of varying lengths, as the splits are determined by semantic coherence.
|
|
111
|
+
|
|
112
|
+
By using these methods, you can create more semantically meaningful chunks, which can lead to improved performance in downstream applications like RAG by ensuring that relevant contextual information remains together.
|
|
113
|
+
|
|
114
|
+
AI responses may include mistakes.
|
|
115
|
+
|
|
116
|
+
---
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Semantic Chunking Library Design
|
|
2
|
+
|
|
3
|
+
## 1. Background & Motivation
|
|
4
|
+
|
|
5
|
+
Our existing `SmartChunker` performs a hybrid of sliding windows and heuristic boundary searches. When processing medium-sized code files the forward/backward scans can explode in CPU and memory, causing build failures. We also have no semantic awareness for other file types, leading to arbitrary splits. To make the knowledge-base pipeline reliable and extensible we want a modular chunking library that plugs cleanly into the Nancy Brain build as well as the new MCP-based RAG service powering the Slack bot. The same library will ship as a standalone package (working name `chunky`) so other indexing services can reuse it. We want that library to:
|
|
6
|
+
|
|
7
|
+
- Handles our common file types (Python, Markdown, JSON/YAML, plain text) with sensible defaults.
|
|
8
|
+
- Lets us plug in stronger semantic strategies (AST, embeddings) as optional enhancements.
|
|
9
|
+
- Keeps configuration centralized and easy to override via environment variables or config files.
|
|
10
|
+
- Produces consistent `Chunk` objects that slot directly into the indexing pipeline.
|
|
11
|
+
|
|
12
|
+
## 2. Goals & Non-Goals
|
|
13
|
+
|
|
14
|
+
### Goals
|
|
15
|
+
- Deterministic chunking for code and docs without pathological loops.
|
|
16
|
+
- Environment-driven configuration (e.g., tweak window sizes per build).
|
|
17
|
+
- Pipeline orchestration that picks the right chunker based on file metadata.
|
|
18
|
+
- Clear surface for future semantic/AST-based chunkers.
|
|
19
|
+
|
|
20
|
+
### Non-Goals
|
|
21
|
+
- Building a full AST parser for every language on day one.
|
|
22
|
+
- Re-implementing vector stores or summarization; we only prepare text for indexing/summarizing.
|
|
23
|
+
- Handling binary formats such as PDFs (they stay outside this module).
|
|
24
|
+
|
|
25
|
+
## 3. High-Level Architecture
|
|
26
|
+
|
|
27
|
+
```
|
|
28
|
+
chunky/
|
|
29
|
+
├── types.py # Chunk, Document, ChunkerConfig definitions
|
|
30
|
+
├── core.py # Chunker protocol, ChunkingError
|
|
31
|
+
├── chunkers/
|
|
32
|
+
│ ├── python.py # PythonSemanticChunker (AST-aware)
|
|
33
|
+
│ ├── markdown.py # MarkdownHeadingChunker
|
|
34
|
+
│ ├── yaml_json.py # JSONYamlChunker
|
|
35
|
+
│ ├── text.py # PlainTextChunker
|
|
36
|
+
│ └── fallback.py # SlidingWindowChunker
|
|
37
|
+
├── registry.py # ChunkerRegistry + DEFAULT_REGISTRY
|
|
38
|
+
├── loaders.py # DocumentLoader hierarchy
|
|
39
|
+
├── pipeline.py # ChunkPipeline orchestrator
|
|
40
|
+
└── utils.py # Shared helpers (token counting, environment hooks)
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
The code will live in the dedicated `chunky` package and be imported by Nancy Brain (and future MCP clients) like any other dependency. Keeping the chunker in its own package keeps agent scopes narrow and makes reuse easier.
|
|
44
|
+
|
|
45
|
+
## 4. Core Concepts
|
|
46
|
+
|
|
47
|
+
- **Document**: normalized representation of a file with (path, content, metadata, language).
|
|
48
|
+
- **Chunk**: dataclass with `chunk_id`, `text`, `metadata` (JSON-serializable), `source_document`.
|
|
49
|
+
- **Chunker**: object exposing `chunk(document, config) -> List[Chunk]`.
|
|
50
|
+
- **ChunkerRegistry**: resolves the appropriate chunker for a document (by extension, language, or explicit override).
|
|
51
|
+
- **ChunkPipeline**: orchestrates loading, chunking, and optional summarization hooks.
|
|
52
|
+
|
|
53
|
+
## 5. Chunker Implementations
|
|
54
|
+
|
|
55
|
+
Minimum viable set:
|
|
56
|
+
|
|
57
|
+
| Chunker | Description | Notes |
|
|
58
|
+
|---------|-------------|-------|
|
|
59
|
+
| `SlidingWindowChunker` | Simple fixed-line window with overlap | Always available; zero dependencies |
|
|
60
|
+
| `PythonSemanticChunker` | AST-based splitting on top-level functions/classes; falls back to window | Requires `ast` (built-in). Optionally `tree_sitter` later |
|
|
61
|
+
| `MarkdownHeadingChunker` | Breaks on heading hierarchy; merges small sections | No heavy deps |
|
|
62
|
+
| `JSONYamlChunker` | Treats top-level keys/arrays as chunks; flatten nested objects | Uses `json` / `yaml` |
|
|
63
|
+
| `PlainTextChunker` | Sentence/paragraph segmentation using regex or spaCy optional | Configurable sentence splitter |
|
|
64
|
+
| `SemanticEmbeddingChunker` (optional) | Embedding-based breakpoints (cosine drift) | Depends on configured embedding model; opt-in |
|
|
65
|
+
| `NotebookChunker` (via `nb4llm`) | Works with notebook-derived fenced text | Delegates heavy lifting to `nb4llm`; enforces Markdown/Python fence boundaries |
|
|
66
|
+
|
|
67
|
+
Each chunker adheres to the `Chunker` protocol and accepts a `ChunkerConfig`. The fallback chunker is always used last to guarantee progress.
|
|
68
|
+
|
|
69
|
+
## 6. Configuration Strategy
|
|
70
|
+
|
|
71
|
+
- `ChunkerConfig` stores generic knobs (`max_chars`, `max_tokens`, `code_window_lines`, `code_overlap_lines`, `semantic_model`, etc.).
|
|
72
|
+
- Defaults come from environment variables (`SMART_CHUNK_CODE_LINES`, `SMART_CHUNK_CODE_OVERLAP`, `SMART_CHUNK_TEXT_CHARS`, `SEMANTIC_MODEL`) or a YAML file (`semantic_chunker.yaml`). For MCP deployments we also respect `MCP_CHUNKER_CONFIG`, pointing to a remote-friendly YAML/JSON config path.
|
|
73
|
+
- The pipeline allows per-call overrides, e.g., `pipeline.chunk_file(path, config=ChunkerConfig(code_window_lines=60))`.
|
|
74
|
+
- All chunkers attach useful metadata (`line_start`, `line_end`, `language`, optional `semantic_score`) so MCP clients and Nancy's Slack responses can surface precise citations.
|
|
75
|
+
|
|
76
|
+
## 7. API Sketch
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from chunky.pipeline import ChunkPipeline
|
|
80
|
+
from chunky.types import ChunkerConfig
|
|
81
|
+
|
|
82
|
+
pipeline = ChunkPipeline() # uses DEFAULT_REGISTRY
|
|
83
|
+
|
|
84
|
+
chunks = pipeline.chunk_file(
|
|
85
|
+
path="knowledge_base/raw/general_tools/Dazzle/dazzle/dazzle.py",
|
|
86
|
+
config=ChunkerConfig(
|
|
87
|
+
code_window_lines=80,
|
|
88
|
+
code_overlap_lines=10,
|
|
89
|
+
semantic_model=None, # disable embedding-based splits
|
|
90
|
+
),
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
for chunk in chunks:
|
|
94
|
+
print(chunk.chunk_id, chunk.metadata["line_start"], chunk.metadata["line_end"])
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
Pipeline steps internally:
|
|
98
|
+
1. Load `Document` via registered loader.
|
|
99
|
+
2. Resolve chunker from registry (falls back to sliding window).
|
|
100
|
+
3. Invoke chunker with provided config.
|
|
101
|
+
4. Optionally post-process (e.g., summarization hook, metadata enrichment).
|
|
102
|
+
5. Return list of `Chunk` objects ready for indexing. Downstream consumers (Nancy Brain builders, MCP adapters, Slack bot citation tooling) rely on `chunk_id`, `source_document`, and line metadata to link answers back to source material.
|
|
103
|
+
|
|
104
|
+
## 8. Integration with KB Build
|
|
105
|
+
|
|
106
|
+
- Replace direct `SmartChunker` usage in `scripts/build_knowledge_base.py` with `ChunkPipeline`.
|
|
107
|
+
- All metadata stays JSON-serializable; pipeline returns chunks with ready metadata.
|
|
108
|
+
- Existing environment flags (`SKIP_PDF_PROCESSING`, `NB_PER_FILE_LOG`) remain untouched.
|
|
109
|
+
|
|
110
|
+
## 9. Extensibility Hooks
|
|
111
|
+
|
|
112
|
+
- `ChunkerRegistry.register(ext, chunker_cls)` to add new chunkers (e.g., notebook support).
|
|
113
|
+
- `ChunkPipeline` accepts custom registry or pre/post hooks (e.g., run summarizer on each chunk).
|
|
114
|
+
- Optional plugin entry points for projects to register chunkers via setuptools entry points.
|
|
115
|
+
|
|
116
|
+
## 10. Risks & Mitigations
|
|
117
|
+
|
|
118
|
+
| Risk | Mitigation |
|
|
119
|
+
|------|------------|
|
|
120
|
+
| AST parser errors on malformed code | Catch exceptions, fall back to sliding window |
|
|
121
|
+
| Semantic chunker slows builds | Make embedding-driven chunker opt-in; default to cheapest strategy |
|
|
122
|
+
| Dependency bloat | Fuck it. the heavy hitters already live in Nancy Brain and people can nuke the env after a build. I’ll plan assuming we can pull in whatever libraries make the chunkers accurate and fast; if anything starts to feel gratuitous later, we can revisit. |
|
|
123
|
+
| Inconsistent metadata | Centralize metadata construction utilities; reuse JSON serialization helpers |
|
|
124
|
+
|
|
125
|
+
## 11. Implementation Plan
|
|
126
|
+
|
|
127
|
+
1. **Phase 1 – Infrastructure**
|
|
128
|
+
- Define `Chunk`, `Document`, `ChunkerConfig`.
|
|
129
|
+
- Implement `SlidingWindowChunker` and registry/pipeline scaffold.
|
|
130
|
+
- Swap KB build to use pipeline with sliding window (parity with current behavior).
|
|
131
|
+
|
|
132
|
+
2. **Phase 2 – Language-specific chunkers**
|
|
133
|
+
- Implement `PythonSemanticChunker`, `MarkdownHeadingChunker`, `JSONYamlChunker`, `PlainTextChunker`.
|
|
134
|
+
- Add tests covering line ranges, metadata, and fallback behavior.
|
|
135
|
+
|
|
136
|
+
3. **Phase 3 – Semantic chunking (optional)**
|
|
137
|
+
- Prototype embedding-based chunker using existing sentence-transformer models.
|
|
138
|
+
- Benchmark build impact; keep behind feature flag.
|
|
139
|
+
|
|
140
|
+
4. **Phase 4 – Documentation & Adoption**
|
|
141
|
+
- Document env vars/config file usage.
|
|
142
|
+
- Update KB pipeline guide in README/docs.
|
|
143
|
+
- Gather feedback, iterate on defaults.
|
|
144
|
+
|
|
145
|
+
## 12. Testing Strategy
|
|
146
|
+
|
|
147
|
+
- Unit tests per chunker verifying chunk counts, metadata integrity, and fallback logic.
|
|
148
|
+
- Golden-file tests comparing chunk outputs for representative code/docs.
|
|
149
|
+
- Integration test running pipeline on sample repo (like Dazzle) ensuring no timeouts or memory blowups.
|
|
150
|
+
- Benchmark harness to track runtime vs. file size.
|
|
151
|
+
|
|
152
|
+
## 13. Open Questions
|
|
153
|
+
|
|
154
|
+
- Do we need per-language registries (e.g., `.py` vs `.pyi`)?
|
|
155
|
+
- Should summarization integrate directly into pipeline or remain in KB build script?
|
|
156
|
+
- How aggressively should we cache chunk results (hash by content) to avoid recomputation when files don’t change?
|
|
157
|
+
- How do we expose chunk metadata in downstream tools (UI, Slack bot) for debugging?
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
**Next Steps:**
|
|
162
|
+
- Finalize `ChunkerConfig` shape and default values.
|
|
163
|
+
- Implement Phase 1 (sliding window + pipeline scaffolding).
|
|
164
|
+
- Write tests ensuring no regression against current KB build.
|
|
165
|
+
- Incrementally add higher-level chunkers in Phase 2.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
.. chunky documentation master file
|
|
2
|
+
|
|
3
|
+
Welcome to Chunky's documentation!
|
|
4
|
+
=================================
|
|
5
|
+
|
|
6
|
+
Chunky provides modular chunking primitives tailored for heterogeneous scientific repositories.
|
|
7
|
+
It is designed to serve both the Nancy Brain knowledge-base pipeline and any external RAG
|
|
8
|
+
pipelines that need deterministic, metadata-rich chunks.
|
|
9
|
+
|
|
10
|
+
.. toctree::
|
|
11
|
+
:maxdepth: 2
|
|
12
|
+
:caption: Contents
|
|
13
|
+
|
|
14
|
+
overview
|
|
15
|
+
api
|
|
16
|
+
|
|
17
|
+
Indices and tables
|
|
18
|
+
==================
|
|
19
|
+
|
|
20
|
+
* :ref:`genindex`
|
|
21
|
+
* :ref:`modindex`
|
|
22
|
+
* :ref:`search`
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Overview
|
|
2
|
+
========
|
|
3
|
+
|
|
4
|
+
Chunky exposes a modular pipeline for converting heterogeneous project artefacts into
|
|
5
|
+
well-behaved text chunks. The pipeline is language-aware, pluggable, and ready for
|
|
6
|
+
Nancy Brain's MCP-backed retrieval workflows.
|
|
7
|
+
|
|
8
|
+
.. note::
|
|
9
|
+
The implementation is in active development. See ``SEMANTIC_CHUNKER.md`` for the full
|
|
10
|
+
design document and roadmap.
|
|
11
|
+
|
|
12
|
+
Getting Started
|
|
13
|
+
---------------
|
|
14
|
+
|
|
15
|
+
Install the package from source:
|
|
16
|
+
|
|
17
|
+
.. code-block:: bash
|
|
18
|
+
|
|
19
|
+
git clone https://github.com/AmberLee2427/chunky.git
|
|
20
|
+
cd chunky
|
|
21
|
+
pip install .
|
|
22
|
+
|
|
23
|
+
For development work and documentation builds:
|
|
24
|
+
|
|
25
|
+
.. code-block:: bash
|
|
26
|
+
|
|
27
|
+
pip install -e ".[dev,docs]"
|
|
28
|
+
|
|
29
|
+
First chunks via the pipeline:
|
|
30
|
+
|
|
31
|
+
.. code-block:: python
|
|
32
|
+
|
|
33
|
+
from pathlib import Path
|
|
34
|
+
|
|
35
|
+
from chunky import ChunkPipeline, ChunkerConfig
|
|
36
|
+
|
|
37
|
+
pipeline = ChunkPipeline()
|
|
38
|
+
config = ChunkerConfig(lines_per_chunk=80, line_overlap=10)
|
|
39
|
+
chunks = pipeline.chunk_file(Path("/path/to/file.py"), config=config)
|
|
40
|
+
|
|
41
|
+
for chunk in chunks:
|
|
42
|
+
print(chunk.chunk_id, chunk.metadata["line_start"], chunk.metadata["line_end"])
|
|
43
|
+
|
|
44
|
+
Roadmap
|
|
45
|
+
-------
|
|
46
|
+
|
|
47
|
+
* Phase 1: infrastructure scaffolding and sliding-window baseline.
|
|
48
|
+
* Phase 2: language-specific chunkers (Python, Markdown, JSON/YAML, notebooks).
|
|
49
|
+
* Phase 3: semantic/embedding-driven chunking.
|
|
50
|
+
* Phase 4: documentation, benchmarks, and Nancy Brain integration.
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chunky-files"
|
|
7
|
+
version = "0.2.0"
|
|
8
|
+
description = "Semantic chunking utilities for scientific code and documentation corpora."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
authors = [
|
|
11
|
+
{ name = "Nancy Brain Contributors" }
|
|
12
|
+
]
|
|
13
|
+
license = { file = "LICENSE" }
|
|
14
|
+
requires-python = ">=3.8"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"Intended Audience :: Science/Research",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.8",
|
|
23
|
+
"Programming Language :: Python :: 3.9",
|
|
24
|
+
"Programming Language :: Python :: 3.10",
|
|
25
|
+
"Programming Language :: Python :: 3.11",
|
|
26
|
+
"Programming Language :: Python :: 3.12",
|
|
27
|
+
"Topic :: Scientific/Engineering",
|
|
28
|
+
"Topic :: Text Processing :: Linguistic",
|
|
29
|
+
]
|
|
30
|
+
dependencies = []
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
dev = [
|
|
34
|
+
"pytest>=7",
|
|
35
|
+
"pytest-cov>=4",
|
|
36
|
+
"coverage[toml]>=7",
|
|
37
|
+
"ruff>=0.6",
|
|
38
|
+
"bump-my-version>=0.6",
|
|
39
|
+
"build"
|
|
40
|
+
]
|
|
41
|
+
docs = [
|
|
42
|
+
"sphinx>=7",
|
|
43
|
+
"myst-parser>=2",
|
|
44
|
+
"furo>=2024.0.0",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[project.urls]
|
|
48
|
+
Home = "https://github.com/AmberLee2427/chunky"
|
|
49
|
+
Documentation = "https://chunky.readthedocs.io/"
|
|
50
|
+
Issues = "https://github.com/AmberLee2427/chunky/issues"
|
|
51
|
+
|
|
52
|
+
[tool.hatch.version]
|
|
53
|
+
path = "src/chunky/__about__.py"
|
|
54
|
+
|
|
55
|
+
[tool.hatch.build.targets.sdist]
|
|
56
|
+
include = [
|
|
57
|
+
"src/**",
|
|
58
|
+
"docs/**",
|
|
59
|
+
"README.md",
|
|
60
|
+
"LICENSE",
|
|
61
|
+
"CHANGELOG.md",
|
|
62
|
+
"pyproject.toml",
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
[tool.hatch.build.targets.wheel]
|
|
66
|
+
packages = ["src/chunky"]
|
|
67
|
+
|
|
68
|
+
[tool.hatch.envs.default]
|
|
69
|
+
dependencies = [
|
|
70
|
+
"pytest",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
[tool.pytest.ini_options]
|
|
74
|
+
minversion = "7.0"
|
|
75
|
+
addopts = "-ra --showlocals --strict-markers --strict-config"
|
|
76
|
+
testpaths = [
|
|
77
|
+
"tests",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
[tool.ruff]
|
|
82
|
+
line-length = 100
|
|
83
|
+
src = ["src", "tests"]
|
|
84
|
+
|
|
85
|
+
[tool.ruff.lint]
|
|
86
|
+
select = ["E", "F", "I", "B"]
|
|
87
|
+
ignore = ["E203"]
|
|
88
|
+
|
|
89
|
+
[tool.ruff.format]
|
|
90
|
+
quote-style = "double"
|
|
91
|
+
indent-style = "space"
|
|
92
|
+
|
|
93
|
+
[tool.bumpversion]
|
|
94
|
+
current_version = "0.2.0"
|
|
95
|
+
commit = true
|
|
96
|
+
message = "chore: bump version to {new_version}"
|
|
97
|
+
|
|
98
|
+
[[tool.bumpversion.files]]
|
|
99
|
+
filename = "pyproject.toml"
|
|
100
|
+
search = "version = \"{current_version}\""
|
|
101
|
+
replace = "version = \"{new_version}\""
|
|
102
|
+
|
|
103
|
+
[[tool.bumpversion.files]]
|
|
104
|
+
filename = "src/chunky/__about__.py"
|
|
105
|
+
search = "__version__ = \"{current_version}\""
|
|
106
|
+
replace = "__version__ = \"{new_version}\""
|
|
107
|
+
|
|
108
|
+
[[tool.bumpversion.files]]
|
|
109
|
+
filename = "CHANGELOG.md"
|
|
110
|
+
search = "## [Unreleased]"
|
|
111
|
+
replace = "## [Unreleased]\n\n## [{new_version}] - TBD"
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Chunky: semantic chunking utilities for heterogeneous repositories."""
|
|
2
|
+
|
|
3
|
+
from .__about__ import __version__
|
|
4
|
+
from .pipeline import ChunkPipeline
|
|
5
|
+
from .types import Chunk, ChunkerConfig, Document
|
|
6
|
+
|
|
7
|
+
__all__ = [
|
|
8
|
+
"__version__",
|
|
9
|
+
"ChunkPipeline",
|
|
10
|
+
"Chunk",
|
|
11
|
+
"ChunkerConfig",
|
|
12
|
+
"Document",
|
|
13
|
+
]
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
"""Sliding window fallback chunker."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
from ..core import Chunker
|
|
8
|
+
from ..types import Chunk, ChunkerConfig, Document
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SlidingWindowChunker(Chunker):
|
|
12
|
+
"""Chunker that produces fixed-size line windows with optional overlap."""
|
|
13
|
+
|
|
14
|
+
def chunk(self, document: Document, config: ChunkerConfig) -> list[Chunk]:
|
|
15
|
+
lines = document.content.splitlines()
|
|
16
|
+
window = config.clamp_lines(config.lines_per_chunk)
|
|
17
|
+
overlap = config.clamp_overlap(config.line_overlap, window)
|
|
18
|
+
|
|
19
|
+
if not lines:
|
|
20
|
+
chunk_id = self._build_chunk_id(document.path, 0)
|
|
21
|
+
return [
|
|
22
|
+
Chunk(
|
|
23
|
+
chunk_id=chunk_id,
|
|
24
|
+
text="",
|
|
25
|
+
source_document=document.path,
|
|
26
|
+
metadata=self._chunk_metadata(
|
|
27
|
+
chunk_index=0,
|
|
28
|
+
line_start=0,
|
|
29
|
+
line_end=0,
|
|
30
|
+
span_start=0,
|
|
31
|
+
span_end=0,
|
|
32
|
+
config=config,
|
|
33
|
+
),
|
|
34
|
+
)
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
chunks: list[Chunk] = []
|
|
38
|
+
line_count = len(lines)
|
|
39
|
+
# Pre-compute character offsets once to avoid quadratic scans.
|
|
40
|
+
line_starts: list[int] = []
|
|
41
|
+
line_ends: list[int] = []
|
|
42
|
+
cursor = 0
|
|
43
|
+
for idx, line in enumerate(lines):
|
|
44
|
+
if idx > 0:
|
|
45
|
+
cursor += 1 # newline preceding this line
|
|
46
|
+
line_starts.append(cursor)
|
|
47
|
+
cursor += len(line)
|
|
48
|
+
line_ends.append(cursor)
|
|
49
|
+
|
|
50
|
+
start_line = 0
|
|
51
|
+
chunk_index = 0
|
|
52
|
+
|
|
53
|
+
while start_line < line_count:
|
|
54
|
+
previous_start = start_line
|
|
55
|
+
end_line = min(start_line + window, line_count)
|
|
56
|
+
text = "\n".join(lines[start_line:end_line])
|
|
57
|
+
chunk_id = self._build_chunk_id(document.path, chunk_index)
|
|
58
|
+
metadata = self._chunk_metadata(
|
|
59
|
+
chunk_index=chunk_index,
|
|
60
|
+
line_start=start_line + 1,
|
|
61
|
+
line_end=end_line,
|
|
62
|
+
span_start=line_starts[start_line],
|
|
63
|
+
span_end=line_ends[end_line - 1],
|
|
64
|
+
config=config,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
chunks.append(
|
|
68
|
+
Chunk(
|
|
69
|
+
chunk_id=chunk_id,
|
|
70
|
+
text=text,
|
|
71
|
+
source_document=document.path,
|
|
72
|
+
metadata=metadata,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
chunk_index += 1
|
|
77
|
+
if config.max_chunks and chunk_index >= config.max_chunks:
|
|
78
|
+
break
|
|
79
|
+
|
|
80
|
+
if end_line >= line_count:
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
next_start = end_line - overlap
|
|
84
|
+
if next_start <= previous_start:
|
|
85
|
+
next_start = end_line
|
|
86
|
+
start_line = next_start
|
|
87
|
+
|
|
88
|
+
return chunks
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def _build_chunk_id(path: Path, index: int) -> str:
|
|
92
|
+
return f"{path}::chunk-{index}"
|
|
93
|
+
|
|
94
|
+
@staticmethod
|
|
95
|
+
def _chunk_metadata(
|
|
96
|
+
chunk_index: int,
|
|
97
|
+
line_start: int,
|
|
98
|
+
line_end: int,
|
|
99
|
+
span_start: int,
|
|
100
|
+
span_end: int,
|
|
101
|
+
config: ChunkerConfig,
|
|
102
|
+
) -> dict[str, int | str]:
|
|
103
|
+
metadata: dict[str, int | str] = {
|
|
104
|
+
"chunk_index": chunk_index,
|
|
105
|
+
"line_start": line_start,
|
|
106
|
+
"line_end": line_end,
|
|
107
|
+
"span_start": span_start,
|
|
108
|
+
"span_end": span_end,
|
|
109
|
+
}
|
|
110
|
+
if config.metadata:
|
|
111
|
+
metadata.update(config.metadata)
|
|
112
|
+
return metadata
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Core interfaces and exceptions for chunkers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Protocol, runtime_checkable
|
|
6
|
+
|
|
7
|
+
from .types import Chunk, ChunkerConfig, Document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ChunkingError(RuntimeError):
|
|
11
|
+
"""Raised when a chunker cannot process the provided document."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@runtime_checkable
|
|
15
|
+
class Chunker(Protocol):
|
|
16
|
+
"""Protocol implemented by all chunkers."""
|
|
17
|
+
|
|
18
|
+
def chunk(self, document: Document, config: ChunkerConfig) -> list[Chunk]:
|
|
19
|
+
"""Return a list of chunks for the given document."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
__all__ = ["ChunkingError", "Chunker"]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Document loaders."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Protocol
|
|
7
|
+
|
|
8
|
+
from .types import Document
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentLoader(Protocol):
|
|
12
|
+
"""Protocol for converting files into :class:`Document` instances."""
|
|
13
|
+
|
|
14
|
+
def load(self, path: Path) -> Document:
|
|
15
|
+
...
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class FileSystemLoader:
|
|
19
|
+
"""Loader that reads text files from disk."""
|
|
20
|
+
|
|
21
|
+
def load(self, path: Path) -> Document:
|
|
22
|
+
content = path.read_text(encoding="utf-8")
|
|
23
|
+
return Document(path=path, content=content)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
DEFAULT_LOADER = FileSystemLoader()
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
__all__ = ["DocumentLoader", "FileSystemLoader", "DEFAULT_LOADER"]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""High-level orchestration for chunking documents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from .chunkers import SlidingWindowChunker
|
|
9
|
+
from .loaders import DEFAULT_LOADER, DocumentLoader
|
|
10
|
+
from .registry import DEFAULT_REGISTRY, ChunkerRegistry
|
|
11
|
+
from .types import Chunk, ChunkerConfig, Document
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ChunkPipeline:
|
|
15
|
+
"""Pipeline that orchestrates document loading and chunking."""
|
|
16
|
+
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
*,
|
|
20
|
+
registry: Optional[ChunkerRegistry] = None,
|
|
21
|
+
loader: Optional[DocumentLoader] = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
self.registry = registry or DEFAULT_REGISTRY
|
|
24
|
+
self.loader = loader or DEFAULT_LOADER
|
|
25
|
+
self._ensure_fallback()
|
|
26
|
+
|
|
27
|
+
def chunk_file(
|
|
28
|
+
self,
|
|
29
|
+
path: Path | str,
|
|
30
|
+
*,
|
|
31
|
+
config: Optional[ChunkerConfig] = None,
|
|
32
|
+
) -> list[Chunk]:
|
|
33
|
+
"""Chunk a file from disk."""
|
|
34
|
+
|
|
35
|
+
config = config or ChunkerConfig()
|
|
36
|
+
document = self.loader.load(Path(path))
|
|
37
|
+
chunker = self.registry.get(document.path)
|
|
38
|
+
return chunker.chunk(document, config)
|
|
39
|
+
|
|
40
|
+
def chunk_documents(
|
|
41
|
+
self,
|
|
42
|
+
documents: list[Document],
|
|
43
|
+
*,
|
|
44
|
+
config: Optional[ChunkerConfig] = None,
|
|
45
|
+
) -> list[Chunk]:
|
|
46
|
+
"""Chunk pre-loaded documents."""
|
|
47
|
+
|
|
48
|
+
config = config or ChunkerConfig()
|
|
49
|
+
chunks: list[Chunk] = []
|
|
50
|
+
for document in documents:
|
|
51
|
+
chunker = self.registry.get(document.path)
|
|
52
|
+
chunks.extend(chunker.chunk(document, config))
|
|
53
|
+
return chunks
|
|
54
|
+
|
|
55
|
+
def _ensure_fallback(self) -> None:
|
|
56
|
+
try:
|
|
57
|
+
self.registry.get(Path("__dummy__"))
|
|
58
|
+
except KeyError:
|
|
59
|
+
self.registry.set_fallback(SlidingWindowChunker())
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Chunker registry responsible for resolving the appropriate implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterable, MutableMapping
|
|
7
|
+
|
|
8
|
+
from .core import Chunker
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ChunkerRegistry:
|
|
12
|
+
"""Runtime registry mapping file extensions to chunkers."""
|
|
13
|
+
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
self._registry: MutableMapping[str, Chunker] = {}
|
|
16
|
+
self._fallback: Chunker | None = None
|
|
17
|
+
|
|
18
|
+
def register(
|
|
19
|
+
self,
|
|
20
|
+
extensions: Iterable[str] | str,
|
|
21
|
+
chunker: Chunker,
|
|
22
|
+
*,
|
|
23
|
+
overwrite: bool = False,
|
|
24
|
+
) -> None:
|
|
25
|
+
"""Register a chunker for one or more extensions."""
|
|
26
|
+
|
|
27
|
+
if isinstance(extensions, str):
|
|
28
|
+
extensions = [extensions]
|
|
29
|
+
|
|
30
|
+
for ext in extensions:
|
|
31
|
+
key = self._normalize(ext)
|
|
32
|
+
if not overwrite and key in self._registry:
|
|
33
|
+
raise ValueError(f"Chunker already registered for extension '{ext}'")
|
|
34
|
+
self._registry[key] = chunker
|
|
35
|
+
|
|
36
|
+
def set_fallback(self, chunker: Chunker) -> None:
|
|
37
|
+
"""Set the fallback chunker used for unknown extensions."""
|
|
38
|
+
|
|
39
|
+
self._fallback = chunker
|
|
40
|
+
|
|
41
|
+
def get(self, path: Path) -> Chunker:
|
|
42
|
+
"""Return the chunker associated with the file path or the fallback."""
|
|
43
|
+
|
|
44
|
+
suffix = self._normalize(path.suffix or "")
|
|
45
|
+
chunker = self._registry.get(suffix)
|
|
46
|
+
if chunker is not None:
|
|
47
|
+
return chunker
|
|
48
|
+
if self._fallback is None:
|
|
49
|
+
raise KeyError(f"No chunker registered for {suffix!r} and no fallback configured")
|
|
50
|
+
return self._fallback
|
|
51
|
+
|
|
52
|
+
@staticmethod
|
|
53
|
+
def _normalize(extension: str) -> str:
|
|
54
|
+
return extension.lower().lstrip(".")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
DEFAULT_REGISTRY = ChunkerRegistry()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
__all__ = ["ChunkerRegistry", "DEFAULT_REGISTRY"]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Core data structures for the semantic chunking pipeline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Dict, Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Document:
|
|
12
|
+
"""Normalized representation of a file to be chunked."""
|
|
13
|
+
|
|
14
|
+
path: Path
|
|
15
|
+
content: str
|
|
16
|
+
language: Optional[str] = None
|
|
17
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class Chunk:
|
|
22
|
+
"""A chunk of text ready for downstream indexing."""
|
|
23
|
+
|
|
24
|
+
chunk_id: str
|
|
25
|
+
text: str
|
|
26
|
+
source_document: Path
|
|
27
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ChunkerConfig:
|
|
32
|
+
"""Configuration shared across chunkers."""
|
|
33
|
+
|
|
34
|
+
max_chars: int = 2000
|
|
35
|
+
lines_per_chunk: int = 120
|
|
36
|
+
line_overlap: int = 20
|
|
37
|
+
max_chunks: Optional[int] = None
|
|
38
|
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
39
|
+
|
|
40
|
+
def clamp_lines(self, lines: int) -> int:
|
|
41
|
+
"""Clamp the requested line count to a sensible positive value."""
|
|
42
|
+
|
|
43
|
+
return max(1, lines)
|
|
44
|
+
|
|
45
|
+
def clamp_overlap(self, overlap: int, window: int) -> int:
|
|
46
|
+
"""Clamp overlap so it cannot exceed the window size."""
|
|
47
|
+
|
|
48
|
+
overlap = max(0, overlap)
|
|
49
|
+
return min(overlap, max(0, window - 1))
|