cenote-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cenote_core-0.1.0/.claude/settings.json +69 -0
- cenote_core-0.1.0/.env.example +3 -0
- cenote_core-0.1.0/.github/dependabot.yml +18 -0
- cenote_core-0.1.0/.github/workflows/ci.yml +129 -0
- cenote_core-0.1.0/.github/workflows/docs.yml +39 -0
- cenote_core-0.1.0/.github/workflows/release.yml +35 -0
- cenote_core-0.1.0/.gitignore +62 -0
- cenote_core-0.1.0/.markdownlint.json +7 -0
- cenote_core-0.1.0/.pre-commit-config.yaml +26 -0
- cenote_core-0.1.0/CHANGELOG.md +132 -0
- cenote_core-0.1.0/CLAUDE.md +145 -0
- cenote_core-0.1.0/CONTRIBUTING.md +84 -0
- cenote_core-0.1.0/LICENSE +202 -0
- cenote_core-0.1.0/PKG-INFO +170 -0
- cenote_core-0.1.0/README.md +144 -0
- cenote_core-0.1.0/SECURITY.md +20 -0
- cenote_core-0.1.0/demos/__init__.py +0 -0
- cenote_core-0.1.0/demos/data/wikipedia_snippets.json +102 -0
- cenote_core-0.1.0/demos/quickstart.py +91 -0
- cenote_core-0.1.0/docker-compose.test.yml +14 -0
- cenote_core-0.1.0/docs/00-first-milestone.md +410 -0
- cenote_core-0.1.0/docs/01-claude-code-prompts.md +266 -0
- cenote_core-0.1.0/docs/diagrams/01-ecosystem.drawio +88 -0
- cenote_core-0.1.0/docs/diagrams/02-architecture.drawio +178 -0
- cenote_core-0.1.0/docs/diagrams/03-runtime-flow.drawio +235 -0
- cenote_core-0.1.0/docs/diagrams/README.md +19 -0
- cenote_core-0.1.0/docs/site/api/chunkers.md +5 -0
- cenote_core-0.1.0/docs/site/api/embedders.md +15 -0
- cenote_core-0.1.0/docs/site/api/eval.md +6 -0
- cenote_core-0.1.0/docs/site/api/models.md +6 -0
- cenote_core-0.1.0/docs/site/api/retrievers.md +5 -0
- cenote_core-0.1.0/docs/site/api/stores.md +7 -0
- cenote_core-0.1.0/docs/site/architecture.md +36 -0
- cenote_core-0.1.0/docs/site/changelog.md +3 -0
- cenote_core-0.1.0/docs/site/extending/custom-chunker.md +38 -0
- cenote_core-0.1.0/docs/site/extending/custom-embedder.md +60 -0
- cenote_core-0.1.0/docs/site/index.md +28 -0
- cenote_core-0.1.0/docs/site/quickstart.md +84 -0
- cenote_core-0.1.0/examples/README.md +35 -0
- cenote_core-0.1.0/examples/custom_embedder.py +108 -0
- cenote_core-0.1.0/examples/pgvector_setup.py +103 -0
- cenote_core-0.1.0/mkdocs.yml +64 -0
- cenote_core-0.1.0/pyproject.toml +97 -0
- cenote_core-0.1.0/src/cenote/__init__.py +35 -0
- cenote_core-0.1.0/src/cenote/chunkers/__init__.py +7 -0
- cenote_core-0.1.0/src/cenote/chunkers/base.py +29 -0
- cenote_core-0.1.0/src/cenote/chunkers/recursive.py +103 -0
- cenote_core-0.1.0/src/cenote/embedders/__init__.py +18 -0
- cenote_core-0.1.0/src/cenote/embedders/_http.py +101 -0
- cenote_core-0.1.0/src/cenote/embedders/base.py +25 -0
- cenote_core-0.1.0/src/cenote/embedders/cache.py +96 -0
- cenote_core-0.1.0/src/cenote/embedders/cohere.py +137 -0
- cenote_core-0.1.0/src/cenote/embedders/mock.py +57 -0
- cenote_core-0.1.0/src/cenote/embedders/voyage.py +136 -0
- cenote_core-0.1.0/src/cenote/errors.py +32 -0
- cenote_core-0.1.0/src/cenote/eval/__init__.py +6 -0
- cenote_core-0.1.0/src/cenote/eval/metrics.py +48 -0
- cenote_core-0.1.0/src/cenote/models.py +56 -0
- cenote_core-0.1.0/src/cenote/observability/__init__.py +6 -0
- cenote_core-0.1.0/src/cenote/observability/base.py +31 -0
- cenote_core-0.1.0/src/cenote/py.typed +0 -0
- cenote_core-0.1.0/src/cenote/rerankers/__init__.py +6 -0
- cenote_core-0.1.0/src/cenote/rerankers/base.py +26 -0
- cenote_core-0.1.0/src/cenote/retrievers/__init__.py +7 -0
- cenote_core-0.1.0/src/cenote/retrievers/base.py +20 -0
- cenote_core-0.1.0/src/cenote/retrievers/vector.py +36 -0
- cenote_core-0.1.0/src/cenote/stores/__init__.py +8 -0
- cenote_core-0.1.0/src/cenote/stores/base.py +31 -0
- cenote_core-0.1.0/src/cenote/stores/memory.py +91 -0
- cenote_core-0.1.0/src/cenote/stores/pgvector.py +246 -0
- cenote_core-0.1.0/src/cenote/stores/pgvector_migrations/001_init.sql +36 -0
- cenote_core-0.1.0/src/cenote/stores/pgvector_migrations/002_namespace_id_pk.sql +10 -0
- cenote_core-0.1.0/src/cenote/stores/pgvector_migrations/__init__.py +0 -0
- cenote_core-0.1.0/src/cenote/types.py +7 -0
- cenote_core-0.1.0/tests/__init__.py +0 -0
- cenote_core-0.1.0/tests/chunkers/__init__.py +0 -0
- cenote_core-0.1.0/tests/chunkers/test_recursive.py +99 -0
- cenote_core-0.1.0/tests/conftest.py +1 -0
- cenote_core-0.1.0/tests/demos/__init__.py +0 -0
- cenote_core-0.1.0/tests/demos/test_quickstart_smoke.py +12 -0
- cenote_core-0.1.0/tests/embedders/__init__.py +0 -0
- cenote_core-0.1.0/tests/embedders/test_cache.py +121 -0
- cenote_core-0.1.0/tests/embedders/test_cohere.py +112 -0
- cenote_core-0.1.0/tests/embedders/test_http.py +50 -0
- cenote_core-0.1.0/tests/embedders/test_mock.py +92 -0
- cenote_core-0.1.0/tests/embedders/test_voyage.py +199 -0
- cenote_core-0.1.0/tests/eval/__init__.py +0 -0
- cenote_core-0.1.0/tests/eval/test_metrics.py +84 -0
- cenote_core-0.1.0/tests/integration/__init__.py +0 -0
- cenote_core-0.1.0/tests/integration/test_pgvector.py +141 -0
- cenote_core-0.1.0/tests/observability/__init__.py +0 -0
- cenote_core-0.1.0/tests/observability/test_base.py +14 -0
- cenote_core-0.1.0/tests/rerankers/__init__.py +0 -0
- cenote_core-0.1.0/tests/rerankers/test_base.py +9 -0
- cenote_core-0.1.0/tests/retrievers/__init__.py +0 -0
- cenote_core-0.1.0/tests/retrievers/test_vector.py +80 -0
- cenote_core-0.1.0/tests/stores/__init__.py +0 -0
- cenote_core-0.1.0/tests/stores/test_memory.py +139 -0
- cenote_core-0.1.0/tests/stores/test_pgvector_helpers.py +87 -0
- cenote_core-0.1.0/tests/test_errors.py +38 -0
- cenote_core-0.1.0/tests/test_logging_smoke.py +48 -0
- cenote_core-0.1.0/tests/test_models.py +152 -0
- cenote_core-0.1.0/tests/test_types.py +26 -0
- cenote_core-0.1.0/uv.lock +1309 -0
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
{
|
|
2
|
+
"permissions": {
|
|
3
|
+
"defaultMode": "default",
|
|
4
|
+
"allow": [
|
|
5
|
+
"Read",
|
|
6
|
+
"Bash(uv sync*)",
|
|
7
|
+
"Bash(uv run pytest*)",
|
|
8
|
+
"Bash(uv run ruff*)",
|
|
9
|
+
"Bash(uv run mypy*)",
|
|
10
|
+
"Bash(uv run pre-commit*)",
|
|
11
|
+
"Bash(uv run python*)",
|
|
12
|
+
"Bash(uv build*)",
|
|
13
|
+
"Bash(uv lock*)",
|
|
14
|
+
"Bash(uv tree*)",
|
|
15
|
+
"Bash(uv pip list*)",
|
|
16
|
+
"Bash(ls *)",
|
|
17
|
+
"Bash(cat *)",
|
|
18
|
+
"Bash(head *)",
|
|
19
|
+
"Bash(tail *)",
|
|
20
|
+
"Bash(grep *)",
|
|
21
|
+
"Bash(rg *)",
|
|
22
|
+
"Bash(find *)",
|
|
23
|
+
"Bash(tree*)",
|
|
24
|
+
"Bash(wc *)",
|
|
25
|
+
"Bash(pwd)",
|
|
26
|
+
"Bash(mkdir -p *)",
|
|
27
|
+
"Bash(git status*)",
|
|
28
|
+
"Bash(git diff*)",
|
|
29
|
+
"Bash(git log*)",
|
|
30
|
+
"Bash(git branch*)",
|
|
31
|
+
"Bash(git show*)",
|
|
32
|
+
"Bash(git add *)",
|
|
33
|
+
"Bash(git restore *)",
|
|
34
|
+
"Bash(git stash*)",
|
|
35
|
+
"Bash(docker compose -f docker-compose.test.yml ps*)",
|
|
36
|
+
"Bash(docker compose -f docker-compose.test.yml logs*)"
|
|
37
|
+
],
|
|
38
|
+
"ask": [
|
|
39
|
+
"Write",
|
|
40
|
+
"Edit",
|
|
41
|
+
"Bash(uv add*)",
|
|
42
|
+
"Bash(uv remove*)",
|
|
43
|
+
"Bash(git checkout *)",
|
|
44
|
+
"Bash(git commit *)",
|
|
45
|
+
"Bash(git merge *)",
|
|
46
|
+
"Bash(git rebase *)",
|
|
47
|
+
"Bash(git reset *)",
|
|
48
|
+
"Bash(docker compose -f docker-compose.test.yml up*)",
|
|
49
|
+
"Bash(docker compose -f docker-compose.test.yml down*)",
|
|
50
|
+
"Bash(docker compose -f docker-compose.test.yml run*)"
|
|
51
|
+
],
|
|
52
|
+
"deny": [
|
|
53
|
+
"Bash(rm -rf *)",
|
|
54
|
+
"Bash(rm -fr *)",
|
|
55
|
+
"Bash(git push*)",
|
|
56
|
+
"Bash(git push --force*)",
|
|
57
|
+
"Bash(git push -f*)",
|
|
58
|
+
"Bash(git reset --hard*)",
|
|
59
|
+
"Bash(git clean -fd*)",
|
|
60
|
+
"Bash(gh pr merge*)",
|
|
61
|
+
"Bash(gh repo delete*)",
|
|
62
|
+
"Bash(uv publish*)",
|
|
63
|
+
"Bash(npm publish*)",
|
|
64
|
+
"Bash(curl *)",
|
|
65
|
+
"Bash(wget *)",
|
|
66
|
+
"WebFetch"
|
|
67
|
+
]
|
|
68
|
+
}
|
|
69
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
version: 2
|
|
2
|
+
updates:
|
|
3
|
+
- package-ecosystem: "uv"
|
|
4
|
+
directory: "/"
|
|
5
|
+
schedule:
|
|
6
|
+
interval: "monthly"
|
|
7
|
+
open-pull-requests-limit: 3
|
|
8
|
+
groups:
|
|
9
|
+
python-dependencies:
|
|
10
|
+
patterns: ["*"]
|
|
11
|
+
- package-ecosystem: "github-actions"
|
|
12
|
+
directory: "/"
|
|
13
|
+
schedule:
|
|
14
|
+
interval: "monthly"
|
|
15
|
+
open-pull-requests-limit: 3
|
|
16
|
+
groups:
|
|
17
|
+
github-actions:
|
|
18
|
+
patterns: ["*"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
branches: [main]
|
|
6
|
+
push:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
concurrency:
|
|
10
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
11
|
+
cancel-in-progress: true
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
lint-and-type:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
strategy:
|
|
17
|
+
fail-fast: false
|
|
18
|
+
matrix:
|
|
19
|
+
python-version: ["3.12", "3.13"]
|
|
20
|
+
steps:
|
|
21
|
+
- uses: actions/checkout@v6
|
|
22
|
+
- name: Install uv
|
|
23
|
+
uses: astral-sh/setup-uv@v7
|
|
24
|
+
with:
|
|
25
|
+
enable-cache: true
|
|
26
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
27
|
+
run: uv python install ${{ matrix.python-version }}
|
|
28
|
+
- name: Install deps
|
|
29
|
+
run: uv sync --all-extras
|
|
30
|
+
- name: Ruff check
|
|
31
|
+
run: uv run ruff check .
|
|
32
|
+
- name: Ruff format check
|
|
33
|
+
run: uv run ruff format --check .
|
|
34
|
+
- name: Mypy
|
|
35
|
+
run: uv run mypy src/
|
|
36
|
+
|
|
37
|
+
security-audit:
|
|
38
|
+
runs-on: ubuntu-latest
|
|
39
|
+
steps:
|
|
40
|
+
- uses: actions/checkout@v6
|
|
41
|
+
- name: Install uv
|
|
42
|
+
uses: astral-sh/setup-uv@v7
|
|
43
|
+
with:
|
|
44
|
+
enable-cache: true
|
|
45
|
+
- name: Set up Python
|
|
46
|
+
run: uv python install 3.12
|
|
47
|
+
- name: Install deps (locked)
|
|
48
|
+
run: uv sync --all-extras --locked
|
|
49
|
+
- name: Export locked deps for pip-audit (excluding cenote-core itself)
|
|
50
|
+
run: |
|
|
51
|
+
uv export --no-hashes --no-emit-project --format requirements-txt \
|
|
52
|
+
--output-file requirements.audit.txt
|
|
53
|
+
- name: Audit dependencies for known CVEs
|
|
54
|
+
run: uv tool run pip-audit --strict --requirement requirements.audit.txt
|
|
55
|
+
|
|
56
|
+
unit-tests:
|
|
57
|
+
runs-on: ubuntu-latest
|
|
58
|
+
strategy:
|
|
59
|
+
fail-fast: false
|
|
60
|
+
matrix:
|
|
61
|
+
python-version: ["3.12", "3.13"]
|
|
62
|
+
steps:
|
|
63
|
+
- uses: actions/checkout@v6
|
|
64
|
+
- name: Install uv
|
|
65
|
+
uses: astral-sh/setup-uv@v7
|
|
66
|
+
with:
|
|
67
|
+
enable-cache: true
|
|
68
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
69
|
+
run: uv python install ${{ matrix.python-version }}
|
|
70
|
+
- name: Install deps
|
|
71
|
+
run: uv sync --all-extras
|
|
72
|
+
- name: Run unit tests
|
|
73
|
+
run: uv run pytest -m "not integration" --cov=cenote --cov-report=xml
|
|
74
|
+
- name: Upload coverage artifact
|
|
75
|
+
if: always() && matrix.python-version == '3.12'
|
|
76
|
+
uses: actions/upload-artifact@v7
|
|
77
|
+
with:
|
|
78
|
+
name: coverage
|
|
79
|
+
path: coverage.xml
|
|
80
|
+
- name: Upload coverage to Codecov
|
|
81
|
+
if: matrix.python-version == '3.12'
|
|
82
|
+
uses: codecov/codecov-action@v4
|
|
83
|
+
with:
|
|
84
|
+
files: ./coverage.xml
|
|
85
|
+
flags: unit
|
|
86
|
+
fail_ci_if_error: false
|
|
87
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
88
|
+
|
|
89
|
+
integration-tests:
|
|
90
|
+
runs-on: ubuntu-latest
|
|
91
|
+
services:
|
|
92
|
+
postgres:
|
|
93
|
+
image: pgvector/pgvector:pg16
|
|
94
|
+
env:
|
|
95
|
+
POSTGRES_USER: cenote
|
|
96
|
+
POSTGRES_PASSWORD: cenote
|
|
97
|
+
POSTGRES_DB: cenote_test
|
|
98
|
+
ports:
|
|
99
|
+
- 5433:5432
|
|
100
|
+
options: >-
|
|
101
|
+
--health-cmd "pg_isready -U cenote -d cenote_test"
|
|
102
|
+
--health-interval 2s
|
|
103
|
+
--health-timeout 5s
|
|
104
|
+
--health-retries 30
|
|
105
|
+
env:
|
|
106
|
+
TEST_DATABASE_URL: postgresql://cenote:cenote@localhost:5433/cenote_test
|
|
107
|
+
steps:
|
|
108
|
+
- uses: actions/checkout@v6
|
|
109
|
+
- uses: astral-sh/setup-uv@v7
|
|
110
|
+
with:
|
|
111
|
+
enable-cache: true
|
|
112
|
+
- run: uv python install 3.12
|
|
113
|
+
- run: uv sync --all-extras
|
|
114
|
+
- name: Wait for Postgres
|
|
115
|
+
run: |
|
|
116
|
+
for i in {1..30}; do
|
|
117
|
+
if pg_isready -h localhost -p 5433 -U cenote; then exit 0; fi
|
|
118
|
+
sleep 1
|
|
119
|
+
done
|
|
120
|
+
exit 1
|
|
121
|
+
- name: Run integration tests with coverage
|
|
122
|
+
run: uv run pytest -m integration --cov=cenote --cov-report=xml
|
|
123
|
+
- name: Upload integration coverage to Codecov
|
|
124
|
+
uses: codecov/codecov-action@v4
|
|
125
|
+
with:
|
|
126
|
+
files: ./coverage.xml
|
|
127
|
+
flags: integration
|
|
128
|
+
fail_ci_if_error: false
|
|
129
|
+
token: ${{ secrets.CODECOV_TOKEN }}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
name: Deploy docs
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
pages: write
|
|
11
|
+
id-token: write
|
|
12
|
+
|
|
13
|
+
concurrency:
|
|
14
|
+
group: pages
|
|
15
|
+
cancel-in-progress: false
|
|
16
|
+
|
|
17
|
+
jobs:
|
|
18
|
+
deploy:
|
|
19
|
+
runs-on: ubuntu-latest
|
|
20
|
+
environment:
|
|
21
|
+
name: github-pages
|
|
22
|
+
url: ${{ steps.deployment.outputs.page_url }}
|
|
23
|
+
steps:
|
|
24
|
+
- uses: actions/checkout@v6
|
|
25
|
+
- name: Setup Pages (auto-enable if not configured)
|
|
26
|
+
uses: actions/configure-pages@v5
|
|
27
|
+
with:
|
|
28
|
+
enablement: true
|
|
29
|
+
- uses: astral-sh/setup-uv@v7
|
|
30
|
+
with:
|
|
31
|
+
enable-cache: true
|
|
32
|
+
- run: uv python install 3.12
|
|
33
|
+
- run: uv sync --all-extras
|
|
34
|
+
- run: uv run mkdocs build --strict
|
|
35
|
+
- uses: actions/upload-pages-artifact@v3
|
|
36
|
+
with:
|
|
37
|
+
path: ./site
|
|
38
|
+
- id: deployment
|
|
39
|
+
uses: actions/deploy-pages@v4
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ['v*']
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
id-token: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
build-and-publish:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
environment:
|
|
16
|
+
name: pypi
|
|
17
|
+
url: https://pypi.org/p/cenote-core
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v6
|
|
20
|
+
- uses: astral-sh/setup-uv@v7
|
|
21
|
+
with:
|
|
22
|
+
enable-cache: true
|
|
23
|
+
- run: uv python install 3.12
|
|
24
|
+
- run: uv sync --all-extras
|
|
25
|
+
- name: Build wheel + sdist
|
|
26
|
+
run: uv build
|
|
27
|
+
- name: Verify build in isolated venv
|
|
28
|
+
run: |
|
|
29
|
+
uv venv /tmp/verify
|
|
30
|
+
uv pip install --python /tmp/verify/bin/python dist/cenote_core-*.whl
|
|
31
|
+
/tmp/verify/bin/python -c "import cenote; print(cenote.__version__)"
|
|
32
|
+
- name: Publish to PyPI via Trusted Publishing
|
|
33
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
34
|
+
with:
|
|
35
|
+
packages-dir: dist/
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
|
|
8
|
+
# Virtual environments
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
env/
|
|
12
|
+
ENV/
|
|
13
|
+
|
|
14
|
+
# Build artifacts
|
|
15
|
+
build/
|
|
16
|
+
dist/
|
|
17
|
+
*.egg-info/
|
|
18
|
+
*.egg
|
|
19
|
+
.eggs/
|
|
20
|
+
|
|
21
|
+
# Testing
|
|
22
|
+
.pytest_cache/
|
|
23
|
+
.coverage
|
|
24
|
+
.coverage.*
|
|
25
|
+
htmlcov/
|
|
26
|
+
.tox/
|
|
27
|
+
.nox/
|
|
28
|
+
|
|
29
|
+
# Type checking
|
|
30
|
+
.mypy_cache/
|
|
31
|
+
.pyright/
|
|
32
|
+
|
|
33
|
+
# Ruff
|
|
34
|
+
.ruff_cache/
|
|
35
|
+
|
|
36
|
+
# Environment
|
|
37
|
+
.env
|
|
38
|
+
.env.*
|
|
39
|
+
!.env.example
|
|
40
|
+
|
|
41
|
+
# IDEs
|
|
42
|
+
.vscode/
|
|
43
|
+
.idea/
|
|
44
|
+
*.swp
|
|
45
|
+
*.swo
|
|
46
|
+
|
|
47
|
+
# OS
|
|
48
|
+
.DS_Store
|
|
49
|
+
Thumbs.db
|
|
50
|
+
|
|
51
|
+
# Claude Code local overrides (not the shared settings.json)
|
|
52
|
+
.claude/settings.local.json
|
|
53
|
+
.claude/cache/
|
|
54
|
+
|
|
55
|
+
# Internal brainstorming specs (ephemeral artifacts; plans under docs/superpowers/plans/ remain tracked)
|
|
56
|
+
docs/superpowers/
|
|
57
|
+
|
|
58
|
+
# Docker volumes (test pgvector data)
|
|
59
|
+
.docker-volumes/
|
|
60
|
+
|
|
61
|
+
# mkdocs build output
|
|
62
|
+
/site/
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-toml
|
|
9
|
+
- id: check-merge-conflict
|
|
10
|
+
- id: check-added-large-files
|
|
11
|
+
args: ["--maxkb=500"]
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.15.14
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args: ["--fix"]
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
- repo: local
|
|
19
|
+
hooks:
|
|
20
|
+
- id: mypy
|
|
21
|
+
name: mypy
|
|
22
|
+
language: system
|
|
23
|
+
entry: uv run mypy
|
|
24
|
+
files: ^src/
|
|
25
|
+
pass_filenames: false
|
|
26
|
+
args: ["src/"]
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented here.
|
|
4
|
+
|
|
5
|
+
Format: [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/).
|
|
6
|
+
Versioning: [SemVer 2.0.0](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
> **Pre-1.0 disclaimer.** APIs may break in any minor release until `1.0.0` ships.
|
|
9
|
+
> Patch releases (`0.1.0` → `0.1.1`) are bug fixes only.
|
|
10
|
+
|
|
11
|
+
## [Unreleased]
|
|
12
|
+
|
|
13
|
+
### Added
|
|
14
|
+
|
|
15
|
+
- (none yet)
|
|
16
|
+
|
|
17
|
+
## [0.1.0] - 2026-05-25
|
|
18
|
+
|
|
19
|
+
### Added
|
|
20
|
+
|
|
21
|
+
- PyPI package name: `cenote-core` (aligns with `langchain-core`,
|
|
22
|
+
`llama-index-core`, `pydantic-core` ecosystem patterns). Import remains
|
|
23
|
+
`import cenote` (Pillow/scikit-learn precedent — distribution name and
|
|
24
|
+
import name need not match).
|
|
25
|
+
- GitHub repo renamed `pycenote` → `cenote` (brand umbrella; GitHub
|
|
26
|
+
auto-redirects old URLs).
|
|
27
|
+
- `CONTRIBUTING.md` with dev setup, test commands, code style, commit
|
|
28
|
+
conventions, and the release process via PyPI Trusted Publishing.
|
|
29
|
+
- Full README rewrite with new positioning ("not a LangChain alternative —
|
|
30
|
+
production minimalist for teams that hit framework complexity ceilings"),
|
|
31
|
+
module status table split into M1.0 / M1.1+ columns, expanded quickstart,
|
|
32
|
+
extension example with structural-typing pattern, architecture section
|
|
33
|
+
with diagram links, and roadmap with realistic M1.0/M1.1/M1.2+ scope.
|
|
34
|
+
- GitHub Actions release workflow with PyPI OIDC trusted publishing.
|
|
35
|
+
Triggers on `v*` tag push and publishes to <https://pypi.org/project/cenote-core/>.
|
|
36
|
+
Requires one-time setup on PyPI to register the pending publisher.
|
|
37
|
+
- Cookbook: `examples/custom_embedder.py` (structural-typing demo — implement
|
|
38
|
+
the `Embedder` protocol without inheritance) and `examples/pgvector_setup.py`
|
|
39
|
+
(production PgVectorStore — connect with retry, apply migrations,
|
|
40
|
+
multi-tenant indexing, namespace isolation verification).
|
|
41
|
+
- `examples` added to ruff src list.
|
|
42
|
+
- Documentation site powered by `mkdocs-material` and `mkdocstrings-python`.
|
|
43
|
+
Deployed to GitHub Pages at <https://jovandyaz.github.io/cenote/> via
|
|
44
|
+
GitHub Actions on every push to main. Includes auto-generated API
|
|
45
|
+
reference, quickstart, architecture page linking to drawio diagrams, and
|
|
46
|
+
extension tutorials for custom embedders and chunkers.
|
|
47
|
+
- `mkdocs-material`, `mkdocstrings-python` added as dev dependencies.
|
|
48
|
+
- Docs badge added to README. Diagrams link directly to GitHub's native
|
|
49
|
+
drawio renderer (no PNG exports needed).
|
|
50
|
+
- PgVectorStore unit-level test coverage raised from 21% to ≥80% via
|
|
51
|
+
`tests/stores/test_pgvector_helpers.py` (pure helpers, no Postgres
|
|
52
|
+
dependency). Integration tests continue to cover the database layer.
|
|
53
|
+
- Codecov integration: CI uploads `coverage.xml` to Codecov on every push
|
|
54
|
+
to main; coverage badge added to README. Requires `CODECOV_TOKEN` secret.
|
|
55
|
+
- Structured logging via `logging.getLogger(__name__)` in every non-trivial
|
|
56
|
+
module under `src/cenote/`. Key events emit at DEBUG; transient failures
|
|
57
|
+
(retries, rate-limit waits) emit at WARNING. No `print()` calls remain.
|
|
58
|
+
- `cenote.types` module with public type aliases (`Vector`, `Namespace`,
|
|
59
|
+
`ModelId`, `ContentHash`). Adopted in public signatures of `Embedder`,
|
|
60
|
+
`EmbeddingCache`, `VectorStore`.
|
|
61
|
+
- `cenote.errors` exception hierarchy: `CenoteError`, `ConfigurationError`,
|
|
62
|
+
`EmbeddingError`, `RateLimitError`, `VectorStoreError`, `DimensionMismatchError`,
|
|
63
|
+
`MigrationError`. Replaces bare `ValueError` raises throughout `src/cenote/`.
|
|
64
|
+
- `cenote.models`: `Document`, `Chunk`, `EmbeddedChunk`, `RetrievalResult`
|
|
65
|
+
Pydantic v2 models with `extra="forbid"`. `Chunk.make_id(doc_id, pos)`
|
|
66
|
+
produces deterministic chunk IDs.
|
|
67
|
+
- `cenote.chunkers`: `Chunker` Protocol (with `chunk.content` contract docstring)
|
|
68
|
+
and `RecursiveCharacterChunker` (priority-list separators, configurable
|
|
69
|
+
`chunk_size=512` and `chunk_overlap=50`, deep-copied metadata, unicode-safe).
|
|
70
|
+
- `cenote.embedders`: `Embedder` Protocol (`model_id`, `dimensions`, async
|
|
71
|
+
`embed`/`embed_query`) and `MockEmbedder` (deterministic unit-norm vectors
|
|
72
|
+
derived from content hash; matches real-embedder distribution to surface
|
|
73
|
+
ranking bugs that raw Gaussian vectors would hide).
|
|
74
|
+
- `cenote.embedders.cache`: `EmbeddingCache` Protocol, `InMemoryCache`
|
|
75
|
+
(dict-backed, copies on `set` to avoid poisoning), and `CachedEmbedder`
|
|
76
|
+
wrapper (slot-array preserves input order; only cache misses hit the
|
|
77
|
+
inner embedder; cache key is `(model_id, content_hash)` so different
|
|
78
|
+
models do not collide).
|
|
79
|
+
- `cenote.embedders.voyage.VoyageEmbedder` and
|
|
80
|
+
`cenote.embedders.cohere.CohereEmbedder`: production-grade multilingual
|
|
81
|
+
embedders over Voyage AI and Cohere v2 REST APIs. Both ship with input
|
|
82
|
+
batching (Voyage ≤128/req, Cohere ≤96/req), concurrency caps via
|
|
83
|
+
`asyncio.Semaphore`, exponential-backoff retries on 429/5xx, and an
|
|
84
|
+
optional sliding-window RPM rate limiter.
|
|
85
|
+
- `cenote.embedders._http`: shared `RateLimiter` (sliding window, lock-coordinated
|
|
86
|
+
across tasks) and `retrying(...)` helper.
|
|
87
|
+
- Runtime dep: `httpx>=0.27`. Dev dep: `respx>=0.21` (HTTP mocking — no real
|
|
88
|
+
API calls in CI).
|
|
89
|
+
- `.env.example`: template for `VOYAGE_API_KEY`, `COHERE_API_KEY`.
|
|
90
|
+
- `cenote.stores`: `VectorStore` Protocol (multi-tenant, `namespace` mandatory
|
|
91
|
+
on every method) and `InMemoryVectorStore` (numpy-backed cosine similarity,
|
|
92
|
+
per-namespace dicts, optional metadata-filter via exact JSONB-style match).
|
|
93
|
+
Production-grade backend (PgVectorStore) lands in a later task.
|
|
94
|
+
- Runtime dep: `numpy>=2.0`.
|
|
95
|
+
- `cenote.retrievers`: `Retriever` Protocol and `VectorRetriever` (composes
|
|
96
|
+
any `Embedder` with any `VectorStore`; embeds the query, searches the store,
|
|
97
|
+
normalizes `retriever="vector"` on every `RetrievalResult`).
|
|
98
|
+
- `cenote.stores.PgVectorStore`: production-grade `VectorStore` backed by
|
|
99
|
+
Postgres + pgvector. Hardenings: transactional `upsert`/`delete`,
|
|
100
|
+
idempotent migrations tracking (`cenote_schema_migrations` table),
|
|
101
|
+
exponential-backoff `connect()` retry on container-not-ready races,
|
|
102
|
+
pre-flight dimension validation, configurable HNSW `m` /
|
|
103
|
+
`ef_construction` (migration template) and runtime `ef_search`. Initial
|
|
104
|
+
schema in `001_init.sql` ships a GIN index on `metadata` so `@>` filters
|
|
105
|
+
are O(log n) instead of seq-scan.
|
|
106
|
+
- `docker-compose.test.yml`: `pgvector/pgvector:pg16` container on port 5433
|
|
107
|
+
for local integration tests.
|
|
108
|
+
- CI: new `integration-tests` job spins up a Postgres service container,
|
|
109
|
+
exports `TEST_DATABASE_URL`, and runs `pytest -m integration`.
|
|
110
|
+
- Runtime dep: `asyncpg>=0.30`. Dev dep: `asyncpg-stubs>=0.31.2` (types).
|
|
111
|
+
- `demos/quickstart.py`: end-to-end demo CLI — indexes a small EN corpus
|
|
112
|
+
(`demos/data/wikipedia_snippets.json`, 20 entries) through chunker +
|
|
113
|
+
embedder + in-memory store, then runs sample queries against
|
|
114
|
+
`VectorRetriever`. `--provider {mock,voyage,cohere}` toggles between the
|
|
115
|
+
no-API mock and the real multilingual embedders. Smoke test
|
|
116
|
+
(`tests/demos/test_quickstart_smoke.py`) runs the mock path on every CI.
|
|
117
|
+
- README: added `## Quickstart` section with the three provider invocations
|
|
118
|
+
and softened the "LATAM-rooted" claim to match what M1.0 actually ships
|
|
119
|
+
(multilingual embedders today; Spanish-specific BM25 + ES eval datasets
|
|
120
|
+
on the M1.1+ roadmap).
|
|
121
|
+
- `cenote.rerankers.Reranker` Protocol (no impl yet — concrete
|
|
122
|
+
`VoyageReranker` / `CohereReranker` ship in M1.1).
|
|
123
|
+
- `cenote.observability`: `Tracer` Protocol + `NoopTracer` default. OTel and
|
|
124
|
+
Langfuse adapters land in M1.1 without breaking the API.
|
|
125
|
+
- `cenote.eval.metrics`: BEIR-style retrieval quality helpers —
|
|
126
|
+
`precision_at_k`, `recall_at_k`, `mean_reciprocal_rank`. DeepEval
|
|
127
|
+
integration arrives in M1.1.
|
|
128
|
+
- Initial project scaffolding: `uv`, `ruff`, `mypy --strict`, `pytest`, `pre-commit`,
|
|
129
|
+
GitHub Actions CI (lint + type + unit tests, Python 3.12 & 3.13, `pip-audit`).
|
|
130
|
+
- `LICENSE` (Apache 2.0), `CHANGELOG.md`, `SECURITY.md`.
|
|
131
|
+
- `py.typed` marker — package ships type information to consumers (PEP 561).
|
|
132
|
+
- `__version__` exposed via `importlib.metadata` (single source of truth).
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# CLAUDE.md
|
|
2
|
+
|
|
3
|
+
This file is the persistent context for Claude Code sessions working on `cenote`. Read it at the start of every session.
|
|
4
|
+
|
|
5
|
+
## What this project is
|
|
6
|
+
|
|
7
|
+
`cenote` is a production-grade Python framework for building agentic RAG applications, with first-class support for Spanish-language content and Latin American use cases. It is the **shared core** for two downstream products that live in separate repos:
|
|
8
|
+
|
|
9
|
+
- **knowtis-ai** — RAG and research agent over the Knowtis notes platform
|
|
10
|
+
- **cfdi-agent** — accounting reconciliation + CFDI 4.0 compliance agent for Mexican PYMEs
|
|
11
|
+
|
|
12
|
+
Each downstream product validates the core from opposite ends: knowtis-ai needs creative synthesis, cfdi-agent needs deterministic correctness with audit trails. If the core serves both, it serves most production RAG verticals.
|
|
13
|
+
|
|
14
|
+
This repo (`cenote`) contains the core library only. PyPI publication name is `cenote-core` — aligned with `langchain-core` / `llama-index-core` / `pydantic-core` patterns. Import remains `import cenote` (Pillow/scikit-learn precedent: distribution name and import name need not match). The bare `cenote` PyPI slot is held by an abandoned 2019 project; we do not pursue PEP 541 reclaim.
|
|
15
|
+
|
|
16
|
+
## Tech stack
|
|
17
|
+
|
|
18
|
+
- **Python**: 3.12+
|
|
19
|
+
- **Package manager**: `uv` (use `uv sync`, `uv add`, `uv run ...`)
|
|
20
|
+
- **Linting & formatting**: `ruff` (replaces black, isort, flake8)
|
|
21
|
+
- **Type checking**: `mypy --strict`
|
|
22
|
+
- **Testing**: `pytest` + `pytest-asyncio` + `pytest-cov`
|
|
23
|
+
- **Data validation**: `pydantic` v2
|
|
24
|
+
- **Async**: default; sync only when wrapping inherently sync libraries
|
|
25
|
+
- **LLM provider**: Anthropic Claude (Sonnet 4.5 default, Opus for high-stakes reasoning)
|
|
26
|
+
- **Agent framework**: LangGraph (for state machines with conditional edges)
|
|
27
|
+
- **Vector store**: pgvector (Postgres extension)
|
|
28
|
+
- **Embeddings**: provider-agnostic protocol; concrete impl deferred (Voyage AI vs Cohere multilingual decision pending)
|
|
29
|
+
- **Observability**: Langfuse (self-hostable)
|
|
30
|
+
- **Evaluation**: DeepEval + custom metrics
|
|
31
|
+
|
|
32
|
+
## Project structure
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
cenote/ (repo root; on disk during dev, may be cloned as `cenote/` after the rename)
|
|
36
|
+
├── src/cenote/
|
|
37
|
+
│ ├── chunkers/ # Text/markdown splitting
|
|
38
|
+
│ ├── embedders/ # Embedding providers (protocol + impls)
|
|
39
|
+
│ ├── stores/ # Vector stores (protocol + impls)
|
|
40
|
+
│ ├── retrievers/ # Retrieval strategies (BM25, vector, hybrid)
|
|
41
|
+
│ ├── rerankers/ # (future) Reranking strategies
|
|
42
|
+
│ ├── llm/ # (future) LLM client abstractions
|
|
43
|
+
│ ├── agents/ # (future) Agent primitives over LangGraph
|
|
44
|
+
│ ├── eval/ # (future) Eval harness
|
|
45
|
+
│ ├── observability/ # (future) Tracing helpers
|
|
46
|
+
│ ├── models.py # Pydantic models (Document, Chunk, etc.)
|
|
47
|
+
│ └── types.py # Shared type aliases
|
|
48
|
+
├── tests/ # Mirror src/cenote/ structure
|
|
49
|
+
├── docs/ # Milestone briefs and design docs
|
|
50
|
+
└── pyproject.toml
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Conventions
|
|
54
|
+
|
|
55
|
+
### Code style
|
|
56
|
+
|
|
57
|
+
- Type hints on everything public. `mypy --strict` must pass.
|
|
58
|
+
- Pydantic models for any data crossing module boundaries.
|
|
59
|
+
- Prefer `Protocol` over `ABC` for interfaces — duck typing + better composition.
|
|
60
|
+
- One class per concept per file. Avoid mega-modules.
|
|
61
|
+
- Async by default. Sync versions only where retrieval libraries force it.
|
|
62
|
+
|
|
63
|
+
### License headers (SPDX)
|
|
64
|
+
|
|
65
|
+
Every `.py` file in `src/` starts with:
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
It must be the **first non-empty line** (before the module docstring). Test files are exempt. Lint enforcement via `reuse-tool` is M1.1+; for now it's convention.
|
|
72
|
+
|
|
73
|
+
### Naming
|
|
74
|
+
|
|
75
|
+
- Files and modules: `snake_case`
|
|
76
|
+
- Classes: `PascalCase`
|
|
77
|
+
- Protocols: just use the bare noun (`Chunker`, `Embedder`). Suffix with `Protocol` only when there's ambiguity with a concrete impl in the same module.
|
|
78
|
+
- Spanish identifiers are OK in domain-specific downstream code (`cfdi`, `rfc`, `iva`), never in this core repo.
|
|
79
|
+
|
|
80
|
+
### Tests
|
|
81
|
+
|
|
82
|
+
- One test file per source file. Mirror the path: `src/cenote/chunkers/markdown.py` ↔ `tests/chunkers/test_markdown.py`.
|
|
83
|
+
- Shared setup goes in `tests/conftest.py` or per-directory `conftest.py`.
|
|
84
|
+
- Integration tests (those requiring Postgres) go in `tests/integration/` and are marked `@pytest.mark.integration`.
|
|
85
|
+
- Aim for >80% coverage on `src/cenote/`.
|
|
86
|
+
|
|
87
|
+
### Commits
|
|
88
|
+
|
|
89
|
+
- Conventional commits: `feat:`, `fix:`, `chore:`, `docs:`, `test:`, `refactor:`
|
|
90
|
+
- Keep PRs focused. One sub-deliverable per PR (see `docs/00-first-milestone.md` for suggested breakdown).
|
|
91
|
+
- Branch naming: `feat/<short-desc>`, `fix/<short-desc>`, etc.
|
|
92
|
+
|
|
93
|
+
### Documentation
|
|
94
|
+
|
|
95
|
+
- Public functions/classes need docstrings (Google style).
|
|
96
|
+
- Module-level docstring explains the module's purpose in 1–2 lines.
|
|
97
|
+
- Examples in docstrings should be runnable.
|
|
98
|
+
|
|
99
|
+
## Commands
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
# Setup
|
|
103
|
+
uv sync # install all deps (incl. dev)
|
|
104
|
+
|
|
105
|
+
# Development
|
|
106
|
+
uv run pytest # all tests
|
|
107
|
+
uv run pytest tests/chunkers/ # subset
|
|
108
|
+
uv run pytest -m "not integration" # skip integration tests
|
|
109
|
+
uv run pytest --cov=cenote # with coverage
|
|
110
|
+
uv run ruff check . # lint
|
|
111
|
+
uv run ruff format . # format
|
|
112
|
+
uv run mypy src/ # type check
|
|
113
|
+
uv run pre-commit run --all-files # all checks
|
|
114
|
+
|
|
115
|
+
# Adding deps
|
|
116
|
+
uv add <package> # runtime dep
|
|
117
|
+
uv add --dev <package> # dev dep
|
|
118
|
+
|
|
119
|
+
# Build
|
|
120
|
+
uv build # build wheel
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## What to NOT do
|
|
124
|
+
|
|
125
|
+
- **Do not pull in LangChain** as a dependency. We use LangGraph (a focused subset) only. The LangChain mega-package is not wanted here.
|
|
126
|
+
- **Do not add concrete embedder implementations yet**. The `Embedder` protocol must stabilize first; the Voyage vs Cohere decision is pending.
|
|
127
|
+
- **Do not bake assumptions about Spanish-only**. The framework is LATAM-aware but multilingual. Default tokenizer/chunker choices must not be English-only.
|
|
128
|
+
- **Do not add framework-specific code** (FastAPI, Django, Flask, etc.) to `cenote` core. Those belong in downstream services that depend on `cenote`.
|
|
129
|
+
- **Do not commit secrets**. Use `.env` (gitignored) and `pydantic-settings`.
|
|
130
|
+
- **Do not skip type hints or tests** for "I'll add them later". Add them in the same PR or don't add the code.
|
|
131
|
+
- **Do not bypass the namespace parameter** on `VectorStore` / `Retriever` interfaces. Multi-tenancy is enforced at the protocol level.
|
|
132
|
+
- **Do not edit migrations** once committed. Add a new migration instead.
|
|
133
|
+
|
|
134
|
+
## Current focus
|
|
135
|
+
|
|
136
|
+
See `docs/00-first-milestone.md` for the active milestone scope. As of project start, we are building **M1.0 — Core Primitives**: chunkers, embedders, stores, retrievers.
|
|
137
|
+
|
|
138
|
+
Do NOT start work on agents, eval, or observability yet. Those depend on the primitives being stable.
|
|
139
|
+
|
|
140
|
+
## When in doubt
|
|
141
|
+
|
|
142
|
+
- Check `docs/` for design rationales
|
|
143
|
+
- Check existing implementations in the same module for patterns
|
|
144
|
+
- If introducing a new top-level dependency, propose it first (open a discussion or ask the maintainer)
|
|
145
|
+
- The maintainer is Jovan Díaz ([@jovandyaz](https://github.com/jovandyaz))
|