cenote-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. cenote_core-0.1.0/.claude/settings.json +69 -0
  2. cenote_core-0.1.0/.env.example +3 -0
  3. cenote_core-0.1.0/.github/dependabot.yml +18 -0
  4. cenote_core-0.1.0/.github/workflows/ci.yml +129 -0
  5. cenote_core-0.1.0/.github/workflows/docs.yml +39 -0
  6. cenote_core-0.1.0/.github/workflows/release.yml +35 -0
  7. cenote_core-0.1.0/.gitignore +62 -0
  8. cenote_core-0.1.0/.markdownlint.json +7 -0
  9. cenote_core-0.1.0/.pre-commit-config.yaml +26 -0
  10. cenote_core-0.1.0/CHANGELOG.md +132 -0
  11. cenote_core-0.1.0/CLAUDE.md +145 -0
  12. cenote_core-0.1.0/CONTRIBUTING.md +84 -0
  13. cenote_core-0.1.0/LICENSE +202 -0
  14. cenote_core-0.1.0/PKG-INFO +170 -0
  15. cenote_core-0.1.0/README.md +144 -0
  16. cenote_core-0.1.0/SECURITY.md +20 -0
  17. cenote_core-0.1.0/demos/__init__.py +0 -0
  18. cenote_core-0.1.0/demos/data/wikipedia_snippets.json +102 -0
  19. cenote_core-0.1.0/demos/quickstart.py +91 -0
  20. cenote_core-0.1.0/docker-compose.test.yml +14 -0
  21. cenote_core-0.1.0/docs/00-first-milestone.md +410 -0
  22. cenote_core-0.1.0/docs/01-claude-code-prompts.md +266 -0
  23. cenote_core-0.1.0/docs/diagrams/01-ecosystem.drawio +88 -0
  24. cenote_core-0.1.0/docs/diagrams/02-architecture.drawio +178 -0
  25. cenote_core-0.1.0/docs/diagrams/03-runtime-flow.drawio +235 -0
  26. cenote_core-0.1.0/docs/diagrams/README.md +19 -0
  27. cenote_core-0.1.0/docs/site/api/chunkers.md +5 -0
  28. cenote_core-0.1.0/docs/site/api/embedders.md +15 -0
  29. cenote_core-0.1.0/docs/site/api/eval.md +6 -0
  30. cenote_core-0.1.0/docs/site/api/models.md +6 -0
  31. cenote_core-0.1.0/docs/site/api/retrievers.md +5 -0
  32. cenote_core-0.1.0/docs/site/api/stores.md +7 -0
  33. cenote_core-0.1.0/docs/site/architecture.md +36 -0
  34. cenote_core-0.1.0/docs/site/changelog.md +3 -0
  35. cenote_core-0.1.0/docs/site/extending/custom-chunker.md +38 -0
  36. cenote_core-0.1.0/docs/site/extending/custom-embedder.md +60 -0
  37. cenote_core-0.1.0/docs/site/index.md +28 -0
  38. cenote_core-0.1.0/docs/site/quickstart.md +84 -0
  39. cenote_core-0.1.0/examples/README.md +35 -0
  40. cenote_core-0.1.0/examples/custom_embedder.py +108 -0
  41. cenote_core-0.1.0/examples/pgvector_setup.py +103 -0
  42. cenote_core-0.1.0/mkdocs.yml +64 -0
  43. cenote_core-0.1.0/pyproject.toml +97 -0
  44. cenote_core-0.1.0/src/cenote/__init__.py +35 -0
  45. cenote_core-0.1.0/src/cenote/chunkers/__init__.py +7 -0
  46. cenote_core-0.1.0/src/cenote/chunkers/base.py +29 -0
  47. cenote_core-0.1.0/src/cenote/chunkers/recursive.py +103 -0
  48. cenote_core-0.1.0/src/cenote/embedders/__init__.py +18 -0
  49. cenote_core-0.1.0/src/cenote/embedders/_http.py +101 -0
  50. cenote_core-0.1.0/src/cenote/embedders/base.py +25 -0
  51. cenote_core-0.1.0/src/cenote/embedders/cache.py +96 -0
  52. cenote_core-0.1.0/src/cenote/embedders/cohere.py +137 -0
  53. cenote_core-0.1.0/src/cenote/embedders/mock.py +57 -0
  54. cenote_core-0.1.0/src/cenote/embedders/voyage.py +136 -0
  55. cenote_core-0.1.0/src/cenote/errors.py +32 -0
  56. cenote_core-0.1.0/src/cenote/eval/__init__.py +6 -0
  57. cenote_core-0.1.0/src/cenote/eval/metrics.py +48 -0
  58. cenote_core-0.1.0/src/cenote/models.py +56 -0
  59. cenote_core-0.1.0/src/cenote/observability/__init__.py +6 -0
  60. cenote_core-0.1.0/src/cenote/observability/base.py +31 -0
  61. cenote_core-0.1.0/src/cenote/py.typed +0 -0
  62. cenote_core-0.1.0/src/cenote/rerankers/__init__.py +6 -0
  63. cenote_core-0.1.0/src/cenote/rerankers/base.py +26 -0
  64. cenote_core-0.1.0/src/cenote/retrievers/__init__.py +7 -0
  65. cenote_core-0.1.0/src/cenote/retrievers/base.py +20 -0
  66. cenote_core-0.1.0/src/cenote/retrievers/vector.py +36 -0
  67. cenote_core-0.1.0/src/cenote/stores/__init__.py +8 -0
  68. cenote_core-0.1.0/src/cenote/stores/base.py +31 -0
  69. cenote_core-0.1.0/src/cenote/stores/memory.py +91 -0
  70. cenote_core-0.1.0/src/cenote/stores/pgvector.py +246 -0
  71. cenote_core-0.1.0/src/cenote/stores/pgvector_migrations/001_init.sql +36 -0
  72. cenote_core-0.1.0/src/cenote/stores/pgvector_migrations/002_namespace_id_pk.sql +10 -0
  73. cenote_core-0.1.0/src/cenote/stores/pgvector_migrations/__init__.py +0 -0
  74. cenote_core-0.1.0/src/cenote/types.py +7 -0
  75. cenote_core-0.1.0/tests/__init__.py +0 -0
  76. cenote_core-0.1.0/tests/chunkers/__init__.py +0 -0
  77. cenote_core-0.1.0/tests/chunkers/test_recursive.py +99 -0
  78. cenote_core-0.1.0/tests/conftest.py +1 -0
  79. cenote_core-0.1.0/tests/demos/__init__.py +0 -0
  80. cenote_core-0.1.0/tests/demos/test_quickstart_smoke.py +12 -0
  81. cenote_core-0.1.0/tests/embedders/__init__.py +0 -0
  82. cenote_core-0.1.0/tests/embedders/test_cache.py +121 -0
  83. cenote_core-0.1.0/tests/embedders/test_cohere.py +112 -0
  84. cenote_core-0.1.0/tests/embedders/test_http.py +50 -0
  85. cenote_core-0.1.0/tests/embedders/test_mock.py +92 -0
  86. cenote_core-0.1.0/tests/embedders/test_voyage.py +199 -0
  87. cenote_core-0.1.0/tests/eval/__init__.py +0 -0
  88. cenote_core-0.1.0/tests/eval/test_metrics.py +84 -0
  89. cenote_core-0.1.0/tests/integration/__init__.py +0 -0
  90. cenote_core-0.1.0/tests/integration/test_pgvector.py +141 -0
  91. cenote_core-0.1.0/tests/observability/__init__.py +0 -0
  92. cenote_core-0.1.0/tests/observability/test_base.py +14 -0
  93. cenote_core-0.1.0/tests/rerankers/__init__.py +0 -0
  94. cenote_core-0.1.0/tests/rerankers/test_base.py +9 -0
  95. cenote_core-0.1.0/tests/retrievers/__init__.py +0 -0
  96. cenote_core-0.1.0/tests/retrievers/test_vector.py +80 -0
  97. cenote_core-0.1.0/tests/stores/__init__.py +0 -0
  98. cenote_core-0.1.0/tests/stores/test_memory.py +139 -0
  99. cenote_core-0.1.0/tests/stores/test_pgvector_helpers.py +87 -0
  100. cenote_core-0.1.0/tests/test_errors.py +38 -0
  101. cenote_core-0.1.0/tests/test_logging_smoke.py +48 -0
  102. cenote_core-0.1.0/tests/test_models.py +152 -0
  103. cenote_core-0.1.0/tests/test_types.py +26 -0
  104. cenote_core-0.1.0/uv.lock +1309 -0
@@ -0,0 +1,69 @@
1
+ {
2
+ "permissions": {
3
+ "defaultMode": "default",
4
+ "allow": [
5
+ "Read",
6
+ "Bash(uv sync*)",
7
+ "Bash(uv run pytest*)",
8
+ "Bash(uv run ruff*)",
9
+ "Bash(uv run mypy*)",
10
+ "Bash(uv run pre-commit*)",
11
+ "Bash(uv run python*)",
12
+ "Bash(uv build*)",
13
+ "Bash(uv lock*)",
14
+ "Bash(uv tree*)",
15
+ "Bash(uv pip list*)",
16
+ "Bash(ls *)",
17
+ "Bash(cat *)",
18
+ "Bash(head *)",
19
+ "Bash(tail *)",
20
+ "Bash(grep *)",
21
+ "Bash(rg *)",
22
+ "Bash(find *)",
23
+ "Bash(tree*)",
24
+ "Bash(wc *)",
25
+ "Bash(pwd)",
26
+ "Bash(mkdir -p *)",
27
+ "Bash(git status*)",
28
+ "Bash(git diff*)",
29
+ "Bash(git log*)",
30
+ "Bash(git branch*)",
31
+ "Bash(git show*)",
32
+ "Bash(git add *)",
33
+ "Bash(git restore *)",
34
+ "Bash(git stash*)",
35
+ "Bash(docker compose -f docker-compose.test.yml ps*)",
36
+ "Bash(docker compose -f docker-compose.test.yml logs*)"
37
+ ],
38
+ "ask": [
39
+ "Write",
40
+ "Edit",
41
+ "Bash(uv add*)",
42
+ "Bash(uv remove*)",
43
+ "Bash(git checkout *)",
44
+ "Bash(git commit *)",
45
+ "Bash(git merge *)",
46
+ "Bash(git rebase *)",
47
+ "Bash(git reset *)",
48
+ "Bash(docker compose -f docker-compose.test.yml up*)",
49
+ "Bash(docker compose -f docker-compose.test.yml down*)",
50
+ "Bash(docker compose -f docker-compose.test.yml run*)"
51
+ ],
52
+ "deny": [
53
+ "Bash(rm -rf *)",
54
+ "Bash(rm -fr *)",
55
+ "Bash(git push*)",
56
+ "Bash(git push --force*)",
57
+ "Bash(git push -f*)",
58
+ "Bash(git reset --hard*)",
59
+ "Bash(git clean -fd*)",
60
+ "Bash(gh pr merge*)",
61
+ "Bash(gh repo delete*)",
62
+ "Bash(uv publish*)",
63
+ "Bash(npm publish*)",
64
+ "Bash(curl *)",
65
+ "Bash(wget *)",
66
+ "WebFetch"
67
+ ]
68
+ }
69
+ }
@@ -0,0 +1,3 @@
1
+ # Copy to .env and fill in real values (never commit .env)
2
+ VOYAGE_API_KEY=
3
+ COHERE_API_KEY=
@@ -0,0 +1,18 @@
1
+ version: 2
2
+ updates:
3
+ - package-ecosystem: "uv"
4
+ directory: "/"
5
+ schedule:
6
+ interval: "monthly"
7
+ open-pull-requests-limit: 3
8
+ groups:
9
+ python-dependencies:
10
+ patterns: ["*"]
11
+ - package-ecosystem: "github-actions"
12
+ directory: "/"
13
+ schedule:
14
+ interval: "monthly"
15
+ open-pull-requests-limit: 3
16
+ groups:
17
+ github-actions:
18
+ patterns: ["*"]
@@ -0,0 +1,129 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ branches: [main]
6
+ push:
7
+ branches: [main]
8
+
9
+ concurrency:
10
+ group: ${{ github.workflow }}-${{ github.ref }}
11
+ cancel-in-progress: true
12
+
13
+ jobs:
14
+ lint-and-type:
15
+ runs-on: ubuntu-latest
16
+ strategy:
17
+ fail-fast: false
18
+ matrix:
19
+ python-version: ["3.12", "3.13"]
20
+ steps:
21
+ - uses: actions/checkout@v6
22
+ - name: Install uv
23
+ uses: astral-sh/setup-uv@v7
24
+ with:
25
+ enable-cache: true
26
+ - name: Set up Python ${{ matrix.python-version }}
27
+ run: uv python install ${{ matrix.python-version }}
28
+ - name: Install deps
29
+ run: uv sync --all-extras
30
+ - name: Ruff check
31
+ run: uv run ruff check .
32
+ - name: Ruff format check
33
+ run: uv run ruff format --check .
34
+ - name: Mypy
35
+ run: uv run mypy src/
36
+
37
+ security-audit:
38
+ runs-on: ubuntu-latest
39
+ steps:
40
+ - uses: actions/checkout@v6
41
+ - name: Install uv
42
+ uses: astral-sh/setup-uv@v7
43
+ with:
44
+ enable-cache: true
45
+ - name: Set up Python
46
+ run: uv python install 3.12
47
+ - name: Install deps (locked)
48
+ run: uv sync --all-extras --locked
49
+ - name: Export locked deps for pip-audit (excluding cenote-core itself)
50
+ run: |
51
+ uv export --no-hashes --no-emit-project --format requirements-txt \
52
+ --output-file requirements.audit.txt
53
+ - name: Audit dependencies for known CVEs
54
+ run: uv tool run pip-audit --strict --requirement requirements.audit.txt
55
+
56
+ unit-tests:
57
+ runs-on: ubuntu-latest
58
+ strategy:
59
+ fail-fast: false
60
+ matrix:
61
+ python-version: ["3.12", "3.13"]
62
+ steps:
63
+ - uses: actions/checkout@v6
64
+ - name: Install uv
65
+ uses: astral-sh/setup-uv@v7
66
+ with:
67
+ enable-cache: true
68
+ - name: Set up Python ${{ matrix.python-version }}
69
+ run: uv python install ${{ matrix.python-version }}
70
+ - name: Install deps
71
+ run: uv sync --all-extras
72
+ - name: Run unit tests
73
+ run: uv run pytest -m "not integration" --cov=cenote --cov-report=xml
74
+ - name: Upload coverage artifact
75
+ if: always() && matrix.python-version == '3.12'
76
+ uses: actions/upload-artifact@v7
77
+ with:
78
+ name: coverage
79
+ path: coverage.xml
80
+ - name: Upload coverage to Codecov
81
+ if: matrix.python-version == '3.12'
82
+ uses: codecov/codecov-action@v4
83
+ with:
84
+ files: ./coverage.xml
85
+ flags: unit
86
+ fail_ci_if_error: false
87
+ token: ${{ secrets.CODECOV_TOKEN }}
88
+
89
+ integration-tests:
90
+ runs-on: ubuntu-latest
91
+ services:
92
+ postgres:
93
+ image: pgvector/pgvector:pg16
94
+ env:
95
+ POSTGRES_USER: cenote
96
+ POSTGRES_PASSWORD: cenote
97
+ POSTGRES_DB: cenote_test
98
+ ports:
99
+ - 5433:5432
100
+ options: >-
101
+ --health-cmd "pg_isready -U cenote -d cenote_test"
102
+ --health-interval 2s
103
+ --health-timeout 5s
104
+ --health-retries 30
105
+ env:
106
+ TEST_DATABASE_URL: postgresql://cenote:cenote@localhost:5433/cenote_test
107
+ steps:
108
+ - uses: actions/checkout@v6
109
+ - uses: astral-sh/setup-uv@v7
110
+ with:
111
+ enable-cache: true
112
+ - run: uv python install 3.12
113
+ - run: uv sync --all-extras
114
+ - name: Wait for Postgres
115
+ run: |
116
+ for i in {1..30}; do
117
+ if pg_isready -h localhost -p 5433 -U cenote; then exit 0; fi
118
+ sleep 1
119
+ done
120
+ exit 1
121
+ - name: Run integration tests with coverage
122
+ run: uv run pytest -m integration --cov=cenote --cov-report=xml
123
+ - name: Upload integration coverage to Codecov
124
+ uses: codecov/codecov-action@v4
125
+ with:
126
+ files: ./coverage.xml
127
+ flags: integration
128
+ fail_ci_if_error: false
129
+ token: ${{ secrets.CODECOV_TOKEN }}
@@ -0,0 +1,39 @@
1
+ name: Deploy docs
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ contents: read
10
+ pages: write
11
+ id-token: write
12
+
13
+ concurrency:
14
+ group: pages
15
+ cancel-in-progress: false
16
+
17
+ jobs:
18
+ deploy:
19
+ runs-on: ubuntu-latest
20
+ environment:
21
+ name: github-pages
22
+ url: ${{ steps.deployment.outputs.page_url }}
23
+ steps:
24
+ - uses: actions/checkout@v6
25
+ - name: Setup Pages (auto-enable if not configured)
26
+ uses: actions/configure-pages@v5
27
+ with:
28
+ enablement: true
29
+ - uses: astral-sh/setup-uv@v7
30
+ with:
31
+ enable-cache: true
32
+ - run: uv python install 3.12
33
+ - run: uv sync --all-extras
34
+ - run: uv run mkdocs build --strict
35
+ - uses: actions/upload-pages-artifact@v3
36
+ with:
37
+ path: ./site
38
+ - id: deployment
39
+ uses: actions/deploy-pages@v4
@@ -0,0 +1,35 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ['v*']
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ contents: read
10
+ id-token: write
11
+
12
+ jobs:
13
+ build-and-publish:
14
+ runs-on: ubuntu-latest
15
+ environment:
16
+ name: pypi
17
+ url: https://pypi.org/p/cenote-core
18
+ steps:
19
+ - uses: actions/checkout@v6
20
+ - uses: astral-sh/setup-uv@v7
21
+ with:
22
+ enable-cache: true
23
+ - run: uv python install 3.12
24
+ - run: uv sync --all-extras
25
+ - name: Build wheel + sdist
26
+ run: uv build
27
+ - name: Verify build in isolated venv
28
+ run: |
29
+ uv venv /tmp/verify
30
+ uv pip install --python /tmp/verify/bin/python dist/cenote_core-*.whl
31
+ /tmp/verify/bin/python -c "import cenote; print(cenote.__version__)"
32
+ - name: Publish to PyPI via Trusted Publishing
33
+ uses: pypa/gh-action-pypi-publish@release/v1
34
+ with:
35
+ packages-dir: dist/
@@ -0,0 +1,62 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ .venv/
10
+ venv/
11
+ env/
12
+ ENV/
13
+
14
+ # Build artifacts
15
+ build/
16
+ dist/
17
+ *.egg-info/
18
+ *.egg
19
+ .eggs/
20
+
21
+ # Testing
22
+ .pytest_cache/
23
+ .coverage
24
+ .coverage.*
25
+ htmlcov/
26
+ .tox/
27
+ .nox/
28
+
29
+ # Type checking
30
+ .mypy_cache/
31
+ .pyright/
32
+
33
+ # Ruff
34
+ .ruff_cache/
35
+
36
+ # Environment
37
+ .env
38
+ .env.*
39
+ !.env.example
40
+
41
+ # IDEs
42
+ .vscode/
43
+ .idea/
44
+ *.swp
45
+ *.swo
46
+
47
+ # OS
48
+ .DS_Store
49
+ Thumbs.db
50
+
51
+ # Claude Code local overrides (not the shared settings.json)
52
+ .claude/settings.local.json
53
+ .claude/cache/
54
+
55
+ # Internal brainstorming specs (ephemeral artifacts; plans under docs/superpowers/plans/ remain tracked)
56
+ docs/superpowers/
57
+
58
+ # Docker volumes (test pgvector data)
59
+ .docker-volumes/
60
+
61
+ # mkdocs build output
62
+ /site/
@@ -0,0 +1,7 @@
1
+ {
2
+ "default": true,
3
+ "MD013": false,
4
+ "MD024": { "siblings_only": true },
5
+ "MD033": false,
6
+ "MD041": false
7
+ }
@@ -0,0 +1,26 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-toml
9
+ - id: check-merge-conflict
10
+ - id: check-added-large-files
11
+ args: ["--maxkb=500"]
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ rev: v0.15.14
14
+ hooks:
15
+ - id: ruff
16
+ args: ["--fix"]
17
+ - id: ruff-format
18
+ - repo: local
19
+ hooks:
20
+ - id: mypy
21
+ name: mypy
22
+ language: system
23
+ entry: uv run mypy
24
+ files: ^src/
25
+ pass_filenames: false
26
+ args: ["src/"]
@@ -0,0 +1,132 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented here.
4
+
5
+ Format: [Keep a Changelog 1.1.0](https://keepachangelog.com/en/1.1.0/).
6
+ Versioning: [SemVer 2.0.0](https://semver.org/spec/v2.0.0.html).
7
+
8
+ > **Pre-1.0 disclaimer.** APIs may break in any minor release until `1.0.0` ships.
9
+ > Patch releases (`0.1.0` → `0.1.1`) are bug fixes only.
10
+
11
+ ## [Unreleased]
12
+
13
+ ### Added
14
+
15
+ - (none yet)
16
+
17
+ ## [0.1.0] - 2026-05-25
18
+
19
+ ### Added
20
+
21
+ - PyPI package name: `cenote-core` (aligns with `langchain-core`,
22
+ `llama-index-core`, `pydantic-core` ecosystem patterns). Import remains
23
+ `import cenote` (Pillow/scikit-learn precedent — distribution name and
24
+ import name need not match).
25
+ - GitHub repo renamed `pycenote` → `cenote` (brand umbrella; GitHub
26
+ auto-redirects old URLs).
27
+ - `CONTRIBUTING.md` with dev setup, test commands, code style, commit
28
+ conventions, and the release process via PyPI Trusted Publishing.
29
+ - Full README rewrite with new positioning ("not a LangChain alternative —
30
+ production minimalist for teams that hit framework complexity ceilings"),
31
+ module status table split into M1.0 / M1.1+ columns, expanded quickstart,
32
+ extension example with structural-typing pattern, architecture section
33
+ with diagram links, and roadmap with realistic M1.0/M1.1/M1.2+ scope.
34
+ - GitHub Actions release workflow with PyPI OIDC trusted publishing.
35
+ Triggers on `v*` tag push and publishes to <https://pypi.org/project/cenote-core/>.
36
+ Requires one-time setup on PyPI to register the pending publisher.
37
+ - Cookbook: `examples/custom_embedder.py` (structural-typing demo — implement
38
+ the `Embedder` protocol without inheritance) and `examples/pgvector_setup.py`
39
+ (production PgVectorStore — connect with retry, apply migrations,
40
+ multi-tenant indexing, namespace isolation verification).
41
+ - `examples` added to ruff src list.
42
+ - Documentation site powered by `mkdocs-material` and `mkdocstrings-python`.
43
+ Deployed to GitHub Pages at <https://jovandyaz.github.io/cenote/> via
44
+ GitHub Actions on every push to main. Includes auto-generated API
45
+ reference, quickstart, architecture page linking to drawio diagrams, and
46
+ extension tutorials for custom embedders and chunkers.
47
+ - `mkdocs-material`, `mkdocstrings-python` added as dev dependencies.
48
+ - Docs badge added to README. Diagrams link directly to GitHub's native
49
+ drawio renderer (no PNG exports needed).
50
+ - PgVectorStore unit-level test coverage raised from 21% to ≥80% via
51
+ `tests/stores/test_pgvector_helpers.py` (pure helpers, no Postgres
52
+ dependency). Integration tests continue to cover the database layer.
53
+ - Codecov integration: CI uploads `coverage.xml` to Codecov on every push
54
+ to main; coverage badge added to README. Requires `CODECOV_TOKEN` secret.
55
+ - Structured logging via `logging.getLogger(__name__)` in every non-trivial
56
+ module under `src/cenote/`. Key events emit at DEBUG; transient failures
57
+ (retries, rate-limit waits) emit at WARNING. No `print()` calls remain.
58
+ - `cenote.types` module with public type aliases (`Vector`, `Namespace`,
59
+ `ModelId`, `ContentHash`). Adopted in public signatures of `Embedder`,
60
+ `EmbeddingCache`, `VectorStore`.
61
+ - `cenote.errors` exception hierarchy: `CenoteError`, `ConfigurationError`,
62
+ `EmbeddingError`, `RateLimitError`, `VectorStoreError`, `DimensionMismatchError`,
63
+ `MigrationError`. Replaces bare `ValueError` raises throughout `src/cenote/`.
64
+ - `cenote.models`: `Document`, `Chunk`, `EmbeddedChunk`, `RetrievalResult`
65
+ Pydantic v2 models with `extra="forbid"`. `Chunk.make_id(doc_id, pos)`
66
+ produces deterministic chunk IDs.
67
+ - `cenote.chunkers`: `Chunker` Protocol (with `chunk.content` contract docstring)
68
+ and `RecursiveCharacterChunker` (priority-list separators, configurable
69
+ `chunk_size=512` and `chunk_overlap=50`, deep-copied metadata, unicode-safe).
70
+ - `cenote.embedders`: `Embedder` Protocol (`model_id`, `dimensions`, async
71
+ `embed`/`embed_query`) and `MockEmbedder` (deterministic unit-norm vectors
72
+ derived from content hash; matches real-embedder distribution to surface
73
+ ranking bugs that raw Gaussian vectors would hide).
74
+ - `cenote.embedders.cache`: `EmbeddingCache` Protocol, `InMemoryCache`
75
+ (dict-backed, copies on `set` to avoid poisoning), and `CachedEmbedder`
76
+ wrapper (slot-array preserves input order; only cache misses hit the
77
+ inner embedder; cache key is `(model_id, content_hash)` so different
78
+ models do not collide).
79
+ - `cenote.embedders.voyage.VoyageEmbedder` and
80
+ `cenote.embedders.cohere.CohereEmbedder`: production-grade multilingual
81
+ embedders over Voyage AI and Cohere v2 REST APIs. Both ship with input
82
+ batching (Voyage ≤128/req, Cohere ≤96/req), concurrency caps via
83
+ `asyncio.Semaphore`, exponential-backoff retries on 429/5xx, and an
84
+ optional sliding-window RPM rate limiter.
85
+ - `cenote.embedders._http`: shared `RateLimiter` (sliding window, lock-coordinated
86
+ across tasks) and `retrying(...)` helper.
87
+ - Runtime dep: `httpx>=0.27`. Dev dep: `respx>=0.21` (HTTP mocking — no real
88
+ API calls in CI).
89
+ - `.env.example`: template for `VOYAGE_API_KEY`, `COHERE_API_KEY`.
90
+ - `cenote.stores`: `VectorStore` Protocol (multi-tenant, `namespace` mandatory
91
+ on every method) and `InMemoryVectorStore` (numpy-backed cosine similarity,
92
+ per-namespace dicts, optional metadata-filter via exact JSONB-style match).
93
+ Production-grade backend (PgVectorStore) lands in a later task.
94
+ - Runtime dep: `numpy>=2.0`.
95
+ - `cenote.retrievers`: `Retriever` Protocol and `VectorRetriever` (composes
96
+ any `Embedder` with any `VectorStore`; embeds the query, searches the store,
97
+ normalizes `retriever="vector"` on every `RetrievalResult`).
98
+ - `cenote.stores.PgVectorStore`: production-grade `VectorStore` backed by
99
+ Postgres + pgvector. Hardenings: transactional `upsert`/`delete`,
100
+ idempotent migrations tracking (`cenote_schema_migrations` table),
101
+ exponential-backoff `connect()` retry on container-not-ready races,
102
+ pre-flight dimension validation, configurable HNSW `m` /
103
+ `ef_construction` (migration template) and runtime `ef_search`. Initial
104
+ schema in `001_init.sql` ships a GIN index on `metadata` so `@>` filters
105
+ are O(log n) instead of seq-scan.
106
+ - `docker-compose.test.yml`: `pgvector/pgvector:pg16` container on port 5433
107
+ for local integration tests.
108
+ - CI: new `integration-tests` job spins up a Postgres service container,
109
+ exports `TEST_DATABASE_URL`, and runs `pytest -m integration`.
110
+ - Runtime dep: `asyncpg>=0.30`. Dev dep: `asyncpg-stubs>=0.31.2` (types).
111
+ - `demos/quickstart.py`: end-to-end demo CLI — indexes a small EN corpus
112
+ (`demos/data/wikipedia_snippets.json`, 20 entries) through chunker +
113
+ embedder + in-memory store, then runs sample queries against
114
+ `VectorRetriever`. `--provider {mock,voyage,cohere}` toggles between the
115
+ no-API mock and the real multilingual embedders. Smoke test
116
+ (`tests/demos/test_quickstart_smoke.py`) runs the mock path on every CI.
117
+ - README: added `## Quickstart` section with the three provider invocations
118
+ and softened the "LATAM-rooted" claim to match what M1.0 actually ships
119
+ (multilingual embedders today; Spanish-specific BM25 + ES eval datasets
120
+ on the M1.1+ roadmap).
121
+ - `cenote.rerankers.Reranker` Protocol (no impl yet — concrete
122
+ `VoyageReranker` / `CohereReranker` ship in M1.1).
123
+ - `cenote.observability`: `Tracer` Protocol + `NoopTracer` default. OTel and
124
+ Langfuse adapters land in M1.1 without breaking the API.
125
+ - `cenote.eval.metrics`: BEIR-style retrieval quality helpers —
126
+ `precision_at_k`, `recall_at_k`, `mean_reciprocal_rank`. DeepEval
127
+ integration arrives in M1.1.
128
+ - Initial project scaffolding: `uv`, `ruff`, `mypy --strict`, `pytest`, `pre-commit`,
129
+ GitHub Actions CI (lint + type + unit tests, Python 3.12 & 3.13, `pip-audit`).
130
+ - `LICENSE` (Apache 2.0), `CHANGELOG.md`, `SECURITY.md`.
131
+ - `py.typed` marker — package ships type information to consumers (PEP 561).
132
+ - `__version__` exposed via `importlib.metadata` (single source of truth).
@@ -0,0 +1,145 @@
1
+ # CLAUDE.md
2
+
3
+ This file is the persistent context for Claude Code sessions working on `cenote`. Read it at the start of every session.
4
+
5
+ ## What this project is
6
+
7
+ `cenote` is a production-grade Python framework for building agentic RAG applications, with first-class support for Spanish-language content and Latin American use cases. It is the **shared core** for two downstream products that live in separate repos:
8
+
9
+ - **knowtis-ai** — RAG and research agent over the Knowtis notes platform
10
+ - **cfdi-agent** — accounting reconciliation + CFDI 4.0 compliance agent for Mexican PYMEs
11
+
12
+ Each downstream product validates the core from opposite ends: knowtis-ai needs creative synthesis, cfdi-agent needs deterministic correctness with audit trails. If the core serves both, it serves most production RAG verticals.
13
+
14
+ This repo (`cenote`) contains the core library only. PyPI publication name is `cenote-core` — aligned with `langchain-core` / `llama-index-core` / `pydantic-core` patterns. Import remains `import cenote` (Pillow/scikit-learn precedent: distribution name and import name need not match). The bare `cenote` PyPI slot is held by an abandoned 2019 project; we do not pursue PEP 541 reclaim.
15
+
16
+ ## Tech stack
17
+
18
+ - **Python**: 3.12+
19
+ - **Package manager**: `uv` (use `uv sync`, `uv add`, `uv run ...`)
20
+ - **Linting & formatting**: `ruff` (replaces black, isort, flake8)
21
+ - **Type checking**: `mypy --strict`
22
+ - **Testing**: `pytest` + `pytest-asyncio` + `pytest-cov`
23
+ - **Data validation**: `pydantic` v2
24
+ - **Async**: default; sync only when wrapping inherently sync libraries
25
+ - **LLM provider**: Anthropic Claude (Sonnet 4.5 default, Opus for high-stakes reasoning)
26
+ - **Agent framework**: LangGraph (for state machines with conditional edges)
27
+ - **Vector store**: pgvector (Postgres extension)
28
+ - **Embeddings**: provider-agnostic protocol; concrete impl deferred (Voyage AI vs Cohere multilingual decision pending)
29
+ - **Observability**: Langfuse (self-hostable)
30
+ - **Evaluation**: DeepEval + custom metrics
31
+
32
+ ## Project structure
33
+
34
+ ```
35
+ cenote/ (repo root; on disk during dev, may be cloned as `cenote/` after the rename)
36
+ ├── src/cenote/
37
+ │ ├── chunkers/ # Text/markdown splitting
38
+ │ ├── embedders/ # Embedding providers (protocol + impls)
39
+ │ ├── stores/ # Vector stores (protocol + impls)
40
+ │ ├── retrievers/ # Retrieval strategies (BM25, vector, hybrid)
41
+ │ ├── rerankers/ # (future) Reranking strategies
42
+ │ ├── llm/ # (future) LLM client abstractions
43
+ │ ├── agents/ # (future) Agent primitives over LangGraph
44
+ │ ├── eval/ # (future) Eval harness
45
+ │ ├── observability/ # (future) Tracing helpers
46
+ │ ├── models.py # Pydantic models (Document, Chunk, etc.)
47
+ │ └── types.py # Shared type aliases
48
+ ├── tests/ # Mirror src/cenote/ structure
49
+ ├── docs/ # Milestone briefs and design docs
50
+ └── pyproject.toml
51
+ ```
52
+
53
+ ## Conventions
54
+
55
+ ### Code style
56
+
57
+ - Type hints on everything public. `mypy --strict` must pass.
58
+ - Pydantic models for any data crossing module boundaries.
59
+ - Prefer `Protocol` over `ABC` for interfaces — duck typing + better composition.
60
+ - One class per concept per file. Avoid mega-modules.
61
+ - Async by default. Sync versions only where retrieval libraries force it.
62
+
63
+ ### License headers (SPDX)
64
+
65
+ Every `.py` file in `src/` starts with:
66
+
67
+ ```python
68
+ # SPDX-License-Identifier: Apache-2.0
69
+ ```
70
+
71
+ It must be the **first non-empty line** (before the module docstring). Test files are exempt. Lint enforcement via `reuse-tool` is M1.1+; for now it's convention.
72
+
73
+ ### Naming
74
+
75
+ - Files and modules: `snake_case`
76
+ - Classes: `PascalCase`
77
+ - Protocols: just use the bare noun (`Chunker`, `Embedder`). Suffix with `Protocol` only when there's ambiguity with a concrete impl in the same module.
78
+ - Spanish identifiers are OK in domain-specific downstream code (`cfdi`, `rfc`, `iva`), never in this core repo.
79
+
80
+ ### Tests
81
+
82
+ - One test file per source file. Mirror the path: `src/cenote/chunkers/markdown.py` ↔ `tests/chunkers/test_markdown.py`.
83
+ - Shared setup goes in `tests/conftest.py` or per-directory `conftest.py`.
84
+ - Integration tests (those requiring Postgres) go in `tests/integration/` and are marked `@pytest.mark.integration`.
85
+ - Aim for >80% coverage on `src/cenote/`.
86
+
87
+ ### Commits
88
+
89
+ - Conventional commits: `feat:`, `fix:`, `chore:`, `docs:`, `test:`, `refactor:`
90
+ - Keep PRs focused. One sub-deliverable per PR (see `docs/00-first-milestone.md` for suggested breakdown).
91
+ - Branch naming: `feat/<short-desc>`, `fix/<short-desc>`, etc.
92
+
93
+ ### Documentation
94
+
95
+ - Public functions/classes need docstrings (Google style).
96
+ - Module-level docstring explains the module's purpose in 1–2 lines.
97
+ - Examples in docstrings should be runnable.
98
+
99
+ ## Commands
100
+
101
+ ```bash
102
+ # Setup
103
+ uv sync # install all deps (incl. dev)
104
+
105
+ # Development
106
+ uv run pytest # all tests
107
+ uv run pytest tests/chunkers/ # subset
108
+ uv run pytest -m "not integration" # skip integration tests
109
+ uv run pytest --cov=cenote # with coverage
110
+ uv run ruff check . # lint
111
+ uv run ruff format . # format
112
+ uv run mypy src/ # type check
113
+ uv run pre-commit run --all-files # all checks
114
+
115
+ # Adding deps
116
+ uv add <package> # runtime dep
117
+ uv add --dev <package> # dev dep
118
+
119
+ # Build
120
+ uv build # build wheel
121
+ ```
122
+
123
+ ## What to NOT do
124
+
125
+ - **Do not pull in LangChain** as a dependency. We use LangGraph (a focused subset) only. The LangChain mega-package is not wanted here.
126
+ - **Do not add concrete embedder implementations yet**. The `Embedder` protocol must stabilize first; the Voyage vs Cohere decision is pending.
127
+ - **Do not bake assumptions about Spanish-only**. The framework is LATAM-aware but multilingual. Default tokenizer/chunker choices must not be English-only.
128
+ - **Do not add framework-specific code** (FastAPI, Django, Flask, etc.) to `cenote` core. Those belong in downstream services that depend on `cenote`.
129
+ - **Do not commit secrets**. Use `.env` (gitignored) and `pydantic-settings`.
130
+ - **Do not skip type hints or tests** for "I'll add them later". Add them in the same PR or don't add the code.
131
+ - **Do not bypass the namespace parameter** on `VectorStore` / `Retriever` interfaces. Multi-tenancy is enforced at the protocol level.
132
+ - **Do not edit migrations** once committed. Add a new migration instead.
133
+
134
+ ## Current focus
135
+
136
+ See `docs/00-first-milestone.md` for the active milestone scope. As of project start, we are building **M1.0 — Core Primitives**: chunkers, embedders, stores, retrievers.
137
+
138
+ Do NOT start work on agents, eval, or observability yet. Those depend on the primitives being stable.
139
+
140
+ ## When in doubt
141
+
142
+ - Check `docs/` for design rationales
143
+ - Check existing implementations in the same module for patterns
144
+ - If introducing a new top-level dependency, propose it first (open a discussion or ask the maintainer)
145
+ - The maintainer is Jovan Díaz ([@jovandyaz](https://github.com/jovandyaz))