our-embeddings 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- our_embeddings-0.1.0/.github/workflows/ci.yml +24 -0
- our_embeddings-0.1.0/.github/workflows/release.yml +13 -0
- our_embeddings-0.1.0/.gitignore +38 -0
- our_embeddings-0.1.0/.pre-commit-config.yaml +33 -0
- our_embeddings-0.1.0/CHANGELOG.md +11 -0
- our_embeddings-0.1.0/LICENSE +21 -0
- our_embeddings-0.1.0/Makefile +38 -0
- our_embeddings-0.1.0/PKG-INFO +211 -0
- our_embeddings-0.1.0/README.md +180 -0
- our_embeddings-0.1.0/pyproject.toml +95 -0
- our_embeddings-0.1.0/src/our_embeddings/__init__.py +54 -0
- our_embeddings-0.1.0/src/our_embeddings/config.py +47 -0
- our_embeddings-0.1.0/src/our_embeddings/exceptions.py +18 -0
- our_embeddings-0.1.0/src/our_embeddings/federation.py +355 -0
- our_embeddings-0.1.0/src/our_embeddings/providers/__init__.py +36 -0
- our_embeddings-0.1.0/src/our_embeddings/providers/local.py +198 -0
- our_embeddings-0.1.0/src/our_embeddings/py.typed +0 -0
- our_embeddings-0.1.0/src/our_embeddings/registry.py +133 -0
- our_embeddings-0.1.0/src/our_embeddings/service.py +374 -0
- our_embeddings-0.1.0/tests/conftest.py +55 -0
- our_embeddings-0.1.0/tests/test_federation.py +382 -0
- our_embeddings-0.1.0/tests/test_local_provider.py +333 -0
- our_embeddings-0.1.0/tests/test_provider.py +152 -0
- our_embeddings-0.1.0/tests/test_registry.py +275 -0
- our_embeddings-0.1.0/tests/test_service.py +422 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
push:
|
|
6
|
+
branches: [main]
|
|
7
|
+
paths-ignore:
|
|
8
|
+
- '*.md'
|
|
9
|
+
- 'docs/**'
|
|
10
|
+
|
|
11
|
+
concurrency:
|
|
12
|
+
group: ci-${{ github.ref }}
|
|
13
|
+
cancel-in-progress: true
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
lint:
|
|
17
|
+
uses: ourochronos/our-infra/.github/workflows/lint.yml@main
|
|
18
|
+
with:
|
|
19
|
+
extra-install: "git+https://github.com/ourochronos/our-db.git"
|
|
20
|
+
test:
|
|
21
|
+
needs: lint
|
|
22
|
+
uses: ourochronos/our-infra/.github/workflows/test.yml@main
|
|
23
|
+
with:
|
|
24
|
+
extra-install: "git+https://github.com/ourochronos/our-db.git"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
*.egg-info/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.egg
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
env/
|
|
15
|
+
|
|
16
|
+
# Testing
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
htmlcov/
|
|
19
|
+
.coverage
|
|
20
|
+
.coverage.*
|
|
21
|
+
coverage.xml
|
|
22
|
+
|
|
23
|
+
# Type checking
|
|
24
|
+
.mypy_cache/
|
|
25
|
+
|
|
26
|
+
# Linting
|
|
27
|
+
.ruff_cache/
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
*~
|
|
35
|
+
|
|
36
|
+
# OS
|
|
37
|
+
.DS_Store
|
|
38
|
+
Thumbs.db
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v4.6.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-added-large-files
|
|
9
|
+
args: ['--maxkb=500']
|
|
10
|
+
- id: check-merge-conflict
|
|
11
|
+
|
|
12
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
13
|
+
rev: v0.4.4
|
|
14
|
+
hooks:
|
|
15
|
+
- id: ruff
|
|
16
|
+
args: [--fix]
|
|
17
|
+
- id: ruff-format
|
|
18
|
+
|
|
19
|
+
- repo: https://github.com/pre-commit/mirrors-mypy
|
|
20
|
+
rev: v1.10.0
|
|
21
|
+
hooks:
|
|
22
|
+
- id: mypy
|
|
23
|
+
additional_dependencies: []
|
|
24
|
+
args: [--config-file=pyproject.toml]
|
|
25
|
+
stages: [commit]
|
|
26
|
+
language: system
|
|
27
|
+
|
|
28
|
+
- repo: https://github.com/PyCQA/bandit
|
|
29
|
+
rev: 1.7.8
|
|
30
|
+
hooks:
|
|
31
|
+
- id: bandit
|
|
32
|
+
args: ['-q', '--severity-level', 'medium']
|
|
33
|
+
exclude: tests/
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Initial project structure
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Chris Jacobs
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
.PHONY: help install dev lint format test test-unit test-int test-cov clean
|
|
2
|
+
|
|
3
|
+
help: ## Show this help
|
|
4
|
+
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
|
|
5
|
+
|
|
6
|
+
install: ## Install package
|
|
7
|
+
pip install -e .
|
|
8
|
+
|
|
9
|
+
dev: ## Install package with dev dependencies
|
|
10
|
+
pip install -e ".[dev]"
|
|
11
|
+
pre-commit install
|
|
12
|
+
|
|
13
|
+
lint: ## Run linters (ruff + mypy)
|
|
14
|
+
ruff check src/ tests/
|
|
15
|
+
ruff format --check src/ tests/
|
|
16
|
+
mypy src/
|
|
17
|
+
|
|
18
|
+
format: ## Auto-format code
|
|
19
|
+
ruff check --fix src/ tests/
|
|
20
|
+
ruff format src/ tests/
|
|
21
|
+
|
|
22
|
+
test: test-unit ## Run tests (unit only by default)
|
|
23
|
+
|
|
24
|
+
test-unit: ## Run unit tests
|
|
25
|
+
pytest tests/ -m "not integration and not slow" -v
|
|
26
|
+
|
|
27
|
+
test-int: ## Run integration tests
|
|
28
|
+
pytest tests/ -m integration -v --timeout=60
|
|
29
|
+
|
|
30
|
+
test-all: ## Run all tests
|
|
31
|
+
pytest tests/ -v --timeout=120
|
|
32
|
+
|
|
33
|
+
test-cov: ## Run tests with coverage report
|
|
34
|
+
pytest tests/ -m "not integration and not slow" --cov --cov-report=term-missing --cov-report=html
|
|
35
|
+
|
|
36
|
+
clean: ## Remove build artifacts and caches
|
|
37
|
+
rm -rf build/ dist/ *.egg-info .pytest_cache .mypy_cache .ruff_cache htmlcov .coverage
|
|
38
|
+
find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: our-embeddings
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Vector embedding generation and similarity search
|
|
5
|
+
Project-URL: Homepage, https://github.com/ourochronos/our-embeddings
|
|
6
|
+
Project-URL: Repository, https://github.com/ourochronos/our-embeddings
|
|
7
|
+
Author: Chris Jacobs
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Requires-Dist: openai>=1.0
|
|
17
|
+
Requires-Dist: our-db>=0.1.0
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
20
|
+
Requires-Dist: numpy>=1.24; extra == 'dev'
|
|
21
|
+
Requires-Dist: pre-commit>=3.7; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
23
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
24
|
+
Requires-Dist: pytest-mock>=3.12; extra == 'dev'
|
|
25
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff>=0.4; extra == 'dev'
|
|
27
|
+
Provides-Extra: local
|
|
28
|
+
Requires-Dist: numpy>=1.24; extra == 'local'
|
|
29
|
+
Requires-Dist: sentence-transformers>=2.2.0; extra == 'local'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# our-embeddings
|
|
33
|
+
|
|
34
|
+
Vector embedding generation and similarity search for the ourochronos ecosystem.
|
|
35
|
+
|
|
36
|
+
## Overview
|
|
37
|
+
|
|
38
|
+
our-embeddings provides a unified interface for generating and searching vector embeddings. It supports both local (sentence-transformers) and OpenAI providers, with a federation standard for cross-node embedding compatibility.
|
|
39
|
+
|
|
40
|
+
Default model: **BAAI/bge-small-en-v1.5** (384 dimensions, L2-normalized).
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install our-embeddings
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
For local embeddings (default, no API key needed):
|
|
49
|
+
```bash
|
|
50
|
+
pip install our-embeddings[local] # includes sentence-transformers
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
### Generate Embeddings
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from our_embeddings.service import generate_embedding, vector_to_pgvector
|
|
59
|
+
|
|
60
|
+
# Generate a 384-dim embedding vector
|
|
61
|
+
vector = generate_embedding("PostgreSQL is excellent for JSONB queries")
|
|
62
|
+
|
|
63
|
+
# Convert to pgvector format for storage
|
|
64
|
+
pg_str = vector_to_pgvector(vector)
|
|
65
|
+
# → "[0.0231,0.0891,...]"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Search Similar Content
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
from our_embeddings import search_similar
|
|
72
|
+
|
|
73
|
+
results = search_similar(
|
|
74
|
+
query="database performance",
|
|
75
|
+
content_type="belief",
|
|
76
|
+
limit=10,
|
|
77
|
+
min_similarity=0.5,
|
|
78
|
+
)
|
|
79
|
+
# Returns list of dicts with id, content, similarity score
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Embed and Store
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from our_embeddings import embed_content
|
|
86
|
+
|
|
87
|
+
result = embed_content(
|
|
88
|
+
content_type="belief",
|
|
89
|
+
content_id="uuid-here",
|
|
90
|
+
text="Valence uses dimensional confidence",
|
|
91
|
+
)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Batch Operations
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from our_embeddings.local import generate_embeddings_batch
|
|
98
|
+
|
|
99
|
+
vectors = generate_embeddings_batch(
|
|
100
|
+
["text one", "text two", "text three"],
|
|
101
|
+
batch_size=32,
|
|
102
|
+
)
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Backfill Missing Embeddings
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from our_embeddings import backfill_embeddings
|
|
109
|
+
|
|
110
|
+
count = backfill_embeddings(content_type="belief", batch_size=100)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Configuration
|
|
114
|
+
|
|
115
|
+
### EmbeddingConfig
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from our_embeddings.config import EmbeddingConfig
|
|
119
|
+
|
|
120
|
+
config = EmbeddingConfig.from_env()
|
|
121
|
+
# Fields:
|
|
122
|
+
# embedding_provider: str = "local"
|
|
123
|
+
# embedding_model_path: str = "BAAI/bge-small-en-v1.5"
|
|
124
|
+
# embedding_device: str = "cpu"
|
|
125
|
+
# openai_api_key: str = ""
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Environment Variables
|
|
129
|
+
|
|
130
|
+
| Variable | Default | Description |
|
|
131
|
+
|----------|---------|-------------|
|
|
132
|
+
| `VALENCE_EMBEDDING_PROVIDER` | `local` | `"local"` or `"openai"` |
|
|
133
|
+
| `VALENCE_EMBEDDING_MODEL_PATH` | `BAAI/bge-small-en-v1.5` | Model name or path |
|
|
134
|
+
| `VALENCE_EMBEDDING_DEVICE` | `cpu` | `"cpu"` or `"cuda"` |
|
|
135
|
+
| `OPENAI_API_KEY` | — | Required if provider is `openai` |
|
|
136
|
+
|
|
137
|
+
## Providers
|
|
138
|
+
|
|
139
|
+
### Local (default)
|
|
140
|
+
|
|
141
|
+
Uses sentence-transformers with BAAI/bge-small-en-v1.5:
|
|
142
|
+
- 384 dimensions, L2-normalized
|
|
143
|
+
- No API key required
|
|
144
|
+
- Model loaded lazily and cached as singleton
|
|
145
|
+
- Thread-safe initialization
|
|
146
|
+
|
|
147
|
+
### OpenAI
|
|
148
|
+
|
|
149
|
+
Uses OpenAI text-embedding-3-small:
|
|
150
|
+
- 1536 dimensions
|
|
151
|
+
- Requires `OPENAI_API_KEY`
|
|
152
|
+
- Text truncated to 8000 chars
|
|
153
|
+
|
|
154
|
+
## Embedding Type Registry
|
|
155
|
+
|
|
156
|
+
Register and manage multiple embedding types:
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from our_embeddings import register_embedding_type, list_embedding_types
|
|
160
|
+
|
|
161
|
+
register_embedding_type(
|
|
162
|
+
type_id="local_bge_small",
|
|
163
|
+
provider="local",
|
|
164
|
+
model="BAAI/bge-small-en-v1.5",
|
|
165
|
+
dimensions=384,
|
|
166
|
+
is_default=True,
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
types = list_embedding_types(status="active")
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
## Federation Standard
|
|
173
|
+
|
|
174
|
+
Cross-node embedding compatibility for federated knowledge sharing:
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
from our_embeddings import get_federation_standard, validate_federation_embedding
|
|
178
|
+
|
|
179
|
+
standard = get_federation_standard()
|
|
180
|
+
# → {"model": "BAAI/bge-small-en-v1.5", "dimensions": 384,
|
|
181
|
+
# "type": "bge_small_en_v15", "normalization": "L2", "version": "1.0"}
|
|
182
|
+
|
|
183
|
+
valid, error = validate_federation_embedding([0.1, 0.2, ...])
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
Federation functions for belief exchange:
|
|
187
|
+
- `prepare_belief_for_federation(belief_id)` — Package belief with embedding
|
|
188
|
+
- `validate_incoming_belief_embedding(data)` — Validate received embeddings
|
|
189
|
+
- `regenerate_embedding_if_needed(data)` — Re-embed if format differs
|
|
190
|
+
|
|
191
|
+
## State Ownership
|
|
192
|
+
|
|
193
|
+
Owns the `embedding_types` and `embedding_coverage` tables in the valence schema. Reads/writes the `embedding` column on `beliefs`, `vkb_exchanges`, and `vkb_patterns` tables.
|
|
194
|
+
|
|
195
|
+
## Development
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
make dev # Install with dev dependencies
|
|
199
|
+
make lint # Run linters
|
|
200
|
+
make test # Run tests
|
|
201
|
+
make test-cov # Tests with coverage
|
|
202
|
+
make format # Auto-format
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Part of Valence
|
|
206
|
+
|
|
207
|
+
This brick is part of the [Valence](https://github.com/ourochronos/valence) knowledge substrate. See [our-infra](https://github.com/ourochronos/our-infra) for ourochronos conventions.
|
|
208
|
+
|
|
209
|
+
## License
|
|
210
|
+
|
|
211
|
+
MIT
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# our-embeddings
|
|
2
|
+
|
|
3
|
+
Vector embedding generation and similarity search for the ourochronos ecosystem.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
our-embeddings provides a unified interface for generating and searching vector embeddings. It supports both local (sentence-transformers) and OpenAI providers, with a federation standard for cross-node embedding compatibility.
|
|
8
|
+
|
|
9
|
+
Default model: **BAAI/bge-small-en-v1.5** (384 dimensions, L2-normalized).
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install our-embeddings
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
For local embeddings (default, no API key needed):
|
|
18
|
+
```bash
|
|
19
|
+
pip install our-embeddings[local] # includes sentence-transformers
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## Usage
|
|
23
|
+
|
|
24
|
+
### Generate Embeddings
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from our_embeddings.service import generate_embedding, vector_to_pgvector
|
|
28
|
+
|
|
29
|
+
# Generate a 384-dim embedding vector
|
|
30
|
+
vector = generate_embedding("PostgreSQL is excellent for JSONB queries")
|
|
31
|
+
|
|
32
|
+
# Convert to pgvector format for storage
|
|
33
|
+
pg_str = vector_to_pgvector(vector)
|
|
34
|
+
# → "[0.0231,0.0891,...]"
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Search Similar Content
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from our_embeddings import search_similar
|
|
41
|
+
|
|
42
|
+
results = search_similar(
|
|
43
|
+
query="database performance",
|
|
44
|
+
content_type="belief",
|
|
45
|
+
limit=10,
|
|
46
|
+
min_similarity=0.5,
|
|
47
|
+
)
|
|
48
|
+
# Returns list of dicts with id, content, similarity score
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Embed and Store
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from our_embeddings import embed_content
|
|
55
|
+
|
|
56
|
+
result = embed_content(
|
|
57
|
+
content_type="belief",
|
|
58
|
+
content_id="uuid-here",
|
|
59
|
+
text="Valence uses dimensional confidence",
|
|
60
|
+
)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
### Batch Operations
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from our_embeddings.local import generate_embeddings_batch
|
|
67
|
+
|
|
68
|
+
vectors = generate_embeddings_batch(
|
|
69
|
+
["text one", "text two", "text three"],
|
|
70
|
+
batch_size=32,
|
|
71
|
+
)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Backfill Missing Embeddings
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from our_embeddings import backfill_embeddings
|
|
78
|
+
|
|
79
|
+
count = backfill_embeddings(content_type="belief", batch_size=100)
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
## Configuration
|
|
83
|
+
|
|
84
|
+
### EmbeddingConfig
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from our_embeddings.config import EmbeddingConfig
|
|
88
|
+
|
|
89
|
+
config = EmbeddingConfig.from_env()
|
|
90
|
+
# Fields:
|
|
91
|
+
# embedding_provider: str = "local"
|
|
92
|
+
# embedding_model_path: str = "BAAI/bge-small-en-v1.5"
|
|
93
|
+
# embedding_device: str = "cpu"
|
|
94
|
+
# openai_api_key: str = ""
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Environment Variables
|
|
98
|
+
|
|
99
|
+
| Variable | Default | Description |
|
|
100
|
+
|----------|---------|-------------|
|
|
101
|
+
| `VALENCE_EMBEDDING_PROVIDER` | `local` | `"local"` or `"openai"` |
|
|
102
|
+
| `VALENCE_EMBEDDING_MODEL_PATH` | `BAAI/bge-small-en-v1.5` | Model name or path |
|
|
103
|
+
| `VALENCE_EMBEDDING_DEVICE` | `cpu` | `"cpu"` or `"cuda"` |
|
|
104
|
+
| `OPENAI_API_KEY` | — | Required if provider is `openai` |
|
|
105
|
+
|
|
106
|
+
## Providers
|
|
107
|
+
|
|
108
|
+
### Local (default)
|
|
109
|
+
|
|
110
|
+
Uses sentence-transformers with BAAI/bge-small-en-v1.5:
|
|
111
|
+
- 384 dimensions, L2-normalized
|
|
112
|
+
- No API key required
|
|
113
|
+
- Model loaded lazily and cached as singleton
|
|
114
|
+
- Thread-safe initialization
|
|
115
|
+
|
|
116
|
+
### OpenAI
|
|
117
|
+
|
|
118
|
+
Uses OpenAI text-embedding-3-small:
|
|
119
|
+
- 1536 dimensions
|
|
120
|
+
- Requires `OPENAI_API_KEY`
|
|
121
|
+
- Text truncated to 8000 chars
|
|
122
|
+
|
|
123
|
+
## Embedding Type Registry
|
|
124
|
+
|
|
125
|
+
Register and manage multiple embedding types:
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from our_embeddings import register_embedding_type, list_embedding_types
|
|
129
|
+
|
|
130
|
+
register_embedding_type(
|
|
131
|
+
type_id="local_bge_small",
|
|
132
|
+
provider="local",
|
|
133
|
+
model="BAAI/bge-small-en-v1.5",
|
|
134
|
+
dimensions=384,
|
|
135
|
+
is_default=True,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
types = list_embedding_types(status="active")
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## Federation Standard
|
|
142
|
+
|
|
143
|
+
Cross-node embedding compatibility for federated knowledge sharing:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from our_embeddings import get_federation_standard, validate_federation_embedding
|
|
147
|
+
|
|
148
|
+
standard = get_federation_standard()
|
|
149
|
+
# → {"model": "BAAI/bge-small-en-v1.5", "dimensions": 384,
|
|
150
|
+
# "type": "bge_small_en_v15", "normalization": "L2", "version": "1.0"}
|
|
151
|
+
|
|
152
|
+
valid, error = validate_federation_embedding([0.1, 0.2, ...])
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Federation functions for belief exchange:
|
|
156
|
+
- `prepare_belief_for_federation(belief_id)` — Package belief with embedding
|
|
157
|
+
- `validate_incoming_belief_embedding(data)` — Validate received embeddings
|
|
158
|
+
- `regenerate_embedding_if_needed(data)` — Re-embed if format differs
|
|
159
|
+
|
|
160
|
+
## State Ownership
|
|
161
|
+
|
|
162
|
+
Owns the `embedding_types` and `embedding_coverage` tables in the valence schema. Reads/writes the `embedding` column on `beliefs`, `vkb_exchanges`, and `vkb_patterns` tables.
|
|
163
|
+
|
|
164
|
+
## Development
|
|
165
|
+
|
|
166
|
+
```bash
|
|
167
|
+
make dev # Install with dev dependencies
|
|
168
|
+
make lint # Run linters
|
|
169
|
+
make test # Run tests
|
|
170
|
+
make test-cov # Tests with coverage
|
|
171
|
+
make format # Auto-format
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Part of Valence
|
|
175
|
+
|
|
176
|
+
This brick is part of the [Valence](https://github.com/ourochronos/valence) knowledge substrate. See [our-infra](https://github.com/ourochronos/our-infra) for ourochronos conventions.
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
MIT
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "our-embeddings"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Vector embedding generation and similarity search"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Chris Jacobs" },
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"License :: OSI Approved :: MIT License",
|
|
21
|
+
]
|
|
22
|
+
dependencies = [
|
|
23
|
+
"our-db>=0.1.0",
|
|
24
|
+
"openai>=1.0",
|
|
25
|
+
]
|
|
26
|
+
|
|
27
|
+
[project.urls]
|
|
28
|
+
Homepage = "https://github.com/ourochronos/our-embeddings"
|
|
29
|
+
Repository = "https://github.com/ourochronos/our-embeddings"
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
local = [
|
|
33
|
+
"sentence-transformers>=2.2.0",
|
|
34
|
+
"numpy>=1.24",
|
|
35
|
+
]
|
|
36
|
+
dev = [
|
|
37
|
+
"pytest>=8.0",
|
|
38
|
+
"pytest-asyncio>=0.23",
|
|
39
|
+
"pytest-cov>=4.0",
|
|
40
|
+
"pytest-mock>=3.12",
|
|
41
|
+
"ruff>=0.4",
|
|
42
|
+
"mypy>=1.10",
|
|
43
|
+
"pre-commit>=3.7",
|
|
44
|
+
"numpy>=1.24",
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
[tool.hatch.build.targets.wheel]
|
|
48
|
+
packages = ["src/our_embeddings"]
|
|
49
|
+
|
|
50
|
+
[tool.ruff]
|
|
51
|
+
target-version = "py311"
|
|
52
|
+
line-length = 120
|
|
53
|
+
src = ["src", "tests"]
|
|
54
|
+
|
|
55
|
+
[tool.ruff.lint]
|
|
56
|
+
select = ["E", "F", "I", "N", "W", "UP", "B", "C4"]
|
|
57
|
+
|
|
58
|
+
[tool.ruff.lint.isort]
|
|
59
|
+
known-first-party = ["our_embeddings"]
|
|
60
|
+
|
|
61
|
+
[tool.mypy]
|
|
62
|
+
python_version = "3.11"
|
|
63
|
+
disallow_untyped_defs = true
|
|
64
|
+
strict_optional = true
|
|
65
|
+
warn_redundant_casts = true
|
|
66
|
+
warn_unused_ignores = true
|
|
67
|
+
|
|
68
|
+
[[tool.mypy.overrides]]
|
|
69
|
+
module = [
|
|
70
|
+
"our_db",
|
|
71
|
+
"our_db.*",
|
|
72
|
+
"openai",
|
|
73
|
+
"openai.*",
|
|
74
|
+
"sentence_transformers",
|
|
75
|
+
"sentence_transformers.*",
|
|
76
|
+
"numpy",
|
|
77
|
+
"numpy.*",
|
|
78
|
+
]
|
|
79
|
+
ignore_missing_imports = true
|
|
80
|
+
|
|
81
|
+
[tool.pytest.ini_options]
|
|
82
|
+
testpaths = ["tests"]
|
|
83
|
+
asyncio_mode = "auto"
|
|
84
|
+
markers = [
|
|
85
|
+
"unit: Unit tests (no external dependencies)",
|
|
86
|
+
"integration: Integration tests (require external services)",
|
|
87
|
+
"slow: Slow tests (>5s)",
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
[tool.coverage.run]
|
|
91
|
+
branch = true
|
|
92
|
+
source = ["src/our_embeddings"]
|
|
93
|
+
|
|
94
|
+
[tool.coverage.report]
|
|
95
|
+
show_missing = true
|