our-embeddings 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,24 @@
1
+ name: CI
2
+
3
+ on:
4
+ pull_request:
5
+ push:
6
+ branches: [main]
7
+ paths-ignore:
8
+ - '*.md'
9
+ - 'docs/**'
10
+
11
+ concurrency:
12
+ group: ci-${{ github.ref }}
13
+ cancel-in-progress: true
14
+
15
+ jobs:
16
+ lint:
17
+ uses: ourochronos/our-infra/.github/workflows/lint.yml@main
18
+ with:
19
+ extra-install: "git+https://github.com/ourochronos/our-db.git"
20
+ test:
21
+ needs: lint
22
+ uses: ourochronos/our-infra/.github/workflows/test.yml@main
23
+ with:
24
+ extra-install: "git+https://github.com/ourochronos/our-db.git"
@@ -0,0 +1,13 @@
1
+ name: Release
2
+
3
+ on:
4
+ push:
5
+ tags: ['v*']
6
+
7
+ permissions:
8
+ contents: write
9
+
10
+ jobs:
11
+ release:
12
+ uses: ourochronos/our-infra/.github/workflows/release.yml@main
13
+ secrets: inherit
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ dist/
8
+ build/
9
+ *.egg
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ env/
15
+
16
+ # Testing
17
+ .pytest_cache/
18
+ htmlcov/
19
+ .coverage
20
+ .coverage.*
21
+ coverage.xml
22
+
23
+ # Type checking
24
+ .mypy_cache/
25
+
26
+ # Linting
27
+ .ruff_cache/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # OS
37
+ .DS_Store
38
+ Thumbs.db
@@ -0,0 +1,33 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v4.6.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-added-large-files
9
+ args: ['--maxkb=500']
10
+ - id: check-merge-conflict
11
+
12
+ - repo: https://github.com/astral-sh/ruff-pre-commit
13
+ rev: v0.4.4
14
+ hooks:
15
+ - id: ruff
16
+ args: [--fix]
17
+ - id: ruff-format
18
+
19
+ - repo: https://github.com/pre-commit/mirrors-mypy
20
+ rev: v1.10.0
21
+ hooks:
22
+ - id: mypy
23
+ additional_dependencies: []
24
+ args: [--config-file=pyproject.toml]
25
+ stages: [commit]
26
+ language: system
27
+
28
+ - repo: https://github.com/PyCQA/bandit
29
+ rev: 1.7.8
30
+ hooks:
31
+ - id: bandit
32
+ args: ['-q', '--severity-level', 'medium']
33
+ exclude: tests/
@@ -0,0 +1,11 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+ - Initial project structure
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Chris Jacobs
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,38 @@
1
+ .PHONY: help install dev lint format test test-unit test-int test-cov clean
2
+
3
+ help: ## Show this help
4
+ @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-15s\033[0m %s\n", $$1, $$2}'
5
+
6
+ install: ## Install package
7
+ pip install -e .
8
+
9
+ dev: ## Install package with dev dependencies
10
+ pip install -e ".[dev]"
11
+ pre-commit install
12
+
13
+ lint: ## Run linters (ruff + mypy)
14
+ ruff check src/ tests/
15
+ ruff format --check src/ tests/
16
+ mypy src/
17
+
18
+ format: ## Auto-format code
19
+ ruff check --fix src/ tests/
20
+ ruff format src/ tests/
21
+
22
+ test: test-unit ## Run tests (unit only by default)
23
+
24
+ test-unit: ## Run unit tests
25
+ pytest tests/ -m "not integration and not slow" -v
26
+
27
+ test-int: ## Run integration tests
28
+ pytest tests/ -m integration -v --timeout=60
29
+
30
+ test-all: ## Run all tests
31
+ pytest tests/ -v --timeout=120
32
+
33
+ test-cov: ## Run tests with coverage report
34
+ pytest tests/ -m "not integration and not slow" --cov --cov-report=term-missing --cov-report=html
35
+
36
+ clean: ## Remove build artifacts and caches
37
+ rm -rf build/ dist/ *.egg-info .pytest_cache .mypy_cache .ruff_cache htmlcov .coverage
38
+ find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true
@@ -0,0 +1,211 @@
1
+ Metadata-Version: 2.4
2
+ Name: our-embeddings
3
+ Version: 0.1.0
4
+ Summary: Vector embedding generation and similarity search
5
+ Project-URL: Homepage, https://github.com/ourochronos/our-embeddings
6
+ Project-URL: Repository, https://github.com/ourochronos/our-embeddings
7
+ Author: Chris Jacobs
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Requires-Python: >=3.11
16
+ Requires-Dist: openai>=1.0
17
+ Requires-Dist: our-db>=0.1.0
18
+ Provides-Extra: dev
19
+ Requires-Dist: mypy>=1.10; extra == 'dev'
20
+ Requires-Dist: numpy>=1.24; extra == 'dev'
21
+ Requires-Dist: pre-commit>=3.7; extra == 'dev'
22
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
23
+ Requires-Dist: pytest-cov>=4.0; extra == 'dev'
24
+ Requires-Dist: pytest-mock>=3.12; extra == 'dev'
25
+ Requires-Dist: pytest>=8.0; extra == 'dev'
26
+ Requires-Dist: ruff>=0.4; extra == 'dev'
27
+ Provides-Extra: local
28
+ Requires-Dist: numpy>=1.24; extra == 'local'
29
+ Requires-Dist: sentence-transformers>=2.2.0; extra == 'local'
30
+ Description-Content-Type: text/markdown
31
+
32
+ # our-embeddings
33
+
34
+ Vector embedding generation and similarity search for the ourochronos ecosystem.
35
+
36
+ ## Overview
37
+
38
+ our-embeddings provides a unified interface for generating and searching vector embeddings. It supports both local (sentence-transformers) and OpenAI providers, with a federation standard for cross-node embedding compatibility.
39
+
40
+ Default model: **BAAI/bge-small-en-v1.5** (384 dimensions, L2-normalized).
41
+
42
+ ## Install
43
+
44
+ ```bash
45
+ pip install our-embeddings
46
+ ```
47
+
48
+ For local embeddings (default, no API key needed):
49
+ ```bash
50
+ pip install our-embeddings[local] # includes sentence-transformers
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ ### Generate Embeddings
56
+
57
+ ```python
58
+ from our_embeddings.service import generate_embedding, vector_to_pgvector
59
+
60
+ # Generate a 384-dim embedding vector
61
+ vector = generate_embedding("PostgreSQL is excellent for JSONB queries")
62
+
63
+ # Convert to pgvector format for storage
64
+ pg_str = vector_to_pgvector(vector)
65
+ # → "[0.0231,0.0891,...]"
66
+ ```
67
+
68
+ ### Search Similar Content
69
+
70
+ ```python
71
+ from our_embeddings import search_similar
72
+
73
+ results = search_similar(
74
+ query="database performance",
75
+ content_type="belief",
76
+ limit=10,
77
+ min_similarity=0.5,
78
+ )
79
+ # Returns list of dicts with id, content, similarity score
80
+ ```
81
+
82
+ ### Embed and Store
83
+
84
+ ```python
85
+ from our_embeddings import embed_content
86
+
87
+ result = embed_content(
88
+ content_type="belief",
89
+ content_id="uuid-here",
90
+ text="Valence uses dimensional confidence",
91
+ )
92
+ ```
93
+
94
+ ### Batch Operations
95
+
96
+ ```python
97
+ from our_embeddings.local import generate_embeddings_batch
98
+
99
+ vectors = generate_embeddings_batch(
100
+ ["text one", "text two", "text three"],
101
+ batch_size=32,
102
+ )
103
+ ```
104
+
105
+ ### Backfill Missing Embeddings
106
+
107
+ ```python
108
+ from our_embeddings import backfill_embeddings
109
+
110
+ count = backfill_embeddings(content_type="belief", batch_size=100)
111
+ ```
112
+
113
+ ## Configuration
114
+
115
+ ### EmbeddingConfig
116
+
117
+ ```python
118
+ from our_embeddings.config import EmbeddingConfig
119
+
120
+ config = EmbeddingConfig.from_env()
121
+ # Fields:
122
+ # embedding_provider: str = "local"
123
+ # embedding_model_path: str = "BAAI/bge-small-en-v1.5"
124
+ # embedding_device: str = "cpu"
125
+ # openai_api_key: str = ""
126
+ ```
127
+
128
+ ### Environment Variables
129
+
130
+ | Variable | Default | Description |
131
+ |----------|---------|-------------|
132
+ | `VALENCE_EMBEDDING_PROVIDER` | `local` | `"local"` or `"openai"` |
133
+ | `VALENCE_EMBEDDING_MODEL_PATH` | `BAAI/bge-small-en-v1.5` | Model name or path |
134
+ | `VALENCE_EMBEDDING_DEVICE` | `cpu` | `"cpu"` or `"cuda"` |
135
+ | `OPENAI_API_KEY` | — | Required if provider is `openai` |
136
+
137
+ ## Providers
138
+
139
+ ### Local (default)
140
+
141
+ Uses sentence-transformers with BAAI/bge-small-en-v1.5:
142
+ - 384 dimensions, L2-normalized
143
+ - No API key required
144
+ - Model loaded lazily and cached as singleton
145
+ - Thread-safe initialization
146
+
147
+ ### OpenAI
148
+
149
+ Uses OpenAI text-embedding-3-small:
150
+ - 1536 dimensions
151
+ - Requires `OPENAI_API_KEY`
152
+ - Text truncated to 8000 chars
153
+
154
+ ## Embedding Type Registry
155
+
156
+ Register and manage multiple embedding types:
157
+
158
+ ```python
159
+ from our_embeddings import register_embedding_type, list_embedding_types
160
+
161
+ register_embedding_type(
162
+ type_id="local_bge_small",
163
+ provider="local",
164
+ model="BAAI/bge-small-en-v1.5",
165
+ dimensions=384,
166
+ is_default=True,
167
+ )
168
+
169
+ types = list_embedding_types(status="active")
170
+ ```
171
+
172
+ ## Federation Standard
173
+
174
+ Cross-node embedding compatibility for federated knowledge sharing:
175
+
176
+ ```python
177
+ from our_embeddings import get_federation_standard, validate_federation_embedding
178
+
179
+ standard = get_federation_standard()
180
+ # → {"model": "BAAI/bge-small-en-v1.5", "dimensions": 384,
181
+ # "type": "bge_small_en_v15", "normalization": "L2", "version": "1.0"}
182
+
183
+ valid, error = validate_federation_embedding([0.1, 0.2, ...])
184
+ ```
185
+
186
+ Federation functions for belief exchange:
187
+ - `prepare_belief_for_federation(belief_id)` — Package belief with embedding
188
+ - `validate_incoming_belief_embedding(data)` — Validate received embeddings
189
+ - `regenerate_embedding_if_needed(data)` — Re-embed if format differs
190
+
191
+ ## State Ownership
192
+
193
+ Owns the `embedding_types` and `embedding_coverage` tables in the valence schema. Reads/writes the `embedding` column on `beliefs`, `vkb_exchanges`, and `vkb_patterns` tables.
194
+
195
+ ## Development
196
+
197
+ ```bash
198
+ make dev # Install with dev dependencies
199
+ make lint # Run linters
200
+ make test # Run tests
201
+ make test-cov # Tests with coverage
202
+ make format # Auto-format
203
+ ```
204
+
205
+ ## Part of Valence
206
+
207
+ This brick is part of the [Valence](https://github.com/ourochronos/valence) knowledge substrate. See [our-infra](https://github.com/ourochronos/our-infra) for ourochronos conventions.
208
+
209
+ ## License
210
+
211
+ MIT
@@ -0,0 +1,180 @@
1
+ # our-embeddings
2
+
3
+ Vector embedding generation and similarity search for the ourochronos ecosystem.
4
+
5
+ ## Overview
6
+
7
+ our-embeddings provides a unified interface for generating and searching vector embeddings. It supports both local (sentence-transformers) and OpenAI providers, with a federation standard for cross-node embedding compatibility.
8
+
9
+ Default model: **BAAI/bge-small-en-v1.5** (384 dimensions, L2-normalized).
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ pip install our-embeddings
15
+ ```
16
+
17
+ For local embeddings (default, no API key needed):
18
+ ```bash
19
+ pip install our-embeddings[local] # includes sentence-transformers
20
+ ```
21
+
22
+ ## Usage
23
+
24
+ ### Generate Embeddings
25
+
26
+ ```python
27
+ from our_embeddings.service import generate_embedding, vector_to_pgvector
28
+
29
+ # Generate a 384-dim embedding vector
30
+ vector = generate_embedding("PostgreSQL is excellent for JSONB queries")
31
+
32
+ # Convert to pgvector format for storage
33
+ pg_str = vector_to_pgvector(vector)
34
+ # → "[0.0231,0.0891,...]"
35
+ ```
36
+
37
+ ### Search Similar Content
38
+
39
+ ```python
40
+ from our_embeddings import search_similar
41
+
42
+ results = search_similar(
43
+ query="database performance",
44
+ content_type="belief",
45
+ limit=10,
46
+ min_similarity=0.5,
47
+ )
48
+ # Returns list of dicts with id, content, similarity score
49
+ ```
50
+
51
+ ### Embed and Store
52
+
53
+ ```python
54
+ from our_embeddings import embed_content
55
+
56
+ result = embed_content(
57
+ content_type="belief",
58
+ content_id="uuid-here",
59
+ text="Valence uses dimensional confidence",
60
+ )
61
+ ```
62
+
63
+ ### Batch Operations
64
+
65
+ ```python
66
+ from our_embeddings.local import generate_embeddings_batch
67
+
68
+ vectors = generate_embeddings_batch(
69
+ ["text one", "text two", "text three"],
70
+ batch_size=32,
71
+ )
72
+ ```
73
+
74
+ ### Backfill Missing Embeddings
75
+
76
+ ```python
77
+ from our_embeddings import backfill_embeddings
78
+
79
+ count = backfill_embeddings(content_type="belief", batch_size=100)
80
+ ```
81
+
82
+ ## Configuration
83
+
84
+ ### EmbeddingConfig
85
+
86
+ ```python
87
+ from our_embeddings.config import EmbeddingConfig
88
+
89
+ config = EmbeddingConfig.from_env()
90
+ # Fields:
91
+ # embedding_provider: str = "local"
92
+ # embedding_model_path: str = "BAAI/bge-small-en-v1.5"
93
+ # embedding_device: str = "cpu"
94
+ # openai_api_key: str = ""
95
+ ```
96
+
97
+ ### Environment Variables
98
+
99
+ | Variable | Default | Description |
100
+ |----------|---------|-------------|
101
+ | `VALENCE_EMBEDDING_PROVIDER` | `local` | `"local"` or `"openai"` |
102
+ | `VALENCE_EMBEDDING_MODEL_PATH` | `BAAI/bge-small-en-v1.5` | Model name or path |
103
+ | `VALENCE_EMBEDDING_DEVICE` | `cpu` | `"cpu"` or `"cuda"` |
104
+ | `OPENAI_API_KEY` | — | Required if provider is `openai` |
105
+
106
+ ## Providers
107
+
108
+ ### Local (default)
109
+
110
+ Uses sentence-transformers with BAAI/bge-small-en-v1.5:
111
+ - 384 dimensions, L2-normalized
112
+ - No API key required
113
+ - Model loaded lazily and cached as singleton
114
+ - Thread-safe initialization
115
+
116
+ ### OpenAI
117
+
118
+ Uses OpenAI text-embedding-3-small:
119
+ - 1536 dimensions
120
+ - Requires `OPENAI_API_KEY`
121
+ - Text truncated to 8000 chars
122
+
123
+ ## Embedding Type Registry
124
+
125
+ Register and manage multiple embedding types:
126
+
127
+ ```python
128
+ from our_embeddings import register_embedding_type, list_embedding_types
129
+
130
+ register_embedding_type(
131
+ type_id="local_bge_small",
132
+ provider="local",
133
+ model="BAAI/bge-small-en-v1.5",
134
+ dimensions=384,
135
+ is_default=True,
136
+ )
137
+
138
+ types = list_embedding_types(status="active")
139
+ ```
140
+
141
+ ## Federation Standard
142
+
143
+ Cross-node embedding compatibility for federated knowledge sharing:
144
+
145
+ ```python
146
+ from our_embeddings import get_federation_standard, validate_federation_embedding
147
+
148
+ standard = get_federation_standard()
149
+ # → {"model": "BAAI/bge-small-en-v1.5", "dimensions": 384,
150
+ # "type": "bge_small_en_v15", "normalization": "L2", "version": "1.0"}
151
+
152
+ valid, error = validate_federation_embedding([0.1, 0.2, ...])
153
+ ```
154
+
155
+ Federation functions for belief exchange:
156
+ - `prepare_belief_for_federation(belief_id)` — Package belief with embedding
157
+ - `validate_incoming_belief_embedding(data)` — Validate received embeddings
158
+ - `regenerate_embedding_if_needed(data)` — Re-embed if format differs
159
+
160
+ ## State Ownership
161
+
162
+ Owns the `embedding_types` and `embedding_coverage` tables in the valence schema. Reads/writes the `embedding` column on `beliefs`, `vkb_exchanges`, and `vkb_patterns` tables.
163
+
164
+ ## Development
165
+
166
+ ```bash
167
+ make dev # Install with dev dependencies
168
+ make lint # Run linters
169
+ make test # Run tests
170
+ make test-cov # Tests with coverage
171
+ make format # Auto-format
172
+ ```
173
+
174
+ ## Part of Valence
175
+
176
+ This brick is part of the [Valence](https://github.com/ourochronos/valence) knowledge substrate. See [our-infra](https://github.com/ourochronos/our-infra) for ourochronos conventions.
177
+
178
+ ## License
179
+
180
+ MIT
@@ -0,0 +1,95 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "our-embeddings"
7
+ version = "0.1.0"
8
+ description = "Vector embedding generation and similarity search"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
+ authors = [
13
+ { name = "Chris Jacobs" },
14
+ ]
15
+ classifiers = [
16
+ "Development Status :: 3 - Alpha",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "License :: OSI Approved :: MIT License",
21
+ ]
22
+ dependencies = [
23
+ "our-db>=0.1.0",
24
+ "openai>=1.0",
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/ourochronos/our-embeddings"
29
+ Repository = "https://github.com/ourochronos/our-embeddings"
30
+
31
+ [project.optional-dependencies]
32
+ local = [
33
+ "sentence-transformers>=2.2.0",
34
+ "numpy>=1.24",
35
+ ]
36
+ dev = [
37
+ "pytest>=8.0",
38
+ "pytest-asyncio>=0.23",
39
+ "pytest-cov>=4.0",
40
+ "pytest-mock>=3.12",
41
+ "ruff>=0.4",
42
+ "mypy>=1.10",
43
+ "pre-commit>=3.7",
44
+ "numpy>=1.24",
45
+ ]
46
+
47
+ [tool.hatch.build.targets.wheel]
48
+ packages = ["src/our_embeddings"]
49
+
50
+ [tool.ruff]
51
+ target-version = "py311"
52
+ line-length = 120
53
+ src = ["src", "tests"]
54
+
55
+ [tool.ruff.lint]
56
+ select = ["E", "F", "I", "N", "W", "UP", "B", "C4"]
57
+
58
+ [tool.ruff.lint.isort]
59
+ known-first-party = ["our_embeddings"]
60
+
61
+ [tool.mypy]
62
+ python_version = "3.11"
63
+ disallow_untyped_defs = true
64
+ strict_optional = true
65
+ warn_redundant_casts = true
66
+ warn_unused_ignores = true
67
+
68
+ [[tool.mypy.overrides]]
69
+ module = [
70
+ "our_db",
71
+ "our_db.*",
72
+ "openai",
73
+ "openai.*",
74
+ "sentence_transformers",
75
+ "sentence_transformers.*",
76
+ "numpy",
77
+ "numpy.*",
78
+ ]
79
+ ignore_missing_imports = true
80
+
81
+ [tool.pytest.ini_options]
82
+ testpaths = ["tests"]
83
+ asyncio_mode = "auto"
84
+ markers = [
85
+ "unit: Unit tests (no external dependencies)",
86
+ "integration: Integration tests (require external services)",
87
+ "slow: Slow tests (>5s)",
88
+ ]
89
+
90
+ [tool.coverage.run]
91
+ branch = true
92
+ source = ["src/our_embeddings"]
93
+
94
+ [tool.coverage.report]
95
+ show_missing = true