remdb 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +801 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.7.dist-info/METADATA +1473 -0
- remdb-0.3.7.dist-info/RECORD +187 -0
- remdb-0.3.7.dist-info/WHEEL +4 -0
- remdb-0.3.7.dist-info/entry_points.txt +2 -0
rem/utils/README.md
ADDED
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
# REM Utilities
|
|
2
|
+
|
|
3
|
+
## Table of Contents
|
|
4
|
+
|
|
5
|
+
1. [SQL Types](#sql-types-sql_typespy) - Pydantic to PostgreSQL type mapping
|
|
6
|
+
2. [Embeddings](#embeddings-embeddingspy) - Vector embeddings generation
|
|
7
|
+
|
|
8
|
+
## SQL Types (`sql_types.py`)
|
|
9
|
+
|
|
10
|
+
Intelligent Pydantic to PostgreSQL type mapping utility for generating database schemas and UPSERT statements.
|
|
11
|
+
|
|
12
|
+
### Features
|
|
13
|
+
|
|
14
|
+
- **Smart String Handling**: VARCHAR(256) by default, TEXT for content/description fields
|
|
15
|
+
- **Union Type Preferences**: Prioritizes UUID and JSONB in Union types
|
|
16
|
+
- **Array Support**: PostgreSQL arrays for `list[str]`, JSONB for complex lists
|
|
17
|
+
- **JSONB for Structured Data**: Automatic JSONB for dicts and nested structures
|
|
18
|
+
- **Custom Type Overrides**: Support for `sql_type` in `json_schema_extra`
|
|
19
|
+
- **Embedding Field Detection**: Auto-detects embedding fields via `embedding_provider`
|
|
20
|
+
- **Schema Generation**: CREATE TABLE with appropriate indexes
|
|
21
|
+
- **UPSERT Templates**: INSERT ... ON CONFLICT UPDATE statements
|
|
22
|
+
|
|
23
|
+
### Type Mapping Rules
|
|
24
|
+
|
|
25
|
+
| Pydantic Type | PostgreSQL Type | Notes |
|
|
26
|
+
|---------------|-----------------|-------|
|
|
27
|
+
| `str` | `VARCHAR(256)` | Default for strings |
|
|
28
|
+
| `str` (field name: content, description, summary, etc.) | `TEXT` | Long-form text fields |
|
|
29
|
+
| `str` (with `embedding_provider`) | `TEXT` | For vector search preprocessing |
|
|
30
|
+
| `int` | `INTEGER` | Standard integer |
|
|
31
|
+
| `float` | `DOUBLE PRECISION` | Floating point |
|
|
32
|
+
| `bool` | `BOOLEAN` | Boolean values |
|
|
33
|
+
| `UUID` | `UUID` | PostgreSQL UUID type |
|
|
34
|
+
| `datetime` | `TIMESTAMP` | Timestamp without timezone |
|
|
35
|
+
| `date` | `DATE` | Date only |
|
|
36
|
+
| `dict` | `JSONB` | Structured JSON data |
|
|
37
|
+
| `list[str]` | `TEXT[]` | PostgreSQL array |
|
|
38
|
+
| `list[dict]` | `JSONB` | Complex nested data |
|
|
39
|
+
| `UUID \| str` | `UUID` | Prefers UUID in unions |
|
|
40
|
+
| `dict \| None` | `JSONB` | Prefers JSONB in unions |
|
|
41
|
+
|
|
42
|
+
### Long-Form Text Field Names
|
|
43
|
+
|
|
44
|
+
Fields with these names automatically use TEXT:
|
|
45
|
+
- `content`, `description`, `summary`
|
|
46
|
+
- `instructions`, `prompt`, `message`
|
|
47
|
+
- `body`, `text`, `note`, `comment`
|
|
48
|
+
- Fields ending with `_content`, `_description`, `_summary`, `_text`, `_message`
|
|
49
|
+
|
|
50
|
+
### Usage Examples
|
|
51
|
+
|
|
52
|
+
#### Basic Type Mapping
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from pydantic import Field
|
|
56
|
+
from rem.utils.sql_types import get_sql_type
|
|
57
|
+
|
|
58
|
+
# String field
|
|
59
|
+
field = Field(default="")
|
|
60
|
+
get_sql_type(field, "name") # "VARCHAR(256)"
|
|
61
|
+
|
|
62
|
+
# Content field (detected by name)
|
|
63
|
+
field = Field(default="")
|
|
64
|
+
get_sql_type(field, "content") # "TEXT"
|
|
65
|
+
|
|
66
|
+
# Dict field
|
|
67
|
+
field = Field(default_factory=dict)
|
|
68
|
+
get_sql_type(field, "metadata") # "JSONB"
|
|
69
|
+
|
|
70
|
+
# List of strings
|
|
71
|
+
field = Field(default_factory=list)
|
|
72
|
+
get_sql_type(field, "tags") # "TEXT[]"
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
#### Custom Type Override
|
|
76
|
+
|
|
77
|
+
```python
|
|
78
|
+
from pydantic import BaseModel, Field
|
|
79
|
+
|
|
80
|
+
class Document(BaseModel):
|
|
81
|
+
# Custom pgvector type
|
|
82
|
+
embedding: list[float] = Field(
|
|
83
|
+
default_factory=list,
|
|
84
|
+
json_schema_extra={"sql_type": "vector(1536)"}
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
# Embedding provider detection (format: provider:model_name)
|
|
88
|
+
content: str = Field(
|
|
89
|
+
default="",
|
|
90
|
+
json_schema_extra={"embedding_provider": "openai:text-embedding-3-small"}
|
|
91
|
+
) # Will use TEXT
|
|
92
|
+
|
|
93
|
+
# Alternative embedding providers
|
|
94
|
+
description: str = Field(
|
|
95
|
+
default="",
|
|
96
|
+
json_schema_extra={"embedding_provider": "anthropic:voyage-2"}
|
|
97
|
+
) # Will use TEXT
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
#### Generate CREATE TABLE
|
|
101
|
+
|
|
102
|
+
```python
|
|
103
|
+
from rem.models.entities import Resource
|
|
104
|
+
from rem.utils.sql_types import model_to_create_table
|
|
105
|
+
|
|
106
|
+
sql = model_to_create_table(Resource, "resources")
|
|
107
|
+
print(sql)
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Output:
|
|
111
|
+
```sql
|
|
112
|
+
CREATE TABLE IF NOT EXISTS resources (
|
|
113
|
+
id UUID PRIMARY KEY,
|
|
114
|
+
name VARCHAR(256) NOT NULL,
|
|
115
|
+
uri VARCHAR(256),
|
|
116
|
+
content TEXT NOT NULL DEFAULT '',
|
|
117
|
+
description TEXT,
|
|
118
|
+
category VARCHAR(256),
|
|
119
|
+
related_entities JSONB NOT NULL DEFAULT '{}'::jsonb
|
|
120
|
+
);
|
|
121
|
+
|
|
122
|
+
-- Indexes
|
|
123
|
+
CREATE INDEX IF NOT EXISTS idx_resources_tenant_id ON resources(tenant_id);
|
|
124
|
+
CREATE INDEX IF NOT EXISTS idx_resources_related_entities ON resources USING GIN(related_entities);
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
#### Generate UPSERT Statement
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from rem.models.entities import Resource
|
|
131
|
+
from rem.utils.sql_types import model_to_upsert
|
|
132
|
+
|
|
133
|
+
sql = model_to_upsert(Resource, "resources")
|
|
134
|
+
print(sql)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Output:
|
|
138
|
+
```sql
|
|
139
|
+
INSERT INTO resources (id, name, uri, content, description, category, related_entities)
|
|
140
|
+
VALUES ($1, $2, $3, $4, $5, $6, $7)
|
|
141
|
+
ON CONFLICT (id)
|
|
142
|
+
DO UPDATE SET name = EXCLUDED.name, uri = EXCLUDED.uri, content = EXCLUDED.content,
|
|
143
|
+
description = EXCLUDED.description, category = EXCLUDED.category,
|
|
144
|
+
related_entities = EXCLUDED.related_entities;
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
#### Complete Column Definition
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from pydantic import Field
|
|
151
|
+
from rem.utils.sql_types import get_column_definition
|
|
152
|
+
|
|
153
|
+
# Primary key
|
|
154
|
+
field = Field(..., description="User ID")
|
|
155
|
+
get_column_definition(field, "id", nullable=False, primary_key=True)
|
|
156
|
+
# "id UUID PRIMARY KEY"
|
|
157
|
+
|
|
158
|
+
# Optional field
|
|
159
|
+
field = Field(default=None)
|
|
160
|
+
get_column_definition(field, "email", nullable=True)
|
|
161
|
+
# "email VARCHAR(256)"
|
|
162
|
+
|
|
163
|
+
# JSONB with default
|
|
164
|
+
field = Field(default_factory=dict)
|
|
165
|
+
get_column_definition(field, "metadata", nullable=False)
|
|
166
|
+
# "metadata JSONB NOT NULL DEFAULT '{}'::jsonb"
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
### Index Generation
|
|
170
|
+
|
|
171
|
+
The utility automatically creates indexes for:
|
|
172
|
+
|
|
173
|
+
1. **Foreign Keys**: Fields ending with `_id` (e.g., `user_id`, `tenant_id`)
|
|
174
|
+
2. **JSONB Fields**: GIN indexes for efficient querying
|
|
175
|
+
3. **Array Fields**: GIN indexes for array containment queries
|
|
176
|
+
4. **Primary Keys**: Automatically indexed
|
|
177
|
+
|
|
178
|
+
### Integration with REM Models
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
# Generate schema for all REM entities
|
|
182
|
+
from rem.models.entities import Resource, Message, User, File, Moment, Schema
|
|
183
|
+
from rem.utils.sql_types import model_to_create_table
|
|
184
|
+
|
|
185
|
+
for model, table_name in [
|
|
186
|
+
(Resource, "resources"),
|
|
187
|
+
(Message, "messages"),
|
|
188
|
+
(User, "users"),
|
|
189
|
+
(File, "files"),
|
|
190
|
+
(Moment, "moments"),
|
|
191
|
+
(Schema, "schemas"),
|
|
192
|
+
]:
|
|
193
|
+
sql = model_to_create_table(model, table_name)
|
|
194
|
+
with open(f"migrations/{table_name}.sql", "w") as f:
|
|
195
|
+
f.write(sql)
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
### Best Practices (from Research)
|
|
199
|
+
|
|
200
|
+
Based on PostgreSQL documentation and community best practices:
|
|
201
|
+
|
|
202
|
+
1. **VARCHAR(256) for Most Strings**: Good balance between validation and flexibility
|
|
203
|
+
2. **TEXT for Long Content**: No performance penalty, better for variable-length text
|
|
204
|
+
3. **JSONB over JSON**: Better querying capabilities, GIN indexing support
|
|
205
|
+
4. **Arrays for Simple Lists**: More efficient than JSONB for simple string/int lists
|
|
206
|
+
5. **Consistent Typing**: Use one approach throughout your schema for maintainability
|
|
207
|
+
6. **Index Size Limits**: PostgreSQL has a 2712-byte limit per index row; TEXT fields should have constraints if indexed
|
|
208
|
+
|
|
209
|
+
### Running the Example
|
|
210
|
+
|
|
211
|
+
```bash
|
|
212
|
+
cd src/rem/utils/examples
|
|
213
|
+
python sql_types_example.py
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
This will demonstrate:
|
|
217
|
+
- Field type mapping
|
|
218
|
+
- Column definitions
|
|
219
|
+
- CREATE TABLE generation
|
|
220
|
+
- UPSERT statement generation
|
|
221
|
+
|
|
222
|
+
### See Also
|
|
223
|
+
|
|
224
|
+
- `examples/sql_types_example.py` - Complete working examples
|
|
225
|
+
- `../../models/entities/` - REM entity models
|
|
226
|
+
- `../../models/core/core_model.py` - CoreModel base class
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Embeddings (`embeddings.py`)
|
|
231
|
+
|
|
232
|
+
Vector embeddings generation utility using HTTP requests (no provider SDKs required). Supports batch processing for efficient API usage and automatic retry with exponential backoff using `tenacity`.
|
|
233
|
+
|
|
234
|
+
### Features
|
|
235
|
+
|
|
236
|
+
- **No SDK Dependencies**: Uses `requests` library for HTTP calls
|
|
237
|
+
- **Batch Processing**: Generate embeddings for multiple texts in a single API call
|
|
238
|
+
- **Multiple Providers**: OpenAI (text-embedding-3-small, text-embedding-3-large, ada-002), Voyage AI (voyage-2)
|
|
239
|
+
- **Automatic Retries**: Uses `tenacity` library for exponential backoff on rate limits
|
|
240
|
+
- **Provider Format**: Uses `provider:model_name` format (e.g., `openai:text-embedding-3-small`)
|
|
241
|
+
- **Environment Variables**: API keys from `LLM__OPENAI_API_KEY` or `OPENAI_API_KEY`
|
|
242
|
+
- **Error Handling**: Custom exceptions for embedding errors and rate limits
|
|
243
|
+
|
|
244
|
+
### Supported Models
|
|
245
|
+
|
|
246
|
+
| Provider | Model | Dimensions | Cost (per 1M tokens) |
|
|
247
|
+
|----------|-------|------------|---------------------|
|
|
248
|
+
| OpenAI | text-embedding-3-small | 1536 | $0.02 |
|
|
249
|
+
| OpenAI | text-embedding-3-large | 3072 | $0.13 |
|
|
250
|
+
| OpenAI | text-embedding-ada-002 | 1536 | $0.10 |
|
|
251
|
+
| Voyage AI | voyage-2 | 1024 | Varies |
|
|
252
|
+
| Voyage AI | voyage-large-2 | 1536 | Varies |
|
|
253
|
+
|
|
254
|
+
### Usage Examples
|
|
255
|
+
|
|
256
|
+
#### Single Text Embedding
|
|
257
|
+
|
|
258
|
+
```python
|
|
259
|
+
from rem.utils.embeddings import generate_embeddings
|
|
260
|
+
|
|
261
|
+
# Generate embedding for single text
|
|
262
|
+
embedding = generate_embeddings(
|
|
263
|
+
"openai:text-embedding-3-small",
|
|
264
|
+
"What is the meaning of life?"
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
# Result: list[float] with 1536 dimensions
|
|
268
|
+
print(f"Embedding dimension: {len(embedding)}") # 1536
|
|
269
|
+
print(f"First 5 values: {embedding[:5]}")
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
#### Batch Processing (Recommended)
|
|
273
|
+
|
|
274
|
+
```python
|
|
275
|
+
from rem.utils.embeddings import generate_embeddings
|
|
276
|
+
|
|
277
|
+
# Generate embeddings for multiple texts (more efficient)
|
|
278
|
+
texts = [
|
|
279
|
+
"What is machine learning?",
|
|
280
|
+
"How does neural network work?",
|
|
281
|
+
"Explain deep learning",
|
|
282
|
+
]
|
|
283
|
+
|
|
284
|
+
embeddings = generate_embeddings(
|
|
285
|
+
"openai:text-embedding-3-small",
|
|
286
|
+
texts
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
# Result: list[list[float]] - one embedding per text
|
|
290
|
+
print(f"Generated {len(embeddings)} embeddings")
|
|
291
|
+
for i, embedding in enumerate(embeddings):
|
|
292
|
+
print(f"Text {i+1}: {len(embedding)} dimensions")
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
#### Get Embedding Dimension
|
|
296
|
+
|
|
297
|
+
```python
|
|
298
|
+
from rem.utils.embeddings import get_embedding_dimension
|
|
299
|
+
|
|
300
|
+
# Get dimension for a model (useful for creating vector columns)
|
|
301
|
+
dimension = get_embedding_dimension("openai:text-embedding-3-small")
|
|
302
|
+
print(f"Dimension: {dimension}") # 1536
|
|
303
|
+
|
|
304
|
+
# Create PostgreSQL vector column with correct dimension
|
|
305
|
+
# CREATE TABLE documents (
|
|
306
|
+
# id UUID PRIMARY KEY,
|
|
307
|
+
# content TEXT,
|
|
308
|
+
# embedding vector(1536) -- Use dimension from get_embedding_dimension
|
|
309
|
+
# );
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
#### Error Handling
|
|
313
|
+
|
|
314
|
+
```python
|
|
315
|
+
from rem.utils.embeddings import (
|
|
316
|
+
generate_embeddings,
|
|
317
|
+
EmbeddingError,
|
|
318
|
+
RateLimitError,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
embeddings = generate_embeddings(
|
|
323
|
+
"openai:text-embedding-3-small",
|
|
324
|
+
texts,
|
|
325
|
+
max_retries=2, # Optional: increase retries if needed
|
|
326
|
+
)
|
|
327
|
+
except RateLimitError as e:
|
|
328
|
+
print(f"Rate limit exceeded after retries: {e}")
|
|
329
|
+
# All retries exhausted, implement queue or wait longer
|
|
330
|
+
except EmbeddingError as e:
|
|
331
|
+
print(f"Embedding generation failed: {e}")
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
#### Custom API Key
|
|
335
|
+
|
|
336
|
+
```python
|
|
337
|
+
from rem.utils.embeddings import generate_embeddings
|
|
338
|
+
|
|
339
|
+
# Provide API key explicitly (instead of environment variable)
|
|
340
|
+
embedding = generate_embeddings(
|
|
341
|
+
"openai:text-embedding-3-small",
|
|
342
|
+
"Hello world",
|
|
343
|
+
api_key="sk-..."
|
|
344
|
+
)
|
|
345
|
+
```
|
|
346
|
+
|
|
347
|
+
### PostgreSQL Integration
|
|
348
|
+
|
|
349
|
+
#### Add Embedding Column
|
|
350
|
+
|
|
351
|
+
```python
|
|
352
|
+
from rem.utils.embeddings import get_embedding_dimension
|
|
353
|
+
|
|
354
|
+
# Get dimension for the model you'll use
|
|
355
|
+
dimension = get_embedding_dimension("openai:text-embedding-3-small")
|
|
356
|
+
|
|
357
|
+
# Create vector column (requires pgvector extension)
|
|
358
|
+
await postgres.execute(f"""
|
|
359
|
+
ALTER TABLE documents
|
|
360
|
+
ADD COLUMN IF NOT EXISTS embedding vector({dimension})
|
|
361
|
+
""")
|
|
362
|
+
```
|
|
363
|
+
|
|
364
|
+
#### Generate and Store Embeddings
|
|
365
|
+
|
|
366
|
+
```python
|
|
367
|
+
from rem.utils.embeddings import generate_embeddings
|
|
368
|
+
|
|
369
|
+
# Single record
|
|
370
|
+
async def add_document_with_embedding(content: str):
|
|
371
|
+
# Generate embedding
|
|
372
|
+
embedding = generate_embeddings(
|
|
373
|
+
"openai:text-embedding-3-small",
|
|
374
|
+
content
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
# Store in database
|
|
378
|
+
await postgres.execute(
|
|
379
|
+
"""
|
|
380
|
+
INSERT INTO documents (id, content, embedding)
|
|
381
|
+
VALUES ($1, $2, $3::vector)
|
|
382
|
+
""",
|
|
383
|
+
doc_id,
|
|
384
|
+
content,
|
|
385
|
+
embedding,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Batch processing (efficient)
|
|
389
|
+
async def batch_generate_embeddings(batch_size: int = 100):
|
|
390
|
+
# Get records without embeddings
|
|
391
|
+
records = await postgres.fetch_all("""
|
|
392
|
+
SELECT id, content
|
|
393
|
+
FROM documents
|
|
394
|
+
WHERE embedding IS NULL
|
|
395
|
+
LIMIT $1
|
|
396
|
+
""", batch_size)
|
|
397
|
+
|
|
398
|
+
# Extract texts
|
|
399
|
+
texts = [r["content"] for r in records]
|
|
400
|
+
|
|
401
|
+
# Generate all embeddings in one API call
|
|
402
|
+
embeddings = generate_embeddings(
|
|
403
|
+
"openai:text-embedding-3-small",
|
|
404
|
+
texts
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
# Store embeddings
|
|
408
|
+
for record, embedding in zip(records, embeddings):
|
|
409
|
+
await postgres.execute(
|
|
410
|
+
"""
|
|
411
|
+
UPDATE documents
|
|
412
|
+
SET embedding = $1::vector
|
|
413
|
+
WHERE id = $2
|
|
414
|
+
""",
|
|
415
|
+
embedding,
|
|
416
|
+
record["id"],
|
|
417
|
+
)
|
|
418
|
+
```
|
|
419
|
+
|
|
420
|
+
#### Similarity Search
|
|
421
|
+
|
|
422
|
+
```python
|
|
423
|
+
# Vector similarity search using pgvector
|
|
424
|
+
async def search_similar_documents(query: str, limit: int = 10):
|
|
425
|
+
# Generate query embedding
|
|
426
|
+
query_embedding = generate_embeddings(
|
|
427
|
+
"openai:text-embedding-3-small",
|
|
428
|
+
query
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Search using cosine similarity (pgvector <=> operator)
|
|
432
|
+
results = await postgres.fetch_all(
|
|
433
|
+
"""
|
|
434
|
+
SELECT id, content,
|
|
435
|
+
embedding <=> $1::vector as distance
|
|
436
|
+
FROM documents
|
|
437
|
+
WHERE embedding IS NOT NULL
|
|
438
|
+
ORDER BY embedding <=> $1::vector
|
|
439
|
+
LIMIT $2
|
|
440
|
+
""",
|
|
441
|
+
query_embedding,
|
|
442
|
+
limit,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
return results
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
#### Create Vector Index
|
|
449
|
+
|
|
450
|
+
```python
|
|
451
|
+
# Create ivfflat index for faster similarity search
|
|
452
|
+
# Note: Requires at least 1000 rows for effective indexing
|
|
453
|
+
await postgres.execute("""
|
|
454
|
+
CREATE INDEX IF NOT EXISTS idx_documents_embedding
|
|
455
|
+
ON documents
|
|
456
|
+
USING ivfflat (embedding vector_cosine_ops)
|
|
457
|
+
WITH (lists = 100);
|
|
458
|
+
""")
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
### Best Practices
|
|
462
|
+
|
|
463
|
+
1. **Batch Processing**
|
|
464
|
+
- Always batch multiple texts in a single API call when possible
|
|
465
|
+
- OpenAI supports up to 2048 inputs per request
|
|
466
|
+
- Reduces API overhead and stays within RPM (requests per minute) limits
|
|
467
|
+
|
|
468
|
+
2. **Rate Limit Management**
|
|
469
|
+
- Uses `tenacity` library for automatic exponential backoff (default: 1 retry with 1s wait)
|
|
470
|
+
- Adjust `max_retries` parameter if needed (default: 1)
|
|
471
|
+
- Monitor your usage and adjust batch size accordingly
|
|
472
|
+
- For large-scale processing, implement a queue system
|
|
473
|
+
|
|
474
|
+
3. **Cost Optimization**
|
|
475
|
+
- Use `text-embedding-3-small` ($0.02/1M tokens) for most use cases
|
|
476
|
+
- Only use `text-embedding-3-large` ($0.13/1M tokens) when higher accuracy is critical
|
|
477
|
+
- Batch requests to minimize API calls
|
|
478
|
+
|
|
479
|
+
4. **Error Handling**
|
|
480
|
+
- Catch `RateLimitError` separately for specific rate limit handling
|
|
481
|
+
- Catch `EmbeddingError` for general API errors
|
|
482
|
+
- Validate `embedding_provider` format early in your code
|
|
483
|
+
|
|
484
|
+
5. **PostgreSQL Performance**
|
|
485
|
+
- Create vector indexes after populating data (requires 1000+ rows)
|
|
486
|
+
- Use `ivfflat` indexes for approximate nearest neighbor search
|
|
487
|
+
- Consider HNSW indexes for better accuracy (pgvector 0.5.0+)
|
|
488
|
+
- Use `vector_cosine_ops` for cosine similarity (most common)
|
|
489
|
+
|
|
490
|
+
6. **Environment Variables**
|
|
491
|
+
- Set `LLM__OPENAI_API_KEY` in `.env` for consistency with REM settings
|
|
492
|
+
- Falls back to `OPENAI_API_KEY` for compatibility
|
|
493
|
+
- Never commit API keys to version control
|
|
494
|
+
|
|
495
|
+
### API Reference
|
|
496
|
+
|
|
497
|
+
#### `generate_embeddings()`
|
|
498
|
+
|
|
499
|
+
```python
|
|
500
|
+
def generate_embeddings(
|
|
501
|
+
embedding_provider: str,
|
|
502
|
+
texts: str | list[str],
|
|
503
|
+
api_key: str | None = None,
|
|
504
|
+
max_retries: int = 1,
|
|
505
|
+
) -> list[float] | list[list[float]]:
|
|
506
|
+
"""
|
|
507
|
+
Generate embeddings for text(s) using specified provider.
|
|
508
|
+
|
|
509
|
+
Uses tenacity for automatic retry with exponential backoff on rate limits.
|
|
510
|
+
|
|
511
|
+
Args:
|
|
512
|
+
embedding_provider: Provider and model (e.g., "openai:text-embedding-3-small")
|
|
513
|
+
texts: Single text or list of texts
|
|
514
|
+
api_key: API key (if None, reads from environment)
|
|
515
|
+
max_retries: Maximum retry attempts for rate limits (default: 1)
|
|
516
|
+
|
|
517
|
+
Returns:
|
|
518
|
+
Single embedding (list[float]) or list of embeddings (list[list[float]])
|
|
519
|
+
|
|
520
|
+
Raises:
|
|
521
|
+
EmbeddingError: If generation fails
|
|
522
|
+
RateLimitError: If rate limit exceeded after retries
|
|
523
|
+
ValueError: If provider format is invalid
|
|
524
|
+
"""
|
|
525
|
+
```
|
|
526
|
+
|
|
527
|
+
#### `get_embedding_dimension()`
|
|
528
|
+
|
|
529
|
+
```python
|
|
530
|
+
def get_embedding_dimension(embedding_provider: str) -> int:
|
|
531
|
+
"""
|
|
532
|
+
Get embedding dimension for a provider and model.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
embedding_provider: Provider and model (e.g., "openai:text-embedding-3-small")
|
|
536
|
+
|
|
537
|
+
Returns:
|
|
538
|
+
Embedding dimension (e.g., 1536)
|
|
539
|
+
|
|
540
|
+
Raises:
|
|
541
|
+
ValueError: If provider/model is unknown
|
|
542
|
+
"""
|
|
543
|
+
```
|
|
544
|
+
|
|
545
|
+
### Environment Variables
|
|
546
|
+
|
|
547
|
+
Set in `.env` or environment:
|
|
548
|
+
|
|
549
|
+
```bash
|
|
550
|
+
# OpenAI (preferred format for REM)
|
|
551
|
+
LLM__OPENAI_API_KEY=sk-...
|
|
552
|
+
|
|
553
|
+
# Or standard OpenAI format (fallback)
|
|
554
|
+
OPENAI_API_KEY=sk-...
|
|
555
|
+
|
|
556
|
+
# Anthropic/Voyage AI
|
|
557
|
+
LLM__ANTHROPIC_API_KEY=sk-ant-...
|
|
558
|
+
```
|
|
559
|
+
|
|
560
|
+
### Running the Example
|
|
561
|
+
|
|
562
|
+
```bash
|
|
563
|
+
# Set API key
|
|
564
|
+
export LLM__OPENAI_API_KEY='sk-...'
|
|
565
|
+
|
|
566
|
+
# Run examples
|
|
567
|
+
cd src/rem/utils/examples
|
|
568
|
+
python embeddings_example.py
|
|
569
|
+
```
|
|
570
|
+
|
|
571
|
+
This will demonstrate:
|
|
572
|
+
- Single text embedding
|
|
573
|
+
- Batch processing
|
|
574
|
+
- Multiple providers
|
|
575
|
+
- Error handling
|
|
576
|
+
- PostgreSQL integration patterns
|
|
577
|
+
|
|
578
|
+
### See Also
|
|
579
|
+
|
|
580
|
+
- `examples/embeddings_example.py` - Complete working examples
|
|
581
|
+
- `sql_types.py` - Use `embedding_provider` in json_schema_extra for TEXT fields
|
|
582
|
+
- OpenAI Embeddings API: https://platform.openai.com/docs/api-reference/embeddings
|
|
583
|
+
- pgvector Documentation: https://github.com/pgvector/pgvector
|
rem/utils/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
REM Utilities
|
|
3
|
+
|
|
4
|
+
Utility functions and helpers for the REM system:
|
|
5
|
+
- sql_types: Pydantic to PostgreSQL type mapping
|
|
6
|
+
- embeddings: Vector embeddings generation using requests library
|
|
7
|
+
- user_id: Deterministic UUID generation from email addresses
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .embeddings import (
|
|
11
|
+
EmbeddingError,
|
|
12
|
+
RateLimitError,
|
|
13
|
+
generate_embeddings,
|
|
14
|
+
get_embedding_dimension,
|
|
15
|
+
)
|
|
16
|
+
from .sql_types import (
|
|
17
|
+
get_column_definition,
|
|
18
|
+
get_sql_type,
|
|
19
|
+
model_to_create_table,
|
|
20
|
+
model_to_upsert,
|
|
21
|
+
)
|
|
22
|
+
from .user_id import (
|
|
23
|
+
email_to_user_id,
|
|
24
|
+
is_valid_uuid,
|
|
25
|
+
user_id_to_uuid,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__all__ = [
|
|
29
|
+
# SQL Types
|
|
30
|
+
"get_sql_type",
|
|
31
|
+
"get_column_definition",
|
|
32
|
+
"model_to_create_table",
|
|
33
|
+
"model_to_upsert",
|
|
34
|
+
# Embeddings
|
|
35
|
+
"generate_embeddings",
|
|
36
|
+
"get_embedding_dimension",
|
|
37
|
+
"EmbeddingError",
|
|
38
|
+
"RateLimitError",
|
|
39
|
+
# User ID
|
|
40
|
+
"email_to_user_id",
|
|
41
|
+
"user_id_to_uuid",
|
|
42
|
+
"is_valid_uuid",
|
|
43
|
+
]
|