remdb 0.3.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (235) hide show
  1. rem/__init__.py +129 -0
  2. rem/agentic/README.md +760 -0
  3. rem/agentic/__init__.py +54 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +38 -0
  6. rem/agentic/agents/agent_manager.py +311 -0
  7. rem/agentic/agents/sse_simulator.py +502 -0
  8. rem/agentic/context.py +425 -0
  9. rem/agentic/context_builder.py +360 -0
  10. rem/agentic/llm_provider_models.py +301 -0
  11. rem/agentic/mcp/__init__.py +0 -0
  12. rem/agentic/mcp/tool_wrapper.py +273 -0
  13. rem/agentic/otel/__init__.py +5 -0
  14. rem/agentic/otel/setup.py +240 -0
  15. rem/agentic/providers/phoenix.py +926 -0
  16. rem/agentic/providers/pydantic_ai.py +854 -0
  17. rem/agentic/query.py +117 -0
  18. rem/agentic/query_helper.py +89 -0
  19. rem/agentic/schema.py +737 -0
  20. rem/agentic/serialization.py +245 -0
  21. rem/agentic/tools/__init__.py +5 -0
  22. rem/agentic/tools/rem_tools.py +242 -0
  23. rem/api/README.md +657 -0
  24. rem/api/deps.py +253 -0
  25. rem/api/main.py +460 -0
  26. rem/api/mcp_router/prompts.py +182 -0
  27. rem/api/mcp_router/resources.py +820 -0
  28. rem/api/mcp_router/server.py +243 -0
  29. rem/api/mcp_router/tools.py +1605 -0
  30. rem/api/middleware/tracking.py +172 -0
  31. rem/api/routers/admin.py +520 -0
  32. rem/api/routers/auth.py +898 -0
  33. rem/api/routers/chat/__init__.py +5 -0
  34. rem/api/routers/chat/child_streaming.py +394 -0
  35. rem/api/routers/chat/completions.py +702 -0
  36. rem/api/routers/chat/json_utils.py +76 -0
  37. rem/api/routers/chat/models.py +202 -0
  38. rem/api/routers/chat/otel_utils.py +33 -0
  39. rem/api/routers/chat/sse_events.py +546 -0
  40. rem/api/routers/chat/streaming.py +950 -0
  41. rem/api/routers/chat/streaming_utils.py +327 -0
  42. rem/api/routers/common.py +18 -0
  43. rem/api/routers/dev.py +87 -0
  44. rem/api/routers/feedback.py +276 -0
  45. rem/api/routers/messages.py +620 -0
  46. rem/api/routers/models.py +86 -0
  47. rem/api/routers/query.py +362 -0
  48. rem/api/routers/shared_sessions.py +422 -0
  49. rem/auth/README.md +258 -0
  50. rem/auth/__init__.py +36 -0
  51. rem/auth/jwt.py +367 -0
  52. rem/auth/middleware.py +318 -0
  53. rem/auth/providers/__init__.py +16 -0
  54. rem/auth/providers/base.py +376 -0
  55. rem/auth/providers/email.py +215 -0
  56. rem/auth/providers/google.py +163 -0
  57. rem/auth/providers/microsoft.py +237 -0
  58. rem/cli/README.md +517 -0
  59. rem/cli/__init__.py +8 -0
  60. rem/cli/commands/README.md +299 -0
  61. rem/cli/commands/__init__.py +3 -0
  62. rem/cli/commands/ask.py +549 -0
  63. rem/cli/commands/cluster.py +1808 -0
  64. rem/cli/commands/configure.py +495 -0
  65. rem/cli/commands/db.py +828 -0
  66. rem/cli/commands/dreaming.py +324 -0
  67. rem/cli/commands/experiments.py +1698 -0
  68. rem/cli/commands/mcp.py +66 -0
  69. rem/cli/commands/process.py +388 -0
  70. rem/cli/commands/query.py +109 -0
  71. rem/cli/commands/scaffold.py +47 -0
  72. rem/cli/commands/schema.py +230 -0
  73. rem/cli/commands/serve.py +106 -0
  74. rem/cli/commands/session.py +453 -0
  75. rem/cli/dreaming.py +363 -0
  76. rem/cli/main.py +123 -0
  77. rem/config.py +244 -0
  78. rem/mcp_server.py +41 -0
  79. rem/models/core/__init__.py +49 -0
  80. rem/models/core/core_model.py +70 -0
  81. rem/models/core/engram.py +333 -0
  82. rem/models/core/experiment.py +672 -0
  83. rem/models/core/inline_edge.py +132 -0
  84. rem/models/core/rem_query.py +246 -0
  85. rem/models/entities/__init__.py +68 -0
  86. rem/models/entities/domain_resource.py +38 -0
  87. rem/models/entities/feedback.py +123 -0
  88. rem/models/entities/file.py +57 -0
  89. rem/models/entities/image_resource.py +88 -0
  90. rem/models/entities/message.py +64 -0
  91. rem/models/entities/moment.py +123 -0
  92. rem/models/entities/ontology.py +181 -0
  93. rem/models/entities/ontology_config.py +131 -0
  94. rem/models/entities/resource.py +95 -0
  95. rem/models/entities/schema.py +87 -0
  96. rem/models/entities/session.py +84 -0
  97. rem/models/entities/shared_session.py +180 -0
  98. rem/models/entities/subscriber.py +175 -0
  99. rem/models/entities/user.py +93 -0
  100. rem/py.typed +0 -0
  101. rem/registry.py +373 -0
  102. rem/schemas/README.md +507 -0
  103. rem/schemas/__init__.py +6 -0
  104. rem/schemas/agents/README.md +92 -0
  105. rem/schemas/agents/core/agent-builder.yaml +235 -0
  106. rem/schemas/agents/core/moment-builder.yaml +178 -0
  107. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  108. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  109. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  110. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  111. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  112. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  113. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  114. rem/schemas/agents/examples/hello-world.yaml +37 -0
  115. rem/schemas/agents/examples/query.yaml +54 -0
  116. rem/schemas/agents/examples/simple.yaml +21 -0
  117. rem/schemas/agents/examples/test.yaml +29 -0
  118. rem/schemas/agents/rem.yaml +132 -0
  119. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  120. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  121. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  122. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  123. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  124. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  125. rem/services/__init__.py +18 -0
  126. rem/services/audio/INTEGRATION.md +308 -0
  127. rem/services/audio/README.md +376 -0
  128. rem/services/audio/__init__.py +15 -0
  129. rem/services/audio/chunker.py +354 -0
  130. rem/services/audio/transcriber.py +259 -0
  131. rem/services/content/README.md +1269 -0
  132. rem/services/content/__init__.py +5 -0
  133. rem/services/content/providers.py +760 -0
  134. rem/services/content/service.py +762 -0
  135. rem/services/dreaming/README.md +230 -0
  136. rem/services/dreaming/__init__.py +53 -0
  137. rem/services/dreaming/affinity_service.py +322 -0
  138. rem/services/dreaming/moment_service.py +251 -0
  139. rem/services/dreaming/ontology_service.py +54 -0
  140. rem/services/dreaming/user_model_service.py +297 -0
  141. rem/services/dreaming/utils.py +39 -0
  142. rem/services/email/__init__.py +10 -0
  143. rem/services/email/service.py +522 -0
  144. rem/services/email/templates.py +360 -0
  145. rem/services/embeddings/__init__.py +11 -0
  146. rem/services/embeddings/api.py +127 -0
  147. rem/services/embeddings/worker.py +435 -0
  148. rem/services/fs/README.md +662 -0
  149. rem/services/fs/__init__.py +62 -0
  150. rem/services/fs/examples.py +206 -0
  151. rem/services/fs/examples_paths.py +204 -0
  152. rem/services/fs/git_provider.py +935 -0
  153. rem/services/fs/local_provider.py +760 -0
  154. rem/services/fs/parsing-hooks-examples.md +172 -0
  155. rem/services/fs/paths.py +276 -0
  156. rem/services/fs/provider.py +460 -0
  157. rem/services/fs/s3_provider.py +1042 -0
  158. rem/services/fs/service.py +186 -0
  159. rem/services/git/README.md +1075 -0
  160. rem/services/git/__init__.py +17 -0
  161. rem/services/git/service.py +469 -0
  162. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  163. rem/services/phoenix/README.md +453 -0
  164. rem/services/phoenix/__init__.py +46 -0
  165. rem/services/phoenix/client.py +960 -0
  166. rem/services/phoenix/config.py +88 -0
  167. rem/services/phoenix/prompt_labels.py +477 -0
  168. rem/services/postgres/README.md +757 -0
  169. rem/services/postgres/__init__.py +49 -0
  170. rem/services/postgres/diff_service.py +599 -0
  171. rem/services/postgres/migration_service.py +427 -0
  172. rem/services/postgres/programmable_diff_service.py +635 -0
  173. rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
  174. rem/services/postgres/register_type.py +353 -0
  175. rem/services/postgres/repository.py +481 -0
  176. rem/services/postgres/schema_generator.py +661 -0
  177. rem/services/postgres/service.py +802 -0
  178. rem/services/postgres/sql_builder.py +355 -0
  179. rem/services/rate_limit.py +113 -0
  180. rem/services/rem/README.md +318 -0
  181. rem/services/rem/__init__.py +23 -0
  182. rem/services/rem/exceptions.py +71 -0
  183. rem/services/rem/executor.py +293 -0
  184. rem/services/rem/parser.py +180 -0
  185. rem/services/rem/queries.py +196 -0
  186. rem/services/rem/query.py +371 -0
  187. rem/services/rem/service.py +608 -0
  188. rem/services/session/README.md +374 -0
  189. rem/services/session/__init__.py +13 -0
  190. rem/services/session/compression.py +488 -0
  191. rem/services/session/pydantic_messages.py +310 -0
  192. rem/services/session/reload.py +85 -0
  193. rem/services/user_service.py +130 -0
  194. rem/settings.py +1877 -0
  195. rem/sql/background_indexes.sql +52 -0
  196. rem/sql/migrations/001_install.sql +983 -0
  197. rem/sql/migrations/002_install_models.sql +3157 -0
  198. rem/sql/migrations/003_optional_extensions.sql +326 -0
  199. rem/sql/migrations/004_cache_system.sql +282 -0
  200. rem/sql/migrations/005_schema_update.sql +145 -0
  201. rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
  202. rem/utils/AGENTIC_CHUNKING.md +597 -0
  203. rem/utils/README.md +628 -0
  204. rem/utils/__init__.py +61 -0
  205. rem/utils/agentic_chunking.py +622 -0
  206. rem/utils/batch_ops.py +343 -0
  207. rem/utils/chunking.py +108 -0
  208. rem/utils/clip_embeddings.py +276 -0
  209. rem/utils/constants.py +97 -0
  210. rem/utils/date_utils.py +228 -0
  211. rem/utils/dict_utils.py +98 -0
  212. rem/utils/embeddings.py +436 -0
  213. rem/utils/examples/embeddings_example.py +305 -0
  214. rem/utils/examples/sql_types_example.py +202 -0
  215. rem/utils/files.py +323 -0
  216. rem/utils/markdown.py +16 -0
  217. rem/utils/mime_types.py +158 -0
  218. rem/utils/model_helpers.py +492 -0
  219. rem/utils/schema_loader.py +649 -0
  220. rem/utils/sql_paths.py +146 -0
  221. rem/utils/sql_types.py +350 -0
  222. rem/utils/user_id.py +81 -0
  223. rem/utils/vision.py +325 -0
  224. rem/workers/README.md +506 -0
  225. rem/workers/__init__.py +7 -0
  226. rem/workers/db_listener.py +579 -0
  227. rem/workers/db_maintainer.py +74 -0
  228. rem/workers/dreaming.py +502 -0
  229. rem/workers/engram_processor.py +312 -0
  230. rem/workers/sqs_file_processor.py +193 -0
  231. rem/workers/unlogged_maintainer.py +463 -0
  232. remdb-0.3.242.dist-info/METADATA +1632 -0
  233. remdb-0.3.242.dist-info/RECORD +235 -0
  234. remdb-0.3.242.dist-info/WHEEL +4 -0
  235. remdb-0.3.242.dist-info/entry_points.txt +2 -0
rem/utils/README.md ADDED
@@ -0,0 +1,628 @@
1
+ # REM Utilities
2
+
3
+ ## Table of Contents
4
+
5
+ 1. [SQL Types](#sql-types-sql_typespy) - Pydantic to PostgreSQL type mapping
6
+ 2. [Embeddings](#embeddings-embeddingspy) - Vector embeddings generation
7
+ 3. [Files](#files-filespy) - File utilities and DataFrame I/O
8
+
9
+ ## SQL Types (`sql_types.py`)
10
+
11
+ Intelligent Pydantic to PostgreSQL type mapping utility for generating database schemas and UPSERT statements.
12
+
13
+ ### Features
14
+
15
+ - **Smart String Handling**: VARCHAR(256) by default, TEXT for content/description fields
16
+ - **Union Type Preferences**: Prioritizes UUID and JSONB in Union types
17
+ - **Array Support**: PostgreSQL arrays for `list[str]`, JSONB for complex lists
18
+ - **JSONB for Structured Data**: Automatic JSONB for dicts and nested structures
19
+ - **Custom Type Overrides**: Support for `sql_type` in `json_schema_extra`
20
+ - **Embedding Field Detection**: Auto-detects embedding fields via `embedding_provider`
21
+ - **Schema Generation**: CREATE TABLE with appropriate indexes
22
+ - **UPSERT Templates**: INSERT ... ON CONFLICT UPDATE statements
23
+
24
+ ### Type Mapping Rules
25
+
26
+ | Pydantic Type | PostgreSQL Type | Notes |
27
+ |---------------|-----------------|-------|
28
+ | `str` | `VARCHAR(256)` | Default for strings |
29
+ | `str` (field name: content, description, summary, etc.) | `TEXT` | Long-form text fields |
30
+ | `str` (with `embedding_provider`) | `TEXT` | For vector search preprocessing |
31
+ | `int` | `INTEGER` | Standard integer |
32
+ | `float` | `DOUBLE PRECISION` | Floating point |
33
+ | `bool` | `BOOLEAN` | Boolean values |
34
+ | `UUID` | `UUID` | PostgreSQL UUID type |
35
+ | `datetime` | `TIMESTAMP` | Timestamp without timezone |
36
+ | `date` | `DATE` | Date only |
37
+ | `dict` | `JSONB` | Structured JSON data |
38
+ | `list[str]` | `TEXT[]` | PostgreSQL array |
39
+ | `list[dict]` | `JSONB` | Complex nested data |
40
+ | `UUID \| str` | `UUID` | Prefers UUID in unions |
41
+ | `dict \| None` | `JSONB` | Prefers JSONB in unions |
42
+
43
+ ### Long-Form Text Field Names
44
+
45
+ Fields with these names automatically use TEXT:
46
+ - `content`, `description`, `summary`
47
+ - `instructions`, `prompt`, `message`
48
+ - `body`, `text`, `note`, `comment`
49
+ - Fields ending with `_content`, `_description`, `_summary`, `_text`, `_message`
50
+
51
+ ### Usage Examples
52
+
53
+ #### Basic Type Mapping
54
+
55
+ ```python
56
+ from pydantic import Field
57
+ from rem.utils.sql_types import get_sql_type
58
+
59
+ # String field
60
+ field = Field(default="")
61
+ get_sql_type(field, "name") # "VARCHAR(256)"
62
+
63
+ # Content field (detected by name)
64
+ field = Field(default="")
65
+ get_sql_type(field, "content") # "TEXT"
66
+
67
+ # Dict field
68
+ field = Field(default_factory=dict)
69
+ get_sql_type(field, "metadata") # "JSONB"
70
+
71
+ # List of strings
72
+ field = Field(default_factory=list)
73
+ get_sql_type(field, "tags") # "TEXT[]"
74
+ ```
75
+
76
+ #### Custom Type Override
77
+
78
+ ```python
79
+ from pydantic import BaseModel, Field
80
+
81
+ class Document(BaseModel):
82
+ # Custom pgvector type
83
+ embedding: list[float] = Field(
84
+ default_factory=list,
85
+ json_schema_extra={"sql_type": "vector(1536)"}
86
+ )
87
+
88
+ # Embedding provider detection (format: provider:model_name)
89
+ content: str = Field(
90
+ default="",
91
+ json_schema_extra={"embedding_provider": "openai:text-embedding-3-small"}
92
+ ) # Will use TEXT
93
+
94
+ # Alternative embedding providers
95
+ description: str = Field(
96
+ default="",
97
+ json_schema_extra={"embedding_provider": "anthropic:voyage-2"}
98
+ ) # Will use TEXT
99
+ ```
100
+
101
+ #### Generate CREATE TABLE
102
+
103
+ ```python
104
+ from rem.models.entities import Resource
105
+ from rem.utils.sql_types import model_to_create_table
106
+
107
+ sql = model_to_create_table(Resource, "resources")
108
+ print(sql)
109
+ ```
110
+
111
+ Output:
112
+ ```sql
113
+ CREATE TABLE IF NOT EXISTS resources (
114
+ id UUID PRIMARY KEY,
115
+ name VARCHAR(256) NOT NULL,
116
+ uri VARCHAR(256),
117
+ content TEXT NOT NULL DEFAULT '',
118
+ description TEXT,
119
+ category VARCHAR(256),
120
+ related_entities JSONB NOT NULL DEFAULT '{}'::jsonb
121
+ );
122
+
123
+ -- Indexes
124
+ CREATE INDEX IF NOT EXISTS idx_resources_tenant_id ON resources(tenant_id);
125
+ CREATE INDEX IF NOT EXISTS idx_resources_related_entities ON resources USING GIN(related_entities);
126
+ ```
127
+
128
+ #### Generate UPSERT Statement
129
+
130
+ ```python
131
+ from rem.models.entities import Resource
132
+ from rem.utils.sql_types import model_to_upsert
133
+
134
+ sql = model_to_upsert(Resource, "resources")
135
+ print(sql)
136
+ ```
137
+
138
+ Output:
139
+ ```sql
140
+ INSERT INTO resources (id, name, uri, content, description, category, related_entities)
141
+ VALUES ($1, $2, $3, $4, $5, $6, $7)
142
+ ON CONFLICT (id)
143
+ DO UPDATE SET name = EXCLUDED.name, uri = EXCLUDED.uri, content = EXCLUDED.content,
144
+ description = EXCLUDED.description, category = EXCLUDED.category,
145
+ related_entities = EXCLUDED.related_entities;
146
+ ```
147
+
148
+ #### Complete Column Definition
149
+
150
+ ```python
151
+ from pydantic import Field
152
+ from rem.utils.sql_types import get_column_definition
153
+
154
+ # Primary key
155
+ field = Field(..., description="User ID")
156
+ get_column_definition(field, "id", nullable=False, primary_key=True)
157
+ # "id UUID PRIMARY KEY"
158
+
159
+ # Optional field
160
+ field = Field(default=None)
161
+ get_column_definition(field, "email", nullable=True)
162
+ # "email VARCHAR(256)"
163
+
164
+ # JSONB with default
165
+ field = Field(default_factory=dict)
166
+ get_column_definition(field, "metadata", nullable=False)
167
+ # "metadata JSONB NOT NULL DEFAULT '{}'::jsonb"
168
+ ```
169
+
170
+ ### Index Generation
171
+
172
+ The utility automatically creates indexes for:
173
+
174
+ 1. **Foreign Keys**: Fields ending with `_id` (e.g., `user_id`, `tenant_id`)
175
+ 2. **JSONB Fields**: GIN indexes for efficient querying
176
+ 3. **Array Fields**: GIN indexes for array containment queries
177
+ 4. **Primary Keys**: Automatically indexed
178
+
179
+ ### Integration with REM Models
180
+
181
+ ```python
182
+ # Generate schema for all REM entities
183
+ from rem.models.entities import Resource, Message, User, File, Moment, Schema
184
+ from rem.utils.sql_types import model_to_create_table
185
+
186
+ for model, table_name in [
187
+ (Resource, "resources"),
188
+ (Message, "messages"),
189
+ (User, "users"),
190
+ (File, "files"),
191
+ (Moment, "moments"),
192
+ (Schema, "schemas"),
193
+ ]:
194
+ sql = model_to_create_table(model, table_name)
195
+ with open(f"migrations/{table_name}.sql", "w") as f:
196
+ f.write(sql)
197
+ ```
198
+
199
+ ### Best Practices (from Research)
200
+
201
+ Based on PostgreSQL documentation and community best practices:
202
+
203
+ 1. **VARCHAR(256) for Most Strings**: Good balance between validation and flexibility
204
+ 2. **TEXT for Long Content**: No performance penalty, better for variable-length text
205
+ 3. **JSONB over JSON**: Better querying capabilities, GIN indexing support
206
+ 4. **Arrays for Simple Lists**: More efficient than JSONB for simple string/int lists
207
+ 5. **Consistent Typing**: Use one approach throughout your schema for maintainability
208
+ 6. **Index Size Limits**: PostgreSQL has a 2712-byte limit per index row; TEXT fields should have constraints if indexed
209
+
210
+ ### Running the Example
211
+
212
+ ```bash
213
+ cd src/rem/utils/examples
214
+ python sql_types_example.py
215
+ ```
216
+
217
+ This will demonstrate:
218
+ - Field type mapping
219
+ - Column definitions
220
+ - CREATE TABLE generation
221
+ - UPSERT statement generation
222
+
223
+ ### See Also
224
+
225
+ - `examples/sql_types_example.py` - Complete working examples
226
+ - `../../models/entities/` - REM entity models
227
+ - `../../models/core/core_model.py` - CoreModel base class
228
+
229
+ ---
230
+
231
+ ## Embeddings (`embeddings.py`)
232
+
233
+ Vector embeddings generation utility using HTTP requests (no provider SDKs required). Supports batch processing for efficient API usage and automatic retry with exponential backoff using `tenacity`.
234
+
235
+ ### Features
236
+
237
+ - **No SDK Dependencies**: Uses `requests` library for HTTP calls
238
+ - **Batch Processing**: Generate embeddings for multiple texts in a single API call
239
+ - **Multiple Providers**: OpenAI (text-embedding-3-small, text-embedding-3-large, ada-002), Voyage AI (voyage-2)
240
+ - **Automatic Retries**: Uses `tenacity` library for exponential backoff on rate limits
241
+ - **Provider Format**: Uses `provider:model_name` format (e.g., `openai:text-embedding-3-small`)
242
+ - **Environment Variables**: API keys from `LLM__OPENAI_API_KEY` or `OPENAI_API_KEY`
243
+ - **Error Handling**: Custom exceptions for embedding errors and rate limits
244
+
245
+ ### Supported Models
246
+
247
+ | Provider | Model | Dimensions | Cost (per 1M tokens) |
248
+ |----------|-------|------------|---------------------|
249
+ | OpenAI | text-embedding-3-small | 1536 | $0.02 |
250
+ | OpenAI | text-embedding-3-large | 3072 | $0.13 |
251
+ | OpenAI | text-embedding-ada-002 | 1536 | $0.10 |
252
+ | Voyage AI | voyage-2 | 1024 | Varies |
253
+ | Voyage AI | voyage-large-2 | 1536 | Varies |
254
+
255
+ ### Usage Examples
256
+
257
+ #### Single Text Embedding
258
+
259
+ ```python
260
+ from rem.utils.embeddings import generate_embeddings
261
+
262
+ # Generate embedding for single text
263
+ embedding = generate_embeddings(
264
+ "openai:text-embedding-3-small",
265
+ "What is the meaning of life?"
266
+ )
267
+
268
+ # Result: list[float] with 1536 dimensions
269
+ print(f"Embedding dimension: {len(embedding)}") # 1536
270
+ print(f"First 5 values: {embedding[:5]}")
271
+ ```
272
+
273
+ #### Batch Processing (Recommended)
274
+
275
+ ```python
276
+ from rem.utils.embeddings import generate_embeddings
277
+
278
+ # Generate embeddings for multiple texts (more efficient)
279
+ texts = [
280
+ "What is machine learning?",
281
+ "How does neural network work?",
282
+ "Explain deep learning",
283
+ ]
284
+
285
+ embeddings = generate_embeddings(
286
+ "openai:text-embedding-3-small",
287
+ texts
288
+ )
289
+
290
+ # Result: list[list[float]] - one embedding per text
291
+ print(f"Generated {len(embeddings)} embeddings")
292
+ for i, embedding in enumerate(embeddings):
293
+ print(f"Text {i+1}: {len(embedding)} dimensions")
294
+ ```
295
+
296
+ #### Get Embedding Dimension
297
+
298
+ ```python
299
+ from rem.utils.embeddings import get_embedding_dimension
300
+
301
+ # Get dimension for a model (useful for creating vector columns)
302
+ dimension = get_embedding_dimension("openai:text-embedding-3-small")
303
+ print(f"Dimension: {dimension}") # 1536
304
+
305
+ # Create PostgreSQL vector column with correct dimension
306
+ # CREATE TABLE documents (
307
+ # id UUID PRIMARY KEY,
308
+ # content TEXT,
309
+ # embedding vector(1536) -- Use dimension from get_embedding_dimension
310
+ # );
311
+ ```
312
+
313
+ #### Error Handling
314
+
315
+ ```python
316
+ from rem.utils.embeddings import (
317
+ generate_embeddings,
318
+ EmbeddingError,
319
+ RateLimitError,
320
+ )
321
+
322
+ try:
323
+ embeddings = generate_embeddings(
324
+ "openai:text-embedding-3-small",
325
+ texts,
326
+ max_retries=2, # Optional: increase retries if needed
327
+ )
328
+ except RateLimitError as e:
329
+ print(f"Rate limit exceeded after retries: {e}")
330
+ # All retries exhausted, implement queue or wait longer
331
+ except EmbeddingError as e:
332
+ print(f"Embedding generation failed: {e}")
333
+ ```
334
+
335
+ #### Custom API Key
336
+
337
+ ```python
338
+ from rem.utils.embeddings import generate_embeddings
339
+
340
+ # Provide API key explicitly (instead of environment variable)
341
+ embedding = generate_embeddings(
342
+ "openai:text-embedding-3-small",
343
+ "Hello world",
344
+ api_key="sk-..."
345
+ )
346
+ ```
347
+
348
+ ### PostgreSQL Integration
349
+
350
+ #### Add Embedding Column
351
+
352
+ ```python
353
+ from rem.utils.embeddings import get_embedding_dimension
354
+
355
+ # Get dimension for the model you'll use
356
+ dimension = get_embedding_dimension("openai:text-embedding-3-small")
357
+
358
+ # Create vector column (requires pgvector extension)
359
+ await postgres.execute(f"""
360
+ ALTER TABLE documents
361
+ ADD COLUMN IF NOT EXISTS embedding vector({dimension})
362
+ """)
363
+ ```
364
+
365
+ #### Generate and Store Embeddings
366
+
367
+ ```python
368
+ from rem.utils.embeddings import generate_embeddings
369
+
370
+ # Single record
371
+ async def add_document_with_embedding(content: str):
372
+ # Generate embedding
373
+ embedding = generate_embeddings(
374
+ "openai:text-embedding-3-small",
375
+ content
376
+ )
377
+
378
+ # Store in database
379
+ await postgres.execute(
380
+ """
381
+ INSERT INTO documents (id, content, embedding)
382
+ VALUES ($1, $2, $3::vector)
383
+ """,
384
+ doc_id,
385
+ content,
386
+ embedding,
387
+ )
388
+
389
+ # Batch processing (efficient)
390
+ async def batch_generate_embeddings(batch_size: int = 100):
391
+ # Get records without embeddings
392
+ records = await postgres.fetch_all("""
393
+ SELECT id, content
394
+ FROM documents
395
+ WHERE embedding IS NULL
396
+ LIMIT $1
397
+ """, batch_size)
398
+
399
+ # Extract texts
400
+ texts = [r["content"] for r in records]
401
+
402
+ # Generate all embeddings in one API call
403
+ embeddings = generate_embeddings(
404
+ "openai:text-embedding-3-small",
405
+ texts
406
+ )
407
+
408
+ # Store embeddings
409
+ for record, embedding in zip(records, embeddings):
410
+ await postgres.execute(
411
+ """
412
+ UPDATE documents
413
+ SET embedding = $1::vector
414
+ WHERE id = $2
415
+ """,
416
+ embedding,
417
+ record["id"],
418
+ )
419
+ ```
420
+
421
+ #### Similarity Search
422
+
423
+ ```python
424
+ # Vector similarity search using pgvector
425
+ async def search_similar_documents(query: str, limit: int = 10):
426
+ # Generate query embedding
427
+ query_embedding = generate_embeddings(
428
+ "openai:text-embedding-3-small",
429
+ query
430
+ )
431
+
432
+ # Search using cosine similarity (pgvector <=> operator)
433
+ results = await postgres.fetch_all(
434
+ """
435
+ SELECT id, content,
436
+ embedding <=> $1::vector as distance
437
+ FROM documents
438
+ WHERE embedding IS NOT NULL
439
+ ORDER BY embedding <=> $1::vector
440
+ LIMIT $2
441
+ """,
442
+ query_embedding,
443
+ limit,
444
+ )
445
+
446
+ return results
447
+ ```
448
+
449
+ #### Create Vector Index
450
+
451
+ ```python
452
+ # Create ivfflat index for faster similarity search
453
+ # Note: Requires at least 1000 rows for effective indexing
454
+ await postgres.execute("""
455
+ CREATE INDEX IF NOT EXISTS idx_documents_embedding
456
+ ON documents
457
+ USING ivfflat (embedding vector_cosine_ops)
458
+ WITH (lists = 100);
459
+ """)
460
+ ```
461
+
462
+ ### Best Practices
463
+
464
+ 1. **Batch Processing**
465
+ - Always batch multiple texts in a single API call when possible
466
+ - OpenAI supports up to 2048 inputs per request
467
+ - Reduces API overhead and stays within RPM (requests per minute) limits
468
+
469
+ 2. **Rate Limit Management**
470
+ - Uses `tenacity` library for automatic exponential backoff (default: 1 retry with 1s wait)
471
+ - Adjust `max_retries` parameter if needed (default: 1)
472
+ - Monitor your usage and adjust batch size accordingly
473
+ - For large-scale processing, implement a queue system
474
+
475
+ 3. **Cost Optimization**
476
+ - Use `text-embedding-3-small` ($0.02/1M tokens) for most use cases
477
+ - Only use `text-embedding-3-large` ($0.13/1M tokens) when higher accuracy is critical
478
+ - Batch requests to minimize API calls
479
+
480
+ 4. **Error Handling**
481
+ - Catch `RateLimitError` separately for specific rate limit handling
482
+ - Catch `EmbeddingError` for general API errors
483
+ - Validate `embedding_provider` format early in your code
484
+
485
+ 5. **PostgreSQL Performance**
486
+ - Create vector indexes after populating data (requires 1000+ rows)
487
+ - Use `ivfflat` indexes for approximate nearest neighbor search
488
+ - Consider HNSW indexes for better accuracy (pgvector 0.5.0+)
489
+ - Use `vector_cosine_ops` for cosine similarity (most common)
490
+
491
+ 6. **Environment Variables**
492
+ - Set `LLM__OPENAI_API_KEY` in `.env` for consistency with REM settings
493
+ - Falls back to `OPENAI_API_KEY` for compatibility
494
+ - Never commit API keys to version control
495
+
496
+ ### API Reference
497
+
498
+ #### `generate_embeddings()`
499
+
500
+ ```python
501
+ def generate_embeddings(
502
+ embedding_provider: str,
503
+ texts: str | list[str],
504
+ api_key: str | None = None,
505
+ max_retries: int = 1,
506
+ ) -> list[float] | list[list[float]]:
507
+ """
508
+ Generate embeddings for text(s) using specified provider.
509
+
510
+ Uses tenacity for automatic retry with exponential backoff on rate limits.
511
+
512
+ Args:
513
+ embedding_provider: Provider and model (e.g., "openai:text-embedding-3-small")
514
+ texts: Single text or list of texts
515
+ api_key: API key (if None, reads from environment)
516
+ max_retries: Maximum retry attempts for rate limits (default: 1)
517
+
518
+ Returns:
519
+ Single embedding (list[float]) or list of embeddings (list[list[float]])
520
+
521
+ Raises:
522
+ EmbeddingError: If generation fails
523
+ RateLimitError: If rate limit exceeded after retries
524
+ ValueError: If provider format is invalid
525
+ """
526
+ ```
527
+
528
+ #### `get_embedding_dimension()`
529
+
530
+ ```python
531
+ def get_embedding_dimension(embedding_provider: str) -> int:
532
+ """
533
+ Get embedding dimension for a provider and model.
534
+
535
+ Args:
536
+ embedding_provider: Provider and model (e.g., "openai:text-embedding-3-small")
537
+
538
+ Returns:
539
+ Embedding dimension (e.g., 1536)
540
+
541
+ Raises:
542
+ ValueError: If provider/model is unknown
543
+ """
544
+ ```
545
+
546
+ ### Environment Variables
547
+
548
+ Set in `.env` or environment:
549
+
550
+ ```bash
551
+ # OpenAI (preferred format for REM)
552
+ LLM__OPENAI_API_KEY=sk-...
553
+
554
+ # Or standard OpenAI format (fallback)
555
+ OPENAI_API_KEY=sk-...
556
+
557
+ # Anthropic/Voyage AI
558
+ LLM__ANTHROPIC_API_KEY=sk-ant-...
559
+ ```
560
+
561
+ ### Running the Example
562
+
563
+ ```bash
564
+ # Set API key
565
+ export LLM__OPENAI_API_KEY='sk-...'
566
+
567
+ # Run examples
568
+ cd src/rem/utils/examples
569
+ python embeddings_example.py
570
+ ```
571
+
572
+ This will demonstrate:
573
+ - Single text embedding
574
+ - Batch processing
575
+ - Multiple providers
576
+ - Error handling
577
+ - PostgreSQL integration patterns
578
+
579
+ ### See Also
580
+
581
+ - `examples/embeddings_example.py` - Complete working examples
582
+ - `sql_types.py` - Use `embedding_provider` in json_schema_extra for TEXT fields
583
+ - OpenAI Embeddings API: https://platform.openai.com/docs/api-reference/embeddings
584
+ - pgvector Documentation: https://github.com/pgvector/pgvector
585
+
586
+ ---
587
+
588
+ ## Files (`files.py`)
589
+
590
+ File utilities including temporary file handling and DataFrame I/O with automatic format detection.
591
+
592
+ ### DataFrame I/O
593
+
594
+ Read and write DataFrames with format auto-detected from file extension:
595
+
596
+ ```python
597
+ from rem.utils.files import read_dataframe, write_dataframe
598
+
599
+ # Read - format inferred from extension
600
+ df = read_dataframe("data.csv")
601
+ df = read_dataframe("data.parquet")
602
+ df = read_dataframe("data.xlsx")
603
+
604
+ # Read from bytes (e.g., from S3)
605
+ df = read_dataframe(content_bytes, filename="data.csv")
606
+
607
+ # Write - format inferred from extension
608
+ write_dataframe(df, "output.parquet")
609
+ ```
610
+
611
+ **Supported formats**: `.csv`, `.tsv`, `.parquet`, `.json`, `.jsonl`, `.avro`, `.xlsx`, `.xls`, `.ods`, `.ipc`, `.arrow`, `.feather`
612
+
613
+ Note: Some formats require optional dependencies (e.g., `fastexcel` for Excel).
614
+
615
+ ### Temporary File Utilities
616
+
617
+ ```python
618
+ from rem.utils.files import temp_file_from_bytes, temp_directory
619
+
620
+ # Create temp file from bytes, auto-cleanup
621
+ with temp_file_from_bytes(pdf_bytes, suffix=".pdf") as tmp_path:
622
+ result = process_pdf(tmp_path)
623
+
624
+ # Create temp directory, auto-cleanup
625
+ with temp_directory() as tmp_dir:
626
+ # Work with files in tmp_dir
627
+ pass
628
+ ```
rem/utils/__init__.py ADDED
@@ -0,0 +1,61 @@
1
+ """
2
+ REM Utilities
3
+
4
+ Utility functions and helpers for the REM system:
5
+ - sql_types: Pydantic to PostgreSQL type mapping
6
+ - embeddings: Vector embeddings generation using requests library
7
+ - user_id: Deterministic UUID generation from email addresses
8
+ - sql_paths: SQL file path resolution for packages and user migrations
9
+ """
10
+
11
+ from .embeddings import (
12
+ EmbeddingError,
13
+ RateLimitError,
14
+ generate_embeddings,
15
+ get_embedding_dimension,
16
+ )
17
+ from .sql_types import (
18
+ get_column_definition,
19
+ get_sql_type,
20
+ model_to_create_table,
21
+ model_to_upsert,
22
+ )
23
+ from .user_id import (
24
+ email_to_user_id,
25
+ is_valid_uuid,
26
+ user_id_to_uuid,
27
+ )
28
+ from .sql_paths import (
29
+ USER_SQL_DIR_CONVENTION,
30
+ get_package_sql_dir,
31
+ get_package_migrations_dir,
32
+ get_user_sql_dir,
33
+ list_package_migrations,
34
+ list_user_migrations,
35
+ list_all_migrations,
36
+ )
37
+
38
+ __all__ = [
39
+ # SQL Types
40
+ "get_sql_type",
41
+ "get_column_definition",
42
+ "model_to_create_table",
43
+ "model_to_upsert",
44
+ # Embeddings
45
+ "generate_embeddings",
46
+ "get_embedding_dimension",
47
+ "EmbeddingError",
48
+ "RateLimitError",
49
+ # User ID
50
+ "email_to_user_id",
51
+ "user_id_to_uuid",
52
+ "is_valid_uuid",
53
+ # SQL Paths
54
+ "USER_SQL_DIR_CONVENTION",
55
+ "get_package_sql_dir",
56
+ "get_package_migrations_dir",
57
+ "get_user_sql_dir",
58
+ "list_package_migrations",
59
+ "list_user_migrations",
60
+ "list_all_migrations",
61
+ ]