remdb 0.2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +565 -0
- rem/cli/commands/configure.py +423 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1124 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +88 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +806 -0
- rem/services/content/service.py +657 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +229 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.2.6.dist-info/METADATA +1191 -0
- remdb-0.2.6.dist-info/RECORD +187 -0
- remdb-0.2.6.dist-info/WHEEL +4 -0
- remdb-0.2.6.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example usage of embeddings utility for generating vector embeddings.
|
|
3
|
+
|
|
4
|
+
This demonstrates batch processing, error handling with tenacity automatic retries,
|
|
5
|
+
and integration patterns for the PostgresService.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from rem.utils.embeddings import (
|
|
9
|
+
EmbeddingError,
|
|
10
|
+
RateLimitError,
|
|
11
|
+
generate_embeddings,
|
|
12
|
+
get_embedding_dimension,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def example_single_embedding():
|
|
17
|
+
"""Generate embedding for a single text."""
|
|
18
|
+
print("=" * 80)
|
|
19
|
+
print("SINGLE EMBEDDING EXAMPLE")
|
|
20
|
+
print("=" * 80)
|
|
21
|
+
|
|
22
|
+
text = "What is the meaning of life?"
|
|
23
|
+
embedding_provider = "openai:text-embedding-3-small"
|
|
24
|
+
|
|
25
|
+
try:
|
|
26
|
+
# Generate embedding
|
|
27
|
+
embedding = generate_embeddings(embedding_provider, text)
|
|
28
|
+
|
|
29
|
+
# Check dimensions
|
|
30
|
+
dimension = get_embedding_dimension(embedding_provider)
|
|
31
|
+
|
|
32
|
+
print(f"\nText: {text}")
|
|
33
|
+
print(f"Provider: {embedding_provider}")
|
|
34
|
+
print(f"Embedding dimension: {dimension}")
|
|
35
|
+
print(f"Actual length: {len(embedding)}")
|
|
36
|
+
print(f"First 5 values: {embedding[:5]}")
|
|
37
|
+
|
|
38
|
+
except EmbeddingError as e:
|
|
39
|
+
print(f"Error: {e}")
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def example_batch_embeddings():
|
|
43
|
+
"""Generate embeddings for multiple texts in a single API call."""
|
|
44
|
+
print("\n" + "=" * 80)
|
|
45
|
+
print("BATCH EMBEDDING EXAMPLE")
|
|
46
|
+
print("=" * 80)
|
|
47
|
+
|
|
48
|
+
texts = [
|
|
49
|
+
"What is the meaning of life?",
|
|
50
|
+
"How do I bake a chocolate cake?",
|
|
51
|
+
"Explain quantum physics in simple terms",
|
|
52
|
+
"Write a haiku about programming",
|
|
53
|
+
"What is the capital of France?",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
embedding_provider = "openai:text-embedding-3-small"
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
# Generate embeddings in batch (more efficient than individual calls)
|
|
60
|
+
embeddings = generate_embeddings(embedding_provider, texts)
|
|
61
|
+
|
|
62
|
+
print(f"\nGenerated {len(embeddings)} embeddings")
|
|
63
|
+
print(f"Provider: {embedding_provider}\n")
|
|
64
|
+
|
|
65
|
+
for i, (text, embedding) in enumerate(zip(texts, embeddings)):
|
|
66
|
+
print(f"{i+1}. {text[:50]}...")
|
|
67
|
+
print(f" Dimension: {len(embedding)}")
|
|
68
|
+
print(f" First 3 values: {embedding[:3]}")
|
|
69
|
+
|
|
70
|
+
except RateLimitError as e:
|
|
71
|
+
print(f"Rate limit exceeded: {e}")
|
|
72
|
+
print("Tenacity automatic retry failed. Consider reducing batch size or waiting.")
|
|
73
|
+
except EmbeddingError as e:
|
|
74
|
+
print(f"Error: {e}")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def example_multiple_providers():
|
|
78
|
+
"""Compare embeddings from different providers."""
|
|
79
|
+
print("\n" + "=" * 80)
|
|
80
|
+
print("MULTIPLE PROVIDERS EXAMPLE")
|
|
81
|
+
print("=" * 80)
|
|
82
|
+
|
|
83
|
+
text = "Machine learning is transforming software development"
|
|
84
|
+
|
|
85
|
+
providers = [
|
|
86
|
+
"openai:text-embedding-3-small",
|
|
87
|
+
"openai:text-embedding-3-large",
|
|
88
|
+
"openai:text-embedding-ada-002",
|
|
89
|
+
]
|
|
90
|
+
|
|
91
|
+
print(f"\nText: {text}\n")
|
|
92
|
+
|
|
93
|
+
for provider in providers:
|
|
94
|
+
try:
|
|
95
|
+
embedding = generate_embeddings(provider, text)
|
|
96
|
+
dimension = get_embedding_dimension(provider)
|
|
97
|
+
|
|
98
|
+
print(f"Provider: {provider}")
|
|
99
|
+
print(f" Dimension: {dimension}")
|
|
100
|
+
print(f" First 3 values: {embedding[:3]}\n")
|
|
101
|
+
|
|
102
|
+
except EmbeddingError as e:
|
|
103
|
+
print(f"Provider: {provider}")
|
|
104
|
+
print(f" Error: {e}\n")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def example_error_handling():
|
|
108
|
+
"""Demonstrate error handling and retries."""
|
|
109
|
+
print("\n" + "=" * 80)
|
|
110
|
+
print("ERROR HANDLING EXAMPLE")
|
|
111
|
+
print("=" * 80)
|
|
112
|
+
|
|
113
|
+
# Invalid provider format
|
|
114
|
+
try:
|
|
115
|
+
generate_embeddings("invalid_format", "test")
|
|
116
|
+
except ValueError as e:
|
|
117
|
+
print(f"\nInvalid format error (expected): {e}")
|
|
118
|
+
|
|
119
|
+
# Empty text
|
|
120
|
+
try:
|
|
121
|
+
generate_embeddings("openai:text-embedding-3-small", [])
|
|
122
|
+
except ValueError as e:
|
|
123
|
+
print(f"\nEmpty input error (expected): {e}")
|
|
124
|
+
|
|
125
|
+
# Unknown model
|
|
126
|
+
try:
|
|
127
|
+
get_embedding_dimension("openai:unknown-model")
|
|
128
|
+
except ValueError as e:
|
|
129
|
+
print(f"\nUnknown model error (expected): {e}")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def example_postgres_integration():
|
|
133
|
+
"""
|
|
134
|
+
Example pattern for PostgresService integration.
|
|
135
|
+
|
|
136
|
+
This shows how to use embeddings utility in a PostgresService method.
|
|
137
|
+
"""
|
|
138
|
+
print("\n" + "=" * 80)
|
|
139
|
+
print("POSTGRES INTEGRATION PATTERN")
|
|
140
|
+
print("=" * 80)
|
|
141
|
+
|
|
142
|
+
print(
|
|
143
|
+
"""
|
|
144
|
+
# In PostgresService class:
|
|
145
|
+
|
|
146
|
+
async def generate_and_store_embedding(
|
|
147
|
+
self,
|
|
148
|
+
table_name: str,
|
|
149
|
+
record_id: str,
|
|
150
|
+
text_content: str,
|
|
151
|
+
embedding_provider: str = "openai:text-embedding-3-small"
|
|
152
|
+
) -> None:
|
|
153
|
+
'''
|
|
154
|
+
Generate embedding for text content and store in database.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
table_name: Table containing the record
|
|
158
|
+
record_id: ID of the record to update
|
|
159
|
+
text_content: Text to embed
|
|
160
|
+
embedding_provider: Provider and model for embeddings
|
|
161
|
+
'''
|
|
162
|
+
from rem.utils.embeddings import generate_embeddings, get_embedding_dimension
|
|
163
|
+
|
|
164
|
+
# Generate embedding
|
|
165
|
+
embedding = generate_embeddings(embedding_provider, text_content)
|
|
166
|
+
|
|
167
|
+
# Get dimension for vector column
|
|
168
|
+
dimension = get_embedding_dimension(embedding_provider)
|
|
169
|
+
|
|
170
|
+
# Ensure vector column exists
|
|
171
|
+
await self.execute(f'''
|
|
172
|
+
ALTER TABLE {table_name}
|
|
173
|
+
ADD COLUMN IF NOT EXISTS embedding vector({dimension})
|
|
174
|
+
''')
|
|
175
|
+
|
|
176
|
+
# Store embedding
|
|
177
|
+
await self.execute(
|
|
178
|
+
f'''
|
|
179
|
+
UPDATE {table_name}
|
|
180
|
+
SET embedding = $1::vector
|
|
181
|
+
WHERE id = $2
|
|
182
|
+
''',
|
|
183
|
+
embedding,
|
|
184
|
+
record_id
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def batch_generate_embeddings(
|
|
189
|
+
self,
|
|
190
|
+
table_name: str,
|
|
191
|
+
text_column: str = "content",
|
|
192
|
+
embedding_provider: str = "openai:text-embedding-3-small",
|
|
193
|
+
batch_size: int = 100
|
|
194
|
+
) -> None:
|
|
195
|
+
'''
|
|
196
|
+
Generate embeddings for all records in a table (batch processing).
|
|
197
|
+
|
|
198
|
+
Args:
|
|
199
|
+
table_name: Table to process
|
|
200
|
+
text_column: Column containing text to embed
|
|
201
|
+
embedding_provider: Provider and model for embeddings
|
|
202
|
+
batch_size: Number of records to process per batch
|
|
203
|
+
'''
|
|
204
|
+
from rem.utils.embeddings import generate_embeddings, get_embedding_dimension
|
|
205
|
+
|
|
206
|
+
# Get dimension
|
|
207
|
+
dimension = get_embedding_dimension(embedding_provider)
|
|
208
|
+
|
|
209
|
+
# Ensure vector column exists
|
|
210
|
+
await self.execute(f'''
|
|
211
|
+
ALTER TABLE {table_name}
|
|
212
|
+
ADD COLUMN IF NOT EXISTS embedding vector({dimension})
|
|
213
|
+
''')
|
|
214
|
+
|
|
215
|
+
# Get all records without embeddings
|
|
216
|
+
records = await self.fetch_all(f'''
|
|
217
|
+
SELECT id, {text_column}
|
|
218
|
+
FROM {table_name}
|
|
219
|
+
WHERE embedding IS NULL
|
|
220
|
+
LIMIT {batch_size}
|
|
221
|
+
''')
|
|
222
|
+
|
|
223
|
+
if not records:
|
|
224
|
+
return
|
|
225
|
+
|
|
226
|
+
# Extract texts and IDs
|
|
227
|
+
texts = [record[text_column] for record in records]
|
|
228
|
+
ids = [record['id'] for record in records]
|
|
229
|
+
|
|
230
|
+
# Generate embeddings in batch
|
|
231
|
+
embeddings = generate_embeddings(embedding_provider, texts)
|
|
232
|
+
|
|
233
|
+
# Store embeddings
|
|
234
|
+
for record_id, embedding in zip(ids, embeddings):
|
|
235
|
+
await self.execute(
|
|
236
|
+
f'''
|
|
237
|
+
UPDATE {table_name}
|
|
238
|
+
SET embedding = $1::vector
|
|
239
|
+
WHERE id = $2
|
|
240
|
+
''',
|
|
241
|
+
embedding,
|
|
242
|
+
record_id
|
|
243
|
+
)
|
|
244
|
+
"""
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
if __name__ == "__main__":
|
|
249
|
+
# Run examples
|
|
250
|
+
# NOTE: Requires OPENAI_API_KEY or LLM__OPENAI_API_KEY environment variable
|
|
251
|
+
|
|
252
|
+
# Check if API key is available
|
|
253
|
+
import os
|
|
254
|
+
|
|
255
|
+
if not (os.getenv("OPENAI_API_KEY") or os.getenv("LLM__OPENAI_API_KEY")):
|
|
256
|
+
print("=" * 80)
|
|
257
|
+
print("SETUP REQUIRED")
|
|
258
|
+
print("=" * 80)
|
|
259
|
+
print("\nTo run these examples, set your OpenAI API key:")
|
|
260
|
+
print(" export OPENAI_API_KEY='sk-...'")
|
|
261
|
+
print(" # OR")
|
|
262
|
+
print(" export LLM__OPENAI_API_KEY='sk-...'")
|
|
263
|
+
print("\nThen run:")
|
|
264
|
+
print(" python embeddings_example.py")
|
|
265
|
+
exit(1)
|
|
266
|
+
|
|
267
|
+
# Run examples (comment out if you don't want to make API calls)
|
|
268
|
+
example_single_embedding()
|
|
269
|
+
example_batch_embeddings()
|
|
270
|
+
example_multiple_providers()
|
|
271
|
+
example_error_handling()
|
|
272
|
+
example_postgres_integration()
|
|
273
|
+
|
|
274
|
+
print("\n" + "=" * 80)
|
|
275
|
+
print("BEST PRACTICES")
|
|
276
|
+
print("=" * 80)
|
|
277
|
+
print(
|
|
278
|
+
"""
|
|
279
|
+
1. Batch Processing:
|
|
280
|
+
- Process multiple texts in a single API call (up to 2048 for OpenAI)
|
|
281
|
+
- Reduces API overhead and stays within rate limits (RPM)
|
|
282
|
+
- Example: generate_embeddings(provider, [text1, text2, ...])
|
|
283
|
+
|
|
284
|
+
2. Rate Limit Handling:
|
|
285
|
+
- Uses tenacity library for automatic exponential backoff (default: 1 retry)
|
|
286
|
+
- Adjust max_retries parameter if needed (default: 1)
|
|
287
|
+
- Monitor your usage and adjust batch_size accordingly
|
|
288
|
+
- Consider implementing a queue for large-scale processing
|
|
289
|
+
|
|
290
|
+
3. Error Handling:
|
|
291
|
+
- Catch EmbeddingError for general API errors
|
|
292
|
+
- Catch RateLimitError for rate limit specific handling
|
|
293
|
+
- Validate embedding_provider format before batch processing
|
|
294
|
+
|
|
295
|
+
4. Cost Optimization:
|
|
296
|
+
- OpenAI text-embedding-3-small: $0.02 / 1M tokens
|
|
297
|
+
- OpenAI text-embedding-3-large: $0.13 / 1M tokens
|
|
298
|
+
- Use smaller models unless you need higher accuracy
|
|
299
|
+
|
|
300
|
+
5. PostgreSQL Integration:
|
|
301
|
+
- Use vector({dimension}) column type with pgvector extension
|
|
302
|
+
- Create indexes: CREATE INDEX ON table USING ivfflat (embedding vector_cosine_ops)
|
|
303
|
+
- For similarity search: ORDER BY embedding <=> query_vector LIMIT 10
|
|
304
|
+
"""
|
|
305
|
+
)
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Example usage of sql_types utility for generating PostgreSQL schema from Pydantic models.
|
|
3
|
+
|
|
4
|
+
This demonstrates how REM entity models are mapped to PostgreSQL types.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from uuid import UUID
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field
|
|
11
|
+
|
|
12
|
+
from rem.utils.sql_types import (
|
|
13
|
+
get_column_definition,
|
|
14
|
+
get_sql_type,
|
|
15
|
+
model_to_create_table,
|
|
16
|
+
model_to_upsert,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# Example 1: CoreModel with various field types
|
|
21
|
+
class CoreModel(BaseModel):
|
|
22
|
+
"""Base model demonstrating all common field types."""
|
|
23
|
+
|
|
24
|
+
# ID - Union type, should prefer UUID
|
|
25
|
+
id: UUID | str = Field(..., description="Unique identifier")
|
|
26
|
+
|
|
27
|
+
# Timestamps
|
|
28
|
+
created_at: datetime = Field(default_factory=datetime.utcnow)
|
|
29
|
+
updated_at: datetime = Field(default_factory=datetime.utcnow)
|
|
30
|
+
|
|
31
|
+
# Optional tenant/user fields
|
|
32
|
+
tenant_id: str | None = Field(default=None, description="Tenant identifier")
|
|
33
|
+
user_id: str | None = Field(default=None, description="User identifier")
|
|
34
|
+
|
|
35
|
+
# JSONB fields
|
|
36
|
+
graph_edges: list[dict] = Field(default_factory=list, description="Graph edges")
|
|
37
|
+
metadata: dict = Field(default_factory=dict, description="Flexible metadata")
|
|
38
|
+
|
|
39
|
+
# Array fields
|
|
40
|
+
tags: list[str] = Field(default_factory=list, description="Tags")
|
|
41
|
+
|
|
42
|
+
# Database schema metadata
|
|
43
|
+
column: dict = Field(default_factory=dict, description="Column metadata")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Example 2: Resource with content fields
|
|
47
|
+
class Resource(BaseModel):
|
|
48
|
+
"""Resource entity with long-form text fields."""
|
|
49
|
+
|
|
50
|
+
id: str
|
|
51
|
+
name: str # VARCHAR(256)
|
|
52
|
+
uri: str | None = None # VARCHAR(256), nullable
|
|
53
|
+
content: str = "" # TEXT (long-form field name)
|
|
54
|
+
description: str | None = None # TEXT (long-form field name)
|
|
55
|
+
category: str | None = None # VARCHAR(256)
|
|
56
|
+
related_entities: list[dict] = Field(default_factory=list) # JSONB
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Example 3: Schema with embedding provider
|
|
60
|
+
class Schema(BaseModel):
|
|
61
|
+
"""Schema with embedding field."""
|
|
62
|
+
|
|
63
|
+
id: str
|
|
64
|
+
name: str
|
|
65
|
+
content: str = Field(
|
|
66
|
+
default="",
|
|
67
|
+
json_schema_extra={
|
|
68
|
+
"embedding_provider": "openai:text-embedding-3-small" # Forces TEXT
|
|
69
|
+
},
|
|
70
|
+
)
|
|
71
|
+
spec: dict = Field(..., description="JSON schema specification") # JSONB
|
|
72
|
+
category: str | None = None
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Example 4: Custom SQL type override
|
|
76
|
+
class CustomModel(BaseModel):
|
|
77
|
+
"""Model with custom SQL type specification."""
|
|
78
|
+
|
|
79
|
+
id: str
|
|
80
|
+
vector_data: list[float] = Field(
|
|
81
|
+
default_factory=list,
|
|
82
|
+
json_schema_extra={"sql_type": "vector(1536)"}, # Custom pgvector type
|
|
83
|
+
)
|
|
84
|
+
json_data: dict = Field(default_factory=dict)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def demonstrate_field_mapping():
|
|
88
|
+
"""Show how individual fields map to SQL types."""
|
|
89
|
+
print("=" * 80)
|
|
90
|
+
print("FIELD TYPE MAPPING EXAMPLES")
|
|
91
|
+
print("=" * 80)
|
|
92
|
+
|
|
93
|
+
examples = [
|
|
94
|
+
(CoreModel.model_fields["id"], "id", "Union[UUID, str] -> UUID (prefers UUID in unions)"),
|
|
95
|
+
(CoreModel.model_fields["created_at"], "created_at", "datetime -> TIMESTAMP"),
|
|
96
|
+
(CoreModel.model_fields["tenant_id"], "tenant_id", "str | None -> VARCHAR(256)"),
|
|
97
|
+
(CoreModel.model_fields["graph_edges"], "graph_edges", "list[dict] -> JSONB"),
|
|
98
|
+
(CoreModel.model_fields["metadata"], "metadata", "dict -> JSONB"),
|
|
99
|
+
(CoreModel.model_fields["tags"], "tags", "list[str] -> TEXT[]"),
|
|
100
|
+
(Resource.model_fields["content"], "content", "str (field name 'content') -> TEXT"),
|
|
101
|
+
(Resource.model_fields["name"], "name", "str -> VARCHAR(256)"),
|
|
102
|
+
(
|
|
103
|
+
Schema.model_fields["content"],
|
|
104
|
+
"content",
|
|
105
|
+
"str with embedding_provider (openai:text-embedding-3-small) -> TEXT",
|
|
106
|
+
),
|
|
107
|
+
(
|
|
108
|
+
CustomModel.model_fields["vector_data"],
|
|
109
|
+
"vector_data",
|
|
110
|
+
"list[float] with sql_type -> vector(1536)",
|
|
111
|
+
),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
for field_info, field_name, description in examples:
|
|
115
|
+
sql_type = get_sql_type(field_info, field_name)
|
|
116
|
+
print(f"\n{description}")
|
|
117
|
+
print(f" SQL Type: {sql_type}")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def demonstrate_column_definitions():
|
|
121
|
+
"""Show complete column definitions."""
|
|
122
|
+
print("\n" + "=" * 80)
|
|
123
|
+
print("COLUMN DEFINITION EXAMPLES")
|
|
124
|
+
print("=" * 80)
|
|
125
|
+
|
|
126
|
+
examples = [
|
|
127
|
+
(CoreModel.model_fields["id"], "id", False, True, "Primary key"),
|
|
128
|
+
(CoreModel.model_fields["created_at"], "created_at", False, False, "Required timestamp"),
|
|
129
|
+
(CoreModel.model_fields["tenant_id"], "tenant_id", True, False, "Optional tenant"),
|
|
130
|
+
(CoreModel.model_fields["metadata"], "metadata", False, False, "JSONB with default"),
|
|
131
|
+
(CoreModel.model_fields["tags"], "tags", False, False, "Array with default"),
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
for field_info, field_name, nullable, is_pk, description in examples:
|
|
135
|
+
col_def = get_column_definition(field_info, field_name, nullable, is_pk)
|
|
136
|
+
print(f"\n{description}:")
|
|
137
|
+
print(f" {col_def}")
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def demonstrate_create_table():
|
|
141
|
+
"""Generate CREATE TABLE statements."""
|
|
142
|
+
print("\n" + "=" * 80)
|
|
143
|
+
print("CREATE TABLE EXAMPLES")
|
|
144
|
+
print("=" * 80)
|
|
145
|
+
|
|
146
|
+
# Generate for Resource model
|
|
147
|
+
print("\n-- Resource Table")
|
|
148
|
+
print(model_to_create_table(Resource, "resources"))
|
|
149
|
+
|
|
150
|
+
# Generate for Schema model
|
|
151
|
+
print("\n\n-- Schema Table")
|
|
152
|
+
print(model_to_create_table(Schema, "schemas"))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def demonstrate_upsert():
|
|
156
|
+
"""Generate UPSERT statements."""
|
|
157
|
+
print("\n" + "=" * 80)
|
|
158
|
+
print("UPSERT EXAMPLES")
|
|
159
|
+
print("=" * 80)
|
|
160
|
+
|
|
161
|
+
print("\n-- Resource Upsert")
|
|
162
|
+
print(model_to_upsert(Resource, "resources"))
|
|
163
|
+
|
|
164
|
+
print("\n-- Schema Upsert")
|
|
165
|
+
print(model_to_upsert(Schema, "schemas"))
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
if __name__ == "__main__":
|
|
169
|
+
demonstrate_field_mapping()
|
|
170
|
+
demonstrate_column_definitions()
|
|
171
|
+
demonstrate_create_table()
|
|
172
|
+
demonstrate_upsert()
|
|
173
|
+
|
|
174
|
+
print("\n" + "=" * 80)
|
|
175
|
+
print("USAGE IN CODE")
|
|
176
|
+
print("=" * 80)
|
|
177
|
+
print(
|
|
178
|
+
"""
|
|
179
|
+
# Generate schema for all REM entities
|
|
180
|
+
from rem.models.entities import Resource, Message, User, File, Moment, Schema
|
|
181
|
+
from rem.utils.sql_types import model_to_create_table
|
|
182
|
+
|
|
183
|
+
for model, table_name in [
|
|
184
|
+
(Resource, "resources"),
|
|
185
|
+
(Message, "messages"),
|
|
186
|
+
(User, "users"),
|
|
187
|
+
(File, "files"),
|
|
188
|
+
(Moment, "moments"),
|
|
189
|
+
(Schema, "schemas"),
|
|
190
|
+
]:
|
|
191
|
+
sql = model_to_create_table(model, table_name)
|
|
192
|
+
print(sql)
|
|
193
|
+
print()
|
|
194
|
+
|
|
195
|
+
# Generate upsert for inserting/updating entities
|
|
196
|
+
from rem.utils.sql_types import model_to_upsert
|
|
197
|
+
|
|
198
|
+
upsert_sql = model_to_upsert(Resource, "resources")
|
|
199
|
+
# Use with psycopg:
|
|
200
|
+
# cursor.execute(upsert_sql, (id, name, uri, content, ...))
|
|
201
|
+
"""
|
|
202
|
+
)
|
rem/utils/markdown.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Markdown conversion utilities for document processing."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def to_markdown(content: str, filename: str) -> str:
|
|
5
|
+
"""
|
|
6
|
+
Convert extracted content to structured markdown.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
content: Extracted text content
|
|
10
|
+
filename: Source filename
|
|
11
|
+
|
|
12
|
+
Returns:
|
|
13
|
+
Structured markdown string with header
|
|
14
|
+
"""
|
|
15
|
+
lines = [f"# {filename}\n", content]
|
|
16
|
+
return "\n".join(lines)
|