remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
rem/utils/batch_ops.py
ADDED
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Batch Operations Utilities.
|
|
3
|
+
|
|
4
|
+
Provides utilities for batch upserting records with:
|
|
5
|
+
- Automatic KV store population (via triggers)
|
|
6
|
+
- Embedding generation (stubbed - to be implemented)
|
|
7
|
+
- Efficient batch processing
|
|
8
|
+
|
|
9
|
+
Design:
|
|
10
|
+
- Uses Pydantic models for type safety
|
|
11
|
+
- Delegates SQL generation to utils.sql_types
|
|
12
|
+
- Keeps PostgresService clean
|
|
13
|
+
- Handles batching automatically
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from typing import Any, Type
|
|
17
|
+
from uuid import UUID, uuid4, uuid5, NAMESPACE_OID
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
import hashlib
|
|
20
|
+
import json
|
|
21
|
+
|
|
22
|
+
from loguru import logger
|
|
23
|
+
from pydantic import BaseModel
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def generate_deterministic_id(
|
|
27
|
+
user_id: str | None, key_values: list[str] | str
|
|
28
|
+
) -> UUID:
|
|
29
|
+
"""
|
|
30
|
+
Generate deterministic UUID from user_id and business key(s).
|
|
31
|
+
|
|
32
|
+
Convention: If a business key field exists (name, uri, etc.), the ID should be
|
|
33
|
+
deterministic based on user_id + key_value(s). This allows upserts to work
|
|
34
|
+
based on the business key rather than requiring explicit ID management.
|
|
35
|
+
|
|
36
|
+
Composite Keys:
|
|
37
|
+
- For composite keys (e.g., uri + ordinal), pass list of values
|
|
38
|
+
- Values are concatenated with ":" separator
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
user_id: User identifier (tenant scoped)
|
|
42
|
+
key_values: Business key value(s) - single string or list for composite keys
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Deterministic UUID v5 based on user_id + key(s)
|
|
46
|
+
|
|
47
|
+
Examples:
|
|
48
|
+
>>> id1 = generate_deterministic_id("user-123", "my-document")
|
|
49
|
+
>>> id2 = generate_deterministic_id("user-123", "my-document")
|
|
50
|
+
>>> id1 == id2
|
|
51
|
+
True
|
|
52
|
+
>>> # Composite key
|
|
53
|
+
>>> id3 = generate_deterministic_id("user-123", ["docs://file.pdf", "0"])
|
|
54
|
+
>>> id4 = generate_deterministic_id("user-123", ["docs://file.pdf", "0"])
|
|
55
|
+
>>> id3 == id4
|
|
56
|
+
True
|
|
57
|
+
"""
|
|
58
|
+
# Create namespace from user_id (or use NULL namespace if no user)
|
|
59
|
+
namespace_str = user_id or "system"
|
|
60
|
+
namespace = uuid5(NAMESPACE_OID, namespace_str)
|
|
61
|
+
|
|
62
|
+
# Handle composite keys
|
|
63
|
+
if isinstance(key_values, list):
|
|
64
|
+
composite_key = ":".join(str(v) for v in key_values)
|
|
65
|
+
else:
|
|
66
|
+
composite_key = str(key_values)
|
|
67
|
+
|
|
68
|
+
# Generate deterministic UUID from business key
|
|
69
|
+
return uuid5(namespace, composite_key)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def prepare_record_for_upsert(
|
|
73
|
+
record: BaseModel,
|
|
74
|
+
model: Type[BaseModel],
|
|
75
|
+
entity_key_field: str | None = None,
|
|
76
|
+
) -> dict[str, Any]:
|
|
77
|
+
"""
|
|
78
|
+
Prepare a Pydantic record for database upsert.
|
|
79
|
+
|
|
80
|
+
ID Generation Convention:
|
|
81
|
+
- If entity_key_field is provided: Generate deterministic ID from user_id + key
|
|
82
|
+
- Otherwise: Generate random UUID v4
|
|
83
|
+
|
|
84
|
+
This allows business key-based upserts where the same user + key always gets
|
|
85
|
+
the same ID, enabling natural upsert behavior.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
record: Pydantic model instance
|
|
89
|
+
model: Pydantic model class
|
|
90
|
+
entity_key_field: Optional business key field name (name, uri, etc.)
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Dict with field values ready for SQL insertion
|
|
94
|
+
|
|
95
|
+
Example:
|
|
96
|
+
>>> from rem.models.entities import Resource
|
|
97
|
+
>>> resource = Resource(name="test", content="data", tenant_id="acme", user_id="sarah")
|
|
98
|
+
>>> data = prepare_record_for_upsert(resource, Resource, entity_key_field="name")
|
|
99
|
+
>>> "id" in data # ID generated from user_id + name
|
|
100
|
+
True
|
|
101
|
+
"""
|
|
102
|
+
# Convert to dict
|
|
103
|
+
data = record.model_dump()
|
|
104
|
+
|
|
105
|
+
# Generate ID based on convention
|
|
106
|
+
if "id" not in data or data["id"] is None:
|
|
107
|
+
user_id = data.get("user_id")
|
|
108
|
+
|
|
109
|
+
# Check for composite keys (fields with composite_key=True in json_schema_extra)
|
|
110
|
+
composite_key_fields = []
|
|
111
|
+
for field_name, field_info in model.model_fields.items():
|
|
112
|
+
json_extra = getattr(field_info, "json_schema_extra", None)
|
|
113
|
+
if json_extra and isinstance(json_extra, dict):
|
|
114
|
+
if json_extra.get("composite_key") is True:
|
|
115
|
+
composite_key_fields.append(field_name)
|
|
116
|
+
|
|
117
|
+
# Check if we have a business key field
|
|
118
|
+
if entity_key_field and entity_key_field in data:
|
|
119
|
+
key_value = data[entity_key_field]
|
|
120
|
+
|
|
121
|
+
if key_value:
|
|
122
|
+
# Build composite key if additional fields exist
|
|
123
|
+
if composite_key_fields:
|
|
124
|
+
key_values = [str(key_value)]
|
|
125
|
+
for comp_field in composite_key_fields:
|
|
126
|
+
if comp_field in data:
|
|
127
|
+
key_values.append(str(data[comp_field]))
|
|
128
|
+
|
|
129
|
+
# Generate deterministic ID from composite key
|
|
130
|
+
data["id"] = generate_deterministic_id(user_id, key_values)
|
|
131
|
+
logger.debug(
|
|
132
|
+
f"Generated deterministic ID from composite key: "
|
|
133
|
+
f"{entity_key_field}={key_value} + {composite_key_fields}"
|
|
134
|
+
)
|
|
135
|
+
else:
|
|
136
|
+
# Single business key
|
|
137
|
+
data["id"] = generate_deterministic_id(user_id, str(key_value))
|
|
138
|
+
logger.debug(
|
|
139
|
+
f"Generated deterministic ID from {entity_key_field}={key_value}"
|
|
140
|
+
)
|
|
141
|
+
else:
|
|
142
|
+
# Key field is None, use random UUID
|
|
143
|
+
data["id"] = uuid4()
|
|
144
|
+
else:
|
|
145
|
+
# No business key, use random UUID
|
|
146
|
+
data["id"] = uuid4()
|
|
147
|
+
|
|
148
|
+
# Handle UUID serialization
|
|
149
|
+
if "id" in data and isinstance(data["id"], UUID):
|
|
150
|
+
data["id"] = str(data["id"])
|
|
151
|
+
|
|
152
|
+
# JSONB fields: asyncpg handles dict/list serialization automatically
|
|
153
|
+
# DO NOT convert to JSON strings - asyncpg expects native Python types
|
|
154
|
+
# PostgreSQL JSONB columns work with Python dicts/lists directly
|
|
155
|
+
|
|
156
|
+
# Normalize datetime fields to be timezone-naive (PostgreSQL TIMESTAMP WITHOUT TIME ZONE)
|
|
157
|
+
for field_name, field_value in data.items():
|
|
158
|
+
if isinstance(field_value, datetime) and field_value.tzinfo is not None:
|
|
159
|
+
# Convert timezone-aware datetime to naive UTC
|
|
160
|
+
data[field_name] = field_value.replace(tzinfo=None)
|
|
161
|
+
|
|
162
|
+
# Remove None values for optional fields (let DB handle defaults)
|
|
163
|
+
# Keep None for required fields to trigger NOT NULL constraints
|
|
164
|
+
# Also filter out fields that don't exist in DB schema (tags, column)
|
|
165
|
+
SKIP_FIELDS = {"tags", "column"} # CoreModel fields not in DB schema
|
|
166
|
+
|
|
167
|
+
cleaned_data = {}
|
|
168
|
+
for field_name, field_value in data.items():
|
|
169
|
+
# Skip fields that aren't in DB schema
|
|
170
|
+
if field_name in SKIP_FIELDS:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
field_info = model.model_fields.get(field_name)
|
|
174
|
+
if field_info is not None and field_info.is_required():
|
|
175
|
+
# Keep required fields even if None (will error if truly NULL)
|
|
176
|
+
cleaned_data[field_name] = field_value
|
|
177
|
+
elif field_value is not None:
|
|
178
|
+
# Only include optional fields if they have values
|
|
179
|
+
cleaned_data[field_name] = field_value
|
|
180
|
+
|
|
181
|
+
return cleaned_data
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def batch_iterator(items: list, batch_size: int = 100):
|
|
185
|
+
"""
|
|
186
|
+
Iterate over items in batches.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
items: List of items to batch
|
|
190
|
+
batch_size: Size of each batch
|
|
191
|
+
|
|
192
|
+
Yields:
|
|
193
|
+
Batches of items
|
|
194
|
+
|
|
195
|
+
Example:
|
|
196
|
+
>>> items = list(range(250))
|
|
197
|
+
>>> batches = list(batch_iterator(items, 100))
|
|
198
|
+
>>> len(batches)
|
|
199
|
+
3
|
|
200
|
+
>>> len(batches[0])
|
|
201
|
+
100
|
|
202
|
+
>>> len(batches[2])
|
|
203
|
+
50
|
|
204
|
+
"""
|
|
205
|
+
for i in range(0, len(items), batch_size):
|
|
206
|
+
yield items[i : i + batch_size]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
async def generate_embeddings_stub(
|
|
210
|
+
records: list[BaseModel],
|
|
211
|
+
table_name: str,
|
|
212
|
+
embeddable_fields: list[str],
|
|
213
|
+
provider: str = "openai",
|
|
214
|
+
model: str = "text-embedding-3-small",
|
|
215
|
+
) -> list[dict]:
|
|
216
|
+
"""
|
|
217
|
+
Generate embeddings for record fields (STUBBED).
|
|
218
|
+
|
|
219
|
+
This is a placeholder for the actual embedding generation logic.
|
|
220
|
+
Will be implemented to:
|
|
221
|
+
1. Extract text from embeddable fields
|
|
222
|
+
2. Call OpenAI/Anthropic API in batch
|
|
223
|
+
3. Return embedding records for upsert
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
records: List of Pydantic records
|
|
227
|
+
table_name: Name of the entity table
|
|
228
|
+
embeddable_fields: List of field names to embed
|
|
229
|
+
provider: Embedding provider (openai, cohere, etc.)
|
|
230
|
+
model: Embedding model name
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of embedding records (id, entity_id, field_name, provider, model, embedding)
|
|
234
|
+
|
|
235
|
+
TODO:
|
|
236
|
+
- Implement OpenAI batch embedding API call
|
|
237
|
+
- Handle rate limiting and retries
|
|
238
|
+
- Support multiple providers
|
|
239
|
+
- Cache embeddings to avoid regeneration
|
|
240
|
+
"""
|
|
241
|
+
logger.warning(
|
|
242
|
+
f"Embedding generation is stubbed for {table_name} "
|
|
243
|
+
f"with {len(records)} records and fields {embeddable_fields}"
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
# STUB: Return empty list for now
|
|
247
|
+
# When implemented, this will return records like:
|
|
248
|
+
# [
|
|
249
|
+
# {
|
|
250
|
+
# "entity_id": record.id,
|
|
251
|
+
# "field_name": "content",
|
|
252
|
+
# "provider": "openai",
|
|
253
|
+
# "model": "text-embedding-3-small",
|
|
254
|
+
# "embedding": [0.1, 0.2, ...], # 1536 dimensions
|
|
255
|
+
# }
|
|
256
|
+
# ]
|
|
257
|
+
return []
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def validate_record_for_kv_store(
|
|
261
|
+
record: BaseModel,
|
|
262
|
+
entity_key_field: str,
|
|
263
|
+
tenant_id: str | None = None,
|
|
264
|
+
) -> tuple[bool, str]:
|
|
265
|
+
"""
|
|
266
|
+
Validate that a record has required fields for KV store population.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
record: Pydantic model instance
|
|
270
|
+
entity_key_field: Field name to use as entity_key
|
|
271
|
+
tenant_id: Optional tenant_id override
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
Tuple of (is_valid, error_message)
|
|
275
|
+
|
|
276
|
+
Example:
|
|
277
|
+
>>> from rem.models.entities import Resource
|
|
278
|
+
>>> resource = Resource(name="test", content="data", tenant_id="acme")
|
|
279
|
+
>>> valid, msg = validate_record_for_kv_store(resource, "name", "acme")
|
|
280
|
+
>>> valid
|
|
281
|
+
True
|
|
282
|
+
"""
|
|
283
|
+
# Check entity_key field exists and has value
|
|
284
|
+
if not hasattr(record, entity_key_field):
|
|
285
|
+
return False, f"Record missing entity_key field: {entity_key_field}"
|
|
286
|
+
|
|
287
|
+
entity_key_value = getattr(record, entity_key_field)
|
|
288
|
+
if not entity_key_value:
|
|
289
|
+
return False, f"Entity key field '{entity_key_field}' is empty"
|
|
290
|
+
|
|
291
|
+
# Check tenant_id (either on record or provided)
|
|
292
|
+
record_tenant_id = getattr(record, "tenant_id", None)
|
|
293
|
+
effective_tenant_id = tenant_id or record_tenant_id
|
|
294
|
+
|
|
295
|
+
if not effective_tenant_id:
|
|
296
|
+
return False, "Record must have tenant_id for KV store"
|
|
297
|
+
|
|
298
|
+
return True, ""
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def build_upsert_statement(
|
|
302
|
+
table_name: str,
|
|
303
|
+
field_names: list[str],
|
|
304
|
+
conflict_column: str = "id",
|
|
305
|
+
) -> str:
|
|
306
|
+
"""
|
|
307
|
+
Build PostgreSQL UPSERT statement with proper identifier quoting.
|
|
308
|
+
|
|
309
|
+
PostgreSQL reserved keywords (like "column", "user", "order") must be quoted.
|
|
310
|
+
We quote all identifiers to avoid SQL injection and reserved keyword issues.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
table_name: Name of the table
|
|
314
|
+
field_names: List of field names to insert
|
|
315
|
+
conflict_column: Column to use for conflict detection
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
SQL UPSERT statement with placeholders
|
|
319
|
+
|
|
320
|
+
Example:
|
|
321
|
+
>>> sql = build_upsert_statement("resources", ["id", "name", "content"])
|
|
322
|
+
>>> "ON CONFLICT" in sql
|
|
323
|
+
True
|
|
324
|
+
>>> "DO UPDATE SET" in sql
|
|
325
|
+
True
|
|
326
|
+
"""
|
|
327
|
+
# Quote all identifiers to handle reserved keywords like "column"
|
|
328
|
+
quoted_fields = [f'"{field}"' for field in field_names]
|
|
329
|
+
placeholders = [f"${i+1}" for i in range(len(field_names))]
|
|
330
|
+
|
|
331
|
+
# Exclude conflict column from UPDATE
|
|
332
|
+
update_fields = [f for f in field_names if f != conflict_column]
|
|
333
|
+
update_set = ", ".join([f'"{field}" = EXCLUDED."{field}"' for field in update_fields])
|
|
334
|
+
|
|
335
|
+
sql = f"""
|
|
336
|
+
INSERT INTO {table_name} ({", ".join(quoted_fields)})
|
|
337
|
+
VALUES ({", ".join(placeholders)})
|
|
338
|
+
ON CONFLICT ("{conflict_column}")
|
|
339
|
+
DO UPDATE SET {update_set}
|
|
340
|
+
RETURNING *;
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
return sql.strip()
|
rem/utils/chunking.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Text chunking utilities for document storage.
|
|
3
|
+
|
|
4
|
+
**ARCHITECTURE NOTE - Chunking vs Token Counting**:
|
|
5
|
+
|
|
6
|
+
This module handles SEMANTIC CHUNKING for document storage:
|
|
7
|
+
- Character-based limits (not tokens!)
|
|
8
|
+
- Respects document structure (paragraphs, sections)
|
|
9
|
+
- Creates 2-3 paragraph chunks for searchable resources
|
|
10
|
+
- Stored in database with embeddings
|
|
11
|
+
|
|
12
|
+
TikToken (token counting) is used ELSEWHERE for LLM context management:
|
|
13
|
+
- Agent flows preparing prompts (agentic/*)
|
|
14
|
+
- Context window limits (128K tokens, 200K tokens)
|
|
15
|
+
- Agentic chunking for large inputs (utils/agentic_chunking.py)
|
|
16
|
+
|
|
17
|
+
DO NOT use tiktoken here - document chunks are storage units, not LLM inputs!
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
import re
|
|
21
|
+
from rem.settings import settings
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def chunk_text(text: str) -> list[str]:
|
|
25
|
+
"""
|
|
26
|
+
Chunk text using semantic character-based chunking.
|
|
27
|
+
|
|
28
|
+
**IMPORTANT**: Uses CHARACTER limits, NOT tokens. This creates storage chunks
|
|
29
|
+
for database/embeddings. Token counting happens later in agent flows when
|
|
30
|
+
preparing LLM prompts.
|
|
31
|
+
|
|
32
|
+
Chunking strategy:
|
|
33
|
+
1. Split on double newlines (paragraph boundaries) - PRIMARY
|
|
34
|
+
2. Split on single newlines if paragraph too large
|
|
35
|
+
3. Split on sentence endings (. ! ?) if still too large
|
|
36
|
+
4. Hard split at max_chunk_size if necessary
|
|
37
|
+
|
|
38
|
+
This creates natural 2-3 paragraph chunks suitable for semantic search.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
text: Text to chunk
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of text chunks (typically 10-50 chunks per document)
|
|
45
|
+
|
|
46
|
+
Example:
|
|
47
|
+
>>> text = "\\n\\n".join([f"Paragraph {i}. " + "Sentence. " * 20 for i in range(10)])
|
|
48
|
+
>>> chunks = chunk_text(text) # ~10 paragraphs → ~5-10 chunks
|
|
49
|
+
>>> len(chunks) # Should be reasonable, not 100+
|
|
50
|
+
"""
|
|
51
|
+
if not text or not text.strip():
|
|
52
|
+
return []
|
|
53
|
+
|
|
54
|
+
chunks = []
|
|
55
|
+
current_chunk: list[str] = []
|
|
56
|
+
current_size = 0
|
|
57
|
+
|
|
58
|
+
# Split by paragraphs (double newline) first
|
|
59
|
+
paragraphs = re.split(r'\n\n+', text)
|
|
60
|
+
|
|
61
|
+
for para in paragraphs:
|
|
62
|
+
para = para.strip()
|
|
63
|
+
if not para:
|
|
64
|
+
continue
|
|
65
|
+
|
|
66
|
+
para_len = len(para)
|
|
67
|
+
|
|
68
|
+
# If adding this paragraph would exceed target size, flush current chunk
|
|
69
|
+
if current_size > 0 and current_size + para_len + 2 > settings.chunking.chunk_size:
|
|
70
|
+
# Flush current chunk
|
|
71
|
+
chunk_text = '\n\n'.join(current_chunk)
|
|
72
|
+
if len(chunk_text) >= settings.chunking.min_chunk_size:
|
|
73
|
+
chunks.append(chunk_text)
|
|
74
|
+
current_chunk = []
|
|
75
|
+
current_size = 0
|
|
76
|
+
|
|
77
|
+
# If paragraph itself is too large, split it
|
|
78
|
+
if para_len > settings.chunking.max_chunk_size:
|
|
79
|
+
# Try splitting on sentences
|
|
80
|
+
sentences = re.split(r'([.!?]+\s+)', para)
|
|
81
|
+
sentence_chunk = ""
|
|
82
|
+
|
|
83
|
+
for i in range(0, len(sentences), 2):
|
|
84
|
+
sentence = sentences[i]
|
|
85
|
+
delimiter = sentences[i + 1] if i + 1 < len(sentences) else ""
|
|
86
|
+
|
|
87
|
+
if len(sentence_chunk) + len(sentence) + len(delimiter) > settings.chunking.max_chunk_size:
|
|
88
|
+
if sentence_chunk:
|
|
89
|
+
chunks.append(sentence_chunk.strip())
|
|
90
|
+
sentence_chunk = sentence + delimiter
|
|
91
|
+
else:
|
|
92
|
+
sentence_chunk += sentence + delimiter
|
|
93
|
+
|
|
94
|
+
if sentence_chunk.strip():
|
|
95
|
+
if len(sentence_chunk) >= settings.chunking.min_chunk_size:
|
|
96
|
+
chunks.append(sentence_chunk.strip())
|
|
97
|
+
else:
|
|
98
|
+
# Add paragraph to current chunk
|
|
99
|
+
current_chunk.append(para)
|
|
100
|
+
current_size += para_len + 2 # +2 for the \n\n we'll add when joining
|
|
101
|
+
|
|
102
|
+
# Flush remaining chunk
|
|
103
|
+
if current_chunk:
|
|
104
|
+
chunk_text = '\n\n'.join(current_chunk)
|
|
105
|
+
if len(chunk_text) >= settings.chunking.min_chunk_size:
|
|
106
|
+
chunks.append(chunk_text)
|
|
107
|
+
|
|
108
|
+
return chunks
|