remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
rem/utils/batch_ops.py ADDED
@@ -0,0 +1,343 @@
1
+ """
2
+ Batch Operations Utilities.
3
+
4
+ Provides utilities for batch upserting records with:
5
+ - Automatic KV store population (via triggers)
6
+ - Embedding generation (stubbed - to be implemented)
7
+ - Efficient batch processing
8
+
9
+ Design:
10
+ - Uses Pydantic models for type safety
11
+ - Delegates SQL generation to utils.sql_types
12
+ - Keeps PostgresService clean
13
+ - Handles batching automatically
14
+ """
15
+
16
+ from typing import Any, Type
17
+ from uuid import UUID, uuid4, uuid5, NAMESPACE_OID
18
+ from datetime import datetime
19
+ import hashlib
20
+ import json
21
+
22
+ from loguru import logger
23
+ from pydantic import BaseModel
24
+
25
+
26
+ def generate_deterministic_id(
27
+ user_id: str | None, key_values: list[str] | str
28
+ ) -> UUID:
29
+ """
30
+ Generate deterministic UUID from user_id and business key(s).
31
+
32
+ Convention: If a business key field exists (name, uri, etc.), the ID should be
33
+ deterministic based on user_id + key_value(s). This allows upserts to work
34
+ based on the business key rather than requiring explicit ID management.
35
+
36
+ Composite Keys:
37
+ - For composite keys (e.g., uri + ordinal), pass list of values
38
+ - Values are concatenated with ":" separator
39
+
40
+ Args:
41
+ user_id: User identifier (tenant scoped)
42
+ key_values: Business key value(s) - single string or list for composite keys
43
+
44
+ Returns:
45
+ Deterministic UUID v5 based on user_id + key(s)
46
+
47
+ Examples:
48
+ >>> id1 = generate_deterministic_id("user-123", "my-document")
49
+ >>> id2 = generate_deterministic_id("user-123", "my-document")
50
+ >>> id1 == id2
51
+ True
52
+ >>> # Composite key
53
+ >>> id3 = generate_deterministic_id("user-123", ["docs://file.pdf", "0"])
54
+ >>> id4 = generate_deterministic_id("user-123", ["docs://file.pdf", "0"])
55
+ >>> id3 == id4
56
+ True
57
+ """
58
+ # Create namespace from user_id (or use NULL namespace if no user)
59
+ namespace_str = user_id or "system"
60
+ namespace = uuid5(NAMESPACE_OID, namespace_str)
61
+
62
+ # Handle composite keys
63
+ if isinstance(key_values, list):
64
+ composite_key = ":".join(str(v) for v in key_values)
65
+ else:
66
+ composite_key = str(key_values)
67
+
68
+ # Generate deterministic UUID from business key
69
+ return uuid5(namespace, composite_key)
70
+
71
+
72
+ def prepare_record_for_upsert(
73
+ record: BaseModel,
74
+ model: Type[BaseModel],
75
+ entity_key_field: str | None = None,
76
+ ) -> dict[str, Any]:
77
+ """
78
+ Prepare a Pydantic record for database upsert.
79
+
80
+ ID Generation Convention:
81
+ - If entity_key_field is provided: Generate deterministic ID from user_id + key
82
+ - Otherwise: Generate random UUID v4
83
+
84
+ This allows business key-based upserts where the same user + key always gets
85
+ the same ID, enabling natural upsert behavior.
86
+
87
+ Args:
88
+ record: Pydantic model instance
89
+ model: Pydantic model class
90
+ entity_key_field: Optional business key field name (name, uri, etc.)
91
+
92
+ Returns:
93
+ Dict with field values ready for SQL insertion
94
+
95
+ Example:
96
+ >>> from rem.models.entities import Resource
97
+ >>> resource = Resource(name="test", content="data", tenant_id="acme", user_id="sarah")
98
+ >>> data = prepare_record_for_upsert(resource, Resource, entity_key_field="name")
99
+ >>> "id" in data # ID generated from user_id + name
100
+ True
101
+ """
102
+ # Convert to dict
103
+ data = record.model_dump()
104
+
105
+ # Generate ID based on convention
106
+ if "id" not in data or data["id"] is None:
107
+ user_id = data.get("user_id")
108
+
109
+ # Check for composite keys (fields with composite_key=True in json_schema_extra)
110
+ composite_key_fields = []
111
+ for field_name, field_info in model.model_fields.items():
112
+ json_extra = getattr(field_info, "json_schema_extra", None)
113
+ if json_extra and isinstance(json_extra, dict):
114
+ if json_extra.get("composite_key") is True:
115
+ composite_key_fields.append(field_name)
116
+
117
+ # Check if we have a business key field
118
+ if entity_key_field and entity_key_field in data:
119
+ key_value = data[entity_key_field]
120
+
121
+ if key_value:
122
+ # Build composite key if additional fields exist
123
+ if composite_key_fields:
124
+ key_values = [str(key_value)]
125
+ for comp_field in composite_key_fields:
126
+ if comp_field in data:
127
+ key_values.append(str(data[comp_field]))
128
+
129
+ # Generate deterministic ID from composite key
130
+ data["id"] = generate_deterministic_id(user_id, key_values)
131
+ logger.debug(
132
+ f"Generated deterministic ID from composite key: "
133
+ f"{entity_key_field}={key_value} + {composite_key_fields}"
134
+ )
135
+ else:
136
+ # Single business key
137
+ data["id"] = generate_deterministic_id(user_id, str(key_value))
138
+ logger.debug(
139
+ f"Generated deterministic ID from {entity_key_field}={key_value}"
140
+ )
141
+ else:
142
+ # Key field is None, use random UUID
143
+ data["id"] = uuid4()
144
+ else:
145
+ # No business key, use random UUID
146
+ data["id"] = uuid4()
147
+
148
+ # Handle UUID serialization
149
+ if "id" in data and isinstance(data["id"], UUID):
150
+ data["id"] = str(data["id"])
151
+
152
+ # JSONB fields: asyncpg handles dict/list serialization automatically
153
+ # DO NOT convert to JSON strings - asyncpg expects native Python types
154
+ # PostgreSQL JSONB columns work with Python dicts/lists directly
155
+
156
+ # Normalize datetime fields to be timezone-naive (PostgreSQL TIMESTAMP WITHOUT TIME ZONE)
157
+ for field_name, field_value in data.items():
158
+ if isinstance(field_value, datetime) and field_value.tzinfo is not None:
159
+ # Convert timezone-aware datetime to naive UTC
160
+ data[field_name] = field_value.replace(tzinfo=None)
161
+
162
+ # Remove None values for optional fields (let DB handle defaults)
163
+ # Keep None for required fields to trigger NOT NULL constraints
164
+ # Also filter out fields that don't exist in DB schema (tags, column)
165
+ SKIP_FIELDS = {"tags", "column"} # CoreModel fields not in DB schema
166
+
167
+ cleaned_data = {}
168
+ for field_name, field_value in data.items():
169
+ # Skip fields that aren't in DB schema
170
+ if field_name in SKIP_FIELDS:
171
+ continue
172
+
173
+ field_info = model.model_fields.get(field_name)
174
+ if field_info is not None and field_info.is_required():
175
+ # Keep required fields even if None (will error if truly NULL)
176
+ cleaned_data[field_name] = field_value
177
+ elif field_value is not None:
178
+ # Only include optional fields if they have values
179
+ cleaned_data[field_name] = field_value
180
+
181
+ return cleaned_data
182
+
183
+
184
+ def batch_iterator(items: list, batch_size: int = 100):
185
+ """
186
+ Iterate over items in batches.
187
+
188
+ Args:
189
+ items: List of items to batch
190
+ batch_size: Size of each batch
191
+
192
+ Yields:
193
+ Batches of items
194
+
195
+ Example:
196
+ >>> items = list(range(250))
197
+ >>> batches = list(batch_iterator(items, 100))
198
+ >>> len(batches)
199
+ 3
200
+ >>> len(batches[0])
201
+ 100
202
+ >>> len(batches[2])
203
+ 50
204
+ """
205
+ for i in range(0, len(items), batch_size):
206
+ yield items[i : i + batch_size]
207
+
208
+
209
+ async def generate_embeddings_stub(
210
+ records: list[BaseModel],
211
+ table_name: str,
212
+ embeddable_fields: list[str],
213
+ provider: str = "openai",
214
+ model: str = "text-embedding-3-small",
215
+ ) -> list[dict]:
216
+ """
217
+ Generate embeddings for record fields (STUBBED).
218
+
219
+ This is a placeholder for the actual embedding generation logic.
220
+ Will be implemented to:
221
+ 1. Extract text from embeddable fields
222
+ 2. Call OpenAI/Anthropic API in batch
223
+ 3. Return embedding records for upsert
224
+
225
+ Args:
226
+ records: List of Pydantic records
227
+ table_name: Name of the entity table
228
+ embeddable_fields: List of field names to embed
229
+ provider: Embedding provider (openai, cohere, etc.)
230
+ model: Embedding model name
231
+
232
+ Returns:
233
+ List of embedding records (id, entity_id, field_name, provider, model, embedding)
234
+
235
+ TODO:
236
+ - Implement OpenAI batch embedding API call
237
+ - Handle rate limiting and retries
238
+ - Support multiple providers
239
+ - Cache embeddings to avoid regeneration
240
+ """
241
+ logger.warning(
242
+ f"Embedding generation is stubbed for {table_name} "
243
+ f"with {len(records)} records and fields {embeddable_fields}"
244
+ )
245
+
246
+ # STUB: Return empty list for now
247
+ # When implemented, this will return records like:
248
+ # [
249
+ # {
250
+ # "entity_id": record.id,
251
+ # "field_name": "content",
252
+ # "provider": "openai",
253
+ # "model": "text-embedding-3-small",
254
+ # "embedding": [0.1, 0.2, ...], # 1536 dimensions
255
+ # }
256
+ # ]
257
+ return []
258
+
259
+
260
+ def validate_record_for_kv_store(
261
+ record: BaseModel,
262
+ entity_key_field: str,
263
+ tenant_id: str | None = None,
264
+ ) -> tuple[bool, str]:
265
+ """
266
+ Validate that a record has required fields for KV store population.
267
+
268
+ Args:
269
+ record: Pydantic model instance
270
+ entity_key_field: Field name to use as entity_key
271
+ tenant_id: Optional tenant_id override
272
+
273
+ Returns:
274
+ Tuple of (is_valid, error_message)
275
+
276
+ Example:
277
+ >>> from rem.models.entities import Resource
278
+ >>> resource = Resource(name="test", content="data", tenant_id="acme")
279
+ >>> valid, msg = validate_record_for_kv_store(resource, "name", "acme")
280
+ >>> valid
281
+ True
282
+ """
283
+ # Check entity_key field exists and has value
284
+ if not hasattr(record, entity_key_field):
285
+ return False, f"Record missing entity_key field: {entity_key_field}"
286
+
287
+ entity_key_value = getattr(record, entity_key_field)
288
+ if not entity_key_value:
289
+ return False, f"Entity key field '{entity_key_field}' is empty"
290
+
291
+ # Check tenant_id (either on record or provided)
292
+ record_tenant_id = getattr(record, "tenant_id", None)
293
+ effective_tenant_id = tenant_id or record_tenant_id
294
+
295
+ if not effective_tenant_id:
296
+ return False, "Record must have tenant_id for KV store"
297
+
298
+ return True, ""
299
+
300
+
301
+ def build_upsert_statement(
302
+ table_name: str,
303
+ field_names: list[str],
304
+ conflict_column: str = "id",
305
+ ) -> str:
306
+ """
307
+ Build PostgreSQL UPSERT statement with proper identifier quoting.
308
+
309
+ PostgreSQL reserved keywords (like "column", "user", "order") must be quoted.
310
+ We quote all identifiers to avoid SQL injection and reserved keyword issues.
311
+
312
+ Args:
313
+ table_name: Name of the table
314
+ field_names: List of field names to insert
315
+ conflict_column: Column to use for conflict detection
316
+
317
+ Returns:
318
+ SQL UPSERT statement with placeholders
319
+
320
+ Example:
321
+ >>> sql = build_upsert_statement("resources", ["id", "name", "content"])
322
+ >>> "ON CONFLICT" in sql
323
+ True
324
+ >>> "DO UPDATE SET" in sql
325
+ True
326
+ """
327
+ # Quote all identifiers to handle reserved keywords like "column"
328
+ quoted_fields = [f'"{field}"' for field in field_names]
329
+ placeholders = [f"${i+1}" for i in range(len(field_names))]
330
+
331
+ # Exclude conflict column from UPDATE
332
+ update_fields = [f for f in field_names if f != conflict_column]
333
+ update_set = ", ".join([f'"{field}" = EXCLUDED."{field}"' for field in update_fields])
334
+
335
+ sql = f"""
336
+ INSERT INTO {table_name} ({", ".join(quoted_fields)})
337
+ VALUES ({", ".join(placeholders)})
338
+ ON CONFLICT ("{conflict_column}")
339
+ DO UPDATE SET {update_set}
340
+ RETURNING *;
341
+ """
342
+
343
+ return sql.strip()
rem/utils/chunking.py ADDED
@@ -0,0 +1,108 @@
1
+ """
2
+ Text chunking utilities for document storage.
3
+
4
+ **ARCHITECTURE NOTE - Chunking vs Token Counting**:
5
+
6
+ This module handles SEMANTIC CHUNKING for document storage:
7
+ - Character-based limits (not tokens!)
8
+ - Respects document structure (paragraphs, sections)
9
+ - Creates 2-3 paragraph chunks for searchable resources
10
+ - Stored in database with embeddings
11
+
12
+ TikToken (token counting) is used ELSEWHERE for LLM context management:
13
+ - Agent flows preparing prompts (agentic/*)
14
+ - Context window limits (128K tokens, 200K tokens)
15
+ - Agentic chunking for large inputs (utils/agentic_chunking.py)
16
+
17
+ DO NOT use tiktoken here - document chunks are storage units, not LLM inputs!
18
+ """
19
+
20
+ import re
21
+ from rem.settings import settings
22
+
23
+
24
+ def chunk_text(text: str) -> list[str]:
25
+ """
26
+ Chunk text using semantic character-based chunking.
27
+
28
+ **IMPORTANT**: Uses CHARACTER limits, NOT tokens. This creates storage chunks
29
+ for database/embeddings. Token counting happens later in agent flows when
30
+ preparing LLM prompts.
31
+
32
+ Chunking strategy:
33
+ 1. Split on double newlines (paragraph boundaries) - PRIMARY
34
+ 2. Split on single newlines if paragraph too large
35
+ 3. Split on sentence endings (. ! ?) if still too large
36
+ 4. Hard split at max_chunk_size if necessary
37
+
38
+ This creates natural 2-3 paragraph chunks suitable for semantic search.
39
+
40
+ Args:
41
+ text: Text to chunk
42
+
43
+ Returns:
44
+ List of text chunks (typically 10-50 chunks per document)
45
+
46
+ Example:
47
+ >>> text = "\\n\\n".join([f"Paragraph {i}. " + "Sentence. " * 20 for i in range(10)])
48
+ >>> chunks = chunk_text(text) # ~10 paragraphs → ~5-10 chunks
49
+ >>> len(chunks) # Should be reasonable, not 100+
50
+ """
51
+ if not text or not text.strip():
52
+ return []
53
+
54
+ chunks = []
55
+ current_chunk: list[str] = []
56
+ current_size = 0
57
+
58
+ # Split by paragraphs (double newline) first
59
+ paragraphs = re.split(r'\n\n+', text)
60
+
61
+ for para in paragraphs:
62
+ para = para.strip()
63
+ if not para:
64
+ continue
65
+
66
+ para_len = len(para)
67
+
68
+ # If adding this paragraph would exceed target size, flush current chunk
69
+ if current_size > 0 and current_size + para_len + 2 > settings.chunking.chunk_size:
70
+ # Flush current chunk
71
+ chunk_text = '\n\n'.join(current_chunk)
72
+ if len(chunk_text) >= settings.chunking.min_chunk_size:
73
+ chunks.append(chunk_text)
74
+ current_chunk = []
75
+ current_size = 0
76
+
77
+ # If paragraph itself is too large, split it
78
+ if para_len > settings.chunking.max_chunk_size:
79
+ # Try splitting on sentences
80
+ sentences = re.split(r'([.!?]+\s+)', para)
81
+ sentence_chunk = ""
82
+
83
+ for i in range(0, len(sentences), 2):
84
+ sentence = sentences[i]
85
+ delimiter = sentences[i + 1] if i + 1 < len(sentences) else ""
86
+
87
+ if len(sentence_chunk) + len(sentence) + len(delimiter) > settings.chunking.max_chunk_size:
88
+ if sentence_chunk:
89
+ chunks.append(sentence_chunk.strip())
90
+ sentence_chunk = sentence + delimiter
91
+ else:
92
+ sentence_chunk += sentence + delimiter
93
+
94
+ if sentence_chunk.strip():
95
+ if len(sentence_chunk) >= settings.chunking.min_chunk_size:
96
+ chunks.append(sentence_chunk.strip())
97
+ else:
98
+ # Add paragraph to current chunk
99
+ current_chunk.append(para)
100
+ current_size += para_len + 2 # +2 for the \n\n we'll add when joining
101
+
102
+ # Flush remaining chunk
103
+ if current_chunk:
104
+ chunk_text = '\n\n'.join(current_chunk)
105
+ if len(chunk_text) >= settings.chunking.min_chunk_size:
106
+ chunks.append(chunk_text)
107
+
108
+ return chunks