remdb 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +566 -0
  44. rem/cli/commands/configure.py +497 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1302 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +96 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +676 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +336 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.3.0.dist-info/METADATA +1455 -0
  185. remdb-0.3.0.dist-info/RECORD +187 -0
  186. remdb-0.3.0.dist-info/WHEEL +4 -0
  187. remdb-0.3.0.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,597 @@
1
+ # Agentic Chunking
2
+
3
+ Token-aware chunking for agent inputs that exceed model context windows.
4
+
5
+ ## Overview
6
+
7
+ When processing large documents, datasets, or session histories with LLM agents, you may encounter context window limits. Agentic chunking solves this by:
8
+
9
+ 1. **Splitting** large inputs into token-aware chunks
10
+ 2. **Processing** each chunk independently with the same agent
11
+ 3. **Merging** results using configurable strategies
12
+
13
+ ## Key Features
14
+
15
+ - **Tiktoken Integration**: Exact token counting for OpenAI models
16
+ - **Character Heuristic Fallback**: ~4 chars/token estimate for other providers
17
+ - **Model Limits Database**: Pre-configured limits for GPT, Claude, Gemini
18
+ - **Smart Chunking**: Preserves line/word boundaries to avoid splitting mid-sentence
19
+ - **Merge Strategies**: Concatenate lists, deep merge JSON, or use LLM for intelligent merging
20
+
21
+ ## Quick Start (Recommended: Smart Chunking)
22
+
23
+ ```python
24
+ from rem.utils.agentic_chunking import (
25
+ smart_chunk_text, # Recommended - auto-sizes based on model
26
+ merge_results,
27
+ MergeStrategy,
28
+ )
29
+
30
+ # Smart chunking - automatically handles sizing
31
+ chunks = smart_chunk_text(cv_text, model="gpt-4o")
32
+
33
+ # For most CVs/resumes: chunks = [full_cv] (no chunking needed!)
34
+ # For huge documents: automatically splits optimally
35
+
36
+ # Process each chunk with agent
37
+ results = []
38
+ for chunk in chunks:
39
+ result = await agent.run(chunk)
40
+ results.append(result.output.model_dump()) # Always serialize!
41
+
42
+ # Merge results (no-op if single chunk)
43
+ merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
44
+ ```
45
+
46
+ ## Quick Start (Manual Chunking)
47
+
48
+ ```python
49
+ from rem.utils.agentic_chunking import (
50
+ chunk_text,
51
+ merge_results,
52
+ MergeStrategy,
53
+ get_model_limits,
54
+ estimate_tokens,
55
+ )
56
+
57
+ # Check model limits
58
+ limits = get_model_limits("gpt-4o")
59
+ print(f"Max input tokens: {limits.max_input}") # 111616
60
+
61
+ # Estimate tokens in text
62
+ text_tokens = estimate_tokens(large_document, model="gpt-4o")
63
+ print(f"Document: {text_tokens} tokens")
64
+
65
+ # Chunk if necessary
66
+ if text_tokens > limits.max_input:
67
+ chunks = chunk_text(large_document, max_tokens=100000, model="gpt-4o")
68
+ print(f"Split into {len(chunks)} chunks")
69
+ else:
70
+ chunks = [large_document]
71
+
72
+ # Process each chunk with agent
73
+ results = []
74
+ for i, chunk in enumerate(chunks):
75
+ print(f"Processing chunk {i+1}/{len(chunks)}")
76
+ result = await agent.run(chunk)
77
+ results.append(result.output.model_dump()) # Always serialize!
78
+
79
+ # Merge results
80
+ merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
81
+ ```
82
+
83
+ ## Model Limits
84
+
85
+ Pre-configured context limits for major LLM providers:
86
+
87
+ | Model | Max Context | Max Output | Max Input |
88
+ |-------|-------------|------------|-----------|
89
+ | gpt-4o | 128K | 16K | 112K |
90
+ | gpt-4o-mini | 128K | 16K | 112K |
91
+ | o1 | 200K | 100K | 100K |
92
+ | claude-sonnet-4 | 200K | 8K | 192K |
93
+ | claude-3-5-sonnet | 200K | 8K | 192K |
94
+ | gemini-2.0-flash-exp | 1M | 8K | 992K |
95
+ | gemini-1.5-pro | 2M | 8K | 1.992M |
96
+
97
+ **Fuzzy Matching**: Models are matched by family (e.g., "gpt-4o-2024-05-13" → gpt-4o limits)
98
+
99
+ ## Smart vs Manual Chunking
100
+
101
+ ### Smart Chunking (Recommended)
102
+
103
+ **Use `smart_chunk_text()` for automatic, intelligent chunking:**
104
+
105
+ ```python
106
+ chunks = smart_chunk_text(text, model="gpt-4o")
107
+ ```
108
+
109
+ **Benefits:**
110
+ - ✅ Automatically calculates optimal chunk size from model limits
111
+ - ✅ CVs/resumes fit in single chunk (no unnecessary splitting!)
112
+ - ✅ Accounts for system prompt overhead
113
+ - ✅ Configurable buffer ratio for safety
114
+ - ✅ Model-aware (adapts to GPT-4o, Claude, Gemini limits)
115
+
116
+ **When to use:**
117
+ - Processing user documents (CVs, reports, articles)
118
+ - When you want maximum utilization of model context
119
+ - When chunk size optimization is important
120
+
121
+ ### Manual Chunking
122
+
123
+ **Use `chunk_text()` when you need explicit control:**
124
+
125
+ ```python
126
+ chunks = chunk_text(text, max_tokens=1000, model="gpt-4o")
127
+ ```
128
+
129
+ **Benefits:**
130
+ - ✅ Explicit control over chunk size
131
+ - ✅ Useful for testing with small chunks
132
+ - ✅ Good for constrained environments (rate limits, cost control)
133
+
134
+ **When to use:**
135
+ - Testing/development with small chunks
136
+ - Rate limit constraints (process X tokens/hour)
137
+ - Cost optimization (smaller chunks = predictable costs)
138
+ - Specific requirements (e.g., "split every 10K tokens")
139
+
140
+ ### Comparison
141
+
142
+ | Feature | smart_chunk_text() | chunk_text() |
143
+ |---------|-------------------|--------------|
144
+ | **Chunk size** | Auto-calculated from model limits | Manual specification |
145
+ | **CV handling** | Single chunk (no splitting) | May split unnecessarily |
146
+ | **System prompt** | Automatically accounted | Must calculate manually |
147
+ | **Model-aware** | Yes (adapts to context windows) | No (fixed max_tokens) |
148
+ | **Buffer safety** | Configurable (default 75%) | Must calculate manually |
149
+ | **Use case** | Production, real documents | Testing, constraints |
150
+
151
+ ## Token Estimation
152
+
153
+ ### OpenAI Models (Exact)
154
+
155
+ Uses tiktoken for precise token counting:
156
+
157
+ ```python
158
+ from rem.utils.agentic_chunking import estimate_tokens
159
+
160
+ tokens = estimate_tokens("Hello, world!", model="gpt-4o")
161
+ # Returns: 4 (exact count via tiktoken)
162
+ ```
163
+
164
+ ### Other Models (Heuristic)
165
+
166
+ Falls back to character-based estimation (~4 chars/token + 5% overhead):
167
+
168
+ ```python
169
+ tokens = estimate_tokens("Hello, world!", model="claude-sonnet-4")
170
+ # Returns: 3 (heuristic estimate)
171
+ ```
172
+
173
+ ## Chunking Strategies
174
+
175
+ ### Line-Preserving (Default)
176
+
177
+ Chunks by lines, preserving line boundaries:
178
+
179
+ ```python
180
+ chunks = chunk_text(text, max_tokens=1000, model="gpt-4o", preserve_lines=True)
181
+ ```
182
+
183
+ - Splits at `\n` boundaries
184
+ - Falls back to character chunking for oversized lines
185
+ - Best for structured text (code, markdown, logs)
186
+
187
+ ### Character-Based
188
+
189
+ Chunks by characters with word boundary preservation:
190
+
191
+ ```python
192
+ chunks = chunk_text(text, max_tokens=1000, model="gpt-4o", preserve_lines=False)
193
+ ```
194
+
195
+ - Tries to break at spaces
196
+ - Useful for prose without newlines
197
+
198
+ ## Merge Strategies
199
+
200
+ ### 1. Concatenate List (Default)
201
+
202
+ **When to use**: Most structured extraction tasks (lists of items, entities, facts)
203
+
204
+ **Behavior**:
205
+ - Lists: Concatenate (`[1, 2]` + `[3, 4]` → `[1, 2, 3, 4]`)
206
+ - Dicts: Update (shallow merge)
207
+ - Scalars: Keep first non-None value
208
+
209
+ **Example**:
210
+ ```python
211
+ results = [
212
+ {"skills": ["Python", "SQL"], "experience_years": 5},
213
+ {"skills": ["Docker", "K8s"], "experience_years": 3}
214
+ ]
215
+
216
+ merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
217
+ # {"skills": ["Python", "SQL", "Docker", "K8s"], "experience_years": 5}
218
+ ```
219
+
220
+ ### 2. Deep JSON Merge
221
+
222
+ **When to use**: Nested object structures with hierarchies
223
+
224
+ **Behavior**:
225
+ - Lists: Concatenate
226
+ - Dicts: Recursively deep merge
227
+ - Scalars: Keep first non-None value
228
+
229
+ **Example**:
230
+ ```python
231
+ results = [
232
+ {"contract": {"parties": ["Alice"], "terms": {"duration": "1 year"}}},
233
+ {"contract": {"parties": ["Bob"], "terms": {"renewal": "auto"}}}
234
+ ]
235
+
236
+ merged = merge_results(results, strategy=MergeStrategy.MERGE_JSON)
237
+ # {
238
+ # "contract": {
239
+ # "parties": ["Alice", "Bob"],
240
+ # "terms": {"duration": "1 year", "renewal": "auto"}
241
+ # }
242
+ # }
243
+ ```
244
+
245
+ ### 3. LLM Merge (TODO)
246
+
247
+ **When to use**: Complex semantic merging requiring intelligence
248
+
249
+ **Behavior**: Use LLM to intelligently merge results (not yet implemented)
250
+
251
+ ## Real-World Examples
252
+
253
+ ### Example 1: Extract Skills from Long CV
254
+
255
+ ```python
256
+ from rem.utils.agentic_chunking import smart_chunk_text, merge_results, MergeStrategy
257
+ from rem.agentic.providers.pydantic_ai import create_pydantic_ai_agent
258
+
259
+ # Long CV document
260
+ cv_text = load_cv_file("john-doe-cv.txt") # 5K tokens (typical CV)
261
+
262
+ # Smart chunking - automatically sizes based on model
263
+ # For typical CVs: will return single chunk (no splitting!)
264
+ chunks = smart_chunk_text(cv_text, model="gpt-4o")
265
+
266
+ print(f"Processing CV in {len(chunks)} chunk(s)")
267
+ # Output: Processing CV in 1 chunk(s)
268
+
269
+ # Create agent (using existing schema)
270
+ agent = await create_pydantic_ai_agent(
271
+ context=context,
272
+ agent_schema_uri="cv-parser-v1"
273
+ )
274
+
275
+ # Process each chunk
276
+ results = []
277
+ for i, chunk in enumerate(chunks):
278
+ result = await agent.run(chunk)
279
+ # CRITICAL: Serialize Pydantic models!
280
+ results.append(result.output.model_dump())
281
+
282
+ # Merge extracted skills (no-op if single chunk)
283
+ merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
284
+
285
+ print(f"Total skills found: {len(merged['skills'])}")
286
+ # Output: Total skills found: 12
287
+ ```
288
+
289
+ ### Example 2: Analyze Multi-Page Contract
290
+
291
+ ```python
292
+ from rem.utils.agentic_chunking import smart_chunk_text, merge_results, MergeStrategy
293
+
294
+ # Large contract (120 pages, 80K tokens)
295
+ contract_text = load_contract("partnership-agreement.pdf")
296
+
297
+ # Smart chunking with system prompt awareness
298
+ system_prompt = """You are a contract analyzer. Extract parties, terms,
299
+ obligations, and risk flags from this legal agreement."""
300
+
301
+ chunks = smart_chunk_text(
302
+ contract_text,
303
+ model="claude-sonnet-4", # 200K context
304
+ system_prompt=system_prompt,
305
+ buffer_ratio=0.75
306
+ )
307
+
308
+ print(f"Contract split into {len(chunks)} chunk(s)")
309
+ # For 80K tokens: likely 1 chunk (Claude has 200K context)
310
+
311
+ # Create contract analyzer agent
312
+ agent = await create_pydantic_ai_agent(
313
+ context=context,
314
+ agent_schema_uri="contract-analyzer-v1"
315
+ )
316
+
317
+ # Extract terms from each chunk
318
+ results = []
319
+ for chunk in chunks:
320
+ result = await agent.run(chunk)
321
+ results.append(result.output.model_dump())
322
+
323
+ # Deep merge nested contract structure
324
+ merged = merge_results(results, strategy=MergeStrategy.MERGE_JSON)
325
+
326
+ print(f"Parties: {merged['parties']}")
327
+ print(f"Key obligations: {len(merged['key_obligations'])}")
328
+ print(f"Risk flags: {len(merged['risk_flags'])}")
329
+ ```
330
+
331
+ ### Example 3: Process User Session History
332
+
333
+ ```python
334
+ from rem.utils.agentic_chunking import chunk_text, estimate_tokens, get_model_limits
335
+
336
+ # User's full session history (many conversations)
337
+ session_history = load_user_sessions(user_id="user-123") # 200K tokens
338
+
339
+ # Get limits for Gemini (large context)
340
+ limits = get_model_limits("gemini-1.5-pro") # 1.992M tokens
341
+
342
+ # Check if chunking needed
343
+ history_tokens = estimate_tokens(session_history, model="gemini-1.5-pro")
344
+
345
+ if history_tokens <= limits.max_input:
346
+ # Fits in one shot!
347
+ result = await agent.run(session_history)
348
+ else:
349
+ # Need to chunk
350
+ chunks = chunk_text(session_history, max_tokens=500000, model="gemini-1.5-pro")
351
+
352
+ results = []
353
+ for chunk in chunks:
354
+ result = await agent.run(chunk)
355
+ results.append(result.output.model_dump())
356
+
357
+ # Merge user profile insights
358
+ merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
359
+ ```
360
+
361
+ ## Integration with REM
362
+
363
+ ### Ontology Extraction on Large Files
364
+
365
+ ```python
366
+ from rem.utils.agentic_chunking import chunk_text, merge_results
367
+ from rem.services.ontology_extractor import extract_from_file
368
+
369
+ async def extract_from_large_file(
370
+ file: File,
371
+ schema: Schema,
372
+ tenant_id: str
373
+ ) -> Ontology:
374
+ """Extract ontology from large file using chunking."""
375
+
376
+ # Get model from schema provider_configs
377
+ provider = schema.provider_configs[0] if schema.provider_configs else {}
378
+ model = provider.get("model_name", "gpt-4o")
379
+
380
+ # Chunk file content if needed
381
+ limits = get_model_limits(model)
382
+ chunks = chunk_text(file.content, max_tokens=int(limits.max_input * 0.75), model=model)
383
+
384
+ if len(chunks) == 1:
385
+ # Single chunk - normal extraction
386
+ return await extract_from_file(file, schema, tenant_id)
387
+
388
+ # Multi-chunk extraction
389
+ results = []
390
+ for chunk in chunks:
391
+ # Create temporary file for chunk
392
+ chunk_file = File(
393
+ name=f"{file.name} (chunk)",
394
+ content=chunk,
395
+ mime_type=file.mime_type,
396
+ tenant_id=tenant_id
397
+ )
398
+
399
+ # Extract from chunk
400
+ result = await extract_from_file(chunk_file, schema, tenant_id)
401
+ results.append(result.extracted_data)
402
+
403
+ # Merge extracted data
404
+ merged_data = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
405
+
406
+ # Create final ontology
407
+ return Ontology(
408
+ name=file.name,
409
+ file_id=file.id,
410
+ agent_schema_id=schema.id,
411
+ provider_name=provider.get("provider_name"),
412
+ model_name=model,
413
+ extracted_data=merged_data,
414
+ tenant_id=tenant_id
415
+ )
416
+ ```
417
+
418
+ ### Dreaming Worker with Chunking
419
+
420
+ ```python
421
+ from rem.utils.agentic_chunking import chunk_text, merge_results
422
+
423
+ async def extract_ontologies_with_chunking(
424
+ user_id: str,
425
+ lookback_hours: int = 24,
426
+ limit: int | None = None
427
+ ):
428
+ """Extract ontologies with automatic chunking for large files."""
429
+
430
+ # Load user's files
431
+ files = await query_files(user_id, lookback_hours, limit)
432
+
433
+ for file in files:
434
+ # Find matching configs
435
+ configs = await get_matching_configs(file, user_id)
436
+
437
+ for config in configs:
438
+ # Load schema
439
+ schema = await load_schema(config.agent_schema_id, user_id)
440
+
441
+ # Extract with chunking
442
+ ontology = await extract_from_large_file(file, schema, user_id)
443
+
444
+ # Generate embeddings
445
+ embedding_text = extract_fields_for_embedding(
446
+ ontology.extracted_data,
447
+ schema.embedding_fields
448
+ )
449
+ ontology.embedding_text = embedding_text
450
+
451
+ # Save
452
+ await ontology_repo.upsert(ontology)
453
+ ```
454
+
455
+ ## Best Practices
456
+
457
+ ### 1. Always Leave Buffer for System Prompt
458
+
459
+ ```python
460
+ # BAD: Use full context window
461
+ chunks = chunk_text(text, max_tokens=limits.max_input, model="gpt-4o")
462
+
463
+ # GOOD: Leave buffer for system prompt, tools, etc.
464
+ chunks = chunk_text(text, max_tokens=int(limits.max_input * 0.75), model="gpt-4o")
465
+ ```
466
+
467
+ ### 2. Serialize Pydantic Models Before Merging
468
+
469
+ ```python
470
+ # BAD: Merge Pydantic model instances directly
471
+ results = [result1.output, result2.output] # Pydantic models
472
+ merged = merge_results(results) # May lose fields!
473
+
474
+ # GOOD: Serialize first
475
+ results = [result1.output.model_dump(), result2.output.model_dump()]
476
+ merged = merge_results(results) # All fields preserved
477
+ ```
478
+
479
+ ### 3. Choose Right Merge Strategy
480
+
481
+ ```python
482
+ # Extracting list of items → CONCATENATE_LIST
483
+ skills = merge_results(skill_results, MergeStrategy.CONCATENATE_LIST)
484
+
485
+ # Nested hierarchy → MERGE_JSON
486
+ contract = merge_results(contract_results, MergeStrategy.MERGE_JSON)
487
+
488
+ # Complex semantic merging → LLM_MERGE (future)
489
+ summary = merge_results(summary_results, MergeStrategy.LLM_MERGE)
490
+ ```
491
+
492
+ ### 4. Handle Single Chunk Case
493
+
494
+ ```python
495
+ chunks = chunk_text(text, max_tokens=100000, model="gpt-4o")
496
+
497
+ if len(chunks) == 1:
498
+ # No chunking needed, faster path
499
+ result = await agent.run(chunks[0])
500
+ return result.output.model_dump()
501
+ else:
502
+ # Multi-chunk processing
503
+ results = [await agent.run(c) for c in chunks]
504
+ return merge_results([r.output.model_dump() for r in results])
505
+ ```
506
+
507
+ ### 5. Respect Rate Limits
508
+
509
+ ```python
510
+ import asyncio
511
+
512
+ # Process chunks with rate limiting
513
+ results = []
514
+ for i, chunk in enumerate(chunks):
515
+ result = await agent.run(chunk)
516
+ results.append(result.output.model_dump())
517
+
518
+ # Wait between chunks (e.g., 1 second)
519
+ if i < len(chunks) - 1:
520
+ await asyncio.sleep(1.0)
521
+
522
+ merged = merge_results(results)
523
+ ```
524
+
525
+ ## Performance Considerations
526
+
527
+ ### Token Estimation
528
+
529
+ - **OpenAI (tiktoken)**: Exact count, ~50ms for 10K tokens
530
+ - **Heuristic**: Instant but ~5-10% error margin
531
+
532
+ ### Chunking
533
+
534
+ - **Line-preserving**: O(n) where n = number of lines
535
+ - **Character-based**: O(n) where n = text length
536
+ - Both are fast (< 1ms for 100K chars)
537
+
538
+ ### Merging
539
+
540
+ - **Concatenate**: O(n*m) where n = results, m = avg fields
541
+ - **Deep merge**: O(n*m*d) where d = nesting depth
542
+ - Both are fast for typical result sizes (< 10ms for 100 results)
543
+
544
+ ## Troubleshooting
545
+
546
+ ### Issue: Chunks Still Too Large
547
+
548
+ **Symptom**: Agent fails with context length error despite chunking
549
+
550
+ **Solution**: Reduce buffer ratio or account for multi-turn conversation
551
+
552
+ ```python
553
+ # If agent uses multiple tool calls (grows context)
554
+ chunks = chunk_text(text, max_tokens=int(limits.max_input * 0.5), model="gpt-4o")
555
+ ```
556
+
557
+ ### Issue: Lost Fields After Merge
558
+
559
+ **Symptom**: Fields disappear from merged results
560
+
561
+ **Solution**: Always serialize Pydantic models with `.model_dump()`
562
+
563
+ ```python
564
+ # Before merging
565
+ results = [r.output.model_dump() for r in agent_results]
566
+ merged = merge_results(results)
567
+ ```
568
+
569
+ ### Issue: Wrong Token Count
570
+
571
+ **Symptom**: Estimate significantly off from actual usage
572
+
573
+ **Solution**: Use tiktoken for OpenAI, increase buffer for others
574
+
575
+ ```python
576
+ # For OpenAI: tiktoken is exact
577
+ chunks = chunk_text(text, max_tokens=100000, model="gpt-4o")
578
+
579
+ # For others: use larger buffer (60-70% instead of 75%)
580
+ chunks = chunk_text(text, max_tokens=int(limits.max_input * 0.6), model="claude-sonnet-4")
581
+ ```
582
+
583
+ ## Future Enhancements
584
+
585
+ - [ ] LLM merge strategy implementation
586
+ - [ ] Async parallel chunk processing
587
+ - [ ] Progress tracking and cancellation
588
+ - [ ] Chunk caching to avoid re-processing
589
+ - [ ] Smart section-based chunking for markdown/HTML
590
+ - [ ] Integration with semchunk for semantic boundaries
591
+
592
+ ## Related Documentation
593
+
594
+ - [CLAUDE.md](../../../../CLAUDE.md) - Core design patterns (Pattern #11)
595
+ - [agentic_chunking.py](./agentic_chunking.py) - Implementation
596
+ - [dict_utils.py](./dict_utils.py) - Field extraction utilities
597
+ - [serialization.py](../agentic/serialization.py) - Pydantic serialization helpers