remdb 0.3.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of remdb might be problematic. Click here for more details.
- rem/__init__.py +129 -0
- rem/agentic/README.md +760 -0
- rem/agentic/__init__.py +54 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +38 -0
- rem/agentic/agents/agent_manager.py +311 -0
- rem/agentic/agents/sse_simulator.py +502 -0
- rem/agentic/context.py +425 -0
- rem/agentic/context_builder.py +360 -0
- rem/agentic/llm_provider_models.py +301 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +273 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +240 -0
- rem/agentic/providers/phoenix.py +926 -0
- rem/agentic/providers/pydantic_ai.py +854 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +737 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +242 -0
- rem/api/README.md +657 -0
- rem/api/deps.py +253 -0
- rem/api/main.py +460 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +820 -0
- rem/api/mcp_router/server.py +243 -0
- rem/api/mcp_router/tools.py +1605 -0
- rem/api/middleware/tracking.py +172 -0
- rem/api/routers/admin.py +520 -0
- rem/api/routers/auth.py +898 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/child_streaming.py +394 -0
- rem/api/routers/chat/completions.py +702 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +202 -0
- rem/api/routers/chat/otel_utils.py +33 -0
- rem/api/routers/chat/sse_events.py +546 -0
- rem/api/routers/chat/streaming.py +950 -0
- rem/api/routers/chat/streaming_utils.py +327 -0
- rem/api/routers/common.py +18 -0
- rem/api/routers/dev.py +87 -0
- rem/api/routers/feedback.py +276 -0
- rem/api/routers/messages.py +620 -0
- rem/api/routers/models.py +86 -0
- rem/api/routers/query.py +362 -0
- rem/api/routers/shared_sessions.py +422 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +36 -0
- rem/auth/jwt.py +367 -0
- rem/auth/middleware.py +318 -0
- rem/auth/providers/__init__.py +16 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/email.py +215 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +517 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +299 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +549 -0
- rem/cli/commands/cluster.py +1808 -0
- rem/cli/commands/configure.py +495 -0
- rem/cli/commands/db.py +828 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1698 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +388 -0
- rem/cli/commands/query.py +109 -0
- rem/cli/commands/scaffold.py +47 -0
- rem/cli/commands/schema.py +230 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/commands/session.py +453 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +123 -0
- rem/config.py +244 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +70 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +672 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +246 -0
- rem/models/entities/__init__.py +68 -0
- rem/models/entities/domain_resource.py +38 -0
- rem/models/entities/feedback.py +123 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +64 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +181 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/session.py +84 -0
- rem/models/entities/shared_session.py +180 -0
- rem/models/entities/subscriber.py +175 -0
- rem/models/entities/user.py +93 -0
- rem/py.typed +0 -0
- rem/registry.py +373 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/agent-builder.yaml +235 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +132 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +18 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +760 -0
- rem/services/content/service.py +762 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +322 -0
- rem/services/dreaming/moment_service.py +251 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/email/__init__.py +10 -0
- rem/services/email/service.py +522 -0
- rem/services/email/templates.py +360 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +127 -0
- rem/services/embeddings/worker.py +435 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +960 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +757 -0
- rem/services/postgres/__init__.py +49 -0
- rem/services/postgres/diff_service.py +599 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/programmable_diff_service.py +635 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +562 -0
- rem/services/postgres/register_type.py +353 -0
- rem/services/postgres/repository.py +481 -0
- rem/services/postgres/schema_generator.py +661 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +355 -0
- rem/services/rate_limit.py +113 -0
- rem/services/rem/README.md +318 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +180 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +608 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +13 -0
- rem/services/session/compression.py +488 -0
- rem/services/session/pydantic_messages.py +310 -0
- rem/services/session/reload.py +85 -0
- rem/services/user_service.py +130 -0
- rem/settings.py +1877 -0
- rem/sql/background_indexes.sql +52 -0
- rem/sql/migrations/001_install.sql +983 -0
- rem/sql/migrations/002_install_models.sql +3157 -0
- rem/sql/migrations/003_optional_extensions.sql +326 -0
- rem/sql/migrations/004_cache_system.sql +282 -0
- rem/sql/migrations/005_schema_update.sql +145 -0
- rem/sql/migrations/migrate_session_id_to_uuid.sql +45 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +628 -0
- rem/utils/__init__.py +61 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/constants.py +97 -0
- rem/utils/date_utils.py +228 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +436 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/files.py +323 -0
- rem/utils/markdown.py +16 -0
- rem/utils/mime_types.py +158 -0
- rem/utils/model_helpers.py +492 -0
- rem/utils/schema_loader.py +649 -0
- rem/utils/sql_paths.py +146 -0
- rem/utils/sql_types.py +350 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +325 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +7 -0
- rem/workers/db_listener.py +579 -0
- rem/workers/db_maintainer.py +74 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- rem/workers/unlogged_maintainer.py +463 -0
- remdb-0.3.242.dist-info/METADATA +1632 -0
- remdb-0.3.242.dist-info/RECORD +235 -0
- remdb-0.3.242.dist-info/WHEEL +4 -0
- remdb-0.3.242.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,597 @@
|
|
|
1
|
+
# Agentic Chunking
|
|
2
|
+
|
|
3
|
+
Token-aware chunking for agent inputs that exceed model context windows.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
When processing large documents, datasets, or session histories with LLM agents, you may encounter context window limits. Agentic chunking solves this by:
|
|
8
|
+
|
|
9
|
+
1. **Splitting** large inputs into token-aware chunks
|
|
10
|
+
2. **Processing** each chunk independently with the same agent
|
|
11
|
+
3. **Merging** results using configurable strategies
|
|
12
|
+
|
|
13
|
+
## Key Features
|
|
14
|
+
|
|
15
|
+
- **Tiktoken Integration**: Exact token counting for OpenAI models
|
|
16
|
+
- **Character Heuristic Fallback**: ~4 chars/token estimate for other providers
|
|
17
|
+
- **Model Limits Database**: Pre-configured limits for GPT, Claude, Gemini
|
|
18
|
+
- **Smart Chunking**: Preserves line/word boundaries to avoid splitting mid-sentence
|
|
19
|
+
- **Merge Strategies**: Concatenate lists, deep merge JSON, or use LLM for intelligent merging
|
|
20
|
+
|
|
21
|
+
## Quick Start (Recommended: Smart Chunking)
|
|
22
|
+
|
|
23
|
+
```python
|
|
24
|
+
from rem.utils.agentic_chunking import (
|
|
25
|
+
smart_chunk_text, # Recommended - auto-sizes based on model
|
|
26
|
+
merge_results,
|
|
27
|
+
MergeStrategy,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Smart chunking - automatically handles sizing
|
|
31
|
+
chunks = smart_chunk_text(cv_text, model="gpt-4o")
|
|
32
|
+
|
|
33
|
+
# For most CVs/resumes: chunks = [full_cv] (no chunking needed!)
|
|
34
|
+
# For huge documents: automatically splits optimally
|
|
35
|
+
|
|
36
|
+
# Process each chunk with agent
|
|
37
|
+
results = []
|
|
38
|
+
for chunk in chunks:
|
|
39
|
+
result = await agent.run(chunk)
|
|
40
|
+
results.append(result.output.model_dump()) # Always serialize!
|
|
41
|
+
|
|
42
|
+
# Merge results (no-op if single chunk)
|
|
43
|
+
merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Quick Start (Manual Chunking)
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from rem.utils.agentic_chunking import (
|
|
50
|
+
chunk_text,
|
|
51
|
+
merge_results,
|
|
52
|
+
MergeStrategy,
|
|
53
|
+
get_model_limits,
|
|
54
|
+
estimate_tokens,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Check model limits
|
|
58
|
+
limits = get_model_limits("gpt-4o")
|
|
59
|
+
print(f"Max input tokens: {limits.max_input}") # 111616
|
|
60
|
+
|
|
61
|
+
# Estimate tokens in text
|
|
62
|
+
text_tokens = estimate_tokens(large_document, model="gpt-4o")
|
|
63
|
+
print(f"Document: {text_tokens} tokens")
|
|
64
|
+
|
|
65
|
+
# Chunk if necessary
|
|
66
|
+
if text_tokens > limits.max_input:
|
|
67
|
+
chunks = chunk_text(large_document, max_tokens=100000, model="gpt-4o")
|
|
68
|
+
print(f"Split into {len(chunks)} chunks")
|
|
69
|
+
else:
|
|
70
|
+
chunks = [large_document]
|
|
71
|
+
|
|
72
|
+
# Process each chunk with agent
|
|
73
|
+
results = []
|
|
74
|
+
for i, chunk in enumerate(chunks):
|
|
75
|
+
print(f"Processing chunk {i+1}/{len(chunks)}")
|
|
76
|
+
result = await agent.run(chunk)
|
|
77
|
+
results.append(result.output.model_dump()) # Always serialize!
|
|
78
|
+
|
|
79
|
+
# Merge results
|
|
80
|
+
merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Model Limits
|
|
84
|
+
|
|
85
|
+
Pre-configured context limits for major LLM providers:
|
|
86
|
+
|
|
87
|
+
| Model | Max Context | Max Output | Max Input |
|
|
88
|
+
|-------|-------------|------------|-----------|
|
|
89
|
+
| gpt-4o | 128K | 16K | 112K |
|
|
90
|
+
| gpt-4o-mini | 128K | 16K | 112K |
|
|
91
|
+
| o1 | 200K | 100K | 100K |
|
|
92
|
+
| claude-sonnet-4 | 200K | 8K | 192K |
|
|
93
|
+
| claude-3-5-sonnet | 200K | 8K | 192K |
|
|
94
|
+
| gemini-2.0-flash-exp | 1M | 8K | 992K |
|
|
95
|
+
| gemini-1.5-pro | 2M | 8K | 1.992M |
|
|
96
|
+
|
|
97
|
+
**Fuzzy Matching**: Models are matched by family (e.g., "gpt-4o-2024-05-13" → gpt-4o limits)
|
|
98
|
+
|
|
99
|
+
## Smart vs Manual Chunking
|
|
100
|
+
|
|
101
|
+
### Smart Chunking (Recommended)
|
|
102
|
+
|
|
103
|
+
**Use `smart_chunk_text()` for automatic, intelligent chunking:**
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
chunks = smart_chunk_text(text, model="gpt-4o")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Benefits:**
|
|
110
|
+
- ✅ Automatically calculates optimal chunk size from model limits
|
|
111
|
+
- ✅ CVs/resumes fit in single chunk (no unnecessary splitting!)
|
|
112
|
+
- ✅ Accounts for system prompt overhead
|
|
113
|
+
- ✅ Configurable buffer ratio for safety
|
|
114
|
+
- ✅ Model-aware (adapts to GPT-4o, Claude, Gemini limits)
|
|
115
|
+
|
|
116
|
+
**When to use:**
|
|
117
|
+
- Processing user documents (CVs, reports, articles)
|
|
118
|
+
- When you want maximum utilization of model context
|
|
119
|
+
- When chunk size optimization is important
|
|
120
|
+
|
|
121
|
+
### Manual Chunking
|
|
122
|
+
|
|
123
|
+
**Use `chunk_text()` when you need explicit control:**
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
chunks = chunk_text(text, max_tokens=1000, model="gpt-4o")
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
**Benefits:**
|
|
130
|
+
- ✅ Explicit control over chunk size
|
|
131
|
+
- ✅ Useful for testing with small chunks
|
|
132
|
+
- ✅ Good for constrained environments (rate limits, cost control)
|
|
133
|
+
|
|
134
|
+
**When to use:**
|
|
135
|
+
- Testing/development with small chunks
|
|
136
|
+
- Rate limit constraints (process X tokens/hour)
|
|
137
|
+
- Cost optimization (smaller chunks = predictable costs)
|
|
138
|
+
- Specific requirements (e.g., "split every 10K tokens")
|
|
139
|
+
|
|
140
|
+
### Comparison
|
|
141
|
+
|
|
142
|
+
| Feature | smart_chunk_text() | chunk_text() |
|
|
143
|
+
|---------|-------------------|--------------|
|
|
144
|
+
| **Chunk size** | Auto-calculated from model limits | Manual specification |
|
|
145
|
+
| **CV handling** | Single chunk (no splitting) | May split unnecessarily |
|
|
146
|
+
| **System prompt** | Automatically accounted | Must calculate manually |
|
|
147
|
+
| **Model-aware** | Yes (adapts to context windows) | No (fixed max_tokens) |
|
|
148
|
+
| **Buffer safety** | Configurable (default 75%) | Must calculate manually |
|
|
149
|
+
| **Use case** | Production, real documents | Testing, constraints |
|
|
150
|
+
|
|
151
|
+
## Token Estimation
|
|
152
|
+
|
|
153
|
+
### OpenAI Models (Exact)
|
|
154
|
+
|
|
155
|
+
Uses tiktoken for precise token counting:
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from rem.utils.agentic_chunking import estimate_tokens
|
|
159
|
+
|
|
160
|
+
tokens = estimate_tokens("Hello, world!", model="gpt-4o")
|
|
161
|
+
# Returns: 4 (exact count via tiktoken)
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
### Other Models (Heuristic)
|
|
165
|
+
|
|
166
|
+
Falls back to character-based estimation (~4 chars/token + 5% overhead):
|
|
167
|
+
|
|
168
|
+
```python
|
|
169
|
+
tokens = estimate_tokens("Hello, world!", model="claude-sonnet-4")
|
|
170
|
+
# Returns: 3 (heuristic estimate)
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
## Chunking Strategies
|
|
174
|
+
|
|
175
|
+
### Line-Preserving (Default)
|
|
176
|
+
|
|
177
|
+
Chunks by lines, preserving line boundaries:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
chunks = chunk_text(text, max_tokens=1000, model="gpt-4o", preserve_lines=True)
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
- Splits at `\n` boundaries
|
|
184
|
+
- Falls back to character chunking for oversized lines
|
|
185
|
+
- Best for structured text (code, markdown, logs)
|
|
186
|
+
|
|
187
|
+
### Character-Based
|
|
188
|
+
|
|
189
|
+
Chunks by characters with word boundary preservation:
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
chunks = chunk_text(text, max_tokens=1000, model="gpt-4o", preserve_lines=False)
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
- Tries to break at spaces
|
|
196
|
+
- Useful for prose without newlines
|
|
197
|
+
|
|
198
|
+
## Merge Strategies
|
|
199
|
+
|
|
200
|
+
### 1. Concatenate List (Default)
|
|
201
|
+
|
|
202
|
+
**When to use**: Most structured extraction tasks (lists of items, entities, facts)
|
|
203
|
+
|
|
204
|
+
**Behavior**:
|
|
205
|
+
- Lists: Concatenate (`[1, 2]` + `[3, 4]` → `[1, 2, 3, 4]`)
|
|
206
|
+
- Dicts: Update (shallow merge)
|
|
207
|
+
- Scalars: Keep first non-None value
|
|
208
|
+
|
|
209
|
+
**Example**:
|
|
210
|
+
```python
|
|
211
|
+
results = [
|
|
212
|
+
{"skills": ["Python", "SQL"], "experience_years": 5},
|
|
213
|
+
{"skills": ["Docker", "K8s"], "experience_years": 3}
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
217
|
+
# {"skills": ["Python", "SQL", "Docker", "K8s"], "experience_years": 5}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### 2. Deep JSON Merge
|
|
221
|
+
|
|
222
|
+
**When to use**: Nested object structures with hierarchies
|
|
223
|
+
|
|
224
|
+
**Behavior**:
|
|
225
|
+
- Lists: Concatenate
|
|
226
|
+
- Dicts: Recursively deep merge
|
|
227
|
+
- Scalars: Keep first non-None value
|
|
228
|
+
|
|
229
|
+
**Example**:
|
|
230
|
+
```python
|
|
231
|
+
results = [
|
|
232
|
+
{"contract": {"parties": ["Alice"], "terms": {"duration": "1 year"}}},
|
|
233
|
+
{"contract": {"parties": ["Bob"], "terms": {"renewal": "auto"}}}
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
merged = merge_results(results, strategy=MergeStrategy.MERGE_JSON)
|
|
237
|
+
# {
|
|
238
|
+
# "contract": {
|
|
239
|
+
# "parties": ["Alice", "Bob"],
|
|
240
|
+
# "terms": {"duration": "1 year", "renewal": "auto"}
|
|
241
|
+
# }
|
|
242
|
+
# }
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### 3. LLM Merge (TODO)
|
|
246
|
+
|
|
247
|
+
**When to use**: Complex semantic merging requiring intelligence
|
|
248
|
+
|
|
249
|
+
**Behavior**: Use LLM to intelligently merge results (not yet implemented)
|
|
250
|
+
|
|
251
|
+
## Real-World Examples
|
|
252
|
+
|
|
253
|
+
### Example 1: Extract Skills from Long CV
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
from rem.utils.agentic_chunking import smart_chunk_text, merge_results, MergeStrategy
|
|
257
|
+
from rem.agentic.providers.pydantic_ai import create_pydantic_ai_agent
|
|
258
|
+
|
|
259
|
+
# Long CV document
|
|
260
|
+
cv_text = load_cv_file("john-doe-cv.txt") # 5K tokens (typical CV)
|
|
261
|
+
|
|
262
|
+
# Smart chunking - automatically sizes based on model
|
|
263
|
+
# For typical CVs: will return single chunk (no splitting!)
|
|
264
|
+
chunks = smart_chunk_text(cv_text, model="gpt-4o")
|
|
265
|
+
|
|
266
|
+
print(f"Processing CV in {len(chunks)} chunk(s)")
|
|
267
|
+
# Output: Processing CV in 1 chunk(s)
|
|
268
|
+
|
|
269
|
+
# Create agent (using existing schema)
|
|
270
|
+
agent = await create_pydantic_ai_agent(
|
|
271
|
+
context=context,
|
|
272
|
+
agent_schema_uri="cv-parser-v1"
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
# Process each chunk
|
|
276
|
+
results = []
|
|
277
|
+
for i, chunk in enumerate(chunks):
|
|
278
|
+
result = await agent.run(chunk)
|
|
279
|
+
# CRITICAL: Serialize Pydantic models!
|
|
280
|
+
results.append(result.output.model_dump())
|
|
281
|
+
|
|
282
|
+
# Merge extracted skills (no-op if single chunk)
|
|
283
|
+
merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
284
|
+
|
|
285
|
+
print(f"Total skills found: {len(merged['skills'])}")
|
|
286
|
+
# Output: Total skills found: 12
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### Example 2: Analyze Multi-Page Contract
|
|
290
|
+
|
|
291
|
+
```python
|
|
292
|
+
from rem.utils.agentic_chunking import smart_chunk_text, merge_results, MergeStrategy
|
|
293
|
+
|
|
294
|
+
# Large contract (120 pages, 80K tokens)
|
|
295
|
+
contract_text = load_contract("partnership-agreement.pdf")
|
|
296
|
+
|
|
297
|
+
# Smart chunking with system prompt awareness
|
|
298
|
+
system_prompt = """You are a contract analyzer. Extract parties, terms,
|
|
299
|
+
obligations, and risk flags from this legal agreement."""
|
|
300
|
+
|
|
301
|
+
chunks = smart_chunk_text(
|
|
302
|
+
contract_text,
|
|
303
|
+
model="claude-sonnet-4", # 200K context
|
|
304
|
+
system_prompt=system_prompt,
|
|
305
|
+
buffer_ratio=0.75
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
print(f"Contract split into {len(chunks)} chunk(s)")
|
|
309
|
+
# For 80K tokens: likely 1 chunk (Claude has 200K context)
|
|
310
|
+
|
|
311
|
+
# Create contract analyzer agent
|
|
312
|
+
agent = await create_pydantic_ai_agent(
|
|
313
|
+
context=context,
|
|
314
|
+
agent_schema_uri="contract-analyzer-v1"
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
# Extract terms from each chunk
|
|
318
|
+
results = []
|
|
319
|
+
for chunk in chunks:
|
|
320
|
+
result = await agent.run(chunk)
|
|
321
|
+
results.append(result.output.model_dump())
|
|
322
|
+
|
|
323
|
+
# Deep merge nested contract structure
|
|
324
|
+
merged = merge_results(results, strategy=MergeStrategy.MERGE_JSON)
|
|
325
|
+
|
|
326
|
+
print(f"Parties: {merged['parties']}")
|
|
327
|
+
print(f"Key obligations: {len(merged['key_obligations'])}")
|
|
328
|
+
print(f"Risk flags: {len(merged['risk_flags'])}")
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
### Example 3: Process User Session History
|
|
332
|
+
|
|
333
|
+
```python
|
|
334
|
+
from rem.utils.agentic_chunking import chunk_text, estimate_tokens, get_model_limits
|
|
335
|
+
|
|
336
|
+
# User's full session history (many conversations)
|
|
337
|
+
session_history = load_user_sessions(user_id="user-123") # 200K tokens
|
|
338
|
+
|
|
339
|
+
# Get limits for Gemini (large context)
|
|
340
|
+
limits = get_model_limits("gemini-1.5-pro") # 1.992M tokens
|
|
341
|
+
|
|
342
|
+
# Check if chunking needed
|
|
343
|
+
history_tokens = estimate_tokens(session_history, model="gemini-1.5-pro")
|
|
344
|
+
|
|
345
|
+
if history_tokens <= limits.max_input:
|
|
346
|
+
# Fits in one shot!
|
|
347
|
+
result = await agent.run(session_history)
|
|
348
|
+
else:
|
|
349
|
+
# Need to chunk
|
|
350
|
+
chunks = chunk_text(session_history, max_tokens=500000, model="gemini-1.5-pro")
|
|
351
|
+
|
|
352
|
+
results = []
|
|
353
|
+
for chunk in chunks:
|
|
354
|
+
result = await agent.run(chunk)
|
|
355
|
+
results.append(result.output.model_dump())
|
|
356
|
+
|
|
357
|
+
# Merge user profile insights
|
|
358
|
+
merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
359
|
+
```
|
|
360
|
+
|
|
361
|
+
## Integration with REM
|
|
362
|
+
|
|
363
|
+
### Ontology Extraction on Large Files
|
|
364
|
+
|
|
365
|
+
```python
|
|
366
|
+
from rem.utils.agentic_chunking import chunk_text, merge_results
|
|
367
|
+
from rem.services.ontology_extractor import extract_from_file
|
|
368
|
+
|
|
369
|
+
async def extract_from_large_file(
|
|
370
|
+
file: File,
|
|
371
|
+
schema: Schema,
|
|
372
|
+
tenant_id: str
|
|
373
|
+
) -> Ontology:
|
|
374
|
+
"""Extract ontology from large file using chunking."""
|
|
375
|
+
|
|
376
|
+
# Get model from schema provider_configs
|
|
377
|
+
provider = schema.provider_configs[0] if schema.provider_configs else {}
|
|
378
|
+
model = provider.get("model_name", "gpt-4o")
|
|
379
|
+
|
|
380
|
+
# Chunk file content if needed
|
|
381
|
+
limits = get_model_limits(model)
|
|
382
|
+
chunks = chunk_text(file.content, max_tokens=int(limits.max_input * 0.75), model=model)
|
|
383
|
+
|
|
384
|
+
if len(chunks) == 1:
|
|
385
|
+
# Single chunk - normal extraction
|
|
386
|
+
return await extract_from_file(file, schema, tenant_id)
|
|
387
|
+
|
|
388
|
+
# Multi-chunk extraction
|
|
389
|
+
results = []
|
|
390
|
+
for chunk in chunks:
|
|
391
|
+
# Create temporary file for chunk
|
|
392
|
+
chunk_file = File(
|
|
393
|
+
name=f"{file.name} (chunk)",
|
|
394
|
+
content=chunk,
|
|
395
|
+
mime_type=file.mime_type,
|
|
396
|
+
tenant_id=tenant_id
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
# Extract from chunk
|
|
400
|
+
result = await extract_from_file(chunk_file, schema, tenant_id)
|
|
401
|
+
results.append(result.extracted_data)
|
|
402
|
+
|
|
403
|
+
# Merge extracted data
|
|
404
|
+
merged_data = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
405
|
+
|
|
406
|
+
# Create final ontology
|
|
407
|
+
return Ontology(
|
|
408
|
+
name=file.name,
|
|
409
|
+
file_id=file.id,
|
|
410
|
+
agent_schema_id=schema.id,
|
|
411
|
+
provider_name=provider.get("provider_name"),
|
|
412
|
+
model_name=model,
|
|
413
|
+
extracted_data=merged_data,
|
|
414
|
+
tenant_id=tenant_id
|
|
415
|
+
)
|
|
416
|
+
```
|
|
417
|
+
|
|
418
|
+
### Dreaming Worker with Chunking
|
|
419
|
+
|
|
420
|
+
```python
|
|
421
|
+
from rem.utils.agentic_chunking import chunk_text, merge_results
|
|
422
|
+
|
|
423
|
+
async def extract_ontologies_with_chunking(
|
|
424
|
+
user_id: str,
|
|
425
|
+
lookback_hours: int = 24,
|
|
426
|
+
limit: int | None = None
|
|
427
|
+
):
|
|
428
|
+
"""Extract ontologies with automatic chunking for large files."""
|
|
429
|
+
|
|
430
|
+
# Load user's files
|
|
431
|
+
files = await query_files(user_id, lookback_hours, limit)
|
|
432
|
+
|
|
433
|
+
for file in files:
|
|
434
|
+
# Find matching configs
|
|
435
|
+
configs = await get_matching_configs(file, user_id)
|
|
436
|
+
|
|
437
|
+
for config in configs:
|
|
438
|
+
# Load schema
|
|
439
|
+
schema = await load_schema(config.agent_schema_id, user_id)
|
|
440
|
+
|
|
441
|
+
# Extract with chunking
|
|
442
|
+
ontology = await extract_from_large_file(file, schema, user_id)
|
|
443
|
+
|
|
444
|
+
# Generate embeddings
|
|
445
|
+
embedding_text = extract_fields_for_embedding(
|
|
446
|
+
ontology.extracted_data,
|
|
447
|
+
schema.embedding_fields
|
|
448
|
+
)
|
|
449
|
+
ontology.embedding_text = embedding_text
|
|
450
|
+
|
|
451
|
+
# Save
|
|
452
|
+
await ontology_repo.upsert(ontology)
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
## Best Practices
|
|
456
|
+
|
|
457
|
+
### 1. Always Leave Buffer for System Prompt
|
|
458
|
+
|
|
459
|
+
```python
|
|
460
|
+
# BAD: Use full context window
|
|
461
|
+
chunks = chunk_text(text, max_tokens=limits.max_input, model="gpt-4o")
|
|
462
|
+
|
|
463
|
+
# GOOD: Leave buffer for system prompt, tools, etc.
|
|
464
|
+
chunks = chunk_text(text, max_tokens=int(limits.max_input * 0.75), model="gpt-4o")
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
### 2. Serialize Pydantic Models Before Merging
|
|
468
|
+
|
|
469
|
+
```python
|
|
470
|
+
# BAD: Merge Pydantic model instances directly
|
|
471
|
+
results = [result1.output, result2.output] # Pydantic models
|
|
472
|
+
merged = merge_results(results) # May lose fields!
|
|
473
|
+
|
|
474
|
+
# GOOD: Serialize first
|
|
475
|
+
results = [result1.output.model_dump(), result2.output.model_dump()]
|
|
476
|
+
merged = merge_results(results) # All fields preserved
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
### 3. Choose Right Merge Strategy
|
|
480
|
+
|
|
481
|
+
```python
|
|
482
|
+
# Extracting list of items → CONCATENATE_LIST
|
|
483
|
+
skills = merge_results(skill_results, MergeStrategy.CONCATENATE_LIST)
|
|
484
|
+
|
|
485
|
+
# Nested hierarchy → MERGE_JSON
|
|
486
|
+
contract = merge_results(contract_results, MergeStrategy.MERGE_JSON)
|
|
487
|
+
|
|
488
|
+
# Complex semantic merging → LLM_MERGE (future)
|
|
489
|
+
summary = merge_results(summary_results, MergeStrategy.LLM_MERGE)
|
|
490
|
+
```
|
|
491
|
+
|
|
492
|
+
### 4. Handle Single Chunk Case
|
|
493
|
+
|
|
494
|
+
```python
|
|
495
|
+
chunks = chunk_text(text, max_tokens=100000, model="gpt-4o")
|
|
496
|
+
|
|
497
|
+
if len(chunks) == 1:
|
|
498
|
+
# No chunking needed, faster path
|
|
499
|
+
result = await agent.run(chunks[0])
|
|
500
|
+
return result.output.model_dump()
|
|
501
|
+
else:
|
|
502
|
+
# Multi-chunk processing
|
|
503
|
+
results = [await agent.run(c) for c in chunks]
|
|
504
|
+
return merge_results([r.output.model_dump() for r in results])
|
|
505
|
+
```
|
|
506
|
+
|
|
507
|
+
### 5. Respect Rate Limits
|
|
508
|
+
|
|
509
|
+
```python
|
|
510
|
+
import asyncio
|
|
511
|
+
|
|
512
|
+
# Process chunks with rate limiting
|
|
513
|
+
results = []
|
|
514
|
+
for i, chunk in enumerate(chunks):
|
|
515
|
+
result = await agent.run(chunk)
|
|
516
|
+
results.append(result.output.model_dump())
|
|
517
|
+
|
|
518
|
+
# Wait between chunks (e.g., 1 second)
|
|
519
|
+
if i < len(chunks) - 1:
|
|
520
|
+
await asyncio.sleep(1.0)
|
|
521
|
+
|
|
522
|
+
merged = merge_results(results)
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
## Performance Considerations
|
|
526
|
+
|
|
527
|
+
### Token Estimation
|
|
528
|
+
|
|
529
|
+
- **OpenAI (tiktoken)**: Exact count, ~50ms for 10K tokens
|
|
530
|
+
- **Heuristic**: Instant but ~5-10% error margin
|
|
531
|
+
|
|
532
|
+
### Chunking
|
|
533
|
+
|
|
534
|
+
- **Line-preserving**: O(n) where n = number of lines
|
|
535
|
+
- **Character-based**: O(n) where n = text length
|
|
536
|
+
- Both are fast (< 1ms for 100K chars)
|
|
537
|
+
|
|
538
|
+
### Merging
|
|
539
|
+
|
|
540
|
+
- **Concatenate**: O(n*m) where n = results, m = avg fields
|
|
541
|
+
- **Deep merge**: O(n*m*d) where d = nesting depth
|
|
542
|
+
- Both are fast for typical result sizes (< 10ms for 100 results)
|
|
543
|
+
|
|
544
|
+
## Troubleshooting
|
|
545
|
+
|
|
546
|
+
### Issue: Chunks Still Too Large
|
|
547
|
+
|
|
548
|
+
**Symptom**: Agent fails with context length error despite chunking
|
|
549
|
+
|
|
550
|
+
**Solution**: Reduce buffer ratio or account for multi-turn conversation
|
|
551
|
+
|
|
552
|
+
```python
|
|
553
|
+
# If agent uses multiple tool calls (grows context)
|
|
554
|
+
chunks = chunk_text(text, max_tokens=int(limits.max_input * 0.5), model="gpt-4o")
|
|
555
|
+
```
|
|
556
|
+
|
|
557
|
+
### Issue: Lost Fields After Merge
|
|
558
|
+
|
|
559
|
+
**Symptom**: Fields disappear from merged results
|
|
560
|
+
|
|
561
|
+
**Solution**: Always serialize Pydantic models with `.model_dump()`
|
|
562
|
+
|
|
563
|
+
```python
|
|
564
|
+
# Before merging
|
|
565
|
+
results = [r.output.model_dump() for r in agent_results]
|
|
566
|
+
merged = merge_results(results)
|
|
567
|
+
```
|
|
568
|
+
|
|
569
|
+
### Issue: Wrong Token Count
|
|
570
|
+
|
|
571
|
+
**Symptom**: Estimate significantly off from actual usage
|
|
572
|
+
|
|
573
|
+
**Solution**: Use tiktoken for OpenAI, increase buffer for others
|
|
574
|
+
|
|
575
|
+
```python
|
|
576
|
+
# For OpenAI: tiktoken is exact
|
|
577
|
+
chunks = chunk_text(text, max_tokens=100000, model="gpt-4o")
|
|
578
|
+
|
|
579
|
+
# For others: use larger buffer (60-70% instead of 75%)
|
|
580
|
+
chunks = chunk_text(text, max_tokens=int(limits.max_input * 0.6), model="claude-sonnet-4")
|
|
581
|
+
```
|
|
582
|
+
|
|
583
|
+
## Future Enhancements
|
|
584
|
+
|
|
585
|
+
- [ ] LLM merge strategy implementation
|
|
586
|
+
- [ ] Async parallel chunk processing
|
|
587
|
+
- [ ] Progress tracking and cancellation
|
|
588
|
+
- [ ] Chunk caching to avoid re-processing
|
|
589
|
+
- [ ] Smart section-based chunking for markdown/HTML
|
|
590
|
+
- [ ] Integration with semchunk for semantic boundaries
|
|
591
|
+
|
|
592
|
+
## Related Documentation
|
|
593
|
+
|
|
594
|
+
- [CLAUDE.md](../../../../CLAUDE.md) - Core design patterns (Pattern #11)
|
|
595
|
+
- [agentic_chunking.py](./agentic_chunking.py) - Implementation
|
|
596
|
+
- [dict_utils.py](./dict_utils.py) - Field extraction utilities
|
|
597
|
+
- [serialization.py](../agentic/serialization.py) - Pydantic serialization helpers
|