remdb 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rem/__init__.py +2 -0
- rem/agentic/README.md +650 -0
- rem/agentic/__init__.py +39 -0
- rem/agentic/agents/README.md +155 -0
- rem/agentic/agents/__init__.py +8 -0
- rem/agentic/context.py +148 -0
- rem/agentic/context_builder.py +329 -0
- rem/agentic/mcp/__init__.py +0 -0
- rem/agentic/mcp/tool_wrapper.py +107 -0
- rem/agentic/otel/__init__.py +5 -0
- rem/agentic/otel/setup.py +151 -0
- rem/agentic/providers/phoenix.py +674 -0
- rem/agentic/providers/pydantic_ai.py +572 -0
- rem/agentic/query.py +117 -0
- rem/agentic/query_helper.py +89 -0
- rem/agentic/schema.py +396 -0
- rem/agentic/serialization.py +245 -0
- rem/agentic/tools/__init__.py +5 -0
- rem/agentic/tools/rem_tools.py +231 -0
- rem/api/README.md +420 -0
- rem/api/main.py +324 -0
- rem/api/mcp_router/prompts.py +182 -0
- rem/api/mcp_router/resources.py +536 -0
- rem/api/mcp_router/server.py +213 -0
- rem/api/mcp_router/tools.py +584 -0
- rem/api/routers/auth.py +229 -0
- rem/api/routers/chat/__init__.py +5 -0
- rem/api/routers/chat/completions.py +281 -0
- rem/api/routers/chat/json_utils.py +76 -0
- rem/api/routers/chat/models.py +124 -0
- rem/api/routers/chat/streaming.py +185 -0
- rem/auth/README.md +258 -0
- rem/auth/__init__.py +26 -0
- rem/auth/middleware.py +100 -0
- rem/auth/providers/__init__.py +13 -0
- rem/auth/providers/base.py +376 -0
- rem/auth/providers/google.py +163 -0
- rem/auth/providers/microsoft.py +237 -0
- rem/cli/README.md +455 -0
- rem/cli/__init__.py +8 -0
- rem/cli/commands/README.md +126 -0
- rem/cli/commands/__init__.py +3 -0
- rem/cli/commands/ask.py +566 -0
- rem/cli/commands/configure.py +497 -0
- rem/cli/commands/db.py +493 -0
- rem/cli/commands/dreaming.py +324 -0
- rem/cli/commands/experiments.py +1302 -0
- rem/cli/commands/mcp.py +66 -0
- rem/cli/commands/process.py +245 -0
- rem/cli/commands/schema.py +183 -0
- rem/cli/commands/serve.py +106 -0
- rem/cli/dreaming.py +363 -0
- rem/cli/main.py +96 -0
- rem/config.py +237 -0
- rem/mcp_server.py +41 -0
- rem/models/core/__init__.py +49 -0
- rem/models/core/core_model.py +64 -0
- rem/models/core/engram.py +333 -0
- rem/models/core/experiment.py +628 -0
- rem/models/core/inline_edge.py +132 -0
- rem/models/core/rem_query.py +243 -0
- rem/models/entities/__init__.py +43 -0
- rem/models/entities/file.py +57 -0
- rem/models/entities/image_resource.py +88 -0
- rem/models/entities/message.py +35 -0
- rem/models/entities/moment.py +123 -0
- rem/models/entities/ontology.py +191 -0
- rem/models/entities/ontology_config.py +131 -0
- rem/models/entities/resource.py +95 -0
- rem/models/entities/schema.py +87 -0
- rem/models/entities/user.py +85 -0
- rem/py.typed +0 -0
- rem/schemas/README.md +507 -0
- rem/schemas/__init__.py +6 -0
- rem/schemas/agents/README.md +92 -0
- rem/schemas/agents/core/moment-builder.yaml +178 -0
- rem/schemas/agents/core/rem-query-agent.yaml +226 -0
- rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
- rem/schemas/agents/core/simple-assistant.yaml +19 -0
- rem/schemas/agents/core/user-profile-builder.yaml +163 -0
- rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
- rem/schemas/agents/examples/contract-extractor.yaml +134 -0
- rem/schemas/agents/examples/cv-parser.yaml +263 -0
- rem/schemas/agents/examples/hello-world.yaml +37 -0
- rem/schemas/agents/examples/query.yaml +54 -0
- rem/schemas/agents/examples/simple.yaml +21 -0
- rem/schemas/agents/examples/test.yaml +29 -0
- rem/schemas/agents/rem.yaml +128 -0
- rem/schemas/evaluators/hello-world/default.yaml +77 -0
- rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
- rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
- rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
- rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
- rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
- rem/services/__init__.py +16 -0
- rem/services/audio/INTEGRATION.md +308 -0
- rem/services/audio/README.md +376 -0
- rem/services/audio/__init__.py +15 -0
- rem/services/audio/chunker.py +354 -0
- rem/services/audio/transcriber.py +259 -0
- rem/services/content/README.md +1269 -0
- rem/services/content/__init__.py +5 -0
- rem/services/content/providers.py +801 -0
- rem/services/content/service.py +676 -0
- rem/services/dreaming/README.md +230 -0
- rem/services/dreaming/__init__.py +53 -0
- rem/services/dreaming/affinity_service.py +336 -0
- rem/services/dreaming/moment_service.py +264 -0
- rem/services/dreaming/ontology_service.py +54 -0
- rem/services/dreaming/user_model_service.py +297 -0
- rem/services/dreaming/utils.py +39 -0
- rem/services/embeddings/__init__.py +11 -0
- rem/services/embeddings/api.py +120 -0
- rem/services/embeddings/worker.py +421 -0
- rem/services/fs/README.md +662 -0
- rem/services/fs/__init__.py +62 -0
- rem/services/fs/examples.py +206 -0
- rem/services/fs/examples_paths.py +204 -0
- rem/services/fs/git_provider.py +935 -0
- rem/services/fs/local_provider.py +760 -0
- rem/services/fs/parsing-hooks-examples.md +172 -0
- rem/services/fs/paths.py +276 -0
- rem/services/fs/provider.py +460 -0
- rem/services/fs/s3_provider.py +1042 -0
- rem/services/fs/service.py +186 -0
- rem/services/git/README.md +1075 -0
- rem/services/git/__init__.py +17 -0
- rem/services/git/service.py +469 -0
- rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
- rem/services/phoenix/README.md +453 -0
- rem/services/phoenix/__init__.py +46 -0
- rem/services/phoenix/client.py +686 -0
- rem/services/phoenix/config.py +88 -0
- rem/services/phoenix/prompt_labels.py +477 -0
- rem/services/postgres/README.md +575 -0
- rem/services/postgres/__init__.py +23 -0
- rem/services/postgres/migration_service.py +427 -0
- rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
- rem/services/postgres/register_type.py +352 -0
- rem/services/postgres/repository.py +337 -0
- rem/services/postgres/schema_generator.py +379 -0
- rem/services/postgres/service.py +802 -0
- rem/services/postgres/sql_builder.py +354 -0
- rem/services/rem/README.md +304 -0
- rem/services/rem/__init__.py +23 -0
- rem/services/rem/exceptions.py +71 -0
- rem/services/rem/executor.py +293 -0
- rem/services/rem/parser.py +145 -0
- rem/services/rem/queries.py +196 -0
- rem/services/rem/query.py +371 -0
- rem/services/rem/service.py +527 -0
- rem/services/session/README.md +374 -0
- rem/services/session/__init__.py +6 -0
- rem/services/session/compression.py +360 -0
- rem/services/session/reload.py +77 -0
- rem/settings.py +1235 -0
- rem/sql/002_install_models.sql +1068 -0
- rem/sql/background_indexes.sql +42 -0
- rem/sql/install_models.sql +1038 -0
- rem/sql/migrations/001_install.sql +503 -0
- rem/sql/migrations/002_install_models.sql +1202 -0
- rem/utils/AGENTIC_CHUNKING.md +597 -0
- rem/utils/README.md +583 -0
- rem/utils/__init__.py +43 -0
- rem/utils/agentic_chunking.py +622 -0
- rem/utils/batch_ops.py +343 -0
- rem/utils/chunking.py +108 -0
- rem/utils/clip_embeddings.py +276 -0
- rem/utils/dict_utils.py +98 -0
- rem/utils/embeddings.py +423 -0
- rem/utils/examples/embeddings_example.py +305 -0
- rem/utils/examples/sql_types_example.py +202 -0
- rem/utils/markdown.py +16 -0
- rem/utils/model_helpers.py +236 -0
- rem/utils/schema_loader.py +336 -0
- rem/utils/sql_types.py +348 -0
- rem/utils/user_id.py +81 -0
- rem/utils/vision.py +330 -0
- rem/workers/README.md +506 -0
- rem/workers/__init__.py +5 -0
- rem/workers/dreaming.py +502 -0
- rem/workers/engram_processor.py +312 -0
- rem/workers/sqs_file_processor.py +193 -0
- remdb-0.3.7.dist-info/METADATA +1473 -0
- remdb-0.3.7.dist-info/RECORD +187 -0
- remdb-0.3.7.dist-info/WHEEL +4 -0
- remdb-0.3.7.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,622 @@
|
|
|
1
|
+
"""Agentic chunking utilities for splitting large inputs across model context windows.
|
|
2
|
+
|
|
3
|
+
This module provides token-aware chunking for agent inputs that exceed model
|
|
4
|
+
context limits. Chunks can be processed independently by agents and merged
|
|
5
|
+
back using configurable strategies.
|
|
6
|
+
|
|
7
|
+
Key Features:
|
|
8
|
+
- Token counting using tiktoken for OpenAI models
|
|
9
|
+
- Character estimation fallback for other providers
|
|
10
|
+
- Model-specific context window limits
|
|
11
|
+
- Smart section-based chunking for markdown
|
|
12
|
+
- Configurable merge strategies: concatenate, merge_json, llm_merge (planned)
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
from rem.utils.agentic_chunking import smart_chunk_text, merge_results, MergeStrategy
|
|
16
|
+
|
|
17
|
+
# Smart chunking (recommended - auto-sizes based on model)
|
|
18
|
+
chunks = smart_chunk_text(text, model="gpt-4o")
|
|
19
|
+
|
|
20
|
+
# Process each chunk with agent
|
|
21
|
+
results = [agent.run(chunk) for chunk in chunks]
|
|
22
|
+
|
|
23
|
+
# Merge results
|
|
24
|
+
merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
import logging
|
|
28
|
+
from dataclasses import dataclass
|
|
29
|
+
from enum import Enum
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
# Module logger
|
|
33
|
+
logger = logging.getLogger(__name__)
|
|
34
|
+
|
|
35
|
+
# Constants for token estimation and chunking
|
|
36
|
+
CHARS_PER_TOKEN_HEURISTIC = 4 # Conservative estimate: ~4 characters per token
|
|
37
|
+
TOKEN_OVERHEAD_MULTIPLIER = 1.05 # Add 5% overhead for special tokens/encoding
|
|
38
|
+
DEFAULT_BUFFER_RATIO = 0.75 # Use 75% of available tokens (conservative for safety)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ModelLimits:
|
|
43
|
+
"""Token limits for a model."""
|
|
44
|
+
|
|
45
|
+
max_context: int
|
|
46
|
+
max_output: int
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def max_input(self) -> int:
|
|
50
|
+
"""Maximum tokens for input (context - output buffer)."""
|
|
51
|
+
return self.max_context - self.max_output
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# Model context limits (conservative estimates)
|
|
55
|
+
# Source: Provider documentation as of Jan 2025
|
|
56
|
+
MODEL_LIMITS = {
|
|
57
|
+
# OpenAI
|
|
58
|
+
"gpt-4o": ModelLimits(max_context=128000, max_output=16384),
|
|
59
|
+
"gpt-4o-mini": ModelLimits(max_context=128000, max_output=16384),
|
|
60
|
+
"gpt-4-turbo": ModelLimits(max_context=128000, max_output=4096),
|
|
61
|
+
"gpt-3.5-turbo": ModelLimits(max_context=16385, max_output=4096),
|
|
62
|
+
"o1": ModelLimits(max_context=200000, max_output=100000),
|
|
63
|
+
"o1-mini": ModelLimits(max_context=128000, max_output=65536),
|
|
64
|
+
# Anthropic
|
|
65
|
+
"claude-sonnet-4-20250514": ModelLimits(max_context=200000, max_output=8192),
|
|
66
|
+
"claude-sonnet-4": ModelLimits(max_context=200000, max_output=8192),
|
|
67
|
+
"claude-3-5-sonnet-20241022": ModelLimits(max_context=200000, max_output=8192),
|
|
68
|
+
"claude-3-opus-20240229": ModelLimits(max_context=200000, max_output=4096),
|
|
69
|
+
"claude-3-sonnet-20240229": ModelLimits(max_context=200000, max_output=4096),
|
|
70
|
+
# Google
|
|
71
|
+
"gemini-2.0-flash-exp": ModelLimits(max_context=1000000, max_output=8192),
|
|
72
|
+
"gemini-1.5-pro": ModelLimits(max_context=2000000, max_output=8192),
|
|
73
|
+
# Default fallback
|
|
74
|
+
"default": ModelLimits(max_context=32000, max_output=4096),
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class MergeStrategy(str, Enum):
|
|
79
|
+
"""Strategy for merging chunked agent results.
|
|
80
|
+
|
|
81
|
+
Available strategies:
|
|
82
|
+
- CONCATENATE_LIST: Merge lists, shallow update dicts, keep first scalar (default)
|
|
83
|
+
- MERGE_JSON: Deep recursive merge of nested JSON objects
|
|
84
|
+
- LLM_MERGE: Use LLM for intelligent semantic merging (NOT YET IMPLEMENTED)
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
CONCATENATE_LIST = "concatenate_list" # Default: merge lists, update dicts, keep first scalar
|
|
88
|
+
MERGE_JSON = "merge_json" # Deep merge JSON objects
|
|
89
|
+
LLM_MERGE = "llm_merge" # PLANNED: Use LLM to intelligently merge results
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def get_model_limits(model: str) -> ModelLimits:
|
|
93
|
+
"""Get token limits for a model.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
model: Model name (e.g., "gpt-4o", "claude-sonnet-4")
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
ModelLimits for the model
|
|
100
|
+
|
|
101
|
+
Examples:
|
|
102
|
+
>>> limits = get_model_limits("gpt-4o")
|
|
103
|
+
>>> limits.max_input
|
|
104
|
+
111616
|
|
105
|
+
|
|
106
|
+
>>> limits = get_model_limits("claude-sonnet-4")
|
|
107
|
+
>>> limits.max_input
|
|
108
|
+
191808
|
|
109
|
+
"""
|
|
110
|
+
# Direct lookup
|
|
111
|
+
if model in MODEL_LIMITS:
|
|
112
|
+
return MODEL_LIMITS[model]
|
|
113
|
+
|
|
114
|
+
# Fuzzy match by model family
|
|
115
|
+
model_lower = model.lower()
|
|
116
|
+
|
|
117
|
+
# OpenAI family
|
|
118
|
+
if "gpt-4o-mini" in model_lower:
|
|
119
|
+
return MODEL_LIMITS["gpt-4o-mini"]
|
|
120
|
+
elif "gpt-4o" in model_lower:
|
|
121
|
+
return MODEL_LIMITS["gpt-4o"]
|
|
122
|
+
elif "gpt-4" in model_lower:
|
|
123
|
+
return MODEL_LIMITS["gpt-4-turbo"]
|
|
124
|
+
elif "gpt-3.5" in model_lower or "gpt-35" in model_lower:
|
|
125
|
+
return MODEL_LIMITS["gpt-3.5-turbo"]
|
|
126
|
+
elif "o1-mini" in model_lower:
|
|
127
|
+
return MODEL_LIMITS["o1-mini"]
|
|
128
|
+
elif "o1" in model_lower:
|
|
129
|
+
return MODEL_LIMITS["o1"]
|
|
130
|
+
|
|
131
|
+
# Anthropic family
|
|
132
|
+
if "claude-sonnet-4" in model_lower or "claude-4" in model_lower:
|
|
133
|
+
return MODEL_LIMITS["claude-sonnet-4"]
|
|
134
|
+
elif "claude-3.5" in model_lower or "claude-3-5" in model_lower:
|
|
135
|
+
return MODEL_LIMITS["claude-3-5-sonnet-20241022"]
|
|
136
|
+
elif "claude-3" in model_lower:
|
|
137
|
+
return MODEL_LIMITS["claude-3-sonnet-20240229"]
|
|
138
|
+
elif "claude" in model_lower:
|
|
139
|
+
return MODEL_LIMITS["claude-3-sonnet-20240229"]
|
|
140
|
+
|
|
141
|
+
# Google family
|
|
142
|
+
if "gemini-2" in model_lower:
|
|
143
|
+
return MODEL_LIMITS["gemini-2.0-flash-exp"]
|
|
144
|
+
elif "gemini" in model_lower:
|
|
145
|
+
return MODEL_LIMITS["gemini-1.5-pro"]
|
|
146
|
+
|
|
147
|
+
# Default fallback
|
|
148
|
+
return MODEL_LIMITS["default"]
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def estimate_tokens(text: str, model: str | None = None) -> int:
|
|
152
|
+
"""Estimate token count for text.
|
|
153
|
+
|
|
154
|
+
Uses tiktoken for OpenAI models (exact count).
|
|
155
|
+
Falls back to character-based heuristic for other providers.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
text: Text to estimate tokens for
|
|
159
|
+
model: Optional model name for tiktoken encoding selection
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Estimated token count
|
|
163
|
+
|
|
164
|
+
Examples:
|
|
165
|
+
>>> estimate_tokens("Hello world", model="gpt-4o")
|
|
166
|
+
2
|
|
167
|
+
|
|
168
|
+
>>> estimate_tokens("Hello world", model="claude-sonnet-4")
|
|
169
|
+
3 # Heuristic estimate
|
|
170
|
+
"""
|
|
171
|
+
if not text:
|
|
172
|
+
return 0
|
|
173
|
+
|
|
174
|
+
# Try tiktoken for OpenAI models (exact counting)
|
|
175
|
+
if model and ("gpt" in model.lower() or "o1" in model.lower()):
|
|
176
|
+
try:
|
|
177
|
+
import tiktoken
|
|
178
|
+
|
|
179
|
+
# Get encoding for model
|
|
180
|
+
try:
|
|
181
|
+
encoding = tiktoken.encoding_for_model(model)
|
|
182
|
+
token_count = len(encoding.encode(text))
|
|
183
|
+
logger.debug(f"Exact token count via tiktoken: {token_count} tokens (model: {model})")
|
|
184
|
+
return token_count
|
|
185
|
+
except KeyError:
|
|
186
|
+
# Fall back to cl100k_base for unknown OpenAI models
|
|
187
|
+
logger.warning(
|
|
188
|
+
f"Unknown OpenAI model '{model}', falling back to cl100k_base encoding. "
|
|
189
|
+
"Token counts may be inaccurate."
|
|
190
|
+
)
|
|
191
|
+
encoding = tiktoken.get_encoding("cl100k_base")
|
|
192
|
+
return len(encoding.encode(text))
|
|
193
|
+
except ImportError:
|
|
194
|
+
# tiktoken not installed, fall through to heuristic
|
|
195
|
+
logger.debug(
|
|
196
|
+
"tiktoken not installed, using character-based heuristic for token estimation. "
|
|
197
|
+
"Install tiktoken for exact OpenAI token counting: pip install tiktoken"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
# Character-based heuristic
|
|
201
|
+
base_estimate = len(text) / CHARS_PER_TOKEN_HEURISTIC
|
|
202
|
+
token_estimate = int(base_estimate * TOKEN_OVERHEAD_MULTIPLIER)
|
|
203
|
+
logger.debug(
|
|
204
|
+
f"Heuristic token estimate: {token_estimate} tokens "
|
|
205
|
+
f"(chars={len(text)}, ratio={CHARS_PER_TOKEN_HEURISTIC}, overhead={TOKEN_OVERHEAD_MULTIPLIER})"
|
|
206
|
+
)
|
|
207
|
+
return token_estimate
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def smart_chunk_text(
|
|
211
|
+
text: str,
|
|
212
|
+
model: str,
|
|
213
|
+
system_prompt: str = "",
|
|
214
|
+
buffer_ratio: float = DEFAULT_BUFFER_RATIO,
|
|
215
|
+
preserve_lines: bool = True,
|
|
216
|
+
) -> list[str]:
|
|
217
|
+
"""Intelligently chunk text based on model limits with automatic sizing.
|
|
218
|
+
|
|
219
|
+
This is the recommended way to chunk text - it automatically calculates
|
|
220
|
+
optimal chunk size based on the model's context window, accounting for
|
|
221
|
+
system prompt overhead and safety buffers.
|
|
222
|
+
|
|
223
|
+
Args:
|
|
224
|
+
text: Text to chunk
|
|
225
|
+
model: Model name (e.g., "gpt-4o", "claude-sonnet-4")
|
|
226
|
+
system_prompt: System prompt that will be used (to account for overhead)
|
|
227
|
+
buffer_ratio: Ratio of available tokens to use (default 0.75 = 75%)
|
|
228
|
+
preserve_lines: If True, avoid splitting mid-line
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
List of text chunks, each optimally sized for the model
|
|
232
|
+
|
|
233
|
+
Examples:
|
|
234
|
+
>>> # CV extraction - will fit in single chunk for GPT-4o (128K context)
|
|
235
|
+
>>> cv_text = load_cv("john-doe.txt") # 5K tokens
|
|
236
|
+
>>> chunks = smart_chunk_text(cv_text, model="gpt-4o")
|
|
237
|
+
>>> len(chunks)
|
|
238
|
+
1
|
|
239
|
+
|
|
240
|
+
>>> # Large contract - will split intelligently
|
|
241
|
+
>>> contract = load_contract("agreement.pdf") # 150K tokens
|
|
242
|
+
>>> chunks = smart_chunk_text(contract, model="gpt-4o")
|
|
243
|
+
>>> len(chunks)
|
|
244
|
+
2
|
|
245
|
+
|
|
246
|
+
>>> # With custom system prompt overhead
|
|
247
|
+
>>> chunks = smart_chunk_text(
|
|
248
|
+
... text,
|
|
249
|
+
... model="gpt-4o",
|
|
250
|
+
... system_prompt="Extract key terms from this contract...",
|
|
251
|
+
... buffer_ratio=0.7 # More conservative for complex prompts
|
|
252
|
+
... )
|
|
253
|
+
"""
|
|
254
|
+
try:
|
|
255
|
+
if not text:
|
|
256
|
+
logger.debug("smart_chunk_text called with empty text")
|
|
257
|
+
return []
|
|
258
|
+
|
|
259
|
+
# Get model limits
|
|
260
|
+
limits = get_model_limits(model)
|
|
261
|
+
|
|
262
|
+
# Calculate overhead from system prompt
|
|
263
|
+
system_tokens = estimate_tokens(system_prompt, model) if system_prompt else 0
|
|
264
|
+
|
|
265
|
+
# Calculate available tokens for content
|
|
266
|
+
# Reserve space for: system prompt + output buffer + safety margin
|
|
267
|
+
available_tokens = limits.max_input - system_tokens
|
|
268
|
+
|
|
269
|
+
# Apply buffer ratio for safety
|
|
270
|
+
max_chunk_tokens = int(available_tokens * buffer_ratio)
|
|
271
|
+
|
|
272
|
+
# Check if text fits in single chunk
|
|
273
|
+
text_tokens = estimate_tokens(text, model)
|
|
274
|
+
|
|
275
|
+
logger.debug(
|
|
276
|
+
f"Chunking analysis: model={model}, text_tokens={text_tokens}, "
|
|
277
|
+
f"max_chunk_tokens={max_chunk_tokens} (buffer={buffer_ratio*100:.0f}%), "
|
|
278
|
+
f"system_overhead={system_tokens}, available={available_tokens}"
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
if text_tokens <= max_chunk_tokens:
|
|
282
|
+
logger.debug("Text fits in single chunk, no chunking needed")
|
|
283
|
+
return [text]
|
|
284
|
+
|
|
285
|
+
# Need to chunk
|
|
286
|
+
strategy = "line-based" if preserve_lines else "character-based"
|
|
287
|
+
logger.info(
|
|
288
|
+
f"Chunking required: {text_tokens} tokens exceeds {max_chunk_tokens} limit "
|
|
289
|
+
f"(model: {model}). Using {strategy} strategy."
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
if preserve_lines:
|
|
293
|
+
chunks = _chunk_by_lines(text, max_chunk_tokens, model)
|
|
294
|
+
else:
|
|
295
|
+
chunks = _chunk_by_chars(text, max_chunk_tokens, model)
|
|
296
|
+
|
|
297
|
+
logger.info(
|
|
298
|
+
f"Created {len(chunks)} chunks from {text_tokens} token input "
|
|
299
|
+
f"(avg {text_tokens//len(chunks)} tokens/chunk)"
|
|
300
|
+
)
|
|
301
|
+
return chunks
|
|
302
|
+
|
|
303
|
+
except Exception as e:
|
|
304
|
+
logger.exception(
|
|
305
|
+
f"Chunking failed: model={model}, text_length={len(text)}, "
|
|
306
|
+
f"buffer_ratio={buffer_ratio}"
|
|
307
|
+
)
|
|
308
|
+
raise
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def chunk_text(
|
|
312
|
+
text: str,
|
|
313
|
+
max_tokens: int,
|
|
314
|
+
model: str | None = None,
|
|
315
|
+
preserve_lines: bool = True,
|
|
316
|
+
) -> list[str]:
|
|
317
|
+
"""Chunk text to fit within token limit.
|
|
318
|
+
|
|
319
|
+
NOTE: Consider using smart_chunk_text() instead, which automatically
|
|
320
|
+
calculates optimal chunk size based on model limits.
|
|
321
|
+
|
|
322
|
+
Args:
|
|
323
|
+
text: Text to chunk
|
|
324
|
+
max_tokens: Maximum tokens per chunk
|
|
325
|
+
model: Optional model name for token counting
|
|
326
|
+
preserve_lines: If True, avoid splitting mid-line
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
List of text chunks, each within token limit
|
|
330
|
+
|
|
331
|
+
Examples:
|
|
332
|
+
>>> text = "Line 1\\nLine 2\\nLine 3\\n" * 1000
|
|
333
|
+
>>> chunks = chunk_text(text, max_tokens=1000, model="gpt-4o")
|
|
334
|
+
>>> len(chunks) > 1
|
|
335
|
+
True
|
|
336
|
+
>>> all(estimate_tokens(c, "gpt-4o") <= 1000 for c in chunks)
|
|
337
|
+
True
|
|
338
|
+
"""
|
|
339
|
+
if not text:
|
|
340
|
+
return []
|
|
341
|
+
|
|
342
|
+
# Check if text fits in single chunk
|
|
343
|
+
text_tokens = estimate_tokens(text, model)
|
|
344
|
+
if text_tokens <= max_tokens:
|
|
345
|
+
return [text]
|
|
346
|
+
|
|
347
|
+
# Need to chunk - use line-based or character-based approach
|
|
348
|
+
if preserve_lines:
|
|
349
|
+
return _chunk_by_lines(text, max_tokens, model)
|
|
350
|
+
else:
|
|
351
|
+
return _chunk_by_chars(text, max_tokens, model)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def _chunk_by_lines(text: str, max_tokens: int, model: str | None) -> list[str]:
|
|
355
|
+
"""Chunk text by lines, preserving line boundaries.
|
|
356
|
+
|
|
357
|
+
Args:
|
|
358
|
+
text: Text to chunk
|
|
359
|
+
max_tokens: Maximum tokens per chunk
|
|
360
|
+
model: Optional model name for token counting
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
List of text chunks
|
|
364
|
+
"""
|
|
365
|
+
chunks = []
|
|
366
|
+
lines = text.split("\n")
|
|
367
|
+
current_chunk: list[str] = []
|
|
368
|
+
current_tokens = 0
|
|
369
|
+
|
|
370
|
+
logger.debug(f"Line-based chunking: {len(lines)} lines, max_tokens={max_tokens}")
|
|
371
|
+
|
|
372
|
+
for line_num, line in enumerate(lines, 1):
|
|
373
|
+
line_tokens = estimate_tokens(line + "\n", model)
|
|
374
|
+
|
|
375
|
+
# If single line exceeds limit, split it by characters
|
|
376
|
+
if line_tokens > max_tokens:
|
|
377
|
+
logger.warning(
|
|
378
|
+
f"Line {line_num} exceeds token limit ({line_tokens} > {max_tokens}), "
|
|
379
|
+
f"falling back to character-based chunking. Line preview: {line[:100]}..."
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
# Save current chunk if any
|
|
383
|
+
if current_chunk:
|
|
384
|
+
chunks.append("\n".join(current_chunk))
|
|
385
|
+
current_chunk = []
|
|
386
|
+
current_tokens = 0
|
|
387
|
+
|
|
388
|
+
# Split the large line
|
|
389
|
+
line_chunks = _chunk_by_chars(line, max_tokens, model)
|
|
390
|
+
chunks.extend(line_chunks)
|
|
391
|
+
continue
|
|
392
|
+
|
|
393
|
+
# Check if adding this line would exceed limit
|
|
394
|
+
if current_tokens + line_tokens > max_tokens and current_chunk:
|
|
395
|
+
# Save current chunk and start new one
|
|
396
|
+
logger.debug(f"Chunk boundary at line {line_num} ({current_tokens} tokens)")
|
|
397
|
+
chunks.append("\n".join(current_chunk))
|
|
398
|
+
current_chunk = [line]
|
|
399
|
+
current_tokens = line_tokens
|
|
400
|
+
else:
|
|
401
|
+
# Add to current chunk
|
|
402
|
+
current_chunk.append(line)
|
|
403
|
+
current_tokens += line_tokens
|
|
404
|
+
|
|
405
|
+
# Add final chunk
|
|
406
|
+
if current_chunk:
|
|
407
|
+
chunks.append("\n".join(current_chunk))
|
|
408
|
+
|
|
409
|
+
logger.debug(f"Line chunking complete: {len(chunks)} chunks created")
|
|
410
|
+
return chunks if chunks else [text]
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def _chunk_by_chars(text: str, max_tokens: int, model: str | None) -> list[str]:
|
|
414
|
+
"""Fallback: chunk text by characters when line-based chunking fails.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
text: Text to chunk
|
|
418
|
+
max_tokens: Maximum tokens per chunk
|
|
419
|
+
model: Optional model name for token counting
|
|
420
|
+
|
|
421
|
+
Returns:
|
|
422
|
+
List of text chunks
|
|
423
|
+
"""
|
|
424
|
+
# Convert tokens to approximate chars using heuristic
|
|
425
|
+
max_chars = int(max_tokens * CHARS_PER_TOKEN_HEURISTIC)
|
|
426
|
+
|
|
427
|
+
chunks = []
|
|
428
|
+
start = 0
|
|
429
|
+
text_len = len(text)
|
|
430
|
+
|
|
431
|
+
logger.debug(
|
|
432
|
+
f"Character-based chunking: text_length={text_len} chars, "
|
|
433
|
+
f"max_chars={max_chars} (max_tokens={max_tokens})"
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
while start < text_len:
|
|
437
|
+
# Calculate end position
|
|
438
|
+
end = min(start + max_chars, text_len)
|
|
439
|
+
|
|
440
|
+
# Try to break at word boundary if not at text end
|
|
441
|
+
if end < text_len:
|
|
442
|
+
# Look back for space
|
|
443
|
+
space_pos = text.rfind(" ", start, end)
|
|
444
|
+
if space_pos > start:
|
|
445
|
+
original_end = end
|
|
446
|
+
end = space_pos
|
|
447
|
+
logger.debug(
|
|
448
|
+
f"Adjusted chunk boundary for word break: {original_end} → {end}"
|
|
449
|
+
)
|
|
450
|
+
|
|
451
|
+
chunk = text[start:end].strip()
|
|
452
|
+
if chunk:
|
|
453
|
+
chunks.append(chunk)
|
|
454
|
+
|
|
455
|
+
start = end
|
|
456
|
+
|
|
457
|
+
logger.debug(
|
|
458
|
+
f"Character chunking complete: {len(chunks)} chunks created "
|
|
459
|
+
f"(avg {text_len//len(chunks) if chunks else 0} chars/chunk)"
|
|
460
|
+
)
|
|
461
|
+
return chunks if chunks else [text]
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
def merge_results(
|
|
465
|
+
results: list[dict[str, Any]],
|
|
466
|
+
strategy: MergeStrategy = MergeStrategy.CONCATENATE_LIST,
|
|
467
|
+
) -> dict[str, Any]:
|
|
468
|
+
"""Merge multiple agent results using specified strategy.
|
|
469
|
+
|
|
470
|
+
Args:
|
|
471
|
+
results: List of result dictionaries from agent chunks
|
|
472
|
+
strategy: Merge strategy to use
|
|
473
|
+
|
|
474
|
+
Returns:
|
|
475
|
+
Merged result dictionary
|
|
476
|
+
|
|
477
|
+
Examples:
|
|
478
|
+
>>> results = [
|
|
479
|
+
... {"items": [1, 2], "count": 2},
|
|
480
|
+
... {"items": [3, 4], "count": 2}
|
|
481
|
+
... ]
|
|
482
|
+
>>> merged = merge_results(results, MergeStrategy.CONCATENATE_LIST)
|
|
483
|
+
>>> merged["items"]
|
|
484
|
+
[1, 2, 3, 4]
|
|
485
|
+
>>> merged["count"]
|
|
486
|
+
2
|
|
487
|
+
"""
|
|
488
|
+
try:
|
|
489
|
+
if not results:
|
|
490
|
+
logger.debug("merge_results called with empty results list")
|
|
491
|
+
return {}
|
|
492
|
+
|
|
493
|
+
if len(results) == 1:
|
|
494
|
+
logger.debug("merge_results called with single result, returning as-is")
|
|
495
|
+
return results[0]
|
|
496
|
+
|
|
497
|
+
logger.info(
|
|
498
|
+
f"Merging {len(results)} chunk results using strategy: {strategy.value}"
|
|
499
|
+
)
|
|
500
|
+
|
|
501
|
+
if strategy == MergeStrategy.CONCATENATE_LIST:
|
|
502
|
+
merged = _merge_concatenate(results)
|
|
503
|
+
elif strategy == MergeStrategy.MERGE_JSON:
|
|
504
|
+
merged = _merge_json_deep(results)
|
|
505
|
+
elif strategy == MergeStrategy.LLM_MERGE:
|
|
506
|
+
raise NotImplementedError("LLM merge strategy not yet implemented")
|
|
507
|
+
else:
|
|
508
|
+
raise ValueError(f"Unknown merge strategy: {strategy}")
|
|
509
|
+
|
|
510
|
+
logger.debug(f"Merge complete: result has {len(merged)} top-level keys")
|
|
511
|
+
return merged
|
|
512
|
+
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.exception(
|
|
515
|
+
f"Merge failed: strategy={strategy.value}, num_results={len(results)}"
|
|
516
|
+
)
|
|
517
|
+
raise
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
def _merge_concatenate(results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
521
|
+
"""Default merge: concatenate lists, update dicts, keep first scalar.
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
results: List of result dictionaries
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Merged result
|
|
528
|
+
"""
|
|
529
|
+
merged = results[0].copy()
|
|
530
|
+
logger.debug(f"Starting concatenate merge with {len(results)} results")
|
|
531
|
+
|
|
532
|
+
for chunk_num, result in enumerate(results[1:], start=2):
|
|
533
|
+
logger.debug(f"Merging chunk {chunk_num}/{len(results)}")
|
|
534
|
+
|
|
535
|
+
for key, value in result.items():
|
|
536
|
+
if key not in merged:
|
|
537
|
+
merged[key] = value
|
|
538
|
+
logger.debug(f" Added new key '{key}' from chunk {chunk_num}")
|
|
539
|
+
continue
|
|
540
|
+
|
|
541
|
+
merged_value = merged[key]
|
|
542
|
+
|
|
543
|
+
# Merge lists by concatenation
|
|
544
|
+
if isinstance(merged_value, list) and isinstance(value, list):
|
|
545
|
+
original_len = len(merged_value)
|
|
546
|
+
merged[key] = merged_value + value
|
|
547
|
+
logger.debug(
|
|
548
|
+
f" Concatenated list '{key}': {original_len} + {len(value)} = {len(merged[key])} items"
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
# Merge dicts by update (shallow)
|
|
552
|
+
elif isinstance(merged_value, dict) and isinstance(value, dict):
|
|
553
|
+
merged[key].update(value)
|
|
554
|
+
logger.debug(f" Updated dict '{key}' with {len(value)} keys from chunk {chunk_num}")
|
|
555
|
+
|
|
556
|
+
# For scalars, prefer non-None values, or keep first
|
|
557
|
+
else:
|
|
558
|
+
if merged_value is None:
|
|
559
|
+
merged[key] = value
|
|
560
|
+
logger.debug(f" Replaced None value for '{key}' with value from chunk {chunk_num}")
|
|
561
|
+
elif value is not None and merged_value != value:
|
|
562
|
+
# CRITICAL: Warn about silent data loss
|
|
563
|
+
logger.warning(
|
|
564
|
+
f"Scalar value conflict for key '{key}': "
|
|
565
|
+
f"keeping first value ({merged_value!r}), "
|
|
566
|
+
f"discarding chunk {chunk_num} value ({value!r})"
|
|
567
|
+
)
|
|
568
|
+
|
|
569
|
+
logger.debug(f"Concatenate merge complete: {len(merged)} keys in result")
|
|
570
|
+
return merged
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
def _merge_json_deep(results: list[dict[str, Any]]) -> dict[str, Any]:
|
|
574
|
+
"""Deep merge JSON objects recursively.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
results: List of result dictionaries
|
|
578
|
+
|
|
579
|
+
Returns:
|
|
580
|
+
Deeply merged result
|
|
581
|
+
"""
|
|
582
|
+
logger.debug(f"Starting deep JSON merge with {len(results)} results")
|
|
583
|
+
|
|
584
|
+
def deep_merge(base: dict, update: dict, depth: int = 0) -> dict:
|
|
585
|
+
"""Recursively merge update into base."""
|
|
586
|
+
merged = base.copy()
|
|
587
|
+
indent = " " * depth
|
|
588
|
+
|
|
589
|
+
for key, value in update.items():
|
|
590
|
+
if key not in merged:
|
|
591
|
+
merged[key] = value
|
|
592
|
+
logger.debug(f"{indent}Added new key '{key}' at depth {depth}")
|
|
593
|
+
elif isinstance(merged[key], dict) and isinstance(value, dict):
|
|
594
|
+
logger.debug(f"{indent}Deep merging dict '{key}' at depth {depth}")
|
|
595
|
+
merged[key] = deep_merge(merged[key], value, depth + 1)
|
|
596
|
+
elif isinstance(merged[key], list) and isinstance(value, list):
|
|
597
|
+
original_len = len(merged[key])
|
|
598
|
+
merged[key] = merged[key] + value
|
|
599
|
+
logger.debug(
|
|
600
|
+
f"{indent}Concatenated list '{key}' at depth {depth}: "
|
|
601
|
+
f"{original_len} + {len(value)} = {len(merged[key])} items"
|
|
602
|
+
)
|
|
603
|
+
else:
|
|
604
|
+
# Keep first non-None value
|
|
605
|
+
if merged[key] is None:
|
|
606
|
+
merged[key] = value
|
|
607
|
+
logger.debug(f"{indent}Replaced None value for '{key}' at depth {depth}")
|
|
608
|
+
elif value is not None and merged[key] != value:
|
|
609
|
+
logger.warning(
|
|
610
|
+
f"{indent}Scalar conflict at depth {depth} for '{key}': "
|
|
611
|
+
f"keeping first value ({merged[key]!r}), discarding ({value!r})"
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
return merged
|
|
615
|
+
|
|
616
|
+
result = results[0].copy()
|
|
617
|
+
for chunk_num, r in enumerate(results[1:], start=2):
|
|
618
|
+
logger.debug(f"Deep merging chunk {chunk_num}/{len(results)}")
|
|
619
|
+
result = deep_merge(result, r, depth=0)
|
|
620
|
+
|
|
621
|
+
logger.debug(f"Deep merge complete: {len(result)} top-level keys")
|
|
622
|
+
return result
|