remdb 0.2.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of remdb might be problematic. Click here for more details.

Files changed (187) hide show
  1. rem/__init__.py +2 -0
  2. rem/agentic/README.md +650 -0
  3. rem/agentic/__init__.py +39 -0
  4. rem/agentic/agents/README.md +155 -0
  5. rem/agentic/agents/__init__.py +8 -0
  6. rem/agentic/context.py +148 -0
  7. rem/agentic/context_builder.py +329 -0
  8. rem/agentic/mcp/__init__.py +0 -0
  9. rem/agentic/mcp/tool_wrapper.py +107 -0
  10. rem/agentic/otel/__init__.py +5 -0
  11. rem/agentic/otel/setup.py +151 -0
  12. rem/agentic/providers/phoenix.py +674 -0
  13. rem/agentic/providers/pydantic_ai.py +572 -0
  14. rem/agentic/query.py +117 -0
  15. rem/agentic/query_helper.py +89 -0
  16. rem/agentic/schema.py +396 -0
  17. rem/agentic/serialization.py +245 -0
  18. rem/agentic/tools/__init__.py +5 -0
  19. rem/agentic/tools/rem_tools.py +231 -0
  20. rem/api/README.md +420 -0
  21. rem/api/main.py +324 -0
  22. rem/api/mcp_router/prompts.py +182 -0
  23. rem/api/mcp_router/resources.py +536 -0
  24. rem/api/mcp_router/server.py +213 -0
  25. rem/api/mcp_router/tools.py +584 -0
  26. rem/api/routers/auth.py +229 -0
  27. rem/api/routers/chat/__init__.py +5 -0
  28. rem/api/routers/chat/completions.py +281 -0
  29. rem/api/routers/chat/json_utils.py +76 -0
  30. rem/api/routers/chat/models.py +124 -0
  31. rem/api/routers/chat/streaming.py +185 -0
  32. rem/auth/README.md +258 -0
  33. rem/auth/__init__.py +26 -0
  34. rem/auth/middleware.py +100 -0
  35. rem/auth/providers/__init__.py +13 -0
  36. rem/auth/providers/base.py +376 -0
  37. rem/auth/providers/google.py +163 -0
  38. rem/auth/providers/microsoft.py +237 -0
  39. rem/cli/README.md +455 -0
  40. rem/cli/__init__.py +8 -0
  41. rem/cli/commands/README.md +126 -0
  42. rem/cli/commands/__init__.py +3 -0
  43. rem/cli/commands/ask.py +565 -0
  44. rem/cli/commands/configure.py +423 -0
  45. rem/cli/commands/db.py +493 -0
  46. rem/cli/commands/dreaming.py +324 -0
  47. rem/cli/commands/experiments.py +1124 -0
  48. rem/cli/commands/mcp.py +66 -0
  49. rem/cli/commands/process.py +245 -0
  50. rem/cli/commands/schema.py +183 -0
  51. rem/cli/commands/serve.py +106 -0
  52. rem/cli/dreaming.py +363 -0
  53. rem/cli/main.py +88 -0
  54. rem/config.py +237 -0
  55. rem/mcp_server.py +41 -0
  56. rem/models/core/__init__.py +49 -0
  57. rem/models/core/core_model.py +64 -0
  58. rem/models/core/engram.py +333 -0
  59. rem/models/core/experiment.py +628 -0
  60. rem/models/core/inline_edge.py +132 -0
  61. rem/models/core/rem_query.py +243 -0
  62. rem/models/entities/__init__.py +43 -0
  63. rem/models/entities/file.py +57 -0
  64. rem/models/entities/image_resource.py +88 -0
  65. rem/models/entities/message.py +35 -0
  66. rem/models/entities/moment.py +123 -0
  67. rem/models/entities/ontology.py +191 -0
  68. rem/models/entities/ontology_config.py +131 -0
  69. rem/models/entities/resource.py +95 -0
  70. rem/models/entities/schema.py +87 -0
  71. rem/models/entities/user.py +85 -0
  72. rem/py.typed +0 -0
  73. rem/schemas/README.md +507 -0
  74. rem/schemas/__init__.py +6 -0
  75. rem/schemas/agents/README.md +92 -0
  76. rem/schemas/agents/core/moment-builder.yaml +178 -0
  77. rem/schemas/agents/core/rem-query-agent.yaml +226 -0
  78. rem/schemas/agents/core/resource-affinity-assessor.yaml +99 -0
  79. rem/schemas/agents/core/simple-assistant.yaml +19 -0
  80. rem/schemas/agents/core/user-profile-builder.yaml +163 -0
  81. rem/schemas/agents/examples/contract-analyzer.yaml +317 -0
  82. rem/schemas/agents/examples/contract-extractor.yaml +134 -0
  83. rem/schemas/agents/examples/cv-parser.yaml +263 -0
  84. rem/schemas/agents/examples/hello-world.yaml +37 -0
  85. rem/schemas/agents/examples/query.yaml +54 -0
  86. rem/schemas/agents/examples/simple.yaml +21 -0
  87. rem/schemas/agents/examples/test.yaml +29 -0
  88. rem/schemas/agents/rem.yaml +128 -0
  89. rem/schemas/evaluators/hello-world/default.yaml +77 -0
  90. rem/schemas/evaluators/rem/faithfulness.yaml +219 -0
  91. rem/schemas/evaluators/rem/lookup-correctness.yaml +182 -0
  92. rem/schemas/evaluators/rem/retrieval-precision.yaml +199 -0
  93. rem/schemas/evaluators/rem/retrieval-recall.yaml +211 -0
  94. rem/schemas/evaluators/rem/search-correctness.yaml +192 -0
  95. rem/services/__init__.py +16 -0
  96. rem/services/audio/INTEGRATION.md +308 -0
  97. rem/services/audio/README.md +376 -0
  98. rem/services/audio/__init__.py +15 -0
  99. rem/services/audio/chunker.py +354 -0
  100. rem/services/audio/transcriber.py +259 -0
  101. rem/services/content/README.md +1269 -0
  102. rem/services/content/__init__.py +5 -0
  103. rem/services/content/providers.py +806 -0
  104. rem/services/content/service.py +657 -0
  105. rem/services/dreaming/README.md +230 -0
  106. rem/services/dreaming/__init__.py +53 -0
  107. rem/services/dreaming/affinity_service.py +336 -0
  108. rem/services/dreaming/moment_service.py +264 -0
  109. rem/services/dreaming/ontology_service.py +54 -0
  110. rem/services/dreaming/user_model_service.py +297 -0
  111. rem/services/dreaming/utils.py +39 -0
  112. rem/services/embeddings/__init__.py +11 -0
  113. rem/services/embeddings/api.py +120 -0
  114. rem/services/embeddings/worker.py +421 -0
  115. rem/services/fs/README.md +662 -0
  116. rem/services/fs/__init__.py +62 -0
  117. rem/services/fs/examples.py +206 -0
  118. rem/services/fs/examples_paths.py +204 -0
  119. rem/services/fs/git_provider.py +935 -0
  120. rem/services/fs/local_provider.py +760 -0
  121. rem/services/fs/parsing-hooks-examples.md +172 -0
  122. rem/services/fs/paths.py +276 -0
  123. rem/services/fs/provider.py +460 -0
  124. rem/services/fs/s3_provider.py +1042 -0
  125. rem/services/fs/service.py +186 -0
  126. rem/services/git/README.md +1075 -0
  127. rem/services/git/__init__.py +17 -0
  128. rem/services/git/service.py +469 -0
  129. rem/services/phoenix/EXPERIMENT_DESIGN.md +1146 -0
  130. rem/services/phoenix/README.md +453 -0
  131. rem/services/phoenix/__init__.py +46 -0
  132. rem/services/phoenix/client.py +686 -0
  133. rem/services/phoenix/config.py +88 -0
  134. rem/services/phoenix/prompt_labels.py +477 -0
  135. rem/services/postgres/README.md +575 -0
  136. rem/services/postgres/__init__.py +23 -0
  137. rem/services/postgres/migration_service.py +427 -0
  138. rem/services/postgres/pydantic_to_sqlalchemy.py +232 -0
  139. rem/services/postgres/register_type.py +352 -0
  140. rem/services/postgres/repository.py +337 -0
  141. rem/services/postgres/schema_generator.py +379 -0
  142. rem/services/postgres/service.py +802 -0
  143. rem/services/postgres/sql_builder.py +354 -0
  144. rem/services/rem/README.md +304 -0
  145. rem/services/rem/__init__.py +23 -0
  146. rem/services/rem/exceptions.py +71 -0
  147. rem/services/rem/executor.py +293 -0
  148. rem/services/rem/parser.py +145 -0
  149. rem/services/rem/queries.py +196 -0
  150. rem/services/rem/query.py +371 -0
  151. rem/services/rem/service.py +527 -0
  152. rem/services/session/README.md +374 -0
  153. rem/services/session/__init__.py +6 -0
  154. rem/services/session/compression.py +360 -0
  155. rem/services/session/reload.py +77 -0
  156. rem/settings.py +1235 -0
  157. rem/sql/002_install_models.sql +1068 -0
  158. rem/sql/background_indexes.sql +42 -0
  159. rem/sql/install_models.sql +1038 -0
  160. rem/sql/migrations/001_install.sql +503 -0
  161. rem/sql/migrations/002_install_models.sql +1202 -0
  162. rem/utils/AGENTIC_CHUNKING.md +597 -0
  163. rem/utils/README.md +583 -0
  164. rem/utils/__init__.py +43 -0
  165. rem/utils/agentic_chunking.py +622 -0
  166. rem/utils/batch_ops.py +343 -0
  167. rem/utils/chunking.py +108 -0
  168. rem/utils/clip_embeddings.py +276 -0
  169. rem/utils/dict_utils.py +98 -0
  170. rem/utils/embeddings.py +423 -0
  171. rem/utils/examples/embeddings_example.py +305 -0
  172. rem/utils/examples/sql_types_example.py +202 -0
  173. rem/utils/markdown.py +16 -0
  174. rem/utils/model_helpers.py +236 -0
  175. rem/utils/schema_loader.py +229 -0
  176. rem/utils/sql_types.py +348 -0
  177. rem/utils/user_id.py +81 -0
  178. rem/utils/vision.py +330 -0
  179. rem/workers/README.md +506 -0
  180. rem/workers/__init__.py +5 -0
  181. rem/workers/dreaming.py +502 -0
  182. rem/workers/engram_processor.py +312 -0
  183. rem/workers/sqs_file_processor.py +193 -0
  184. remdb-0.2.6.dist-info/METADATA +1191 -0
  185. remdb-0.2.6.dist-info/RECORD +187 -0
  186. remdb-0.2.6.dist-info/WHEEL +4 -0
  187. remdb-0.2.6.dist-info/entry_points.txt +2 -0
@@ -0,0 +1,622 @@
1
+ """Agentic chunking utilities for splitting large inputs across model context windows.
2
+
3
+ This module provides token-aware chunking for agent inputs that exceed model
4
+ context limits. Chunks can be processed independently by agents and merged
5
+ back using configurable strategies.
6
+
7
+ Key Features:
8
+ - Token counting using tiktoken for OpenAI models
9
+ - Character estimation fallback for other providers
10
+ - Model-specific context window limits
11
+ - Smart section-based chunking for markdown
12
+ - Configurable merge strategies: concatenate, merge_json, llm_merge (planned)
13
+
14
+ Usage:
15
+ from rem.utils.agentic_chunking import smart_chunk_text, merge_results, MergeStrategy
16
+
17
+ # Smart chunking (recommended - auto-sizes based on model)
18
+ chunks = smart_chunk_text(text, model="gpt-4o")
19
+
20
+ # Process each chunk with agent
21
+ results = [agent.run(chunk) for chunk in chunks]
22
+
23
+ # Merge results
24
+ merged = merge_results(results, strategy=MergeStrategy.CONCATENATE_LIST)
25
+ """
26
+
27
+ import logging
28
+ from dataclasses import dataclass
29
+ from enum import Enum
30
+ from typing import Any
31
+
32
+ # Module logger
33
+ logger = logging.getLogger(__name__)
34
+
35
+ # Constants for token estimation and chunking
36
+ CHARS_PER_TOKEN_HEURISTIC = 4 # Conservative estimate: ~4 characters per token
37
+ TOKEN_OVERHEAD_MULTIPLIER = 1.05 # Add 5% overhead for special tokens/encoding
38
+ DEFAULT_BUFFER_RATIO = 0.75 # Use 75% of available tokens (conservative for safety)
39
+
40
+
41
+ @dataclass
42
+ class ModelLimits:
43
+ """Token limits for a model."""
44
+
45
+ max_context: int
46
+ max_output: int
47
+
48
+ @property
49
+ def max_input(self) -> int:
50
+ """Maximum tokens for input (context - output buffer)."""
51
+ return self.max_context - self.max_output
52
+
53
+
54
+ # Model context limits (conservative estimates)
55
+ # Source: Provider documentation as of Jan 2025
56
+ MODEL_LIMITS = {
57
+ # OpenAI
58
+ "gpt-4o": ModelLimits(max_context=128000, max_output=16384),
59
+ "gpt-4o-mini": ModelLimits(max_context=128000, max_output=16384),
60
+ "gpt-4-turbo": ModelLimits(max_context=128000, max_output=4096),
61
+ "gpt-3.5-turbo": ModelLimits(max_context=16385, max_output=4096),
62
+ "o1": ModelLimits(max_context=200000, max_output=100000),
63
+ "o1-mini": ModelLimits(max_context=128000, max_output=65536),
64
+ # Anthropic
65
+ "claude-sonnet-4-20250514": ModelLimits(max_context=200000, max_output=8192),
66
+ "claude-sonnet-4": ModelLimits(max_context=200000, max_output=8192),
67
+ "claude-3-5-sonnet-20241022": ModelLimits(max_context=200000, max_output=8192),
68
+ "claude-3-opus-20240229": ModelLimits(max_context=200000, max_output=4096),
69
+ "claude-3-sonnet-20240229": ModelLimits(max_context=200000, max_output=4096),
70
+ # Google
71
+ "gemini-2.0-flash-exp": ModelLimits(max_context=1000000, max_output=8192),
72
+ "gemini-1.5-pro": ModelLimits(max_context=2000000, max_output=8192),
73
+ # Default fallback
74
+ "default": ModelLimits(max_context=32000, max_output=4096),
75
+ }
76
+
77
+
78
+ class MergeStrategy(str, Enum):
79
+ """Strategy for merging chunked agent results.
80
+
81
+ Available strategies:
82
+ - CONCATENATE_LIST: Merge lists, shallow update dicts, keep first scalar (default)
83
+ - MERGE_JSON: Deep recursive merge of nested JSON objects
84
+ - LLM_MERGE: Use LLM for intelligent semantic merging (NOT YET IMPLEMENTED)
85
+ """
86
+
87
+ CONCATENATE_LIST = "concatenate_list" # Default: merge lists, update dicts, keep first scalar
88
+ MERGE_JSON = "merge_json" # Deep merge JSON objects
89
+ LLM_MERGE = "llm_merge" # PLANNED: Use LLM to intelligently merge results
90
+
91
+
92
+ def get_model_limits(model: str) -> ModelLimits:
93
+ """Get token limits for a model.
94
+
95
+ Args:
96
+ model: Model name (e.g., "gpt-4o", "claude-sonnet-4")
97
+
98
+ Returns:
99
+ ModelLimits for the model
100
+
101
+ Examples:
102
+ >>> limits = get_model_limits("gpt-4o")
103
+ >>> limits.max_input
104
+ 111616
105
+
106
+ >>> limits = get_model_limits("claude-sonnet-4")
107
+ >>> limits.max_input
108
+ 191808
109
+ """
110
+ # Direct lookup
111
+ if model in MODEL_LIMITS:
112
+ return MODEL_LIMITS[model]
113
+
114
+ # Fuzzy match by model family
115
+ model_lower = model.lower()
116
+
117
+ # OpenAI family
118
+ if "gpt-4o-mini" in model_lower:
119
+ return MODEL_LIMITS["gpt-4o-mini"]
120
+ elif "gpt-4o" in model_lower:
121
+ return MODEL_LIMITS["gpt-4o"]
122
+ elif "gpt-4" in model_lower:
123
+ return MODEL_LIMITS["gpt-4-turbo"]
124
+ elif "gpt-3.5" in model_lower or "gpt-35" in model_lower:
125
+ return MODEL_LIMITS["gpt-3.5-turbo"]
126
+ elif "o1-mini" in model_lower:
127
+ return MODEL_LIMITS["o1-mini"]
128
+ elif "o1" in model_lower:
129
+ return MODEL_LIMITS["o1"]
130
+
131
+ # Anthropic family
132
+ if "claude-sonnet-4" in model_lower or "claude-4" in model_lower:
133
+ return MODEL_LIMITS["claude-sonnet-4"]
134
+ elif "claude-3.5" in model_lower or "claude-3-5" in model_lower:
135
+ return MODEL_LIMITS["claude-3-5-sonnet-20241022"]
136
+ elif "claude-3" in model_lower:
137
+ return MODEL_LIMITS["claude-3-sonnet-20240229"]
138
+ elif "claude" in model_lower:
139
+ return MODEL_LIMITS["claude-3-sonnet-20240229"]
140
+
141
+ # Google family
142
+ if "gemini-2" in model_lower:
143
+ return MODEL_LIMITS["gemini-2.0-flash-exp"]
144
+ elif "gemini" in model_lower:
145
+ return MODEL_LIMITS["gemini-1.5-pro"]
146
+
147
+ # Default fallback
148
+ return MODEL_LIMITS["default"]
149
+
150
+
151
+ def estimate_tokens(text: str, model: str | None = None) -> int:
152
+ """Estimate token count for text.
153
+
154
+ Uses tiktoken for OpenAI models (exact count).
155
+ Falls back to character-based heuristic for other providers.
156
+
157
+ Args:
158
+ text: Text to estimate tokens for
159
+ model: Optional model name for tiktoken encoding selection
160
+
161
+ Returns:
162
+ Estimated token count
163
+
164
+ Examples:
165
+ >>> estimate_tokens("Hello world", model="gpt-4o")
166
+ 2
167
+
168
+ >>> estimate_tokens("Hello world", model="claude-sonnet-4")
169
+ 3 # Heuristic estimate
170
+ """
171
+ if not text:
172
+ return 0
173
+
174
+ # Try tiktoken for OpenAI models (exact counting)
175
+ if model and ("gpt" in model.lower() or "o1" in model.lower()):
176
+ try:
177
+ import tiktoken
178
+
179
+ # Get encoding for model
180
+ try:
181
+ encoding = tiktoken.encoding_for_model(model)
182
+ token_count = len(encoding.encode(text))
183
+ logger.debug(f"Exact token count via tiktoken: {token_count} tokens (model: {model})")
184
+ return token_count
185
+ except KeyError:
186
+ # Fall back to cl100k_base for unknown OpenAI models
187
+ logger.warning(
188
+ f"Unknown OpenAI model '{model}', falling back to cl100k_base encoding. "
189
+ "Token counts may be inaccurate."
190
+ )
191
+ encoding = tiktoken.get_encoding("cl100k_base")
192
+ return len(encoding.encode(text))
193
+ except ImportError:
194
+ # tiktoken not installed, fall through to heuristic
195
+ logger.debug(
196
+ "tiktoken not installed, using character-based heuristic for token estimation. "
197
+ "Install tiktoken for exact OpenAI token counting: pip install tiktoken"
198
+ )
199
+
200
+ # Character-based heuristic
201
+ base_estimate = len(text) / CHARS_PER_TOKEN_HEURISTIC
202
+ token_estimate = int(base_estimate * TOKEN_OVERHEAD_MULTIPLIER)
203
+ logger.debug(
204
+ f"Heuristic token estimate: {token_estimate} tokens "
205
+ f"(chars={len(text)}, ratio={CHARS_PER_TOKEN_HEURISTIC}, overhead={TOKEN_OVERHEAD_MULTIPLIER})"
206
+ )
207
+ return token_estimate
208
+
209
+
210
+ def smart_chunk_text(
211
+ text: str,
212
+ model: str,
213
+ system_prompt: str = "",
214
+ buffer_ratio: float = DEFAULT_BUFFER_RATIO,
215
+ preserve_lines: bool = True,
216
+ ) -> list[str]:
217
+ """Intelligently chunk text based on model limits with automatic sizing.
218
+
219
+ This is the recommended way to chunk text - it automatically calculates
220
+ optimal chunk size based on the model's context window, accounting for
221
+ system prompt overhead and safety buffers.
222
+
223
+ Args:
224
+ text: Text to chunk
225
+ model: Model name (e.g., "gpt-4o", "claude-sonnet-4")
226
+ system_prompt: System prompt that will be used (to account for overhead)
227
+ buffer_ratio: Ratio of available tokens to use (default 0.75 = 75%)
228
+ preserve_lines: If True, avoid splitting mid-line
229
+
230
+ Returns:
231
+ List of text chunks, each optimally sized for the model
232
+
233
+ Examples:
234
+ >>> # CV extraction - will fit in single chunk for GPT-4o (128K context)
235
+ >>> cv_text = load_cv("john-doe.txt") # 5K tokens
236
+ >>> chunks = smart_chunk_text(cv_text, model="gpt-4o")
237
+ >>> len(chunks)
238
+ 1
239
+
240
+ >>> # Large contract - will split intelligently
241
+ >>> contract = load_contract("agreement.pdf") # 150K tokens
242
+ >>> chunks = smart_chunk_text(contract, model="gpt-4o")
243
+ >>> len(chunks)
244
+ 2
245
+
246
+ >>> # With custom system prompt overhead
247
+ >>> chunks = smart_chunk_text(
248
+ ... text,
249
+ ... model="gpt-4o",
250
+ ... system_prompt="Extract key terms from this contract...",
251
+ ... buffer_ratio=0.7 # More conservative for complex prompts
252
+ ... )
253
+ """
254
+ try:
255
+ if not text:
256
+ logger.debug("smart_chunk_text called with empty text")
257
+ return []
258
+
259
+ # Get model limits
260
+ limits = get_model_limits(model)
261
+
262
+ # Calculate overhead from system prompt
263
+ system_tokens = estimate_tokens(system_prompt, model) if system_prompt else 0
264
+
265
+ # Calculate available tokens for content
266
+ # Reserve space for: system prompt + output buffer + safety margin
267
+ available_tokens = limits.max_input - system_tokens
268
+
269
+ # Apply buffer ratio for safety
270
+ max_chunk_tokens = int(available_tokens * buffer_ratio)
271
+
272
+ # Check if text fits in single chunk
273
+ text_tokens = estimate_tokens(text, model)
274
+
275
+ logger.debug(
276
+ f"Chunking analysis: model={model}, text_tokens={text_tokens}, "
277
+ f"max_chunk_tokens={max_chunk_tokens} (buffer={buffer_ratio*100:.0f}%), "
278
+ f"system_overhead={system_tokens}, available={available_tokens}"
279
+ )
280
+
281
+ if text_tokens <= max_chunk_tokens:
282
+ logger.debug("Text fits in single chunk, no chunking needed")
283
+ return [text]
284
+
285
+ # Need to chunk
286
+ strategy = "line-based" if preserve_lines else "character-based"
287
+ logger.info(
288
+ f"Chunking required: {text_tokens} tokens exceeds {max_chunk_tokens} limit "
289
+ f"(model: {model}). Using {strategy} strategy."
290
+ )
291
+
292
+ if preserve_lines:
293
+ chunks = _chunk_by_lines(text, max_chunk_tokens, model)
294
+ else:
295
+ chunks = _chunk_by_chars(text, max_chunk_tokens, model)
296
+
297
+ logger.info(
298
+ f"Created {len(chunks)} chunks from {text_tokens} token input "
299
+ f"(avg {text_tokens//len(chunks)} tokens/chunk)"
300
+ )
301
+ return chunks
302
+
303
+ except Exception as e:
304
+ logger.exception(
305
+ f"Chunking failed: model={model}, text_length={len(text)}, "
306
+ f"buffer_ratio={buffer_ratio}"
307
+ )
308
+ raise
309
+
310
+
311
+ def chunk_text(
312
+ text: str,
313
+ max_tokens: int,
314
+ model: str | None = None,
315
+ preserve_lines: bool = True,
316
+ ) -> list[str]:
317
+ """Chunk text to fit within token limit.
318
+
319
+ NOTE: Consider using smart_chunk_text() instead, which automatically
320
+ calculates optimal chunk size based on model limits.
321
+
322
+ Args:
323
+ text: Text to chunk
324
+ max_tokens: Maximum tokens per chunk
325
+ model: Optional model name for token counting
326
+ preserve_lines: If True, avoid splitting mid-line
327
+
328
+ Returns:
329
+ List of text chunks, each within token limit
330
+
331
+ Examples:
332
+ >>> text = "Line 1\\nLine 2\\nLine 3\\n" * 1000
333
+ >>> chunks = chunk_text(text, max_tokens=1000, model="gpt-4o")
334
+ >>> len(chunks) > 1
335
+ True
336
+ >>> all(estimate_tokens(c, "gpt-4o") <= 1000 for c in chunks)
337
+ True
338
+ """
339
+ if not text:
340
+ return []
341
+
342
+ # Check if text fits in single chunk
343
+ text_tokens = estimate_tokens(text, model)
344
+ if text_tokens <= max_tokens:
345
+ return [text]
346
+
347
+ # Need to chunk - use line-based or character-based approach
348
+ if preserve_lines:
349
+ return _chunk_by_lines(text, max_tokens, model)
350
+ else:
351
+ return _chunk_by_chars(text, max_tokens, model)
352
+
353
+
354
+ def _chunk_by_lines(text: str, max_tokens: int, model: str | None) -> list[str]:
355
+ """Chunk text by lines, preserving line boundaries.
356
+
357
+ Args:
358
+ text: Text to chunk
359
+ max_tokens: Maximum tokens per chunk
360
+ model: Optional model name for token counting
361
+
362
+ Returns:
363
+ List of text chunks
364
+ """
365
+ chunks = []
366
+ lines = text.split("\n")
367
+ current_chunk: list[str] = []
368
+ current_tokens = 0
369
+
370
+ logger.debug(f"Line-based chunking: {len(lines)} lines, max_tokens={max_tokens}")
371
+
372
+ for line_num, line in enumerate(lines, 1):
373
+ line_tokens = estimate_tokens(line + "\n", model)
374
+
375
+ # If single line exceeds limit, split it by characters
376
+ if line_tokens > max_tokens:
377
+ logger.warning(
378
+ f"Line {line_num} exceeds token limit ({line_tokens} > {max_tokens}), "
379
+ f"falling back to character-based chunking. Line preview: {line[:100]}..."
380
+ )
381
+
382
+ # Save current chunk if any
383
+ if current_chunk:
384
+ chunks.append("\n".join(current_chunk))
385
+ current_chunk = []
386
+ current_tokens = 0
387
+
388
+ # Split the large line
389
+ line_chunks = _chunk_by_chars(line, max_tokens, model)
390
+ chunks.extend(line_chunks)
391
+ continue
392
+
393
+ # Check if adding this line would exceed limit
394
+ if current_tokens + line_tokens > max_tokens and current_chunk:
395
+ # Save current chunk and start new one
396
+ logger.debug(f"Chunk boundary at line {line_num} ({current_tokens} tokens)")
397
+ chunks.append("\n".join(current_chunk))
398
+ current_chunk = [line]
399
+ current_tokens = line_tokens
400
+ else:
401
+ # Add to current chunk
402
+ current_chunk.append(line)
403
+ current_tokens += line_tokens
404
+
405
+ # Add final chunk
406
+ if current_chunk:
407
+ chunks.append("\n".join(current_chunk))
408
+
409
+ logger.debug(f"Line chunking complete: {len(chunks)} chunks created")
410
+ return chunks if chunks else [text]
411
+
412
+
413
+ def _chunk_by_chars(text: str, max_tokens: int, model: str | None) -> list[str]:
414
+ """Fallback: chunk text by characters when line-based chunking fails.
415
+
416
+ Args:
417
+ text: Text to chunk
418
+ max_tokens: Maximum tokens per chunk
419
+ model: Optional model name for token counting
420
+
421
+ Returns:
422
+ List of text chunks
423
+ """
424
+ # Convert tokens to approximate chars using heuristic
425
+ max_chars = int(max_tokens * CHARS_PER_TOKEN_HEURISTIC)
426
+
427
+ chunks = []
428
+ start = 0
429
+ text_len = len(text)
430
+
431
+ logger.debug(
432
+ f"Character-based chunking: text_length={text_len} chars, "
433
+ f"max_chars={max_chars} (max_tokens={max_tokens})"
434
+ )
435
+
436
+ while start < text_len:
437
+ # Calculate end position
438
+ end = min(start + max_chars, text_len)
439
+
440
+ # Try to break at word boundary if not at text end
441
+ if end < text_len:
442
+ # Look back for space
443
+ space_pos = text.rfind(" ", start, end)
444
+ if space_pos > start:
445
+ original_end = end
446
+ end = space_pos
447
+ logger.debug(
448
+ f"Adjusted chunk boundary for word break: {original_end} → {end}"
449
+ )
450
+
451
+ chunk = text[start:end].strip()
452
+ if chunk:
453
+ chunks.append(chunk)
454
+
455
+ start = end
456
+
457
+ logger.debug(
458
+ f"Character chunking complete: {len(chunks)} chunks created "
459
+ f"(avg {text_len//len(chunks) if chunks else 0} chars/chunk)"
460
+ )
461
+ return chunks if chunks else [text]
462
+
463
+
464
+ def merge_results(
465
+ results: list[dict[str, Any]],
466
+ strategy: MergeStrategy = MergeStrategy.CONCATENATE_LIST,
467
+ ) -> dict[str, Any]:
468
+ """Merge multiple agent results using specified strategy.
469
+
470
+ Args:
471
+ results: List of result dictionaries from agent chunks
472
+ strategy: Merge strategy to use
473
+
474
+ Returns:
475
+ Merged result dictionary
476
+
477
+ Examples:
478
+ >>> results = [
479
+ ... {"items": [1, 2], "count": 2},
480
+ ... {"items": [3, 4], "count": 2}
481
+ ... ]
482
+ >>> merged = merge_results(results, MergeStrategy.CONCATENATE_LIST)
483
+ >>> merged["items"]
484
+ [1, 2, 3, 4]
485
+ >>> merged["count"]
486
+ 2
487
+ """
488
+ try:
489
+ if not results:
490
+ logger.debug("merge_results called with empty results list")
491
+ return {}
492
+
493
+ if len(results) == 1:
494
+ logger.debug("merge_results called with single result, returning as-is")
495
+ return results[0]
496
+
497
+ logger.info(
498
+ f"Merging {len(results)} chunk results using strategy: {strategy.value}"
499
+ )
500
+
501
+ if strategy == MergeStrategy.CONCATENATE_LIST:
502
+ merged = _merge_concatenate(results)
503
+ elif strategy == MergeStrategy.MERGE_JSON:
504
+ merged = _merge_json_deep(results)
505
+ elif strategy == MergeStrategy.LLM_MERGE:
506
+ raise NotImplementedError("LLM merge strategy not yet implemented")
507
+ else:
508
+ raise ValueError(f"Unknown merge strategy: {strategy}")
509
+
510
+ logger.debug(f"Merge complete: result has {len(merged)} top-level keys")
511
+ return merged
512
+
513
+ except Exception as e:
514
+ logger.exception(
515
+ f"Merge failed: strategy={strategy.value}, num_results={len(results)}"
516
+ )
517
+ raise
518
+
519
+
520
+ def _merge_concatenate(results: list[dict[str, Any]]) -> dict[str, Any]:
521
+ """Default merge: concatenate lists, update dicts, keep first scalar.
522
+
523
+ Args:
524
+ results: List of result dictionaries
525
+
526
+ Returns:
527
+ Merged result
528
+ """
529
+ merged = results[0].copy()
530
+ logger.debug(f"Starting concatenate merge with {len(results)} results")
531
+
532
+ for chunk_num, result in enumerate(results[1:], start=2):
533
+ logger.debug(f"Merging chunk {chunk_num}/{len(results)}")
534
+
535
+ for key, value in result.items():
536
+ if key not in merged:
537
+ merged[key] = value
538
+ logger.debug(f" Added new key '{key}' from chunk {chunk_num}")
539
+ continue
540
+
541
+ merged_value = merged[key]
542
+
543
+ # Merge lists by concatenation
544
+ if isinstance(merged_value, list) and isinstance(value, list):
545
+ original_len = len(merged_value)
546
+ merged[key] = merged_value + value
547
+ logger.debug(
548
+ f" Concatenated list '{key}': {original_len} + {len(value)} = {len(merged[key])} items"
549
+ )
550
+
551
+ # Merge dicts by update (shallow)
552
+ elif isinstance(merged_value, dict) and isinstance(value, dict):
553
+ merged[key].update(value)
554
+ logger.debug(f" Updated dict '{key}' with {len(value)} keys from chunk {chunk_num}")
555
+
556
+ # For scalars, prefer non-None values, or keep first
557
+ else:
558
+ if merged_value is None:
559
+ merged[key] = value
560
+ logger.debug(f" Replaced None value for '{key}' with value from chunk {chunk_num}")
561
+ elif value is not None and merged_value != value:
562
+ # CRITICAL: Warn about silent data loss
563
+ logger.warning(
564
+ f"Scalar value conflict for key '{key}': "
565
+ f"keeping first value ({merged_value!r}), "
566
+ f"discarding chunk {chunk_num} value ({value!r})"
567
+ )
568
+
569
+ logger.debug(f"Concatenate merge complete: {len(merged)} keys in result")
570
+ return merged
571
+
572
+
573
+ def _merge_json_deep(results: list[dict[str, Any]]) -> dict[str, Any]:
574
+ """Deep merge JSON objects recursively.
575
+
576
+ Args:
577
+ results: List of result dictionaries
578
+
579
+ Returns:
580
+ Deeply merged result
581
+ """
582
+ logger.debug(f"Starting deep JSON merge with {len(results)} results")
583
+
584
+ def deep_merge(base: dict, update: dict, depth: int = 0) -> dict:
585
+ """Recursively merge update into base."""
586
+ merged = base.copy()
587
+ indent = " " * depth
588
+
589
+ for key, value in update.items():
590
+ if key not in merged:
591
+ merged[key] = value
592
+ logger.debug(f"{indent}Added new key '{key}' at depth {depth}")
593
+ elif isinstance(merged[key], dict) and isinstance(value, dict):
594
+ logger.debug(f"{indent}Deep merging dict '{key}' at depth {depth}")
595
+ merged[key] = deep_merge(merged[key], value, depth + 1)
596
+ elif isinstance(merged[key], list) and isinstance(value, list):
597
+ original_len = len(merged[key])
598
+ merged[key] = merged[key] + value
599
+ logger.debug(
600
+ f"{indent}Concatenated list '{key}' at depth {depth}: "
601
+ f"{original_len} + {len(value)} = {len(merged[key])} items"
602
+ )
603
+ else:
604
+ # Keep first non-None value
605
+ if merged[key] is None:
606
+ merged[key] = value
607
+ logger.debug(f"{indent}Replaced None value for '{key}' at depth {depth}")
608
+ elif value is not None and merged[key] != value:
609
+ logger.warning(
610
+ f"{indent}Scalar conflict at depth {depth} for '{key}': "
611
+ f"keeping first value ({merged[key]!r}), discarding ({value!r})"
612
+ )
613
+
614
+ return merged
615
+
616
+ result = results[0].copy()
617
+ for chunk_num, r in enumerate(results[1:], start=2):
618
+ logger.debug(f"Deep merging chunk {chunk_num}/{len(results)}")
619
+ result = deep_merge(result, r, depth=0)
620
+
621
+ logger.debug(f"Deep merge complete: {len(result)} top-level keys")
622
+ return result