contextual-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. contextual/__init__.py +18 -0
  2. contextual/__main__.py +11 -0
  3. contextual/cli.py +339 -0
  4. contextual/cli_docs.py +685 -0
  5. contextual/config.py +7 -0
  6. contextual/core/__init__.py +11 -0
  7. contextual/core/errors.py +470 -0
  8. contextual/core/models.py +590 -0
  9. contextual/docs/__init__.py +66 -0
  10. contextual/docs/chunker.py +550 -0
  11. contextual/docs/pipeline.py +513 -0
  12. contextual/docs/retrieval.py +654 -0
  13. contextual/docs/watcher.py +265 -0
  14. contextual/embedding/__init__.py +87 -0
  15. contextual/embedding/cache.py +455 -0
  16. contextual/embedding/embedder.py +414 -0
  17. contextual/embedding/helpers.py +252 -0
  18. contextual/git/__init__.py +22 -0
  19. contextual/git/blame.py +334 -0
  20. contextual/indexing/__init__.py +20 -0
  21. contextual/indexing/bug_sweep.py +119 -0
  22. contextual/indexing/chunker.py +691 -0
  23. contextual/indexing/embedder.py +271 -0
  24. contextual/indexing/file_watcher.py +154 -0
  25. contextual/indexing/incremental.py +260 -0
  26. contextual/indexing/index_writer.py +442 -0
  27. contextual/indexing/pipeline.py +438 -0
  28. contextual/indexing/processor.py +436 -0
  29. contextual/indexing/queries/readme.md +22 -0
  30. contextual/indexing/symbol_extractor.py +426 -0
  31. contextual/indexing/tokenizer.py +203 -0
  32. contextual/integrations/__init__.py +10 -0
  33. contextual/mcp/__init__.py +15 -0
  34. contextual/mcp/__main__.py +24 -0
  35. contextual/mcp/docs_tools.py +286 -0
  36. contextual/mcp/server.py +118 -0
  37. contextual/mcp/tools.py +443 -0
  38. contextual/observability/__init__.py +21 -0
  39. contextual/observability/logging.py +115 -0
  40. contextual/py.typed +0 -0
  41. contextual/retrieval/__init__.py +24 -0
  42. contextual/retrieval/context_assembler.py +372 -0
  43. contextual/retrieval/ranker.py +193 -0
  44. contextual/retrieval/search.py +548 -0
  45. contextual/security/__init__.py +52 -0
  46. contextual/security/paths.py +347 -0
  47. contextual/security/sanitize.py +349 -0
  48. contextual/security/workspace.py +348 -0
  49. contextual/storage/__init__.py +36 -0
  50. contextual/storage/fts_manager.py +273 -0
  51. contextual/storage/migration_v2.py +289 -0
  52. contextual/storage/migrations.py +316 -0
  53. contextual/storage/schema.py +210 -0
  54. contextual/storage/sqlite_pool.py +468 -0
  55. contextual/storage/vec0_manager.py +421 -0
  56. contextual_engine-0.1.0.dist-info/METADATA +297 -0
  57. contextual_engine-0.1.0.dist-info/RECORD +60 -0
  58. contextual_engine-0.1.0.dist-info/WHEEL +4 -0
  59. contextual_engine-0.1.0.dist-info/entry_points.txt +2 -0
  60. contextual_engine-0.1.0.dist-info/licenses/LICENSE +111 -0
@@ -0,0 +1,11 @@
1
+ """Core data models, errors, and orchestration engine.
2
+
3
+ This module contains the foundational contracts that all other modules depend on.
4
+ All data shapes (Pydantic models), error types, and the main indexing/retrieval
5
+ orchestrator live here.
6
+ """
7
+ from __future__ import annotations
8
+
9
+
10
+ __all__ = []
11
+ # Exports will be added as models.py, errors.py, and engine.py are built
@@ -0,0 +1,470 @@
1
+ """Error types and exception hierarchy for Contextual.
2
+
3
+ Provides typed error codes and structured exceptions for consistent error handling
4
+ across all modules. Every error carries a code, message, and optional context.
5
+
6
+ Error handling philosophy:
7
+ - Fail fast with clear error messages
8
+ - Never swallow exceptions silently
9
+ - Provide actionable error context
10
+ - Distinguish retriable vs non-retriable errors
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from enum import StrEnum
16
+ from typing import Any
17
+
18
+
19
+ # ============================================================================
20
+ # ERROR CODES - Typed categorical error classification
21
+ # ============================================================================
22
+
23
+
24
+ class ErrorCode(StrEnum):
25
+ """Categorized error codes for all subsystems.
26
+
27
+ Format: SUBSYSTEM_SPECIFIC_CONDITION
28
+
29
+ These codes enable:
30
+ - Structured error logging
31
+ - Client-side error handling (MCP clients)
32
+ - Metrics/alerting aggregation
33
+ - Automatic retry logic
34
+ """
35
+
36
+ # ========================================================================
37
+ # STORAGE ERRORS (1xx range conceptually)
38
+ # ========================================================================
39
+
40
+ # SQLite errors
41
+ STORAGE_SQLITE_CONNECTION_FAILED = "storage_sqlite_connection_failed"
42
+ STORAGE_SQLITE_POOL_EXHAUSTED = "storage_sqlite_pool_exhausted"
43
+ STORAGE_SQLITE_QUERY_FAILED = "storage_sqlite_query_failed"
44
+ STORAGE_SQLITE_TRANSACTION_FAILED = "storage_sqlite_transaction_failed"
45
+ STORAGE_SQLITE_SCHEMA_INVALID = "storage_sqlite_schema_invalid"
46
+ STORAGE_SQLITE_WAL_CHECKPOINT_FAILED = "storage_sqlite_wal_checkpoint_failed"
47
+ STORAGE_SQLITE_BUSY = "storage_sqlite_busy" # Retriable
48
+
49
+ # LanceDB errors
50
+ STORAGE_LANCE_CONNECTION_FAILED = "storage_lance_connection_failed"
51
+ STORAGE_LANCE_TABLE_NOT_FOUND = "storage_lance_table_not_found"
52
+ STORAGE_LANCE_INSERT_FAILED = "storage_lance_insert_failed"
53
+ STORAGE_LANCE_SEARCH_FAILED = "storage_lance_search_failed"
54
+ STORAGE_LANCE_INDEX_FAILED = "storage_lance_index_failed"
55
+ STORAGE_LANCE_OPTIMIZE_FAILED = "storage_lance_optimize_failed"
56
+
57
+ # Vec0 errors
58
+ STORAGE_VEC_FAILED = "storage_vec_failed"
59
+ STORAGE_VEC_DIMENSION_MISMATCH = "storage_vec_dimension_mismatch"
60
+ STORAGE_VEC_INVALID_PARAMETER = "storage_vec_invalid_parameter"
61
+
62
+ # Tantivy errors
63
+ STORAGE_TANTIVY_INDEX_OPEN_FAILED = "storage_tantivy_index_open_failed"
64
+ STORAGE_TANTIVY_INDEX_WRITE_FAILED = "storage_tantivy_index_write_failed"
65
+ STORAGE_TANTIVY_SEARCH_FAILED = "storage_tantivy_search_failed"
66
+
67
+ # Migration errors
68
+ STORAGE_MIGRATION_FAILED = "storage_migration_failed"
69
+ STORAGE_MIGRATION_ROLLBACK_FAILED = "storage_migration_rollback_failed"
70
+ STORAGE_MIGRATION_VERSION_CONFLICT = "storage_migration_version_conflict"
71
+
72
+ # Temporal query errors
73
+ STORAGE_TEMPORAL_QUERY_INVALID = "storage_temporal_query_invalid"
74
+ STORAGE_TEMPORAL_CONTRADICTION_DETECTED = "storage_temporal_contradiction_detected"
75
+
76
+ # ========================================================================
77
+ # INDEXING ERRORS (2xx range conceptually)
78
+ # ========================================================================
79
+
80
+ # Parser errors
81
+ INDEXING_PARSER_INIT_FAILED = "indexing_parser_init_failed"
82
+ INDEXING_PARSER_PARSE_FAILED = "indexing_parser_parse_failed"
83
+ INDEXING_PARSER_UNSUPPORTED_LANGUAGE = "indexing_parser_unsupported_language"
84
+ INDEXING_PARSER_POOL_EXHAUSTED = "indexing_parser_pool_exhausted"
85
+
86
+ # Chunking errors
87
+ INDEXING_CHUNKER_FILE_TOO_LARGE = "indexing_chunker_file_too_large"
88
+ INDEXING_CHUNKER_BINARY_FILE = "indexing_chunker_binary_file"
89
+ INDEXING_CHUNKER_ENCODING_ERROR = "indexing_chunker_encoding_error"
90
+ INDEXING_CHUNKER_SPLIT_FAILED = "indexing_chunker_split_failed"
91
+
92
+ # File processing errors
93
+ INDEXING_FILE_NOT_FOUND = "indexing_file_not_found"
94
+ INDEXING_FILE_PERMISSION_DENIED = "indexing_file_permission_denied"
95
+ INDEXING_FILE_READ_FAILED = "indexing_file_read_failed"
96
+ INDEXING_FILE_IGNORED = "indexing_file_ignored" # Not an error, just logged
97
+
98
+ # Embedding errors
99
+ INDEXING_EMBEDDING_MODEL_LOAD_FAILED = "indexing_embedding_model_load_failed"
100
+ INDEXING_EMBEDDING_INFERENCE_FAILED = "indexing_embedding_inference_failed"
101
+ INDEXING_EMBEDDING_BATCH_TOO_LARGE = "indexing_embedding_batch_too_large"
102
+ INDEXING_EMBEDDING_OOM = "indexing_embedding_oom" # Out of memory
103
+
104
+ # Content hash errors
105
+ INDEXING_HASH_COLLISION = "indexing_hash_collision" # Extremely unlikely SHA-256 collision
106
+
107
+ # ========================================================================
108
+ # GIT ERRORS (3xx range conceptually)
109
+ # ========================================================================
110
+
111
+ GIT_REPO_NOT_FOUND = "git_repo_not_found"
112
+ GIT_REPO_INVALID = "git_repo_invalid"
113
+ GIT_COMMIT_WALK_FAILED = "git_commit_walk_failed"
114
+ GIT_BLAME_FAILED = "git_blame_failed"
115
+ GIT_DIFF_FAILED = "git_diff_failed"
116
+ GIT_SHALLOW_CLONE = "git_shallow_clone" # Warning, not error
117
+ GIT_DETACHED_HEAD = "git_detached_head" # Warning
118
+ GIT_REBASE_DETECTED = "git_rebase_detected" # Requires special handling
119
+ GIT_FORCE_PUSH_DETECTED = "git_force_push_detected" # Requires re-indexing
120
+
121
+ # ========================================================================
122
+ # RETRIEVAL ERRORS (4xx range conceptually)
123
+ # ========================================================================
124
+
125
+ # Search errors
126
+ RETRIEVAL_QUERY_EMPTY = "retrieval_query_empty"
127
+ RETRIEVAL_QUERY_TOO_LONG = "retrieval_query_too_long"
128
+ RETRIEVAL_BM25_FAILED = "retrieval_bm25_failed"
129
+ RETRIEVAL_DENSE_SEARCH_FAILED = "retrieval_dense_search_failed"
130
+ RETRIEVAL_FUSION_FAILED = "retrieval_fusion_failed"
131
+ RETRIEVAL_RERANK_FAILED = "retrieval_rerank_failed"
132
+ RETRIEVAL_HYDRATION_FAILED = "retrieval_hydration_failed"
133
+ RETRIEVAL_NO_RESULTS = "retrieval_no_results" # Not an error, informational
134
+
135
+ # Context assembly errors
136
+ RETRIEVAL_CONTEXT_BUDGET_EXCEEDED = "retrieval_context_budget_exceeded"
137
+ RETRIEVAL_CONTEXT_ASSEMBLY_FAILED = "retrieval_context_assembly_failed"
138
+
139
+ # ========================================================================
140
+ # SECURITY ERRORS (5xx range conceptually)
141
+ # ========================================================================
142
+
143
+ # Path safety errors
144
+ SECURITY_PATH_TRAVERSAL = "security_path_traversal"
145
+ SECURITY_PATH_OUTSIDE_ROOT = "security_path_outside_root"
146
+ SECURITY_PATH_SYMLINK_LOOP = "security_path_symlink_loop"
147
+ SECURITY_PATH_INVALID = "security_path_invalid"
148
+ SECURITY_PATH_RESERVED_NAME = "security_path_reserved_name" # Windows: CON, PRN, etc.
149
+
150
+ # Sanitization errors
151
+ SECURITY_SQL_INJECTION_ATTEMPT = "security_sql_injection_attempt"
152
+ SECURITY_FTS5_INJECTION_ATTEMPT = "security_fts5_injection_attempt"
153
+ SECURITY_UNICODE_ATTACK = "security_unicode_attack"
154
+
155
+ # Workspace errors
156
+ SECURITY_WORKSPACE_PERMISSION_DENIED = "security_workspace_permission_denied"
157
+ SECURITY_WORKSPACE_INIT_FAILED = "security_workspace_init_failed"
158
+ SECURITY_WORKSPACE_ISOLATION_VIOLATION = "security_workspace_isolation_violation"
159
+
160
+ # Prompt injection
161
+ SECURITY_PROMPT_INJECTION_DETECTED = "security_prompt_injection_detected"
162
+
163
+ # ========================================================================
164
+ # CONFIGURATION ERRORS (6xx range conceptually)
165
+ # ========================================================================
166
+
167
+ CONFIG_FILE_NOT_FOUND = "config_file_not_found"
168
+ CONFIG_FILE_INVALID = "config_file_invalid"
169
+ CONFIG_PARSE_FAILED = "config_parse_failed"
170
+ CONFIG_VALIDATION_FAILED = "config_validation_failed"
171
+ CONFIG_MISSING_REQUIRED = "config_missing_required"
172
+ CONFIG_VALUE_OUT_OF_RANGE = "config_value_out_of_range"
173
+
174
+ # ========================================================================
175
+ # MCP SERVER ERRORS (7xx range conceptually)
176
+ # ========================================================================
177
+
178
+ MCP_TOOL_CALL_FAILED = "mcp_tool_call_failed"
179
+ MCP_INVALID_PARAMETERS = "mcp_invalid_parameters"
180
+ MCP_TRANSPORT_ERROR = "mcp_transport_error"
181
+ MCP_TIMEOUT = "mcp_timeout"
182
+ MCP_CLIENT_DISCONNECTED = "mcp_client_disconnected"
183
+
184
+ # ========================================================================
185
+ # SYSTEM ERRORS (8xx range conceptually)
186
+ # ========================================================================
187
+
188
+ SYSTEM_DISK_FULL = "system_disk_full"
189
+ SYSTEM_OOM = "system_oom" # Out of memory
190
+ SYSTEM_PERMISSION_DENIED = "system_permission_denied"
191
+ SYSTEM_RESOURCE_EXHAUSTED = "system_resource_exhausted"
192
+ SYSTEM_SHUTDOWN = "system_shutdown" # Graceful shutdown in progress
193
+
194
+ # ========================================================================
195
+ # UNKNOWN/INTERNAL ERRORS (9xx range conceptually)
196
+ # ========================================================================
197
+
198
+ INTERNAL_ASSERTION_FAILED = "internal_assertion_failed"
199
+ INTERNAL_INVARIANT_VIOLATED = "internal_invariant_violated"
200
+ INTERNAL_UNEXPECTED_STATE = "internal_unexpected_state"
201
+ UNKNOWN = "unknown"
202
+
203
+
204
+ # ============================================================================
205
+ # RETRY CLASSIFICATION
206
+ # ============================================================================
207
+
208
+
209
+ RETRIABLE_ERRORS = frozenset({
210
+ ErrorCode.STORAGE_SQLITE_BUSY,
211
+ ErrorCode.STORAGE_SQLITE_POOL_EXHAUSTED,
212
+ ErrorCode.INDEXING_PARSER_POOL_EXHAUSTED,
213
+ ErrorCode.INDEXING_EMBEDDING_OOM,
214
+ ErrorCode.MCP_TIMEOUT,
215
+ ErrorCode.MCP_CLIENT_DISCONNECTED,
216
+ ErrorCode.SYSTEM_RESOURCE_EXHAUSTED,
217
+ })
218
+
219
+
220
+ def is_retriable(code: ErrorCode) -> bool:
221
+ """Check if an error code represents a retriable condition.
222
+
223
+ Args:
224
+ code: Error code to check.
225
+
226
+ Returns:
227
+ True if the error should be retried, False otherwise.
228
+ """
229
+ return code in RETRIABLE_ERRORS
230
+
231
+
232
+ # ============================================================================
233
+ # EXCEPTION HIERARCHY
234
+ # ============================================================================
235
+
236
+
237
+ class ContextualExceptionError(Exception):
238
+ """Base exception for all Contextual errors.
239
+
240
+ All custom exceptions inherit from this to enable broad catch blocks
241
+ while maintaining error code granularity.
242
+
243
+ Attributes:
244
+ code: Typed error code for categorization.
245
+ message: Human-readable error message.
246
+ context: Additional error context (file paths, query text, etc.).
247
+ """
248
+
249
+ def __init__(
250
+ self,
251
+ code: ErrorCode,
252
+ message: str,
253
+ context: dict[str, Any] | None = None,
254
+ ) -> None:
255
+ """Initialize a Contextual exception.
256
+
257
+ Args:
258
+ code: Error code categorizing this error.
259
+ message: Human-readable error description.
260
+ context: Optional contextual information.
261
+ """
262
+ self.code = code
263
+ self.message = message
264
+ self.context = context or {}
265
+ super().__init__(self._format_message())
266
+
267
+ def _format_message(self) -> str:
268
+ """Format the complete error message.
269
+
270
+ Returns:
271
+ Formatted message with code and context.
272
+ """
273
+ parts = [f"[{self.code.value}] {self.message}"]
274
+ if self.context:
275
+ context_str = ", ".join(f"{k}={v}" for k, v in self.context.items())
276
+ parts.append(f"({context_str})")
277
+ return " ".join(parts)
278
+
279
+ def is_retriable(self) -> bool:
280
+ """Check if this error represents a retriable condition.
281
+
282
+ Returns:
283
+ True if the error should be retried.
284
+ """
285
+ return is_retriable(self.code)
286
+
287
+
288
+ # ============================================================================
289
+ # SUBSYSTEM-SPECIFIC EXCEPTIONS
290
+ # ============================================================================
291
+
292
+
293
+ class StorageError(ContextualExceptionError):
294
+ """Errors from the storage layer (SQLite, LanceDB, tantivy)."""
295
+
296
+
297
+ class IndexingError(ContextualExceptionError):
298
+ """Errors from the indexing pipeline (parsing, chunking, embedding)."""
299
+
300
+
301
+ class GitError(ContextualExceptionError):
302
+ """Errors from git operations (commit walking, blame, diff)."""
303
+
304
+
305
+ class RetrievalError(ContextualExceptionError):
306
+ """Errors from the retrieval pipeline (search, fusion, reranking)."""
307
+
308
+
309
+ class SecurityError(ContextualExceptionError):
310
+ """Errors from security checks (path safety, sanitization, isolation)."""
311
+
312
+
313
+ class ConfigError(ContextualExceptionError):
314
+ """Errors from configuration loading and validation."""
315
+
316
+
317
+ class MCPError(ContextualExceptionError):
318
+ """Errors from MCP server operations."""
319
+
320
+
321
+ class ContextualSystemError(ContextualExceptionError):
322
+ """System-level errors (disk, memory, permissions)."""
323
+
324
+
325
+ class InternalError(ContextualExceptionError):
326
+ """Internal errors indicating bugs (assertion failures, invariant violations)."""
327
+
328
+
329
+ # ============================================================================
330
+ # ERROR CONTEXT HELPERS
331
+ # ============================================================================
332
+
333
+
334
+ def storage_context(
335
+ db_path: str | None = None,
336
+ table: str | None = None,
337
+ query: str | None = None,
338
+ ) -> dict[str, Any]:
339
+ """Build context dict for storage errors.
340
+
341
+ Args:
342
+ db_path: Database file path.
343
+ table: Table name.
344
+ query: SQL query (truncated if too long).
345
+
346
+ Returns:
347
+ Context dictionary.
348
+ """
349
+ context = {}
350
+ if db_path:
351
+ context["db_path"] = db_path
352
+ if table:
353
+ context["table"] = table
354
+ # Context truncation limit for long queries
355
+ _query_truncate_length = 200
356
+ if query:
357
+ # Truncate long queries
358
+ context["query"] = (
359
+ query[:_query_truncate_length] + "..."
360
+ if len(query) > _query_truncate_length
361
+ else query
362
+ )
363
+ return context
364
+
365
+
366
+ def indexing_context(
367
+ file_path: str | None = None,
368
+ language: str | None = None,
369
+ line: int | None = None,
370
+ ) -> dict[str, Any]:
371
+ """Build context dict for indexing errors.
372
+
373
+ Args:
374
+ file_path: File being processed.
375
+ language: Programming language.
376
+ line: Line number where error occurred.
377
+
378
+ Returns:
379
+ Context dictionary.
380
+ """
381
+ context: dict[str, Any] = {}
382
+ if file_path:
383
+ context["file_path"] = file_path
384
+ if language:
385
+ context["language"] = language
386
+ if line is not None:
387
+ context["line"] = line
388
+ return context
389
+
390
+
391
+ def git_context(
392
+ repo_path: str | None = None,
393
+ commit_sha: str | None = None,
394
+ branch: str | None = None,
395
+ ) -> dict[str, Any]:
396
+ """Build context dict for git errors.
397
+
398
+ Args:
399
+ repo_path: Repository path.
400
+ commit_sha: Commit SHA (first 8 chars).
401
+ branch: Branch name.
402
+
403
+ Returns:
404
+ Context dictionary.
405
+ """
406
+ context = {}
407
+ if repo_path:
408
+ context["repo_path"] = repo_path
409
+ if commit_sha:
410
+ context["commit_sha"] = commit_sha[:8] # Short SHA
411
+ if branch:
412
+ context["branch"] = branch
413
+ return context
414
+
415
+
416
+ def retrieval_context(
417
+ query: str | None = None,
418
+ top_k: int | None = None,
419
+ phase: str | None = None,
420
+ ) -> dict[str, Any]:
421
+ """Build context dict for retrieval errors.
422
+
423
+ Args:
424
+ query: Search query (truncated).
425
+ top_k: Number of results requested.
426
+ phase: Pipeline phase (bm25, dense, fusion, rerank).
427
+
428
+ Returns:
429
+ Context dictionary.
430
+ """
431
+ # Query truncation limit for retrieval context
432
+ _query_truncate_length = 100
433
+ context: dict[str, Any] = {}
434
+ if query:
435
+ # Truncate long queries
436
+ context["query"] = (
437
+ query[:_query_truncate_length] + "..."
438
+ if len(query) > _query_truncate_length
439
+ else query
440
+ )
441
+ if top_k is not None:
442
+ context["top_k"] = top_k
443
+ if phase:
444
+ context["phase"] = phase
445
+ return context
446
+
447
+
448
+ def security_context(
449
+ path: str | None = None,
450
+ root: str | None = None,
451
+ attack_type: str | None = None,
452
+ ) -> dict[str, Any]:
453
+ """Build context dict for security errors.
454
+
455
+ Args:
456
+ path: Path that violated security check.
457
+ root: Expected root path.
458
+ attack_type: Type of attack detected.
459
+
460
+ Returns:
461
+ Context dictionary.
462
+ """
463
+ context = {}
464
+ if path:
465
+ context["path"] = path
466
+ if root:
467
+ context["root"] = root
468
+ if attack_type:
469
+ context["attack_type"] = attack_type
470
+ return context