contextual-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. contextual/__init__.py +18 -0
  2. contextual/__main__.py +11 -0
  3. contextual/cli.py +339 -0
  4. contextual/cli_docs.py +685 -0
  5. contextual/config.py +7 -0
  6. contextual/core/__init__.py +11 -0
  7. contextual/core/errors.py +470 -0
  8. contextual/core/models.py +590 -0
  9. contextual/docs/__init__.py +66 -0
  10. contextual/docs/chunker.py +550 -0
  11. contextual/docs/pipeline.py +513 -0
  12. contextual/docs/retrieval.py +654 -0
  13. contextual/docs/watcher.py +265 -0
  14. contextual/embedding/__init__.py +87 -0
  15. contextual/embedding/cache.py +455 -0
  16. contextual/embedding/embedder.py +414 -0
  17. contextual/embedding/helpers.py +252 -0
  18. contextual/git/__init__.py +22 -0
  19. contextual/git/blame.py +334 -0
  20. contextual/indexing/__init__.py +20 -0
  21. contextual/indexing/bug_sweep.py +119 -0
  22. contextual/indexing/chunker.py +691 -0
  23. contextual/indexing/embedder.py +271 -0
  24. contextual/indexing/file_watcher.py +154 -0
  25. contextual/indexing/incremental.py +260 -0
  26. contextual/indexing/index_writer.py +442 -0
  27. contextual/indexing/pipeline.py +438 -0
  28. contextual/indexing/processor.py +436 -0
  29. contextual/indexing/queries/readme.md +22 -0
  30. contextual/indexing/symbol_extractor.py +426 -0
  31. contextual/indexing/tokenizer.py +203 -0
  32. contextual/integrations/__init__.py +10 -0
  33. contextual/mcp/__init__.py +15 -0
  34. contextual/mcp/__main__.py +24 -0
  35. contextual/mcp/docs_tools.py +286 -0
  36. contextual/mcp/server.py +118 -0
  37. contextual/mcp/tools.py +443 -0
  38. contextual/observability/__init__.py +21 -0
  39. contextual/observability/logging.py +115 -0
  40. contextual/py.typed +0 -0
  41. contextual/retrieval/__init__.py +24 -0
  42. contextual/retrieval/context_assembler.py +372 -0
  43. contextual/retrieval/ranker.py +193 -0
  44. contextual/retrieval/search.py +548 -0
  45. contextual/security/__init__.py +52 -0
  46. contextual/security/paths.py +347 -0
  47. contextual/security/sanitize.py +349 -0
  48. contextual/security/workspace.py +348 -0
  49. contextual/storage/__init__.py +36 -0
  50. contextual/storage/fts_manager.py +273 -0
  51. contextual/storage/migration_v2.py +289 -0
  52. contextual/storage/migrations.py +316 -0
  53. contextual/storage/schema.py +210 -0
  54. contextual/storage/sqlite_pool.py +468 -0
  55. contextual/storage/vec0_manager.py +421 -0
  56. contextual_engine-0.1.0.dist-info/METADATA +297 -0
  57. contextual_engine-0.1.0.dist-info/RECORD +60 -0
  58. contextual_engine-0.1.0.dist-info/WHEEL +4 -0
  59. contextual_engine-0.1.0.dist-info/entry_points.txt +2 -0
  60. contextual_engine-0.1.0.dist-info/licenses/LICENSE +111 -0
@@ -0,0 +1,590 @@
1
+ """Core data models for Contextual.
2
+
3
+ This module defines all data shapes used throughout the system. Every other module
4
+ imports from here. These contracts are the architectural foundation - changing them
5
+ requires cascading updates across storage, indexing, and retrieval layers.
6
+
7
+ All models use Pydantic v2 for validation and serialization.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from datetime import UTC, datetime
13
+ from enum import StrEnum
14
+ from pathlib import Path
15
+ from typing import Any, Literal
16
+
17
+ from pydantic import BaseModel, Field, field_validator
18
+
19
+
20
+ # ============================================================================
21
+ # ENUMS - Type-safe categorical values
22
+ # ============================================================================
23
+
24
+
25
+ class EntityType(StrEnum):
26
+ """Types of entities that can be tracked in the knowledge graph.
27
+
28
+ These align with code structure and decision artifacts.
29
+ """
30
+
31
+ FUNCTION = "function"
32
+ CLASS = "class"
33
+ MODULE = "module"
34
+ FILE = "file"
35
+ SYMBOL = "symbol"
36
+ DECISION = "decision" # ADR, RFC, design doc
37
+ ADR = "adr" # Architecture Decision Record
38
+ DEPENDENCY = "dependency" # External package/library
39
+
40
+
41
+ class ChunkType(StrEnum):
42
+ """Types of code chunks extracted during indexing.
43
+
44
+ Determines chunking strategy and context assembly behavior.
45
+ """
46
+
47
+ FUNCTION = "function"
48
+ METHOD = "method"
49
+ CLASS = "class"
50
+ MODULE = "module"
51
+ IMPORT_BLOCK = "import_block"
52
+ TYPE_DEFINITION = "type_definition"
53
+ INTERFACE = "interface"
54
+ CONSTANT = "constant"
55
+
56
+
57
+ class EpisodeSource(StrEnum):
58
+ """Sources of ingestion events.
59
+
60
+ Tracks provenance of information entering the system.
61
+ """
62
+
63
+ COMMIT = "commit" # Git commit
64
+ FILE = "file" # Direct file ingestion
65
+ USER = "user" # User-provided context
66
+ HOOK = "hook" # Git hook trigger
67
+ MANUAL = "manual" # Manual re-index
68
+
69
+
70
+ class FactSource(StrEnum):
71
+ """Sources of factual assertions.
72
+
73
+ Determines confidence and invalidation rules.
74
+ """
75
+
76
+ TREE_SITTER = "tree-sitter" # AST parsing (high confidence)
77
+ GIT_BLAME = "git-blame" # Temporal attribution (high confidence)
78
+ USER = "user" # User annotation (medium confidence)
79
+ LLM_EXTRACT = "llm-extract" # LLM extraction (lower confidence)
80
+ HEURISTIC = "heuristic" # Pattern matching (medium confidence)
81
+
82
+
83
+ class Language(StrEnum):
84
+ """Supported programming languages for indexing.
85
+
86
+ Week 1 MVP: Python, TypeScript, JavaScript, Go, Java, Rust, C#
87
+ """
88
+
89
+ PYTHON = "python"
90
+ TYPESCRIPT = "typescript"
91
+ JAVASCRIPT = "javascript"
92
+ TSX = "tsx"
93
+ GO = "go"
94
+ JAVA = "java"
95
+ RUST = "rust"
96
+ CSHARP = "csharp"
97
+ # Config formats (free wins)
98
+ JSON = "json"
99
+ YAML = "yaml"
100
+ TOML = "toml"
101
+ MARKDOWN = "markdown"
102
+ DOCKERFILE = "dockerfile"
103
+
104
+
105
+ class ModelType(StrEnum):
106
+ """Embedding models used for vector generation.
107
+
108
+ Determines which index to search and dimension expectations.
109
+ """
110
+
111
+ JINA_CODE_V2 = "jina-v2-code" # 768d, code-specific
112
+
113
+
114
+ # ============================================================================
115
+ # CORE ENTITIES - Bi-temporal knowledge graph nodes
116
+ # ============================================================================
117
+
118
+
119
+ class Entity(BaseModel):
120
+ """A tracked entity in the code knowledge graph.
121
+
122
+ Entities represent code structures (functions, classes, modules),
123
+ decisions (ADRs), and dependencies that have facts associated with them.
124
+
125
+ Attributes:
126
+ id: Database primary key (assigned on insert).
127
+ name: Fully qualified name (e.g., "myapp.utils.helpers.parse_json").
128
+ entity_type: Category of entity.
129
+ metadata: Flexible JSON blob for entity-specific data.
130
+ created_at: When this entity was first observed (unix milliseconds UTC).
131
+ """
132
+
133
+ id: int | None = None
134
+ name: str = Field(..., min_length=1, max_length=500)
135
+ entity_type: EntityType
136
+ metadata: dict[str, Any] = Field(default_factory=dict)
137
+ created_at: int = Field(
138
+ default_factory=lambda: int(datetime.now(UTC).timestamp() * 1000),
139
+ )
140
+
141
+ model_config = {"frozen": False}
142
+
143
+
144
+ class Fact(BaseModel):
145
+ """A bi-temporal fact about an entity.
146
+
147
+ Facts are assertions like "function X has signature Y" or "module A imports B".
148
+ They carry four timestamps for complete temporal tracking:
149
+ - valid_at: When this became true in reality (from git blame)
150
+ - invalid_at: When this stopped being true (or None if still valid)
151
+ - created_at: When we recorded this assertion
152
+ - expired_at: When we retracted our belief (or None if still believed)
153
+
154
+ This bi-temporal model (Snodgrass/Jensen formalism) enables AS-OF queries
155
+ and contradiction detection without deleting history.
156
+
157
+ Attributes:
158
+ id: Database primary key.
159
+ entity_id: Subject of the fact.
160
+ object_id: Object entity (for entity-entity relations) or None.
161
+ predicate: Relationship type (e.g., "has_signature", "calls", "imports").
162
+ value: Literal value (for entity-value facts) as JSON string.
163
+ valid_at: When this became true (unix ms UTC, from git blame).
164
+ invalid_at: When this stopped being true (or None).
165
+ created_at: When we recorded this (unix ms UTC).
166
+ expired_at: When we retracted this belief (or None).
167
+ episode_id: Source episode that created this fact.
168
+ confidence: 0.0-1.0 confidence score.
169
+ source: Where this fact came from.
170
+ """
171
+
172
+ id: int | None = None
173
+ entity_id: int
174
+ object_id: int | None = None
175
+ predicate: str = Field(..., min_length=1, max_length=100)
176
+ value: str | None = Field(None, max_length=10000)
177
+ valid_at: int # Unix milliseconds UTC
178
+ invalid_at: int | None = None
179
+ created_at: int = Field(
180
+ default_factory=lambda: int(datetime.now(UTC).timestamp() * 1000),
181
+ )
182
+ expired_at: int | None = None
183
+ episode_id: int | None = None
184
+ confidence: float = Field(1.0, ge=0.0, le=1.0)
185
+ source: FactSource
186
+
187
+ model_config = {"frozen": False}
188
+
189
+ @field_validator("valid_at")
190
+ @classmethod
191
+ def validate_valid_at(cls, v: int) -> int:
192
+ """Ensure valid_at is a reasonable timestamp."""
193
+ if v < 0:
194
+ msg = "valid_at must be non-negative"
195
+ raise ValueError(msg)
196
+ return v
197
+
198
+
199
+ class Episode(BaseModel):
200
+ """An ingestion event that created facts.
201
+
202
+ Episodes provide provenance tracking - every fact can be traced back to
203
+ the episode that created it (a commit, file change, user annotation, etc.).
204
+
205
+ Attributes:
206
+ id: Database primary key.
207
+ source: Type of ingestion event.
208
+ content: Raw data for provenance (commit SHA, file path, user input).
209
+ timestamp: When this episode occurred (unix ms UTC).
210
+ metadata: Flexible JSON blob for episode-specific data.
211
+ """
212
+
213
+ id: int | None = None
214
+ source: EpisodeSource
215
+ content: str | None = Field(None, max_length=50000)
216
+ timestamp: int = Field(
217
+ default_factory=lambda: int(datetime.now(UTC).timestamp() * 1000),
218
+ )
219
+ metadata: dict[str, Any] = Field(default_factory=dict)
220
+
221
+ model_config = {"frozen": False}
222
+
223
+
224
+ # ============================================================================
225
+ # CODE CHUNKS - Vector search payloads
226
+ # ============================================================================
227
+
228
+
229
+ class Chunk(BaseModel):
230
+ """A code chunk with embedding vector and metadata.
231
+
232
+ Chunks are the atomic units of retrieval. Each represents a coherent piece
233
+ of code (function, class, module section) with a structural header and body.
234
+
235
+ The content_hash enables deduplication - identical chunks are never re-embedded.
236
+
237
+ Attributes:
238
+ vector: 768-dimensional embedding (jina-v2-code).
239
+ path: File path relative to project root.
240
+ language: Programming language.
241
+ symbol_name: Function/class name (or None for module-level chunks).
242
+ chunk_type: Category of code structure.
243
+ content_hash: SHA-256 of (header + body) for deduplication.
244
+ header: Structural context (file path, parent class, imports, decorators).
245
+ body: Actual code content.
246
+ model_type: Which embedding model generated the vector.
247
+ start_line: First line of the code chunk.
248
+ end_line: Last line of the code chunk.
249
+ """
250
+
251
+ vector: list[float] = Field(..., min_length=256, max_length=768)
252
+ path: str = Field(..., min_length=1, max_length=500)
253
+ language: Language
254
+ symbol_name: str | None = Field(None, max_length=200)
255
+ chunk_type: ChunkType | str = "code"
256
+ source_type: str = "code"
257
+ heading_path: str = ""
258
+ heading_level: int = 0
259
+ content_hash: str = Field(..., min_length=64, max_length=64) # SHA-256 hex
260
+ header: str = Field(..., max_length=2000)
261
+ body: str = Field(..., max_length=50000)
262
+ model_type: ModelType
263
+ start_line: int = Field(..., ge=0)
264
+ end_line: int = Field(..., ge=0)
265
+
266
+ model_config = {"frozen": True} # Chunks are immutable once created
267
+
268
+ @field_validator("vector")
269
+ @classmethod
270
+ def validate_vector_dimensions(cls, v: list[float]) -> list[float]:
271
+ """Ensure vector has valid dimensions (256 or 768)."""
272
+ if len(v) not in {256, 768}:
273
+ msg = f"Vector must be 256d or 768d, got {len(v)}d"
274
+ raise ValueError(msg)
275
+ return v
276
+
277
+ @field_validator("content_hash")
278
+ @classmethod
279
+ def validate_content_hash_format(cls, v: str) -> str:
280
+ """Ensure content_hash is valid SHA-256 hex."""
281
+ if not all(c in "0123456789abcdef" for c in v.lower()):
282
+ msg = "content_hash must be valid SHA-256 hex string"
283
+ raise ValueError(msg)
284
+ return v.lower()
285
+
286
+
287
+ # ============================================================================
288
+ # CONFIGURATION MODELS
289
+ # ============================================================================
290
+
291
+
292
+ class IndexingConfig(BaseModel):
293
+ """Configuration for code indexing pipeline.
294
+
295
+ Controls chunking, file discovery, and git integration behavior.
296
+ """
297
+
298
+ # Chunking parameters
299
+ target_chunk_size: int = Field(1500, ge=500, le=5000)
300
+ max_chunk_size: int = Field(2000, ge=1000, le=10000)
301
+ min_chunk_size: int = Field(50, ge=10, le=500)
302
+
303
+ # File discovery
304
+ max_file_size_bytes: int = Field(2_097_152, ge=0) # 2 MB default
305
+ max_file_size_override_bytes: int = Field(10_485_760, ge=0) # 10 MB override
306
+ respect_gitignore: bool = True
307
+ respect_contextualignore: bool = True
308
+
309
+ # Git integration
310
+ enable_git_blame: bool = True
311
+ enable_incremental_indexing: bool = True
312
+ max_commits_per_walk: int = Field(1000, ge=1)
313
+
314
+ # Performance
315
+ parser_pool_size: int = Field(4, ge=1, le=32)
316
+ batch_size_chunks: int = Field(100, ge=1, le=1000)
317
+
318
+ model_config = {"frozen": False}
319
+
320
+
321
+ class EmbeddingConfig(BaseModel):
322
+ """Configuration for embedding models.
323
+
324
+ Controls which models to use and inference parameters.
325
+ """
326
+
327
+ # Model selection
328
+ code_model: Literal["jina-v2-code"] = "jina-v2-code"
329
+
330
+ # Inference parameters
331
+ batch_size: int = Field(64, ge=1, le=256)
332
+ max_seq_length: int = Field(512, ge=128, le=8192)
333
+
334
+ # Caching
335
+ cache_size: int = Field(2000, ge=0)
336
+ enable_cache: bool = True
337
+
338
+ model_config = {"frozen": False}
339
+
340
+
341
+ class RetrievalConfig(BaseModel):
342
+ """Configuration for hybrid retrieval pipeline.
343
+
344
+ Controls BM25, dense search, fusion, and reranking.
345
+ """
346
+
347
+ # Retrieval counts
348
+ bm25_top_k: int = Field(100, ge=1, le=500)
349
+ dense_top_k: int = Field(100, ge=1, le=500)
350
+ rerank_top_k: int = Field(20, ge=1, le=100)
351
+ final_top_k: int = Field(10, ge=1, le=50)
352
+
353
+ # RRF parameters
354
+ rrf_k: int = Field(60, ge=1, le=100)
355
+ bm25_weight: float = Field(0.6, ge=0.0, le=1.0)
356
+ dense_weight: float = Field(0.4, ge=0.0, le=1.0)
357
+
358
+ # MMR parameters
359
+ mmr_lambda: float = Field(0.7, ge=0.0, le=1.0)
360
+ max_chunks_per_file: int = Field(2, ge=1, le=10)
361
+ max_chunks_per_symbol: int = Field(1, ge=1, le=5)
362
+
363
+ # Context assembly
364
+ max_context_tokens: int = Field(8000, ge=1000, le=200000)
365
+ structural_context_ratio: float = Field(0.15, ge=0.0, le=0.5)
366
+
367
+ model_config = {"frozen": False}
368
+
369
+
370
+ class StorageConfig(BaseModel):
371
+ """Configuration for storage backends.
372
+
373
+ Controls SQLite, LanceDB, and tantivy settings.
374
+ """
375
+
376
+ # Paths (relative to project root)
377
+ contextual_dir: Path = Field(default=Path(".contextual"))
378
+ sqlite_db_name: str = "contextual.db"
379
+ lance_db_name: str = "lance"
380
+ tantivy_index_name: str = "tantivy"
381
+
382
+ # SQLite
383
+ sqlite_cache_size_mb: int = Field(64, ge=8, le=1024)
384
+ sqlite_mmap_size_mb: int = Field(256, ge=64, le=2048)
385
+ sqlite_wal_autocheckpoint_pages: int = Field(4000, ge=1000, le=10000)
386
+
387
+ # LanceDB
388
+ lance_ivf_pq_threshold: int = Field(100_000, ge=10000)
389
+ lance_num_partitions: int | None = None # Auto: sqrt(n)
390
+ lance_num_sub_vectors: int = Field(48, ge=8, le=96)
391
+ lance_refine_factor: int = Field(20, ge=1, le=100)
392
+
393
+ model_config = {"frozen": False}
394
+
395
+
396
+ class SecurityConfig(BaseModel):
397
+ """Configuration for security hardening.
398
+
399
+ Controls path validation, sanitization, and workspace isolation.
400
+ """
401
+
402
+ # Path safety
403
+ enable_path_traversal_check: bool = True
404
+ enable_symlink_resolution: bool = True
405
+ max_path_depth: int = Field(20, ge=5, le=50)
406
+
407
+ # Sanitization
408
+ strip_unicode_control_chars: bool = True
409
+ escape_fts5_special_chars: bool = True
410
+
411
+ # Workspace isolation
412
+ per_project_workspace: bool = True
413
+ workspace_permissions: int = Field(0o700, ge=0o600, le=0o777)
414
+ db_file_permissions: int = Field(0o600, ge=0o600, le=0o666)
415
+
416
+ model_config = {"frozen": False}
417
+
418
+
419
+ class ContextualConfig(BaseModel):
420
+ """Top-level configuration for Contextual.
421
+
422
+ Aggregates all subsystem configurations and provides project-level settings.
423
+
424
+ Attributes:
425
+ project_root: Absolute path to project being indexed.
426
+ indexing: Indexing pipeline configuration.
427
+ embedding: Embedding model configuration.
428
+ retrieval: Retrieval pipeline configuration.
429
+ storage: Storage backend configuration.
430
+ security: Security hardening configuration.
431
+ """
432
+
433
+ project_root: Path
434
+ indexing: IndexingConfig = Field(default_factory=lambda: IndexingConfig()) # type: ignore[call-arg] # noqa: PLW0108
435
+ embedding: EmbeddingConfig = Field(default_factory=lambda: EmbeddingConfig()) # type: ignore[call-arg] # noqa: PLW0108
436
+ retrieval: RetrievalConfig = Field(default_factory=lambda: RetrievalConfig()) # type: ignore[call-arg] # noqa: PLW0108
437
+ storage: StorageConfig = Field(default_factory=lambda: StorageConfig()) # type: ignore[call-arg] # noqa: PLW0108
438
+ security: SecurityConfig = Field(default_factory=lambda: SecurityConfig()) # type: ignore[call-arg] # noqa: PLW0108
439
+
440
+ model_config = {"frozen": False}
441
+
442
+ @field_validator("project_root")
443
+ @classmethod
444
+ def validate_project_root_exists(cls, v: Path) -> Path:
445
+ """Ensure project root exists and is absolute."""
446
+ if not v.is_absolute():
447
+ msg = "project_root must be an absolute path"
448
+ raise ValueError(msg)
449
+ if not v.exists():
450
+ msg = f"project_root does not exist: {v}"
451
+ raise ValueError(msg)
452
+ if not v.is_dir():
453
+ msg = f"project_root is not a directory: {v}"
454
+ raise ValueError(msg)
455
+ return v
456
+
457
+
458
+ # ============================================================================
459
+ # QUERY & RESULT MODELS - API contracts for search operations
460
+ # ============================================================================
461
+
462
+
463
+ class SearchQuery(BaseModel):
464
+ """Query parameters for semantic search.
465
+
466
+ Attributes:
467
+ query: Natural language or code query string.
468
+ top_k: Number of results to return.
469
+ language_filter: Optional language filter.
470
+ path_filter: Optional path prefix filter.
471
+ include_scores: Whether to include similarity scores.
472
+ """
473
+
474
+ query: str = Field(..., min_length=1, max_length=1000)
475
+ top_k: int = Field(10, ge=1, le=100)
476
+ language_filter: Language | None = None
477
+ path_filter: str | None = None
478
+ include_scores: bool = True
479
+
480
+ model_config = {"frozen": True}
481
+
482
+
483
+ class SearchResult(BaseModel):
484
+ """A single search result with chunk and metadata.
485
+
486
+ Attributes:
487
+ chunk: The matched code chunk.
488
+ score: Similarity score (0.0-1.0, higher is better).
489
+ rank: Result rank (1-indexed).
490
+ """
491
+
492
+ chunk: Chunk
493
+ score: float = Field(..., ge=0.0, le=1.0)
494
+ rank: int = Field(..., ge=1)
495
+
496
+ model_config = {"frozen": True}
497
+
498
+
499
+ class SearchResponse(BaseModel):
500
+ """Complete search response with results and metadata.
501
+
502
+ Attributes:
503
+ query: Original query.
504
+ results: Ranked search results.
505
+ total_candidates: Total chunks considered.
506
+ pipeline_latency_ms: End-to-end latency.
507
+ """
508
+
509
+ query: str
510
+ results: list[SearchResult]
511
+ total_candidates: int = Field(..., ge=0)
512
+ pipeline_latency_ms: float = Field(..., ge=0.0)
513
+
514
+ model_config = {"frozen": True}
515
+
516
+
517
+ class RecallQuery(BaseModel):
518
+ """Query parameters for entity recall (temporal queries).
519
+
520
+ Attributes:
521
+ entity_name: Name of entity to recall.
522
+ predicate: Optional predicate filter.
523
+ as_of_timestamp: Optional AS-OF timestamp (unix ms UTC).
524
+ """
525
+
526
+ entity_name: str = Field(..., min_length=1, max_length=500)
527
+ predicate: str | None = Field(None, max_length=100)
528
+ as_of_timestamp: int | None = None
529
+
530
+ model_config = {"frozen": True}
531
+
532
+
533
+ class RecallResult(BaseModel):
534
+ """Facts about an entity at a specific time.
535
+
536
+ Attributes:
537
+ entity: The queried entity.
538
+ facts: List of valid facts at query time.
539
+ as_of: Query timestamp (or None for current).
540
+ """
541
+
542
+ entity: Entity
543
+ facts: list[Fact]
544
+ as_of: int | None = None
545
+
546
+ model_config = {"frozen": True}
547
+
548
+
549
+ class IndexProgress(BaseModel):
550
+ """Progress update during indexing.
551
+
552
+ Attributes:
553
+ phase: Current indexing phase.
554
+ files_processed: Number of files processed.
555
+ total_files: Total files to process.
556
+ chunks_created: Number of chunks created.
557
+ facts_created: Number of facts created.
558
+ """
559
+
560
+ phase: str
561
+ files_processed: int = Field(..., ge=0)
562
+ total_files: int = Field(..., ge=0)
563
+ chunks_created: int = Field(..., ge=0)
564
+ facts_created: int = Field(..., ge=0)
565
+
566
+ model_config = {"frozen": False}
567
+
568
+
569
+ class IndexResult(BaseModel):
570
+ """Result of an indexing operation.
571
+
572
+ Attributes:
573
+ success: Whether indexing completed successfully.
574
+ files_indexed: Number of files successfully indexed.
575
+ chunks_created: Number of new chunks created.
576
+ chunks_skipped: Number of chunks skipped (deduped).
577
+ facts_created: Number of new facts created.
578
+ duration_seconds: Total indexing time.
579
+ error: Error message if not successful.
580
+ """
581
+
582
+ success: bool
583
+ files_indexed: int = Field(0, ge=0)
584
+ chunks_created: int = Field(0, ge=0)
585
+ chunks_skipped: int = Field(0, ge=0)
586
+ facts_created: int = Field(0, ge=0)
587
+ duration_seconds: float = Field(0.0, ge=0.0)
588
+ error: str | None = None
589
+
590
+ model_config = {"frozen": True}
@@ -0,0 +1,66 @@
1
+ """Docs module — Phase 2: heading-aware document indexing and retrieval.
2
+
3
+ Public API
4
+ ----------
5
+ Chunking:
6
+ DocsChunker Heading-aware markdown/MDX/RST/TXT chunker.
7
+ DocChunk Dataclass representing a single doc chunk.
8
+ chunk_doc_file Convenience: chunk a single file with defaults.
9
+
10
+ Indexing:
11
+ DocsPipeline Full indexing pipeline (file discovery → embed → store).
12
+ DocsIndexStats Stats dataclass returned by pipeline runs.
13
+ index_docs Convenience: index all docs in a repo.
14
+
15
+ Retrieval:
16
+ docs_search Hybrid BM25 + vector search over doc chunks.
17
+ docs_get_section Fetch exact section by heading path (5-tier resolver).
18
+ docs_list_files List all indexed doc files.
19
+ DocSearchHit Search result dataclass.
20
+ DocSection Resolved section dataclass.
21
+ DocFileInfo File metadata dataclass.
22
+
23
+ File Watching:
24
+ DocsFileWatcher Watchdog-based watcher; composable with code watcher.
25
+ """
26
+ from __future__ import annotations
27
+
28
+ from contextual.docs.chunker import (
29
+ DocChunk,
30
+ DocsChunker,
31
+ chunk_doc_file,
32
+ )
33
+ from contextual.docs.pipeline import (
34
+ DocsPipeline,
35
+ DocsIndexStats,
36
+ index_docs,
37
+ )
38
+ from contextual.docs.retrieval import (
39
+ DocSearchHit,
40
+ DocSection,
41
+ DocFileInfo,
42
+ docs_search,
43
+ docs_get_section,
44
+ docs_list_files,
45
+ )
46
+ from contextual.docs.watcher import DocsFileWatcher
47
+
48
+ __all__ = [
49
+ # Chunking
50
+ "DocChunk",
51
+ "DocsChunker",
52
+ "chunk_doc_file",
53
+ # Indexing
54
+ "DocsPipeline",
55
+ "DocsIndexStats",
56
+ "index_docs",
57
+ # Retrieval
58
+ "DocSearchHit",
59
+ "DocSection",
60
+ "DocFileInfo",
61
+ "docs_search",
62
+ "docs_get_section",
63
+ "docs_list_files",
64
+ # Watching
65
+ "DocsFileWatcher",
66
+ ]