semantixrag 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. semantixrag/__init__.py +14 -0
  2. semantixrag/__main__.py +6 -0
  3. semantixrag/api/__init__.py +1 -0
  4. semantixrag/api/main.py +29 -0
  5. semantixrag/api/routes/__init__.py +0 -0
  6. semantixrag/api/routes/admin.py +68 -0
  7. semantixrag/api/routes/compliance.py +108 -0
  8. semantixrag/api/routes/ingestion.py +76 -0
  9. semantixrag/api/routes/observability.py +39 -0
  10. semantixrag/api/routes/retrieval.py +75 -0
  11. semantixrag/cdc/__init__.py +5 -0
  12. semantixrag/cdc/incremental.py +57 -0
  13. semantixrag/cdc/watcher.py +139 -0
  14. semantixrag/chunking/__init__.py +5 -0
  15. semantixrag/chunking/enricher.py +153 -0
  16. semantixrag/chunking/header_splitter.py +237 -0
  17. semantixrag/cli.py +309 -0
  18. semantixrag/compliance/__init__.py +1 -0
  19. semantixrag/compliance/dsar.py +202 -0
  20. semantixrag/compliance/masking.py +93 -0
  21. semantixrag/compliance/pii_scanner.py +165 -0
  22. semantixrag/config/__init__.py +4 -0
  23. semantixrag/config/opa/access.rego +45 -0
  24. semantixrag/config/opa/audit.rego +41 -0
  25. semantixrag/config/opa/masking.rego +39 -0
  26. semantixrag/config/settings.py +76 -0
  27. semantixrag/embeddings/__init__.py +4 -0
  28. semantixrag/embeddings/embedder.py +143 -0
  29. semantixrag/extractors/__init__.py +6 -0
  30. semantixrag/extractors/base.py +35 -0
  31. semantixrag/extractors/multimodal_extractor.py +237 -0
  32. semantixrag/extractors/table_extractor.py +170 -0
  33. semantixrag/extractors/unstructured_extractor.py +175 -0
  34. semantixrag/indexing/__init__.py +7 -0
  35. semantixrag/indexing/bulk_indexer.py +142 -0
  36. semantixrag/indexing/connection.py +96 -0
  37. semantixrag/indexing/graph_writer.py +192 -0
  38. semantixrag/indexing/hybrid_search.py +208 -0
  39. semantixrag/indexing/index_manager.py +163 -0
  40. semantixrag/knowledge/__init__.py +1 -0
  41. semantixrag/knowledge/entity_extractor.py +98 -0
  42. semantixrag/knowledge/ontology.py +100 -0
  43. semantixrag/models.py +146 -0
  44. semantixrag/monitoring/__init__.py +4 -0
  45. semantixrag/monitoring/logger.py +60 -0
  46. semantixrag/observability/__init__.py +1 -0
  47. semantixrag/observability/evaluator.py +110 -0
  48. semantixrag/observability/metrics.py +129 -0
  49. semantixrag/observability/tracer.py +133 -0
  50. semantixrag/pipeline.py +318 -0
  51. semantixrag/resources.py +98 -0
  52. semantixrag-2.0.0.dist-info/METADATA +433 -0
  53. semantixrag-2.0.0.dist-info/RECORD +57 -0
  54. semantixrag-2.0.0.dist-info/WHEEL +5 -0
  55. semantixrag-2.0.0.dist-info/entry_points.txt +2 -0
  56. semantixrag-2.0.0.dist-info/licenses/LICENSE +21 -0
  57. semantixrag-2.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,14 @@
1
+ """RAG Ingestion Pipeline Source Package."""
2
+ from .pipeline import IngestionPipeline
3
+ from .models import Chunk, ExtractionResult, ExtractedElement, ElementType, IndexedDocument
4
+ from .monitoring.logger import setup_logging
5
+
6
+ __all__ = [
7
+ "IngestionPipeline",
8
+ "Chunk",
9
+ "ExtractionResult",
10
+ "ExtractedElement",
11
+ "ElementType",
12
+ "IndexedDocument",
13
+ "setup_logging",
14
+ ]
@@ -0,0 +1,6 @@
1
+ """Allow semantixrag to be executed as a module: python -m semantixrag"""
2
+ import sys
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ sys.exit(main())
@@ -0,0 +1 @@
1
+ """FastAPI application for SemantixRAG Platform."""
@@ -0,0 +1,29 @@
1
+ """FastAPI application entry point for SemantixRAG Platform."""
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from .routes import ingestion, retrieval, admin, observability, compliance
5
+
6
+ app = FastAPI(
7
+ title="SemantixRAG Platform",
8
+ description="AI-native RAG ingestion, retrieval, and governance platform",
9
+ version="2.0.0",
10
+ )
11
+
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_credentials=True,
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+ app.include_router(ingestion.router, prefix="/v1", tags=["ingestion"])
21
+ app.include_router(retrieval.router, prefix="/v1", tags=["retrieval"])
22
+ app.include_router(admin.router, prefix="/v1", tags=["admin"])
23
+ app.include_router(observability.router, prefix="/v1", tags=["observability"])
24
+ app.include_router(compliance.router, prefix="/v1", tags=["compliance"])
25
+
26
+
27
+ @app.get("/health")
28
+ async def health_check():
29
+ return {"status": "ok", "version": "2.0.0"}
File without changes
@@ -0,0 +1,68 @@
1
+ """Admin API routes (AdminCopilot)."""
2
+ from fastapi import APIRouter, HTTPException
3
+ from pydantic import BaseModel
4
+ from typing import Optional
5
+ from ...pipeline import IngestionPipeline
6
+ from ...observability.metrics import metrics_collector
7
+
8
+ router = APIRouter()
9
+ pipeline = IngestionPipeline()
10
+
11
+
12
+ class AdminQueryRequest(BaseModel):
13
+ query: str
14
+ user_id: str = "anonymous"
15
+ user_role: str = "viewer"
16
+ tenant_id: str = "default"
17
+ confirm_destructive: bool = True
18
+
19
+
20
+ class AdminQueryResponse(BaseModel):
21
+ action_taken: str = "query_executed"
22
+ action_type: str = "read"
23
+ result: dict = {}
24
+ requires_confirmation: bool = False
25
+ summary: str = ""
26
+
27
+
28
+ @router.post("/admin/query", response_model=AdminQueryResponse)
29
+ async def admin_query(request: AdminQueryRequest):
30
+ """Natural-language platform administration."""
31
+ query = request.query.lower()
32
+
33
+ try:
34
+ if "document" in query and "count" in query:
35
+ return AdminQueryResponse(
36
+ summary="Document count query processed",
37
+ result={"message": "Use GET /metrics for document counts"},
38
+ )
39
+
40
+ elif "pii" in query and ("scan" in query or "find" in query):
41
+ return AdminQueryResponse(
42
+ action_type="compliance",
43
+ summary="PII scan initiated",
44
+ result={"status": "PII scanning enabled by default on ingestion"},
45
+ )
46
+
47
+ elif "health" in query or "status" in query:
48
+ return AdminQueryResponse(
49
+ action_type="read",
50
+ summary="Platform health check",
51
+ result=metrics_collector.snapshot(),
52
+ )
53
+
54
+ elif "schema" in query or "ontology" in query:
55
+ return AdminQueryResponse(
56
+ action_type="read",
57
+ summary="Knowledge graph schema",
58
+ result=pipeline.ontology_manager.to_schema(),
59
+ )
60
+
61
+ else:
62
+ return AdminQueryResponse(
63
+ summary=f"Processed: {request.query}",
64
+ result={"message": f"Query '{request.query}' received. Use specific commands: 'document count', 'PII scan', 'health', 'schema'"},
65
+ )
66
+
67
+ except Exception as e:
68
+ raise HTTPException(status_code=500, detail=str(e))
@@ -0,0 +1,108 @@
1
+ """Compliance API routes (GuardRail)."""
2
+ from fastapi import APIRouter, HTTPException, Query
3
+ from pydantic import BaseModel
4
+ from typing import Optional
5
+ import uuid
6
+ from datetime import datetime
7
+ from ...compliance.pii_scanner import PIIScanner
8
+ from ...compliance.masking import masking_engine
9
+ from ...compliance.dsar import DSAREngine
10
+ from ...config.settings import settings
11
+
12
+ router = APIRouter()
13
+ pii_scanner = PIIScanner()
14
+ dsar_engine = DSAREngine()
15
+
16
+
17
+ class PIIscanRequest(BaseModel):
18
+ document_id: Optional[str] = None
19
+ text: Optional[str] = None
20
+ tenant_id: str = "default"
21
+ scan_depth: str = "standard"
22
+
23
+
24
+ class DSARRequest(BaseModel):
25
+ subject_id: str
26
+ action: str
27
+ tenant_id: str = "default"
28
+ requested_by: str
29
+ reason: Optional[str] = None
30
+
31
+
32
+ @router.post("/compliance/pii/scan")
33
+ async def scan_pii(request: PIIscanRequest):
34
+ """Scan text or document for PII."""
35
+ if not request.text:
36
+ raise HTTPException(status_code=400, detail="No text provided for scanning")
37
+
38
+ findings = await pii_scanner.scan(request.text)
39
+ summary = pii_scanner.get_summary(findings)
40
+
41
+ return {
42
+ "scan_id": str(uuid.uuid4()),
43
+ "pii_findings": [
44
+ {
45
+ "pii_type": f.pii_type,
46
+ "sensitivity": f.sensitivity,
47
+ "confidence": f.confidence,
48
+ "location": {"start": f.start, "end": f.end},
49
+ "context": f.context[:100],
50
+ "recommended_action": f.recommended_action,
51
+ "masked_text": f.masked_text,
52
+ }
53
+ for f in findings
54
+ ],
55
+ "summary": summary,
56
+ }
57
+
58
+
59
+ @router.post("/compliance/dsar")
60
+ async def execute_dsar(request: DSARRequest):
61
+ """Execute GDPR data subject access/deletion request."""
62
+ if request.action == "delete" and not request.reason:
63
+ raise HTTPException(
64
+ status_code=400,
65
+ detail="Reason is required for delete actions",
66
+ )
67
+
68
+ result = await dsar_engine.execute_dsar(
69
+ subject_id=request.subject_id,
70
+ action=request.action,
71
+ tenant_id=request.tenant_id,
72
+ requested_by=request.requested_by,
73
+ reason=request.reason,
74
+ )
75
+
76
+ return {
77
+ "dsar_id": result.dsar_id,
78
+ "status": result.status,
79
+ "subject_id": result.subject_id,
80
+ "action": result.action,
81
+ "affected_records": {
82
+ "documents": result.affected_documents,
83
+ "chunks": result.affected_chunks,
84
+ "embeddings": result.affected_embeddings,
85
+ "agent_memories": result.affected_memories,
86
+ },
87
+ "estimated_completion": result.estimated_completion,
88
+ }
89
+
90
+
91
+ @router.get("/compliance/dsar/{dsar_id}")
92
+ async def get_dsar_status(dsar_id: str):
93
+ """Get DSAR request status."""
94
+ result = dsar_engine.get_dsar_status(dsar_id)
95
+ if not result:
96
+ raise HTTPException(status_code=404, detail="DSAR request not found")
97
+ return {
98
+ "dsar_id": result.dsar_id,
99
+ "status": result.status,
100
+ "subject_id": result.subject_id,
101
+ "action": result.action,
102
+ "affected_records": {
103
+ "documents": result.affected_documents,
104
+ "chunks": result.affected_chunks,
105
+ "embeddings": result.affected_embeddings,
106
+ "agent_memories": result.affected_memories,
107
+ },
108
+ }
@@ -0,0 +1,76 @@
1
+ """Ingestion API routes."""
2
+ from fastapi import APIRouter, UploadFile, File, Form, HTTPException
3
+ from pathlib import Path
4
+ import uuid
5
+ from datetime import datetime
6
+ from typing import Optional
7
+ from ...pipeline import IngestionPipeline
8
+ from ...config.settings import settings
9
+
10
+ router = APIRouter()
11
+ pipeline = IngestionPipeline()
12
+
13
+
14
+ @router.post("/ingest")
15
+ async def ingest_document(
16
+ file: UploadFile = File(...),
17
+ tenant_id: str = Form("default"),
18
+ enrich: bool = Form(True),
19
+ enable_entity_extraction: bool = Form(True),
20
+ enable_pii_scan: bool = Form(True),
21
+ enable_summarization: bool = Form(True),
22
+ ):
23
+ """Upload and process a single document."""
24
+ if not file.filename:
25
+ raise HTTPException(status_code=400, detail="No file provided")
26
+
27
+ # Save uploaded file temporarily
28
+ temp_dir = Path(settings.watch_directory) / "uploads"
29
+ temp_dir.mkdir(parents=True, exist_ok=True)
30
+ temp_path = temp_dir / f"{uuid.uuid4()}_{file.filename}"
31
+
32
+ try:
33
+ content = await file.read()
34
+ temp_path.write_bytes(content)
35
+
36
+ document_id = str(uuid.uuid4())
37
+ result = pipeline.process_document(
38
+ temp_path,
39
+ document_id=document_id,
40
+ tenant_id=tenant_id,
41
+ )
42
+
43
+ if not result.get("success"):
44
+ raise HTTPException(
45
+ status_code=500,
46
+ detail=result.get("error", "Processing failed"),
47
+ )
48
+
49
+ return {
50
+ "document_id": result["document_id"],
51
+ "filename": result["filename"],
52
+ "status": "completed",
53
+ "chunks_created": result["chunks_count"],
54
+ "entities_extracted": result.get("entities_extracted", 0),
55
+ "pii_findings": result.get("pii_findings", 0),
56
+ "indexed_at": datetime.utcnow().isoformat(),
57
+ "trace_id": result.get("trace_id", ""),
58
+ }
59
+
60
+ except HTTPException:
61
+ raise
62
+ except Exception as e:
63
+ raise HTTPException(status_code=500, detail=str(e))
64
+ finally:
65
+ if temp_path.exists():
66
+ temp_path.unlink()
67
+
68
+
69
+ @router.get("/ingest/{document_id}/status")
70
+ async def get_ingest_status(document_id: str):
71
+ """Check ingestion status for a document."""
72
+ return {
73
+ "document_id": document_id,
74
+ "status": "completed",
75
+ "message": "Status tracking requires OpenSearch connection",
76
+ }
@@ -0,0 +1,39 @@
1
+ """Observability API routes (Obsidian)."""
2
+ from fastapi import APIRouter, HTTPException, Query
3
+ from typing import Optional, List
4
+ from datetime import datetime
5
+ from ...observability.metrics import metrics_collector
6
+
7
+ router = APIRouter()
8
+
9
+
10
+ @router.post("/observability/traces")
11
+ async def ingest_traces(traces: list[dict]):
12
+ """Ingest telemetry traces."""
13
+ accepted = len(traces)
14
+ metrics_collector.increment("traces.ingested", accepted)
15
+ return {"accepted": accepted, "failed": 0}
16
+
17
+
18
+ @router.get("/observability/metrics")
19
+ async def get_metrics(
20
+ start_time: Optional[str] = Query(None),
21
+ end_time: Optional[str] = Query(None),
22
+ granularity: str = Query("5m"),
23
+ ):
24
+ """Query pipeline metrics."""
25
+ return metrics_collector.snapshot()
26
+
27
+
28
+ @router.get("/observability/evaluation")
29
+ async def get_evaluation(
30
+ start_time: Optional[str] = Query(None),
31
+ end_time: Optional[str] = Query(None),
32
+ evaluation_id: Optional[str] = Query(None),
33
+ query_id: Optional[str] = Query(None),
34
+ ):
35
+ """Query RAG quality evaluation metrics."""
36
+ return {
37
+ "evaluations": [],
38
+ "message": "Evaluation requires LLM client for full functionality",
39
+ }
@@ -0,0 +1,75 @@
1
+ """Retrieval API routes."""
2
+ from fastapi import APIRouter, HTTPException
3
+ from pydantic import BaseModel
4
+ from typing import Optional, List
5
+ import uuid
6
+ from ...pipeline import IngestionPipeline
7
+ from ...indexing.hybrid_search import HybridSearch
8
+ from ...indexing.graph_writer import GraphWriter
9
+ from ...config.settings import settings
10
+
11
+ router = APIRouter()
12
+ pipeline = IngestionPipeline()
13
+ hybrid_search = HybridSearch()
14
+
15
+
16
+ class RetrievalRequest(BaseModel):
17
+ query: str
18
+ tenant_id: str = "default"
19
+ strategy: str = "hybrid"
20
+ vector_top_k: int = 50
21
+ keyword_top_k: int = 50
22
+ graph_hops: int = 2
23
+ rerank: bool = True
24
+ rerank_top_k: int = 5
25
+ filters: Optional[dict] = None
26
+ generation: Optional[dict] = None
27
+
28
+
29
+ class RetrievalResponse(BaseModel):
30
+ query_id: str
31
+ query: str
32
+ results: list[dict] = []
33
+ retrieval_metrics: dict = {}
34
+ generation: Optional[dict] = None
35
+ trace_id: str = ""
36
+
37
+
38
+ @router.post("/query", response_model=RetrievalResponse)
39
+ async def query(request: RetrievalRequest):
40
+ """Execute semantic search with hybrid retrieval."""
41
+ query_id = str(uuid.uuid4())
42
+ trace_id = str(uuid.uuid4())
43
+
44
+ try:
45
+ search_results = hybrid_search.search(
46
+ query_text=request.query,
47
+ top_k=request.vector_top_k,
48
+ tenant_id=request.tenant_id,
49
+ )
50
+
51
+ results = []
52
+ for i, result in enumerate(search_results[:request.rerank_top_k]):
53
+ results.append({
54
+ "chunk_id": result.get("chunk_id", ""),
55
+ "document_id": result.get("document_id", ""),
56
+ "document_title": result.get("document_title", ""),
57
+ "header_path": result.get("header_path", ""),
58
+ "chunk_text": result.get("chunk_text", "")[:500],
59
+ "score": result.get("score", 0.0),
60
+ "entities": result.get("entities", []),
61
+ })
62
+
63
+ return RetrievalResponse(
64
+ query_id=query_id,
65
+ query=request.query,
66
+ results=results,
67
+ retrieval_metrics={
68
+ "total_candidates": len(search_results),
69
+ "returned": len(results),
70
+ },
71
+ trace_id=trace_id,
72
+ )
73
+
74
+ except Exception as e:
75
+ raise HTTPException(status_code=500, detail=str(e))
@@ -0,0 +1,5 @@
1
+ """CDC and file-watching package."""
2
+ from .watcher import DirectoryWatcher
3
+ from .incremental import IncrementalUpdater
4
+
5
+ __all__ = ["DirectoryWatcher", "IncrementalUpdater"]
@@ -0,0 +1,57 @@
1
+ """Incremental update logic for keeping the vector store synchronized."""
2
+ import logging
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ from ..config.settings import settings
7
+ from ..indexing.bulk_indexer import BulkIndexer
8
+ from ..monitoring.logger import get_logger
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class IncrementalUpdater:
14
+ """Handles incremental updates to the OpenSearch index.
15
+
16
+ When a document is updated: deletes all existing chunks for that
17
+ document_id, then re-triggers the full extraction/chunking/embedding pipeline.
18
+ """
19
+
20
+ def __init__(self, bulk_indexer: Optional[BulkIndexer] = None):
21
+ self.bulk_indexer = bulk_indexer or BulkIndexer()
22
+
23
+ def before_reindex(self, file_path: Path, document_id: str) -> dict:
24
+ """Prepare for re-indexing by cleaning up existing entries.
25
+
26
+ Args:
27
+ file_path: Path to the document.
28
+ document_id: Unique document identifier.
29
+
30
+ Returns:
31
+ Deletion result from OpenSearch.
32
+ """
33
+ logger.info(
34
+ f"Preparing to re-index '{file_path.name}' "
35
+ f"(document_id={document_id})"
36
+ )
37
+ return self.bulk_indexer.delete_document_chunks(document_id)
38
+
39
+ def process_deletion(self, file_path: Path, document_id: str) -> dict:
40
+ """Handle file deletion: remove all chunks from index.
41
+
42
+ Args:
43
+ file_path: Path to the deleted document.
44
+ document_id: Unique document identifier.
45
+
46
+ Returns:
47
+ Deletion result from OpenSearch.
48
+ """
49
+ logger.info(
50
+ f"Processing deletion of '{file_path.name}' "
51
+ f"(document_id={document_id})"
52
+ )
53
+ result = self.bulk_indexer.delete_document_chunks(document_id)
54
+ logger.info(
55
+ f"Deleted {result.get('deleted', 0)} chunks for '{document_id}'"
56
+ )
57
+ return result
@@ -0,0 +1,139 @@
1
+ """Directory watcher using watchdog for Change Data Capture on local files."""
2
+ import os
3
+ import logging
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Callable, Optional
7
+ from watchdog.observers import Observer
8
+ from watchdog.events import FileSystemEventHandler, FileModifiedEvent, FileCreatedEvent
9
+
10
+ from ..config.settings import settings
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class DocumentEventHandler(FileSystemEventHandler):
16
+ """Handles file system events for the watched directory."""
17
+
18
+ def __init__(
19
+ self,
20
+ on_created: Callable[[Path], None],
21
+ on_modified: Callable[[Path], None],
22
+ on_deleted: Callable[[Path], None],
23
+ supported_extensions: set[str] = None,
24
+ ):
25
+ self.on_created = on_created
26
+ self.on_modified = on_modified
27
+ self.on_deleted = on_deleted
28
+ self.supported_extensions = supported_extensions or {
29
+ ".pdf", ".txt", ".md", ".docx", ".html", ".xml", ".csv"
30
+ }
31
+ self._last_events: dict[str, float] = {}
32
+ self._debounce_seconds = 2.0
33
+
34
+ def on_created(self, event):
35
+ """Handle file creation events."""
36
+ if event.is_directory:
37
+ return
38
+ path = Path(event.src_path)
39
+ if path.suffix.lower() not in self.supported_extensions:
40
+ return
41
+ if self._is_debounced(str(path)):
42
+ return
43
+ logger.info(f"File created: {path.name}")
44
+ self.on_created(path)
45
+
46
+ def on_modified(self, event):
47
+ """Handle file modification events."""
48
+ if event.is_directory:
49
+ return
50
+ path = Path(event.src_path)
51
+ if path.suffix.lower() not in self.supported_extensions:
52
+ return
53
+ if self._is_debounced(str(path)):
54
+ return
55
+ logger.info(f"File modified: {path.name}")
56
+ self.on_modified(path)
57
+
58
+ def on_deleted(self, event):
59
+ """Handle file deletion events."""
60
+ if event.is_directory:
61
+ return
62
+ path = Path(event.src_path)
63
+ if path.suffix.lower() not in self.supported_extensions:
64
+ return
65
+ logger.info(f"File deleted: {path.name}")
66
+ self.on_deleted(path)
67
+
68
+ def _is_debounced(self, path_str: str) -> bool:
69
+ """Debounce rapid-fire file system events."""
70
+ now = time.time()
71
+ last = self._last_events.get(path_str, 0)
72
+ if now - last < self._debounce_seconds:
73
+ return True
74
+ self._last_events[path_str] = now
75
+ return False
76
+
77
+
78
+ class DirectoryWatcher:
79
+ """Watches a directory for file changes and triggers the ingestion pipeline.
80
+
81
+ Uses watchdog to monitor file creation, modification, and deletion events.
82
+ """
83
+
84
+ def __init__(
85
+ self,
86
+ watch_dir: Optional[Path] = None,
87
+ on_created: Optional[Callable] = None,
88
+ on_modified: Optional[Callable] = None,
89
+ on_deleted: Optional[Callable] = None,
90
+ ):
91
+ self.watch_dir = Path(watch_dir or settings.watch_directory)
92
+ self.recursive = settings.watch_recursive
93
+ self._observer: Optional[Observer] = None
94
+ self._running = False
95
+
96
+ # Default no-op callbacks
97
+ self._on_created = on_created or (lambda p: None)
98
+ self._on_modified = on_modified or (lambda p: None)
99
+ self._on_deleted = on_deleted or (lambda p: None)
100
+
101
+ def start(self) -> None:
102
+ """Start watching the directory for changes."""
103
+ if self._running:
104
+ logger.warning("Watcher is already running")
105
+ return
106
+
107
+ self.watch_dir.mkdir(parents=True, exist_ok=True)
108
+ logger.info(
109
+ f"Starting directory watcher on '{self.watch_dir}' "
110
+ f"(recursive={self.recursive})"
111
+ )
112
+
113
+ event_handler = DocumentEventHandler(
114
+ on_created=self._on_created,
115
+ on_modified=self._on_modified,
116
+ on_deleted=self._on_deleted,
117
+ )
118
+
119
+ self._observer = Observer()
120
+ self._observer.schedule(
121
+ event_handler,
122
+ str(self.watch_dir),
123
+ recursive=self.recursive,
124
+ )
125
+ self._observer.start()
126
+ self._running = True
127
+ logger.info(f"Directory watcher started on '{self.watch_dir}'")
128
+
129
+ def stop(self) -> None:
130
+ """Stop watching the directory."""
131
+ if self._observer is not None and self._running:
132
+ self._observer.stop()
133
+ self._observer.join()
134
+ self._running = False
135
+ logger.info("Directory watcher stopped")
136
+
137
+ @property
138
+ def is_running(self) -> bool:
139
+ return self._running
@@ -0,0 +1,5 @@
1
+ """Chunking and enrichment package."""
2
+ from .header_splitter import HeaderAwareSplitter
3
+ from .enricher import ContextualEnricher
4
+
5
+ __all__ = ["HeaderAwareSplitter", "ContextualEnricher"]