semantixrag 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- semantixrag/__init__.py +14 -0
- semantixrag/__main__.py +6 -0
- semantixrag/api/__init__.py +1 -0
- semantixrag/api/main.py +29 -0
- semantixrag/api/routes/__init__.py +0 -0
- semantixrag/api/routes/admin.py +68 -0
- semantixrag/api/routes/compliance.py +108 -0
- semantixrag/api/routes/ingestion.py +76 -0
- semantixrag/api/routes/observability.py +39 -0
- semantixrag/api/routes/retrieval.py +75 -0
- semantixrag/cdc/__init__.py +5 -0
- semantixrag/cdc/incremental.py +57 -0
- semantixrag/cdc/watcher.py +139 -0
- semantixrag/chunking/__init__.py +5 -0
- semantixrag/chunking/enricher.py +153 -0
- semantixrag/chunking/header_splitter.py +237 -0
- semantixrag/cli.py +309 -0
- semantixrag/compliance/__init__.py +1 -0
- semantixrag/compliance/dsar.py +202 -0
- semantixrag/compliance/masking.py +93 -0
- semantixrag/compliance/pii_scanner.py +165 -0
- semantixrag/config/__init__.py +4 -0
- semantixrag/config/opa/access.rego +45 -0
- semantixrag/config/opa/audit.rego +41 -0
- semantixrag/config/opa/masking.rego +39 -0
- semantixrag/config/settings.py +76 -0
- semantixrag/embeddings/__init__.py +4 -0
- semantixrag/embeddings/embedder.py +143 -0
- semantixrag/extractors/__init__.py +6 -0
- semantixrag/extractors/base.py +35 -0
- semantixrag/extractors/multimodal_extractor.py +237 -0
- semantixrag/extractors/table_extractor.py +170 -0
- semantixrag/extractors/unstructured_extractor.py +175 -0
- semantixrag/indexing/__init__.py +7 -0
- semantixrag/indexing/bulk_indexer.py +142 -0
- semantixrag/indexing/connection.py +96 -0
- semantixrag/indexing/graph_writer.py +192 -0
- semantixrag/indexing/hybrid_search.py +208 -0
- semantixrag/indexing/index_manager.py +163 -0
- semantixrag/knowledge/__init__.py +1 -0
- semantixrag/knowledge/entity_extractor.py +98 -0
- semantixrag/knowledge/ontology.py +100 -0
- semantixrag/models.py +146 -0
- semantixrag/monitoring/__init__.py +4 -0
- semantixrag/monitoring/logger.py +60 -0
- semantixrag/observability/__init__.py +1 -0
- semantixrag/observability/evaluator.py +110 -0
- semantixrag/observability/metrics.py +129 -0
- semantixrag/observability/tracer.py +133 -0
- semantixrag/pipeline.py +318 -0
- semantixrag/resources.py +98 -0
- semantixrag-2.0.0.dist-info/METADATA +433 -0
- semantixrag-2.0.0.dist-info/RECORD +57 -0
- semantixrag-2.0.0.dist-info/WHEEL +5 -0
- semantixrag-2.0.0.dist-info/entry_points.txt +2 -0
- semantixrag-2.0.0.dist-info/licenses/LICENSE +21 -0
- semantixrag-2.0.0.dist-info/top_level.txt +1 -0
semantixrag/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""RAG Ingestion Pipeline Source Package."""
|
|
2
|
+
from .pipeline import IngestionPipeline
|
|
3
|
+
from .models import Chunk, ExtractionResult, ExtractedElement, ElementType, IndexedDocument
|
|
4
|
+
from .monitoring.logger import setup_logging
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"IngestionPipeline",
|
|
8
|
+
"Chunk",
|
|
9
|
+
"ExtractionResult",
|
|
10
|
+
"ExtractedElement",
|
|
11
|
+
"ElementType",
|
|
12
|
+
"IndexedDocument",
|
|
13
|
+
"setup_logging",
|
|
14
|
+
]
|
semantixrag/__main__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""FastAPI application for SemantixRAG Platform."""
|
semantixrag/api/main.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""FastAPI application entry point for SemantixRAG Platform."""
|
|
2
|
+
from fastapi import FastAPI
|
|
3
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
4
|
+
from .routes import ingestion, retrieval, admin, observability, compliance
|
|
5
|
+
|
|
6
|
+
app = FastAPI(
|
|
7
|
+
title="SemantixRAG Platform",
|
|
8
|
+
description="AI-native RAG ingestion, retrieval, and governance platform",
|
|
9
|
+
version="2.0.0",
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
app.add_middleware(
|
|
13
|
+
CORSMiddleware,
|
|
14
|
+
allow_origins=["*"],
|
|
15
|
+
allow_credentials=True,
|
|
16
|
+
allow_methods=["*"],
|
|
17
|
+
allow_headers=["*"],
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
app.include_router(ingestion.router, prefix="/v1", tags=["ingestion"])
|
|
21
|
+
app.include_router(retrieval.router, prefix="/v1", tags=["retrieval"])
|
|
22
|
+
app.include_router(admin.router, prefix="/v1", tags=["admin"])
|
|
23
|
+
app.include_router(observability.router, prefix="/v1", tags=["observability"])
|
|
24
|
+
app.include_router(compliance.router, prefix="/v1", tags=["compliance"])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@app.get("/health")
|
|
28
|
+
async def health_check():
|
|
29
|
+
return {"status": "ok", "version": "2.0.0"}
|
|
File without changes
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Admin API routes (AdminCopilot)."""
|
|
2
|
+
from fastapi import APIRouter, HTTPException
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Optional
|
|
5
|
+
from ...pipeline import IngestionPipeline
|
|
6
|
+
from ...observability.metrics import metrics_collector
|
|
7
|
+
|
|
8
|
+
router = APIRouter()
|
|
9
|
+
pipeline = IngestionPipeline()
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class AdminQueryRequest(BaseModel):
|
|
13
|
+
query: str
|
|
14
|
+
user_id: str = "anonymous"
|
|
15
|
+
user_role: str = "viewer"
|
|
16
|
+
tenant_id: str = "default"
|
|
17
|
+
confirm_destructive: bool = True
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class AdminQueryResponse(BaseModel):
|
|
21
|
+
action_taken: str = "query_executed"
|
|
22
|
+
action_type: str = "read"
|
|
23
|
+
result: dict = {}
|
|
24
|
+
requires_confirmation: bool = False
|
|
25
|
+
summary: str = ""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@router.post("/admin/query", response_model=AdminQueryResponse)
|
|
29
|
+
async def admin_query(request: AdminQueryRequest):
|
|
30
|
+
"""Natural-language platform administration."""
|
|
31
|
+
query = request.query.lower()
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
if "document" in query and "count" in query:
|
|
35
|
+
return AdminQueryResponse(
|
|
36
|
+
summary="Document count query processed",
|
|
37
|
+
result={"message": "Use GET /metrics for document counts"},
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
elif "pii" in query and ("scan" in query or "find" in query):
|
|
41
|
+
return AdminQueryResponse(
|
|
42
|
+
action_type="compliance",
|
|
43
|
+
summary="PII scan initiated",
|
|
44
|
+
result={"status": "PII scanning enabled by default on ingestion"},
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
elif "health" in query or "status" in query:
|
|
48
|
+
return AdminQueryResponse(
|
|
49
|
+
action_type="read",
|
|
50
|
+
summary="Platform health check",
|
|
51
|
+
result=metrics_collector.snapshot(),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
elif "schema" in query or "ontology" in query:
|
|
55
|
+
return AdminQueryResponse(
|
|
56
|
+
action_type="read",
|
|
57
|
+
summary="Knowledge graph schema",
|
|
58
|
+
result=pipeline.ontology_manager.to_schema(),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
else:
|
|
62
|
+
return AdminQueryResponse(
|
|
63
|
+
summary=f"Processed: {request.query}",
|
|
64
|
+
result={"message": f"Query '{request.query}' received. Use specific commands: 'document count', 'PII scan', 'health', 'schema'"},
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
except Exception as e:
|
|
68
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Compliance API routes (GuardRail)."""
|
|
2
|
+
from fastapi import APIRouter, HTTPException, Query
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Optional
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from ...compliance.pii_scanner import PIIScanner
|
|
8
|
+
from ...compliance.masking import masking_engine
|
|
9
|
+
from ...compliance.dsar import DSAREngine
|
|
10
|
+
from ...config.settings import settings
|
|
11
|
+
|
|
12
|
+
router = APIRouter()
|
|
13
|
+
pii_scanner = PIIScanner()
|
|
14
|
+
dsar_engine = DSAREngine()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class PIIscanRequest(BaseModel):
|
|
18
|
+
document_id: Optional[str] = None
|
|
19
|
+
text: Optional[str] = None
|
|
20
|
+
tenant_id: str = "default"
|
|
21
|
+
scan_depth: str = "standard"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DSARRequest(BaseModel):
|
|
25
|
+
subject_id: str
|
|
26
|
+
action: str
|
|
27
|
+
tenant_id: str = "default"
|
|
28
|
+
requested_by: str
|
|
29
|
+
reason: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@router.post("/compliance/pii/scan")
|
|
33
|
+
async def scan_pii(request: PIIscanRequest):
|
|
34
|
+
"""Scan text or document for PII."""
|
|
35
|
+
if not request.text:
|
|
36
|
+
raise HTTPException(status_code=400, detail="No text provided for scanning")
|
|
37
|
+
|
|
38
|
+
findings = await pii_scanner.scan(request.text)
|
|
39
|
+
summary = pii_scanner.get_summary(findings)
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
"scan_id": str(uuid.uuid4()),
|
|
43
|
+
"pii_findings": [
|
|
44
|
+
{
|
|
45
|
+
"pii_type": f.pii_type,
|
|
46
|
+
"sensitivity": f.sensitivity,
|
|
47
|
+
"confidence": f.confidence,
|
|
48
|
+
"location": {"start": f.start, "end": f.end},
|
|
49
|
+
"context": f.context[:100],
|
|
50
|
+
"recommended_action": f.recommended_action,
|
|
51
|
+
"masked_text": f.masked_text,
|
|
52
|
+
}
|
|
53
|
+
for f in findings
|
|
54
|
+
],
|
|
55
|
+
"summary": summary,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@router.post("/compliance/dsar")
|
|
60
|
+
async def execute_dsar(request: DSARRequest):
|
|
61
|
+
"""Execute GDPR data subject access/deletion request."""
|
|
62
|
+
if request.action == "delete" and not request.reason:
|
|
63
|
+
raise HTTPException(
|
|
64
|
+
status_code=400,
|
|
65
|
+
detail="Reason is required for delete actions",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
result = await dsar_engine.execute_dsar(
|
|
69
|
+
subject_id=request.subject_id,
|
|
70
|
+
action=request.action,
|
|
71
|
+
tenant_id=request.tenant_id,
|
|
72
|
+
requested_by=request.requested_by,
|
|
73
|
+
reason=request.reason,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
return {
|
|
77
|
+
"dsar_id": result.dsar_id,
|
|
78
|
+
"status": result.status,
|
|
79
|
+
"subject_id": result.subject_id,
|
|
80
|
+
"action": result.action,
|
|
81
|
+
"affected_records": {
|
|
82
|
+
"documents": result.affected_documents,
|
|
83
|
+
"chunks": result.affected_chunks,
|
|
84
|
+
"embeddings": result.affected_embeddings,
|
|
85
|
+
"agent_memories": result.affected_memories,
|
|
86
|
+
},
|
|
87
|
+
"estimated_completion": result.estimated_completion,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@router.get("/compliance/dsar/{dsar_id}")
|
|
92
|
+
async def get_dsar_status(dsar_id: str):
|
|
93
|
+
"""Get DSAR request status."""
|
|
94
|
+
result = dsar_engine.get_dsar_status(dsar_id)
|
|
95
|
+
if not result:
|
|
96
|
+
raise HTTPException(status_code=404, detail="DSAR request not found")
|
|
97
|
+
return {
|
|
98
|
+
"dsar_id": result.dsar_id,
|
|
99
|
+
"status": result.status,
|
|
100
|
+
"subject_id": result.subject_id,
|
|
101
|
+
"action": result.action,
|
|
102
|
+
"affected_records": {
|
|
103
|
+
"documents": result.affected_documents,
|
|
104
|
+
"chunks": result.affected_chunks,
|
|
105
|
+
"embeddings": result.affected_embeddings,
|
|
106
|
+
"agent_memories": result.affected_memories,
|
|
107
|
+
},
|
|
108
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Ingestion API routes."""
|
|
2
|
+
from fastapi import APIRouter, UploadFile, File, Form, HTTPException
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import uuid
|
|
5
|
+
from datetime import datetime
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from ...pipeline import IngestionPipeline
|
|
8
|
+
from ...config.settings import settings
|
|
9
|
+
|
|
10
|
+
router = APIRouter()
|
|
11
|
+
pipeline = IngestionPipeline()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@router.post("/ingest")
|
|
15
|
+
async def ingest_document(
|
|
16
|
+
file: UploadFile = File(...),
|
|
17
|
+
tenant_id: str = Form("default"),
|
|
18
|
+
enrich: bool = Form(True),
|
|
19
|
+
enable_entity_extraction: bool = Form(True),
|
|
20
|
+
enable_pii_scan: bool = Form(True),
|
|
21
|
+
enable_summarization: bool = Form(True),
|
|
22
|
+
):
|
|
23
|
+
"""Upload and process a single document."""
|
|
24
|
+
if not file.filename:
|
|
25
|
+
raise HTTPException(status_code=400, detail="No file provided")
|
|
26
|
+
|
|
27
|
+
# Save uploaded file temporarily
|
|
28
|
+
temp_dir = Path(settings.watch_directory) / "uploads"
|
|
29
|
+
temp_dir.mkdir(parents=True, exist_ok=True)
|
|
30
|
+
temp_path = temp_dir / f"{uuid.uuid4()}_{file.filename}"
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
content = await file.read()
|
|
34
|
+
temp_path.write_bytes(content)
|
|
35
|
+
|
|
36
|
+
document_id = str(uuid.uuid4())
|
|
37
|
+
result = pipeline.process_document(
|
|
38
|
+
temp_path,
|
|
39
|
+
document_id=document_id,
|
|
40
|
+
tenant_id=tenant_id,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
if not result.get("success"):
|
|
44
|
+
raise HTTPException(
|
|
45
|
+
status_code=500,
|
|
46
|
+
detail=result.get("error", "Processing failed"),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
"document_id": result["document_id"],
|
|
51
|
+
"filename": result["filename"],
|
|
52
|
+
"status": "completed",
|
|
53
|
+
"chunks_created": result["chunks_count"],
|
|
54
|
+
"entities_extracted": result.get("entities_extracted", 0),
|
|
55
|
+
"pii_findings": result.get("pii_findings", 0),
|
|
56
|
+
"indexed_at": datetime.utcnow().isoformat(),
|
|
57
|
+
"trace_id": result.get("trace_id", ""),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
except HTTPException:
|
|
61
|
+
raise
|
|
62
|
+
except Exception as e:
|
|
63
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
64
|
+
finally:
|
|
65
|
+
if temp_path.exists():
|
|
66
|
+
temp_path.unlink()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@router.get("/ingest/{document_id}/status")
|
|
70
|
+
async def get_ingest_status(document_id: str):
|
|
71
|
+
"""Check ingestion status for a document."""
|
|
72
|
+
return {
|
|
73
|
+
"document_id": document_id,
|
|
74
|
+
"status": "completed",
|
|
75
|
+
"message": "Status tracking requires OpenSearch connection",
|
|
76
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Observability API routes (Obsidian)."""
|
|
2
|
+
from fastapi import APIRouter, HTTPException, Query
|
|
3
|
+
from typing import Optional, List
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
from ...observability.metrics import metrics_collector
|
|
6
|
+
|
|
7
|
+
router = APIRouter()
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@router.post("/observability/traces")
|
|
11
|
+
async def ingest_traces(traces: list[dict]):
|
|
12
|
+
"""Ingest telemetry traces."""
|
|
13
|
+
accepted = len(traces)
|
|
14
|
+
metrics_collector.increment("traces.ingested", accepted)
|
|
15
|
+
return {"accepted": accepted, "failed": 0}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@router.get("/observability/metrics")
|
|
19
|
+
async def get_metrics(
|
|
20
|
+
start_time: Optional[str] = Query(None),
|
|
21
|
+
end_time: Optional[str] = Query(None),
|
|
22
|
+
granularity: str = Query("5m"),
|
|
23
|
+
):
|
|
24
|
+
"""Query pipeline metrics."""
|
|
25
|
+
return metrics_collector.snapshot()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@router.get("/observability/evaluation")
|
|
29
|
+
async def get_evaluation(
|
|
30
|
+
start_time: Optional[str] = Query(None),
|
|
31
|
+
end_time: Optional[str] = Query(None),
|
|
32
|
+
evaluation_id: Optional[str] = Query(None),
|
|
33
|
+
query_id: Optional[str] = Query(None),
|
|
34
|
+
):
|
|
35
|
+
"""Query RAG quality evaluation metrics."""
|
|
36
|
+
return {
|
|
37
|
+
"evaluations": [],
|
|
38
|
+
"message": "Evaluation requires LLM client for full functionality",
|
|
39
|
+
}
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Retrieval API routes."""
|
|
2
|
+
from fastapi import APIRouter, HTTPException
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
from typing import Optional, List
|
|
5
|
+
import uuid
|
|
6
|
+
from ...pipeline import IngestionPipeline
|
|
7
|
+
from ...indexing.hybrid_search import HybridSearch
|
|
8
|
+
from ...indexing.graph_writer import GraphWriter
|
|
9
|
+
from ...config.settings import settings
|
|
10
|
+
|
|
11
|
+
router = APIRouter()
|
|
12
|
+
pipeline = IngestionPipeline()
|
|
13
|
+
hybrid_search = HybridSearch()
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RetrievalRequest(BaseModel):
|
|
17
|
+
query: str
|
|
18
|
+
tenant_id: str = "default"
|
|
19
|
+
strategy: str = "hybrid"
|
|
20
|
+
vector_top_k: int = 50
|
|
21
|
+
keyword_top_k: int = 50
|
|
22
|
+
graph_hops: int = 2
|
|
23
|
+
rerank: bool = True
|
|
24
|
+
rerank_top_k: int = 5
|
|
25
|
+
filters: Optional[dict] = None
|
|
26
|
+
generation: Optional[dict] = None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class RetrievalResponse(BaseModel):
|
|
30
|
+
query_id: str
|
|
31
|
+
query: str
|
|
32
|
+
results: list[dict] = []
|
|
33
|
+
retrieval_metrics: dict = {}
|
|
34
|
+
generation: Optional[dict] = None
|
|
35
|
+
trace_id: str = ""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@router.post("/query", response_model=RetrievalResponse)
|
|
39
|
+
async def query(request: RetrievalRequest):
|
|
40
|
+
"""Execute semantic search with hybrid retrieval."""
|
|
41
|
+
query_id = str(uuid.uuid4())
|
|
42
|
+
trace_id = str(uuid.uuid4())
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
search_results = hybrid_search.search(
|
|
46
|
+
query_text=request.query,
|
|
47
|
+
top_k=request.vector_top_k,
|
|
48
|
+
tenant_id=request.tenant_id,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
results = []
|
|
52
|
+
for i, result in enumerate(search_results[:request.rerank_top_k]):
|
|
53
|
+
results.append({
|
|
54
|
+
"chunk_id": result.get("chunk_id", ""),
|
|
55
|
+
"document_id": result.get("document_id", ""),
|
|
56
|
+
"document_title": result.get("document_title", ""),
|
|
57
|
+
"header_path": result.get("header_path", ""),
|
|
58
|
+
"chunk_text": result.get("chunk_text", "")[:500],
|
|
59
|
+
"score": result.get("score", 0.0),
|
|
60
|
+
"entities": result.get("entities", []),
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
return RetrievalResponse(
|
|
64
|
+
query_id=query_id,
|
|
65
|
+
query=request.query,
|
|
66
|
+
results=results,
|
|
67
|
+
retrieval_metrics={
|
|
68
|
+
"total_candidates": len(search_results),
|
|
69
|
+
"returned": len(results),
|
|
70
|
+
},
|
|
71
|
+
trace_id=trace_id,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise HTTPException(status_code=500, detail=str(e))
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Incremental update logic for keeping the vector store synchronized."""
|
|
2
|
+
import logging
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from ..config.settings import settings
|
|
7
|
+
from ..indexing.bulk_indexer import BulkIndexer
|
|
8
|
+
from ..monitoring.logger import get_logger
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IncrementalUpdater:
|
|
14
|
+
"""Handles incremental updates to the OpenSearch index.
|
|
15
|
+
|
|
16
|
+
When a document is updated: deletes all existing chunks for that
|
|
17
|
+
document_id, then re-triggers the full extraction/chunking/embedding pipeline.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, bulk_indexer: Optional[BulkIndexer] = None):
|
|
21
|
+
self.bulk_indexer = bulk_indexer or BulkIndexer()
|
|
22
|
+
|
|
23
|
+
def before_reindex(self, file_path: Path, document_id: str) -> dict:
|
|
24
|
+
"""Prepare for re-indexing by cleaning up existing entries.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: Path to the document.
|
|
28
|
+
document_id: Unique document identifier.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Deletion result from OpenSearch.
|
|
32
|
+
"""
|
|
33
|
+
logger.info(
|
|
34
|
+
f"Preparing to re-index '{file_path.name}' "
|
|
35
|
+
f"(document_id={document_id})"
|
|
36
|
+
)
|
|
37
|
+
return self.bulk_indexer.delete_document_chunks(document_id)
|
|
38
|
+
|
|
39
|
+
def process_deletion(self, file_path: Path, document_id: str) -> dict:
|
|
40
|
+
"""Handle file deletion: remove all chunks from index.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
file_path: Path to the deleted document.
|
|
44
|
+
document_id: Unique document identifier.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
Deletion result from OpenSearch.
|
|
48
|
+
"""
|
|
49
|
+
logger.info(
|
|
50
|
+
f"Processing deletion of '{file_path.name}' "
|
|
51
|
+
f"(document_id={document_id})"
|
|
52
|
+
)
|
|
53
|
+
result = self.bulk_indexer.delete_document_chunks(document_id)
|
|
54
|
+
logger.info(
|
|
55
|
+
f"Deleted {result.get('deleted', 0)} chunks for '{document_id}'"
|
|
56
|
+
)
|
|
57
|
+
return result
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""Directory watcher using watchdog for Change Data Capture on local files."""
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable, Optional
|
|
7
|
+
from watchdog.observers import Observer
|
|
8
|
+
from watchdog.events import FileSystemEventHandler, FileModifiedEvent, FileCreatedEvent
|
|
9
|
+
|
|
10
|
+
from ..config.settings import settings
|
|
11
|
+
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DocumentEventHandler(FileSystemEventHandler):
|
|
16
|
+
"""Handles file system events for the watched directory."""
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
on_created: Callable[[Path], None],
|
|
21
|
+
on_modified: Callable[[Path], None],
|
|
22
|
+
on_deleted: Callable[[Path], None],
|
|
23
|
+
supported_extensions: set[str] = None,
|
|
24
|
+
):
|
|
25
|
+
self.on_created = on_created
|
|
26
|
+
self.on_modified = on_modified
|
|
27
|
+
self.on_deleted = on_deleted
|
|
28
|
+
self.supported_extensions = supported_extensions or {
|
|
29
|
+
".pdf", ".txt", ".md", ".docx", ".html", ".xml", ".csv"
|
|
30
|
+
}
|
|
31
|
+
self._last_events: dict[str, float] = {}
|
|
32
|
+
self._debounce_seconds = 2.0
|
|
33
|
+
|
|
34
|
+
def on_created(self, event):
|
|
35
|
+
"""Handle file creation events."""
|
|
36
|
+
if event.is_directory:
|
|
37
|
+
return
|
|
38
|
+
path = Path(event.src_path)
|
|
39
|
+
if path.suffix.lower() not in self.supported_extensions:
|
|
40
|
+
return
|
|
41
|
+
if self._is_debounced(str(path)):
|
|
42
|
+
return
|
|
43
|
+
logger.info(f"File created: {path.name}")
|
|
44
|
+
self.on_created(path)
|
|
45
|
+
|
|
46
|
+
def on_modified(self, event):
|
|
47
|
+
"""Handle file modification events."""
|
|
48
|
+
if event.is_directory:
|
|
49
|
+
return
|
|
50
|
+
path = Path(event.src_path)
|
|
51
|
+
if path.suffix.lower() not in self.supported_extensions:
|
|
52
|
+
return
|
|
53
|
+
if self._is_debounced(str(path)):
|
|
54
|
+
return
|
|
55
|
+
logger.info(f"File modified: {path.name}")
|
|
56
|
+
self.on_modified(path)
|
|
57
|
+
|
|
58
|
+
def on_deleted(self, event):
|
|
59
|
+
"""Handle file deletion events."""
|
|
60
|
+
if event.is_directory:
|
|
61
|
+
return
|
|
62
|
+
path = Path(event.src_path)
|
|
63
|
+
if path.suffix.lower() not in self.supported_extensions:
|
|
64
|
+
return
|
|
65
|
+
logger.info(f"File deleted: {path.name}")
|
|
66
|
+
self.on_deleted(path)
|
|
67
|
+
|
|
68
|
+
def _is_debounced(self, path_str: str) -> bool:
|
|
69
|
+
"""Debounce rapid-fire file system events."""
|
|
70
|
+
now = time.time()
|
|
71
|
+
last = self._last_events.get(path_str, 0)
|
|
72
|
+
if now - last < self._debounce_seconds:
|
|
73
|
+
return True
|
|
74
|
+
self._last_events[path_str] = now
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class DirectoryWatcher:
|
|
79
|
+
"""Watches a directory for file changes and triggers the ingestion pipeline.
|
|
80
|
+
|
|
81
|
+
Uses watchdog to monitor file creation, modification, and deletion events.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
def __init__(
|
|
85
|
+
self,
|
|
86
|
+
watch_dir: Optional[Path] = None,
|
|
87
|
+
on_created: Optional[Callable] = None,
|
|
88
|
+
on_modified: Optional[Callable] = None,
|
|
89
|
+
on_deleted: Optional[Callable] = None,
|
|
90
|
+
):
|
|
91
|
+
self.watch_dir = Path(watch_dir or settings.watch_directory)
|
|
92
|
+
self.recursive = settings.watch_recursive
|
|
93
|
+
self._observer: Optional[Observer] = None
|
|
94
|
+
self._running = False
|
|
95
|
+
|
|
96
|
+
# Default no-op callbacks
|
|
97
|
+
self._on_created = on_created or (lambda p: None)
|
|
98
|
+
self._on_modified = on_modified or (lambda p: None)
|
|
99
|
+
self._on_deleted = on_deleted or (lambda p: None)
|
|
100
|
+
|
|
101
|
+
def start(self) -> None:
|
|
102
|
+
"""Start watching the directory for changes."""
|
|
103
|
+
if self._running:
|
|
104
|
+
logger.warning("Watcher is already running")
|
|
105
|
+
return
|
|
106
|
+
|
|
107
|
+
self.watch_dir.mkdir(parents=True, exist_ok=True)
|
|
108
|
+
logger.info(
|
|
109
|
+
f"Starting directory watcher on '{self.watch_dir}' "
|
|
110
|
+
f"(recursive={self.recursive})"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
event_handler = DocumentEventHandler(
|
|
114
|
+
on_created=self._on_created,
|
|
115
|
+
on_modified=self._on_modified,
|
|
116
|
+
on_deleted=self._on_deleted,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
self._observer = Observer()
|
|
120
|
+
self._observer.schedule(
|
|
121
|
+
event_handler,
|
|
122
|
+
str(self.watch_dir),
|
|
123
|
+
recursive=self.recursive,
|
|
124
|
+
)
|
|
125
|
+
self._observer.start()
|
|
126
|
+
self._running = True
|
|
127
|
+
logger.info(f"Directory watcher started on '{self.watch_dir}'")
|
|
128
|
+
|
|
129
|
+
def stop(self) -> None:
|
|
130
|
+
"""Stop watching the directory."""
|
|
131
|
+
if self._observer is not None and self._running:
|
|
132
|
+
self._observer.stop()
|
|
133
|
+
self._observer.join()
|
|
134
|
+
self._running = False
|
|
135
|
+
logger.info("Directory watcher stopped")
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def is_running(self) -> bool:
|
|
139
|
+
return self._running
|