daytashield 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daytashield/__init__.py +55 -0
- daytashield/cli/__init__.py +5 -0
- daytashield/cli/main.py +541 -0
- daytashield/core/__init__.py +15 -0
- daytashield/core/audit.py +275 -0
- daytashield/core/pipeline.py +240 -0
- daytashield/core/result.py +185 -0
- daytashield/core/router.py +217 -0
- daytashield/integrations/__init__.py +7 -0
- daytashield/integrations/langchain.py +391 -0
- daytashield/processors/__init__.py +13 -0
- daytashield/processors/base.py +182 -0
- daytashield/processors/csv.py +269 -0
- daytashield/processors/json.py +260 -0
- daytashield/processors/pdf.py +232 -0
- daytashield/rules/__init__.py +14 -0
- daytashield/rules/base.py +67 -0
- daytashield/rules/gdpr.py +348 -0
- daytashield/rules/hipaa.py +229 -0
- daytashield/rules/pii.py +208 -0
- daytashield/validators/__init__.py +15 -0
- daytashield/validators/base.py +103 -0
- daytashield/validators/compliance.py +222 -0
- daytashield/validators/freshness.py +337 -0
- daytashield/validators/schema.py +176 -0
- daytashield/validators/semantic.py +256 -0
- daytashield-0.1.1.dist-info/METADATA +316 -0
- daytashield-0.1.1.dist-info/RECORD +31 -0
- daytashield-0.1.1.dist-info/WHEEL +4 -0
- daytashield-0.1.1.dist-info/entry_points.txt +2 -0
- daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""Data routing based on validation results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Callable
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
from daytashield.core.result import ValidationResult, ValidationStatus
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RouteAction(str, Enum):
|
|
14
|
+
"""Actions that can be taken for routed data."""
|
|
15
|
+
|
|
16
|
+
PASS = "pass" # Send to destination
|
|
17
|
+
REVIEW = "review" # Send to review queue
|
|
18
|
+
QUARANTINE = "quarantine" # Isolate failed data
|
|
19
|
+
RETRY = "retry" # Attempt reprocessing
|
|
20
|
+
DROP = "drop" # Discard the data
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Route(BaseModel):
|
|
24
|
+
"""A routing rule with condition and action."""
|
|
25
|
+
|
|
26
|
+
name: str = Field(..., description="Route name for identification")
|
|
27
|
+
action: RouteAction = Field(..., description="Action to take")
|
|
28
|
+
condition: Callable[[ValidationResult], bool] | None = Field(
|
|
29
|
+
None, description="Custom condition function"
|
|
30
|
+
)
|
|
31
|
+
destination: str | None = Field(None, description="Destination identifier")
|
|
32
|
+
metadata: dict[str, Any] = Field(default_factory=dict, description="Additional route metadata")
|
|
33
|
+
|
|
34
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class RoutingDecision(BaseModel):
|
|
38
|
+
"""The result of routing a validation result."""
|
|
39
|
+
|
|
40
|
+
route: Route = Field(..., description="The matched route")
|
|
41
|
+
result: ValidationResult = Field(..., description="The validation result")
|
|
42
|
+
reason: str = Field(..., description="Why this route was selected")
|
|
43
|
+
|
|
44
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class RouterConfig(BaseModel):
|
|
48
|
+
"""Configuration for the data router."""
|
|
49
|
+
|
|
50
|
+
default_action: RouteAction = Field(
|
|
51
|
+
RouteAction.QUARANTINE, description="Default action when no route matches"
|
|
52
|
+
)
|
|
53
|
+
include_warnings_in_review: bool = Field(
|
|
54
|
+
True, description="Route warnings to review queue"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class DataRouter:
|
|
59
|
+
"""Routes data based on validation results.
|
|
60
|
+
|
|
61
|
+
The router examines validation results and determines where data should
|
|
62
|
+
go: pass to destination, send to review, quarantine, etc.
|
|
63
|
+
|
|
64
|
+
Default routing logic:
|
|
65
|
+
- PASSED → PASS (send to destination)
|
|
66
|
+
- WARNING → REVIEW (send to review queue) if configured
|
|
67
|
+
- FAILED → QUARANTINE (isolate for investigation)
|
|
68
|
+
- ERROR → QUARANTINE
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> router = DataRouter()
|
|
72
|
+
>>> result = pipeline.validate(data)
|
|
73
|
+
>>> decision = router.route(result)
|
|
74
|
+
>>> if decision.route.action == RouteAction.PASS:
|
|
75
|
+
... send_to_destination(result.data)
|
|
76
|
+
>>> elif decision.route.action == RouteAction.QUARANTINE:
|
|
77
|
+
... quarantine(result.data, decision.reason)
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
routes: list[Route] | None = None,
|
|
83
|
+
config: RouterConfig | dict[str, Any] | None = None,
|
|
84
|
+
):
|
|
85
|
+
"""Initialize the router.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
routes: Custom routes (checked before defaults)
|
|
89
|
+
config: Router configuration
|
|
90
|
+
"""
|
|
91
|
+
self.custom_routes: list[Route] = routes or []
|
|
92
|
+
|
|
93
|
+
if config is None:
|
|
94
|
+
self.config = RouterConfig()
|
|
95
|
+
elif isinstance(config, dict):
|
|
96
|
+
self.config = RouterConfig(**config)
|
|
97
|
+
else:
|
|
98
|
+
self.config = config
|
|
99
|
+
|
|
100
|
+
# Set up default routes
|
|
101
|
+
self._default_routes = self._create_default_routes()
|
|
102
|
+
|
|
103
|
+
def _create_default_routes(self) -> list[Route]:
|
|
104
|
+
"""Create the default routing rules."""
|
|
105
|
+
routes = [
|
|
106
|
+
Route(
|
|
107
|
+
name="pass_valid",
|
|
108
|
+
action=RouteAction.PASS,
|
|
109
|
+
condition=lambda r: r.status == ValidationStatus.PASSED,
|
|
110
|
+
),
|
|
111
|
+
Route(
|
|
112
|
+
name="quarantine_failed",
|
|
113
|
+
action=RouteAction.QUARANTINE,
|
|
114
|
+
condition=lambda r: r.status == ValidationStatus.FAILED,
|
|
115
|
+
),
|
|
116
|
+
Route(
|
|
117
|
+
name="quarantine_error",
|
|
118
|
+
action=RouteAction.QUARANTINE,
|
|
119
|
+
condition=lambda r: r.status == ValidationStatus.ERROR,
|
|
120
|
+
),
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
# Add warning route based on config
|
|
124
|
+
if self.config.include_warnings_in_review:
|
|
125
|
+
routes.insert(
|
|
126
|
+
1,
|
|
127
|
+
Route(
|
|
128
|
+
name="review_warnings",
|
|
129
|
+
action=RouteAction.REVIEW,
|
|
130
|
+
condition=lambda r: r.status == ValidationStatus.WARNING,
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
else:
|
|
134
|
+
routes.insert(
|
|
135
|
+
1,
|
|
136
|
+
Route(
|
|
137
|
+
name="pass_warnings",
|
|
138
|
+
action=RouteAction.PASS,
|
|
139
|
+
condition=lambda r: r.status == ValidationStatus.WARNING,
|
|
140
|
+
),
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return routes
|
|
144
|
+
|
|
145
|
+
def add_route(self, route: Route) -> DataRouter:
|
|
146
|
+
"""Add a custom route.
|
|
147
|
+
|
|
148
|
+
Custom routes are checked before default routes.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
route: The route to add
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Self for method chaining
|
|
155
|
+
"""
|
|
156
|
+
self.custom_routes.append(route)
|
|
157
|
+
return self
|
|
158
|
+
|
|
159
|
+
def route(self, result: ValidationResult) -> RoutingDecision:
|
|
160
|
+
"""Determine where to route the data.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
result: The validation result to route
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
RoutingDecision with the selected route
|
|
167
|
+
"""
|
|
168
|
+
# Check custom routes first
|
|
169
|
+
for route in self.custom_routes:
|
|
170
|
+
if route.condition and route.condition(result):
|
|
171
|
+
return RoutingDecision(
|
|
172
|
+
route=route,
|
|
173
|
+
result=result,
|
|
174
|
+
reason=f"Matched custom route: {route.name}",
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# Check default routes
|
|
178
|
+
for route in self._default_routes:
|
|
179
|
+
if route.condition and route.condition(result):
|
|
180
|
+
return RoutingDecision(
|
|
181
|
+
route=route,
|
|
182
|
+
result=result,
|
|
183
|
+
reason=f"Matched default route: {route.name}",
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Fallback to default action
|
|
187
|
+
fallback_route = Route(
|
|
188
|
+
name="fallback",
|
|
189
|
+
action=self.config.default_action,
|
|
190
|
+
)
|
|
191
|
+
return RoutingDecision(
|
|
192
|
+
route=fallback_route,
|
|
193
|
+
result=result,
|
|
194
|
+
reason=f"No route matched, using default action: {self.config.default_action.value}",
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def route_batch(self, results: list[ValidationResult]) -> dict[RouteAction, list[RoutingDecision]]:
|
|
198
|
+
"""Route multiple results and group by action.
|
|
199
|
+
|
|
200
|
+
Args:
|
|
201
|
+
results: List of validation results
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Dict mapping actions to lists of routing decisions
|
|
205
|
+
"""
|
|
206
|
+
grouped: dict[RouteAction, list[RoutingDecision]] = {
|
|
207
|
+
action: [] for action in RouteAction
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
for result in results:
|
|
211
|
+
decision = self.route(result)
|
|
212
|
+
grouped[decision.route.action].append(decision)
|
|
213
|
+
|
|
214
|
+
return grouped
|
|
215
|
+
|
|
216
|
+
def __repr__(self) -> str:
|
|
217
|
+
return f"DataRouter(custom_routes={len(self.custom_routes)}, config={self.config})"
|
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
"""LangChain integration for validated retrieval."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from daytashield.core.pipeline import ValidationPipeline
|
|
11
|
+
from daytashield.core.result import ValidationResult, ValidationStatus, create_result
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
|
15
|
+
from langchain_core.documents import Document
|
|
16
|
+
from langchain_core.retrievers import BaseRetriever
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
OnFailAction = Literal["filter", "raise", "warn", "tag"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ValidatedRetriever:
|
|
25
|
+
"""A LangChain retriever wrapper that validates documents before returning.
|
|
26
|
+
|
|
27
|
+
Wraps any LangChain retriever and applies DaytaShield validation to
|
|
28
|
+
retrieved documents. Invalid documents can be filtered, flagged, or
|
|
29
|
+
cause exceptions based on configuration.
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> from langchain_community.vectorstores import FAISS
|
|
33
|
+
>>> from daytashield import SchemaValidator, FreshnessValidator
|
|
34
|
+
>>> from daytashield.integrations.langchain import ValidatedRetriever
|
|
35
|
+
>>>
|
|
36
|
+
>>> # Create base retriever
|
|
37
|
+
>>> vectorstore = FAISS.from_texts(texts, embeddings)
|
|
38
|
+
>>> base_retriever = vectorstore.as_retriever()
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Wrap with validation
|
|
41
|
+
>>> retriever = ValidatedRetriever(
|
|
42
|
+
... base_retriever=base_retriever,
|
|
43
|
+
... validators=[
|
|
44
|
+
... SchemaValidator(schema={"type": "object"}),
|
|
45
|
+
... FreshnessValidator(max_age="7d"),
|
|
46
|
+
... ],
|
|
47
|
+
... on_fail="filter",
|
|
48
|
+
... )
|
|
49
|
+
>>>
|
|
50
|
+
>>> # Use like any retriever
|
|
51
|
+
>>> docs = retriever.invoke("search query")
|
|
52
|
+
|
|
53
|
+
Actions on validation failure:
|
|
54
|
+
- "filter": Remove invalid documents from results
|
|
55
|
+
- "raise": Raise ValidationError for any invalid document
|
|
56
|
+
- "warn": Log warning but include document
|
|
57
|
+
- "tag": Add validation metadata to document
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
def __init__(
|
|
61
|
+
self,
|
|
62
|
+
base_retriever: BaseRetriever,
|
|
63
|
+
validators: list[Any] | None = None,
|
|
64
|
+
pipeline: ValidationPipeline | None = None,
|
|
65
|
+
on_fail: OnFailAction = "filter",
|
|
66
|
+
validate_metadata: bool = True,
|
|
67
|
+
validate_content: bool = True,
|
|
68
|
+
min_confidence: float = 0.0,
|
|
69
|
+
):
|
|
70
|
+
"""Initialize the validated retriever.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
base_retriever: The underlying LangChain retriever
|
|
74
|
+
validators: List of validators to apply (creates pipeline)
|
|
75
|
+
pipeline: Pre-configured ValidationPipeline (alternative to validators)
|
|
76
|
+
on_fail: Action when validation fails
|
|
77
|
+
validate_metadata: Validate document metadata
|
|
78
|
+
validate_content: Validate document content
|
|
79
|
+
min_confidence: Minimum confidence score to pass (for semantic validation)
|
|
80
|
+
"""
|
|
81
|
+
self.base_retriever = base_retriever
|
|
82
|
+
self.on_fail = on_fail
|
|
83
|
+
self.validate_metadata = validate_metadata
|
|
84
|
+
self.validate_content = validate_content
|
|
85
|
+
self.min_confidence = min_confidence
|
|
86
|
+
|
|
87
|
+
# Set up validation pipeline
|
|
88
|
+
if pipeline is not None:
|
|
89
|
+
self.pipeline = pipeline
|
|
90
|
+
elif validators:
|
|
91
|
+
self.pipeline = ValidationPipeline(validators=validators)
|
|
92
|
+
else:
|
|
93
|
+
self.pipeline = ValidationPipeline()
|
|
94
|
+
|
|
95
|
+
# Statistics
|
|
96
|
+
self._stats = {
|
|
97
|
+
"total_retrieved": 0,
|
|
98
|
+
"total_validated": 0,
|
|
99
|
+
"total_passed": 0,
|
|
100
|
+
"total_filtered": 0,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
def invoke(
|
|
104
|
+
self,
|
|
105
|
+
input: str, # noqa: A002
|
|
106
|
+
config: dict[str, Any] | None = None,
|
|
107
|
+
**kwargs: Any,
|
|
108
|
+
) -> list[Document]:
|
|
109
|
+
"""Retrieve and validate documents.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
input: The search query
|
|
113
|
+
config: Optional config for the retriever
|
|
114
|
+
**kwargs: Additional arguments for base retriever
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
List of validated Document objects
|
|
118
|
+
"""
|
|
119
|
+
# Retrieve documents from base retriever
|
|
120
|
+
docs = self.base_retriever.invoke(input, config=config, **kwargs)
|
|
121
|
+
self._stats["total_retrieved"] += len(docs)
|
|
122
|
+
|
|
123
|
+
# Validate each document
|
|
124
|
+
validated_docs: list[Document] = []
|
|
125
|
+
|
|
126
|
+
for doc in docs:
|
|
127
|
+
result = self._validate_document(doc)
|
|
128
|
+
self._stats["total_validated"] += 1
|
|
129
|
+
|
|
130
|
+
if result.passed:
|
|
131
|
+
self._stats["total_passed"] += 1
|
|
132
|
+
validated_docs.append(self._tag_document(doc, result))
|
|
133
|
+
else:
|
|
134
|
+
self._handle_failed_validation(doc, result, validated_docs)
|
|
135
|
+
|
|
136
|
+
return validated_docs
|
|
137
|
+
|
|
138
|
+
def _validate_document(self, doc: Document) -> ValidationResult:
|
|
139
|
+
"""Validate a single document.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
doc: LangChain Document to validate
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
ValidationResult
|
|
146
|
+
"""
|
|
147
|
+
# Build data dict from document
|
|
148
|
+
data: dict[str, Any] = {}
|
|
149
|
+
|
|
150
|
+
if self.validate_content:
|
|
151
|
+
data["content"] = doc.page_content
|
|
152
|
+
|
|
153
|
+
if self.validate_metadata and doc.metadata:
|
|
154
|
+
data["metadata"] = doc.metadata
|
|
155
|
+
# Also add metadata fields at top level for validators
|
|
156
|
+
data.update(doc.metadata)
|
|
157
|
+
|
|
158
|
+
# Run validation
|
|
159
|
+
result = self.pipeline.validate(data)
|
|
160
|
+
|
|
161
|
+
# Check confidence threshold
|
|
162
|
+
confidence = result.metadata.get("semantic_confidence", 1.0)
|
|
163
|
+
if confidence < self.min_confidence:
|
|
164
|
+
result.add_message(
|
|
165
|
+
code="LOW_CONFIDENCE",
|
|
166
|
+
message=f"Confidence {confidence:.2f} below threshold {self.min_confidence}",
|
|
167
|
+
severity=ValidationStatus.FAILED,
|
|
168
|
+
validator="langchain_retriever",
|
|
169
|
+
)
|
|
170
|
+
result.status = ValidationStatus.FAILED
|
|
171
|
+
|
|
172
|
+
return result
|
|
173
|
+
|
|
174
|
+
def _handle_failed_validation(
|
|
175
|
+
self,
|
|
176
|
+
doc: Document,
|
|
177
|
+
result: ValidationResult,
|
|
178
|
+
validated_docs: list[Document],
|
|
179
|
+
) -> None:
|
|
180
|
+
"""Handle a document that failed validation.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
doc: The failed document
|
|
184
|
+
result: Validation result
|
|
185
|
+
validated_docs: List to potentially add document to
|
|
186
|
+
"""
|
|
187
|
+
if self.on_fail == "filter":
|
|
188
|
+
self._stats["total_filtered"] += 1
|
|
189
|
+
logger.debug(f"Filtered document due to validation failure: {result}")
|
|
190
|
+
|
|
191
|
+
elif self.on_fail == "raise":
|
|
192
|
+
error_messages = [str(m) for m in result.errors]
|
|
193
|
+
raise ValidationError(
|
|
194
|
+
f"Document validation failed: {'; '.join(error_messages)}",
|
|
195
|
+
result=result,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
elif self.on_fail == "warn":
|
|
199
|
+
logger.warning(f"Document validation warning: {result}")
|
|
200
|
+
validated_docs.append(self._tag_document(doc, result))
|
|
201
|
+
|
|
202
|
+
elif self.on_fail == "tag":
|
|
203
|
+
validated_docs.append(self._tag_document(doc, result))
|
|
204
|
+
|
|
205
|
+
def _tag_document(self, doc: Document, result: ValidationResult) -> Document:
|
|
206
|
+
"""Add validation metadata to document.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
doc: Document to tag
|
|
210
|
+
result: Validation result
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
Tagged document (same object, modified metadata)
|
|
214
|
+
"""
|
|
215
|
+
doc.metadata["_daytashield_status"] = result.status.value
|
|
216
|
+
doc.metadata["_daytashield_passed"] = result.passed
|
|
217
|
+
doc.metadata["_daytashield_message_count"] = len(result.messages)
|
|
218
|
+
|
|
219
|
+
if result.errors:
|
|
220
|
+
doc.metadata["_daytashield_errors"] = [str(e) for e in result.errors[:5]]
|
|
221
|
+
|
|
222
|
+
if "semantic_confidence" in result.metadata:
|
|
223
|
+
doc.metadata["_daytashield_confidence"] = result.metadata["semantic_confidence"]
|
|
224
|
+
|
|
225
|
+
return doc
|
|
226
|
+
|
|
227
|
+
@property
|
|
228
|
+
def stats(self) -> dict[str, int]:
|
|
229
|
+
"""Get retrieval and validation statistics."""
|
|
230
|
+
return dict(self._stats)
|
|
231
|
+
|
|
232
|
+
def reset_stats(self) -> None:
|
|
233
|
+
"""Reset statistics counters."""
|
|
234
|
+
for key in self._stats:
|
|
235
|
+
self._stats[key] = 0
|
|
236
|
+
|
|
237
|
+
# LangChain Retriever interface methods
|
|
238
|
+
|
|
239
|
+
def get_relevant_documents(
|
|
240
|
+
self,
|
|
241
|
+
query: str,
|
|
242
|
+
*,
|
|
243
|
+
callbacks: CallbackManagerForRetrieverRun | None = None,
|
|
244
|
+
tags: list[str] | None = None,
|
|
245
|
+
metadata: dict[str, Any] | None = None,
|
|
246
|
+
run_name: str | None = None,
|
|
247
|
+
**kwargs: Any,
|
|
248
|
+
) -> list[Document]:
|
|
249
|
+
"""Legacy method for LangChain compatibility."""
|
|
250
|
+
return self.invoke(query, **kwargs)
|
|
251
|
+
|
|
252
|
+
async def ainvoke(
|
|
253
|
+
self,
|
|
254
|
+
input: str, # noqa: A002
|
|
255
|
+
config: dict[str, Any] | None = None,
|
|
256
|
+
**kwargs: Any,
|
|
257
|
+
) -> list[Document]:
|
|
258
|
+
"""Async version of invoke."""
|
|
259
|
+
# For now, just call sync version
|
|
260
|
+
# Could be optimized with async validation in the future
|
|
261
|
+
return self.invoke(input, config=config, **kwargs)
|
|
262
|
+
|
|
263
|
+
def __repr__(self) -> str:
|
|
264
|
+
return (
|
|
265
|
+
f"ValidatedRetriever(base={self.base_retriever.__class__.__name__}, "
|
|
266
|
+
f"validators={len(self.pipeline.validators)}, on_fail={self.on_fail!r})"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class ValidationError(Exception):
|
|
271
|
+
"""Raised when document validation fails and on_fail='raise'."""
|
|
272
|
+
|
|
273
|
+
def __init__(self, message: str, result: ValidationResult | None = None):
|
|
274
|
+
super().__init__(message)
|
|
275
|
+
self.result = result
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
class ValidatedDocumentLoader:
|
|
279
|
+
"""A document loader wrapper that validates documents during loading.
|
|
280
|
+
|
|
281
|
+
Wraps any LangChain document loader and validates documents as they're
|
|
282
|
+
loaded. Useful for validating data at ingestion time.
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
>>> from langchain_community.document_loaders import PyPDFLoader
|
|
286
|
+
>>> from daytashield.integrations.langchain import ValidatedDocumentLoader
|
|
287
|
+
>>>
|
|
288
|
+
>>> loader = ValidatedDocumentLoader(
|
|
289
|
+
... base_loader=PyPDFLoader("document.pdf"),
|
|
290
|
+
... validators=[ComplianceValidator(rules=["hipaa"])],
|
|
291
|
+
... on_fail="filter",
|
|
292
|
+
... )
|
|
293
|
+
>>> docs = loader.load() # Only compliant documents returned
|
|
294
|
+
"""
|
|
295
|
+
|
|
296
|
+
def __init__(
|
|
297
|
+
self,
|
|
298
|
+
base_loader: Any,
|
|
299
|
+
validators: list[Any] | None = None,
|
|
300
|
+
pipeline: ValidationPipeline | None = None,
|
|
301
|
+
on_fail: OnFailAction = "filter",
|
|
302
|
+
transform: Callable[[Document], dict[str, Any]] | None = None,
|
|
303
|
+
):
|
|
304
|
+
"""Initialize the validated loader.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
base_loader: The underlying LangChain document loader
|
|
308
|
+
validators: List of validators to apply
|
|
309
|
+
pipeline: Pre-configured ValidationPipeline
|
|
310
|
+
on_fail: Action when validation fails
|
|
311
|
+
transform: Optional function to transform document to validation input
|
|
312
|
+
"""
|
|
313
|
+
self.base_loader = base_loader
|
|
314
|
+
self.on_fail = on_fail
|
|
315
|
+
self.transform = transform
|
|
316
|
+
|
|
317
|
+
if pipeline is not None:
|
|
318
|
+
self.pipeline = pipeline
|
|
319
|
+
elif validators:
|
|
320
|
+
self.pipeline = ValidationPipeline(validators=validators)
|
|
321
|
+
else:
|
|
322
|
+
self.pipeline = ValidationPipeline()
|
|
323
|
+
|
|
324
|
+
def load(self) -> list[Document]:
|
|
325
|
+
"""Load and validate documents.
|
|
326
|
+
|
|
327
|
+
Returns:
|
|
328
|
+
List of validated documents
|
|
329
|
+
"""
|
|
330
|
+
docs = self.base_loader.load()
|
|
331
|
+
return self._validate_docs(docs)
|
|
332
|
+
|
|
333
|
+
def lazy_load(self) -> Any:
|
|
334
|
+
"""Lazily load and validate documents.
|
|
335
|
+
|
|
336
|
+
Yields:
|
|
337
|
+
Validated documents one at a time
|
|
338
|
+
"""
|
|
339
|
+
for doc in self.base_loader.lazy_load():
|
|
340
|
+
data = self._doc_to_data(doc)
|
|
341
|
+
result = self.pipeline.validate(data)
|
|
342
|
+
|
|
343
|
+
if result.passed:
|
|
344
|
+
yield doc
|
|
345
|
+
elif self.on_fail == "warn":
|
|
346
|
+
logger.warning(f"Document validation warning: {result}")
|
|
347
|
+
yield doc
|
|
348
|
+
elif self.on_fail == "tag":
|
|
349
|
+
doc.metadata["_daytashield_validation"] = result.to_dict()
|
|
350
|
+
yield doc
|
|
351
|
+
elif self.on_fail == "raise":
|
|
352
|
+
raise ValidationError(f"Document validation failed: {result}", result)
|
|
353
|
+
# "filter" - just don't yield
|
|
354
|
+
|
|
355
|
+
def _validate_docs(self, docs: list[Document]) -> list[Document]:
|
|
356
|
+
"""Validate a list of documents."""
|
|
357
|
+
validated = []
|
|
358
|
+
for doc in docs:
|
|
359
|
+
data = self._doc_to_data(doc)
|
|
360
|
+
result = self.pipeline.validate(data)
|
|
361
|
+
|
|
362
|
+
if result.passed:
|
|
363
|
+
validated.append(doc)
|
|
364
|
+
elif self.on_fail == "warn":
|
|
365
|
+
logger.warning(f"Document validation warning: {result}")
|
|
366
|
+
validated.append(doc)
|
|
367
|
+
elif self.on_fail == "tag":
|
|
368
|
+
doc.metadata["_daytashield_validation"] = result.to_dict()
|
|
369
|
+
validated.append(doc)
|
|
370
|
+
elif self.on_fail == "raise":
|
|
371
|
+
raise ValidationError(f"Document validation failed: {result}", result)
|
|
372
|
+
|
|
373
|
+
return validated
|
|
374
|
+
|
|
375
|
+
def _doc_to_data(self, doc: Document) -> dict[str, Any]:
|
|
376
|
+
"""Convert document to validation data."""
|
|
377
|
+
if self.transform:
|
|
378
|
+
return self.transform(doc)
|
|
379
|
+
|
|
380
|
+
return {
|
|
381
|
+
"content": doc.page_content,
|
|
382
|
+
"metadata": doc.metadata,
|
|
383
|
+
**doc.metadata,
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
# Type alias for the Document type (avoid import at module level)
|
|
388
|
+
try:
|
|
389
|
+
from langchain_core.documents import Document
|
|
390
|
+
except ImportError:
|
|
391
|
+
Document = Any # type: ignore[misc, assignment]
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""DaytaShield processors for extracting and transforming data."""
|
|
2
|
+
|
|
3
|
+
from daytashield.processors.base import BaseProcessor
|
|
4
|
+
from daytashield.processors.csv import CSVProcessor
|
|
5
|
+
from daytashield.processors.json import JSONProcessor
|
|
6
|
+
from daytashield.processors.pdf import PDFProcessor
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseProcessor",
|
|
10
|
+
"PDFProcessor",
|
|
11
|
+
"CSVProcessor",
|
|
12
|
+
"JSONProcessor",
|
|
13
|
+
]
|