daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,217 @@
1
+ """Data routing based on validation results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from enum import Enum
6
+ from typing import Any, Callable
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ from daytashield.core.result import ValidationResult, ValidationStatus
11
+
12
+
13
+ class RouteAction(str, Enum):
14
+ """Actions that can be taken for routed data."""
15
+
16
+ PASS = "pass" # Send to destination
17
+ REVIEW = "review" # Send to review queue
18
+ QUARANTINE = "quarantine" # Isolate failed data
19
+ RETRY = "retry" # Attempt reprocessing
20
+ DROP = "drop" # Discard the data
21
+
22
+
23
+ class Route(BaseModel):
24
+ """A routing rule with condition and action."""
25
+
26
+ name: str = Field(..., description="Route name for identification")
27
+ action: RouteAction = Field(..., description="Action to take")
28
+ condition: Callable[[ValidationResult], bool] | None = Field(
29
+ None, description="Custom condition function"
30
+ )
31
+ destination: str | None = Field(None, description="Destination identifier")
32
+ metadata: dict[str, Any] = Field(default_factory=dict, description="Additional route metadata")
33
+
34
+ model_config = {"arbitrary_types_allowed": True}
35
+
36
+
37
+ class RoutingDecision(BaseModel):
38
+ """The result of routing a validation result."""
39
+
40
+ route: Route = Field(..., description="The matched route")
41
+ result: ValidationResult = Field(..., description="The validation result")
42
+ reason: str = Field(..., description="Why this route was selected")
43
+
44
+ model_config = {"arbitrary_types_allowed": True}
45
+
46
+
47
+ class RouterConfig(BaseModel):
48
+ """Configuration for the data router."""
49
+
50
+ default_action: RouteAction = Field(
51
+ RouteAction.QUARANTINE, description="Default action when no route matches"
52
+ )
53
+ include_warnings_in_review: bool = Field(
54
+ True, description="Route warnings to review queue"
55
+ )
56
+
57
+
58
+ class DataRouter:
59
+ """Routes data based on validation results.
60
+
61
+ The router examines validation results and determines where data should
62
+ go: pass to destination, send to review, quarantine, etc.
63
+
64
+ Default routing logic:
65
+ - PASSED → PASS (send to destination)
66
+ - WARNING → REVIEW (send to review queue) if configured
67
+ - FAILED → QUARANTINE (isolate for investigation)
68
+ - ERROR → QUARANTINE
69
+
70
+ Example:
71
+ >>> router = DataRouter()
72
+ >>> result = pipeline.validate(data)
73
+ >>> decision = router.route(result)
74
+ >>> if decision.route.action == RouteAction.PASS:
75
+ ... send_to_destination(result.data)
76
+ >>> elif decision.route.action == RouteAction.QUARANTINE:
77
+ ... quarantine(result.data, decision.reason)
78
+ """
79
+
80
+ def __init__(
81
+ self,
82
+ routes: list[Route] | None = None,
83
+ config: RouterConfig | dict[str, Any] | None = None,
84
+ ):
85
+ """Initialize the router.
86
+
87
+ Args:
88
+ routes: Custom routes (checked before defaults)
89
+ config: Router configuration
90
+ """
91
+ self.custom_routes: list[Route] = routes or []
92
+
93
+ if config is None:
94
+ self.config = RouterConfig()
95
+ elif isinstance(config, dict):
96
+ self.config = RouterConfig(**config)
97
+ else:
98
+ self.config = config
99
+
100
+ # Set up default routes
101
+ self._default_routes = self._create_default_routes()
102
+
103
+ def _create_default_routes(self) -> list[Route]:
104
+ """Create the default routing rules."""
105
+ routes = [
106
+ Route(
107
+ name="pass_valid",
108
+ action=RouteAction.PASS,
109
+ condition=lambda r: r.status == ValidationStatus.PASSED,
110
+ ),
111
+ Route(
112
+ name="quarantine_failed",
113
+ action=RouteAction.QUARANTINE,
114
+ condition=lambda r: r.status == ValidationStatus.FAILED,
115
+ ),
116
+ Route(
117
+ name="quarantine_error",
118
+ action=RouteAction.QUARANTINE,
119
+ condition=lambda r: r.status == ValidationStatus.ERROR,
120
+ ),
121
+ ]
122
+
123
+ # Add warning route based on config
124
+ if self.config.include_warnings_in_review:
125
+ routes.insert(
126
+ 1,
127
+ Route(
128
+ name="review_warnings",
129
+ action=RouteAction.REVIEW,
130
+ condition=lambda r: r.status == ValidationStatus.WARNING,
131
+ ),
132
+ )
133
+ else:
134
+ routes.insert(
135
+ 1,
136
+ Route(
137
+ name="pass_warnings",
138
+ action=RouteAction.PASS,
139
+ condition=lambda r: r.status == ValidationStatus.WARNING,
140
+ ),
141
+ )
142
+
143
+ return routes
144
+
145
+ def add_route(self, route: Route) -> DataRouter:
146
+ """Add a custom route.
147
+
148
+ Custom routes are checked before default routes.
149
+
150
+ Args:
151
+ route: The route to add
152
+
153
+ Returns:
154
+ Self for method chaining
155
+ """
156
+ self.custom_routes.append(route)
157
+ return self
158
+
159
+ def route(self, result: ValidationResult) -> RoutingDecision:
160
+ """Determine where to route the data.
161
+
162
+ Args:
163
+ result: The validation result to route
164
+
165
+ Returns:
166
+ RoutingDecision with the selected route
167
+ """
168
+ # Check custom routes first
169
+ for route in self.custom_routes:
170
+ if route.condition and route.condition(result):
171
+ return RoutingDecision(
172
+ route=route,
173
+ result=result,
174
+ reason=f"Matched custom route: {route.name}",
175
+ )
176
+
177
+ # Check default routes
178
+ for route in self._default_routes:
179
+ if route.condition and route.condition(result):
180
+ return RoutingDecision(
181
+ route=route,
182
+ result=result,
183
+ reason=f"Matched default route: {route.name}",
184
+ )
185
+
186
+ # Fallback to default action
187
+ fallback_route = Route(
188
+ name="fallback",
189
+ action=self.config.default_action,
190
+ )
191
+ return RoutingDecision(
192
+ route=fallback_route,
193
+ result=result,
194
+ reason=f"No route matched, using default action: {self.config.default_action.value}",
195
+ )
196
+
197
+ def route_batch(self, results: list[ValidationResult]) -> dict[RouteAction, list[RoutingDecision]]:
198
+ """Route multiple results and group by action.
199
+
200
+ Args:
201
+ results: List of validation results
202
+
203
+ Returns:
204
+ Dict mapping actions to lists of routing decisions
205
+ """
206
+ grouped: dict[RouteAction, list[RoutingDecision]] = {
207
+ action: [] for action in RouteAction
208
+ }
209
+
210
+ for result in results:
211
+ decision = self.route(result)
212
+ grouped[decision.route.action].append(decision)
213
+
214
+ return grouped
215
+
216
+ def __repr__(self) -> str:
217
+ return f"DataRouter(custom_routes={len(self.custom_routes)}, config={self.config})"
@@ -0,0 +1,7 @@
1
+ """DaytaShield integrations with external frameworks."""
2
+
3
+ from daytashield.integrations.langchain import ValidatedRetriever
4
+
5
+ __all__ = [
6
+ "ValidatedRetriever",
7
+ ]
@@ -0,0 +1,391 @@
1
+ """LangChain integration for validated retrieval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Any, Callable, Literal
7
+
8
+ from pydantic import Field
9
+
10
+ from daytashield.core.pipeline import ValidationPipeline
11
+ from daytashield.core.result import ValidationResult, ValidationStatus, create_result
12
+
13
+ if TYPE_CHECKING:
14
+ from langchain_core.callbacks import CallbackManagerForRetrieverRun
15
+ from langchain_core.documents import Document
16
+ from langchain_core.retrievers import BaseRetriever
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ OnFailAction = Literal["filter", "raise", "warn", "tag"]
22
+
23
+
24
+ class ValidatedRetriever:
25
+ """A LangChain retriever wrapper that validates documents before returning.
26
+
27
+ Wraps any LangChain retriever and applies DaytaShield validation to
28
+ retrieved documents. Invalid documents can be filtered, flagged, or
29
+ cause exceptions based on configuration.
30
+
31
+ Example:
32
+ >>> from langchain_community.vectorstores import FAISS
33
+ >>> from daytashield import SchemaValidator, FreshnessValidator
34
+ >>> from daytashield.integrations.langchain import ValidatedRetriever
35
+ >>>
36
+ >>> # Create base retriever
37
+ >>> vectorstore = FAISS.from_texts(texts, embeddings)
38
+ >>> base_retriever = vectorstore.as_retriever()
39
+ >>>
40
+ >>> # Wrap with validation
41
+ >>> retriever = ValidatedRetriever(
42
+ ... base_retriever=base_retriever,
43
+ ... validators=[
44
+ ... SchemaValidator(schema={"type": "object"}),
45
+ ... FreshnessValidator(max_age="7d"),
46
+ ... ],
47
+ ... on_fail="filter",
48
+ ... )
49
+ >>>
50
+ >>> # Use like any retriever
51
+ >>> docs = retriever.invoke("search query")
52
+
53
+ Actions on validation failure:
54
+ - "filter": Remove invalid documents from results
55
+ - "raise": Raise ValidationError for any invalid document
56
+ - "warn": Log warning but include document
57
+ - "tag": Add validation metadata to document
58
+ """
59
+
60
+ def __init__(
61
+ self,
62
+ base_retriever: BaseRetriever,
63
+ validators: list[Any] | None = None,
64
+ pipeline: ValidationPipeline | None = None,
65
+ on_fail: OnFailAction = "filter",
66
+ validate_metadata: bool = True,
67
+ validate_content: bool = True,
68
+ min_confidence: float = 0.0,
69
+ ):
70
+ """Initialize the validated retriever.
71
+
72
+ Args:
73
+ base_retriever: The underlying LangChain retriever
74
+ validators: List of validators to apply (creates pipeline)
75
+ pipeline: Pre-configured ValidationPipeline (alternative to validators)
76
+ on_fail: Action when validation fails
77
+ validate_metadata: Validate document metadata
78
+ validate_content: Validate document content
79
+ min_confidence: Minimum confidence score to pass (for semantic validation)
80
+ """
81
+ self.base_retriever = base_retriever
82
+ self.on_fail = on_fail
83
+ self.validate_metadata = validate_metadata
84
+ self.validate_content = validate_content
85
+ self.min_confidence = min_confidence
86
+
87
+ # Set up validation pipeline
88
+ if pipeline is not None:
89
+ self.pipeline = pipeline
90
+ elif validators:
91
+ self.pipeline = ValidationPipeline(validators=validators)
92
+ else:
93
+ self.pipeline = ValidationPipeline()
94
+
95
+ # Statistics
96
+ self._stats = {
97
+ "total_retrieved": 0,
98
+ "total_validated": 0,
99
+ "total_passed": 0,
100
+ "total_filtered": 0,
101
+ }
102
+
103
+ def invoke(
104
+ self,
105
+ input: str, # noqa: A002
106
+ config: dict[str, Any] | None = None,
107
+ **kwargs: Any,
108
+ ) -> list[Document]:
109
+ """Retrieve and validate documents.
110
+
111
+ Args:
112
+ input: The search query
113
+ config: Optional config for the retriever
114
+ **kwargs: Additional arguments for base retriever
115
+
116
+ Returns:
117
+ List of validated Document objects
118
+ """
119
+ # Retrieve documents from base retriever
120
+ docs = self.base_retriever.invoke(input, config=config, **kwargs)
121
+ self._stats["total_retrieved"] += len(docs)
122
+
123
+ # Validate each document
124
+ validated_docs: list[Document] = []
125
+
126
+ for doc in docs:
127
+ result = self._validate_document(doc)
128
+ self._stats["total_validated"] += 1
129
+
130
+ if result.passed:
131
+ self._stats["total_passed"] += 1
132
+ validated_docs.append(self._tag_document(doc, result))
133
+ else:
134
+ self._handle_failed_validation(doc, result, validated_docs)
135
+
136
+ return validated_docs
137
+
138
+ def _validate_document(self, doc: Document) -> ValidationResult:
139
+ """Validate a single document.
140
+
141
+ Args:
142
+ doc: LangChain Document to validate
143
+
144
+ Returns:
145
+ ValidationResult
146
+ """
147
+ # Build data dict from document
148
+ data: dict[str, Any] = {}
149
+
150
+ if self.validate_content:
151
+ data["content"] = doc.page_content
152
+
153
+ if self.validate_metadata and doc.metadata:
154
+ data["metadata"] = doc.metadata
155
+ # Also add metadata fields at top level for validators
156
+ data.update(doc.metadata)
157
+
158
+ # Run validation
159
+ result = self.pipeline.validate(data)
160
+
161
+ # Check confidence threshold
162
+ confidence = result.metadata.get("semantic_confidence", 1.0)
163
+ if confidence < self.min_confidence:
164
+ result.add_message(
165
+ code="LOW_CONFIDENCE",
166
+ message=f"Confidence {confidence:.2f} below threshold {self.min_confidence}",
167
+ severity=ValidationStatus.FAILED,
168
+ validator="langchain_retriever",
169
+ )
170
+ result.status = ValidationStatus.FAILED
171
+
172
+ return result
173
+
174
+ def _handle_failed_validation(
175
+ self,
176
+ doc: Document,
177
+ result: ValidationResult,
178
+ validated_docs: list[Document],
179
+ ) -> None:
180
+ """Handle a document that failed validation.
181
+
182
+ Args:
183
+ doc: The failed document
184
+ result: Validation result
185
+ validated_docs: List to potentially add document to
186
+ """
187
+ if self.on_fail == "filter":
188
+ self._stats["total_filtered"] += 1
189
+ logger.debug(f"Filtered document due to validation failure: {result}")
190
+
191
+ elif self.on_fail == "raise":
192
+ error_messages = [str(m) for m in result.errors]
193
+ raise ValidationError(
194
+ f"Document validation failed: {'; '.join(error_messages)}",
195
+ result=result,
196
+ )
197
+
198
+ elif self.on_fail == "warn":
199
+ logger.warning(f"Document validation warning: {result}")
200
+ validated_docs.append(self._tag_document(doc, result))
201
+
202
+ elif self.on_fail == "tag":
203
+ validated_docs.append(self._tag_document(doc, result))
204
+
205
+ def _tag_document(self, doc: Document, result: ValidationResult) -> Document:
206
+ """Add validation metadata to document.
207
+
208
+ Args:
209
+ doc: Document to tag
210
+ result: Validation result
211
+
212
+ Returns:
213
+ Tagged document (same object, modified metadata)
214
+ """
215
+ doc.metadata["_daytashield_status"] = result.status.value
216
+ doc.metadata["_daytashield_passed"] = result.passed
217
+ doc.metadata["_daytashield_message_count"] = len(result.messages)
218
+
219
+ if result.errors:
220
+ doc.metadata["_daytashield_errors"] = [str(e) for e in result.errors[:5]]
221
+
222
+ if "semantic_confidence" in result.metadata:
223
+ doc.metadata["_daytashield_confidence"] = result.metadata["semantic_confidence"]
224
+
225
+ return doc
226
+
227
+ @property
228
+ def stats(self) -> dict[str, int]:
229
+ """Get retrieval and validation statistics."""
230
+ return dict(self._stats)
231
+
232
+ def reset_stats(self) -> None:
233
+ """Reset statistics counters."""
234
+ for key in self._stats:
235
+ self._stats[key] = 0
236
+
237
+ # LangChain Retriever interface methods
238
+
239
+ def get_relevant_documents(
240
+ self,
241
+ query: str,
242
+ *,
243
+ callbacks: CallbackManagerForRetrieverRun | None = None,
244
+ tags: list[str] | None = None,
245
+ metadata: dict[str, Any] | None = None,
246
+ run_name: str | None = None,
247
+ **kwargs: Any,
248
+ ) -> list[Document]:
249
+ """Legacy method for LangChain compatibility."""
250
+ return self.invoke(query, **kwargs)
251
+
252
+ async def ainvoke(
253
+ self,
254
+ input: str, # noqa: A002
255
+ config: dict[str, Any] | None = None,
256
+ **kwargs: Any,
257
+ ) -> list[Document]:
258
+ """Async version of invoke."""
259
+ # For now, just call sync version
260
+ # Could be optimized with async validation in the future
261
+ return self.invoke(input, config=config, **kwargs)
262
+
263
+ def __repr__(self) -> str:
264
+ return (
265
+ f"ValidatedRetriever(base={self.base_retriever.__class__.__name__}, "
266
+ f"validators={len(self.pipeline.validators)}, on_fail={self.on_fail!r})"
267
+ )
268
+
269
+
270
+ class ValidationError(Exception):
271
+ """Raised when document validation fails and on_fail='raise'."""
272
+
273
+ def __init__(self, message: str, result: ValidationResult | None = None):
274
+ super().__init__(message)
275
+ self.result = result
276
+
277
+
278
+ class ValidatedDocumentLoader:
279
+ """A document loader wrapper that validates documents during loading.
280
+
281
+ Wraps any LangChain document loader and validates documents as they're
282
+ loaded. Useful for validating data at ingestion time.
283
+
284
+ Example:
285
+ >>> from langchain_community.document_loaders import PyPDFLoader
286
+ >>> from daytashield.integrations.langchain import ValidatedDocumentLoader
287
+ >>>
288
+ >>> loader = ValidatedDocumentLoader(
289
+ ... base_loader=PyPDFLoader("document.pdf"),
290
+ ... validators=[ComplianceValidator(rules=["hipaa"])],
291
+ ... on_fail="filter",
292
+ ... )
293
+ >>> docs = loader.load() # Only compliant documents returned
294
+ """
295
+
296
+ def __init__(
297
+ self,
298
+ base_loader: Any,
299
+ validators: list[Any] | None = None,
300
+ pipeline: ValidationPipeline | None = None,
301
+ on_fail: OnFailAction = "filter",
302
+ transform: Callable[[Document], dict[str, Any]] | None = None,
303
+ ):
304
+ """Initialize the validated loader.
305
+
306
+ Args:
307
+ base_loader: The underlying LangChain document loader
308
+ validators: List of validators to apply
309
+ pipeline: Pre-configured ValidationPipeline
310
+ on_fail: Action when validation fails
311
+ transform: Optional function to transform document to validation input
312
+ """
313
+ self.base_loader = base_loader
314
+ self.on_fail = on_fail
315
+ self.transform = transform
316
+
317
+ if pipeline is not None:
318
+ self.pipeline = pipeline
319
+ elif validators:
320
+ self.pipeline = ValidationPipeline(validators=validators)
321
+ else:
322
+ self.pipeline = ValidationPipeline()
323
+
324
+ def load(self) -> list[Document]:
325
+ """Load and validate documents.
326
+
327
+ Returns:
328
+ List of validated documents
329
+ """
330
+ docs = self.base_loader.load()
331
+ return self._validate_docs(docs)
332
+
333
+ def lazy_load(self) -> Any:
334
+ """Lazily load and validate documents.
335
+
336
+ Yields:
337
+ Validated documents one at a time
338
+ """
339
+ for doc in self.base_loader.lazy_load():
340
+ data = self._doc_to_data(doc)
341
+ result = self.pipeline.validate(data)
342
+
343
+ if result.passed:
344
+ yield doc
345
+ elif self.on_fail == "warn":
346
+ logger.warning(f"Document validation warning: {result}")
347
+ yield doc
348
+ elif self.on_fail == "tag":
349
+ doc.metadata["_daytashield_validation"] = result.to_dict()
350
+ yield doc
351
+ elif self.on_fail == "raise":
352
+ raise ValidationError(f"Document validation failed: {result}", result)
353
+ # "filter" - just don't yield
354
+
355
+ def _validate_docs(self, docs: list[Document]) -> list[Document]:
356
+ """Validate a list of documents."""
357
+ validated = []
358
+ for doc in docs:
359
+ data = self._doc_to_data(doc)
360
+ result = self.pipeline.validate(data)
361
+
362
+ if result.passed:
363
+ validated.append(doc)
364
+ elif self.on_fail == "warn":
365
+ logger.warning(f"Document validation warning: {result}")
366
+ validated.append(doc)
367
+ elif self.on_fail == "tag":
368
+ doc.metadata["_daytashield_validation"] = result.to_dict()
369
+ validated.append(doc)
370
+ elif self.on_fail == "raise":
371
+ raise ValidationError(f"Document validation failed: {result}", result)
372
+
373
+ return validated
374
+
375
+ def _doc_to_data(self, doc: Document) -> dict[str, Any]:
376
+ """Convert document to validation data."""
377
+ if self.transform:
378
+ return self.transform(doc)
379
+
380
+ return {
381
+ "content": doc.page_content,
382
+ "metadata": doc.metadata,
383
+ **doc.metadata,
384
+ }
385
+
386
+
387
+ # Type alias for the Document type (avoid import at module level)
388
+ try:
389
+ from langchain_core.documents import Document
390
+ except ImportError:
391
+ Document = Any # type: ignore[misc, assignment]
@@ -0,0 +1,13 @@
1
+ """DaytaShield processors for extracting and transforming data."""
2
+
3
+ from daytashield.processors.base import BaseProcessor
4
+ from daytashield.processors.csv import CSVProcessor
5
+ from daytashield.processors.json import JSONProcessor
6
+ from daytashield.processors.pdf import PDFProcessor
7
+
8
+ __all__ = [
9
+ "BaseProcessor",
10
+ "PDFProcessor",
11
+ "CSVProcessor",
12
+ "JSONProcessor",
13
+ ]