daytashield 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daytashield/__init__.py +55 -0
- daytashield/cli/__init__.py +5 -0
- daytashield/cli/main.py +541 -0
- daytashield/core/__init__.py +15 -0
- daytashield/core/audit.py +275 -0
- daytashield/core/pipeline.py +240 -0
- daytashield/core/result.py +185 -0
- daytashield/core/router.py +217 -0
- daytashield/integrations/__init__.py +7 -0
- daytashield/integrations/langchain.py +391 -0
- daytashield/processors/__init__.py +13 -0
- daytashield/processors/base.py +182 -0
- daytashield/processors/csv.py +269 -0
- daytashield/processors/json.py +260 -0
- daytashield/processors/pdf.py +232 -0
- daytashield/rules/__init__.py +14 -0
- daytashield/rules/base.py +67 -0
- daytashield/rules/gdpr.py +348 -0
- daytashield/rules/hipaa.py +229 -0
- daytashield/rules/pii.py +208 -0
- daytashield/validators/__init__.py +15 -0
- daytashield/validators/base.py +103 -0
- daytashield/validators/compliance.py +222 -0
- daytashield/validators/freshness.py +337 -0
- daytashield/validators/schema.py +176 -0
- daytashield/validators/semantic.py +256 -0
- daytashield-0.1.1.dist-info/METADATA +316 -0
- daytashield-0.1.1.dist-info/RECORD +31 -0
- daytashield-0.1.1.dist-info/WHEEL +4 -0
- daytashield-0.1.1.dist-info/entry_points.txt +2 -0
- daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Semantic validation using LLMs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from daytashield.core.result import ValidationResult, ValidationStatus
|
|
12
|
+
from daytashield.validators.base import BaseValidator, ValidatorConfig
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SemanticValidatorConfig(ValidatorConfig):
|
|
16
|
+
"""Configuration for semantic validation."""
|
|
17
|
+
|
|
18
|
+
model: str = Field("gpt-4o-mini", description="LLM model to use")
|
|
19
|
+
temperature: float = Field(0.0, description="LLM temperature (0 for deterministic)")
|
|
20
|
+
max_tokens: int = Field(500, description="Maximum response tokens")
|
|
21
|
+
cache_results: bool = Field(True, description="Cache validation results")
|
|
22
|
+
timeout: int = Field(30, description="Request timeout in seconds")
|
|
23
|
+
api_base: str | None = Field(None, description="Custom API base URL")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class SemanticValidator(BaseValidator):
|
|
27
|
+
"""Validates data semantically using LLMs.
|
|
28
|
+
|
|
29
|
+
Uses language models to perform content-based validation that goes
|
|
30
|
+
beyond schema checking. Useful for:
|
|
31
|
+
- Checking if content is appropriate/relevant
|
|
32
|
+
- Verifying factual consistency
|
|
33
|
+
- Detecting anomalies or outliers
|
|
34
|
+
- Domain-specific validation rules
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
>>> validator = SemanticValidator(
|
|
38
|
+
... prompt="Check if this document is a valid invoice with required fields",
|
|
39
|
+
... criteria=["has_invoice_number", "has_date", "has_line_items", "has_total"],
|
|
40
|
+
... )
|
|
41
|
+
>>> result = validator.validate(document_data, result)
|
|
42
|
+
|
|
43
|
+
The validator uses LiteLLM for provider-agnostic LLM access, supporting
|
|
44
|
+
OpenAI, Anthropic, local models, and more.
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
name = "semantic"
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
prompt: str,
|
|
52
|
+
criteria: list[str] | None = None,
|
|
53
|
+
config: SemanticValidatorConfig | dict[str, Any] | None = None,
|
|
54
|
+
):
|
|
55
|
+
"""Initialize the semantic validator.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
prompt: The validation prompt describing what to check
|
|
59
|
+
criteria: List of specific criteria to evaluate
|
|
60
|
+
config: Validator configuration
|
|
61
|
+
"""
|
|
62
|
+
if config is None:
|
|
63
|
+
super().__init__(SemanticValidatorConfig())
|
|
64
|
+
elif isinstance(config, dict):
|
|
65
|
+
super().__init__(SemanticValidatorConfig(**config))
|
|
66
|
+
else:
|
|
67
|
+
super().__init__(config)
|
|
68
|
+
|
|
69
|
+
self.prompt = prompt
|
|
70
|
+
self.criteria = criteria or []
|
|
71
|
+
self._cache: dict[str, dict[str, Any]] = {}
|
|
72
|
+
|
|
73
|
+
def validate(self, data: Any, result: ValidationResult) -> ValidationResult:
|
|
74
|
+
"""Validate data semantically using an LLM.
|
|
75
|
+
|
|
76
|
+
Args:
|
|
77
|
+
data: The data to validate
|
|
78
|
+
result: The ValidationResult to update
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Updated ValidationResult
|
|
82
|
+
"""
|
|
83
|
+
config = self.config
|
|
84
|
+
if not isinstance(config, SemanticValidatorConfig):
|
|
85
|
+
config = SemanticValidatorConfig()
|
|
86
|
+
|
|
87
|
+
# Generate cache key
|
|
88
|
+
cache_key = self._get_cache_key(data)
|
|
89
|
+
if config.cache_results and cache_key in self._cache:
|
|
90
|
+
return self._apply_cached_result(result, self._cache[cache_key])
|
|
91
|
+
|
|
92
|
+
# Build the validation prompt
|
|
93
|
+
system_prompt = self._build_system_prompt()
|
|
94
|
+
user_prompt = self._build_user_prompt(data)
|
|
95
|
+
|
|
96
|
+
try:
|
|
97
|
+
# Import litellm here to make it optional
|
|
98
|
+
import litellm
|
|
99
|
+
|
|
100
|
+
# Configure litellm
|
|
101
|
+
if config.api_base:
|
|
102
|
+
litellm.api_base = config.api_base
|
|
103
|
+
|
|
104
|
+
# Make the LLM call
|
|
105
|
+
response = litellm.completion(
|
|
106
|
+
model=config.model,
|
|
107
|
+
messages=[
|
|
108
|
+
{"role": "system", "content": system_prompt},
|
|
109
|
+
{"role": "user", "content": user_prompt},
|
|
110
|
+
],
|
|
111
|
+
temperature=config.temperature,
|
|
112
|
+
max_tokens=config.max_tokens,
|
|
113
|
+
timeout=config.timeout,
|
|
114
|
+
response_format={"type": "json_object"},
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Parse the response
|
|
118
|
+
response_text = response.choices[0].message.content
|
|
119
|
+
validation_result = json.loads(response_text)
|
|
120
|
+
|
|
121
|
+
# Cache the result
|
|
122
|
+
if config.cache_results:
|
|
123
|
+
self._cache[cache_key] = validation_result
|
|
124
|
+
|
|
125
|
+
# Apply the result
|
|
126
|
+
return self._apply_validation_result(result, validation_result)
|
|
127
|
+
|
|
128
|
+
except ImportError:
|
|
129
|
+
result.add_message(
|
|
130
|
+
code="SEMANTIC_NO_LITELLM",
|
|
131
|
+
message="litellm package not installed. Install with: pip install litellm",
|
|
132
|
+
severity=ValidationStatus.ERROR,
|
|
133
|
+
validator=self.name,
|
|
134
|
+
)
|
|
135
|
+
result.status = ValidationStatus.ERROR
|
|
136
|
+
return result
|
|
137
|
+
|
|
138
|
+
except json.JSONDecodeError as e:
|
|
139
|
+
result.add_message(
|
|
140
|
+
code="SEMANTIC_PARSE_ERROR",
|
|
141
|
+
message=f"Failed to parse LLM response as JSON: {e}",
|
|
142
|
+
severity=ValidationStatus.ERROR,
|
|
143
|
+
validator=self.name,
|
|
144
|
+
)
|
|
145
|
+
result.status = ValidationStatus.ERROR
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
except Exception as e:
|
|
149
|
+
result.add_message(
|
|
150
|
+
code="SEMANTIC_LLM_ERROR",
|
|
151
|
+
message=f"LLM validation failed: {e}",
|
|
152
|
+
severity=ValidationStatus.ERROR,
|
|
153
|
+
validator=self.name,
|
|
154
|
+
details={"error": str(e)},
|
|
155
|
+
)
|
|
156
|
+
result.status = ValidationStatus.ERROR
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
def _build_system_prompt(self) -> str:
|
|
160
|
+
"""Build the system prompt for the LLM."""
|
|
161
|
+
criteria_text = ""
|
|
162
|
+
if self.criteria:
|
|
163
|
+
criteria_list = "\n".join(f"- {c}" for c in self.criteria)
|
|
164
|
+
criteria_text = f"\n\nSpecific criteria to evaluate:\n{criteria_list}"
|
|
165
|
+
|
|
166
|
+
return f"""You are a data validation assistant. Your task is to validate data based on the given criteria and return a structured JSON response.
|
|
167
|
+
|
|
168
|
+
Validation task: {self.prompt}{criteria_text}
|
|
169
|
+
|
|
170
|
+
IMPORTANT: Respond ONLY with valid JSON in this exact format:
|
|
171
|
+
{{
|
|
172
|
+
"valid": true/false,
|
|
173
|
+
"confidence": 0.0-1.0,
|
|
174
|
+
"issues": [
|
|
175
|
+
{{
|
|
176
|
+
"criterion": "criterion_name",
|
|
177
|
+
"passed": true/false,
|
|
178
|
+
"message": "explanation"
|
|
179
|
+
}}
|
|
180
|
+
],
|
|
181
|
+
"summary": "brief overall assessment"
|
|
182
|
+
}}"""
|
|
183
|
+
|
|
184
|
+
def _build_user_prompt(self, data: Any) -> str:
|
|
185
|
+
"""Build the user prompt with the data to validate."""
|
|
186
|
+
if isinstance(data, dict):
|
|
187
|
+
data_str = json.dumps(data, indent=2, default=str)
|
|
188
|
+
elif isinstance(data, str):
|
|
189
|
+
data_str = data
|
|
190
|
+
else:
|
|
191
|
+
data_str = str(data)
|
|
192
|
+
|
|
193
|
+
return f"""Please validate the following data:
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
{data_str[:10000]}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
Return your validation result as JSON."""
|
|
200
|
+
|
|
201
|
+
def _get_cache_key(self, data: Any) -> str:
|
|
202
|
+
"""Generate a cache key for the data."""
|
|
203
|
+
data_str = json.dumps(data, sort_keys=True, default=str) if isinstance(data, dict) else str(data)
|
|
204
|
+
content = f"{self.prompt}:{self.criteria}:{data_str}"
|
|
205
|
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
206
|
+
|
|
207
|
+
def _apply_validation_result(
|
|
208
|
+
self, result: ValidationResult, validation: dict[str, Any]
|
|
209
|
+
) -> ValidationResult:
|
|
210
|
+
"""Apply the LLM validation result to the ValidationResult."""
|
|
211
|
+
is_valid = validation.get("valid", False)
|
|
212
|
+
confidence = validation.get("confidence", 0.0)
|
|
213
|
+
issues = validation.get("issues", [])
|
|
214
|
+
summary = validation.get("summary", "")
|
|
215
|
+
|
|
216
|
+
# Add metadata
|
|
217
|
+
result.metadata["semantic_confidence"] = confidence
|
|
218
|
+
result.metadata["semantic_summary"] = summary
|
|
219
|
+
|
|
220
|
+
# Process issues
|
|
221
|
+
for issue in issues:
|
|
222
|
+
if not issue.get("passed", True):
|
|
223
|
+
severity = ValidationStatus.WARNING if confidence > 0.5 else ValidationStatus.FAILED
|
|
224
|
+
result.add_message(
|
|
225
|
+
code="SEMANTIC_CRITERION_FAILED",
|
|
226
|
+
message=issue.get("message", "Criterion not met"),
|
|
227
|
+
severity=severity,
|
|
228
|
+
validator=self.name,
|
|
229
|
+
field=issue.get("criterion"),
|
|
230
|
+
details={"confidence": confidence},
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# Update status
|
|
234
|
+
if not is_valid:
|
|
235
|
+
if confidence < 0.5:
|
|
236
|
+
result.status = ValidationStatus.FAILED
|
|
237
|
+
else:
|
|
238
|
+
# High confidence but invalid = warning
|
|
239
|
+
if result.status != ValidationStatus.FAILED:
|
|
240
|
+
result.status = ValidationStatus.WARNING
|
|
241
|
+
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
def _apply_cached_result(
|
|
245
|
+
self, result: ValidationResult, cached: dict[str, Any]
|
|
246
|
+
) -> ValidationResult:
|
|
247
|
+
"""Apply a cached validation result."""
|
|
248
|
+
result.metadata["semantic_cached"] = True
|
|
249
|
+
return self._apply_validation_result(result, cached)
|
|
250
|
+
|
|
251
|
+
def clear_cache(self) -> None:
|
|
252
|
+
"""Clear the validation cache."""
|
|
253
|
+
self._cache.clear()
|
|
254
|
+
|
|
255
|
+
def __repr__(self) -> str:
|
|
256
|
+
return f"SemanticValidator(prompt={self.prompt[:50]!r}..., criteria={len(self.criteria)})"
|
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: daytashield
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: The missing validation layer between unstructured data and AI systems
|
|
5
|
+
Project-URL: Homepage, https://github.com/daytashield/daytashield
|
|
6
|
+
Project-URL: Documentation, https://daytashield.dev/docs
|
|
7
|
+
Project-URL: Repository, https://github.com/daytashield/daytashield
|
|
8
|
+
Project-URL: Issues, https://github.com/daytashield/daytashield/issues
|
|
9
|
+
Author-email: DaytaShield Team <team@daytashield.dev>
|
|
10
|
+
License-Expression: Apache-2.0
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: ai,compliance,data-validation,gdpr,hipaa,langchain,llm,multimodal,rag
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Requires-Dist: click>=8.0
|
|
27
|
+
Requires-Dist: eval-type-backport>=0.2.0; python_version < '3.10'
|
|
28
|
+
Requires-Dist: jsonschema>=4.0
|
|
29
|
+
Requires-Dist: langchain-core>=0.1
|
|
30
|
+
Requires-Dist: litellm>=1.0
|
|
31
|
+
Requires-Dist: orjson>=3.9
|
|
32
|
+
Requires-Dist: pandas>=2.0
|
|
33
|
+
Requires-Dist: pdfplumber>=0.10
|
|
34
|
+
Requires-Dist: pydantic>=2.0
|
|
35
|
+
Requires-Dist: python-dateutil>=2.8
|
|
36
|
+
Requires-Dist: rich>=13.0
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Requires-Dist: mkdocs-material>=9.0; extra == 'all'
|
|
39
|
+
Requires-Dist: mkdocs>=1.5; extra == 'all'
|
|
40
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == 'all'
|
|
41
|
+
Requires-Dist: mypy>=1.0; extra == 'all'
|
|
42
|
+
Requires-Dist: pandas-stubs>=2.0; extra == 'all'
|
|
43
|
+
Requires-Dist: pillow; extra == 'all'
|
|
44
|
+
Requires-Dist: pre-commit>=3.0; extra == 'all'
|
|
45
|
+
Requires-Dist: pytesseract; extra == 'all'
|
|
46
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'all'
|
|
47
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'all'
|
|
48
|
+
Requires-Dist: pytest>=7.0; extra == 'all'
|
|
49
|
+
Requires-Dist: ruff>=0.1; extra == 'all'
|
|
50
|
+
Provides-Extra: dev
|
|
51
|
+
Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
52
|
+
Requires-Dist: pandas-stubs>=2.0; extra == 'dev'
|
|
53
|
+
Requires-Dist: pre-commit>=3.0; extra == 'dev'
|
|
54
|
+
Requires-Dist: pytest-asyncio>=0.21; extra == 'dev'
|
|
55
|
+
Requires-Dist: pytest-cov>=4.0; extra == 'dev'
|
|
56
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
57
|
+
Requires-Dist: ruff>=0.1; extra == 'dev'
|
|
58
|
+
Provides-Extra: docs
|
|
59
|
+
Requires-Dist: mkdocs-material>=9.0; extra == 'docs'
|
|
60
|
+
Requires-Dist: mkdocs>=1.5; extra == 'docs'
|
|
61
|
+
Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
|
|
62
|
+
Provides-Extra: ocr
|
|
63
|
+
Requires-Dist: pillow; extra == 'ocr'
|
|
64
|
+
Requires-Dist: pytesseract; extra == 'ocr'
|
|
65
|
+
Description-Content-Type: text/markdown
|
|
66
|
+
|
|
67
|
+
# π‘οΈ DaytaShield
|
|
68
|
+
|
|
69
|
+
[](https://badge.fury.io/py/daytashield)
|
|
70
|
+
[](https://www.python.org/downloads/)
|
|
71
|
+
[](https://opensource.org/licenses/Apache-2.0)
|
|
72
|
+
|
|
73
|
+
**The missing validation layer between unstructured data and AI systems.**
|
|
74
|
+
|
|
75
|
+
DaytaShield validates multimodal data (PDFs, CSVs, JSON, images) before it reaches your RAG pipelines, AI agents, or analytics systems. Stop hallucinations at the source.
|
|
76
|
+
|
|
77
|
+
## π Quick Start
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install daytashield
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from daytashield import ValidationPipeline, SchemaValidator, FreshnessValidator
|
|
85
|
+
|
|
86
|
+
# Create a validation pipeline
|
|
87
|
+
pipeline = ValidationPipeline([
|
|
88
|
+
SchemaValidator(schema={"type": "object", "required": ["id", "content"]}),
|
|
89
|
+
FreshnessValidator(max_age="7d"),
|
|
90
|
+
])
|
|
91
|
+
|
|
92
|
+
# Validate your data
|
|
93
|
+
result = pipeline.validate({
|
|
94
|
+
"id": 1,
|
|
95
|
+
"content": "Hello world",
|
|
96
|
+
"timestamp": "2024-01-15"
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
print(result.status) # ValidationStatus.PASSED
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## β¨ Features
|
|
103
|
+
|
|
104
|
+
- **π Schema Validation** - JSON Schema + Pydantic model validation
|
|
105
|
+
- **π§ Semantic Validation** - LLM-powered content validation
|
|
106
|
+
- **β° Freshness Checks** - Detect stale data before it causes problems
|
|
107
|
+
- **π Compliance Rules** - Built-in HIPAA, GDPR, and PII detection
|
|
108
|
+
- **π Document Processing** - PDF, CSV, JSON extraction and validation
|
|
109
|
+
- **π LangChain Integration** - Validated retrievers for RAG pipelines
|
|
110
|
+
- **π Audit Trail** - Immutable logging for compliance
|
|
111
|
+
|
|
112
|
+
## π Usage
|
|
113
|
+
|
|
114
|
+
### Validate Files
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from daytashield import ValidationPipeline, SchemaValidator, PDFProcessor
|
|
118
|
+
|
|
119
|
+
# Create pipeline with processors
|
|
120
|
+
pipeline = ValidationPipeline([
|
|
121
|
+
SchemaValidator(schema=invoice_schema),
|
|
122
|
+
])
|
|
123
|
+
pipeline.add_processor(".pdf", PDFProcessor())
|
|
124
|
+
|
|
125
|
+
# Validate a PDF
|
|
126
|
+
result = pipeline.validate_file("invoice.pdf")
|
|
127
|
+
if result.failed:
|
|
128
|
+
for error in result.errors:
|
|
129
|
+
print(f"Error: {error.message}")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
### Compliance Checking
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from daytashield import ValidationPipeline, ComplianceValidator
|
|
136
|
+
|
|
137
|
+
# Check for HIPAA and PII violations
|
|
138
|
+
pipeline = ValidationPipeline([
|
|
139
|
+
ComplianceValidator(rules=["hipaa", "pii"]),
|
|
140
|
+
])
|
|
141
|
+
|
|
142
|
+
result = pipeline.validate(patient_data)
|
|
143
|
+
for message in result.messages:
|
|
144
|
+
print(f"{message.severity}: {message.message}")
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### LangChain Integration
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from langchain_community.vectorstores import FAISS
|
|
151
|
+
from daytashield import SchemaValidator, FreshnessValidator
|
|
152
|
+
from daytashield.integrations.langchain import ValidatedRetriever
|
|
153
|
+
|
|
154
|
+
# Wrap your retriever with validation
|
|
155
|
+
retriever = ValidatedRetriever(
|
|
156
|
+
base_retriever=vectorstore.as_retriever(),
|
|
157
|
+
validators=[
|
|
158
|
+
SchemaValidator(schema=doc_schema),
|
|
159
|
+
FreshnessValidator(max_age="7d"),
|
|
160
|
+
],
|
|
161
|
+
on_fail="filter", # Remove invalid documents
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Use like any LangChain retriever
|
|
165
|
+
docs = retriever.invoke("What is the refund policy?")
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### Routing Based on Validation
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from daytashield import ValidationPipeline, DataRouter, RouteAction
|
|
172
|
+
|
|
173
|
+
pipeline = ValidationPipeline([...])
|
|
174
|
+
router = DataRouter()
|
|
175
|
+
|
|
176
|
+
result = pipeline.validate(data)
|
|
177
|
+
decision = router.route(result)
|
|
178
|
+
|
|
179
|
+
if decision.route.action == RouteAction.PASS:
|
|
180
|
+
send_to_destination(result.data)
|
|
181
|
+
elif decision.route.action == RouteAction.QUARANTINE:
|
|
182
|
+
quarantine_for_review(result.data, decision.reason)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## π₯οΈ CLI
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
# Validate files
|
|
189
|
+
daytashield validate invoice.pdf --schema invoice.json
|
|
190
|
+
|
|
191
|
+
# Validate with compliance rules
|
|
192
|
+
daytashield validate ./data/ --rules hipaa --rules pii
|
|
193
|
+
|
|
194
|
+
# Watch directory for new files
|
|
195
|
+
daytashield watch ./incoming/ --rules hipaa --audit audit.jsonl
|
|
196
|
+
|
|
197
|
+
# Query audit log
|
|
198
|
+
daytashield audit audit.jsonl --status failed --limit 10
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## π¦ Validators
|
|
202
|
+
|
|
203
|
+
| Validator | Description |
|
|
204
|
+
|-----------|-------------|
|
|
205
|
+
| `SchemaValidator` | JSON Schema and Pydantic validation |
|
|
206
|
+
| `SemanticValidator` | LLM-based content validation |
|
|
207
|
+
| `FreshnessValidator` | Timestamp and staleness checks |
|
|
208
|
+
| `ComplianceValidator` | HIPAA, GDPR, PII rule enforcement |
|
|
209
|
+
|
|
210
|
+
## π Processors
|
|
211
|
+
|
|
212
|
+
| Processor | Formats | Description |
|
|
213
|
+
|-----------|---------|-------------|
|
|
214
|
+
| `PDFProcessor` | `.pdf` | Text extraction with pdfplumber |
|
|
215
|
+
| `CSVProcessor` | `.csv`, `.tsv` | Tabular data with pandas |
|
|
216
|
+
| `JSONProcessor` | `.json`, `.jsonl` | Structured data with orjson |
|
|
217
|
+
|
|
218
|
+
## π Compliance Rules
|
|
219
|
+
|
|
220
|
+
| Rule Pack | Coverage |
|
|
221
|
+
|-----------|----------|
|
|
222
|
+
| `hipaa` | PHI detection, medical record numbers, health plan IDs |
|
|
223
|
+
| `gdpr` | Consent checking, special category data, data minimization |
|
|
224
|
+
| `pii` | SSN, credit cards, emails, phone numbers, IP addresses |
|
|
225
|
+
|
|
226
|
+
## π Audit Trail
|
|
227
|
+
|
|
228
|
+
DaytaShield maintains an immutable audit log of all validation operations:
|
|
229
|
+
|
|
230
|
+
```python
|
|
231
|
+
from daytashield import AuditTrail, ValidationPipeline
|
|
232
|
+
|
|
233
|
+
# Enable audit logging
|
|
234
|
+
audit = AuditTrail("./audit.jsonl")
|
|
235
|
+
pipeline = ValidationPipeline([...])
|
|
236
|
+
|
|
237
|
+
result = pipeline.validate(data)
|
|
238
|
+
audit.log(result)
|
|
239
|
+
|
|
240
|
+
# Query the audit trail
|
|
241
|
+
for entry in audit.query(status=ValidationStatus.FAILED):
|
|
242
|
+
print(f"Failed: {entry.source_id} at {entry.timestamp}")
|
|
243
|
+
|
|
244
|
+
# Get statistics
|
|
245
|
+
stats = audit.stats()
|
|
246
|
+
print(f"Pass rate: {stats['by_status']['passed'] / stats['total'] * 100:.1f}%")
|
|
247
|
+
```
|
|
248
|
+
|
|
249
|
+
## ποΈ Architecture
|
|
250
|
+
|
|
251
|
+
```
|
|
252
|
+
βββββββββββββββ βββββββββββββββ βββββββββββββββ
|
|
253
|
+
β Source ββββββΆβ Processor ββββββΆβ Validators β
|
|
254
|
+
β PDF/CSV/JSONβ β Extract β β Schema β
|
|
255
|
+
βββββββββββββββ βββββββββββββββ β Semantic β
|
|
256
|
+
β Freshness β
|
|
257
|
+
β Compliance β
|
|
258
|
+
ββββββββ¬βββββββ
|
|
259
|
+
β
|
|
260
|
+
βββββββββββββββ ββββββββΌβββββββ
|
|
261
|
+
β Audit βββββββ Router β
|
|
262
|
+
β Trail β β Pass/Warn β
|
|
263
|
+
βββββββββββββββ β /Fail β
|
|
264
|
+
βββββββββββββββ
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
## π§ Configuration
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
from daytashield import ValidationPipeline, PipelineConfig
|
|
271
|
+
|
|
272
|
+
pipeline = ValidationPipeline(
|
|
273
|
+
validators=[...],
|
|
274
|
+
config=PipelineConfig(
|
|
275
|
+
fail_fast=True, # Stop on first failure
|
|
276
|
+
include_original_data=True, # Keep original data in result
|
|
277
|
+
auto_detect_processor=True, # Auto-select processor by extension
|
|
278
|
+
),
|
|
279
|
+
)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
## π€ Contributing
|
|
283
|
+
|
|
284
|
+
We welcome contributions! Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
# Clone the repo
|
|
288
|
+
git clone https://github.com/daytashield/daytashield.git
|
|
289
|
+
cd daytashield
|
|
290
|
+
|
|
291
|
+
# Install dev dependencies
|
|
292
|
+
pip install -e ".[dev]"
|
|
293
|
+
|
|
294
|
+
# Run tests
|
|
295
|
+
pytest
|
|
296
|
+
|
|
297
|
+
# Run linting
|
|
298
|
+
ruff check src tests
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
## π License
|
|
302
|
+
|
|
303
|
+
Apache 2.0 - see [LICENSE](LICENSE) for details.
|
|
304
|
+
|
|
305
|
+
## π Links
|
|
306
|
+
|
|
307
|
+
- [Documentation](https://daytashield.dev/docs)
|
|
308
|
+
- [PyPI](https://pypi.org/project/daytashield/)
|
|
309
|
+
- [GitHub](https://github.com/daytashield/daytashield)
|
|
310
|
+
- [Discord](https://discord.gg/daytashield)
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
**Built with β€οΈ for the AI community**
|
|
315
|
+
|
|
316
|
+
*Stop bad data at the source. Validate before you hallucinate.*
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
daytashield/__init__.py,sha256=ibXIZN0DYPLL6x4Et-zEWZOn3tdf3zekt4gRFHRZDds,2109
|
|
2
|
+
daytashield/cli/__init__.py,sha256=KGux-93hICSzY5wTnajn7z7S0rteqHk0G9LFgHIP_CY,99
|
|
3
|
+
daytashield/cli/main.py,sha256=Uzxk89lSBqUrYlJQV4NfH1-jgObUazhKuI0wCTa6bIs,15220
|
|
4
|
+
daytashield/core/__init__.py,sha256=swtK_Uxj6eet25AjcEJNTLP9k85HWdOTfDsjVLmmW4A,443
|
|
5
|
+
daytashield/core/audit.py,sha256=cTQYG3OUTmR-R7BQFHm9BwlX-CxfPfF3VjcdHl59u68,9232
|
|
6
|
+
daytashield/core/pipeline.py,sha256=tBuROvwVf0E68cQ0FrdqmKlQgtAPkonmjceIVRueCK0,8383
|
|
7
|
+
daytashield/core/result.py,sha256=DRqkob6NYKQW_tdZMXVes_KCtW_1TSS1eLW_IBRTwO8,6833
|
|
8
|
+
daytashield/core/router.py,sha256=szWpc7NTxyZ4iocbnrxzt0bGf5KgJ9CFYboyKDWKC4o,6972
|
|
9
|
+
daytashield/integrations/__init__.py,sha256=D3uD4WyCMM-UqaEgbfDlQVaEPjDbzwSUQUyOaKTVfwY,165
|
|
10
|
+
daytashield/integrations/langchain.py,sha256=x5aZwHkX0VFeGgq-OeBWGZULyoQQ2cq31jxpiuDzSKM,13340
|
|
11
|
+
daytashield/processors/__init__.py,sha256=8l7q8lzfwKes4FKzbUy0OwmGGlaojFjW_p3l33NRrpE,377
|
|
12
|
+
daytashield/processors/base.py,sha256=HIX_Qd-tinjkZLe-fSV46BeeH-lI1aJAI0dnlBh7EFo,6055
|
|
13
|
+
daytashield/processors/csv.py,sha256=IpbjsuXwc9jjSMO551-gq2SVYrxENapJuSszxlOYrk0,9151
|
|
14
|
+
daytashield/processors/json.py,sha256=gJAb4IhEJwt_U0NQ7du9LPC7HFfMhJzOpGIF0XcsPpU,8723
|
|
15
|
+
daytashield/processors/pdf.py,sha256=vGkD_q1Youib7YTsrCfmpoEvU6LBRXlGIOkgPmGDMUk,7698
|
|
16
|
+
daytashield/rules/__init__.py,sha256=lcmwGft7iVkI2Rplr6p7HxGBcuW1hv5PzGQmQA9bx3Q,369
|
|
17
|
+
daytashield/rules/base.py,sha256=fsliG9OzPfWO67mcUJqNmIchHD8AGr_ITB1Qmk1Sa_w,2375
|
|
18
|
+
daytashield/rules/gdpr.py,sha256=zzZFMZlfiE1uXa82Ekr0eabcJVOs54YmkEDGUNtvF14,11403
|
|
19
|
+
daytashield/rules/hipaa.py,sha256=F3sFcAano34CVp_Gbxzq0hpDYLx8ULnVs-xvYHYR6zc,8006
|
|
20
|
+
daytashield/rules/pii.py,sha256=YAvkD7yVDYNCHWJcuBTBKW6XuCBIV4LgmP9TJYlTTPI,7644
|
|
21
|
+
daytashield/validators/__init__.py,sha256=jb9YIiP8smesZW0lvfoYioSOJFJSjpNXG-TofU3mi80,499
|
|
22
|
+
daytashield/validators/base.py,sha256=E_qB174VaTxAJXYNj033CCo2MCAQCibreuaZeqmp2e8,3417
|
|
23
|
+
daytashield/validators/compliance.py,sha256=aX3sB6rTOklMmSSXZTfXImaqN26xSLsoyU5S7Jcw7vA,7781
|
|
24
|
+
daytashield/validators/freshness.py,sha256=U5Ta8vIWJErOJz1Ahodw2xpMGgTraJq7gxMnYs8yqQI,11279
|
|
25
|
+
daytashield/validators/schema.py,sha256=vJI5V_02PSq-L61fVVptr6M27C1EeBYoLmrqLiLz4M4,5948
|
|
26
|
+
daytashield/validators/semantic.py,sha256=cgTGALC7Jvqnzt_8RanyDBBspKij8LQ4GbcWR5RPCSI,9069
|
|
27
|
+
daytashield-0.1.1.dist-info/METADATA,sha256=wskiF0EK5H9Xh7BbXA2vTc77BEpXontsHxiyMgP-j5g,10320
|
|
28
|
+
daytashield-0.1.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
29
|
+
daytashield-0.1.1.dist-info/entry_points.txt,sha256=HDyDPfM8S9Vllxik1fCxrTQ3CAH5UKUDznqlPTVzTkk,57
|
|
30
|
+
daytashield-0.1.1.dist-info/licenses/LICENSE,sha256=OkFgJ1ml3343f-jCqVLj11Ihv4ieJrIr6leY8AiYTes,10764
|
|
31
|
+
daytashield-0.1.1.dist-info/RECORD,,
|