daytashield 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ """DaytaShield: The missing validation layer between unstructured data and AI systems.
2
+
3
+ DaytaShield validates multimodal data before it reaches RAG/agents/analytics:
4
+ - Validates: Schema, semantic (LLM-based), freshness, compliance (HIPAA, GDPR)
5
+ - Cleans: Fixes OCR errors, missing fields, stale data automatically
6
+ - Routes: Pass → Destination | Warning → Review | Fail → Quarantine
7
+ - Tracks: Immutable audit trail (provenance tracking)
8
+ - Integrates: LangChain, RAGFlow, Unstructured.io, Anthropic MCP
9
+
10
+ Example:
11
+ >>> from daytashield import ValidationPipeline, SchemaValidator, FreshnessValidator
12
+ >>> pipeline = ValidationPipeline([
13
+ ... SchemaValidator(schema={"type": "object", "required": ["id", "content"]}),
14
+ ... FreshnessValidator(max_age="7d"),
15
+ ... ])
16
+ >>> result = pipeline.validate({"id": 1, "content": "Hello", "timestamp": "2024-01-01"})
17
+ >>> print(result.status)
18
+ ValidationStatus.PASSED
19
+ """
20
+
21
+ from daytashield.core.audit import AuditTrail
22
+ from daytashield.core.pipeline import ValidationPipeline
23
+ from daytashield.core.result import ValidationResult, ValidationStatus
24
+ from daytashield.core.router import DataRouter, RouteAction
25
+ from daytashield.processors.base import BaseProcessor
26
+ from daytashield.processors.csv import CSVProcessor
27
+ from daytashield.processors.json import JSONProcessor
28
+ from daytashield.processors.pdf import PDFProcessor
29
+ from daytashield.validators.base import BaseValidator
30
+ from daytashield.validators.compliance import ComplianceValidator
31
+ from daytashield.validators.freshness import FreshnessValidator
32
+ from daytashield.validators.schema import SchemaValidator
33
+ from daytashield.validators.semantic import SemanticValidator
34
+
35
+ __version__ = "0.1.1"
36
+ __all__ = [
37
+ # Core
38
+ "ValidationPipeline",
39
+ "ValidationResult",
40
+ "ValidationStatus",
41
+ "DataRouter",
42
+ "RouteAction",
43
+ "AuditTrail",
44
+ # Validators
45
+ "BaseValidator",
46
+ "SchemaValidator",
47
+ "SemanticValidator",
48
+ "FreshnessValidator",
49
+ "ComplianceValidator",
50
+ # Processors
51
+ "BaseProcessor",
52
+ "PDFProcessor",
53
+ "CSVProcessor",
54
+ "JSONProcessor",
55
+ ]
@@ -0,0 +1,5 @@
1
+ """DaytaShield command-line interface."""
2
+
3
+ from daytashield.cli.main import cli
4
+
5
+ __all__ = ["cli"]
@@ -0,0 +1,541 @@
1
+ """DaytaShield command-line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ import time
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ import click
12
+ from rich.console import Console
13
+ from rich.panel import Panel
14
+ from rich.progress import Progress, SpinnerColumn, TextColumn
15
+ from rich.table import Table
16
+ from rich.tree import Tree
17
+
18
+ from daytashield import __version__
19
+ from daytashield.core.audit import AuditTrail
20
+ from daytashield.core.pipeline import ValidationPipeline
21
+ from daytashield.core.result import ValidationResult, ValidationStatus
22
+ from daytashield.core.router import DataRouter, RouteAction
23
+ from daytashield.processors.csv import CSVProcessor
24
+ from daytashield.processors.json import JSONProcessor
25
+ from daytashield.processors.pdf import PDFProcessor
26
+ from daytashield.validators.compliance import ComplianceValidator
27
+ from daytashield.validators.freshness import FreshnessValidator
28
+ from daytashield.validators.schema import SchemaValidator
29
+
30
+ console = Console()
31
+
32
+
33
+ def get_status_style(status: ValidationStatus) -> str:
34
+ """Get Rich style for a validation status."""
35
+ styles = {
36
+ ValidationStatus.PASSED: "bold green",
37
+ ValidationStatus.WARNING: "bold yellow",
38
+ ValidationStatus.FAILED: "bold red",
39
+ ValidationStatus.ERROR: "bold red",
40
+ ValidationStatus.SKIPPED: "dim",
41
+ }
42
+ return styles.get(status, "")
43
+
44
+
45
+ def format_duration(ms: float | None) -> str:
46
+ """Format duration in human-readable form."""
47
+ if ms is None:
48
+ return "-"
49
+ if ms < 1000:
50
+ return f"{ms:.1f}ms"
51
+ return f"{ms / 1000:.2f}s"
52
+
53
+
54
+ @click.group()
55
+ @click.version_option(version=__version__, prog_name="daytashield")
56
+ def cli() -> None:
57
+ """DaytaShield: Validate multimodal data for AI systems.
58
+
59
+ The missing validation layer between unstructured data and AI.
60
+ """
61
+ pass
62
+
63
+
64
+ @cli.command()
65
+ @click.argument("paths", nargs=-1, required=True, type=click.Path(exists=True))
66
+ @click.option(
67
+ "--schema",
68
+ "-s",
69
+ type=click.Path(exists=True),
70
+ help="JSON Schema file for validation",
71
+ )
72
+ @click.option(
73
+ "--rules",
74
+ "-r",
75
+ multiple=True,
76
+ type=click.Choice(["hipaa", "gdpr", "pii"]),
77
+ help="Compliance rules to apply",
78
+ )
79
+ @click.option(
80
+ "--max-age",
81
+ "-a",
82
+ type=str,
83
+ help="Maximum data age (e.g., 7d, 2w, 1M)",
84
+ )
85
+ @click.option(
86
+ "--output",
87
+ "-o",
88
+ type=click.Path(),
89
+ help="Output file for results (JSON)",
90
+ )
91
+ @click.option(
92
+ "--format",
93
+ "-f",
94
+ "output_format",
95
+ type=click.Choice(["table", "json", "summary"]),
96
+ default="table",
97
+ help="Output format",
98
+ )
99
+ @click.option(
100
+ "--fail-fast",
101
+ is_flag=True,
102
+ help="Stop on first validation failure",
103
+ )
104
+ @click.option(
105
+ "--quiet",
106
+ "-q",
107
+ is_flag=True,
108
+ help="Suppress output except errors",
109
+ )
110
+ def validate(
111
+ paths: tuple[str, ...],
112
+ schema: str | None,
113
+ rules: tuple[str, ...],
114
+ max_age: str | None,
115
+ output: str | None,
116
+ output_format: str,
117
+ fail_fast: bool,
118
+ quiet: bool,
119
+ ) -> None:
120
+ """Validate files against schema and compliance rules.
121
+
122
+ Examples:
123
+
124
+ daytashield validate invoice.pdf --schema invoice.json
125
+
126
+ daytashield validate ./data/ --rules hipaa --rules pii
127
+
128
+ daytashield validate report.csv --max-age 7d --output results.json
129
+ """
130
+ # Build pipeline
131
+ pipeline = _build_pipeline(schema, rules, max_age, fail_fast)
132
+
133
+ # Collect files to validate
134
+ files = _collect_files(paths)
135
+
136
+ if not files:
137
+ console.print("[yellow]No files found to validate[/yellow]")
138
+ return
139
+
140
+ if not quiet:
141
+ console.print(f"\n[bold]Validating {len(files)} file(s)...[/bold]\n")
142
+
143
+ # Validate files
144
+ results: list[tuple[Path, ValidationResult]] = []
145
+ start_time = time.time()
146
+
147
+ with Progress(
148
+ SpinnerColumn(),
149
+ TextColumn("[progress.description]{task.description}"),
150
+ console=console,
151
+ disable=quiet,
152
+ ) as progress:
153
+ task = progress.add_task("Validating...", total=len(files))
154
+
155
+ for file_path in files:
156
+ progress.update(task, description=f"Validating {file_path.name}...")
157
+ result = pipeline.validate_file(file_path)
158
+ results.append((file_path, result))
159
+ progress.advance(task)
160
+
161
+ total_time = time.time() - start_time
162
+
163
+ # Output results
164
+ if output_format == "json":
165
+ _output_json(results, output)
166
+ elif output_format == "summary":
167
+ _output_summary(results, total_time, quiet)
168
+ else:
169
+ _output_table(results, total_time, quiet)
170
+
171
+ # Write to file if specified
172
+ if output:
173
+ _write_output_file(results, output)
174
+ if not quiet:
175
+ console.print(f"\n[dim]Results written to {output}[/dim]")
176
+
177
+ # Exit with error code if any failures
178
+ failed_count = sum(1 for _, r in results if r.failed)
179
+ if failed_count > 0:
180
+ sys.exit(1)
181
+
182
+
183
+ @cli.command()
184
+ @click.argument("directory", type=click.Path(exists=True, file_okay=False))
185
+ @click.option(
186
+ "--schema",
187
+ "-s",
188
+ type=click.Path(exists=True),
189
+ help="JSON Schema file for validation",
190
+ )
191
+ @click.option(
192
+ "--rules",
193
+ "-r",
194
+ multiple=True,
195
+ type=click.Choice(["hipaa", "gdpr", "pii"]),
196
+ help="Compliance rules to apply",
197
+ )
198
+ @click.option(
199
+ "--pattern",
200
+ "-p",
201
+ default="*",
202
+ help="File pattern to watch (e.g., *.pdf)",
203
+ )
204
+ @click.option(
205
+ "--audit",
206
+ type=click.Path(),
207
+ help="Audit log file path",
208
+ )
209
+ def watch(
210
+ directory: str,
211
+ schema: str | None,
212
+ rules: tuple[str, ...],
213
+ pattern: str,
214
+ audit: str | None,
215
+ ) -> None:
216
+ """Watch a directory for new files and validate them.
217
+
218
+ Example:
219
+
220
+ daytashield watch ./incoming/ --rules hipaa --audit ./audit.jsonl
221
+ """
222
+ try:
223
+ from watchdog.events import FileSystemEventHandler
224
+ from watchdog.observers import Observer
225
+ except ImportError:
226
+ console.print(
227
+ "[red]watchdog package required for watch mode.[/red]\n"
228
+ "Install with: pip install watchdog"
229
+ )
230
+ sys.exit(1)
231
+
232
+ pipeline = _build_pipeline(schema, rules, None, False)
233
+ audit_trail = AuditTrail(audit) if audit else None
234
+ router = DataRouter()
235
+
236
+ console.print(f"\n[bold]Watching {directory} for {pattern} files...[/bold]")
237
+ console.print("[dim]Press Ctrl+C to stop[/dim]\n")
238
+
239
+ class ValidationHandler(FileSystemEventHandler):
240
+ def on_created(self, event: Any) -> None:
241
+ if event.is_directory:
242
+ return
243
+
244
+ file_path = Path(event.src_path)
245
+ if not file_path.match(pattern):
246
+ return
247
+
248
+ console.print(f"[cyan]New file: {file_path.name}[/cyan]")
249
+
250
+ result = pipeline.validate_file(file_path)
251
+ decision = router.route(result)
252
+
253
+ status_style = get_status_style(result.status)
254
+ console.print(
255
+ f" Status: [{status_style}]{result.status.value}[/{status_style}] "
256
+ f"→ {decision.route.action.value}"
257
+ )
258
+
259
+ if result.messages:
260
+ for msg in result.messages[:3]:
261
+ console.print(f" • {msg}")
262
+
263
+ if audit_trail:
264
+ audit_trail.log(result)
265
+
266
+ handler = ValidationHandler()
267
+ observer = Observer()
268
+ observer.schedule(handler, directory, recursive=True)
269
+ observer.start()
270
+
271
+ try:
272
+ while True:
273
+ time.sleep(1)
274
+ except KeyboardInterrupt:
275
+ observer.stop()
276
+ if audit_trail:
277
+ audit_trail.flush()
278
+ console.print("\n[dim]Stopped watching[/dim]")
279
+
280
+ observer.join()
281
+
282
+
283
+ @cli.command()
284
+ @click.argument("audit_file", type=click.Path(exists=True))
285
+ @click.option(
286
+ "--status",
287
+ "-s",
288
+ type=click.Choice(["passed", "warning", "failed", "error"]),
289
+ help="Filter by status",
290
+ )
291
+ @click.option(
292
+ "--limit",
293
+ "-n",
294
+ type=int,
295
+ default=20,
296
+ help="Maximum entries to show",
297
+ )
298
+ @click.option(
299
+ "--stats",
300
+ is_flag=True,
301
+ help="Show statistics only",
302
+ )
303
+ def audit(
304
+ audit_file: str,
305
+ status: str | None,
306
+ limit: int,
307
+ stats: bool,
308
+ ) -> None:
309
+ """Query the audit trail.
310
+
311
+ Example:
312
+
313
+ daytashield audit ./audit.jsonl --status failed --limit 10
314
+ """
315
+ trail = AuditTrail(audit_file)
316
+
317
+ if stats:
318
+ audit_stats = trail.stats()
319
+
320
+ table = Table(title="Audit Statistics")
321
+ table.add_column("Metric", style="cyan")
322
+ table.add_column("Value", style="green")
323
+
324
+ table.add_row("Total Validations", str(audit_stats["total"]))
325
+ table.add_row("Passed", str(audit_stats["by_status"]["passed"]))
326
+ table.add_row("Warnings", str(audit_stats["by_status"]["warning"]))
327
+ table.add_row("Failed", str(audit_stats["by_status"]["failed"]))
328
+ table.add_row("Errors", str(audit_stats["by_status"]["error"]))
329
+ table.add_row("Avg Duration", f"{audit_stats['avg_duration_ms']:.1f}ms")
330
+
331
+ console.print(table)
332
+ return
333
+
334
+ # Query entries
335
+ status_filter = ValidationStatus(status) if status else None
336
+ entries = list(trail.query(status=status_filter, limit=limit))
337
+
338
+ if not entries:
339
+ console.print("[yellow]No entries found[/yellow]")
340
+ return
341
+
342
+ table = Table(title=f"Audit Log ({len(entries)} entries)")
343
+ table.add_column("Time", style="dim")
344
+ table.add_column("Source")
345
+ table.add_column("Status")
346
+ table.add_column("Messages")
347
+ table.add_column("Duration")
348
+
349
+ for entry in entries:
350
+ status_style = get_status_style(entry.status)
351
+ table.add_row(
352
+ entry.timestamp.strftime("%Y-%m-%d %H:%M"),
353
+ entry.source_id or "-",
354
+ f"[{status_style}]{entry.status.value}[/{status_style}]",
355
+ str(entry.message_count),
356
+ format_duration(entry.duration_ms),
357
+ )
358
+
359
+ console.print(table)
360
+
361
+
362
+ @cli.command()
363
+ def info() -> None:
364
+ """Show DaytaShield configuration and status."""
365
+ tree = Tree("[bold]DaytaShield[/bold]")
366
+
367
+ # Version info
368
+ version_branch = tree.add("[cyan]Version[/cyan]")
369
+ version_branch.add(f"daytashield: {__version__}")
370
+
371
+ # Available validators
372
+ validators_branch = tree.add("[cyan]Validators[/cyan]")
373
+ validators_branch.add("SchemaValidator - JSON Schema / Pydantic validation")
374
+ validators_branch.add("SemanticValidator - LLM-based semantic validation")
375
+ validators_branch.add("FreshnessValidator - Timestamp/staleness checks")
376
+ validators_branch.add("ComplianceValidator - HIPAA/GDPR/PII rules")
377
+
378
+ # Available processors
379
+ processors_branch = tree.add("[cyan]Processors[/cyan]")
380
+ processors_branch.add("PDFProcessor - PDF text extraction")
381
+ processors_branch.add("CSVProcessor - CSV/TSV parsing")
382
+ processors_branch.add("JSONProcessor - JSON/JSONL parsing")
383
+
384
+ # Compliance rules
385
+ rules_branch = tree.add("[cyan]Compliance Rules[/cyan]")
386
+ rules_branch.add("hipaa - HIPAA PHI detection")
387
+ rules_branch.add("gdpr - GDPR data protection")
388
+ rules_branch.add("pii - PII pattern detection")
389
+
390
+ console.print(tree)
391
+
392
+
393
+ def _build_pipeline(
394
+ schema: str | None,
395
+ rules: tuple[str, ...],
396
+ max_age: str | None,
397
+ fail_fast: bool,
398
+ ) -> ValidationPipeline:
399
+ """Build validation pipeline from CLI options."""
400
+ validators = []
401
+
402
+ # Add schema validator
403
+ if schema:
404
+ schema_path = Path(schema)
405
+ schema_data = json.loads(schema_path.read_text())
406
+ validators.append(SchemaValidator(schema=schema_data))
407
+
408
+ # Add freshness validator
409
+ if max_age:
410
+ validators.append(FreshnessValidator(max_age=max_age))
411
+
412
+ # Add compliance validator
413
+ if rules:
414
+ validators.append(ComplianceValidator(rules=list(rules)))
415
+
416
+ # Build pipeline with processors
417
+ pipeline = ValidationPipeline(
418
+ validators=validators,
419
+ config={"fail_fast": fail_fast},
420
+ )
421
+
422
+ # Register processors
423
+ pipeline.add_processor(".pdf", PDFProcessor())
424
+ pipeline.add_processor(".csv", CSVProcessor())
425
+ pipeline.add_processor(".tsv", CSVProcessor({"delimiter": "\t"}))
426
+ pipeline.add_processor(".json", JSONProcessor())
427
+ pipeline.add_processor(".jsonl", JSONProcessor())
428
+
429
+ return pipeline
430
+
431
+
432
+ def _collect_files(paths: tuple[str, ...]) -> list[Path]:
433
+ """Collect all files from paths (files and directories)."""
434
+ files: list[Path] = []
435
+ supported_extensions = {".pdf", ".csv", ".tsv", ".json", ".jsonl"}
436
+
437
+ for path_str in paths:
438
+ path = Path(path_str)
439
+ if path.is_file():
440
+ files.append(path)
441
+ elif path.is_dir():
442
+ for ext in supported_extensions:
443
+ files.extend(path.rglob(f"*{ext}"))
444
+
445
+ return sorted(set(files))
446
+
447
+
448
+ def _output_table(
449
+ results: list[tuple[Path, ValidationResult]],
450
+ total_time: float,
451
+ quiet: bool,
452
+ ) -> None:
453
+ """Output results as a table."""
454
+ if quiet:
455
+ return
456
+
457
+ table = Table(title="Validation Results")
458
+ table.add_column("File", style="cyan")
459
+ table.add_column("Status")
460
+ table.add_column("Messages")
461
+ table.add_column("Duration")
462
+
463
+ for file_path, result in results:
464
+ status_style = get_status_style(result.status)
465
+ table.add_row(
466
+ file_path.name,
467
+ f"[{status_style}]{result.status.value}[/{status_style}]",
468
+ str(len(result.messages)),
469
+ format_duration(result.duration_ms),
470
+ )
471
+
472
+ console.print(table)
473
+
474
+ # Summary
475
+ passed = sum(1 for _, r in results if r.status == ValidationStatus.PASSED)
476
+ warnings = sum(1 for _, r in results if r.status == ValidationStatus.WARNING)
477
+ failed = sum(1 for _, r in results if r.failed)
478
+
479
+ console.print(
480
+ f"\n[bold]Summary:[/bold] "
481
+ f"[green]{passed} passed[/green], "
482
+ f"[yellow]{warnings} warnings[/yellow], "
483
+ f"[red]{failed} failed[/red] "
484
+ f"[dim]({total_time:.2f}s)[/dim]"
485
+ )
486
+
487
+
488
+ def _output_summary(
489
+ results: list[tuple[Path, ValidationResult]],
490
+ total_time: float,
491
+ quiet: bool,
492
+ ) -> None:
493
+ """Output a brief summary."""
494
+ passed = sum(1 for _, r in results if r.status == ValidationStatus.PASSED)
495
+ warnings = sum(1 for _, r in results if r.status == ValidationStatus.WARNING)
496
+ failed = sum(1 for _, r in results if r.failed)
497
+
498
+ if not quiet:
499
+ console.print(
500
+ f"Validated {len(results)} files: "
501
+ f"{passed} passed, {warnings} warnings, {failed} failed "
502
+ f"({total_time:.2f}s)"
503
+ )
504
+
505
+
506
+ def _output_json(
507
+ results: list[tuple[Path, ValidationResult]],
508
+ output: str | None,
509
+ ) -> None:
510
+ """Output results as JSON."""
511
+ data = [
512
+ {
513
+ "file": str(file_path),
514
+ "result": result.to_dict(),
515
+ }
516
+ for file_path, result in results
517
+ ]
518
+
519
+ if output:
520
+ Path(output).write_text(json.dumps(data, indent=2, default=str))
521
+ else:
522
+ console.print_json(json.dumps(data, default=str))
523
+
524
+
525
+ def _write_output_file(
526
+ results: list[tuple[Path, ValidationResult]],
527
+ output: str,
528
+ ) -> None:
529
+ """Write results to output file."""
530
+ data = [
531
+ {
532
+ "file": str(file_path),
533
+ "result": result.to_dict(),
534
+ }
535
+ for file_path, result in results
536
+ ]
537
+ Path(output).write_text(json.dumps(data, indent=2, default=str))
538
+
539
+
540
+ if __name__ == "__main__":
541
+ cli()
@@ -0,0 +1,15 @@
1
+ """Core DaytaShield components for validation orchestration."""
2
+
3
+ from daytashield.core.audit import AuditTrail
4
+ from daytashield.core.pipeline import ValidationPipeline
5
+ from daytashield.core.result import ValidationResult, ValidationStatus
6
+ from daytashield.core.router import DataRouter, RouteAction
7
+
8
+ __all__ = [
9
+ "ValidationPipeline",
10
+ "ValidationResult",
11
+ "ValidationStatus",
12
+ "DataRouter",
13
+ "RouteAction",
14
+ "AuditTrail",
15
+ ]