daytashield 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- daytashield/__init__.py +55 -0
- daytashield/cli/__init__.py +5 -0
- daytashield/cli/main.py +541 -0
- daytashield/core/__init__.py +15 -0
- daytashield/core/audit.py +275 -0
- daytashield/core/pipeline.py +240 -0
- daytashield/core/result.py +185 -0
- daytashield/core/router.py +217 -0
- daytashield/integrations/__init__.py +7 -0
- daytashield/integrations/langchain.py +391 -0
- daytashield/processors/__init__.py +13 -0
- daytashield/processors/base.py +182 -0
- daytashield/processors/csv.py +269 -0
- daytashield/processors/json.py +260 -0
- daytashield/processors/pdf.py +232 -0
- daytashield/rules/__init__.py +14 -0
- daytashield/rules/base.py +67 -0
- daytashield/rules/gdpr.py +348 -0
- daytashield/rules/hipaa.py +229 -0
- daytashield/rules/pii.py +208 -0
- daytashield/validators/__init__.py +15 -0
- daytashield/validators/base.py +103 -0
- daytashield/validators/compliance.py +222 -0
- daytashield/validators/freshness.py +337 -0
- daytashield/validators/schema.py +176 -0
- daytashield/validators/semantic.py +256 -0
- daytashield-0.1.1.dist-info/METADATA +316 -0
- daytashield-0.1.1.dist-info/RECORD +31 -0
- daytashield-0.1.1.dist-info/WHEEL +4 -0
- daytashield-0.1.1.dist-info/entry_points.txt +2 -0
- daytashield-0.1.1.dist-info/licenses/LICENSE +190 -0
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
"""JSON file processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, BinaryIO
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from daytashield.core.result import ValidationResult, ValidationStatus
|
|
11
|
+
from daytashield.processors.base import BaseProcessor, ProcessedData, ProcessorConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class JSONProcessorConfig(ProcessorConfig):
|
|
15
|
+
"""Configuration for JSON processing."""
|
|
16
|
+
|
|
17
|
+
encoding: str = Field("utf-8", description="File encoding")
|
|
18
|
+
allow_comments: bool = Field(False, description="Allow JSON5-style comments")
|
|
19
|
+
max_depth: int = Field(100, description="Maximum nesting depth")
|
|
20
|
+
flatten: bool = Field(False, description="Flatten nested structures")
|
|
21
|
+
flatten_separator: str = Field(".", description="Separator for flattened keys")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class JSONProcessor(BaseProcessor):
|
|
25
|
+
"""Processes JSON files to extract structured data.
|
|
26
|
+
|
|
27
|
+
Uses orjson for fast JSON parsing with:
|
|
28
|
+
- Streaming support for large files
|
|
29
|
+
- Schema extraction
|
|
30
|
+
- Nested structure analysis
|
|
31
|
+
- Optional flattening
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
>>> processor = JSONProcessor()
|
|
35
|
+
>>> result = processor.process("data.json")
|
|
36
|
+
>>> data = result.data.content # Parsed JSON
|
|
37
|
+
>>> schema = result.data.metadata["schema"] # Inferred structure
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
name = "json"
|
|
41
|
+
supported_extensions = [".json", ".jsonl", ".ndjson"]
|
|
42
|
+
supported_mime_types = ["application/json", "application/x-ndjson"]
|
|
43
|
+
|
|
44
|
+
def __init__(self, config: JSONProcessorConfig | dict[str, Any] | None = None):
|
|
45
|
+
"""Initialize the JSON processor.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
config: Processor configuration
|
|
49
|
+
"""
|
|
50
|
+
if config is None:
|
|
51
|
+
super().__init__(JSONProcessorConfig())
|
|
52
|
+
elif isinstance(config, dict):
|
|
53
|
+
super().__init__(JSONProcessorConfig(**config))
|
|
54
|
+
else:
|
|
55
|
+
super().__init__(config)
|
|
56
|
+
|
|
57
|
+
def process(
|
|
58
|
+
self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
|
|
59
|
+
) -> ValidationResult:
|
|
60
|
+
"""Process a JSON file and extract content.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
source: JSON file path, file object, or bytes
|
|
64
|
+
result: Optional existing ValidationResult
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
ValidationResult with ProcessedData containing parsed JSON
|
|
68
|
+
"""
|
|
69
|
+
# Create result if not provided
|
|
70
|
+
if result is None:
|
|
71
|
+
result, provenance = self._create_result(source)
|
|
72
|
+
else:
|
|
73
|
+
provenance = result.provenance
|
|
74
|
+
|
|
75
|
+
config = self.config
|
|
76
|
+
if not isinstance(config, JSONProcessorConfig):
|
|
77
|
+
config = JSONProcessorConfig()
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
import orjson
|
|
81
|
+
except ImportError:
|
|
82
|
+
# Fall back to standard json
|
|
83
|
+
import json as orjson # type: ignore[no-redef]
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
# Read raw bytes
|
|
87
|
+
raw_bytes = self._read_source(source)
|
|
88
|
+
|
|
89
|
+
# Compute checksum if configured
|
|
90
|
+
if self.config.compute_checksum and provenance:
|
|
91
|
+
provenance.checksum = self._compute_checksum(raw_bytes)
|
|
92
|
+
|
|
93
|
+
# Determine if JSONL format
|
|
94
|
+
is_jsonl = False
|
|
95
|
+
if isinstance(source, (str, Path)):
|
|
96
|
+
ext = Path(source).suffix.lower()
|
|
97
|
+
is_jsonl = ext in (".jsonl", ".ndjson")
|
|
98
|
+
|
|
99
|
+
# Parse JSON
|
|
100
|
+
if is_jsonl:
|
|
101
|
+
content = self._parse_jsonl(raw_bytes, config.encoding)
|
|
102
|
+
content_type = "records"
|
|
103
|
+
record_count = len(content)
|
|
104
|
+
else:
|
|
105
|
+
content = orjson.loads(raw_bytes)
|
|
106
|
+
content_type = "array" if isinstance(content, list) else "object"
|
|
107
|
+
record_count = len(content) if isinstance(content, list) else None
|
|
108
|
+
|
|
109
|
+
# Analyze structure
|
|
110
|
+
structure_info = self._analyze_structure(content, config.max_depth)
|
|
111
|
+
|
|
112
|
+
# Optionally flatten
|
|
113
|
+
if config.flatten and isinstance(content, dict):
|
|
114
|
+
content = self._flatten(content, config.flatten_separator)
|
|
115
|
+
|
|
116
|
+
# Create processed data
|
|
117
|
+
processed = ProcessedData(
|
|
118
|
+
content=content,
|
|
119
|
+
content_type=content_type,
|
|
120
|
+
source_type="json",
|
|
121
|
+
metadata={
|
|
122
|
+
"structure": structure_info,
|
|
123
|
+
"is_jsonl": is_jsonl,
|
|
124
|
+
},
|
|
125
|
+
record_count=record_count,
|
|
126
|
+
raw_size_bytes=len(raw_bytes),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
result.data = processed
|
|
130
|
+
|
|
131
|
+
# Add warnings for potential issues
|
|
132
|
+
if structure_info["max_depth"] > 10:
|
|
133
|
+
result.add_message(
|
|
134
|
+
code="JSON_DEEP_NESTING",
|
|
135
|
+
message=f"Deep nesting detected: {structure_info['max_depth']} levels",
|
|
136
|
+
severity=ValidationStatus.WARNING,
|
|
137
|
+
validator=self.name,
|
|
138
|
+
)
|
|
139
|
+
if result.status == ValidationStatus.PASSED:
|
|
140
|
+
result.status = ValidationStatus.WARNING
|
|
141
|
+
|
|
142
|
+
except orjson.JSONDecodeError as e: # type: ignore[union-attr]
|
|
143
|
+
result.add_message(
|
|
144
|
+
code="JSON_PARSE_ERROR",
|
|
145
|
+
message=f"Invalid JSON: {e}",
|
|
146
|
+
severity=ValidationStatus.ERROR,
|
|
147
|
+
validator=self.name,
|
|
148
|
+
)
|
|
149
|
+
result.status = ValidationStatus.ERROR
|
|
150
|
+
|
|
151
|
+
except Exception as e:
|
|
152
|
+
result.add_message(
|
|
153
|
+
code="JSON_PROCESSING_ERROR",
|
|
154
|
+
message=f"Failed to process JSON: {e}",
|
|
155
|
+
severity=ValidationStatus.ERROR,
|
|
156
|
+
validator=self.name,
|
|
157
|
+
details={"error": str(e)},
|
|
158
|
+
)
|
|
159
|
+
result.status = ValidationStatus.ERROR
|
|
160
|
+
|
|
161
|
+
return result
|
|
162
|
+
|
|
163
|
+
def _parse_jsonl(self, raw_bytes: bytes, encoding: str) -> list[Any]:
|
|
164
|
+
"""Parse JSON Lines format.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
raw_bytes: Raw file bytes
|
|
168
|
+
encoding: Text encoding
|
|
169
|
+
|
|
170
|
+
Returns:
|
|
171
|
+
List of parsed JSON objects
|
|
172
|
+
"""
|
|
173
|
+
try:
|
|
174
|
+
import orjson
|
|
175
|
+
|
|
176
|
+
records = []
|
|
177
|
+
for line in raw_bytes.decode(encoding).splitlines():
|
|
178
|
+
line = line.strip()
|
|
179
|
+
if line: # Skip empty lines
|
|
180
|
+
records.append(orjson.loads(line))
|
|
181
|
+
return records
|
|
182
|
+
except ImportError:
|
|
183
|
+
import json
|
|
184
|
+
|
|
185
|
+
records = []
|
|
186
|
+
for line in raw_bytes.decode(encoding).splitlines():
|
|
187
|
+
line = line.strip()
|
|
188
|
+
if line:
|
|
189
|
+
records.append(json.loads(line))
|
|
190
|
+
return records
|
|
191
|
+
|
|
192
|
+
def _analyze_structure(self, data: Any, max_depth: int) -> dict[str, Any]:
|
|
193
|
+
"""Analyze JSON structure.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
data: Parsed JSON data
|
|
197
|
+
max_depth: Maximum depth to analyze
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Structure analysis dict
|
|
201
|
+
"""
|
|
202
|
+
info: dict[str, Any] = {
|
|
203
|
+
"type": type(data).__name__,
|
|
204
|
+
"max_depth": 0,
|
|
205
|
+
"total_keys": 0,
|
|
206
|
+
"array_lengths": [],
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
def analyze(obj: Any, depth: int) -> None:
|
|
210
|
+
if depth > max_depth:
|
|
211
|
+
return
|
|
212
|
+
|
|
213
|
+
info["max_depth"] = max(info["max_depth"], depth)
|
|
214
|
+
|
|
215
|
+
if isinstance(obj, dict):
|
|
216
|
+
info["total_keys"] += len(obj)
|
|
217
|
+
for value in obj.values():
|
|
218
|
+
analyze(value, depth + 1)
|
|
219
|
+
elif isinstance(obj, list):
|
|
220
|
+
info["array_lengths"].append(len(obj))
|
|
221
|
+
for item in obj:
|
|
222
|
+
analyze(item, depth + 1)
|
|
223
|
+
|
|
224
|
+
analyze(data, 0)
|
|
225
|
+
|
|
226
|
+
# Summarize array lengths
|
|
227
|
+
if info["array_lengths"]:
|
|
228
|
+
info["min_array_length"] = min(info["array_lengths"])
|
|
229
|
+
info["max_array_length"] = max(info["array_lengths"])
|
|
230
|
+
info["array_count"] = len(info["array_lengths"])
|
|
231
|
+
del info["array_lengths"]
|
|
232
|
+
|
|
233
|
+
return info
|
|
234
|
+
|
|
235
|
+
def _flatten(self, data: dict[str, Any], separator: str) -> dict[str, Any]:
|
|
236
|
+
"""Flatten a nested dictionary.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
data: Nested dictionary
|
|
240
|
+
separator: Key separator
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
Flattened dictionary
|
|
244
|
+
"""
|
|
245
|
+
result: dict[str, Any] = {}
|
|
246
|
+
|
|
247
|
+
def flatten_recursive(obj: Any, prefix: str) -> None:
|
|
248
|
+
if isinstance(obj, dict):
|
|
249
|
+
for key, value in obj.items():
|
|
250
|
+
new_key = f"{prefix}{separator}{key}" if prefix else key
|
|
251
|
+
flatten_recursive(value, new_key)
|
|
252
|
+
elif isinstance(obj, list):
|
|
253
|
+
for i, item in enumerate(obj):
|
|
254
|
+
new_key = f"{prefix}[{i}]"
|
|
255
|
+
flatten_recursive(item, new_key)
|
|
256
|
+
else:
|
|
257
|
+
result[prefix] = obj
|
|
258
|
+
|
|
259
|
+
flatten_recursive(data, "")
|
|
260
|
+
return result
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
"""PDF document processor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any, BinaryIO
|
|
7
|
+
|
|
8
|
+
from pydantic import Field
|
|
9
|
+
|
|
10
|
+
from daytashield.core.result import ValidationResult, ValidationStatus
|
|
11
|
+
from daytashield.processors.base import BaseProcessor, ProcessedData, ProcessorConfig
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PDFProcessorConfig(ProcessorConfig):
|
|
15
|
+
"""Configuration for PDF processing."""
|
|
16
|
+
|
|
17
|
+
extract_images: bool = Field(False, description="Extract embedded images")
|
|
18
|
+
extract_tables: bool = Field(True, description="Extract tables from PDF")
|
|
19
|
+
ocr_fallback: bool = Field(False, description="Use OCR if text extraction fails")
|
|
20
|
+
max_pages: int | None = Field(None, description="Maximum pages to process")
|
|
21
|
+
password: str | None = Field(None, description="PDF password if encrypted")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PDFProcessor(BaseProcessor):
|
|
25
|
+
"""Processes PDF documents to extract text and metadata.
|
|
26
|
+
|
|
27
|
+
Uses pdfplumber for reliable text extraction with layout preservation.
|
|
28
|
+
Supports:
|
|
29
|
+
- Text extraction with layout
|
|
30
|
+
- Metadata extraction (author, title, dates)
|
|
31
|
+
- Table extraction
|
|
32
|
+
- Page-by-page processing
|
|
33
|
+
- Optional OCR fallback
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> processor = PDFProcessor()
|
|
37
|
+
>>> result = processor.process("invoice.pdf")
|
|
38
|
+
>>> print(result.data.content) # Extracted text
|
|
39
|
+
>>> print(result.data.metadata) # PDF metadata
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
name = "pdf"
|
|
43
|
+
supported_extensions = [".pdf"]
|
|
44
|
+
supported_mime_types = ["application/pdf"]
|
|
45
|
+
|
|
46
|
+
def __init__(self, config: PDFProcessorConfig | dict[str, Any] | None = None):
|
|
47
|
+
"""Initialize the PDF processor.
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
config: Processor configuration
|
|
51
|
+
"""
|
|
52
|
+
if config is None:
|
|
53
|
+
super().__init__(PDFProcessorConfig())
|
|
54
|
+
elif isinstance(config, dict):
|
|
55
|
+
super().__init__(PDFProcessorConfig(**config))
|
|
56
|
+
else:
|
|
57
|
+
super().__init__(config)
|
|
58
|
+
|
|
59
|
+
def process(
|
|
60
|
+
self, source: str | Path | BinaryIO | bytes, result: ValidationResult | None = None
|
|
61
|
+
) -> ValidationResult:
|
|
62
|
+
"""Process a PDF file and extract content.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
source: PDF file path, file object, or bytes
|
|
66
|
+
result: Optional existing ValidationResult
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
ValidationResult with ProcessedData containing extracted text
|
|
70
|
+
"""
|
|
71
|
+
# Create result if not provided
|
|
72
|
+
if result is None:
|
|
73
|
+
result, provenance = self._create_result(source)
|
|
74
|
+
else:
|
|
75
|
+
provenance = result.provenance
|
|
76
|
+
|
|
77
|
+
config = self.config
|
|
78
|
+
if not isinstance(config, PDFProcessorConfig):
|
|
79
|
+
config = PDFProcessorConfig()
|
|
80
|
+
|
|
81
|
+
try:
|
|
82
|
+
import pdfplumber
|
|
83
|
+
except ImportError:
|
|
84
|
+
result.add_message(
|
|
85
|
+
code="PDF_NO_PDFPLUMBER",
|
|
86
|
+
message="pdfplumber package not installed. Install with: pip install pdfplumber",
|
|
87
|
+
severity=ValidationStatus.ERROR,
|
|
88
|
+
validator=self.name,
|
|
89
|
+
)
|
|
90
|
+
result.status = ValidationStatus.ERROR
|
|
91
|
+
return result
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
# Read the source
|
|
95
|
+
raw_bytes = self._read_source(source)
|
|
96
|
+
|
|
97
|
+
# Compute checksum if configured
|
|
98
|
+
if self.config.compute_checksum and provenance:
|
|
99
|
+
provenance.checksum = self._compute_checksum(raw_bytes)
|
|
100
|
+
|
|
101
|
+
# Open PDF
|
|
102
|
+
pdf_file: Any
|
|
103
|
+
if isinstance(source, (str, Path)):
|
|
104
|
+
pdf_file = pdfplumber.open(source, password=config.password)
|
|
105
|
+
else:
|
|
106
|
+
import io
|
|
107
|
+
pdf_file = pdfplumber.open(io.BytesIO(raw_bytes), password=config.password)
|
|
108
|
+
|
|
109
|
+
with pdf_file as pdf:
|
|
110
|
+
# Extract metadata
|
|
111
|
+
metadata = self._extract_metadata(pdf)
|
|
112
|
+
|
|
113
|
+
# Extract text from pages
|
|
114
|
+
pages_text: list[str] = []
|
|
115
|
+
tables: list[list[list[str]]] = []
|
|
116
|
+
page_count = len(pdf.pages)
|
|
117
|
+
|
|
118
|
+
max_pages = config.max_pages or page_count
|
|
119
|
+
for i, page in enumerate(pdf.pages[:max_pages]):
|
|
120
|
+
# Extract text
|
|
121
|
+
page_text = page.extract_text() or ""
|
|
122
|
+
pages_text.append(page_text)
|
|
123
|
+
|
|
124
|
+
# Extract tables if configured
|
|
125
|
+
if config.extract_tables:
|
|
126
|
+
page_tables = page.extract_tables() or []
|
|
127
|
+
tables.extend(page_tables)
|
|
128
|
+
|
|
129
|
+
# Combine all text
|
|
130
|
+
full_text = "\n\n".join(pages_text)
|
|
131
|
+
|
|
132
|
+
# Check if text extraction worked
|
|
133
|
+
if not full_text.strip() and config.ocr_fallback:
|
|
134
|
+
full_text = self._ocr_fallback(raw_bytes)
|
|
135
|
+
metadata["ocr_used"] = True
|
|
136
|
+
|
|
137
|
+
# Create processed data
|
|
138
|
+
processed = ProcessedData(
|
|
139
|
+
content=full_text,
|
|
140
|
+
content_type="text",
|
|
141
|
+
source_type="pdf",
|
|
142
|
+
metadata=metadata,
|
|
143
|
+
page_count=page_count,
|
|
144
|
+
raw_size_bytes=len(raw_bytes),
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Add tables to metadata if extracted
|
|
148
|
+
if tables:
|
|
149
|
+
processed.metadata["tables"] = tables
|
|
150
|
+
processed.metadata["table_count"] = len(tables)
|
|
151
|
+
|
|
152
|
+
result.data = processed
|
|
153
|
+
|
|
154
|
+
# Add info message about extraction
|
|
155
|
+
result.metadata["pdf_pages"] = page_count
|
|
156
|
+
result.metadata["pdf_text_length"] = len(full_text)
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
result.add_message(
|
|
160
|
+
code="PDF_PROCESSING_ERROR",
|
|
161
|
+
message=f"Failed to process PDF: {e}",
|
|
162
|
+
severity=ValidationStatus.ERROR,
|
|
163
|
+
validator=self.name,
|
|
164
|
+
details={"error": str(e)},
|
|
165
|
+
)
|
|
166
|
+
result.status = ValidationStatus.ERROR
|
|
167
|
+
|
|
168
|
+
return result
|
|
169
|
+
|
|
170
|
+
def _extract_metadata(self, pdf: Any) -> dict[str, Any]:
|
|
171
|
+
"""Extract metadata from PDF.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
pdf: pdfplumber PDF object
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Dict of metadata
|
|
178
|
+
"""
|
|
179
|
+
metadata: dict[str, Any] = {}
|
|
180
|
+
|
|
181
|
+
if hasattr(pdf, "metadata") and pdf.metadata:
|
|
182
|
+
pdf_meta = pdf.metadata
|
|
183
|
+
# Common PDF metadata fields
|
|
184
|
+
field_mapping = {
|
|
185
|
+
"Title": "title",
|
|
186
|
+
"Author": "author",
|
|
187
|
+
"Subject": "subject",
|
|
188
|
+
"Creator": "creator",
|
|
189
|
+
"Producer": "producer",
|
|
190
|
+
"CreationDate": "created_at",
|
|
191
|
+
"ModDate": "modified_at",
|
|
192
|
+
"Keywords": "keywords",
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
for pdf_key, our_key in field_mapping.items():
|
|
196
|
+
if pdf_key in pdf_meta and pdf_meta[pdf_key]:
|
|
197
|
+
metadata[our_key] = pdf_meta[pdf_key]
|
|
198
|
+
|
|
199
|
+
metadata["page_count"] = len(pdf.pages)
|
|
200
|
+
|
|
201
|
+
return metadata
|
|
202
|
+
|
|
203
|
+
def _ocr_fallback(self, pdf_bytes: bytes) -> str:
|
|
204
|
+
"""Attempt OCR on PDF pages.
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
pdf_bytes: Raw PDF bytes
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Extracted text via OCR
|
|
211
|
+
"""
|
|
212
|
+
try:
|
|
213
|
+
import io
|
|
214
|
+
|
|
215
|
+
from pdf2image import convert_from_bytes
|
|
216
|
+
import pytesseract
|
|
217
|
+
|
|
218
|
+
# Convert PDF to images
|
|
219
|
+
images = convert_from_bytes(pdf_bytes)
|
|
220
|
+
|
|
221
|
+
# OCR each page
|
|
222
|
+
texts = []
|
|
223
|
+
for img in images:
|
|
224
|
+
text = pytesseract.image_to_string(img)
|
|
225
|
+
texts.append(text)
|
|
226
|
+
|
|
227
|
+
return "\n\n".join(texts)
|
|
228
|
+
|
|
229
|
+
except ImportError:
|
|
230
|
+
return "[OCR unavailable - install pytesseract and pdf2image]"
|
|
231
|
+
except Exception:
|
|
232
|
+
return "[OCR failed]"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""DaytaShield compliance rule packs."""
|
|
2
|
+
|
|
3
|
+
from daytashield.rules.base import ComplianceRule, ComplianceViolation
|
|
4
|
+
from daytashield.rules.gdpr import GDPRRules
|
|
5
|
+
from daytashield.rules.hipaa import HIPAARules
|
|
6
|
+
from daytashield.rules.pii import PIIDetector
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"ComplianceRule",
|
|
10
|
+
"ComplianceViolation",
|
|
11
|
+
"HIPAARules",
|
|
12
|
+
"GDPRRules",
|
|
13
|
+
"PIIDetector",
|
|
14
|
+
]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Base compliance rule abstract class."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ComplianceViolation(BaseModel):
|
|
12
|
+
"""A compliance violation detected by a rule."""
|
|
13
|
+
|
|
14
|
+
code: str = Field(..., description="Machine-readable violation code")
|
|
15
|
+
message: str = Field(..., description="Human-readable description")
|
|
16
|
+
severity: str = Field("error", description="Severity: error, warning, info")
|
|
17
|
+
category: str = Field(..., description="Violation category (e.g., PHI, PII)")
|
|
18
|
+
field: str | None = Field(None, description="Field path where violation was found")
|
|
19
|
+
matched_value: str | None = Field(None, description="The value that matched (redacted)")
|
|
20
|
+
recommendation: str | None = Field(None, description="How to fix the violation")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ComplianceRule(ABC):
|
|
24
|
+
"""Abstract base class for compliance rules.
|
|
25
|
+
|
|
26
|
+
Compliance rules detect specific types of violations in data,
|
|
27
|
+
such as exposed PII, missing consent, or regulatory violations.
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> class MyRule(ComplianceRule):
|
|
31
|
+
... name = "my_rule"
|
|
32
|
+
... description = "Checks for custom violations"
|
|
33
|
+
...
|
|
34
|
+
... def check(self, data, text_content):
|
|
35
|
+
... violations = []
|
|
36
|
+
... for field, text in text_content:
|
|
37
|
+
... if "secret" in text.lower():
|
|
38
|
+
... violations.append(ComplianceViolation(
|
|
39
|
+
... code="SECRET_EXPOSED",
|
|
40
|
+
... message="Secret value detected",
|
|
41
|
+
... category="security",
|
|
42
|
+
... field=field,
|
|
43
|
+
... ))
|
|
44
|
+
... return violations
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
name: str = "base_rule"
|
|
48
|
+
description: str = "Base compliance rule"
|
|
49
|
+
enabled: bool = True
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def check(
|
|
53
|
+
self, data: Any, text_content: list[tuple[str, str]]
|
|
54
|
+
) -> list[ComplianceViolation]:
|
|
55
|
+
"""Check data for compliance violations.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
data: The original data structure
|
|
59
|
+
text_content: List of (field_path, text_value) tuples
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of ComplianceViolation objects
|
|
63
|
+
"""
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
def __repr__(self) -> str:
|
|
67
|
+
return f"{self.__class__.__name__}(name={self.name!r})"
|