truthound-dashboard 1.2.1__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. truthound_dashboard/api/deps.py +28 -0
  2. truthound_dashboard/api/drift.py +1 -0
  3. truthound_dashboard/api/mask.py +164 -0
  4. truthound_dashboard/api/profile.py +11 -3
  5. truthound_dashboard/api/router.py +22 -0
  6. truthound_dashboard/api/scan.py +168 -0
  7. truthound_dashboard/api/schemas.py +13 -4
  8. truthound_dashboard/api/validations.py +33 -1
  9. truthound_dashboard/api/validators.py +85 -0
  10. truthound_dashboard/core/__init__.py +8 -0
  11. truthound_dashboard/core/phase5/activity.py +1 -1
  12. truthound_dashboard/core/services.py +457 -7
  13. truthound_dashboard/core/truthound_adapter.py +441 -26
  14. truthound_dashboard/db/__init__.py +6 -0
  15. truthound_dashboard/db/models.py +250 -1
  16. truthound_dashboard/schemas/__init__.py +52 -1
  17. truthound_dashboard/schemas/collaboration.py +1 -1
  18. truthound_dashboard/schemas/drift.py +118 -3
  19. truthound_dashboard/schemas/mask.py +209 -0
  20. truthound_dashboard/schemas/profile.py +45 -2
  21. truthound_dashboard/schemas/scan.py +312 -0
  22. truthound_dashboard/schemas/schema.py +30 -2
  23. truthound_dashboard/schemas/validation.py +60 -3
  24. truthound_dashboard/schemas/validators/__init__.py +59 -0
  25. truthound_dashboard/schemas/validators/aggregate_validators.py +238 -0
  26. truthound_dashboard/schemas/validators/anomaly_validators.py +723 -0
  27. truthound_dashboard/schemas/validators/base.py +263 -0
  28. truthound_dashboard/schemas/validators/completeness_validators.py +269 -0
  29. truthound_dashboard/schemas/validators/cross_table_validators.py +375 -0
  30. truthound_dashboard/schemas/validators/datetime_validators.py +253 -0
  31. truthound_dashboard/schemas/validators/distribution_validators.py +422 -0
  32. truthound_dashboard/schemas/validators/drift_validators.py +615 -0
  33. truthound_dashboard/schemas/validators/geospatial_validators.py +486 -0
  34. truthound_dashboard/schemas/validators/multi_column_validators.py +706 -0
  35. truthound_dashboard/schemas/validators/privacy_validators.py +531 -0
  36. truthound_dashboard/schemas/validators/query_validators.py +510 -0
  37. truthound_dashboard/schemas/validators/registry.py +318 -0
  38. truthound_dashboard/schemas/validators/schema_validators.py +408 -0
  39. truthound_dashboard/schemas/validators/string_validators.py +396 -0
  40. truthound_dashboard/schemas/validators/table_validators.py +412 -0
  41. truthound_dashboard/schemas/validators/uniqueness_validators.py +355 -0
  42. truthound_dashboard/schemas/validators.py +59 -0
  43. truthound_dashboard/static/assets/index-BZG20KuF.js +586 -0
  44. truthound_dashboard/static/assets/index-D_HyZ3pb.css +1 -0
  45. truthound_dashboard/static/assets/unmerged_dictionaries-CtpqQBm0.js +1 -0
  46. truthound_dashboard/static/index.html +2 -2
  47. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/METADATA +50 -11
  48. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/RECORD +51 -27
  49. truthound_dashboard/static/assets/index-BqXVFyqj.js +0 -574
  50. truthound_dashboard/static/assets/index-o8qHVDte.css +0 -1
  51. truthound_dashboard/static/assets/unmerged_dictionaries-n_T3wZTf.js +0 -1
  52. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/WHEEL +0 -0
  53. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/entry_points.txt +0 -0
  54. {truthound_dashboard-1.2.1.dist-info → truthound_dashboard-1.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,209 @@
1
+ """Pydantic schemas for data masking (th.mask) operations.
2
+
3
+ Provides schemas for masking requests, responses, and history.
4
+ Supports three masking strategies: redact, hash, fake.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from datetime import datetime
10
+ from enum import Enum
11
+ from typing import Literal
12
+
13
+ from pydantic import Field
14
+
15
+ from .base import BaseSchema, IDMixin, TimestampMixin
16
+
17
+
18
+ class MaskingStrategy(str, Enum):
19
+ """Masking strategy options.
20
+
21
+ - redact: Replace values with asterisks (e.g., "john@example.com" -> "****")
22
+ - hash: Replace values with SHA256 hash (deterministic, can be used for joins)
23
+ - fake: Replace values with realistic fake data (e.g., "john@example.com" -> "alice@test.org")
24
+ """
25
+
26
+ REDACT = "redact"
27
+ HASH = "hash"
28
+ FAKE = "fake"
29
+
30
+
31
+ MaskingStrategyLiteral = Literal["redact", "hash", "fake"]
32
+
33
+
34
+ class MaskStatus(str, Enum):
35
+ """Status of a masking operation."""
36
+
37
+ PENDING = "pending"
38
+ RUNNING = "running"
39
+ SUCCESS = "success"
40
+ FAILED = "failed"
41
+ ERROR = "error"
42
+
43
+
44
+ class MaskRequest(BaseSchema):
45
+ """Request body for running a masking operation.
46
+
47
+ Attributes:
48
+ columns: Optional list of columns to mask. If None, auto-detects PII.
49
+ strategy: Masking strategy to use. Defaults to "redact".
50
+ output_format: Output file format. Defaults to "csv".
51
+ """
52
+
53
+ columns: list[str] | None = Field(
54
+ default=None,
55
+ description="Columns to mask. If not specified, auto-detects PII columns.",
56
+ )
57
+ strategy: MaskingStrategyLiteral = Field(
58
+ default="redact",
59
+ description="Masking strategy: 'redact' (asterisks), 'hash' (SHA256), 'fake' (realistic data)",
60
+ )
61
+ output_format: Literal["csv", "parquet", "json"] = Field(
62
+ default="csv",
63
+ description="Output file format",
64
+ )
65
+
66
+
67
+ class MaskSummary(BaseSchema):
68
+ """Summary of a masking operation.
69
+
70
+ Attributes:
71
+ source_id: ID of the source that was masked.
72
+ source_name: Name of the source.
73
+ status: Current status of the operation.
74
+ strategy: Masking strategy used.
75
+ columns_masked: Number of columns that were masked.
76
+ row_count: Number of rows processed.
77
+ duration_ms: Operation duration in milliseconds.
78
+ """
79
+
80
+ source_id: str
81
+ source_name: str | None = None
82
+ status: str
83
+ strategy: str
84
+ columns_masked: int
85
+ row_count: int | None = None
86
+ duration_ms: int | None = None
87
+
88
+
89
+ class MaskResponse(BaseSchema, IDMixin, TimestampMixin):
90
+ """Response for a masking operation.
91
+
92
+ Attributes:
93
+ id: Unique identifier for the masking operation.
94
+ source_id: ID of the source that was masked.
95
+ status: Current status (pending, running, success, failed, error).
96
+ strategy: Masking strategy used.
97
+ output_path: Path to the masked output file.
98
+ columns_masked: List of columns that were masked.
99
+ auto_detected: Whether PII columns were auto-detected.
100
+ row_count: Number of rows processed.
101
+ column_count: Total number of columns.
102
+ duration_ms: Operation duration in milliseconds.
103
+ error_message: Error message if operation failed.
104
+ started_at: When the operation started.
105
+ completed_at: When the operation completed.
106
+ """
107
+
108
+ source_id: str
109
+ status: str
110
+ strategy: str
111
+ output_path: str | None = None
112
+ columns_masked: list[str] | None = None
113
+ auto_detected: bool = False
114
+ row_count: int | None = None
115
+ column_count: int | None = None
116
+ duration_ms: int | None = None
117
+ error_message: str | None = None
118
+ started_at: datetime | None = None
119
+ completed_at: datetime | None = None
120
+
121
+ @classmethod
122
+ def from_db(cls, db_mask: object) -> MaskResponse:
123
+ """Create response from database model.
124
+
125
+ Args:
126
+ db_mask: DataMask database model instance.
127
+
128
+ Returns:
129
+ MaskResponse instance.
130
+ """
131
+ return cls(
132
+ id=db_mask.id,
133
+ source_id=db_mask.source_id,
134
+ status=db_mask.status,
135
+ strategy=db_mask.strategy,
136
+ output_path=db_mask.output_path,
137
+ columns_masked=db_mask.columns_masked,
138
+ auto_detected=db_mask.auto_detected,
139
+ row_count=db_mask.row_count,
140
+ column_count=db_mask.column_count,
141
+ duration_ms=db_mask.duration_ms,
142
+ error_message=db_mask.error_message,
143
+ started_at=db_mask.started_at,
144
+ completed_at=db_mask.completed_at,
145
+ created_at=db_mask.created_at,
146
+ updated_at=getattr(db_mask, "updated_at", None),
147
+ )
148
+
149
+
150
+ class MaskListItem(BaseSchema, IDMixin):
151
+ """List item for masking operations.
152
+
153
+ Attributes:
154
+ id: Unique identifier.
155
+ source_id: ID of the source.
156
+ source_name: Name of the source.
157
+ status: Current status.
158
+ strategy: Masking strategy used.
159
+ columns_masked: Number of columns masked.
160
+ row_count: Number of rows processed.
161
+ duration_ms: Operation duration in milliseconds.
162
+ created_at: When the operation was created.
163
+ """
164
+
165
+ source_id: str
166
+ source_name: str | None = None
167
+ status: str
168
+ strategy: str
169
+ columns_masked: int = 0
170
+ row_count: int | None = None
171
+ duration_ms: int | None = None
172
+ created_at: datetime
173
+
174
+ @classmethod
175
+ def from_db(cls, db_mask: object, source_name: str | None = None) -> MaskListItem:
176
+ """Create list item from database model.
177
+
178
+ Args:
179
+ db_mask: DataMask database model instance.
180
+ source_name: Optional source name.
181
+
182
+ Returns:
183
+ MaskListItem instance.
184
+ """
185
+ return cls(
186
+ id=db_mask.id,
187
+ source_id=db_mask.source_id,
188
+ source_name=source_name or getattr(db_mask.source, "name", None),
189
+ status=db_mask.status,
190
+ strategy=db_mask.strategy,
191
+ columns_masked=len(db_mask.columns_masked) if db_mask.columns_masked else 0,
192
+ row_count=db_mask.row_count,
193
+ duration_ms=db_mask.duration_ms,
194
+ created_at=db_mask.created_at,
195
+ )
196
+
197
+
198
+ class MaskListResponse(BaseSchema):
199
+ """Response for listing masking operations.
200
+
201
+ Attributes:
202
+ data: List of masking operation items.
203
+ total: Total number of items.
204
+ limit: Maximum items per page.
205
+ """
206
+
207
+ data: list[MaskListItem]
208
+ total: int
209
+ limit: int = 20
@@ -12,6 +12,22 @@ from pydantic import Field
12
12
  from .base import BaseSchema
13
13
 
14
14
 
15
+ class ProfileRequest(BaseSchema):
16
+ """Request schema for data profiling.
17
+
18
+ Provides optional configuration for profiling operations.
19
+ All fields are optional with sensible defaults.
20
+ """
21
+
22
+ sample_size: int | None = Field(
23
+ default=None,
24
+ ge=1,
25
+ description="Maximum number of rows to sample for profiling. "
26
+ "If None, profiles all data. Useful for large datasets.",
27
+ examples=[10000, 50000, 100000],
28
+ )
29
+
30
+
15
31
  class ColumnProfile(BaseSchema):
16
32
  """Profile information for a single column."""
17
33
 
@@ -60,14 +76,41 @@ class ProfileResponse(BaseSchema):
60
76
 
61
77
  @classmethod
62
78
  def from_result(cls, result: Any) -> ProfileResponse:
63
- """Create response from adapter result.
79
+ """Create response from adapter result or Profile model.
64
80
 
65
81
  Args:
66
- result: ProfileResult from adapter.
82
+ result: ProfileResult from adapter or Profile model.
67
83
 
68
84
  Returns:
69
85
  ProfileResponse instance.
70
86
  """
87
+ # Handle Profile model (from database)
88
+ if hasattr(result, "profile_json"):
89
+ profile_json = result.profile_json
90
+ source_name = profile_json.get("source", result.source_id)
91
+ columns_data = profile_json.get("columns", [])
92
+ columns = [
93
+ ColumnProfile(
94
+ name=col["name"],
95
+ dtype=col["dtype"],
96
+ null_pct=col.get("null_pct", "0%"),
97
+ unique_pct=col.get("unique_pct", "0%"),
98
+ min=col.get("min"),
99
+ max=col.get("max"),
100
+ mean=col.get("mean"),
101
+ std=col.get("std"),
102
+ )
103
+ for col in columns_data
104
+ ]
105
+ return cls(
106
+ source=source_name,
107
+ row_count=result.row_count or 0,
108
+ column_count=result.column_count or 0,
109
+ size_bytes=result.size_bytes or 0,
110
+ columns=columns,
111
+ )
112
+
113
+ # Handle ProfileResult (from adapter)
71
114
  columns = [
72
115
  ColumnProfile(
73
116
  name=col["name"],
@@ -0,0 +1,312 @@
1
+ """PII scan-related Pydantic schemas.
2
+
3
+ This module defines schemas for PII scan API operations using th.scan().
4
+
5
+ The scan functionality detects personally identifiable information (PII)
6
+ in datasets and checks compliance with privacy regulations (GDPR, CCPA, LGPD).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from datetime import datetime
12
+ from enum import Enum
13
+ from typing import Any, Literal
14
+
15
+ from pydantic import Field
16
+
17
+ from .base import BaseSchema, IDMixin, ListResponseWrapper
18
+
19
+
20
+ class Regulation(str, Enum):
21
+ """Supported privacy regulations for compliance checking."""
22
+
23
+ GDPR = "gdpr"
24
+ CCPA = "ccpa"
25
+ LGPD = "lgpd"
26
+
27
+
28
+ # Type alias for regulation literal
29
+ RegulationLiteral = Literal["gdpr", "ccpa", "lgpd"]
30
+
31
+ # PII type categories commonly detected
32
+ PII_TYPES = [
33
+ "email",
34
+ "phone",
35
+ "ssn",
36
+ "credit_card",
37
+ "ip_address",
38
+ "date_of_birth",
39
+ "address",
40
+ "name",
41
+ "passport",
42
+ "driver_license",
43
+ "national_id",
44
+ "bank_account",
45
+ "medical_record",
46
+ "biometric",
47
+ ]
48
+
49
+
50
+ class PIIScanRequest(BaseSchema):
51
+ """Request to run PII scan on a data source.
52
+
53
+ This schema maps to truthound's th.scan() parameters for maximum flexibility.
54
+ All optional parameters default to None to use truthound's defaults.
55
+ """
56
+
57
+ # Column filtering
58
+ columns: list[str] | None = Field(
59
+ default=None,
60
+ description="Columns to scan. If None, all columns are scanned.",
61
+ examples=[["email", "phone", "ssn"]],
62
+ )
63
+
64
+ # Regulation compliance checking
65
+ regulations: list[RegulationLiteral] | None = Field(
66
+ default=None,
67
+ description="Privacy regulations to check compliance: gdpr, ccpa, lgpd",
68
+ examples=[["gdpr", "ccpa"]],
69
+ )
70
+
71
+ # Confidence threshold
72
+ min_confidence: float = Field(
73
+ default=0.8,
74
+ ge=0.0,
75
+ le=1.0,
76
+ description="Minimum confidence threshold for PII detection (0.0-1.0)",
77
+ examples=[0.8, 0.9],
78
+ )
79
+
80
+
81
+ class PIIFinding(BaseSchema):
82
+ """Single PII finding detected in a column.
83
+
84
+ Represents one type of PII detected within a specific column,
85
+ including confidence score and sample information.
86
+ """
87
+
88
+ column: str = Field(..., description="Column where PII was detected")
89
+ pii_type: str = Field(
90
+ ...,
91
+ description="Type of PII detected",
92
+ examples=["email", "ssn", "phone", "credit_card"],
93
+ )
94
+ confidence: float = Field(
95
+ ...,
96
+ ge=0.0,
97
+ le=1.0,
98
+ description="Confidence score for this detection (0.0-1.0)",
99
+ )
100
+ sample_count: int = Field(
101
+ ...,
102
+ ge=0,
103
+ description="Number of values matching this PII type",
104
+ )
105
+ sample_values: list[str] | None = Field(
106
+ default=None,
107
+ description="Sample values that matched (redacted for privacy)",
108
+ )
109
+
110
+
111
+ class RegulationViolation(BaseSchema):
112
+ """Regulation compliance violation.
113
+
114
+ Represents a violation of a specific privacy regulation
115
+ detected in the scanned data.
116
+ """
117
+
118
+ regulation: RegulationLiteral = Field(
119
+ ...,
120
+ description="Violated regulation",
121
+ )
122
+ column: str = Field(
123
+ ...,
124
+ description="Column with violation",
125
+ )
126
+ pii_type: str = Field(
127
+ ...,
128
+ description="Type of PII causing the violation",
129
+ )
130
+ message: str = Field(
131
+ ...,
132
+ description="Human-readable violation description",
133
+ )
134
+ severity: Literal["low", "medium", "high", "critical"] = Field(
135
+ default="high",
136
+ description="Severity level of the violation",
137
+ )
138
+
139
+
140
+ class PIIScanSummary(BaseSchema):
141
+ """Summary statistics for a PII scan run."""
142
+
143
+ total_columns_scanned: int = Field(
144
+ default=0,
145
+ ge=0,
146
+ description="Total number of columns scanned",
147
+ )
148
+ columns_with_pii: int = Field(
149
+ default=0,
150
+ ge=0,
151
+ description="Number of columns containing PII",
152
+ )
153
+ total_findings: int = Field(
154
+ default=0,
155
+ ge=0,
156
+ description="Total number of PII findings",
157
+ )
158
+ has_violations: bool = Field(
159
+ default=False,
160
+ description="Whether any regulation violations were found",
161
+ )
162
+ total_violations: int = Field(
163
+ default=0,
164
+ ge=0,
165
+ description="Total number of regulation violations",
166
+ )
167
+
168
+
169
+ class PIIScanResponse(IDMixin, PIIScanSummary):
170
+ """Full PII scan response with all details."""
171
+
172
+ source_id: str = Field(..., description="Source that was scanned")
173
+ status: Literal["pending", "running", "success", "failed", "error"] = Field(
174
+ ...,
175
+ description="Current scan status",
176
+ )
177
+
178
+ # Data statistics
179
+ row_count: int | None = Field(default=None, description="Number of rows scanned")
180
+ column_count: int | None = Field(default=None, description="Number of columns")
181
+
182
+ # Scan configuration used
183
+ min_confidence: float = Field(
184
+ default=0.8,
185
+ description="Confidence threshold used for this scan",
186
+ )
187
+ regulations_checked: list[str] | None = Field(
188
+ default=None,
189
+ description="Regulations that were checked",
190
+ )
191
+
192
+ # Findings (full details)
193
+ findings: list[PIIFinding] = Field(
194
+ default_factory=list,
195
+ description="List of PII findings",
196
+ )
197
+
198
+ # Regulation violations
199
+ violations: list[RegulationViolation] = Field(
200
+ default_factory=list,
201
+ description="List of regulation violations",
202
+ )
203
+
204
+ # Error info (if status is 'error')
205
+ error_message: str | None = Field(
206
+ default=None,
207
+ description="Error message if scan failed",
208
+ )
209
+
210
+ # Timing
211
+ duration_ms: int | None = Field(
212
+ default=None,
213
+ ge=0,
214
+ description="Scan duration in milliseconds",
215
+ )
216
+ started_at: datetime | None = Field(default=None, description="Start timestamp")
217
+ completed_at: datetime | None = Field(
218
+ default=None,
219
+ description="Completion timestamp",
220
+ )
221
+ created_at: datetime = Field(..., description="Record creation timestamp")
222
+
223
+ @classmethod
224
+ def from_model(cls, scan: Any) -> PIIScanResponse:
225
+ """Create response from model.
226
+
227
+ Args:
228
+ scan: PIIScan model instance.
229
+
230
+ Returns:
231
+ PIIScanResponse instance.
232
+ """
233
+ findings = []
234
+ if scan.result_json and "findings" in scan.result_json:
235
+ findings = [
236
+ PIIFinding(**finding) for finding in scan.result_json["findings"]
237
+ ]
238
+
239
+ violations = []
240
+ if scan.result_json and "violations" in scan.result_json:
241
+ violations = [
242
+ RegulationViolation(**violation)
243
+ for violation in scan.result_json["violations"]
244
+ ]
245
+
246
+ return cls(
247
+ id=scan.id,
248
+ source_id=scan.source_id,
249
+ status=scan.status,
250
+ total_columns_scanned=scan.total_columns_scanned or 0,
251
+ columns_with_pii=scan.columns_with_pii or 0,
252
+ total_findings=scan.total_findings or 0,
253
+ has_violations=scan.has_violations or False,
254
+ total_violations=scan.total_violations or 0,
255
+ row_count=scan.row_count,
256
+ column_count=scan.column_count,
257
+ min_confidence=scan.min_confidence or 0.8,
258
+ regulations_checked=scan.regulations_checked,
259
+ findings=findings,
260
+ violations=violations,
261
+ error_message=scan.error_message,
262
+ duration_ms=scan.duration_ms,
263
+ started_at=scan.started_at,
264
+ completed_at=scan.completed_at,
265
+ created_at=scan.created_at,
266
+ )
267
+
268
+
269
+ class PIIScanListItem(IDMixin, PIIScanSummary):
270
+ """PII scan list item (without full findings/violations)."""
271
+
272
+ source_id: str
273
+ status: Literal["pending", "running", "success", "failed", "error"]
274
+ row_count: int | None = None
275
+ column_count: int | None = None
276
+ min_confidence: float = 0.8
277
+ regulations_checked: list[str] | None = None
278
+ duration_ms: int | None = None
279
+ created_at: datetime
280
+
281
+ @classmethod
282
+ def from_model(cls, scan: Any) -> PIIScanListItem:
283
+ """Create list item from model.
284
+
285
+ Args:
286
+ scan: PIIScan model instance.
287
+
288
+ Returns:
289
+ PIIScanListItem instance.
290
+ """
291
+ return cls(
292
+ id=scan.id,
293
+ source_id=scan.source_id,
294
+ status=scan.status,
295
+ total_columns_scanned=scan.total_columns_scanned or 0,
296
+ columns_with_pii=scan.columns_with_pii or 0,
297
+ total_findings=scan.total_findings or 0,
298
+ has_violations=scan.has_violations or False,
299
+ total_violations=scan.total_violations or 0,
300
+ row_count=scan.row_count,
301
+ column_count=scan.column_count,
302
+ min_confidence=scan.min_confidence or 0.8,
303
+ regulations_checked=scan.regulations_checked,
304
+ duration_ms=scan.duration_ms,
305
+ created_at=scan.created_at,
306
+ )
307
+
308
+
309
+ class PIIScanListResponse(ListResponseWrapper[PIIScanListItem]):
310
+ """Paginated PII scan list response."""
311
+
312
+ pass
@@ -40,11 +40,39 @@ class ColumnSchema(BaseSchema):
40
40
 
41
41
 
42
42
  class SchemaLearnRequest(BaseSchema):
43
- """Request to learn schema from source."""
43
+ """Request to learn schema from source.
44
+
45
+ Maps to truthound's th.learn() parameters for schema inference.
46
+
47
+ Attributes:
48
+ infer_constraints: If True, infers min/max, allowed values from data.
49
+ categorical_threshold: Maximum unique values for categorical detection.
50
+ Columns with unique values <= this threshold are treated as categorical.
51
+ sample_size: Number of rows to sample for large datasets.
52
+ If None, uses all rows. Useful for performance with large files.
53
+ """
44
54
 
45
55
  infer_constraints: bool = Field(
46
56
  default=True,
47
- description="Infer constraints from data statistics",
57
+ description="Infer constraints (min/max, allowed values) from data statistics",
58
+ )
59
+ categorical_threshold: int | None = Field(
60
+ default=None,
61
+ ge=1,
62
+ le=1000,
63
+ description=(
64
+ "Maximum unique values for categorical detection. "
65
+ "Columns with unique values <= threshold are treated as categorical. "
66
+ "If None, uses truthound default (20)."
67
+ ),
68
+ )
69
+ sample_size: int | None = Field(
70
+ default=None,
71
+ ge=100,
72
+ description=(
73
+ "Number of rows to sample for schema learning. "
74
+ "If None, uses all rows. Useful for large datasets."
75
+ ),
48
76
  )
49
77
 
50
78