additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/expressions/samples.py
DELETED
|
@@ -1,492 +0,0 @@
|
|
|
1
|
-
# sample_data_manager.py
|
|
2
|
-
# Enhanced sample data management for additory expressions
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
import yaml
|
|
6
|
-
import pandas as pd
|
|
7
|
-
from typing import Dict, List, Optional, Any, Union, Tuple
|
|
8
|
-
from dataclasses import dataclass
|
|
9
|
-
import re
|
|
10
|
-
|
|
11
|
-
from .logging import log_info, log_warning
|
|
12
|
-
from .enhanced_version_manager import EnhancedVersionManager
|
|
13
|
-
from .namespace_manager import NamespaceManager
|
|
14
|
-
from .integrity_manager import IntegrityManager
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class SampleDataInfo:
|
|
19
|
-
"""Information about sample data"""
|
|
20
|
-
expression_name: str
|
|
21
|
-
version: str
|
|
22
|
-
has_clean: bool
|
|
23
|
-
has_unclean: bool
|
|
24
|
-
clean_rows: int
|
|
25
|
-
unclean_rows: int
|
|
26
|
-
educational_comments: List[str]
|
|
27
|
-
validation_errors: List[str]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class SampleDataError(Exception):
|
|
31
|
-
"""Raised when sample data operations fail"""
|
|
32
|
-
pass
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class SampleDataManager:
|
|
36
|
-
"""
|
|
37
|
-
Enhanced sample data management system
|
|
38
|
-
Provides clean/unclean sample support with educational comments and validation
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
def __init__(self):
|
|
42
|
-
self.version_manager = EnhancedVersionManager()
|
|
43
|
-
self.namespace_manager = NamespaceManager()
|
|
44
|
-
self.integrity_manager = IntegrityManager()
|
|
45
|
-
|
|
46
|
-
# Sample data validation rules
|
|
47
|
-
self.validation_rules = {
|
|
48
|
-
"max_rows": 100, # Maximum rows in sample data
|
|
49
|
-
"min_rows": 1, # Minimum rows in sample data
|
|
50
|
-
"required_columns": [], # Will be determined from expression
|
|
51
|
-
"max_column_length": 50, # Maximum string length in columns
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
# Educational comment patterns for unclean data
|
|
55
|
-
self.educational_patterns = {
|
|
56
|
-
"missing_values": "# Missing values to test null handling",
|
|
57
|
-
"invalid_types": "# Invalid data types to test type validation",
|
|
58
|
-
"edge_cases": "# Edge cases to test boundary conditions",
|
|
59
|
-
"malformed_data": "# Malformed data to test error handling",
|
|
60
|
-
"duplicate_values": "# Duplicate values to test deduplication",
|
|
61
|
-
"extreme_values": "# Extreme values to test range validation"
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
log_info("[sample_data] Sample Data Manager initialized")
|
|
65
|
-
|
|
66
|
-
def get_clean_sample(self, expression_name: str, namespace: str = "builtin",
|
|
67
|
-
version: Optional[str] = None) -> pd.DataFrame:
|
|
68
|
-
"""
|
|
69
|
-
Get clean sample data for an expression
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
expression_name: Name of the expression
|
|
73
|
-
namespace: Namespace ("builtin" or "user")
|
|
74
|
-
version: Specific version (optional)
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
DataFrame with clean sample data
|
|
78
|
-
|
|
79
|
-
Raises:
|
|
80
|
-
SampleDataError: If sample data cannot be retrieved
|
|
81
|
-
"""
|
|
82
|
-
try:
|
|
83
|
-
sample_data = self._get_sample_data(expression_name, namespace, version, "clean")
|
|
84
|
-
|
|
85
|
-
if sample_data is None:
|
|
86
|
-
# Generate default clean sample if none exists
|
|
87
|
-
return self._generate_default_clean_sample(expression_name)
|
|
88
|
-
|
|
89
|
-
df = pd.DataFrame(sample_data)
|
|
90
|
-
|
|
91
|
-
# Validate clean sample data
|
|
92
|
-
validation_errors = self._validate_clean_sample(df, expression_name)
|
|
93
|
-
if validation_errors:
|
|
94
|
-
log_warning(f"[sample_data] Clean sample validation issues for {expression_name}: {validation_errors}")
|
|
95
|
-
|
|
96
|
-
log_info(f"[sample_data] Retrieved clean sample for {expression_name} ({len(df)} rows)")
|
|
97
|
-
return df
|
|
98
|
-
|
|
99
|
-
except Exception as e:
|
|
100
|
-
log_warning(f"[sample_data] Failed to get clean sample for {expression_name}: {e}")
|
|
101
|
-
raise SampleDataError(f"Failed to get clean sample data: {e}")
|
|
102
|
-
|
|
103
|
-
def get_unclean_sample(self, expression_name: str, namespace: str = "builtin",
|
|
104
|
-
version: Optional[str] = None) -> pd.DataFrame:
|
|
105
|
-
"""
|
|
106
|
-
Get unclean sample data with educational comments
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
expression_name: Name of the expression
|
|
110
|
-
namespace: Namespace ("builtin" or "user")
|
|
111
|
-
version: Specific version (optional)
|
|
112
|
-
|
|
113
|
-
Returns:
|
|
114
|
-
DataFrame with unclean sample data and educational comments
|
|
115
|
-
|
|
116
|
-
Raises:
|
|
117
|
-
SampleDataError: If sample data cannot be retrieved
|
|
118
|
-
"""
|
|
119
|
-
try:
|
|
120
|
-
sample_data = self._get_sample_data(expression_name, namespace, version, "unclean")
|
|
121
|
-
|
|
122
|
-
if sample_data is None:
|
|
123
|
-
# Generate default unclean sample if none exists
|
|
124
|
-
return self._generate_default_unclean_sample(expression_name)
|
|
125
|
-
|
|
126
|
-
df = pd.DataFrame(sample_data)
|
|
127
|
-
|
|
128
|
-
# Add educational comments as metadata
|
|
129
|
-
df = self._add_educational_comments(df, expression_name)
|
|
130
|
-
|
|
131
|
-
log_info(f"[sample_data] Retrieved unclean sample for {expression_name} ({len(df)} rows)")
|
|
132
|
-
return df
|
|
133
|
-
|
|
134
|
-
except Exception as e:
|
|
135
|
-
log_warning(f"[sample_data] Failed to get unclean sample for {expression_name}: {e}")
|
|
136
|
-
raise SampleDataError(f"Failed to get unclean sample data: {e}")
|
|
137
|
-
|
|
138
|
-
def validate_sample_data(self, sample_data: Dict[str, Any],
|
|
139
|
-
expression_name: str, sample_type: str = "clean") -> Tuple[bool, List[str]]:
|
|
140
|
-
"""
|
|
141
|
-
Validate sample data format and content
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
sample_data: Sample data dictionary
|
|
145
|
-
expression_name: Name of the expression
|
|
146
|
-
sample_type: "clean" or "unclean"
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
Tuple of (is_valid, list_of_errors)
|
|
150
|
-
"""
|
|
151
|
-
errors = []
|
|
152
|
-
|
|
153
|
-
try:
|
|
154
|
-
# Check if sample_data is a dictionary
|
|
155
|
-
if not isinstance(sample_data, dict):
|
|
156
|
-
errors.append("Sample data must be a dictionary")
|
|
157
|
-
return False, errors
|
|
158
|
-
|
|
159
|
-
# Check if sample_data has columns
|
|
160
|
-
if not sample_data:
|
|
161
|
-
errors.append("Sample data cannot be empty")
|
|
162
|
-
return False, errors
|
|
163
|
-
|
|
164
|
-
# Convert to DataFrame for validation
|
|
165
|
-
try:
|
|
166
|
-
df = pd.DataFrame(sample_data)
|
|
167
|
-
except Exception as e:
|
|
168
|
-
errors.append(f"Cannot convert sample data to DataFrame: {e}")
|
|
169
|
-
return False, errors
|
|
170
|
-
|
|
171
|
-
# Validate row count
|
|
172
|
-
row_count = len(df)
|
|
173
|
-
if row_count < self.validation_rules["min_rows"]:
|
|
174
|
-
errors.append(f"Sample data has too few rows: {row_count} < {self.validation_rules['min_rows']}")
|
|
175
|
-
|
|
176
|
-
if row_count > self.validation_rules["max_rows"]:
|
|
177
|
-
errors.append(f"Sample data has too many rows: {row_count} > {self.validation_rules['max_rows']}")
|
|
178
|
-
|
|
179
|
-
# Validate column content
|
|
180
|
-
for column, values in sample_data.items():
|
|
181
|
-
if not isinstance(values, list):
|
|
182
|
-
errors.append(f"Column '{column}' must be a list")
|
|
183
|
-
continue
|
|
184
|
-
|
|
185
|
-
# Check for consistent length
|
|
186
|
-
if len(values) != row_count:
|
|
187
|
-
errors.append(f"Column '{column}' has inconsistent length")
|
|
188
|
-
|
|
189
|
-
# Check string length limits
|
|
190
|
-
for i, value in enumerate(values):
|
|
191
|
-
if isinstance(value, str) and len(value) > self.validation_rules["max_column_length"]:
|
|
192
|
-
errors.append(f"Column '{column}' row {i} exceeds max length")
|
|
193
|
-
|
|
194
|
-
# Specific validation for clean vs unclean samples
|
|
195
|
-
if sample_type == "clean":
|
|
196
|
-
errors.extend(self._validate_clean_sample(df, expression_name))
|
|
197
|
-
else:
|
|
198
|
-
errors.extend(self._validate_unclean_sample(df, expression_name))
|
|
199
|
-
|
|
200
|
-
is_valid = len(errors) == 0
|
|
201
|
-
|
|
202
|
-
if is_valid:
|
|
203
|
-
log_info(f"[sample_data] Sample data validation passed for {expression_name}")
|
|
204
|
-
else:
|
|
205
|
-
log_warning(f"[sample_data] Sample data validation failed for {expression_name}: {errors}")
|
|
206
|
-
|
|
207
|
-
return is_valid, errors
|
|
208
|
-
|
|
209
|
-
except Exception as e:
|
|
210
|
-
errors.append(f"Validation error: {e}")
|
|
211
|
-
return False, errors
|
|
212
|
-
|
|
213
|
-
def get_sample_info(self, expression_name: str, namespace: str = "builtin",
|
|
214
|
-
version: Optional[str] = None) -> SampleDataInfo:
|
|
215
|
-
"""
|
|
216
|
-
Get comprehensive information about sample data
|
|
217
|
-
|
|
218
|
-
Args:
|
|
219
|
-
expression_name: Name of the expression
|
|
220
|
-
namespace: Namespace ("builtin" or "user")
|
|
221
|
-
version: Specific version (optional)
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
SampleDataInfo object with comprehensive information
|
|
225
|
-
"""
|
|
226
|
-
try:
|
|
227
|
-
# Get sample data
|
|
228
|
-
clean_data = self._get_sample_data(expression_name, namespace, version, "clean")
|
|
229
|
-
unclean_data = self._get_sample_data(expression_name, namespace, version, "unclean")
|
|
230
|
-
|
|
231
|
-
# Analyze clean sample
|
|
232
|
-
has_clean = clean_data is not None
|
|
233
|
-
clean_rows = len(pd.DataFrame(clean_data)) if has_clean else 0
|
|
234
|
-
|
|
235
|
-
# Analyze unclean sample
|
|
236
|
-
has_unclean = unclean_data is not None
|
|
237
|
-
unclean_rows = len(pd.DataFrame(unclean_data)) if has_unclean else 0
|
|
238
|
-
|
|
239
|
-
# Extract educational comments
|
|
240
|
-
educational_comments = []
|
|
241
|
-
if has_unclean:
|
|
242
|
-
educational_comments = self._extract_educational_comments(unclean_data)
|
|
243
|
-
|
|
244
|
-
# Validate samples
|
|
245
|
-
validation_errors = []
|
|
246
|
-
if has_clean:
|
|
247
|
-
_, clean_errors = self.validate_sample_data(clean_data, expression_name, "clean")
|
|
248
|
-
validation_errors.extend([f"Clean: {err}" for err in clean_errors])
|
|
249
|
-
|
|
250
|
-
if has_unclean:
|
|
251
|
-
_, unclean_errors = self.validate_sample_data(unclean_data, expression_name, "unclean")
|
|
252
|
-
validation_errors.extend([f"Unclean: {err}" for err in unclean_errors])
|
|
253
|
-
|
|
254
|
-
return SampleDataInfo(
|
|
255
|
-
expression_name=expression_name,
|
|
256
|
-
version=version or self.version_manager.default_version,
|
|
257
|
-
has_clean=has_clean,
|
|
258
|
-
has_unclean=has_unclean,
|
|
259
|
-
clean_rows=clean_rows,
|
|
260
|
-
unclean_rows=unclean_rows,
|
|
261
|
-
educational_comments=educational_comments,
|
|
262
|
-
validation_errors=validation_errors
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
except Exception as e:
|
|
266
|
-
log_warning(f"[sample_data] Failed to get sample info for {expression_name}: {e}")
|
|
267
|
-
return SampleDataInfo(
|
|
268
|
-
expression_name=expression_name,
|
|
269
|
-
version=version or "unknown",
|
|
270
|
-
has_clean=False,
|
|
271
|
-
has_unclean=False,
|
|
272
|
-
clean_rows=0,
|
|
273
|
-
unclean_rows=0,
|
|
274
|
-
educational_comments=[],
|
|
275
|
-
validation_errors=[f"Failed to get sample info: {e}"]
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
def create_sample_template(self, expression_name: str, columns: List[str]) -> Dict[str, Dict[str, Any]]:
|
|
279
|
-
"""
|
|
280
|
-
Create a template for sample data
|
|
281
|
-
|
|
282
|
-
Args:
|
|
283
|
-
expression_name: Name of the expression
|
|
284
|
-
columns: List of required columns
|
|
285
|
-
|
|
286
|
-
Returns:
|
|
287
|
-
Dictionary with clean and unclean sample templates
|
|
288
|
-
"""
|
|
289
|
-
try:
|
|
290
|
-
# Create clean sample template
|
|
291
|
-
clean_template = {}
|
|
292
|
-
for column in columns:
|
|
293
|
-
clean_template[column] = [f"sample_{column}_1", f"sample_{column}_2", f"sample_{column}_3"]
|
|
294
|
-
|
|
295
|
-
# Create unclean sample template with educational comments
|
|
296
|
-
unclean_template = {}
|
|
297
|
-
for column in columns:
|
|
298
|
-
unclean_template[column] = [
|
|
299
|
-
f"valid_{column}",
|
|
300
|
-
None, # Missing value
|
|
301
|
-
f"invalid_{column}_type",
|
|
302
|
-
f"extreme_{column}_value"
|
|
303
|
-
]
|
|
304
|
-
|
|
305
|
-
# Add educational comments
|
|
306
|
-
unclean_template["_comments"] = [
|
|
307
|
-
"# This is unclean sample data for testing error handling",
|
|
308
|
-
"# Row 1: Valid data",
|
|
309
|
-
"# Row 2: Missing values (None/null)",
|
|
310
|
-
"# Row 3: Invalid data types",
|
|
311
|
-
"# Row 4: Extreme or edge case values"
|
|
312
|
-
]
|
|
313
|
-
|
|
314
|
-
template = {
|
|
315
|
-
"clean": clean_template,
|
|
316
|
-
"unclean": unclean_template
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
log_info(f"[sample_data] Created sample template for {expression_name}")
|
|
320
|
-
return template
|
|
321
|
-
|
|
322
|
-
except Exception as e:
|
|
323
|
-
log_warning(f"[sample_data] Failed to create sample template for {expression_name}: {e}")
|
|
324
|
-
raise SampleDataError(f"Failed to create sample template: {e}")
|
|
325
|
-
|
|
326
|
-
def _get_sample_data(self, expression_name: str, namespace: str,
|
|
327
|
-
version: Optional[str], sample_type: str) -> Optional[Dict[str, Any]]:
|
|
328
|
-
"""Get raw sample data from expression file"""
|
|
329
|
-
try:
|
|
330
|
-
# Get expression file path
|
|
331
|
-
expression_path = self.namespace_manager.get_expression_file_path(
|
|
332
|
-
namespace, expression_name, version
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
if not expression_path or not os.path.exists(expression_path):
|
|
336
|
-
return None
|
|
337
|
-
|
|
338
|
-
# Validate integrity
|
|
339
|
-
if not self.integrity_manager.validate_integrity(expression_path):
|
|
340
|
-
log_warning(f"[sample_data] Integrity validation failed for {expression_path}")
|
|
341
|
-
return None
|
|
342
|
-
|
|
343
|
-
# Parse expression file
|
|
344
|
-
with open(expression_path, 'r') as f:
|
|
345
|
-
content = yaml.safe_load(f)
|
|
346
|
-
|
|
347
|
-
# Extract sample data
|
|
348
|
-
sample_section = content.get("sample", {})
|
|
349
|
-
return sample_section.get(sample_type)
|
|
350
|
-
|
|
351
|
-
except Exception as e:
|
|
352
|
-
log_warning(f"[sample_data] Failed to get sample data from {expression_path}: {e}")
|
|
353
|
-
return None
|
|
354
|
-
|
|
355
|
-
def _validate_clean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
|
|
356
|
-
"""Validate clean sample data"""
|
|
357
|
-
errors = []
|
|
358
|
-
|
|
359
|
-
# Check for missing values in clean sample
|
|
360
|
-
if df.isnull().any().any():
|
|
361
|
-
errors.append("Clean sample should not contain missing values")
|
|
362
|
-
|
|
363
|
-
# Check for reasonable data types
|
|
364
|
-
for column in df.columns:
|
|
365
|
-
if column.startswith('_'): # Skip metadata columns
|
|
366
|
-
continue
|
|
367
|
-
|
|
368
|
-
series = df[column]
|
|
369
|
-
|
|
370
|
-
# Check for mixed types (should be consistent in clean data)
|
|
371
|
-
unique_types = set(type(x).__name__ for x in series.dropna())
|
|
372
|
-
if len(unique_types) > 1:
|
|
373
|
-
errors.append(f"Column '{column}' has mixed data types in clean sample")
|
|
374
|
-
|
|
375
|
-
return errors
|
|
376
|
-
|
|
377
|
-
def _validate_unclean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
|
|
378
|
-
"""Validate unclean sample data"""
|
|
379
|
-
errors = []
|
|
380
|
-
|
|
381
|
-
# Unclean samples should have some issues for educational purposes
|
|
382
|
-
has_nulls = df.isnull().any().any()
|
|
383
|
-
has_mixed_types = False
|
|
384
|
-
|
|
385
|
-
for column in df.columns:
|
|
386
|
-
if column.startswith('_'): # Skip metadata columns
|
|
387
|
-
continue
|
|
388
|
-
|
|
389
|
-
series = df[column]
|
|
390
|
-
unique_types = set(type(x).__name__ for x in series.dropna())
|
|
391
|
-
if len(unique_types) > 1:
|
|
392
|
-
has_mixed_types = True
|
|
393
|
-
break
|
|
394
|
-
|
|
395
|
-
# Unclean samples should demonstrate common data issues
|
|
396
|
-
if not has_nulls and not has_mixed_types:
|
|
397
|
-
errors.append("Unclean sample should contain some data quality issues for educational purposes")
|
|
398
|
-
|
|
399
|
-
return errors
|
|
400
|
-
|
|
401
|
-
def _add_educational_comments(self, df: pd.DataFrame, expression_name: str) -> pd.DataFrame:
|
|
402
|
-
"""Add educational comments to unclean sample data"""
|
|
403
|
-
try:
|
|
404
|
-
# Add a comments column with educational information
|
|
405
|
-
comments = []
|
|
406
|
-
|
|
407
|
-
for i, row in df.iterrows():
|
|
408
|
-
comment_parts = []
|
|
409
|
-
|
|
410
|
-
# Check for missing values
|
|
411
|
-
if row.isnull().any():
|
|
412
|
-
comment_parts.append("Contains missing values")
|
|
413
|
-
|
|
414
|
-
# Check for potential type issues
|
|
415
|
-
for col, val in row.items():
|
|
416
|
-
if col.startswith('_'):
|
|
417
|
-
continue
|
|
418
|
-
if isinstance(val, str) and val.lower() in ['invalid', 'error', 'null']:
|
|
419
|
-
comment_parts.append(f"'{col}' has invalid value")
|
|
420
|
-
|
|
421
|
-
if not comment_parts:
|
|
422
|
-
comment_parts.append("Valid data row")
|
|
423
|
-
|
|
424
|
-
comments.append(" | ".join(comment_parts))
|
|
425
|
-
|
|
426
|
-
# Add comments as a new column
|
|
427
|
-
df_with_comments = df.copy()
|
|
428
|
-
df_with_comments['_educational_comments'] = comments
|
|
429
|
-
|
|
430
|
-
return df_with_comments
|
|
431
|
-
|
|
432
|
-
except Exception as e:
|
|
433
|
-
log_warning(f"[sample_data] Failed to add educational comments: {e}")
|
|
434
|
-
return df
|
|
435
|
-
|
|
436
|
-
def _extract_educational_comments(self, sample_data: Dict[str, Any]) -> List[str]:
|
|
437
|
-
"""Extract educational comments from sample data"""
|
|
438
|
-
comments = []
|
|
439
|
-
|
|
440
|
-
# Look for comment fields
|
|
441
|
-
if '_comments' in sample_data:
|
|
442
|
-
comments.extend(sample_data['_comments'])
|
|
443
|
-
|
|
444
|
-
# Generate comments based on data patterns
|
|
445
|
-
try:
|
|
446
|
-
df = pd.DataFrame({k: v for k, v in sample_data.items() if not k.startswith('_')})
|
|
447
|
-
|
|
448
|
-
if df.isnull().any().any():
|
|
449
|
-
comments.append("Contains missing values for null handling testing")
|
|
450
|
-
|
|
451
|
-
for column in df.columns:
|
|
452
|
-
series = df[column]
|
|
453
|
-
unique_types = set(type(x).__name__ for x in series.dropna())
|
|
454
|
-
if len(unique_types) > 1:
|
|
455
|
-
comments.append(f"Column '{column}' has mixed types for type validation testing")
|
|
456
|
-
|
|
457
|
-
except Exception:
|
|
458
|
-
pass # Ignore errors in comment extraction
|
|
459
|
-
|
|
460
|
-
return comments
|
|
461
|
-
|
|
462
|
-
def _generate_default_clean_sample(self, expression_name: str) -> pd.DataFrame:
|
|
463
|
-
"""Generate default clean sample data when none exists"""
|
|
464
|
-
return pd.DataFrame({
|
|
465
|
-
"col_a": [1, 2, 3],
|
|
466
|
-
"col_b": [4, 5, 6],
|
|
467
|
-
"_info": [f"Default clean sample for '{expression_name}'"] * 3
|
|
468
|
-
})
|
|
469
|
-
|
|
470
|
-
def _generate_default_unclean_sample(self, expression_name: str) -> pd.DataFrame:
|
|
471
|
-
"""Generate default unclean sample data when none exists"""
|
|
472
|
-
return pd.DataFrame({
|
|
473
|
-
"col_a": [1, None, "invalid"],
|
|
474
|
-
"col_b": [4, 5, -999],
|
|
475
|
-
"_educational_comments": [
|
|
476
|
-
"Valid data row",
|
|
477
|
-
"Missing value in col_a",
|
|
478
|
-
"Invalid type in col_a, extreme value in col_b"
|
|
479
|
-
],
|
|
480
|
-
"_info": [f"Default unclean sample for '{expression_name}'"] * 3
|
|
481
|
-
})
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
# Global sample data manager instance
|
|
485
|
-
_sample_data_manager = None
|
|
486
|
-
|
|
487
|
-
def get_sample_data_manager() -> SampleDataManager:
|
|
488
|
-
"""Get the global sample data manager instance"""
|
|
489
|
-
global _sample_data_manager
|
|
490
|
-
if _sample_data_manager is None:
|
|
491
|
-
_sample_data_manager = SampleDataManager()
|
|
492
|
-
return _sample_data_manager
|
additory/synthetic/__init__.py
DELETED
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Synthetic Module - Synthetic Data Generation Functionality
|
|
3
|
-
|
|
4
|
-
This module provides synthetic data generation capabilities to add synthetic rows
|
|
5
|
-
to existing dataframes or create data from scratch by intelligently sampling
|
|
6
|
-
from existing data patterns.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from additory.synthetic.synthesizer import synthetic
|
|
10
|
-
|
|
11
|
-
__all__ = [
|
|
12
|
-
"synthetic"
|
|
13
|
-
]
|
|
@@ -1,149 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Column Name Resolver for Linked Lists
|
|
3
|
-
|
|
4
|
-
Resolves column names for linked lists using priority order:
|
|
5
|
-
1. Column_Names row (explicit names)
|
|
6
|
-
2. Underscore parsing from list name
|
|
7
|
-
3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from typing import List, Optional
|
|
11
|
-
import warnings
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def parse_column_names_from_underscores(list_name: str) -> Optional[List[str]]:
|
|
15
|
-
"""
|
|
16
|
-
Parse column names from list name using underscore delimiters.
|
|
17
|
-
|
|
18
|
-
Args:
|
|
19
|
-
list_name: Name of the list variable (e.g., "AE_CM_SEV")
|
|
20
|
-
|
|
21
|
-
Returns:
|
|
22
|
-
List of column names, or None if no underscores found
|
|
23
|
-
|
|
24
|
-
Examples:
|
|
25
|
-
>>> parse_column_names_from_underscores("AE_CM_SEV")
|
|
26
|
-
['AE', 'CM', 'SEV']
|
|
27
|
-
|
|
28
|
-
>>> parse_column_names_from_underscores("adverse_event_medication")
|
|
29
|
-
['adverse', 'event', 'medication']
|
|
30
|
-
|
|
31
|
-
>>> parse_column_names_from_underscores("adverseconmed")
|
|
32
|
-
None
|
|
33
|
-
"""
|
|
34
|
-
if '_' not in list_name:
|
|
35
|
-
return None
|
|
36
|
-
|
|
37
|
-
parts = list_name.split('_')
|
|
38
|
-
|
|
39
|
-
# Filter out empty parts
|
|
40
|
-
column_names = [part for part in parts if part]
|
|
41
|
-
|
|
42
|
-
if not column_names:
|
|
43
|
-
return None
|
|
44
|
-
|
|
45
|
-
return column_names
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def generate_fallback_column_names(strategy_key: str, num_columns: int) -> List[str]:
|
|
49
|
-
"""
|
|
50
|
-
Generate fallback column names when no other naming strategy works.
|
|
51
|
-
|
|
52
|
-
Format: {strategy_key}_1, {strategy_key}_2, etc.
|
|
53
|
-
|
|
54
|
-
Args:
|
|
55
|
-
strategy_key: Key from strategy dict (e.g., "col1")
|
|
56
|
-
num_columns: Number of columns to generate names for
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
List of column names
|
|
60
|
-
|
|
61
|
-
Examples:
|
|
62
|
-
>>> generate_fallback_column_names("col1", 3)
|
|
63
|
-
['col1_1', 'col1_2', 'col1_3']
|
|
64
|
-
|
|
65
|
-
>>> generate_fallback_column_names("adverse_events", 2)
|
|
66
|
-
['adverse_events_1', 'adverse_events_2']
|
|
67
|
-
"""
|
|
68
|
-
return [f"{strategy_key}_{i+1}" for i in range(num_columns)]
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def resolve_column_names(
|
|
72
|
-
list_name: str,
|
|
73
|
-
strategy_key: str,
|
|
74
|
-
num_columns: int,
|
|
75
|
-
explicit_names: Optional[List[str]] = None
|
|
76
|
-
) -> List[str]:
|
|
77
|
-
"""
|
|
78
|
-
Resolve column names using priority order.
|
|
79
|
-
|
|
80
|
-
Priority:
|
|
81
|
-
1. explicit_names (from Column_Names row)
|
|
82
|
-
2. Underscore parsing from list_name
|
|
83
|
-
3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
list_name: Name of the list variable
|
|
87
|
-
strategy_key: Key from strategy dict
|
|
88
|
-
num_columns: Number of columns to generate
|
|
89
|
-
explicit_names: Explicit column names from Column_Names row (optional)
|
|
90
|
-
|
|
91
|
-
Returns:
|
|
92
|
-
List of column names
|
|
93
|
-
|
|
94
|
-
Raises:
|
|
95
|
-
ValueError: If explicit_names count doesn't match num_columns
|
|
96
|
-
|
|
97
|
-
Examples:
|
|
98
|
-
>>> # Priority 1: Explicit names
|
|
99
|
-
>>> resolve_column_names("AE_CM", "col1", 2, ["adverse_event", "medication"])
|
|
100
|
-
['adverse_event', 'medication']
|
|
101
|
-
|
|
102
|
-
>>> # Priority 2: Underscore parsing
|
|
103
|
-
>>> resolve_column_names("AE_CM_SEV", "col1", 3)
|
|
104
|
-
['AE', 'CM', 'SEV']
|
|
105
|
-
|
|
106
|
-
>>> # Priority 3: Fallback
|
|
107
|
-
>>> resolve_column_names("adverseconmed", "col1", 2)
|
|
108
|
-
['col1_1', 'col1_2']
|
|
109
|
-
"""
|
|
110
|
-
# Priority 1: Explicit names from Column_Names row
|
|
111
|
-
if explicit_names is not None:
|
|
112
|
-
if len(explicit_names) != num_columns:
|
|
113
|
-
raise ValueError(
|
|
114
|
-
f"Column_Names row has {len(explicit_names)} names but "
|
|
115
|
-
f"linked list generates {num_columns} columns. They must match."
|
|
116
|
-
)
|
|
117
|
-
return explicit_names
|
|
118
|
-
|
|
119
|
-
# Priority 2: Underscore parsing
|
|
120
|
-
parsed_names = parse_column_names_from_underscores(list_name)
|
|
121
|
-
if parsed_names is not None:
|
|
122
|
-
if len(parsed_names) == num_columns:
|
|
123
|
-
return parsed_names
|
|
124
|
-
else:
|
|
125
|
-
# Underscore count doesn't match - fall through to fallback
|
|
126
|
-
warnings.warn(
|
|
127
|
-
f"List name '{list_name}' has {len(parsed_names)} underscore-separated "
|
|
128
|
-
f"parts but generates {num_columns} columns. "
|
|
129
|
-
f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
|
|
130
|
-
f"Suggestion: Use a list name with {num_columns-1} underscores, "
|
|
131
|
-
f"or add a Column_Names row for explicit naming.",
|
|
132
|
-
UserWarning
|
|
133
|
-
)
|
|
134
|
-
else:
|
|
135
|
-
# No underscores - emit warning
|
|
136
|
-
warnings.warn(
|
|
137
|
-
f"List name '{list_name}' has no underscores. "
|
|
138
|
-
f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
|
|
139
|
-
f"Suggestion: Use underscore-delimited naming (e.g., 'AE_CM_SEV') "
|
|
140
|
-
f"or add a Column_Names row:\n"
|
|
141
|
-
f" {list_name} = [\n"
|
|
142
|
-
f" ['Column_Names:[col1,col2,col3]'],\n"
|
|
143
|
-
f" ['primary', ['attr1'], ['attr2']]\n"
|
|
144
|
-
f" ]",
|
|
145
|
-
UserWarning
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
# Priority 3: Fallback
|
|
149
|
-
return generate_fallback_column_names(strategy_key, num_columns)
|