additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/core/registry.py DELETED
@@ -1,176 +0,0 @@
1
- # registry.py
2
- # Versioned registry for additory
3
-
4
- from dataclasses import dataclass
5
- import os
6
- import json
7
-
8
- from .logging import log_info, log_warning
9
- from .config import (
10
- get_user_formula_root_override,
11
- get_custom_formula_path,
12
- get_default_version,
13
- get_user_version_override,
14
- )
15
-
16
- from additory.core.loader import load_expression
17
- from additory.core.parser import parse_expression
18
-
19
-
20
- # ------------------------------------------------------------
21
- # Resolved Formula Object
22
- # ------------------------------------------------------------
23
-
24
- @dataclass
25
- class ResolvedFormula:
26
- source: str
27
- version: str
28
- mode: str = "local"
29
- ast: dict | None = None
30
- sample_clean: dict | None = None
31
- sample_unclean: dict | None = None
32
-
33
-
34
- # ------------------------------------------------------------
35
- # Manifest loading
36
- # ------------------------------------------------------------
37
-
38
- def _load_manifest(root: str, version: str):
39
- manifest_path = os.path.join(root, version, "manifest.json")
40
-
41
- if not os.path.exists(manifest_path):
42
- raise FileNotFoundError(f"Manifest not found for version {version}")
43
-
44
- with open(manifest_path, "r", encoding="utf-8") as f:
45
- return json.load(f)
46
-
47
-
48
- # ------------------------------------------------------------
49
- # Resolve formula filename
50
- # ------------------------------------------------------------
51
-
52
- def _resolve_filename(formula_name: str, manifest: dict):
53
- if formula_name not in manifest:
54
- raise FileNotFoundError(f"Formula '{formula_name}' not found in manifest")
55
-
56
- return manifest[formula_name]
57
-
58
-
59
- # ------------------------------------------------------------
60
- # Main resolver (attaches AST + sample data)
61
- # ------------------------------------------------------------
62
-
63
- def resolve_formula(formula_name: str, namespace="builtin", version=None):
64
- """
65
- Versioned resolver:
66
- 1. Custom override path
67
- 2. User-set formula root
68
- 3. Version override
69
- 4. Default version
70
- 5. Manifest lookup
71
- 6. Load + parse expression
72
- 7. Attach AST + sample data
73
-
74
- Args:
75
- formula_name: Name of the formula to resolve
76
- namespace: "builtin" or "user" (default: "builtin")
77
- version: Specific version to use (optional)
78
-
79
- Returns:
80
- ResolvedFormula object with AST and sample data
81
- """
82
-
83
- # --------------------------------------------------------
84
- # 1. Custom override (FIXED)
85
- # --------------------------------------------------------
86
- custom = get_custom_formula_path()
87
- if custom:
88
- log_info(f"[registry] Using custom formula path: {custom}")
89
-
90
- resolved = ResolvedFormula(source=custom, version="custom")
91
-
92
- text = load_expression(resolved, namespace)
93
- parsed = parse_expression(text)
94
-
95
- resolved.ast = parsed.ast
96
- resolved.sample_clean = parsed.sample_clean
97
- resolved.sample_unclean = parsed.sample_unclean
98
-
99
- return resolved
100
-
101
- # --------------------------------------------------------
102
- # 2. Root folder
103
- # --------------------------------------------------------
104
- root = get_user_formula_root_override()
105
- if not root:
106
- raise ValueError("Formula root not set. Use add.set_formula_root(path).")
107
-
108
- # --------------------------------------------------------
109
- # 3. Version override
110
- # --------------------------------------------------------
111
- version = (
112
- version
113
- or get_user_version_override()
114
- or get_default_version()
115
- )
116
-
117
- # --------------------------------------------------------
118
- # 4. Load manifest
119
- # --------------------------------------------------------
120
- manifest = _load_manifest(root, version)
121
-
122
- # --------------------------------------------------------
123
- # 5. Resolve filename
124
- # --------------------------------------------------------
125
- filename = _resolve_filename(formula_name, manifest)
126
-
127
- # --------------------------------------------------------
128
- # 6. Build full path
129
- # --------------------------------------------------------
130
- full_path = os.path.join(root, version, filename)
131
-
132
- if not os.path.exists(full_path):
133
- raise FileNotFoundError(f"Expression file not found: {full_path}")
134
-
135
- resolved = ResolvedFormula(source=full_path, version=version)
136
-
137
- # --------------------------------------------------------
138
- # 7. Load + parse + attach AST + sample data
139
- # --------------------------------------------------------
140
- try:
141
- text = load_expression(resolved, namespace)
142
- parsed = parse_expression(text)
143
-
144
- resolved.ast = parsed.ast
145
- resolved.sample_clean = parsed.sample_clean
146
- resolved.sample_unclean = parsed.sample_unclean
147
-
148
- if resolved.ast is None:
149
- log_warning(f"[registry] No AST parsed for '{formula_name}'")
150
-
151
- except Exception as e:
152
- log_warning(f"[registry] Failed to load/parse '{formula_name}': {e}")
153
-
154
- return resolved
155
-
156
-
157
- # ------------------------------------------------------------
158
- # Public setters
159
- # ------------------------------------------------------------
160
-
161
- def set_formula_root(path: str):
162
- from .config import set_user_formula_root_override
163
- set_user_formula_root_override(path)
164
- log_info(f"[registry] Formula root set to: {path}")
165
-
166
-
167
- def set_formula_version(v: str):
168
- from .config import set_user_version_override
169
- set_user_version_override(v)
170
- log_info(f"[registry] Version override set to: {v}")
171
-
172
-
173
- def set_custom_formula_path(path: str):
174
- from .config import set_custom_formula_path
175
- set_custom_formula_path(path)
176
- log_info(f"[registry] Custom formula path set to: {path}")
@@ -1,492 +0,0 @@
1
- # sample_data_manager.py
2
- # Enhanced sample data management for additory expressions
3
-
4
- import os
5
- import yaml
6
- import pandas as pd
7
- from typing import Dict, List, Optional, Any, Union, Tuple
8
- from dataclasses import dataclass
9
- import re
10
-
11
- from .logging import log_info, log_warning
12
- from .enhanced_version_manager import EnhancedVersionManager
13
- from .namespace_manager import NamespaceManager
14
- from .integrity_manager import IntegrityManager
15
-
16
-
17
- @dataclass
18
- class SampleDataInfo:
19
- """Information about sample data"""
20
- expression_name: str
21
- version: str
22
- has_clean: bool
23
- has_unclean: bool
24
- clean_rows: int
25
- unclean_rows: int
26
- educational_comments: List[str]
27
- validation_errors: List[str]
28
-
29
-
30
- class SampleDataError(Exception):
31
- """Raised when sample data operations fail"""
32
- pass
33
-
34
-
35
- class SampleDataManager:
36
- """
37
- Enhanced sample data management system
38
- Provides clean/unclean sample support with educational comments and validation
39
- """
40
-
41
- def __init__(self):
42
- self.version_manager = EnhancedVersionManager()
43
- self.namespace_manager = NamespaceManager()
44
- self.integrity_manager = IntegrityManager()
45
-
46
- # Sample data validation rules
47
- self.validation_rules = {
48
- "max_rows": 100, # Maximum rows in sample data
49
- "min_rows": 1, # Minimum rows in sample data
50
- "required_columns": [], # Will be determined from expression
51
- "max_column_length": 50, # Maximum string length in columns
52
- }
53
-
54
- # Educational comment patterns for unclean data
55
- self.educational_patterns = {
56
- "missing_values": "# Missing values to test null handling",
57
- "invalid_types": "# Invalid data types to test type validation",
58
- "edge_cases": "# Edge cases to test boundary conditions",
59
- "malformed_data": "# Malformed data to test error handling",
60
- "duplicate_values": "# Duplicate values to test deduplication",
61
- "extreme_values": "# Extreme values to test range validation"
62
- }
63
-
64
- log_info("[sample_data] Sample Data Manager initialized")
65
-
66
- def get_clean_sample(self, expression_name: str, namespace: str = "builtin",
67
- version: Optional[str] = None) -> pd.DataFrame:
68
- """
69
- Get clean sample data for an expression
70
-
71
- Args:
72
- expression_name: Name of the expression
73
- namespace: Namespace ("builtin" or "user")
74
- version: Specific version (optional)
75
-
76
- Returns:
77
- DataFrame with clean sample data
78
-
79
- Raises:
80
- SampleDataError: If sample data cannot be retrieved
81
- """
82
- try:
83
- sample_data = self._get_sample_data(expression_name, namespace, version, "clean")
84
-
85
- if sample_data is None:
86
- # Generate default clean sample if none exists
87
- return self._generate_default_clean_sample(expression_name)
88
-
89
- df = pd.DataFrame(sample_data)
90
-
91
- # Validate clean sample data
92
- validation_errors = self._validate_clean_sample(df, expression_name)
93
- if validation_errors:
94
- log_warning(f"[sample_data] Clean sample validation issues for {expression_name}: {validation_errors}")
95
-
96
- log_info(f"[sample_data] Retrieved clean sample for {expression_name} ({len(df)} rows)")
97
- return df
98
-
99
- except Exception as e:
100
- log_warning(f"[sample_data] Failed to get clean sample for {expression_name}: {e}")
101
- raise SampleDataError(f"Failed to get clean sample data: {e}")
102
-
103
- def get_unclean_sample(self, expression_name: str, namespace: str = "builtin",
104
- version: Optional[str] = None) -> pd.DataFrame:
105
- """
106
- Get unclean sample data with educational comments
107
-
108
- Args:
109
- expression_name: Name of the expression
110
- namespace: Namespace ("builtin" or "user")
111
- version: Specific version (optional)
112
-
113
- Returns:
114
- DataFrame with unclean sample data and educational comments
115
-
116
- Raises:
117
- SampleDataError: If sample data cannot be retrieved
118
- """
119
- try:
120
- sample_data = self._get_sample_data(expression_name, namespace, version, "unclean")
121
-
122
- if sample_data is None:
123
- # Generate default unclean sample if none exists
124
- return self._generate_default_unclean_sample(expression_name)
125
-
126
- df = pd.DataFrame(sample_data)
127
-
128
- # Add educational comments as metadata
129
- df = self._add_educational_comments(df, expression_name)
130
-
131
- log_info(f"[sample_data] Retrieved unclean sample for {expression_name} ({len(df)} rows)")
132
- return df
133
-
134
- except Exception as e:
135
- log_warning(f"[sample_data] Failed to get unclean sample for {expression_name}: {e}")
136
- raise SampleDataError(f"Failed to get unclean sample data: {e}")
137
-
138
- def validate_sample_data(self, sample_data: Dict[str, Any],
139
- expression_name: str, sample_type: str = "clean") -> Tuple[bool, List[str]]:
140
- """
141
- Validate sample data format and content
142
-
143
- Args:
144
- sample_data: Sample data dictionary
145
- expression_name: Name of the expression
146
- sample_type: "clean" or "unclean"
147
-
148
- Returns:
149
- Tuple of (is_valid, list_of_errors)
150
- """
151
- errors = []
152
-
153
- try:
154
- # Check if sample_data is a dictionary
155
- if not isinstance(sample_data, dict):
156
- errors.append("Sample data must be a dictionary")
157
- return False, errors
158
-
159
- # Check if sample_data has columns
160
- if not sample_data:
161
- errors.append("Sample data cannot be empty")
162
- return False, errors
163
-
164
- # Convert to DataFrame for validation
165
- try:
166
- df = pd.DataFrame(sample_data)
167
- except Exception as e:
168
- errors.append(f"Cannot convert sample data to DataFrame: {e}")
169
- return False, errors
170
-
171
- # Validate row count
172
- row_count = len(df)
173
- if row_count < self.validation_rules["min_rows"]:
174
- errors.append(f"Sample data has too few rows: {row_count} < {self.validation_rules['min_rows']}")
175
-
176
- if row_count > self.validation_rules["max_rows"]:
177
- errors.append(f"Sample data has too many rows: {row_count} > {self.validation_rules['max_rows']}")
178
-
179
- # Validate column content
180
- for column, values in sample_data.items():
181
- if not isinstance(values, list):
182
- errors.append(f"Column '{column}' must be a list")
183
- continue
184
-
185
- # Check for consistent length
186
- if len(values) != row_count:
187
- errors.append(f"Column '{column}' has inconsistent length")
188
-
189
- # Check string length limits
190
- for i, value in enumerate(values):
191
- if isinstance(value, str) and len(value) > self.validation_rules["max_column_length"]:
192
- errors.append(f"Column '{column}' row {i} exceeds max length")
193
-
194
- # Specific validation for clean vs unclean samples
195
- if sample_type == "clean":
196
- errors.extend(self._validate_clean_sample(df, expression_name))
197
- else:
198
- errors.extend(self._validate_unclean_sample(df, expression_name))
199
-
200
- is_valid = len(errors) == 0
201
-
202
- if is_valid:
203
- log_info(f"[sample_data] Sample data validation passed for {expression_name}")
204
- else:
205
- log_warning(f"[sample_data] Sample data validation failed for {expression_name}: {errors}")
206
-
207
- return is_valid, errors
208
-
209
- except Exception as e:
210
- errors.append(f"Validation error: {e}")
211
- return False, errors
212
-
213
- def get_sample_info(self, expression_name: str, namespace: str = "builtin",
214
- version: Optional[str] = None) -> SampleDataInfo:
215
- """
216
- Get comprehensive information about sample data
217
-
218
- Args:
219
- expression_name: Name of the expression
220
- namespace: Namespace ("builtin" or "user")
221
- version: Specific version (optional)
222
-
223
- Returns:
224
- SampleDataInfo object with comprehensive information
225
- """
226
- try:
227
- # Get sample data
228
- clean_data = self._get_sample_data(expression_name, namespace, version, "clean")
229
- unclean_data = self._get_sample_data(expression_name, namespace, version, "unclean")
230
-
231
- # Analyze clean sample
232
- has_clean = clean_data is not None
233
- clean_rows = len(pd.DataFrame(clean_data)) if has_clean else 0
234
-
235
- # Analyze unclean sample
236
- has_unclean = unclean_data is not None
237
- unclean_rows = len(pd.DataFrame(unclean_data)) if has_unclean else 0
238
-
239
- # Extract educational comments
240
- educational_comments = []
241
- if has_unclean:
242
- educational_comments = self._extract_educational_comments(unclean_data)
243
-
244
- # Validate samples
245
- validation_errors = []
246
- if has_clean:
247
- _, clean_errors = self.validate_sample_data(clean_data, expression_name, "clean")
248
- validation_errors.extend([f"Clean: {err}" for err in clean_errors])
249
-
250
- if has_unclean:
251
- _, unclean_errors = self.validate_sample_data(unclean_data, expression_name, "unclean")
252
- validation_errors.extend([f"Unclean: {err}" for err in unclean_errors])
253
-
254
- return SampleDataInfo(
255
- expression_name=expression_name,
256
- version=version or self.version_manager.default_version,
257
- has_clean=has_clean,
258
- has_unclean=has_unclean,
259
- clean_rows=clean_rows,
260
- unclean_rows=unclean_rows,
261
- educational_comments=educational_comments,
262
- validation_errors=validation_errors
263
- )
264
-
265
- except Exception as e:
266
- log_warning(f"[sample_data] Failed to get sample info for {expression_name}: {e}")
267
- return SampleDataInfo(
268
- expression_name=expression_name,
269
- version=version or "unknown",
270
- has_clean=False,
271
- has_unclean=False,
272
- clean_rows=0,
273
- unclean_rows=0,
274
- educational_comments=[],
275
- validation_errors=[f"Failed to get sample info: {e}"]
276
- )
277
-
278
- def create_sample_template(self, expression_name: str, columns: List[str]) -> Dict[str, Dict[str, Any]]:
279
- """
280
- Create a template for sample data
281
-
282
- Args:
283
- expression_name: Name of the expression
284
- columns: List of required columns
285
-
286
- Returns:
287
- Dictionary with clean and unclean sample templates
288
- """
289
- try:
290
- # Create clean sample template
291
- clean_template = {}
292
- for column in columns:
293
- clean_template[column] = [f"sample_{column}_1", f"sample_{column}_2", f"sample_{column}_3"]
294
-
295
- # Create unclean sample template with educational comments
296
- unclean_template = {}
297
- for column in columns:
298
- unclean_template[column] = [
299
- f"valid_{column}",
300
- None, # Missing value
301
- f"invalid_{column}_type",
302
- f"extreme_{column}_value"
303
- ]
304
-
305
- # Add educational comments
306
- unclean_template["_comments"] = [
307
- "# This is unclean sample data for testing error handling",
308
- "# Row 1: Valid data",
309
- "# Row 2: Missing values (None/null)",
310
- "# Row 3: Invalid data types",
311
- "# Row 4: Extreme or edge case values"
312
- ]
313
-
314
- template = {
315
- "clean": clean_template,
316
- "unclean": unclean_template
317
- }
318
-
319
- log_info(f"[sample_data] Created sample template for {expression_name}")
320
- return template
321
-
322
- except Exception as e:
323
- log_warning(f"[sample_data] Failed to create sample template for {expression_name}: {e}")
324
- raise SampleDataError(f"Failed to create sample template: {e}")
325
-
326
- def _get_sample_data(self, expression_name: str, namespace: str,
327
- version: Optional[str], sample_type: str) -> Optional[Dict[str, Any]]:
328
- """Get raw sample data from expression file"""
329
- try:
330
- # Get expression file path
331
- expression_path = self.namespace_manager.get_expression_file_path(
332
- namespace, expression_name, version
333
- )
334
-
335
- if not expression_path or not os.path.exists(expression_path):
336
- return None
337
-
338
- # Validate integrity
339
- if not self.integrity_manager.validate_integrity(expression_path):
340
- log_warning(f"[sample_data] Integrity validation failed for {expression_path}")
341
- return None
342
-
343
- # Parse expression file
344
- with open(expression_path, 'r') as f:
345
- content = yaml.safe_load(f)
346
-
347
- # Extract sample data
348
- sample_section = content.get("sample", {})
349
- return sample_section.get(sample_type)
350
-
351
- except Exception as e:
352
- log_warning(f"[sample_data] Failed to get sample data from {expression_path}: {e}")
353
- return None
354
-
355
- def _validate_clean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
356
- """Validate clean sample data"""
357
- errors = []
358
-
359
- # Check for missing values in clean sample
360
- if df.isnull().any().any():
361
- errors.append("Clean sample should not contain missing values")
362
-
363
- # Check for reasonable data types
364
- for column in df.columns:
365
- if column.startswith('_'): # Skip metadata columns
366
- continue
367
-
368
- series = df[column]
369
-
370
- # Check for mixed types (should be consistent in clean data)
371
- unique_types = set(type(x).__name__ for x in series.dropna())
372
- if len(unique_types) > 1:
373
- errors.append(f"Column '{column}' has mixed data types in clean sample")
374
-
375
- return errors
376
-
377
- def _validate_unclean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
378
- """Validate unclean sample data"""
379
- errors = []
380
-
381
- # Unclean samples should have some issues for educational purposes
382
- has_nulls = df.isnull().any().any()
383
- has_mixed_types = False
384
-
385
- for column in df.columns:
386
- if column.startswith('_'): # Skip metadata columns
387
- continue
388
-
389
- series = df[column]
390
- unique_types = set(type(x).__name__ for x in series.dropna())
391
- if len(unique_types) > 1:
392
- has_mixed_types = True
393
- break
394
-
395
- # Unclean samples should demonstrate common data issues
396
- if not has_nulls and not has_mixed_types:
397
- errors.append("Unclean sample should contain some data quality issues for educational purposes")
398
-
399
- return errors
400
-
401
- def _add_educational_comments(self, df: pd.DataFrame, expression_name: str) -> pd.DataFrame:
402
- """Add educational comments to unclean sample data"""
403
- try:
404
- # Add a comments column with educational information
405
- comments = []
406
-
407
- for i, row in df.iterrows():
408
- comment_parts = []
409
-
410
- # Check for missing values
411
- if row.isnull().any():
412
- comment_parts.append("Contains missing values")
413
-
414
- # Check for potential type issues
415
- for col, val in row.items():
416
- if col.startswith('_'):
417
- continue
418
- if isinstance(val, str) and val.lower() in ['invalid', 'error', 'null']:
419
- comment_parts.append(f"'{col}' has invalid value")
420
-
421
- if not comment_parts:
422
- comment_parts.append("Valid data row")
423
-
424
- comments.append(" | ".join(comment_parts))
425
-
426
- # Add comments as a new column
427
- df_with_comments = df.copy()
428
- df_with_comments['_educational_comments'] = comments
429
-
430
- return df_with_comments
431
-
432
- except Exception as e:
433
- log_warning(f"[sample_data] Failed to add educational comments: {e}")
434
- return df
435
-
436
- def _extract_educational_comments(self, sample_data: Dict[str, Any]) -> List[str]:
437
- """Extract educational comments from sample data"""
438
- comments = []
439
-
440
- # Look for comment fields
441
- if '_comments' in sample_data:
442
- comments.extend(sample_data['_comments'])
443
-
444
- # Generate comments based on data patterns
445
- try:
446
- df = pd.DataFrame({k: v for k, v in sample_data.items() if not k.startswith('_')})
447
-
448
- if df.isnull().any().any():
449
- comments.append("Contains missing values for null handling testing")
450
-
451
- for column in df.columns:
452
- series = df[column]
453
- unique_types = set(type(x).__name__ for x in series.dropna())
454
- if len(unique_types) > 1:
455
- comments.append(f"Column '{column}' has mixed types for type validation testing")
456
-
457
- except Exception:
458
- pass # Ignore errors in comment extraction
459
-
460
- return comments
461
-
462
- def _generate_default_clean_sample(self, expression_name: str) -> pd.DataFrame:
463
- """Generate default clean sample data when none exists"""
464
- return pd.DataFrame({
465
- "col_a": [1, 2, 3],
466
- "col_b": [4, 5, 6],
467
- "_info": [f"Default clean sample for '{expression_name}'"] * 3
468
- })
469
-
470
- def _generate_default_unclean_sample(self, expression_name: str) -> pd.DataFrame:
471
- """Generate default unclean sample data when none exists"""
472
- return pd.DataFrame({
473
- "col_a": [1, None, "invalid"],
474
- "col_b": [4, 5, -999],
475
- "_educational_comments": [
476
- "Valid data row",
477
- "Missing value in col_a",
478
- "Invalid type in col_a, extreme value in col_b"
479
- ],
480
- "_info": [f"Default unclean sample for '{expression_name}'"] * 3
481
- })
482
-
483
-
484
- # Global sample data manager instance
485
- _sample_data_manager = None
486
-
487
- def get_sample_data_manager() -> SampleDataManager:
488
- """Get the global sample data manager instance"""
489
- global _sample_data_manager
490
- if _sample_data_manager is None:
491
- _sample_data_manager = SampleDataManager()
492
- return _sample_data_manager