additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/core/registry.py DELETED
@@ -1,177 +0,0 @@
1
- # registry.py
2
- # Versioned registry for additory
3
-
4
- from dataclasses import dataclass
5
- from typing import Optional
6
- import os
7
- import json
8
-
9
- from .logging import log_info, log_warning
10
- from .config import (
11
- get_user_formula_root_override,
12
- get_custom_formula_path,
13
- get_default_version,
14
- get_user_version_override,
15
- )
16
-
17
- from additory.core.loader import load_expression
18
- from additory.core.parser import parse_expression
19
-
20
-
21
- # ------------------------------------------------------------
22
- # Resolved Formula Object
23
- # ------------------------------------------------------------
24
-
25
- @dataclass
26
- class ResolvedFormula:
27
- source: str
28
- version: str
29
- mode: str = "local"
30
- ast: Optional[dict] = None
31
- sample_clean: Optional[dict] = None
32
- sample_unclean: Optional[dict] = None
33
-
34
-
35
- # ------------------------------------------------------------
36
- # Manifest loading
37
- # ------------------------------------------------------------
38
-
39
- def _load_manifest(root: str, version: str):
40
- manifest_path = os.path.join(root, version, "manifest.json")
41
-
42
- if not os.path.exists(manifest_path):
43
- raise FileNotFoundError(f"Manifest not found for version {version}")
44
-
45
- with open(manifest_path, "r", encoding="utf-8") as f:
46
- return json.load(f)
47
-
48
-
49
- # ------------------------------------------------------------
50
- # Resolve formula filename
51
- # ------------------------------------------------------------
52
-
53
- def _resolve_filename(formula_name: str, manifest: dict):
54
- if formula_name not in manifest:
55
- raise FileNotFoundError(f"Formula '{formula_name}' not found in manifest")
56
-
57
- return manifest[formula_name]
58
-
59
-
60
- # ------------------------------------------------------------
61
- # Main resolver (attaches AST + sample data)
62
- # ------------------------------------------------------------
63
-
64
- def resolve_formula(formula_name: str, namespace="builtin", version=None):
65
- """
66
- Versioned resolver:
67
- 1. Custom override path
68
- 2. User-set formula root
69
- 3. Version override
70
- 4. Default version
71
- 5. Manifest lookup
72
- 6. Load + parse expression
73
- 7. Attach AST + sample data
74
-
75
- Args:
76
- formula_name: Name of the formula to resolve
77
- namespace: "builtin" or "user" (default: "builtin")
78
- version: Specific version to use (optional)
79
-
80
- Returns:
81
- ResolvedFormula object with AST and sample data
82
- """
83
-
84
- # --------------------------------------------------------
85
- # 1. Custom override (FIXED)
86
- # --------------------------------------------------------
87
- custom = get_custom_formula_path()
88
- if custom:
89
- log_info(f"[registry] Using custom formula path: {custom}")
90
-
91
- resolved = ResolvedFormula(source=custom, version="custom")
92
-
93
- text = load_expression(resolved, namespace)
94
- parsed = parse_expression(text)
95
-
96
- resolved.ast = parsed.ast
97
- resolved.sample_clean = parsed.sample_clean
98
- resolved.sample_unclean = parsed.sample_unclean
99
-
100
- return resolved
101
-
102
- # --------------------------------------------------------
103
- # 2. Root folder
104
- # --------------------------------------------------------
105
- root = get_user_formula_root_override()
106
- if not root:
107
- raise ValueError("Formula root not set. Use add.set_formula_root(path).")
108
-
109
- # --------------------------------------------------------
110
- # 3. Version override
111
- # --------------------------------------------------------
112
- version = (
113
- version
114
- or get_user_version_override()
115
- or get_default_version()
116
- )
117
-
118
- # --------------------------------------------------------
119
- # 4. Load manifest
120
- # --------------------------------------------------------
121
- manifest = _load_manifest(root, version)
122
-
123
- # --------------------------------------------------------
124
- # 5. Resolve filename
125
- # --------------------------------------------------------
126
- filename = _resolve_filename(formula_name, manifest)
127
-
128
- # --------------------------------------------------------
129
- # 6. Build full path
130
- # --------------------------------------------------------
131
- full_path = os.path.join(root, version, filename)
132
-
133
- if not os.path.exists(full_path):
134
- raise FileNotFoundError(f"Expression file not found: {full_path}")
135
-
136
- resolved = ResolvedFormula(source=full_path, version=version)
137
-
138
- # --------------------------------------------------------
139
- # 7. Load + parse + attach AST + sample data
140
- # --------------------------------------------------------
141
- try:
142
- text = load_expression(resolved, namespace)
143
- parsed = parse_expression(text)
144
-
145
- resolved.ast = parsed.ast
146
- resolved.sample_clean = parsed.sample_clean
147
- resolved.sample_unclean = parsed.sample_unclean
148
-
149
- if resolved.ast is None:
150
- log_warning(f"[registry] No AST parsed for '{formula_name}'")
151
-
152
- except Exception as e:
153
- log_warning(f"[registry] Failed to load/parse '{formula_name}': {e}")
154
-
155
- return resolved
156
-
157
-
158
- # ------------------------------------------------------------
159
- # Public setters
160
- # ------------------------------------------------------------
161
-
162
- def set_formula_root(path: str):
163
- from .config import set_user_formula_root_override
164
- set_user_formula_root_override(path)
165
- log_info(f"[registry] Formula root set to: {path}")
166
-
167
-
168
- def set_formula_version(v: str):
169
- from .config import set_user_version_override
170
- set_user_version_override(v)
171
- log_info(f"[registry] Version override set to: {v}")
172
-
173
-
174
- def set_custom_formula_path(path: str):
175
- from .config import set_custom_formula_path
176
- set_custom_formula_path(path)
177
- log_info(f"[registry] Custom formula path set to: {path}")
@@ -1,492 +0,0 @@
1
- # sample_data_manager.py
2
- # Enhanced sample data management for additory expressions
3
-
4
- import os
5
- import yaml
6
- import pandas as pd
7
- from typing import Dict, List, Optional, Any, Union, Tuple
8
- from dataclasses import dataclass
9
- import re
10
-
11
- from .logging import log_info, log_warning
12
- from .enhanced_version_manager import EnhancedVersionManager
13
- from .namespace_manager import NamespaceManager
14
- from .integrity_manager import IntegrityManager
15
-
16
-
17
- @dataclass
18
- class SampleDataInfo:
19
- """Information about sample data"""
20
- expression_name: str
21
- version: str
22
- has_clean: bool
23
- has_unclean: bool
24
- clean_rows: int
25
- unclean_rows: int
26
- educational_comments: List[str]
27
- validation_errors: List[str]
28
-
29
-
30
- class SampleDataError(Exception):
31
- """Raised when sample data operations fail"""
32
- pass
33
-
34
-
35
- class SampleDataManager:
36
- """
37
- Enhanced sample data management system
38
- Provides clean/unclean sample support with educational comments and validation
39
- """
40
-
41
- def __init__(self):
42
- self.version_manager = EnhancedVersionManager()
43
- self.namespace_manager = NamespaceManager()
44
- self.integrity_manager = IntegrityManager()
45
-
46
- # Sample data validation rules
47
- self.validation_rules = {
48
- "max_rows": 100, # Maximum rows in sample data
49
- "min_rows": 1, # Minimum rows in sample data
50
- "required_columns": [], # Will be determined from expression
51
- "max_column_length": 50, # Maximum string length in columns
52
- }
53
-
54
- # Educational comment patterns for unclean data
55
- self.educational_patterns = {
56
- "missing_values": "# Missing values to test null handling",
57
- "invalid_types": "# Invalid data types to test type validation",
58
- "edge_cases": "# Edge cases to test boundary conditions",
59
- "malformed_data": "# Malformed data to test error handling",
60
- "duplicate_values": "# Duplicate values to test deduplication",
61
- "extreme_values": "# Extreme values to test range validation"
62
- }
63
-
64
- log_info("[sample_data] Sample Data Manager initialized")
65
-
66
- def get_clean_sample(self, expression_name: str, namespace: str = "builtin",
67
- version: Optional[str] = None) -> pd.DataFrame:
68
- """
69
- Get clean sample data for an expression
70
-
71
- Args:
72
- expression_name: Name of the expression
73
- namespace: Namespace ("builtin" or "user")
74
- version: Specific version (optional)
75
-
76
- Returns:
77
- DataFrame with clean sample data
78
-
79
- Raises:
80
- SampleDataError: If sample data cannot be retrieved
81
- """
82
- try:
83
- sample_data = self._get_sample_data(expression_name, namespace, version, "clean")
84
-
85
- if sample_data is None:
86
- # Generate default clean sample if none exists
87
- return self._generate_default_clean_sample(expression_name)
88
-
89
- df = pd.DataFrame(sample_data)
90
-
91
- # Validate clean sample data
92
- validation_errors = self._validate_clean_sample(df, expression_name)
93
- if validation_errors:
94
- log_warning(f"[sample_data] Clean sample validation issues for {expression_name}: {validation_errors}")
95
-
96
- log_info(f"[sample_data] Retrieved clean sample for {expression_name} ({len(df)} rows)")
97
- return df
98
-
99
- except Exception as e:
100
- log_warning(f"[sample_data] Failed to get clean sample for {expression_name}: {e}")
101
- raise SampleDataError(f"Failed to get clean sample data: {e}")
102
-
103
- def get_unclean_sample(self, expression_name: str, namespace: str = "builtin",
104
- version: Optional[str] = None) -> pd.DataFrame:
105
- """
106
- Get unclean sample data with educational comments
107
-
108
- Args:
109
- expression_name: Name of the expression
110
- namespace: Namespace ("builtin" or "user")
111
- version: Specific version (optional)
112
-
113
- Returns:
114
- DataFrame with unclean sample data and educational comments
115
-
116
- Raises:
117
- SampleDataError: If sample data cannot be retrieved
118
- """
119
- try:
120
- sample_data = self._get_sample_data(expression_name, namespace, version, "unclean")
121
-
122
- if sample_data is None:
123
- # Generate default unclean sample if none exists
124
- return self._generate_default_unclean_sample(expression_name)
125
-
126
- df = pd.DataFrame(sample_data)
127
-
128
- # Add educational comments as metadata
129
- df = self._add_educational_comments(df, expression_name)
130
-
131
- log_info(f"[sample_data] Retrieved unclean sample for {expression_name} ({len(df)} rows)")
132
- return df
133
-
134
- except Exception as e:
135
- log_warning(f"[sample_data] Failed to get unclean sample for {expression_name}: {e}")
136
- raise SampleDataError(f"Failed to get unclean sample data: {e}")
137
-
138
- def validate_sample_data(self, sample_data: Dict[str, Any],
139
- expression_name: str, sample_type: str = "clean") -> Tuple[bool, List[str]]:
140
- """
141
- Validate sample data format and content
142
-
143
- Args:
144
- sample_data: Sample data dictionary
145
- expression_name: Name of the expression
146
- sample_type: "clean" or "unclean"
147
-
148
- Returns:
149
- Tuple of (is_valid, list_of_errors)
150
- """
151
- errors = []
152
-
153
- try:
154
- # Check if sample_data is a dictionary
155
- if not isinstance(sample_data, dict):
156
- errors.append("Sample data must be a dictionary")
157
- return False, errors
158
-
159
- # Check if sample_data has columns
160
- if not sample_data:
161
- errors.append("Sample data cannot be empty")
162
- return False, errors
163
-
164
- # Convert to DataFrame for validation
165
- try:
166
- df = pd.DataFrame(sample_data)
167
- except Exception as e:
168
- errors.append(f"Cannot convert sample data to DataFrame: {e}")
169
- return False, errors
170
-
171
- # Validate row count
172
- row_count = len(df)
173
- if row_count < self.validation_rules["min_rows"]:
174
- errors.append(f"Sample data has too few rows: {row_count} < {self.validation_rules['min_rows']}")
175
-
176
- if row_count > self.validation_rules["max_rows"]:
177
- errors.append(f"Sample data has too many rows: {row_count} > {self.validation_rules['max_rows']}")
178
-
179
- # Validate column content
180
- for column, values in sample_data.items():
181
- if not isinstance(values, list):
182
- errors.append(f"Column '{column}' must be a list")
183
- continue
184
-
185
- # Check for consistent length
186
- if len(values) != row_count:
187
- errors.append(f"Column '{column}' has inconsistent length")
188
-
189
- # Check string length limits
190
- for i, value in enumerate(values):
191
- if isinstance(value, str) and len(value) > self.validation_rules["max_column_length"]:
192
- errors.append(f"Column '{column}' row {i} exceeds max length")
193
-
194
- # Specific validation for clean vs unclean samples
195
- if sample_type == "clean":
196
- errors.extend(self._validate_clean_sample(df, expression_name))
197
- else:
198
- errors.extend(self._validate_unclean_sample(df, expression_name))
199
-
200
- is_valid = len(errors) == 0
201
-
202
- if is_valid:
203
- log_info(f"[sample_data] Sample data validation passed for {expression_name}")
204
- else:
205
- log_warning(f"[sample_data] Sample data validation failed for {expression_name}: {errors}")
206
-
207
- return is_valid, errors
208
-
209
- except Exception as e:
210
- errors.append(f"Validation error: {e}")
211
- return False, errors
212
-
213
- def get_sample_info(self, expression_name: str, namespace: str = "builtin",
214
- version: Optional[str] = None) -> SampleDataInfo:
215
- """
216
- Get comprehensive information about sample data
217
-
218
- Args:
219
- expression_name: Name of the expression
220
- namespace: Namespace ("builtin" or "user")
221
- version: Specific version (optional)
222
-
223
- Returns:
224
- SampleDataInfo object with comprehensive information
225
- """
226
- try:
227
- # Get sample data
228
- clean_data = self._get_sample_data(expression_name, namespace, version, "clean")
229
- unclean_data = self._get_sample_data(expression_name, namespace, version, "unclean")
230
-
231
- # Analyze clean sample
232
- has_clean = clean_data is not None
233
- clean_rows = len(pd.DataFrame(clean_data)) if has_clean else 0
234
-
235
- # Analyze unclean sample
236
- has_unclean = unclean_data is not None
237
- unclean_rows = len(pd.DataFrame(unclean_data)) if has_unclean else 0
238
-
239
- # Extract educational comments
240
- educational_comments = []
241
- if has_unclean:
242
- educational_comments = self._extract_educational_comments(unclean_data)
243
-
244
- # Validate samples
245
- validation_errors = []
246
- if has_clean:
247
- _, clean_errors = self.validate_sample_data(clean_data, expression_name, "clean")
248
- validation_errors.extend([f"Clean: {err}" for err in clean_errors])
249
-
250
- if has_unclean:
251
- _, unclean_errors = self.validate_sample_data(unclean_data, expression_name, "unclean")
252
- validation_errors.extend([f"Unclean: {err}" for err in unclean_errors])
253
-
254
- return SampleDataInfo(
255
- expression_name=expression_name,
256
- version=version or self.version_manager.default_version,
257
- has_clean=has_clean,
258
- has_unclean=has_unclean,
259
- clean_rows=clean_rows,
260
- unclean_rows=unclean_rows,
261
- educational_comments=educational_comments,
262
- validation_errors=validation_errors
263
- )
264
-
265
- except Exception as e:
266
- log_warning(f"[sample_data] Failed to get sample info for {expression_name}: {e}")
267
- return SampleDataInfo(
268
- expression_name=expression_name,
269
- version=version or "unknown",
270
- has_clean=False,
271
- has_unclean=False,
272
- clean_rows=0,
273
- unclean_rows=0,
274
- educational_comments=[],
275
- validation_errors=[f"Failed to get sample info: {e}"]
276
- )
277
-
278
- def create_sample_template(self, expression_name: str, columns: List[str]) -> Dict[str, Dict[str, Any]]:
279
- """
280
- Create a template for sample data
281
-
282
- Args:
283
- expression_name: Name of the expression
284
- columns: List of required columns
285
-
286
- Returns:
287
- Dictionary with clean and unclean sample templates
288
- """
289
- try:
290
- # Create clean sample template
291
- clean_template = {}
292
- for column in columns:
293
- clean_template[column] = [f"sample_{column}_1", f"sample_{column}_2", f"sample_{column}_3"]
294
-
295
- # Create unclean sample template with educational comments
296
- unclean_template = {}
297
- for column in columns:
298
- unclean_template[column] = [
299
- f"valid_{column}",
300
- None, # Missing value
301
- f"invalid_{column}_type",
302
- f"extreme_{column}_value"
303
- ]
304
-
305
- # Add educational comments
306
- unclean_template["_comments"] = [
307
- "# This is unclean sample data for testing error handling",
308
- "# Row 1: Valid data",
309
- "# Row 2: Missing values (None/null)",
310
- "# Row 3: Invalid data types",
311
- "# Row 4: Extreme or edge case values"
312
- ]
313
-
314
- template = {
315
- "clean": clean_template,
316
- "unclean": unclean_template
317
- }
318
-
319
- log_info(f"[sample_data] Created sample template for {expression_name}")
320
- return template
321
-
322
- except Exception as e:
323
- log_warning(f"[sample_data] Failed to create sample template for {expression_name}: {e}")
324
- raise SampleDataError(f"Failed to create sample template: {e}")
325
-
326
- def _get_sample_data(self, expression_name: str, namespace: str,
327
- version: Optional[str], sample_type: str) -> Optional[Dict[str, Any]]:
328
- """Get raw sample data from expression file"""
329
- try:
330
- # Get expression file path
331
- expression_path = self.namespace_manager.get_expression_file_path(
332
- namespace, expression_name, version
333
- )
334
-
335
- if not expression_path or not os.path.exists(expression_path):
336
- return None
337
-
338
- # Validate integrity
339
- if not self.integrity_manager.validate_integrity(expression_path):
340
- log_warning(f"[sample_data] Integrity validation failed for {expression_path}")
341
- return None
342
-
343
- # Parse expression file
344
- with open(expression_path, 'r') as f:
345
- content = yaml.safe_load(f)
346
-
347
- # Extract sample data
348
- sample_section = content.get("sample", {})
349
- return sample_section.get(sample_type)
350
-
351
- except Exception as e:
352
- log_warning(f"[sample_data] Failed to get sample data from {expression_path}: {e}")
353
- return None
354
-
355
- def _validate_clean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
356
- """Validate clean sample data"""
357
- errors = []
358
-
359
- # Check for missing values in clean sample
360
- if df.isnull().any().any():
361
- errors.append("Clean sample should not contain missing values")
362
-
363
- # Check for reasonable data types
364
- for column in df.columns:
365
- if column.startswith('_'): # Skip metadata columns
366
- continue
367
-
368
- series = df[column]
369
-
370
- # Check for mixed types (should be consistent in clean data)
371
- unique_types = set(type(x).__name__ for x in series.dropna())
372
- if len(unique_types) > 1:
373
- errors.append(f"Column '{column}' has mixed data types in clean sample")
374
-
375
- return errors
376
-
377
- def _validate_unclean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
378
- """Validate unclean sample data"""
379
- errors = []
380
-
381
- # Unclean samples should have some issues for educational purposes
382
- has_nulls = df.isnull().any().any()
383
- has_mixed_types = False
384
-
385
- for column in df.columns:
386
- if column.startswith('_'): # Skip metadata columns
387
- continue
388
-
389
- series = df[column]
390
- unique_types = set(type(x).__name__ for x in series.dropna())
391
- if len(unique_types) > 1:
392
- has_mixed_types = True
393
- break
394
-
395
- # Unclean samples should demonstrate common data issues
396
- if not has_nulls and not has_mixed_types:
397
- errors.append("Unclean sample should contain some data quality issues for educational purposes")
398
-
399
- return errors
400
-
401
- def _add_educational_comments(self, df: pd.DataFrame, expression_name: str) -> pd.DataFrame:
402
- """Add educational comments to unclean sample data"""
403
- try:
404
- # Add a comments column with educational information
405
- comments = []
406
-
407
- for i, row in df.iterrows():
408
- comment_parts = []
409
-
410
- # Check for missing values
411
- if row.isnull().any():
412
- comment_parts.append("Contains missing values")
413
-
414
- # Check for potential type issues
415
- for col, val in row.items():
416
- if col.startswith('_'):
417
- continue
418
- if isinstance(val, str) and val.lower() in ['invalid', 'error', 'null']:
419
- comment_parts.append(f"'{col}' has invalid value")
420
-
421
- if not comment_parts:
422
- comment_parts.append("Valid data row")
423
-
424
- comments.append(" | ".join(comment_parts))
425
-
426
- # Add comments as a new column
427
- df_with_comments = df.copy()
428
- df_with_comments['_educational_comments'] = comments
429
-
430
- return df_with_comments
431
-
432
- except Exception as e:
433
- log_warning(f"[sample_data] Failed to add educational comments: {e}")
434
- return df
435
-
436
- def _extract_educational_comments(self, sample_data: Dict[str, Any]) -> List[str]:
437
- """Extract educational comments from sample data"""
438
- comments = []
439
-
440
- # Look for comment fields
441
- if '_comments' in sample_data:
442
- comments.extend(sample_data['_comments'])
443
-
444
- # Generate comments based on data patterns
445
- try:
446
- df = pd.DataFrame({k: v for k, v in sample_data.items() if not k.startswith('_')})
447
-
448
- if df.isnull().any().any():
449
- comments.append("Contains missing values for null handling testing")
450
-
451
- for column in df.columns:
452
- series = df[column]
453
- unique_types = set(type(x).__name__ for x in series.dropna())
454
- if len(unique_types) > 1:
455
- comments.append(f"Column '{column}' has mixed types for type validation testing")
456
-
457
- except Exception:
458
- pass # Ignore errors in comment extraction
459
-
460
- return comments
461
-
462
- def _generate_default_clean_sample(self, expression_name: str) -> pd.DataFrame:
463
- """Generate default clean sample data when none exists"""
464
- return pd.DataFrame({
465
- "col_a": [1, 2, 3],
466
- "col_b": [4, 5, 6],
467
- "_info": [f"Default clean sample for '{expression_name}'"] * 3
468
- })
469
-
470
- def _generate_default_unclean_sample(self, expression_name: str) -> pd.DataFrame:
471
- """Generate default unclean sample data when none exists"""
472
- return pd.DataFrame({
473
- "col_a": [1, None, "invalid"],
474
- "col_b": [4, 5, -999],
475
- "_educational_comments": [
476
- "Valid data row",
477
- "Missing value in col_a",
478
- "Invalid type in col_a, extreme value in col_b"
479
- ],
480
- "_info": [f"Default unclean sample for '{expression_name}'"] * 3
481
- })
482
-
483
-
484
- # Global sample data manager instance
485
- _sample_data_manager = None
486
-
487
- def get_sample_data_manager() -> SampleDataManager:
488
- """Get the global sample data manager instance"""
489
- global _sample_data_manager
490
- if _sample_data_manager is None:
491
- _sample_data_manager = SampleDataManager()
492
- return _sample_data_manager