additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/core/registry.py
DELETED
|
@@ -1,176 +0,0 @@
|
|
|
1
|
-
# registry.py
|
|
2
|
-
# Versioned registry for additory
|
|
3
|
-
|
|
4
|
-
from dataclasses import dataclass
|
|
5
|
-
import os
|
|
6
|
-
import json
|
|
7
|
-
|
|
8
|
-
from .logging import log_info, log_warning
|
|
9
|
-
from .config import (
|
|
10
|
-
get_user_formula_root_override,
|
|
11
|
-
get_custom_formula_path,
|
|
12
|
-
get_default_version,
|
|
13
|
-
get_user_version_override,
|
|
14
|
-
)
|
|
15
|
-
|
|
16
|
-
from additory.core.loader import load_expression
|
|
17
|
-
from additory.core.parser import parse_expression
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
# ------------------------------------------------------------
|
|
21
|
-
# Resolved Formula Object
|
|
22
|
-
# ------------------------------------------------------------
|
|
23
|
-
|
|
24
|
-
@dataclass
|
|
25
|
-
class ResolvedFormula:
|
|
26
|
-
source: str
|
|
27
|
-
version: str
|
|
28
|
-
mode: str = "local"
|
|
29
|
-
ast: dict | None = None
|
|
30
|
-
sample_clean: dict | None = None
|
|
31
|
-
sample_unclean: dict | None = None
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
# ------------------------------------------------------------
|
|
35
|
-
# Manifest loading
|
|
36
|
-
# ------------------------------------------------------------
|
|
37
|
-
|
|
38
|
-
def _load_manifest(root: str, version: str):
|
|
39
|
-
manifest_path = os.path.join(root, version, "manifest.json")
|
|
40
|
-
|
|
41
|
-
if not os.path.exists(manifest_path):
|
|
42
|
-
raise FileNotFoundError(f"Manifest not found for version {version}")
|
|
43
|
-
|
|
44
|
-
with open(manifest_path, "r", encoding="utf-8") as f:
|
|
45
|
-
return json.load(f)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
# ------------------------------------------------------------
|
|
49
|
-
# Resolve formula filename
|
|
50
|
-
# ------------------------------------------------------------
|
|
51
|
-
|
|
52
|
-
def _resolve_filename(formula_name: str, manifest: dict):
|
|
53
|
-
if formula_name not in manifest:
|
|
54
|
-
raise FileNotFoundError(f"Formula '{formula_name}' not found in manifest")
|
|
55
|
-
|
|
56
|
-
return manifest[formula_name]
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
# ------------------------------------------------------------
|
|
60
|
-
# Main resolver (attaches AST + sample data)
|
|
61
|
-
# ------------------------------------------------------------
|
|
62
|
-
|
|
63
|
-
def resolve_formula(formula_name: str, namespace="builtin", version=None):
|
|
64
|
-
"""
|
|
65
|
-
Versioned resolver:
|
|
66
|
-
1. Custom override path
|
|
67
|
-
2. User-set formula root
|
|
68
|
-
3. Version override
|
|
69
|
-
4. Default version
|
|
70
|
-
5. Manifest lookup
|
|
71
|
-
6. Load + parse expression
|
|
72
|
-
7. Attach AST + sample data
|
|
73
|
-
|
|
74
|
-
Args:
|
|
75
|
-
formula_name: Name of the formula to resolve
|
|
76
|
-
namespace: "builtin" or "user" (default: "builtin")
|
|
77
|
-
version: Specific version to use (optional)
|
|
78
|
-
|
|
79
|
-
Returns:
|
|
80
|
-
ResolvedFormula object with AST and sample data
|
|
81
|
-
"""
|
|
82
|
-
|
|
83
|
-
# --------------------------------------------------------
|
|
84
|
-
# 1. Custom override (FIXED)
|
|
85
|
-
# --------------------------------------------------------
|
|
86
|
-
custom = get_custom_formula_path()
|
|
87
|
-
if custom:
|
|
88
|
-
log_info(f"[registry] Using custom formula path: {custom}")
|
|
89
|
-
|
|
90
|
-
resolved = ResolvedFormula(source=custom, version="custom")
|
|
91
|
-
|
|
92
|
-
text = load_expression(resolved, namespace)
|
|
93
|
-
parsed = parse_expression(text)
|
|
94
|
-
|
|
95
|
-
resolved.ast = parsed.ast
|
|
96
|
-
resolved.sample_clean = parsed.sample_clean
|
|
97
|
-
resolved.sample_unclean = parsed.sample_unclean
|
|
98
|
-
|
|
99
|
-
return resolved
|
|
100
|
-
|
|
101
|
-
# --------------------------------------------------------
|
|
102
|
-
# 2. Root folder
|
|
103
|
-
# --------------------------------------------------------
|
|
104
|
-
root = get_user_formula_root_override()
|
|
105
|
-
if not root:
|
|
106
|
-
raise ValueError("Formula root not set. Use add.set_formula_root(path).")
|
|
107
|
-
|
|
108
|
-
# --------------------------------------------------------
|
|
109
|
-
# 3. Version override
|
|
110
|
-
# --------------------------------------------------------
|
|
111
|
-
version = (
|
|
112
|
-
version
|
|
113
|
-
or get_user_version_override()
|
|
114
|
-
or get_default_version()
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
# --------------------------------------------------------
|
|
118
|
-
# 4. Load manifest
|
|
119
|
-
# --------------------------------------------------------
|
|
120
|
-
manifest = _load_manifest(root, version)
|
|
121
|
-
|
|
122
|
-
# --------------------------------------------------------
|
|
123
|
-
# 5. Resolve filename
|
|
124
|
-
# --------------------------------------------------------
|
|
125
|
-
filename = _resolve_filename(formula_name, manifest)
|
|
126
|
-
|
|
127
|
-
# --------------------------------------------------------
|
|
128
|
-
# 6. Build full path
|
|
129
|
-
# --------------------------------------------------------
|
|
130
|
-
full_path = os.path.join(root, version, filename)
|
|
131
|
-
|
|
132
|
-
if not os.path.exists(full_path):
|
|
133
|
-
raise FileNotFoundError(f"Expression file not found: {full_path}")
|
|
134
|
-
|
|
135
|
-
resolved = ResolvedFormula(source=full_path, version=version)
|
|
136
|
-
|
|
137
|
-
# --------------------------------------------------------
|
|
138
|
-
# 7. Load + parse + attach AST + sample data
|
|
139
|
-
# --------------------------------------------------------
|
|
140
|
-
try:
|
|
141
|
-
text = load_expression(resolved, namespace)
|
|
142
|
-
parsed = parse_expression(text)
|
|
143
|
-
|
|
144
|
-
resolved.ast = parsed.ast
|
|
145
|
-
resolved.sample_clean = parsed.sample_clean
|
|
146
|
-
resolved.sample_unclean = parsed.sample_unclean
|
|
147
|
-
|
|
148
|
-
if resolved.ast is None:
|
|
149
|
-
log_warning(f"[registry] No AST parsed for '{formula_name}'")
|
|
150
|
-
|
|
151
|
-
except Exception as e:
|
|
152
|
-
log_warning(f"[registry] Failed to load/parse '{formula_name}': {e}")
|
|
153
|
-
|
|
154
|
-
return resolved
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
# ------------------------------------------------------------
|
|
158
|
-
# Public setters
|
|
159
|
-
# ------------------------------------------------------------
|
|
160
|
-
|
|
161
|
-
def set_formula_root(path: str):
|
|
162
|
-
from .config import set_user_formula_root_override
|
|
163
|
-
set_user_formula_root_override(path)
|
|
164
|
-
log_info(f"[registry] Formula root set to: {path}")
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def set_formula_version(v: str):
|
|
168
|
-
from .config import set_user_version_override
|
|
169
|
-
set_user_version_override(v)
|
|
170
|
-
log_info(f"[registry] Version override set to: {v}")
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
def set_custom_formula_path(path: str):
|
|
174
|
-
from .config import set_custom_formula_path
|
|
175
|
-
set_custom_formula_path(path)
|
|
176
|
-
log_info(f"[registry] Custom formula path set to: {path}")
|
|
@@ -1,492 +0,0 @@
|
|
|
1
|
-
# sample_data_manager.py
|
|
2
|
-
# Enhanced sample data management for additory expressions
|
|
3
|
-
|
|
4
|
-
import os
|
|
5
|
-
import yaml
|
|
6
|
-
import pandas as pd
|
|
7
|
-
from typing import Dict, List, Optional, Any, Union, Tuple
|
|
8
|
-
from dataclasses import dataclass
|
|
9
|
-
import re
|
|
10
|
-
|
|
11
|
-
from .logging import log_info, log_warning
|
|
12
|
-
from .enhanced_version_manager import EnhancedVersionManager
|
|
13
|
-
from .namespace_manager import NamespaceManager
|
|
14
|
-
from .integrity_manager import IntegrityManager
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class SampleDataInfo:
|
|
19
|
-
"""Information about sample data"""
|
|
20
|
-
expression_name: str
|
|
21
|
-
version: str
|
|
22
|
-
has_clean: bool
|
|
23
|
-
has_unclean: bool
|
|
24
|
-
clean_rows: int
|
|
25
|
-
unclean_rows: int
|
|
26
|
-
educational_comments: List[str]
|
|
27
|
-
validation_errors: List[str]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class SampleDataError(Exception):
|
|
31
|
-
"""Raised when sample data operations fail"""
|
|
32
|
-
pass
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class SampleDataManager:
|
|
36
|
-
"""
|
|
37
|
-
Enhanced sample data management system
|
|
38
|
-
Provides clean/unclean sample support with educational comments and validation
|
|
39
|
-
"""
|
|
40
|
-
|
|
41
|
-
def __init__(self):
|
|
42
|
-
self.version_manager = EnhancedVersionManager()
|
|
43
|
-
self.namespace_manager = NamespaceManager()
|
|
44
|
-
self.integrity_manager = IntegrityManager()
|
|
45
|
-
|
|
46
|
-
# Sample data validation rules
|
|
47
|
-
self.validation_rules = {
|
|
48
|
-
"max_rows": 100, # Maximum rows in sample data
|
|
49
|
-
"min_rows": 1, # Minimum rows in sample data
|
|
50
|
-
"required_columns": [], # Will be determined from expression
|
|
51
|
-
"max_column_length": 50, # Maximum string length in columns
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
# Educational comment patterns for unclean data
|
|
55
|
-
self.educational_patterns = {
|
|
56
|
-
"missing_values": "# Missing values to test null handling",
|
|
57
|
-
"invalid_types": "# Invalid data types to test type validation",
|
|
58
|
-
"edge_cases": "# Edge cases to test boundary conditions",
|
|
59
|
-
"malformed_data": "# Malformed data to test error handling",
|
|
60
|
-
"duplicate_values": "# Duplicate values to test deduplication",
|
|
61
|
-
"extreme_values": "# Extreme values to test range validation"
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
log_info("[sample_data] Sample Data Manager initialized")
|
|
65
|
-
|
|
66
|
-
def get_clean_sample(self, expression_name: str, namespace: str = "builtin",
|
|
67
|
-
version: Optional[str] = None) -> pd.DataFrame:
|
|
68
|
-
"""
|
|
69
|
-
Get clean sample data for an expression
|
|
70
|
-
|
|
71
|
-
Args:
|
|
72
|
-
expression_name: Name of the expression
|
|
73
|
-
namespace: Namespace ("builtin" or "user")
|
|
74
|
-
version: Specific version (optional)
|
|
75
|
-
|
|
76
|
-
Returns:
|
|
77
|
-
DataFrame with clean sample data
|
|
78
|
-
|
|
79
|
-
Raises:
|
|
80
|
-
SampleDataError: If sample data cannot be retrieved
|
|
81
|
-
"""
|
|
82
|
-
try:
|
|
83
|
-
sample_data = self._get_sample_data(expression_name, namespace, version, "clean")
|
|
84
|
-
|
|
85
|
-
if sample_data is None:
|
|
86
|
-
# Generate default clean sample if none exists
|
|
87
|
-
return self._generate_default_clean_sample(expression_name)
|
|
88
|
-
|
|
89
|
-
df = pd.DataFrame(sample_data)
|
|
90
|
-
|
|
91
|
-
# Validate clean sample data
|
|
92
|
-
validation_errors = self._validate_clean_sample(df, expression_name)
|
|
93
|
-
if validation_errors:
|
|
94
|
-
log_warning(f"[sample_data] Clean sample validation issues for {expression_name}: {validation_errors}")
|
|
95
|
-
|
|
96
|
-
log_info(f"[sample_data] Retrieved clean sample for {expression_name} ({len(df)} rows)")
|
|
97
|
-
return df
|
|
98
|
-
|
|
99
|
-
except Exception as e:
|
|
100
|
-
log_warning(f"[sample_data] Failed to get clean sample for {expression_name}: {e}")
|
|
101
|
-
raise SampleDataError(f"Failed to get clean sample data: {e}")
|
|
102
|
-
|
|
103
|
-
def get_unclean_sample(self, expression_name: str, namespace: str = "builtin",
|
|
104
|
-
version: Optional[str] = None) -> pd.DataFrame:
|
|
105
|
-
"""
|
|
106
|
-
Get unclean sample data with educational comments
|
|
107
|
-
|
|
108
|
-
Args:
|
|
109
|
-
expression_name: Name of the expression
|
|
110
|
-
namespace: Namespace ("builtin" or "user")
|
|
111
|
-
version: Specific version (optional)
|
|
112
|
-
|
|
113
|
-
Returns:
|
|
114
|
-
DataFrame with unclean sample data and educational comments
|
|
115
|
-
|
|
116
|
-
Raises:
|
|
117
|
-
SampleDataError: If sample data cannot be retrieved
|
|
118
|
-
"""
|
|
119
|
-
try:
|
|
120
|
-
sample_data = self._get_sample_data(expression_name, namespace, version, "unclean")
|
|
121
|
-
|
|
122
|
-
if sample_data is None:
|
|
123
|
-
# Generate default unclean sample if none exists
|
|
124
|
-
return self._generate_default_unclean_sample(expression_name)
|
|
125
|
-
|
|
126
|
-
df = pd.DataFrame(sample_data)
|
|
127
|
-
|
|
128
|
-
# Add educational comments as metadata
|
|
129
|
-
df = self._add_educational_comments(df, expression_name)
|
|
130
|
-
|
|
131
|
-
log_info(f"[sample_data] Retrieved unclean sample for {expression_name} ({len(df)} rows)")
|
|
132
|
-
return df
|
|
133
|
-
|
|
134
|
-
except Exception as e:
|
|
135
|
-
log_warning(f"[sample_data] Failed to get unclean sample for {expression_name}: {e}")
|
|
136
|
-
raise SampleDataError(f"Failed to get unclean sample data: {e}")
|
|
137
|
-
|
|
138
|
-
def validate_sample_data(self, sample_data: Dict[str, Any],
|
|
139
|
-
expression_name: str, sample_type: str = "clean") -> Tuple[bool, List[str]]:
|
|
140
|
-
"""
|
|
141
|
-
Validate sample data format and content
|
|
142
|
-
|
|
143
|
-
Args:
|
|
144
|
-
sample_data: Sample data dictionary
|
|
145
|
-
expression_name: Name of the expression
|
|
146
|
-
sample_type: "clean" or "unclean"
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
Tuple of (is_valid, list_of_errors)
|
|
150
|
-
"""
|
|
151
|
-
errors = []
|
|
152
|
-
|
|
153
|
-
try:
|
|
154
|
-
# Check if sample_data is a dictionary
|
|
155
|
-
if not isinstance(sample_data, dict):
|
|
156
|
-
errors.append("Sample data must be a dictionary")
|
|
157
|
-
return False, errors
|
|
158
|
-
|
|
159
|
-
# Check if sample_data has columns
|
|
160
|
-
if not sample_data:
|
|
161
|
-
errors.append("Sample data cannot be empty")
|
|
162
|
-
return False, errors
|
|
163
|
-
|
|
164
|
-
# Convert to DataFrame for validation
|
|
165
|
-
try:
|
|
166
|
-
df = pd.DataFrame(sample_data)
|
|
167
|
-
except Exception as e:
|
|
168
|
-
errors.append(f"Cannot convert sample data to DataFrame: {e}")
|
|
169
|
-
return False, errors
|
|
170
|
-
|
|
171
|
-
# Validate row count
|
|
172
|
-
row_count = len(df)
|
|
173
|
-
if row_count < self.validation_rules["min_rows"]:
|
|
174
|
-
errors.append(f"Sample data has too few rows: {row_count} < {self.validation_rules['min_rows']}")
|
|
175
|
-
|
|
176
|
-
if row_count > self.validation_rules["max_rows"]:
|
|
177
|
-
errors.append(f"Sample data has too many rows: {row_count} > {self.validation_rules['max_rows']}")
|
|
178
|
-
|
|
179
|
-
# Validate column content
|
|
180
|
-
for column, values in sample_data.items():
|
|
181
|
-
if not isinstance(values, list):
|
|
182
|
-
errors.append(f"Column '{column}' must be a list")
|
|
183
|
-
continue
|
|
184
|
-
|
|
185
|
-
# Check for consistent length
|
|
186
|
-
if len(values) != row_count:
|
|
187
|
-
errors.append(f"Column '{column}' has inconsistent length")
|
|
188
|
-
|
|
189
|
-
# Check string length limits
|
|
190
|
-
for i, value in enumerate(values):
|
|
191
|
-
if isinstance(value, str) and len(value) > self.validation_rules["max_column_length"]:
|
|
192
|
-
errors.append(f"Column '{column}' row {i} exceeds max length")
|
|
193
|
-
|
|
194
|
-
# Specific validation for clean vs unclean samples
|
|
195
|
-
if sample_type == "clean":
|
|
196
|
-
errors.extend(self._validate_clean_sample(df, expression_name))
|
|
197
|
-
else:
|
|
198
|
-
errors.extend(self._validate_unclean_sample(df, expression_name))
|
|
199
|
-
|
|
200
|
-
is_valid = len(errors) == 0
|
|
201
|
-
|
|
202
|
-
if is_valid:
|
|
203
|
-
log_info(f"[sample_data] Sample data validation passed for {expression_name}")
|
|
204
|
-
else:
|
|
205
|
-
log_warning(f"[sample_data] Sample data validation failed for {expression_name}: {errors}")
|
|
206
|
-
|
|
207
|
-
return is_valid, errors
|
|
208
|
-
|
|
209
|
-
except Exception as e:
|
|
210
|
-
errors.append(f"Validation error: {e}")
|
|
211
|
-
return False, errors
|
|
212
|
-
|
|
213
|
-
def get_sample_info(self, expression_name: str, namespace: str = "builtin",
|
|
214
|
-
version: Optional[str] = None) -> SampleDataInfo:
|
|
215
|
-
"""
|
|
216
|
-
Get comprehensive information about sample data
|
|
217
|
-
|
|
218
|
-
Args:
|
|
219
|
-
expression_name: Name of the expression
|
|
220
|
-
namespace: Namespace ("builtin" or "user")
|
|
221
|
-
version: Specific version (optional)
|
|
222
|
-
|
|
223
|
-
Returns:
|
|
224
|
-
SampleDataInfo object with comprehensive information
|
|
225
|
-
"""
|
|
226
|
-
try:
|
|
227
|
-
# Get sample data
|
|
228
|
-
clean_data = self._get_sample_data(expression_name, namespace, version, "clean")
|
|
229
|
-
unclean_data = self._get_sample_data(expression_name, namespace, version, "unclean")
|
|
230
|
-
|
|
231
|
-
# Analyze clean sample
|
|
232
|
-
has_clean = clean_data is not None
|
|
233
|
-
clean_rows = len(pd.DataFrame(clean_data)) if has_clean else 0
|
|
234
|
-
|
|
235
|
-
# Analyze unclean sample
|
|
236
|
-
has_unclean = unclean_data is not None
|
|
237
|
-
unclean_rows = len(pd.DataFrame(unclean_data)) if has_unclean else 0
|
|
238
|
-
|
|
239
|
-
# Extract educational comments
|
|
240
|
-
educational_comments = []
|
|
241
|
-
if has_unclean:
|
|
242
|
-
educational_comments = self._extract_educational_comments(unclean_data)
|
|
243
|
-
|
|
244
|
-
# Validate samples
|
|
245
|
-
validation_errors = []
|
|
246
|
-
if has_clean:
|
|
247
|
-
_, clean_errors = self.validate_sample_data(clean_data, expression_name, "clean")
|
|
248
|
-
validation_errors.extend([f"Clean: {err}" for err in clean_errors])
|
|
249
|
-
|
|
250
|
-
if has_unclean:
|
|
251
|
-
_, unclean_errors = self.validate_sample_data(unclean_data, expression_name, "unclean")
|
|
252
|
-
validation_errors.extend([f"Unclean: {err}" for err in unclean_errors])
|
|
253
|
-
|
|
254
|
-
return SampleDataInfo(
|
|
255
|
-
expression_name=expression_name,
|
|
256
|
-
version=version or self.version_manager.default_version,
|
|
257
|
-
has_clean=has_clean,
|
|
258
|
-
has_unclean=has_unclean,
|
|
259
|
-
clean_rows=clean_rows,
|
|
260
|
-
unclean_rows=unclean_rows,
|
|
261
|
-
educational_comments=educational_comments,
|
|
262
|
-
validation_errors=validation_errors
|
|
263
|
-
)
|
|
264
|
-
|
|
265
|
-
except Exception as e:
|
|
266
|
-
log_warning(f"[sample_data] Failed to get sample info for {expression_name}: {e}")
|
|
267
|
-
return SampleDataInfo(
|
|
268
|
-
expression_name=expression_name,
|
|
269
|
-
version=version or "unknown",
|
|
270
|
-
has_clean=False,
|
|
271
|
-
has_unclean=False,
|
|
272
|
-
clean_rows=0,
|
|
273
|
-
unclean_rows=0,
|
|
274
|
-
educational_comments=[],
|
|
275
|
-
validation_errors=[f"Failed to get sample info: {e}"]
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
def create_sample_template(self, expression_name: str, columns: List[str]) -> Dict[str, Dict[str, Any]]:
|
|
279
|
-
"""
|
|
280
|
-
Create a template for sample data
|
|
281
|
-
|
|
282
|
-
Args:
|
|
283
|
-
expression_name: Name of the expression
|
|
284
|
-
columns: List of required columns
|
|
285
|
-
|
|
286
|
-
Returns:
|
|
287
|
-
Dictionary with clean and unclean sample templates
|
|
288
|
-
"""
|
|
289
|
-
try:
|
|
290
|
-
# Create clean sample template
|
|
291
|
-
clean_template = {}
|
|
292
|
-
for column in columns:
|
|
293
|
-
clean_template[column] = [f"sample_{column}_1", f"sample_{column}_2", f"sample_{column}_3"]
|
|
294
|
-
|
|
295
|
-
# Create unclean sample template with educational comments
|
|
296
|
-
unclean_template = {}
|
|
297
|
-
for column in columns:
|
|
298
|
-
unclean_template[column] = [
|
|
299
|
-
f"valid_{column}",
|
|
300
|
-
None, # Missing value
|
|
301
|
-
f"invalid_{column}_type",
|
|
302
|
-
f"extreme_{column}_value"
|
|
303
|
-
]
|
|
304
|
-
|
|
305
|
-
# Add educational comments
|
|
306
|
-
unclean_template["_comments"] = [
|
|
307
|
-
"# This is unclean sample data for testing error handling",
|
|
308
|
-
"# Row 1: Valid data",
|
|
309
|
-
"# Row 2: Missing values (None/null)",
|
|
310
|
-
"# Row 3: Invalid data types",
|
|
311
|
-
"# Row 4: Extreme or edge case values"
|
|
312
|
-
]
|
|
313
|
-
|
|
314
|
-
template = {
|
|
315
|
-
"clean": clean_template,
|
|
316
|
-
"unclean": unclean_template
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
log_info(f"[sample_data] Created sample template for {expression_name}")
|
|
320
|
-
return template
|
|
321
|
-
|
|
322
|
-
except Exception as e:
|
|
323
|
-
log_warning(f"[sample_data] Failed to create sample template for {expression_name}: {e}")
|
|
324
|
-
raise SampleDataError(f"Failed to create sample template: {e}")
|
|
325
|
-
|
|
326
|
-
def _get_sample_data(self, expression_name: str, namespace: str,
|
|
327
|
-
version: Optional[str], sample_type: str) -> Optional[Dict[str, Any]]:
|
|
328
|
-
"""Get raw sample data from expression file"""
|
|
329
|
-
try:
|
|
330
|
-
# Get expression file path
|
|
331
|
-
expression_path = self.namespace_manager.get_expression_file_path(
|
|
332
|
-
namespace, expression_name, version
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
if not expression_path or not os.path.exists(expression_path):
|
|
336
|
-
return None
|
|
337
|
-
|
|
338
|
-
# Validate integrity
|
|
339
|
-
if not self.integrity_manager.validate_integrity(expression_path):
|
|
340
|
-
log_warning(f"[sample_data] Integrity validation failed for {expression_path}")
|
|
341
|
-
return None
|
|
342
|
-
|
|
343
|
-
# Parse expression file
|
|
344
|
-
with open(expression_path, 'r') as f:
|
|
345
|
-
content = yaml.safe_load(f)
|
|
346
|
-
|
|
347
|
-
# Extract sample data
|
|
348
|
-
sample_section = content.get("sample", {})
|
|
349
|
-
return sample_section.get(sample_type)
|
|
350
|
-
|
|
351
|
-
except Exception as e:
|
|
352
|
-
log_warning(f"[sample_data] Failed to get sample data from {expression_path}: {e}")
|
|
353
|
-
return None
|
|
354
|
-
|
|
355
|
-
def _validate_clean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
|
|
356
|
-
"""Validate clean sample data"""
|
|
357
|
-
errors = []
|
|
358
|
-
|
|
359
|
-
# Check for missing values in clean sample
|
|
360
|
-
if df.isnull().any().any():
|
|
361
|
-
errors.append("Clean sample should not contain missing values")
|
|
362
|
-
|
|
363
|
-
# Check for reasonable data types
|
|
364
|
-
for column in df.columns:
|
|
365
|
-
if column.startswith('_'): # Skip metadata columns
|
|
366
|
-
continue
|
|
367
|
-
|
|
368
|
-
series = df[column]
|
|
369
|
-
|
|
370
|
-
# Check for mixed types (should be consistent in clean data)
|
|
371
|
-
unique_types = set(type(x).__name__ for x in series.dropna())
|
|
372
|
-
if len(unique_types) > 1:
|
|
373
|
-
errors.append(f"Column '{column}' has mixed data types in clean sample")
|
|
374
|
-
|
|
375
|
-
return errors
|
|
376
|
-
|
|
377
|
-
def _validate_unclean_sample(self, df: pd.DataFrame, expression_name: str) -> List[str]:
|
|
378
|
-
"""Validate unclean sample data"""
|
|
379
|
-
errors = []
|
|
380
|
-
|
|
381
|
-
# Unclean samples should have some issues for educational purposes
|
|
382
|
-
has_nulls = df.isnull().any().any()
|
|
383
|
-
has_mixed_types = False
|
|
384
|
-
|
|
385
|
-
for column in df.columns:
|
|
386
|
-
if column.startswith('_'): # Skip metadata columns
|
|
387
|
-
continue
|
|
388
|
-
|
|
389
|
-
series = df[column]
|
|
390
|
-
unique_types = set(type(x).__name__ for x in series.dropna())
|
|
391
|
-
if len(unique_types) > 1:
|
|
392
|
-
has_mixed_types = True
|
|
393
|
-
break
|
|
394
|
-
|
|
395
|
-
# Unclean samples should demonstrate common data issues
|
|
396
|
-
if not has_nulls and not has_mixed_types:
|
|
397
|
-
errors.append("Unclean sample should contain some data quality issues for educational purposes")
|
|
398
|
-
|
|
399
|
-
return errors
|
|
400
|
-
|
|
401
|
-
def _add_educational_comments(self, df: pd.DataFrame, expression_name: str) -> pd.DataFrame:
|
|
402
|
-
"""Add educational comments to unclean sample data"""
|
|
403
|
-
try:
|
|
404
|
-
# Add a comments column with educational information
|
|
405
|
-
comments = []
|
|
406
|
-
|
|
407
|
-
for i, row in df.iterrows():
|
|
408
|
-
comment_parts = []
|
|
409
|
-
|
|
410
|
-
# Check for missing values
|
|
411
|
-
if row.isnull().any():
|
|
412
|
-
comment_parts.append("Contains missing values")
|
|
413
|
-
|
|
414
|
-
# Check for potential type issues
|
|
415
|
-
for col, val in row.items():
|
|
416
|
-
if col.startswith('_'):
|
|
417
|
-
continue
|
|
418
|
-
if isinstance(val, str) and val.lower() in ['invalid', 'error', 'null']:
|
|
419
|
-
comment_parts.append(f"'{col}' has invalid value")
|
|
420
|
-
|
|
421
|
-
if not comment_parts:
|
|
422
|
-
comment_parts.append("Valid data row")
|
|
423
|
-
|
|
424
|
-
comments.append(" | ".join(comment_parts))
|
|
425
|
-
|
|
426
|
-
# Add comments as a new column
|
|
427
|
-
df_with_comments = df.copy()
|
|
428
|
-
df_with_comments['_educational_comments'] = comments
|
|
429
|
-
|
|
430
|
-
return df_with_comments
|
|
431
|
-
|
|
432
|
-
except Exception as e:
|
|
433
|
-
log_warning(f"[sample_data] Failed to add educational comments: {e}")
|
|
434
|
-
return df
|
|
435
|
-
|
|
436
|
-
def _extract_educational_comments(self, sample_data: Dict[str, Any]) -> List[str]:
|
|
437
|
-
"""Extract educational comments from sample data"""
|
|
438
|
-
comments = []
|
|
439
|
-
|
|
440
|
-
# Look for comment fields
|
|
441
|
-
if '_comments' in sample_data:
|
|
442
|
-
comments.extend(sample_data['_comments'])
|
|
443
|
-
|
|
444
|
-
# Generate comments based on data patterns
|
|
445
|
-
try:
|
|
446
|
-
df = pd.DataFrame({k: v for k, v in sample_data.items() if not k.startswith('_')})
|
|
447
|
-
|
|
448
|
-
if df.isnull().any().any():
|
|
449
|
-
comments.append("Contains missing values for null handling testing")
|
|
450
|
-
|
|
451
|
-
for column in df.columns:
|
|
452
|
-
series = df[column]
|
|
453
|
-
unique_types = set(type(x).__name__ for x in series.dropna())
|
|
454
|
-
if len(unique_types) > 1:
|
|
455
|
-
comments.append(f"Column '{column}' has mixed types for type validation testing")
|
|
456
|
-
|
|
457
|
-
except Exception:
|
|
458
|
-
pass # Ignore errors in comment extraction
|
|
459
|
-
|
|
460
|
-
return comments
|
|
461
|
-
|
|
462
|
-
def _generate_default_clean_sample(self, expression_name: str) -> pd.DataFrame:
|
|
463
|
-
"""Generate default clean sample data when none exists"""
|
|
464
|
-
return pd.DataFrame({
|
|
465
|
-
"col_a": [1, 2, 3],
|
|
466
|
-
"col_b": [4, 5, 6],
|
|
467
|
-
"_info": [f"Default clean sample for '{expression_name}'"] * 3
|
|
468
|
-
})
|
|
469
|
-
|
|
470
|
-
def _generate_default_unclean_sample(self, expression_name: str) -> pd.DataFrame:
|
|
471
|
-
"""Generate default unclean sample data when none exists"""
|
|
472
|
-
return pd.DataFrame({
|
|
473
|
-
"col_a": [1, None, "invalid"],
|
|
474
|
-
"col_b": [4, 5, -999],
|
|
475
|
-
"_educational_comments": [
|
|
476
|
-
"Valid data row",
|
|
477
|
-
"Missing value in col_a",
|
|
478
|
-
"Invalid type in col_a, extreme value in col_b"
|
|
479
|
-
],
|
|
480
|
-
"_info": [f"Default unclean sample for '{expression_name}'"] * 3
|
|
481
|
-
})
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
# Global sample data manager instance
|
|
485
|
-
_sample_data_manager = None
|
|
486
|
-
|
|
487
|
-
def get_sample_data_manager() -> SampleDataManager:
|
|
488
|
-
"""Get the global sample data manager instance"""
|
|
489
|
-
global _sample_data_manager
|
|
490
|
-
if _sample_data_manager is None:
|
|
491
|
-
_sample_data_manager = SampleDataManager()
|
|
492
|
-
return _sample_data_manager
|