xelytics-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- xelytics_core-0.1.0/PKG-INFO +104 -0
- xelytics_core-0.1.0/README.md +70 -0
- xelytics_core-0.1.0/pyproject.toml +64 -0
- xelytics_core-0.1.0/setup.cfg +4 -0
- xelytics_core-0.1.0/tests/test_core.py +167 -0
- xelytics_core-0.1.0/tests/test_golden.py +178 -0
- xelytics_core-0.1.0/tests/test_llm.py +151 -0
- xelytics_core-0.1.0/tests/test_stats.py +225 -0
- xelytics_core-0.1.0/tests/test_viz_insights.py +181 -0
- xelytics_core-0.1.0/xelytics/__init__.py +16 -0
- xelytics_core-0.1.0/xelytics/__version__.py +3 -0
- xelytics_core-0.1.0/xelytics/cli/__init__.py +5 -0
- xelytics_core-0.1.0/xelytics/cli/main.py +149 -0
- xelytics_core-0.1.0/xelytics/core/__init__.py +14 -0
- xelytics_core-0.1.0/xelytics/core/features.py +359 -0
- xelytics_core-0.1.0/xelytics/core/ingestion.py +135 -0
- xelytics_core-0.1.0/xelytics/core/profiler.py +174 -0
- xelytics_core-0.1.0/xelytics/engine.py +139 -0
- xelytics_core-0.1.0/xelytics/exceptions.py +78 -0
- xelytics_core-0.1.0/xelytics/insights/__init__.py +10 -0
- xelytics_core-0.1.0/xelytics/insights/rules.py +203 -0
- xelytics_core-0.1.0/xelytics/llm/__init__.py +14 -0
- xelytics_core-0.1.0/xelytics/llm/narrator.py +135 -0
- xelytics_core-0.1.0/xelytics/llm/provider.py +125 -0
- xelytics_core-0.1.0/xelytics/llm/providers/__init__.py +8 -0
- xelytics_core-0.1.0/xelytics/llm/providers/openai.py +160 -0
- xelytics_core-0.1.0/xelytics/schemas/__init__.py +26 -0
- xelytics_core-0.1.0/xelytics/schemas/config.py +51 -0
- xelytics_core-0.1.0/xelytics/schemas/inputs.py +47 -0
- xelytics_core-0.1.0/xelytics/schemas/metadata.py +121 -0
- xelytics_core-0.1.0/xelytics/schemas/outputs.py +368 -0
- xelytics_core-0.1.0/xelytics/stats/__init__.py +24 -0
- xelytics_core-0.1.0/xelytics/stats/engine.py +213 -0
- xelytics_core-0.1.0/xelytics/stats/planner.py +309 -0
- xelytics_core-0.1.0/xelytics/stats/tests.py +378 -0
- xelytics_core-0.1.0/xelytics/viz/__init__.py +12 -0
- xelytics_core-0.1.0/xelytics/viz/generator.py +249 -0
- xelytics_core-0.1.0/xelytics/viz/selector.py +147 -0
- xelytics_core-0.1.0/xelytics_core.egg-info/PKG-INFO +104 -0
- xelytics_core-0.1.0/xelytics_core.egg-info/SOURCES.txt +42 -0
- xelytics_core-0.1.0/xelytics_core.egg-info/dependency_links.txt +1 -0
- xelytics_core-0.1.0/xelytics_core.egg-info/entry_points.txt +2 -0
- xelytics_core-0.1.0/xelytics_core.egg-info/requires.txt +19 -0
- xelytics_core-0.1.0/xelytics_core.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: xelytics-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pure analytics engine for statistical analysis and insight generation
|
|
5
|
+
Author: Xelytics Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.9
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: pandas>=2.1.0
|
|
18
|
+
Requires-Dist: numpy>=1.24.0
|
|
19
|
+
Requires-Dist: scipy>=1.11.0
|
|
20
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
21
|
+
Requires-Dist: statsmodels>=0.14.0
|
|
22
|
+
Requires-Dist: pingouin>=0.5.3
|
|
23
|
+
Requires-Dist: plotly>=5.17.0
|
|
24
|
+
Provides-Extra: llm
|
|
25
|
+
Requires-Dist: openai>=1.6.0; extra == "llm"
|
|
26
|
+
Requires-Dist: groq>=0.4.0; extra == "llm"
|
|
27
|
+
Requires-Dist: httpx>=0.25.0; extra == "llm"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: black>=23.11.0; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy>=1.7.0; extra == "dev"
|
|
34
|
+
|
|
35
|
+
# Xelytics-Core
|
|
36
|
+
|
|
37
|
+
**Pure analytics engine for statistical analysis and insight generation.**
|
|
38
|
+
|
|
39
|
+
## Installation
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install -e .
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from xelytics import analyze, AnalysisConfig
|
|
49
|
+
import pandas as pd
|
|
50
|
+
|
|
51
|
+
# Load your data
|
|
52
|
+
df = pd.read_csv("data.csv")
|
|
53
|
+
|
|
54
|
+
# Run automated analysis
|
|
55
|
+
result = analyze(df, mode="automated")
|
|
56
|
+
|
|
57
|
+
# Access results
|
|
58
|
+
print(f"Analyzed {result.metadata.row_count} rows")
|
|
59
|
+
print(f"Found {len(result.statistics)} statistical tests")
|
|
60
|
+
print(f"Generated {len(result.visualizations)} visualizations")
|
|
61
|
+
print(f"Produced {len(result.insights)} insights")
|
|
62
|
+
|
|
63
|
+
# Export to JSON
|
|
64
|
+
json_output = result.to_json()
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## API Contract
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
from xelytics import analyze, AnalysisConfig, AnalysisResult
|
|
71
|
+
|
|
72
|
+
result = analyze(
|
|
73
|
+
data=df,
|
|
74
|
+
mode="automated", # or "semi-automated"
|
|
75
|
+
config=AnalysisConfig(
|
|
76
|
+
significance_level=0.05,
|
|
77
|
+
enable_llm_insights=True,
|
|
78
|
+
max_visualizations=10,
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Output Schema
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
AnalysisResult(
|
|
87
|
+
summary=DatasetSummary(...),
|
|
88
|
+
statistics=[StatisticalTestResult(...), ...],
|
|
89
|
+
visualizations=[VisualizationSpec(...), ...],
|
|
90
|
+
insights=[Insight(...), ...],
|
|
91
|
+
metadata=RunMetadata(...),
|
|
92
|
+
)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Design Principles
|
|
96
|
+
|
|
97
|
+
1. **Pure analytics engine** - No HTTP, no database, no auth
|
|
98
|
+
2. **Deterministic** - Same input = same output
|
|
99
|
+
3. **LLM is optional** - Rule-based insights work without LLM
|
|
100
|
+
4. **Type-safe** - All inputs/outputs are typed dataclasses
|
|
101
|
+
|
|
102
|
+
## License
|
|
103
|
+
|
|
104
|
+
MIT
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
# Xelytics-Core
|
|
2
|
+
|
|
3
|
+
**Pure analytics engine for statistical analysis and insight generation.**
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install -e .
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from xelytics import analyze, AnalysisConfig
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
# Load your data
|
|
18
|
+
df = pd.read_csv("data.csv")
|
|
19
|
+
|
|
20
|
+
# Run automated analysis
|
|
21
|
+
result = analyze(df, mode="automated")
|
|
22
|
+
|
|
23
|
+
# Access results
|
|
24
|
+
print(f"Analyzed {result.metadata.row_count} rows")
|
|
25
|
+
print(f"Found {len(result.statistics)} statistical tests")
|
|
26
|
+
print(f"Generated {len(result.visualizations)} visualizations")
|
|
27
|
+
print(f"Produced {len(result.insights)} insights")
|
|
28
|
+
|
|
29
|
+
# Export to JSON
|
|
30
|
+
json_output = result.to_json()
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## API Contract
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from xelytics import analyze, AnalysisConfig, AnalysisResult
|
|
37
|
+
|
|
38
|
+
result = analyze(
|
|
39
|
+
data=df,
|
|
40
|
+
mode="automated", # or "semi-automated"
|
|
41
|
+
config=AnalysisConfig(
|
|
42
|
+
significance_level=0.05,
|
|
43
|
+
enable_llm_insights=True,
|
|
44
|
+
max_visualizations=10,
|
|
45
|
+
)
|
|
46
|
+
)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Output Schema
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
AnalysisResult(
|
|
53
|
+
summary=DatasetSummary(...),
|
|
54
|
+
statistics=[StatisticalTestResult(...), ...],
|
|
55
|
+
visualizations=[VisualizationSpec(...), ...],
|
|
56
|
+
insights=[Insight(...), ...],
|
|
57
|
+
metadata=RunMetadata(...),
|
|
58
|
+
)
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Design Principles
|
|
62
|
+
|
|
63
|
+
1. **Pure analytics engine** - No HTTP, no database, no auth
|
|
64
|
+
2. **Deterministic** - Same input = same output
|
|
65
|
+
3. **LLM is optional** - Rule-based insights work without LLM
|
|
66
|
+
4. **Type-safe** - All inputs/outputs are typed dataclasses
|
|
67
|
+
|
|
68
|
+
## License
|
|
69
|
+
|
|
70
|
+
MIT
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "xelytics-core"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Pure analytics engine for statistical analysis and insight generation"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.9"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Xelytics Team"}
|
|
14
|
+
]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 3 - Alpha",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.9",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"pandas>=2.1.0",
|
|
28
|
+
"numpy>=1.24.0",
|
|
29
|
+
"scipy>=1.11.0",
|
|
30
|
+
"scikit-learn>=1.3.0",
|
|
31
|
+
"statsmodels>=0.14.0",
|
|
32
|
+
"pingouin>=0.5.3",
|
|
33
|
+
"plotly>=5.17.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.optional-dependencies]
|
|
37
|
+
llm = [
|
|
38
|
+
"openai>=1.6.0",
|
|
39
|
+
"groq>=0.4.0",
|
|
40
|
+
"httpx>=0.25.0",
|
|
41
|
+
]
|
|
42
|
+
dev = [
|
|
43
|
+
"pytest>=7.4.0",
|
|
44
|
+
"pytest-asyncio>=0.21.0",
|
|
45
|
+
"pytest-cov>=4.1.0",
|
|
46
|
+
"black>=23.11.0",
|
|
47
|
+
"mypy>=1.7.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.scripts]
|
|
51
|
+
xelytics = "xelytics.cli.main:main"
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.packages.find]
|
|
54
|
+
where = ["."]
|
|
55
|
+
include = ["xelytics*"]
|
|
56
|
+
|
|
57
|
+
[tool.black]
|
|
58
|
+
line-length = 100
|
|
59
|
+
target-version = ['py39', 'py310', 'py311', 'py312']
|
|
60
|
+
|
|
61
|
+
[tool.mypy]
|
|
62
|
+
python_version = "3.9"
|
|
63
|
+
warn_return_any = true
|
|
64
|
+
warn_unused_configs = true
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Unit tests for core modules.
|
|
2
|
+
|
|
3
|
+
Tests ingestion, profiling, and feature detection.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
from xelytics.core.ingestion import DataIngestion, IngestionResult
|
|
11
|
+
from xelytics.core.profiler import DataProfiler, ProfileResult
|
|
12
|
+
from xelytics.core.features import FeatureDetector, FeatureDetectionResult
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class TestDataIngestion:
|
|
16
|
+
"""Tests for DataIngestion module."""
|
|
17
|
+
|
|
18
|
+
def test_ingest_valid_dataframe(self, sample_mixed_df):
|
|
19
|
+
"""Test ingestion of valid DataFrame."""
|
|
20
|
+
ingestion = DataIngestion()
|
|
21
|
+
result = ingestion.ingest(sample_mixed_df)
|
|
22
|
+
|
|
23
|
+
assert isinstance(result, IngestionResult)
|
|
24
|
+
assert result.row_count == 100
|
|
25
|
+
assert result.column_count == 5
|
|
26
|
+
assert len(result.column_dtypes) == 5
|
|
27
|
+
|
|
28
|
+
def test_ingest_empty_dataframe_raises(self):
|
|
29
|
+
"""Test that empty DataFrame raises ValueError."""
|
|
30
|
+
ingestion = DataIngestion()
|
|
31
|
+
with pytest.raises(ValueError, match="cannot be empty"):
|
|
32
|
+
ingestion.ingest(pd.DataFrame())
|
|
33
|
+
|
|
34
|
+
def test_ingest_invalid_type_raises(self):
|
|
35
|
+
"""Test that non-DataFrame raises error."""
|
|
36
|
+
ingestion = DataIngestion()
|
|
37
|
+
with pytest.raises(ValueError, match="must be a pandas DataFrame"):
|
|
38
|
+
ingestion.ingest([1, 2, 3]) # type: ignore
|
|
39
|
+
|
|
40
|
+
def test_type_normalization(self):
|
|
41
|
+
"""Test automatic type normalization."""
|
|
42
|
+
ingestion = DataIngestion()
|
|
43
|
+
df = pd.DataFrame({
|
|
44
|
+
'numeric_str': ['1', '2', '3', '4', '5'],
|
|
45
|
+
'date_str': ['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05'],
|
|
46
|
+
})
|
|
47
|
+
result = ingestion.ingest(df)
|
|
48
|
+
|
|
49
|
+
# Should normalize numeric strings to proper types
|
|
50
|
+
assert result.row_count == 5
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TestDataProfiler:
|
|
54
|
+
"""Tests for DataProfiler module."""
|
|
55
|
+
|
|
56
|
+
def test_profile_numeric_columns(self, sample_numeric_df):
|
|
57
|
+
"""Test profiling numeric columns."""
|
|
58
|
+
profiler = DataProfiler()
|
|
59
|
+
result = profiler.profile(sample_numeric_df)
|
|
60
|
+
|
|
61
|
+
assert isinstance(result, ProfileResult)
|
|
62
|
+
assert len(result.column_profiles) == 4
|
|
63
|
+
|
|
64
|
+
# Check numeric statistics present
|
|
65
|
+
sales_profile = next(p for p in result.column_profiles if p.column_name == 'sales')
|
|
66
|
+
assert sales_profile.mean is not None
|
|
67
|
+
assert sales_profile.std is not None
|
|
68
|
+
assert sales_profile.min is not None
|
|
69
|
+
assert sales_profile.max is not None
|
|
70
|
+
|
|
71
|
+
def test_profile_categorical_columns(self, sample_categorical_df):
|
|
72
|
+
"""Test profiling categorical columns."""
|
|
73
|
+
profiler = DataProfiler()
|
|
74
|
+
result = profiler.profile(sample_categorical_df)
|
|
75
|
+
|
|
76
|
+
assert len(result.column_profiles) == 4
|
|
77
|
+
|
|
78
|
+
# Check categorical statistics present
|
|
79
|
+
region_profile = next(p for p in result.column_profiles if p.column_name == 'region')
|
|
80
|
+
assert region_profile.unique_count == 4
|
|
81
|
+
assert region_profile.data_type == 'categorical'
|
|
82
|
+
|
|
83
|
+
def test_profile_missing_values(self, sample_with_missing_df):
|
|
84
|
+
"""Test profiling DataFrame with missing values."""
|
|
85
|
+
profiler = DataProfiler()
|
|
86
|
+
result = profiler.profile(sample_with_missing_df)
|
|
87
|
+
|
|
88
|
+
assert result.total_missing_cells > 0
|
|
89
|
+
|
|
90
|
+
# Check missing value detection
|
|
91
|
+
value1_profile = next(p for p in result.column_profiles if p.column_name == 'value1')
|
|
92
|
+
assert value1_profile.missing_count == 10
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class TestFeatureDetector:
|
|
96
|
+
"""Tests for FeatureDetector module."""
|
|
97
|
+
|
|
98
|
+
def test_detect_numeric_columns(self, sample_numeric_df):
|
|
99
|
+
"""Test detection of numeric columns."""
|
|
100
|
+
detector = FeatureDetector()
|
|
101
|
+
result = detector.detect(sample_numeric_df)
|
|
102
|
+
|
|
103
|
+
assert isinstance(result, FeatureDetectionResult)
|
|
104
|
+
assert len(result.numeric_columns) == 4
|
|
105
|
+
assert 'sales' in result.numeric_columns
|
|
106
|
+
|
|
107
|
+
def test_detect_categorical_columns(self, sample_categorical_df):
|
|
108
|
+
"""Test detection of categorical columns."""
|
|
109
|
+
detector = FeatureDetector()
|
|
110
|
+
result = detector.detect(sample_categorical_df)
|
|
111
|
+
|
|
112
|
+
assert len(result.categorical_columns) == 4
|
|
113
|
+
assert 'region' in result.categorical_columns
|
|
114
|
+
|
|
115
|
+
def test_detect_datetime_columns(self, sample_mixed_df):
|
|
116
|
+
"""Test detection of datetime columns."""
|
|
117
|
+
detector = FeatureDetector()
|
|
118
|
+
result = detector.detect(sample_mixed_df)
|
|
119
|
+
|
|
120
|
+
assert 'date' in result.datetime_columns
|
|
121
|
+
|
|
122
|
+
def test_detect_groupable_columns(self, sample_mixed_df):
|
|
123
|
+
"""Test detection of groupable columns."""
|
|
124
|
+
detector = FeatureDetector()
|
|
125
|
+
result = detector.detect(sample_mixed_df)
|
|
126
|
+
|
|
127
|
+
# Should detect region and category as groupable
|
|
128
|
+
assert len(result.groupable_columns) >= 2
|
|
129
|
+
|
|
130
|
+
def test_no_name_heuristics(self):
|
|
131
|
+
"""Test that feature detection uses data only, not column names.
|
|
132
|
+
|
|
133
|
+
Per plan constraint: Feature detection must rely only on data.
|
|
134
|
+
"""
|
|
135
|
+
detector = FeatureDetector()
|
|
136
|
+
|
|
137
|
+
# Create DataFrame with misleading column names
|
|
138
|
+
df = pd.DataFrame({
|
|
139
|
+
'id_column': ['A', 'B', 'C'] * 10, # Named like ID but low cardinality
|
|
140
|
+
'date_field': [1.5, 2.5, 3.5] * 10, # Named like date but numeric
|
|
141
|
+
'target_var': np.random.choice(['X', 'Y'], 30), # Named like target but categorical
|
|
142
|
+
})
|
|
143
|
+
|
|
144
|
+
result = detector.detect(df)
|
|
145
|
+
|
|
146
|
+
# 'id_column' should NOT be classified as identifier (low cardinality)
|
|
147
|
+
assert 'id_column' not in result.identifier_columns
|
|
148
|
+
assert 'id_column' in result.categorical_columns
|
|
149
|
+
|
|
150
|
+
# 'date_field' should NOT be classified as datetime (it's numeric)
|
|
151
|
+
assert 'date_field' not in result.datetime_columns
|
|
152
|
+
assert 'date_field' in result.numeric_columns
|
|
153
|
+
|
|
154
|
+
def test_deterministic_detection(self, sample_mixed_df, assert_deterministic):
|
|
155
|
+
"""Test that feature detection is deterministic."""
|
|
156
|
+
detector = FeatureDetector()
|
|
157
|
+
|
|
158
|
+
def detect_wrapper():
|
|
159
|
+
return detector.detect(sample_mixed_df)
|
|
160
|
+
|
|
161
|
+
# Run 5 times and assert same output
|
|
162
|
+
results = []
|
|
163
|
+
for _ in range(5):
|
|
164
|
+
result = detector.detect(sample_mixed_df)
|
|
165
|
+
results.append(tuple(sorted(result.column_roles.items())))
|
|
166
|
+
|
|
167
|
+
assert len(set(results)) == 1, "Feature detection is not deterministic"
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Golden output and determinism tests.
|
|
2
|
+
|
|
3
|
+
Tests to ensure:
|
|
4
|
+
1. Same input → same output (determinism)
|
|
5
|
+
2. Output matches expected golden values (regression)
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
import json
|
|
12
|
+
import hashlib
|
|
13
|
+
|
|
14
|
+
from xelytics import analyze, AnalysisConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TestDeterminism:
|
|
18
|
+
"""Tests that same input produces same output."""
|
|
19
|
+
|
|
20
|
+
def test_analyze_deterministic(self, golden_dataset):
|
|
21
|
+
"""Test that analyze() produces identical output across runs."""
|
|
22
|
+
config = AnalysisConfig(
|
|
23
|
+
mode="automated",
|
|
24
|
+
enable_llm_insights=False, # Disable LLM for determinism
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
hashes = []
|
|
28
|
+
for _ in range(3):
|
|
29
|
+
result = analyze(golden_dataset, mode="automated", config=config)
|
|
30
|
+
# Hash the core output
|
|
31
|
+
output = {
|
|
32
|
+
"row_count": result.summary.row_count,
|
|
33
|
+
"column_count": result.summary.column_count,
|
|
34
|
+
"tests_executed": result.metadata.tests_executed,
|
|
35
|
+
"numeric_columns": sorted(result.summary.numeric_columns),
|
|
36
|
+
"categorical_columns": sorted(result.summary.categorical_columns),
|
|
37
|
+
}
|
|
38
|
+
hash_val = hashlib.md5(json.dumps(output, sort_keys=True).encode()).hexdigest()
|
|
39
|
+
hashes.append(hash_val)
|
|
40
|
+
|
|
41
|
+
assert len(set(hashes)) == 1, f"Non-deterministic output: {hashes}"
|
|
42
|
+
|
|
43
|
+
def test_statistical_results_deterministic(self, golden_dataset):
|
|
44
|
+
"""Test that statistical test results are deterministic."""
|
|
45
|
+
config = AnalysisConfig(mode="automated", enable_llm_insights=False)
|
|
46
|
+
|
|
47
|
+
p_values_runs = []
|
|
48
|
+
for _ in range(3):
|
|
49
|
+
result = analyze(golden_dataset, mode="automated", config=config)
|
|
50
|
+
p_values = [r.p_value for r in result.statistics]
|
|
51
|
+
p_values_runs.append(tuple(p_values))
|
|
52
|
+
|
|
53
|
+
assert len(set(p_values_runs)) == 1, "P-values differ across runs"
|
|
54
|
+
|
|
55
|
+
def test_insight_generation_deterministic(self, golden_dataset):
|
|
56
|
+
"""Test that insight generation is deterministic."""
|
|
57
|
+
config = AnalysisConfig(mode="automated", enable_llm_insights=False)
|
|
58
|
+
|
|
59
|
+
insight_titles_runs = []
|
|
60
|
+
for _ in range(3):
|
|
61
|
+
result = analyze(golden_dataset, mode="automated", config=config)
|
|
62
|
+
titles = sorted([i.title for i in result.insights])
|
|
63
|
+
insight_titles_runs.append(tuple(titles))
|
|
64
|
+
|
|
65
|
+
assert len(set(insight_titles_runs)) == 1, "Insights differ across runs"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TestGoldenOutput:
|
|
69
|
+
"""Golden output regression tests.
|
|
70
|
+
|
|
71
|
+
These tests verify that key outputs match expected values.
|
|
72
|
+
If the expected values change, investigate why before updating.
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def test_golden_dataset_row_count(self, golden_dataset):
|
|
76
|
+
"""Test expected row count."""
|
|
77
|
+
result = analyze(golden_dataset, mode="automated")
|
|
78
|
+
assert result.summary.row_count == 10
|
|
79
|
+
|
|
80
|
+
def test_golden_dataset_column_count(self, golden_dataset):
|
|
81
|
+
"""Test expected column count."""
|
|
82
|
+
result = analyze(golden_dataset, mode="automated")
|
|
83
|
+
assert result.summary.column_count == 4
|
|
84
|
+
|
|
85
|
+
def test_golden_dataset_numeric_columns(self, golden_dataset):
|
|
86
|
+
"""Test expected numeric column detection."""
|
|
87
|
+
result = analyze(golden_dataset, mode="automated")
|
|
88
|
+
|
|
89
|
+
expected_numeric = ['age', 'income']
|
|
90
|
+
assert sorted(result.summary.numeric_columns) == sorted(expected_numeric)
|
|
91
|
+
|
|
92
|
+
def test_golden_dataset_categorical_columns(self, golden_dataset):
|
|
93
|
+
"""Test expected categorical column detection."""
|
|
94
|
+
result = analyze(golden_dataset, mode="automated")
|
|
95
|
+
|
|
96
|
+
expected_categorical = ['education', 'region']
|
|
97
|
+
assert sorted(result.summary.categorical_columns) == sorted(expected_categorical)
|
|
98
|
+
|
|
99
|
+
def test_golden_dataset_no_missing(self, golden_dataset):
|
|
100
|
+
"""Test that golden dataset has no missing values."""
|
|
101
|
+
result = analyze(golden_dataset, mode="automated")
|
|
102
|
+
assert result.summary.total_missing_cells == 0
|
|
103
|
+
|
|
104
|
+
def test_json_serialization_roundtrip(self, golden_dataset):
|
|
105
|
+
"""Test that JSON serialization is lossless."""
|
|
106
|
+
from xelytics.schemas.outputs import AnalysisResult
|
|
107
|
+
|
|
108
|
+
result = analyze(golden_dataset, mode="automated")
|
|
109
|
+
|
|
110
|
+
# Serialize to JSON
|
|
111
|
+
json_str = result.to_json()
|
|
112
|
+
|
|
113
|
+
# Deserialize
|
|
114
|
+
restored = AnalysisResult.from_json(json_str)
|
|
115
|
+
|
|
116
|
+
# Compare key fields
|
|
117
|
+
assert restored.summary.row_count == result.summary.row_count
|
|
118
|
+
assert restored.summary.column_count == result.summary.column_count
|
|
119
|
+
assert len(restored.statistics) == len(result.statistics)
|
|
120
|
+
assert len(restored.insights) == len(result.insights)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TestBackwardCompatibility:
|
|
124
|
+
"""Backward compatibility tests.
|
|
125
|
+
|
|
126
|
+
Ensures schema changes don't break existing integrations.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def test_analysis_result_has_required_fields(self, sample_mixed_df):
|
|
130
|
+
"""Test that AnalysisResult has all required fields."""
|
|
131
|
+
result = analyze(sample_mixed_df, mode="automated")
|
|
132
|
+
|
|
133
|
+
# Required top-level fields (per API_CONTRACT.md)
|
|
134
|
+
assert hasattr(result, 'summary')
|
|
135
|
+
assert hasattr(result, 'statistics')
|
|
136
|
+
assert hasattr(result, 'visualizations')
|
|
137
|
+
assert hasattr(result, 'insights')
|
|
138
|
+
assert hasattr(result, 'metadata')
|
|
139
|
+
|
|
140
|
+
def test_dataset_summary_fields(self, sample_mixed_df):
|
|
141
|
+
"""Test that DatasetSummary has expected fields."""
|
|
142
|
+
result = analyze(sample_mixed_df, mode="automated")
|
|
143
|
+
summary = result.summary
|
|
144
|
+
|
|
145
|
+
assert hasattr(summary, 'row_count')
|
|
146
|
+
assert hasattr(summary, 'column_count')
|
|
147
|
+
assert hasattr(summary, 'numeric_columns')
|
|
148
|
+
assert hasattr(summary, 'categorical_columns')
|
|
149
|
+
assert hasattr(summary, 'column_profiles')
|
|
150
|
+
|
|
151
|
+
def test_statistical_result_fields(self, sample_mixed_df):
|
|
152
|
+
"""Test that StatisticalTestResult has expected fields."""
|
|
153
|
+
result = analyze(sample_mixed_df, mode="automated")
|
|
154
|
+
|
|
155
|
+
if result.statistics:
|
|
156
|
+
stat = result.statistics[0]
|
|
157
|
+
assert hasattr(stat, 'test_name')
|
|
158
|
+
assert hasattr(stat, 'test_type')
|
|
159
|
+
assert hasattr(stat, 'statistic')
|
|
160
|
+
assert hasattr(stat, 'p_value')
|
|
161
|
+
assert hasattr(stat, 'significant')
|
|
162
|
+
assert hasattr(stat, 'interpretation')
|
|
163
|
+
|
|
164
|
+
def test_json_output_structure(self, sample_mixed_df):
|
|
165
|
+
"""Test that JSON output has expected structure."""
|
|
166
|
+
result = analyze(sample_mixed_df, mode="automated")
|
|
167
|
+
output = result.to_dict()
|
|
168
|
+
|
|
169
|
+
# Top-level keys
|
|
170
|
+
assert 'summary' in output
|
|
171
|
+
assert 'statistics' in output
|
|
172
|
+
assert 'visualizations' in output
|
|
173
|
+
assert 'insights' in output
|
|
174
|
+
assert 'metadata' in output
|
|
175
|
+
|
|
176
|
+
# Summary keys
|
|
177
|
+
assert 'row_count' in output['summary']
|
|
178
|
+
assert 'column_count' in output['summary']
|