additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,61 @@
1
+ """
2
+ Feature analysis module.
3
+
4
+ Analyzes feature types and provides transformation recommendations.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+
10
+
11
+ def analyze_features(df: pl.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Analyze features and provide recommendations.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+
18
+ Returns:
19
+ Dictionary with feature analysis:
20
+ - feature_types: Dict of column -> feature type
21
+ - numeric_features: List of numeric features
22
+ - categorical_features: List of categorical features
23
+ - datetime_features: List of datetime features
24
+
25
+ Example:
26
+ >>> df = pl.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
27
+ >>> result = analyze_features(df)
28
+ >>> result['numeric_features']
29
+ ['a']
30
+ """
31
+ feature_types = {}
32
+ numeric_features = []
33
+ categorical_features = []
34
+ datetime_features = []
35
+
36
+ for col in df.columns:
37
+ dtype = df[col].dtype
38
+
39
+ if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
40
+ feature_types[col] = 'numeric'
41
+ numeric_features.append(col)
42
+ elif dtype == pl.Utf8:
43
+ # Check if it's categorical (low cardinality)
44
+ unique_ratio = df[col].n_unique() / df.height if df.height > 0 else 0
45
+ if unique_ratio < 0.05:
46
+ feature_types[col] = 'categorical'
47
+ categorical_features.append(col)
48
+ else:
49
+ feature_types[col] = 'text'
50
+ elif dtype in [pl.Date, pl.Datetime]:
51
+ feature_types[col] = 'datetime'
52
+ datetime_features.append(col)
53
+ else:
54
+ feature_types[col] = str(dtype)
55
+
56
+ return {
57
+ 'feature_types': feature_types,
58
+ 'numeric_features': numeric_features,
59
+ 'categorical_features': categorical_features,
60
+ 'datetime_features': datetime_features
61
+ }
@@ -0,0 +1,66 @@
1
+ """
2
+ Imputation recommendation module.
3
+
4
+ Recommends imputation strategies for missing values.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+
10
+
11
+ def analyze_imputation(df: pl.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Recommend imputation strategies for missing values.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+
18
+ Returns:
19
+ Dictionary with imputation recommendations:
20
+ - recommendations: Dict of column -> recommended strategy
21
+ - columns_needing_imputation: List of columns with missing values
22
+ - imputation_complexity: Overall complexity (low, medium, high)
23
+
24
+ Example:
25
+ >>> df = pl.DataFrame({'a': [1, None, 3], 'b': ['x', 'y', None]})
26
+ >>> result = analyze_imputation(df)
27
+ >>> 'a' in result['columns_needing_imputation']
28
+ True
29
+ """
30
+ recommendations = {}
31
+ columns_needing_imputation = []
32
+
33
+ for col in df.columns:
34
+ null_count = df[col].null_count()
35
+
36
+ if null_count == 0:
37
+ continue
38
+
39
+ columns_needing_imputation.append(col)
40
+ dtype = df[col].dtype
41
+
42
+ # Recommend strategy based on type
43
+ if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
44
+ recommendations[col] = 'mean or median'
45
+ elif dtype == pl.Utf8:
46
+ recommendations[col] = 'mode or constant'
47
+ elif dtype == pl.Boolean:
48
+ recommendations[col] = 'mode'
49
+ else:
50
+ recommendations[col] = 'forward fill or constant'
51
+
52
+ # Determine complexity
53
+ if len(columns_needing_imputation) == 0:
54
+ complexity = 'none'
55
+ elif len(columns_needing_imputation) <= 2:
56
+ complexity = 'low'
57
+ elif len(columns_needing_imputation) <= 5:
58
+ complexity = 'medium'
59
+ else:
60
+ complexity = 'high'
61
+
62
+ return {
63
+ 'recommendations': recommendations,
64
+ 'columns_needing_imputation': columns_needing_imputation,
65
+ 'imputation_complexity': complexity
66
+ }
@@ -0,0 +1,65 @@
1
+ """
2
+ Outlier detection module.
3
+
4
+ Detects outliers in numeric columns using IQR method.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+
10
+
11
+ def analyze_outliers(df: pl.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Detect outliers in numeric columns using IQR method.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+
18
+ Returns:
19
+ Dictionary with outlier detection results:
20
+ - outlier_counts: Dict of column -> outlier count
21
+ - outlier_percentages: Dict of column -> outlier percentage
22
+ - columns_with_outliers: List of columns with outliers
23
+
24
+ Example:
25
+ >>> df = pl.DataFrame({'a': [1, 2, 3, 4, 100]})
26
+ >>> result = analyze_outliers(df)
27
+ >>> result['outlier_counts']['a'] > 0
28
+ True
29
+ """
30
+ numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
31
+
32
+ outlier_counts = {}
33
+ outlier_percentages = {}
34
+
35
+ for col in numeric_cols:
36
+ col_data = df[col].drop_nulls()
37
+
38
+ if len(col_data) == 0:
39
+ outlier_counts[col] = 0
40
+ outlier_percentages[col] = 0.0
41
+ continue
42
+
43
+ # Calculate IQR
44
+ q25 = col_data.quantile(0.25)
45
+ q75 = col_data.quantile(0.75)
46
+ iqr = q75 - q25
47
+
48
+ # Calculate bounds
49
+ lower_bound = q25 - 1.5 * iqr
50
+ upper_bound = q75 + 1.5 * iqr
51
+
52
+ # Count outliers
53
+ outliers = col_data.filter((col_data < lower_bound) | (col_data > upper_bound))
54
+ outlier_count = len(outliers)
55
+
56
+ outlier_counts[col] = outlier_count
57
+ outlier_percentages[col] = (outlier_count / len(col_data) * 100) if len(col_data) > 0 else 0.0
58
+
59
+ columns_with_outliers = [col for col, count in outlier_counts.items() if count > 0]
60
+
61
+ return {
62
+ 'outlier_counts': outlier_counts,
63
+ 'outlier_percentages': {k: round(v, 2) for k, v in outlier_percentages.items()},
64
+ 'columns_with_outliers': columns_with_outliers
65
+ }
@@ -0,0 +1,65 @@
1
+ """
2
+ Pattern detection module.
3
+
4
+ Detects common patterns in string columns like emails, phone numbers, etc.
5
+ """
6
+
7
+ import polars as pl
8
+ import re
9
+ from typing import Dict, Any
10
+
11
+
12
+ def analyze_patterns(df: pl.DataFrame) -> Dict[str, Any]:
13
+ """
14
+ Detect patterns in string columns.
15
+
16
+ Args:
17
+ df: Polars DataFrame to analyze
18
+
19
+ Returns:
20
+ Dictionary with pattern detection results:
21
+ - email_columns: List of columns containing emails
22
+ - phone_columns: List of columns containing phone numbers
23
+ - date_string_columns: List of columns containing date strings
24
+ - id_columns: List of columns containing IDs
25
+
26
+ Example:
27
+ >>> df = pl.DataFrame({'email': ['a@b.com', 'c@d.com']})
28
+ >>> result = analyze_patterns(df)
29
+ >>> 'email' in result['email_columns']
30
+ True
31
+ """
32
+ email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
33
+ phone_pattern = r'^\+?[\d\s\-\(\)]{10,}$'
34
+
35
+ email_columns = []
36
+ phone_columns = []
37
+ date_string_columns = []
38
+ id_columns = []
39
+
40
+ for col in df.columns:
41
+ if df[col].dtype != pl.Utf8:
42
+ continue
43
+
44
+ # Sample first 100 non-null values
45
+ sample = df[col].drop_nulls().head(100).to_list()
46
+
47
+ if not sample:
48
+ continue
49
+
50
+ # Check for email pattern
51
+ email_matches = sum(1 for val in sample if re.match(email_pattern, str(val)))
52
+ if email_matches / len(sample) > 0.8:
53
+ email_columns.append(col)
54
+
55
+ # Check for phone pattern
56
+ phone_matches = sum(1 for val in sample if re.match(phone_pattern, str(val)))
57
+ if phone_matches / len(sample) > 0.8:
58
+ phone_columns.append(col)
59
+
60
+ return {
61
+ 'email_columns': email_columns,
62
+ 'phone_columns': phone_columns,
63
+ 'date_string_columns': date_string_columns,
64
+ 'id_columns': id_columns
65
+ }
@@ -0,0 +1,72 @@
1
+ """
2
+ Preset analysis configurations.
3
+
4
+ Provides preset analysis configurations for quick and full analysis.
5
+ """
6
+
7
+ from typing import Dict, List
8
+
9
+
10
+ def get_preset_analyses(preset_name: str) -> Dict[str, bool]:
11
+ """
12
+ Get analyses for a preset.
13
+
14
+ Args:
15
+ preset_name: Name of preset ('quick' or 'full')
16
+
17
+ Returns:
18
+ Dictionary of analysis_name -> enabled
19
+
20
+ Example:
21
+ >>> analyses = get_preset_analyses('quick')
22
+ >>> analyses['quality']
23
+ True
24
+ """
25
+ presets = {
26
+ 'quick': {
27
+ 'quality': True,
28
+ 'cardinality': True,
29
+ 'types': True,
30
+ 'distributions': False,
31
+ 'correlations': False,
32
+ 'features': False,
33
+ 'patterns': False,
34
+ 'outliers': False,
35
+ 'duplicates': False,
36
+ 'timeseries': False,
37
+ 'imputation': False
38
+ },
39
+ 'full': {
40
+ 'quality': True,
41
+ 'cardinality': True,
42
+ 'types': True,
43
+ 'distributions': True,
44
+ 'correlations': True,
45
+ 'features': True,
46
+ 'patterns': True,
47
+ 'outliers': True,
48
+ 'duplicates': True,
49
+ 'timeseries': False, # Requires date_column
50
+ 'imputation': True
51
+ }
52
+ }
53
+
54
+ if preset_name not in presets:
55
+ raise ValueError(f"Unknown preset '{preset_name}'. Available: {list(presets.keys())}")
56
+
57
+ return presets[preset_name]
58
+
59
+
60
+ def list_presets() -> List[str]:
61
+ """
62
+ List available presets.
63
+
64
+ Returns:
65
+ List of preset names
66
+
67
+ Example:
68
+ >>> presets = list_presets()
69
+ >>> 'quick' in presets
70
+ True
71
+ """
72
+ return ['quick', 'full']
@@ -0,0 +1,59 @@
1
+ """
2
+ Data quality analysis module.
3
+
4
+ Analyzes data quality metrics including missing values, type consistency,
5
+ and format violations.
6
+ """
7
+
8
+ import polars as pl
9
+ from typing import Dict, Any
10
+
11
+
12
+ def analyze_quality(df: pl.DataFrame) -> Dict[str, Any]:
13
+ """
14
+ Analyze data quality metrics.
15
+
16
+ Args:
17
+ df: Polars DataFrame to analyze
18
+
19
+ Returns:
20
+ Dictionary with quality metrics:
21
+ - missing_values: Dict of column -> count
22
+ - missing_percentages: Dict of column -> percentage
23
+ - total_rows: Total number of rows
24
+ - columns_with_nulls: List of columns with missing values
25
+ - quality_score: Overall quality score (0-100)
26
+
27
+ Example:
28
+ >>> df = pl.DataFrame({'a': [1, 2, None], 'b': [1, 2, 3]})
29
+ >>> result = analyze_quality(df)
30
+ >>> result['missing_values']
31
+ {'a': 1, 'b': 0}
32
+ """
33
+ total_rows = df.height
34
+ total_cells = total_rows * df.width
35
+
36
+ # Calculate missing values per column
37
+ missing_values = {}
38
+ missing_percentages = {}
39
+
40
+ for col in df.columns:
41
+ null_count = df[col].null_count()
42
+ missing_values[col] = null_count
43
+ missing_percentages[col] = (null_count / total_rows * 100) if total_rows > 0 else 0.0
44
+
45
+ # Identify columns with nulls
46
+ columns_with_nulls = [col for col, count in missing_values.items() if count > 0]
47
+
48
+ # Calculate overall quality score (percentage of non-null cells)
49
+ total_nulls = sum(missing_values.values())
50
+ quality_score = ((total_cells - total_nulls) / total_cells * 100) if total_cells > 0 else 100.0
51
+
52
+ return {
53
+ 'missing_values': missing_values,
54
+ 'missing_percentages': missing_percentages,
55
+ 'total_rows': total_rows,
56
+ 'total_columns': df.width,
57
+ 'columns_with_nulls': columns_with_nulls,
58
+ 'quality_score': round(quality_score, 2)
59
+ }
@@ -0,0 +1,53 @@
1
+ """
2
+ Time series analysis module.
3
+
4
+ Analyzes time series data for trends and patterns.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any, Optional
9
+
10
+
11
+ def analyze_timeseries(df: pl.DataFrame, date_column: str) -> Dict[str, Any]:
12
+ """
13
+ Analyze time series data.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+ date_column: Column containing dates
18
+
19
+ Returns:
20
+ Dictionary with timeseries analysis:
21
+ - date_column: Name of date column
22
+ - date_range: Tuple of (min_date, max_date)
23
+ - total_days: Number of days in range
24
+ - row_count: Number of rows
25
+ - frequency: Estimated frequency (daily, weekly, etc.)
26
+
27
+ Example:
28
+ >>> df = pl.DataFrame({'date': ['2024-01-01', '2024-01-02'], 'value': [1, 2]})
29
+ >>> df = df.with_columns(pl.col('date').str.strptime(pl.Date, '%Y-%m-%d'))
30
+ >>> result = analyze_timeseries(df, 'date')
31
+ >>> result['row_count']
32
+ 2
33
+ """
34
+ if date_column not in df.columns:
35
+ raise ValueError(f"Column '{date_column}' not found in DataFrame")
36
+
37
+ # Get date range
38
+ min_date = df[date_column].min()
39
+ max_date = df[date_column].max()
40
+
41
+ # Calculate total days
42
+ if min_date is not None and max_date is not None:
43
+ total_days = (max_date - min_date).days if hasattr(max_date - min_date, 'days') else 0
44
+ else:
45
+ total_days = 0
46
+
47
+ return {
48
+ 'date_column': date_column,
49
+ 'date_range': (str(min_date), str(max_date)),
50
+ 'total_days': total_days,
51
+ 'row_count': df.height,
52
+ 'frequency': 'unknown'
53
+ }
@@ -0,0 +1,45 @@
1
+ """
2
+ Data type analysis module.
3
+
4
+ Analyzes data types and provides type recommendations.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+
10
+
11
+ def analyze_types(df: pl.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Analyze data types of columns.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+
18
+ Returns:
19
+ Dictionary with type information:
20
+ - column_types: Dict of column -> type name
21
+ - numeric_columns: List of numeric columns
22
+ - string_columns: List of string columns
23
+ - boolean_columns: List of boolean columns
24
+ - date_columns: List of date/datetime columns
25
+
26
+ Example:
27
+ >>> df = pl.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
28
+ >>> result = analyze_types(df)
29
+ >>> result['numeric_columns']
30
+ ['a']
31
+ """
32
+ column_types = {col: str(df[col].dtype) for col in df.columns}
33
+
34
+ numeric_columns = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
35
+ string_columns = [col for col in df.columns if df[col].dtype == pl.Utf8]
36
+ boolean_columns = [col for col in df.columns if df[col].dtype == pl.Boolean]
37
+ date_columns = [col for col in df.columns if df[col].dtype in [pl.Date, pl.Datetime]]
38
+
39
+ return {
40
+ 'column_types': column_types,
41
+ 'numeric_columns': numeric_columns,
42
+ 'string_columns': string_columns,
43
+ 'boolean_columns': boolean_columns,
44
+ 'date_columns': date_columns
45
+ }
@@ -0,0 +1,161 @@
1
+ """
2
+ Expressions function - Evaluate expressions and add result columns.
3
+
4
+ Supports both inline expressions and references to named expressions.
5
+ """
6
+
7
+ import polars as pl
8
+ import time
9
+ import re
10
+ from typing import List
11
+
12
+ from additory.core.backend import detect_backend, to_polars, from_polars
13
+ from additory.core.logging import get_logger
14
+ from additory.common.validation import validate_dataframe, validate_not_empty
15
+ from additory.common.result import wrap_result
16
+ from additory.expressions.engine import get_engine
17
+
18
+
19
+ logger = get_logger()
20
+
21
+
22
+ def expressions(df, *expressions_list):
23
+ """
24
+ Evaluate one or more expressions and add result columns.
25
+
26
+ Args:
27
+ df: Input DataFrame
28
+ *expressions_list: Variable number of expression strings
29
+
30
+ Returns:
31
+ DataFrameResult with computed columns
32
+
33
+ Examples:
34
+ >>> # Single expression
35
+ >>> result = expressions(df, 'inbuilt:bmi')
36
+
37
+ >>> # Multiple expressions
38
+ >>> result = expressions(df, 'inbuilt:bmi', 'inbuilt:bsa')
39
+
40
+ >>> # Inline expression
41
+ >>> result = expressions(df, 'weight / (height ** 2)')
42
+
43
+ >>> # Mix of inline and references
44
+ >>> result = expressions(df, 'inbuilt:bmi', 'age * 12')
45
+ """
46
+ start_time = time.time()
47
+
48
+ # Validate inputs
49
+ validate_dataframe(df, 'df')
50
+ validate_not_empty(df, 'df')
51
+
52
+ if not expressions_list:
53
+ raise ValueError("At least one expression required")
54
+
55
+ # Detect backend and convert to Polars
56
+ backend = detect_backend(df)
57
+ polars_df = to_polars(df)
58
+
59
+ # Get expression engine
60
+ engine = get_engine()
61
+
62
+ # Track columns added
63
+ columns_added = []
64
+
65
+ # Evaluate each expression
66
+ result_df = polars_df
67
+ for expr_str in expressions_list:
68
+ # Evaluate expression
69
+ result_series = engine.evaluate(result_df, expr_str)
70
+
71
+ # Determine column name
72
+ col_name = determine_column_name(expr_str)
73
+
74
+ # Add to DataFrame
75
+ result_df = result_df.with_columns(result_series.alias(col_name))
76
+ columns_added.append(col_name)
77
+
78
+ logger.info(f"Added column '{col_name}' from expression '{expr_str}'")
79
+
80
+ # Convert back to original backend
81
+ result_df = from_polars(result_df, backend)
82
+
83
+ # Calculate execution time
84
+ execution_time = time.time() - start_time
85
+
86
+ # Wrap result
87
+ metadata = {
88
+ 'expressions_evaluated': len(expressions_list),
89
+ 'columns_added': columns_added,
90
+ 'input_shape': (polars_df.height, polars_df.width),
91
+ 'execution_time': execution_time
92
+ }
93
+
94
+ return wrap_result(result_df, 'expressions', metadata)
95
+
96
+
97
+ def determine_column_name(expression: str) -> str:
98
+ """
99
+ Determine column name for expression result.
100
+
101
+ Args:
102
+ expression: Expression string
103
+
104
+ Returns:
105
+ Column name
106
+
107
+ Examples:
108
+ >>> determine_column_name('inbuilt:bmi')
109
+ 'bmi'
110
+ >>> determine_column_name('weight / height')
111
+ 'weight_div_height'
112
+ """
113
+ # If reference (e.g., 'inbuilt:bmi'), use expression name
114
+ if ':' in expression:
115
+ parts = expression.split(':', 1)
116
+ if len(parts) == 2 and parts[0] in ['inbuilt', 'user', 'company']:
117
+ return parts[1]
118
+
119
+ # If inline, infer from expression
120
+ return infer_column_name(expression)
121
+
122
+
123
+ def infer_column_name(expression: str) -> str:
124
+ """
125
+ Infer column name from inline expression.
126
+
127
+ Args:
128
+ expression: Inline expression string
129
+
130
+ Returns:
131
+ Inferred column name
132
+
133
+ Examples:
134
+ >>> infer_column_name('weight / height')
135
+ 'weight_div_height'
136
+ >>> infer_column_name('sqrt(age)')
137
+ 'sqrt_age'
138
+ >>> infer_column_name('age * 12')
139
+ 'age_mul_12'
140
+ """
141
+ # Replace operators with words
142
+ name = expression
143
+ name = name.replace(' / ', '_div_')
144
+ name = name.replace(' * ', '_mul_')
145
+ name = name.replace(' + ', '_add_')
146
+ name = name.replace(' - ', '_sub_')
147
+ name = name.replace('**', '_pow_')
148
+
149
+ # Remove parentheses and spaces
150
+ name = name.replace('(', '_')
151
+ name = name.replace(')', '')
152
+ name = name.replace(' ', '')
153
+
154
+ # Remove trailing underscores
155
+ name = name.strip('_')
156
+
157
+ # If name is too long or complex, use generic name
158
+ if len(name) > 50 or not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
159
+ return 'expr_result'
160
+
161
+ return name