additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,182 +0,0 @@
1
- """
2
- Centralized Sample Dataset Management
3
-
4
- Provides sample datasets for demonstrations across all additory modules.
5
- Sample datasets are stored as .add files in reference/ directories and
6
- loaded on-demand using the existing .add file parser.
7
-
8
- Usage:
9
- from additory.common.sample_data import get_sample_dataset
10
-
11
- # For synthetic
12
- df = get_sample_dataset("synthetic", "sample")
13
-
14
- # For expressions (future)
15
- df = get_sample_dataset("expressions", "sample")
16
- df_unclean = get_sample_dataset("expressions", "sample_unclean")
17
- """
18
-
19
- import polars as pl
20
- from pathlib import Path
21
- from typing import Optional
22
- import yaml
23
-
24
- from additory.common.exceptions import ValidationError
25
-
26
-
27
- def get_sample_dataset(
28
- module: str = "synthetic",
29
- block: str = "sample",
30
- dataset_type: str = "clean"
31
- ) -> pl.DataFrame:
32
- """
33
- Load a sample dataset from .add files.
34
-
35
- This function provides centralized access to sample datasets across
36
- all additory modules (synthetic, expressions, utilities). Sample datasets
37
- are stored as .add files in the reference/ directory structure.
38
-
39
- Args:
40
- module: Module name ("synthetic", "expressions", "utilities")
41
- block: Block name within the .add file ("sample" for synthetic)
42
- dataset_type: Type of sample data ("clean" or "unclean")
43
-
44
- Returns:
45
- Polars DataFrame with sample data
46
-
47
- Raises:
48
- ValidationError: If module, block, or dataset_type not found
49
-
50
- Examples:
51
- >>> # Load synthetic sample dataset
52
- >>> df = get_sample_dataset("synthetic", "sample")
53
- >>> print(df.shape)
54
- (50, 10)
55
-
56
- >>> # Load expressions sample dataset (future)
57
- >>> df = get_sample_dataset("expressions", "sample", "clean")
58
- >>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
59
-
60
- Sample Dataset Structure (synthetic):
61
- - id: Sequential numeric IDs (1-50)
62
- - emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
63
- - order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
64
- - age: Age values (18-65 range)
65
- - salary: Salary values (40k-120k range)
66
- - first_name: First names from builtin list
67
- - last_name: Last names from builtin list
68
- - department: Departments from builtin list
69
- - status: Status values from builtin list
70
- - region: Geographic regions (North, South, East, West)
71
- """
72
- # Construct path to .add file
73
- base_path = Path(__file__).parent.parent.parent / "reference"
74
-
75
- if module == "synthetic":
76
- add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
77
- elif module == "expressions":
78
- add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
79
- elif module == "utilities":
80
- add_file_path = base_path / "utilities_definitions" / f"{block}_0.1.add"
81
- else:
82
- raise ValidationError(
83
- f"Unknown module '{module}'. "
84
- f"Valid modules: synthetic, expressions, utilities"
85
- )
86
-
87
- # Check if file exists
88
- if not add_file_path.exists():
89
- raise ValidationError(
90
- f"Sample dataset file not found: {add_file_path}\n"
91
- f"Module: {module}, Block: {block}"
92
- )
93
-
94
- # Load and parse .add file
95
- try:
96
- with open(add_file_path, 'r') as f:
97
- content = yaml.safe_load(f)
98
- except Exception as e:
99
- raise ValidationError(
100
- f"Failed to parse sample dataset file: {add_file_path}\n"
101
- f"Error: {e}"
102
- )
103
-
104
- # Extract sample data
105
- sample_section = content.get("sample", {})
106
-
107
- if not sample_section:
108
- raise ValidationError(
109
- f"No 'sample' section found in {add_file_path}"
110
- )
111
-
112
- # Get the requested dataset type (clean or unclean)
113
- dataset = sample_section.get(dataset_type)
114
-
115
- if dataset is None:
116
- available_types = list(sample_section.keys())
117
- raise ValidationError(
118
- f"Dataset type '{dataset_type}' not found in {add_file_path}\n"
119
- f"Available types: {available_types}"
120
- )
121
-
122
- # Convert to Polars DataFrame
123
- try:
124
- df = pl.DataFrame(dataset)
125
- except Exception as e:
126
- raise ValidationError(
127
- f"Failed to create DataFrame from sample data: {e}"
128
- )
129
-
130
- return df
131
-
132
-
133
- def list_available_samples() -> dict:
134
- """
135
- List all available sample datasets.
136
-
137
- Returns:
138
- Dictionary mapping module names to available samples
139
-
140
- Example:
141
- >>> samples = list_available_samples()
142
- >>> print(samples)
143
- {
144
- 'synthetic': ['sample'],
145
- 'expressions': ['sample'],
146
- 'utilities': []
147
- }
148
- """
149
- base_path = Path(__file__).parent.parent.parent / "reference"
150
- available = {}
151
-
152
- # Check synthetic
153
- synthetic_path = base_path / "synthetic_definitions"
154
- if synthetic_path.exists():
155
- available['synthetic'] = [
156
- f.stem.rsplit('_', 1)[0] # Remove version suffix
157
- for f in synthetic_path.glob("*.add")
158
- ]
159
- else:
160
- available['synthetic'] = []
161
-
162
- # Check expressions
163
- expressions_path = base_path / "expressions_definitions"
164
- if expressions_path.exists():
165
- available['expressions'] = [
166
- f.stem.rsplit('_', 1)[0] # Remove version suffix
167
- for f in expressions_path.glob("*.add")
168
- ]
169
- else:
170
- available['expressions'] = []
171
-
172
- # Check utilities
173
- utilities_path = base_path / "utilities_definitions"
174
- if utilities_path.exists():
175
- available['utilities'] = [
176
- f.stem.rsplit('_', 1)[0] # Remove version suffix
177
- for f in utilities_path.glob("*.add")
178
- ]
179
- else:
180
- available['utilities'] = []
181
-
182
- return available
@@ -1,165 +0,0 @@
1
- # ast_builder.py
2
- #
3
- # Extended AST builder for additory DSL.
4
- # Backward compatible with minimal arithmetic DSL.
5
- # Adds:
6
- # - comparisons
7
- # - boolean logic
8
- # - ternary (Python-style: a if cond else b)
9
- # - function calls (min, max, abs, log, exp)
10
- #
11
-
12
- import ast
13
-
14
-
15
- def build_ast_from_expression(expr: str) -> dict:
16
- """
17
- Convert a Python-like expression string into our internal AST format.
18
- Uses Python's ast module as a parser, then transforms nodes.
19
- """
20
-
21
- if not expr or not expr.strip():
22
- return None
23
-
24
- py_ast = ast.parse(expr, mode="eval")
25
- return _convert(py_ast.body)
26
-
27
-
28
- def _convert(node):
29
- """Convert Python AST → additory AST."""
30
-
31
- # ------------------------------------------------------------
32
- # Literals
33
- # ------------------------------------------------------------
34
- if isinstance(node, ast.Constant):
35
- return {"type": "literal", "value": node.value}
36
-
37
- # ------------------------------------------------------------
38
- # Column reference
39
- # ------------------------------------------------------------
40
- if isinstance(node, ast.Name):
41
- return {"type": "column", "name": node.id}
42
-
43
- # ------------------------------------------------------------
44
- # Binary arithmetic: + - * / **
45
- # ------------------------------------------------------------
46
- if isinstance(node, ast.BinOp):
47
- return {
48
- "type": "binary",
49
- "op": _op_symbol(node.op),
50
- "left": _convert(node.left),
51
- "right": _convert(node.right),
52
- }
53
-
54
- # ------------------------------------------------------------
55
- # Unary arithmetic: -x, +x
56
- # ------------------------------------------------------------
57
- if isinstance(node, ast.UnaryOp):
58
- if isinstance(node.op, ast.UAdd):
59
- return _convert(node.operand)
60
- if isinstance(node.op, ast.USub):
61
- return {
62
- "type": "binary",
63
- "op": "*",
64
- "left": {"type": "literal", "value": -1},
65
- "right": _convert(node.operand),
66
- }
67
- if isinstance(node.op, ast.Not):
68
- return {
69
- "type": "unary_bool",
70
- "op": "not",
71
- "value": _convert(node.operand),
72
- }
73
-
74
- # ------------------------------------------------------------
75
- # Boolean operations: and/or
76
- # ------------------------------------------------------------
77
- if isinstance(node, ast.BoolOp):
78
- op = "and" if isinstance(node.op, ast.And) else "or"
79
- return {
80
- "type": "bool_op",
81
- "op": op,
82
- "values": [_convert(v) for v in node.values],
83
- }
84
-
85
- # ------------------------------------------------------------
86
- # Comparisons: == != > < >= <=
87
- # ------------------------------------------------------------
88
- if isinstance(node, ast.Compare):
89
- # Python allows chained comparisons: a < b < c
90
- # We only support simple binary comparisons
91
- if len(node.ops) != 1 or len(node.comparators) != 1:
92
- raise NotImplementedError("Chained comparisons not supported")
93
-
94
- op = _cmp_symbol(node.ops[0])
95
- return {
96
- "type": "cmp",
97
- "op": op,
98
- "left": _convert(node.left),
99
- "right": _convert(node.comparators[0]),
100
- }
101
-
102
- # ------------------------------------------------------------
103
- # Ternary: a if cond else b
104
- # ------------------------------------------------------------
105
- if isinstance(node, ast.IfExp):
106
- return {
107
- "type": "if_expr",
108
- "cond": _convert(node.test),
109
- "then": _convert(node.body),
110
- "else": _convert(node.orelse),
111
- }
112
-
113
- # ------------------------------------------------------------
114
- # Function calls: min, max, abs, log, exp
115
- # ------------------------------------------------------------
116
- if isinstance(node, ast.Call):
117
- if not isinstance(node.func, ast.Name):
118
- raise NotImplementedError("Only simple function calls supported")
119
-
120
- name = node.func.id
121
- args = [_convert(a) for a in node.args]
122
-
123
- return {
124
- "type": "call",
125
- "name": name,
126
- "args": args,
127
- }
128
-
129
- raise NotImplementedError(f"Unsupported AST node: {type(node)}")
130
-
131
-
132
- def _op_symbol(op):
133
- """Map Python AST operator → string symbol."""
134
- if isinstance(op, ast.Add):
135
- return "+"
136
- if isinstance(op, ast.Sub):
137
- return "-"
138
- if isinstance(op, ast.Mult):
139
- return "*"
140
- if isinstance(op, ast.Div):
141
- return "/"
142
- if isinstance(op, ast.Pow):
143
- return "**"
144
- if isinstance(op, ast.Mod):
145
- return "%"
146
- if isinstance(op, ast.FloorDiv):
147
- return "//"
148
- raise NotImplementedError(f"Unsupported operator: {type(op)}")
149
-
150
-
151
- def _cmp_symbol(op):
152
- """Map Python AST comparison operator → string symbol."""
153
- if isinstance(op, ast.Eq):
154
- return "=="
155
- if isinstance(op, ast.NotEq):
156
- return "!="
157
- if isinstance(op, ast.Gt):
158
- return ">"
159
- if isinstance(op, ast.Lt):
160
- return "<"
161
- if isinstance(op, ast.GtE):
162
- return ">="
163
- if isinstance(op, ast.LtE):
164
- return "<="
165
- raise NotImplementedError(f"Unsupported comparison operator: {type(op)}")
@@ -1,23 +0,0 @@
1
- # additory/core/backends/__init__.py
2
- # Backend support system
3
-
4
- """
5
- Backend Support Module
6
-
7
- This module provides universal backend support for dataframes:
8
- - Arrow bridge for cross-backend compatibility
9
- - Enhanced cuDF support with GPU acceleration
10
- - Memory management and cleanup
11
- """
12
-
13
- # Backend functionality
14
- from .arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
15
- from .cudf_bridge import get_cudf_bridge, EnhancedCuDFBridge, CuDFBridgeError
16
-
17
- __all__ = [
18
- 'EnhancedArrowBridge',
19
- 'ArrowBridgeError',
20
- 'get_cudf_bridge',
21
- 'EnhancedCuDFBridge',
22
- 'CuDFBridgeError'
23
- ]