additory 0.1.0a2__tar.gz → 0.1.0a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {additory-0.1.0a2 → additory-0.1.0a3}/PKG-INFO +10 -17
  2. {additory-0.1.0a2 → additory-0.1.0a3}/README.md +7 -15
  3. {additory-0.1.0a2 → additory-0.1.0a3}/additory/__init__.py +4 -0
  4. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/__init__.py +2 -2
  5. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/backend.py +20 -4
  6. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/distributions.py +1 -1
  7. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/sample_data.py +19 -19
  8. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/backends/arrow_bridge.py +7 -0
  9. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/polars_expression_engine.py +66 -16
  10. {additory-0.1.0a2 → additory-0.1.0a3}/additory/dynamic_api.py +42 -46
  11. {additory-0.1.0a2 → additory-0.1.0a3}/additory/expressions/proxy.py +4 -1
  12. additory-0.1.0a3/additory/synthetic/__init__.py +13 -0
  13. additory-0.1.0a3/additory/synthetic/column_name_resolver.py +149 -0
  14. {additory-0.1.0a2/additory/augment → additory-0.1.0a3/additory/synthetic}/distributions.py +2 -2
  15. {additory-0.1.0a2/additory/augment → additory-0.1.0a3/additory/synthetic}/forecast.py +1 -1
  16. additory-0.1.0a3/additory/synthetic/linked_list_parser.py +415 -0
  17. additory-0.1.0a3/additory/synthetic/namespace_lookup.py +129 -0
  18. {additory-0.1.0a2/additory/augment → additory-0.1.0a3/additory/synthetic}/smote.py +1 -1
  19. {additory-0.1.0a2/additory/augment → additory-0.1.0a3/additory/synthetic}/strategies.py +11 -44
  20. additory-0.1.0a2/additory/augment/augmentor.py → additory-0.1.0a3/additory/synthetic/synthesizer.py +75 -15
  21. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/units.py +4 -1
  22. {additory-0.1.0a2 → additory-0.1.0a3}/additory.egg-info/SOURCES.txt +9 -37
  23. {additory-0.1.0a2 → additory-0.1.0a3}/documentation/V0.1.0/add_onehotencoding_function.html +43 -40
  24. additory-0.1.0a2/documentation/V0.1.0/add_augment_function.html → additory-0.1.0a3/documentation/V0.1.0/add_synthetic_function.html +140 -147
  25. {additory-0.1.0a2 → additory-0.1.0a3}/documentation/V0.1.0/expressions.html +55 -79
  26. {additory-0.1.0a2 → additory-0.1.0a3}/pyproject.toml +3 -2
  27. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/age_category_0.1.add +23 -5
  28. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/blood_pressure_category_0.1.add +24 -5
  29. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/bmi2_0.1.add +3 -3
  30. additory-0.1.0a3/reference/expressions_definitions/bmi3_0.1.add +41 -0
  31. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/bmi_0.1.add +3 -3
  32. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/bmr_0.1.add +13 -5
  33. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/body_fat_percentage_0.1.add +13 -5
  34. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/bsa_0.1.add +11 -5
  35. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/cholesterol_ratio_0.1.add +11 -5
  36. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/fitness_score_0.1.add +24 -5
  37. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/ideal_body_weight_0.1.add +14 -5
  38. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/waist_hip_ratio_0.1.add +11 -5
  39. additory-0.1.0a2/additory/augment/__init__.py +0 -24
  40. additory-0.1.0a2/additory/augment/builtin_lists.py +0 -430
  41. additory-0.1.0a2/additory/augment/list_registry.py +0 -177
  42. additory-0.1.0a2/additory/synthetic/__init__.py +0 -101
  43. additory-0.1.0a2/additory/synthetic/api.py +0 -220
  44. additory-0.1.0a2/additory/synthetic/common_integration.py +0 -314
  45. additory-0.1.0a2/additory/synthetic/config.py +0 -262
  46. additory-0.1.0a2/additory/synthetic/engines.py +0 -529
  47. additory-0.1.0a2/additory/synthetic/exceptions.py +0 -180
  48. additory-0.1.0a2/additory/synthetic/file_managers.py +0 -518
  49. additory-0.1.0a2/additory/synthetic/generator.py +0 -702
  50. additory-0.1.0a2/additory/synthetic/generator_parser.py +0 -68
  51. additory-0.1.0a2/additory/synthetic/integration.py +0 -319
  52. additory-0.1.0a2/additory/synthetic/models.py +0 -241
  53. additory-0.1.0a2/additory/synthetic/pattern_resolver.py +0 -573
  54. additory-0.1.0a2/additory/synthetic/performance.py +0 -469
  55. additory-0.1.0a2/additory/synthetic/polars_integration.py +0 -464
  56. additory-0.1.0a2/additory/synthetic/proxy.py +0 -60
  57. additory-0.1.0a2/additory/synthetic/schema_parser.py +0 -685
  58. additory-0.1.0a2/additory/synthetic/validator.py +0 -553
  59. additory-0.1.0a2/documentation/V0.1.0/add_synth_function.html +0 -664
  60. additory-0.1.0a2/reference/expressions_definitions/bmi3_0.1.add +0 -26
  61. additory-0.1.0a2/reference/schema_definitions/ca.list +0 -41
  62. additory-0.1.0a2/reference/schema_definitions/ca.properties +0 -14
  63. additory-0.1.0a2/reference/schema_definitions/eu.list +0 -41
  64. additory-0.1.0a2/reference/schema_definitions/eu.properties +0 -13
  65. additory-0.1.0a2/reference/schema_definitions/finance.list +0 -31
  66. additory-0.1.0a2/reference/schema_definitions/finance.properties +0 -18
  67. additory-0.1.0a2/reference/schema_definitions/global.list +0 -57
  68. additory-0.1.0a2/reference/schema_definitions/global.properties +0 -11
  69. additory-0.1.0a2/reference/schema_definitions/healthcare.list +0 -28
  70. additory-0.1.0a2/reference/schema_definitions/us.list +0 -41
  71. additory-0.1.0a2/reference/schema_definitions/us.properties +0 -14
  72. {additory-0.1.0a2 → additory-0.1.0a3}/LICENSE +0 -0
  73. {additory-0.1.0a2 → additory-0.1.0a3}/additory/analysis/__init__.py +0 -0
  74. {additory-0.1.0a2 → additory-0.1.0a3}/additory/analysis/cardinality.py +0 -0
  75. {additory-0.1.0a2 → additory-0.1.0a3}/additory/analysis/correlations.py +0 -0
  76. {additory-0.1.0a2 → additory-0.1.0a3}/additory/analysis/distributions.py +0 -0
  77. {additory-0.1.0a2 → additory-0.1.0a3}/additory/analysis/quality.py +0 -0
  78. {additory-0.1.0a2 → additory-0.1.0a3}/additory/analysis/scan.py +0 -0
  79. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/column_utils.py +0 -0
  80. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/exceptions.py +0 -0
  81. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/lists.py +0 -0
  82. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/patterns.py +0 -0
  83. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/resolver.py +0 -0
  84. {additory-0.1.0a2 → additory-0.1.0a3}/additory/common/validation.py +0 -0
  85. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/__init__.py +0 -0
  86. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/ast_builder.py +0 -0
  87. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/backends/__init__.py +0 -0
  88. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/backends/cudf_bridge.py +0 -0
  89. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/column_positioning.py +0 -0
  90. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/compiler_polars.py +0 -0
  91. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/config.py +0 -0
  92. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/enhanced_cache_manager.py +0 -0
  93. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/enhanced_matchers.py +0 -0
  94. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/enhanced_version_manager.py +0 -0
  95. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/executor.py +0 -0
  96. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/integrity_manager.py +0 -0
  97. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/loader.py +0 -0
  98. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/logging.py +0 -0
  99. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/memory_manager.py +0 -0
  100. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/namespace_manager.py +0 -0
  101. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/parser.py +0 -0
  102. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/registry.py +0 -0
  103. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/sample_data_manager.py +0 -0
  104. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/user_namespace.py +0 -0
  105. {additory-0.1.0a2 → additory-0.1.0a3}/additory/core/validator.py +0 -0
  106. {additory-0.1.0a2 → additory-0.1.0a3}/additory/expressions/__init__.py +0 -0
  107. {additory-0.1.0a2 → additory-0.1.0a3}/additory/expressions/engine.py +0 -0
  108. {additory-0.1.0a2 → additory-0.1.0a3}/additory/expressions/parser.py +0 -0
  109. {additory-0.1.0a2 → additory-0.1.0a3}/additory/expressions/registry.py +0 -0
  110. {additory-0.1.0a2 → additory-0.1.0a3}/additory/expressions/samples.py +0 -0
  111. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/__init__.py +0 -0
  112. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/encoding.py +0 -0
  113. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/games.py +0 -0
  114. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/keys.py +0 -0
  115. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/lookup.py +0 -0
  116. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/matchers.py +0 -0
  117. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/resolvers.py +0 -0
  118. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/settings.py +0 -0
  119. {additory-0.1.0a2 → additory-0.1.0a3}/additory/utilities/validators.py +0 -0
  120. {additory-0.1.0a2 → additory-0.1.0a3}/documentation/V0.1.0/add_harmonize_units_function.html +0 -0
  121. {additory-0.1.0a2 → additory-0.1.0a3}/documentation/V0.1.0/add_scan_function.html +0 -0
  122. {additory-0.1.0a2 → additory-0.1.0a3}/documentation/V0.1.0/add_to_function.html +0 -0
  123. {additory-0.1.0a2 → additory-0.1.0a3}/reference/expressions_definitions/manifest.json +0 -0
  124. {additory-0.1.0a2 → additory-0.1.0a3}/setup.cfg +0 -0
  125. {additory-0.1.0a2 → additory-0.1.0a3}/user_expressions/bmi1_0.1.add +0 -0
  126. {additory-0.1.0a2 → additory-0.1.0a3}/user_expressions/bmi2_0.1.add +0 -0
  127. {additory-0.1.0a2 → additory-0.1.0a3}/user_expressions/bmi3_0.1.add +0 -0
  128. {additory-0.1.0a2 → additory-0.1.0a3}/user_expressions/bmi_0.1.add +0 -0
  129. {additory-0.1.0a2 → additory-0.1.0a3}/user_expressions/manifest.json +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: additory
3
- Version: 0.1.0a2
4
- Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.
3
+ Version: 0.1.0a3
4
+ Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
5
5
  Author: Krishnamoorthy Sankaran
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/sekarkrishna/additory
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pandas>=1.5
15
15
  Requires-Dist: polars>=0.20
16
+ Requires-Dist: pyarrow>=10.0
16
17
  Requires-Dist: pyyaml>=6.0
17
18
  Requires-Dist: requests>=2.31
18
19
  Requires-Dist: toml>=0.10
@@ -34,11 +35,11 @@ Dynamic: license-file
34
35
 
35
36
  # Additory
36
37
 
37
- **A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
38
+ **A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
38
39
 
39
40
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
- [![Version](https://img.shields.io/badge/version-0.1.0a1-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
42
+ [![Version](https://img.shields.io/badge/version-0.1.0a2-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
42
43
 
43
44
  **Author:** Krishnamoorthy Sankaran
44
45
 
@@ -51,17 +52,17 @@ Dynamic: license-file
51
52
  ## 📦 Installation
52
53
 
53
54
  ```bash
54
- pip install additory==0.1.0a1
55
+ pip install additory==0.1.0a2
55
56
  ```
56
57
 
57
58
  **Optional GPU support:**
58
59
  ```bash
59
- pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
60
+ pip install additory[gpu]==0.1.0a2 # Includes cuDF for GPU acceleration
60
61
  ```
61
62
 
62
63
  **Development installation:**
63
64
  ```bash
64
- pip install additory[dev]==0.1.0a1 # Includes testing and development tools
65
+ pip install additory[dev]==0.1.0a2 # Includes testing and development tools
65
66
  ```
66
67
 
67
68
  ## 🎯 Core Functions
@@ -70,7 +71,6 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
70
71
  |----------|---------|---------|
71
72
  | `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
72
73
  | `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
73
- | `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
74
74
  | `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
75
75
 
76
76
  ## 🧬 Available Expressions
@@ -193,13 +193,9 @@ patients_with_bsa = add.bsa(patients)
193
193
  result = add.fitness_score(add.bmr(add.bmi(patients)))
194
194
  ```
195
195
 
196
- ### 🔄 Augment and Synthetic Data
196
+ ### 🔄 Augment Data Generation
197
197
 
198
- **Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
199
-
200
- **Key Differences:**
201
- - **Augment**: Learns patterns from existing data to create similar rows
202
- - **Synthetic**: Uses predefined schemas to generate structured data
198
+ **Augment** generates additional data similar to your existing dataset using inline strategies.
203
199
 
204
200
  ```python
205
201
  # Augment existing data (learns from patterns)
@@ -211,9 +207,6 @@ new_data = add.augment("@new", n_rows=500, strategy={
211
207
  'name': 'choice:[John,Jane,Bob]',
212
208
  'age': 'range:18-65'
213
209
  })
214
-
215
- # Generate from schema file (structured approach)
216
- customers = add.synth("customer_schema.toml", rows=10000)
217
210
  ```
218
211
 
219
212
  ## 🧪 Examples
@@ -1,10 +1,10 @@
1
1
  # Additory
2
2
 
3
- **A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
3
+ **A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
4
4
 
5
5
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
- [![Version](https://img.shields.io/badge/version-0.1.0a1-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
7
+ [![Version](https://img.shields.io/badge/version-0.1.0a2-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
8
8
 
9
9
  **Author:** Krishnamoorthy Sankaran
10
10
 
@@ -17,17 +17,17 @@
17
17
  ## 📦 Installation
18
18
 
19
19
  ```bash
20
- pip install additory==0.1.0a1
20
+ pip install additory==0.1.0a2
21
21
  ```
22
22
 
23
23
  **Optional GPU support:**
24
24
  ```bash
25
- pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
25
+ pip install additory[gpu]==0.1.0a2 # Includes cuDF for GPU acceleration
26
26
  ```
27
27
 
28
28
  **Development installation:**
29
29
  ```bash
30
- pip install additory[dev]==0.1.0a1 # Includes testing and development tools
30
+ pip install additory[dev]==0.1.0a2 # Includes testing and development tools
31
31
  ```
32
32
 
33
33
  ## 🎯 Core Functions
@@ -36,7 +36,6 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
36
36
  |----------|---------|---------|
37
37
  | `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
38
38
  | `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
39
- | `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
40
39
  | `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
41
40
 
42
41
  ## 🧬 Available Expressions
@@ -159,13 +158,9 @@ patients_with_bsa = add.bsa(patients)
159
158
  result = add.fitness_score(add.bmr(add.bmi(patients)))
160
159
  ```
161
160
 
162
- ### 🔄 Augment and Synthetic Data
161
+ ### 🔄 Augment Data Generation
163
162
 
164
- **Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
165
-
166
- **Key Differences:**
167
- - **Augment**: Learns patterns from existing data to create similar rows
168
- - **Synthetic**: Uses predefined schemas to generate structured data
163
+ **Augment** generates additional data similar to your existing dataset using inline strategies.
169
164
 
170
165
  ```python
171
166
  # Augment existing data (learns from patterns)
@@ -177,9 +172,6 @@ new_data = add.augment("@new", n_rows=500, strategy={
177
172
  'name': 'choice:[John,Jane,Bob]',
178
173
  'age': 'range:18-65'
179
174
  })
180
-
181
- # Generate from schema file (structured approach)
182
- customers = add.synth("customer_schema.toml", rows=10000)
183
175
  ```
184
176
 
185
177
  ## 🧪 Examples
@@ -2,6 +2,9 @@
2
2
 
3
3
  from .dynamic_api import add as _api_instance
4
4
 
5
+ # Version information
6
+ __version__ = "0.1.0a3"
7
+
5
8
  # Expose the API instance normally
6
9
  add = _api_instance
7
10
 
@@ -12,4 +15,5 @@ def __getattr__(name):
12
15
 
13
16
  __all__ = [
14
17
  "add",
18
+ "__version__",
15
19
  ]
@@ -1,14 +1,14 @@
1
1
  """
2
2
  Common Utilities Module
3
3
 
4
- Shared functionality used by both augment and synthetic modules:
4
+ Shared functionality used by both synthetic and expressions modules:
5
5
  - Distribution functions (normal, uniform, skewed, etc.)
6
6
  - List file management (.list format)
7
7
  - Pattern file management (.properties format)
8
8
  - Fallback resolution logic
9
9
 
10
10
  This module eliminates code duplication and provides consistent behavior
11
- across augment and synthetic data generation.
11
+ across synthetic and expression data generation.
12
12
  """
13
13
 
14
14
  from .distributions import (
@@ -180,11 +180,14 @@ def get_arrow_bridge():
180
180
  - Use for all cross-backend conversions
181
181
  - Handles pandas/polars/cuDF via Arrow
182
182
  """
183
- from additory.core.backends.arrow_bridge import EnhancedArrowBridge
183
+ from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
184
184
 
185
185
  # Singleton pattern
186
186
  if not hasattr(get_arrow_bridge, '_instance'):
187
- get_arrow_bridge._instance = EnhancedArrowBridge()
187
+ try:
188
+ get_arrow_bridge._instance = EnhancedArrowBridge()
189
+ except ArrowBridgeError:
190
+ get_arrow_bridge._instance = None
188
191
 
189
192
  return get_arrow_bridge._instance
190
193
 
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
194
197
  Convert any dataframe to Polars via Arrow bridge.
195
198
 
196
199
  This is the primary conversion function for the Polars-only architecture.
197
- All operations (expressions, augment, etc.) use this to convert input
200
+ All operations (expressions, synthetic, etc.) use this to convert input
198
201
  dataframes to Polars for processing.
199
202
 
200
203
  Args:
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
224
227
  )
225
228
 
226
229
  # Fast path: already Polars
227
- if isinstance(df, pl.DataFrame):
230
+ if HAS_POLARS and isinstance(df, pl.DataFrame):
228
231
  return df
229
232
 
230
233
  # Validate input
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
240
243
  # Convert via Arrow bridge
241
244
  try:
242
245
  bridge = get_arrow_bridge()
246
+ if bridge is None:
247
+ # Fallback: direct conversion for pandas
248
+ if backend_type == "pandas":
249
+ if isinstance(df, pd.DataFrame):
250
+ return pl.from_pandas(df)
251
+ raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
252
+
243
253
  arrow_table = bridge.to_arrow(df, backend_type)
244
254
  pl_df = bridge.from_arrow(arrow_table, "polars")
245
255
  return pl_df
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
309
319
  # Convert via Arrow bridge
310
320
  try:
311
321
  bridge = get_arrow_bridge()
322
+ if bridge is None:
323
+ # Fallback: direct conversion for pandas
324
+ if target_backend == "pandas":
325
+ return pl_df.to_pandas()
326
+ raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
327
+
312
328
  arrow_table = bridge.to_arrow(pl_df, "polars")
313
329
  result_df = bridge.from_arrow(arrow_table, target_backend)
314
330
  return result_df
@@ -1,5 +1,5 @@
1
1
  """
2
- Distribution Strategies for Data Augmentation
2
+ Distribution Strategies for Synthetic Data Generation
3
3
 
4
4
  Provides statistical distribution-based data generation:
5
5
  - Normal (Gaussian) distribution
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
8
8
  Usage:
9
9
  from additory.common.sample_data import get_sample_dataset
10
10
 
11
- # For augment
12
- df = get_sample_dataset("augment", "sample")
11
+ # For synthetic
12
+ df = get_sample_dataset("synthetic", "sample")
13
13
 
14
14
  # For expressions (future)
15
15
  df = get_sample_dataset("expressions", "sample")
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
25
25
 
26
26
 
27
27
  def get_sample_dataset(
28
- module: str = "augment",
28
+ module: str = "synthetic",
29
29
  block: str = "sample",
30
30
  dataset_type: str = "clean"
31
31
  ) -> pl.DataFrame:
@@ -33,12 +33,12 @@ def get_sample_dataset(
33
33
  Load a sample dataset from .add files.
34
34
 
35
35
  This function provides centralized access to sample datasets across
36
- all additory modules (augment, expressions, utilities). Sample datasets
36
+ all additory modules (synthetic, expressions, utilities). Sample datasets
37
37
  are stored as .add files in the reference/ directory structure.
38
38
 
39
39
  Args:
40
- module: Module name ("augment", "expressions", "utilities")
41
- block: Block name within the .add file ("sample" for augment)
40
+ module: Module name ("synthetic", "expressions", "utilities")
41
+ block: Block name within the .add file ("sample" for synthetic)
42
42
  dataset_type: Type of sample data ("clean" or "unclean")
43
43
 
44
44
  Returns:
@@ -48,8 +48,8 @@ def get_sample_dataset(
48
48
  ValidationError: If module, block, or dataset_type not found
49
49
 
50
50
  Examples:
51
- >>> # Load augment sample dataset
52
- >>> df = get_sample_dataset("augment", "sample")
51
+ >>> # Load synthetic sample dataset
52
+ >>> df = get_sample_dataset("synthetic", "sample")
53
53
  >>> print(df.shape)
54
54
  (50, 10)
55
55
 
@@ -57,7 +57,7 @@ def get_sample_dataset(
57
57
  >>> df = get_sample_dataset("expressions", "sample", "clean")
58
58
  >>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
59
59
 
60
- Sample Dataset Structure (augment):
60
+ Sample Dataset Structure (synthetic):
61
61
  - id: Sequential numeric IDs (1-50)
62
62
  - emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
63
63
  - order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
@@ -72,8 +72,8 @@ def get_sample_dataset(
72
72
  # Construct path to .add file
73
73
  base_path = Path(__file__).parent.parent.parent / "reference"
74
74
 
75
- if module == "augment":
76
- add_file_path = base_path / "augment_definitions" / f"{block}_0.1.add"
75
+ if module == "synthetic":
76
+ add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
77
77
  elif module == "expressions":
78
78
  add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
79
79
  elif module == "utilities":
@@ -81,7 +81,7 @@ def get_sample_dataset(
81
81
  else:
82
82
  raise ValidationError(
83
83
  f"Unknown module '{module}'. "
84
- f"Valid modules: augment, expressions, utilities"
84
+ f"Valid modules: synthetic, expressions, utilities"
85
85
  )
86
86
 
87
87
  # Check if file exists
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
141
141
  >>> samples = list_available_samples()
142
142
  >>> print(samples)
143
143
  {
144
- 'augment': ['sample'],
144
+ 'synthetic': ['sample'],
145
145
  'expressions': ['sample'],
146
146
  'utilities': []
147
147
  }
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
149
149
  base_path = Path(__file__).parent.parent.parent / "reference"
150
150
  available = {}
151
151
 
152
- # Check augment
153
- augment_path = base_path / "augment_definitions"
154
- if augment_path.exists():
155
- available['augment'] = [
152
+ # Check synthetic
153
+ synthetic_path = base_path / "synthetic_definitions"
154
+ if synthetic_path.exists():
155
+ available['synthetic'] = [
156
156
  f.stem.rsplit('_', 1)[0] # Remove version suffix
157
- for f in augment_path.glob("*.add")
157
+ for f in synthetic_path.glob("*.add")
158
158
  ]
159
159
  else:
160
- available['augment'] = []
160
+ available['synthetic'] = []
161
161
 
162
162
  # Check expressions
163
163
  expressions_path = base_path / "expressions_definitions"
@@ -16,6 +16,13 @@ try:
16
16
  except ImportError as e:
17
17
  ARROW_AVAILABLE = False
18
18
  IMPORT_ERROR = str(e)
19
+ # Create dummy classes for type annotations
20
+ class pa:
21
+ Table = Any
22
+ class pl:
23
+ DataFrame = Any
24
+ class pd:
25
+ DataFrame = Any
19
26
 
20
27
  from ..logging import log_info, log_warning
21
28
  from .cudf_bridge import get_cudf_bridge
@@ -32,7 +32,10 @@ class PolarsExpressionEngine:
32
32
  """Exclusive Polars-based expression processing engine"""
33
33
 
34
34
  def __init__(self):
35
- self.arrow_bridge = EnhancedArrowBridge()
35
+ try:
36
+ self.arrow_bridge = EnhancedArrowBridge()
37
+ except ArrowBridgeError:
38
+ self.arrow_bridge = None
36
39
  self.execution_stats = {
37
40
  "total_executions": 0,
38
41
  "total_time_ms": 0.0,
@@ -68,14 +71,28 @@ class PolarsExpressionEngine:
68
71
  try:
69
72
  # Auto-detect backend if not specified
70
73
  if backend_type is None:
71
- backend_type = self.arrow_bridge.detect_backend(df)
74
+ if self.arrow_bridge:
75
+ backend_type = self.arrow_bridge.detect_backend(df)
76
+ else:
77
+ backend_type = "pandas" # fallback
72
78
 
73
79
  # Get memory usage before processing
74
- memory_before = self.arrow_bridge._get_memory_usage_mb()
80
+ if self.arrow_bridge:
81
+ memory_before = self.arrow_bridge._get_memory_usage_mb()
82
+ else:
83
+ memory_before = 0
75
84
 
76
85
  # 1. Convert input to Arrow
77
86
  log_info(f"[polars_engine] Converting {backend_type} to Arrow")
78
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
87
+ if self.arrow_bridge:
88
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
89
+ else:
90
+ # Fallback: assume pandas and convert directly
91
+ import pandas as pd
92
+ if isinstance(df, pd.DataFrame):
93
+ arrow_table = pl.from_pandas(df).to_arrow()
94
+ else:
95
+ raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
79
96
 
80
97
  # 2. Convert Arrow to Polars
81
98
  log_info("[polars_engine] Converting Arrow to Polars")
@@ -93,11 +110,18 @@ class PolarsExpressionEngine:
93
110
 
94
111
  # 5. Convert to original backend format
95
112
  log_info(f"[polars_engine] Converting Arrow to {backend_type}")
96
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
113
+ if self.arrow_bridge:
114
+ final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
115
+ else:
116
+ # Fallback: convert back to pandas
117
+ final_result = pl.from_arrow(result_arrow).to_pandas()
97
118
 
98
119
  # Calculate execution statistics
99
120
  execution_time = (datetime.now() - start_time).total_seconds() * 1000
100
- memory_after = self.arrow_bridge._get_memory_usage_mb()
121
+ if self.arrow_bridge:
122
+ memory_after = self.arrow_bridge._get_memory_usage_mb()
123
+ else:
124
+ memory_after = 0
101
125
  memory_used = max(0, memory_after - memory_before)
102
126
 
103
127
  # Update global statistics
@@ -122,7 +146,8 @@ class PolarsExpressionEngine:
122
146
 
123
147
  finally:
124
148
  # 6. Always cleanup Arrow memory
125
- self.arrow_bridge.cleanup_arrow_memory()
149
+ if self.arrow_bridge:
150
+ self.arrow_bridge.cleanup_arrow_memory()
126
151
 
127
152
  def _execute_polars_expression(self, polars_df: pl.DataFrame,
128
153
  expression: str, output_column: str) -> pl.DataFrame:
@@ -381,14 +406,28 @@ class PolarsExpressionEngine:
381
406
  try:
382
407
  # Auto-detect backend if not specified
383
408
  if backend_type is None:
384
- backend_type = self.arrow_bridge.detect_backend(df)
409
+ if self.arrow_bridge:
410
+ backend_type = self.arrow_bridge.detect_backend(df)
411
+ else:
412
+ backend_type = "pandas"
385
413
 
386
414
  # Get memory usage before processing
387
- memory_before = self.arrow_bridge._get_memory_usage_mb()
415
+ if self.arrow_bridge:
416
+ memory_before = self.arrow_bridge._get_memory_usage_mb()
417
+ else:
418
+ memory_before = 0
388
419
 
389
420
  # Convert to Polars via Arrow
390
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
391
- polars_df = pl.from_arrow(arrow_table)
421
+ if self.arrow_bridge:
422
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
423
+ polars_df = pl.from_arrow(arrow_table)
424
+ else:
425
+ # Fallback: assume pandas
426
+ import pandas as pd
427
+ if isinstance(df, pd.DataFrame):
428
+ polars_df = pl.from_pandas(df)
429
+ else:
430
+ raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
392
431
 
393
432
  # Execute using AST
394
433
  polars_expr = self._ast_to_polars_expr(ast_tree)
@@ -396,11 +435,17 @@ class PolarsExpressionEngine:
396
435
 
397
436
  # Convert back to original format
398
437
  result_arrow = result_df.to_arrow()
399
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
438
+ if self.arrow_bridge:
439
+ final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
440
+ else:
441
+ final_result = pl.from_arrow(result_arrow).to_pandas()
400
442
 
401
443
  # Calculate statistics
402
444
  execution_time = (datetime.now() - start_time).total_seconds() * 1000
403
- memory_after = self.arrow_bridge._get_memory_usage_mb()
445
+ if self.arrow_bridge:
446
+ memory_after = self.arrow_bridge._get_memory_usage_mb()
447
+ else:
448
+ memory_after = 0
404
449
  memory_used = max(0, memory_after - memory_before)
405
450
 
406
451
  # Update statistics
@@ -422,7 +467,8 @@ class PolarsExpressionEngine:
422
467
  raise PolarsExpressionError(f"AST execution failed: {e}")
423
468
 
424
469
  finally:
425
- self.arrow_bridge.cleanup_arrow_memory()
470
+ if self.arrow_bridge:
471
+ self.arrow_bridge.cleanup_arrow_memory()
426
472
 
427
473
  def validate_expression(self, expression: str) -> bool:
428
474
  """
@@ -489,7 +535,10 @@ class PolarsExpressionEngine:
489
535
  Benchmark results
490
536
  """
491
537
  times = []
492
- backend_type = self.arrow_bridge.detect_backend(df)
538
+ if self.arrow_bridge:
539
+ backend_type = self.arrow_bridge.detect_backend(df)
540
+ else:
541
+ backend_type = "pandas"
493
542
 
494
543
  for i in range(iterations):
495
544
  try:
@@ -532,7 +581,8 @@ class PolarsExpressionEngine:
532
581
  """Cleanup callback for memory manager"""
533
582
  try:
534
583
  # Cleanup Arrow bridge memory
535
- self.arrow_bridge.cleanup_arrow_memory()
584
+ if self.arrow_bridge:
585
+ self.arrow_bridge.cleanup_arrow_memory()
536
586
 
537
587
  # Reset statistics if they get too large
538
588
  if self.execution_stats["total_executions"] > 10000: