additory 0.1.0a2__tar.gz → 0.1.0a4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {additory-0.1.0a2 → additory-0.1.0a4}/PKG-INFO +44 -28
  2. {additory-0.1.0a2 → additory-0.1.0a4}/README.md +41 -26
  3. {additory-0.1.0a2 → additory-0.1.0a4}/additory/__init__.py +4 -0
  4. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/__init__.py +2 -2
  5. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/backend.py +20 -4
  6. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/distributions.py +1 -1
  7. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/sample_data.py +19 -19
  8. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/backends/arrow_bridge.py +7 -0
  9. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/config.py +3 -3
  10. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/polars_expression_engine.py +66 -16
  11. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/registry.py +4 -3
  12. {additory-0.1.0a2 → additory-0.1.0a4}/additory/dynamic_api.py +95 -51
  13. {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/proxy.py +4 -1
  14. {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/registry.py +3 -3
  15. additory-0.1.0a4/additory/synthetic/__init__.py +13 -0
  16. additory-0.1.0a4/additory/synthetic/column_name_resolver.py +149 -0
  17. additory-0.1.0a4/additory/synthetic/deduce.py +259 -0
  18. {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/distributions.py +2 -2
  19. {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/forecast.py +1 -1
  20. additory-0.1.0a4/additory/synthetic/linked_list_parser.py +415 -0
  21. additory-0.1.0a4/additory/synthetic/namespace_lookup.py +129 -0
  22. {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/smote.py +1 -1
  23. {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/strategies.py +87 -44
  24. additory-0.1.0a2/additory/augment/augmentor.py → additory-0.1.0a4/additory/synthetic/synthesizer.py +75 -15
  25. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/units.py +4 -1
  26. {additory-0.1.0a2 → additory-0.1.0a4}/additory.egg-info/SOURCES.txt +11 -37
  27. additory-0.1.0a4/documentation/V0.1.0/add_deduce_function.html +759 -0
  28. {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_onehotencoding_function.html +43 -40
  29. additory-0.1.0a2/documentation/V0.1.0/add_augment_function.html → additory-0.1.0a4/documentation/V0.1.0/add_synthetic_function.html +140 -147
  30. {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/expressions.html +55 -79
  31. {additory-0.1.0a2 → additory-0.1.0a4}/pyproject.toml +3 -2
  32. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/age_category_0.1.add +23 -5
  33. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/blood_pressure_category_0.1.add +24 -5
  34. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bmi2_0.1.add +3 -3
  35. additory-0.1.0a4/reference/expressions_definitions/bmi3_0.1.add +41 -0
  36. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bmi_0.1.add +3 -3
  37. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bmr_0.1.add +13 -5
  38. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/body_fat_percentage_0.1.add +13 -5
  39. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bsa_0.1.add +11 -5
  40. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/cholesterol_ratio_0.1.add +11 -5
  41. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/fitness_score_0.1.add +24 -5
  42. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/ideal_body_weight_0.1.add +14 -5
  43. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/waist_hip_ratio_0.1.add +11 -5
  44. additory-0.1.0a2/additory/augment/__init__.py +0 -24
  45. additory-0.1.0a2/additory/augment/builtin_lists.py +0 -430
  46. additory-0.1.0a2/additory/augment/list_registry.py +0 -177
  47. additory-0.1.0a2/additory/synthetic/__init__.py +0 -101
  48. additory-0.1.0a2/additory/synthetic/api.py +0 -220
  49. additory-0.1.0a2/additory/synthetic/common_integration.py +0 -314
  50. additory-0.1.0a2/additory/synthetic/config.py +0 -262
  51. additory-0.1.0a2/additory/synthetic/engines.py +0 -529
  52. additory-0.1.0a2/additory/synthetic/exceptions.py +0 -180
  53. additory-0.1.0a2/additory/synthetic/file_managers.py +0 -518
  54. additory-0.1.0a2/additory/synthetic/generator.py +0 -702
  55. additory-0.1.0a2/additory/synthetic/generator_parser.py +0 -68
  56. additory-0.1.0a2/additory/synthetic/integration.py +0 -319
  57. additory-0.1.0a2/additory/synthetic/models.py +0 -241
  58. additory-0.1.0a2/additory/synthetic/pattern_resolver.py +0 -573
  59. additory-0.1.0a2/additory/synthetic/performance.py +0 -469
  60. additory-0.1.0a2/additory/synthetic/polars_integration.py +0 -464
  61. additory-0.1.0a2/additory/synthetic/proxy.py +0 -60
  62. additory-0.1.0a2/additory/synthetic/schema_parser.py +0 -685
  63. additory-0.1.0a2/additory/synthetic/validator.py +0 -553
  64. additory-0.1.0a2/documentation/V0.1.0/add_synth_function.html +0 -664
  65. additory-0.1.0a2/reference/expressions_definitions/bmi3_0.1.add +0 -26
  66. additory-0.1.0a2/reference/schema_definitions/ca.list +0 -41
  67. additory-0.1.0a2/reference/schema_definitions/ca.properties +0 -14
  68. additory-0.1.0a2/reference/schema_definitions/eu.list +0 -41
  69. additory-0.1.0a2/reference/schema_definitions/eu.properties +0 -13
  70. additory-0.1.0a2/reference/schema_definitions/finance.list +0 -31
  71. additory-0.1.0a2/reference/schema_definitions/finance.properties +0 -18
  72. additory-0.1.0a2/reference/schema_definitions/global.list +0 -57
  73. additory-0.1.0a2/reference/schema_definitions/global.properties +0 -11
  74. additory-0.1.0a2/reference/schema_definitions/healthcare.list +0 -28
  75. additory-0.1.0a2/reference/schema_definitions/us.list +0 -41
  76. additory-0.1.0a2/reference/schema_definitions/us.properties +0 -14
  77. {additory-0.1.0a2 → additory-0.1.0a4}/LICENSE +0 -0
  78. {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/__init__.py +0 -0
  79. {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/cardinality.py +0 -0
  80. {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/correlations.py +0 -0
  81. {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/distributions.py +0 -0
  82. {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/quality.py +0 -0
  83. {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/scan.py +0 -0
  84. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/column_utils.py +0 -0
  85. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/exceptions.py +0 -0
  86. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/lists.py +0 -0
  87. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/patterns.py +0 -0
  88. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/resolver.py +0 -0
  89. {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/validation.py +0 -0
  90. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/__init__.py +0 -0
  91. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/ast_builder.py +0 -0
  92. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/backends/__init__.py +0 -0
  93. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/backends/cudf_bridge.py +0 -0
  94. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/column_positioning.py +0 -0
  95. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/compiler_polars.py +0 -0
  96. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/enhanced_cache_manager.py +0 -0
  97. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/enhanced_matchers.py +0 -0
  98. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/enhanced_version_manager.py +0 -0
  99. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/executor.py +0 -0
  100. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/integrity_manager.py +0 -0
  101. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/loader.py +0 -0
  102. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/logging.py +0 -0
  103. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/memory_manager.py +0 -0
  104. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/namespace_manager.py +0 -0
  105. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/parser.py +0 -0
  106. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/sample_data_manager.py +0 -0
  107. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/user_namespace.py +0 -0
  108. {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/validator.py +0 -0
  109. {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/__init__.py +0 -0
  110. {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/engine.py +0 -0
  111. {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/parser.py +0 -0
  112. {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/samples.py +0 -0
  113. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/__init__.py +0 -0
  114. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/encoding.py +0 -0
  115. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/games.py +0 -0
  116. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/keys.py +0 -0
  117. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/lookup.py +0 -0
  118. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/matchers.py +0 -0
  119. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/resolvers.py +0 -0
  120. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/settings.py +0 -0
  121. {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/validators.py +0 -0
  122. {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_harmonize_units_function.html +0 -0
  123. {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_scan_function.html +0 -0
  124. {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_to_function.html +0 -0
  125. {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/manifest.json +0 -0
  126. {additory-0.1.0a2 → additory-0.1.0a4}/setup.cfg +0 -0
  127. {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi1_0.1.add +0 -0
  128. {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi2_0.1.add +0 -0
  129. {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi3_0.1.add +0 -0
  130. {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi_0.1.add +0 -0
  131. {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/manifest.json +0 -0
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: additory
3
- Version: 0.1.0a2
4
- Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.
3
+ Version: 0.1.0a4
4
+ Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
5
5
  Author: Krishnamoorthy Sankaran
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/sekarkrishna/additory
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pandas>=1.5
15
15
  Requires-Dist: polars>=0.20
16
+ Requires-Dist: pyarrow>=10.0
16
17
  Requires-Dist: pyyaml>=6.0
17
18
  Requires-Dist: requests>=2.31
18
19
  Requires-Dist: toml>=0.10
@@ -34,11 +35,11 @@ Dynamic: license-file
34
35
 
35
36
  # Additory
36
37
 
37
- **A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
38
+ **A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
38
39
 
39
40
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
- [![Version](https://img.shields.io/badge/version-0.1.0a1-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
42
+ [![Version](https://img.shields.io/badge/version-0.1.0a4-orange.svg)](https://github.com/sekarkrishna/additory)
42
43
 
43
44
  **Author:** Krishnamoorthy Sankaran
44
45
 
@@ -51,17 +52,17 @@ Dynamic: license-file
51
52
  ## 📦 Installation
52
53
 
53
54
  ```bash
54
- pip install additory==0.1.0a1
55
+ pip install additory==0.1.0a4
55
56
  ```
56
57
 
57
58
  **Optional GPU support:**
58
59
  ```bash
59
- pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
60
+ pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
60
61
  ```
61
62
 
62
63
  **Development installation:**
63
64
  ```bash
64
- pip install additory[dev]==0.1.0a1 # Includes testing and development tools
65
+ pip install additory[dev]==0.1.0a4 # Includes testing and development tools
65
66
  ```
66
67
 
67
68
  ## 🎯 Core Functions
@@ -69,8 +70,8 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
69
70
  | Function | Purpose | Example |
70
71
  |----------|---------|---------|
71
72
  | `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
72
- | `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
73
- | `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
73
+ | `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
74
+ | `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
74
75
  | `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
75
76
 
76
77
  ## 🧬 Available Expressions
@@ -119,7 +120,7 @@ import additory as add
119
120
 
120
121
  # Works with polars
121
122
  df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
122
- result = add.augment(df_polars, n_rows=100)
123
+ result = add.synthetic(df_polars, n_rows=100)
123
124
 
124
125
  # Automatic type detection and conversion
125
126
  ```
@@ -193,27 +194,42 @@ patients_with_bsa = add.bsa(patients)
193
194
  result = add.fitness_score(add.bmr(add.bmi(patients)))
194
195
  ```
195
196
 
196
- ### 🔄 Augment and Synthetic Data
197
+ ### 🔄 Synthetic Data Generation
197
198
 
198
- **Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
199
-
200
- **Key Differences:**
201
- - **Augment**: Learns patterns from existing data to create similar rows
202
- - **Synthetic**: Uses predefined schemas to generate structured data
199
+ **Synthetic** generates additional data similar to your existing dataset using inline strategies.
203
200
 
204
201
  ```python
205
- # Augment existing data (learns from patterns)
206
- more_customers = add.augment(customers, n_rows=1000)
202
+ # Extend existing data (learns from patterns)
203
+ more_customers = add.synthetic(customers, n_rows=1000)
207
204
 
208
205
  # Create data from scratch with strategies
209
- new_data = add.augment("@new", n_rows=500, strategy={
206
+ new_data = add.synthetic("@new", n_rows=500, strategy={
210
207
  'id': 'increment:start=1',
211
208
  'name': 'choice:[John,Jane,Bob]',
212
209
  'age': 'range:18-65'
213
210
  })
211
+ ```
212
+
213
+ ### 🤖 Text-Based Label Deduction
214
214
 
215
- # Generate from schema file (structured approach)
216
- customers = add.synth("customer_schema.toml", rows=10000)
215
+ **Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
216
+
217
+ ```python
218
+ # Deduce missing labels from text
219
+ tickets = pd.DataFrame({
220
+ "ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
221
+ "category": ["Technical", "Billing", None, None]
222
+ })
223
+
224
+ # Automatically fill in missing categories
225
+ result = add.deduce(tickets, from_column="ticket_text", to_column="category")
226
+
227
+ # Use multiple columns for better accuracy
228
+ result = add.deduce(
229
+ df,
230
+ from_column=["title", "description"],
231
+ to_column="category"
232
+ )
217
233
  ```
218
234
 
219
235
  ## 🧪 Examples
@@ -231,7 +247,7 @@ customers = pd.DataFrame({
231
247
  })
232
248
 
233
249
  # Generate more customers
234
- customers = add.augment(customers, n_rows=10000)
250
+ customers = add.synthetic(customers, n_rows=10000)
235
251
 
236
252
  # Add customer tiers
237
253
  tiers = pd.DataFrame({
@@ -257,7 +273,7 @@ strategy = {
257
273
  'height_cm': 'range:150-200' # Height in cm
258
274
  }
259
275
 
260
- patients = add.augment("@new", n_rows=1000, strategy=strategy)
276
+ patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
261
277
 
262
278
  # Convert height to meters for expressions
263
279
  patients['height_m'] = patients['height_cm'] / 100
@@ -272,19 +288,19 @@ print(result.correlations)
272
288
 
273
289
  ## 📚 Documentation
274
290
 
275
- - **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/)** - Detailed guides for each function
276
- - **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/expressions.html)** - Complete expressions reference
291
+ - **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
292
+ - **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
277
293
 
278
294
  ## 📄 License
279
295
 
280
- MIT License - see [LICENSE](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/LICENSE) file for details.
296
+ MIT License - see [LICENSE](LICENSE) file for details.
281
297
 
282
298
  ## 📞 Support
283
299
 
284
300
  - **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
285
- - **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0)
301
+ - **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
286
302
 
287
- ## 🗺️ v0.1.1 (February 2025)
303
+ ## 🗺️ v0.1.1 (January 2026)
288
304
  - Enhanced documentation and tutorials
289
305
  - Performance optimizations
290
306
  - Additional expressions
@@ -1,10 +1,10 @@
1
1
  # Additory
2
2
 
3
- **A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
3
+ **A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
4
4
 
5
5
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
6
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
- [![Version](https://img.shields.io/badge/version-0.1.0a1-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
7
+ [![Version](https://img.shields.io/badge/version-0.1.0a4-orange.svg)](https://github.com/sekarkrishna/additory)
8
8
 
9
9
  **Author:** Krishnamoorthy Sankaran
10
10
 
@@ -17,17 +17,17 @@
17
17
  ## 📦 Installation
18
18
 
19
19
  ```bash
20
- pip install additory==0.1.0a1
20
+ pip install additory==0.1.0a4
21
21
  ```
22
22
 
23
23
  **Optional GPU support:**
24
24
  ```bash
25
- pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
25
+ pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
26
26
  ```
27
27
 
28
28
  **Development installation:**
29
29
  ```bash
30
- pip install additory[dev]==0.1.0a1 # Includes testing and development tools
30
+ pip install additory[dev]==0.1.0a4 # Includes testing and development tools
31
31
  ```
32
32
 
33
33
  ## 🎯 Core Functions
@@ -35,8 +35,8 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
35
35
  | Function | Purpose | Example |
36
36
  |----------|---------|---------|
37
37
  | `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
38
- | `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
39
- | `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
38
+ | `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
39
+ | `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
40
40
  | `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
41
41
 
42
42
  ## 🧬 Available Expressions
@@ -85,7 +85,7 @@ import additory as add
85
85
 
86
86
  # Works with polars
87
87
  df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
88
- result = add.augment(df_polars, n_rows=100)
88
+ result = add.synthetic(df_polars, n_rows=100)
89
89
 
90
90
  # Automatic type detection and conversion
91
91
  ```
@@ -159,27 +159,42 @@ patients_with_bsa = add.bsa(patients)
159
159
  result = add.fitness_score(add.bmr(add.bmi(patients)))
160
160
  ```
161
161
 
162
- ### 🔄 Augment and Synthetic Data
162
+ ### 🔄 Synthetic Data Generation
163
163
 
164
- **Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
165
-
166
- **Key Differences:**
167
- - **Augment**: Learns patterns from existing data to create similar rows
168
- - **Synthetic**: Uses predefined schemas to generate structured data
164
+ **Synthetic** generates additional data similar to your existing dataset using inline strategies.
169
165
 
170
166
  ```python
171
- # Augment existing data (learns from patterns)
172
- more_customers = add.augment(customers, n_rows=1000)
167
+ # Extend existing data (learns from patterns)
168
+ more_customers = add.synthetic(customers, n_rows=1000)
173
169
 
174
170
  # Create data from scratch with strategies
175
- new_data = add.augment("@new", n_rows=500, strategy={
171
+ new_data = add.synthetic("@new", n_rows=500, strategy={
176
172
  'id': 'increment:start=1',
177
173
  'name': 'choice:[John,Jane,Bob]',
178
174
  'age': 'range:18-65'
179
175
  })
176
+ ```
177
+
178
+ ### 🤖 Text-Based Label Deduction
180
179
 
181
- # Generate from schema file (structured approach)
182
- customers = add.synth("customer_schema.toml", rows=10000)
180
+ **Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
181
+
182
+ ```python
183
+ # Deduce missing labels from text
184
+ tickets = pd.DataFrame({
185
+ "ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
186
+ "category": ["Technical", "Billing", None, None]
187
+ })
188
+
189
+ # Automatically fill in missing categories
190
+ result = add.deduce(tickets, from_column="ticket_text", to_column="category")
191
+
192
+ # Use multiple columns for better accuracy
193
+ result = add.deduce(
194
+ df,
195
+ from_column=["title", "description"],
196
+ to_column="category"
197
+ )
183
198
  ```
184
199
 
185
200
  ## 🧪 Examples
@@ -197,7 +212,7 @@ customers = pd.DataFrame({
197
212
  })
198
213
 
199
214
  # Generate more customers
200
- customers = add.augment(customers, n_rows=10000)
215
+ customers = add.synthetic(customers, n_rows=10000)
201
216
 
202
217
  # Add customer tiers
203
218
  tiers = pd.DataFrame({
@@ -223,7 +238,7 @@ strategy = {
223
238
  'height_cm': 'range:150-200' # Height in cm
224
239
  }
225
240
 
226
- patients = add.augment("@new", n_rows=1000, strategy=strategy)
241
+ patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
227
242
 
228
243
  # Convert height to meters for expressions
229
244
  patients['height_m'] = patients['height_cm'] / 100
@@ -238,19 +253,19 @@ print(result.correlations)
238
253
 
239
254
  ## 📚 Documentation
240
255
 
241
- - **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/)** - Detailed guides for each function
242
- - **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/expressions.html)** - Complete expressions reference
256
+ - **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
257
+ - **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
243
258
 
244
259
  ## 📄 License
245
260
 
246
- MIT License - see [LICENSE](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/LICENSE) file for details.
261
+ MIT License - see [LICENSE](LICENSE) file for details.
247
262
 
248
263
  ## 📞 Support
249
264
 
250
265
  - **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
251
- - **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0)
266
+ - **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
252
267
 
253
- ## 🗺️ v0.1.1 (February 2025)
268
+ ## 🗺️ v0.1.1 (January 2026)
254
269
  - Enhanced documentation and tutorials
255
270
  - Performance optimizations
256
271
  - Additional expressions
@@ -2,6 +2,9 @@
2
2
 
3
3
  from .dynamic_api import add as _api_instance
4
4
 
5
+ # Version information
6
+ __version__ = "0.1.0a4"
7
+
5
8
  # Expose the API instance normally
6
9
  add = _api_instance
7
10
 
@@ -12,4 +15,5 @@ def __getattr__(name):
12
15
 
13
16
  __all__ = [
14
17
  "add",
18
+ "__version__",
15
19
  ]
@@ -1,14 +1,14 @@
1
1
  """
2
2
  Common Utilities Module
3
3
 
4
- Shared functionality used by both augment and synthetic modules:
4
+ Shared functionality used by both synthetic and expressions modules:
5
5
  - Distribution functions (normal, uniform, skewed, etc.)
6
6
  - List file management (.list format)
7
7
  - Pattern file management (.properties format)
8
8
  - Fallback resolution logic
9
9
 
10
10
  This module eliminates code duplication and provides consistent behavior
11
- across augment and synthetic data generation.
11
+ across synthetic and expression data generation.
12
12
  """
13
13
 
14
14
  from .distributions import (
@@ -180,11 +180,14 @@ def get_arrow_bridge():
180
180
  - Use for all cross-backend conversions
181
181
  - Handles pandas/polars/cuDF via Arrow
182
182
  """
183
- from additory.core.backends.arrow_bridge import EnhancedArrowBridge
183
+ from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
184
184
 
185
185
  # Singleton pattern
186
186
  if not hasattr(get_arrow_bridge, '_instance'):
187
- get_arrow_bridge._instance = EnhancedArrowBridge()
187
+ try:
188
+ get_arrow_bridge._instance = EnhancedArrowBridge()
189
+ except ArrowBridgeError:
190
+ get_arrow_bridge._instance = None
188
191
 
189
192
  return get_arrow_bridge._instance
190
193
 
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
194
197
  Convert any dataframe to Polars via Arrow bridge.
195
198
 
196
199
  This is the primary conversion function for the Polars-only architecture.
197
- All operations (expressions, augment, etc.) use this to convert input
200
+ All operations (expressions, synthetic, etc.) use this to convert input
198
201
  dataframes to Polars for processing.
199
202
 
200
203
  Args:
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
224
227
  )
225
228
 
226
229
  # Fast path: already Polars
227
- if isinstance(df, pl.DataFrame):
230
+ if HAS_POLARS and isinstance(df, pl.DataFrame):
228
231
  return df
229
232
 
230
233
  # Validate input
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
240
243
  # Convert via Arrow bridge
241
244
  try:
242
245
  bridge = get_arrow_bridge()
246
+ if bridge is None:
247
+ # Fallback: direct conversion for pandas
248
+ if backend_type == "pandas":
249
+ if isinstance(df, pd.DataFrame):
250
+ return pl.from_pandas(df)
251
+ raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
252
+
243
253
  arrow_table = bridge.to_arrow(df, backend_type)
244
254
  pl_df = bridge.from_arrow(arrow_table, "polars")
245
255
  return pl_df
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
309
319
  # Convert via Arrow bridge
310
320
  try:
311
321
  bridge = get_arrow_bridge()
322
+ if bridge is None:
323
+ # Fallback: direct conversion for pandas
324
+ if target_backend == "pandas":
325
+ return pl_df.to_pandas()
326
+ raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
327
+
312
328
  arrow_table = bridge.to_arrow(pl_df, "polars")
313
329
  result_df = bridge.from_arrow(arrow_table, target_backend)
314
330
  return result_df
@@ -1,5 +1,5 @@
1
1
  """
2
- Distribution Strategies for Data Augmentation
2
+ Distribution Strategies for Synthetic Data Generation
3
3
 
4
4
  Provides statistical distribution-based data generation:
5
5
  - Normal (Gaussian) distribution
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
8
8
  Usage:
9
9
  from additory.common.sample_data import get_sample_dataset
10
10
 
11
- # For augment
12
- df = get_sample_dataset("augment", "sample")
11
+ # For synthetic
12
+ df = get_sample_dataset("synthetic", "sample")
13
13
 
14
14
  # For expressions (future)
15
15
  df = get_sample_dataset("expressions", "sample")
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
25
25
 
26
26
 
27
27
  def get_sample_dataset(
28
- module: str = "augment",
28
+ module: str = "synthetic",
29
29
  block: str = "sample",
30
30
  dataset_type: str = "clean"
31
31
  ) -> pl.DataFrame:
@@ -33,12 +33,12 @@ def get_sample_dataset(
33
33
  Load a sample dataset from .add files.
34
34
 
35
35
  This function provides centralized access to sample datasets across
36
- all additory modules (augment, expressions, utilities). Sample datasets
36
+ all additory modules (synthetic, expressions, utilities). Sample datasets
37
37
  are stored as .add files in the reference/ directory structure.
38
38
 
39
39
  Args:
40
- module: Module name ("augment", "expressions", "utilities")
41
- block: Block name within the .add file ("sample" for augment)
40
+ module: Module name ("synthetic", "expressions", "utilities")
41
+ block: Block name within the .add file ("sample" for synthetic)
42
42
  dataset_type: Type of sample data ("clean" or "unclean")
43
43
 
44
44
  Returns:
@@ -48,8 +48,8 @@ def get_sample_dataset(
48
48
  ValidationError: If module, block, or dataset_type not found
49
49
 
50
50
  Examples:
51
- >>> # Load augment sample dataset
52
- >>> df = get_sample_dataset("augment", "sample")
51
+ >>> # Load synthetic sample dataset
52
+ >>> df = get_sample_dataset("synthetic", "sample")
53
53
  >>> print(df.shape)
54
54
  (50, 10)
55
55
 
@@ -57,7 +57,7 @@ def get_sample_dataset(
57
57
  >>> df = get_sample_dataset("expressions", "sample", "clean")
58
58
  >>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
59
59
 
60
- Sample Dataset Structure (augment):
60
+ Sample Dataset Structure (synthetic):
61
61
  - id: Sequential numeric IDs (1-50)
62
62
  - emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
63
63
  - order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
@@ -72,8 +72,8 @@ def get_sample_dataset(
72
72
  # Construct path to .add file
73
73
  base_path = Path(__file__).parent.parent.parent / "reference"
74
74
 
75
- if module == "augment":
76
- add_file_path = base_path / "augment_definitions" / f"{block}_0.1.add"
75
+ if module == "synthetic":
76
+ add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
77
77
  elif module == "expressions":
78
78
  add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
79
79
  elif module == "utilities":
@@ -81,7 +81,7 @@ def get_sample_dataset(
81
81
  else:
82
82
  raise ValidationError(
83
83
  f"Unknown module '{module}'. "
84
- f"Valid modules: augment, expressions, utilities"
84
+ f"Valid modules: synthetic, expressions, utilities"
85
85
  )
86
86
 
87
87
  # Check if file exists
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
141
141
  >>> samples = list_available_samples()
142
142
  >>> print(samples)
143
143
  {
144
- 'augment': ['sample'],
144
+ 'synthetic': ['sample'],
145
145
  'expressions': ['sample'],
146
146
  'utilities': []
147
147
  }
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
149
149
  base_path = Path(__file__).parent.parent.parent / "reference"
150
150
  available = {}
151
151
 
152
- # Check augment
153
- augment_path = base_path / "augment_definitions"
154
- if augment_path.exists():
155
- available['augment'] = [
152
+ # Check synthetic
153
+ synthetic_path = base_path / "synthetic_definitions"
154
+ if synthetic_path.exists():
155
+ available['synthetic'] = [
156
156
  f.stem.rsplit('_', 1)[0] # Remove version suffix
157
- for f in augment_path.glob("*.add")
157
+ for f in synthetic_path.glob("*.add")
158
158
  ]
159
159
  else:
160
- available['augment'] = []
160
+ available['synthetic'] = []
161
161
 
162
162
  # Check expressions
163
163
  expressions_path = base_path / "expressions_definitions"
@@ -16,6 +16,13 @@ try:
16
16
  except ImportError as e:
17
17
  ARROW_AVAILABLE = False
18
18
  IMPORT_ERROR = str(e)
19
+ # Create dummy classes for type annotations
20
+ class pa:
21
+ Table = Any
22
+ class pl:
23
+ DataFrame = Any
24
+ class pd:
25
+ DataFrame = Any
19
26
 
20
27
  from ..logging import log_info, log_warning
21
28
  from .cudf_bridge import get_cudf_bridge
@@ -329,14 +329,14 @@ def set_custom_formula_path(path):
329
329
 
330
330
  # backend preference setting
331
331
 
332
- _backend_preference: str | None = None # "cpu", "gpu", or None
332
+ _backend_preference: Optional[str] = None # "cpu", "gpu", or None
333
333
 
334
- def set_backend_preference(mode: str | None):
334
+ def set_backend_preference(mode: Optional[str]):
335
335
  global _backend_preference
336
336
  if mode not in (None, "cpu", "gpu"):
337
337
  raise ValueError("backend must be 'cpu', 'gpu', or None")
338
338
  _backend_preference = mode
339
339
 
340
- def get_backend_preference() -> str | None:
340
+ def get_backend_preference() -> Optional[str]:
341
341
  return _backend_preference
342
342