additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,82 @@
1
+ """
2
+ Snapshot function - Filter and select data.
3
+
4
+ This module provides the main snapshot function for filtering and selecting data.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Optional
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.common.result import wrap_result
12
+ from additory.core.backend import detect_backend, to_polars, from_polars
13
+ from additory.core.logging import Logger
14
+ from additory.functions.snapshot.filter import apply_filter, select_columns
15
+
16
+
17
+ def snapshot(
18
+ df,
19
+ where: Optional[str] = None,
20
+ columns: Optional[List[str]] = None
21
+ ):
22
+ """
23
+ Filter and select data from DataFrame.
24
+
25
+ Args:
26
+ df: Input DataFrame (Polars, pandas, or cuDF)
27
+ where: Optional filter expression (SQL-like WHERE clause)
28
+ columns: Optional list of columns to select
29
+
30
+ Returns:
31
+ DataFrameResult with filtered/selected data
32
+
33
+ Examples:
34
+ >>> # Simple filter
35
+ >>> result = snapshot(df, where='age > 18')
36
+
37
+ >>> # Filter + select
38
+ >>> result = snapshot(df,
39
+ ... where='age > 18 AND status == "active"',
40
+ ... columns=['name', 'email', 'age'])
41
+
42
+ >>> # Complex conditions
43
+ >>> result = snapshot(df,
44
+ ... where='(age >= 18 AND status == "active") OR country == "USA"')
45
+ """
46
+ logger = Logger()
47
+ logger.info("[snapshot] Starting snapshot() function")
48
+
49
+ try:
50
+ # Validate and convert
51
+ validate_dataframe(df)
52
+ validate_not_empty(df)
53
+ backend = detect_backend(df)
54
+ polars_df = to_polars(df)
55
+
56
+ logger.info(f"Input: {polars_df.shape[0]} rows × {polars_df.shape[1]} columns")
57
+
58
+ # Apply filter
59
+ if where:
60
+ polars_df = apply_filter(polars_df, where)
61
+
62
+ # Select columns
63
+ if columns:
64
+ polars_df = select_columns(polars_df, columns)
65
+
66
+ # Convert back
67
+ result = from_polars(polars_df, backend)
68
+
69
+ logger.info(f"Output: {polars_df.shape[0]} rows × {polars_df.shape[1]} columns")
70
+ logger.info("[snapshot] snapshot() function complete")
71
+
72
+ # Wrap result
73
+ return wrap_result(result, 'snapshot', metadata={
74
+ 'where': where,
75
+ 'columns': columns,
76
+ 'rows_filtered': len(polars_df)
77
+ })
78
+
79
+ except Exception as e:
80
+ logger.error(f"Error in snapshot() function: {e}", error_location="snapshot")
81
+ raise
82
+
@@ -0,0 +1,119 @@
1
+ """
2
+ Filter and select data.
3
+
4
+ This module provides filtering functionality for the snapshot function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Optional
9
+
10
+ from additory.common.validation import validate_dataframe
11
+ from additory.common.column_selector import select_columns as select_cols
12
+ from additory.core.logging import Logger
13
+
14
+
15
+ def apply_filter(df: pl.DataFrame, where: str) -> pl.DataFrame:
16
+ """
17
+ Apply WHERE clause filter to DataFrame.
18
+
19
+ Args:
20
+ df: Input DataFrame
21
+ where: Filter expression (SQL-like WHERE clause)
22
+
23
+ Returns:
24
+ Filtered DataFrame
25
+
26
+ Example:
27
+ >>> result = apply_filter(df, 'age > 18 AND status == "active"')
28
+ """
29
+ logger = Logger()
30
+
31
+ # Validate
32
+ validate_dataframe(df)
33
+
34
+ if not where:
35
+ return df
36
+
37
+ logger.info(f"Applying filter: {where}")
38
+
39
+ # Parse and evaluate WHERE clause
40
+ where_expr = parse_where_clause(df, where)
41
+ result = df.filter(where_expr)
42
+
43
+ logger.info(f"Filter applied: {len(result)} rows remaining")
44
+
45
+ return result
46
+
47
+
48
+ def parse_where_clause(df: pl.DataFrame, where: str) -> pl.Expr:
49
+ """
50
+ Parse WHERE clause into Polars expression.
51
+
52
+ Args:
53
+ df: Input DataFrame (for column validation)
54
+ where: Filter expression
55
+
56
+ Returns:
57
+ Polars expression
58
+
59
+ Supports:
60
+ - Comparison: >, <, >=, <=, ==, !=
61
+ - Logical: AND, OR, NOT
62
+ - Parentheses for grouping
63
+
64
+ Example:
65
+ >>> expr = parse_where_clause(df, 'age > 18 AND status == "active"')
66
+ """
67
+ # Simple implementation - convert to Polars expression
68
+ # This is a simplified parser - full implementation would use expressions.parser
69
+
70
+ # Replace SQL-style logical operators with Python/Polars operators
71
+ expr_str = where
72
+ expr_str = expr_str.replace(' AND ', ' & ')
73
+ expr_str = expr_str.replace(' OR ', ' | ')
74
+ expr_str = expr_str.replace(' NOT ', ' ~ ')
75
+
76
+ # For now, use Polars' SQL context for parsing
77
+ # Create a temporary SQL query and extract the filter
78
+ try:
79
+ # Build namespace with column references
80
+ namespace = {col: pl.col(col) for col in df.columns}
81
+ namespace['pl'] = pl
82
+
83
+ # Evaluate the expression
84
+ result = eval(expr_str, namespace)
85
+ return result
86
+ except Exception as e:
87
+ raise ValueError(f"Invalid WHERE clause: {where}. Error: {e}")
88
+
89
+
90
+ def select_columns(df: pl.DataFrame, columns: List[str]) -> pl.DataFrame:
91
+ """
92
+ Select specified columns from DataFrame.
93
+
94
+ Args:
95
+ df: Input DataFrame
96
+ columns: List of column names to select
97
+
98
+ Returns:
99
+ DataFrame with selected columns
100
+
101
+ Example:
102
+ >>> result = select_columns(df, ['name', 'age', 'email'])
103
+ """
104
+ logger = Logger()
105
+
106
+ # Validate
107
+ validate_dataframe(df)
108
+
109
+ if not columns:
110
+ return df
111
+
112
+ logger.info(f"Selecting {len(columns)} columns")
113
+
114
+ # Use Polars select directly
115
+ result = df.select(columns)
116
+
117
+ logger.info(f"Column selection complete")
118
+
119
+ return result
@@ -0,0 +1,113 @@
1
+ """
2
+ Synthetic function - Generate synthetic data.
3
+
4
+ This module provides the main synthetic function for generating synthetic data
5
+ in three modes: augment, create, and preset.
6
+ """
7
+
8
+ import polars as pl
9
+ from typing import Dict, Optional, Union
10
+
11
+ from additory.common.validation import validate_dataframe, validate_not_empty
12
+ from additory.common.result import wrap_result
13
+ from additory.core.backend import detect_backend, to_polars, from_polars
14
+ from additory.core.logging import Logger
15
+ from additory.functions.synthetic.mode_detector import detect_mode
16
+ from additory.functions.synthetic.strategies.generative import generate_data
17
+ from additory.functions.synthetic.strategies.augmentative import augment_data
18
+ from additory.functions.synthetic.strategies.presets import apply_preset
19
+
20
+
21
+ def synthetic(
22
+ df_or_mode,
23
+ n_rows: Optional[Union[int, str]] = None,
24
+ strategy: Optional[Dict] = None,
25
+ preset: Optional[str] = None,
26
+ **kwargs
27
+ ):
28
+ """
29
+ Generate synthetic data.
30
+
31
+ Args:
32
+ df_or_mode: DataFrame (augment mode) or '@new' (create mode)
33
+ n_rows: Number of rows to generate
34
+ strategy: Strategy dictionary
35
+ preset: Preset name
36
+ **kwargs: Mode-specific parameters
37
+
38
+ Returns:
39
+ DataFrameResult with synthetic data
40
+
41
+ Modes:
42
+ - augment: Add synthetic rows to existing data
43
+ - create: Create synthetic data from scratch
44
+ - preset: Use preset configuration
45
+
46
+ Examples:
47
+ >>> # Augment mode
48
+ >>> result = synthetic(df, n_rows=100)
49
+ >>> result = synthetic(df, n_rows="50%")
50
+
51
+ >>> # Create mode
52
+ >>> result = synthetic('@new', n_rows=100, strategy={
53
+ ... 'id': 'increment:start=1',
54
+ ... 'age': 'range:18-65',
55
+ ... 'status': 'choice:[Active,Inactive,Pending]'
56
+ ... })
57
+
58
+ >>> # Preset mode
59
+ >>> result = synthetic('@new', n_rows=100, preset='users')
60
+ """
61
+ logger = Logger()
62
+ logger.info("[synthetic] Starting synthetic() function")
63
+
64
+ try:
65
+ # Detect mode
66
+ mode = detect_mode(df_or_mode, strategy, preset)
67
+ logger.info(f"Mode detected: {mode}")
68
+
69
+ if mode == 'augment':
70
+ # Validate and convert
71
+ validate_dataframe(df_or_mode)
72
+ validate_not_empty(df_or_mode)
73
+ backend = detect_backend(df_or_mode)
74
+ polars_df = to_polars(df_or_mode)
75
+
76
+ # Augment
77
+ result = augment_data(polars_df, n_rows, strategy)
78
+
79
+ elif mode == 'create':
80
+ # Create from scratch
81
+ if not n_rows:
82
+ raise ValueError("n_rows is required for create mode")
83
+
84
+ result = generate_data(n_rows, strategy)
85
+ backend = 'polars'
86
+
87
+ elif mode == 'preset':
88
+ # Use preset
89
+ if not n_rows:
90
+ raise ValueError("n_rows is required for preset mode")
91
+
92
+ result = apply_preset(preset, n_rows)
93
+ backend = 'polars'
94
+
95
+ else:
96
+ raise ValueError(f"Unknown mode: {mode}")
97
+
98
+ # Convert back
99
+ result = from_polars(result, backend)
100
+
101
+ logger.info(f"Output: {len(result)} rows × {len(result.columns) if hasattr(result, 'columns') else 'N/A'} columns")
102
+ logger.info("[synthetic] synthetic() function complete")
103
+
104
+ # Wrap result
105
+ return wrap_result(result, 'synthetic', metadata={
106
+ 'mode': mode,
107
+ 'rows_generated': n_rows
108
+ })
109
+
110
+ except Exception as e:
111
+ logger.error(f"Error in synthetic() function: {e}", error_location="synthetic")
112
+ raise
113
+
@@ -0,0 +1,47 @@
1
+ """
2
+ Mode detection for synthetic data generation.
3
+
4
+ This module detects which mode to use: augment, create, or preset.
5
+ """
6
+
7
+ from typing import Optional, Dict
8
+ from additory.common.validation import is_dataframe
9
+
10
+
11
+ def detect_mode(df_or_mode, strategy: Optional[Dict], preset: Optional[str]) -> str:
12
+ """
13
+ Detect synthetic data generation mode.
14
+
15
+ Args:
16
+ df_or_mode: DataFrame (augment mode) or '@new' (create mode)
17
+ strategy: Strategy dictionary
18
+ preset: Preset name
19
+
20
+ Returns:
21
+ Mode string ('augment', 'create', 'preset')
22
+
23
+ Raises:
24
+ ValueError: If parameters are invalid
25
+
26
+ Example:
27
+ >>> mode = detect_mode(df, None, None) # Returns: 'augment'
28
+ >>> mode = detect_mode('@new', strategy, None) # Returns: 'create'
29
+ >>> mode = detect_mode('@new', None, 'users') # Returns: 'preset'
30
+ """
31
+ # Preset mode takes precedence
32
+ if preset is not None:
33
+ return 'preset'
34
+
35
+ # Check if it's a DataFrame first (augment mode)
36
+ elif is_dataframe(df_or_mode):
37
+ return 'augment'
38
+
39
+ # Create mode
40
+ elif df_or_mode == '@new':
41
+ return 'create'
42
+
43
+ else:
44
+ raise ValueError(
45
+ "Invalid parameters for synthetic(). "
46
+ "First parameter must be a DataFrame or '@new'"
47
+ )
@@ -0,0 +1 @@
1
+ # Synthetic strategies module
@@ -0,0 +1,35 @@
1
+ """
2
+ Advanced synthetic data generation strategies.
3
+
4
+ This module provides advanced strategies like SMOTE, correlations, etc.
5
+ (Placeholder for future implementation)
6
+ """
7
+
8
+ import polars as pl
9
+ from typing import Dict
10
+
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ def apply_advanced_strategy(df: pl.DataFrame, strategy: Dict) -> pl.DataFrame:
15
+ """
16
+ Apply advanced synthetic data strategy.
17
+
18
+ Args:
19
+ df: Input DataFrame
20
+ strategy: Advanced strategy configuration
21
+
22
+ Returns:
23
+ DataFrame with synthetic data
24
+
25
+ Note:
26
+ This is a placeholder for future advanced strategies like:
27
+ - SMOTE (Synthetic Minority Over-sampling Technique)
28
+ - Conditional generation
29
+ - Time series forecasting
30
+ - Correlation preservation
31
+ """
32
+ logger = Logger()
33
+ logger.warning("Advanced strategies not yet implemented")
34
+
35
+ return df
@@ -0,0 +1,160 @@
1
+ """
2
+ Augment existing data with synthetic rows.
3
+
4
+ This module provides functionality to add synthetic rows to existing data.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Optional, Union
9
+ import numpy as np
10
+
11
+ from additory.common.validation import validate_dataframe, validate_not_empty
12
+ from additory.common.distributions import generate_normal, generate_uniform
13
+ from additory.core.logging import Logger
14
+
15
+
16
+ def augment_data(
17
+ df: pl.DataFrame,
18
+ n_rows: Union[int, str],
19
+ strategy: Optional[Dict] = None
20
+ ) -> pl.DataFrame:
21
+ """
22
+ Add synthetic rows to existing data.
23
+
24
+ Args:
25
+ df: Input DataFrame
26
+ n_rows: Number of rows (int or percentage string like '50%')
27
+ strategy: Optional augmentation strategy
28
+
29
+ Returns:
30
+ DataFrame with original + synthetic rows
31
+
32
+ Example:
33
+ >>> result = augment_data(df, n_rows=100) # Add 100 rows
34
+ >>> result = augment_data(df, n_rows="50%") # Add 50% more rows
35
+ """
36
+ logger = Logger()
37
+
38
+ # Validate
39
+ validate_dataframe(df)
40
+ validate_not_empty(df)
41
+
42
+ # Parse n_rows
43
+ n_synthetic = parse_n_rows(n_rows, len(df))
44
+
45
+ logger.info(f"Augmenting {len(df)} rows with {n_synthetic} synthetic rows")
46
+
47
+ # Generate synthetic rows
48
+ synthetic_df = generate_synthetic_rows(df, n_synthetic, strategy)
49
+
50
+ # Concatenate
51
+ result = pl.concat([df, synthetic_df])
52
+
53
+ logger.info(f"Augmentation complete: {len(result)} total rows")
54
+
55
+ return result
56
+
57
+
58
+ def parse_n_rows(n_rows: Union[int, str], df_len: int) -> int:
59
+ """
60
+ Parse n_rows parameter.
61
+
62
+ Args:
63
+ n_rows: Number of rows (int or percentage string)
64
+ df_len: Length of DataFrame
65
+
66
+ Returns:
67
+ Number of rows to generate
68
+
69
+ Example:
70
+ >>> parse_n_rows(100, 1000)
71
+ 100
72
+ >>> parse_n_rows("50%", 1000)
73
+ 500
74
+ """
75
+ if isinstance(n_rows, int):
76
+ return n_rows
77
+
78
+ elif isinstance(n_rows, str) and n_rows.endswith('%'):
79
+ percentage = float(n_rows.rstrip('%'))
80
+ return int(df_len * percentage / 100)
81
+
82
+ else:
83
+ raise ValueError(f"Invalid n_rows: {n_rows}. Must be int or percentage string")
84
+
85
+
86
+ def generate_synthetic_rows(
87
+ df: pl.DataFrame,
88
+ n_rows: int,
89
+ strategy: Optional[Dict]
90
+ ) -> pl.DataFrame:
91
+ """
92
+ Generate synthetic rows matching existing data.
93
+
94
+ Args:
95
+ df: Input DataFrame
96
+ n_rows: Number of rows to generate
97
+ strategy: Optional strategy
98
+
99
+ Returns:
100
+ DataFrame with synthetic rows
101
+ """
102
+ logger = Logger()
103
+
104
+ # Generate each column
105
+ columns = {}
106
+
107
+ for col in df.columns:
108
+ logger.info(f"Generating synthetic column: {col}")
109
+
110
+ # Get column dtype
111
+ dtype = df[col].dtype
112
+
113
+ # Generate based on dtype
114
+ if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
115
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64]:
116
+ # Integer column - match distribution
117
+ col_data = df[col].drop_nulls()
118
+ if len(col_data) > 0:
119
+ mean = float(col_data.mean())
120
+ std = float(col_data.std())
121
+ synthetic = generate_normal(n_rows, mean, std).cast(dtype)
122
+ columns[col] = synthetic
123
+ else:
124
+ columns[col] = pl.Series([None] * n_rows, dtype=dtype)
125
+
126
+ elif dtype in [pl.Float32, pl.Float64]:
127
+ # Float column - match distribution
128
+ col_data = df[col].drop_nulls()
129
+ if len(col_data) > 0:
130
+ mean = float(col_data.mean())
131
+ std = float(col_data.std())
132
+ columns[col] = generate_normal(n_rows, mean, std)
133
+ else:
134
+ columns[col] = pl.Series([None] * n_rows, dtype=dtype)
135
+
136
+ elif dtype == pl.Utf8:
137
+ # String column - sample from existing values
138
+ col_data = df[col].drop_nulls()
139
+ if len(col_data) > 0:
140
+ values = col_data.to_list()
141
+ synthetic = np.random.choice(values, n_rows)
142
+ columns[col] = pl.Series(synthetic)
143
+ else:
144
+ columns[col] = pl.Series([None] * n_rows, dtype=dtype)
145
+
146
+ elif dtype == pl.Boolean:
147
+ # Boolean column - match distribution
148
+ col_data = df[col].drop_nulls()
149
+ if len(col_data) > 0:
150
+ true_ratio = float(col_data.sum()) / len(col_data)
151
+ synthetic = np.random.random(n_rows) < true_ratio
152
+ columns[col] = pl.Series(synthetic)
153
+ else:
154
+ columns[col] = pl.Series([None] * n_rows, dtype=dtype)
155
+
156
+ else:
157
+ # Other types - just use nulls
158
+ columns[col] = pl.Series([None] * n_rows, dtype=dtype)
159
+
160
+ return pl.DataFrame(columns)