FlowerPower 0.11.6.20__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +400 -132
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.20.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,570 +0,0 @@
1
- import concurrent.futures
2
-
3
- import numpy as np
4
- import polars as pl
5
- import pyarrow as pa
6
- import pyarrow.compute as pc
7
-
8
- # Pre-compiled regex patterns (identical to original)
9
- INTEGER_REGEX = r"^[-+]?\d+$"
10
- FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
11
- BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
12
- BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
13
- DATETIME_REGEX = (
14
- r"^("
15
- r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
16
- r"|"
17
- r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
18
- r"|"
19
- r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
20
- r"|"
21
- r"\d{8}" # Compact: 20231231
22
- r")"
23
- r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
24
- r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
25
- r"$"
26
- )
27
-
28
- # Float32 range limits
29
- F32_MIN = float(np.finfo(np.float32).min)
30
- F32_MAX = float(np.finfo(np.float32).max)
31
-
32
-
33
- def dominant_timezone_per_column(
34
- schemas: list[pa.Schema],
35
- ) -> dict[str, tuple[str | None, str | None]]:
36
- """
37
- For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
38
- If None and a timezone are tied, prefer the timezone.
39
- Returns a dict: {column_name: dominant_timezone}
40
- """
41
- from collections import Counter, defaultdict
42
-
43
- tz_counts = defaultdict(Counter)
44
- units = {}
45
-
46
- for schema in schemas:
47
- for field in schema:
48
- if pa.types.is_timestamp(field.type):
49
- tz = field.type.tz
50
- name = field.name
51
- tz_counts[name][tz] += 1
52
- # Track unit for each column (assume consistent)
53
- if name not in units:
54
- units[name] = field.type.unit
55
-
56
- dominant = {}
57
- for name, counter in tz_counts.items():
58
- most_common = counter.most_common()
59
- if not most_common:
60
- continue
61
- top_count = most_common[0][1]
62
- # Find all with top_count
63
- top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
64
- # If tie and one is not None, prefer not-None
65
- if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
66
- tz = next(tz for tz in top_tzs if tz is not None)
67
- else:
68
- tz = most_common[0][0]
69
- dominant[name] = (units[name], tz)
70
- return dominant
71
-
72
-
73
- def standardize_schema_timezones_by_majority(
74
- schemas: list[pa.Schema],
75
- ) -> list[pa.Schema]:
76
- """
77
- For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
78
- Returns a new list of schemas with updated timestamp timezones.
79
- """
80
- dom = dominant_timezone_per_column(schemas)
81
- new_schemas = []
82
- for schema in schemas:
83
- fields = []
84
- for field in schema:
85
- if pa.types.is_timestamp(field.type) and field.name in dom:
86
- unit, tz = dom[field.name]
87
- fields.append(
88
- pa.field(
89
- field.name,
90
- pa.timestamp(unit, tz),
91
- field.nullable,
92
- field.metadata,
93
- )
94
- )
95
- else:
96
- fields.append(field)
97
- new_schemas.append(pa.schema(fields, schema.metadata))
98
- return new_schemas
99
-
100
-
101
- def standardize_schema_timezones(
102
- schemas: list[pa.Schema], timezone: str | None = None
103
- ) -> list[pa.Schema]:
104
- """
105
- Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
106
-
107
- Args:
108
- schemas (list of pa.Schema): List of PyArrow schemas.
109
- timezone (str or None): If None, remove timezone from all timestamp columns.
110
- If str, set this timezone for all timestamp columns.
111
- If "auto", use the most frequent timezone across schemas.
112
-
113
- Returns:
114
- list of pa.Schema: New schemas with standardized timezone info.
115
- """
116
- if timezone == "auto":
117
- # Use the most frequent timezone for each column
118
- return standardize_schema_timezones_by_majority(schemas)
119
- new_schemas = []
120
- for schema in schemas:
121
- fields = []
122
- for field in schema:
123
- if pa.types.is_timestamp(field.type):
124
- fields.append(
125
- pa.field(
126
- field.name,
127
- pa.timestamp(field.type.unit, timezone),
128
- field.nullable,
129
- field.metadata,
130
- )
131
- )
132
- else:
133
- fields.append(field)
134
- new_schemas.append(pa.schema(fields, schema.metadata))
135
- return new_schemas
136
-
137
-
138
- def unify_schemas(
139
- schemas: list[pa.Schema],
140
- use_large_dtypes: bool = False,
141
- timezone: str | None = None,
142
- standardize_timezones: bool = True,
143
- ) -> pa.Schema:
144
- """
145
- Unify a list of PyArrow schemas into a single schema.
146
-
147
- Args:
148
- schemas (list[pa.Schema]): List of PyArrow schemas to unify.
149
- use_large_dtypes (bool): If True, keep large types like large_string.
150
- timezone (str | None): If specified, standardize all timestamp columns to this timezone.
151
- If "auto", use the most frequent timezone across schemas.
152
- If None, remove timezone from all timestamp columns.
153
- standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
154
-
155
- Returns:
156
- pa.Schema: A unified PyArrow schema.
157
- """
158
- if standardize_timezones:
159
- schemas = standardize_schema_timezones(schemas, timezone)
160
- try:
161
- return pa.unify_schemas(schemas, promote_options="permissive")
162
- except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
163
- _ = e.args[0]
164
- # If unify_schemas fails, we can try to create a schema with empty tables
165
- schema = (
166
- pl.concat(
167
- [
168
- # pl.from_arrow(pa.Table.from_pylist([], schema=schema))
169
- pl.from_arrow(schema.empty_table())
170
- for schema in schemas
171
- ],
172
- how="diagonal_relaxed",
173
- )
174
- .to_arrow()
175
- .schema
176
- )
177
- if not use_large_dtypes:
178
- return convert_large_types_to_normal(schema)
179
- return schema
180
-
181
-
182
- def cast_schema(table: pa.Table, schema: pa.Schema) -> pa.Table:
183
- """
184
- Cast a PyArrow table to a given schema, updating the schema to match the table's columns.
185
-
186
- Args:
187
- table (pa.Table): The PyArrow table to cast.
188
- schema (pa.Schema): The target schema to cast the table to.
189
-
190
- Returns:
191
- pa.Table: A new PyArrow table with the specified schema.
192
- """
193
- # Filter schema fields to only those present in the table
194
- table_columns = set(table.schema.names)
195
- filtered_fields = [field for field in schema if field.name in table_columns]
196
- updated_schema = pa.schema(filtered_fields)
197
- return table.select(updated_schema.names).cast(updated_schema)
198
-
199
-
200
- def convert_large_types_to_normal(schema: pa.Schema) -> pa.Schema:
201
- """
202
- Convert large types in a PyArrow schema to their standard types.
203
-
204
- Args:
205
- schema (pa.Schema): The PyArrow schema to convert.
206
-
207
- Returns:
208
- pa.Schema: A new PyArrow schema with large types converted to standard types.
209
- """
210
- # Define mapping of large types to standard types
211
- type_mapping = {
212
- pa.large_string(): pa.string(),
213
- pa.large_binary(): pa.binary(),
214
- pa.large_utf8(): pa.utf8(),
215
- pa.large_list(pa.null()): pa.list_(pa.null()),
216
- pa.large_list_view(pa.null()): pa.list_view(pa.null()),
217
- }
218
- # Convert fields
219
- new_fields = []
220
- for field in schema:
221
- field_type = field.type
222
- # Check if type exists in mapping
223
- if field_type in type_mapping:
224
- new_field = pa.field(
225
- name=field.name,
226
- type=type_mapping[field_type],
227
- nullable=field.nullable,
228
- metadata=field.metadata,
229
- )
230
- new_fields.append(new_field)
231
- # Handle large lists with nested types
232
- elif isinstance(field_type, pa.LargeListType):
233
- new_field = pa.field(
234
- name=field.name,
235
- type=pa.list_(
236
- type_mapping[field_type.value_type]
237
- if field_type.value_type in type_mapping
238
- else field_type.value_type
239
- ),
240
- nullable=field.nullable,
241
- metadata=field.metadata,
242
- )
243
- new_fields.append(new_field)
244
- # Handle dictionary with large_string, large_utf8, or large_binary values
245
- elif isinstance(field_type, pa.DictionaryType):
246
- new_field = pa.field(
247
- name=field.name,
248
- type=pa.dictionary(
249
- field_type.index_type,
250
- type_mapping[field_type.value_type]
251
- if field_type.value_type in type_mapping
252
- else field_type.value_type,
253
- field_type.ordered,
254
- ),
255
- # nullable=field.nullable,
256
- metadata=field.metadata,
257
- )
258
- new_fields.append(new_field)
259
- else:
260
- new_fields.append(field)
261
-
262
- return pa.schema(new_fields)
263
-
264
-
265
- def _clean_string_array(array: pa.Array) -> pa.DataType:
266
- """
267
- Clean string values in a PyArrow array using vectorized operations.
268
- Returns the optimal dtype after cleaning.
269
- """
270
- if len(array) == 0 or array.null_count == len(array):
271
- return array.type
272
-
273
- # Trim whitespace using compute functions
274
- trimmed = pc.utf8_trim_whitespace(array)
275
-
276
- # Create mask for values to convert to null
277
- empty_mask = pc.equal(trimmed, "")
278
- dash_mask = pc.equal(trimmed, "-")
279
- none_mask = pc.or_(
280
- pc.equal(trimmed, "None"),
281
- pc.equal(trimmed, "none"),
282
- pc.equal(trimmed, "NONE"),
283
- pc.equal(trimmed, "Nan"),
284
- pc.equal(trimmed, "N/A"),
285
- pc.equal(trimmed, "n/a"),
286
- pc.equal(trimmed, "NaN"),
287
- pc.equal(trimmed, "nan"),
288
- pc.equal(trimmed, "NAN"),
289
- pc.equal(trimmed, "Null"),
290
- pc.equal(trimmed, "NULL"),
291
- pc.equal(trimmed, "null"),
292
- )
293
-
294
- null_mask = pc.or_(pc.or_(empty_mask, dash_mask), none_mask)
295
-
296
- # If all values are null after cleaning, return null type
297
- if pc.all(null_mask).as_py():
298
- return pa.null()
299
-
300
- return array.type # Default: keep string type if not all null
301
-
302
-
303
- def _can_downcast_to_float32(array: pa.Array) -> bool:
304
- """
305
- Check if float values are within Float32 range using vectorized operations.
306
- """
307
- if len(array) == 0 or array.null_count == len(array):
308
- return True
309
-
310
- is_finite = pc.is_finite(array)
311
- if not pc.any(is_finite).as_py():
312
- return True
313
-
314
- finite_array = pc.filter(array, is_finite)
315
- min_val = pc.min(finite_array).as_py()
316
- max_val = pc.max(finite_array).as_py()
317
-
318
- return F32_MIN <= min_val <= max_val <= F32_MAX
319
-
320
-
321
- def _get_optimal_int_type(
322
- array: pa.Array, allow_unsigned: bool, allow_null: bool = True
323
- ) -> pa.DataType:
324
- """
325
- Determine the most efficient integer type based on data range.
326
- """
327
- if len(array) == 0 or array.null_count == len(array):
328
- if allow_null:
329
- return pa.null()
330
- else:
331
- # If all values are null and allow_null is False, default to int8
332
- return pa.int8()
333
-
334
- min_max = pc.min_max(array)
335
- min_val = min_max["min"].as_py()
336
- max_val = min_max["max"].as_py()
337
-
338
- if allow_unsigned and min_val >= 0:
339
- if max_val <= 255:
340
- return pa.uint8()
341
- elif max_val <= 65535:
342
- return pa.uint16()
343
- elif max_val <= 4294967295:
344
- return pa.uint32()
345
- else:
346
- return pa.uint64()
347
- else:
348
- if -128 <= min_val and max_val <= 127:
349
- return pa.int8()
350
- elif -32768 <= min_val and max_val <= 32767:
351
- return pa.int16()
352
- elif -2147483648 <= min_val and max_val <= 2147483647:
353
- return pa.int32()
354
- else:
355
- return pa.int64()
356
-
357
-
358
- def _optimize_numeric_array(
359
- array: pa.Array, shrink: bool, allow_unsigned: bool = True, allow_null: bool = True
360
- ) -> pa.DataType:
361
- """
362
- Optimize numeric PyArrow array by downcasting when possible.
363
- Returns the optimal dtype.
364
- """
365
-
366
- if not shrink or len(array) == 0 or array.null_count == len(array):
367
- if allow_null:
368
- return pa.null()
369
- else:
370
- return array.type
371
-
372
- if pa.types.is_floating(array.type):
373
- if array.type == pa.float64() and _can_downcast_to_float32(array):
374
- return pa.float32()
375
- return array.type
376
-
377
- if pa.types.is_integer(array.type):
378
- return _get_optimal_int_type(array, allow_unsigned, allow_null)
379
-
380
- return array.type
381
-
382
-
383
- def _all_match_regex(array: pa.Array, pattern: str) -> bool:
384
- """
385
- Check if all non-null values in array match regex pattern.
386
- """
387
- if len(array) == 0 or array.null_count == len(array):
388
- return False
389
- return pc.all(pc.match_substring_regex(array, pattern, ignore_case=True)).as_py()
390
-
391
-
392
- def _optimize_string_array(
393
- array: pa.Array,
394
- col_name: str,
395
- shrink_numerics: bool,
396
- time_zone: str | None = None,
397
- allow_unsigned: bool = True,
398
- allow_null: bool = True,
399
- ) -> pa.DataType:
400
- """
401
- Convert string PyArrow array to appropriate type based on content analysis.
402
- Returns the optimal dtype.
403
- """
404
- if len(array) == 0 or array.null_count == len(array):
405
- if allow_null:
406
- return pa.null()
407
- else:
408
- return array.type
409
-
410
- cleaned_array = _clean_string_array(
411
- array, allow_null
412
- ) # pc.utf8_trim_whitespace(array)
413
-
414
- try:
415
- if _all_match_regex(cleaned_array, BOOLEAN_REGEX):
416
- return pa.bool_()
417
- elif _all_match_regex(cleaned_array, INTEGER_REGEX):
418
- int_array = pc.cast(
419
- pc.replace_substring(cleaned_array, ",", "."), pa.int64()
420
- )
421
- return _optimize_numeric_array(
422
- int_array, allow_unsigned=allow_unsigned, allow_null=allow_null
423
- )
424
- elif _all_match_regex(cleaned_array, FLOAT_REGEX):
425
- float_array = pc.cast(
426
- pc.replace_substring(cleaned_array, ",", "."), pa.float64()
427
- )
428
- return _optimize_numeric_array(
429
- float_array,
430
- shrink_numerics,
431
- allow_unsigned=allow_unsigned,
432
- allow_null=allow_null,
433
- )
434
- elif _all_match_regex(cleaned_array, DATETIME_REGEX):
435
- pl_series = pl.Series(col_name, cleaned_array)
436
- converted = pl_series.str.to_datetime(
437
- strict=False, time_unit="us", time_zone=time_zone
438
- )
439
- # Get the Arrow dtype from Polars
440
- arrow_dtype = converted.to_arrow().type
441
- return arrow_dtype
442
- except Exception:
443
- return pa.string()
444
-
445
- return pa.string()
446
-
447
-
448
- def _process_column(
449
- # table: pa.Table,
450
- # col_name: str,
451
- array: pa.Array,
452
- col_name: str,
453
- shrink_numerics: bool,
454
- allow_unsigned: bool,
455
- time_zone: str | None = None,
456
- ) -> pa.Field:
457
- """
458
- Process a single column for type optimization.
459
- Returns a pyarrow.Field with the optimal dtype.
460
- """
461
- # array = table[col_name]
462
- if array.null_count == len(array):
463
- return pa.field(col_name, pa.null())
464
-
465
- if pa.types.is_floating(array.type) or pa.types.is_integer(array.type):
466
- dtype = _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
467
- return pa.field(col_name, dtype, nullable=array.null_count > 0)
468
- elif pa.types.is_string(array.type):
469
- dtype = _optimize_string_array(array, col_name, shrink_numerics, time_zone)
470
- return pa.field(col_name, dtype, nullable=array.null_count > 0)
471
-
472
- return pa.field(col_name, array.type, nullable=array.null_count > 0)
473
-
474
-
475
- def _process_column_for_opt_dtype(args):
476
- (
477
- array,
478
- col_name,
479
- cols_to_process,
480
- shrink_numerics,
481
- allow_unsigned,
482
- time_zone,
483
- strict,
484
- allow_null,
485
- ) = args
486
- try:
487
- if col_name in cols_to_process:
488
- field = _process_column(
489
- array, col_name, shrink_numerics, allow_unsigned, time_zone
490
- )
491
- if pa.types.is_null(field.type):
492
- if allow_null:
493
- array = pa.nulls(array.length(), type=pa.null())
494
- return (col_name, field, array)
495
- else:
496
- orig_type = array.type
497
- # array = table[col_name]
498
- field = pa.field(col_name, orig_type, nullable=True)
499
- return (col_name, field, array)
500
- else:
501
- array = array.cast(field.type)
502
- return (col_name, field, array)
503
- else:
504
- field = pa.field(col_name, array.type, nullable=True)
505
- # array = table[col_name]
506
- return (col_name, field, array)
507
- except Exception as e:
508
- if strict:
509
- raise e
510
- field = pa.field(col_name, array.type, nullable=True)
511
- return (col_name, field, array)
512
-
513
-
514
- def opt_dtype(
515
- table: pa.Table,
516
- include: str | list[str] | None = None,
517
- exclude: str | list[str] | None = None,
518
- time_zone: str | None = None,
519
- shrink_numerics: bool = True,
520
- allow_unsigned: bool = True,
521
- use_large_dtypes: bool = False,
522
- strict: bool = False,
523
- allow_null: bool = True,
524
- ) -> pa.Table:
525
- """
526
- Optimize data types of a PyArrow Table for performance and memory efficiency.
527
- Returns a new table casted to the optimal schema.
528
-
529
- Args:
530
- allow_null (bool): If False, columns that only hold null-like values will not be converted to pyarrow.null().
531
- """
532
- if isinstance(include, str):
533
- include = [include]
534
- if isinstance(exclude, str):
535
- exclude = [exclude]
536
-
537
- cols_to_process = table.column_names
538
- if include:
539
- cols_to_process = [col for col in include if col in table.column_names]
540
- if exclude:
541
- cols_to_process = [col for col in cols_to_process if col not in exclude]
542
-
543
- # Prepare arguments for parallel processing
544
- args_list = [
545
- (
546
- table[col_name],
547
- col_name,
548
- cols_to_process,
549
- shrink_numerics,
550
- allow_unsigned,
551
- time_zone,
552
- strict,
553
- allow_null,
554
- )
555
- for col_name in table.column_names
556
- ]
557
-
558
- # Parallelize column processing
559
- with concurrent.futures.ThreadPoolExecutor() as executor:
560
- results = list(executor.map(_process_column_for_opt_dtype, args_list))
561
-
562
- # Sort results to preserve column order
563
- results.sort(key=lambda x: table.column_names.index(x[0]))
564
- fields = [field for _, field, _ in results]
565
- arrays = [array for _, _, array in results]
566
-
567
- schema = pa.schema(fields)
568
- if use_large_dtypes:
569
- schema = convert_large_types_to_normal(schema)
570
- return pa.Table.from_arrays(arrays, schema=schema)