FlowerPower 0.11.6.20__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/__init__.py +2 -6
- flowerpower/cfg/__init__.py +7 -14
- flowerpower/cfg/base.py +29 -25
- flowerpower/cfg/pipeline/__init__.py +8 -6
- flowerpower/cfg/pipeline/_schedule.py +32 -0
- flowerpower/cfg/pipeline/adapter.py +0 -5
- flowerpower/cfg/pipeline/builder.py +377 -0
- flowerpower/cfg/pipeline/run.py +36 -0
- flowerpower/cfg/project/__init__.py +11 -24
- flowerpower/cfg/project/adapter.py +0 -12
- flowerpower/cli/__init__.py +2 -21
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/mqtt.py +0 -6
- flowerpower/cli/pipeline.py +22 -415
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +345 -146
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +21 -12
- flowerpower/pipeline/io.py +58 -54
- flowerpower/pipeline/manager.py +165 -726
- flowerpower/pipeline/pipeline.py +643 -0
- flowerpower/pipeline/registry.py +285 -18
- flowerpower/pipeline/visualizer.py +5 -6
- flowerpower/plugins/io/__init__.py +8 -0
- flowerpower/plugins/mqtt/__init__.py +7 -11
- flowerpower/settings/__init__.py +0 -2
- flowerpower/settings/{backend.py → _backend.py} +0 -21
- flowerpower/settings/logging.py +1 -1
- flowerpower/utils/logging.py +24 -12
- flowerpower/utils/misc.py +17 -256
- flowerpower/utils/monkey.py +1 -83
- flowerpower-0.21.0.dist-info/METADATA +463 -0
- flowerpower-0.21.0.dist-info/RECORD +44 -0
- flowerpower/cfg/pipeline/schedule.py +0 -74
- flowerpower/cfg/project/job_queue.py +0 -238
- flowerpower/cli/job_queue.py +0 -1061
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/__init__.py +0 -294
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/job_queue/base.py +0 -413
- flowerpower/job_queue/rq/__init__.py +0 -10
- flowerpower/job_queue/rq/_trigger.py +0 -37
- flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +0 -226
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -231
- flowerpower/job_queue/rq/manager.py +0 -1582
- flowerpower/job_queue/rq/setup.py +0 -154
- flowerpower/job_queue/rq/utils.py +0 -69
- flowerpower/mqtt.py +0 -12
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/plugins/mqtt/cfg.py +0 -17
- flowerpower/plugins/mqtt/manager.py +0 -962
- flowerpower/settings/job_queue.py +0 -87
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.20.dist-info/METADATA +0 -537
- flowerpower-0.11.6.20.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/top_level.txt +0 -0
@@ -1,570 +0,0 @@
|
|
1
|
-
import concurrent.futures
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
import polars as pl
|
5
|
-
import pyarrow as pa
|
6
|
-
import pyarrow.compute as pc
|
7
|
-
|
8
|
-
# Pre-compiled regex patterns (identical to original)
|
9
|
-
INTEGER_REGEX = r"^[-+]?\d+$"
|
10
|
-
FLOAT_REGEX = r"^[-+]?(?:\d*[.,])?\d+(?:[eE][-+]?\d+)?$"
|
11
|
-
BOOLEAN_REGEX = r"^(true|false|1|0|yes|ja|no|nein|t|f|y|j|n|ok|nok)$"
|
12
|
-
BOOLEAN_TRUE_REGEX = r"^(true|1|yes|ja|t|y|j|ok)$"
|
13
|
-
DATETIME_REGEX = (
|
14
|
-
r"^("
|
15
|
-
r"\d{4}-\d{2}-\d{2}" # ISO: 2023-12-31
|
16
|
-
r"|"
|
17
|
-
r"\d{2}/\d{2}/\d{4}" # US: 12/31/2023
|
18
|
-
r"|"
|
19
|
-
r"\d{2}\.\d{2}\.\d{4}" # German: 31.12.2023
|
20
|
-
r"|"
|
21
|
-
r"\d{8}" # Compact: 20231231
|
22
|
-
r")"
|
23
|
-
r"([ T]\d{2}:\d{2}(:\d{2}(\.\d{1,6})?)?)?" # Optional time: 23:59[:59[.123456]]
|
24
|
-
r"([+-]\d{2}:?\d{2}|Z)?" # Optional timezone: +01:00, -0500, Z
|
25
|
-
r"$"
|
26
|
-
)
|
27
|
-
|
28
|
-
# Float32 range limits
|
29
|
-
F32_MIN = float(np.finfo(np.float32).min)
|
30
|
-
F32_MAX = float(np.finfo(np.float32).max)
|
31
|
-
|
32
|
-
|
33
|
-
def dominant_timezone_per_column(
|
34
|
-
schemas: list[pa.Schema],
|
35
|
-
) -> dict[str, tuple[str | None, str | None]]:
|
36
|
-
"""
|
37
|
-
For each timestamp column (by name) across all schemas, detect the most frequent timezone (including None).
|
38
|
-
If None and a timezone are tied, prefer the timezone.
|
39
|
-
Returns a dict: {column_name: dominant_timezone}
|
40
|
-
"""
|
41
|
-
from collections import Counter, defaultdict
|
42
|
-
|
43
|
-
tz_counts = defaultdict(Counter)
|
44
|
-
units = {}
|
45
|
-
|
46
|
-
for schema in schemas:
|
47
|
-
for field in schema:
|
48
|
-
if pa.types.is_timestamp(field.type):
|
49
|
-
tz = field.type.tz
|
50
|
-
name = field.name
|
51
|
-
tz_counts[name][tz] += 1
|
52
|
-
# Track unit for each column (assume consistent)
|
53
|
-
if name not in units:
|
54
|
-
units[name] = field.type.unit
|
55
|
-
|
56
|
-
dominant = {}
|
57
|
-
for name, counter in tz_counts.items():
|
58
|
-
most_common = counter.most_common()
|
59
|
-
if not most_common:
|
60
|
-
continue
|
61
|
-
top_count = most_common[0][1]
|
62
|
-
# Find all with top_count
|
63
|
-
top_tzs = [tz for tz, cnt in most_common if cnt == top_count]
|
64
|
-
# If tie and one is not None, prefer not-None
|
65
|
-
if len(top_tzs) > 1 and any(tz is not None for tz in top_tzs):
|
66
|
-
tz = next(tz for tz in top_tzs if tz is not None)
|
67
|
-
else:
|
68
|
-
tz = most_common[0][0]
|
69
|
-
dominant[name] = (units[name], tz)
|
70
|
-
return dominant
|
71
|
-
|
72
|
-
|
73
|
-
def standardize_schema_timezones_by_majority(
|
74
|
-
schemas: list[pa.Schema],
|
75
|
-
) -> list[pa.Schema]:
|
76
|
-
"""
|
77
|
-
For each timestamp column (by name) across all schemas, set the timezone to the most frequent (with tie-breaking).
|
78
|
-
Returns a new list of schemas with updated timestamp timezones.
|
79
|
-
"""
|
80
|
-
dom = dominant_timezone_per_column(schemas)
|
81
|
-
new_schemas = []
|
82
|
-
for schema in schemas:
|
83
|
-
fields = []
|
84
|
-
for field in schema:
|
85
|
-
if pa.types.is_timestamp(field.type) and field.name in dom:
|
86
|
-
unit, tz = dom[field.name]
|
87
|
-
fields.append(
|
88
|
-
pa.field(
|
89
|
-
field.name,
|
90
|
-
pa.timestamp(unit, tz),
|
91
|
-
field.nullable,
|
92
|
-
field.metadata,
|
93
|
-
)
|
94
|
-
)
|
95
|
-
else:
|
96
|
-
fields.append(field)
|
97
|
-
new_schemas.append(pa.schema(fields, schema.metadata))
|
98
|
-
return new_schemas
|
99
|
-
|
100
|
-
|
101
|
-
def standardize_schema_timezones(
|
102
|
-
schemas: list[pa.Schema], timezone: str | None = None
|
103
|
-
) -> list[pa.Schema]:
|
104
|
-
"""
|
105
|
-
Standardize timezone info for all timestamp columns in a list of PyArrow schemas.
|
106
|
-
|
107
|
-
Args:
|
108
|
-
schemas (list of pa.Schema): List of PyArrow schemas.
|
109
|
-
timezone (str or None): If None, remove timezone from all timestamp columns.
|
110
|
-
If str, set this timezone for all timestamp columns.
|
111
|
-
If "auto", use the most frequent timezone across schemas.
|
112
|
-
|
113
|
-
Returns:
|
114
|
-
list of pa.Schema: New schemas with standardized timezone info.
|
115
|
-
"""
|
116
|
-
if timezone == "auto":
|
117
|
-
# Use the most frequent timezone for each column
|
118
|
-
return standardize_schema_timezones_by_majority(schemas)
|
119
|
-
new_schemas = []
|
120
|
-
for schema in schemas:
|
121
|
-
fields = []
|
122
|
-
for field in schema:
|
123
|
-
if pa.types.is_timestamp(field.type):
|
124
|
-
fields.append(
|
125
|
-
pa.field(
|
126
|
-
field.name,
|
127
|
-
pa.timestamp(field.type.unit, timezone),
|
128
|
-
field.nullable,
|
129
|
-
field.metadata,
|
130
|
-
)
|
131
|
-
)
|
132
|
-
else:
|
133
|
-
fields.append(field)
|
134
|
-
new_schemas.append(pa.schema(fields, schema.metadata))
|
135
|
-
return new_schemas
|
136
|
-
|
137
|
-
|
138
|
-
def unify_schemas(
|
139
|
-
schemas: list[pa.Schema],
|
140
|
-
use_large_dtypes: bool = False,
|
141
|
-
timezone: str | None = None,
|
142
|
-
standardize_timezones: bool = True,
|
143
|
-
) -> pa.Schema:
|
144
|
-
"""
|
145
|
-
Unify a list of PyArrow schemas into a single schema.
|
146
|
-
|
147
|
-
Args:
|
148
|
-
schemas (list[pa.Schema]): List of PyArrow schemas to unify.
|
149
|
-
use_large_dtypes (bool): If True, keep large types like large_string.
|
150
|
-
timezone (str | None): If specified, standardize all timestamp columns to this timezone.
|
151
|
-
If "auto", use the most frequent timezone across schemas.
|
152
|
-
If None, remove timezone from all timestamp columns.
|
153
|
-
standardize_timezones (bool): If True, standardize all timestamp columns to the most frequent timezone.
|
154
|
-
|
155
|
-
Returns:
|
156
|
-
pa.Schema: A unified PyArrow schema.
|
157
|
-
"""
|
158
|
-
if standardize_timezones:
|
159
|
-
schemas = standardize_schema_timezones(schemas, timezone)
|
160
|
-
try:
|
161
|
-
return pa.unify_schemas(schemas, promote_options="permissive")
|
162
|
-
except (pa.lib.ArrowInvalid, pa.lib.ArrowTypeError) as e:
|
163
|
-
_ = e.args[0]
|
164
|
-
# If unify_schemas fails, we can try to create a schema with empty tables
|
165
|
-
schema = (
|
166
|
-
pl.concat(
|
167
|
-
[
|
168
|
-
# pl.from_arrow(pa.Table.from_pylist([], schema=schema))
|
169
|
-
pl.from_arrow(schema.empty_table())
|
170
|
-
for schema in schemas
|
171
|
-
],
|
172
|
-
how="diagonal_relaxed",
|
173
|
-
)
|
174
|
-
.to_arrow()
|
175
|
-
.schema
|
176
|
-
)
|
177
|
-
if not use_large_dtypes:
|
178
|
-
return convert_large_types_to_normal(schema)
|
179
|
-
return schema
|
180
|
-
|
181
|
-
|
182
|
-
def cast_schema(table: pa.Table, schema: pa.Schema) -> pa.Table:
|
183
|
-
"""
|
184
|
-
Cast a PyArrow table to a given schema, updating the schema to match the table's columns.
|
185
|
-
|
186
|
-
Args:
|
187
|
-
table (pa.Table): The PyArrow table to cast.
|
188
|
-
schema (pa.Schema): The target schema to cast the table to.
|
189
|
-
|
190
|
-
Returns:
|
191
|
-
pa.Table: A new PyArrow table with the specified schema.
|
192
|
-
"""
|
193
|
-
# Filter schema fields to only those present in the table
|
194
|
-
table_columns = set(table.schema.names)
|
195
|
-
filtered_fields = [field for field in schema if field.name in table_columns]
|
196
|
-
updated_schema = pa.schema(filtered_fields)
|
197
|
-
return table.select(updated_schema.names).cast(updated_schema)
|
198
|
-
|
199
|
-
|
200
|
-
def convert_large_types_to_normal(schema: pa.Schema) -> pa.Schema:
|
201
|
-
"""
|
202
|
-
Convert large types in a PyArrow schema to their standard types.
|
203
|
-
|
204
|
-
Args:
|
205
|
-
schema (pa.Schema): The PyArrow schema to convert.
|
206
|
-
|
207
|
-
Returns:
|
208
|
-
pa.Schema: A new PyArrow schema with large types converted to standard types.
|
209
|
-
"""
|
210
|
-
# Define mapping of large types to standard types
|
211
|
-
type_mapping = {
|
212
|
-
pa.large_string(): pa.string(),
|
213
|
-
pa.large_binary(): pa.binary(),
|
214
|
-
pa.large_utf8(): pa.utf8(),
|
215
|
-
pa.large_list(pa.null()): pa.list_(pa.null()),
|
216
|
-
pa.large_list_view(pa.null()): pa.list_view(pa.null()),
|
217
|
-
}
|
218
|
-
# Convert fields
|
219
|
-
new_fields = []
|
220
|
-
for field in schema:
|
221
|
-
field_type = field.type
|
222
|
-
# Check if type exists in mapping
|
223
|
-
if field_type in type_mapping:
|
224
|
-
new_field = pa.field(
|
225
|
-
name=field.name,
|
226
|
-
type=type_mapping[field_type],
|
227
|
-
nullable=field.nullable,
|
228
|
-
metadata=field.metadata,
|
229
|
-
)
|
230
|
-
new_fields.append(new_field)
|
231
|
-
# Handle large lists with nested types
|
232
|
-
elif isinstance(field_type, pa.LargeListType):
|
233
|
-
new_field = pa.field(
|
234
|
-
name=field.name,
|
235
|
-
type=pa.list_(
|
236
|
-
type_mapping[field_type.value_type]
|
237
|
-
if field_type.value_type in type_mapping
|
238
|
-
else field_type.value_type
|
239
|
-
),
|
240
|
-
nullable=field.nullable,
|
241
|
-
metadata=field.metadata,
|
242
|
-
)
|
243
|
-
new_fields.append(new_field)
|
244
|
-
# Handle dictionary with large_string, large_utf8, or large_binary values
|
245
|
-
elif isinstance(field_type, pa.DictionaryType):
|
246
|
-
new_field = pa.field(
|
247
|
-
name=field.name,
|
248
|
-
type=pa.dictionary(
|
249
|
-
field_type.index_type,
|
250
|
-
type_mapping[field_type.value_type]
|
251
|
-
if field_type.value_type in type_mapping
|
252
|
-
else field_type.value_type,
|
253
|
-
field_type.ordered,
|
254
|
-
),
|
255
|
-
# nullable=field.nullable,
|
256
|
-
metadata=field.metadata,
|
257
|
-
)
|
258
|
-
new_fields.append(new_field)
|
259
|
-
else:
|
260
|
-
new_fields.append(field)
|
261
|
-
|
262
|
-
return pa.schema(new_fields)
|
263
|
-
|
264
|
-
|
265
|
-
def _clean_string_array(array: pa.Array) -> pa.DataType:
|
266
|
-
"""
|
267
|
-
Clean string values in a PyArrow array using vectorized operations.
|
268
|
-
Returns the optimal dtype after cleaning.
|
269
|
-
"""
|
270
|
-
if len(array) == 0 or array.null_count == len(array):
|
271
|
-
return array.type
|
272
|
-
|
273
|
-
# Trim whitespace using compute functions
|
274
|
-
trimmed = pc.utf8_trim_whitespace(array)
|
275
|
-
|
276
|
-
# Create mask for values to convert to null
|
277
|
-
empty_mask = pc.equal(trimmed, "")
|
278
|
-
dash_mask = pc.equal(trimmed, "-")
|
279
|
-
none_mask = pc.or_(
|
280
|
-
pc.equal(trimmed, "None"),
|
281
|
-
pc.equal(trimmed, "none"),
|
282
|
-
pc.equal(trimmed, "NONE"),
|
283
|
-
pc.equal(trimmed, "Nan"),
|
284
|
-
pc.equal(trimmed, "N/A"),
|
285
|
-
pc.equal(trimmed, "n/a"),
|
286
|
-
pc.equal(trimmed, "NaN"),
|
287
|
-
pc.equal(trimmed, "nan"),
|
288
|
-
pc.equal(trimmed, "NAN"),
|
289
|
-
pc.equal(trimmed, "Null"),
|
290
|
-
pc.equal(trimmed, "NULL"),
|
291
|
-
pc.equal(trimmed, "null"),
|
292
|
-
)
|
293
|
-
|
294
|
-
null_mask = pc.or_(pc.or_(empty_mask, dash_mask), none_mask)
|
295
|
-
|
296
|
-
# If all values are null after cleaning, return null type
|
297
|
-
if pc.all(null_mask).as_py():
|
298
|
-
return pa.null()
|
299
|
-
|
300
|
-
return array.type # Default: keep string type if not all null
|
301
|
-
|
302
|
-
|
303
|
-
def _can_downcast_to_float32(array: pa.Array) -> bool:
|
304
|
-
"""
|
305
|
-
Check if float values are within Float32 range using vectorized operations.
|
306
|
-
"""
|
307
|
-
if len(array) == 0 or array.null_count == len(array):
|
308
|
-
return True
|
309
|
-
|
310
|
-
is_finite = pc.is_finite(array)
|
311
|
-
if not pc.any(is_finite).as_py():
|
312
|
-
return True
|
313
|
-
|
314
|
-
finite_array = pc.filter(array, is_finite)
|
315
|
-
min_val = pc.min(finite_array).as_py()
|
316
|
-
max_val = pc.max(finite_array).as_py()
|
317
|
-
|
318
|
-
return F32_MIN <= min_val <= max_val <= F32_MAX
|
319
|
-
|
320
|
-
|
321
|
-
def _get_optimal_int_type(
|
322
|
-
array: pa.Array, allow_unsigned: bool, allow_null: bool = True
|
323
|
-
) -> pa.DataType:
|
324
|
-
"""
|
325
|
-
Determine the most efficient integer type based on data range.
|
326
|
-
"""
|
327
|
-
if len(array) == 0 or array.null_count == len(array):
|
328
|
-
if allow_null:
|
329
|
-
return pa.null()
|
330
|
-
else:
|
331
|
-
# If all values are null and allow_null is False, default to int8
|
332
|
-
return pa.int8()
|
333
|
-
|
334
|
-
min_max = pc.min_max(array)
|
335
|
-
min_val = min_max["min"].as_py()
|
336
|
-
max_val = min_max["max"].as_py()
|
337
|
-
|
338
|
-
if allow_unsigned and min_val >= 0:
|
339
|
-
if max_val <= 255:
|
340
|
-
return pa.uint8()
|
341
|
-
elif max_val <= 65535:
|
342
|
-
return pa.uint16()
|
343
|
-
elif max_val <= 4294967295:
|
344
|
-
return pa.uint32()
|
345
|
-
else:
|
346
|
-
return pa.uint64()
|
347
|
-
else:
|
348
|
-
if -128 <= min_val and max_val <= 127:
|
349
|
-
return pa.int8()
|
350
|
-
elif -32768 <= min_val and max_val <= 32767:
|
351
|
-
return pa.int16()
|
352
|
-
elif -2147483648 <= min_val and max_val <= 2147483647:
|
353
|
-
return pa.int32()
|
354
|
-
else:
|
355
|
-
return pa.int64()
|
356
|
-
|
357
|
-
|
358
|
-
def _optimize_numeric_array(
|
359
|
-
array: pa.Array, shrink: bool, allow_unsigned: bool = True, allow_null: bool = True
|
360
|
-
) -> pa.DataType:
|
361
|
-
"""
|
362
|
-
Optimize numeric PyArrow array by downcasting when possible.
|
363
|
-
Returns the optimal dtype.
|
364
|
-
"""
|
365
|
-
|
366
|
-
if not shrink or len(array) == 0 or array.null_count == len(array):
|
367
|
-
if allow_null:
|
368
|
-
return pa.null()
|
369
|
-
else:
|
370
|
-
return array.type
|
371
|
-
|
372
|
-
if pa.types.is_floating(array.type):
|
373
|
-
if array.type == pa.float64() and _can_downcast_to_float32(array):
|
374
|
-
return pa.float32()
|
375
|
-
return array.type
|
376
|
-
|
377
|
-
if pa.types.is_integer(array.type):
|
378
|
-
return _get_optimal_int_type(array, allow_unsigned, allow_null)
|
379
|
-
|
380
|
-
return array.type
|
381
|
-
|
382
|
-
|
383
|
-
def _all_match_regex(array: pa.Array, pattern: str) -> bool:
|
384
|
-
"""
|
385
|
-
Check if all non-null values in array match regex pattern.
|
386
|
-
"""
|
387
|
-
if len(array) == 0 or array.null_count == len(array):
|
388
|
-
return False
|
389
|
-
return pc.all(pc.match_substring_regex(array, pattern, ignore_case=True)).as_py()
|
390
|
-
|
391
|
-
|
392
|
-
def _optimize_string_array(
|
393
|
-
array: pa.Array,
|
394
|
-
col_name: str,
|
395
|
-
shrink_numerics: bool,
|
396
|
-
time_zone: str | None = None,
|
397
|
-
allow_unsigned: bool = True,
|
398
|
-
allow_null: bool = True,
|
399
|
-
) -> pa.DataType:
|
400
|
-
"""
|
401
|
-
Convert string PyArrow array to appropriate type based on content analysis.
|
402
|
-
Returns the optimal dtype.
|
403
|
-
"""
|
404
|
-
if len(array) == 0 or array.null_count == len(array):
|
405
|
-
if allow_null:
|
406
|
-
return pa.null()
|
407
|
-
else:
|
408
|
-
return array.type
|
409
|
-
|
410
|
-
cleaned_array = _clean_string_array(
|
411
|
-
array, allow_null
|
412
|
-
) # pc.utf8_trim_whitespace(array)
|
413
|
-
|
414
|
-
try:
|
415
|
-
if _all_match_regex(cleaned_array, BOOLEAN_REGEX):
|
416
|
-
return pa.bool_()
|
417
|
-
elif _all_match_regex(cleaned_array, INTEGER_REGEX):
|
418
|
-
int_array = pc.cast(
|
419
|
-
pc.replace_substring(cleaned_array, ",", "."), pa.int64()
|
420
|
-
)
|
421
|
-
return _optimize_numeric_array(
|
422
|
-
int_array, allow_unsigned=allow_unsigned, allow_null=allow_null
|
423
|
-
)
|
424
|
-
elif _all_match_regex(cleaned_array, FLOAT_REGEX):
|
425
|
-
float_array = pc.cast(
|
426
|
-
pc.replace_substring(cleaned_array, ",", "."), pa.float64()
|
427
|
-
)
|
428
|
-
return _optimize_numeric_array(
|
429
|
-
float_array,
|
430
|
-
shrink_numerics,
|
431
|
-
allow_unsigned=allow_unsigned,
|
432
|
-
allow_null=allow_null,
|
433
|
-
)
|
434
|
-
elif _all_match_regex(cleaned_array, DATETIME_REGEX):
|
435
|
-
pl_series = pl.Series(col_name, cleaned_array)
|
436
|
-
converted = pl_series.str.to_datetime(
|
437
|
-
strict=False, time_unit="us", time_zone=time_zone
|
438
|
-
)
|
439
|
-
# Get the Arrow dtype from Polars
|
440
|
-
arrow_dtype = converted.to_arrow().type
|
441
|
-
return arrow_dtype
|
442
|
-
except Exception:
|
443
|
-
return pa.string()
|
444
|
-
|
445
|
-
return pa.string()
|
446
|
-
|
447
|
-
|
448
|
-
def _process_column(
|
449
|
-
# table: pa.Table,
|
450
|
-
# col_name: str,
|
451
|
-
array: pa.Array,
|
452
|
-
col_name: str,
|
453
|
-
shrink_numerics: bool,
|
454
|
-
allow_unsigned: bool,
|
455
|
-
time_zone: str | None = None,
|
456
|
-
) -> pa.Field:
|
457
|
-
"""
|
458
|
-
Process a single column for type optimization.
|
459
|
-
Returns a pyarrow.Field with the optimal dtype.
|
460
|
-
"""
|
461
|
-
# array = table[col_name]
|
462
|
-
if array.null_count == len(array):
|
463
|
-
return pa.field(col_name, pa.null())
|
464
|
-
|
465
|
-
if pa.types.is_floating(array.type) or pa.types.is_integer(array.type):
|
466
|
-
dtype = _optimize_numeric_array(array, shrink_numerics, allow_unsigned)
|
467
|
-
return pa.field(col_name, dtype, nullable=array.null_count > 0)
|
468
|
-
elif pa.types.is_string(array.type):
|
469
|
-
dtype = _optimize_string_array(array, col_name, shrink_numerics, time_zone)
|
470
|
-
return pa.field(col_name, dtype, nullable=array.null_count > 0)
|
471
|
-
|
472
|
-
return pa.field(col_name, array.type, nullable=array.null_count > 0)
|
473
|
-
|
474
|
-
|
475
|
-
def _process_column_for_opt_dtype(args):
|
476
|
-
(
|
477
|
-
array,
|
478
|
-
col_name,
|
479
|
-
cols_to_process,
|
480
|
-
shrink_numerics,
|
481
|
-
allow_unsigned,
|
482
|
-
time_zone,
|
483
|
-
strict,
|
484
|
-
allow_null,
|
485
|
-
) = args
|
486
|
-
try:
|
487
|
-
if col_name in cols_to_process:
|
488
|
-
field = _process_column(
|
489
|
-
array, col_name, shrink_numerics, allow_unsigned, time_zone
|
490
|
-
)
|
491
|
-
if pa.types.is_null(field.type):
|
492
|
-
if allow_null:
|
493
|
-
array = pa.nulls(array.length(), type=pa.null())
|
494
|
-
return (col_name, field, array)
|
495
|
-
else:
|
496
|
-
orig_type = array.type
|
497
|
-
# array = table[col_name]
|
498
|
-
field = pa.field(col_name, orig_type, nullable=True)
|
499
|
-
return (col_name, field, array)
|
500
|
-
else:
|
501
|
-
array = array.cast(field.type)
|
502
|
-
return (col_name, field, array)
|
503
|
-
else:
|
504
|
-
field = pa.field(col_name, array.type, nullable=True)
|
505
|
-
# array = table[col_name]
|
506
|
-
return (col_name, field, array)
|
507
|
-
except Exception as e:
|
508
|
-
if strict:
|
509
|
-
raise e
|
510
|
-
field = pa.field(col_name, array.type, nullable=True)
|
511
|
-
return (col_name, field, array)
|
512
|
-
|
513
|
-
|
514
|
-
def opt_dtype(
|
515
|
-
table: pa.Table,
|
516
|
-
include: str | list[str] | None = None,
|
517
|
-
exclude: str | list[str] | None = None,
|
518
|
-
time_zone: str | None = None,
|
519
|
-
shrink_numerics: bool = True,
|
520
|
-
allow_unsigned: bool = True,
|
521
|
-
use_large_dtypes: bool = False,
|
522
|
-
strict: bool = False,
|
523
|
-
allow_null: bool = True,
|
524
|
-
) -> pa.Table:
|
525
|
-
"""
|
526
|
-
Optimize data types of a PyArrow Table for performance and memory efficiency.
|
527
|
-
Returns a new table casted to the optimal schema.
|
528
|
-
|
529
|
-
Args:
|
530
|
-
allow_null (bool): If False, columns that only hold null-like values will not be converted to pyarrow.null().
|
531
|
-
"""
|
532
|
-
if isinstance(include, str):
|
533
|
-
include = [include]
|
534
|
-
if isinstance(exclude, str):
|
535
|
-
exclude = [exclude]
|
536
|
-
|
537
|
-
cols_to_process = table.column_names
|
538
|
-
if include:
|
539
|
-
cols_to_process = [col for col in include if col in table.column_names]
|
540
|
-
if exclude:
|
541
|
-
cols_to_process = [col for col in cols_to_process if col not in exclude]
|
542
|
-
|
543
|
-
# Prepare arguments for parallel processing
|
544
|
-
args_list = [
|
545
|
-
(
|
546
|
-
table[col_name],
|
547
|
-
col_name,
|
548
|
-
cols_to_process,
|
549
|
-
shrink_numerics,
|
550
|
-
allow_unsigned,
|
551
|
-
time_zone,
|
552
|
-
strict,
|
553
|
-
allow_null,
|
554
|
-
)
|
555
|
-
for col_name in table.column_names
|
556
|
-
]
|
557
|
-
|
558
|
-
# Parallelize column processing
|
559
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
560
|
-
results = list(executor.map(_process_column_for_opt_dtype, args_list))
|
561
|
-
|
562
|
-
# Sort results to preserve column order
|
563
|
-
results.sort(key=lambda x: table.column_names.index(x[0]))
|
564
|
-
fields = [field for _, field, _ in results]
|
565
|
-
arrays = [array for _, _, array in results]
|
566
|
-
|
567
|
-
schema = pa.schema(fields)
|
568
|
-
if use_large_dtypes:
|
569
|
-
schema = convert_large_types_to_normal(schema)
|
570
|
-
return pa.Table.from_arrays(arrays, schema=schema)
|