tracdap-runtime 0.6.5__py3-none-any.whl → 0.7.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. tracdap/rt/__init__.py +6 -5
  2. tracdap/rt/_exec/actors.py +6 -5
  3. tracdap/rt/_exec/context.py +278 -110
  4. tracdap/rt/_exec/dev_mode.py +237 -143
  5. tracdap/rt/_exec/engine.py +223 -64
  6. tracdap/rt/_exec/functions.py +31 -6
  7. tracdap/rt/_exec/graph.py +15 -5
  8. tracdap/rt/_exec/graph_builder.py +301 -203
  9. tracdap/rt/_exec/runtime.py +13 -10
  10. tracdap/rt/_exec/server.py +6 -5
  11. tracdap/rt/_impl/__init__.py +6 -5
  12. tracdap/rt/_impl/config_parser.py +17 -9
  13. tracdap/rt/_impl/data.py +284 -172
  14. tracdap/rt/_impl/ext/__init__.py +14 -0
  15. tracdap/rt/_impl/ext/sql.py +117 -0
  16. tracdap/rt/_impl/ext/storage.py +58 -0
  17. tracdap/rt/_impl/grpc/__init__.py +6 -5
  18. tracdap/rt/_impl/grpc/codec.py +6 -5
  19. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.py +62 -54
  20. tracdap/rt/_impl/grpc/tracdap/metadata/job_pb2.pyi +37 -2
  21. tracdap/rt/_impl/guard_rails.py +6 -5
  22. tracdap/rt/_impl/models.py +6 -5
  23. tracdap/rt/_impl/repos.py +6 -5
  24. tracdap/rt/_impl/schemas.py +6 -5
  25. tracdap/rt/_impl/shim.py +6 -5
  26. tracdap/rt/_impl/static_api.py +30 -16
  27. tracdap/rt/_impl/storage.py +8 -7
  28. tracdap/rt/_impl/type_system.py +6 -5
  29. tracdap/rt/_impl/util.py +16 -5
  30. tracdap/rt/_impl/validation.py +72 -18
  31. tracdap/rt/_plugins/__init__.py +6 -5
  32. tracdap/rt/_plugins/_helpers.py +6 -5
  33. tracdap/rt/_plugins/config_local.py +6 -5
  34. tracdap/rt/_plugins/format_arrow.py +6 -5
  35. tracdap/rt/_plugins/format_csv.py +6 -5
  36. tracdap/rt/_plugins/format_parquet.py +6 -5
  37. tracdap/rt/_plugins/repo_git.py +6 -5
  38. tracdap/rt/_plugins/repo_local.py +6 -5
  39. tracdap/rt/_plugins/repo_pypi.py +6 -5
  40. tracdap/rt/_plugins/storage_aws.py +6 -5
  41. tracdap/rt/_plugins/storage_azure.py +6 -5
  42. tracdap/rt/_plugins/storage_gcp.py +6 -5
  43. tracdap/rt/_plugins/storage_local.py +6 -5
  44. tracdap/rt/_plugins/storage_sql.py +418 -0
  45. tracdap/rt/_plugins/storage_sql_dialects.py +118 -0
  46. tracdap/rt/_version.py +7 -6
  47. tracdap/rt/api/__init__.py +23 -5
  48. tracdap/rt/api/experimental.py +85 -37
  49. tracdap/rt/api/hook.py +16 -5
  50. tracdap/rt/api/model_api.py +110 -90
  51. tracdap/rt/api/static_api.py +142 -100
  52. tracdap/rt/config/common.py +26 -27
  53. tracdap/rt/config/job.py +5 -6
  54. tracdap/rt/config/platform.py +41 -42
  55. tracdap/rt/config/result.py +5 -6
  56. tracdap/rt/config/runtime.py +6 -7
  57. tracdap/rt/exceptions.py +13 -7
  58. tracdap/rt/ext/__init__.py +6 -5
  59. tracdap/rt/ext/config.py +6 -5
  60. tracdap/rt/ext/embed.py +6 -5
  61. tracdap/rt/ext/plugins.py +6 -5
  62. tracdap/rt/ext/repos.py +6 -5
  63. tracdap/rt/ext/storage.py +6 -5
  64. tracdap/rt/launch/__init__.py +10 -5
  65. tracdap/rt/launch/__main__.py +6 -5
  66. tracdap/rt/launch/cli.py +6 -5
  67. tracdap/rt/launch/launch.py +38 -15
  68. tracdap/rt/metadata/__init__.py +4 -0
  69. tracdap/rt/metadata/common.py +2 -3
  70. tracdap/rt/metadata/custom.py +3 -4
  71. tracdap/rt/metadata/data.py +30 -31
  72. tracdap/rt/metadata/file.py +6 -7
  73. tracdap/rt/metadata/flow.py +22 -23
  74. tracdap/rt/metadata/job.py +89 -45
  75. tracdap/rt/metadata/model.py +26 -27
  76. tracdap/rt/metadata/object.py +11 -12
  77. tracdap/rt/metadata/object_id.py +23 -24
  78. tracdap/rt/metadata/resource.py +0 -1
  79. tracdap/rt/metadata/search.py +15 -16
  80. tracdap/rt/metadata/stoarge.py +22 -23
  81. tracdap/rt/metadata/tag.py +8 -9
  82. tracdap/rt/metadata/tag_update.py +11 -12
  83. tracdap/rt/metadata/type.py +38 -38
  84. {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0rc1.dist-info}/LICENSE +1 -1
  85. {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0rc1.dist-info}/METADATA +4 -2
  86. tracdap_runtime-0.7.0rc1.dist-info/RECORD +121 -0
  87. {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0rc1.dist-info}/WHEEL +1 -1
  88. tracdap_runtime-0.6.5.dist-info/RECORD +0 -116
  89. {tracdap_runtime-0.6.5.dist-info → tracdap_runtime-0.7.0rc1.dist-info}/top_level.txt +0 -0
tracdap/rt/_impl/data.py CHANGED
@@ -1,8 +1,9 @@
1
- # Copyright 2022 Accenture Global Solutions Limited
2
- #
3
- # Licensed under the Apache License, Version 2.0 (the "License");
4
- # you may not use this file except in compliance with the License.
5
- # You may obtain a copy of the License at
1
+ # Licensed to the Fintech Open Source Foundation (FINOS) under one or
2
+ # more contributor license agreements. See the NOTICE file distributed
3
+ # with this work for additional information regarding copyright ownership.
4
+ # FINOS licenses this file to you under the Apache License, Version 2.0
5
+ # (the "License"); you may not use this file except in compliance with the
6
+ # License. You may obtain a copy of the License at
6
7
  #
7
8
  # http://www.apache.org/licenses/LICENSE-2.0
8
9
  #
@@ -12,6 +13,7 @@
12
13
  # See the License for the specific language governing permissions and
13
14
  # limitations under the License.
14
15
 
16
+ import abc
15
17
  import dataclasses as dc
16
18
  import typing as tp
17
19
  import datetime as dt
@@ -31,6 +33,7 @@ try:
31
33
  except ModuleNotFoundError:
32
34
  polars = None
33
35
 
36
+ import tracdap.rt.api.experimental as _api
34
37
  import tracdap.rt.metadata as _meta
35
38
  import tracdap.rt.exceptions as _ex
36
39
  import tracdap.rt._impl.util as _util
@@ -116,73 +119,19 @@ class DataMapping:
116
119
 
117
120
  # Matches TRAC_ARROW_TYPE_MAPPING in ArrowSchema, tracdap-lib-data
118
121
 
119
- __TRAC_DECIMAL_PRECISION = 38
120
- __TRAC_DECIMAL_SCALE = 12
121
- __TRAC_TIMESTAMP_UNIT = "ms"
122
- __TRAC_TIMESTAMP_ZONE = None
122
+ DEFAULT_DECIMAL_PRECISION = 38
123
+ DEFAULT_DECIMAL_SCALE = 12
124
+ DEFAULT_TIMESTAMP_UNIT = "ms"
125
+ DEFAULT_TIMESTAMP_ZONE = None
123
126
 
124
127
  __TRAC_TO_ARROW_BASIC_TYPE_MAPPING = {
125
128
  _meta.BasicType.BOOLEAN: pa.bool_(),
126
129
  _meta.BasicType.INTEGER: pa.int64(),
127
130
  _meta.BasicType.FLOAT: pa.float64(),
128
- _meta.BasicType.DECIMAL: pa.decimal128(__TRAC_DECIMAL_PRECISION, __TRAC_DECIMAL_SCALE),
131
+ _meta.BasicType.DECIMAL: pa.decimal128(DEFAULT_DECIMAL_PRECISION, DEFAULT_DECIMAL_SCALE),
129
132
  _meta.BasicType.STRING: pa.utf8(),
130
133
  _meta.BasicType.DATE: pa.date32(),
131
- _meta.BasicType.DATETIME: pa.timestamp(__TRAC_TIMESTAMP_UNIT, __TRAC_TIMESTAMP_ZONE)
132
- }
133
-
134
- # Check the Pandas dtypes for handling floats are available before setting up the type mapping
135
- __PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
136
- __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
137
- __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
138
-
139
- if __PANDAS_MAJOR_VERSION == 2:
140
-
141
- __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
142
- __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(__TRAC_TIMESTAMP_UNIT).dtype
143
-
144
- @classmethod
145
- def __pandas_datetime_type(cls, tz, unit):
146
- if tz is None and unit is None:
147
- return cls.__PANDAS_DATETIME_TYPE
148
- _unit = unit if unit is not None else cls.__TRAC_TIMESTAMP_UNIT
149
- if tz is None:
150
- return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
151
- else:
152
- return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
153
-
154
- # Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
155
- elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
156
-
157
- __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
158
- __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
159
-
160
- @classmethod
161
- def __pandas_datetime_type(cls, tz, unit): # noqa
162
- if tz is None:
163
- return cls.__PANDAS_DATETIME_TYPE
164
- else:
165
- return pandas.DatetimeTZDtype(tz=tz)
166
-
167
- else:
168
- raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
169
-
170
- # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
171
- __ARROW_TO_PANDAS_TYPE_MAPPING = {
172
- pa.bool_(): pandas.BooleanDtype(),
173
- pa.int8(): pandas.Int8Dtype(),
174
- pa.int16(): pandas.Int16Dtype(),
175
- pa.int32(): pandas.Int32Dtype(),
176
- pa.int64(): pandas.Int64Dtype(),
177
- pa.uint8(): pandas.UInt8Dtype(),
178
- pa.uint16(): pandas.UInt16Dtype(),
179
- pa.uint32(): pandas.UInt32Dtype(),
180
- pa.uint64(): pandas.UInt64Dtype(),
181
- pa.float16(): pandas.Float32Dtype(),
182
- pa.float32(): pandas.Float32Dtype(),
183
- pa.float64(): pandas.Float64Dtype(),
184
- pa.string(): pandas.StringDtype(),
185
- pa.utf8(): pandas.StringDtype()
134
+ _meta.BasicType.DATETIME: pa.timestamp(DEFAULT_TIMESTAMP_UNIT, DEFAULT_TIMESTAMP_ZONE)
186
135
  }
187
136
 
188
137
  __ARROW_TO_TRAC_BASIC_TYPE_MAPPING = {
@@ -243,7 +192,7 @@ class DataMapping:
243
192
  return pa.float64()
244
193
 
245
194
  if python_type == decimal.Decimal:
246
- return pa.decimal128(cls.__TRAC_DECIMAL_PRECISION, cls.__TRAC_DECIMAL_SCALE)
195
+ return pa.decimal128(cls.DEFAULT_DECIMAL_PRECISION, cls.DEFAULT_DECIMAL_SCALE)
247
196
 
248
197
  if python_type == str:
249
198
  return pa.utf8()
@@ -252,7 +201,7 @@ class DataMapping:
252
201
  return pa.date32()
253
202
 
254
203
  if python_type == dt.datetime:
255
- return pa.timestamp(cls.__TRAC_TIMESTAMP_UNIT, cls.__TRAC_TIMESTAMP_ZONE)
204
+ return pa.timestamp(cls.DEFAULT_TIMESTAMP_UNIT, cls.DEFAULT_TIMESTAMP_ZONE)
256
205
 
257
206
  raise _ex.ETracInternal(f"No Arrow type mapping available for Python type [{python_type}]")
258
207
 
@@ -293,8 +242,8 @@ class DataMapping:
293
242
  def trac_arrow_decimal_type(cls) -> pa.Decimal128Type:
294
243
 
295
244
  return pa.decimal128(
296
- cls.__TRAC_DECIMAL_PRECISION,
297
- cls.__TRAC_DECIMAL_SCALE)
245
+ cls.DEFAULT_DECIMAL_PRECISION,
246
+ cls.DEFAULT_DECIMAL_SCALE,)
298
247
 
299
248
  @classmethod
300
249
  def arrow_to_trac_schema(cls, arrow_schema: pa.Schema) -> _meta.SchemaDefinition:
@@ -337,41 +286,6 @@ class DataMapping:
337
286
 
338
287
  raise _ex.ETracInternal(f"No data type mapping available for Arrow type [{arrow_type}]")
339
288
 
340
- @classmethod
341
- def pandas_date_type(cls):
342
- return cls.__PANDAS_DATE_TYPE
343
-
344
- @classmethod
345
- def pandas_datetime_type(cls, tz=None, unit=None):
346
- return cls.__pandas_datetime_type(tz, unit)
347
-
348
- @classmethod
349
- def view_to_pandas(
350
- cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema],
351
- temporal_objects_flag: bool) -> "pandas.DataFrame":
352
-
353
- table = cls.view_to_arrow(view, part)
354
- return cls.arrow_to_pandas(table, schema, temporal_objects_flag)
355
-
356
- @classmethod
357
- def view_to_polars(
358
- cls, view: DataView, part: DataPartKey, schema: tp.Optional[pa.Schema]):
359
-
360
- table = cls.view_to_arrow(view, part)
361
- return cls.arrow_to_polars(table, schema)
362
-
363
- @classmethod
364
- def pandas_to_item(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
365
-
366
- table = cls.pandas_to_arrow(df, schema)
367
- return DataItem(table.schema, table)
368
-
369
- @classmethod
370
- def polars_to_item(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema]) -> DataItem:
371
-
372
- table = cls.polars_to_arrow(df, schema)
373
- return DataItem(table.schema, table)
374
-
375
289
  @classmethod
376
290
  def add_item_to_view(cls, view: DataView, part: DataPartKey, item: DataItem) -> DataView:
377
291
 
@@ -420,108 +334,306 @@ class DataMapping:
420
334
 
421
335
  @classmethod
422
336
  def arrow_to_pandas(
423
- cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None,
337
+ cls, table: pa.Table,
338
+ schema: tp.Optional[pa.Schema] = None,
424
339
  temporal_objects_flag: bool = False) -> "pandas.DataFrame":
425
340
 
426
- if schema is not None:
427
- table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
428
- else:
429
- DataConformance.check_duplicate_fields(table.schema.names, False)
341
+ # This is a legacy internal method and should be removed
342
+ # DataMapping is no longer responsible for individual data APIs
430
343
 
431
- # Use Arrow's built-in function to convert to Pandas
432
- return table.to_pandas(
344
+ # Maintained temporarily for compatibility with existing deployments
433
345
 
434
- # Mapping for arrow -> pandas types for core types
435
- types_mapper=cls.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
346
+ converter = PandasArrowConverter(_api.PANDAS, use_temporal_objects=temporal_objects_flag)
347
+ return converter.from_internal(table, schema)
436
348
 
437
- # Use Python objects for dates and times if temporal_objects_flag is set
438
- date_as_object=temporal_objects_flag, # noqa
439
- timestamp_as_object=temporal_objects_flag, # noqa
349
+ @classmethod
350
+ def pandas_to_arrow(
351
+ cls, df: "pandas.DataFrame",
352
+ schema: tp.Optional[pa.Schema] = None) -> pa.Table:
440
353
 
441
- # Do not bring any Arrow metadata into Pandas dataframe
442
- ignore_metadata=True, # noqa
354
+ # This is a legacy internal method and should be removed
355
+ # DataMapping is no longer responsible for individual data APIs
443
356
 
444
- # Do not consolidate memory across columns when preparing the Pandas vectors
445
- # This is a significant performance win for very wide datasets
446
- split_blocks=True) # noqa
357
+ # Maintained temporarily for compatibility with existing deployments
447
358
 
448
- @classmethod
449
- def arrow_to_polars(
450
- cls, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> "polars.DataFrame":
359
+ converter = PandasArrowConverter(_api.PANDAS)
360
+ return converter.to_internal(df, schema)
451
361
 
452
- if schema is not None:
453
- table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
454
- else:
455
- DataConformance.check_duplicate_fields(table.schema.names, False)
456
362
 
457
- return polars.from_arrow(table)
458
363
 
459
- @classmethod
460
- def pandas_to_arrow(cls, df: "pandas.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
364
+ T_DATA_API = tp.TypeVar("T_DATA_API")
365
+ T_INTERNAL_DATA = tp.TypeVar("T_INTERNAL_DATA")
366
+ T_INTERNAL_SCHEMA = tp.TypeVar("T_INTERNAL_SCHEMA")
461
367
 
462
- # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
463
- # Calling Table.from_pandas with the supplied schema will very often reject data
464
- # Instead, we convert the dataframe as-is and then apply type conversion in a second step
465
- # This allows us to apply specific coercion rules for each data type
466
368
 
467
- # As an optimisation, the column filter means columns will not be converted if they are not needed
468
- # E.g. if a model outputs lots of undeclared columns, there is no need to convert them
369
+ class DataConverter(tp.Generic[T_DATA_API, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
469
370
 
470
- column_filter = DataConformance.column_filter(df.columns, schema) # noqa
371
+ # Available per-framework args, to enable framework-specific type-checking in public APIs
372
+ # These should (for a purist point of view) be in the individual converter classes
373
+ # For now there are only a few converters, they are all defined here so this is OK
374
+ __FRAMEWORK_ARGS = {
375
+ _api.PANDAS: {"use_temporal_objects": tp.Optional[bool]},
376
+ _api.POLARS: {}
377
+ }
471
378
 
472
- if len(df) > 0:
379
+ @classmethod
380
+ def get_framework(cls, dataset: _api.DATA_API) -> _api.DataFramework[_api.DATA_API]:
473
381
 
474
- table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False) # noqa
382
+ if pandas is not None and isinstance(dataset, pandas.DataFrame):
383
+ return _api.PANDAS
475
384
 
476
- # Special case handling for converting an empty dataframe
477
- # These must flow through the pipe with valid schemas, like any other dataset
478
- # Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
385
+ if polars is not None and isinstance(dataset, polars.DataFrame):
386
+ return _api.POLARS
479
387
 
480
- else:
388
+ data_api_type = f"{type(dataset).__module__}.{type(dataset).__name__}"
389
+ raise _ex.EPluginNotAvailable(f"No data framework available for type [{data_api_type}]")
481
390
 
482
- empty_df = df.filter(column_filter) if column_filter else df
483
- empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False) # noqa
391
+ @classmethod
392
+ def get_framework_args(cls, framework: _api.DataFramework[_api.DATA_API]) -> tp.Dict[str, type]:
484
393
 
485
- table = pa.Table.from_batches(list(), empty_schema) # noqa
394
+ return cls.__FRAMEWORK_ARGS.get(framework) or {}
486
395
 
487
- # If there is no explict schema, give back the table exactly as it was received from Pandas
488
- # There could be an option here to infer and coerce for TRAC standard types
489
- # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
396
+ @classmethod
397
+ def for_framework(cls, framework: _api.DataFramework[_api.DATA_API], **framework_args) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
490
398
 
491
- if schema is None:
492
- DataConformance.check_duplicate_fields(table.schema.names, False)
493
- return table
399
+ if framework == _api.PANDAS:
400
+ if pandas is not None:
401
+ return PandasArrowConverter(framework, **framework_args)
402
+ else:
403
+ raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
494
404
 
495
- # If a schema has been supplied, apply data conformance
496
- # If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
405
+ if framework == _api.POLARS:
406
+ if polars is not None:
407
+ return PolarsArrowConverter(framework)
408
+ else:
409
+ raise _ex.EPluginNotAvailable(f"Optional package [{framework}] is not installed")
497
410
 
498
- else:
499
- df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
500
- return DataConformance.conform_to_schema(table, schema, df_types)
411
+ raise _ex.EPluginNotAvailable(f"Data framework [{framework}] is not recognized")
501
412
 
502
413
  @classmethod
503
- def pandas_to_arrow_schema(cls, df: "pandas.DataFrame") -> pa.Schema:
414
+ def for_dataset(cls, dataset: _api.DATA_API) -> "DataConverter[_api.DATA_API, pa.Table, pa.Schema]":
504
415
 
505
- return pa.Schema.from_pandas(df, preserve_index=False) # noqa
416
+ return cls.for_framework(cls.get_framework(dataset))
506
417
 
507
418
  @classmethod
508
- def polars_to_arrow(cls, df: "polars.DataFrame", schema: tp.Optional[pa.Schema] = None) -> pa.Table:
419
+ def noop(cls) -> "DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]":
420
+ return NoopConverter()
421
+
422
+ def __init__(self, framework: _api.DataFramework[T_DATA_API]):
423
+ self.framework = framework
424
+
425
+ @abc.abstractmethod
426
+ def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
427
+ pass
428
+
429
+ @abc.abstractmethod
430
+ def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
431
+ pass
432
+
433
+ @abc.abstractmethod
434
+ def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
435
+ pass
436
+
437
+
438
+ class NoopConverter(DataConverter[T_INTERNAL_DATA, T_INTERNAL_DATA, T_INTERNAL_SCHEMA]):
439
+
440
+ def __init__(self):
441
+ super().__init__(_api.DataFramework("internal", None)) # noqa
442
+
443
+ def from_internal(self, dataset: T_INTERNAL_DATA, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_DATA_API:
444
+ return dataset
445
+
446
+ def to_internal(self, dataset: T_DATA_API, schema: tp.Optional[T_INTERNAL_SCHEMA] = None) -> T_INTERNAL_DATA:
447
+ return dataset
448
+
449
+ def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
450
+ raise _ex.EUnexpected() # A real converter should be selected before use
451
+
452
+
453
+ # Data frameworks are optional, do not blow up the module just because one framework is unavailable!
454
+ if pandas is not None:
509
455
 
510
- column_filter = DataConformance.column_filter(df.columns, schema)
456
+ class PandasArrowConverter(DataConverter[pandas.DataFrame, pa.Table, pa.Schema]):
511
457
 
512
- filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
513
- table = filtered_df.to_arrow()
458
+ # Check the Pandas dtypes for handling floats are available before setting up the type mapping
459
+ __PANDAS_VERSION_ELEMENTS = pandas.__version__.split(".")
460
+ __PANDAS_MAJOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[0])
461
+ __PANDAS_MINOR_VERSION = int(__PANDAS_VERSION_ELEMENTS[1])
462
+
463
+ if __PANDAS_MAJOR_VERSION == 2:
464
+
465
+ __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
466
+ __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(DataMapping.DEFAULT_TIMESTAMP_UNIT).dtype
467
+
468
+ @classmethod
469
+ def __pandas_datetime_type(cls, tz, unit):
470
+ if tz is None and unit is None:
471
+ return cls.__PANDAS_DATETIME_TYPE
472
+ _unit = unit if unit is not None else DataMapping.DEFAULT_TIMESTAMP_UNIT
473
+ if tz is None:
474
+ return pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).as_unit(_unit).dtype
475
+ else:
476
+ return pandas.DatetimeTZDtype(tz=tz, unit=_unit)
477
+
478
+ # Minimum supported version for Pandas is 1.2, when pandas.Float64Dtype was introduced
479
+ elif __PANDAS_MAJOR_VERSION == 1 and __PANDAS_MINOR_VERSION >= 2:
480
+
481
+ __PANDAS_DATE_TYPE = pandas.to_datetime([dt.date(2000, 1, 1)]).dtype
482
+ __PANDAS_DATETIME_TYPE = pandas.to_datetime([dt.datetime(2000, 1, 1, 0, 0, 0)]).dtype
483
+
484
+ @classmethod
485
+ def __pandas_datetime_type(cls, tz, unit): # noqa
486
+ if tz is None:
487
+ return cls.__PANDAS_DATETIME_TYPE
488
+ else:
489
+ return pandas.DatetimeTZDtype(tz=tz)
514
490
 
515
- if schema is None:
516
- DataConformance.check_duplicate_fields(table.schema.names, False)
517
- return table
518
491
  else:
519
- return DataConformance.conform_to_schema(table, schema, None)
492
+ raise _ex.EStartup(f"Pandas version not supported: [{pandas.__version__}]")
493
+
494
+ # Only partial mapping is possible, decimal and temporal dtypes cannot be mapped this way
495
+ __ARROW_TO_PANDAS_TYPE_MAPPING = {
496
+ pa.bool_(): pandas.BooleanDtype(),
497
+ pa.int8(): pandas.Int8Dtype(),
498
+ pa.int16(): pandas.Int16Dtype(),
499
+ pa.int32(): pandas.Int32Dtype(),
500
+ pa.int64(): pandas.Int64Dtype(),
501
+ pa.uint8(): pandas.UInt8Dtype(),
502
+ pa.uint16(): pandas.UInt16Dtype(),
503
+ pa.uint32(): pandas.UInt32Dtype(),
504
+ pa.uint64(): pandas.UInt64Dtype(),
505
+ pa.float16(): pandas.Float32Dtype(),
506
+ pa.float32(): pandas.Float32Dtype(),
507
+ pa.float64(): pandas.Float64Dtype(),
508
+ pa.string(): pandas.StringDtype(),
509
+ pa.utf8(): pandas.StringDtype()
510
+ }
511
+
512
+ __DEFAULT_TEMPORAL_OBJECTS = False
513
+
514
+ # Expose date type for testing
515
+ @classmethod
516
+ def pandas_date_type(cls):
517
+ return cls.__PANDAS_DATE_TYPE
520
518
 
521
- @classmethod
522
- def polars_to_arrow_schema(cls, df: "polars.DataFrame") -> pa.Schema:
519
+ # Expose datetime type for testing
520
+ @classmethod
521
+ def pandas_datetime_type(cls, tz=None, unit=None):
522
+ return cls.__pandas_datetime_type(tz, unit)
523
+
524
+ def __init__(self, framework: _api.DataFramework[T_DATA_API], use_temporal_objects: tp.Optional[bool] = None):
525
+ super().__init__(framework)
526
+ if use_temporal_objects is None:
527
+ self.__temporal_objects_flag = self.__DEFAULT_TEMPORAL_OBJECTS
528
+ else:
529
+ self.__temporal_objects_flag = use_temporal_objects
530
+
531
+ def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> pandas.DataFrame:
532
+
533
+ if schema is not None:
534
+ table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
535
+ else:
536
+ DataConformance.check_duplicate_fields(table.schema.names, False)
537
+
538
+ # Use Arrow's built-in function to convert to Pandas
539
+ return table.to_pandas(
540
+
541
+ # Mapping for arrow -> pandas types for core types
542
+ types_mapper=self.__ARROW_TO_PANDAS_TYPE_MAPPING.get,
543
+
544
+ # Use Python objects for dates and times if temporal_objects_flag is set
545
+ date_as_object=self.__temporal_objects_flag, # noqa
546
+ timestamp_as_object=self.__temporal_objects_flag, # noqa
547
+
548
+ # Do not bring any Arrow metadata into Pandas dataframe
549
+ ignore_metadata=True, # noqa
550
+
551
+ # Do not consolidate memory across columns when preparing the Pandas vectors
552
+ # This is a significant performance win for very wide datasets
553
+ split_blocks=True) # noqa
554
+
555
+ def to_internal(self, df: pandas.DataFrame, schema: tp.Optional[pa.Schema] = None) -> pa.Table:
556
+
557
+ # Converting pandas -> arrow needs care to ensure type coercion is applied correctly
558
+ # Calling Table.from_pandas with the supplied schema will very often reject data
559
+ # Instead, we convert the dataframe as-is and then apply type conversion in a second step
560
+ # This allows us to apply specific coercion rules for each data type
561
+
562
+ # As an optimisation, the column filter means columns will not be converted if they are not needed
563
+ # E.g. if a model outputs lots of undeclared columns, there is no need to convert them
564
+
565
+ column_filter = DataConformance.column_filter(df.columns, schema) # noqa
566
+
567
+ if len(df) > 0:
568
+
569
+ table = pa.Table.from_pandas(df, columns=column_filter, preserve_index=False) # noqa
570
+
571
+ # Special case handling for converting an empty dataframe
572
+ # These must flow through the pipe with valid schemas, like any other dataset
573
+ # Type coercion and column filtering happen in conform_to_schema, if a schema has been supplied
574
+
575
+ else:
576
+
577
+ empty_df = df.filter(column_filter) if column_filter else df
578
+ empty_schema = pa.Schema.from_pandas(empty_df, preserve_index=False) # noqa
579
+
580
+ table = pa.Table.from_batches(list(), empty_schema) # noqa
581
+
582
+ # If there is no explict schema, give back the table exactly as it was received from Pandas
583
+ # There could be an option here to infer and coerce for TRAC standard types
584
+ # E.g. unsigned int 32 -> signed int 64, TRAC standard integer type
585
+
586
+ if schema is None:
587
+ DataConformance.check_duplicate_fields(table.schema.names, False)
588
+ return table
589
+
590
+ # If a schema has been supplied, apply data conformance
591
+ # If column filtering has been applied, we also need to filter the pandas dtypes used for hinting
592
+
593
+ else:
594
+ df_types = df.dtypes.filter(column_filter) if column_filter else df.dtypes
595
+ return DataConformance.conform_to_schema(table, schema, df_types)
596
+
597
+ def infer_schema(self, dataset: pandas.DataFrame) -> _meta.SchemaDefinition:
598
+
599
+ arrow_schema = pa.Schema.from_pandas(dataset, preserve_index=False) # noqa
600
+ return DataMapping.arrow_to_trac_schema(arrow_schema)
601
+
602
+
603
+ # Data frameworks are optional, do not blow up the module just because one framework is unavailable!
604
+ if polars is not None:
605
+
606
+ class PolarsArrowConverter(DataConverter[polars.DataFrame, pa.Table, pa.Schema]):
607
+
608
+ def __init__(self, framework: _api.DataFramework[T_DATA_API]):
609
+ super().__init__(framework)
610
+
611
+ def from_internal(self, table: pa.Table, schema: tp.Optional[pa.Schema] = None) -> polars.DataFrame:
612
+
613
+ if schema is not None:
614
+ table = DataConformance.conform_to_schema(table, schema, warn_extra_columns=False)
615
+ else:
616
+ DataConformance.check_duplicate_fields(table.schema.names, False)
617
+
618
+ return polars.from_arrow(table)
619
+
620
+ def to_internal(self, df: polars.DataFrame, schema: tp.Optional[pa.Schema] = None,) -> pa.Table:
621
+
622
+ column_filter = DataConformance.column_filter(df.columns, schema)
623
+
624
+ filtered_df = df.select(polars.col(*column_filter)) if column_filter else df
625
+ table = filtered_df.to_arrow()
626
+
627
+ if schema is None:
628
+ DataConformance.check_duplicate_fields(table.schema.names, False)
629
+ return table
630
+ else:
631
+ return DataConformance.conform_to_schema(table, schema, None)
632
+
633
+ def infer_schema(self, dataset: T_DATA_API) -> _meta.SchemaDefinition:
523
634
 
524
- return df.top_k(1).to_arrow().schema
635
+ arrow_schema = dataset.top_k(1).to_arrow().schema
636
+ return DataMapping.arrow_to_trac_schema(arrow_schema)
525
637
 
526
638
 
527
639
  class DataConformance:
@@ -652,7 +764,7 @@ class DataConformance:
652
764
  # Columns not defined in the schema will not be included in the conformed output
653
765
  if warn_extra_columns and table.num_columns > len(schema.types):
654
766
 
655
- schema_columns = set(map(str.lower, schema.names))
767
+ schema_columns = set(map(lambda c: c.lower(), schema.names))
656
768
  extra_columns = [
657
769
  f"[{col}]"
658
770
  for col in table.schema.names
@@ -0,0 +1,14 @@
1
+ # Licensed to the Fintech Open Source Foundation (FINOS) under one or
2
+ # more contributor license agreements. See the NOTICE file distributed
3
+ # with this work for additional information regarding copyright ownership.
4
+ # FINOS licenses this file to you under the Apache License, Version 2.0
5
+ # (the "License"); you may not use this file except in compliance with the
6
+ # License. You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.