teradataml 20.0.0.6__py3-none-any.whl → 20.0.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/README.md +210 -0
- teradataml/__init__.py +1 -1
- teradataml/_version.py +1 -1
- teradataml/analytics/analytic_function_executor.py +162 -76
- teradataml/analytics/byom/__init__.py +1 -1
- teradataml/analytics/json_parser/__init__.py +2 -0
- teradataml/analytics/json_parser/analytic_functions_argument.py +95 -2
- teradataml/analytics/json_parser/metadata.py +22 -4
- teradataml/analytics/sqle/DecisionTreePredict.py +3 -2
- teradataml/analytics/sqle/NaiveBayesPredict.py +3 -2
- teradataml/analytics/sqle/__init__.py +3 -0
- teradataml/analytics/utils.py +4 -1
- teradataml/automl/__init__.py +2369 -464
- teradataml/automl/autodataprep/__init__.py +15 -0
- teradataml/automl/custom_json_utils.py +184 -112
- teradataml/automl/data_preparation.py +113 -58
- teradataml/automl/data_transformation.py +154 -53
- teradataml/automl/feature_engineering.py +113 -53
- teradataml/automl/feature_exploration.py +548 -25
- teradataml/automl/model_evaluation.py +260 -32
- teradataml/automl/model_training.py +399 -206
- teradataml/clients/auth_client.py +2 -2
- teradataml/common/aed_utils.py +11 -2
- teradataml/common/bulk_exposed_utils.py +4 -2
- teradataml/common/constants.py +62 -2
- teradataml/common/garbagecollector.py +50 -21
- teradataml/common/messagecodes.py +47 -2
- teradataml/common/messages.py +19 -1
- teradataml/common/sqlbundle.py +23 -6
- teradataml/common/utils.py +116 -10
- teradataml/context/aed_context.py +16 -10
- teradataml/data/Employee.csv +5 -0
- teradataml/data/Employee_Address.csv +4 -0
- teradataml/data/Employee_roles.csv +5 -0
- teradataml/data/JulesBelvezeDummyData.csv +100 -0
- teradataml/data/byom_example.json +5 -0
- teradataml/data/creditcard_data.csv +284618 -0
- teradataml/data/docs/byom/docs/ONNXSeq2Seq.py +255 -0
- teradataml/data/docs/sqle/docs_17_10/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/NGramSplitter.py +1 -1
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +1 -1
- teradataml/data/jsons/byom/ONNXSeq2Seq.json +287 -0
- teradataml/data/jsons/sqle/20.00/AI_AnalyzeSentiment.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_AskLLM.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_DetectLanguage.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_ExtractKeyPhrases.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_MaskPII.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizeEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_RecognizePIIEntities.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextClassifier.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextEmbeddings.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextSummarize.json +3 -7
- teradataml/data/jsons/sqle/20.00/AI_TextTranslate.json +3 -7
- teradataml/data/jsons/sqle/20.00/TD_API_AzureML.json +151 -0
- teradataml/data/jsons/sqle/20.00/TD_API_Sagemaker.json +182 -0
- teradataml/data/jsons/sqle/20.00/TD_API_VertexAI.json +183 -0
- teradataml/data/load_example_data.py +29 -11
- teradataml/data/payment_fraud_dataset.csv +10001 -0
- teradataml/data/teradataml_example.json +67 -0
- teradataml/dataframe/copy_to.py +714 -54
- teradataml/dataframe/dataframe.py +1153 -33
- teradataml/dataframe/dataframe_utils.py +8 -3
- teradataml/dataframe/functions.py +168 -1
- teradataml/dataframe/setop.py +4 -1
- teradataml/dataframe/sql.py +141 -9
- teradataml/dbutils/dbutils.py +470 -35
- teradataml/dbutils/filemgr.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +456 -142
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/lib/libaed_0_1.dylib +0 -0
- teradataml/lib/libaed_0_1.so +0 -0
- teradataml/lib/libaed_0_1_aarch64.so +0 -0
- teradataml/scriptmgmt/UserEnv.py +234 -34
- teradataml/scriptmgmt/lls_utils.py +43 -17
- teradataml/sdk/_json_parser.py +1 -1
- teradataml/sdk/api_client.py +9 -6
- teradataml/sdk/modelops/_client.py +3 -0
- teradataml/series/series.py +12 -7
- teradataml/store/feature_store/constants.py +601 -234
- teradataml/store/feature_store/feature_store.py +2886 -616
- teradataml/store/feature_store/mind_map.py +639 -0
- teradataml/store/feature_store/models.py +5831 -214
- teradataml/store/feature_store/utils.py +390 -0
- teradataml/table_operators/table_operator_util.py +1 -1
- teradataml/table_operators/templates/dataframe_register.template +6 -2
- teradataml/table_operators/templates/dataframe_udf.template +6 -2
- teradataml/utils/docstring.py +527 -0
- teradataml/utils/dtypes.py +93 -0
- teradataml/utils/internal_buffer.py +2 -2
- teradataml/utils/utils.py +41 -2
- teradataml/utils/validators.py +694 -17
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/METADATA +213 -2
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/RECORD +96 -81
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.6.dist-info → teradataml-20.0.0.7.dist-info}/zip-safe +0 -0
teradataml/dataframe/copy_to.py
CHANGED
|
@@ -14,9 +14,11 @@ import pandas.api.types as pt
|
|
|
14
14
|
|
|
15
15
|
from sqlalchemy import MetaData, Table, Column
|
|
16
16
|
from sqlalchemy.exc import OperationalError as sqlachemyOperationalError
|
|
17
|
+
from teradataml.dataframe.sql import ColumnExpression
|
|
17
18
|
from teradatasqlalchemy import (INTEGER, BIGINT, BYTEINT, FLOAT)
|
|
18
|
-
from teradatasqlalchemy import (TIMESTAMP)
|
|
19
|
+
from teradatasqlalchemy import (TIMESTAMP, DATE)
|
|
19
20
|
from teradatasqlalchemy import (VARCHAR)
|
|
21
|
+
from teradatasqlalchemy import (PERIOD_DATE,PERIOD_TIMESTAMP)
|
|
20
22
|
from teradatasqlalchemy.dialect import TDCreateTablePost as post
|
|
21
23
|
from teradataml.common.aed_utils import AedUtils
|
|
22
24
|
from teradataml.context.context import *
|
|
@@ -25,13 +27,15 @@ from teradataml.dataframe.dataframe_utils import DataFrameUtils as df_utils
|
|
|
25
27
|
from teradataml.dbutils.dbutils import _rename_table
|
|
26
28
|
from teradataml.common.utils import UtilFuncs
|
|
27
29
|
from teradataml.options.configure import configure
|
|
28
|
-
from teradataml.common.constants import CopyToConstants, PTITableConstants
|
|
30
|
+
from teradataml.common.constants import CopyToConstants, PTITableConstants, TeradataTypes
|
|
29
31
|
from teradatasql import OperationalError
|
|
30
32
|
from teradataml.common.wrapper_utils import AnalyticsWrapperUtils
|
|
31
33
|
from teradataml.utils.utils import execute_sql
|
|
32
34
|
from teradataml.utils.validators import _Validators
|
|
33
35
|
from teradataml.telemetry_utils.queryband import collect_queryband
|
|
36
|
+
from teradatasqlalchemy.dialect import dialect as td_dialect
|
|
34
37
|
|
|
38
|
+
from teradataml.utils.dtypes import _TupleOf
|
|
35
39
|
|
|
36
40
|
@collect_queryband(queryband="CpToSql")
|
|
37
41
|
def copy_to_sql(df, table_name,
|
|
@@ -48,7 +52,12 @@ def copy_to_sql(df, table_name,
|
|
|
48
52
|
seq_max=None,
|
|
49
53
|
set_table=False,
|
|
50
54
|
chunksize=CopyToConstants.DBAPI_BATCHSIZE.value,
|
|
51
|
-
match_column_order=True
|
|
55
|
+
match_column_order=True,
|
|
56
|
+
partition_by=None,
|
|
57
|
+
partition_by_case=None,
|
|
58
|
+
partition_by_range=None,
|
|
59
|
+
sub_partition=None,
|
|
60
|
+
**kwargs):
|
|
52
61
|
"""
|
|
53
62
|
Writes records stored in a Pandas DataFrame or a teradataml DataFrame to Teradata Vantage.
|
|
54
63
|
|
|
@@ -284,6 +293,68 @@ def copy_to_sql(df, table_name,
|
|
|
284
293
|
Default Value: True
|
|
285
294
|
Types: bool
|
|
286
295
|
|
|
296
|
+
partition_by:
|
|
297
|
+
Optional Argument.
|
|
298
|
+
Specifies the columns on which partition should be created while creating the table.
|
|
299
|
+
Note:
|
|
300
|
+
1. "partition_by", "partition_by_case" and "partition_by_range" are mutually exclusive.
|
|
301
|
+
2. "primary_index" should be specified when "partition_by" is used.
|
|
302
|
+
3. Not applicable for PTI tables.
|
|
303
|
+
Types: str or ColumnExpression
|
|
304
|
+
|
|
305
|
+
partition_by_case:
|
|
306
|
+
Optional Argument.
|
|
307
|
+
Specifies different cases to partition the index while creating table.
|
|
308
|
+
Note:
|
|
309
|
+
1. "partition_by", "partition_by_case" and "partition_by_range" are mutually exclusive.
|
|
310
|
+
2. "primary_index" should be specified when "partition_by_case" is used.
|
|
311
|
+
3. Not applicable for PTI tables.
|
|
312
|
+
Types: str or ColumnExpression or tuple of ColumnExpression, str
|
|
313
|
+
|
|
314
|
+
partition_by_range:
|
|
315
|
+
Optional Argument.
|
|
316
|
+
Specifies the range of values on which partition should be created while creating a table.
|
|
317
|
+
Note:
|
|
318
|
+
1. "partition_by", "partition_by_case" and "partition_by_range" are mutually exclusive.
|
|
319
|
+
2. "primary_index" should be specified when "partition_by_range" is used.
|
|
320
|
+
3. Not applicable for PTI tables.
|
|
321
|
+
types: str or ColumnExpression
|
|
322
|
+
|
|
323
|
+
sub_partition:
|
|
324
|
+
Optional Argument.
|
|
325
|
+
Specifies the details to subpartition the main partition according to the value provided while creating the table.
|
|
326
|
+
Note:
|
|
327
|
+
1. "sub_partition" is applicable only when "partition_by_range" is specified.
|
|
328
|
+
2. Not applicable for PTI tables.
|
|
329
|
+
Types: int or Teradata Interval datatypes
|
|
330
|
+
|
|
331
|
+
**kwargs:
|
|
332
|
+
Optional keyword arguments.
|
|
333
|
+
|
|
334
|
+
valid_time_columns:
|
|
335
|
+
Optional Argument.
|
|
336
|
+
Specifies the name(s) of the valid time columns to be referred in "df".
|
|
337
|
+
When "valid_time_columns" is specified, then function considers
|
|
338
|
+
these columns as valid time dimension columns and creates a
|
|
339
|
+
valid time dimension temporal table if table does not exist.
|
|
340
|
+
Notes:
|
|
341
|
+
* If a string is provided, the column must be of PERIOD type.
|
|
342
|
+
Types: tuple of strings or str
|
|
343
|
+
|
|
344
|
+
derived_column:
|
|
345
|
+
Optional Argument.
|
|
346
|
+
Specifies the name of the derived column to be kept in the temporal table.
|
|
347
|
+
Notes:
|
|
348
|
+
* Argument is ignored if "valid_time_columns" are not specified.
|
|
349
|
+
* Argument is considered only if copy_to_sql() is creating a table.
|
|
350
|
+
* If "valid_time_columns" is specified and "derived_column" is not specified,
|
|
351
|
+
then copy_to_sql() automatically creates a derived column by adding "_" between
|
|
352
|
+
the columns mentioned in "valid_time_columns". For example,
|
|
353
|
+
if "valid_time_columns" is ('col1', 'col2') and "derived_column"
|
|
354
|
+
is not specified, then copy_to_sql() creates table with
|
|
355
|
+
derived column name as 'col1_col2'.
|
|
356
|
+
Types: str
|
|
357
|
+
|
|
287
358
|
RETURNS:
|
|
288
359
|
None
|
|
289
360
|
|
|
@@ -305,32 +376,32 @@ def copy_to_sql(df, table_name,
|
|
|
305
376
|
>>> pandas_df = pd.DataFrame(df)
|
|
306
377
|
|
|
307
378
|
a) Save a Pandas DataFrame using a dataframe & table name only:
|
|
308
|
-
>>> copy_to_sql(df
|
|
379
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table')
|
|
309
380
|
|
|
310
381
|
b) Saving as a SET table
|
|
311
|
-
>>> copy_to_sql(df
|
|
382
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_set_table', index=True,
|
|
312
383
|
primary_index='index_label', set_table=True)
|
|
313
384
|
|
|
314
385
|
c) Save a Pandas DataFrame by specifying additional parameters:
|
|
315
|
-
>>> copy_to_sql(df
|
|
316
|
-
... index
|
|
317
|
-
... primary_index
|
|
318
|
-
... types
|
|
319
|
-
...
|
|
386
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table_2', schema_name='alice',
|
|
387
|
+
... index=True, index_label='my_index_label', temporary=False,
|
|
388
|
+
... primary_index=['emp_id'], if_exists='append',
|
|
389
|
+
... types={'emp_name': VARCHAR, 'emp_sage':INTEGER,
|
|
390
|
+
... 'emp_id': BIGINT, 'marks': DECIMAL})
|
|
320
391
|
|
|
321
392
|
d) Saving with additional parameters as a SET table
|
|
322
|
-
>>> copy_to_sql(df
|
|
323
|
-
... index
|
|
324
|
-
... primary_index
|
|
325
|
-
... types
|
|
326
|
-
...
|
|
393
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table_3', schema_name='alice',
|
|
394
|
+
... index=True, index_label='my_index_label', temporary=False,
|
|
395
|
+
... primary_index=['emp_id'], if_exists='append',
|
|
396
|
+
... types={'emp_name': VARCHAR, 'emp_sage':INTEGER,
|
|
397
|
+
... 'emp_id': BIGINT, 'marks': DECIMAL},
|
|
327
398
|
... set_table=True)
|
|
328
399
|
|
|
329
400
|
e) Saving levels in index of type MultiIndex
|
|
330
401
|
>>> pandas_df = pandas_df.set_index(['emp_id', 'emp_name'])
|
|
331
|
-
>>> copy_to_sql(df
|
|
332
|
-
... index
|
|
333
|
-
... primary_index
|
|
402
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table_4', schema_name='alice',
|
|
403
|
+
... index=True, index_label=['index1', 'index2'], temporary=False,
|
|
404
|
+
... primary_index=['index1'], if_exists = 'replace')
|
|
334
405
|
|
|
335
406
|
f) Save a Pandas DataFrame with VECTOR datatype:
|
|
336
407
|
>>> import pandas as pd
|
|
@@ -343,6 +414,68 @@ def copy_to_sql(df, table_name,
|
|
|
343
414
|
>>> from teradatasqlalchemy import VECTOR
|
|
344
415
|
>>> copy_to_sql(df=df, table_name='my_vector_table', types={'array_col': VECTOR})
|
|
345
416
|
|
|
417
|
+
g) Saving pandas DataFrame with partition_by:
|
|
418
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table_5', if_exists='replace',
|
|
419
|
+
... primary_index=['emp_id'],
|
|
420
|
+
... partition_by='emp_id')
|
|
421
|
+
|
|
422
|
+
h) Saving pandas DataFrame with partition_by_case:
|
|
423
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table_6', if_exists='replace',
|
|
424
|
+
... primary_index=['emp_id'],
|
|
425
|
+
... partition_by_case='emp_id > 100, emp_id < 500')
|
|
426
|
+
|
|
427
|
+
i) Saving pandas DataFrame with partition_by_range:
|
|
428
|
+
>>> copy_to_sql(df=pandas_df, table_name='my_table_7', if_exists='replace',
|
|
429
|
+
... primary_index=['emp_id'],
|
|
430
|
+
... partition_by_range='emp_id BETWEEN 100 AND 500')
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
j) Save a Pandas DataFrame with valid time columns of DATE type to a temporal table.
|
|
434
|
+
>>> import pandas as pd
|
|
435
|
+
>>> from teradataml.dataframe.copy_to import copy_to_sql
|
|
436
|
+
>>> df = pd.DataFrame({
|
|
437
|
+
... 'id': [1, 2, 3],
|
|
438
|
+
... 'start_date': pd.to_datetime(['2024-01-01', '2024-02-01', '2024-03-01']).date,
|
|
439
|
+
... 'end_date': pd.to_datetime(['2024-01-10', '2024-02-10', '2024-03-10']).date,
|
|
440
|
+
... 'description': ['a', 'b', 'c']
|
|
441
|
+
... })
|
|
442
|
+
>>> copy_to_sql(
|
|
443
|
+
... df=df,
|
|
444
|
+
... table_name='temporal_table_pandas_date',
|
|
445
|
+
... valid_time_columns=('start_date', 'end_date')
|
|
446
|
+
... )
|
|
447
|
+
|
|
448
|
+
k) Save a Pandas DataFrame with valid time columns of TIMESTAMP type
|
|
449
|
+
to a temporal table. Name the derived column as `valid_time`.
|
|
450
|
+
>>> import pandas as pd
|
|
451
|
+
>>> from teradataml.dataframe.copy_to import copy_to_sql
|
|
452
|
+
>>> df = pd.DataFrame({
|
|
453
|
+
... 'id': [1, 2, 3],
|
|
454
|
+
... 'start_time': pd.to_datetime(['2024-01-01 10:00:00', '2024-02-01 11:00:00', '2024-03-01 12:00:00']),
|
|
455
|
+
... 'end_time': pd.to_datetime(['2024-01-01 12:00:00', '2024-02-01 13:00:00', '2024-03-01 14:00:00']),
|
|
456
|
+
... 'description: ['a', 'b', 'c']
|
|
457
|
+
... })
|
|
458
|
+
>>> copy_to_sql(
|
|
459
|
+
... df=df,
|
|
460
|
+
... table_name='temporal_table_pandas_timestamp',
|
|
461
|
+
... valid_time_columns=('start_time', 'end_time'),
|
|
462
|
+
... derived_column='valid_time'
|
|
463
|
+
... )
|
|
464
|
+
|
|
465
|
+
f) Save a teradataml DataFrame with valid time column of PERIOD type to a temporal table.
|
|
466
|
+
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
467
|
+
>>> from teradataml.dataframe.copy_to import copy_to_sql
|
|
468
|
+
>>> from teradataml.data.load_example_data import load_example_data
|
|
469
|
+
>>> load_example_data("teradataml", "Employee_roles")
|
|
470
|
+
>>> from teradatasqlalchemy.types import PERIOD_DATE
|
|
471
|
+
>>> df = DataFrame('Employee_roles')
|
|
472
|
+
>>> copy_to_sql(
|
|
473
|
+
... df,
|
|
474
|
+
... table_name = 'employee_roles_temporal',
|
|
475
|
+
... valid_time_column='role_validity_period',
|
|
476
|
+
... types={'role_validity_period':PERIOD_DATE}
|
|
477
|
+
... )
|
|
478
|
+
|
|
346
479
|
2. Saving a teradataml DataFrame:
|
|
347
480
|
|
|
348
481
|
>>> from teradataml.dataframe.dataframe import DataFrame
|
|
@@ -368,14 +501,62 @@ def copy_to_sql(df, table_name,
|
|
|
368
501
|
>>> copy_to_sql(df2, 'my_tdml_table_2')
|
|
369
502
|
|
|
370
503
|
d) Save a teradataml DataFrame by using copy_to_sql with additional parameters:
|
|
371
|
-
>>> copy_to_sql(df
|
|
372
|
-
... temporary
|
|
373
|
-
... types
|
|
504
|
+
>>> copy_to_sql(df=df2, table_name='my_tdml_table_3', schema_name='alice',
|
|
505
|
+
... temporary=False, primary_index=None, if_exists='append',
|
|
506
|
+
... types={'masters': VARCHAR, 'gpa':INTEGER})
|
|
374
507
|
|
|
375
508
|
e) Saving as a SET table
|
|
376
|
-
>>> copy_to_sql(df = df2, table_name = 'my_tdml_set_table', schema_name
|
|
377
|
-
... temporary
|
|
378
|
-
... types
|
|
509
|
+
>>> copy_to_sql(df = df2, table_name = 'my_tdml_set_table', schema_name='alice',
|
|
510
|
+
... temporary=False, primary_index=['gpa'], if_exists='append',
|
|
511
|
+
... types={'masters': VARCHAR, 'gpa':INTEGER}, set_table = True)
|
|
512
|
+
|
|
513
|
+
f) Saving a teradataml DataFrame into a table by partitioning the table with column 'gpa':
|
|
514
|
+
>>> copy_to_sql(df=df, table_name='my_tdml_table_4', if_exists='replace',
|
|
515
|
+
... primary_index=['gpa'],
|
|
516
|
+
... partition_by=df.gpa)
|
|
517
|
+
|
|
518
|
+
g) Saving a teradataml DataFrame into a table with two partitions as below:
|
|
519
|
+
>>> copy_to_sql(df=df, table_name='my_tdml_table_5', if_exists='replace',
|
|
520
|
+
... primary_index=['id'],
|
|
521
|
+
... partition_by_case=(df.id < 100, df.gpa < 5.0))
|
|
522
|
+
|
|
523
|
+
h) Saving a teradataml DataFrame into a table by partitioning the table with different ranges:
|
|
524
|
+
>>> copy_to_sql(df=df, table_name='my_tdml_table_6', if_exists='replace',
|
|
525
|
+
... primary_index=['id'],
|
|
526
|
+
... partition_by_range=df.id.between(1, 100))
|
|
527
|
+
|
|
528
|
+
i) Saving a teradataml DataFrame into a table by partitioning the table with different ranges.
|
|
529
|
+
Also sub-partitioning based on INTERVAL:
|
|
530
|
+
>>> load_example_data("dataframe", "sales")
|
|
531
|
+
>>> df = DataFrame('sales')
|
|
532
|
+
>>> from teradatasqlalchemy import INTERVAL_DAY
|
|
533
|
+
>>> copy_to_sql(df=df, table_name='my_tdml_table_7', if_exists='replace',
|
|
534
|
+
... primary_index="Feb"
|
|
535
|
+
... partition_by_range=df.datetime.between('2017-01-01', '2017-01-31'),
|
|
536
|
+
... sub_partition=INTERVAL_DAY(1))
|
|
537
|
+
|
|
538
|
+
j) Save a teradataml DataFrame with valid time columns of DATE type to a temporal table.
|
|
539
|
+
pdf = pd.DataFrame({
|
|
540
|
+
... 'id': [1, 2, 3],
|
|
541
|
+
... 'start_date': pd.to_datetime(['2024-01-01', '2024-02-01', '2024-03-01']).date,
|
|
542
|
+
... 'end_date': pd.to_datetime(['2024-01-10', '2024-02-10', '2024-03-10']).date,
|
|
543
|
+
... 'description': ['a', 'b', 'c']
|
|
544
|
+
... })
|
|
545
|
+
>>> df_temporal = DataFrame(data = pdf)
|
|
546
|
+
>>> copy_to_sql(df=df_temporal, table_name='temporal_table_tdml_date',
|
|
547
|
+
... valid_time_columns=('start_date', 'end_date'))
|
|
548
|
+
|
|
549
|
+
k) Save a teradataml DataFrame with valid time columns of TIMESTAMP type
|
|
550
|
+
to a temporal table. Name the derived column as `validity_period`.
|
|
551
|
+
>>> df_temporal_ts = DataFrame(data = pd.DataFrame({
|
|
552
|
+
... 'id': [1, 2, 3],
|
|
553
|
+
... 'start_time': pd.to_datetime(['2024-01-01 10:00:00', '2024-02-01 11:00:00', '2024-03-01 12:00:00']),
|
|
554
|
+
... 'end_time': pd.to_datetime(['2024-01-01 12:00:00', '2024-02-01 13:00:00', '2024-03-01 14:00:00']),
|
|
555
|
+
... 'description': ['a', 'b', 'c']
|
|
556
|
+
... }))
|
|
557
|
+
>>> copy_to_sql(df=df_temporal_ts, table_name='temporal_table_tdml_timestamp',
|
|
558
|
+
... valid_time_columns=('start_time', 'end_time'), derived_column='validity_period')
|
|
559
|
+
|
|
379
560
|
|
|
380
561
|
3. Saving a teradataml DataFrame as a PTI table:
|
|
381
562
|
|
|
@@ -403,6 +584,10 @@ def copy_to_sql(df, table_name,
|
|
|
403
584
|
... set_table=True)
|
|
404
585
|
|
|
405
586
|
"""
|
|
587
|
+
# Accept valid_time_columns and derived_column from kwargs
|
|
588
|
+
valid_time_columns = kwargs.get("valid_time_columns", None)
|
|
589
|
+
derived_column = kwargs.get("derived_column", None)
|
|
590
|
+
|
|
406
591
|
# Deriving global connection using get_connection().
|
|
407
592
|
con = get_connection()
|
|
408
593
|
|
|
@@ -460,6 +645,12 @@ def copy_to_sql(df, table_name,
|
|
|
460
645
|
|
|
461
646
|
dt_obj._validate()
|
|
462
647
|
|
|
648
|
+
# Validate partition arguments
|
|
649
|
+
_validate_partition_arguments(partition_by=partition_by,
|
|
650
|
+
partition_by_case=partition_by_case,
|
|
651
|
+
partition_by_range=partition_by_range,
|
|
652
|
+
sub_partition=sub_partition)
|
|
653
|
+
|
|
463
654
|
# If the table created must be a PTI table, then validate additional parameters
|
|
464
655
|
# Note that if the required parameters for PTI are valid, then other parameters, though being validated,
|
|
465
656
|
# will be ignored - for example, primary_index
|
|
@@ -473,6 +664,13 @@ def copy_to_sql(df, table_name,
|
|
|
473
664
|
raise TeradataMlException(Messages.get_message(MessageCodes.SET_TABLE_NO_PI),
|
|
474
665
|
MessageCodes.SET_TABLE_NO_PI)
|
|
475
666
|
|
|
667
|
+
# Check whether valid time columns are passed to consider it as temporal table.
|
|
668
|
+
is_temporal = False
|
|
669
|
+
if valid_time_columns is not None:
|
|
670
|
+
_validate_valid_time_columns(df, valid_time_columns, derived_column,types)
|
|
671
|
+
is_temporal = True
|
|
672
|
+
|
|
673
|
+
|
|
476
674
|
# Check if destination table exists
|
|
477
675
|
table_exists = dt_obj._table_exists(con)
|
|
478
676
|
|
|
@@ -503,35 +701,49 @@ def copy_to_sql(df, table_name,
|
|
|
503
701
|
# failing with Blank name in quotation mark. Hence, extracted only the table name.
|
|
504
702
|
table_name = UtilFuncs._extract_table_name(table_name)
|
|
505
703
|
|
|
704
|
+
partition_exp, partition_func = _build_partition_expression(partition_by=partition_by,
|
|
705
|
+
partition_by_case=partition_by_case,
|
|
706
|
+
partition_by_range=partition_by_range,
|
|
707
|
+
sub_partition=sub_partition)
|
|
708
|
+
|
|
506
709
|
# Let's create the SQLAlchemy table object to recreate the table
|
|
507
710
|
if not table_exists or if_exists.lower() == 'replace':
|
|
508
|
-
if
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
table = _create_pti_table_object(df, con, table_name, schema_name, temporary,
|
|
514
|
-
primary_time_index_name, timecode_column, timezero_date,
|
|
515
|
-
timebucket_duration, sequence_column, seq_max,
|
|
516
|
-
columns_list, set_table, types,
|
|
517
|
-
None if not is_pandas_df else index,
|
|
518
|
-
None if not is_pandas_df else index_label)
|
|
519
|
-
|
|
520
|
-
if table is not None:
|
|
521
|
-
# If the table need to be replaced and there is no table name conflict,
|
|
522
|
-
# let's drop the existing table first
|
|
523
|
-
if table_exists and not is_conflict:
|
|
524
|
-
tbl_name = dt_obj._get_fully_qualified_table_name()
|
|
525
|
-
UtilFuncs._drop_table(tbl_name)
|
|
526
|
-
try:
|
|
527
|
-
table.create(bind=get_context())
|
|
528
|
-
except sqlachemyOperationalError as err:
|
|
529
|
-
raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED) +
|
|
530
|
-
'\n' + str(err),
|
|
531
|
-
MessageCodes.TABLE_OBJECT_CREATION_FAILED)
|
|
711
|
+
if is_temporal:
|
|
712
|
+
_create_temporal_table(df, table_name, con, primary_index,
|
|
713
|
+
schema_name, valid_time_columns, derived_column,
|
|
714
|
+
types, None if not is_pandas_df else index,
|
|
715
|
+
None if not is_pandas_df else index_label)
|
|
532
716
|
else:
|
|
533
|
-
|
|
534
|
-
|
|
717
|
+
if is_pti:
|
|
718
|
+
table = _create_pti_table_object(df, con, table_name, schema_name, temporary,
|
|
719
|
+
primary_time_index_name, timecode_column, timezero_date,
|
|
720
|
+
timebucket_duration, sequence_column, seq_max,
|
|
721
|
+
columns_list, set_table, types,
|
|
722
|
+
None if not is_pandas_df else index,
|
|
723
|
+
None if not is_pandas_df else index_label)
|
|
724
|
+
else:
|
|
725
|
+
table = _create_table_object(df, table_name, con, primary_index, temporary, schema_name, set_table,
|
|
726
|
+
types, None if not is_pandas_df else index,
|
|
727
|
+
None if not is_pandas_df else index_label,
|
|
728
|
+
partition_expression=partition_exp,
|
|
729
|
+
partition_function=partition_func
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
if table is not None:
|
|
733
|
+
# If the table need to be replaced and there is no table name conflict,
|
|
734
|
+
# let's drop the existing table first
|
|
735
|
+
if table_exists and not is_conflict:
|
|
736
|
+
tbl_name = dt_obj._get_fully_qualified_table_name()
|
|
737
|
+
UtilFuncs._drop_table(tbl_name)
|
|
738
|
+
try:
|
|
739
|
+
table.create(bind=get_context())
|
|
740
|
+
except sqlachemyOperationalError as err:
|
|
741
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED) +
|
|
742
|
+
'\n' + str(err),
|
|
743
|
+
MessageCodes.TABLE_OBJECT_CREATION_FAILED)
|
|
744
|
+
else:
|
|
745
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED),
|
|
746
|
+
MessageCodes.TABLE_OBJECT_CREATION_FAILED)
|
|
535
747
|
|
|
536
748
|
# Check column compatibility for insertion when table exists and if_exists = 'append'
|
|
537
749
|
if table_exists and if_exists.lower() == 'append':
|
|
@@ -549,7 +761,7 @@ def copy_to_sql(df, table_name,
|
|
|
549
761
|
cols, _ = df_utils._get_column_names_and_types_from_metaexpr(df._metaexpr)
|
|
550
762
|
if match_column_order:
|
|
551
763
|
cols_compatible = _check_columns_insertion_compatible(table.c, cols, is_pandas_df,
|
|
552
|
-
is_pti, timecode_column, sequence_column)
|
|
764
|
+
is_pti, timecode_column, sequence_column, derived_column)
|
|
553
765
|
|
|
554
766
|
if not cols_compatible:
|
|
555
767
|
raise TeradataMlException(Messages.get_message(MessageCodes.INSERTION_INCOMPATIBLE),
|
|
@@ -746,6 +958,143 @@ def _get_index_labels(df, index_label):
|
|
|
746
958
|
|
|
747
959
|
return ind_names, ind_types
|
|
748
960
|
|
|
961
|
+
def _validate_partition_arguments(partition_by=None,
|
|
962
|
+
partition_by_case=None,
|
|
963
|
+
partition_by_range=None,
|
|
964
|
+
sub_partition=None):
|
|
965
|
+
"""
|
|
966
|
+
Internal function to validate the partition_by arguments.
|
|
967
|
+
|
|
968
|
+
PARAMETERS:
|
|
969
|
+
partition_by:
|
|
970
|
+
Optional argument.
|
|
971
|
+
Specifies the columns on which PARTITION BY should be created.
|
|
972
|
+
Types: str or ColumnExpression
|
|
973
|
+
|
|
974
|
+
partition_by_case:
|
|
975
|
+
Optional argument.
|
|
976
|
+
Specifies different cases to partition the index.
|
|
977
|
+
Types: str or ColumnExpression or tuple of ColumnExpression, str
|
|
978
|
+
|
|
979
|
+
partition_by_range:
|
|
980
|
+
Optional argument.
|
|
981
|
+
Specifies the range of values of Date columns on which partition to be created.
|
|
982
|
+
Types: str or ColumnExpression
|
|
983
|
+
|
|
984
|
+
sub_partition:
|
|
985
|
+
Optional argument.
|
|
986
|
+
Specifies the details to subpartition the main partition according to the value provided.
|
|
987
|
+
Types: int or Teradata Interval datatypes
|
|
988
|
+
|
|
989
|
+
|
|
990
|
+
RETURNS:
|
|
991
|
+
None
|
|
992
|
+
|
|
993
|
+
RAISES:
|
|
994
|
+
TeradataMlException
|
|
995
|
+
|
|
996
|
+
EXAMPLES:
|
|
997
|
+
>>> _validate_partition_arguments(partition_by='col1')
|
|
998
|
+
>>> _validate_partition_arguments(partition_by_case=(df.col1 < 100, df.col1 < 1000))
|
|
999
|
+
"""
|
|
1000
|
+
# Validate partition_by argument
|
|
1001
|
+
arg_matrix = []
|
|
1002
|
+
arg_matrix.append(['partition_by', partition_by, True, (str, ColumnExpression), True])
|
|
1003
|
+
arg_matrix.append(['partition_by_case', partition_by_case, True, (ColumnExpression, str, _TupleOf((str, ColumnExpression))), True])
|
|
1004
|
+
arg_matrix.append(['partition_by_range', partition_by_range, True, (ColumnExpression, str), True])
|
|
1005
|
+
arg_matrix.append(['sub_partition', sub_partition, True, (int, TeradataTypes.TD_RANGE_N_CLAUSE_TYPES.value), True])
|
|
1006
|
+
|
|
1007
|
+
# Validate argument types
|
|
1008
|
+
_Validators._validate_function_arguments(arg_matrix)
|
|
1009
|
+
|
|
1010
|
+
# Validate mutually exclusive arguments
|
|
1011
|
+
_Validators._validate_mutually_exclusive_argument_groups({"partition_by":partition_by},
|
|
1012
|
+
{"partition_by_case":partition_by_case},
|
|
1013
|
+
{"partition_by_range":partition_by_range})
|
|
1014
|
+
|
|
1015
|
+
def _build_partition_expression(partition_by=None,
|
|
1016
|
+
partition_by_case=None,
|
|
1017
|
+
partition_by_range=None,
|
|
1018
|
+
sub_partition=None):
|
|
1019
|
+
"""
|
|
1020
|
+
DESCRIPTION:
|
|
1021
|
+
Internal function to build the partitioning expression for the table.
|
|
1022
|
+
|
|
1023
|
+
PARAMETERS:
|
|
1024
|
+
partition_by:
|
|
1025
|
+
Optional argument.
|
|
1026
|
+
Specifies the columns on which PARTITION BY should be created.
|
|
1027
|
+
Types: str or ColumnExpression
|
|
1028
|
+
|
|
1029
|
+
partition_by_case:
|
|
1030
|
+
Optional argument.
|
|
1031
|
+
Specifies different cases to partition the index.
|
|
1032
|
+
Types: str or ColumnExpression or tuple of ColumnExpression, str
|
|
1033
|
+
|
|
1034
|
+
partition_by_range:
|
|
1035
|
+
Optional argument.
|
|
1036
|
+
Specifies the range of values of Date columns on which partition to be created.
|
|
1037
|
+
Types: str or ColumnExpression
|
|
1038
|
+
|
|
1039
|
+
sub_partition:
|
|
1040
|
+
Optional argument.
|
|
1041
|
+
Specifies the details to subpartition the main partition according to the value provided.
|
|
1042
|
+
Types: int or Teradata Interval datatypes
|
|
1043
|
+
|
|
1044
|
+
RAISES:
|
|
1045
|
+
None
|
|
1046
|
+
|
|
1047
|
+
RETURNS:
|
|
1048
|
+
strings containing the partitioning expression and partition function.
|
|
1049
|
+
|
|
1050
|
+
EXAMPLES:
|
|
1051
|
+
>>> _build_partition_expression(partition_by='col1')
|
|
1052
|
+
>>> _build_partition_expression(partition_by_case=(df.col1 < 100, df.col1 < 1000))
|
|
1053
|
+
|
|
1054
|
+
"""
|
|
1055
|
+
partition_exp = None
|
|
1056
|
+
partition_fn = None
|
|
1057
|
+
# Check if partition_by expression is a ColumnExpression,
|
|
1058
|
+
# if so, compile it to a string
|
|
1059
|
+
if partition_by:
|
|
1060
|
+
partition_exp = partition_by.compile() if isinstance(partition_by, ColumnExpression) \
|
|
1061
|
+
else partition_by
|
|
1062
|
+
|
|
1063
|
+
# Check if partition_by_case is a ColumnExpression or string,
|
|
1064
|
+
# if string, join to partition_by expression
|
|
1065
|
+
# if ColumnExpression, compile it to a string and join to partition_by expression
|
|
1066
|
+
# if tuple, compile each expression to a string and join to partition_by expression
|
|
1067
|
+
if partition_by_case:
|
|
1068
|
+
partition_fn = "CASE_N"
|
|
1069
|
+
partition_by_case = [partition_by_case] if isinstance(partition_by_case, (str, ColumnExpression)) \
|
|
1070
|
+
else partition_by_case
|
|
1071
|
+
partition_exp = "{}, NO CASE, UNKNOWN".format(
|
|
1072
|
+
", ".join(str(exp.compile()) if isinstance(exp, ColumnExpression) else str(exp)
|
|
1073
|
+
for exp in partition_by_case))
|
|
1074
|
+
|
|
1075
|
+
# Check if partition_by_range is a ColumnExpression or string,
|
|
1076
|
+
# if so, compile it to a string
|
|
1077
|
+
if partition_by_range:
|
|
1078
|
+
partition_fn = "RANGE_N"
|
|
1079
|
+
sub_partition_clause = ""
|
|
1080
|
+
if isinstance(partition_by_range, ColumnExpression):
|
|
1081
|
+
partition_by_range = partition_by_range.compile()
|
|
1082
|
+
|
|
1083
|
+
# Check if sub_partition provided,
|
|
1084
|
+
# if so, complie the EACH clause for RANGE_N
|
|
1085
|
+
# If sub_partition is an int, the convert to string and add to the clause.
|
|
1086
|
+
# If sub_partition is a TeradataTypes.TD_RANGE_N_CLAUSE_TYPES,
|
|
1087
|
+
# convert to string and extract the precision and add to the clause.
|
|
1088
|
+
if sub_partition:
|
|
1089
|
+
sub_partition_clause = (
|
|
1090
|
+
f" EACH {str(sub_partition)}"
|
|
1091
|
+
if isinstance(sub_partition, int)
|
|
1092
|
+
else f" EACH INTERVAL '{sub_partition.precision}' {str(sub_partition).split(maxsplit=1)[1]}")
|
|
1093
|
+
|
|
1094
|
+
partition_exp = "{0}{1}".format(partition_by_range, sub_partition_clause)
|
|
1095
|
+
# Return partition_by expression and partition function
|
|
1096
|
+
return partition_exp, partition_fn
|
|
1097
|
+
|
|
749
1098
|
|
|
750
1099
|
def _validate_pti_copy_parameters(df, timecode_column, timebucket_duration,
|
|
751
1100
|
timezero_date, primary_time_index_name, columns_list,
|
|
@@ -1010,7 +1359,7 @@ def _validate_column_type(df, col, col_arg, expected_types, types = None, index
|
|
|
1010
1359
|
|
|
1011
1360
|
|
|
1012
1361
|
def _create_table_object(df, table_name, con, primary_index, temporary, schema_name, set_table, types, index=None,
|
|
1013
|
-
index_label=None):
|
|
1362
|
+
index_label=None, partition_expression=None, partition_function=None):
|
|
1014
1363
|
"""
|
|
1015
1364
|
This is an internal function used to construct a SQLAlchemy Table Object.
|
|
1016
1365
|
This function checks appropriate flags and supports creation of Teradata
|
|
@@ -1041,6 +1390,12 @@ def _create_table_object(df, table_name, con, primary_index, temporary, schema_n
|
|
|
1041
1390
|
When True, an attempt to create a SET table is made.
|
|
1042
1391
|
When False, an attempt to create a MULTISET table is made.
|
|
1043
1392
|
|
|
1393
|
+
partition_expression:
|
|
1394
|
+
Specifies the partitioning expression to be used for partition by clause.
|
|
1395
|
+
|
|
1396
|
+
partition_function:
|
|
1397
|
+
Specifies the partitioning function to be used with partition by clause.
|
|
1398
|
+
|
|
1044
1399
|
types:
|
|
1045
1400
|
Specifies a python dictionary with column-name(key) to column-type(value) mapping to create DataFrames.
|
|
1046
1401
|
|
|
@@ -1097,6 +1452,11 @@ def _create_table_object(df, table_name, con, primary_index, temporary, schema_n
|
|
|
1097
1452
|
else:
|
|
1098
1453
|
pti = pti.no_primary_index()
|
|
1099
1454
|
|
|
1455
|
+
# Partitioning expression and function
|
|
1456
|
+
if partition_expression:
|
|
1457
|
+
pti = pti.partition_by(partition_expression=partition_expression,
|
|
1458
|
+
partition_fn=partition_function)
|
|
1459
|
+
|
|
1100
1460
|
# Create default Table construct with parameter dictionary
|
|
1101
1461
|
table = Table(table_name, meta,
|
|
1102
1462
|
*(Column(col_name, col_type)
|
|
@@ -1243,6 +1603,142 @@ def _create_pti_table_object(df, con, table_name, schema_name, temporary, primar
|
|
|
1243
1603
|
|
|
1244
1604
|
return table
|
|
1245
1605
|
|
|
1606
|
+
def _create_temporal_table(df, table_name, con, primary_index, schema_name,
|
|
1607
|
+
valid_time_columns, derived_column, types, index=None, index_label=None):
|
|
1608
|
+
"""
|
|
1609
|
+
This is an internal function used to construct a CREATE TABLE statement for a Teradata temporal table.
|
|
1610
|
+
Supports creation of tables with a PERIOD FOR derived column using the specified valid time columns.
|
|
1611
|
+
|
|
1612
|
+
PARAMETERS:
|
|
1613
|
+
df:
|
|
1614
|
+
Required Arugment.
|
|
1615
|
+
The teradataml or Pandas DataFrame object to be saved.
|
|
1616
|
+
Types: pandas.DataFrame or teradataml.dataframe.dataframe.DataFrame
|
|
1617
|
+
|
|
1618
|
+
table_name:
|
|
1619
|
+
Required Argument.
|
|
1620
|
+
Name of SQL table.
|
|
1621
|
+
Types: String
|
|
1622
|
+
|
|
1623
|
+
con:
|
|
1624
|
+
Optional Argument.
|
|
1625
|
+
A SQLAlchemy connectable (engine/connection) object.
|
|
1626
|
+
Types: SQLAlchemy Engine or Connection
|
|
1627
|
+
|
|
1628
|
+
primary_index:
|
|
1629
|
+
Optional Argument.
|
|
1630
|
+
Creates Teradata Table(s) with Primary index column if specified.
|
|
1631
|
+
Types: String or list of Strings
|
|
1632
|
+
|
|
1633
|
+
schema_name:
|
|
1634
|
+
Optional Argument.
|
|
1635
|
+
Specifies the name of the SQL schema in the database to write to.
|
|
1636
|
+
Types: String
|
|
1637
|
+
|
|
1638
|
+
valid_time_columns:
|
|
1639
|
+
Required Argument.
|
|
1640
|
+
Specifies a tuple of two column names representing the temporal validity period.
|
|
1641
|
+
Types: tuple of Strings or str
|
|
1642
|
+
|
|
1643
|
+
derived_column:
|
|
1644
|
+
Optional Argument.
|
|
1645
|
+
Specifies the name of the derived PERIOD FOR column to be created.
|
|
1646
|
+
Types: String
|
|
1647
|
+
|
|
1648
|
+
types:
|
|
1649
|
+
Optional Argument.
|
|
1650
|
+
Specifies a python dictionary with column-name(key) to column-type(value) mapping to create DataFrames.
|
|
1651
|
+
Types: dict
|
|
1652
|
+
|
|
1653
|
+
index:
|
|
1654
|
+
Optional Argument.
|
|
1655
|
+
Flag specifying whether to write Pandas DataFrame index as a column(s) or not.
|
|
1656
|
+
Types: Boolean
|
|
1657
|
+
|
|
1658
|
+
index_label:
|
|
1659
|
+
Optional Argument.
|
|
1660
|
+
Column label(s) for index column(s).
|
|
1661
|
+
Types: String or list of Strings
|
|
1662
|
+
|
|
1663
|
+
RETURNS:
|
|
1664
|
+
None
|
|
1665
|
+
|
|
1666
|
+
RAISES:
|
|
1667
|
+
TeradataMlException
|
|
1668
|
+
|
|
1669
|
+
EXAMPLES:
|
|
1670
|
+
_create_temporal_table(
|
|
1671
|
+
df=my_df,
|
|
1672
|
+
table_name='temporal_table',
|
|
1673
|
+
con=td_connection,
|
|
1674
|
+
primary_index=['id'],
|
|
1675
|
+
schema_name='my_schema',
|
|
1676
|
+
valid_time_columns=('start_date', 'end_date'),
|
|
1677
|
+
derived_column='validity_period',
|
|
1678
|
+
types={'id': INTEGER, 'start_date': DATE, 'end_date': DATE},
|
|
1679
|
+
index=False,
|
|
1680
|
+
index_label=None
|
|
1681
|
+
)
|
|
1682
|
+
|
|
1683
|
+
"""
|
|
1684
|
+
|
|
1685
|
+
# Extract column names and types
|
|
1686
|
+
if isinstance(df, pd.DataFrame):
|
|
1687
|
+
col_names, col_types = _extract_column_info(df, types, index, index_label)
|
|
1688
|
+
else:
|
|
1689
|
+
col_names, col_types = df_utils._get_column_names_and_types_from_metaexpr(df._metaexpr)
|
|
1690
|
+
if types is not None:
|
|
1691
|
+
col_types = [types.get(col_name, col_type) for col_name, col_type in zip(col_names, col_types)]
|
|
1692
|
+
|
|
1693
|
+
columns_clause_ = []
|
|
1694
|
+
# Ensure all col_types are instances, not classes
|
|
1695
|
+
for i, col_type in enumerate(col_types):
|
|
1696
|
+
if isinstance(col_type, type):
|
|
1697
|
+
col_types[i] = col_type()
|
|
1698
|
+
# Use col_names and col_types to build the columns clause
|
|
1699
|
+
# Compile column types to string using the dialect of the current connection
|
|
1700
|
+
# Add NOT NULL to valid_time_columns
|
|
1701
|
+
for col_name, col_type in zip(col_names, col_types):
|
|
1702
|
+
col_def = '{} {}'.format(col_name, col_type.compile(dialect=td_dialect()))
|
|
1703
|
+
|
|
1704
|
+
if col_name in valid_time_columns:
|
|
1705
|
+
col_def += ' NOT NULL'
|
|
1706
|
+
if isinstance(col_type, (PERIOD_DATE, PERIOD_TIMESTAMP)):
|
|
1707
|
+
col_def += ' AS VALIDTIME'
|
|
1708
|
+
columns_clause_.append(col_def)
|
|
1709
|
+
|
|
1710
|
+
period_for_clause = []
|
|
1711
|
+
if isinstance(valid_time_columns, tuple):
|
|
1712
|
+
if derived_column is None:
|
|
1713
|
+
derived_column = "_".join(valid_time_columns)
|
|
1714
|
+
period_for_clause = ['PERIOD FOR {} ({}, {}) AS VALIDTIME'.format(
|
|
1715
|
+
derived_column, valid_time_columns[0], valid_time_columns[1])
|
|
1716
|
+
]
|
|
1717
|
+
columns_clause = ",\n ".join(columns_clause_ + period_for_clause)
|
|
1718
|
+
|
|
1719
|
+
# Prepare primary index clause.
|
|
1720
|
+
if primary_index:
|
|
1721
|
+
primary_index_clause = "PRIMARY INDEX ({})".format(
|
|
1722
|
+
", ".join(UtilFuncs._as_list(primary_index)))
|
|
1723
|
+
else:
|
|
1724
|
+
primary_index_clause = ""
|
|
1725
|
+
|
|
1726
|
+
# Prepare create table statement.
|
|
1727
|
+
table_name = UtilFuncs._get_qualified_table_name(schema_name, table_name) if\
|
|
1728
|
+
schema_name else table_name
|
|
1729
|
+
sql = """
|
|
1730
|
+
CREATE MULTISET TABLE {}
|
|
1731
|
+
(\n{}\n)\n{}
|
|
1732
|
+
""".format(table_name, columns_clause, primary_index_clause)
|
|
1733
|
+
try:
|
|
1734
|
+
execute_sql(sql)
|
|
1735
|
+
except Exception as err:
|
|
1736
|
+
raise TeradataMlException(
|
|
1737
|
+
Messages.get_message(MessageCodes.TABLE_OBJECT_CREATION_FAILED) +
|
|
1738
|
+
'\n' + str(err),
|
|
1739
|
+
MessageCodes.TABLE_OBJECT_CREATION_FAILED
|
|
1740
|
+
)
|
|
1741
|
+
|
|
1246
1742
|
|
|
1247
1743
|
def _rename_column(col_names, search_for, rename_to):
|
|
1248
1744
|
"""
|
|
@@ -1370,7 +1866,7 @@ def _reorder_insert_list_for_pti(df_column_list, timecode_column, sequence_colum
|
|
|
1370
1866
|
|
|
1371
1867
|
|
|
1372
1868
|
def _check_columns_insertion_compatible(table1_col_object, table2_cols, is_pandas_df=False,
|
|
1373
|
-
is_pti=False, timecode_column=None, sequence_column=None):
|
|
1869
|
+
is_pti=False, timecode_column=None, sequence_column=None, derived_column=None):
|
|
1374
1870
|
"""
|
|
1375
1871
|
Internal function used to extract column information from two lists of SQLAlchemy ColumnExpression objects;
|
|
1376
1872
|
and check if the number of columns and their names are matching to determine table insertion compatibility.
|
|
@@ -1394,11 +1890,15 @@ def _check_columns_insertion_compatible(table1_col_object, table2_cols, is_panda
|
|
|
1394
1890
|
timecode_column:
|
|
1395
1891
|
timecode_column required to order the select expression for the insert.
|
|
1396
1892
|
It should be the first column in the select expression.
|
|
1397
|
-
|
|
1893
|
+
|
|
1398
1894
|
sequence_column:
|
|
1399
1895
|
sequence_column required to order the select expression for the insert.
|
|
1400
1896
|
It should be the second column in the select expression.
|
|
1401
1897
|
|
|
1898
|
+
derived_column:
|
|
1899
|
+
Specifies a derived column that is part of the table schema but not
|
|
1900
|
+
part of insert.
|
|
1901
|
+
Types: String
|
|
1402
1902
|
|
|
1403
1903
|
RETURNS:
|
|
1404
1904
|
a) True, when insertion compatible (number of columns and their names match)
|
|
@@ -1410,11 +1910,16 @@ def _check_columns_insertion_compatible(table1_col_object, table2_cols, is_panda
|
|
|
1410
1910
|
EXAMPLES:
|
|
1411
1911
|
_check_columns_insertion_compatible(table1.c, ['co1', 'col2'], False)
|
|
1412
1912
|
_check_columns_insertion_compatible(table1.c, (['co1', 'col2'], [int, str]), True, True, 'ts', 'seq')
|
|
1913
|
+
_check_columns_insertion_compatible(table1.c, (['co1', 'col2'], [int, str]), True, True, 'ts', 'seq', 'derived_col')
|
|
1413
1914
|
|
|
1414
1915
|
"""
|
|
1415
1916
|
table1_col_names, _ = UtilFuncs._extract_table_object_column_info(table1_col_object)
|
|
1416
1917
|
table2_col_names = table2_cols[0] if is_pandas_df else table2_cols
|
|
1417
1918
|
|
|
1919
|
+
# Remove derived_column from table1_col_names if specified
|
|
1920
|
+
if derived_column is not None and derived_column in table1_col_names:
|
|
1921
|
+
table1_col_names.remove(derived_column)
|
|
1922
|
+
|
|
1418
1923
|
# Check for number of columns
|
|
1419
1924
|
if len(table1_col_names) != len(table2_col_names):
|
|
1420
1925
|
return False
|
|
@@ -1783,3 +2288,158 @@ def _validate_timezero_date(timezero_date):
|
|
|
1783
2288
|
|
|
1784
2289
|
# Looks like the value is valid
|
|
1785
2290
|
return True
|
|
2291
|
+
|
|
2292
|
+
def _validate_valid_time_columns(df, valid_time_columns, derived_column=None, types=None):
|
|
2293
|
+
"""
|
|
2294
|
+
Internal function to validate that the columns specified in valid_time_columns
|
|
2295
|
+
exist in the DataFrame, are of type DATE or TIMESTAMP, and are of the same type.
|
|
2296
|
+
Also checks that the derived_column, if specified, is not present in the DataFrame.
|
|
2297
|
+
|
|
2298
|
+
PARAMETERS:
|
|
2299
|
+
df:
|
|
2300
|
+
Required Argument.
|
|
2301
|
+
Specifies the Pandas or teradataml DataFrame object to be validated.
|
|
2302
|
+
Types: pandas.DataFrame or teradataml.dataframe.dataframe.DataFrame
|
|
2303
|
+
|
|
2304
|
+
valid_time_columns:
|
|
2305
|
+
Required Argument.
|
|
2306
|
+
Specifies a tuple of two column names representing the temporal validity period.
|
|
2307
|
+
Types: tuple of Strings
|
|
2308
|
+
|
|
2309
|
+
derived_column:
|
|
2310
|
+
Optional Argument.
|
|
2311
|
+
Specifies the name of the derived column that should not be
|
|
2312
|
+
present in the DataFrame.
|
|
2313
|
+
Types: String
|
|
2314
|
+
|
|
2315
|
+
types:
|
|
2316
|
+
Optional Argument.
|
|
2317
|
+
Specifies a python dictionary with column-name(key) to column-type(value)
|
|
2318
|
+
mapping to create DataFrames.
|
|
2319
|
+
Types: dict
|
|
2320
|
+
|
|
2321
|
+
RETURNS:
|
|
2322
|
+
None
|
|
2323
|
+
|
|
2324
|
+
RAISES:
|
|
2325
|
+
TeradataMlException
|
|
2326
|
+
|
|
2327
|
+
EXAMPLES:
|
|
2328
|
+
_validate_valid_time_columns(
|
|
2329
|
+
df=my_df,
|
|
2330
|
+
valid_time_columns=('start_date', 'end_date'),
|
|
2331
|
+
derived_column='validity_period',
|
|
2332
|
+
types={'start_date': DATE, 'end_date': DATE}
|
|
2333
|
+
)
|
|
2334
|
+
"""
|
|
2335
|
+
df_columns = _get_pd_df_column_names(df) if isinstance(df, pd.DataFrame) else df.columns
|
|
2336
|
+
df_dtypes = (
|
|
2337
|
+
{
|
|
2338
|
+
col: _get_sqlalchemy_mapping_types(str(df.dtypes[col]))
|
|
2339
|
+
for col in df.dtypes.keys()
|
|
2340
|
+
}
|
|
2341
|
+
if isinstance(df, pd.DataFrame)
|
|
2342
|
+
else df._td_column_names_and_sqlalchemy_types
|
|
2343
|
+
)
|
|
2344
|
+
# If types argument is provided, override the dtypes for those columns
|
|
2345
|
+
if types is not None:
|
|
2346
|
+
for col, typ in types.items():
|
|
2347
|
+
if col in df_columns:
|
|
2348
|
+
df_dtypes[col] = typ
|
|
2349
|
+
|
|
2350
|
+
|
|
2351
|
+
if derived_column is not None and derived_column in df_columns:
|
|
2352
|
+
raise TeradataMlException(
|
|
2353
|
+
Messages.get_message(MessageCodes.TDMLDF_COLUMN_IN_ARG_FOUND).format(
|
|
2354
|
+
derived_column, 'derived_column', 'dataframe.', 'Provide value which is not part of DataFrame columns'
|
|
2355
|
+
),
|
|
2356
|
+
MessageCodes.TDMLDF_COLUMN_IN_ARG_FOUND
|
|
2357
|
+
)
|
|
2358
|
+
# valid_time_columns can be a tuple of two column names or a single column name
|
|
2359
|
+
if isinstance(valid_time_columns, tuple):
|
|
2360
|
+
if len(valid_time_columns) != 2:
|
|
2361
|
+
raise TeradataMlException(
|
|
2362
|
+
Messages.get_message(MessageCodes.INVALID_ARG_VALUE).format(
|
|
2363
|
+
valid_time_columns, 'valid_time_columns', 'tuple of two column names'
|
|
2364
|
+
),
|
|
2365
|
+
MessageCodes.INVALID_ARG_VALUE
|
|
2366
|
+
)
|
|
2367
|
+
# Check if both columns are present in the DataFrame
|
|
2368
|
+
for col in valid_time_columns:
|
|
2369
|
+
if col not in df_columns:
|
|
2370
|
+
raise TeradataMlException(
|
|
2371
|
+
Messages.get_message(MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND).format(
|
|
2372
|
+
col, 'valid_time_columns', 'df', 'DataFrame'
|
|
2373
|
+
),
|
|
2374
|
+
MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND
|
|
2375
|
+
)
|
|
2376
|
+
|
|
2377
|
+
col1_type = df_dtypes[valid_time_columns[0]]
|
|
2378
|
+
col2_type = df_dtypes[valid_time_columns[1]]
|
|
2379
|
+
|
|
2380
|
+
# When types are specified, ensure they are DATE or TIMESTAMP objects or classes.
|
|
2381
|
+
if not (
|
|
2382
|
+
isinstance(col1_type, TIMESTAMP) or isinstance(col1_type, DATE) or
|
|
2383
|
+
col1_type is TIMESTAMP or col1_type is DATE
|
|
2384
|
+
):
|
|
2385
|
+
raise TeradataMlException(
|
|
2386
|
+
Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE).format(
|
|
2387
|
+
'valid_time_columns',
|
|
2388
|
+
col1_type.__name__ if isinstance(col1_type, type)
|
|
2389
|
+
else col1_type.__class__.__name__, 'DATE or TIMESTAMP'
|
|
2390
|
+
),
|
|
2391
|
+
MessageCodes.INVALID_COLUMN_TYPE
|
|
2392
|
+
)
|
|
2393
|
+
# When types are specified, ensure they are DATE or TIMESTAMP objects or classes.
|
|
2394
|
+
if not (
|
|
2395
|
+
isinstance(col2_type, TIMESTAMP) or isinstance(col2_type, DATE) or
|
|
2396
|
+
col2_type is TIMESTAMP or col2_type is DATE
|
|
2397
|
+
):
|
|
2398
|
+
raise TeradataMlException(
|
|
2399
|
+
Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE).format(
|
|
2400
|
+
'valid_time_columns',
|
|
2401
|
+
col2_type.__name__ if isinstance(col2_type, type)
|
|
2402
|
+
else col2_type.__class__.__name__, 'DATE or TIMESTAMP'
|
|
2403
|
+
),
|
|
2404
|
+
MessageCodes.INVALID_COLUMN_TYPE
|
|
2405
|
+
)
|
|
2406
|
+
|
|
2407
|
+
if type(col1_type) != type(col2_type):
|
|
2408
|
+
raise ValueError(
|
|
2409
|
+
Messages.get_message(MessageCodes.INVALID_ARG_VALUE).format(
|
|
2410
|
+
valid_time_columns, 'valid_time_columns', 'both columns of same type (DATE or TIMESTAMP)'
|
|
2411
|
+
),
|
|
2412
|
+
MessageCodes.INVALID_ARG_VALUE
|
|
2413
|
+
)
|
|
2414
|
+
elif isinstance(valid_time_columns, str):
|
|
2415
|
+
col = valid_time_columns
|
|
2416
|
+
col_type = df_dtypes[col]
|
|
2417
|
+
|
|
2418
|
+
if col not in df_columns:
|
|
2419
|
+
raise TeradataMlException(
|
|
2420
|
+
Messages.get_message(MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND).format(
|
|
2421
|
+
col, 'valid_time_columns', 'df', 'DataFrame'
|
|
2422
|
+
),
|
|
2423
|
+
MessageCodes.TDMLDF_COLUMN_IN_ARG_NOT_FOUND
|
|
2424
|
+
)
|
|
2425
|
+
# When types are specified, ensure they are PERIOD_DATE or PERIOD_TIMESTAMP objects or classes.
|
|
2426
|
+
if not (
|
|
2427
|
+
isinstance(col_type, PERIOD_TIMESTAMP) or isinstance(col_type, PERIOD_DATE) or
|
|
2428
|
+
col_type is PERIOD_TIMESTAMP or col_type is PERIOD_DATE
|
|
2429
|
+
):
|
|
2430
|
+
raise TeradataMlException(
|
|
2431
|
+
Messages.get_message(MessageCodes.INVALID_COLUMN_TYPE).format(
|
|
2432
|
+
'valid_time_columns',
|
|
2433
|
+
col_type.__name__ if isinstance(col_type, type)
|
|
2434
|
+
else col_type.__class__.__name__, 'PERIOD_DATE or PERIOD_TIMESTAMP'
|
|
2435
|
+
),
|
|
2436
|
+
MessageCodes.INVALID_COLUMN_TYPE
|
|
2437
|
+
)
|
|
2438
|
+
else:
|
|
2439
|
+
raise TeradataMlException(
|
|
2440
|
+
Messages.get_message(MessageCodes.INVALID_ARG_VALUE).format(
|
|
2441
|
+
valid_time_columns, 'valid_time_columns', 'tuple of two column names or a single column name'
|
|
2442
|
+
),
|
|
2443
|
+
MessageCodes.INVALID_ARG_VALUE
|
|
2444
|
+
)
|
|
2445
|
+
|