teradataml 20.0.0.1__py3-none-any.whl → 20.0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of teradataml might be problematic. Click here for more details.
- teradataml/LICENSE-3RD-PARTY.pdf +0 -0
- teradataml/LICENSE.pdf +0 -0
- teradataml/README.md +306 -0
- teradataml/__init__.py +10 -3
- teradataml/_version.py +1 -1
- teradataml/analytics/__init__.py +3 -2
- teradataml/analytics/analytic_function_executor.py +299 -16
- teradataml/analytics/analytic_query_generator.py +92 -0
- teradataml/analytics/byom/__init__.py +3 -2
- teradataml/analytics/json_parser/metadata.py +13 -3
- teradataml/analytics/json_parser/utils.py +13 -6
- teradataml/analytics/meta_class.py +40 -1
- teradataml/analytics/sqle/DecisionTreePredict.py +1 -1
- teradataml/analytics/sqle/__init__.py +11 -2
- teradataml/analytics/table_operator/__init__.py +4 -3
- teradataml/analytics/uaf/__init__.py +21 -2
- teradataml/analytics/utils.py +66 -1
- teradataml/analytics/valib.py +1 -1
- teradataml/automl/__init__.py +1502 -323
- teradataml/automl/custom_json_utils.py +139 -61
- teradataml/automl/data_preparation.py +247 -307
- teradataml/automl/data_transformation.py +32 -12
- teradataml/automl/feature_engineering.py +325 -86
- teradataml/automl/model_evaluation.py +44 -35
- teradataml/automl/model_training.py +122 -153
- teradataml/catalog/byom.py +8 -8
- teradataml/clients/pkce_client.py +1 -1
- teradataml/common/__init__.py +2 -1
- teradataml/common/constants.py +72 -0
- teradataml/common/deprecations.py +13 -7
- teradataml/common/garbagecollector.py +152 -120
- teradataml/common/messagecodes.py +11 -2
- teradataml/common/messages.py +4 -1
- teradataml/common/sqlbundle.py +26 -4
- teradataml/common/utils.py +225 -14
- teradataml/common/wrapper_utils.py +1 -1
- teradataml/context/context.py +82 -2
- teradataml/data/SQL_Fundamentals.pdf +0 -0
- teradataml/data/complaints_test_tokenized.csv +353 -0
- teradataml/data/complaints_tokens_model.csv +348 -0
- teradataml/data/covid_confirm_sd.csv +83 -0
- teradataml/data/dataframe_example.json +27 -1
- teradataml/data/docs/sqle/docs_17_20/CFilter.py +132 -0
- teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +162 -0
- teradataml/data/docs/sqle/docs_17_20/OutlierFilterFit.py +2 -0
- teradataml/data/docs/sqle/docs_17_20/Pivoting.py +279 -0
- teradataml/data/docs/sqle/docs_17_20/Shap.py +203 -0
- teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +189 -0
- teradataml/data/docs/sqle/docs_17_20/TFIDF.py +142 -0
- teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
- teradataml/data/docs/sqle/docs_17_20/Unpivoting.py +216 -0
- teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
- teradataml/data/docs/uaf/docs_17_20/ACF.py +1 -10
- teradataml/data/docs/uaf/docs_17_20/ArimaEstimate.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaForecast.py +35 -5
- teradataml/data/docs/uaf/docs_17_20/ArimaValidate.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/ArimaXEstimate.py +293 -0
- teradataml/data/docs/uaf/docs_17_20/AutoArima.py +354 -0
- teradataml/data/docs/uaf/docs_17_20/BreuschGodfrey.py +3 -2
- teradataml/data/docs/uaf/docs_17_20/BreuschPaganGodfrey.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/Convolve.py +13 -10
- teradataml/data/docs/uaf/docs_17_20/Convolve2.py +4 -1
- teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
- teradataml/data/docs/uaf/docs_17_20/CumulPeriodogram.py +5 -4
- teradataml/data/docs/uaf/docs_17_20/DFFT2Conv.py +4 -4
- teradataml/data/docs/uaf/docs_17_20/DWT.py +235 -0
- teradataml/data/docs/uaf/docs_17_20/DWT2D.py +214 -0
- teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
- teradataml/data/docs/uaf/docs_17_20/DurbinWatson.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/ExtractResults.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/FilterFactory1d.py +160 -0
- teradataml/data/docs/uaf/docs_17_20/GenseriesSinusoids.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/GoldfeldQuandt.py +9 -31
- teradataml/data/docs/uaf/docs_17_20/HoltWintersForecaster.py +4 -2
- teradataml/data/docs/uaf/docs_17_20/IDFFT2.py +1 -8
- teradataml/data/docs/uaf/docs_17_20/IDWT.py +236 -0
- teradataml/data/docs/uaf/docs_17_20/IDWT2D.py +226 -0
- teradataml/data/docs/uaf/docs_17_20/IQR.py +134 -0
- teradataml/data/docs/uaf/docs_17_20/LineSpec.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/LinearRegr.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/MAMean.py +3 -3
- teradataml/data/docs/uaf/docs_17_20/Matrix2Image.py +297 -0
- teradataml/data/docs/uaf/docs_17_20/MatrixMultiply.py +15 -6
- teradataml/data/docs/uaf/docs_17_20/PACF.py +0 -1
- teradataml/data/docs/uaf/docs_17_20/Portman.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/PowerSpec.py +2 -2
- teradataml/data/docs/uaf/docs_17_20/Resample.py +9 -1
- teradataml/data/docs/uaf/docs_17_20/SAX.py +246 -0
- teradataml/data/docs/uaf/docs_17_20/SeasonalNormalize.py +17 -10
- teradataml/data/docs/uaf/docs_17_20/SignifPeriodicities.py +1 -1
- teradataml/data/docs/uaf/docs_17_20/WhitesGeneral.py +3 -1
- teradataml/data/docs/uaf/docs_17_20/WindowDFFT.py +368 -0
- teradataml/data/dwt2d_dataTable.csv +65 -0
- teradataml/data/dwt_dataTable.csv +8 -0
- teradataml/data/dwt_filterTable.csv +3 -0
- teradataml/data/finance_data4.csv +13 -0
- teradataml/data/grocery_transaction.csv +19 -0
- teradataml/data/idwt2d_dataTable.csv +5 -0
- teradataml/data/idwt_dataTable.csv +8 -0
- teradataml/data/idwt_filterTable.csv +3 -0
- teradataml/data/interval_data.csv +5 -0
- teradataml/data/jsons/paired_functions.json +14 -0
- teradataml/data/jsons/sqle/17.20/TD_CFilter.json +118 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayes.json +193 -0
- teradataml/data/jsons/sqle/17.20/TD_NaiveBayesPredict.json +212 -0
- teradataml/data/jsons/sqle/17.20/TD_OneClassSVM.json +9 -9
- teradataml/data/jsons/sqle/17.20/TD_Pivoting.json +280 -0
- teradataml/data/jsons/sqle/17.20/TD_Shap.json +222 -0
- teradataml/data/jsons/sqle/17.20/TD_TFIDF.json +162 -0
- teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
- teradataml/data/jsons/sqle/17.20/TD_Unpivoting.json +235 -0
- teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
- teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
- teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
- teradataml/data/jsons/storedprocedure/17.20/TD_FILTERFACTORY1D.json +150 -0
- teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
- teradataml/data/jsons/uaf/17.20/TD_ACF.json +1 -18
- teradataml/data/jsons/uaf/17.20/TD_ARIMAESTIMATE.json +3 -16
- teradataml/data/jsons/uaf/17.20/TD_ARIMAFORECAST.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAVALIDATE.json +5 -3
- teradataml/data/jsons/uaf/17.20/TD_ARIMAXESTIMATE.json +362 -0
- teradataml/data/jsons/uaf/17.20/TD_AUTOARIMA.json +469 -0
- teradataml/data/jsons/uaf/17.20/TD_BINARYMATRIXOP.json +0 -3
- teradataml/data/jsons/uaf/17.20/TD_BINARYSERIESOP.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_GODFREY.json +2 -1
- teradataml/data/jsons/uaf/17.20/TD_BREUSCH_PAGAN_GODFREY.json +2 -5
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_CONVOLVE2.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_CUMUL_PERIODOGRAM.json +0 -5
- teradataml/data/jsons/uaf/17.20/TD_DFFT.json +1 -4
- teradataml/data/jsons/uaf/17.20/TD_DFFT2.json +2 -7
- teradataml/data/jsons/uaf/17.20/TD_DFFT2CONV.json +1 -2
- teradataml/data/jsons/uaf/17.20/TD_DFFTCONV.json +0 -2
- teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
- teradataml/data/jsons/uaf/17.20/TD_DTW.json +3 -6
- teradataml/data/jsons/uaf/17.20/TD_DWT.json +173 -0
- teradataml/data/jsons/uaf/17.20/TD_DWT2D.json +160 -0
- teradataml/data/jsons/uaf/17.20/TD_FITMETRICS.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_GOLDFELD_QUANDT.json +16 -30
- teradataml/data/jsons/uaf/17.20/{TD_HOLT_WINTERS_FORECAST.json → TD_HOLT_WINTERS_FORECASTER.json} +1 -2
- teradataml/data/jsons/uaf/17.20/TD_IDFFT2.json +1 -15
- teradataml/data/jsons/uaf/17.20/TD_IDWT.json +162 -0
- teradataml/data/jsons/uaf/17.20/TD_IDWT2D.json +149 -0
- teradataml/data/jsons/uaf/17.20/TD_IQR.json +117 -0
- teradataml/data/jsons/uaf/17.20/TD_LINEAR_REGR.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_LINESPEC.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_MAMEAN.json +1 -3
- teradataml/data/jsons/uaf/17.20/TD_MATRIX2IMAGE.json +209 -0
- teradataml/data/jsons/uaf/17.20/TD_PACF.json +2 -2
- teradataml/data/jsons/uaf/17.20/TD_POWERSPEC.json +5 -5
- teradataml/data/jsons/uaf/17.20/TD_RESAMPLE.json +48 -28
- teradataml/data/jsons/uaf/17.20/TD_SAX.json +210 -0
- teradataml/data/jsons/uaf/17.20/TD_SEASONALNORMALIZE.json +12 -6
- teradataml/data/jsons/uaf/17.20/TD_SIMPLEEXP.json +0 -1
- teradataml/data/jsons/uaf/17.20/TD_TRACKINGOP.json +8 -8
- teradataml/data/jsons/uaf/17.20/TD_UNDIFF.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_UNNORMALIZE.json +1 -1
- teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +410 -0
- teradataml/data/load_example_data.py +8 -2
- teradataml/data/medical_readings.csv +101 -0
- teradataml/data/naivebayestextclassifier_example.json +1 -1
- teradataml/data/naivebayestextclassifierpredict_example.json +11 -0
- teradataml/data/patient_profile.csv +101 -0
- teradataml/data/peppers.png +0 -0
- teradataml/data/real_values.csv +14 -0
- teradataml/data/sax_example.json +8 -0
- teradataml/data/scripts/deploy_script.py +1 -1
- teradataml/data/scripts/lightgbm/dataset.template +157 -0
- teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
- teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
- teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
- teradataml/data/scripts/sklearn/sklearn_fit.py +194 -160
- teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
- teradataml/data/scripts/sklearn/sklearn_function.template +34 -16
- teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
- teradataml/data/scripts/sklearn/sklearn_neighbors.py +1 -1
- teradataml/data/scripts/sklearn/sklearn_score.py +12 -3
- teradataml/data/scripts/sklearn/sklearn_transform.py +162 -24
- teradataml/data/star_pivot.csv +8 -0
- teradataml/data/target_udt_data.csv +8 -0
- teradataml/data/templates/open_source_ml.json +3 -1
- teradataml/data/teradataml_example.json +20 -1
- teradataml/data/timestamp_data.csv +4 -0
- teradataml/data/titanic_dataset_unpivoted.csv +19 -0
- teradataml/data/uaf_example.json +55 -1
- teradataml/data/unpivot_example.json +15 -0
- teradataml/data/url_data.csv +9 -0
- teradataml/data/vectordistance_example.json +4 -0
- teradataml/data/windowdfft.csv +16 -0
- teradataml/dataframe/copy_to.py +1 -1
- teradataml/dataframe/data_transfer.py +5 -3
- teradataml/dataframe/dataframe.py +1002 -201
- teradataml/dataframe/fastload.py +3 -3
- teradataml/dataframe/functions.py +867 -0
- teradataml/dataframe/row.py +160 -0
- teradataml/dataframe/setop.py +2 -2
- teradataml/dataframe/sql.py +840 -33
- teradataml/dataframe/window.py +1 -1
- teradataml/dbutils/dbutils.py +878 -34
- teradataml/dbutils/filemgr.py +48 -1
- teradataml/geospatial/geodataframe.py +1 -1
- teradataml/geospatial/geodataframecolumn.py +1 -1
- teradataml/hyperparameter_tuner/optimizer.py +13 -13
- teradataml/lib/aed_0_1.dll +0 -0
- teradataml/opensource/__init__.py +1 -1
- teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
- teradataml/opensource/_lightgbm.py +950 -0
- teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
- teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
- teradataml/opensource/sklearn/__init__.py +0 -1
- teradataml/opensource/sklearn/_sklearn_wrapper.py +1019 -574
- teradataml/options/__init__.py +9 -23
- teradataml/options/configure.py +42 -4
- teradataml/options/display.py +2 -2
- teradataml/plot/axis.py +4 -4
- teradataml/scriptmgmt/UserEnv.py +13 -9
- teradataml/scriptmgmt/lls_utils.py +77 -23
- teradataml/store/__init__.py +13 -0
- teradataml/store/feature_store/__init__.py +0 -0
- teradataml/store/feature_store/constants.py +291 -0
- teradataml/store/feature_store/feature_store.py +2223 -0
- teradataml/store/feature_store/models.py +1505 -0
- teradataml/store/vector_store/__init__.py +1586 -0
- teradataml/table_operators/Script.py +2 -2
- teradataml/table_operators/TableOperator.py +106 -20
- teradataml/table_operators/query_generator.py +3 -0
- teradataml/table_operators/table_operator_query_generator.py +3 -1
- teradataml/table_operators/table_operator_util.py +102 -56
- teradataml/table_operators/templates/dataframe_register.template +69 -0
- teradataml/table_operators/templates/dataframe_udf.template +63 -0
- teradataml/telemetry_utils/__init__.py +0 -0
- teradataml/telemetry_utils/queryband.py +52 -0
- teradataml/utils/dtypes.py +4 -2
- teradataml/utils/validators.py +34 -2
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +311 -3
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +240 -157
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
- {teradataml-20.0.0.1.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
|
@@ -0,0 +1,1505 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2024 by Teradata Corporation. All rights reserved.
|
|
3
|
+
TERADATA CORPORATION CONFIDENTIAL AND TRADE SECRET
|
|
4
|
+
|
|
5
|
+
Primary Owner: pradeep.garre@teradata.com
|
|
6
|
+
Secondary Owner: adithya.avvaru@teradata.com
|
|
7
|
+
|
|
8
|
+
This file implements the models required for Teradata Enterprise Feature Store.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from collections import OrderedDict
|
|
12
|
+
from datetime import datetime as dt
|
|
13
|
+
from teradatasqlalchemy import types as tdtypes
|
|
14
|
+
from teradataml.common.exceptions import TeradataMlException
|
|
15
|
+
from teradataml.common.messages import Messages
|
|
16
|
+
from teradataml.common.messagecodes import MessageCodes
|
|
17
|
+
from teradataml.common.utils import UtilFuncs
|
|
18
|
+
from teradataml.dataframe.dataframe import DataFrame, in_schema
|
|
19
|
+
from teradataml.dataframe.sql import _SQLColumnExpression
|
|
20
|
+
from teradataml.dbutils.dbutils import db_transaction, _delete_data, execute_sql, _insert_data, _upsert_data
|
|
21
|
+
from teradataml.store.feature_store.constants import *
|
|
22
|
+
from teradataml.utils.validators import _Validators
|
|
23
|
+
import inspect
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Feature:
|
|
27
|
+
"""Class for Feature. """
|
|
28
|
+
def __init__(self,
|
|
29
|
+
name,
|
|
30
|
+
column,
|
|
31
|
+
feature_type=FeatureType.CONTINUOUS,
|
|
32
|
+
description=None,
|
|
33
|
+
tags=None,
|
|
34
|
+
status=FeatureStatus.ACTIVE):
|
|
35
|
+
"""
|
|
36
|
+
DESCRIPTION:
|
|
37
|
+
Constructor for Feature.
|
|
38
|
+
|
|
39
|
+
PARAMETERS:
|
|
40
|
+
name:
|
|
41
|
+
Required Argument.
|
|
42
|
+
Specifies the unique name of the Feature.
|
|
43
|
+
Types: str.
|
|
44
|
+
|
|
45
|
+
column:
|
|
46
|
+
Required Argument.
|
|
47
|
+
Specifies the DataFrame Column.
|
|
48
|
+
Types: teradataml DataFrame Column
|
|
49
|
+
|
|
50
|
+
feature_type:
|
|
51
|
+
Optional Argument.
|
|
52
|
+
Specifies whether feature is continuous or discrete.
|
|
53
|
+
Default Value: FeatureType.CONTINUOUS
|
|
54
|
+
Types: FeatureType Enum
|
|
55
|
+
|
|
56
|
+
description:
|
|
57
|
+
Optional Argument.
|
|
58
|
+
Specifies human readable description for Feature.
|
|
59
|
+
Types: str
|
|
60
|
+
|
|
61
|
+
tags:
|
|
62
|
+
Optional Argument.
|
|
63
|
+
Specifies the tags for Feature.
|
|
64
|
+
Types: str OR list of str
|
|
65
|
+
|
|
66
|
+
status:
|
|
67
|
+
Optional Argument.
|
|
68
|
+
Specifies whether feature is archived or active.
|
|
69
|
+
Types: FeatureStatus Enum
|
|
70
|
+
|
|
71
|
+
RETURNS:
|
|
72
|
+
None.
|
|
73
|
+
|
|
74
|
+
RAISES:
|
|
75
|
+
None
|
|
76
|
+
|
|
77
|
+
EXAMPLES:
|
|
78
|
+
>>> from teradataml import DataFrame, Feature, FeatureType, load_example_data
|
|
79
|
+
# Load the sales data to Vantage.
|
|
80
|
+
>>> load_example_data("dataframe", "sales")
|
|
81
|
+
# Create DataFrame on sales data.
|
|
82
|
+
>>> df = DataFrame("sales")
|
|
83
|
+
>>> df
|
|
84
|
+
>>> df
|
|
85
|
+
Feb Jan Mar Apr datetime
|
|
86
|
+
accounts
|
|
87
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
88
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
89
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
90
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
91
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
92
|
+
|
|
93
|
+
# create a Categorical Feature for column 'Feb' for 'sales' DataFrame and name it as
|
|
94
|
+
# 'sales_Feb'.
|
|
95
|
+
>>> from teradataml import Feature
|
|
96
|
+
>>> feature = Feature('sales_Feb', column=df.Feb, feature_type=FeatureType.CATEGORICAL)
|
|
97
|
+
>>> feature
|
|
98
|
+
Feature(name=sales_Feb)
|
|
99
|
+
>>>
|
|
100
|
+
"""
|
|
101
|
+
self.name = name
|
|
102
|
+
self.column_name = column.name
|
|
103
|
+
self.description = description
|
|
104
|
+
self.tags = UtilFuncs._as_list(tags) if tags else None
|
|
105
|
+
self.data_type = column.type
|
|
106
|
+
self.feature_type = feature_type
|
|
107
|
+
self.status = status
|
|
108
|
+
|
|
109
|
+
@classmethod
|
|
110
|
+
def _from_df(cls, df):
|
|
111
|
+
"""
|
|
112
|
+
DESCRIPTION:
|
|
113
|
+
Internal method to create object of Feature from DataFrame.
|
|
114
|
+
|
|
115
|
+
PARAMETERS:
|
|
116
|
+
df:
|
|
117
|
+
Required Argument.
|
|
118
|
+
Specifies teradataml DataFrame which has Feature details.
|
|
119
|
+
Types: teradataml DataFrame.
|
|
120
|
+
|
|
121
|
+
RETURNS:
|
|
122
|
+
Feature or list of Feature.
|
|
123
|
+
|
|
124
|
+
RAISES:
|
|
125
|
+
None
|
|
126
|
+
|
|
127
|
+
EXAMPLES:
|
|
128
|
+
>>> Feature._from_df(df)
|
|
129
|
+
"""
|
|
130
|
+
_features = []
|
|
131
|
+
recs = [rec._asdict() for rec in df.itertuples()]
|
|
132
|
+
|
|
133
|
+
for rec in recs:
|
|
134
|
+
# Pop out unnecessary details.
|
|
135
|
+
rec.pop("creation_time")
|
|
136
|
+
rec.pop("modified_time")
|
|
137
|
+
rec.pop("group_name")
|
|
138
|
+
rec["column"] = _SQLColumnExpression(rec.pop("column_name"),
|
|
139
|
+
type=getattr(tdtypes, rec.pop("data_type"))())
|
|
140
|
+
rec["feature_type"] = FeatureType.CONTINUOUS if rec["feature_type"] == FeatureType.CONTINUOUS.name \
|
|
141
|
+
else FeatureType.CATEGORICAL
|
|
142
|
+
rec["status"] = FeatureStatus.ACTIVE if rec["status"] == FeatureStatus.ACTIVE.name else FeatureStatus.INACTIVE
|
|
143
|
+
_features.append(cls(**rec))
|
|
144
|
+
|
|
145
|
+
return _features if len(_features) > 1 else _features[0]
|
|
146
|
+
|
|
147
|
+
def __repr__(self):
|
|
148
|
+
"""
|
|
149
|
+
DESCRIPTION:
|
|
150
|
+
String representation for Feature object.
|
|
151
|
+
|
|
152
|
+
PARAMETERS:
|
|
153
|
+
None
|
|
154
|
+
|
|
155
|
+
RETURNS:
|
|
156
|
+
str
|
|
157
|
+
|
|
158
|
+
RAISES:
|
|
159
|
+
None
|
|
160
|
+
"""
|
|
161
|
+
return "Feature(name={name})".format(name=self.name)
|
|
162
|
+
|
|
163
|
+
def publish(self, repo):
|
|
164
|
+
"""
|
|
165
|
+
DESCRIPTION:
|
|
166
|
+
Method to publish the Feature details to repository.
|
|
167
|
+
|
|
168
|
+
PARAMETERS:
|
|
169
|
+
repo:
|
|
170
|
+
Required Argument.
|
|
171
|
+
Specifies the name of the repository to publish the Feature details.
|
|
172
|
+
Types: str.
|
|
173
|
+
|
|
174
|
+
RETURNS:
|
|
175
|
+
bool.
|
|
176
|
+
|
|
177
|
+
RAISES:
|
|
178
|
+
TeradataMlException
|
|
179
|
+
|
|
180
|
+
EXAMPLES:
|
|
181
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
182
|
+
>>> df = DataFrame("sales")
|
|
183
|
+
|
|
184
|
+
# Example 1: Publish the Feature details to repo 'vfs_test' for column
|
|
185
|
+
'Feb' from 'sales' DataFrame.
|
|
186
|
+
>>> from teradataml import Feature
|
|
187
|
+
>>> feature = Feature('sales:Feb', df.Feb)
|
|
188
|
+
>>> feature.publish('vfs_test')
|
|
189
|
+
True
|
|
190
|
+
>>>
|
|
191
|
+
|
|
192
|
+
# Example 2: Republish the Feature published in Example 1 by updating
|
|
193
|
+
# it's tags.
|
|
194
|
+
>>> # First, Get the existing Feature.
|
|
195
|
+
>>> from teradataml import FeatureStore
|
|
196
|
+
>>> feature = FeatureStore('vfs_test').get_feature('sales:Feb')
|
|
197
|
+
>>> # Update it's tags.
|
|
198
|
+
>>> feature.tags = ["sales_data", "monthly_sales"]
|
|
199
|
+
>>> # Republish the details to same repo.
|
|
200
|
+
>>> feature.publish('vfs_test')
|
|
201
|
+
"""
|
|
202
|
+
_upsert_data(schema_name=repo,
|
|
203
|
+
table_name=EFS_FEATURES_SPEC["table_name"],
|
|
204
|
+
insert_columns_values = OrderedDict({
|
|
205
|
+
'name': self.name,
|
|
206
|
+
'column_name': self.column_name,
|
|
207
|
+
'description': self.description,
|
|
208
|
+
'creation_time': dt.utcnow(),
|
|
209
|
+
'tags': ", ".join(self.tags) if self.tags else None,
|
|
210
|
+
'data_type': str(self.data_type),
|
|
211
|
+
'feature_type': self.feature_type.name,
|
|
212
|
+
'status': self.status.name}),
|
|
213
|
+
upsert_conditions=OrderedDict({
|
|
214
|
+
'name': self.name}),
|
|
215
|
+
update_columns_values=OrderedDict({
|
|
216
|
+
'column_name': self.column_name,
|
|
217
|
+
'description': self.description,
|
|
218
|
+
'modified_time': dt.utcnow(),
|
|
219
|
+
'tags': ", ".join(self.tags) if self.tags else None,
|
|
220
|
+
'data_type': str(self.data_type),
|
|
221
|
+
'feature_type': self.feature_type.name,
|
|
222
|
+
'status': self.status.name})
|
|
223
|
+
)
|
|
224
|
+
return True
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
class Entity:
|
|
228
|
+
"""Class for Entity. """
|
|
229
|
+
def __init__(self, name, columns, description=None):
|
|
230
|
+
"""
|
|
231
|
+
DESCRIPTION:
|
|
232
|
+
Constructor for creating Entity Object.
|
|
233
|
+
|
|
234
|
+
PARAMETERS:
|
|
235
|
+
name:
|
|
236
|
+
Required Argument.
|
|
237
|
+
Specifies the unique name of the entity.
|
|
238
|
+
Types: str.
|
|
239
|
+
|
|
240
|
+
columns:
|
|
241
|
+
Required Argument.
|
|
242
|
+
Specifies the names of the columns.
|
|
243
|
+
Types: teradataml DataFrame Column OR list of teradataml DataFrame Columns.
|
|
244
|
+
|
|
245
|
+
description:
|
|
246
|
+
Optional Argument.
|
|
247
|
+
Specifies human readable description for Feature.
|
|
248
|
+
Types: str
|
|
249
|
+
|
|
250
|
+
RETURNS:
|
|
251
|
+
Object of Entity.
|
|
252
|
+
|
|
253
|
+
RAISES:
|
|
254
|
+
None
|
|
255
|
+
|
|
256
|
+
EXAMPLES:
|
|
257
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
258
|
+
>>> df = DataFrame("sales")
|
|
259
|
+
|
|
260
|
+
# create a Entity with column 'accounts' for 'sales' DataFrame and name it as
|
|
261
|
+
# 'sales_accounts'.
|
|
262
|
+
>>> from teradataml import Entity
|
|
263
|
+
>>> entity = Entity('sales_accounts', df.accounts)
|
|
264
|
+
>>> entity
|
|
265
|
+
Entity(name=sales_accounts)
|
|
266
|
+
>>>
|
|
267
|
+
"""
|
|
268
|
+
self.name = name
|
|
269
|
+
self.columns = [col if isinstance(col, str) else col.name for col in UtilFuncs._as_list(columns)]
|
|
270
|
+
self.description = description
|
|
271
|
+
|
|
272
|
+
@classmethod
|
|
273
|
+
def _from_df(cls, df):
|
|
274
|
+
"""
|
|
275
|
+
DESCRIPTION:
|
|
276
|
+
Internal method to create object of Entity from DataFrame.
|
|
277
|
+
|
|
278
|
+
PARAMETERS:
|
|
279
|
+
df:
|
|
280
|
+
Required Argument.
|
|
281
|
+
Specifies teradataml DataFrame which has details for Entity.
|
|
282
|
+
Types: teradataml DataFrame.
|
|
283
|
+
|
|
284
|
+
RETURNS:
|
|
285
|
+
Entity
|
|
286
|
+
|
|
287
|
+
RAISES:
|
|
288
|
+
None
|
|
289
|
+
|
|
290
|
+
EXAMPLES:
|
|
291
|
+
>>> Entity._from_df(df)
|
|
292
|
+
"""
|
|
293
|
+
entity_name = None
|
|
294
|
+
description = None
|
|
295
|
+
columns = []
|
|
296
|
+
|
|
297
|
+
# Get all the entity columns and update there.
|
|
298
|
+
for rec in df.itertuples():
|
|
299
|
+
entity_name = rec.name
|
|
300
|
+
description = rec.description
|
|
301
|
+
columns.append(rec.entity_column)
|
|
302
|
+
|
|
303
|
+
return cls(name=entity_name, description=description, columns=columns)
|
|
304
|
+
|
|
305
|
+
def __repr__(self):
|
|
306
|
+
"""
|
|
307
|
+
DESCRIPTION:
|
|
308
|
+
String representation for Entity object.
|
|
309
|
+
|
|
310
|
+
PARAMETERS:
|
|
311
|
+
None
|
|
312
|
+
|
|
313
|
+
RETURNS:
|
|
314
|
+
str
|
|
315
|
+
|
|
316
|
+
RAISES:
|
|
317
|
+
None
|
|
318
|
+
"""
|
|
319
|
+
return "Entity(name={})".format(self.name)
|
|
320
|
+
|
|
321
|
+
@db_transaction
|
|
322
|
+
def publish(self, repo):
|
|
323
|
+
"""
|
|
324
|
+
DESCRIPTION:
|
|
325
|
+
Method to publish the Entity details to repository.
|
|
326
|
+
|
|
327
|
+
PARAMETERS:
|
|
328
|
+
repo:
|
|
329
|
+
Required Argument.
|
|
330
|
+
Specifies the name of the repository to publish the Entity details.
|
|
331
|
+
Types: str.
|
|
332
|
+
|
|
333
|
+
RETURNS:
|
|
334
|
+
bool.
|
|
335
|
+
|
|
336
|
+
RAISES:
|
|
337
|
+
TeradataMlException
|
|
338
|
+
|
|
339
|
+
EXAMPLES:
|
|
340
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
341
|
+
>>> df = DataFrame("sales")
|
|
342
|
+
|
|
343
|
+
# Example 1: Publish the Entity details to repo 'vfs_test' for column
|
|
344
|
+
'accounts' from 'sales' DataFrame.
|
|
345
|
+
>>> from teradataml import Entity
|
|
346
|
+
>>> entity = Entity('sales:accounts', 'accounts')
|
|
347
|
+
>>> entity.publish('vfs_test')
|
|
348
|
+
True
|
|
349
|
+
>>>
|
|
350
|
+
"""
|
|
351
|
+
# Upsert should be triggered for every corresponding entity ID and column.
|
|
352
|
+
_upsert_data(schema_name=repo,
|
|
353
|
+
table_name=EFS_ENTITY_SPEC["table_name"],
|
|
354
|
+
insert_columns_values=OrderedDict({
|
|
355
|
+
'name': self.name,
|
|
356
|
+
'description': self.description,
|
|
357
|
+
'creation_time': dt.utcnow()}),
|
|
358
|
+
upsert_conditions=OrderedDict({
|
|
359
|
+
'name': self.name}),
|
|
360
|
+
update_columns_values=OrderedDict({
|
|
361
|
+
'description': self.description,
|
|
362
|
+
'modified_time': dt.utcnow()})
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
# Insert into xref table now. Before that, delete for that key.
|
|
366
|
+
_delete_data(schema_name=repo,
|
|
367
|
+
table_name=EFS_ENTITY_XREF_SPEC["table_name"],
|
|
368
|
+
delete_conditions=_SQLColumnExpression("entity_name")==self.name)
|
|
369
|
+
|
|
370
|
+
values = [(self.name, col) for col in self.columns]
|
|
371
|
+
_insert_data(EFS_ENTITY_XREF_SPEC["table_name"], values, schema_name=repo)
|
|
372
|
+
|
|
373
|
+
return True
|
|
374
|
+
|
|
375
|
+
def __eq__(self, other):
|
|
376
|
+
"""
|
|
377
|
+
Compare the Entity with other Entity to check if both are
|
|
378
|
+
same or not.
|
|
379
|
+
|
|
380
|
+
PARAMETERS:
|
|
381
|
+
other :
|
|
382
|
+
Required Argument.
|
|
383
|
+
Specifies another Entity.
|
|
384
|
+
Types: Entity
|
|
385
|
+
|
|
386
|
+
RETURNS:
|
|
387
|
+
bool
|
|
388
|
+
|
|
389
|
+
RAISES:
|
|
390
|
+
None
|
|
391
|
+
|
|
392
|
+
EXAMPLES:
|
|
393
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
394
|
+
>>> df = DataFrame("sales")
|
|
395
|
+
|
|
396
|
+
# Example 1: Create two entities and compare whether they are same or not.
|
|
397
|
+
>>> from teradataml import Entity
|
|
398
|
+
>>> entity1 = Entity('sales:accounts', 'accounts')
|
|
399
|
+
>>> entity2 = Entity('sales:accounts', 'accounts')
|
|
400
|
+
>>> entity1 == entity2
|
|
401
|
+
True
|
|
402
|
+
>>>
|
|
403
|
+
"""
|
|
404
|
+
if not isinstance(other, Entity):
|
|
405
|
+
return False
|
|
406
|
+
# Both entities will be same only when corresponding columns are same.
|
|
407
|
+
return set(self.columns) == set(other.columns)
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
class DataSource:
|
|
411
|
+
"""Class for DataSource. """
|
|
412
|
+
def __init__(self, name, source, description=None, timestamp_col_name=None):
|
|
413
|
+
"""
|
|
414
|
+
DESCRIPTION:
|
|
415
|
+
Constructor for creating DataSource Object.
|
|
416
|
+
|
|
417
|
+
PARAMETERS:
|
|
418
|
+
name:
|
|
419
|
+
Required Argument.
|
|
420
|
+
Specifies the unique name of the DataSource.
|
|
421
|
+
Types: str.
|
|
422
|
+
|
|
423
|
+
source:
|
|
424
|
+
Required Argument.
|
|
425
|
+
Specifies the source query of DataSource.
|
|
426
|
+
Types: str OR teradataml DataFrame.
|
|
427
|
+
|
|
428
|
+
description:
|
|
429
|
+
Optional Argument.
|
|
430
|
+
Specifies human readable description for DataSource.
|
|
431
|
+
Types: str
|
|
432
|
+
|
|
433
|
+
timestamp_col_name:
|
|
434
|
+
Optional Argument.
|
|
435
|
+
Specifies the timestamp column indicating when the row was created.
|
|
436
|
+
Types: str
|
|
437
|
+
|
|
438
|
+
RETURNS:
|
|
439
|
+
Object of DataSource.
|
|
440
|
+
|
|
441
|
+
RAISES:
|
|
442
|
+
None
|
|
443
|
+
|
|
444
|
+
EXAMPLES:
|
|
445
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
446
|
+
>>> df = DataFrame("sales")
|
|
447
|
+
|
|
448
|
+
# Example 1: create a DataSource for above mentioned DataFrame with name 'Sales_Data'.
|
|
449
|
+
>>> from teradataml import DataSource
|
|
450
|
+
>>> data_source = DataSource('Sales_Data', df)
|
|
451
|
+
>>> data_source
|
|
452
|
+
DataSource(Sales_Data)
|
|
453
|
+
>>>
|
|
454
|
+
"""
|
|
455
|
+
self.name = name
|
|
456
|
+
self.timestamp_col_name = timestamp_col_name
|
|
457
|
+
self.source = source if isinstance(source, str) else source.show_query()
|
|
458
|
+
self.description = description
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def _from_df(cls, df):
|
|
462
|
+
"""
|
|
463
|
+
DESCRIPTION:
|
|
464
|
+
Internal method to create object of DataSource from DataFrame.
|
|
465
|
+
|
|
466
|
+
PARAMETERS:
|
|
467
|
+
df:
|
|
468
|
+
Required Argument.
|
|
469
|
+
Specifies teradataml DataFrame which has a single
|
|
470
|
+
record denoting DataSource.
|
|
471
|
+
Types: teradataml DataFrame.
|
|
472
|
+
|
|
473
|
+
RETURNS:
|
|
474
|
+
teradataml DataFrame
|
|
475
|
+
|
|
476
|
+
RAISES:
|
|
477
|
+
None
|
|
478
|
+
|
|
479
|
+
EXAMPLES:
|
|
480
|
+
>>> DataSource._from_df(df)
|
|
481
|
+
"""
|
|
482
|
+
rec = next(df.itertuples())._asdict()
|
|
483
|
+
rec.pop("creation_time")
|
|
484
|
+
rec.pop("modified_time")
|
|
485
|
+
return cls(**(rec))
|
|
486
|
+
|
|
487
|
+
def __repr__(self):
|
|
488
|
+
"""
|
|
489
|
+
DESCRIPTION:
|
|
490
|
+
String representation for DataSource object.
|
|
491
|
+
|
|
492
|
+
PARAMETERS:
|
|
493
|
+
None
|
|
494
|
+
|
|
495
|
+
RETURNS:
|
|
496
|
+
str
|
|
497
|
+
|
|
498
|
+
RAISES:
|
|
499
|
+
None
|
|
500
|
+
"""
|
|
501
|
+
return "DataSource(name={})".format(self.name)
|
|
502
|
+
|
|
503
|
+
def publish(self, repo):
|
|
504
|
+
"""
|
|
505
|
+
DESCRIPTION:
|
|
506
|
+
Method to publish the DataSource details to repository.
|
|
507
|
+
|
|
508
|
+
PARAMETERS:
|
|
509
|
+
repo:
|
|
510
|
+
Required Argument.
|
|
511
|
+
Specifies the name of the repository to publish the DataSource details.
|
|
512
|
+
Types: str.
|
|
513
|
+
|
|
514
|
+
RETURNS:
|
|
515
|
+
bool.
|
|
516
|
+
|
|
517
|
+
RAISES:
|
|
518
|
+
TeradataMlException
|
|
519
|
+
|
|
520
|
+
EXAMPLES:
|
|
521
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
522
|
+
>>> df = DataFrame("sales")
|
|
523
|
+
|
|
524
|
+
# Example 1: Publish the above mentioned DataFrame as DataSource
|
|
525
|
+
# and name it as "Sales_Data".
|
|
526
|
+
>>> from teradataml import DataSource
|
|
527
|
+
>>> data_source = DataSource('Sales_Data', df)
|
|
528
|
+
>>> data_source.publish('vfs_test')
|
|
529
|
+
True
|
|
530
|
+
>>>
|
|
531
|
+
|
|
532
|
+
# Example 2: Republish the published DataSource in example 1 with
|
|
533
|
+
# updated description.
|
|
534
|
+
>>> # First, Get the existing DataSource.
|
|
535
|
+
>>> from teradataml import FeatureStore
|
|
536
|
+
>>> data_source = FeatureStore('vfs_test').get_data_source('Sales_Data')
|
|
537
|
+
>>> # Update it's description.
|
|
538
|
+
>>> data_source.description = "Pivoted sales data."
|
|
539
|
+
>>> # Republish the details to same repo.
|
|
540
|
+
>>> data_source.publish('vfs_test')
|
|
541
|
+
"""
|
|
542
|
+
_upsert_data(schema_name=repo,
|
|
543
|
+
table_name=EFS_DATA_SOURCE_SPEC["table_name"],
|
|
544
|
+
insert_columns_values=OrderedDict({
|
|
545
|
+
'name': self.name,
|
|
546
|
+
'description': self.description,
|
|
547
|
+
'timestamp_col_name': self.timestamp_col_name,
|
|
548
|
+
'source': self.source,
|
|
549
|
+
'creation_time': dt.utcnow()
|
|
550
|
+
}),
|
|
551
|
+
upsert_conditions={"name": self.name},
|
|
552
|
+
update_columns_values=OrderedDict({
|
|
553
|
+
'description': self.description,
|
|
554
|
+
'timestamp_col_name': self.timestamp_col_name,
|
|
555
|
+
'modified_time': dt.utcnow(),
|
|
556
|
+
'source': self.source})
|
|
557
|
+
)
|
|
558
|
+
return True
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class FeatureGroup:
|
|
562
|
+
"""Class for FeatureGroup. """
|
|
563
|
+
def __init__(self, name, features, entity, data_source, description=None):
|
|
564
|
+
"""
|
|
565
|
+
DESCRIPTION:
|
|
566
|
+
Constructor for creating FeatureGroup Object.
|
|
567
|
+
|
|
568
|
+
PARAMETERS:
|
|
569
|
+
name:
|
|
570
|
+
Required Argument.
|
|
571
|
+
Specifies the unique name of the FeatureGroup.
|
|
572
|
+
Types: str.
|
|
573
|
+
|
|
574
|
+
features:
|
|
575
|
+
Required Argument.
|
|
576
|
+
Specifies the features required to create a group.
|
|
577
|
+
Types: Feature or list of Feature.
|
|
578
|
+
|
|
579
|
+
entity:
|
|
580
|
+
Required Argument.
|
|
581
|
+
Specifies the entity associated with corresponding features.
|
|
582
|
+
Types: Entity
|
|
583
|
+
|
|
584
|
+
data_source:
|
|
585
|
+
Required Argument.
|
|
586
|
+
Specifies the DataSource associated with Features.
|
|
587
|
+
Types: str
|
|
588
|
+
|
|
589
|
+
description:
|
|
590
|
+
Optional Argument.
|
|
591
|
+
Specifies human readable description for DataSource.
|
|
592
|
+
Types: str
|
|
593
|
+
|
|
594
|
+
RETURNS:
|
|
595
|
+
Object of FeatureGroup.
|
|
596
|
+
|
|
597
|
+
RAISES:
|
|
598
|
+
None
|
|
599
|
+
|
|
600
|
+
EXAMPLES:
|
|
601
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
602
|
+
>>> df = DataFrame("sales")
|
|
603
|
+
|
|
604
|
+
# Example 1: create a FeatureGroup for above mentioned DataFrame.
|
|
605
|
+
>>> # First create the features.
|
|
606
|
+
>>> jan_feature = Feature("sales:Jan", df.Jan)
|
|
607
|
+
>>> feb_feature = Feature("sales:Fan", df.Feb)
|
|
608
|
+
>>> mar_feature = Feature("sales:Mar", df.Mar)
|
|
609
|
+
>>> apr_feature = Feature("sales:Apr", df.Apr)
|
|
610
|
+
>>> # Create Entity.
|
|
611
|
+
>>> entity = Entity("sales:accounts", df.accounts)
|
|
612
|
+
>>> # Create DataSource
|
|
613
|
+
>>> data_source = DataSource("sales_source", df.show_query())
|
|
614
|
+
>>> # Create FeatureGroup.
|
|
615
|
+
>>> fg = FeatureGroup('Sales',
|
|
616
|
+
... features=[jan_feature, feb_feature, mar_feature, apr_feature],
|
|
617
|
+
... entity=entity,
|
|
618
|
+
... data_source=data_source)
|
|
619
|
+
"""
|
|
620
|
+
self.name = name
|
|
621
|
+
self.features = UtilFuncs._as_list(features)
|
|
622
|
+
self.entity = entity
|
|
623
|
+
self.data_source = data_source
|
|
624
|
+
self.description = description
|
|
625
|
+
self.__redundant_features = []
|
|
626
|
+
self._labels = []
|
|
627
|
+
|
|
628
|
+
@property
|
|
629
|
+
def features(self):
|
|
630
|
+
"""
|
|
631
|
+
DESCRIPTION:
|
|
632
|
+
Get's the features from FeatureGroup.
|
|
633
|
+
|
|
634
|
+
PARAMETERS:
|
|
635
|
+
None
|
|
636
|
+
|
|
637
|
+
RETURNS:
|
|
638
|
+
list
|
|
639
|
+
|
|
640
|
+
RAISES:
|
|
641
|
+
None
|
|
642
|
+
|
|
643
|
+
EXAMPLES:
|
|
644
|
+
>>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
|
|
645
|
+
>>> load_example_data("dataframe", "sales")
|
|
646
|
+
>>> # Let's create DataFrame first.
|
|
647
|
+
>>> df = DataFrame("sales")
|
|
648
|
+
>>> # create the features.
|
|
649
|
+
>>> jan_feature = Feature("sales:Jan", df.Jan)
|
|
650
|
+
>>> feb_feature = Feature("sales:Fan", df.Feb)
|
|
651
|
+
>>> mar_feature = Feature("sales:Mar", df.Mar)
|
|
652
|
+
>>> apr_feature = Feature("sales:Apr", df.Apr)
|
|
653
|
+
>>> # Create Entity.
|
|
654
|
+
>>> entity = Entity("sales:accounts", df.accounts)
|
|
655
|
+
>>> # Create DataSource
|
|
656
|
+
>>> data_source = DataSource("sales_source", df)
|
|
657
|
+
>>> # Create FeatureGroup.
|
|
658
|
+
>>> fg = FeatureGroup('Sales',
|
|
659
|
+
... features=[jan_feature, feb_feature, mar_feature, apr_feature],
|
|
660
|
+
... entity=entity,
|
|
661
|
+
... data_source=data_source)
|
|
662
|
+
|
|
663
|
+
# Get the features from FeatureGroup
|
|
664
|
+
>>> fg.features
|
|
665
|
+
[Feature(name=sales:Jan), Feature(name=sales:Fan), Feature(name=sales:Mar), Feature(name=sales:Apr)]
|
|
666
|
+
>>>
|
|
667
|
+
"""
|
|
668
|
+
return [feature for feature in self._features if feature.name not in self._labels]
|
|
669
|
+
|
|
670
|
+
@property
|
|
671
|
+
def labels(self):
|
|
672
|
+
"""
|
|
673
|
+
DESCRIPTION:
|
|
674
|
+
Get's the labels from FeatureGroup.
|
|
675
|
+
Note:
|
|
676
|
+
Use this function only after setting the labels using "set_labels".
|
|
677
|
+
|
|
678
|
+
PARAMETERS:
|
|
679
|
+
None
|
|
680
|
+
|
|
681
|
+
RETURNS:
|
|
682
|
+
Feature OR list
|
|
683
|
+
|
|
684
|
+
RAISES:
|
|
685
|
+
None
|
|
686
|
+
|
|
687
|
+
EXAMPLES:
|
|
688
|
+
>>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
|
|
689
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
690
|
+
>>> # Let's create DataFrame first.
|
|
691
|
+
>>> df = DataFrame("admissions_train")
|
|
692
|
+
>>> # create the features.
|
|
693
|
+
>>> masters_feature = Feature("masters", df.masters)
|
|
694
|
+
>>> gpa_feature = Feature("gpa", df.gpa)
|
|
695
|
+
>>> stats_feature = Feature("stats", df.stats)
|
|
696
|
+
>>> admitted_feature = Feature("admitted", df.admitted)
|
|
697
|
+
>>> # Create Entity.
|
|
698
|
+
>>> entity = Entity("id", df.id)
|
|
699
|
+
>>> # Create DataSource
|
|
700
|
+
>>> data_source = DataSource("admissions_source", df)
|
|
701
|
+
>>> # Create FeatureGroup.
|
|
702
|
+
>>> fg = FeatureGroup('Admissions',
|
|
703
|
+
... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
|
|
704
|
+
... entity=entity,
|
|
705
|
+
... data_source=data_source)
|
|
706
|
+
>>> # Set feature 'admitted' as label.
|
|
707
|
+
>>> fg.set_labels('admitted')
|
|
708
|
+
True
|
|
709
|
+
|
|
710
|
+
# Get the labels from FeatureGroup
|
|
711
|
+
>>> fg.labels
|
|
712
|
+
Feature(name=admitted)
|
|
713
|
+
>>>
|
|
714
|
+
"""
|
|
715
|
+
labels = [feature for feature in self._features if feature.name in self._labels]
|
|
716
|
+
if len(labels) == 1:
|
|
717
|
+
return labels[0]
|
|
718
|
+
return labels
|
|
719
|
+
|
|
720
|
+
@features.setter
|
|
721
|
+
def features(self, features):
|
|
722
|
+
""" Setter for features. """
|
|
723
|
+
self._features = UtilFuncs._as_list(features)
|
|
724
|
+
return True
|
|
725
|
+
|
|
726
|
+
def set_labels(self, labels):
|
|
727
|
+
"""
|
|
728
|
+
DESCRIPTION:
|
|
729
|
+
Sets the labels for FeatureGroup.
|
|
730
|
+
This method is helpful, when working with analytic functions to consume the Features.
|
|
731
|
+
Note:
|
|
732
|
+
Label is for the current session only.
|
|
733
|
+
|
|
734
|
+
PARAMETERS:
|
|
735
|
+
labels:
|
|
736
|
+
Required Argument.
|
|
737
|
+
Specifies the name(s) of the features to refer as labels.
|
|
738
|
+
Types: str or list of str
|
|
739
|
+
|
|
740
|
+
RETURNS:
|
|
741
|
+
bool
|
|
742
|
+
|
|
743
|
+
RAISES:
|
|
744
|
+
None
|
|
745
|
+
|
|
746
|
+
EXAMPLES:
|
|
747
|
+
>>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
|
|
748
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
749
|
+
>>> # Let's create DataFrame first.
|
|
750
|
+
>>> df = DataFrame("admissions_train")
|
|
751
|
+
>>> # create the features.
|
|
752
|
+
>>> masters_feature = Feature("masters", df.masters)
|
|
753
|
+
>>> gpa_feature = Feature("gpa", df.gpa)
|
|
754
|
+
>>> stats_feature = Feature("stats", df.stats)
|
|
755
|
+
>>> admitted_feature = Feature("admitted", df.admitted)
|
|
756
|
+
>>> # Create Entity.
|
|
757
|
+
>>> entity = Entity("id", df.id)
|
|
758
|
+
>>> # Create DataSource
|
|
759
|
+
>>> data_source = DataSource("admissions_source", df)
|
|
760
|
+
>>> # Create FeatureGroup.
|
|
761
|
+
>>> fg = FeatureGroup('Admissions',
|
|
762
|
+
... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
|
|
763
|
+
... entity=entity,
|
|
764
|
+
... data_source=data_source)
|
|
765
|
+
|
|
766
|
+
>>> # Set feature 'admitted' as label.
|
|
767
|
+
>>> fg.set_labels('admitted')
|
|
768
|
+
True
|
|
769
|
+
"""
|
|
770
|
+
self._labels = [] if labels is None else UtilFuncs._as_list(labels)
|
|
771
|
+
return True
|
|
772
|
+
|
|
773
|
+
@labels.setter
|
|
774
|
+
def labels(self, labels):
|
|
775
|
+
"""
|
|
776
|
+
DESCRIPTION:
|
|
777
|
+
Sets the labels for FeatureGroup.
|
|
778
|
+
This method is helpful, when working with analytic functions to consume the Features.
|
|
779
|
+
Note:
|
|
780
|
+
Label is for the current session only.
|
|
781
|
+
|
|
782
|
+
PARAMETERS:
|
|
783
|
+
labels:
|
|
784
|
+
Required Argument.
|
|
785
|
+
Specifies the name(s) of the features to refer as labels.
|
|
786
|
+
Types: str or list of str
|
|
787
|
+
|
|
788
|
+
RETURNS:
|
|
789
|
+
bool
|
|
790
|
+
|
|
791
|
+
RAISES:
|
|
792
|
+
None
|
|
793
|
+
|
|
794
|
+
EXAMPLES:
|
|
795
|
+
>>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
|
|
796
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
797
|
+
# Let's create DataFrame first.
|
|
798
|
+
>>> df = DataFrame("admissions_train")
|
|
799
|
+
# Create the features.
|
|
800
|
+
>>> masters_feature = Feature("masters", df.masters)
|
|
801
|
+
>>> gpa_feature = Feature("gpa", df.gpa)
|
|
802
|
+
>>> stats_feature = Feature("stats", df.stats)
|
|
803
|
+
>>> admitted_feature = Feature("admitted", df.admitted)
|
|
804
|
+
# Create Entity.
|
|
805
|
+
>>> entity = Entity("id", df.id)
|
|
806
|
+
# Create DataSource.
|
|
807
|
+
>>> data_source = DataSource("admissions_source", df)
|
|
808
|
+
# Create FeatureGroup.
|
|
809
|
+
>>> fg = FeatureGroup('Admissions',
|
|
810
|
+
... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
|
|
811
|
+
... entity=entity,
|
|
812
|
+
... data_source=data_source)
|
|
813
|
+
|
|
814
|
+
# Set feature 'admitted' as label.
|
|
815
|
+
>>> fg.labels = 'admitted'
|
|
816
|
+
True
|
|
817
|
+
"""
|
|
818
|
+
return self.set_labels(labels)
|
|
819
|
+
|
|
820
|
+
def reset_labels(self):
|
|
821
|
+
"""
|
|
822
|
+
DESCRIPTION:
|
|
823
|
+
Resets the labels for FeatureGroup.
|
|
824
|
+
|
|
825
|
+
PARAMETERS:
|
|
826
|
+
None
|
|
827
|
+
|
|
828
|
+
RETURNS:
|
|
829
|
+
bool
|
|
830
|
+
|
|
831
|
+
RAISES:
|
|
832
|
+
None
|
|
833
|
+
|
|
834
|
+
EXAMPLES:
|
|
835
|
+
>>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
|
|
836
|
+
>>> load_example_data("dataframe", "admissions_train")
|
|
837
|
+
>>> # Let's create DataFrame first.
|
|
838
|
+
>>> df = DataFrame("admissions_train")
|
|
839
|
+
>>> # create the features.
|
|
840
|
+
>>> masters_feature = Feature("masters", df.masters)
|
|
841
|
+
>>> gpa_feature = Feature("gpa", df.gpa)
|
|
842
|
+
>>> stats_feature = Feature("stats", df.stats)
|
|
843
|
+
>>> admitted_feature = Feature("admitted", df.admitted)
|
|
844
|
+
>>> # Create Entity.
|
|
845
|
+
>>> entity = Entity("id", df.id)
|
|
846
|
+
>>> # Create DataSource
|
|
847
|
+
>>> data_source = DataSource("admissions_source", df)
|
|
848
|
+
>>> # Create FeatureGroup.
|
|
849
|
+
>>> fg = FeatureGroup('Admissions',
|
|
850
|
+
... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
|
|
851
|
+
... entity=entity,
|
|
852
|
+
... data_source=data_source)
|
|
853
|
+
>>> # Set feature 'admitted' as label.
|
|
854
|
+
>>> fg.set_labels('admitted')
|
|
855
|
+
True
|
|
856
|
+
|
|
857
|
+
>>> # Remove the labels from FeatureGroup.
|
|
858
|
+
>>> fg.reset_labels()
|
|
859
|
+
True
|
|
860
|
+
>>>
|
|
861
|
+
"""
|
|
862
|
+
self._labels = []
|
|
863
|
+
return True
|
|
864
|
+
|
|
865
|
+
def apply(self, object):
|
|
866
|
+
"""
|
|
867
|
+
DESCRIPTION:
|
|
868
|
+
Register objects to FeatureGroup.
|
|
869
|
+
|
|
870
|
+
PARAMETERS:
|
|
871
|
+
object:
|
|
872
|
+
Required Argument.
|
|
873
|
+
Specifies the object to update the FeatureGroup.
|
|
874
|
+
Types: Feature OR DataSource OR Entity.
|
|
875
|
+
|
|
876
|
+
RETURNS:
|
|
877
|
+
bool.
|
|
878
|
+
|
|
879
|
+
RAISES:
|
|
880
|
+
TeradataMLException
|
|
881
|
+
|
|
882
|
+
EXAMPLES:
|
|
883
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
884
|
+
>>> df = DataFrame("sales")
|
|
885
|
+
>>> # Create FeatureGroup to use it in examples.
|
|
886
|
+
>>> from teradataml import Feature, Entity, DataSource, FeatureGroup
|
|
887
|
+
>>> feature = Feature('sales:Feb', df.Feb)
|
|
888
|
+
>>> entity = Entity('sales:accounts', df.accounts)
|
|
889
|
+
>>> data_source = DataSource('Sales_Data', df)
|
|
890
|
+
>>> fg = FeatureGroup('Sales',
|
|
891
|
+
... features=feature,
|
|
892
|
+
... entity=entity,
|
|
893
|
+
... data_source=data_source)
|
|
894
|
+
|
|
895
|
+
# Example 1: create a new Feature for column df.Mar and
|
|
896
|
+
# apply the feature to FeatueGroup.
|
|
897
|
+
>>> # Create Feature.
|
|
898
|
+
>>> feature = Feature('sales:Mar', df.Mar)
|
|
899
|
+
>>> # Register the above Feature with FeatureGroup.
|
|
900
|
+
>>> fg.apply(feature)
|
|
901
|
+
True
|
|
902
|
+
>>>
|
|
903
|
+
"""
|
|
904
|
+
if isinstance(object, Feature):
|
|
905
|
+
# Before adding feature, check if already feature with
|
|
906
|
+
# the name exists or not.
|
|
907
|
+
feature_exists = [i for i in range(len(self._features)) if self._features[i].name == object.name]
|
|
908
|
+
if feature_exists:
|
|
909
|
+
self._features[feature_exists[0]] = object
|
|
910
|
+
else:
|
|
911
|
+
self._features.append(object)
|
|
912
|
+
elif isinstance(object, Entity):
|
|
913
|
+
self.entity = object
|
|
914
|
+
elif isinstance(object, DataSource):
|
|
915
|
+
self.data_source = object
|
|
916
|
+
else:
|
|
917
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE,
|
|
918
|
+
'object', "Feature or Entity or DataSource"),
|
|
919
|
+
MessageCodes.UNSUPPORTED_DATATYPE)
|
|
920
|
+
|
|
921
|
+
return True
|
|
922
|
+
|
|
923
|
+
def remove(self, object):
|
|
924
|
+
"""
|
|
925
|
+
DESCRIPTION:
|
|
926
|
+
Method to remove the objects from FeatureGroup. One can use this
|
|
927
|
+
method to detach either Feature or DataSource or Entity from
|
|
928
|
+
FeatureGroup. Much useful to remove existing Features from
|
|
929
|
+
FeatureGroup.
|
|
930
|
+
|
|
931
|
+
PARAMETERS:
|
|
932
|
+
object:
|
|
933
|
+
Required Argument.
|
|
934
|
+
Specifies the object to be removed from FeatureGroup.
|
|
935
|
+
Types: Feature OR Entity OR DataSource OR FeatureGroup.
|
|
936
|
+
|
|
937
|
+
RETURNS:
|
|
938
|
+
bool.
|
|
939
|
+
|
|
940
|
+
RAISES:
|
|
941
|
+
TeradataMlException
|
|
942
|
+
|
|
943
|
+
EXAMPLES:
|
|
944
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
945
|
+
>>> df = DataFrame("sales")
|
|
946
|
+
>>> # First create the features.
|
|
947
|
+
>>> jan_feature = Feature("sales:Jan", df.Jan)
|
|
948
|
+
>>> feb_feature = Feature("sales:Fan", df.Feb)
|
|
949
|
+
>>> mar_feature = Feature("sales:Mar", df.Mar)
|
|
950
|
+
>>> apr_feature = Feature("sales:Jan", df.Apr)
|
|
951
|
+
>>> # Create Entity.
|
|
952
|
+
>>> entity = Entity("sales:accounts", df.accounts)
|
|
953
|
+
>>> # Create DataSource
|
|
954
|
+
>>> data_source = DataSource("sales_source", df.show_query())
|
|
955
|
+
>>> # Create FeatureGroup.
|
|
956
|
+
>>> fg = FeatureGroup('Sales',
|
|
957
|
+
... features=[jan_feature, feb_feature, mar_feature],
|
|
958
|
+
... entity=entity,
|
|
959
|
+
... data_source=data_source)
|
|
960
|
+
|
|
961
|
+
# Example: Remove the Feature with name "sales:Feb" from FeatureGroup.
|
|
962
|
+
>>> fg.remove(feb_feature)
|
|
963
|
+
True
|
|
964
|
+
>>>
|
|
965
|
+
"""
|
|
966
|
+
get_msg = lambda object: "{} '{}' is not associated with FeatureGroup.".format(
|
|
967
|
+
object.__class__.__name__, object.name)
|
|
968
|
+
|
|
969
|
+
if isinstance(object, Feature):
|
|
970
|
+
# Find the position of feature first, then pop it.
|
|
971
|
+
index = [i for i in range(len(self._features)) if self._features[i].name == object.name]
|
|
972
|
+
if index:
|
|
973
|
+
self.__redundant_features.append(self._features.pop(index[0]))
|
|
974
|
+
else:
|
|
975
|
+
print(get_msg(object))
|
|
976
|
+
return False
|
|
977
|
+
elif isinstance(object, DataSource):
|
|
978
|
+
if self.data_source.name == object.name:
|
|
979
|
+
self.data_source = None
|
|
980
|
+
else:
|
|
981
|
+
print(get_msg(object))
|
|
982
|
+
return False
|
|
983
|
+
elif isinstance(object, Entity):
|
|
984
|
+
if self.entity.name == object.name:
|
|
985
|
+
self.entity = None
|
|
986
|
+
else:
|
|
987
|
+
print(get_msg(object))
|
|
988
|
+
return False
|
|
989
|
+
else:
|
|
990
|
+
raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE,
|
|
991
|
+
'object', "Feature or Entity or DataSource"),
|
|
992
|
+
MessageCodes.UNSUPPORTED_DATATYPE)
|
|
993
|
+
return True
|
|
994
|
+
|
|
995
|
+
@classmethod
|
|
996
|
+
def _from_df(cls, df, repo, features_df, entity_df, data_source_df):
|
|
997
|
+
"""
|
|
998
|
+
DESCRIPTION:
|
|
999
|
+
Internal method to create object of FeatureGroup from DataFrame.
|
|
1000
|
+
|
|
1001
|
+
PARAMETERS:
|
|
1002
|
+
df:
|
|
1003
|
+
Required Argument.
|
|
1004
|
+
Specifies teradataml DataFrame which has a single
|
|
1005
|
+
record denoting FeatureGroup.
|
|
1006
|
+
Types: teradataml DataFrame.
|
|
1007
|
+
|
|
1008
|
+
repo:
|
|
1009
|
+
Required Argument.
|
|
1010
|
+
Specifies the repo name of FeatureStore.
|
|
1011
|
+
Types: str
|
|
1012
|
+
|
|
1013
|
+
features_df:
|
|
1014
|
+
Required Argument.
|
|
1015
|
+
Specifies teradataml DataFrame which has features.
|
|
1016
|
+
Types: teradataml DataFrame.
|
|
1017
|
+
|
|
1018
|
+
entity_df:
|
|
1019
|
+
Required Argument.
|
|
1020
|
+
Specifies teradataml DataFrame which has entities.
|
|
1021
|
+
Types: teradataml DataFrame.
|
|
1022
|
+
|
|
1023
|
+
data_source_df:
|
|
1024
|
+
Required Argument.
|
|
1025
|
+
Specifies teradataml DataFrame which has data sources.
|
|
1026
|
+
Types: teradataml DataFrame.
|
|
1027
|
+
|
|
1028
|
+
RETURNS:
|
|
1029
|
+
FeatureGroup
|
|
1030
|
+
|
|
1031
|
+
RAISES:
|
|
1032
|
+
None
|
|
1033
|
+
|
|
1034
|
+
EXAMPLES:
|
|
1035
|
+
>>> FeatureGroup._from_df(df, "repo", features_df, entity_df, data_source_df)
|
|
1036
|
+
"""
|
|
1037
|
+
rec = next(df.itertuples())._asdict()
|
|
1038
|
+
|
|
1039
|
+
# Select active features.
|
|
1040
|
+
features_df = features_df[features_df.status != FeatureStatus.INACTIVE.name]
|
|
1041
|
+
req_features_df = features_df[features_df.group_name == rec["name"]]
|
|
1042
|
+
|
|
1043
|
+
features = Feature._from_df(req_features_df)
|
|
1044
|
+
entity = Entity._from_df(entity_df[entity_df.name==rec['entity_name']])
|
|
1045
|
+
data_source = DataSource._from_df(data_source_df[data_source_df.name==rec['data_source_name']])
|
|
1046
|
+
|
|
1047
|
+
return cls(name=rec["name"], features=features, entity=entity, data_source=data_source, description=rec["description"])
|
|
1048
|
+
|
|
1049
|
+
def __repr__(self):
|
|
1050
|
+
"""
|
|
1051
|
+
DESCRIPTION:
|
|
1052
|
+
String representation for FeatureGroup object.
|
|
1053
|
+
|
|
1054
|
+
PARAMETERS:
|
|
1055
|
+
None
|
|
1056
|
+
|
|
1057
|
+
RETURNS:
|
|
1058
|
+
str
|
|
1059
|
+
|
|
1060
|
+
RAISES:
|
|
1061
|
+
None
|
|
1062
|
+
"""
|
|
1063
|
+
return "FeatureGroup({}, features=[{}], entity={}, data_source={})".format(
|
|
1064
|
+
self.name, ", ".join((str(feature) for feature in self.features)), self.entity, self.data_source)
|
|
1065
|
+
|
|
1066
|
+
@db_transaction
|
|
1067
|
+
def publish(self, repo):
|
|
1068
|
+
"""
|
|
1069
|
+
DESCRIPTION:
|
|
1070
|
+
Method to publish the FeatureGroup details to repository.
|
|
1071
|
+
|
|
1072
|
+
PARAMETERS:
|
|
1073
|
+
repo:
|
|
1074
|
+
Required Argument.
|
|
1075
|
+
Specifies the name of the repository to publish the FeatureGroup details.
|
|
1076
|
+
Types: str.
|
|
1077
|
+
|
|
1078
|
+
RETURNS:
|
|
1079
|
+
bool.
|
|
1080
|
+
|
|
1081
|
+
RAISES:
|
|
1082
|
+
TeradataMlException
|
|
1083
|
+
|
|
1084
|
+
EXAMPLES:
|
|
1085
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
1086
|
+
>>> df = DataFrame("sales")
|
|
1087
|
+
|
|
1088
|
+
# Example 1: create a FeatureGroup 'sales_data_fg' for above mentioned
|
|
1089
|
+
# DataFrame and publish it to 'vfs_v1'.
|
|
1090
|
+
>>> # First create the features.
|
|
1091
|
+
>>> jan_feature = Feature("sales:Jan", df.Jan)
|
|
1092
|
+
>>> feb_feature = Feature("sales:Fan", df.Feb)
|
|
1093
|
+
>>> mar_feature = Feature("sales:Mar", df.Mar)
|
|
1094
|
+
>>> apr_feature = Feature("sales:Jan", df.Apr)
|
|
1095
|
+
>>> # Create Entity.
|
|
1096
|
+
>>> entity = Entity("sales:accounts", df.accounts)
|
|
1097
|
+
>>> # Create DataSource
|
|
1098
|
+
>>> data_source = DataSource("sales_source", df.show_query())
|
|
1099
|
+
>>> # Create FeatureGroup.
|
|
1100
|
+
>>> fg = FeatureGroup('Sales',
|
|
1101
|
+
... features=[jan_feature, feb_feature, mar_feature],
|
|
1102
|
+
... entity=entity,
|
|
1103
|
+
... data_source=data_source)
|
|
1104
|
+
>>> feature_group.publish('vfs_v1')
|
|
1105
|
+
|
|
1106
|
+
# Example 2: Republish the FeatureGroup published in example1 with
|
|
1107
|
+
# updated description.
|
|
1108
|
+
>>> # First, Get the existing FeatureGroup.
|
|
1109
|
+
>>> from teradataml import FeatureStore
|
|
1110
|
+
>>> fg = FeatureStore('vfs_test').get_feature_group('Sales')
|
|
1111
|
+
>>> # Update it's description.
|
|
1112
|
+
>>> fg.description = "Feature group for Sales."
|
|
1113
|
+
>>> # Republish the details to same repo.
|
|
1114
|
+
>>> fg.publish('vfs_v1')
|
|
1115
|
+
"""
|
|
1116
|
+
|
|
1117
|
+
# Do not publish if any of required associated parameter does not exist.
|
|
1118
|
+
message = "FeatureGroup can not be published with out {}"
|
|
1119
|
+
if not self.features:
|
|
1120
|
+
raise TeradataMlException(Messages.get_message(
|
|
1121
|
+
MessageCodes.FUNC_EXECUTION_FAILED, 'publish', message.format("Features")),
|
|
1122
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
1123
|
+
|
|
1124
|
+
if not self.data_source:
|
|
1125
|
+
raise TeradataMlException(Messages.get_message(
|
|
1126
|
+
MessageCodes.FUNC_EXECUTION_FAILED, 'publish', message.format("DataSource")),
|
|
1127
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
1128
|
+
|
|
1129
|
+
if not self.entity:
|
|
1130
|
+
raise TeradataMlException(Messages.get_message(
|
|
1131
|
+
MessageCodes.FUNC_EXECUTION_FAILED, 'publish', message.format("Entity")),
|
|
1132
|
+
MessageCodes.FUNC_EXECUTION_FAILED)
|
|
1133
|
+
|
|
1134
|
+
# Before publish FeatureGroup, publish other elements.
|
|
1135
|
+
for feature in self.features:
|
|
1136
|
+
feature.publish(repo)
|
|
1137
|
+
|
|
1138
|
+
self.entity.publish(repo)
|
|
1139
|
+
self.data_source.publish(repo)
|
|
1140
|
+
_upsert_data(schema_name=repo,
|
|
1141
|
+
table_name=EFS_FEATURE_GROUP_SPEC["table_name"],
|
|
1142
|
+
insert_columns_values=OrderedDict({
|
|
1143
|
+
'name': self.name,
|
|
1144
|
+
'description': self.description,
|
|
1145
|
+
'data_source_name': self.data_source.name,
|
|
1146
|
+
'entity_name': self.entity.name,
|
|
1147
|
+
'creation_time': dt.utcnow()
|
|
1148
|
+
}),
|
|
1149
|
+
upsert_conditions={'name': self.name},
|
|
1150
|
+
update_columns_values=OrderedDict({
|
|
1151
|
+
'description': self.description,
|
|
1152
|
+
'data_source_name': self.data_source.name,
|
|
1153
|
+
'modified_time': dt.utcnow(),
|
|
1154
|
+
'entity_name': self.entity.name})
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
for feature in self.features:
|
|
1158
|
+
_upsert_data(schema_name=repo,
|
|
1159
|
+
table_name=EFS_GROUP_FEATURES_SPEC["table_name"],
|
|
1160
|
+
insert_columns_values=OrderedDict({
|
|
1161
|
+
'feature_name': feature.name,
|
|
1162
|
+
'group_name': self.name,
|
|
1163
|
+
'modified_time': dt.utcnow()
|
|
1164
|
+
}),
|
|
1165
|
+
upsert_conditions={'feature_name': feature.name, "group_name": self.name},
|
|
1166
|
+
update_columns_values=OrderedDict({
|
|
1167
|
+
'modified_time': dt.utcnow()
|
|
1168
|
+
})
|
|
1169
|
+
)
|
|
1170
|
+
|
|
1171
|
+
# Cut down the link between features and FeatureGroup if any of the
|
|
1172
|
+
# features is removed from FeatureGroup.
|
|
1173
|
+
if self.__redundant_features:
|
|
1174
|
+
col_expression = _SQLColumnExpression("feature_name") == self.__redundant_features[0].name
|
|
1175
|
+
for feature in self.__redundant_features[1:]:
|
|
1176
|
+
col_expression = ((col_expression) | (_SQLColumnExpression("feature_name") == feature.name))
|
|
1177
|
+
_delete_data(schema_name=repo,
|
|
1178
|
+
table_name=EFS_GROUP_FEATURES_SPEC["table_name"],
|
|
1179
|
+
delete_conditions=((_SQLColumnExpression("group_name") == self.name) & (col_expression)))
|
|
1180
|
+
# After removing the data, set this back.
|
|
1181
|
+
self.__redundant_features = []
|
|
1182
|
+
|
|
1183
|
+
return True
|
|
1184
|
+
|
|
1185
|
+
def __add__(self, other):
|
|
1186
|
+
"""
|
|
1187
|
+
Combines two Feature groups.
|
|
1188
|
+
|
|
1189
|
+
PARAMETERS:
|
|
1190
|
+
other :
|
|
1191
|
+
Required Argument.
|
|
1192
|
+
Specifies another FeatureGroup.
|
|
1193
|
+
Types: FeatureGroup
|
|
1194
|
+
|
|
1195
|
+
RETURNS:
|
|
1196
|
+
FeatureGroup
|
|
1197
|
+
|
|
1198
|
+
RAISES:
|
|
1199
|
+
TypeError, ValueError
|
|
1200
|
+
|
|
1201
|
+
EXAMPLES:
|
|
1202
|
+
>>> load_example_data("dataframe", "sales")
|
|
1203
|
+
>>> df = DataFrame("sales")
|
|
1204
|
+
>>> df
|
|
1205
|
+
Feb Jan Mar Apr datetime
|
|
1206
|
+
accounts
|
|
1207
|
+
Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
|
|
1208
|
+
Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
|
|
1209
|
+
Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
|
|
1210
|
+
Orange Inc 210.0 NaN NaN 250.0 04/01/2017
|
|
1211
|
+
Yellow Inc 90.0 NaN NaN NaN 04/01/2017
|
|
1212
|
+
Red Inc 200.0 150.0 140.0 NaN 04/01/2017
|
|
1213
|
+
|
|
1214
|
+
# Example 1: create two feature groups and then create a new feature
|
|
1215
|
+
# group by combining those two feature groups.
|
|
1216
|
+
# Creating first feature group.
|
|
1217
|
+
>>> f1 = Feature("sales_Jan", column=df.Jan)
|
|
1218
|
+
>>> f2 = Feature("sales_Feb", column=df.Feb)
|
|
1219
|
+
>>> entity = Entity(name="sales", columns='accounts')
|
|
1220
|
+
>>> data_source = DataSource("sales", source=df.show_query())
|
|
1221
|
+
>>> fg1 = FeatureGroup(name="sales_jan_feb", entity=entity, features=[f1, f2], data_source=data_source)
|
|
1222
|
+
>>> fg1
|
|
1223
|
+
FeatureGroup(sales_jan_feb, features=[Feature(name=sales_Jan), Feature(name=sales_Feb)], entity=Entity(name=sales), data_source=DataSource(name=sales))
|
|
1224
|
+
|
|
1225
|
+
>>> # Creating second feature group.
|
|
1226
|
+
>>> f3 = Feature("sales_Mar", column=df.Mar)
|
|
1227
|
+
>>> f4 = Feature("sales_Apr", column=df.Apr)
|
|
1228
|
+
>>> data_source = DataSource("sales_Mar_Apr", source=df.show_query())
|
|
1229
|
+
>>> fg2 = FeatureGroup(name="sales_Mar_Apr", entity=entity, features=[f3, f4], data_source=data_source)
|
|
1230
|
+
>>> fg2
|
|
1231
|
+
FeatureGroup(sales_Mar_Apr, features=[Feature(name=sales_Mar), Feature(name=sales_Apr)], entity=Entity(name=sales), data_source=DataSource(name=sales))
|
|
1232
|
+
|
|
1233
|
+
>>> # Combining two feature groups.
|
|
1234
|
+
>>> new_fg = feature_group1 + feature_group2
|
|
1235
|
+
>>> new_fg
|
|
1236
|
+
FeatureGroup(sales_jan_feb_sales_Mar_Apr, features=[Feature(name=sales_Jan), Feature(name=sales_Feb), Feature(name=sales_Mar), Feature(name=sales_Apr)], entity=Entity(name=sales), data_source=DataSource(name=sales))
|
|
1237
|
+
>>>
|
|
1238
|
+
"""
|
|
1239
|
+
if not isinstance(other, FeatureGroup):
|
|
1240
|
+
err_ = Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE, "other",
|
|
1241
|
+
"FeatureGroup")
|
|
1242
|
+
raise TypeError(err_)
|
|
1243
|
+
|
|
1244
|
+
if self.entity != other.entity:
|
|
1245
|
+
raise ValueError("Two FeatureGroups can be merged only when the corresponding entities are same.")
|
|
1246
|
+
|
|
1247
|
+
# While merging two datasets, time stamp columns also should be same.
|
|
1248
|
+
if ((self.data_source.timestamp_col_name and not other.data_source.timestamp_col_name) or
|
|
1249
|
+
(other.data_source.timestamp_col_name and not self.data_source.timestamp_col_name) or
|
|
1250
|
+
(self.data_source.timestamp_col_name != other.data_source.timestamp_col_name)):
|
|
1251
|
+
raise ValueError("Two FeatureGroups can be merged only when the corresponding "
|
|
1252
|
+
"'timestamp_col_name' for the DataSources are same.")
|
|
1253
|
+
|
|
1254
|
+
if self.entity == other.entity:
|
|
1255
|
+
|
|
1256
|
+
existing_columns = {feature.column_name for feature in self.features}
|
|
1257
|
+
# New features should be combined features of both "self" and other.
|
|
1258
|
+
# However, these two features may share common features too. In such cases,
|
|
1259
|
+
# consider only one.
|
|
1260
|
+
effective_other_features = [feature for feature in other.features
|
|
1261
|
+
if feature.column_name not in existing_columns]
|
|
1262
|
+
|
|
1263
|
+
# Prepare new DataSource.
|
|
1264
|
+
query_1 = self.data_source.source
|
|
1265
|
+
query_2 = other.data_source.source
|
|
1266
|
+
|
|
1267
|
+
# If both the queries a.k.a sources are not same, then combine those
|
|
1268
|
+
# sources with join. While combining, make sure to specify only the
|
|
1269
|
+
# columns which are required.
|
|
1270
|
+
if query_2 != query_1:
|
|
1271
|
+
|
|
1272
|
+
# Consider adding timestamp column to query.
|
|
1273
|
+
time_stamp_column = []
|
|
1274
|
+
if self.data_source.timestamp_col_name:
|
|
1275
|
+
time_stamp_column.append("A.{}".format(self.data_source.timestamp_col_name))
|
|
1276
|
+
|
|
1277
|
+
feature_columns = (["A.{}".format(feature.column_name) for feature in self.features] +
|
|
1278
|
+
["B.{}".format(feature.column_name) for feature in effective_other_features])
|
|
1279
|
+
|
|
1280
|
+
columns = ", ".join(["A.{}".format(col) for col in self.entity.columns] + time_stamp_column + feature_columns)
|
|
1281
|
+
on_clause_columns = [col for col in self.entity.columns]
|
|
1282
|
+
if self.data_source.timestamp_col_name:
|
|
1283
|
+
on_clause_columns.append(self.data_source.timestamp_col_name)
|
|
1284
|
+
where_clause = " AND ".join(["A.{0} = B.{0}".format(column) for column in on_clause_columns])
|
|
1285
|
+
|
|
1286
|
+
query = f"""
|
|
1287
|
+
SELECT {columns}
|
|
1288
|
+
FROM ({query_1.strip(";")}) AS A, ({query_2.strip(";")}) AS B
|
|
1289
|
+
WHERE {where_clause}
|
|
1290
|
+
"""
|
|
1291
|
+
data_source = DataSource(name="{}_{}".format(self.data_source.name, other.data_source.name),
|
|
1292
|
+
source=query,
|
|
1293
|
+
description="Combined DataSource for {} and {}".format(
|
|
1294
|
+
self.data_source.name, other.data_source.name),
|
|
1295
|
+
timestamp_col_name=self.data_source.timestamp_col_name
|
|
1296
|
+
)
|
|
1297
|
+
else:
|
|
1298
|
+
data_source = self.data_source
|
|
1299
|
+
|
|
1300
|
+
# Create new feature group.
|
|
1301
|
+
feature_group = FeatureGroup(name="{}_{}".format(self.name, other.name),
|
|
1302
|
+
features=self.features + effective_other_features,
|
|
1303
|
+
data_source=data_source,
|
|
1304
|
+
entity=Entity(name="{}_{}".format(self.name, other.name),
|
|
1305
|
+
columns=self.entity.columns),
|
|
1306
|
+
description="Combined FeatureGroup for groups {} and {}.".format(
|
|
1307
|
+
self.name, other.name)
|
|
1308
|
+
)
|
|
1309
|
+
return feature_group
|
|
1310
|
+
|
|
1311
|
+
@classmethod
|
|
1312
|
+
def from_query(cls, name, entity_columns, query, timestamp_col_name=None):
|
|
1313
|
+
"""
|
|
1314
|
+
DESCRIPTION:
|
|
1315
|
+
Method to create FeatureGroup from Query.
|
|
1316
|
+
|
|
1317
|
+
PARAMETERS:
|
|
1318
|
+
name:
|
|
1319
|
+
Required Argument.
|
|
1320
|
+
Specifies the name of the FeatureGroup.
|
|
1321
|
+
Note:
|
|
1322
|
+
* Entitiy, DataSource also get the same name as "name".
|
|
1323
|
+
Users can change the name of Entity or DataSource by accessing
|
|
1324
|
+
object from FeatureGroup.
|
|
1325
|
+
Types: str.
|
|
1326
|
+
|
|
1327
|
+
entity_columns:
|
|
1328
|
+
Required Argument.
|
|
1329
|
+
Specifies the column names for the Entity.
|
|
1330
|
+
Types: str or list of str.
|
|
1331
|
+
|
|
1332
|
+
query:
|
|
1333
|
+
Required Argument.
|
|
1334
|
+
Specifies the query for DataSource.
|
|
1335
|
+
Types: str.
|
|
1336
|
+
|
|
1337
|
+
timestamp_col_name:
|
|
1338
|
+
Optional Argument.
|
|
1339
|
+
Specifies the name of the column in the Query which
|
|
1340
|
+
holds the record creation time.
|
|
1341
|
+
Types: str
|
|
1342
|
+
|
|
1343
|
+
RETURNS:
|
|
1344
|
+
FeatureGroup
|
|
1345
|
+
|
|
1346
|
+
RAISES:
|
|
1347
|
+
None
|
|
1348
|
+
|
|
1349
|
+
EXAMPLES:
|
|
1350
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
1351
|
+
>>> df = DataFrame("sales")
|
|
1352
|
+
|
|
1353
|
+
# Example 1: create a FeatureGroup from query 'SELECT * FROM SALES' and
|
|
1354
|
+
# consider 'accounts' column as entity and 'datetime' column
|
|
1355
|
+
# as timestamp_col_name.
|
|
1356
|
+
>>> from teradataml import FeatureGroup
|
|
1357
|
+
>>> query = 'SELECT * FROM SALES'
|
|
1358
|
+
>>> fg = FeatureGroup.from_query(
|
|
1359
|
+
... name='sales',
|
|
1360
|
+
... entity_columns='accounts',
|
|
1361
|
+
... query=query,
|
|
1362
|
+
... timestamp_col_name='datetime'
|
|
1363
|
+
... )
|
|
1364
|
+
"""
|
|
1365
|
+
return cls.__create_feature_group(name, entity_columns, query, timestamp_col_name)
|
|
1366
|
+
|
|
1367
|
+
@classmethod
|
|
1368
|
+
def from_DataFrame(cls, name, entity_columns, df, timestamp_col_name=None):
|
|
1369
|
+
"""
|
|
1370
|
+
DESCRIPTION:
|
|
1371
|
+
Method to create FeatureGroup from DataFrame.
|
|
1372
|
+
|
|
1373
|
+
PARAMETERS:
|
|
1374
|
+
name:
|
|
1375
|
+
Required Argument.
|
|
1376
|
+
Specifies the name of the FeatureGroup.
|
|
1377
|
+
Note:
|
|
1378
|
+
* Entitiy, DataSource also get the same name as "name".
|
|
1379
|
+
User's can change the name of Entity or DataSource by accessing
|
|
1380
|
+
object from FeatureGroup.
|
|
1381
|
+
Types: str.
|
|
1382
|
+
|
|
1383
|
+
entity_columns:
|
|
1384
|
+
Required Argument.
|
|
1385
|
+
Specifies the column names for the Entity.
|
|
1386
|
+
Types: str or list of str.
|
|
1387
|
+
|
|
1388
|
+
df:
|
|
1389
|
+
Required Argument.
|
|
1390
|
+
Specifies teradataml DataFrame for creating DataSource.
|
|
1391
|
+
Types: teradataml DataFrame.
|
|
1392
|
+
|
|
1393
|
+
timestamp_col_name:
|
|
1394
|
+
Optional Argument.
|
|
1395
|
+
Specifies the name of the column in the Query which
|
|
1396
|
+
holds the record creation time.
|
|
1397
|
+
Types: str
|
|
1398
|
+
|
|
1399
|
+
RETURNS:
|
|
1400
|
+
FeatureGroup
|
|
1401
|
+
|
|
1402
|
+
RAISES:
|
|
1403
|
+
None
|
|
1404
|
+
|
|
1405
|
+
EXAMPLES:
|
|
1406
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
1407
|
+
>>> df = DataFrame("sales")
|
|
1408
|
+
|
|
1409
|
+
# Example 1: create a FeatureGroup from DataFrame created on 'sales' table and
|
|
1410
|
+
# consider 'accounts' column as entity and 'datetime' column
|
|
1411
|
+
# as timestamp_col_name.
|
|
1412
|
+
>>> from teradataml import FeatureGroup
|
|
1413
|
+
>>> df = DataFrame("sales")
|
|
1414
|
+
>>> fg = FeatureGroup.from_DataFrame(
|
|
1415
|
+
... name='sales',
|
|
1416
|
+
... entity_columns='accounts',
|
|
1417
|
+
... df=df,
|
|
1418
|
+
... timestamp_col_name='datetime'
|
|
1419
|
+
... )
|
|
1420
|
+
"""
|
|
1421
|
+
return cls.__create_feature_group(name, entity_columns, df, timestamp_col_name)
|
|
1422
|
+
|
|
1423
|
+
@classmethod
|
|
1424
|
+
def __create_feature_group(cls, name, entity_columns, obj, timestamp_col_name=None):
|
|
1425
|
+
"""
|
|
1426
|
+
DESCRIPTION:
|
|
1427
|
+
Internal method to create FeatureGroup from either DataFrame or from Query.
|
|
1428
|
+
|
|
1429
|
+
PARAMETERS:
|
|
1430
|
+
name:
|
|
1431
|
+
Required Argument.
|
|
1432
|
+
Specifies the name of the FeatureGroup.
|
|
1433
|
+
Types: str.
|
|
1434
|
+
|
|
1435
|
+
entity_columns:
|
|
1436
|
+
Required Argument.
|
|
1437
|
+
Specifies the column names for the Entity.
|
|
1438
|
+
Types: str or list of str.
|
|
1439
|
+
|
|
1440
|
+
obj:
|
|
1441
|
+
Required Argument.
|
|
1442
|
+
Specifies either teradataml DataFrame or Query for creating DataSource.
|
|
1443
|
+
Types: teradataml DataFrame OR str.
|
|
1444
|
+
|
|
1445
|
+
timestamp_col_name:
|
|
1446
|
+
Optional Argument.
|
|
1447
|
+
Specifies the name of the column in the Query or DataFrame which
|
|
1448
|
+
holds the record creation time.
|
|
1449
|
+
Types: str
|
|
1450
|
+
|
|
1451
|
+
RETURNS:
|
|
1452
|
+
FeatureGroup
|
|
1453
|
+
|
|
1454
|
+
RAISES:
|
|
1455
|
+
None
|
|
1456
|
+
|
|
1457
|
+
EXAMPLES:
|
|
1458
|
+
>>> load_example_data('dataframe', ['sales'])
|
|
1459
|
+
>>> df = DataFrame("sales")
|
|
1460
|
+
|
|
1461
|
+
# Example 1: create a FeatureGroup from DataFrame created on 'sales' table and
|
|
1462
|
+
# consider 'accounts' column as entity and 'datetime' column
|
|
1463
|
+
# as timestamp_col_name.
|
|
1464
|
+
>>> from teradataml import FeatureGroup
|
|
1465
|
+
>>> df = DataFrame("sales")
|
|
1466
|
+
>>> fg = FeatureGroup.__create_feature_group(
|
|
1467
|
+
... name='sales',
|
|
1468
|
+
... entity_columns='accounts',
|
|
1469
|
+
... df=df,
|
|
1470
|
+
... timestamp_col_name='datetime'
|
|
1471
|
+
... )
|
|
1472
|
+
"""
|
|
1473
|
+
# Check the caller. And decide the type of 'obj'.
|
|
1474
|
+
is_obj_dataframe = False
|
|
1475
|
+
if inspect.stack()[1][3] == 'from_DataFrame':
|
|
1476
|
+
# Perform the function validations.
|
|
1477
|
+
is_obj_dataframe = True
|
|
1478
|
+
|
|
1479
|
+
argument_validation_params = []
|
|
1480
|
+
argument_validation_params.append(["name", name, False, str, True])
|
|
1481
|
+
argument_validation_params.append(["entity_columns", entity_columns, False, (str, list), True])
|
|
1482
|
+
argument_validation_params.append(["timestamp_col_name", timestamp_col_name, True, str, True])
|
|
1483
|
+
param = ["df", obj, False, DataFrame, True] if is_obj_dataframe else ["query", obj, False, str, True]
|
|
1484
|
+
argument_validation_params.append(param)
|
|
1485
|
+
# Validate argument types
|
|
1486
|
+
_Validators._validate_function_arguments(argument_validation_params)
|
|
1487
|
+
|
|
1488
|
+
df = obj if is_obj_dataframe else DataFrame.from_query(obj)
|
|
1489
|
+
|
|
1490
|
+
features = [Feature(name=col, column=df[col]) for col in df.columns if (
|
|
1491
|
+
col not in UtilFuncs._as_list(entity_columns) and col != timestamp_col_name)
|
|
1492
|
+
]
|
|
1493
|
+
data_source = DataSource(
|
|
1494
|
+
name=name,
|
|
1495
|
+
source=df.show_query(),
|
|
1496
|
+
timestamp_col_name=timestamp_col_name
|
|
1497
|
+
)
|
|
1498
|
+
entity = Entity(name=name, columns=entity_columns)
|
|
1499
|
+
fg = FeatureGroup(
|
|
1500
|
+
name=name,
|
|
1501
|
+
features=features,
|
|
1502
|
+
data_source=data_source,
|
|
1503
|
+
entity=entity
|
|
1504
|
+
)
|
|
1505
|
+
return fg
|