teradataml 20.0.0.2__py3-none-any.whl → 20.0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of teradataml might be problematic. Click here for more details.

Files changed (88) hide show
  1. teradataml/LICENSE-3RD-PARTY.pdf +0 -0
  2. teradataml/README.md +196 -2
  3. teradataml/__init__.py +4 -0
  4. teradataml/_version.py +1 -1
  5. teradataml/analytics/analytic_function_executor.py +79 -4
  6. teradataml/analytics/json_parser/metadata.py +12 -3
  7. teradataml/analytics/json_parser/utils.py +7 -2
  8. teradataml/analytics/sqle/__init__.py +1 -0
  9. teradataml/analytics/table_operator/__init__.py +1 -1
  10. teradataml/analytics/uaf/__init__.py +1 -1
  11. teradataml/analytics/utils.py +4 -0
  12. teradataml/automl/data_preparation.py +3 -2
  13. teradataml/automl/feature_engineering.py +15 -7
  14. teradataml/automl/model_training.py +39 -33
  15. teradataml/common/__init__.py +2 -1
  16. teradataml/common/constants.py +35 -0
  17. teradataml/common/garbagecollector.py +2 -1
  18. teradataml/common/messagecodes.py +8 -2
  19. teradataml/common/messages.py +3 -1
  20. teradataml/common/sqlbundle.py +25 -3
  21. teradataml/common/utils.py +134 -9
  22. teradataml/context/context.py +20 -10
  23. teradataml/data/SQL_Fundamentals.pdf +0 -0
  24. teradataml/data/dataframe_example.json +18 -2
  25. teradataml/data/docs/sqle/docs_17_20/NaiveBayes.py +1 -1
  26. teradataml/data/docs/sqle/docs_17_20/Shap.py +7 -1
  27. teradataml/data/docs/sqle/docs_17_20/TDNaiveBayesPredict.py +4 -4
  28. teradataml/data/docs/sqle/docs_17_20/TextParser.py +3 -3
  29. teradataml/data/docs/tableoperator/docs_17_20/Image2Matrix.py +118 -0
  30. teradataml/data/docs/uaf/docs_17_20/CopyArt.py +145 -0
  31. teradataml/data/docs/uaf/docs_17_20/DickeyFuller.py +18 -21
  32. teradataml/data/jsons/sqle/17.20/TD_TextParser.json +1 -1
  33. teradataml/data/jsons/sqle/20.00/TD_KMeans.json +250 -0
  34. teradataml/data/jsons/sqle/20.00/TD_SMOTE.json +266 -0
  35. teradataml/data/jsons/sqle/20.00/TD_VectorDistance.json +278 -0
  36. teradataml/data/jsons/storedprocedure/17.20/TD_COPYART.json +71 -0
  37. teradataml/data/jsons/tableoperator/17.20/IMAGE2MATRIX.json +53 -0
  38. teradataml/data/jsons/uaf/17.20/TD_DICKEY_FULLER.json +10 -19
  39. teradataml/data/jsons/uaf/17.20/TD_SAX.json +3 -1
  40. teradataml/data/jsons/uaf/17.20/TD_WINDOWDFFT.json +15 -5
  41. teradataml/data/medical_readings.csv +101 -0
  42. teradataml/data/patient_profile.csv +101 -0
  43. teradataml/data/scripts/lightgbm/dataset.template +157 -0
  44. teradataml/data/scripts/lightgbm/lightgbm_class_functions.template +247 -0
  45. teradataml/data/scripts/lightgbm/lightgbm_function.template +216 -0
  46. teradataml/data/scripts/lightgbm/lightgbm_sklearn.template +159 -0
  47. teradataml/data/scripts/sklearn/sklearn_fit.py +194 -167
  48. teradataml/data/scripts/sklearn/sklearn_fit_predict.py +136 -115
  49. teradataml/data/scripts/sklearn/sklearn_function.template +14 -19
  50. teradataml/data/scripts/sklearn/sklearn_model_selection_split.py +155 -137
  51. teradataml/data/scripts/sklearn/sklearn_transform.py +129 -42
  52. teradataml/data/target_udt_data.csv +8 -0
  53. teradataml/data/templates/open_source_ml.json +3 -2
  54. teradataml/data/vectordistance_example.json +4 -0
  55. teradataml/dataframe/dataframe.py +543 -175
  56. teradataml/dataframe/functions.py +553 -25
  57. teradataml/dataframe/sql.py +184 -15
  58. teradataml/dbutils/dbutils.py +556 -18
  59. teradataml/dbutils/filemgr.py +48 -1
  60. teradataml/lib/aed_0_1.dll +0 -0
  61. teradataml/opensource/__init__.py +1 -1
  62. teradataml/opensource/{sklearn/_class.py → _class.py} +102 -17
  63. teradataml/opensource/_lightgbm.py +950 -0
  64. teradataml/opensource/{sklearn/_wrapper_utils.py → _wrapper_utils.py} +1 -2
  65. teradataml/opensource/{sklearn/constants.py → constants.py} +13 -10
  66. teradataml/opensource/sklearn/__init__.py +0 -1
  67. teradataml/opensource/sklearn/_sklearn_wrapper.py +798 -438
  68. teradataml/options/__init__.py +7 -23
  69. teradataml/options/configure.py +29 -3
  70. teradataml/scriptmgmt/UserEnv.py +3 -3
  71. teradataml/scriptmgmt/lls_utils.py +74 -21
  72. teradataml/store/__init__.py +13 -0
  73. teradataml/store/feature_store/__init__.py +0 -0
  74. teradataml/store/feature_store/constants.py +291 -0
  75. teradataml/store/feature_store/feature_store.py +2223 -0
  76. teradataml/store/feature_store/models.py +1505 -0
  77. teradataml/store/vector_store/__init__.py +1586 -0
  78. teradataml/table_operators/query_generator.py +3 -0
  79. teradataml/table_operators/table_operator_query_generator.py +3 -1
  80. teradataml/table_operators/table_operator_util.py +37 -38
  81. teradataml/table_operators/templates/dataframe_register.template +69 -0
  82. teradataml/utils/dtypes.py +4 -2
  83. teradataml/utils/validators.py +33 -1
  84. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/METADATA +200 -5
  85. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/RECORD +88 -65
  86. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/WHEEL +0 -0
  87. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/top_level.txt +0 -0
  88. {teradataml-20.0.0.2.dist-info → teradataml-20.0.0.3.dist-info}/zip-safe +0 -0
@@ -0,0 +1,1505 @@
1
+ """
2
+ Copyright (c) 2024 by Teradata Corporation. All rights reserved.
3
+ TERADATA CORPORATION CONFIDENTIAL AND TRADE SECRET
4
+
5
+ Primary Owner: pradeep.garre@teradata.com
6
+ Secondary Owner: adithya.avvaru@teradata.com
7
+
8
+ This file implements the models required for Teradata Enterprise Feature Store.
9
+ """
10
+
11
+ from collections import OrderedDict
12
+ from datetime import datetime as dt
13
+ from teradatasqlalchemy import types as tdtypes
14
+ from teradataml.common.exceptions import TeradataMlException
15
+ from teradataml.common.messages import Messages
16
+ from teradataml.common.messagecodes import MessageCodes
17
+ from teradataml.common.utils import UtilFuncs
18
+ from teradataml.dataframe.dataframe import DataFrame, in_schema
19
+ from teradataml.dataframe.sql import _SQLColumnExpression
20
+ from teradataml.dbutils.dbutils import db_transaction, _delete_data, execute_sql, _insert_data, _upsert_data
21
+ from teradataml.store.feature_store.constants import *
22
+ from teradataml.utils.validators import _Validators
23
+ import inspect
24
+
25
+
26
+ class Feature:
27
+ """Class for Feature. """
28
+ def __init__(self,
29
+ name,
30
+ column,
31
+ feature_type=FeatureType.CONTINUOUS,
32
+ description=None,
33
+ tags=None,
34
+ status=FeatureStatus.ACTIVE):
35
+ """
36
+ DESCRIPTION:
37
+ Constructor for Feature.
38
+
39
+ PARAMETERS:
40
+ name:
41
+ Required Argument.
42
+ Specifies the unique name of the Feature.
43
+ Types: str.
44
+
45
+ column:
46
+ Required Argument.
47
+ Specifies the DataFrame Column.
48
+ Types: teradataml DataFrame Column
49
+
50
+ feature_type:
51
+ Optional Argument.
52
+ Specifies whether feature is continuous or discrete.
53
+ Default Value: FeatureType.CONTINUOUS
54
+ Types: FeatureType Enum
55
+
56
+ description:
57
+ Optional Argument.
58
+ Specifies human readable description for Feature.
59
+ Types: str
60
+
61
+ tags:
62
+ Optional Argument.
63
+ Specifies the tags for Feature.
64
+ Types: str OR list of str
65
+
66
+ status:
67
+ Optional Argument.
68
+ Specifies whether feature is archived or active.
69
+ Types: FeatureStatus Enum
70
+
71
+ RETURNS:
72
+ None.
73
+
74
+ RAISES:
75
+ None
76
+
77
+ EXAMPLES:
78
+ >>> from teradataml import DataFrame, Feature, FeatureType, load_example_data
79
+ # Load the sales data to Vantage.
80
+ >>> load_example_data("dataframe", "sales")
81
+ # Create DataFrame on sales data.
82
+ >>> df = DataFrame("sales")
83
+ >>> df
84
+ >>> df
85
+ Feb Jan Mar Apr datetime
86
+ accounts
87
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
88
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
89
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
90
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
91
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
92
+
93
+ # create a Categorical Feature for column 'Feb' for 'sales' DataFrame and name it as
94
+ # 'sales_Feb'.
95
+ >>> from teradataml import Feature
96
+ >>> feature = Feature('sales_Feb', column=df.Feb, feature_type=FeatureType.CATEGORICAL)
97
+ >>> feature
98
+ Feature(name=sales_Feb)
99
+ >>>
100
+ """
101
+ self.name = name
102
+ self.column_name = column.name
103
+ self.description = description
104
+ self.tags = UtilFuncs._as_list(tags) if tags else None
105
+ self.data_type = column.type
106
+ self.feature_type = feature_type
107
+ self.status = status
108
+
109
+ @classmethod
110
+ def _from_df(cls, df):
111
+ """
112
+ DESCRIPTION:
113
+ Internal method to create object of Feature from DataFrame.
114
+
115
+ PARAMETERS:
116
+ df:
117
+ Required Argument.
118
+ Specifies teradataml DataFrame which has Feature details.
119
+ Types: teradataml DataFrame.
120
+
121
+ RETURNS:
122
+ Feature or list of Feature.
123
+
124
+ RAISES:
125
+ None
126
+
127
+ EXAMPLES:
128
+ >>> Feature._from_df(df)
129
+ """
130
+ _features = []
131
+ recs = [rec._asdict() for rec in df.itertuples()]
132
+
133
+ for rec in recs:
134
+ # Pop out unnecessary details.
135
+ rec.pop("creation_time")
136
+ rec.pop("modified_time")
137
+ rec.pop("group_name")
138
+ rec["column"] = _SQLColumnExpression(rec.pop("column_name"),
139
+ type=getattr(tdtypes, rec.pop("data_type"))())
140
+ rec["feature_type"] = FeatureType.CONTINUOUS if rec["feature_type"] == FeatureType.CONTINUOUS.name \
141
+ else FeatureType.CATEGORICAL
142
+ rec["status"] = FeatureStatus.ACTIVE if rec["status"] == FeatureStatus.ACTIVE.name else FeatureStatus.INACTIVE
143
+ _features.append(cls(**rec))
144
+
145
+ return _features if len(_features) > 1 else _features[0]
146
+
147
+ def __repr__(self):
148
+ """
149
+ DESCRIPTION:
150
+ String representation for Feature object.
151
+
152
+ PARAMETERS:
153
+ None
154
+
155
+ RETURNS:
156
+ str
157
+
158
+ RAISES:
159
+ None
160
+ """
161
+ return "Feature(name={name})".format(name=self.name)
162
+
163
+ def publish(self, repo):
164
+ """
165
+ DESCRIPTION:
166
+ Method to publish the Feature details to repository.
167
+
168
+ PARAMETERS:
169
+ repo:
170
+ Required Argument.
171
+ Specifies the name of the repository to publish the Feature details.
172
+ Types: str.
173
+
174
+ RETURNS:
175
+ bool.
176
+
177
+ RAISES:
178
+ TeradataMlException
179
+
180
+ EXAMPLES:
181
+ >>> load_example_data('dataframe', ['sales'])
182
+ >>> df = DataFrame("sales")
183
+
184
+ # Example 1: Publish the Feature details to repo 'vfs_test' for column
185
+ 'Feb' from 'sales' DataFrame.
186
+ >>> from teradataml import Feature
187
+ >>> feature = Feature('sales:Feb', df.Feb)
188
+ >>> feature.publish('vfs_test')
189
+ True
190
+ >>>
191
+
192
+ # Example 2: Republish the Feature published in Example 1 by updating
193
+ # it's tags.
194
+ >>> # First, Get the existing Feature.
195
+ >>> from teradataml import FeatureStore
196
+ >>> feature = FeatureStore('vfs_test').get_feature('sales:Feb')
197
+ >>> # Update it's tags.
198
+ >>> feature.tags = ["sales_data", "monthly_sales"]
199
+ >>> # Republish the details to same repo.
200
+ >>> feature.publish('vfs_test')
201
+ """
202
+ _upsert_data(schema_name=repo,
203
+ table_name=EFS_FEATURES_SPEC["table_name"],
204
+ insert_columns_values = OrderedDict({
205
+ 'name': self.name,
206
+ 'column_name': self.column_name,
207
+ 'description': self.description,
208
+ 'creation_time': dt.utcnow(),
209
+ 'tags': ", ".join(self.tags) if self.tags else None,
210
+ 'data_type': str(self.data_type),
211
+ 'feature_type': self.feature_type.name,
212
+ 'status': self.status.name}),
213
+ upsert_conditions=OrderedDict({
214
+ 'name': self.name}),
215
+ update_columns_values=OrderedDict({
216
+ 'column_name': self.column_name,
217
+ 'description': self.description,
218
+ 'modified_time': dt.utcnow(),
219
+ 'tags': ", ".join(self.tags) if self.tags else None,
220
+ 'data_type': str(self.data_type),
221
+ 'feature_type': self.feature_type.name,
222
+ 'status': self.status.name})
223
+ )
224
+ return True
225
+
226
+
227
+ class Entity:
228
+ """Class for Entity. """
229
+ def __init__(self, name, columns, description=None):
230
+ """
231
+ DESCRIPTION:
232
+ Constructor for creating Entity Object.
233
+
234
+ PARAMETERS:
235
+ name:
236
+ Required Argument.
237
+ Specifies the unique name of the entity.
238
+ Types: str.
239
+
240
+ columns:
241
+ Required Argument.
242
+ Specifies the names of the columns.
243
+ Types: teradataml DataFrame Column OR list of teradataml DataFrame Columns.
244
+
245
+ description:
246
+ Optional Argument.
247
+ Specifies human readable description for Feature.
248
+ Types: str
249
+
250
+ RETURNS:
251
+ Object of Entity.
252
+
253
+ RAISES:
254
+ None
255
+
256
+ EXAMPLES:
257
+ >>> load_example_data('dataframe', ['sales'])
258
+ >>> df = DataFrame("sales")
259
+
260
+ # create a Entity with column 'accounts' for 'sales' DataFrame and name it as
261
+ # 'sales_accounts'.
262
+ >>> from teradataml import Entity
263
+ >>> entity = Entity('sales_accounts', df.accounts)
264
+ >>> entity
265
+ Entity(name=sales_accounts)
266
+ >>>
267
+ """
268
+ self.name = name
269
+ self.columns = [col if isinstance(col, str) else col.name for col in UtilFuncs._as_list(columns)]
270
+ self.description = description
271
+
272
+ @classmethod
273
+ def _from_df(cls, df):
274
+ """
275
+ DESCRIPTION:
276
+ Internal method to create object of Entity from DataFrame.
277
+
278
+ PARAMETERS:
279
+ df:
280
+ Required Argument.
281
+ Specifies teradataml DataFrame which has details for Entity.
282
+ Types: teradataml DataFrame.
283
+
284
+ RETURNS:
285
+ Entity
286
+
287
+ RAISES:
288
+ None
289
+
290
+ EXAMPLES:
291
+ >>> Entity._from_df(df)
292
+ """
293
+ entity_name = None
294
+ description = None
295
+ columns = []
296
+
297
+ # Get all the entity columns and update there.
298
+ for rec in df.itertuples():
299
+ entity_name = rec.name
300
+ description = rec.description
301
+ columns.append(rec.entity_column)
302
+
303
+ return cls(name=entity_name, description=description, columns=columns)
304
+
305
+ def __repr__(self):
306
+ """
307
+ DESCRIPTION:
308
+ String representation for Entity object.
309
+
310
+ PARAMETERS:
311
+ None
312
+
313
+ RETURNS:
314
+ str
315
+
316
+ RAISES:
317
+ None
318
+ """
319
+ return "Entity(name={})".format(self.name)
320
+
321
+ @db_transaction
322
+ def publish(self, repo):
323
+ """
324
+ DESCRIPTION:
325
+ Method to publish the Entity details to repository.
326
+
327
+ PARAMETERS:
328
+ repo:
329
+ Required Argument.
330
+ Specifies the name of the repository to publish the Entity details.
331
+ Types: str.
332
+
333
+ RETURNS:
334
+ bool.
335
+
336
+ RAISES:
337
+ TeradataMlException
338
+
339
+ EXAMPLES:
340
+ >>> load_example_data('dataframe', ['sales'])
341
+ >>> df = DataFrame("sales")
342
+
343
+ # Example 1: Publish the Entity details to repo 'vfs_test' for column
344
+ 'accounts' from 'sales' DataFrame.
345
+ >>> from teradataml import Entity
346
+ >>> entity = Entity('sales:accounts', 'accounts')
347
+ >>> entity.publish('vfs_test')
348
+ True
349
+ >>>
350
+ """
351
+ # Upsert should be triggered for every corresponding entity ID and column.
352
+ _upsert_data(schema_name=repo,
353
+ table_name=EFS_ENTITY_SPEC["table_name"],
354
+ insert_columns_values=OrderedDict({
355
+ 'name': self.name,
356
+ 'description': self.description,
357
+ 'creation_time': dt.utcnow()}),
358
+ upsert_conditions=OrderedDict({
359
+ 'name': self.name}),
360
+ update_columns_values=OrderedDict({
361
+ 'description': self.description,
362
+ 'modified_time': dt.utcnow()})
363
+ )
364
+
365
+ # Insert into xref table now. Before that, delete for that key.
366
+ _delete_data(schema_name=repo,
367
+ table_name=EFS_ENTITY_XREF_SPEC["table_name"],
368
+ delete_conditions=_SQLColumnExpression("entity_name")==self.name)
369
+
370
+ values = [(self.name, col) for col in self.columns]
371
+ _insert_data(EFS_ENTITY_XREF_SPEC["table_name"], values, schema_name=repo)
372
+
373
+ return True
374
+
375
+ def __eq__(self, other):
376
+ """
377
+ Compare the Entity with other Entity to check if both are
378
+ same or not.
379
+
380
+ PARAMETERS:
381
+ other :
382
+ Required Argument.
383
+ Specifies another Entity.
384
+ Types: Entity
385
+
386
+ RETURNS:
387
+ bool
388
+
389
+ RAISES:
390
+ None
391
+
392
+ EXAMPLES:
393
+ >>> load_example_data('dataframe', ['sales'])
394
+ >>> df = DataFrame("sales")
395
+
396
+ # Example 1: Create two entities and compare whether they are same or not.
397
+ >>> from teradataml import Entity
398
+ >>> entity1 = Entity('sales:accounts', 'accounts')
399
+ >>> entity2 = Entity('sales:accounts', 'accounts')
400
+ >>> entity1 == entity2
401
+ True
402
+ >>>
403
+ """
404
+ if not isinstance(other, Entity):
405
+ return False
406
+ # Both entities will be same only when corresponding columns are same.
407
+ return set(self.columns) == set(other.columns)
408
+
409
+
410
+ class DataSource:
411
+ """Class for DataSource. """
412
+ def __init__(self, name, source, description=None, timestamp_col_name=None):
413
+ """
414
+ DESCRIPTION:
415
+ Constructor for creating DataSource Object.
416
+
417
+ PARAMETERS:
418
+ name:
419
+ Required Argument.
420
+ Specifies the unique name of the DataSource.
421
+ Types: str.
422
+
423
+ source:
424
+ Required Argument.
425
+ Specifies the source query of DataSource.
426
+ Types: str OR teradataml DataFrame.
427
+
428
+ description:
429
+ Optional Argument.
430
+ Specifies human readable description for DataSource.
431
+ Types: str
432
+
433
+ timestamp_col_name:
434
+ Optional Argument.
435
+ Specifies the timestamp column indicating when the row was created.
436
+ Types: str
437
+
438
+ RETURNS:
439
+ Object of DataSource.
440
+
441
+ RAISES:
442
+ None
443
+
444
+ EXAMPLES:
445
+ >>> load_example_data('dataframe', ['sales'])
446
+ >>> df = DataFrame("sales")
447
+
448
+ # Example 1: create a DataSource for above mentioned DataFrame with name 'Sales_Data'.
449
+ >>> from teradataml import DataSource
450
+ >>> data_source = DataSource('Sales_Data', df)
451
+ >>> data_source
452
+ DataSource(Sales_Data)
453
+ >>>
454
+ """
455
+ self.name = name
456
+ self.timestamp_col_name = timestamp_col_name
457
+ self.source = source if isinstance(source, str) else source.show_query()
458
+ self.description = description
459
+
460
+ @classmethod
461
+ def _from_df(cls, df):
462
+ """
463
+ DESCRIPTION:
464
+ Internal method to create object of DataSource from DataFrame.
465
+
466
+ PARAMETERS:
467
+ df:
468
+ Required Argument.
469
+ Specifies teradataml DataFrame which has a single
470
+ record denoting DataSource.
471
+ Types: teradataml DataFrame.
472
+
473
+ RETURNS:
474
+ teradataml DataFrame
475
+
476
+ RAISES:
477
+ None
478
+
479
+ EXAMPLES:
480
+ >>> DataSource._from_df(df)
481
+ """
482
+ rec = next(df.itertuples())._asdict()
483
+ rec.pop("creation_time")
484
+ rec.pop("modified_time")
485
+ return cls(**(rec))
486
+
487
+ def __repr__(self):
488
+ """
489
+ DESCRIPTION:
490
+ String representation for DataSource object.
491
+
492
+ PARAMETERS:
493
+ None
494
+
495
+ RETURNS:
496
+ str
497
+
498
+ RAISES:
499
+ None
500
+ """
501
+ return "DataSource(name={})".format(self.name)
502
+
503
+ def publish(self, repo):
504
+ """
505
+ DESCRIPTION:
506
+ Method to publish the DataSource details to repository.
507
+
508
+ PARAMETERS:
509
+ repo:
510
+ Required Argument.
511
+ Specifies the name of the repository to publish the DataSource details.
512
+ Types: str.
513
+
514
+ RETURNS:
515
+ bool.
516
+
517
+ RAISES:
518
+ TeradataMlException
519
+
520
+ EXAMPLES:
521
+ >>> load_example_data('dataframe', ['sales'])
522
+ >>> df = DataFrame("sales")
523
+
524
+ # Example 1: Publish the above mentioned DataFrame as DataSource
525
+ # and name it as "Sales_Data".
526
+ >>> from teradataml import DataSource
527
+ >>> data_source = DataSource('Sales_Data', df)
528
+ >>> data_source.publish('vfs_test')
529
+ True
530
+ >>>
531
+
532
+ # Example 2: Republish the published DataSource in example 1 with
533
+ # updated description.
534
+ >>> # First, Get the existing DataSource.
535
+ >>> from teradataml import FeatureStore
536
+ >>> data_source = FeatureStore('vfs_test').get_data_source('Sales_Data')
537
+ >>> # Update it's description.
538
+ >>> data_source.description = "Pivoted sales data."
539
+ >>> # Republish the details to same repo.
540
+ >>> data_source.publish('vfs_test')
541
+ """
542
+ _upsert_data(schema_name=repo,
543
+ table_name=EFS_DATA_SOURCE_SPEC["table_name"],
544
+ insert_columns_values=OrderedDict({
545
+ 'name': self.name,
546
+ 'description': self.description,
547
+ 'timestamp_col_name': self.timestamp_col_name,
548
+ 'source': self.source,
549
+ 'creation_time': dt.utcnow()
550
+ }),
551
+ upsert_conditions={"name": self.name},
552
+ update_columns_values=OrderedDict({
553
+ 'description': self.description,
554
+ 'timestamp_col_name': self.timestamp_col_name,
555
+ 'modified_time': dt.utcnow(),
556
+ 'source': self.source})
557
+ )
558
+ return True
559
+
560
+
561
+ class FeatureGroup:
562
+ """Class for FeatureGroup. """
563
+ def __init__(self, name, features, entity, data_source, description=None):
564
+ """
565
+ DESCRIPTION:
566
+ Constructor for creating FeatureGroup Object.
567
+
568
+ PARAMETERS:
569
+ name:
570
+ Required Argument.
571
+ Specifies the unique name of the FeatureGroup.
572
+ Types: str.
573
+
574
+ features:
575
+ Required Argument.
576
+ Specifies the features required to create a group.
577
+ Types: Feature or list of Feature.
578
+
579
+ entity:
580
+ Required Argument.
581
+ Specifies the entity associated with corresponding features.
582
+ Types: Entity
583
+
584
+ data_source:
585
+ Required Argument.
586
+ Specifies the DataSource associated with Features.
587
+ Types: str
588
+
589
+ description:
590
+ Optional Argument.
591
+ Specifies human readable description for DataSource.
592
+ Types: str
593
+
594
+ RETURNS:
595
+ Object of FeatureGroup.
596
+
597
+ RAISES:
598
+ None
599
+
600
+ EXAMPLES:
601
+ >>> load_example_data('dataframe', ['sales'])
602
+ >>> df = DataFrame("sales")
603
+
604
+ # Example 1: create a FeatureGroup for above mentioned DataFrame.
605
+ >>> # First create the features.
606
+ >>> jan_feature = Feature("sales:Jan", df.Jan)
607
+ >>> feb_feature = Feature("sales:Fan", df.Feb)
608
+ >>> mar_feature = Feature("sales:Mar", df.Mar)
609
+ >>> apr_feature = Feature("sales:Apr", df.Apr)
610
+ >>> # Create Entity.
611
+ >>> entity = Entity("sales:accounts", df.accounts)
612
+ >>> # Create DataSource
613
+ >>> data_source = DataSource("sales_source", df.show_query())
614
+ >>> # Create FeatureGroup.
615
+ >>> fg = FeatureGroup('Sales',
616
+ ... features=[jan_feature, feb_feature, mar_feature, apr_feature],
617
+ ... entity=entity,
618
+ ... data_source=data_source)
619
+ """
620
+ self.name = name
621
+ self.features = UtilFuncs._as_list(features)
622
+ self.entity = entity
623
+ self.data_source = data_source
624
+ self.description = description
625
+ self.__redundant_features = []
626
+ self._labels = []
627
+
628
+ @property
629
+ def features(self):
630
+ """
631
+ DESCRIPTION:
632
+ Get's the features from FeatureGroup.
633
+
634
+ PARAMETERS:
635
+ None
636
+
637
+ RETURNS:
638
+ list
639
+
640
+ RAISES:
641
+ None
642
+
643
+ EXAMPLES:
644
+ >>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
645
+ >>> load_example_data("dataframe", "sales")
646
+ >>> # Let's create DataFrame first.
647
+ >>> df = DataFrame("sales")
648
+ >>> # create the features.
649
+ >>> jan_feature = Feature("sales:Jan", df.Jan)
650
+ >>> feb_feature = Feature("sales:Fan", df.Feb)
651
+ >>> mar_feature = Feature("sales:Mar", df.Mar)
652
+ >>> apr_feature = Feature("sales:Apr", df.Apr)
653
+ >>> # Create Entity.
654
+ >>> entity = Entity("sales:accounts", df.accounts)
655
+ >>> # Create DataSource
656
+ >>> data_source = DataSource("sales_source", df)
657
+ >>> # Create FeatureGroup.
658
+ >>> fg = FeatureGroup('Sales',
659
+ ... features=[jan_feature, feb_feature, mar_feature, apr_feature],
660
+ ... entity=entity,
661
+ ... data_source=data_source)
662
+
663
+ # Get the features from FeatureGroup
664
+ >>> fg.features
665
+ [Feature(name=sales:Jan), Feature(name=sales:Fan), Feature(name=sales:Mar), Feature(name=sales:Apr)]
666
+ >>>
667
+ """
668
+ return [feature for feature in self._features if feature.name not in self._labels]
669
+
670
+ @property
671
+ def labels(self):
672
+ """
673
+ DESCRIPTION:
674
+ Get's the labels from FeatureGroup.
675
+ Note:
676
+ Use this function only after setting the labels using "set_labels".
677
+
678
+ PARAMETERS:
679
+ None
680
+
681
+ RETURNS:
682
+ Feature OR list
683
+
684
+ RAISES:
685
+ None
686
+
687
+ EXAMPLES:
688
+ >>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
689
+ >>> load_example_data("dataframe", "admissions_train")
690
+ >>> # Let's create DataFrame first.
691
+ >>> df = DataFrame("admissions_train")
692
+ >>> # create the features.
693
+ >>> masters_feature = Feature("masters", df.masters)
694
+ >>> gpa_feature = Feature("gpa", df.gpa)
695
+ >>> stats_feature = Feature("stats", df.stats)
696
+ >>> admitted_feature = Feature("admitted", df.admitted)
697
+ >>> # Create Entity.
698
+ >>> entity = Entity("id", df.id)
699
+ >>> # Create DataSource
700
+ >>> data_source = DataSource("admissions_source", df)
701
+ >>> # Create FeatureGroup.
702
+ >>> fg = FeatureGroup('Admissions',
703
+ ... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
704
+ ... entity=entity,
705
+ ... data_source=data_source)
706
+ >>> # Set feature 'admitted' as label.
707
+ >>> fg.set_labels('admitted')
708
+ True
709
+
710
+ # Get the labels from FeatureGroup
711
+ >>> fg.labels
712
+ Feature(name=admitted)
713
+ >>>
714
+ """
715
+ labels = [feature for feature in self._features if feature.name in self._labels]
716
+ if len(labels) == 1:
717
+ return labels[0]
718
+ return labels
719
+
720
+ @features.setter
721
+ def features(self, features):
722
+ """ Setter for features. """
723
+ self._features = UtilFuncs._as_list(features)
724
+ return True
725
+
726
+ def set_labels(self, labels):
727
+ """
728
+ DESCRIPTION:
729
+ Sets the labels for FeatureGroup.
730
+ This method is helpful, when working with analytic functions to consume the Features.
731
+ Note:
732
+ Label is for the current session only.
733
+
734
+ PARAMETERS:
735
+ labels:
736
+ Required Argument.
737
+ Specifies the name(s) of the features to refer as labels.
738
+ Types: str or list of str
739
+
740
+ RETURNS:
741
+ bool
742
+
743
+ RAISES:
744
+ None
745
+
746
+ EXAMPLES:
747
+ >>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
748
+ >>> load_example_data("dataframe", "admissions_train")
749
+ >>> # Let's create DataFrame first.
750
+ >>> df = DataFrame("admissions_train")
751
+ >>> # create the features.
752
+ >>> masters_feature = Feature("masters", df.masters)
753
+ >>> gpa_feature = Feature("gpa", df.gpa)
754
+ >>> stats_feature = Feature("stats", df.stats)
755
+ >>> admitted_feature = Feature("admitted", df.admitted)
756
+ >>> # Create Entity.
757
+ >>> entity = Entity("id", df.id)
758
+ >>> # Create DataSource
759
+ >>> data_source = DataSource("admissions_source", df)
760
+ >>> # Create FeatureGroup.
761
+ >>> fg = FeatureGroup('Admissions',
762
+ ... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
763
+ ... entity=entity,
764
+ ... data_source=data_source)
765
+
766
+ >>> # Set feature 'admitted' as label.
767
+ >>> fg.set_labels('admitted')
768
+ True
769
+ """
770
+ self._labels = [] if labels is None else UtilFuncs._as_list(labels)
771
+ return True
772
+
773
+ @labels.setter
774
+ def labels(self, labels):
775
+ """
776
+ DESCRIPTION:
777
+ Sets the labels for FeatureGroup.
778
+ This method is helpful, when working with analytic functions to consume the Features.
779
+ Note:
780
+ Label is for the current session only.
781
+
782
+ PARAMETERS:
783
+ labels:
784
+ Required Argument.
785
+ Specifies the name(s) of the features to refer as labels.
786
+ Types: str or list of str
787
+
788
+ RETURNS:
789
+ bool
790
+
791
+ RAISES:
792
+ None
793
+
794
+ EXAMPLES:
795
+ >>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
796
+ >>> load_example_data("dataframe", "admissions_train")
797
+ # Let's create DataFrame first.
798
+ >>> df = DataFrame("admissions_train")
799
+ # Create the features.
800
+ >>> masters_feature = Feature("masters", df.masters)
801
+ >>> gpa_feature = Feature("gpa", df.gpa)
802
+ >>> stats_feature = Feature("stats", df.stats)
803
+ >>> admitted_feature = Feature("admitted", df.admitted)
804
+ # Create Entity.
805
+ >>> entity = Entity("id", df.id)
806
+ # Create DataSource.
807
+ >>> data_source = DataSource("admissions_source", df)
808
+ # Create FeatureGroup.
809
+ >>> fg = FeatureGroup('Admissions',
810
+ ... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
811
+ ... entity=entity,
812
+ ... data_source=data_source)
813
+
814
+ # Set feature 'admitted' as label.
815
+ >>> fg.labels = 'admitted'
816
+ True
817
+ """
818
+ return self.set_labels(labels)
819
+
820
+ def reset_labels(self):
821
+ """
822
+ DESCRIPTION:
823
+ Resets the labels for FeatureGroup.
824
+
825
+ PARAMETERS:
826
+ None
827
+
828
+ RETURNS:
829
+ bool
830
+
831
+ RAISES:
832
+ None
833
+
834
+ EXAMPLES:
835
+ >>> from teradataml import DataSource, Entity, Feature, FeatureGroup, load_example_data
836
+ >>> load_example_data("dataframe", "admissions_train")
837
+ >>> # Let's create DataFrame first.
838
+ >>> df = DataFrame("admissions_train")
839
+ >>> # create the features.
840
+ >>> masters_feature = Feature("masters", df.masters)
841
+ >>> gpa_feature = Feature("gpa", df.gpa)
842
+ >>> stats_feature = Feature("stats", df.stats)
843
+ >>> admitted_feature = Feature("admitted", df.admitted)
844
+ >>> # Create Entity.
845
+ >>> entity = Entity("id", df.id)
846
+ >>> # Create DataSource
847
+ >>> data_source = DataSource("admissions_source", df)
848
+ >>> # Create FeatureGroup.
849
+ >>> fg = FeatureGroup('Admissions',
850
+ ... features=[masters_feature, gpa_feature, stats_feature, admitted_feature],
851
+ ... entity=entity,
852
+ ... data_source=data_source)
853
+ >>> # Set feature 'admitted' as label.
854
+ >>> fg.set_labels('admitted')
855
+ True
856
+
857
+ >>> # Remove the labels from FeatureGroup.
858
+ >>> fg.reset_labels()
859
+ True
860
+ >>>
861
+ """
862
+ self._labels = []
863
+ return True
864
+
865
+ def apply(self, object):
866
+ """
867
+ DESCRIPTION:
868
+ Register objects to FeatureGroup.
869
+
870
+ PARAMETERS:
871
+ object:
872
+ Required Argument.
873
+ Specifies the object to update the FeatureGroup.
874
+ Types: Feature OR DataSource OR Entity.
875
+
876
+ RETURNS:
877
+ bool.
878
+
879
+ RAISES:
880
+ TeradataMLException
881
+
882
+ EXAMPLES:
883
+ >>> load_example_data('dataframe', ['sales'])
884
+ >>> df = DataFrame("sales")
885
+ >>> # Create FeatureGroup to use it in examples.
886
+ >>> from teradataml import Feature, Entity, DataSource, FeatureGroup
887
+ >>> feature = Feature('sales:Feb', df.Feb)
888
+ >>> entity = Entity('sales:accounts', df.accounts)
889
+ >>> data_source = DataSource('Sales_Data', df)
890
+ >>> fg = FeatureGroup('Sales',
891
+ ... features=feature,
892
+ ... entity=entity,
893
+ ... data_source=data_source)
894
+
895
+ # Example 1: create a new Feature for column df.Mar and
896
+ # apply the feature to FeatueGroup.
897
+ >>> # Create Feature.
898
+ >>> feature = Feature('sales:Mar', df.Mar)
899
+ >>> # Register the above Feature with FeatureGroup.
900
+ >>> fg.apply(feature)
901
+ True
902
+ >>>
903
+ """
904
+ if isinstance(object, Feature):
905
+ # Before adding feature, check if already feature with
906
+ # the name exists or not.
907
+ feature_exists = [i for i in range(len(self._features)) if self._features[i].name == object.name]
908
+ if feature_exists:
909
+ self._features[feature_exists[0]] = object
910
+ else:
911
+ self._features.append(object)
912
+ elif isinstance(object, Entity):
913
+ self.entity = object
914
+ elif isinstance(object, DataSource):
915
+ self.data_source = object
916
+ else:
917
+ raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE,
918
+ 'object', "Feature or Entity or DataSource"),
919
+ MessageCodes.UNSUPPORTED_DATATYPE)
920
+
921
+ return True
922
+
923
+ def remove(self, object):
924
+ """
925
+ DESCRIPTION:
926
+ Method to remove the objects from FeatureGroup. One can use this
927
+ method to detach either Feature or DataSource or Entity from
928
+ FeatureGroup. Much useful to remove existing Features from
929
+ FeatureGroup.
930
+
931
+ PARAMETERS:
932
+ object:
933
+ Required Argument.
934
+ Specifies the object to be removed from FeatureGroup.
935
+ Types: Feature OR Entity OR DataSource OR FeatureGroup.
936
+
937
+ RETURNS:
938
+ bool.
939
+
940
+ RAISES:
941
+ TeradataMlException
942
+
943
+ EXAMPLES:
944
+ >>> load_example_data('dataframe', ['sales'])
945
+ >>> df = DataFrame("sales")
946
+ >>> # First create the features.
947
+ >>> jan_feature = Feature("sales:Jan", df.Jan)
948
+ >>> feb_feature = Feature("sales:Fan", df.Feb)
949
+ >>> mar_feature = Feature("sales:Mar", df.Mar)
950
+ >>> apr_feature = Feature("sales:Jan", df.Apr)
951
+ >>> # Create Entity.
952
+ >>> entity = Entity("sales:accounts", df.accounts)
953
+ >>> # Create DataSource
954
+ >>> data_source = DataSource("sales_source", df.show_query())
955
+ >>> # Create FeatureGroup.
956
+ >>> fg = FeatureGroup('Sales',
957
+ ... features=[jan_feature, feb_feature, mar_feature],
958
+ ... entity=entity,
959
+ ... data_source=data_source)
960
+
961
+ # Example: Remove the Feature with name "sales:Feb" from FeatureGroup.
962
+ >>> fg.remove(feb_feature)
963
+ True
964
+ >>>
965
+ """
966
+ get_msg = lambda object: "{} '{}' is not associated with FeatureGroup.".format(
967
+ object.__class__.__name__, object.name)
968
+
969
+ if isinstance(object, Feature):
970
+ # Find the position of feature first, then pop it.
971
+ index = [i for i in range(len(self._features)) if self._features[i].name == object.name]
972
+ if index:
973
+ self.__redundant_features.append(self._features.pop(index[0]))
974
+ else:
975
+ print(get_msg(object))
976
+ return False
977
+ elif isinstance(object, DataSource):
978
+ if self.data_source.name == object.name:
979
+ self.data_source = None
980
+ else:
981
+ print(get_msg(object))
982
+ return False
983
+ elif isinstance(object, Entity):
984
+ if self.entity.name == object.name:
985
+ self.entity = None
986
+ else:
987
+ print(get_msg(object))
988
+ return False
989
+ else:
990
+ raise TeradataMlException(Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE,
991
+ 'object', "Feature or Entity or DataSource"),
992
+ MessageCodes.UNSUPPORTED_DATATYPE)
993
+ return True
994
+
995
+ @classmethod
996
+ def _from_df(cls, df, repo, features_df, entity_df, data_source_df):
997
+ """
998
+ DESCRIPTION:
999
+ Internal method to create object of FeatureGroup from DataFrame.
1000
+
1001
+ PARAMETERS:
1002
+ df:
1003
+ Required Argument.
1004
+ Specifies teradataml DataFrame which has a single
1005
+ record denoting FeatureGroup.
1006
+ Types: teradataml DataFrame.
1007
+
1008
+ repo:
1009
+ Required Argument.
1010
+ Specifies the repo name of FeatureStore.
1011
+ Types: str
1012
+
1013
+ features_df:
1014
+ Required Argument.
1015
+ Specifies teradataml DataFrame which has features.
1016
+ Types: teradataml DataFrame.
1017
+
1018
+ entity_df:
1019
+ Required Argument.
1020
+ Specifies teradataml DataFrame which has entities.
1021
+ Types: teradataml DataFrame.
1022
+
1023
+ data_source_df:
1024
+ Required Argument.
1025
+ Specifies teradataml DataFrame which has data sources.
1026
+ Types: teradataml DataFrame.
1027
+
1028
+ RETURNS:
1029
+ FeatureGroup
1030
+
1031
+ RAISES:
1032
+ None
1033
+
1034
+ EXAMPLES:
1035
+ >>> FeatureGroup._from_df(df, "repo", features_df, entity_df, data_source_df)
1036
+ """
1037
+ rec = next(df.itertuples())._asdict()
1038
+
1039
+ # Select active features.
1040
+ features_df = features_df[features_df.status != FeatureStatus.INACTIVE.name]
1041
+ req_features_df = features_df[features_df.group_name == rec["name"]]
1042
+
1043
+ features = Feature._from_df(req_features_df)
1044
+ entity = Entity._from_df(entity_df[entity_df.name==rec['entity_name']])
1045
+ data_source = DataSource._from_df(data_source_df[data_source_df.name==rec['data_source_name']])
1046
+
1047
+ return cls(name=rec["name"], features=features, entity=entity, data_source=data_source, description=rec["description"])
1048
+
1049
+ def __repr__(self):
1050
+ """
1051
+ DESCRIPTION:
1052
+ String representation for FeatureGroup object.
1053
+
1054
+ PARAMETERS:
1055
+ None
1056
+
1057
+ RETURNS:
1058
+ str
1059
+
1060
+ RAISES:
1061
+ None
1062
+ """
1063
+ return "FeatureGroup({}, features=[{}], entity={}, data_source={})".format(
1064
+ self.name, ", ".join((str(feature) for feature in self.features)), self.entity, self.data_source)
1065
+
1066
+ @db_transaction
1067
+ def publish(self, repo):
1068
+ """
1069
+ DESCRIPTION:
1070
+ Method to publish the FeatureGroup details to repository.
1071
+
1072
+ PARAMETERS:
1073
+ repo:
1074
+ Required Argument.
1075
+ Specifies the name of the repository to publish the FeatureGroup details.
1076
+ Types: str.
1077
+
1078
+ RETURNS:
1079
+ bool.
1080
+
1081
+ RAISES:
1082
+ TeradataMlException
1083
+
1084
+ EXAMPLES:
1085
+ >>> load_example_data('dataframe', ['sales'])
1086
+ >>> df = DataFrame("sales")
1087
+
1088
+ # Example 1: create a FeatureGroup 'sales_data_fg' for above mentioned
1089
+ # DataFrame and publish it to 'vfs_v1'.
1090
+ >>> # First create the features.
1091
+ >>> jan_feature = Feature("sales:Jan", df.Jan)
1092
+ >>> feb_feature = Feature("sales:Fan", df.Feb)
1093
+ >>> mar_feature = Feature("sales:Mar", df.Mar)
1094
+ >>> apr_feature = Feature("sales:Jan", df.Apr)
1095
+ >>> # Create Entity.
1096
+ >>> entity = Entity("sales:accounts", df.accounts)
1097
+ >>> # Create DataSource
1098
+ >>> data_source = DataSource("sales_source", df.show_query())
1099
+ >>> # Create FeatureGroup.
1100
+ >>> fg = FeatureGroup('Sales',
1101
+ ... features=[jan_feature, feb_feature, mar_feature],
1102
+ ... entity=entity,
1103
+ ... data_source=data_source)
1104
+ >>> feature_group.publish('vfs_v1')
1105
+
1106
+ # Example 2: Republish the FeatureGroup published in example1 with
1107
+ # updated description.
1108
+ >>> # First, Get the existing FeatureGroup.
1109
+ >>> from teradataml import FeatureStore
1110
+ >>> fg = FeatureStore('vfs_test').get_feature_group('Sales')
1111
+ >>> # Update it's description.
1112
+ >>> fg.description = "Feature group for Sales."
1113
+ >>> # Republish the details to same repo.
1114
+ >>> fg.publish('vfs_v1')
1115
+ """
1116
+
1117
+ # Do not publish if any of required associated parameter does not exist.
1118
+ message = "FeatureGroup can not be published with out {}"
1119
+ if not self.features:
1120
+ raise TeradataMlException(Messages.get_message(
1121
+ MessageCodes.FUNC_EXECUTION_FAILED, 'publish', message.format("Features")),
1122
+ MessageCodes.FUNC_EXECUTION_FAILED)
1123
+
1124
+ if not self.data_source:
1125
+ raise TeradataMlException(Messages.get_message(
1126
+ MessageCodes.FUNC_EXECUTION_FAILED, 'publish', message.format("DataSource")),
1127
+ MessageCodes.FUNC_EXECUTION_FAILED)
1128
+
1129
+ if not self.entity:
1130
+ raise TeradataMlException(Messages.get_message(
1131
+ MessageCodes.FUNC_EXECUTION_FAILED, 'publish', message.format("Entity")),
1132
+ MessageCodes.FUNC_EXECUTION_FAILED)
1133
+
1134
+ # Before publish FeatureGroup, publish other elements.
1135
+ for feature in self.features:
1136
+ feature.publish(repo)
1137
+
1138
+ self.entity.publish(repo)
1139
+ self.data_source.publish(repo)
1140
+ _upsert_data(schema_name=repo,
1141
+ table_name=EFS_FEATURE_GROUP_SPEC["table_name"],
1142
+ insert_columns_values=OrderedDict({
1143
+ 'name': self.name,
1144
+ 'description': self.description,
1145
+ 'data_source_name': self.data_source.name,
1146
+ 'entity_name': self.entity.name,
1147
+ 'creation_time': dt.utcnow()
1148
+ }),
1149
+ upsert_conditions={'name': self.name},
1150
+ update_columns_values=OrderedDict({
1151
+ 'description': self.description,
1152
+ 'data_source_name': self.data_source.name,
1153
+ 'modified_time': dt.utcnow(),
1154
+ 'entity_name': self.entity.name})
1155
+ )
1156
+
1157
+ for feature in self.features:
1158
+ _upsert_data(schema_name=repo,
1159
+ table_name=EFS_GROUP_FEATURES_SPEC["table_name"],
1160
+ insert_columns_values=OrderedDict({
1161
+ 'feature_name': feature.name,
1162
+ 'group_name': self.name,
1163
+ 'modified_time': dt.utcnow()
1164
+ }),
1165
+ upsert_conditions={'feature_name': feature.name, "group_name": self.name},
1166
+ update_columns_values=OrderedDict({
1167
+ 'modified_time': dt.utcnow()
1168
+ })
1169
+ )
1170
+
1171
+ # Cut down the link between features and FeatureGroup if any of the
1172
+ # features is removed from FeatureGroup.
1173
+ if self.__redundant_features:
1174
+ col_expression = _SQLColumnExpression("feature_name") == self.__redundant_features[0].name
1175
+ for feature in self.__redundant_features[1:]:
1176
+ col_expression = ((col_expression) | (_SQLColumnExpression("feature_name") == feature.name))
1177
+ _delete_data(schema_name=repo,
1178
+ table_name=EFS_GROUP_FEATURES_SPEC["table_name"],
1179
+ delete_conditions=((_SQLColumnExpression("group_name") == self.name) & (col_expression)))
1180
+ # After removing the data, set this back.
1181
+ self.__redundant_features = []
1182
+
1183
+ return True
1184
+
1185
+ def __add__(self, other):
1186
+ """
1187
+ Combines two Feature groups.
1188
+
1189
+ PARAMETERS:
1190
+ other :
1191
+ Required Argument.
1192
+ Specifies another FeatureGroup.
1193
+ Types: FeatureGroup
1194
+
1195
+ RETURNS:
1196
+ FeatureGroup
1197
+
1198
+ RAISES:
1199
+ TypeError, ValueError
1200
+
1201
+ EXAMPLES:
1202
+ >>> load_example_data("dataframe", "sales")
1203
+ >>> df = DataFrame("sales")
1204
+ >>> df
1205
+ Feb Jan Mar Apr datetime
1206
+ accounts
1207
+ Alpha Co 210.0 200.0 215.0 250.0 04/01/2017
1208
+ Blue Inc 90.0 50.0 95.0 101.0 04/01/2017
1209
+ Jones LLC 200.0 150.0 140.0 180.0 04/01/2017
1210
+ Orange Inc 210.0 NaN NaN 250.0 04/01/2017
1211
+ Yellow Inc 90.0 NaN NaN NaN 04/01/2017
1212
+ Red Inc 200.0 150.0 140.0 NaN 04/01/2017
1213
+
1214
+ # Example 1: create two feature groups and then create a new feature
1215
+ # group by combining those two feature groups.
1216
+ # Creating first feature group.
1217
+ >>> f1 = Feature("sales_Jan", column=df.Jan)
1218
+ >>> f2 = Feature("sales_Feb", column=df.Feb)
1219
+ >>> entity = Entity(name="sales", columns='accounts')
1220
+ >>> data_source = DataSource("sales", source=df.show_query())
1221
+ >>> fg1 = FeatureGroup(name="sales_jan_feb", entity=entity, features=[f1, f2], data_source=data_source)
1222
+ >>> fg1
1223
+ FeatureGroup(sales_jan_feb, features=[Feature(name=sales_Jan), Feature(name=sales_Feb)], entity=Entity(name=sales), data_source=DataSource(name=sales))
1224
+
1225
+ >>> # Creating second feature group.
1226
+ >>> f3 = Feature("sales_Mar", column=df.Mar)
1227
+ >>> f4 = Feature("sales_Apr", column=df.Apr)
1228
+ >>> data_source = DataSource("sales_Mar_Apr", source=df.show_query())
1229
+ >>> fg2 = FeatureGroup(name="sales_Mar_Apr", entity=entity, features=[f3, f4], data_source=data_source)
1230
+ >>> fg2
1231
+ FeatureGroup(sales_Mar_Apr, features=[Feature(name=sales_Mar), Feature(name=sales_Apr)], entity=Entity(name=sales), data_source=DataSource(name=sales))
1232
+
1233
+ >>> # Combining two feature groups.
1234
+ >>> new_fg = feature_group1 + feature_group2
1235
+ >>> new_fg
1236
+ FeatureGroup(sales_jan_feb_sales_Mar_Apr, features=[Feature(name=sales_Jan), Feature(name=sales_Feb), Feature(name=sales_Mar), Feature(name=sales_Apr)], entity=Entity(name=sales), data_source=DataSource(name=sales))
1237
+ >>>
1238
+ """
1239
+ if not isinstance(other, FeatureGroup):
1240
+ err_ = Messages.get_message(MessageCodes.UNSUPPORTED_DATATYPE, "other",
1241
+ "FeatureGroup")
1242
+ raise TypeError(err_)
1243
+
1244
+ if self.entity != other.entity:
1245
+ raise ValueError("Two FeatureGroups can be merged only when the corresponding entities are same.")
1246
+
1247
+ # While merging two datasets, time stamp columns also should be same.
1248
+ if ((self.data_source.timestamp_col_name and not other.data_source.timestamp_col_name) or
1249
+ (other.data_source.timestamp_col_name and not self.data_source.timestamp_col_name) or
1250
+ (self.data_source.timestamp_col_name != other.data_source.timestamp_col_name)):
1251
+ raise ValueError("Two FeatureGroups can be merged only when the corresponding "
1252
+ "'timestamp_col_name' for the DataSources are same.")
1253
+
1254
+ if self.entity == other.entity:
1255
+
1256
+ existing_columns = {feature.column_name for feature in self.features}
1257
+ # New features should be combined features of both "self" and other.
1258
+ # However, these two features may share common features too. In such cases,
1259
+ # consider only one.
1260
+ effective_other_features = [feature for feature in other.features
1261
+ if feature.column_name not in existing_columns]
1262
+
1263
+ # Prepare new DataSource.
1264
+ query_1 = self.data_source.source
1265
+ query_2 = other.data_source.source
1266
+
1267
+ # If both the queries a.k.a sources are not same, then combine those
1268
+ # sources with join. While combining, make sure to specify only the
1269
+ # columns which are required.
1270
+ if query_2 != query_1:
1271
+
1272
+ # Consider adding timestamp column to query.
1273
+ time_stamp_column = []
1274
+ if self.data_source.timestamp_col_name:
1275
+ time_stamp_column.append("A.{}".format(self.data_source.timestamp_col_name))
1276
+
1277
+ feature_columns = (["A.{}".format(feature.column_name) for feature in self.features] +
1278
+ ["B.{}".format(feature.column_name) for feature in effective_other_features])
1279
+
1280
+ columns = ", ".join(["A.{}".format(col) for col in self.entity.columns] + time_stamp_column + feature_columns)
1281
+ on_clause_columns = [col for col in self.entity.columns]
1282
+ if self.data_source.timestamp_col_name:
1283
+ on_clause_columns.append(self.data_source.timestamp_col_name)
1284
+ where_clause = " AND ".join(["A.{0} = B.{0}".format(column) for column in on_clause_columns])
1285
+
1286
+ query = f"""
1287
+ SELECT {columns}
1288
+ FROM ({query_1.strip(";")}) AS A, ({query_2.strip(";")}) AS B
1289
+ WHERE {where_clause}
1290
+ """
1291
+ data_source = DataSource(name="{}_{}".format(self.data_source.name, other.data_source.name),
1292
+ source=query,
1293
+ description="Combined DataSource for {} and {}".format(
1294
+ self.data_source.name, other.data_source.name),
1295
+ timestamp_col_name=self.data_source.timestamp_col_name
1296
+ )
1297
+ else:
1298
+ data_source = self.data_source
1299
+
1300
+ # Create new feature group.
1301
+ feature_group = FeatureGroup(name="{}_{}".format(self.name, other.name),
1302
+ features=self.features + effective_other_features,
1303
+ data_source=data_source,
1304
+ entity=Entity(name="{}_{}".format(self.name, other.name),
1305
+ columns=self.entity.columns),
1306
+ description="Combined FeatureGroup for groups {} and {}.".format(
1307
+ self.name, other.name)
1308
+ )
1309
+ return feature_group
1310
+
1311
+ @classmethod
1312
+ def from_query(cls, name, entity_columns, query, timestamp_col_name=None):
1313
+ """
1314
+ DESCRIPTION:
1315
+ Method to create FeatureGroup from Query.
1316
+
1317
+ PARAMETERS:
1318
+ name:
1319
+ Required Argument.
1320
+ Specifies the name of the FeatureGroup.
1321
+ Note:
1322
+ * Entitiy, DataSource also get the same name as "name".
1323
+ Users can change the name of Entity or DataSource by accessing
1324
+ object from FeatureGroup.
1325
+ Types: str.
1326
+
1327
+ entity_columns:
1328
+ Required Argument.
1329
+ Specifies the column names for the Entity.
1330
+ Types: str or list of str.
1331
+
1332
+ query:
1333
+ Required Argument.
1334
+ Specifies the query for DataSource.
1335
+ Types: str.
1336
+
1337
+ timestamp_col_name:
1338
+ Optional Argument.
1339
+ Specifies the name of the column in the Query which
1340
+ holds the record creation time.
1341
+ Types: str
1342
+
1343
+ RETURNS:
1344
+ FeatureGroup
1345
+
1346
+ RAISES:
1347
+ None
1348
+
1349
+ EXAMPLES:
1350
+ >>> load_example_data('dataframe', ['sales'])
1351
+ >>> df = DataFrame("sales")
1352
+
1353
+ # Example 1: create a FeatureGroup from query 'SELECT * FROM SALES' and
1354
+ # consider 'accounts' column as entity and 'datetime' column
1355
+ # as timestamp_col_name.
1356
+ >>> from teradataml import FeatureGroup
1357
+ >>> query = 'SELECT * FROM SALES'
1358
+ >>> fg = FeatureGroup.from_query(
1359
+ ... name='sales',
1360
+ ... entity_columns='accounts',
1361
+ ... query=query,
1362
+ ... timestamp_col_name='datetime'
1363
+ ... )
1364
+ """
1365
+ return cls.__create_feature_group(name, entity_columns, query, timestamp_col_name)
1366
+
1367
+ @classmethod
1368
+ def from_DataFrame(cls, name, entity_columns, df, timestamp_col_name=None):
1369
+ """
1370
+ DESCRIPTION:
1371
+ Method to create FeatureGroup from DataFrame.
1372
+
1373
+ PARAMETERS:
1374
+ name:
1375
+ Required Argument.
1376
+ Specifies the name of the FeatureGroup.
1377
+ Note:
1378
+ * Entitiy, DataSource also get the same name as "name".
1379
+ User's can change the name of Entity or DataSource by accessing
1380
+ object from FeatureGroup.
1381
+ Types: str.
1382
+
1383
+ entity_columns:
1384
+ Required Argument.
1385
+ Specifies the column names for the Entity.
1386
+ Types: str or list of str.
1387
+
1388
+ df:
1389
+ Required Argument.
1390
+ Specifies teradataml DataFrame for creating DataSource.
1391
+ Types: teradataml DataFrame.
1392
+
1393
+ timestamp_col_name:
1394
+ Optional Argument.
1395
+ Specifies the name of the column in the Query which
1396
+ holds the record creation time.
1397
+ Types: str
1398
+
1399
+ RETURNS:
1400
+ FeatureGroup
1401
+
1402
+ RAISES:
1403
+ None
1404
+
1405
+ EXAMPLES:
1406
+ >>> load_example_data('dataframe', ['sales'])
1407
+ >>> df = DataFrame("sales")
1408
+
1409
+ # Example 1: create a FeatureGroup from DataFrame created on 'sales' table and
1410
+ # consider 'accounts' column as entity and 'datetime' column
1411
+ # as timestamp_col_name.
1412
+ >>> from teradataml import FeatureGroup
1413
+ >>> df = DataFrame("sales")
1414
+ >>> fg = FeatureGroup.from_DataFrame(
1415
+ ... name='sales',
1416
+ ... entity_columns='accounts',
1417
+ ... df=df,
1418
+ ... timestamp_col_name='datetime'
1419
+ ... )
1420
+ """
1421
+ return cls.__create_feature_group(name, entity_columns, df, timestamp_col_name)
1422
+
1423
+ @classmethod
1424
+ def __create_feature_group(cls, name, entity_columns, obj, timestamp_col_name=None):
1425
+ """
1426
+ DESCRIPTION:
1427
+ Internal method to create FeatureGroup from either DataFrame or from Query.
1428
+
1429
+ PARAMETERS:
1430
+ name:
1431
+ Required Argument.
1432
+ Specifies the name of the FeatureGroup.
1433
+ Types: str.
1434
+
1435
+ entity_columns:
1436
+ Required Argument.
1437
+ Specifies the column names for the Entity.
1438
+ Types: str or list of str.
1439
+
1440
+ obj:
1441
+ Required Argument.
1442
+ Specifies either teradataml DataFrame or Query for creating DataSource.
1443
+ Types: teradataml DataFrame OR str.
1444
+
1445
+ timestamp_col_name:
1446
+ Optional Argument.
1447
+ Specifies the name of the column in the Query or DataFrame which
1448
+ holds the record creation time.
1449
+ Types: str
1450
+
1451
+ RETURNS:
1452
+ FeatureGroup
1453
+
1454
+ RAISES:
1455
+ None
1456
+
1457
+ EXAMPLES:
1458
+ >>> load_example_data('dataframe', ['sales'])
1459
+ >>> df = DataFrame("sales")
1460
+
1461
+ # Example 1: create a FeatureGroup from DataFrame created on 'sales' table and
1462
+ # consider 'accounts' column as entity and 'datetime' column
1463
+ # as timestamp_col_name.
1464
+ >>> from teradataml import FeatureGroup
1465
+ >>> df = DataFrame("sales")
1466
+ >>> fg = FeatureGroup.__create_feature_group(
1467
+ ... name='sales',
1468
+ ... entity_columns='accounts',
1469
+ ... df=df,
1470
+ ... timestamp_col_name='datetime'
1471
+ ... )
1472
+ """
1473
+ # Check the caller. And decide the type of 'obj'.
1474
+ is_obj_dataframe = False
1475
+ if inspect.stack()[1][3] == 'from_DataFrame':
1476
+ # Perform the function validations.
1477
+ is_obj_dataframe = True
1478
+
1479
+ argument_validation_params = []
1480
+ argument_validation_params.append(["name", name, False, str, True])
1481
+ argument_validation_params.append(["entity_columns", entity_columns, False, (str, list), True])
1482
+ argument_validation_params.append(["timestamp_col_name", timestamp_col_name, True, str, True])
1483
+ param = ["df", obj, False, DataFrame, True] if is_obj_dataframe else ["query", obj, False, str, True]
1484
+ argument_validation_params.append(param)
1485
+ # Validate argument types
1486
+ _Validators._validate_function_arguments(argument_validation_params)
1487
+
1488
+ df = obj if is_obj_dataframe else DataFrame.from_query(obj)
1489
+
1490
+ features = [Feature(name=col, column=df[col]) for col in df.columns if (
1491
+ col not in UtilFuncs._as_list(entity_columns) and col != timestamp_col_name)
1492
+ ]
1493
+ data_source = DataSource(
1494
+ name=name,
1495
+ source=df.show_query(),
1496
+ timestamp_col_name=timestamp_col_name
1497
+ )
1498
+ entity = Entity(name=name, columns=entity_columns)
1499
+ fg = FeatureGroup(
1500
+ name=name,
1501
+ features=features,
1502
+ data_source=data_source,
1503
+ entity=entity
1504
+ )
1505
+ return fg