wedata-feature-engineering 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {feature_store → wedata}/__init__.py +1 -1
  2. {feature_store → wedata/feature_store}/client.py +113 -41
  3. {feature_store → wedata/feature_store}/constants/constants.py +19 -0
  4. {feature_store → wedata/feature_store}/entities/column_info.py +4 -4
  5. {feature_store → wedata/feature_store}/entities/feature_lookup.py +5 -1
  6. {feature_store → wedata/feature_store}/entities/feature_spec.py +46 -46
  7. wedata/feature_store/entities/feature_table.py +107 -0
  8. {feature_store → wedata/feature_store}/entities/training_set.py +13 -12
  9. {feature_store → wedata/feature_store}/feature_table_client/feature_table_client.py +85 -30
  10. {feature_store → wedata/feature_store}/spark_client/spark_client.py +30 -56
  11. wedata/feature_store/training_set_client/training_set_client.py +367 -0
  12. wedata/feature_store/utils/__init__.py +0 -0
  13. feature_store/utils/utils.py → wedata/feature_store/utils/common_utils.py +108 -54
  14. {feature_store → wedata/feature_store}/utils/feature_lookup_utils.py +6 -6
  15. {feature_store → wedata/feature_store}/utils/feature_spec_utils.py +6 -6
  16. {feature_store → wedata/feature_store}/utils/feature_utils.py +5 -5
  17. wedata/feature_store/utils/on_demand_utils.py +107 -0
  18. {feature_store → wedata/feature_store}/utils/schema_utils.py +1 -1
  19. wedata/feature_store/utils/signature_utils.py +205 -0
  20. {feature_store → wedata/feature_store}/utils/training_set_utils.py +18 -19
  21. {feature_store → wedata/feature_store}/utils/uc_utils.py +1 -1
  22. {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/METADATA +1 -1
  23. wedata_feature_engineering-0.1.6.dist-info/RECORD +43 -0
  24. wedata_feature_engineering-0.1.6.dist-info/top_level.txt +1 -0
  25. feature_store/entities/feature_table.py +0 -164
  26. feature_store/training_set_client/training_set_client.py +0 -196
  27. feature_store/utils/common_utils.py +0 -96
  28. wedata_feature_engineering-0.1.4.dist-info/RECORD +0 -41
  29. wedata_feature_engineering-0.1.4.dist-info/top_level.txt +0 -1
  30. {feature_store/constants → wedata/feature_store}/__init__.py +0 -0
  31. {feature_store/entities → wedata/feature_store/constants}/__init__.py +0 -0
  32. {feature_store/feature_table_client → wedata/feature_store/entities}/__init__.py +0 -0
  33. {feature_store → wedata/feature_store}/entities/data_type.py +0 -0
  34. {feature_store → wedata/feature_store}/entities/environment_variables.py +0 -0
  35. {feature_store → wedata/feature_store}/entities/feature.py +0 -0
  36. {feature_store → wedata/feature_store}/entities/feature_column_info.py +0 -0
  37. {feature_store → wedata/feature_store}/entities/feature_function.py +0 -0
  38. {feature_store → wedata/feature_store}/entities/feature_spec_constants.py +0 -0
  39. {feature_store → wedata/feature_store}/entities/feature_table_info.py +0 -0
  40. {feature_store → wedata/feature_store}/entities/function_info.py +0 -0
  41. {feature_store → wedata/feature_store}/entities/on_demand_column_info.py +0 -0
  42. {feature_store → wedata/feature_store}/entities/source_data_column_info.py +0 -0
  43. {feature_store/spark_client → wedata/feature_store/feature_table_client}/__init__.py +0 -0
  44. {feature_store/training_set_client → wedata/feature_store/spark_client}/__init__.py +0 -0
  45. {feature_store/utils → wedata/feature_store/training_set_client}/__init__.py +0 -0
  46. {feature_store → wedata/feature_store}/utils/topological_sort.py +0 -0
  47. {feature_store → wedata/feature_store}/utils/validation_utils.py +0 -0
  48. {wedata_feature_engineering-0.1.4.dist-info → wedata_feature_engineering-0.1.6.dist-info}/WHEEL +0 -0
@@ -1,196 +0,0 @@
1
- import json
2
- import logging
3
- import os
4
- from collections import defaultdict
5
- from types import ModuleType
6
- from typing import Any, Dict, List, Optional, Set, Union
7
-
8
- import mlflow
9
- import yaml
10
- from mlflow.models import Model, ModelSignature
11
- from mlflow.utils.file_utils import TempDir, YamlSafeDumper, read_yaml
12
- from pyspark.sql import DataFrame
13
- from pyspark.sql.functions import struct
14
-
15
- from feature_store.entities.feature_column_info import FeatureColumnInfo
16
- from feature_store.entities.feature_function import FeatureFunction
17
- from feature_store.entities.feature_lookup import FeatureLookup
18
- from feature_store.entities.feature_spec import FeatureSpec
19
- from feature_store.entities.training_set import TrainingSet
20
- from feature_store.spark_client.spark_client import SparkClient
21
-
22
- from feature_store.constants.constants import (
23
- _NO_RESULT_TYPE_PASSED,
24
- _PREBUILT_ENV_URI,
25
- _USE_SPARK_NATIVE_JOIN,
26
- _WARN,
27
- MODEL_DATA_PATH_ROOT,
28
- PREDICTION_COLUMN_NAME,
29
- )
30
-
31
- from feature_store.utils import common_utils, training_set_utils
32
- from feature_store.utils.feature_spec_utils import convert_to_yaml_string
33
-
34
- _logger = logging.getLogger(__name__)
35
-
36
- FEATURE_SPEC_GRAPH_MAX_COLUMN_INFO = 1000
37
-
38
-
39
- class TrainingSetClient:
40
- def __init__(
41
- self,
42
- spark_client: SparkClient
43
- ):
44
- self._spark_client = spark_client
45
-
46
- def create_training_set(
47
- self,
48
- feature_spec: FeatureSpec,
49
- feature_column_infos: List[FeatureColumnInfo],
50
- label_names: List[str],
51
- df: DataFrame,
52
- ft_metadata: training_set_utils._FeatureTableMetadata,
53
- kwargs,
54
- ):
55
- uc_function_infos = training_set_utils.get_uc_function_infos(
56
- self._spark_client,
57
- {odci.udf_name for odci in feature_spec.on_demand_column_infos},
58
- )
59
-
60
- # TODO(divyagupta-db): Move validation from _validate_join_feature_data in feature_lookup_utils.py
61
- # to a helper function called here and in score_batch.
62
-
63
- # Add consumer of each feature and instrument as final step
64
- consumer_feature_table_map = defaultdict(list)
65
- for feature in feature_column_infos:
66
- consumer_feature_table_map[feature.table_name].append(feature.feature_name)
67
- consumed_udf_names = [f.udf_name for f in feature_spec.function_infos]
68
-
69
- # Spark query planning is known to cause spark driver to crash if there are many feature tables to PiT join.
70
- # See https://docs.google.com/document/d/1EyA4vvlWikTJMeinsLkxmRAVNlXoF1eqoZElOdqlWyY/edit
71
- # So we disable native join by default.
72
- training_set_utils.warn_if_non_photon_for_native_spark(
73
- kwargs.get(_USE_SPARK_NATIVE_JOIN, False), self._spark_client
74
- )
75
- return TrainingSet(
76
- feature_spec,
77
- df,
78
- label_names,
79
- ft_metadata.feature_table_metadata_map,
80
- ft_metadata.feature_table_data_map,
81
- uc_function_infos,
82
- kwargs.get(_USE_SPARK_NATIVE_JOIN, False),
83
- )
84
-
85
- def create_training_set_from_feature_lookups(
86
- self,
87
- df: DataFrame,
88
- feature_lookups: List[Union[FeatureLookup, FeatureFunction]],
89
- label: Union[str, List[str], None],
90
- exclude_columns: List[str],
91
- **kwargs,
92
- ) -> TrainingSet:
93
-
94
- # 获取特征查找列表和特征函数列表
95
- features = feature_lookups
96
- feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
97
- feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
98
-
99
- # 如果未提供标签,则用空列表初始化label_names
100
- label_names = common_utils.as_list(label, [])
101
- del label
102
-
103
- # 校验数据集和标签
104
- training_set_utils.verify_df_and_labels(df, label_names, exclude_columns)
105
-
106
- # 获取特征表元数据
107
- ft_metadata = training_set_utils.get_table_metadata(
108
- self._spark_client,
109
- {fl.table_name for fl in feature_lookups}
110
- )
111
-
112
- column_infos = training_set_utils.get_column_infos(
113
- feature_lookups,
114
- feature_functions,
115
- ft_metadata,
116
- df_columns=df.columns,
117
- label_names=label_names,
118
- )
119
-
120
- training_set_utils.validate_column_infos(
121
- self._spark_client,
122
- ft_metadata,
123
- column_infos.source_data_column_infos,
124
- column_infos.feature_column_infos,
125
- column_infos.on_demand_column_infos,
126
- label_names,
127
- )
128
-
129
- # Build feature_spec locally for comparison with the feature spec yaml generated by the
130
- # FeatureStore backend. This will be removed once the migration is validated.
131
- feature_spec = training_set_utils.build_feature_spec(
132
- feature_lookups,
133
- ft_metadata,
134
- column_infos,
135
- exclude_columns
136
- )
137
-
138
- return self.create_training_set(
139
- feature_spec,
140
- column_infos.feature_column_infos,
141
- label_names,
142
- df,
143
- ft_metadata,
144
- kwargs=kwargs,
145
- )
146
-
147
-
148
-
149
-
150
-
151
- def create_feature_spec(
152
- self,
153
- name: str,
154
- features: List[Union[FeatureLookup, FeatureFunction]],
155
- sparkClient: SparkClient,
156
- exclude_columns: List[str] = [],
157
- ) -> FeatureSpec:
158
-
159
- feature_lookups = [f for f in features if isinstance(f, FeatureLookup)]
160
- feature_functions = [f for f in features if isinstance(f, FeatureFunction)]
161
-
162
- # Maximum of 100 FeatureFunctions is supported
163
- if len(feature_functions) > training_set_utils.MAX_FEATURE_FUNCTIONS:
164
- raise ValueError(
165
- f"A maximum of {training_set_utils.MAX_FEATURE_FUNCTIONS} FeatureFunctions are supported."
166
- )
167
-
168
- # Get feature table metadata and column infos
169
- ft_metadata = training_set_utils.get_table_metadata(
170
- self._spark_client,
171
- {fl.table_name for fl in feature_lookups}
172
- )
173
- column_infos = training_set_utils.get_column_infos(
174
- feature_lookups,
175
- feature_functions,
176
- ft_metadata,
177
- )
178
-
179
- column_infos = training_set_utils.add_inferred_source_columns(column_infos)
180
-
181
- training_set_utils.validate_column_infos(
182
- self._spark_client,
183
- ft_metadata,
184
- column_infos.source_data_column_infos,
185
- column_infos.feature_column_infos,
186
- column_infos.on_demand_column_infos,
187
- )
188
-
189
- feature_spec = training_set_utils.build_feature_spec(
190
- feature_lookups,
191
- ft_metadata,
192
- column_infos,
193
- exclude_columns
194
- )
195
-
196
- return feature_spec
@@ -1,96 +0,0 @@
1
- """
2
- 通用工具函数
3
- """
4
-
5
- from collections import Counter
6
- from typing import Any, List
7
-
8
- from mlflow.store.artifact.models_artifact_repo import ModelsArtifactRepository
9
- from mlflow.store.artifact.runs_artifact_repo import RunsArtifactRepository
10
-
11
-
12
- def is_artifact_uri(uri):
13
- """
14
- Checks the artifact URI is associated with a MLflow model or run.
15
- The actual URI can be a model URI, model URI + subdirectory, or model URI + path to artifact file.
16
- """
17
- return ModelsArtifactRepository.is_models_uri(
18
- uri
19
- ) or RunsArtifactRepository.is_runs_uri(uri)
20
-
21
- def as_list(obj, default=None):
22
- if not obj:
23
- return default
24
- elif isinstance(obj, list):
25
- return obj
26
- else:
27
- return [obj]
28
-
29
- def get_duplicates(elements: List[Any]) -> List[Any]:
30
- """
31
- Returns duplicate elements in the order they first appear.
32
- """
33
- element_counts = Counter(elements)
34
- duplicates = []
35
- for e in element_counts.keys():
36
- if element_counts[e] > 1:
37
- duplicates.append(e)
38
- return duplicates
39
-
40
- def validate_strings_unique(strings: List[str], error_template: str):
41
- """
42
- Validates all strings are unique, otherwise raise ValueError with the error template and duplicates.
43
- Passes single-quoted, comma delimited duplicates to the error template.
44
- """
45
- duplicate_strings = get_duplicates(strings)
46
- if duplicate_strings:
47
- duplicates_formatted = ", ".join([f"'{s}'" for s in duplicate_strings])
48
- raise ValueError(error_template.format(duplicates_formatted))
49
-
50
- def sanitize_identifier(identifier: str):
51
- """
52
- Sanitize and wrap an identifier with backquotes. For example, "a`b" becomes "`a``b`".
53
- Use this function to sanitize identifiers such as column names in SQL and PySpark.
54
- """
55
- return f"`{identifier.replace('`', '``')}`"
56
-
57
-
58
- def sanitize_identifiers(identifiers: List[str]):
59
- """
60
- Sanitize and wrap the identifiers in a list with backquotes.
61
- """
62
- return [sanitize_identifier(i) for i in identifiers]
63
-
64
-
65
- def sanitize_multi_level_name(multi_level_name: str):
66
- """
67
- Sanitize a multi-level name (such as an Unity Catalog table name) by sanitizing each segment
68
- and joining the results. For example, "ca+t.fo`o.ba$r" becomes "`ca+t`.`fo``o`.`ba$r`".
69
- """
70
- segments = multi_level_name.split(".")
71
- return ".".join(sanitize_identifiers(segments))
72
-
73
-
74
- def unsanitize_identifier(identifier: str):
75
- """
76
- Unsanitize an identifier. Useful when we get a possibly sanitized identifier from Spark or
77
- somewhere else, but we need an unsanitized one.
78
- Note: This function does not check the correctness of the identifier passed in. e.g. `foo``
79
- is not a valid sanitized identifier. When given such invalid input, this function returns
80
- invalid output.
81
- """
82
- if len(identifier) >= 2 and identifier[0] == "`" and identifier[-1] == "`":
83
- return identifier[1:-1].replace("``", "`")
84
- else:
85
- return identifier
86
-
87
-
88
- # strings containing \ or ' can break sql statements, so escape them.
89
- def escape_sql_string(input_str: str) -> str:
90
- return input_str.replace("\\", "\\\\").replace("'", "\\'")
91
-
92
- def get_unique_list_order(elements: List[Any]) -> List[Any]:
93
- """
94
- Returns unique elements in the order they first appear.
95
- """
96
- return list(dict.fromkeys(elements))
@@ -1,41 +0,0 @@
1
- feature_store/__init__.py,sha256=CP3YAMoy3pSTWRYzTza_CYBnGbTv_KzycVEBMQCeiD8,101
2
- feature_store/client.py,sha256=FG1xK460rD859iSY4VA75XeYhqStJD8Wlr0sRxk25LI,5267
3
- feature_store/constants/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- feature_store/constants/constants.py,sha256=exW3kiFLDyCmU9cYHFjcvIQhPWEpFtkogLXeB9Arfd8,827
5
- feature_store/entities/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
6
- feature_store/entities/column_info.py,sha256=WezowI46YHDym5ZlbhCJDqhKbVcjXjnjt7dQdy3XqYM,4164
7
- feature_store/entities/data_type.py,sha256=VpHS6Fr3TphQQ8NbAcEnDJ-8eOZV6ivYuWxv3pAM2RM,3394
8
- feature_store/entities/environment_variables.py,sha256=ZEFml5H9MQuzBKM074mUrFYu-Sga4Knmxqiwpke2WGc,1679
9
- feature_store/entities/feature.py,sha256=wX8fTBlJq3GYdj9rrBDCY3kFgcVBBAiOOZdxEhnQkNQ,1241
10
- feature_store/entities/feature_column_info.py,sha256=-TGxRafYUaNKe0YzHus2XbfRaVrMv7pcffMdbtTT4nA,2031
11
- feature_store/entities/feature_function.py,sha256=R17INrCE-U_Uj9KLbFz69aYlOkTETTwQHMMo470F4lQ,1865
12
- feature_store/entities/feature_lookup.py,sha256=zUDMdDIboitOffYRZlurf_O_4UeBPmE5YS0PyCS2Fqg,7912
13
- feature_store/entities/feature_spec.py,sha256=F4MiKEyvKZSBh6Uv7V4vVLbamZ9fRClaC3HCrUeynDE,20079
14
- feature_store/entities/feature_spec_constants.py,sha256=YWDBfRiNDe6fUJFUBo3V4WYg2xsljoPAE-ZejfFZCgM,785
15
- feature_store/entities/feature_table.py,sha256=4ghopIvJcoIlyFiSEuTkOcDWn88c1Kt6q5LWM4BYEHI,6073
16
- feature_store/entities/feature_table_info.py,sha256=2vUaVdW_jw1dRAlmJWvBRueuMeuqWu_NYB9SlxLI7Uw,1126
17
- feature_store/entities/function_info.py,sha256=l0kmiq2R_QNfSMJ7y0xZohlMiemgYSr1dN5vzV8ijIs,7314
18
- feature_store/entities/on_demand_column_info.py,sha256=Eh5ieaj1TxC7DG6ipBZzH2ZyY0bwkLrDOkuZjgYr4gY,1297
19
- feature_store/entities/source_data_column_info.py,sha256=a9jQOJvehwDIrKPwsP6W9YRBSPNK2nZYypE6-p80CwA,542
20
- feature_store/entities/training_set.py,sha256=9H2uGnUxTAsk93Om50QxRELbeFCocwGMze2VexPVJWI,5569
21
- feature_store/feature_table_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
- feature_store/feature_table_client/feature_table_client.py,sha256=uir33K7oigrSnjTT6VbNOp0Nb22-X3JHd1_92kWjrow,10754
23
- feature_store/spark_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
- feature_store/spark_client/spark_client.py,sha256=vd-NCE9IGC0Ygqr-QSVY0teuWsQSkq_BFV4Mn6xMMNU,11578
25
- feature_store/training_set_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- feature_store/training_set_client/training_set_client.py,sha256=Aa80xVXVE1KBdgplL9qqR8ftD5A5r2pfBttAhmySrB0,6696
27
- feature_store/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- feature_store/utils/common_utils.py,sha256=ck8pJYeN6vrcZmTrcnmIOOJWzYZaY3ZjvRSVme4tplo,3314
29
- feature_store/utils/feature_lookup_utils.py,sha256=iILSP4AFHXrjNTuId6mT7wtMFAsZejyxThr_mZHPRF4,22330
30
- feature_store/utils/feature_spec_utils.py,sha256=jeWzEhmkVW-bMRySMx_5grepHAlLquMhYxpbbiaJR-g,11582
31
- feature_store/utils/feature_utils.py,sha256=8KhlkWax3KAi_xRnStVPlhCxeUHO08VW2fmT9jN8QUs,2761
32
- feature_store/utils/schema_utils.py,sha256=8NhNUsF4Z6UtmzFeaVBnmb7xut0LqZepK3M27PSEpfE,4484
33
- feature_store/utils/topological_sort.py,sha256=ebzKxmxeCLk9seB1zR0ASCGXsZsa-DjxJeTc4KUadtg,6475
34
- feature_store/utils/training_set_utils.py,sha256=V5yW-XQ9in7gNOo4xsWy7txnSw_Z9Zxm4mV7MQmrWnk,22466
35
- feature_store/utils/uc_utils.py,sha256=ets7YlrAtkhW9kKyYajDNo6iZasBIhFyxUT2MOyLuV8,10767
36
- feature_store/utils/utils.py,sha256=T6dOUX3oOYRsbvXyTIElFZ20kNO92KMYPUCrqY5eomE,8953
37
- feature_store/utils/validation_utils.py,sha256=FslvrNs3kstqvM6THScLOluEE6O9RWlDrD9xiihTzlw,1735
38
- wedata_feature_engineering-0.1.4.dist-info/METADATA,sha256=uwmHZ4fVVcncF5YH_p3kUG24D377eKLraAcOlx-KU5o,493
39
- wedata_feature_engineering-0.1.4.dist-info/WHEEL,sha256=tZoeGjtWxWRfdplE7E3d45VPlLNQnvbKiYnx7gwAy8A,92
40
- wedata_feature_engineering-0.1.4.dist-info/top_level.txt,sha256=15761LgVdJ7tJWbdlYk0EZ560G9k6C4TE42dfLx8d0I,14
41
- wedata_feature_engineering-0.1.4.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- feature_store