upgini 1.2.57__py3-none-any.whl → 1.2.57a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.57"
1
+ __version__ = "1.2.57a2"
upgini/autofe/vector.py CHANGED
@@ -2,11 +2,7 @@ import abc
2
2
  from typing import Dict, List, Optional
3
3
 
4
4
  import pandas as pd
5
-
6
- try:
7
- from pydantic import field_validator as validator # V2
8
- except ImportError:
9
- from pydantic import validator # V1
5
+ from pydantic import validator
10
6
 
11
7
  from upgini.autofe.operand import PandasOperand, ParametrizedOperand, VectorizableMixin
12
8
 
@@ -33,16 +29,12 @@ class Sum(PandasOperand, VectorizableMixin):
33
29
  class TimeSeriesBase(PandasOperand, abc.ABC):
34
30
  is_vector: bool = True
35
31
  date_unit: Optional[str] = None
36
- offset_size: int = 0
37
- offset_unit: str = "D"
38
32
 
39
33
  def get_params(self) -> Dict[str, Optional[str]]:
40
34
  res = super().get_params()
41
35
  res.update(
42
36
  {
43
37
  "date_unit": self.date_unit,
44
- "offset_size": self.offset_size,
45
- "offset_unit": self.offset_unit,
46
38
  }
47
39
  )
48
40
  return res
@@ -54,31 +46,13 @@ class TimeSeriesBase(PandasOperand, abc.ABC):
54
46
  ts.drop_duplicates(subset=ts.columns[:-1], keep="first", inplace=True)
55
47
  ts.set_index(date.name, inplace=True)
56
48
  ts = ts[ts.index.notna()].sort_index()
57
- ts = (
58
- ts.groupby([c.name for c in data[1:-1]])
59
- .apply(self._shift)[data[-1].name]
60
- .to_frame()
61
- .reset_index()
62
- .set_index(date.name)
63
- .groupby([c.name for c in data[1:-1]])
64
- if len(data) > 2
65
- else self._shift(ts)
66
- )
49
+ ts = ts.groupby([c.name for c in data[1:-1]]) if len(data) > 2 else ts
67
50
  ts = self._aggregate(ts)
68
51
  ts = ts.reindex(data[1:-1] + [date] if len(data) > 2 else date).reset_index()
69
52
  ts.index = date.index
70
53
 
71
54
  return ts.iloc[:, -1]
72
55
 
73
- def _shift(self, ts: pd.DataFrame) -> pd.DataFrame:
74
- if self.offset_size > 0:
75
- return ts.iloc[:, :-1].merge(
76
- ts.iloc[:, -1].shift(freq=f"{self.offset_size}{self.offset_unit}"),
77
- left_index=True,
78
- right_index=True,
79
- )
80
- return ts
81
-
82
56
  @abc.abstractmethod
83
57
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
84
58
  pass
@@ -93,7 +67,6 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
93
67
  window_unit: str = "D"
94
68
 
95
69
  @validator("window_unit")
96
- @classmethod
97
70
  def validate_window_unit(cls, v: str) -> str:
98
71
  try:
99
72
  pd.tseries.frequencies.to_offset(v)
@@ -104,35 +77,12 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
104
77
  )
105
78
 
106
79
  def to_formula(self) -> str:
107
- roll_component = f"roll_{self.window_size}{self.window_unit}"
108
- if self.offset_size > 0:
109
- roll_component += f"_offset_{self.offset_size}{self.offset_unit}"
110
- return f"{roll_component}_{self.aggregation}"
80
+ return f"roll_{self.window_size}{self.window_unit}_{self.aggregation}"
111
81
 
112
82
  @classmethod
113
83
  def from_formula(cls, formula: str) -> Optional["Roll"]:
114
84
  import re
115
85
 
116
- # Try matching pattern with offset first
117
- pattern_with_offset = r"^roll_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])_(\w+)$"
118
- match_with_offset = re.match(pattern_with_offset, formula)
119
-
120
- if match_with_offset:
121
- window_size = int(match_with_offset.group(1))
122
- window_unit = match_with_offset.group(2)
123
- offset_size = int(match_with_offset.group(3))
124
- offset_unit = match_with_offset.group(4)
125
- aggregation = match_with_offset.group(5)
126
-
127
- return cls(
128
- window_size=window_size,
129
- window_unit=window_unit,
130
- offset_size=offset_size,
131
- offset_unit=offset_unit,
132
- aggregation=aggregation,
133
- )
134
-
135
- # If no offset pattern found, try basic pattern
136
86
  pattern = r"^roll_(\d+)([a-zA-Z])_(\w+)$"
137
87
  match = re.match(pattern, formula)
138
88
 
@@ -157,7 +107,7 @@ class Roll(TimeSeriesBase, ParametrizedOperand):
157
107
  return res
158
108
 
159
109
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
160
- return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=1).agg(
110
+ return ts.rolling(f"{self.window_size}{self.window_unit}", min_periods=self.window_size).agg(
161
111
  _roll_aggregations.get(self.aggregation, self.aggregation)
162
112
  )
163
113
 
@@ -167,33 +117,12 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
167
117
  lag_unit: str = "D"
168
118
 
169
119
  def to_formula(self) -> str:
170
- lag_component = f"lag_{self.lag_size}{self.lag_unit}"
171
- if self.offset_size > 0:
172
- lag_component += f"_offset_{self.offset_size}{self.offset_unit}"
173
- return lag_component
120
+ return f"lag_{self.lag_size}{self.lag_unit}"
174
121
 
175
122
  @classmethod
176
123
  def from_formula(cls, formula: str) -> Optional["Lag"]:
177
124
  import re
178
125
 
179
- # Try matching pattern with offset first
180
- pattern_with_offset = r"^lag_(\d+)([a-zA-Z])_offset_(\d+)([a-zA-Z])$"
181
- match_with_offset = re.match(pattern_with_offset, formula)
182
-
183
- if match_with_offset:
184
- lag_size = int(match_with_offset.group(1))
185
- lag_unit = match_with_offset.group(2)
186
- offset_size = int(match_with_offset.group(3))
187
- offset_unit = match_with_offset.group(4)
188
-
189
- return cls(
190
- lag_size=lag_size,
191
- lag_unit=lag_unit,
192
- offset_size=offset_size,
193
- offset_unit=offset_unit,
194
- )
195
-
196
- # If no offset pattern found, try basic pattern
197
126
  pattern = r"^lag_(\d+)([a-zA-Z])$"
198
127
  match = re.match(pattern, formula)
199
128
 
@@ -207,12 +136,6 @@ class Lag(TimeSeriesBase, ParametrizedOperand):
207
136
 
208
137
  def get_params(self) -> Dict[str, Optional[str]]:
209
138
  res = super().get_params()
210
- res.update(
211
- {
212
- "lag_size": self.lag_size,
213
- "lag_unit": self.lag_unit,
214
- }
215
- )
216
139
  return res
217
140
 
218
141
  def _aggregate(self, ts: pd.DataFrame) -> pd.DataFrame:
@@ -386,6 +386,7 @@ class DataSourcePublisher:
386
386
  search_keys = [k.value.value for k in search_keys] if search_keys else None
387
387
  request = {"bqTableId": bq_table_id, "searchKeys": search_keys}
388
388
  task_id = self._rest_client.upload_online(request, trace_id)
389
+ print(f"Uploading online task created. task_id={task_id}")
389
390
  with Spinner():
390
391
  status_response = self._rest_client.poll_ads_management_task_status(task_id, trace_id)
391
392
  while status_response["status"] not in self.FINAL_STATUSES:
upgini/dataset.py CHANGED
@@ -587,15 +587,23 @@ class Dataset: # (pd.DataFrame):
587
587
  if (
588
588
  runtime_parameters is not None
589
589
  and runtime_parameters.properties is not None
590
- and "generate_features" in runtime_parameters.properties
591
590
  ):
592
- generate_features = runtime_parameters.properties["generate_features"].split(",")
593
- renamed_generate_features = []
594
- for f in generate_features:
595
- for new_column, orig_column in self.columns_renaming.items():
596
- if f == orig_column:
597
- renamed_generate_features.append(new_column)
598
- runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
591
+ if "generate_features" in runtime_parameters.properties:
592
+ generate_features = runtime_parameters.properties["generate_features"].split(",")
593
+ renamed_generate_features = []
594
+ for f in generate_features:
595
+ for new_column, orig_column in self.columns_renaming.items():
596
+ if f == orig_column:
597
+ renamed_generate_features.append(new_column)
598
+ runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
599
+ if "columns_for_online_api" in runtime_parameters.properties:
600
+ columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
601
+ renamed_columns_for_online_api = []
602
+ for f in columns_for_online_api:
603
+ for new_column, orig_column in self.columns_renaming.items():
604
+ if f == orig_column:
605
+ renamed_columns_for_online_api.append(new_column)
606
+ runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
599
607
 
600
608
  return runtime_parameters
601
609
 
@@ -222,6 +222,7 @@ class FeaturesEnricher(TransformerMixin):
222
222
  loss: Optional[str] = None,
223
223
  detect_missing_search_keys: bool = True,
224
224
  generate_features: Optional[List[str]] = None,
225
+ columns_for_online_api: Optional[List[str]] = None,
225
226
  round_embeddings: Optional[int] = None,
226
227
  logs_enabled: bool = True,
227
228
  raise_validation_error: bool = True,
@@ -345,6 +346,9 @@ class FeaturesEnricher(TransformerMixin):
345
346
  self.logger.error(msg)
346
347
  raise ValidationError(msg)
347
348
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
349
+ self.columns_for_online_api = columns_for_online_api
350
+ if columns_for_online_api is not None:
351
+ self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
348
352
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
349
353
  if maybe_downsampling_limit is not None:
350
354
  Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -2620,17 +2624,18 @@ if response.status_code == 200:
2620
2624
  checked_generate_features = []
2621
2625
  for gen_feature in self.generate_features:
2622
2626
  if gen_feature not in x_columns:
2623
- if gen_feature == self._get_phone_column(self.search_keys):
2624
- raise ValidationError(
2625
- self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2626
- )
2627
- else:
2628
- self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2627
+ msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2628
+ self.__log_warning(msg)
2629
2629
  else:
2630
2630
  checked_generate_features.append(gen_feature)
2631
2631
  self.generate_features = checked_generate_features
2632
2632
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2633
2633
 
2634
+ if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
2635
+ for column in self.columns_for_online_api:
2636
+ if column not in validated_X.columns:
2637
+ raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
2638
+
2634
2639
  if self.id_columns is not None:
2635
2640
  for id_column in self.id_columns:
2636
2641
  if id_column not in validated_X.columns:
@@ -111,6 +111,7 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
114
+ missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
114
115
  x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
115
116
  train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
116
117
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
116
116
  else:
117
117
  df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
118
118
 
119
- del self.search_keys[self.email_column]
120
- if self.email_column in self.unnest_search_keys:
121
- self.unnest_search_keys.remove(self.email_column)
119
+ # del self.search_keys[self.email_column]
120
+ # if self.email_column in self.unnest_search_keys:
121
+ # self.unnest_search_keys.remove(self.email_column)
122
122
 
123
123
  one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
124
124
  df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
125
125
  self.columns_renaming[one_domain_name] = original_email_column
126
126
  self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
127
127
 
128
- if self.email_converted_to_hem:
129
- df = df.drop(columns=self.email_column)
130
- del self.columns_renaming[self.email_column]
128
+ # if self.email_converted_to_hem:
129
+ # df = df.drop(columns=self.email_column)
130
+ # del self.columns_renaming[self.email_column]
131
131
 
132
132
  return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.57
3
+ Version: 1.2.57a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=4bvatwbfE15IIgVfHJZH8d-WXGATbSGcT6GSdTUc1l0,23
1
+ upgini/__about__.py,sha256=PD2lbh5FQufk15oyUAYIGJrdUHAs9qG5Btw3lTqrUtI,25
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
4
+ upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=FkAKQV_XOXTobwOXpdy9BPfRkL4fkgoNa2B6NniiCrs,201554
6
+ upgini/features_enricher.py,sha256=qJhzMy_Z16wUduRrtAluawV8h_t4HCg9I7uDpRnhKjk,201884
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -21,16 +21,16 @@ upgini/autofe/feature.py,sha256=l8A8E3BH2BmYvqEC81zbcIEfH6KEEhcesJ2BH4fn0-4,1514
21
21
  upgini/autofe/groupby.py,sha256=G48_sQZw016eGx3cOy8YQrEIOp95puWqYUpFWd-gdeM,3595
22
22
  upgini/autofe/operand.py,sha256=8Ttrfxv_H91dMbS7J55zxluzAJHfGXU_Y2xCh4OHwb8,4774
23
23
  upgini/autofe/unary.py,sha256=T3E7F3dA_7o_rkdCFq7JV6nHLzcoHLHQTcxO7y5Opa4,4646
24
- upgini/autofe/vector.py,sha256=udkg4pP7IIeLjt0Cg6rzEKUmGaubOnqsEz3bz9R6E44,7110
24
+ upgini/autofe/vector.py,sha256=bvcop9b0uFFPfQ3FLTwXT2IYfxNl4dIfR8icvnBHvOA,4358
25
25
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
- upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lYQaGZbxDnOd4A3Q,22516
26
+ upgini/data_source/data_source_publisher.py,sha256=0vaYz5v3KclJnA6jAWiTUiMQO5mbBTBINWV9jr2F5xM,22591
27
27
  upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
28
28
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
29
29
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=0_KAExIi1u48N1CQ13LKJS3bgDlRs-MPOyU3VxcE-qY,27350
33
+ upgini/resource_bundle/strings.properties,sha256=UXMiaFP3p-WdiXyZJN3O_OZstb-F33BWVDxDiofyxd4,27464
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -46,7 +46,7 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
46
46
  upgini/utils/datetime_utils.py,sha256=RVAk4_rakK8X9zjybK3-rj0to0e3elye8tnBuA4wTWU,13491
47
47
  upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
48
48
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
49
- upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
49
+ upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
50
50
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
51
51
  upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
52
52
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.57.dist-info/METADATA,sha256=oRGZz3JdygY9pgsN4tSc14GF7Iqhfp4lMXs2TBQX3Qw,49055
63
- upgini-1.2.57.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
- upgini-1.2.57.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.57.dist-info/RECORD,,
62
+ upgini-1.2.57a2.dist-info/METADATA,sha256=-dEVxWnjwc3LcSqFVJGENL07YJDvWgH8mHQ0PaE93sI,49057
63
+ upgini-1.2.57a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.57a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.57a2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any