upgini 1.2.55a1__py3-none-any.whl → 1.2.55a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.55a1"
1
+ __version__ = "1.2.55a2"
upgini/dataset.py CHANGED
@@ -587,23 +587,15 @@ class Dataset: # (pd.DataFrame):
587
587
  if (
588
588
  runtime_parameters is not None
589
589
  and runtime_parameters.properties is not None
590
+ and "generate_features" in runtime_parameters.properties
590
591
  ):
591
- if "generate_features" in runtime_parameters.properties:
592
- generate_features = runtime_parameters.properties["generate_features"].split(",")
593
- renamed_generate_features = []
594
- for f in generate_features:
595
- for new_column, orig_column in self.columns_renaming.items():
596
- if f == orig_column:
597
- renamed_generate_features.append(new_column)
598
- runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
599
- if "columns_for_online_api" in runtime_parameters.properties:
600
- columns_for_online_api = runtime_parameters.properties["columns_for_online_api"].split(",")
601
- renamed_columns_for_online_api = []
602
- for f in columns_for_online_api:
603
- for new_column, orig_column in self.columns_renaming.items():
604
- if f == orig_column:
605
- renamed_columns_for_online_api.append(new_column)
606
- runtime_parameters.properties["columns_for_online_api"] = ",".join(renamed_columns_for_online_api)
592
+ generate_features = runtime_parameters.properties["generate_features"].split(",")
593
+ renamed_generate_features = []
594
+ for f in generate_features:
595
+ for new_column, orig_column in self.columns_renaming.items():
596
+ if f == orig_column:
597
+ renamed_generate_features.append(new_column)
598
+ runtime_parameters.properties["generate_features"] = ",".join(renamed_generate_features)
607
599
 
608
600
  return runtime_parameters
609
601
 
@@ -222,7 +222,6 @@ class FeaturesEnricher(TransformerMixin):
222
222
  loss: Optional[str] = None,
223
223
  detect_missing_search_keys: bool = True,
224
224
  generate_features: Optional[List[str]] = None,
225
- columns_for_online_api: Optional[List[str]] = None,
226
225
  round_embeddings: Optional[int] = None,
227
226
  logs_enabled: bool = True,
228
227
  raise_validation_error: bool = True,
@@ -346,9 +345,6 @@ class FeaturesEnricher(TransformerMixin):
346
345
  self.logger.error(msg)
347
346
  raise ValidationError(msg)
348
347
  self.runtime_parameters.properties["round_embeddings"] = round_embeddings
349
- self.columns_for_online_api = columns_for_online_api
350
- if columns_for_online_api is not None:
351
- self.runtime_parameters.properties["columns_for_online_api"] = ",".join(columns_for_online_api)
352
348
  maybe_downsampling_limit = self.runtime_parameters.properties.get("downsampling_limit")
353
349
  if maybe_downsampling_limit is not None:
354
350
  Dataset.FIT_SAMPLE_THRESHOLD = int(maybe_downsampling_limit)
@@ -2464,7 +2460,19 @@ if response.status_code == 200:
2464
2460
  if add_fit_system_record_id:
2465
2461
  selecting_columns.append(SORT_ID)
2466
2462
 
2467
- result = result[selecting_columns]
2463
+ selecting_columns = list(set(selecting_columns))
2464
+ # sorting: first columns from X, then generated features, then enriched features
2465
+ sorted_selecting_columns = [c for c in validated_X.columns if c in selecting_columns]
2466
+ for c in generated_features:
2467
+ if c in selecting_columns and c not in sorted_selecting_columns:
2468
+ sorted_selecting_columns.append(c)
2469
+ for c in result.columns:
2470
+ if c in selecting_columns and c not in sorted_selecting_columns:
2471
+ sorted_selecting_columns.append(c)
2472
+
2473
+ self.logger.info(f"Transform sorted_selecting_columns: {sorted_selecting_columns}")
2474
+
2475
+ result = result[sorted_selecting_columns]
2468
2476
 
2469
2477
  if self.country_added:
2470
2478
  result = result.drop(columns=COUNTRY, errors="ignore")
@@ -2612,18 +2620,17 @@ if response.status_code == 200:
2612
2620
  checked_generate_features = []
2613
2621
  for gen_feature in self.generate_features:
2614
2622
  if gen_feature not in x_columns:
2615
- msg = self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2616
- self.__log_warning(msg)
2623
+ if gen_feature == self._get_phone_column(self.search_keys):
2624
+ raise ValidationError(
2625
+ self.bundle.get("missing_generate_feature").format(gen_feature, x_columns)
2626
+ )
2627
+ else:
2628
+ self.__log_warning(self.bundle.get("missing_generate_feature").format(gen_feature, x_columns))
2617
2629
  else:
2618
2630
  checked_generate_features.append(gen_feature)
2619
2631
  self.generate_features = checked_generate_features
2620
2632
  self.runtime_parameters.properties["generate_features"] = ",".join(self.generate_features)
2621
2633
 
2622
- if self.columns_for_online_api is not None and len(self.columns_for_online_api) > 0:
2623
- for column in self.columns_for_online_api:
2624
- if column not in validated_X.columns:
2625
- raise ValidationError(self.bundle.get("missing_column_for_online_api").format(column))
2626
-
2627
2634
  if self.id_columns is not None:
2628
2635
  for id_column in self.id_columns:
2629
2636
  if id_column not in validated_X.columns:
@@ -3726,6 +3733,8 @@ if response.status_code == 200:
3726
3733
  features_info_without_links = []
3727
3734
  internal_features_info = []
3728
3735
 
3736
+ original_shaps = {fm.name: fm.shap_value for fm in features_meta}
3737
+
3729
3738
  if updated_shaps is not None:
3730
3739
  for fm in features_meta:
3731
3740
  fm.shap_value = updated_shaps.get(fm.name, 0.0)
@@ -3737,15 +3746,16 @@ if response.status_code == 200:
3737
3746
 
3738
3747
  is_client_feature = feature_meta.name in x_columns
3739
3748
 
3740
- if feature_meta.shap_value == 0.0:
3749
+ # TODO make a decision about selected features based on special flag from mlb
3750
+ if original_shaps.get(feature_meta.name, 0.0) == 0.0:
3741
3751
  if self.fit_select_features:
3742
3752
  self.dropped_client_feature_names_.append(feature_meta.name)
3743
3753
  continue
3744
3754
 
3745
3755
  # Use only important features
3746
3756
  if (
3747
- feature_meta.name in self.fit_generated_features
3748
- or feature_meta.name == COUNTRY
3757
+ # feature_meta.name in self.fit_generated_features or
3758
+ feature_meta.name == COUNTRY
3749
3759
  # In select_features mode we select also from etalon features and need to show them
3750
3760
  or (not self.fit_select_features and is_client_feature)
3751
3761
  ):
upgini/metadata.py CHANGED
@@ -89,7 +89,7 @@ class SearchKey(Enum):
89
89
  if meaning_type == FileColumnMeaningType.EMAIL:
90
90
  return SearchKey.EMAIL
91
91
  if meaning_type == FileColumnMeaningType.HEM:
92
- return SearchKey.HEM
92
+ return SearchKey.HEM # TODO check that it wasn't EMAIL
93
93
  if meaning_type == FileColumnMeaningType.IP_ADDRESS:
94
94
  return SearchKey.IP
95
95
  if meaning_type == FileColumnMeaningType.MSISDN:
@@ -105,27 +105,27 @@ class SearchKey(Enum):
105
105
  if meaning_type == FileColumnMeaningType.POSTAL_CODE:
106
106
  return SearchKey.POSTAL_CODE
107
107
  if meaning_type == FileColumnMeaningType.IPV6_ADDRESS:
108
- return SearchKey.IPV6_ADDRESS
109
- if meaning_type == FileColumnMeaningType.IPV6_RANGE_FROM:
110
- return SearchKey.IPV6_RANGE_FROM
111
- if meaning_type == FileColumnMeaningType.IPV6_RANGE_TO:
112
- return SearchKey.IPV6_RANGE_TO
113
- if meaning_type == FileColumnMeaningType.EMAIL_ONE_DOMAIN:
114
- return SearchKey.EMAIL_ONE_DOMAIN
115
- if meaning_type == FileColumnMeaningType.IP_RANGE_FROM:
116
- return SearchKey.IP_RANGE_FROM
117
- if meaning_type == FileColumnMeaningType.IP_RANGE_TO:
118
- return SearchKey.IP_RANGE_TO
119
- if meaning_type == FileColumnMeaningType.MSISDN_RANGE_FROM:
120
- return SearchKey.MSISDN_RANGE_FROM
121
- if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
122
- return SearchKey.MSISDN_RANGE_TO
108
+ return SearchKey.IP
109
+ # if meaning_type == FileColumnMeaningType.IPV6_RANGE_FROM:
110
+ # return SearchKey.IPV6_RANGE_FROM
111
+ # if meaning_type == FileColumnMeaningType.IPV6_RANGE_TO:
112
+ # return SearchKey.IPV6_RANGE_TO
113
+ # if meaning_type == FileColumnMeaningType.EMAIL_ONE_DOMAIN:
114
+ # return SearchKey.EMAIL_ONE_DOMAIN
115
+ # if meaning_type == FileColumnMeaningType.IP_RANGE_FROM:
116
+ # return SearchKey.IP_RANGE_FROM
117
+ # if meaning_type == FileColumnMeaningType.IP_RANGE_TO:
118
+ # return SearchKey.IP_RANGE_TO
119
+ # if meaning_type == FileColumnMeaningType.MSISDN_RANGE_FROM:
120
+ # return SearchKey.MSISDN_RANGE_FROM
121
+ # if meaning_type == FileColumnMeaningType.MSISDN_RANGE_TO:
122
+ # return SearchKey.MSISDN_RANGE_TO
123
123
  if meaning_type == FileColumnMeaningType.IP_BINARY:
124
- return SearchKey.IP_BINARY
125
- if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
126
- return SearchKey.IP_RANGE_FROM_BINARY
127
- if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
128
- return SearchKey.IP_RANGE_TO_BINARY
124
+ return SearchKey.IP
125
+ # if meaning_type == FileColumnMeaningType.IP_RANGE_FROM_BINARY:
126
+ # return SearchKey.IP_RANGE_FROM_BINARY
127
+ # if meaning_type == FileColumnMeaningType.IP_RANGE_TO_BINARY:
128
+ # return SearchKey.IP_RANGE_TO_BINARY
129
129
 
130
130
  @staticmethod
131
131
  def find_key(search_keys: Dict[str, SearchKey], keys: Union[SearchKey, List[SearchKey]]) -> Optional[SearchKey]:
@@ -249,7 +249,9 @@ class FileMetadata(BaseModel):
249
249
  for key in keys_group:
250
250
  column = self.column_by_name(key)
251
251
  if column:
252
- search_keys[SearchKey.from_meaning_type(column.meaningType)] = column.name
252
+ search_key = SearchKey.from_meaning_type(column.meaningType)
253
+ if search_key is not None:
254
+ search_keys[search_key] = column.name
253
255
  return search_keys
254
256
 
255
257
 
@@ -111,7 +111,6 @@ x_is_empty=X is empty
111
111
  y_is_empty=y is empty
112
112
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
113
  missing_generate_feature=Feature {} specified in `generate_features` is not present in input columns: {}
114
- missing_column_for_online_api=Column {} specified in `columns_for_online_api` is not present in input columns: {}
115
114
  x_unstable_by_date=Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
115
  train_unstable_target=Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
116
  eval_unstable_target=Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
@@ -116,17 +116,17 @@ class EmailSearchKeyConverter:
116
116
  else:
117
117
  df[self.hem_column] = df[self.hem_column].astype("string").str.lower()
118
118
 
119
- # del self.search_keys[self.email_column]
120
- # if self.email_column in self.unnest_search_keys:
121
- # self.unnest_search_keys.remove(self.email_column)
119
+ del self.search_keys[self.email_column]
120
+ if self.email_column in self.unnest_search_keys:
121
+ self.unnest_search_keys.remove(self.email_column)
122
122
 
123
123
  one_domain_name = self.email_column + self.ONE_DOMAIN_SUFFIX
124
124
  df[one_domain_name] = df[self.email_column].apply(self._email_to_one_domain)
125
125
  self.columns_renaming[one_domain_name] = original_email_column
126
126
  self.search_keys[one_domain_name] = SearchKey.EMAIL_ONE_DOMAIN
127
127
 
128
- # if self.email_converted_to_hem:
129
- # df = df.drop(columns=self.email_column)
130
- # del self.columns_renaming[self.email_column]
128
+ if self.email_converted_to_hem:
129
+ df = df.drop(columns=self.email_column)
130
+ del self.columns_renaming[self.email_column]
131
131
 
132
132
  return df
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.55a1
3
+ Version: 1.2.55a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=86s5f_R4jUHavxr1KulObYZALbZX82z6GRJ0aiY88oY,25
1
+ upgini/__about__.py,sha256=AoPRhBb-_Kg5q3atFlIhgGYnF89Sf3FiehaPlpQQx5M,25
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=NP5vHqEfZQ1HWz3TcNAa_OhXG8wiMRdydm26D6UBiRU,34166
4
+ upgini/dataset.py,sha256=vT4JyHmafLNbj54SySXr93f5hNS6-t94aFslbBy-7No,33535
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=ZYBLKxkFZB-b94-7oiAqZ0b4sbA1wXKClfkxVU529e0,200957
6
+ upgini/features_enricher.py,sha256=Te4ZbFZ2RCEi9NHo1ddWaxfkTep_3O6Okct3U_DWeD0,201520
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
- upgini/metadata.py,sha256=-bulW4ay0qxOxR4_5oDvENtgKiZgZI-QnU2stGHrKhg,12130
9
+ upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
10
10
  upgini/metrics.py,sha256=hr7UwLphbZ_FEglLuO2lzr_pFgxOJ4c3WBeg7H-fNqY,35521
11
11
  upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
12
12
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=UXMiaFP3p-WdiXyZJN3O_OZstb-F33BWVDxDiofyxd4,27464
33
+ upgini/resource_bundle/strings.properties,sha256=0_KAExIi1u48N1CQ13LKJS3bgDlRs-MPOyU3VxcE-qY,27350
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -46,7 +46,7 @@ upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
46
46
  upgini/utils/datetime_utils.py,sha256=RVAk4_rakK8X9zjybK3-rj0to0e3elye8tnBuA4wTWU,13491
47
47
  upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
48
48
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
49
- upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
49
+ upgini/utils/email_utils.py,sha256=GbnhHJn1nhUBytmK6PophYqaoq4t7Lp6i0-O0Gd3RV8,5265
50
50
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
51
51
  upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
52
52
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.55a1.dist-info/METADATA,sha256=uESME_SjmqafjB1XBhRSZWYFrv4rPx5F1pKSxEKKsNs,49057
63
- upgini-1.2.55a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
- upgini-1.2.55a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.55a1.dist-info/RECORD,,
62
+ upgini-1.2.55a2.dist-info/METADATA,sha256=j01z5ab2V_U_jFU4mNVMFcSmWqsb9ePRLJsgK6BunU0,49057
63
+ upgini-1.2.55a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
64
+ upgini-1.2.55a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.55a2.dist-info/RECORD,,