upgini 1.2.87.dev3__py3-none-any.whl → 1.2.87.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.87.dev3"
1
+ __version__ = "1.2.87.dev4"
@@ -2299,11 +2299,16 @@ if response.status_code == 200:
2299
2299
 
2300
2300
  self.__log_debug_information(validated_X, validated_y, exclude_features_sources=exclude_features_sources)
2301
2301
 
2302
- self.__validate_search_keys(self.search_keys, self.search_id)
2302
+ filtered_columns = self.__filtered_enriched_features(
2303
+ importance_threshold, max_features, trace_id, validated_X
2304
+ )
2305
+ # If there are no important features, return original dataframe
2306
+ if not filtered_columns:
2307
+ msg = self.bundle.get("no_important_features_for_transform")
2308
+ self.__log_warning(msg, show_support_link=True)
2309
+ return X, {c: c for c in X.columns}, [], dict()
2303
2310
 
2304
- if len(self.feature_names_) == 0:
2305
- self.logger.warning(self.bundle.get("no_important_features_for_transform"))
2306
- return X, {c: c for c in X.columns}, [], {}
2311
+ self.__validate_search_keys(self.search_keys, self.search_id)
2307
2312
 
2308
2313
  if self._has_paid_features(exclude_features_sources):
2309
2314
  msg = self.bundle.get("transform_with_paid_features")
@@ -2444,6 +2449,8 @@ if response.status_code == 200:
2444
2449
  # Explode multiple search keys
2445
2450
  df, unnest_search_keys = self._explode_multiple_search_keys(df, search_keys, columns_renaming)
2446
2451
 
2452
+ # Convert search keys and generate features on them
2453
+
2447
2454
  email_column = self._get_email_column(search_keys)
2448
2455
  hem_column = self._get_hem_column(search_keys)
2449
2456
  if email_column:
@@ -2484,6 +2491,26 @@ if response.status_code == 200:
2484
2491
  converter = PostalCodeSearchKeyConverter(postal_code)
2485
2492
  df = converter.convert(df)
2486
2493
 
2494
+ # TODO return X + generated features
2495
+ # external_features = [fm for fm in features_meta if fm.shap_value > 0 and fm.source != "etalon"]
2496
+ # if not external_features:
2497
+ # # Unexplode dataframe back to original shape
2498
+ # if len(unnest_search_keys) > 0:
2499
+ # df = df.groupby(ENTITY_SYSTEM_RECORD_ID).first().reset_index()
2500
+
2501
+ # # Get important features from etalon source
2502
+ # etalon_features = [fm.name for fm in features_meta if fm.shap_value > 0 and fm.source == "etalon"]
2503
+
2504
+ # # Select only etalon features that exist in dataframe
2505
+ # available_etalon_features = [f for f in etalon_features if f in df.columns]
2506
+
2507
+ # # Return original dataframe with only important etalon features
2508
+ # result = df[available_etalon_features].copy()
2509
+ # result.index = validated_Xy.index
2510
+
2511
+ # return result, columns_renaming, generated_features, search_keys
2512
+ # ...
2513
+
2487
2514
  meaning_types = {}
2488
2515
  meaning_types.update({col: FileColumnMeaningType.FEATURE for col in features_for_transform})
2489
2516
  meaning_types.update({col: key.value for col, key in search_keys.items()})
@@ -2637,9 +2664,6 @@ if response.status_code == 200:
2637
2664
  for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
2638
2665
  if c not in self.dropped_client_feature_names_
2639
2666
  ]
2640
- filtered_columns = self.__filtered_enriched_features(
2641
- importance_threshold, max_features, trace_id, validated_X
2642
- )
2643
2667
  selecting_columns.extend(
2644
2668
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2645
2669
  )
@@ -2942,7 +2966,10 @@ if response.status_code == 200:
2942
2966
  self.__log_warning(fintech_warning)
2943
2967
  df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
2944
2968
  if full_duplicates_warning:
2945
- self.__log_warning(full_duplicates_warning)
2969
+ if len(df) == 0:
2970
+ raise ValidationError(full_duplicates_warning)
2971
+ else:
2972
+ self.__log_warning(full_duplicates_warning)
2946
2973
 
2947
2974
  # Explode multiple search keys
2948
2975
  df = self.__add_fit_system_record_id(
upgini/metrics.py CHANGED
@@ -807,10 +807,10 @@ class CatBoostWrapper(EstimatorWrapper):
807
807
  try:
808
808
  from catboost import Pool
809
809
 
810
+ cat_features = None
810
811
  if cat_encoder is not None:
811
812
  if isinstance(self.cv, TimeSeriesSplit) or isinstance(self.cv, BlockedTimeSeriesSplit):
812
813
  encoded = cat_encoder.transform(x[self.cat_features]).astype(int)
813
- cat_features = None
814
814
  else:
815
815
  encoded = cat_encoder.transform(x[self.cat_features])
816
816
  cat_features = encoded.columns.to_list()
@@ -156,7 +156,7 @@ dataset_too_few_rows=X size should be at least {} rows after validation
156
156
  dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample X
157
157
  dataset_empty_column_names=Some column names are empty. Add names please
158
158
  dataset_full_duplicates={:.5f}% of the rows are fully duplicated
159
- dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
159
+ dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
160
160
  dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
161
161
  dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
162
162
  dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
@@ -192,7 +192,7 @@ def clean_full_duplicates(
192
192
  unique_columns.remove(TARGET)
193
193
  marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
194
194
  if marked_duplicates.sum() > 0:
195
- dups_indices = df[marked_duplicates].index.to_list()
195
+ dups_indices = df[marked_duplicates].index.to_list()[:100]
196
196
  nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
197
197
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
198
198
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.87.dev3
3
+ Version: 1.2.87.dev4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=-MoNpjvEXC0uIle8xxIgQduzBZJlNzuW-1rPMTm_xc8,28
1
+ upgini/__about__.py,sha256=snYX5GSOXf809cKcpmiRzx30DuIAydReavaEB237z1A,28
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=fRtqSkXNONLnPe6cCL967GMt349FTIpXzy_u8LUKncw,35354
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=n8KBoBgJApLiRv4wXeSgfS-PfbB1D5aDOJfFnL0q6v8,214487
6
+ upgini/features_enricher.py,sha256=8KJiPXTFSiJUl5hJPEhMwhpXqPnGm3LrX31pKwlYe3k,215900
7
7
  upgini/http.py,sha256=6Qcepv0tDC72mBBJxYHnA2xqw6QwFaKrXN8o4vju8Es,44372
8
8
  upgini/metadata.py,sha256=zt_9k0iQbWXuiRZcel4ORNPdQKt6Ou69ucZD_E1Q46o,12341
9
- upgini/metrics.py,sha256=CR_MKBcq1RlNMXeqc9S374JzHgunMl-mEmlTnZAm_VI,45236
9
+ upgini/metrics.py,sha256=64M7RGbr9dItbXPYqWmeKhpBGHO4B69eV9Rj6P18_qg,45228
10
10
  upgini/search_task.py,sha256=Q5HjBpLIB3OCxAD1zNv5yQ3ZNJx696WCK_-H35_y7Rs,17912
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=g2TcDXZeJp9kAFO2sTqZ4CAsN4J1qHNgoJHZ8gtzUWo,7376
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=U_ewTI-qPww4X3WcFG3qDf_jv2vo6RrlCehVDjqtzEI,27991
41
+ upgini/resource_bundle/strings.properties,sha256=Q__3SNuespbG9bRJ9Gq4E_w665NPe8EZ7Pcng8B1V8Y,28001
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -52,7 +52,7 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
52
52
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
53
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
54
54
  upgini/utils/datetime_utils.py,sha256=UL1ernnawW0LV9mPDpCIc6sFy0HUhFscWVNwfH4V7rI,14366
55
- upgini/utils/deduplicate_utils.py,sha256=jm9ARZ0fbJFF3aJqj-xm_T6lNh-WErM0H0h6B_L1xQc,8948
55
+ upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ_MGI,8954
56
56
  upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.87.dev3.dist-info/METADATA,sha256=Pm-acVK8TpDLvPsO0qluwSjmu0cb3FHmtXmqMj--2Ag,49167
74
- upgini-1.2.87.dev3.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.87.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.87.dev3.dist-info/RECORD,,
73
+ upgini-1.2.87.dev4.dist-info/METADATA,sha256=PpZ-d4CiDjy-RnXvTGmyEXh-Q_Mjkdf1UaGyVFniqCw,49167
74
+ upgini-1.2.87.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.87.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.87.dev4.dist-info/RECORD,,