PyPI - upgini - Versions diffs - 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl - Mend

upgini 1.2.124py3-none-any.whl → 1.2.146a4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of upgini might be problematic. Click here for more details.

Files changed (23) hide show

upgini/__about__.py +1 -1
upgini/autofe/binary.py +4 -3
upgini/data_source/data_source_publisher.py +1 -9
upgini/dataset.py +56 -6
upgini/features_enricher.py +634 -556
upgini/http.py +2 -2
upgini/metadata.py +16 -2
upgini/normalizer/normalize_utils.py +6 -6
upgini/resource_bundle/strings.properties +15 -11
upgini/search_task.py +14 -2
upgini/utils/base_search_key_detector.py +5 -1
upgini/utils/datetime_utils.py +125 -39
upgini/utils/deduplicate_utils.py +8 -5
upgini/utils/display_utils.py +61 -20
upgini/utils/feature_info.py +18 -7
upgini/utils/features_validator.py +6 -4
upgini/utils/postal_code_utils.py +35 -2
upgini/utils/target_utils.py +3 -1
upgini/utils/track_info.py +29 -1
{upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/METADATA +123 -121
{upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/RECORD +23 -23
{upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/WHEEL +1 -1
{upgini-1.2.124.dist-info → upgini-1.2.146a4.dist-info}/licenses/LICENSE +0 -0

upgini/http.py CHANGED Viewed

@@ -433,8 +433,8 @@ class _RestClient:
             with open(file_path, "rb") as file:
                 content = file.read()
                 md5_hash.update(content)
-                digest = md5_hash.hexdigest()
-                metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest})
+                digest_md5 = md5_hash.hexdigest()
+                metadata_with_md5 = pydantic_copy_method(metadata)(update={"checksumMD5": digest_md5})
             digest_sha256 = file_hash(file_path)
             metadata_with_md5 = pydantic_copy_method(metadata_with_md5)(update={"digest": digest_sha256})

upgini/metadata.py CHANGED Viewed

@@ -12,10 +12,19 @@ SORT_ID = "sort_id"
 EVAL_SET_INDEX = "eval_set_index"
 TARGET = "target"
 COUNTRY = "country_iso_code"
+CURRENT_DATE_COL = "current_date_"
 RENAMED_INDEX = "index_col"
 DEFAULT_INDEX = "index"
 ORIGINAL_INDEX = "original_index"
-SYSTEM_COLUMNS = {SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST, EVAL_SET_INDEX, TARGET, COUNTRY}
+SYSTEM_COLUMNS = {
+    SYSTEM_RECORD_ID,
+    ENTITY_SYSTEM_RECORD_ID,
+    SEARCH_KEY_UNNEST,
+    EVAL_SET_INDEX,
+    TARGET,
+    COUNTRY,
+    CURRENT_DATE_COL,
+}
 class FileColumnMeaningType(Enum):
@@ -36,6 +45,8 @@ class FileColumnMeaningType(Enum):
     SCORE = "SCORE"
     TARGET = "TARGET"
     FEATURE = "FEATURE"
+    GENERATED_FEATURE = "GENERATED_FEATURE"
+    DATE_FEATURE = "DATE_FEATURE"
     CUSTOM_KEY = "CUSTOM_KEY"
     COUNTRY = "COUNTRY"
     POSTAL_CODE = "POSTAL_CODE"
@@ -85,7 +96,7 @@ class SearchKey(Enum):
         return [SearchKey.EMAIL, SearchKey.HEM, SearchKey.IP, SearchKey.PHONE]
     @staticmethod
-    def from_meaning_type(meaning_type: FileColumnMeaningType) -> "SearchKey":
+    def from_meaning_type(meaning_type: FileColumnMeaningType) -> Optional["SearchKey"]:
         if meaning_type == FileColumnMeaningType.EMAIL:
             return SearchKey.EMAIL
         if meaning_type == FileColumnMeaningType.HEM:
@@ -250,6 +261,9 @@ class FileMetadata(BaseModel):
     rowsCount: Optional[int] = None
     checksumMD5: Optional[str] = None
     digest: Optional[str] = None
+    deterministicDigest: Optional[str] = None
+    droppedColumns: Optional[List[str]] = None
+    autodetectedSearchKeys: Optional[Dict[str, str]] = None
     def column_by_name(self, name: str) -> Optional[FileColumnMetadata]:
         for c in self.columns:

upgini/normalizer/normalize_utils.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Dict, List, Tuple
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_bool_dtype as is_bool
-from pandas.api.types import is_datetime64_any_dtype as is_datetime
 from pandas.api.types import (
     is_float_dtype,
     is_numeric_dtype,
@@ -25,7 +24,7 @@ from upgini.metadata import (
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
 from upgini.utils import find_numbers_with_decimal_comma
 from upgini.utils.country_utils import CountrySearchKeyConverter
-from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
+from upgini.utils.datetime_utils import DateTimeConverter
 from upgini.utils.ip_utils import IpSearchKeyConverter
 from upgini.utils.phone_utils import PhoneSearchKeyConverter
 from upgini.utils.postal_code_utils import PostalCodeSearchKeyConverter
@@ -45,7 +44,7 @@ class Normalizer:
         self.columns_renaming = {}
         self.search_keys = {}
         self.generated_features = []
-        self.removed_features = []
+        self.removed_datetime_features = []
     def normalize(
         self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
@@ -89,7 +88,7 @@ class Normalizer:
                 SYSTEM_RECORD_ID,
                 ENTITY_SYSTEM_RECORD_ID,
                 SEARCH_KEY_UNNEST,
-                DateTimeSearchKeyConverter.DATETIME_COL,
+                DateTimeConverter.DATETIME_COL,
             ]:
                 self.columns_renaming[column] = column
                 new_columns.append(column)
@@ -134,8 +133,9 @@ class Normalizer:
         features = self._get_features(df)
         for f in features:
-            if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
-                self.removed_features.append(f)
+            converter = DateTimeConverter(f)
+            if converter.is_datetime(df) and f != DateTimeConverter.DATETIME_COL:
+                self.removed_datetime_features.append(f)
                 df.drop(columns=f, inplace=True)
         return df

upgini/resource_bundle/strings.properties CHANGED Viewed

@@ -12,7 +12,8 @@ polling_unregister_information=We'll send email notification once it's completed
 ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
 demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
 transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
-transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
+transform_usage_warning_demo=Unregistered-user limit: {} rows remaining; you requested {}.
+transform_usage_warning_registered=Free tier limit: {} rows remaining; you requested {}.
 # Warnings
 support_link=https://upgini.com/support
@@ -139,6 +140,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
 eval_x_has_train_samples=Eval set X has rows that are present in train set X
 oot_without_date_not_supported=Eval set {} provided as OOT but date column is missing. It will be ignored for stability check
 oot_with_online_sources_not_supported=Eval set {} provided as OOT and also provided columns for online API. It will be ignored for stability check
+autodetected_search_key_not_found=Autodetected on fit search key {} not found in X columns: {} for transform
 baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
 baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
@@ -174,7 +176,8 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
 dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
 dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
 dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
-dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
+dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class
+#\nPlease, remove rows with rarest class from your dataframe
 dataset_rarest_class_less_threshold=Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
 dataset_date_features=Columns {} is a datetime or period type but not used as a search key, removed from X
 dataset_too_many_features=Too many features. Maximum number of features is {}
@@ -209,15 +212,16 @@ features_info_zero_important_features=Oops, we can't find any relevant external
 features_info_zero_hit_rate_search_keys=Oops, looks like values/formats of the search keys {} might be incorrect,\nas we won't be able to match any data source using these values\nPlease check docs https://github.com/upgini/upgini#-search-key-types-we-support-more-to-come or send us a help request in Support:
 features_not_generated=Following features didn't pass checks for automated feature generation: {}
 # Information
-postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-country_default_determined=Search key country_code `{}` was used as default. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
-target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
+datetime_detected=Datetime detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+postal_code_detected=Postal codes detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+country_detected=Countries detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+country_auto_determined=Search key country_code `{}` was automatically determined by client IP. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+country_default_determined=Search key country_code `{}` was used as default. \nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+email_detected=Emails detected in column `{}`. It will be used as a search key\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns\n
+target_type_detected=Detected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
 binary_target_reason=only two unique label-values observed
 non_numeric_multiclass_reason=non-numeric label values observed
 few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical

upgini/search_task.py CHANGED Viewed

@@ -165,10 +165,21 @@ class SearchTask:
         return list(zero_hit_search_keys)
-    def get_features_for_transform(self) -> Optional[List[str]]:
+    def get_features_for_embeddings(self) -> Optional[List[str]]:
         if self.provider_metadata_v2 is None:
             return None
+        features_for_transform = set()
+        for meta in self.provider_metadata_v2:
+            if meta.features_used_for_embeddings is not None:
+                features_for_transform.update(meta.features_used_for_embeddings)
+        return list(features_for_transform)
+    def get_features_for_transform(self) -> List[str]:
+        if self.provider_metadata_v2 is None:
+            return []
         features_for_transform = set()
         for meta in self.provider_metadata_v2:
             if meta.features_used_for_embeddings is not None:
@@ -423,4 +434,5 @@ def _read_parquet(file_content: bytes, file_name: str = "features.parquet"):
         tmp_file_name = f"{tmp_dir}/{file_name}"
         with open(tmp_file_name, "wb") as gzip_file:
             gzip_file.write(file_content)
-        return pd.read_parquet(tmp_file_name, engine="fastparquet")
+        # Note: MLB writes files using pyarrow, so reading with fastparquet may cause errors.
+        return pd.read_parquet(tmp_file_name, engine="pyarrow")

upgini/utils/base_search_key_detector.py CHANGED Viewed

@@ -24,4 +24,8 @@ class BaseSearchKeyDetector:
         for column_name in other_columns:
             if self._is_search_key_by_values(df[column_name]):
                 columns_by_values.append(column_name)
-        return list(set(columns_by_names + columns_by_values))
+        both = [col for col in columns_by_names if col in columns_by_values]
+        only_values = [col for col in columns_by_values if col not in columns_by_names]
+        only_names = [col for col in columns_by_names if col not in columns_by_values]
+        return both + only_values + only_names

upgini/utils/datetime_utils.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import datetime
 import logging
-import re
 from typing import Dict, List, Optional
 import numpy as np
@@ -11,6 +10,7 @@ from pandas.api.types import is_numeric_dtype
 from upgini.errors import ValidationError
 from upgini.metadata import EVAL_SET_INDEX, SearchKey
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
+from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
 DATE_FORMATS = [
     "%Y-%m-%d",
@@ -30,7 +30,16 @@ DATE_FORMATS = [
 DATETIME_PATTERN = r"^[\d\s\.\-:T/+]+$"
-class DateTimeSearchKeyConverter:
+class DateSearchKeyDetector(BaseSearchKeyDetector):
+    def _is_search_key_by_name(self, column_name: str) -> bool:
+        lower_column_name = str(column_name).lower()
+        return "date" in lower_column_name or "time" in lower_column_name or "timestamp" in lower_column_name
+    def _is_search_key_by_values(self, column: pd.Series) -> bool:
+        return DateTimeConverter(column.name).is_datetime(column.to_frame(column.name))
+class DateTimeConverter:
     DATETIME_COL = "_date_time"
     # MIN_SUPPORTED_DATE_TS = datetime.datetime(1999, 12, 31)  # 946684800000  # 2000-01-01
     MIN_SUPPORTED_DATE_TS = pd.to_datetime(datetime.datetime(1999, 12, 31)).tz_localize(None)
@@ -67,47 +76,108 @@ class DateTimeSearchKeyConverter:
         try:
             if s is None or len(str(s).strip()) == 0:
                 return None
-            if not re.match(DATETIME_PATTERN, str(s)):
+            if sum(ch.isdigit() for ch in str(s)) < 6:
                 return None
             return s
         except Exception:
             return None
-    def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
-        if len(df) == 0:
-            return df
+    def is_datetime(self, df: pd.DataFrame) -> bool:
+        if len(df) == 0 or df[self.date_column].isna().all():
+            return False
-        df = df.copy()
-        if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
-            df[self.date_column] = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
-        elif isinstance(df[self.date_column].values[0], datetime.date):
-            df[self.date_column] = pd.to_datetime(df[self.date_column], errors="coerce")
-        elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
-            df[self.date_column] = df[self.date_column].dt.to_timestamp()
-        elif is_numeric_dtype(df[self.date_column]):
-            # 315532801 - 2524608001    - seconds
-            # 315532801000 - 2524608001000 - milliseconds
-            # 315532801000000 - 2524608001000000 - microseconds
-            # 315532801000000000 - 2524608001000000000 - nanoseconds
-            if df[self.date_column].apply(lambda x: 10**16 < x).all():
-                df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ns")
-            elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
-                df[self.date_column] = pd.to_datetime(df[self.date_column], unit="us")
-            elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
-                df[self.date_column] = pd.to_datetime(df[self.date_column], unit="ms")
-            elif df[self.date_column].apply(lambda x: 0 < x < 10**11).all():
-                df[self.date_column] = pd.to_datetime(df[self.date_column], unit="s")
+        if pd.api.types.is_datetime64_any_dtype(df[self.date_column]):
+            return True
+        parsed = self.parse_datetime(df, raise_errors=False)
+        return parsed is not None and parsed.isna().mean() <= 0.5
+    def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
+        if len(df) == 0 or df[self.date_column].isna().all():
+            return None
+        date_col = df[self.date_column].copy()
+        try:
+            if date_col.apply(lambda x: isinstance(x, datetime.datetime)).all():
+                parsed_datetime = date_col.apply(lambda x: x.replace(tzinfo=None))
+            elif isinstance(date_col.dropna().values[0], datetime.date):
+                parsed_datetime = pd.to_datetime(date_col, errors="coerce")
+            elif isinstance(date_col.dtype, pd.PeriodDtype):
+                parsed_datetime = date_col.dt.to_timestamp()
+            elif is_numeric_dtype(date_col):
+                # 315532801 - 2524608001    - seconds
+                # 315532801000 - 2524608001000 - milliseconds
+                # 315532801000000 - 2524608001000000 - microseconds
+                # 315532801000000000 - 2524608001000000000 - nanoseconds
+                if date_col.apply(lambda x: 10**16 < x).all():
+                    parsed_datetime = pd.to_datetime(date_col, unit="ns")
+                elif date_col.apply(lambda x: 10**14 < x < 10**16).all():
+                    parsed_datetime = pd.to_datetime(date_col, unit="us")
+                elif date_col.apply(lambda x: 10**11 < x < 10**14).all():
+                    parsed_datetime = pd.to_datetime(date_col, unit="ms")
+                elif date_col.apply(lambda x: 10**8 < x < 10**11).all():
+                    parsed_datetime = pd.to_datetime(date_col, unit="s")
+                else:
+                    msg = self.bundle.get("unsupported_date_type").format(self.date_column)
+                    if raise_errors:
+                        raise ValidationError(msg)
+                    else:
+                        return None
+            else:
+                date_col = date_col.astype("string").apply(self.clean_date)
+                parsed_datetime = self.parse_string_date(date_col.to_frame(self.date_column), raise_errors)
+                if parsed_datetime.isna().all():
+                    raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
+            parsed_datetime = parsed_datetime.dt.tz_localize(None)
+            return parsed_datetime
+        except Exception as e:
+            if raise_errors:
+                raise ValidationError(e)
             else:
-                msg = self.bundle.get("unsupported_date_type").format(self.date_column)
-                raise ValidationError(msg)
+                return None
+    def to_date_string(self, df: pd.DataFrame) -> pd.Series:
+        parsed_datetime = self.parse_datetime(df)
+        if parsed_datetime is None:
+            return df[self.date_column]
+        return parsed_datetime.dt.strftime("%Y-%m-%d")
+    def to_date_ms(self, df: pd.DataFrame) -> pd.Series:
+        parsed_datetime = self.parse_datetime(df)
+        if parsed_datetime is None:
+            return df[self.date_column]
+        return self.convert_datetime_to_date_ms(parsed_datetime)
+    def convert_datetime_to_datetime_ms(self, date_col: pd.Series) -> pd.Series:
+        if date_col.dt.unit == "ns":
+            date_col = date_col.astype(np.int64) // 1_000_000
+        elif date_col.dt.unit == "us":
+            date_col = date_col.astype(np.int64) // 1_000
+        elif date_col.dt.unit == "ms":
+            date_col = date_col.astype(np.int64)
+        elif date_col.dt.unit == "s":
+            date_col = date_col.astype(np.int64) * 1_000
         else:
-            df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
-            df[self.date_column] = self.parse_date(df)
+            raise ValueError(f"Unsupported date unit: {date_col.dt.unit}")
+        return date_col.apply(self._int_to_opt).astype("Int64")
+    def convert_datetime_to_date_ms(self, date_col: pd.Series) -> pd.Series:
+        date_col = date_col.dt.floor("D")
+        return self.convert_datetime_to_datetime_ms(date_col)
+    def convert(self, df: pd.DataFrame, keep_time=False) -> pd.DataFrame:
+        df = df.copy()
+        parsed_datetime = self.parse_datetime(df)
+        if parsed_datetime is None:
+            return df
+        df[self.date_column] = parsed_datetime
         # If column with date is datetime then extract seconds of the day and minute of the hour
         # as additional features
         seconds = "datetime_seconds"
-        df[self.date_column] = df[self.date_column].dt.tz_localize(None)
         df = self.clean_old_dates(df)
@@ -182,21 +252,22 @@ class DateTimeSearchKeyConverter:
             df.drop(columns=seconds, inplace=True)
         if keep_time:
-            df[self.DATETIME_COL] = df[self.date_column].astype(np.int64) // 1_000_000
-            df[self.DATETIME_COL] = df[self.DATETIME_COL].apply(self._int_to_opt).astype("Int64")
-        df[self.date_column] = df[self.date_column].dt.floor("D").astype(np.int64) // 1_000_000
-        df[self.date_column] = df[self.date_column].apply(self._int_to_opt).astype("Int64")
+            df[self.DATETIME_COL] = self.convert_datetime_to_datetime_ms(df[self.date_column])
+        df[self.date_column] = self.convert_datetime_to_date_ms(df[self.date_column])
         self.logger.info(f"Date after convertion to timestamp: {df[self.date_column]}")
         return df
-    def parse_date(self, df: pd.DataFrame):
+    def parse_string_date(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
         if self.date_format is not None:
             try:
                 return pd.to_datetime(df[self.date_column], format=self.date_format)
             except ValueError as e:
-                raise ValidationError(e)
+                if raise_errors:
+                    raise ValidationError(e)
+                else:
+                    return None
         else:
             for date_format in DATE_FORMATS:
                 try:
@@ -204,9 +275,20 @@ class DateTimeSearchKeyConverter:
                 except ValueError:
                     pass
             try:
-                return pd.to_datetime(df[self.date_column])
+                # Suppress warning for intentional fallback to dateutil parsing
+                import warnings
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", message="Could not infer format")
+                    return pd.to_datetime(df[self.date_column])
             except ValueError:
-                raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
+                try:
+                    return pd.to_datetime(df[self.date_column], format="mixed", errors="raise")
+                except ValueError:
+                    if raise_errors:
+                        raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
+                    else:
+                        return None
     def clean_old_dates(self, df: pd.DataFrame) -> pd.DataFrame:
         condition = df[self.date_column] <= self.MIN_SUPPORTED_DATE_TS
@@ -339,6 +421,10 @@ def is_dates_distribution_valid(
         if maybe_date_col is None:
             return
+        # Don't check if date column is constant
+        if X[maybe_date_col].nunique() <= 1:
+            return
         if isinstance(X[maybe_date_col].dtype, pd.PeriodDtype):
             dates = X[maybe_date_col].dt.to_timestamp().dt.date
         elif pd.__version__ >= "2.0.0":

upgini/utils/deduplicate_utils.py CHANGED Viewed

@@ -14,7 +14,7 @@ from upgini.metadata import (
     SearchKey,
 )
 from upgini.resource_bundle import ResourceBundle, get_custom_bundle
-from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
+from upgini.utils.datetime_utils import DateTimeConverter
 from upgini.utils.target_utils import define_task
@@ -31,7 +31,7 @@ def remove_fintech_duplicates(
         logger = logging.getLogger()
         logger.setLevel(logging.FATAL)
     date_col = _get_column_by_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME])
-    if define_task(df[TARGET], date_col is not None, silent=True) != ModelTaskType.BINARY:
+    if define_task(df[TARGET], date_col is not None, logger=logger, silent=True) != ModelTaskType.BINARY:
         return df, []
     if date_col is None:
@@ -104,7 +104,7 @@ def remove_fintech_duplicates(
         sub_df = pd.merge(sub_df, nonunique_target_rows, on=personal_cols)
         # Convert date columns for further checks
-        sub_df = DateTimeSearchKeyConverter(
+        sub_df = DateTimeConverter(
             date_col, date_format=date_format, logger=logger, bundle=bundle, generate_cyclical_features=False
         ).convert(sub_df)
         grouped_by_personal_cols = sub_df.groupby(personal_cols, group_keys=False)
@@ -160,7 +160,10 @@ def remove_fintech_duplicates(
 def clean_full_duplicates(
-    df: pd.DataFrame, logger: Optional[Logger] = None, bundle: Optional[ResourceBundle] = None
+    df: pd.DataFrame,
+    is_transform: bool = False,
+    logger: Optional[Logger] = None,
+    bundle: Optional[ResourceBundle] = None,
 ) -> Tuple[pd.DataFrame, Optional[str]]:
     if logger is None:
         logger = logging.getLogger()
@@ -193,7 +196,7 @@ def clean_full_duplicates(
         logger.warning(bundle.get("dataset_full_duplicates").format(share_full_dedup))
     msg = None
-    if TARGET in df.columns:
+    if not is_transform and TARGET in df.columns:
         unique_columns.remove(TARGET)
         # Separate rows to exclude from deduplication:

upgini/utils/display_utils.py CHANGED Viewed

@@ -8,7 +8,6 @@ from io import StringIO
 from typing import Callable, List, Optional
 import pandas as pd
-from xhtml2pdf import pisa
 from upgini.__about__ import __version__
@@ -325,31 +324,73 @@ def show_button_download_pdf(
     # html = HTML(string=source)
     # html.write_pdf(file_name)
-    with open(file_name, "wb") as output:
-        pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
-    with open(file_name, "rb") as f:
-        b64 = base64.b64encode(f.read())
-        payload = b64.decode()
-        html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
-        <button>{title}</button></a>"""
-        if display_handle is not None:
-            display_handle.update(HTML(html))
-        else:
-            return display(HTML(html), display_id=display_id)
+    try:
+        from xhtml2pdf import pisa
+        with open(file_name, "wb") as output:
+            pisa.CreatePDF(src=StringIO(source), dest=output, encoding="UTF-8")
+        with open(file_name, "rb") as f:
+            b64 = base64.b64encode(f.read())
+            payload = b64.decode()
+            html = f"""<a download="{file_name}" href="data:application/pdf;base64,{payload}" target="_blank">
+            <button>{title}</button></a>"""
+            if display_handle is not None:
+                display_handle.update(HTML(html))
+            else:
+                return display(HTML(html), display_id=display_id)
+    except Exception:
+        pass
-def show_request_quote_button():
+def show_request_quote_button(is_registered: bool):
     if not ipython_available():
-        print("https://upgini.com/request-a-quote")
+        if is_registered:
+            print("https://upgini.com/request-a-quote")
+        else:
+            print("https://profile.upgini.com/login")
     else:
-        import ipywidgets as widgets
-        from IPython.display import Javascript, display
-        button = widgets.Button(description="Request a quote", button_style="danger")
+        from IPython.display import HTML, display, Javascript
+        from ipywidgets import Layout, Button
+        if is_registered:
+            display(HTML("""
+                <style>
+                    button.custom-button {
+                        border: 1px solid black !important;
+                        background: white !important;
+                        color: black !important;
+                        white-space: nowrap;
+                    }
+                </style>
+            """))
+            description = "Request a quote"
+            tooltip = "Ask a quote"
+            url = "https://upgini.com/request-a-quote"
+        else:
+            display(HTML("""
+                <style>
+                    button.custom-button {
+                        border: 1px solid #d00 !important;
+                        background: #fff !important;
+                        color: #d00 !important;
+                        white-space: nowrap;
+                    }
+                </style>
+            """))
+            description = "Get an API KEY"
+            tooltip = "Register"
+            url = "https://profile.upgini.com/login"
+        button = Button(
+            description=description,
+            layout=Layout(width='auto'),
+            tooltip=tooltip
+        )
+        button.add_class("custom-button")
         def on_button_clicked(b):
-            display(Javascript('window.open("https://upgini.com/request-a-quote");'))
+            display(Javascript('window.open("' + url + '");'))
         button.on_click(on_button_clicked)

upgini/utils/feature_info.py CHANGED Viewed

@@ -31,7 +31,10 @@ class FeatureInfo:
     @staticmethod
     def from_metadata(
-        feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame], is_client_feature: bool
+        feature_meta: FeaturesMetadataV2,
+        data: Optional[pd.DataFrame],
+        is_client_feature: bool,
+        is_generated_feature: bool,
     ) -> "FeatureInfo":
         return FeatureInfo(
             name=_get_name(feature_meta),
@@ -41,8 +44,8 @@ class FeatureInfo:
             value_preview=_get_feature_sample(feature_meta, data),
             provider=_get_provider(feature_meta, is_client_feature),
             internal_provider=_get_internal_provider(feature_meta, is_client_feature),
-            source=_get_source(feature_meta, is_client_feature),
-            internal_source=_get_internal_source(feature_meta, is_client_feature),
+            source=_get_source(feature_meta, is_client_feature, is_generated_feature),
+            internal_source=_get_internal_source(feature_meta, is_client_feature, is_generated_feature),
             update_frequency=feature_meta.update_frequency,
             commercial_schema=feature_meta.commercial_schema,
             doc_link=feature_meta.doc_link,
@@ -139,22 +142,30 @@ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature:
         return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
-def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
+def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
+    if is_generated_feature:
+        return "AutoFE: features from Training dataset"
     sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
     source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
     if sources:
         source = _make_links(sources, source_links)
     else:
-        source = _get_internal_source(feature_meta, is_client_feature)
+        source = _get_internal_source(feature_meta, is_client_feature, is_generated_feature)
     return source
-def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
+def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
+    if is_generated_feature:
+        return "AutoFE: features from Training dataset"
     sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
     if sources:
         return ", ".join(sources)
+    elif feature_meta.data_source:
+        return feature_meta.data_source
     else:
-        return feature_meta.data_source or (
+        return (
             LLM_SOURCE
             if not feature_meta.name.endswith("_country")
             and not feature_meta.name.endswith("_postal_code")

upgini 1.2.124__py3-none-any.whl → 1.2.146a4__py3-none-any.whl

Potentially problematic release.

upgini 1.2.124py3-none-any.whl → 1.2.146a4py3-none-any.whl