PyPI - py2ls - Versions diffs - 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl - Mend

py2ls 0.2.4.24py3-none-any.whl → 0.2.4.25py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

py2ls/.git/index +0 -0
py2ls/ec2ls.py +61 -0
py2ls/ips.py +105 -55
py2ls/ml2ls.py +244 -110
py2ls/nl2ls.py +283 -0
py2ls/plot.py +351 -40
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/METADATA +1 -1
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/RECORD +9 -8
py2ls/ml2ls copy.py +0 -2906
{py2ls-0.2.4.24.dist-info → py2ls-0.2.4.25.dist-info}/WHEEL +0 -0

py2ls/.git/index CHANGED Viewed

Binary file

py2ls/ec2ls.py ADDED Viewed

@@ -0,0 +1,61 @@
+def get_trend(
+    keywords: list = None,  # ["AI", "Python", "Data Science"]
+    timezone: str = "Europe/Berlin",  # minutes differ from UTC
+    cat=0,
+    timeframe="today 12-m",
+    geo="DE",
+    gprop="",
+    **kwargs
+):
+    from pytrends.request import TrendReq
+    from pytrends.exceptions import TooManyRequestsError
+    import pytz
+    from datetime import datetime
+    import time
+    import requests
+    from urllib3.util.retry import Retry
+    if isinstance(timezone, str):
+        stadt = pytz.timezone(timezone)
+        current_time = datetime.now(stadt)  # This will be timezone-aware
+        # Convert the timezone-aware datetime to naive UTC datetime
+        naive_time = current_time.astimezone(pytz.utc).replace(tzinfo=None)
+        tz_offset = stadt.utcoffset(naive_time).seconds // 60  # in minutes
+    elif isinstance(timezone, int):
+        tz_offset = timezone
+    # Initialize TrendReq with correct timezone offset
+    pytrends = TrendReq(hl="en-US", tz=tz_offset )
+    # Ensure that keywords are in list form
+    if isinstance(keywords, str):
+        keywords = [keywords]
+    pytrends.build_payload(keywords, cat=cat, timeframe=timeframe, geo=geo, gprop=gprop)
+    res = {}
+    # Try fetching data with error handling
+    for func_name, fetch_func in [
+        ("interest_over_time", pytrends.interest_over_time),
+        ("related_topics", pytrends.related_topics),
+        ("related_queries", pytrends.related_queries),
+        ("categories", pytrends.categories)
+    ]:
+        try:
+            print(f"Fetching {func_name}...")
+            res[func_name] = fetch_func()
+            print(f"done: {func_name}")
+        except TooManyRequestsError:
+            print(f"Too many requests error for {func_name}. Retrying...")
+            time.sleep(5)  # Delay to avoid spamming the server
+            if retries > 0:
+                return get_trend(keywords, timezone, cat, timeframe, geo, gprop, retries=retries-1)
+            res[func_name] = None
+        except requests.exceptions.RequestException as e:
+            print(f"Request error for {func_name}: {e}")
+            res[func_name] = None
+        except Exception as e:
+            print(f"Error fetching {func_name}: {e}")
+            res[func_name] = None
+    return res

py2ls/ips.py CHANGED Viewed

@@ -18,15 +18,17 @@ warnings.simplefilter("ignore", category=pd.errors.SettingWithCopyWarning)
 warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
-def run_once_within(duration=60):  # default 60s
+def run_once_within(duration=60,reverse=False):  # default 60s
     import time
     """
+    如果reverse is True, 则在第一次运行时并不运行.但是在第二次运行时则运行
     usage:
     if run_once_within():
         print("This code runs once per minute.")
     else:
         print("The code has already been run in the last minute.")
     """
     if not hasattr(run_once_within, "time_last"):
         run_once_within.time_last = None
@@ -36,9 +38,9 @@ def run_once_within(duration=60):  # default 60s
         time_curr - run_once_within.time_last >= duration
     ):
         run_once_within.time_last = time_curr  # Update the last execution time
-        return True
+        return False if reverse else True
     else:
-        return False
+        return True if reverse else False
 def plt_font(dir_font: str = "/System/Library/Fonts/Hiragino Sans GB.ttc"):
@@ -3497,12 +3499,8 @@ def figsave(*args, dpi=300):
                     )
         else:
             plt.savefig(
-                fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", pad_inches=0
-            )
-    # elif ftype.lower() == "png":
-    #     plt.savefig(fname, format="png", dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0)
-    # elif ftype.lower() in ["tiff", "tif"]:
-    #     plt.savefig(fname, format="tiff", dpi=dpi, bbox_inches="tight",pad_inches=0)
+                fname, format=ftype.lower(), dpi=dpi, bbox_inches="tight", transparent=True,pad_inches=0
+            )
     elif ftype.lower() == "emf":
         plt.savefig(fname, format="emf", dpi=dpi, bbox_inches="tight", pad_inches=0)
     elif ftype.lower() == "fig":
@@ -5230,16 +5228,16 @@ def df_extend(data: pd.DataFrame, column, axis=0, sep=None, prefix="col"):
     data = data.explode(column, ignore_index=True)
     return data
-def df_circular(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
+def df_cycle(data: pd.DataFrame, columns=None, max_val=None, inplace=False):
     """
     Purpose: transforms a datetime feature (like month or day) into a cyclic encoding for use in machine learning models, particularly neural networks.
     Usage:
         data = pd.DataFrame({'month': [1, 4, 7, 10, 12]})  # Just months as an example
-        # df_circular month cyclically
-        data = df_circular(data, 'month', 12)
+        # df_cycle month cyclically
+        data = df_cycle(data, 'month', 12)
     """
     if columns is None:
-        columns = list(data.columns)  # If no columns specified, use all columns
+        columns = list(data.select_dtypes(include=np.number).columns)  # If no columns specified, use all columns
     if max_val is None:
         max_val = np.max(data[columns])  # If no max_val specified, use the maximum value across all columns
     if isinstance(columns, str):
@@ -5424,7 +5422,7 @@ def df_astype(
                 # print(f"Successfully converted '{column}' to timedelta.")
             elif astype == "circular":
                 max_val = kwargs.get('max_val',None)
-                data[column]=df_circular(data=data,columns=column,max_val=max_val)
+                data[column]=df_cycle(data=data,columns=column,max_val=max_val)
             else:
                 # Convert to other types (e.g., float, int)
                 if astype=='int':
@@ -5910,11 +5908,13 @@ def df_encoder(
 def df_scaler(
     data: pd.DataFrame,  # should be numeric dtype
+    scaler=None,
     method="standard",
     columns=None,  # default, select all numeric col/row
     inplace=False,
     verbose=False,  # show usage
     axis=0,  # defalut column-wise
+    return_scaler:bool=False,# True: return both: return df, scaler
     **kwargs,
 ):
     """
@@ -5932,31 +5932,49 @@ def df_scaler(
     """
     if verbose:
         print('df_scaler(data, scaler="standard", inplace=False, axis=0, verbose=True)')
-    methods = ["standard", "minmax", "robust"]
-    method = strcmp(method, methods)[0]
-    if method == "standard":
-        from sklearn.preprocessing import StandardScaler
-        scaler = StandardScaler(**kwargs)
-    elif method == "minmax":
-        from sklearn.preprocessing import MinMaxScaler
-        scaler = MinMaxScaler(**kwargs)
-    elif method == "robust":
-        from sklearn.preprocessing import RobustScaler
-        scaler = RobustScaler(**kwargs)
-    if axis not in [0, 1]:
-        raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if scaler is None:
+        methods = ["standard", "minmax", "robust","maxabs"]
+        method = strcmp(method, methods)[0]
+        if method == "standard":
+            from sklearn.preprocessing import StandardScaler
+            if verbose:
+                print("performs z-score normalization: This will standardize each feature to have a mean of 0 and a standard deviation of 1.")
+                print("Use when the data is approximately normally distributed (Gaussian).\nWorks well with algorithms sensitive to feature distribution, such as SVMs, linear regression, logistic regression, and neural networks.")
+            scaler = StandardScaler(**kwargs)
+        elif method == "minmax":
+            from sklearn.preprocessing import MinMaxScaler
+            if verbose:
+                print("don't forget to define the range: e.g., 'feature_range=(0, 1)'. ")
+                print("scales the features to the range [0, 1]. Adjust feature_range if you want a different range, like [-1, 1].")
+                print("Use when the data does not follow a normal distribution and you need all features in a specific range (e.g., [0, 1]).\nIdeal for algorithms that do not assume a particular distribution, such as k-nearest neighbors and neural networks.")
+            scaler = MinMaxScaler(**kwargs)
+        elif method == "robust":
+            from sklearn.preprocessing import RobustScaler
+            if verbose:
+                print("scales the data based on the median and interquartile range, which is robust to outliers.")
+                print("Use when the dataset contains outliers.\nThis method is useful because it scales based on the median and the interquartile range (IQR), which are more robust to outliers than the mean and standard deviation.")
+            scaler = RobustScaler(**kwargs)
+        elif method=="maxabs":
+            from sklearn.preprocessing import MaxAbsScaler
+            if verbose:
+                print("This scales each feature by its maximum absolute value, resulting in values within the range [-1, 1] for each feature.")
+                print("Use for data that is already sparse or when features have positive or negative values that need scaling without shifting the data.\nOften used with sparse data (data with many zeros), where preserving zero entries is essential, such as in text data or recommendation systems.")
+            scaler = MaxAbsScaler(**kwargs)
+        if axis not in [0, 1]:
+            raise ValueError("Axis must be 0 (column-wise) or 1 (row-wise).")
+    if verbose:
+        print(scaler)
     if axis == 0:
         # Column-wise scaling (default)
         if columns is None:
             columns = data.select_dtypes(include=np.number).columns.tolist()
         non_numeric_columns = data.columns.difference(columns)
-        scaled_data = scaler.fit_transform(data[columns])
+        # scaled_data = scaler.fit_transform(data[columns])
+        if scaler is None or not hasattr(scaler, 'mean_'):
+            scaled_data = scaler.fit_transform(data[columns])
+        else:
+            scaled_data = scaler.transform(data[columns])
         if inplace:
             data[columns] = scaled_data
@@ -5970,7 +5988,10 @@ def df_scaler(
                 axis=1,
             )
             scaled_df = scaled_df[data.columns]  # Maintain column order
-            return scaled_df
+            if return_scaler:
+                return scaled_df,scaler
+            else:
+                return scaled_df
     elif axis == 1:
         # Row-wise scaling
@@ -5982,9 +6003,10 @@ def df_scaler(
         print(f"Scaling rows")
-        scaled_data = scaler.fit_transform(
-            numeric_rows.T
-        ).T  # Transpose for scaling and then back
+        # scaled_data = scaler.fit_transform(
+        #     numeric_rows.T
+        # ).T  # Transpose for scaling and then back
+        scaled_data = scaler.fit_transform(numeric_rows.T).T if scaler is None or not hasattr(scaler, 'mean_') else scaler.transform(numeric_rows.T).T
         if inplace:
             data.loc[numeric_rows.index] = scaled_data
@@ -5992,7 +6014,10 @@ def df_scaler(
         else:
             scaled_df = data.copy()
             scaled_df.loc[numeric_rows.index] = scaled_data
-            return scaled_df
+            if return_scaler:
+                return scaled_df,scaler
+            else:
+                return scaled_df
 def df_special_characters_cleaner(
@@ -7590,21 +7615,18 @@ def df_qc(
             for warning in res_qc["warnings"]:
                 print("  -", warning)
     if plot_:
-        df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue)
-        if dir_save:
-            try:
-                figsave(dir_save)
-            except Exception as e:
-                print(f"⚠️: {e}")
-    if output:
+        df_qc_plots(data=data, res_qc=res_qc, max_cols=max_cols,hue=hue,dir_save=dir_save)
+    if output or not plot_:
         return res_qc
     return None
-def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None):
+def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,hue=None,dir_save=None):
     import matplotlib.pyplot as plt
     import seaborn as sns
     from .plot import subplot, figsets, get_color
+    from datetime import datetime
+    now_ = datetime.now().strftime("%y%m%d_%H%M%S")
     if columns is not None:
         if isinstance(columns, (list,pd.core.indexes.base.Index)):
@@ -7638,15 +7660,7 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
         ax=nexttile(),
     )
     figsets(ax=ax_outlier_num,title="Outliers (#)", xlabel="#",ylabel=None,fontsize=8 if len(outlier_num)<=20 else 6)
-    #!
-    try:
-        if data.select_dtypes(include=np.number).shape[1]<=10:
-            for col in data.select_dtypes(include=np.number).columns:
-                sns.histplot(data[col], kde=True, bins=30, ax=nexttile())
-                figsets(title=f"Distribution: {col}", xlabel=col, ylabel="Frequency")
-    except:
-        pass
     #!
     try:
         for col in data.select_dtypes(include='category').columns:
@@ -7857,7 +7871,43 @@ def df_qc_plots(data: pd.DataFrame, columns=None,res_qc: dict=None, max_cols=20,
                 title="Correlation Heatmap",
                 ax=ax_heatmap
             )
+    # save figure
+    if dir_save:
+        figsave(dir_save,f"qc_plot_{now_}.pdf")
+    if columns is not None:
+        if isinstance(columns, (list,pd.core.indexes.base.Index)):
+            data=data[columns]
+    len_total = len(res_qc)
+    n_row, n_col = int((len_total + 10) / 3), 3
+    nexttile = subplot(n_row, n_col, figsize=[5 * n_col, 5 * n_row],verbose=False)
+    #! check distribution
+    data_num = data.select_dtypes(include=np.number)
+    if len(data_num) > max_cols:
+        data_num = data_num.iloc[:,:max_cols]
+    data_num = df_scaler(data=data_num, method='standard')
+    import scipy.stats as stats
+    for column in data_num.columns:
+        #* Shapiro-Wilk test for normality
+        stat, p_value = stats.shapiro(data_num[column])
+        normality = "norm" if p_value > 0.05 else "not_norm"
+        #* Plot histogram
+        ax_hist=sns.histplot(data_num[column], kde=True, ax=nexttile())
+        x_min, x_max = ax_hist.get_xlim()
+        y_min, y_max = ax_hist.get_ylim()
+        ax_hist.text(x_min+(x_max-x_min)*0.5, y_min+(y_max-y_min)*0.75,
+                     f'p(Shapiro-Wilk)={p_value:.3f}\n{normality}',
+                     ha='center', va='top')
+        figsets(title=column,ax=ax_hist)
+        ax_twin=ax_hist.twinx()
+        #* Q-Q plot
+        stats.probplot(data_num[column], dist="norm", plot=ax_twin)
+        figsets(ylabel=f'Q-Q Plot:{column}',title=None)
+    # save figure
+    if dir_save:
+        figsave(dir_save,f"qq_plot_{now_}.pdf")
 def use_pd(
     func_name="excel",
     verbose=True,

py2ls 0.2.4.24__py3-none-any.whl → 0.2.4.25__py3-none-any.whl

py2ls 0.2.4.24py3-none-any.whl → 0.2.4.25py3-none-any.whl