PyPI - tdfs4ds - Versions diffs - 0.2.4.17__py3-none-any.whl → 0.2.4.19__py3-none-any.whl - Mend

tdfs4ds 0.2.4.17py3-none-any.whl → 0.2.4.19py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

tdfs4ds/__init__.py CHANGED Viewed

@@ -1,4 +1,4 @@
-__version__ = '0.2.4.17'
+__version__ = '0.2.4.19'
 import logging
 # Setup the logger
 logging.basicConfig(

tdfs4ds/utils/lineage.py CHANGED Viewed

@@ -4,6 +4,7 @@ import teradataml as tdml
 import tdfs4ds
 import tqdm
 import networkx as nx
+import sqlparse
 def query_change_case(query, case):
@@ -630,3 +631,165 @@ def get_ddl(view_name, schema_name, object_type='view'):
     # Replace carriage returns with newlines for consistent formatting
     return ddl.replace('\r', '\n')
+from datetime import datetime
+import sqlparse
+import re
+import os
+def generate_process_report(format="html", output_file=None, collapsible=False, sort_by="view_name"):
+    """
+    Generate a process catalog report with sidebar index grouped by database,
+    with sub-items for Entity, Features, and DDL.
+    """
+    processes = tdfs4ds.process_catalog()
+    processes = processes[processes.DATA_DOMAIN == tdfs4ds.DATA_DOMAIN].to_pandas()
+    processes['VIEW'] = processes['VIEW_NAME'].apply(lambda x: x.split('.')[1].replace('"', ""))
+    def split_view_name(full_name):
+        db, vw = full_name.replace('"', '').split('.')
+        return db, vw
+    processes["DB"], processes["VW"] = zip(*processes["VIEW_NAME"].map(split_view_name))
+    if sort_by:
+        if sort_by == "database":
+            processes = processes.sort_values(["DB", "VW"])
+        elif sort_by == "view_name":
+            processes = processes.sort_values(["VW"])
+        elif sort_by == "database,view_name":
+            processes = processes.sort_values(["DB", "VW"])
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    readable_ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    data_domain = tdfs4ds.DATA_DOMAIN
+    if output_file is None:
+        output_file = f"report_{data_domain}_{timestamp}.{ 'html' if format=='html' else format }"
+    report_parts = []
+    if format == "html":
+        report_parts.append(f"""
+        <html>
+        <head>
+            <title>Process Report - {data_domain}</title>
+            <style>
+                body {{
+                    margin: 0;
+                    font-family: Arial, sans-serif;
+                }}
+                .sidebar {{
+                    position: fixed;
+                    top: 0;
+                    left: 0;
+                    width: 280px;
+                    height: 100%;
+                    background: #f4f4f4;
+                    border-right: 1px solid #ccc;
+                    padding: 20px;
+                    overflow-y: auto;
+                }}
+                .sidebar h2 {{
+                    font-size: 18px;
+                    margin-top: 0;
+                }}
+                .sidebar ul {{
+                    list-style: none;
+                    padding-left: 15px;
+                }}
+                .sidebar li {{
+                    margin-bottom: 4px;
+                }}
+                .sidebar a {{
+                    text-decoration: none;
+                    color: #007BFF;
+                    font-size: 14px;
+                }}
+                .sidebar a:hover {{
+                    text-decoration: underline;
+                }}
+                .content {{
+                    margin-left: 300px;
+                    padding: 20px;
+                }}
+                pre {{
+                    background: #f8f8f8;
+                    border: 1px solid #ddd;
+                    padding: 10px;
+                    overflow-x: auto;
+                }}
+            </style>
+        </head>
+        <body>
+            <div class="sidebar">
+                <h2>Teradata Feature Store</h2>
+        """)
+        # Build structured index: group by database, with sub-items
+        grouped = processes.groupby("DB")
+        for db, group in grouped:
+            report_parts.append(f"<h3>DB: {db}</h3><ul>")
+            for _, row in group.iterrows():
+                view_name = row['VIEW_NAME']
+                vw = row["VW"]
+                safe_id = re.sub(r'[^A-Za-z0-9_]+', '_', view_name)
+                # Main view link
+                report_parts.append(f'<li><a href="#{safe_id}">{vw}</a>')
+                # Sub-links: entity, features, ddl
+                report_parts.append("<ul>")
+                report_parts.append(f'<li><a href="#{safe_id}_entity">Entity</a></li>')
+                report_parts.append(f'<li><a href="#{safe_id}_features">Features</a></li>')
+                report_parts.append(f'<li><a href="#{safe_id}_ddl">DDL</a></li>')
+                report_parts.append("</ul></li>")
+            report_parts.append("</ul>")
+        report_parts.append("""
+            </div>
+            <div class="content">
+        """)
+        # Header
+        report_parts.append(f"<h1>Process Catalog Report - {data_domain}</h1>")
+        report_parts.append(f"<p><em>Generated on {readable_ts}</em></p>")
+    for _, row in processes.iterrows():
+        view_name = row['VIEW_NAME']
+        db, vw = row["DB"], row["VW"]
+        entity_list = row['ENTITY_ID'].split(',')
+        features_list = row['FEATURE_NAMES'].split(',')
+        ddl_raw = tdml.execute_sql(f"SHOW VIEW {view_name}").fetchall()[0][0]
+        ddl = sqlparse.format(ddl_raw, reindent=True, keyword_case="upper")
+        safe_id = re.sub(r'[^A-Za-z0-9_]+', '_', view_name)
+        if format == "html":
+            section = [f'<h2 id="{safe_id}">{db}.{vw} ({data_domain})</h2>']
+            section.append(f'<h3 id="{safe_id}_entity">Entity</h3><ul>')
+            section.extend([f"<li>{t}</li>" for t in entity_list])
+            section.append("</ul>")
+            section.append(f'<h3 id="{safe_id}_features">Features ({len(features_list)} total)</h3><ul>')
+            section.extend([f"<li>{t}</li>" for t in features_list])
+            section.append("</ul>")
+            section.append(f'<h3 id="{safe_id}_ddl">DDL</h3>')
+            if collapsible:
+                section.append("<details><summary>Show/Hide DDL</summary><pre><code>")
+                section.append(ddl)
+                section.append("</code></pre></details>")
+            else:
+                section.append(f"<pre><code>{ddl}</code></pre>")
+            report_parts.append("\n".join(section))
+    if format == "html":
+        report_parts.append("</div></body></html>")
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write("\n".join(report_parts))
+    print(f"✅ Report generated: {os.path.abspath(output_file)}")

tdfs4ds/utils/time_management.py CHANGED Viewed

@@ -45,47 +45,89 @@ class TimeManager:
     def load_time_steps(self, df, time_column):
         """
-        Loads a new filter into the table and updates the view to reflect this filter.
+        Load time steps into the table and update the view accordingly.
-        This method takes a DataFrame as input, assigns filter IDs to each row, and updates or replaces the table and view to reflect the new filter configuration.
+        This method:
+        1. Creates a new DataFrame with a sequential time_id and BUSINESS_DATE.
+        2. Ensures BUSINESS_DATE has the correct SQL data type.
+        3. Drops and recreates the target table with the appropriate schema.
+        4. Inserts the new data into the table.
+        5. Updates the view to reference the first time step.
+        6. Stores the number of time steps in `self.nb_time_steps`.
         Args:
-            df (DataFrame): The data containing the new filter configuration.
+            df (pd.DataFrame): The input DataFrame containing time data.
+            time_column (str): The column name representing time.
         """
-        df_            = df.assign(**{
-            'time_id': tdml.sqlalchemy.literal_column(
-            f"ROW_NUMBER() OVER (PARTITION BY 1 ORDER BY {time_column})", tdml.BIGINT()),
-            'BUSINESS_DATE' : df[time_column]
-        })[['time_id','BUSINESS_DATE']]
-        type_BUSINESS_DATE = tdfs4ds.utils.info.get_feature_types_sql_format(df_)['BUSINESS_DATE']
-        if 'TIMESTAMP' in type_BUSINESS_DATE.upper() and 'ZONE' not in type_BUSINESS_DATE.upper():
-            print(f"data type of the time colum has been modified from {type_BUSINESS_DATE} to {type_BUSINESS_DATE + ' WITH TIME ZONE'}")
-            type_BUSINESS_DATE = type_BUSINESS_DATE + ' WITH TIME ZONE'
-            df_ = df_.assign(type_BUSINESS_DATE = tdml.sqlalchemy.literal_column(f"CAST(BUSINESS_DATE AS {type_BUSINESS_DATE})"))
-        d_ = {x[0]: x[1] for x in df_._td_column_names_and_types}
-        self.data_type = type_BUSINESS_DATE #d_['BUSINESS_DATE']
+        # Step 1: Build DataFrame with time_id and BUSINESS_DATE
+        df_ = df.assign(
+            time_id=tdml.sqlalchemy.literal_column(
+                f"ROW_NUMBER() OVER (PARTITION BY 1 ORDER BY {time_column})",
+                tdml.BIGINT()
+            ),
+            BUSINESS_DATE=df[time_column]
+        )[["time_id", "BUSINESS_DATE"]]
+        # Step 2: Get SQL types and adjust BUSINESS_DATE if necessary
+        sql_types = tdfs4ds.utils.info.get_feature_types_sql_format(df_)
+        type_business_date = sql_types["BUSINESS_DATE"]
+        if "TIMESTAMP" in type_business_date.upper() and "ZONE" not in type_business_date.upper():
+            new_type = f"{type_business_date} WITH TIME ZONE"
+            print(
+                f"Data type of the time column modified from {type_business_date} "
+                f"to {new_type}"
+            )
+            type_business_date = new_type
+            sql_types["BUSINESS_DATE"] = new_type
+            df_ = df_.assign(
+                BUSINESS_DATE=tdml.sqlalchemy.literal_column(
+                    f"CAST(BUSINESS_DATE AS {new_type})"
+                )
+            )
+        self.data_type = type_business_date
+        # Step 3: Drop table if it exists
+        try:
+            tdml.execute_sql(f"DROP TABLE {self.schema_name}.{self.table_name}")
+        except Exception as e:
+            if tdfs4ds.DEBUG_MODE:
+                print(f"Error dropping table {self.schema_name}.{self.table_name}: {e}")
+        # Step 4: Recreate table
+        ddl = ",\n".join([f"{col} {dtype}" for col, dtype in sql_types.items()])
+        create_table_sql = f"""
+            CREATE TABLE {self.schema_name}.{self.table_name} (
+                {ddl}
+            )
+            PRIMARY INDEX (time_id)
+        """
+        tdml.execute_sql(create_table_sql)
-        df_.to_sql(
-            table_name    = self.table_name,
-            schema_name   = self.schema_name,
-            if_exists     = 'replace',
-            primary_index = ['time_id'],
+        # Step 5: Insert data
+        df_[list(sql_types.keys())].to_sql(
+            table_name=self.table_name,
+            schema_name=self.schema_name,
+            if_exists="append"
         )
-        query = f"""
-         REPLACE VIEW {self.schema_name}.{self.view_name} AS
-         SEL BUSINESS_DATE
-         FROM {self.schema_name}.{self.table_name}
-         WHERE time_id = 1
-         """
-        tdml.execute_sql(query)
+        # Step 6: Update view
+        create_view_sql = f"""
+            REPLACE VIEW {self.schema_name}.{self.view_name} AS
+            SELECT BUSINESS_DATE
+            FROM {self.schema_name}.{self.table_name}
+            WHERE time_id = 1
+        """
+        tdml.execute_sql(create_view_sql)
-        self.nb_time_steps = tdml.execute_sql(
-            f"SEL MAX(time_id) AS nb_filters FROM {self.schema_name}.{self.table_name}").fetchall()[0][0]
+        # Step 7: Store number of time steps
+        result = tdml.execute_sql(
+            f"SELECT MAX(time_id) AS nb_filters FROM {self.schema_name}.{self.table_name}"
+        ).fetchall()
+        self.nb_time_steps = result[0][0]
     def _exists(self):

{tdfs4ds-0.2.4.17.dist-info → tdfs4ds-0.2.4.19.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: tdfs4ds
-Version: 0.2.4.17
+Version: 0.2.4.19
 Summary: A python package to simplify the usage of feature store using Teradata Vantage ...
 Author: Denis Molin
 Requires-Python: >=3.6

{tdfs4ds-0.2.4.17.dist-info → tdfs4ds-0.2.4.19.dist-info}/RECORD RENAMED Viewed

@@ -2,7 +2,7 @@ tdfs/__init__.py,sha256=7AcO7uB1opRCt7t2JOHworKimfAaDeO3boRW7u9Geo8,23
 tdfs/datasets.py,sha256=-b2MPEKGki2V1M8iUcoDR9uc2krIK7u1CK-EhChvihs,985
 tdfs/feature_store.py,sha256=Honu7eOAXxP4Ivz0mRlhuNkfTDzgZl5HB1WlQUwzcZ0,31354
 tdfs/data/curves.csv,sha256=q0Tm-0yu7VMK4lHvHpgi1LMeRq0lO5gJy2Q17brKbEM,112488
-tdfs4ds/__init__.py,sha256=CU5AFwETPm__QJJhmyxIE35XgkKB8rNmJaSrA-0GgFk,64168
+tdfs4ds/__init__.py,sha256=_H_VK1ezxvUSLKuIq9WKdNwZuu_iWjNcBJ9IwXYkpDo,64168
 tdfs4ds/datasets.py,sha256=LE4Gn0muwdyrIrCrbkE92cnafUML63z1lj5bFIIVzmc,3524
 tdfs4ds/feature_engineering.py,sha256=oVnZ2V_XNGE12LKC_fNfkrWSQZLgtYRmaf8Dispi6S4,7081
 tdfs4ds/feature_store.py,sha256=y-oItPZw6nBkBcGAceaATZbkLPTsvpk0OnpzTxYofDs,68576
@@ -25,11 +25,11 @@ tdfs4ds/process_store/process_store_catalog_management.py,sha256=H135RRTYn-pyWIq
 tdfs4ds/utils/__init__.py,sha256=-yTMfDLZbQnIRQ64s_bczzT21tDW2A8FZeq9PX5SgFU,168
 tdfs4ds/utils/filter_management.py,sha256=7D47N_hnTSUVOkaV2XuKrlUFMxzWjDsCBvRYsH4lXdU,11073
 tdfs4ds/utils/info.py,sha256=sShnUxXMlvCtQ6xtShDhqdpTr6sMG0dZQhNBFgUENDY,12058
-tdfs4ds/utils/lineage.py,sha256=LI-5pG7D8lO3-YFa9qA6CrEackiYugV23_Vz9IpF5xw,28670
+tdfs4ds/utils/lineage.py,sha256=XvoiNyrVrsVhuSZTAJrCNjEZAQ4YVsoe61aIl2fBKzk,34757
 tdfs4ds/utils/query_management.py,sha256=nAcE8QY1GWAKgOtb-ubSfDVcnYbU7Ge8CruVRLoPtmY,6356
-tdfs4ds/utils/time_management.py,sha256=rVxtIXcFtQih2UabAtos4DK-j9MPqzYVieIz_SvySZE,9241
+tdfs4ds/utils/time_management.py,sha256=1eqGs7rT3SGag0F30R3PzwiC7Aa7DKia2Ud0aSNKcPg,10593
 tdfs4ds/utils/visualization.py,sha256=5S528KoKzzkrAdCxfy7ecyqKvAXBoibNvHwz_u5ISMs,23167
-tdfs4ds-0.2.4.17.dist-info/METADATA,sha256=p_BzFpsW4I4oBIQryiPuid20dyzJbHw23_R8hrj_quQ,11944
-tdfs4ds-0.2.4.17.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
-tdfs4ds-0.2.4.17.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
-tdfs4ds-0.2.4.17.dist-info/RECORD,,
+tdfs4ds-0.2.4.19.dist-info/METADATA,sha256=5ri5CNAhzx8igDa4PpsfwMQPeJSxhNsaM_c7_B8IYIs,11944
+tdfs4ds-0.2.4.19.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
+tdfs4ds-0.2.4.19.dist-info/top_level.txt,sha256=wMyVkMvnBn8RRt1xBveGQxOpWFijPMPkMiE7G2mi8zo,8
+tdfs4ds-0.2.4.19.dist-info/RECORD,,

{tdfs4ds-0.2.4.17.dist-info → tdfs4ds-0.2.4.19.dist-info}/WHEEL RENAMED Viewed

File without changes

{tdfs4ds-0.2.4.17.dist-info → tdfs4ds-0.2.4.19.dist-info}/top_level.txt RENAMED Viewed

File without changes

tdfs4ds 0.2.4.17__py3-none-any.whl → 0.2.4.19__py3-none-any.whl

tdfs4ds 0.2.4.17py3-none-any.whl → 0.2.4.19py3-none-any.whl