PyPI - ai-data-science-team - Versions diffs - 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl - Mend

ai-data-science-team 0.0.0.9008py3-none-any.whl → 0.0.0.9010py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

ai_data_science_team/tools/data_loader.py ADDED Viewed

@@ -0,0 +1,378 @@
+from langchain.tools import tool
+import pandas as pd
+from typing import Tuple, List, Dict
+@tool(response_format='content_and_artifact')
+def load_directory(dir_path: str) -> Tuple[str, Dict]:
+    """
+    Tool: load_directory
+    Description: Loads all recognized tabular files in a directory.
+    Parameters:
+    ----------
+    dir_path : str
+        The path to the directory to load.
+    Returns:
+    -------
+    Tuple[str, Dict]
+        A tuple containing a message and a dictionary of data frames.
+    """
+    print("    * Tool: load_directory")
+    import os
+    import pandas as pd
+    data_frames = {}
+    for filename in os.listdir(dir_path):
+        file_path = os.path.join(dir_path, filename)
+        # Skip directories
+        if os.path.isdir(file_path):
+            continue
+        try:
+            data_frames[filename] = auto_load_file(file_path).to_dict()
+        except Exception as e:
+            data_frames[filename] = f"Error loading file: {e}"
+    return f"Returned the following data frames: {list(data_frames.keys())}", data_frames
+@tool(response_format='content_and_artifact')
+def load_file(file_path: str) -> Tuple[str, Dict]:
+    """
+    Automatically loads a file based on its extension.
+    Parameters:
+    ----------
+    file_path : str
+        The path to the file to load.
+    Returns:
+    -------
+    Tuple[str, Dict]
+        A tuple containing a message and a dictionary of the data frame.
+    """
+    print("    * Tool: load_file")
+    return f"Returned the following data frame from this file: {file_path}", auto_load_file(file_path).to_dict()
+@tool(response_format='content_and_artifact')
+def list_directory_contents(directory_path: str, show_hidden: bool = False) -> Tuple[List[str], List[Dict]]:
+    """
+    Tool: list_directory_contents
+    Description: Lists all files and folders in the specified directory.
+    Args:
+        directory_path (str): The path of the directory to list.
+        show_hidden (bool): Whether to include hidden files (default: False).
+    Returns:
+        tuple:
+            - content (list[str]): A list of filenames/folders (suitable for display)
+            - artifact (list[dict]): A list of dictionaries where each dict has keys like {"filename": <name>}.
+                                     This structure can be easily converted to a pandas DataFrame.
+    """
+    print("    * Tool: list_directory_contents")
+    import os
+    items = []
+    for item in os.listdir(directory_path):
+        # If show_hidden is False, skip items starting with '.'
+        if not show_hidden and item.startswith('.'):
+            continue
+        items.append(item)
+    # content: just the raw list of filenames
+    content = items
+    # artifact: list of dicts (each row is {"filename": ...}), easily turned into a DataFrame
+    artifact = [{"filename": item} for item in items]
+    return content, artifact
+@tool(response_format='content_and_artifact')
+def list_directory_recursive(directory_path: str, show_hidden: bool = False) -> Tuple[str, List[Dict]]:
+    """
+    Tool: list_directory_recursive
+    Description:
+        Recursively lists all files and folders within the specified directory.
+        Returns a two-tuple:
+          (1) A human-readable tree representation of the directory (content).
+          (2) A list of dicts (artifact) that can be easily converted into a DataFrame.
+    Args:
+        directory_path (str): The path of the directory to list.
+        show_hidden (bool): Whether to include hidden files (default: False).
+    Returns:
+        Tuple[str, List[dict]]:
+            content: A multiline string showing the directory tree.
+            artifact: A list of dictionaries, each with information about a file or directory.
+    Example:
+        content, artifact = list_directory_recursive("/path/to/folder", show_hidden=False)
+    """
+    print("    * Tool: list_directory_recursive")
+    # We'll store two things as we recurse:
+    # 1) lines for building the "tree" string
+    # 2) records in a list of dicts for easy DataFrame creation
+    import os
+    lines = []
+    records = []
+    def recurse(path: str, indent_level: int = 0):
+        # List items in the current directory
+        try:
+            items = os.listdir(path)
+        except PermissionError:
+            # If we don't have permission to read the directory, just note it.
+            lines.append("  " * indent_level + "[Permission Denied]")
+            return
+        # Sort items for a consistent order (optional)
+        items.sort()
+        for item in items:
+            if not show_hidden and item.startswith('.'):
+                continue
+            full_path = os.path.join(path, item)
+            # Build an indented prefix for the tree
+            prefix = "  " * indent_level
+            if os.path.isdir(full_path):
+                # Directory
+                lines.append(f"{prefix}{item}/")
+                records.append({
+                    "type": "directory",
+                    "name": item,
+                    "parent_path": path,
+                    "absolute_path": full_path
+                })
+                # Recursively descend
+                recurse(full_path, indent_level + 1)
+            else:
+                # File
+                lines.append(f"{prefix}- {item}")
+                records.append({
+                    "type": "file",
+                    "name": item,
+                    "parent_path": path,
+                    "absolute_path": full_path
+                })
+    # Kick off recursion
+    if os.path.isdir(directory_path):
+        # Add the top-level directory to lines/records if you like
+        dir_name = os.path.basename(os.path.normpath(directory_path)) or directory_path
+        lines.append(f"{dir_name}/")  # Show the root as well
+        records.append({
+            "type": "directory",
+            "name": dir_name,
+            "parent_path": os.path.dirname(directory_path),
+            "absolute_path": os.path.abspath(directory_path)
+        })
+        recurse(directory_path, indent_level=1)
+    else:
+        # If the given path is not a directory, just return a note
+        lines.append(f"{directory_path} is not a directory.")
+        records.append({
+            "type": "error",
+            "name": directory_path,
+            "parent_path": None,
+            "absolute_path": os.path.abspath(directory_path)
+        })
+    # content: multiline string with the entire tree
+    content = "\n".join(lines)
+    # artifact: list of dicts, easily converted into a DataFrame
+    artifact = records
+    return content, artifact
+@tool(response_format='content_and_artifact')
+def get_file_info(file_path: str) -> Tuple[str, List[Dict]]:
+    """
+    Tool: get_file_info
+    Description: Retrieves metadata (size, modification time, etc.) about a file.
+                 Returns a tuple (content, artifact):
+                   - content (str): A textual summary of the file info.
+                   - artifact (List[Dict]): A list with a single dictionary of file metadata.
+                                            Useful for direct conversion into a DataFrame.
+    Args:
+        file_path (str): The path of the file to inspect.
+    Returns:
+        Tuple[str, List[dict]]:
+            content: Summary text
+            artifact: A list[dict] of file metadata
+    Example:
+        content, artifact = get_file_info("/path/to/mydata.csv")
+    """
+    print("    * Tool: get_file_info")
+    # Ensure the file exists
+    import os
+    import time
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"{file_path} is not a valid file.")
+    file_stats = os.stat(file_path)
+    # Construct the data dictionary
+    file_data = {
+        "file_name": os.path.basename(file_path),
+        "size_bytes": file_stats.st_size,
+        "modification_time": time.ctime(file_stats.st_mtime),
+        "absolute_path": os.path.abspath(file_path),
+    }
+    # Create a user-friendly summary (content)
+    content_str = (
+        f"File Name: {file_data['file_name']}\n"
+        f"Size (bytes): {file_data['size_bytes']}\n"
+        f"Last Modified: {file_data['modification_time']}\n"
+        f"Absolute Path: {file_data['absolute_path']}"
+    )
+    # Artifact should be a list of dict(s) to easily convert to DataFrame
+    artifact = [file_data]
+    return content_str, artifact
+@tool(response_format='content_and_artifact')
+def search_files_by_pattern(directory_path: str, pattern: str = "*.csv", recursive: bool = False) -> Tuple[str, List[Dict]]:
+    """
+    Tool: search_files_by_pattern
+    Description:
+        Searches for files (optionally in subdirectories) that match a given
+        wildcard pattern (e.g. "*.csv", "*.xlsx", etc.), returning a tuple:
+          (1) content (str): A multiline summary of the matched files.
+          (2) artifact (List[Dict]): A list of dicts with file path info.
+    Args:
+        directory_path (str): Directory path to start searching from.
+        pattern (str): A wildcard pattern, e.g. "*.csv". Default is "*.csv".
+        recursive (bool): Whether to search in subdirectories. Default is False.
+    Returns:
+        Tuple[str, List[Dict]]:
+            content: A user-friendly string showing matched file paths.
+            artifact: A list of dictionaries, each representing a matched file.
+    Example:
+        content, artifact = search_files_by_pattern("/path/to/folder", "*.csv", recursive=True)
+    """
+    print("    * Tool: search_files_by_pattern")
+    import os
+    import fnmatch
+    matched_files = []
+    if recursive:
+        for root, dirs, files in os.walk(directory_path):
+            for filename in files:
+                if fnmatch.fnmatch(filename, pattern):
+                    matched_files.append(os.path.join(root, filename))
+    else:
+        # Non-recursive
+        for filename in os.listdir(directory_path):
+            full_path = os.path.join(directory_path, filename)
+            if os.path.isfile(full_path) and fnmatch.fnmatch(filename, pattern):
+                matched_files.append(full_path)
+    # Create a human-readable summary (content)
+    if matched_files:
+        lines = [f"Found {len(matched_files)} file(s) matching '{pattern}':"]
+        for f in matched_files:
+            lines.append(f" - {f}")
+        content = "\n".join(lines)
+    else:
+        content = f"No files found matching '{pattern}'."
+    # Create artifact as a list of dicts for DataFrame conversion
+    artifact = [{"file_path": path} for path in matched_files]
+    return content, artifact
+# Loaders
+def auto_load_file(file_path: str) -> pd.DataFrame:
+    """
+    Auto loads a file based on its extension.
+    Parameters:
+    ----------
+    file_path : str
+        The path to the file to load.
+    Returns:
+    -------
+    pd.DataFrame
+    """
+    import pandas as pd
+    try:
+        ext = file_path.split(".")[-1].lower()
+        if ext == "csv":
+            return load_csv(file_path)
+        elif ext in ["xlsx", "xls"]:
+            return load_excel(file_path)
+        elif ext == "json":
+            return load_json(file_path)
+        elif ext == "parquet":
+            return load_parquet(file_path)
+        elif ext == "pkl":
+            return load_pickle(file_path)
+        else:
+            return f"Unsupported file extension: {ext}"
+    except Exception as e:
+        return f"Error loading file: {e}"
+def load_csv(file_path: str) -> pd.DataFrame:
+    """
+    Tool: load_csv
+    Description: Loads a CSV file into a pandas DataFrame.
+    Args:
+      file_path (str): Path to the CSV file.
+    Returns:
+      pd.DataFrame
+    """
+    import pandas as pd
+    return pd.read_csv(file_path)
+def load_excel(file_path: str, sheet_name=None) -> pd.DataFrame:
+    """
+    Tool: load_excel
+    Description: Loads an Excel file into a pandas DataFrame.
+    """
+    import pandas as pd
+    return pd.read_excel(file_path, sheet_name=sheet_name)
+def load_json(file_path: str) -> pd.DataFrame:
+    """
+    Tool: load_json
+    Description: Loads a JSON file or NDJSON into a pandas DataFrame.
+    """
+    import pandas as pd
+    # For simple JSON arrays
+    return pd.read_json(file_path, orient="records", lines=False)
+def load_parquet(file_path: str) -> pd.DataFrame:
+    """
+    Tool: load_parquet
+    Description: Loads a Parquet file into a pandas DataFrame.
+    """
+    import pandas as pd
+    return pd.read_parquet(file_path)
+def load_pickle(file_path: str) -> pd.DataFrame:
+    """
+    Tool: load_pickle
+    Description: Loads a Pickle file into a pandas DataFrame.
+    """
+    import pandas as pd
+    return pd.read_pickle(file_path)

ai_data_science_team/tools/{metadata.py → dataframe.py} RENAMED Viewed

@@ -1,6 +1,5 @@
 import io
 import pandas as pd
-import sqlalchemy as sql
 from typing import Union, List, Dict
 def get_dataframe_summary(
@@ -138,93 +137,3 @@ def _summarize_dataframe(df: pd.DataFrame, dataset_name: str, n_sample=30, skip_
     return summary_text.strip()
-def get_database_metadata(connection: Union[sql.engine.base.Connection, sql.engine.base.Engine],
-                          n_samples: int = 10) -> str:
-    """
-    Collects metadata and sample data from a database, with safe identifier quoting and
-    basic dialect-aware row limiting. Prevents issues with spaces/reserved words in identifiers.
-    Parameters
-    ----------
-    connection : Union[sql.engine.base.Connection, sql.engine.base.Engine]
-        An active SQLAlchemy connection or engine.
-    n_samples : int
-        Number of sample values to retrieve for each column.
-    Returns
-    -------
-    str
-        A formatted string with database metadata, including some sample data from each column.
-    """
-    # If a connection is passed, use it; if an engine is passed, connect to it
-    is_engine = isinstance(connection, sql.engine.base.Engine)
-    conn = connection.connect() if is_engine else connection
-    output = []
-    try:
-        # Grab the engine off the connection
-        sql_engine = conn.engine
-        dialect_name = sql_engine.dialect.name.lower()
-        output.append(f"Database Dialect: {sql_engine.dialect.name}")
-        output.append(f"Driver: {sql_engine.driver}")
-        output.append(f"Connection URL: {sql_engine.url}")
-        # Inspect the database
-        inspector = sql.inspect(sql_engine)
-        tables = inspector.get_table_names()
-        output.append(f"Tables: {tables}")
-        output.append(f"Schemas: {inspector.get_schema_names()}")
-        # Helper to build a dialect-specific limit clause
-        def build_query(col_name_quoted: str, table_name_quoted: str, n: int) -> str:
-            """
-            Returns a SQL query string to select N rows from the given column/table
-            across different dialects (SQLite, MySQL, Postgres, MSSQL, Oracle, etc.)
-            """
-            if "sqlite" in dialect_name or "mysql" in dialect_name or "postgres" in dialect_name:
-                # Common dialects supporting LIMIT
-                return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
-            elif "mssql" in dialect_name:
-                # Microsoft SQL Server syntax
-                return f"SELECT TOP {n} {col_name_quoted} FROM {table_name_quoted}"
-            elif "oracle" in dialect_name:
-                # Oracle syntax
-                return f"SELECT {col_name_quoted} FROM {table_name_quoted} WHERE ROWNUM <= {n}"
-            else:
-                # Fallback
-                return f"SELECT {col_name_quoted} FROM {table_name_quoted} LIMIT {n}"
-        # Prepare for quoting
-        preparer = inspector.bind.dialect.identifier_preparer
-        # For each table, get columns and sample data
-        for table_name in tables:
-            output.append(f"\nTable: {table_name}")
-            # Properly quote the table name
-            table_name_quoted = preparer.quote_identifier(table_name)
-            for column in inspector.get_columns(table_name):
-                col_name = column["name"]
-                col_type = column["type"]
-                output.append(f"  Column: {col_name} Type: {col_type}")
-                # Properly quote the column name
-                col_name_quoted = preparer.quote_identifier(col_name)
-                # Build a dialect-aware query with safe quoting
-                query = build_query(col_name_quoted, table_name_quoted, n_samples)
-                # Read a few sample values
-                df = pd.read_sql(sql.text(query), conn)
-                first_values = df[col_name].tolist()
-                output.append(f"    First {n_samples} Values: {first_values}")
-    finally:
-        # Close connection if created inside the function
-        if is_engine:
-            conn.close()
-    return "\n".join(output)

ai-data-science-team 0.0.0.9008__py3-none-any.whl → 0.0.0.9010__py3-none-any.whl

ai-data-science-team 0.0.0.9008py3-none-any.whl → 0.0.0.9010py3-none-any.whl