PyPI - datablade - Versions diffs - 0.0.0__tar.gz - Mend

datablade 0.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

datablade-0.0.0/LICENSE +21 -0
datablade-0.0.0/PKG-INFO +13 -0
datablade-0.0.0/pyproject.toml +3 -0
datablade-0.0.0/setup.cfg +4 -0
datablade-0.0.0/setup.py +12 -0
datablade-0.0.0/src/datablade/__init__.py +1 -0
datablade-0.0.0/src/datablade/core/__init__.py +7 -0
datablade-0.0.0/src/datablade/core/frames.py +236 -0
datablade-0.0.0/src/datablade/core/json.py +10 -0
datablade-0.0.0/src/datablade/core/lists.py +10 -0
datablade-0.0.0/src/datablade/core/messages.py +11 -0
datablade-0.0.0/src/datablade/core/strings.py +43 -0
datablade-0.0.0/src/datablade/core/zip.py +24 -0
datablade-0.0.0/src/datablade.egg-info/PKG-INFO +13 -0
datablade-0.0.0/src/datablade.egg-info/SOURCES.txt +16 -0
datablade-0.0.0/src/datablade.egg-info/dependency_links.txt +1 -0
datablade-0.0.0/src/datablade.egg-info/requires.txt +5 -0
datablade-0.0.0/src/datablade.egg-info/top_level.txt +1 -0

datablade-0.0.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Brent Carpenetti
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

datablade-0.0.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,13 @@
+Metadata-Version: 2.1
+Name: datablade
+Version: 0.0.0
+Summary: datablade is a suite of functions to provide standard syntax across projects.
+Author: Brent Carpenetti
+Author-email: brentcarpenetti@gmail.com
+License: MIT
+License-File: LICENSE
+Requires-Dist: pandas
+Requires-Dist: pyarrow
+Requires-Dist: numpy
+Requires-Dist: openpyxl
+Requires-Dist: requests

datablade-0.0.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,3 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"

datablade-0.0.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

datablade-0.0.0/setup.py ADDED Viewed

@@ -0,0 +1,12 @@
+from setuptools import setup, find_packages
+desc = """datablade is a suite of functions to provide standard syntax across projects."""
+setup(name='datablade',version='0.0.0',
+      packages=find_packages(where="src"),
+      package_dir={'': 'src'},
+      install_requires=['pandas','pyarrow','numpy','openpyxl','requests'],
+      include_package_data=True,
+      description=desc,
+      author='Brent Carpenetti',
+      author_email='brentcarpenetti@gmail.com',
+      license='MIT',)

datablade-0.0.0/src/datablade/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ #empty

datablade-0.0.0/src/datablade/core/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+import os, re
+def find_python_files(path):
+    return [one_file_name.replace('.py','') for one_file_name in os.listdir(os.path.abspath(path)) if one_file_name != '__init__.py' and re.match(r'.*\.py$',one_file_name) is not None]
+for each_file in find_python_files(path=os.path.dirname(__file__)):
+    exec('from .'+each_file+' import *')

datablade-0.0.0/src/datablade/core/frames.py ADDED Viewed

@@ -0,0 +1,236 @@
+import pandas as pd
+import pyarrow as pa
+import numpy as np
+from .messages import print_verbose
+from .strings  import sql_quotename
+def try_cast_string_columns_to_numeric(df: pd.DataFrame=None, convert_partial: bool=False, verbose: bool=False) -> pd.DataFrame|None:
+    """
+    Attempt to cast DataFrame string columns to numeric values where possible.
+    Parameters:
+        df (pd.DataFrame): The DataFrame to process.
+        convert_partial (bool): If True, columns with some values convertible to numeric types
+                                will be converted to numeric types with NaNs where conversion failed.
+                                If False, only columns where all values can be converted will be converted.
+    Returns:
+        pd.DataFrame: DataFrame with string columns converted to numeric types where possible.
+    """
+    if df is None:
+        print_verbose("No DataFrame provided; exiting try_cast_string_columns_to_numeric.", verbose)
+        exit # Exit the function if no DataFrame is provided
+    for col in df.columns:
+        if df[col].dtype == 'object':
+            converted = pd.to_numeric(df[col], errors='coerce')
+            has_nan = converted.isnull().any()
+            if not has_nan:
+                df[col] = converted
+                print_verbose(f"Column '{col}' successfully converted to numeric.", verbose)
+            else:
+                if convert_partial:
+                    df[col] = converted
+                    print_verbose(f"Column '{col}' partially converted to numeric with NaNs where conversion failed.", verbose)
+                else:
+                    print_verbose(f"Column '{col}' could not be fully converted to numeric; leaving as is.", verbose)
+    return df
+def clean_dataframe_columns(df: pd.DataFrame=None, verbose: bool=False) -> pd.DataFrame|None:
+    """
+    Clean the DataFrame columns by:
+    - Flattening MultiIndex columns
+    - Converting non-string column names to strings
+    - Removing duplicate columns, keeping the first occurrence
+    Parameters:
+        df (pd.DataFrame): The DataFrame to clean.
+    Returns:
+        pd.DataFrame: The cleaned DataFrame.
+    """
+    if df is None:
+        print_verbose("No DataFrame provided; exiting clean_dataframe_columns.", verbose)
+        exit
+    # Step 1: Flatten MultiIndex columns
+    if isinstance(df.columns, pd.MultiIndex):
+        df.columns = ['_'.join(map(str, col)).strip() for col in df.columns.values]
+        print_verbose("Flattened MultiIndex columns.", verbose)
+    # Step 2: Convert non-string column names to strings
+    df.columns = df.columns.map(str)
+    print_verbose("Converted column names to strings.", verbose)
+    # Step 3: Remove duplicate columns, keeping the first occurrence
+    duplicates = df.columns.duplicated()
+    if duplicates.any():
+        duplicate_cols = df.columns[duplicates]
+        print_verbose(f"Duplicate columns found: {list(duplicate_cols)}", verbose)
+        df = df.loc[:, ~duplicates]
+        print_verbose("Removed duplicate columns, keeping the first occurrence.", verbose)
+    return df
+def generate_parquet_schema(df: pd.DataFrame=None, verbose: bool=False) -> pa.Schema|None:
+    """
+    Generate a PyArrow Schema from a pandas DataFrame.
+    Parameters:
+        df (pandas.DataFrame): The DataFrame to generate the schema from.
+    Returns:
+        pyarrow.Schema: The PyArrow Schema object.
+    """
+    if df is None:
+        print_verbose("No DataFrame provided; exiting generate_parquet_schema.", verbose)
+        exit
+    fields = []
+    for column in df.columns:
+        col_data = df[column]
+        col_name = column
+        dtype = col_data.dtype
+        # Determine if the column contains any nulls
+        nullable = col_data.isnull().any()
+        # Map pandas dtype to PyArrow type
+        pa_type = None
+        if pd.api.types.is_integer_dtype(dtype):
+            # Check the range to determine the smallest integer type
+            min_value = col_data.min()
+            max_value = col_data.max()
+            if min_value >= np.iinfo(np.int8).min and max_value <= np.iinfo(np.int8).max:
+                pa_type = pa.int8()
+            elif min_value >= np.iinfo(np.int16).min and max_value <= np.iinfo(np.int16).max:
+                pa_type = pa.int16()
+            elif min_value >= np.iinfo(np.int32).min and max_value <= np.iinfo(np.int32).max:
+                pa_type = pa.int32()
+            else:
+                pa_type = pa.int64()
+        elif pd.api.types.is_float_dtype(dtype):
+            pa_type = pa.float64()
+        elif pd.api.types.is_bool_dtype(dtype):
+            pa_type = pa.bool_()
+        elif pd.api.types.is_datetime64_any_dtype(dtype):
+            pa_type = pa.timestamp('ms')
+        elif isinstance(dtype, pd.CategoricalDtype) or pd.api.types.is_object_dtype(dtype):
+            pa_type = pa.string()
+        else:
+            pa_type = pa.string()
+        # Create a field
+        field = pa.field(col_name, pa_type, nullable=nullable)
+        fields.append(field)
+    schema = pa.schema(fields)
+    return schema
+def pandas_to_parquet_table(df: pd.DataFrame=None, convert: bool=True, partial: bool=False, preserve_index: bool=False, verbose: bool=False) -> pa.Table|None:
+    """
+    Generate a PyArrow Table from a pandas DataFrame.
+    Parameters:
+        df (pandas.DataFrame): The DataFrame to generate the table from.
+        table (str): The name of the table.
+    Returns:
+        pyarrow.Table: The PyArrow Table object.
+    """
+    if df is None:
+        print_verbose("No DataFrame provided; exiting generate_parquet_table.", verbose)
+        exit
+    df     = clean_dataframe_columns(df=df, verbose=verbose)
+    if convert:
+        df = try_cast_string_columns_to_numeric(df=df, convert_partial=partial, verbose=verbose)
+    schema = generate_parquet_schema(df=df, verbose=verbose)
+    try:
+        table = pa.Table.from_pandas(df, schema=schema, preserve_index=preserve_index)
+        return table
+    except Exception as e:
+        print_verbose(f"Error generating PyArrow Table: {e}", verbose)
+        exit
+def generate_sql_server_create_table_string(df: pd.DataFrame=None, catalog: str='database', schema: str='dbo', table: str='table', dropexisting: bool=True, verbose: bool=False) -> str|None:
+    """
+    Generate a SQL Server CREATE TABLE string from a pandas DataFrame.
+    Parameters:
+        df (pandas.DataFrame): The DataFrame to generate the schema from.
+        table_name (str): The name of the SQL table.
+    Returns:
+        str: The SQL Server CREATE TABLE statement.
+    """
+    if df is None:
+        print_verbose("No DataFrame provided; exiting try_cast_string_columns_to_numeric.", verbose)
+        exit
+    table_name = f"{sql_quotename(catalog)}.{sql_quotename(schema)}.{sql_quotename(table)}"
+    drop_statement = f"use {sql_quotename(catalog)}\rgo\rif object_id('{table_name}') is not null drop table {table_name};\r" if dropexisting else ""
+    create_statement = [f"{drop_statement};create table {table_name} ("]
+    indent = "    "
+    column_lines = []
+    for column in df.columns:
+        col_data = df[column]
+        col_name = column
+        dtype = col_data.dtype
+        # Determine if the column contains any nulls
+        nullable = col_data.isnull().any()
+        null_str = f"{'   ' if nullable else 'not'} null"
+        # Map pandas dtype to SQL Server type
+        sql_type = None
+        if pd.api.types.is_integer_dtype(dtype):
+            min_value = col_data.min()
+            max_value = col_data.max()
+            if min_value >= 0 and max_value <= 255:
+                sql_type = "tinyint"
+            elif min_value >= -32768 and max_value <= 32767:
+                sql_type = "smallint"
+            elif min_value >= -2147483648 and max_value <= 2147483647:
+                sql_type = "int"
+            else:
+                sql_type = "bigint"
+        elif pd.api.types.is_float_dtype(dtype):
+            sql_type = "float"
+        elif pd.api.types.is_bool_dtype(dtype):
+            sql_type = "bit"
+        elif pd.api.types.is_datetime64_any_dtype(dtype):
+            sql_type = "datetime2"
+        elif isinstance(dtype, pd.CategoricalDtype) or pd.api.types.is_object_dtype(dtype):
+            # Determine maximum length of string data
+            max_length = col_data.dropna().astype(str).map(len).max()
+            sql_type = f"nvarchar({str(max_length) if max_length <= 4000 else 'max'})"
+        else:
+            sql_type = "nvarchar(max)"
+        # Build the column definition
+        column_line = f"{indent}{sql_quotename(col_name)} {sql_type} {null_str},"
+        column_lines.append(column_line)
+    # Remove the last comma from the last column definition
+    if column_lines:
+        column_lines[-1] = column_lines[-1].rstrip(',')
+    create_statement.extend(column_lines)
+    create_statement.append(");")
+    return_statement = "\r".join(create_statement)
+    return return_statement

datablade-0.0.0/src/datablade/core/json.py ADDED Viewed

@@ -0,0 +1,10 @@
+import requests
+from .messages import print_verbose
+def get(url: str, verbose: bool = False, **kwargs) -> dict:
+    """Get JSON data from a URL."""
+    try:
+        response = requests.get(url, **kwargs)
+        return response.json()
+    except requests.exceptions.RequestException as e:
+        print_verbose(f"Error: {e}", verbose=verbose)

datablade-0.0.0/src/datablade/core/lists.py ADDED Viewed

@@ -0,0 +1,10 @@
+def flatten(nest: list) -> list:
+    """Flatten a nested list."""
+    result = []
+    for item in nest:
+        if isinstance(item, list):
+            result.extend(flatten(item))
+        else:
+            result.append(item)
+    return result

datablade-0.0.0/src/datablade/core/messages.py ADDED Viewed

@@ -0,0 +1,11 @@
+def print_verbose(message: str, verbose: bool=True) -> None:
+    """
+    Print a message if verbose is True.
+    Parameters:
+        message (str): The message to print.
+        verbose (bool): If True, the message will be printed.
+    """
+    if verbose:
+        print(str(message))

datablade-0.0.0/src/datablade/core/strings.py ADDED Viewed

@@ -0,0 +1,43 @@
+from .messages import print_verbose
+import pathlib
+def sql_quotename(name: str=None, brackets: bool=True, ticks: bool=False, verbose: bool=False) -> str|None:
+    """
+    Quote a SQL Server name string.
+    Parameters:
+        name (str): The name to quote.
+        brackets (bool): Whether to use brackets.
+    Returns:
+        str: The quoted name.
+    """
+    if name is None:
+        print_verbose("No name provided; exiting sql_quotename.", verbose)
+        exit
+    return_value = f"{name.replace('[','').replace(']','')}"
+    if brackets:
+        return_value = f"[{return_value}]"
+    if ticks or not brackets:
+        return_value = f"'{return_value}'"
+    return return_value
+def pathing(input: str | pathlib.Path, verbose: bool=False) -> pathlib.Path|None:
+    """
+    Standardize a path string.
+    Parameters:
+        path (str): The path to standardize.
+    Returns:
+        str: The standardized path.
+    """
+    if input is None:
+        print_verbose("No path provided; exiting pathing.", verbose)
+        exit
+    if isinstance(input, str):
+        input.replace('\\','/')
+        input = pathlib.Path(input)
+    else:
+        input = input
+    if input.exists():
+        return input
+    else:
+        print_verbose(f"Path {input} does not exist; exiting pathing.", verbose)
+        exit

datablade-0.0.0/src/datablade/core/zip.py ADDED Viewed

@@ -0,0 +1,24 @@
+import requests, zipfile, io, pathlib
+from .messages import print_verbose
+from .strings import pathing
+def get(url:str, path:str|pathlib.Path=None, verbose:bool=False, **kwargs) -> None|io.BytesIO:
+    """Download a file from a URL and save it to a path."""
+    try:
+        print_verbose(f"Downloading {url}", verbose=verbose)
+        data = requests.get(url, **kwargs).content
+        zip_buffer = io.BytesIO(data)
+        if path is None:
+            return zip_buffer
+        else:
+            print_verbose(f"Saving data to {path}", verbose=verbose)
+            zip_buffer.seek(0)
+            with zipfile.ZipFile(zip_buffer, 'r') as zip_ref:
+                for zip_info in zip_ref.infolist():
+                    extract_path = pathing(path) / zip_info.filename
+                    extract_path.parent.mkdir(parents=True, exist_ok=True)
+                    with open(extract_path, 'wb') as f:
+                        f.write(zip_ref.read(zip_info.filename))
+                        f.close()
+    except requests.exceptions.RequestException as e:
+        print_verbose(f"Error: {e}", verbose=verbose)

datablade-0.0.0/src/datablade.egg-info/PKG-INFO ADDED Viewed

@@ -0,0 +1,13 @@
+Metadata-Version: 2.1
+Name: datablade
+Version: 0.0.0
+Summary: datablade is a suite of functions to provide standard syntax across projects.
+Author: Brent Carpenetti
+Author-email: brentcarpenetti@gmail.com
+License: MIT
+License-File: LICENSE
+Requires-Dist: pandas
+Requires-Dist: pyarrow
+Requires-Dist: numpy
+Requires-Dist: openpyxl
+Requires-Dist: requests

datablade-0.0.0/src/datablade.egg-info/SOURCES.txt ADDED Viewed

@@ -0,0 +1,16 @@
+LICENSE
+pyproject.toml
+setup.py
+src/datablade/__init__.py
+src/datablade.egg-info/PKG-INFO
+src/datablade.egg-info/SOURCES.txt
+src/datablade.egg-info/dependency_links.txt
+src/datablade.egg-info/requires.txt
+src/datablade.egg-info/top_level.txt
+src/datablade/core/__init__.py
+src/datablade/core/frames.py
+src/datablade/core/json.py
+src/datablade/core/lists.py
+src/datablade/core/messages.py
+src/datablade/core/strings.py
+src/datablade/core/zip.py

datablade-0.0.0/src/datablade.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+

datablade-0.0.0/src/datablade.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,5 @@
+pandas
+pyarrow
+numpy
+openpyxl
+requests

datablade-0.0.0/src/datablade.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ datablade