PyPI - dsgrid-toolkit - Versions diffs - 0.3.3__cp313-cp313-win_amd64.whl - Mend

dsgrid-toolkit 0.3.3__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

build_backend.py +93 -0
dsgrid/__init__.py +22 -0
dsgrid/api/__init__.py +0 -0
dsgrid/api/api_manager.py +179 -0
dsgrid/api/app.py +419 -0
dsgrid/api/models.py +60 -0
dsgrid/api/response_models.py +116 -0
dsgrid/apps/__init__.py +0 -0
dsgrid/apps/project_viewer/app.py +216 -0
dsgrid/apps/registration_gui.py +444 -0
dsgrid/chronify.py +32 -0
dsgrid/cli/__init__.py +0 -0
dsgrid/cli/common.py +120 -0
dsgrid/cli/config.py +176 -0
dsgrid/cli/download.py +13 -0
dsgrid/cli/dsgrid.py +157 -0
dsgrid/cli/dsgrid_admin.py +92 -0
dsgrid/cli/install_notebooks.py +62 -0
dsgrid/cli/query.py +729 -0
dsgrid/cli/registry.py +1862 -0
dsgrid/cloud/__init__.py +0 -0
dsgrid/cloud/cloud_storage_interface.py +140 -0
dsgrid/cloud/factory.py +31 -0
dsgrid/cloud/fake_storage_interface.py +37 -0
dsgrid/cloud/s3_storage_interface.py +156 -0
dsgrid/common.py +36 -0
dsgrid/config/__init__.py +0 -0
dsgrid/config/annual_time_dimension_config.py +194 -0
dsgrid/config/common.py +142 -0
dsgrid/config/config_base.py +148 -0
dsgrid/config/dataset_config.py +907 -0
dsgrid/config/dataset_schema_handler_factory.py +46 -0
dsgrid/config/date_time_dimension_config.py +136 -0
dsgrid/config/dimension_config.py +54 -0
dsgrid/config/dimension_config_factory.py +65 -0
dsgrid/config/dimension_mapping_base.py +350 -0
dsgrid/config/dimension_mappings_config.py +48 -0
dsgrid/config/dimensions.py +1025 -0
dsgrid/config/dimensions_config.py +71 -0
dsgrid/config/file_schema.py +190 -0
dsgrid/config/index_time_dimension_config.py +80 -0
dsgrid/config/input_dataset_requirements.py +31 -0
dsgrid/config/mapping_tables.py +209 -0
dsgrid/config/noop_time_dimension_config.py +42 -0
dsgrid/config/project_config.py +1462 -0
dsgrid/config/registration_models.py +188 -0
dsgrid/config/representative_period_time_dimension_config.py +194 -0
dsgrid/config/simple_models.py +49 -0
dsgrid/config/supplemental_dimension.py +29 -0
dsgrid/config/time_dimension_base_config.py +192 -0
dsgrid/data_models.py +155 -0
dsgrid/dataset/__init__.py +0 -0
dsgrid/dataset/dataset.py +123 -0
dsgrid/dataset/dataset_expression_handler.py +86 -0
dsgrid/dataset/dataset_mapping_manager.py +121 -0
dsgrid/dataset/dataset_schema_handler_base.py +945 -0
dsgrid/dataset/dataset_schema_handler_one_table.py +209 -0
dsgrid/dataset/dataset_schema_handler_two_table.py +322 -0
dsgrid/dataset/growth_rates.py +162 -0
dsgrid/dataset/models.py +51 -0
dsgrid/dataset/table_format_handler_base.py +257 -0
dsgrid/dataset/table_format_handler_factory.py +17 -0
dsgrid/dataset/unpivoted_table.py +121 -0
dsgrid/dimension/__init__.py +0 -0
dsgrid/dimension/base_models.py +230 -0
dsgrid/dimension/dimension_filters.py +308 -0
dsgrid/dimension/standard.py +252 -0
dsgrid/dimension/time.py +352 -0
dsgrid/dimension/time_utils.py +103 -0
dsgrid/dsgrid_rc.py +88 -0
dsgrid/exceptions.py +105 -0
dsgrid/filesystem/__init__.py +0 -0
dsgrid/filesystem/cloud_filesystem.py +32 -0
dsgrid/filesystem/factory.py +32 -0
dsgrid/filesystem/filesystem_interface.py +136 -0
dsgrid/filesystem/local_filesystem.py +74 -0
dsgrid/filesystem/s3_filesystem.py +118 -0
dsgrid/loggers.py +132 -0
dsgrid/minimal_patterns.cp313-win_amd64.pyd +0 -0
dsgrid/notebooks/connect_to_dsgrid_registry.ipynb +949 -0
dsgrid/notebooks/registration.ipynb +48 -0
dsgrid/notebooks/start_notebook.sh +11 -0
dsgrid/project.py +451 -0
dsgrid/query/__init__.py +0 -0
dsgrid/query/dataset_mapping_plan.py +142 -0
dsgrid/query/derived_dataset.py +388 -0
dsgrid/query/models.py +728 -0
dsgrid/query/query_context.py +287 -0
dsgrid/query/query_submitter.py +994 -0
dsgrid/query/report_factory.py +19 -0
dsgrid/query/report_peak_load.py +70 -0
dsgrid/query/reports_base.py +20 -0
dsgrid/registry/__init__.py +0 -0
dsgrid/registry/bulk_register.py +165 -0
dsgrid/registry/common.py +287 -0
dsgrid/registry/config_update_checker_base.py +63 -0
dsgrid/registry/data_store_factory.py +34 -0
dsgrid/registry/data_store_interface.py +74 -0
dsgrid/registry/dataset_config_generator.py +158 -0
dsgrid/registry/dataset_registry_manager.py +950 -0
dsgrid/registry/dataset_update_checker.py +16 -0
dsgrid/registry/dimension_mapping_registry_manager.py +575 -0
dsgrid/registry/dimension_mapping_update_checker.py +16 -0
dsgrid/registry/dimension_registry_manager.py +413 -0
dsgrid/registry/dimension_update_checker.py +16 -0
dsgrid/registry/duckdb_data_store.py +207 -0
dsgrid/registry/filesystem_data_store.py +150 -0
dsgrid/registry/filter_registry_manager.py +123 -0
dsgrid/registry/project_config_generator.py +57 -0
dsgrid/registry/project_registry_manager.py +1623 -0
dsgrid/registry/project_update_checker.py +48 -0
dsgrid/registry/registration_context.py +223 -0
dsgrid/registry/registry_auto_updater.py +316 -0
dsgrid/registry/registry_database.py +667 -0
dsgrid/registry/registry_interface.py +446 -0
dsgrid/registry/registry_manager.py +558 -0
dsgrid/registry/registry_manager_base.py +367 -0
dsgrid/registry/versioning.py +92 -0
dsgrid/rust_ext/__init__.py +14 -0
dsgrid/rust_ext/find_minimal_patterns.py +129 -0
dsgrid/spark/__init__.py +0 -0
dsgrid/spark/functions.py +589 -0
dsgrid/spark/types.py +110 -0
dsgrid/tests/__init__.py +0 -0
dsgrid/tests/common.py +140 -0
dsgrid/tests/make_us_data_registry.py +265 -0
dsgrid/tests/register_derived_datasets.py +103 -0
dsgrid/tests/utils.py +25 -0
dsgrid/time/__init__.py +0 -0
dsgrid/time/time_conversions.py +80 -0
dsgrid/time/types.py +67 -0
dsgrid/units/__init__.py +0 -0
dsgrid/units/constants.py +113 -0
dsgrid/units/convert.py +71 -0
dsgrid/units/energy.py +145 -0
dsgrid/units/power.py +87 -0
dsgrid/utils/__init__.py +0 -0
dsgrid/utils/dataset.py +830 -0
dsgrid/utils/files.py +179 -0
dsgrid/utils/filters.py +125 -0
dsgrid/utils/id_remappings.py +100 -0
dsgrid/utils/py_expression_eval/LICENSE +19 -0
dsgrid/utils/py_expression_eval/README.md +8 -0
dsgrid/utils/py_expression_eval/__init__.py +847 -0
dsgrid/utils/py_expression_eval/tests.py +283 -0
dsgrid/utils/run_command.py +70 -0
dsgrid/utils/scratch_dir_context.py +65 -0
dsgrid/utils/spark.py +918 -0
dsgrid/utils/spark_partition.py +98 -0
dsgrid/utils/timing.py +239 -0
dsgrid/utils/utilities.py +221 -0
dsgrid/utils/versioning.py +36 -0
dsgrid_toolkit-0.3.3.dist-info/METADATA +193 -0
dsgrid_toolkit-0.3.3.dist-info/RECORD +157 -0
dsgrid_toolkit-0.3.3.dist-info/WHEEL +4 -0
dsgrid_toolkit-0.3.3.dist-info/entry_points.txt +4 -0
dsgrid_toolkit-0.3.3.dist-info/licenses/LICENSE +29 -0

dsgrid/utils/spark_partition.py ADDED Viewed

@@ -0,0 +1,98 @@
+import logging
+import math
+from dsgrid.utils.timing import timed_info
+logger = logging.getLogger(__name__)
+class SparkPartition:
+    def __init__(self):
+        return
+    def get_data_size(self, df, bytes_per_cell=8):
+        """approximate dataset size
+        Parameters
+        ----------
+        df : DataFrame
+        bytes_per_cell : [float, int]
+            Estimated number of bytes per cell in a dataframe.
+            * 4-bytes = 32-bit = Single-precision Float = pyspark.sql.types.FloatType,
+            * 8-bytes = 64-bit = Double-precision float = pyspark.sql.types.DoubleType,
+        Returns
+        -------
+        n_rows : int
+            Number of rows in df
+        n_cols : int
+            Number of columns in df
+        data_MB : float
+            Estimated size of df in memory in MB
+        """
+        n_rows = df.count()
+        n_cols = len(df.columns)
+        data_MB = n_rows * n_cols * bytes_per_cell / 1e6  # MB
+        return n_rows, n_cols, data_MB
+    @timed_info
+    def get_optimal_number_of_files(self, df, MB_per_cmp_file=128, cmp_ratio=0.18):
+        """calculate *optimal* number of files
+        Parameters
+        ----------
+        df : DataFrame
+        MB_per_cmp_file : float
+            Desired size of compressed file on disk in MB
+        cmp_ratio : float
+            Ratio of file size after and before compression
+        Returns
+        -------
+        n_files : int
+            Number of files
+        """
+        _, _, data_MB = self.get_data_size(df)
+        MB_per_file = MB_per_cmp_file / cmp_ratio
+        n_files = math.ceil(data_MB / MB_per_file)
+        logger.info(
+            f"Dataframe is approximately {data_MB:.02f} MB in size, "
+            f"ideal to split into {n_files} file(s) at {MB_per_file:.1f} MB compressed on disk. "
+            f"({MB_per_file:.1f} MB uncompressed in memory, {cmp_ratio} compression ratio)."
+        )
+        return n_files
+    @timed_info
+    def file_size_if_partition_by(self, df, key):
+        """calculate sharded file size based on paritionBy key"""
+        n_rows, n_cols, data_MB = self.get_data_size(df)
+        n_partitions = df.select(key).distinct().count()
+        avg_MB = round(data_MB / n_partitions, 2)
+        n_rows_largest_part = df.groupBy(key).count().orderBy("count", ascending=False).first()[1]
+        n_rows_smallest_part = df.groupBy(key).count().orderBy("count", ascending=True).first()[1]
+        largest_MB = round(data_MB / n_rows * n_rows_largest_part, 2)
+        smallest_MB = round(data_MB / n_rows * n_rows_smallest_part, 2)
+        report = (
+            f'Partitioning by "{key}" will yield: \n'
+            + f"  - # of partitions: {n_partitions} \n"
+            + f"  - avg partition size: {avg_MB} MB \n"
+            + f"  - largest partition: {largest_MB} MB \n"
+            + f"  - smallest partition: {smallest_MB} MB \n"
+        )
+        logger.info(report)
+        output = {
+            key: {
+                "n_partitions": n_partitions,
+                "avg_partition_MB": avg_MB,
+                "max_partition_MB": largest_MB,
+                "min_partition_MB": smallest_MB,
+            }
+        }
+        return output

dsgrid/utils/timing.py ADDED Viewed

@@ -0,0 +1,239 @@
+"""Utility functions for timing measurements."""
+import functools
+import logging
+import time
+from pathlib import Path
+from dsgrid.utils.files import dump_line_delimited_json
+logger = logging.getLogger(__name__)
+def timed_info(func):
+    """Decorator to measure and logger.info a function's execution time."""
+    @functools.wraps(func)
+    def timed_(*args, **kwargs):
+        return _timed(func, logger.info, *args, **kwargs)
+    return timed_
+def timed_debug(func):
+    """Decorator to measure and logger.debug a function's execution time."""
+    @functools.wraps(func)
+    def timed_(*args, **kwargs):
+        return _timed(func, logger.debug, *args, **kwargs)
+    return timed_
+def _timed(func, log_func, *args, **kwargs):
+    start = time.time()
+    result = func(*args, **kwargs)
+    total = time.time() - start
+    log_func("execution-time=%s func=%s", get_time_duration_string(total), func.__name__)
+    return result
+def get_time_duration_string(seconds):
+    """Returns a string with the time converted to reasonable units."""
+    if seconds >= 1:
+        val = "{:.3f} s".format(seconds)
+    elif seconds >= 0.001:
+        val = "{:.3f} ms".format(seconds * 1000)
+    elif seconds >= 0.000001:
+        val = "{:.3f} us".format(seconds * 1000000)
+    elif seconds == 0:
+        val = "0 s"
+    else:
+        val = "{:.3f} ns".format(seconds * 1000000000)
+    return val
+class TimerStats:
+    """Tracks timing stats for one code block."""
+    def __init__(self, name):
+        self._name = name
+        self._count = 0
+        self._max = 0.0
+        self._min = None
+        self._avg = 0.0
+        self._total = 0.0
+    def get_stats(self):
+        """Get the current stats summary.
+        Returns
+        -------
+        dict
+        """
+        avg = 0 if self._count == 0 else self._total / self._count
+        return {
+            "min": self._min,
+            "max": self._max,
+            "total": self._total,
+            "avg": avg,
+            "count": self._count,
+        }
+    def log_stats(self):
+        """Log a summary of the stats."""
+        if self._count == 0:
+            logger.info("No stats have been recorded for %s.", self._name)
+            return
+        x = self.get_stats()
+        text = "total={:.3f}s avg={:.3f}ms max={:.3f}ms min={:.3f}ms count={}".format(
+            x["total"], x["avg"] * 1000, x["max"] * 1000, x["min"] * 1000, x["count"]
+        )
+        logger.info("TimerStats summary: %s: %s", self._name, text)
+    def update(self, duration):
+        """Update the stats with a new timing."""
+        self._count += 1
+        self._total += duration
+        if duration > self._max:
+            self._max = duration
+        if self._min is None or duration < self._min:
+            self._min = duration
+class Timer:
+    """Times a code block."""
+    def __init__(self, timer_stats, name):
+        self._start = None
+        self._timer_stat = timer_stats.get_stat(name)
+    def __enter__(self):
+        if self._timer_stat is not None:
+            self._start = time.perf_counter()
+    def __exit__(self, exc, value, tb):
+        if self._timer_stat is not None:
+            self._timer_stat.update(time.perf_counter() - self._start)
+def track_timing(collector):
+    """Decorator to track statistics on a function's execution time.
+    Parameters
+    ----------
+    collector : TimerStatsCollector
+    """
+    def wrap(func):
+        def timed_(*args, **kwargs):
+            return _timed_func(collector, func, *args, **kwargs)
+        return timed_
+    return wrap
+def _timed_func(timer_stats, func, *args, **kwargs):
+    with Timer(timer_stats, func.__qualname__):
+        return func(*args, **kwargs)
+class TimerStatsCollector:
+    """Collects statistics for timed code segments."""
+    def __init__(self, is_enabled=False):
+        self._stats = {}
+        self._is_enabled = is_enabled
+    def clear(self):
+        """Clear all stats."""
+        self._stats.clear()
+    def disable(self):
+        """Disable timing."""
+        self._is_enabled = False
+    def enable(self):
+        """Enable timing."""
+        self._is_enabled = True
+    def get_stat(self, name):
+        """Return a TimerStats. Return None if timing is disabled.
+        Parameters
+        ----------
+        name : str
+        Returns
+        -------
+        TimerStats | None
+        """
+        if not self._is_enabled:
+            return None
+        if name not in self._stats:
+            self.register_stat(name)
+        return self._stats[name]
+    @property
+    def is_enabled(self) -> bool:
+        """Return True if timing is enabled."""
+        return self._is_enabled
+    def log_json_stats(self, filename: Path, clear=False):
+        """Log line-delimited JSON stats to filename.
+        Parameters
+        ----------
+        filename: Path
+        clear : bool
+            If True, clear all stats.
+        """
+        if self._is_enabled:
+            rows = []
+            for name, stat in self._stats.items():
+                row = {"name": name}
+                row.update(stat.get_stats())
+                rows.append(row)
+            dump_line_delimited_json(rows, filename, mode="a")
+            if clear:
+                self._stats.clear()
+    def log_stats(self, clear=False):
+        """Log statistics for all tracked stats.
+        Parameters
+        ----------
+        clear : bool
+            If True, clear all stats.
+        """
+        if self._is_enabled:
+            for stat in self._stats.values():
+                stat.log_stats()
+            if clear:
+                self._stats.clear()
+    def register_stat(self, name):
+        """Register tracking of a new stat.
+        Parameters
+        ----------
+        name : str
+        Returns
+        -------
+        TimerStats
+        """
+        if self._is_enabled:
+            assert name not in self._stats
+            stat = TimerStats(name)
+            self._stats[name] = stat
+timer_stats_collector = TimerStatsCollector()

dsgrid/utils/utilities.py ADDED Viewed

@@ -0,0 +1,221 @@
+"""
+Helpful utility functions for dsgrid
+"""
+import logging
+import inspect
+import json
+import os
+from enum import Enum
+from typing import Iterable
+from prettytable import PrettyTable
+try:
+    from IPython.display import display, HTML
+    from IPython import get_ipython
+    from ipykernel.zmqshell import ZMQInteractiveShell
+    _IPYTHON_INSTALLED = True
+except ImportError:
+    _IPYTHON_INSTALLED = False
+from dsgrid.exceptions import DSGJSONError
+logger = logging.getLogger(__name__)
+def safe_json_load(fpath):
+    """Perform a json file load with better exception handling.
+    Parameters
+    ----------
+    fpath : str
+        Filepath to .json file.
+    Returns
+    -------
+    j : dict
+        Loaded json dictionary.
+    Examples
+    --------
+    >>> json_path = "./path_to_json.json"
+    >>> safe_json_load(json_path)
+    {key1: value1,
+     key2: value2}
+    """
+    if not isinstance(fpath, str):
+        msg = "Filepath must be str to load json: {}".format(fpath)
+        raise TypeError(msg)
+    if not fpath.endswith(".json"):
+        msg = "Filepath must end in .json to load json: {}".format(fpath)
+        raise DSGJSONError(msg)
+    if not os.path.isfile(fpath):
+        msg = "Could not find json file to load: {}".format(fpath)
+        raise DSGJSONError(msg)
+    try:
+        with open(fpath, "r") as f:
+            j = json.load(f)
+    except json.decoder.JSONDecodeError as e:
+        emsg = 'JSON Error:\n{}\nCannot read json file: "{}"'.format(e, fpath)
+        raise DSGJSONError(emsg)
+    return j
+def get_class_properties(cls):
+    """Get all class properties
+    Used to check against config keys
+    Returns
+    -------
+    properties : list
+        List of class properties, each of which should represent a valid
+        config key/entry
+    """
+    properties = [
+        attr for attr, attr_obj in inspect.getmembers(cls) if isinstance(attr_obj, property)
+    ]
+    return properties
+def check_uniqueness(iterable: Iterable, tag: str) -> set[str]:
+    """Raises ValueError if iterable has duplicate entries.
+    Parameters
+    ----------
+    iterable : list | generator
+    tag : str
+        tag to add to the exception string
+    Returns
+    -------
+    set[str]
+    """
+    values = set()
+    for item in iterable:
+        if item in values:
+            msg = f"duplicate {tag}: {item}"
+            raise ValueError(msg)
+        values.add(item)
+    return values
+def convert_record_dicts_to_classes(iterable, cls, check_duplicates: None | list[str] = None):
+    """Convert an iterable of dicts to instances of a data class.
+    Parameters
+    ----------
+    iterable
+        Any iterable of dicts that must have an 'id' field.
+    cls : class
+        Instantiate a class from each dict by splatting the dict to the constructor.
+    check_duplicates : None | list[str]
+        If it is a list of column names, ensure that there are no duplicates among the rows.
+    Returns
+    -------
+    list
+    """
+    records = []
+    check_duplicates = check_duplicates or []
+    values = {x: set() for x in check_duplicates}
+    length = None
+    for row in iterable:
+        if None in row:
+            msg = f"row has a key that is None: {row=}"
+            raise ValueError(msg)
+        if length is None:
+            length = len(row)
+        elif len(row) != length:
+            msg = f"Rows have inconsistent length: first_row_length={length} {row=}"
+            raise ValueError(msg)
+        record = cls(**row)
+        for name in check_duplicates:
+            val = getattr(record, name)
+            if val in values[name]:
+                msg = f"{val} is listed multiple times"
+                raise ValueError(msg)
+            values[name].add(val)
+        records.append(record)
+    return records
+def list_enum_values(enum: Enum):
+    """Returns list enum values."""
+    return [e.value for e in enum]
+def in_jupyter_notebook():
+    """Returns True if the current interpreter is running in a Jupyter notebook.
+    Returns
+    -------
+    bool
+    """
+    if not _IPYTHON_INSTALLED:
+        return False
+    return isinstance(get_ipython(), ZMQInteractiveShell)
+def display_table(table: PrettyTable):
+    """Displays a table in an ASCII or HTML format as determined by the current interpreter.
+    Parameters
+    ----------
+    table : PrettyTable
+    """
+    if in_jupyter_notebook():
+        display(HTML(table.get_html_string()))
+    else:
+        print(table)
+def make_unique_key(base_name: str, existing_keys: Iterable[str]) -> str:
+    """Generate a unique key by appending an index if the base name already exists.
+    Parameters
+    ----------
+    base_name : str
+        The base name to use as a key.
+    existing_keys : Iterable[str]
+        Collection of existing keys to check against.
+    Returns
+    -------
+    str
+        A unique key, either the base name or base name with an appended index
+        (e.g., 'name_1', 'name_2').
+    Examples
+    --------
+    >>> make_unique_key("file", {"other", "another"})
+    'file'
+    >>> make_unique_key("file", {"file", "other"})
+    'file_1'
+    >>> make_unique_key("file", {"file", "file_1", "file_2"})
+    'file_3'
+    """
+    existing = set(existing_keys)
+    if base_name not in existing:
+        return base_name
+    index = 1
+    while True:
+        new_key = f"{base_name}_{index}"
+        if new_key not in existing:
+            return new_key
+        index += 1

dsgrid/utils/versioning.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""Utility functions for versioning"""
+from semver import VersionInfo
+def handle_version_or_str(version):
+    """Return VersionInfo if version is a str."""
+    if isinstance(version, str):
+        return make_version(version)
+    return version
+def make_version(version):
+    """Convert the string version to a VersionInfo object.
+    Parameters
+    ----------
+    version : str
+    Returns
+    -------
+    VersionInfo
+    Raises
+    ------
+    ValueError
+        Raised if parsing fails.
+    """
+    try:
+        return VersionInfo.parse(version)
+    except Exception as exc:
+        msg = f"Failed to create VersionInfo: {exc}"
+        raise ValueError(msg) from exc
+    return version