PyPI - esgpull - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

esgpull 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

esgpull/cli/__init__.py +2 -2
esgpull/cli/add.py +7 -1
esgpull/cli/config.py +5 -21
esgpull/cli/plugins.py +398 -0
esgpull/cli/update.py +58 -15
esgpull/cli/utils.py +16 -1
esgpull/config.py +82 -25
esgpull/constants.py +3 -0
esgpull/context.py +9 -9
esgpull/database.py +8 -2
esgpull/download.py +3 -0
esgpull/esgpull.py +49 -5
esgpull/graph.py +1 -1
esgpull/migrations/versions/0.9.0_update_tables.py +28 -0
esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py +32 -0
esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py +39 -0
esgpull/models/__init__.py +2 -1
esgpull/models/base.py +31 -14
esgpull/models/dataset.py +48 -5
esgpull/models/query.py +58 -14
esgpull/models/sql.py +40 -9
esgpull/plugin.py +574 -0
esgpull/processor.py +3 -3
esgpull/tui.py +23 -1
esgpull/utils.py +5 -1
{esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/METADATA +2 -1
{esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/RECORD +30 -26
esgpull/cli/datasets.py +0 -78
{esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/WHEEL +0 -0
{esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/entry_points.txt +0 -0
{esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/licenses/LICENSE +0 -0

esgpull/models/query.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 from collections.abc import Iterator, MutableMapping, Sequence
 from datetime import datetime, timezone
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal
 import sqlalchemy as sa
 from rich.console import Console, ConsoleOptions
@@ -15,10 +15,15 @@ from typing_extensions import NotRequired, TypedDict
 from esgpull import utils
 from esgpull.exceptions import UntrackableQuery
 from esgpull.models.base import Base, Sha
+from esgpull.models.dataset import Dataset
 from esgpull.models.file import FileDict, FileStatus
 from esgpull.models.options import Options
 from esgpull.models.selection import FacetValues, Selection
 from esgpull.models.tag import Tag
+if TYPE_CHECKING:
+    from esgpull.models.dataset import Dataset
 from esgpull.models.utils import (
     find_int,
     find_str,
@@ -55,9 +60,14 @@ query_tag_proxy = sa.Table(
 class File(Base):
     __tablename__ = "file"
+    __table_args__ = (
+        sa.Index("ix_file_dataset_status", "dataset_id", "status"),
+    )
     file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
-    dataset_id: Mapped[str] = mapped_column(sa.String(255))
+    dataset_id: Mapped[str] = mapped_column(
+        sa.String(255), sa.ForeignKey("dataset.dataset_id")
+    )
     master_id: Mapped[str] = mapped_column(sa.String(255))
     url: Mapped[str] = mapped_column(sa.String(255))
     version: Mapped[str] = mapped_column(sa.String(16))
@@ -76,6 +86,11 @@ class File(Base):
         back_populates="files",
         repr=False,
     )
+    dataset: Mapped["Dataset"] = relationship(
+        back_populates="files",
+        init=False,
+        repr=False,
+    )
     def _as_bytes(self) -> bytes:
         self_tuple = (self.file_id, self.checksum)
@@ -100,7 +115,7 @@ class File(Base):
             size=source["size"],
         )
         if "status" in source:
-            result.status = FileStatus(source.get("source"))
+            result.status = FileStatus(source.get("status").lower())
         return result
     @classmethod
@@ -395,11 +410,6 @@ class Query(Base):
             self.tags.remove(tag)
         return tag is not None
-    def no_require(self) -> Query:
-        cl = self.clone(compute_sha=False)
-        cl._rich_no_require = True  # type: ignore [attr-defined]
-        return cl
     def __lshift__(self, child: Query) -> Query:
         result = self.clone(compute_sha=False)
         # if self.name != child.require:
@@ -440,7 +450,7 @@ class Query(Base):
     __rich_measure__ = rich_measure_impl
-    def _rich_tree(self) -> Tree:
+    def _rich_tree(self, hide_require: bool = False) -> Tree:
         title = Text.from_markup(self.rich_name)
         if not self.tracked:
             title.append(" untracked", style="i red")
@@ -449,7 +459,7 @@ class Query(Base):
             f"\n│ updated  {format_date_iso(self.updated_at)}"
         )
         contents = Table.grid(padding=(0, 1))
-        if not hasattr(self, "_rich_no_require") and self.require is not None:
+        if not hide_require and self.require is not None:
             if len(self.require) == 40:
                 require = Text(short_sha(self.require), style="i green")
             else:
@@ -482,10 +492,44 @@ class Query(Base):
             count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
             count_total, size_total = self.files_count_size()
             sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
-            lens = f"{count_ondisk}/{count_total}"
-            contents.add_row(
-                "files:", Text(f"{sizes} [{lens}]", style="magenta")
-            )
+            lens = f"{count_ondisk} / {count_total}"
+            # Add dataset completion info
+            complete_datasets = 0
+            total_datasets = 0
+            session = object_session(self)
+            orphaned_dataset_count = 0
+            if session is not None:
+                from esgpull.models import sql
+                dataset_stats = session.execute(
+                    sql.dataset.query_stats(self.sha)
+                ).all()
+                # Check for orphaned datasets (dataset_ids from files not in Dataset table)
+                orphaned_dataset_count = (
+                    session.scalar(sql.dataset.orphaned(self.sha)) or 0
+                )
+                # Compute counts in Python - simpler and more maintainable
+                total_datasets = len(dataset_stats)
+                complete_datasets = sum(
+                    1 for d in dataset_stats if d.done_count == d.total_files
+                )
+            contents.add_row("files:", Text(f"{lens}", style="magenta"))
+            if orphaned_dataset_count > 0:
+                contents.add_row(
+                    "datasets:",
+                    "[magenta]? / ?[/]  [yellow italic]<- update for accurate datasets[/]",
+                )
+            else:
+                contents.add_row(
+                    "datasets:",
+                    f"[magenta]{complete_datasets} / {total_datasets}",
+                )
+            contents.add_row("size:", Text(f"{sizes}", style="magenta"))
         tree = Tree("", hide_root=True, guide_style="dim").add(title)
         if contents.row_count:
             tree.add(contents)

esgpull/models/sql.py CHANGED Viewed

@@ -3,6 +3,7 @@ import functools
 import sqlalchemy as sa
 from esgpull.models import Table
+from esgpull.models.dataset import Dataset
 from esgpull.models.facet import Facet
 from esgpull.models.file import FileStatus
 from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
 from esgpull.models.tag import Tag
-def count(item: Table) -> sa.Select[tuple[int]]:
-    table = item.__class__
-    return (
-        sa.select(sa.func.count("*"))
-        .select_from(table)
-        .filter_by(sha=item.sha)
-    )
 def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
     return sa.select(sa.func.count("*")).select_from(table)
@@ -148,6 +140,45 @@ class file:
         return stmt
+class dataset:
+    @staticmethod
+    @functools.cache
+    def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
+        return (
+            sa.select(
+                Dataset.dataset_id,
+                Dataset.total_files,
+                sa.func.count(
+                    sa.case((File.status == FileStatus.Done, 1))
+                ).label("done_count"),
+            )
+            .join(File)
+            .join(query_file_proxy)
+            .filter(query_file_proxy.c.query_sha == query_sha)
+            .filter(File.dataset_id.isnot(None))
+            .group_by(Dataset.dataset_id, Dataset.total_files)
+        )
+    @staticmethod
+    @functools.cache
+    def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
+        return (
+            sa.select(sa.func.count(sa.distinct(File.dataset_id)))
+            .join(query_file_proxy)
+            .filter(query_file_proxy.c.query_sha == query_sha)
+            .filter(File.dataset_id.isnot(None))
+            .filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
+        )
+    @staticmethod
+    @functools.cache
+    def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
+        return sa.select(
+            sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
+            == dataset.total_files
+        ).where(File.dataset_id == dataset.dataset_id)
 class query:
     @staticmethod
     @functools.cache

esgpull 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

esgpull 0.8.0py3-none-any.whl → 0.9.0py3-none-any.whl