PyPI - esgpull - Versions diffs - 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

esgpull 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

esgpull/cli/__init__.py +2 -2
esgpull/cli/add.py +7 -1
esgpull/cli/config.py +5 -21
esgpull/cli/plugins.py +398 -0
esgpull/cli/remove.py +9 -3
esgpull/cli/self.py +1 -1
esgpull/cli/update.py +78 -35
esgpull/cli/utils.py +16 -1
esgpull/config.py +83 -26
esgpull/constants.py +3 -0
esgpull/context.py +9 -9
esgpull/database.py +21 -7
esgpull/download.py +3 -0
esgpull/esgpull.py +49 -5
esgpull/fs.py +9 -20
esgpull/graph.py +1 -1
esgpull/migrations/versions/0.9.0_update_tables.py +28 -0
esgpull/migrations/versions/0.9.1_update_tables.py +28 -0
esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py +32 -0
esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py +39 -0
esgpull/models/__init__.py +2 -1
esgpull/models/base.py +31 -14
esgpull/models/dataset.py +48 -5
esgpull/models/query.py +58 -14
esgpull/models/sql.py +48 -9
esgpull/plugin.py +574 -0
esgpull/processor.py +3 -3
esgpull/tui.py +23 -1
esgpull/utils.py +5 -1
{esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/METADATA +19 -3
{esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/RECORD +34 -29
esgpull/cli/datasets.py +0 -78
{esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/WHEEL +0 -0
{esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/entry_points.txt +0 -0
{esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/licenses/LICENSE +0 -0

esgpull/fs.py CHANGED Viewed

@@ -10,7 +10,7 @@ from shutil import copyfile
 import aiofiles
 from aiofiles.threadpool.binary import AsyncBufferedIOBase
-from esgpull.config import Config
+from esgpull.config import Config, Paths
 from esgpull.models import File
 from esgpull.result import Err, Ok, Result
 from esgpull.tui import logger
@@ -63,45 +63,34 @@ class Digest:
 @dataclass
 class Filesystem:
-    auth: Path
-    data: Path
-    db: Path
-    log: Path
-    tmp: Path
+    paths: Paths
     disable_checksum: bool = False
     install: InitVar[bool] = True
     @staticmethod
     def from_config(config: Config, install: bool = False) -> Filesystem:
         return Filesystem(
-            auth=config.paths.auth,
-            data=config.paths.data,
-            db=config.paths.db,
-            log=config.paths.log,
-            tmp=config.paths.tmp,
+            paths=config.paths,
             disable_checksum=config.download.disable_checksum,
             install=install,
         )
     def __post_init__(self, install: bool = True) -> None:
         if install:
-            self.auth.mkdir(parents=True, exist_ok=True)
-            self.data.mkdir(parents=True, exist_ok=True)
-            self.db.mkdir(parents=True, exist_ok=True)
-            self.log.mkdir(parents=True, exist_ok=True)
-            self.tmp.mkdir(parents=True, exist_ok=True)
+            for path in self.paths.values():
+                path.mkdir(parents=True, exist_ok=True)
     def __getitem__(self, file: File) -> FilePath:
         if not isinstance(file, File):
             raise TypeError(file)
         return FilePath(
-            drs=self.data / file.local_path / file.filename,
-            tmp=self.tmp / f"{file.sha}.part",
+            drs=self.paths.data / file.local_path / file.filename,
+            tmp=self.paths.tmp / f"{file.sha}.part",
         )
     def glob_netcdf(self) -> Iterator[Path]:
-        for path in self.data.glob("**/*.nc"):
-            yield path.relative_to(self.data)
+        for path in self.paths.data.glob("**/*.nc"):
+            yield path.relative_to(self.paths.data)
     def open(self, file: File) -> FileObject:
         return FileObject(self[file])

esgpull/graph.py CHANGED Viewed

@@ -418,7 +418,7 @@ class Graph:
                 if keep_require:
                     query_tree = query._rich_tree()
                 else:
-                    query_tree = query.no_require()._rich_tree()
+                    query_tree = query._rich_tree(hide_require=True)
             if query_tree is not None:
                 tree.add(query_tree)
                 self.fill_tree(query, query_tree)

esgpull/migrations/versions/0.9.0_update_tables.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""update tables
+Revision ID: 0.9.0
+Revises: d14f179e553c
+Create Date: 2025-07-07 14:54:58.433022
+"""
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision = '0.9.0'
+down_revision = 'd14f179e553c'
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###

esgpull/migrations/versions/0.9.1_update_tables.py ADDED Viewed

@@ -0,0 +1,28 @@
+"""update tables
+Revision ID: 0.9.1
+Revises: 0.9.0
+Create Date: 2025-08-08 10:38:14.204594
+"""
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision = '0.9.1'
+down_revision = '0.9.0'
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    pass
+    # ### end Alembic commands ###

esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py ADDED Viewed

@@ -0,0 +1,32 @@
+"""file_add_composite_index_dataset_id_status
+Revision ID: d14f179e553c
+Revises: e7edab5d4e4b
+Create Date: 2025-06-18 16:05:35.721085
+"""
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision = 'd14f179e553c'
+down_revision = 'e7edab5d4e4b'
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('file', schema=None) as batch_op:
+        batch_op.create_index('ix_file_dataset_status', ['dataset_id', 'status'], unique=False)
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('file', schema=None) as batch_op:
+        batch_op.drop_index('ix_file_dataset_status')
+    # ### end Alembic commands ###

esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py ADDED Viewed

@@ -0,0 +1,39 @@
+"""add_dataset_tracking
+Revision ID: e7edab5d4e4b
+Revises: 0.8.0
+Create Date: 2025-05-23 17:38:22.066153
+"""
+from alembic import op
+import sqlalchemy as sa
+# revision identifiers, used by Alembic.
+revision = 'e7edab5d4e4b'
+down_revision = '0.8.0'
+branch_labels = None
+depends_on = None
+def upgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    op.create_table('dataset',
+    sa.Column('dataset_id', sa.String(length=255), nullable=False),
+    sa.Column('total_files', sa.Integer(), nullable=False),
+    sa.Column('created_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
+    sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
+    sa.PrimaryKeyConstraint('dataset_id')
+    )
+    with op.batch_alter_table('file', schema=None) as batch_op:
+        batch_op.create_foreign_key('fk_file_dataset', 'dataset', ['dataset_id'], ['dataset_id'])
+    # ### end Alembic commands ###
+def downgrade() -> None:
+    # ### commands auto generated by Alembic - please adjust! ###
+    with op.batch_alter_table('file', schema=None) as batch_op:
+        batch_op.drop_constraint('fk_file_dataset', type_='foreignkey')
+    op.drop_table('dataset')
+    # ### end Alembic commands ###

esgpull/models/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from typing import TypeVar
 from esgpull.models.base import Base
-from esgpull.models.dataset import Dataset
+from esgpull.models.dataset import Dataset, DatasetRecord
 from esgpull.models.facet import Facet
 from esgpull.models.file import FastFile, FileStatus
 from esgpull.models.options import Option, Options
@@ -15,6 +15,7 @@ Table = TypeVar("Table", bound=Base)
 __all__ = [
     "Base",
     "Dataset",
+    "DatasetRecord",
     "Facet",
     "FastFile",
     "File",

esgpull/models/base.py CHANGED Viewed

@@ -16,16 +16,10 @@ T = TypeVar("T")
 Sha = sa.String(40)
-class Base(MappedAsDataclass, DeclarativeBase):
+# Base class for all models - provides core SQLAlchemy functionality
+class _BaseModel(MappedAsDataclass, DeclarativeBase):
     __dataclass_fields__: ClassVar[dict[str, Field]]
-    __sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
-    sha: Mapped[str] = mapped_column(
-        Sha,
-        init=False,
-        repr=False,
-        primary_key=True,
-    )
+    __sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
     @property
     def _names(self) -> tuple[str, ...]:
@@ -36,15 +30,38 @@ class Base(MappedAsDataclass, DeclarativeBase):
             result += (name,)
         return result
+    @property
+    def state(self) -> InstanceState:
+        return cast(InstanceState, sa.inspect(self))
+    def asdict(self) -> Mapping[str, Any]:
+        raise NotImplementedError
+# Base class for models that use SHA as primary key
+class Base(_BaseModel):
+    __abstract__ = True
+    __sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
+    sha: Mapped[str] = mapped_column(
+        Sha,
+        init=False,
+        repr=False,
+        primary_key=True,
+    )
     def _as_bytes(self) -> bytes:
         raise NotImplementedError
     def compute_sha(self) -> None:
         self.sha = sha1(self._as_bytes()).hexdigest()
-    @property
-    def state(self) -> InstanceState:
-        return cast(InstanceState, sa.inspect(self))
-    def asdict(self) -> Mapping[str, Any]:
-        raise NotImplementedError
+# Base class for models that don't use SHA (e.g., Dataset)
+class BaseNoSHA(_BaseModel):
+    __abstract__ = True
+    __sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
+# Keep SHAKeyMixin for backward compatibility if needed
+SHAKeyMixin = Base

esgpull/models/dataset.py CHANGED Viewed

@@ -1,12 +1,22 @@
 from __future__ import annotations
-from dataclasses import asdict, dataclass
+from collections.abc import Mapping
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from typing import TYPE_CHECKING, Any
+import sqlalchemy as sa
+from sqlalchemy.orm import Mapped, mapped_column, relationship
+from esgpull.models.base import BaseNoSHA
 from esgpull.models.utils import find_int, find_str
+if TYPE_CHECKING:
+    from esgpull.models.query import File
 @dataclass
-class Dataset:
+class DatasetRecord:
     dataset_id: str
     master_id: str
     version: str
@@ -15,7 +25,7 @@ class Dataset:
     number_of_files: int
     @classmethod
-    def serialize(cls, source: dict) -> Dataset:
+    def serialize(cls, source: dict) -> DatasetRecord:
         dataset_id = find_str(source["instance_id"]).partition("|")[0]
         master_id, version = dataset_id.rsplit(".", 1)
         data_node = find_str(source["data_node"])
@@ -30,5 +40,38 @@ class Dataset:
             number_of_files=number_of_files,
         )
-    def asdict(self) -> dict:
-        return asdict(self)
+class Dataset(BaseNoSHA):
+    __tablename__ = "dataset"
+    dataset_id: Mapped[str] = mapped_column(sa.String(255), primary_key=True)
+    total_files: Mapped[int] = mapped_column(sa.Integer)
+    created_at: Mapped[datetime] = mapped_column(
+        server_default=sa.func.now(),
+        default_factory=lambda: datetime.now(timezone.utc),
+        init=False,
+    )
+    updated_at: Mapped[datetime] = mapped_column(
+        server_default=sa.func.now(),
+        default_factory=lambda: datetime.now(timezone.utc),
+        init=False,
+    )
+    files: Mapped[list[File]] = relationship(
+        back_populates="dataset",
+        foreign_keys="[File.dataset_id]",
+        primaryjoin="Dataset.dataset_id==File.dataset_id",
+        default_factory=list,
+        init=False,
+        repr=False,
+    )
+    def asdict(self) -> Mapping[str, Any]:
+        return {
+            "dataset_id": self.dataset_id,
+            "total_files": self.total_files,
+            "created_at": self.created_at.isoformat(),
+            "updated_at": self.updated_at.isoformat(),
+        }
+    def __hash__(self) -> int:
+        return hash(self.dataset_id)

esgpull/models/query.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 from collections.abc import Iterator, MutableMapping, Sequence
 from datetime import datetime, timezone
-from typing import Any, Literal
+from typing import TYPE_CHECKING, Any, Literal
 import sqlalchemy as sa
 from rich.console import Console, ConsoleOptions
@@ -15,10 +15,15 @@ from typing_extensions import NotRequired, TypedDict
 from esgpull import utils
 from esgpull.exceptions import UntrackableQuery
 from esgpull.models.base import Base, Sha
+from esgpull.models.dataset import Dataset
 from esgpull.models.file import FileDict, FileStatus
 from esgpull.models.options import Options
 from esgpull.models.selection import FacetValues, Selection
 from esgpull.models.tag import Tag
+if TYPE_CHECKING:
+    from esgpull.models.dataset import Dataset
 from esgpull.models.utils import (
     find_int,
     find_str,
@@ -55,9 +60,14 @@ query_tag_proxy = sa.Table(
 class File(Base):
     __tablename__ = "file"
+    __table_args__ = (
+        sa.Index("ix_file_dataset_status", "dataset_id", "status"),
+    )
     file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
-    dataset_id: Mapped[str] = mapped_column(sa.String(255))
+    dataset_id: Mapped[str] = mapped_column(
+        sa.String(255), sa.ForeignKey("dataset.dataset_id")
+    )
     master_id: Mapped[str] = mapped_column(sa.String(255))
     url: Mapped[str] = mapped_column(sa.String(255))
     version: Mapped[str] = mapped_column(sa.String(16))
@@ -76,6 +86,11 @@ class File(Base):
         back_populates="files",
         repr=False,
     )
+    dataset: Mapped["Dataset"] = relationship(
+        back_populates="files",
+        init=False,
+        repr=False,
+    )
     def _as_bytes(self) -> bytes:
         self_tuple = (self.file_id, self.checksum)
@@ -100,7 +115,7 @@ class File(Base):
             size=source["size"],
         )
         if "status" in source:
-            result.status = FileStatus(source.get("source"))
+            result.status = FileStatus(source.get("status").lower())
         return result
     @classmethod
@@ -395,11 +410,6 @@ class Query(Base):
             self.tags.remove(tag)
         return tag is not None
-    def no_require(self) -> Query:
-        cl = self.clone(compute_sha=False)
-        cl._rich_no_require = True  # type: ignore [attr-defined]
-        return cl
     def __lshift__(self, child: Query) -> Query:
         result = self.clone(compute_sha=False)
         # if self.name != child.require:
@@ -440,7 +450,7 @@ class Query(Base):
     __rich_measure__ = rich_measure_impl
-    def _rich_tree(self) -> Tree:
+    def _rich_tree(self, hide_require: bool = False) -> Tree:
         title = Text.from_markup(self.rich_name)
         if not self.tracked:
             title.append(" untracked", style="i red")
@@ -449,7 +459,7 @@ class Query(Base):
             f"\n│ updated  {format_date_iso(self.updated_at)}"
         )
         contents = Table.grid(padding=(0, 1))
-        if not hasattr(self, "_rich_no_require") and self.require is not None:
+        if not hide_require and self.require is not None:
             if len(self.require) == 40:
                 require = Text(short_sha(self.require), style="i green")
             else:
@@ -482,10 +492,44 @@ class Query(Base):
             count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
             count_total, size_total = self.files_count_size()
             sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
-            lens = f"{count_ondisk}/{count_total}"
-            contents.add_row(
-                "files:", Text(f"{sizes} [{lens}]", style="magenta")
-            )
+            lens = f"{count_ondisk} / {count_total}"
+            # Add dataset completion info
+            complete_datasets = 0
+            total_datasets = 0
+            session = object_session(self)
+            orphaned_dataset_count = 0
+            if session is not None:
+                from esgpull.models import sql
+                dataset_stats = session.execute(
+                    sql.dataset.query_stats(self.sha)
+                ).all()
+                # Check for orphaned datasets (dataset_ids from files not in Dataset table)
+                orphaned_dataset_count = (
+                    session.scalar(sql.dataset.orphaned(self.sha)) or 0
+                )
+                # Compute counts in Python - simpler and more maintainable
+                total_datasets = len(dataset_stats)
+                complete_datasets = sum(
+                    1 for d in dataset_stats if d.done_count == d.total_files
+                )
+            contents.add_row("files:", Text(f"{lens}", style="magenta"))
+            if orphaned_dataset_count > 0:
+                contents.add_row(
+                    "datasets:",
+                    "[magenta]? / ?[/]  [yellow italic]<- update for accurate datasets[/]",
+                )
+            else:
+                contents.add_row(
+                    "datasets:",
+                    f"[magenta]{complete_datasets} / {total_datasets}",
+                )
+            contents.add_row("size:", Text(f"{sizes}", style="magenta"))
         tree = Tree("", hide_root=True, guide_style="dim").add(title)
         if contents.row_count:
             tree.add(contents)

esgpull/models/sql.py CHANGED Viewed

@@ -3,6 +3,7 @@ import functools
 import sqlalchemy as sa
 from esgpull.models import Table
+from esgpull.models.dataset import Dataset
 from esgpull.models.facet import Facet
 from esgpull.models.file import FileStatus
 from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
 from esgpull.models.tag import Tag
-def count(item: Table) -> sa.Select[tuple[int]]:
-    table = item.__class__
-    return (
-        sa.select(sa.func.count("*"))
-        .select_from(table)
-        .filter_by(sha=item.sha)
-    )
 def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
     return sa.select(sa.func.count("*")).select_from(table)
@@ -148,6 +140,45 @@ class file:
         return stmt
+class dataset:
+    @staticmethod
+    @functools.cache
+    def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
+        return (
+            sa.select(
+                Dataset.dataset_id,
+                Dataset.total_files,
+                sa.func.count(
+                    sa.case((File.status == FileStatus.Done, 1))
+                ).label("done_count"),
+            )
+            .join(File)
+            .join(query_file_proxy)
+            .filter(query_file_proxy.c.query_sha == query_sha)
+            .filter(File.dataset_id.isnot(None))
+            .group_by(Dataset.dataset_id, Dataset.total_files)
+        )
+    @staticmethod
+    @functools.cache
+    def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
+        return (
+            sa.select(sa.func.count(sa.distinct(File.dataset_id)))
+            .join(query_file_proxy)
+            .filter(query_file_proxy.c.query_sha == query_sha)
+            .filter(File.dataset_id.isnot(None))
+            .filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
+        )
+    @staticmethod
+    @functools.cache
+    def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
+        return sa.select(
+            sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
+            == dataset.total_files
+        ).where(File.dataset_id == dataset.dataset_id)
 class query:
     @staticmethod
     @functools.cache
@@ -270,3 +301,11 @@ class query_file:
             .where(query_file_proxy.c.query_sha == query.sha)
             .where(query_file_proxy.c.file_sha == file.sha)
         )
+    @staticmethod
+    def is_linked(query: Query, file: File) -> sa.Select[tuple[bool]]:
+        return sa.select(
+            sa.exists()
+            .where(query_file_proxy.c.query_sha == query.sha)
+            .where(query_file_proxy.c.file_sha == file.sha)
+        )

esgpull 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

esgpull 0.8.0py3-none-any.whl → 0.9.1py3-none-any.whl