esgpull 0.7.3__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. esgpull/cli/__init__.py +2 -2
  2. esgpull/cli/add.py +7 -1
  3. esgpull/cli/config.py +5 -21
  4. esgpull/cli/plugins.py +398 -0
  5. esgpull/cli/show.py +29 -0
  6. esgpull/cli/status.py +6 -4
  7. esgpull/cli/update.py +72 -18
  8. esgpull/cli/utils.py +16 -1
  9. esgpull/config.py +83 -25
  10. esgpull/constants.py +3 -0
  11. esgpull/context.py +15 -15
  12. esgpull/database.py +8 -2
  13. esgpull/download.py +3 -0
  14. esgpull/esgpull.py +49 -5
  15. esgpull/graph.py +1 -1
  16. esgpull/migrations/versions/0.8.0_update_tables.py +28 -0
  17. esgpull/migrations/versions/0.9.0_update_tables.py +28 -0
  18. esgpull/migrations/versions/14c72daea083_query_add_column_updated_at.py +36 -0
  19. esgpull/migrations/versions/c7c8541fa741_query_add_column_added_at.py +37 -0
  20. esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py +32 -0
  21. esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py +39 -0
  22. esgpull/models/__init__.py +2 -1
  23. esgpull/models/base.py +31 -14
  24. esgpull/models/dataset.py +48 -5
  25. esgpull/models/options.py +1 -1
  26. esgpull/models/query.py +98 -15
  27. esgpull/models/sql.py +40 -9
  28. esgpull/plugin.py +574 -0
  29. esgpull/processor.py +3 -3
  30. esgpull/tui.py +23 -1
  31. esgpull/utils.py +19 -3
  32. {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/METADATA +11 -2
  33. {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/RECORD +36 -29
  34. {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/WHEEL +1 -1
  35. esgpull/cli/datasets.py +0 -78
  36. {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/entry_points.txt +0 -0
  37. {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,28 @@
1
+ """update tables
2
+
3
+ Revision ID: 0.8.0
4
+ Revises: 14c72daea083
5
+ Create Date: 2025-05-15 11:28:10.755003
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '0.8.0'
14
+ down_revision = '14c72daea083'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ pass
22
+ # ### end Alembic commands ###
23
+
24
+
25
+ def downgrade() -> None:
26
+ # ### commands auto generated by Alembic - please adjust! ###
27
+ pass
28
+ # ### end Alembic commands ###
@@ -0,0 +1,28 @@
1
+ """update tables
2
+
3
+ Revision ID: 0.9.0
4
+ Revises: d14f179e553c
5
+ Create Date: 2025-07-07 14:54:58.433022
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '0.9.0'
14
+ down_revision = 'd14f179e553c'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ pass
22
+ # ### end Alembic commands ###
23
+
24
+
25
+ def downgrade() -> None:
26
+ # ### commands auto generated by Alembic - please adjust! ###
27
+ pass
28
+ # ### end Alembic commands ###
@@ -0,0 +1,36 @@
1
+ """query_add_column_updated_at
2
+
3
+ Revision ID: 14c72daea083
4
+ Revises: c7c8541fa741
5
+ Create Date: 2025-05-07 14:49:43.993125
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '14c72daea083'
14
+ down_revision = 'c7c8541fa741'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ with op.batch_alter_table('query', schema=None) as batch_op:
21
+ batch_op.add_column(sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True))
22
+
23
+ # Backfill nulls
24
+ op.execute('UPDATE query SET updated_at = CURRENT_TIMESTAMP WHERE updated_at IS NULL')
25
+
26
+ # Make non-nullable
27
+ with op.batch_alter_table('query', schema=None) as batch_op:
28
+ batch_op.alter_column('updated_at', nullable=False)
29
+
30
+
31
+ def downgrade() -> None:
32
+ # ### commands auto generated by Alembic - please adjust! ###
33
+ with op.batch_alter_table('query', schema=None) as batch_op:
34
+ batch_op.drop_column('updated_at')
35
+
36
+ # ### end Alembic commands ###
@@ -0,0 +1,37 @@
1
+ """query_add_column_added_at
2
+
3
+ Revision ID: c7c8541fa741
4
+ Revises: 0.7.3
5
+ Create Date: 2025-05-05 16:14:57.140262
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = 'c7c8541fa741'
14
+ down_revision = '0.7.3'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # Add as nullable first
21
+ with op.batch_alter_table('query', schema=None) as batch_op:
22
+ batch_op.add_column(sa.Column('added_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True))
23
+
24
+ # Backfill nulls
25
+ op.execute('UPDATE query SET added_at = CURRENT_TIMESTAMP WHERE added_at IS NULL')
26
+
27
+ # Make non-nullable
28
+ with op.batch_alter_table('query', schema=None) as batch_op:
29
+ batch_op.alter_column('added_at', nullable=False)
30
+
31
+
32
+ def downgrade() -> None:
33
+ # ### commands auto generated by Alembic - please adjust! ###
34
+ with op.batch_alter_table('query', schema=None) as batch_op:
35
+ batch_op.drop_column('added_at')
36
+
37
+ # ### end Alembic commands ###
@@ -0,0 +1,32 @@
1
+ """file_add_composite_index_dataset_id_status
2
+
3
+ Revision ID: d14f179e553c
4
+ Revises: e7edab5d4e4b
5
+ Create Date: 2025-06-18 16:05:35.721085
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = 'd14f179e553c'
14
+ down_revision = 'e7edab5d4e4b'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table('file', schema=None) as batch_op:
22
+ batch_op.create_index('ix_file_dataset_status', ['dataset_id', 'status'], unique=False)
23
+
24
+ # ### end Alembic commands ###
25
+
26
+
27
+ def downgrade() -> None:
28
+ # ### commands auto generated by Alembic - please adjust! ###
29
+ with op.batch_alter_table('file', schema=None) as batch_op:
30
+ batch_op.drop_index('ix_file_dataset_status')
31
+
32
+ # ### end Alembic commands ###
@@ -0,0 +1,39 @@
1
+ """add_dataset_tracking
2
+
3
+ Revision ID: e7edab5d4e4b
4
+ Revises: 0.8.0
5
+ Create Date: 2025-05-23 17:38:22.066153
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = 'e7edab5d4e4b'
14
+ down_revision = '0.8.0'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ op.create_table('dataset',
22
+ sa.Column('dataset_id', sa.String(length=255), nullable=False),
23
+ sa.Column('total_files', sa.Integer(), nullable=False),
24
+ sa.Column('created_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
25
+ sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
26
+ sa.PrimaryKeyConstraint('dataset_id')
27
+ )
28
+ with op.batch_alter_table('file', schema=None) as batch_op:
29
+ batch_op.create_foreign_key('fk_file_dataset', 'dataset', ['dataset_id'], ['dataset_id'])
30
+
31
+ # ### end Alembic commands ###
32
+
33
+ def downgrade() -> None:
34
+ # ### commands auto generated by Alembic - please adjust! ###
35
+ with op.batch_alter_table('file', schema=None) as batch_op:
36
+ batch_op.drop_constraint('fk_file_dataset', type_='foreignkey')
37
+
38
+ op.drop_table('dataset')
39
+ # ### end Alembic commands ###
@@ -1,7 +1,7 @@
1
1
  from typing import TypeVar
2
2
 
3
3
  from esgpull.models.base import Base
4
- from esgpull.models.dataset import Dataset
4
+ from esgpull.models.dataset import Dataset, DatasetRecord
5
5
  from esgpull.models.facet import Facet
6
6
  from esgpull.models.file import FastFile, FileStatus
7
7
  from esgpull.models.options import Option, Options
@@ -15,6 +15,7 @@ Table = TypeVar("Table", bound=Base)
15
15
  __all__ = [
16
16
  "Base",
17
17
  "Dataset",
18
+ "DatasetRecord",
18
19
  "Facet",
19
20
  "FastFile",
20
21
  "File",
esgpull/models/base.py CHANGED
@@ -16,16 +16,10 @@ T = TypeVar("T")
16
16
  Sha = sa.String(40)
17
17
 
18
18
 
19
- class Base(MappedAsDataclass, DeclarativeBase):
19
+ # Base class for all models - provides core SQLAlchemy functionality
20
+ class _BaseModel(MappedAsDataclass, DeclarativeBase):
20
21
  __dataclass_fields__: ClassVar[dict[str, Field]]
21
- __sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
22
-
23
- sha: Mapped[str] = mapped_column(
24
- Sha,
25
- init=False,
26
- repr=False,
27
- primary_key=True,
28
- )
22
+ __sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
29
23
 
30
24
  @property
31
25
  def _names(self) -> tuple[str, ...]:
@@ -36,15 +30,38 @@ class Base(MappedAsDataclass, DeclarativeBase):
36
30
  result += (name,)
37
31
  return result
38
32
 
33
+ @property
34
+ def state(self) -> InstanceState:
35
+ return cast(InstanceState, sa.inspect(self))
36
+
37
+ def asdict(self) -> Mapping[str, Any]:
38
+ raise NotImplementedError
39
+
40
+
41
+ # Base class for models that use SHA as primary key
42
+ class Base(_BaseModel):
43
+ __abstract__ = True
44
+ __sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
45
+
46
+ sha: Mapped[str] = mapped_column(
47
+ Sha,
48
+ init=False,
49
+ repr=False,
50
+ primary_key=True,
51
+ )
52
+
39
53
  def _as_bytes(self) -> bytes:
40
54
  raise NotImplementedError
41
55
 
42
56
  def compute_sha(self) -> None:
43
57
  self.sha = sha1(self._as_bytes()).hexdigest()
44
58
 
45
- @property
46
- def state(self) -> InstanceState:
47
- return cast(InstanceState, sa.inspect(self))
48
59
 
49
- def asdict(self) -> Mapping[str, Any]:
50
- raise NotImplementedError
60
+ # Base class for models that don't use SHA (e.g., Dataset)
61
+ class BaseNoSHA(_BaseModel):
62
+ __abstract__ = True
63
+ __sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
64
+
65
+
66
+ # Keep SHAKeyMixin for backward compatibility if needed
67
+ SHAKeyMixin = Base
esgpull/models/dataset.py CHANGED
@@ -1,12 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import asdict, dataclass
3
+ from collections.abc import Mapping
4
+ from dataclasses import dataclass
5
+ from datetime import datetime, timezone
6
+ from typing import TYPE_CHECKING, Any
4
7
 
8
+ import sqlalchemy as sa
9
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
10
+
11
+ from esgpull.models.base import BaseNoSHA
5
12
  from esgpull.models.utils import find_int, find_str
6
13
 
14
+ if TYPE_CHECKING:
15
+ from esgpull.models.query import File
16
+
7
17
 
8
18
  @dataclass
9
- class Dataset:
19
+ class DatasetRecord:
10
20
  dataset_id: str
11
21
  master_id: str
12
22
  version: str
@@ -15,7 +25,7 @@ class Dataset:
15
25
  number_of_files: int
16
26
 
17
27
  @classmethod
18
- def serialize(cls, source: dict) -> Dataset:
28
+ def serialize(cls, source: dict) -> DatasetRecord:
19
29
  dataset_id = find_str(source["instance_id"]).partition("|")[0]
20
30
  master_id, version = dataset_id.rsplit(".", 1)
21
31
  data_node = find_str(source["data_node"])
@@ -30,5 +40,38 @@ class Dataset:
30
40
  number_of_files=number_of_files,
31
41
  )
32
42
 
33
- def asdict(self) -> dict:
34
- return asdict(self)
43
+
44
+ class Dataset(BaseNoSHA):
45
+ __tablename__ = "dataset"
46
+
47
+ dataset_id: Mapped[str] = mapped_column(sa.String(255), primary_key=True)
48
+ total_files: Mapped[int] = mapped_column(sa.Integer)
49
+ created_at: Mapped[datetime] = mapped_column(
50
+ server_default=sa.func.now(),
51
+ default_factory=lambda: datetime.now(timezone.utc),
52
+ init=False,
53
+ )
54
+ updated_at: Mapped[datetime] = mapped_column(
55
+ server_default=sa.func.now(),
56
+ default_factory=lambda: datetime.now(timezone.utc),
57
+ init=False,
58
+ )
59
+ files: Mapped[list[File]] = relationship(
60
+ back_populates="dataset",
61
+ foreign_keys="[File.dataset_id]",
62
+ primaryjoin="Dataset.dataset_id==File.dataset_id",
63
+ default_factory=list,
64
+ init=False,
65
+ repr=False,
66
+ )
67
+
68
+ def asdict(self) -> Mapping[str, Any]:
69
+ return {
70
+ "dataset_id": self.dataset_id,
71
+ "total_files": self.total_files,
72
+ "created_at": self.created_at.isoformat(),
73
+ "updated_at": self.updated_at.isoformat(),
74
+ }
75
+
76
+ def __hash__(self) -> int:
77
+ return hash(self.dataset_id)
esgpull/models/options.py CHANGED
@@ -53,7 +53,7 @@ class Options(Base):
53
53
  replica: Mapped[Option] = mapped_column(sa.Enum(Option))
54
54
  retracted: Mapped[Option] = mapped_column(sa.Enum(Option))
55
55
 
56
- _distrib_ = Option(False)
56
+ _distrib_ = Option(True)
57
57
  _latest_ = Option(True)
58
58
  _replica_ = Option(None)
59
59
  _retracted_ = Option(False)
esgpull/models/query.py CHANGED
@@ -1,7 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterator, MutableMapping, Sequence
4
- from typing import Any, Literal
4
+ from datetime import datetime, timezone
5
+ from typing import TYPE_CHECKING, Any, Literal
5
6
 
6
7
  import sqlalchemy as sa
7
8
  from rich.console import Console, ConsoleOptions
@@ -11,12 +12,18 @@ from rich.tree import Tree
11
12
  from sqlalchemy.orm import Mapped, mapped_column, object_session, relationship
12
13
  from typing_extensions import NotRequired, TypedDict
13
14
 
15
+ from esgpull import utils
14
16
  from esgpull.exceptions import UntrackableQuery
15
17
  from esgpull.models.base import Base, Sha
18
+ from esgpull.models.dataset import Dataset
16
19
  from esgpull.models.file import FileDict, FileStatus
17
20
  from esgpull.models.options import Options
18
21
  from esgpull.models.selection import FacetValues, Selection
19
22
  from esgpull.models.tag import Tag
23
+
24
+ if TYPE_CHECKING:
25
+ from esgpull.models.dataset import Dataset
26
+
20
27
  from esgpull.models.utils import (
21
28
  find_int,
22
29
  find_str,
@@ -24,7 +31,18 @@ from esgpull.models.utils import (
24
31
  rich_measure_impl,
25
32
  short_sha,
26
33
  )
27
- from esgpull.utils import format_size
34
+ from esgpull.utils import format_date_iso, format_size
35
+
36
+ QUERY_DATE_FMT = "%Y-%m-%d %H:%M:%S"
37
+
38
+
39
+ def parse_date(d: datetime | str) -> datetime:
40
+ return utils.parse_date(d, fmt=QUERY_DATE_FMT)
41
+
42
+
43
+ def format_date(d: datetime | str) -> str:
44
+ return utils.format_date(d, fmt=QUERY_DATE_FMT)
45
+
28
46
 
29
47
  query_file_proxy = sa.Table(
30
48
  "query_file",
@@ -42,9 +60,14 @@ query_tag_proxy = sa.Table(
42
60
 
43
61
  class File(Base):
44
62
  __tablename__ = "file"
63
+ __table_args__ = (
64
+ sa.Index("ix_file_dataset_status", "dataset_id", "status"),
65
+ )
45
66
 
46
67
  file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
47
- dataset_id: Mapped[str] = mapped_column(sa.String(255))
68
+ dataset_id: Mapped[str] = mapped_column(
69
+ sa.String(255), sa.ForeignKey("dataset.dataset_id")
70
+ )
48
71
  master_id: Mapped[str] = mapped_column(sa.String(255))
49
72
  url: Mapped[str] = mapped_column(sa.String(255))
50
73
  version: Mapped[str] = mapped_column(sa.String(16))
@@ -63,6 +86,11 @@ class File(Base):
63
86
  back_populates="files",
64
87
  repr=False,
65
88
  )
89
+ dataset: Mapped["Dataset"] = relationship(
90
+ back_populates="files",
91
+ init=False,
92
+ repr=False,
93
+ )
66
94
 
67
95
  def _as_bytes(self) -> bytes:
68
96
  self_tuple = (self.file_id, self.checksum)
@@ -87,7 +115,7 @@ class File(Base):
87
115
  size=source["size"],
88
116
  )
89
117
  if "status" in source:
90
- result.status = FileStatus(source.get("source"))
118
+ result.status = FileStatus(source.get("status").lower())
91
119
  return result
92
120
 
93
121
  @classmethod
@@ -152,6 +180,8 @@ class QueryDict(TypedDict):
152
180
  options: NotRequired[MutableMapping[str, bool | None]]
153
181
  selection: NotRequired[MutableMapping[str, FacetValues]]
154
182
  files: NotRequired[list[FileDict]]
183
+ added_at: NotRequired[str]
184
+ updated_at: NotRequired[str]
155
185
 
156
186
 
157
187
  class Query(Base):
@@ -181,6 +211,14 @@ class Query(Base):
181
211
  back_populates="queries",
182
212
  repr=False,
183
213
  )
214
+ added_at: Mapped[datetime] = mapped_column(
215
+ server_default=sa.func.now(),
216
+ default_factory=lambda: datetime.now(timezone.utc),
217
+ )
218
+ updated_at: Mapped[datetime] = mapped_column(
219
+ server_default=sa.func.now(),
220
+ default_factory=lambda: datetime.now(timezone.utc),
221
+ )
184
222
 
185
223
  def __init__(
186
224
  self,
@@ -191,6 +229,8 @@ class Query(Base):
191
229
  options: Options | MutableMapping[str, bool | None] | None = None,
192
230
  selection: Selection | MutableMapping[str, FacetValues] | None = None,
193
231
  files: list[FileDict] | None = None,
232
+ added_at: datetime | str | None = None,
233
+ updated_at: datetime | str | None = None,
194
234
  ) -> None:
195
235
  self.tracked = tracked
196
236
  self.require = require
@@ -219,6 +259,14 @@ class Query(Base):
219
259
  if files is not None:
220
260
  for file in files:
221
261
  self.files.append(File.fromdict(file))
262
+ if added_at is not None:
263
+ self.added_at = parse_date(added_at)
264
+ else:
265
+ self.added_at = datetime.now(timezone.utc)
266
+ if updated_at is not None:
267
+ self.updated_at = parse_date(updated_at)
268
+ else:
269
+ self.updated_at = datetime.now(timezone.utc)
222
270
 
223
271
  @property
224
272
  def has_files(self) -> bool:
@@ -313,6 +361,8 @@ class Query(Base):
313
361
  result["options"] = self.options.asdict()
314
362
  if self.selection:
315
363
  result["selection"] = self.selection.asdict()
364
+ result["added_at"] = format_date(self.added_at)
365
+ result["updated_at"] = format_date(self.updated_at)
316
366
  return result
317
367
 
318
368
  def clone(self, compute_sha: bool = True) -> Query:
@@ -360,11 +410,6 @@ class Query(Base):
360
410
  self.tags.remove(tag)
361
411
  return tag is not None
362
412
 
363
- def no_require(self) -> Query:
364
- cl = self.clone(compute_sha=False)
365
- cl._rich_no_require = True # type: ignore [attr-defined]
366
- return cl
367
-
368
413
  def __lshift__(self, child: Query) -> Query:
369
414
  result = self.clone(compute_sha=False)
370
415
  # if self.name != child.require:
@@ -405,12 +450,16 @@ class Query(Base):
405
450
 
406
451
  __rich_measure__ = rich_measure_impl
407
452
 
408
- def _rich_tree(self) -> Tree:
453
+ def _rich_tree(self, hide_require: bool = False) -> Tree:
409
454
  title = Text.from_markup(self.rich_name)
410
455
  if not self.tracked:
411
456
  title.append(" untracked", style="i red")
457
+ title.append(
458
+ f"\n│ added {format_date_iso(self.added_at)}"
459
+ f"\n│ updated {format_date_iso(self.updated_at)}"
460
+ )
412
461
  contents = Table.grid(padding=(0, 1))
413
- if not hasattr(self, "_rich_no_require") and self.require is not None:
462
+ if not hide_require and self.require is not None:
414
463
  if len(self.require) == 40:
415
464
  require = Text(short_sha(self.require), style="i green")
416
465
  else:
@@ -443,10 +492,44 @@ class Query(Base):
443
492
  count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
444
493
  count_total, size_total = self.files_count_size()
445
494
  sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
446
- lens = f"{count_ondisk}/{count_total}"
447
- contents.add_row(
448
- "files:", Text(f"{sizes} [{lens}]", style="magenta")
449
- )
495
+ lens = f"{count_ondisk} / {count_total}"
496
+
497
+ # Add dataset completion info
498
+ complete_datasets = 0
499
+ total_datasets = 0
500
+ session = object_session(self)
501
+ orphaned_dataset_count = 0
502
+
503
+ if session is not None:
504
+ from esgpull.models import sql
505
+
506
+ dataset_stats = session.execute(
507
+ sql.dataset.query_stats(self.sha)
508
+ ).all()
509
+
510
+ # Check for orphaned datasets (dataset_ids from files not in Dataset table)
511
+ orphaned_dataset_count = (
512
+ session.scalar(sql.dataset.orphaned(self.sha)) or 0
513
+ )
514
+
515
+ # Compute counts in Python - simpler and more maintainable
516
+ total_datasets = len(dataset_stats)
517
+ complete_datasets = sum(
518
+ 1 for d in dataset_stats if d.done_count == d.total_files
519
+ )
520
+
521
+ contents.add_row("files:", Text(f"{lens}", style="magenta"))
522
+ if orphaned_dataset_count > 0:
523
+ contents.add_row(
524
+ "datasets:",
525
+ "[magenta]? / ?[/] [yellow italic]<- update for accurate datasets[/]",
526
+ )
527
+ else:
528
+ contents.add_row(
529
+ "datasets:",
530
+ f"[magenta]{complete_datasets} / {total_datasets}",
531
+ )
532
+ contents.add_row("size:", Text(f"{sizes}", style="magenta"))
450
533
  tree = Tree("", hide_root=True, guide_style="dim").add(title)
451
534
  if contents.row_count:
452
535
  tree.add(contents)
esgpull/models/sql.py CHANGED
@@ -3,6 +3,7 @@ import functools
3
3
  import sqlalchemy as sa
4
4
 
5
5
  from esgpull.models import Table
6
+ from esgpull.models.dataset import Dataset
6
7
  from esgpull.models.facet import Facet
7
8
  from esgpull.models.file import FileStatus
8
9
  from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
11
12
  from esgpull.models.tag import Tag
12
13
 
13
14
 
14
- def count(item: Table) -> sa.Select[tuple[int]]:
15
- table = item.__class__
16
- return (
17
- sa.select(sa.func.count("*"))
18
- .select_from(table)
19
- .filter_by(sha=item.sha)
20
- )
21
-
22
-
23
15
  def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
24
16
  return sa.select(sa.func.count("*")).select_from(table)
25
17
 
@@ -148,6 +140,45 @@ class file:
148
140
  return stmt
149
141
 
150
142
 
143
+ class dataset:
144
+ @staticmethod
145
+ @functools.cache
146
+ def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
147
+ return (
148
+ sa.select(
149
+ Dataset.dataset_id,
150
+ Dataset.total_files,
151
+ sa.func.count(
152
+ sa.case((File.status == FileStatus.Done, 1))
153
+ ).label("done_count"),
154
+ )
155
+ .join(File)
156
+ .join(query_file_proxy)
157
+ .filter(query_file_proxy.c.query_sha == query_sha)
158
+ .filter(File.dataset_id.isnot(None))
159
+ .group_by(Dataset.dataset_id, Dataset.total_files)
160
+ )
161
+
162
+ @staticmethod
163
+ @functools.cache
164
+ def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
165
+ return (
166
+ sa.select(sa.func.count(sa.distinct(File.dataset_id)))
167
+ .join(query_file_proxy)
168
+ .filter(query_file_proxy.c.query_sha == query_sha)
169
+ .filter(File.dataset_id.isnot(None))
170
+ .filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
171
+ )
172
+
173
+ @staticmethod
174
+ @functools.cache
175
+ def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
176
+ return sa.select(
177
+ sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
178
+ == dataset.total_files
179
+ ).where(File.dataset_id == dataset.dataset_id)
180
+
181
+
151
182
  class query:
152
183
  @staticmethod
153
184
  @functools.cache