esgpull 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgpull/models/query.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterator, MutableMapping, Sequence
4
4
  from datetime import datetime, timezone
5
- from typing import Any, Literal
5
+ from typing import TYPE_CHECKING, Any, Literal
6
6
 
7
7
  import sqlalchemy as sa
8
8
  from rich.console import Console, ConsoleOptions
@@ -15,10 +15,15 @@ from typing_extensions import NotRequired, TypedDict
15
15
  from esgpull import utils
16
16
  from esgpull.exceptions import UntrackableQuery
17
17
  from esgpull.models.base import Base, Sha
18
+ from esgpull.models.dataset import Dataset
18
19
  from esgpull.models.file import FileDict, FileStatus
19
20
  from esgpull.models.options import Options
20
21
  from esgpull.models.selection import FacetValues, Selection
21
22
  from esgpull.models.tag import Tag
23
+
24
+ if TYPE_CHECKING:
25
+ from esgpull.models.dataset import Dataset
26
+
22
27
  from esgpull.models.utils import (
23
28
  find_int,
24
29
  find_str,
@@ -55,9 +60,14 @@ query_tag_proxy = sa.Table(
55
60
 
56
61
  class File(Base):
57
62
  __tablename__ = "file"
63
+ __table_args__ = (
64
+ sa.Index("ix_file_dataset_status", "dataset_id", "status"),
65
+ )
58
66
 
59
67
  file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
60
- dataset_id: Mapped[str] = mapped_column(sa.String(255))
68
+ dataset_id: Mapped[str] = mapped_column(
69
+ sa.String(255), sa.ForeignKey("dataset.dataset_id")
70
+ )
61
71
  master_id: Mapped[str] = mapped_column(sa.String(255))
62
72
  url: Mapped[str] = mapped_column(sa.String(255))
63
73
  version: Mapped[str] = mapped_column(sa.String(16))
@@ -76,6 +86,11 @@ class File(Base):
76
86
  back_populates="files",
77
87
  repr=False,
78
88
  )
89
+ dataset: Mapped["Dataset"] = relationship(
90
+ back_populates="files",
91
+ init=False,
92
+ repr=False,
93
+ )
79
94
 
80
95
  def _as_bytes(self) -> bytes:
81
96
  self_tuple = (self.file_id, self.checksum)
@@ -100,7 +115,7 @@ class File(Base):
100
115
  size=source["size"],
101
116
  )
102
117
  if "status" in source:
103
- result.status = FileStatus(source.get("source"))
118
+ result.status = FileStatus(source.get("status").lower())
104
119
  return result
105
120
 
106
121
  @classmethod
@@ -395,11 +410,6 @@ class Query(Base):
395
410
  self.tags.remove(tag)
396
411
  return tag is not None
397
412
 
398
- def no_require(self) -> Query:
399
- cl = self.clone(compute_sha=False)
400
- cl._rich_no_require = True # type: ignore [attr-defined]
401
- return cl
402
-
403
413
  def __lshift__(self, child: Query) -> Query:
404
414
  result = self.clone(compute_sha=False)
405
415
  # if self.name != child.require:
@@ -440,7 +450,7 @@ class Query(Base):
440
450
 
441
451
  __rich_measure__ = rich_measure_impl
442
452
 
443
- def _rich_tree(self) -> Tree:
453
+ def _rich_tree(self, hide_require: bool = False) -> Tree:
444
454
  title = Text.from_markup(self.rich_name)
445
455
  if not self.tracked:
446
456
  title.append(" untracked", style="i red")
@@ -449,7 +459,7 @@ class Query(Base):
449
459
  f"\n│ updated {format_date_iso(self.updated_at)}"
450
460
  )
451
461
  contents = Table.grid(padding=(0, 1))
452
- if not hasattr(self, "_rich_no_require") and self.require is not None:
462
+ if not hide_require and self.require is not None:
453
463
  if len(self.require) == 40:
454
464
  require = Text(short_sha(self.require), style="i green")
455
465
  else:
@@ -482,10 +492,44 @@ class Query(Base):
482
492
  count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
483
493
  count_total, size_total = self.files_count_size()
484
494
  sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
485
- lens = f"{count_ondisk}/{count_total}"
486
- contents.add_row(
487
- "files:", Text(f"{sizes} [{lens}]", style="magenta")
488
- )
495
+ lens = f"{count_ondisk} / {count_total}"
496
+
497
+ # Add dataset completion info
498
+ complete_datasets = 0
499
+ total_datasets = 0
500
+ session = object_session(self)
501
+ orphaned_dataset_count = 0
502
+
503
+ if session is not None:
504
+ from esgpull.models import sql
505
+
506
+ dataset_stats = session.execute(
507
+ sql.dataset.query_stats(self.sha)
508
+ ).all()
509
+
510
+ # Check for orphaned datasets (dataset_ids from files not in Dataset table)
511
+ orphaned_dataset_count = (
512
+ session.scalar(sql.dataset.orphaned(self.sha)) or 0
513
+ )
514
+
515
+ # Compute counts in Python - simpler and more maintainable
516
+ total_datasets = len(dataset_stats)
517
+ complete_datasets = sum(
518
+ 1 for d in dataset_stats if d.done_count == d.total_files
519
+ )
520
+
521
+ contents.add_row("files:", Text(f"{lens}", style="magenta"))
522
+ if orphaned_dataset_count > 0:
523
+ contents.add_row(
524
+ "datasets:",
525
+ "[magenta]? / ?[/] [yellow italic]<- update for accurate datasets[/]",
526
+ )
527
+ else:
528
+ contents.add_row(
529
+ "datasets:",
530
+ f"[magenta]{complete_datasets} / {total_datasets}",
531
+ )
532
+ contents.add_row("size:", Text(f"{sizes}", style="magenta"))
489
533
  tree = Tree("", hide_root=True, guide_style="dim").add(title)
490
534
  if contents.row_count:
491
535
  tree.add(contents)
esgpull/models/sql.py CHANGED
@@ -3,6 +3,7 @@ import functools
3
3
  import sqlalchemy as sa
4
4
 
5
5
  from esgpull.models import Table
6
+ from esgpull.models.dataset import Dataset
6
7
  from esgpull.models.facet import Facet
7
8
  from esgpull.models.file import FileStatus
8
9
  from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
11
12
  from esgpull.models.tag import Tag
12
13
 
13
14
 
14
- def count(item: Table) -> sa.Select[tuple[int]]:
15
- table = item.__class__
16
- return (
17
- sa.select(sa.func.count("*"))
18
- .select_from(table)
19
- .filter_by(sha=item.sha)
20
- )
21
-
22
-
23
15
  def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
24
16
  return sa.select(sa.func.count("*")).select_from(table)
25
17
 
@@ -148,6 +140,45 @@ class file:
148
140
  return stmt
149
141
 
150
142
 
143
+ class dataset:
144
+ @staticmethod
145
+ @functools.cache
146
+ def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
147
+ return (
148
+ sa.select(
149
+ Dataset.dataset_id,
150
+ Dataset.total_files,
151
+ sa.func.count(
152
+ sa.case((File.status == FileStatus.Done, 1))
153
+ ).label("done_count"),
154
+ )
155
+ .join(File)
156
+ .join(query_file_proxy)
157
+ .filter(query_file_proxy.c.query_sha == query_sha)
158
+ .filter(File.dataset_id.isnot(None))
159
+ .group_by(Dataset.dataset_id, Dataset.total_files)
160
+ )
161
+
162
+ @staticmethod
163
+ @functools.cache
164
+ def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
165
+ return (
166
+ sa.select(sa.func.count(sa.distinct(File.dataset_id)))
167
+ .join(query_file_proxy)
168
+ .filter(query_file_proxy.c.query_sha == query_sha)
169
+ .filter(File.dataset_id.isnot(None))
170
+ .filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
171
+ )
172
+
173
+ @staticmethod
174
+ @functools.cache
175
+ def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
176
+ return sa.select(
177
+ sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
178
+ == dataset.total_files
179
+ ).where(File.dataset_id == dataset.dataset_id)
180
+
181
+
151
182
  class query:
152
183
  @staticmethod
153
184
  @functools.cache