esgpull 0.8.0__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgpull/cli/__init__.py +2 -2
- esgpull/cli/add.py +7 -1
- esgpull/cli/config.py +5 -21
- esgpull/cli/plugins.py +398 -0
- esgpull/cli/update.py +58 -15
- esgpull/cli/utils.py +16 -1
- esgpull/config.py +82 -25
- esgpull/constants.py +3 -0
- esgpull/context.py +9 -9
- esgpull/database.py +8 -2
- esgpull/download.py +3 -0
- esgpull/esgpull.py +49 -5
- esgpull/graph.py +1 -1
- esgpull/migrations/versions/0.9.0_update_tables.py +28 -0
- esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py +32 -0
- esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py +39 -0
- esgpull/models/__init__.py +2 -1
- esgpull/models/base.py +31 -14
- esgpull/models/dataset.py +48 -5
- esgpull/models/query.py +58 -14
- esgpull/models/sql.py +40 -9
- esgpull/plugin.py +574 -0
- esgpull/processor.py +3 -3
- esgpull/tui.py +23 -1
- esgpull/utils.py +5 -1
- {esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/METADATA +2 -1
- {esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/RECORD +30 -26
- esgpull/cli/datasets.py +0 -78
- {esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/WHEEL +0 -0
- {esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/entry_points.txt +0 -0
- {esgpull-0.8.0.dist-info → esgpull-0.9.0.dist-info}/licenses/LICENSE +0 -0
esgpull/models/query.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterator, MutableMapping, Sequence
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
6
6
|
|
|
7
7
|
import sqlalchemy as sa
|
|
8
8
|
from rich.console import Console, ConsoleOptions
|
|
@@ -15,10 +15,15 @@ from typing_extensions import NotRequired, TypedDict
|
|
|
15
15
|
from esgpull import utils
|
|
16
16
|
from esgpull.exceptions import UntrackableQuery
|
|
17
17
|
from esgpull.models.base import Base, Sha
|
|
18
|
+
from esgpull.models.dataset import Dataset
|
|
18
19
|
from esgpull.models.file import FileDict, FileStatus
|
|
19
20
|
from esgpull.models.options import Options
|
|
20
21
|
from esgpull.models.selection import FacetValues, Selection
|
|
21
22
|
from esgpull.models.tag import Tag
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from esgpull.models.dataset import Dataset
|
|
26
|
+
|
|
22
27
|
from esgpull.models.utils import (
|
|
23
28
|
find_int,
|
|
24
29
|
find_str,
|
|
@@ -55,9 +60,14 @@ query_tag_proxy = sa.Table(
|
|
|
55
60
|
|
|
56
61
|
class File(Base):
|
|
57
62
|
__tablename__ = "file"
|
|
63
|
+
__table_args__ = (
|
|
64
|
+
sa.Index("ix_file_dataset_status", "dataset_id", "status"),
|
|
65
|
+
)
|
|
58
66
|
|
|
59
67
|
file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
|
|
60
|
-
dataset_id: Mapped[str] = mapped_column(
|
|
68
|
+
dataset_id: Mapped[str] = mapped_column(
|
|
69
|
+
sa.String(255), sa.ForeignKey("dataset.dataset_id")
|
|
70
|
+
)
|
|
61
71
|
master_id: Mapped[str] = mapped_column(sa.String(255))
|
|
62
72
|
url: Mapped[str] = mapped_column(sa.String(255))
|
|
63
73
|
version: Mapped[str] = mapped_column(sa.String(16))
|
|
@@ -76,6 +86,11 @@ class File(Base):
|
|
|
76
86
|
back_populates="files",
|
|
77
87
|
repr=False,
|
|
78
88
|
)
|
|
89
|
+
dataset: Mapped["Dataset"] = relationship(
|
|
90
|
+
back_populates="files",
|
|
91
|
+
init=False,
|
|
92
|
+
repr=False,
|
|
93
|
+
)
|
|
79
94
|
|
|
80
95
|
def _as_bytes(self) -> bytes:
|
|
81
96
|
self_tuple = (self.file_id, self.checksum)
|
|
@@ -100,7 +115,7 @@ class File(Base):
|
|
|
100
115
|
size=source["size"],
|
|
101
116
|
)
|
|
102
117
|
if "status" in source:
|
|
103
|
-
result.status = FileStatus(source.get("
|
|
118
|
+
result.status = FileStatus(source.get("status").lower())
|
|
104
119
|
return result
|
|
105
120
|
|
|
106
121
|
@classmethod
|
|
@@ -395,11 +410,6 @@ class Query(Base):
|
|
|
395
410
|
self.tags.remove(tag)
|
|
396
411
|
return tag is not None
|
|
397
412
|
|
|
398
|
-
def no_require(self) -> Query:
|
|
399
|
-
cl = self.clone(compute_sha=False)
|
|
400
|
-
cl._rich_no_require = True # type: ignore [attr-defined]
|
|
401
|
-
return cl
|
|
402
|
-
|
|
403
413
|
def __lshift__(self, child: Query) -> Query:
|
|
404
414
|
result = self.clone(compute_sha=False)
|
|
405
415
|
# if self.name != child.require:
|
|
@@ -440,7 +450,7 @@ class Query(Base):
|
|
|
440
450
|
|
|
441
451
|
__rich_measure__ = rich_measure_impl
|
|
442
452
|
|
|
443
|
-
def _rich_tree(self) -> Tree:
|
|
453
|
+
def _rich_tree(self, hide_require: bool = False) -> Tree:
|
|
444
454
|
title = Text.from_markup(self.rich_name)
|
|
445
455
|
if not self.tracked:
|
|
446
456
|
title.append(" untracked", style="i red")
|
|
@@ -449,7 +459,7 @@ class Query(Base):
|
|
|
449
459
|
f"\n│ updated {format_date_iso(self.updated_at)}"
|
|
450
460
|
)
|
|
451
461
|
contents = Table.grid(padding=(0, 1))
|
|
452
|
-
if not
|
|
462
|
+
if not hide_require and self.require is not None:
|
|
453
463
|
if len(self.require) == 40:
|
|
454
464
|
require = Text(short_sha(self.require), style="i green")
|
|
455
465
|
else:
|
|
@@ -482,10 +492,44 @@ class Query(Base):
|
|
|
482
492
|
count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
|
|
483
493
|
count_total, size_total = self.files_count_size()
|
|
484
494
|
sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
|
|
485
|
-
lens = f"{count_ondisk}/{count_total}"
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
495
|
+
lens = f"{count_ondisk} / {count_total}"
|
|
496
|
+
|
|
497
|
+
# Add dataset completion info
|
|
498
|
+
complete_datasets = 0
|
|
499
|
+
total_datasets = 0
|
|
500
|
+
session = object_session(self)
|
|
501
|
+
orphaned_dataset_count = 0
|
|
502
|
+
|
|
503
|
+
if session is not None:
|
|
504
|
+
from esgpull.models import sql
|
|
505
|
+
|
|
506
|
+
dataset_stats = session.execute(
|
|
507
|
+
sql.dataset.query_stats(self.sha)
|
|
508
|
+
).all()
|
|
509
|
+
|
|
510
|
+
# Check for orphaned datasets (dataset_ids from files not in Dataset table)
|
|
511
|
+
orphaned_dataset_count = (
|
|
512
|
+
session.scalar(sql.dataset.orphaned(self.sha)) or 0
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Compute counts in Python - simpler and more maintainable
|
|
516
|
+
total_datasets = len(dataset_stats)
|
|
517
|
+
complete_datasets = sum(
|
|
518
|
+
1 for d in dataset_stats if d.done_count == d.total_files
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
contents.add_row("files:", Text(f"{lens}", style="magenta"))
|
|
522
|
+
if orphaned_dataset_count > 0:
|
|
523
|
+
contents.add_row(
|
|
524
|
+
"datasets:",
|
|
525
|
+
"[magenta]? / ?[/] [yellow italic]<- update for accurate datasets[/]",
|
|
526
|
+
)
|
|
527
|
+
else:
|
|
528
|
+
contents.add_row(
|
|
529
|
+
"datasets:",
|
|
530
|
+
f"[magenta]{complete_datasets} / {total_datasets}",
|
|
531
|
+
)
|
|
532
|
+
contents.add_row("size:", Text(f"{sizes}", style="magenta"))
|
|
489
533
|
tree = Tree("", hide_root=True, guide_style="dim").add(title)
|
|
490
534
|
if contents.row_count:
|
|
491
535
|
tree.add(contents)
|
esgpull/models/sql.py
CHANGED
|
@@ -3,6 +3,7 @@ import functools
|
|
|
3
3
|
import sqlalchemy as sa
|
|
4
4
|
|
|
5
5
|
from esgpull.models import Table
|
|
6
|
+
from esgpull.models.dataset import Dataset
|
|
6
7
|
from esgpull.models.facet import Facet
|
|
7
8
|
from esgpull.models.file import FileStatus
|
|
8
9
|
from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
|
|
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
|
|
|
11
12
|
from esgpull.models.tag import Tag
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
def count(item: Table) -> sa.Select[tuple[int]]:
|
|
15
|
-
table = item.__class__
|
|
16
|
-
return (
|
|
17
|
-
sa.select(sa.func.count("*"))
|
|
18
|
-
.select_from(table)
|
|
19
|
-
.filter_by(sha=item.sha)
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
15
|
def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
|
|
24
16
|
return sa.select(sa.func.count("*")).select_from(table)
|
|
25
17
|
|
|
@@ -148,6 +140,45 @@ class file:
|
|
|
148
140
|
return stmt
|
|
149
141
|
|
|
150
142
|
|
|
143
|
+
class dataset:
|
|
144
|
+
@staticmethod
|
|
145
|
+
@functools.cache
|
|
146
|
+
def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
|
|
147
|
+
return (
|
|
148
|
+
sa.select(
|
|
149
|
+
Dataset.dataset_id,
|
|
150
|
+
Dataset.total_files,
|
|
151
|
+
sa.func.count(
|
|
152
|
+
sa.case((File.status == FileStatus.Done, 1))
|
|
153
|
+
).label("done_count"),
|
|
154
|
+
)
|
|
155
|
+
.join(File)
|
|
156
|
+
.join(query_file_proxy)
|
|
157
|
+
.filter(query_file_proxy.c.query_sha == query_sha)
|
|
158
|
+
.filter(File.dataset_id.isnot(None))
|
|
159
|
+
.group_by(Dataset.dataset_id, Dataset.total_files)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
@functools.cache
|
|
164
|
+
def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
|
|
165
|
+
return (
|
|
166
|
+
sa.select(sa.func.count(sa.distinct(File.dataset_id)))
|
|
167
|
+
.join(query_file_proxy)
|
|
168
|
+
.filter(query_file_proxy.c.query_sha == query_sha)
|
|
169
|
+
.filter(File.dataset_id.isnot(None))
|
|
170
|
+
.filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
@functools.cache
|
|
175
|
+
def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
|
|
176
|
+
return sa.select(
|
|
177
|
+
sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
|
|
178
|
+
== dataset.total_files
|
|
179
|
+
).where(File.dataset_id == dataset.dataset_id)
|
|
180
|
+
|
|
181
|
+
|
|
151
182
|
class query:
|
|
152
183
|
@staticmethod
|
|
153
184
|
@functools.cache
|