esgpull 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgpull/cli/__init__.py +2 -2
- esgpull/cli/add.py +7 -1
- esgpull/cli/config.py +5 -21
- esgpull/cli/plugins.py +398 -0
- esgpull/cli/remove.py +9 -3
- esgpull/cli/self.py +1 -1
- esgpull/cli/update.py +78 -35
- esgpull/cli/utils.py +16 -1
- esgpull/config.py +83 -26
- esgpull/constants.py +3 -0
- esgpull/context.py +9 -9
- esgpull/database.py +21 -7
- esgpull/download.py +3 -0
- esgpull/esgpull.py +49 -5
- esgpull/fs.py +9 -20
- esgpull/graph.py +1 -1
- esgpull/migrations/versions/0.9.0_update_tables.py +28 -0
- esgpull/migrations/versions/0.9.1_update_tables.py +28 -0
- esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py +32 -0
- esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py +39 -0
- esgpull/models/__init__.py +2 -1
- esgpull/models/base.py +31 -14
- esgpull/models/dataset.py +48 -5
- esgpull/models/query.py +58 -14
- esgpull/models/sql.py +48 -9
- esgpull/plugin.py +574 -0
- esgpull/processor.py +3 -3
- esgpull/tui.py +23 -1
- esgpull/utils.py +5 -1
- {esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/METADATA +19 -3
- {esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/RECORD +34 -29
- esgpull/cli/datasets.py +0 -78
- {esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/WHEEL +0 -0
- {esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/entry_points.txt +0 -0
- {esgpull-0.8.0.dist-info → esgpull-0.9.1.dist-info}/licenses/LICENSE +0 -0
esgpull/fs.py
CHANGED
|
@@ -10,7 +10,7 @@ from shutil import copyfile
|
|
|
10
10
|
import aiofiles
|
|
11
11
|
from aiofiles.threadpool.binary import AsyncBufferedIOBase
|
|
12
12
|
|
|
13
|
-
from esgpull.config import Config
|
|
13
|
+
from esgpull.config import Config, Paths
|
|
14
14
|
from esgpull.models import File
|
|
15
15
|
from esgpull.result import Err, Ok, Result
|
|
16
16
|
from esgpull.tui import logger
|
|
@@ -63,45 +63,34 @@ class Digest:
|
|
|
63
63
|
|
|
64
64
|
@dataclass
|
|
65
65
|
class Filesystem:
|
|
66
|
-
|
|
67
|
-
data: Path
|
|
68
|
-
db: Path
|
|
69
|
-
log: Path
|
|
70
|
-
tmp: Path
|
|
66
|
+
paths: Paths
|
|
71
67
|
disable_checksum: bool = False
|
|
72
68
|
install: InitVar[bool] = True
|
|
73
69
|
|
|
74
70
|
@staticmethod
|
|
75
71
|
def from_config(config: Config, install: bool = False) -> Filesystem:
|
|
76
72
|
return Filesystem(
|
|
77
|
-
|
|
78
|
-
data=config.paths.data,
|
|
79
|
-
db=config.paths.db,
|
|
80
|
-
log=config.paths.log,
|
|
81
|
-
tmp=config.paths.tmp,
|
|
73
|
+
paths=config.paths,
|
|
82
74
|
disable_checksum=config.download.disable_checksum,
|
|
83
75
|
install=install,
|
|
84
76
|
)
|
|
85
77
|
|
|
86
78
|
def __post_init__(self, install: bool = True) -> None:
|
|
87
79
|
if install:
|
|
88
|
-
self.
|
|
89
|
-
|
|
90
|
-
self.db.mkdir(parents=True, exist_ok=True)
|
|
91
|
-
self.log.mkdir(parents=True, exist_ok=True)
|
|
92
|
-
self.tmp.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
for path in self.paths.values():
|
|
81
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
93
82
|
|
|
94
83
|
def __getitem__(self, file: File) -> FilePath:
|
|
95
84
|
if not isinstance(file, File):
|
|
96
85
|
raise TypeError(file)
|
|
97
86
|
return FilePath(
|
|
98
|
-
drs=self.data / file.local_path / file.filename,
|
|
99
|
-
tmp=self.tmp / f"{file.sha}.part",
|
|
87
|
+
drs=self.paths.data / file.local_path / file.filename,
|
|
88
|
+
tmp=self.paths.tmp / f"{file.sha}.part",
|
|
100
89
|
)
|
|
101
90
|
|
|
102
91
|
def glob_netcdf(self) -> Iterator[Path]:
|
|
103
|
-
for path in self.data.glob("**/*.nc"):
|
|
104
|
-
yield path.relative_to(self.data)
|
|
92
|
+
for path in self.paths.data.glob("**/*.nc"):
|
|
93
|
+
yield path.relative_to(self.paths.data)
|
|
105
94
|
|
|
106
95
|
def open(self, file: File) -> FileObject:
|
|
107
96
|
return FileObject(self[file])
|
esgpull/graph.py
CHANGED
|
@@ -418,7 +418,7 @@ class Graph:
|
|
|
418
418
|
if keep_require:
|
|
419
419
|
query_tree = query._rich_tree()
|
|
420
420
|
else:
|
|
421
|
-
query_tree = query.
|
|
421
|
+
query_tree = query._rich_tree(hide_require=True)
|
|
422
422
|
if query_tree is not None:
|
|
423
423
|
tree.add(query_tree)
|
|
424
424
|
self.fill_tree(query, query_tree)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""update tables
|
|
2
|
+
|
|
3
|
+
Revision ID: 0.9.0
|
|
4
|
+
Revises: d14f179e553c
|
|
5
|
+
Create Date: 2025-07-07 14:54:58.433022
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = '0.9.0'
|
|
14
|
+
down_revision = 'd14f179e553c'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
pass
|
|
22
|
+
# ### end Alembic commands ###
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def downgrade() -> None:
|
|
26
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
27
|
+
pass
|
|
28
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""update tables
|
|
2
|
+
|
|
3
|
+
Revision ID: 0.9.1
|
|
4
|
+
Revises: 0.9.0
|
|
5
|
+
Create Date: 2025-08-08 10:38:14.204594
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = '0.9.1'
|
|
14
|
+
down_revision = '0.9.0'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
pass
|
|
22
|
+
# ### end Alembic commands ###
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def downgrade() -> None:
|
|
26
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
27
|
+
pass
|
|
28
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""file_add_composite_index_dataset_id_status
|
|
2
|
+
|
|
3
|
+
Revision ID: d14f179e553c
|
|
4
|
+
Revises: e7edab5d4e4b
|
|
5
|
+
Create Date: 2025-06-18 16:05:35.721085
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = 'd14f179e553c'
|
|
14
|
+
down_revision = 'e7edab5d4e4b'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
22
|
+
batch_op.create_index('ix_file_dataset_status', ['dataset_id', 'status'], unique=False)
|
|
23
|
+
|
|
24
|
+
# ### end Alembic commands ###
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def downgrade() -> None:
|
|
28
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
29
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
30
|
+
batch_op.drop_index('ix_file_dataset_status')
|
|
31
|
+
|
|
32
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""add_dataset_tracking
|
|
2
|
+
|
|
3
|
+
Revision ID: e7edab5d4e4b
|
|
4
|
+
Revises: 0.8.0
|
|
5
|
+
Create Date: 2025-05-23 17:38:22.066153
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = 'e7edab5d4e4b'
|
|
14
|
+
down_revision = '0.8.0'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
op.create_table('dataset',
|
|
22
|
+
sa.Column('dataset_id', sa.String(length=255), nullable=False),
|
|
23
|
+
sa.Column('total_files', sa.Integer(), nullable=False),
|
|
24
|
+
sa.Column('created_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
|
25
|
+
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
|
26
|
+
sa.PrimaryKeyConstraint('dataset_id')
|
|
27
|
+
)
|
|
28
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
29
|
+
batch_op.create_foreign_key('fk_file_dataset', 'dataset', ['dataset_id'], ['dataset_id'])
|
|
30
|
+
|
|
31
|
+
# ### end Alembic commands ###
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
35
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
36
|
+
batch_op.drop_constraint('fk_file_dataset', type_='foreignkey')
|
|
37
|
+
|
|
38
|
+
op.drop_table('dataset')
|
|
39
|
+
# ### end Alembic commands ###
|
esgpull/models/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TypeVar
|
|
2
2
|
|
|
3
3
|
from esgpull.models.base import Base
|
|
4
|
-
from esgpull.models.dataset import Dataset
|
|
4
|
+
from esgpull.models.dataset import Dataset, DatasetRecord
|
|
5
5
|
from esgpull.models.facet import Facet
|
|
6
6
|
from esgpull.models.file import FastFile, FileStatus
|
|
7
7
|
from esgpull.models.options import Option, Options
|
|
@@ -15,6 +15,7 @@ Table = TypeVar("Table", bound=Base)
|
|
|
15
15
|
__all__ = [
|
|
16
16
|
"Base",
|
|
17
17
|
"Dataset",
|
|
18
|
+
"DatasetRecord",
|
|
18
19
|
"Facet",
|
|
19
20
|
"FastFile",
|
|
20
21
|
"File",
|
esgpull/models/base.py
CHANGED
|
@@ -16,16 +16,10 @@ T = TypeVar("T")
|
|
|
16
16
|
Sha = sa.String(40)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
# Base class for all models - provides core SQLAlchemy functionality
|
|
20
|
+
class _BaseModel(MappedAsDataclass, DeclarativeBase):
|
|
20
21
|
__dataclass_fields__: ClassVar[dict[str, Field]]
|
|
21
|
-
__sql_attrs__ = ("id", "
|
|
22
|
-
|
|
23
|
-
sha: Mapped[str] = mapped_column(
|
|
24
|
-
Sha,
|
|
25
|
-
init=False,
|
|
26
|
-
repr=False,
|
|
27
|
-
primary_key=True,
|
|
28
|
-
)
|
|
22
|
+
__sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
|
|
29
23
|
|
|
30
24
|
@property
|
|
31
25
|
def _names(self) -> tuple[str, ...]:
|
|
@@ -36,15 +30,38 @@ class Base(MappedAsDataclass, DeclarativeBase):
|
|
|
36
30
|
result += (name,)
|
|
37
31
|
return result
|
|
38
32
|
|
|
33
|
+
@property
|
|
34
|
+
def state(self) -> InstanceState:
|
|
35
|
+
return cast(InstanceState, sa.inspect(self))
|
|
36
|
+
|
|
37
|
+
def asdict(self) -> Mapping[str, Any]:
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Base class for models that use SHA as primary key
|
|
42
|
+
class Base(_BaseModel):
|
|
43
|
+
__abstract__ = True
|
|
44
|
+
__sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
|
|
45
|
+
|
|
46
|
+
sha: Mapped[str] = mapped_column(
|
|
47
|
+
Sha,
|
|
48
|
+
init=False,
|
|
49
|
+
repr=False,
|
|
50
|
+
primary_key=True,
|
|
51
|
+
)
|
|
52
|
+
|
|
39
53
|
def _as_bytes(self) -> bytes:
|
|
40
54
|
raise NotImplementedError
|
|
41
55
|
|
|
42
56
|
def compute_sha(self) -> None:
|
|
43
57
|
self.sha = sha1(self._as_bytes()).hexdigest()
|
|
44
58
|
|
|
45
|
-
@property
|
|
46
|
-
def state(self) -> InstanceState:
|
|
47
|
-
return cast(InstanceState, sa.inspect(self))
|
|
48
59
|
|
|
49
|
-
|
|
50
|
-
|
|
60
|
+
# Base class for models that don't use SHA (e.g., Dataset)
|
|
61
|
+
class BaseNoSHA(_BaseModel):
|
|
62
|
+
__abstract__ = True
|
|
63
|
+
__sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Keep SHAKeyMixin for backward compatibility if needed
|
|
67
|
+
SHAKeyMixin = Base
|
esgpull/models/dataset.py
CHANGED
|
@@ -1,12 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
4
7
|
|
|
8
|
+
import sqlalchemy as sa
|
|
9
|
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
10
|
+
|
|
11
|
+
from esgpull.models.base import BaseNoSHA
|
|
5
12
|
from esgpull.models.utils import find_int, find_str
|
|
6
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from esgpull.models.query import File
|
|
16
|
+
|
|
7
17
|
|
|
8
18
|
@dataclass
|
|
9
|
-
class
|
|
19
|
+
class DatasetRecord:
|
|
10
20
|
dataset_id: str
|
|
11
21
|
master_id: str
|
|
12
22
|
version: str
|
|
@@ -15,7 +25,7 @@ class Dataset:
|
|
|
15
25
|
number_of_files: int
|
|
16
26
|
|
|
17
27
|
@classmethod
|
|
18
|
-
def serialize(cls, source: dict) ->
|
|
28
|
+
def serialize(cls, source: dict) -> DatasetRecord:
|
|
19
29
|
dataset_id = find_str(source["instance_id"]).partition("|")[0]
|
|
20
30
|
master_id, version = dataset_id.rsplit(".", 1)
|
|
21
31
|
data_node = find_str(source["data_node"])
|
|
@@ -30,5 +40,38 @@ class Dataset:
|
|
|
30
40
|
number_of_files=number_of_files,
|
|
31
41
|
)
|
|
32
42
|
|
|
33
|
-
|
|
34
|
-
|
|
43
|
+
|
|
44
|
+
class Dataset(BaseNoSHA):
|
|
45
|
+
__tablename__ = "dataset"
|
|
46
|
+
|
|
47
|
+
dataset_id: Mapped[str] = mapped_column(sa.String(255), primary_key=True)
|
|
48
|
+
total_files: Mapped[int] = mapped_column(sa.Integer)
|
|
49
|
+
created_at: Mapped[datetime] = mapped_column(
|
|
50
|
+
server_default=sa.func.now(),
|
|
51
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
52
|
+
init=False,
|
|
53
|
+
)
|
|
54
|
+
updated_at: Mapped[datetime] = mapped_column(
|
|
55
|
+
server_default=sa.func.now(),
|
|
56
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
57
|
+
init=False,
|
|
58
|
+
)
|
|
59
|
+
files: Mapped[list[File]] = relationship(
|
|
60
|
+
back_populates="dataset",
|
|
61
|
+
foreign_keys="[File.dataset_id]",
|
|
62
|
+
primaryjoin="Dataset.dataset_id==File.dataset_id",
|
|
63
|
+
default_factory=list,
|
|
64
|
+
init=False,
|
|
65
|
+
repr=False,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def asdict(self) -> Mapping[str, Any]:
|
|
69
|
+
return {
|
|
70
|
+
"dataset_id": self.dataset_id,
|
|
71
|
+
"total_files": self.total_files,
|
|
72
|
+
"created_at": self.created_at.isoformat(),
|
|
73
|
+
"updated_at": self.updated_at.isoformat(),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def __hash__(self) -> int:
|
|
77
|
+
return hash(self.dataset_id)
|
esgpull/models/query.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterator, MutableMapping, Sequence
|
|
4
4
|
from datetime import datetime, timezone
|
|
5
|
-
from typing import Any, Literal
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
6
6
|
|
|
7
7
|
import sqlalchemy as sa
|
|
8
8
|
from rich.console import Console, ConsoleOptions
|
|
@@ -15,10 +15,15 @@ from typing_extensions import NotRequired, TypedDict
|
|
|
15
15
|
from esgpull import utils
|
|
16
16
|
from esgpull.exceptions import UntrackableQuery
|
|
17
17
|
from esgpull.models.base import Base, Sha
|
|
18
|
+
from esgpull.models.dataset import Dataset
|
|
18
19
|
from esgpull.models.file import FileDict, FileStatus
|
|
19
20
|
from esgpull.models.options import Options
|
|
20
21
|
from esgpull.models.selection import FacetValues, Selection
|
|
21
22
|
from esgpull.models.tag import Tag
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from esgpull.models.dataset import Dataset
|
|
26
|
+
|
|
22
27
|
from esgpull.models.utils import (
|
|
23
28
|
find_int,
|
|
24
29
|
find_str,
|
|
@@ -55,9 +60,14 @@ query_tag_proxy = sa.Table(
|
|
|
55
60
|
|
|
56
61
|
class File(Base):
|
|
57
62
|
__tablename__ = "file"
|
|
63
|
+
__table_args__ = (
|
|
64
|
+
sa.Index("ix_file_dataset_status", "dataset_id", "status"),
|
|
65
|
+
)
|
|
58
66
|
|
|
59
67
|
file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
|
|
60
|
-
dataset_id: Mapped[str] = mapped_column(
|
|
68
|
+
dataset_id: Mapped[str] = mapped_column(
|
|
69
|
+
sa.String(255), sa.ForeignKey("dataset.dataset_id")
|
|
70
|
+
)
|
|
61
71
|
master_id: Mapped[str] = mapped_column(sa.String(255))
|
|
62
72
|
url: Mapped[str] = mapped_column(sa.String(255))
|
|
63
73
|
version: Mapped[str] = mapped_column(sa.String(16))
|
|
@@ -76,6 +86,11 @@ class File(Base):
|
|
|
76
86
|
back_populates="files",
|
|
77
87
|
repr=False,
|
|
78
88
|
)
|
|
89
|
+
dataset: Mapped["Dataset"] = relationship(
|
|
90
|
+
back_populates="files",
|
|
91
|
+
init=False,
|
|
92
|
+
repr=False,
|
|
93
|
+
)
|
|
79
94
|
|
|
80
95
|
def _as_bytes(self) -> bytes:
|
|
81
96
|
self_tuple = (self.file_id, self.checksum)
|
|
@@ -100,7 +115,7 @@ class File(Base):
|
|
|
100
115
|
size=source["size"],
|
|
101
116
|
)
|
|
102
117
|
if "status" in source:
|
|
103
|
-
result.status = FileStatus(source.get("
|
|
118
|
+
result.status = FileStatus(source.get("status").lower())
|
|
104
119
|
return result
|
|
105
120
|
|
|
106
121
|
@classmethod
|
|
@@ -395,11 +410,6 @@ class Query(Base):
|
|
|
395
410
|
self.tags.remove(tag)
|
|
396
411
|
return tag is not None
|
|
397
412
|
|
|
398
|
-
def no_require(self) -> Query:
|
|
399
|
-
cl = self.clone(compute_sha=False)
|
|
400
|
-
cl._rich_no_require = True # type: ignore [attr-defined]
|
|
401
|
-
return cl
|
|
402
|
-
|
|
403
413
|
def __lshift__(self, child: Query) -> Query:
|
|
404
414
|
result = self.clone(compute_sha=False)
|
|
405
415
|
# if self.name != child.require:
|
|
@@ -440,7 +450,7 @@ class Query(Base):
|
|
|
440
450
|
|
|
441
451
|
__rich_measure__ = rich_measure_impl
|
|
442
452
|
|
|
443
|
-
def _rich_tree(self) -> Tree:
|
|
453
|
+
def _rich_tree(self, hide_require: bool = False) -> Tree:
|
|
444
454
|
title = Text.from_markup(self.rich_name)
|
|
445
455
|
if not self.tracked:
|
|
446
456
|
title.append(" untracked", style="i red")
|
|
@@ -449,7 +459,7 @@ class Query(Base):
|
|
|
449
459
|
f"\n│ updated {format_date_iso(self.updated_at)}"
|
|
450
460
|
)
|
|
451
461
|
contents = Table.grid(padding=(0, 1))
|
|
452
|
-
if not
|
|
462
|
+
if not hide_require and self.require is not None:
|
|
453
463
|
if len(self.require) == 40:
|
|
454
464
|
require = Text(short_sha(self.require), style="i green")
|
|
455
465
|
else:
|
|
@@ -482,10 +492,44 @@ class Query(Base):
|
|
|
482
492
|
count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
|
|
483
493
|
count_total, size_total = self.files_count_size()
|
|
484
494
|
sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
|
|
485
|
-
lens = f"{count_ondisk}/{count_total}"
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
495
|
+
lens = f"{count_ondisk} / {count_total}"
|
|
496
|
+
|
|
497
|
+
# Add dataset completion info
|
|
498
|
+
complete_datasets = 0
|
|
499
|
+
total_datasets = 0
|
|
500
|
+
session = object_session(self)
|
|
501
|
+
orphaned_dataset_count = 0
|
|
502
|
+
|
|
503
|
+
if session is not None:
|
|
504
|
+
from esgpull.models import sql
|
|
505
|
+
|
|
506
|
+
dataset_stats = session.execute(
|
|
507
|
+
sql.dataset.query_stats(self.sha)
|
|
508
|
+
).all()
|
|
509
|
+
|
|
510
|
+
# Check for orphaned datasets (dataset_ids from files not in Dataset table)
|
|
511
|
+
orphaned_dataset_count = (
|
|
512
|
+
session.scalar(sql.dataset.orphaned(self.sha)) or 0
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Compute counts in Python - simpler and more maintainable
|
|
516
|
+
total_datasets = len(dataset_stats)
|
|
517
|
+
complete_datasets = sum(
|
|
518
|
+
1 for d in dataset_stats if d.done_count == d.total_files
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
contents.add_row("files:", Text(f"{lens}", style="magenta"))
|
|
522
|
+
if orphaned_dataset_count > 0:
|
|
523
|
+
contents.add_row(
|
|
524
|
+
"datasets:",
|
|
525
|
+
"[magenta]? / ?[/] [yellow italic]<- update for accurate datasets[/]",
|
|
526
|
+
)
|
|
527
|
+
else:
|
|
528
|
+
contents.add_row(
|
|
529
|
+
"datasets:",
|
|
530
|
+
f"[magenta]{complete_datasets} / {total_datasets}",
|
|
531
|
+
)
|
|
532
|
+
contents.add_row("size:", Text(f"{sizes}", style="magenta"))
|
|
489
533
|
tree = Tree("", hide_root=True, guide_style="dim").add(title)
|
|
490
534
|
if contents.row_count:
|
|
491
535
|
tree.add(contents)
|
esgpull/models/sql.py
CHANGED
|
@@ -3,6 +3,7 @@ import functools
|
|
|
3
3
|
import sqlalchemy as sa
|
|
4
4
|
|
|
5
5
|
from esgpull.models import Table
|
|
6
|
+
from esgpull.models.dataset import Dataset
|
|
6
7
|
from esgpull.models.facet import Facet
|
|
7
8
|
from esgpull.models.file import FileStatus
|
|
8
9
|
from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
|
|
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
|
|
|
11
12
|
from esgpull.models.tag import Tag
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
def count(item: Table) -> sa.Select[tuple[int]]:
|
|
15
|
-
table = item.__class__
|
|
16
|
-
return (
|
|
17
|
-
sa.select(sa.func.count("*"))
|
|
18
|
-
.select_from(table)
|
|
19
|
-
.filter_by(sha=item.sha)
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
15
|
def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
|
|
24
16
|
return sa.select(sa.func.count("*")).select_from(table)
|
|
25
17
|
|
|
@@ -148,6 +140,45 @@ class file:
|
|
|
148
140
|
return stmt
|
|
149
141
|
|
|
150
142
|
|
|
143
|
+
class dataset:
|
|
144
|
+
@staticmethod
|
|
145
|
+
@functools.cache
|
|
146
|
+
def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
|
|
147
|
+
return (
|
|
148
|
+
sa.select(
|
|
149
|
+
Dataset.dataset_id,
|
|
150
|
+
Dataset.total_files,
|
|
151
|
+
sa.func.count(
|
|
152
|
+
sa.case((File.status == FileStatus.Done, 1))
|
|
153
|
+
).label("done_count"),
|
|
154
|
+
)
|
|
155
|
+
.join(File)
|
|
156
|
+
.join(query_file_proxy)
|
|
157
|
+
.filter(query_file_proxy.c.query_sha == query_sha)
|
|
158
|
+
.filter(File.dataset_id.isnot(None))
|
|
159
|
+
.group_by(Dataset.dataset_id, Dataset.total_files)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
@functools.cache
|
|
164
|
+
def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
|
|
165
|
+
return (
|
|
166
|
+
sa.select(sa.func.count(sa.distinct(File.dataset_id)))
|
|
167
|
+
.join(query_file_proxy)
|
|
168
|
+
.filter(query_file_proxy.c.query_sha == query_sha)
|
|
169
|
+
.filter(File.dataset_id.isnot(None))
|
|
170
|
+
.filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
@functools.cache
|
|
175
|
+
def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
|
|
176
|
+
return sa.select(
|
|
177
|
+
sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
|
|
178
|
+
== dataset.total_files
|
|
179
|
+
).where(File.dataset_id == dataset.dataset_id)
|
|
180
|
+
|
|
181
|
+
|
|
151
182
|
class query:
|
|
152
183
|
@staticmethod
|
|
153
184
|
@functools.cache
|
|
@@ -270,3 +301,11 @@ class query_file:
|
|
|
270
301
|
.where(query_file_proxy.c.query_sha == query.sha)
|
|
271
302
|
.where(query_file_proxy.c.file_sha == file.sha)
|
|
272
303
|
)
|
|
304
|
+
|
|
305
|
+
@staticmethod
|
|
306
|
+
def is_linked(query: Query, file: File) -> sa.Select[tuple[bool]]:
|
|
307
|
+
return sa.select(
|
|
308
|
+
sa.exists()
|
|
309
|
+
.where(query_file_proxy.c.query_sha == query.sha)
|
|
310
|
+
.where(query_file_proxy.c.file_sha == file.sha)
|
|
311
|
+
)
|