esgpull 0.7.3__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgpull/cli/__init__.py +2 -2
- esgpull/cli/add.py +7 -1
- esgpull/cli/config.py +5 -21
- esgpull/cli/plugins.py +398 -0
- esgpull/cli/show.py +29 -0
- esgpull/cli/status.py +6 -4
- esgpull/cli/update.py +72 -18
- esgpull/cli/utils.py +16 -1
- esgpull/config.py +83 -25
- esgpull/constants.py +3 -0
- esgpull/context.py +15 -15
- esgpull/database.py +8 -2
- esgpull/download.py +3 -0
- esgpull/esgpull.py +49 -5
- esgpull/graph.py +1 -1
- esgpull/migrations/versions/0.8.0_update_tables.py +28 -0
- esgpull/migrations/versions/0.9.0_update_tables.py +28 -0
- esgpull/migrations/versions/14c72daea083_query_add_column_updated_at.py +36 -0
- esgpull/migrations/versions/c7c8541fa741_query_add_column_added_at.py +37 -0
- esgpull/migrations/versions/d14f179e553c_file_add_composite_index_dataset_id_.py +32 -0
- esgpull/migrations/versions/e7edab5d4e4b_add_dataset_tracking.py +39 -0
- esgpull/models/__init__.py +2 -1
- esgpull/models/base.py +31 -14
- esgpull/models/dataset.py +48 -5
- esgpull/models/options.py +1 -1
- esgpull/models/query.py +98 -15
- esgpull/models/sql.py +40 -9
- esgpull/plugin.py +574 -0
- esgpull/processor.py +3 -3
- esgpull/tui.py +23 -1
- esgpull/utils.py +19 -3
- {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/METADATA +11 -2
- {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/RECORD +36 -29
- {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/WHEEL +1 -1
- esgpull/cli/datasets.py +0 -78
- {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/entry_points.txt +0 -0
- {esgpull-0.7.3.dist-info → esgpull-0.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""update tables
|
|
2
|
+
|
|
3
|
+
Revision ID: 0.8.0
|
|
4
|
+
Revises: 14c72daea083
|
|
5
|
+
Create Date: 2025-05-15 11:28:10.755003
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = '0.8.0'
|
|
14
|
+
down_revision = '14c72daea083'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
pass
|
|
22
|
+
# ### end Alembic commands ###
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def downgrade() -> None:
|
|
26
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
27
|
+
pass
|
|
28
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""update tables
|
|
2
|
+
|
|
3
|
+
Revision ID: 0.9.0
|
|
4
|
+
Revises: d14f179e553c
|
|
5
|
+
Create Date: 2025-07-07 14:54:58.433022
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = '0.9.0'
|
|
14
|
+
down_revision = 'd14f179e553c'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
pass
|
|
22
|
+
# ### end Alembic commands ###
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def downgrade() -> None:
|
|
26
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
27
|
+
pass
|
|
28
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""query_add_column_updated_at
|
|
2
|
+
|
|
3
|
+
Revision ID: 14c72daea083
|
|
4
|
+
Revises: c7c8541fa741
|
|
5
|
+
Create Date: 2025-05-07 14:49:43.993125
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = '14c72daea083'
|
|
14
|
+
down_revision = 'c7c8541fa741'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
with op.batch_alter_table('query', schema=None) as batch_op:
|
|
21
|
+
batch_op.add_column(sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True))
|
|
22
|
+
|
|
23
|
+
# Backfill nulls
|
|
24
|
+
op.execute('UPDATE query SET updated_at = CURRENT_TIMESTAMP WHERE updated_at IS NULL')
|
|
25
|
+
|
|
26
|
+
# Make non-nullable
|
|
27
|
+
with op.batch_alter_table('query', schema=None) as batch_op:
|
|
28
|
+
batch_op.alter_column('updated_at', nullable=False)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def downgrade() -> None:
|
|
32
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
33
|
+
with op.batch_alter_table('query', schema=None) as batch_op:
|
|
34
|
+
batch_op.drop_column('updated_at')
|
|
35
|
+
|
|
36
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""query_add_column_added_at
|
|
2
|
+
|
|
3
|
+
Revision ID: c7c8541fa741
|
|
4
|
+
Revises: 0.7.3
|
|
5
|
+
Create Date: 2025-05-05 16:14:57.140262
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = 'c7c8541fa741'
|
|
14
|
+
down_revision = '0.7.3'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# Add as nullable first
|
|
21
|
+
with op.batch_alter_table('query', schema=None) as batch_op:
|
|
22
|
+
batch_op.add_column(sa.Column('added_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=True))
|
|
23
|
+
|
|
24
|
+
# Backfill nulls
|
|
25
|
+
op.execute('UPDATE query SET added_at = CURRENT_TIMESTAMP WHERE added_at IS NULL')
|
|
26
|
+
|
|
27
|
+
# Make non-nullable
|
|
28
|
+
with op.batch_alter_table('query', schema=None) as batch_op:
|
|
29
|
+
batch_op.alter_column('added_at', nullable=False)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def downgrade() -> None:
|
|
33
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
34
|
+
with op.batch_alter_table('query', schema=None) as batch_op:
|
|
35
|
+
batch_op.drop_column('added_at')
|
|
36
|
+
|
|
37
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""file_add_composite_index_dataset_id_status
|
|
2
|
+
|
|
3
|
+
Revision ID: d14f179e553c
|
|
4
|
+
Revises: e7edab5d4e4b
|
|
5
|
+
Create Date: 2025-06-18 16:05:35.721085
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = 'd14f179e553c'
|
|
14
|
+
down_revision = 'e7edab5d4e4b'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
22
|
+
batch_op.create_index('ix_file_dataset_status', ['dataset_id', 'status'], unique=False)
|
|
23
|
+
|
|
24
|
+
# ### end Alembic commands ###
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def downgrade() -> None:
|
|
28
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
29
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
30
|
+
batch_op.drop_index('ix_file_dataset_status')
|
|
31
|
+
|
|
32
|
+
# ### end Alembic commands ###
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""add_dataset_tracking
|
|
2
|
+
|
|
3
|
+
Revision ID: e7edab5d4e4b
|
|
4
|
+
Revises: 0.8.0
|
|
5
|
+
Create Date: 2025-05-23 17:38:22.066153
|
|
6
|
+
|
|
7
|
+
"""
|
|
8
|
+
from alembic import op
|
|
9
|
+
import sqlalchemy as sa
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# revision identifiers, used by Alembic.
|
|
13
|
+
revision = 'e7edab5d4e4b'
|
|
14
|
+
down_revision = '0.8.0'
|
|
15
|
+
branch_labels = None
|
|
16
|
+
depends_on = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def upgrade() -> None:
|
|
20
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
21
|
+
op.create_table('dataset',
|
|
22
|
+
sa.Column('dataset_id', sa.String(length=255), nullable=False),
|
|
23
|
+
sa.Column('total_files', sa.Integer(), nullable=False),
|
|
24
|
+
sa.Column('created_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
|
25
|
+
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
|
|
26
|
+
sa.PrimaryKeyConstraint('dataset_id')
|
|
27
|
+
)
|
|
28
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
29
|
+
batch_op.create_foreign_key('fk_file_dataset', 'dataset', ['dataset_id'], ['dataset_id'])
|
|
30
|
+
|
|
31
|
+
# ### end Alembic commands ###
|
|
32
|
+
|
|
33
|
+
def downgrade() -> None:
|
|
34
|
+
# ### commands auto generated by Alembic - please adjust! ###
|
|
35
|
+
with op.batch_alter_table('file', schema=None) as batch_op:
|
|
36
|
+
batch_op.drop_constraint('fk_file_dataset', type_='foreignkey')
|
|
37
|
+
|
|
38
|
+
op.drop_table('dataset')
|
|
39
|
+
# ### end Alembic commands ###
|
esgpull/models/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TypeVar
|
|
2
2
|
|
|
3
3
|
from esgpull.models.base import Base
|
|
4
|
-
from esgpull.models.dataset import Dataset
|
|
4
|
+
from esgpull.models.dataset import Dataset, DatasetRecord
|
|
5
5
|
from esgpull.models.facet import Facet
|
|
6
6
|
from esgpull.models.file import FastFile, FileStatus
|
|
7
7
|
from esgpull.models.options import Option, Options
|
|
@@ -15,6 +15,7 @@ Table = TypeVar("Table", bound=Base)
|
|
|
15
15
|
__all__ = [
|
|
16
16
|
"Base",
|
|
17
17
|
"Dataset",
|
|
18
|
+
"DatasetRecord",
|
|
18
19
|
"Facet",
|
|
19
20
|
"FastFile",
|
|
20
21
|
"File",
|
esgpull/models/base.py
CHANGED
|
@@ -16,16 +16,10 @@ T = TypeVar("T")
|
|
|
16
16
|
Sha = sa.String(40)
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
# Base class for all models - provides core SQLAlchemy functionality
|
|
20
|
+
class _BaseModel(MappedAsDataclass, DeclarativeBase):
|
|
20
21
|
__dataclass_fields__: ClassVar[dict[str, Field]]
|
|
21
|
-
__sql_attrs__ = ("id", "
|
|
22
|
-
|
|
23
|
-
sha: Mapped[str] = mapped_column(
|
|
24
|
-
Sha,
|
|
25
|
-
init=False,
|
|
26
|
-
repr=False,
|
|
27
|
-
primary_key=True,
|
|
28
|
-
)
|
|
22
|
+
__sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
|
|
29
23
|
|
|
30
24
|
@property
|
|
31
25
|
def _names(self) -> tuple[str, ...]:
|
|
@@ -36,15 +30,38 @@ class Base(MappedAsDataclass, DeclarativeBase):
|
|
|
36
30
|
result += (name,)
|
|
37
31
|
return result
|
|
38
32
|
|
|
33
|
+
@property
|
|
34
|
+
def state(self) -> InstanceState:
|
|
35
|
+
return cast(InstanceState, sa.inspect(self))
|
|
36
|
+
|
|
37
|
+
def asdict(self) -> Mapping[str, Any]:
|
|
38
|
+
raise NotImplementedError
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# Base class for models that use SHA as primary key
|
|
42
|
+
class Base(_BaseModel):
|
|
43
|
+
__abstract__ = True
|
|
44
|
+
__sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
|
|
45
|
+
|
|
46
|
+
sha: Mapped[str] = mapped_column(
|
|
47
|
+
Sha,
|
|
48
|
+
init=False,
|
|
49
|
+
repr=False,
|
|
50
|
+
primary_key=True,
|
|
51
|
+
)
|
|
52
|
+
|
|
39
53
|
def _as_bytes(self) -> bytes:
|
|
40
54
|
raise NotImplementedError
|
|
41
55
|
|
|
42
56
|
def compute_sha(self) -> None:
|
|
43
57
|
self.sha = sha1(self._as_bytes()).hexdigest()
|
|
44
58
|
|
|
45
|
-
@property
|
|
46
|
-
def state(self) -> InstanceState:
|
|
47
|
-
return cast(InstanceState, sa.inspect(self))
|
|
48
59
|
|
|
49
|
-
|
|
50
|
-
|
|
60
|
+
# Base class for models that don't use SHA (e.g., Dataset)
|
|
61
|
+
class BaseNoSHA(_BaseModel):
|
|
62
|
+
__abstract__ = True
|
|
63
|
+
__sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# Keep SHAKeyMixin for backward compatibility if needed
|
|
67
|
+
SHAKeyMixin = Base
|
esgpull/models/dataset.py
CHANGED
|
@@ -1,12 +1,22 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from collections.abc import Mapping
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import TYPE_CHECKING, Any
|
|
4
7
|
|
|
8
|
+
import sqlalchemy as sa
|
|
9
|
+
from sqlalchemy.orm import Mapped, mapped_column, relationship
|
|
10
|
+
|
|
11
|
+
from esgpull.models.base import BaseNoSHA
|
|
5
12
|
from esgpull.models.utils import find_int, find_str
|
|
6
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from esgpull.models.query import File
|
|
16
|
+
|
|
7
17
|
|
|
8
18
|
@dataclass
|
|
9
|
-
class
|
|
19
|
+
class DatasetRecord:
|
|
10
20
|
dataset_id: str
|
|
11
21
|
master_id: str
|
|
12
22
|
version: str
|
|
@@ -15,7 +25,7 @@ class Dataset:
|
|
|
15
25
|
number_of_files: int
|
|
16
26
|
|
|
17
27
|
@classmethod
|
|
18
|
-
def serialize(cls, source: dict) ->
|
|
28
|
+
def serialize(cls, source: dict) -> DatasetRecord:
|
|
19
29
|
dataset_id = find_str(source["instance_id"]).partition("|")[0]
|
|
20
30
|
master_id, version = dataset_id.rsplit(".", 1)
|
|
21
31
|
data_node = find_str(source["data_node"])
|
|
@@ -30,5 +40,38 @@ class Dataset:
|
|
|
30
40
|
number_of_files=number_of_files,
|
|
31
41
|
)
|
|
32
42
|
|
|
33
|
-
|
|
34
|
-
|
|
43
|
+
|
|
44
|
+
class Dataset(BaseNoSHA):
|
|
45
|
+
__tablename__ = "dataset"
|
|
46
|
+
|
|
47
|
+
dataset_id: Mapped[str] = mapped_column(sa.String(255), primary_key=True)
|
|
48
|
+
total_files: Mapped[int] = mapped_column(sa.Integer)
|
|
49
|
+
created_at: Mapped[datetime] = mapped_column(
|
|
50
|
+
server_default=sa.func.now(),
|
|
51
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
52
|
+
init=False,
|
|
53
|
+
)
|
|
54
|
+
updated_at: Mapped[datetime] = mapped_column(
|
|
55
|
+
server_default=sa.func.now(),
|
|
56
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
57
|
+
init=False,
|
|
58
|
+
)
|
|
59
|
+
files: Mapped[list[File]] = relationship(
|
|
60
|
+
back_populates="dataset",
|
|
61
|
+
foreign_keys="[File.dataset_id]",
|
|
62
|
+
primaryjoin="Dataset.dataset_id==File.dataset_id",
|
|
63
|
+
default_factory=list,
|
|
64
|
+
init=False,
|
|
65
|
+
repr=False,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
def asdict(self) -> Mapping[str, Any]:
|
|
69
|
+
return {
|
|
70
|
+
"dataset_id": self.dataset_id,
|
|
71
|
+
"total_files": self.total_files,
|
|
72
|
+
"created_at": self.created_at.isoformat(),
|
|
73
|
+
"updated_at": self.updated_at.isoformat(),
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
def __hash__(self) -> int:
|
|
77
|
+
return hash(self.dataset_id)
|
esgpull/models/options.py
CHANGED
|
@@ -53,7 +53,7 @@ class Options(Base):
|
|
|
53
53
|
replica: Mapped[Option] = mapped_column(sa.Enum(Option))
|
|
54
54
|
retracted: Mapped[Option] = mapped_column(sa.Enum(Option))
|
|
55
55
|
|
|
56
|
-
_distrib_ = Option(
|
|
56
|
+
_distrib_ = Option(True)
|
|
57
57
|
_latest_ = Option(True)
|
|
58
58
|
_replica_ = Option(None)
|
|
59
59
|
_retracted_ = Option(False)
|
esgpull/models/query.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from collections.abc import Iterator, MutableMapping, Sequence
|
|
4
|
-
from
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
5
6
|
|
|
6
7
|
import sqlalchemy as sa
|
|
7
8
|
from rich.console import Console, ConsoleOptions
|
|
@@ -11,12 +12,18 @@ from rich.tree import Tree
|
|
|
11
12
|
from sqlalchemy.orm import Mapped, mapped_column, object_session, relationship
|
|
12
13
|
from typing_extensions import NotRequired, TypedDict
|
|
13
14
|
|
|
15
|
+
from esgpull import utils
|
|
14
16
|
from esgpull.exceptions import UntrackableQuery
|
|
15
17
|
from esgpull.models.base import Base, Sha
|
|
18
|
+
from esgpull.models.dataset import Dataset
|
|
16
19
|
from esgpull.models.file import FileDict, FileStatus
|
|
17
20
|
from esgpull.models.options import Options
|
|
18
21
|
from esgpull.models.selection import FacetValues, Selection
|
|
19
22
|
from esgpull.models.tag import Tag
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
from esgpull.models.dataset import Dataset
|
|
26
|
+
|
|
20
27
|
from esgpull.models.utils import (
|
|
21
28
|
find_int,
|
|
22
29
|
find_str,
|
|
@@ -24,7 +31,18 @@ from esgpull.models.utils import (
|
|
|
24
31
|
rich_measure_impl,
|
|
25
32
|
short_sha,
|
|
26
33
|
)
|
|
27
|
-
from esgpull.utils import format_size
|
|
34
|
+
from esgpull.utils import format_date_iso, format_size
|
|
35
|
+
|
|
36
|
+
QUERY_DATE_FMT = "%Y-%m-%d %H:%M:%S"
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def parse_date(d: datetime | str) -> datetime:
|
|
40
|
+
return utils.parse_date(d, fmt=QUERY_DATE_FMT)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def format_date(d: datetime | str) -> str:
|
|
44
|
+
return utils.format_date(d, fmt=QUERY_DATE_FMT)
|
|
45
|
+
|
|
28
46
|
|
|
29
47
|
query_file_proxy = sa.Table(
|
|
30
48
|
"query_file",
|
|
@@ -42,9 +60,14 @@ query_tag_proxy = sa.Table(
|
|
|
42
60
|
|
|
43
61
|
class File(Base):
|
|
44
62
|
__tablename__ = "file"
|
|
63
|
+
__table_args__ = (
|
|
64
|
+
sa.Index("ix_file_dataset_status", "dataset_id", "status"),
|
|
65
|
+
)
|
|
45
66
|
|
|
46
67
|
file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
|
|
47
|
-
dataset_id: Mapped[str] = mapped_column(
|
|
68
|
+
dataset_id: Mapped[str] = mapped_column(
|
|
69
|
+
sa.String(255), sa.ForeignKey("dataset.dataset_id")
|
|
70
|
+
)
|
|
48
71
|
master_id: Mapped[str] = mapped_column(sa.String(255))
|
|
49
72
|
url: Mapped[str] = mapped_column(sa.String(255))
|
|
50
73
|
version: Mapped[str] = mapped_column(sa.String(16))
|
|
@@ -63,6 +86,11 @@ class File(Base):
|
|
|
63
86
|
back_populates="files",
|
|
64
87
|
repr=False,
|
|
65
88
|
)
|
|
89
|
+
dataset: Mapped["Dataset"] = relationship(
|
|
90
|
+
back_populates="files",
|
|
91
|
+
init=False,
|
|
92
|
+
repr=False,
|
|
93
|
+
)
|
|
66
94
|
|
|
67
95
|
def _as_bytes(self) -> bytes:
|
|
68
96
|
self_tuple = (self.file_id, self.checksum)
|
|
@@ -87,7 +115,7 @@ class File(Base):
|
|
|
87
115
|
size=source["size"],
|
|
88
116
|
)
|
|
89
117
|
if "status" in source:
|
|
90
|
-
result.status = FileStatus(source.get("
|
|
118
|
+
result.status = FileStatus(source.get("status").lower())
|
|
91
119
|
return result
|
|
92
120
|
|
|
93
121
|
@classmethod
|
|
@@ -152,6 +180,8 @@ class QueryDict(TypedDict):
|
|
|
152
180
|
options: NotRequired[MutableMapping[str, bool | None]]
|
|
153
181
|
selection: NotRequired[MutableMapping[str, FacetValues]]
|
|
154
182
|
files: NotRequired[list[FileDict]]
|
|
183
|
+
added_at: NotRequired[str]
|
|
184
|
+
updated_at: NotRequired[str]
|
|
155
185
|
|
|
156
186
|
|
|
157
187
|
class Query(Base):
|
|
@@ -181,6 +211,14 @@ class Query(Base):
|
|
|
181
211
|
back_populates="queries",
|
|
182
212
|
repr=False,
|
|
183
213
|
)
|
|
214
|
+
added_at: Mapped[datetime] = mapped_column(
|
|
215
|
+
server_default=sa.func.now(),
|
|
216
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
217
|
+
)
|
|
218
|
+
updated_at: Mapped[datetime] = mapped_column(
|
|
219
|
+
server_default=sa.func.now(),
|
|
220
|
+
default_factory=lambda: datetime.now(timezone.utc),
|
|
221
|
+
)
|
|
184
222
|
|
|
185
223
|
def __init__(
|
|
186
224
|
self,
|
|
@@ -191,6 +229,8 @@ class Query(Base):
|
|
|
191
229
|
options: Options | MutableMapping[str, bool | None] | None = None,
|
|
192
230
|
selection: Selection | MutableMapping[str, FacetValues] | None = None,
|
|
193
231
|
files: list[FileDict] | None = None,
|
|
232
|
+
added_at: datetime | str | None = None,
|
|
233
|
+
updated_at: datetime | str | None = None,
|
|
194
234
|
) -> None:
|
|
195
235
|
self.tracked = tracked
|
|
196
236
|
self.require = require
|
|
@@ -219,6 +259,14 @@ class Query(Base):
|
|
|
219
259
|
if files is not None:
|
|
220
260
|
for file in files:
|
|
221
261
|
self.files.append(File.fromdict(file))
|
|
262
|
+
if added_at is not None:
|
|
263
|
+
self.added_at = parse_date(added_at)
|
|
264
|
+
else:
|
|
265
|
+
self.added_at = datetime.now(timezone.utc)
|
|
266
|
+
if updated_at is not None:
|
|
267
|
+
self.updated_at = parse_date(updated_at)
|
|
268
|
+
else:
|
|
269
|
+
self.updated_at = datetime.now(timezone.utc)
|
|
222
270
|
|
|
223
271
|
@property
|
|
224
272
|
def has_files(self) -> bool:
|
|
@@ -313,6 +361,8 @@ class Query(Base):
|
|
|
313
361
|
result["options"] = self.options.asdict()
|
|
314
362
|
if self.selection:
|
|
315
363
|
result["selection"] = self.selection.asdict()
|
|
364
|
+
result["added_at"] = format_date(self.added_at)
|
|
365
|
+
result["updated_at"] = format_date(self.updated_at)
|
|
316
366
|
return result
|
|
317
367
|
|
|
318
368
|
def clone(self, compute_sha: bool = True) -> Query:
|
|
@@ -360,11 +410,6 @@ class Query(Base):
|
|
|
360
410
|
self.tags.remove(tag)
|
|
361
411
|
return tag is not None
|
|
362
412
|
|
|
363
|
-
def no_require(self) -> Query:
|
|
364
|
-
cl = self.clone(compute_sha=False)
|
|
365
|
-
cl._rich_no_require = True # type: ignore [attr-defined]
|
|
366
|
-
return cl
|
|
367
|
-
|
|
368
413
|
def __lshift__(self, child: Query) -> Query:
|
|
369
414
|
result = self.clone(compute_sha=False)
|
|
370
415
|
# if self.name != child.require:
|
|
@@ -405,12 +450,16 @@ class Query(Base):
|
|
|
405
450
|
|
|
406
451
|
__rich_measure__ = rich_measure_impl
|
|
407
452
|
|
|
408
|
-
def _rich_tree(self) -> Tree:
|
|
453
|
+
def _rich_tree(self, hide_require: bool = False) -> Tree:
|
|
409
454
|
title = Text.from_markup(self.rich_name)
|
|
410
455
|
if not self.tracked:
|
|
411
456
|
title.append(" untracked", style="i red")
|
|
457
|
+
title.append(
|
|
458
|
+
f"\n│ added {format_date_iso(self.added_at)}"
|
|
459
|
+
f"\n│ updated {format_date_iso(self.updated_at)}"
|
|
460
|
+
)
|
|
412
461
|
contents = Table.grid(padding=(0, 1))
|
|
413
|
-
if not
|
|
462
|
+
if not hide_require and self.require is not None:
|
|
414
463
|
if len(self.require) == 40:
|
|
415
464
|
require = Text(short_sha(self.require), style="i green")
|
|
416
465
|
else:
|
|
@@ -443,10 +492,44 @@ class Query(Base):
|
|
|
443
492
|
count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
|
|
444
493
|
count_total, size_total = self.files_count_size()
|
|
445
494
|
sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
|
|
446
|
-
lens = f"{count_ondisk}/{count_total}"
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
495
|
+
lens = f"{count_ondisk} / {count_total}"
|
|
496
|
+
|
|
497
|
+
# Add dataset completion info
|
|
498
|
+
complete_datasets = 0
|
|
499
|
+
total_datasets = 0
|
|
500
|
+
session = object_session(self)
|
|
501
|
+
orphaned_dataset_count = 0
|
|
502
|
+
|
|
503
|
+
if session is not None:
|
|
504
|
+
from esgpull.models import sql
|
|
505
|
+
|
|
506
|
+
dataset_stats = session.execute(
|
|
507
|
+
sql.dataset.query_stats(self.sha)
|
|
508
|
+
).all()
|
|
509
|
+
|
|
510
|
+
# Check for orphaned datasets (dataset_ids from files not in Dataset table)
|
|
511
|
+
orphaned_dataset_count = (
|
|
512
|
+
session.scalar(sql.dataset.orphaned(self.sha)) or 0
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
# Compute counts in Python - simpler and more maintainable
|
|
516
|
+
total_datasets = len(dataset_stats)
|
|
517
|
+
complete_datasets = sum(
|
|
518
|
+
1 for d in dataset_stats if d.done_count == d.total_files
|
|
519
|
+
)
|
|
520
|
+
|
|
521
|
+
contents.add_row("files:", Text(f"{lens}", style="magenta"))
|
|
522
|
+
if orphaned_dataset_count > 0:
|
|
523
|
+
contents.add_row(
|
|
524
|
+
"datasets:",
|
|
525
|
+
"[magenta]? / ?[/] [yellow italic]<- update for accurate datasets[/]",
|
|
526
|
+
)
|
|
527
|
+
else:
|
|
528
|
+
contents.add_row(
|
|
529
|
+
"datasets:",
|
|
530
|
+
f"[magenta]{complete_datasets} / {total_datasets}",
|
|
531
|
+
)
|
|
532
|
+
contents.add_row("size:", Text(f"{sizes}", style="magenta"))
|
|
450
533
|
tree = Tree("", hide_root=True, guide_style="dim").add(title)
|
|
451
534
|
if contents.row_count:
|
|
452
535
|
tree.add(contents)
|
esgpull/models/sql.py
CHANGED
|
@@ -3,6 +3,7 @@ import functools
|
|
|
3
3
|
import sqlalchemy as sa
|
|
4
4
|
|
|
5
5
|
from esgpull.models import Table
|
|
6
|
+
from esgpull.models.dataset import Dataset
|
|
6
7
|
from esgpull.models.facet import Facet
|
|
7
8
|
from esgpull.models.file import FileStatus
|
|
8
9
|
from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
|
|
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
|
|
|
11
12
|
from esgpull.models.tag import Tag
|
|
12
13
|
|
|
13
14
|
|
|
14
|
-
def count(item: Table) -> sa.Select[tuple[int]]:
|
|
15
|
-
table = item.__class__
|
|
16
|
-
return (
|
|
17
|
-
sa.select(sa.func.count("*"))
|
|
18
|
-
.select_from(table)
|
|
19
|
-
.filter_by(sha=item.sha)
|
|
20
|
-
)
|
|
21
|
-
|
|
22
|
-
|
|
23
15
|
def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
|
|
24
16
|
return sa.select(sa.func.count("*")).select_from(table)
|
|
25
17
|
|
|
@@ -148,6 +140,45 @@ class file:
|
|
|
148
140
|
return stmt
|
|
149
141
|
|
|
150
142
|
|
|
143
|
+
class dataset:
|
|
144
|
+
@staticmethod
|
|
145
|
+
@functools.cache
|
|
146
|
+
def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
|
|
147
|
+
return (
|
|
148
|
+
sa.select(
|
|
149
|
+
Dataset.dataset_id,
|
|
150
|
+
Dataset.total_files,
|
|
151
|
+
sa.func.count(
|
|
152
|
+
sa.case((File.status == FileStatus.Done, 1))
|
|
153
|
+
).label("done_count"),
|
|
154
|
+
)
|
|
155
|
+
.join(File)
|
|
156
|
+
.join(query_file_proxy)
|
|
157
|
+
.filter(query_file_proxy.c.query_sha == query_sha)
|
|
158
|
+
.filter(File.dataset_id.isnot(None))
|
|
159
|
+
.group_by(Dataset.dataset_id, Dataset.total_files)
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
@staticmethod
|
|
163
|
+
@functools.cache
|
|
164
|
+
def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
|
|
165
|
+
return (
|
|
166
|
+
sa.select(sa.func.count(sa.distinct(File.dataset_id)))
|
|
167
|
+
.join(query_file_proxy)
|
|
168
|
+
.filter(query_file_proxy.c.query_sha == query_sha)
|
|
169
|
+
.filter(File.dataset_id.isnot(None))
|
|
170
|
+
.filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
@staticmethod
|
|
174
|
+
@functools.cache
|
|
175
|
+
def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
|
|
176
|
+
return sa.select(
|
|
177
|
+
sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
|
|
178
|
+
== dataset.total_files
|
|
179
|
+
).where(File.dataset_id == dataset.dataset_id)
|
|
180
|
+
|
|
181
|
+
|
|
151
182
|
class query:
|
|
152
183
|
@staticmethod
|
|
153
184
|
@functools.cache
|