esgpull 0.8.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
esgpull/fs.py CHANGED
@@ -10,7 +10,7 @@ from shutil import copyfile
10
10
  import aiofiles
11
11
  from aiofiles.threadpool.binary import AsyncBufferedIOBase
12
12
 
13
- from esgpull.config import Config
13
+ from esgpull.config import Config, Paths
14
14
  from esgpull.models import File
15
15
  from esgpull.result import Err, Ok, Result
16
16
  from esgpull.tui import logger
@@ -63,45 +63,34 @@ class Digest:
63
63
 
64
64
  @dataclass
65
65
  class Filesystem:
66
- auth: Path
67
- data: Path
68
- db: Path
69
- log: Path
70
- tmp: Path
66
+ paths: Paths
71
67
  disable_checksum: bool = False
72
68
  install: InitVar[bool] = True
73
69
 
74
70
  @staticmethod
75
71
  def from_config(config: Config, install: bool = False) -> Filesystem:
76
72
  return Filesystem(
77
- auth=config.paths.auth,
78
- data=config.paths.data,
79
- db=config.paths.db,
80
- log=config.paths.log,
81
- tmp=config.paths.tmp,
73
+ paths=config.paths,
82
74
  disable_checksum=config.download.disable_checksum,
83
75
  install=install,
84
76
  )
85
77
 
86
78
  def __post_init__(self, install: bool = True) -> None:
87
79
  if install:
88
- self.auth.mkdir(parents=True, exist_ok=True)
89
- self.data.mkdir(parents=True, exist_ok=True)
90
- self.db.mkdir(parents=True, exist_ok=True)
91
- self.log.mkdir(parents=True, exist_ok=True)
92
- self.tmp.mkdir(parents=True, exist_ok=True)
80
+ for path in self.paths.values():
81
+ path.mkdir(parents=True, exist_ok=True)
93
82
 
94
83
  def __getitem__(self, file: File) -> FilePath:
95
84
  if not isinstance(file, File):
96
85
  raise TypeError(file)
97
86
  return FilePath(
98
- drs=self.data / file.local_path / file.filename,
99
- tmp=self.tmp / f"{file.sha}.part",
87
+ drs=self.paths.data / file.local_path / file.filename,
88
+ tmp=self.paths.tmp / f"{file.sha}.part",
100
89
  )
101
90
 
102
91
  def glob_netcdf(self) -> Iterator[Path]:
103
- for path in self.data.glob("**/*.nc"):
104
- yield path.relative_to(self.data)
92
+ for path in self.paths.data.glob("**/*.nc"):
93
+ yield path.relative_to(self.paths.data)
105
94
 
106
95
  def open(self, file: File) -> FileObject:
107
96
  return FileObject(self[file])
esgpull/graph.py CHANGED
@@ -418,7 +418,7 @@ class Graph:
418
418
  if keep_require:
419
419
  query_tree = query._rich_tree()
420
420
  else:
421
- query_tree = query.no_require()._rich_tree()
421
+ query_tree = query._rich_tree(hide_require=True)
422
422
  if query_tree is not None:
423
423
  tree.add(query_tree)
424
424
  self.fill_tree(query, query_tree)
@@ -0,0 +1,28 @@
1
+ """update tables
2
+
3
+ Revision ID: 0.9.0
4
+ Revises: d14f179e553c
5
+ Create Date: 2025-07-07 14:54:58.433022
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '0.9.0'
14
+ down_revision = 'd14f179e553c'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ pass
22
+ # ### end Alembic commands ###
23
+
24
+
25
+ def downgrade() -> None:
26
+ # ### commands auto generated by Alembic - please adjust! ###
27
+ pass
28
+ # ### end Alembic commands ###
@@ -0,0 +1,28 @@
1
+ """update tables
2
+
3
+ Revision ID: 0.9.1
4
+ Revises: 0.9.0
5
+ Create Date: 2025-08-08 10:38:14.204594
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = '0.9.1'
14
+ down_revision = '0.9.0'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ pass
22
+ # ### end Alembic commands ###
23
+
24
+
25
+ def downgrade() -> None:
26
+ # ### commands auto generated by Alembic - please adjust! ###
27
+ pass
28
+ # ### end Alembic commands ###
@@ -0,0 +1,32 @@
1
+ """file_add_composite_index_dataset_id_status
2
+
3
+ Revision ID: d14f179e553c
4
+ Revises: e7edab5d4e4b
5
+ Create Date: 2025-06-18 16:05:35.721085
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = 'd14f179e553c'
14
+ down_revision = 'e7edab5d4e4b'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ with op.batch_alter_table('file', schema=None) as batch_op:
22
+ batch_op.create_index('ix_file_dataset_status', ['dataset_id', 'status'], unique=False)
23
+
24
+ # ### end Alembic commands ###
25
+
26
+
27
+ def downgrade() -> None:
28
+ # ### commands auto generated by Alembic - please adjust! ###
29
+ with op.batch_alter_table('file', schema=None) as batch_op:
30
+ batch_op.drop_index('ix_file_dataset_status')
31
+
32
+ # ### end Alembic commands ###
@@ -0,0 +1,39 @@
1
+ """add_dataset_tracking
2
+
3
+ Revision ID: e7edab5d4e4b
4
+ Revises: 0.8.0
5
+ Create Date: 2025-05-23 17:38:22.066153
6
+
7
+ """
8
+ from alembic import op
9
+ import sqlalchemy as sa
10
+
11
+
12
+ # revision identifiers, used by Alembic.
13
+ revision = 'e7edab5d4e4b'
14
+ down_revision = '0.8.0'
15
+ branch_labels = None
16
+ depends_on = None
17
+
18
+
19
+ def upgrade() -> None:
20
+ # ### commands auto generated by Alembic - please adjust! ###
21
+ op.create_table('dataset',
22
+ sa.Column('dataset_id', sa.String(length=255), nullable=False),
23
+ sa.Column('total_files', sa.Integer(), nullable=False),
24
+ sa.Column('created_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
25
+ sa.Column('updated_at', sa.DateTime(), server_default=sa.text('(CURRENT_TIMESTAMP)'), nullable=False),
26
+ sa.PrimaryKeyConstraint('dataset_id')
27
+ )
28
+ with op.batch_alter_table('file', schema=None) as batch_op:
29
+ batch_op.create_foreign_key('fk_file_dataset', 'dataset', ['dataset_id'], ['dataset_id'])
30
+
31
+ # ### end Alembic commands ###
32
+
33
+ def downgrade() -> None:
34
+ # ### commands auto generated by Alembic - please adjust! ###
35
+ with op.batch_alter_table('file', schema=None) as batch_op:
36
+ batch_op.drop_constraint('fk_file_dataset', type_='foreignkey')
37
+
38
+ op.drop_table('dataset')
39
+ # ### end Alembic commands ###
@@ -1,7 +1,7 @@
1
1
  from typing import TypeVar
2
2
 
3
3
  from esgpull.models.base import Base
4
- from esgpull.models.dataset import Dataset
4
+ from esgpull.models.dataset import Dataset, DatasetRecord
5
5
  from esgpull.models.facet import Facet
6
6
  from esgpull.models.file import FastFile, FileStatus
7
7
  from esgpull.models.options import Option, Options
@@ -15,6 +15,7 @@ Table = TypeVar("Table", bound=Base)
15
15
  __all__ = [
16
16
  "Base",
17
17
  "Dataset",
18
+ "DatasetRecord",
18
19
  "Facet",
19
20
  "FastFile",
20
21
  "File",
esgpull/models/base.py CHANGED
@@ -16,16 +16,10 @@ T = TypeVar("T")
16
16
  Sha = sa.String(40)
17
17
 
18
18
 
19
- class Base(MappedAsDataclass, DeclarativeBase):
19
+ # Base class for all models - provides core SQLAlchemy functionality
20
+ class _BaseModel(MappedAsDataclass, DeclarativeBase):
20
21
  __dataclass_fields__: ClassVar[dict[str, Field]]
21
- __sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
22
-
23
- sha: Mapped[str] = mapped_column(
24
- Sha,
25
- init=False,
26
- repr=False,
27
- primary_key=True,
28
- )
22
+ __sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
29
23
 
30
24
  @property
31
25
  def _names(self) -> tuple[str, ...]:
@@ -36,15 +30,38 @@ class Base(MappedAsDataclass, DeclarativeBase):
36
30
  result += (name,)
37
31
  return result
38
32
 
33
+ @property
34
+ def state(self) -> InstanceState:
35
+ return cast(InstanceState, sa.inspect(self))
36
+
37
+ def asdict(self) -> Mapping[str, Any]:
38
+ raise NotImplementedError
39
+
40
+
41
+ # Base class for models that use SHA as primary key
42
+ class Base(_BaseModel):
43
+ __abstract__ = True
44
+ __sql_attrs__ = ("id", "sha", "_sa_instance_state", "__dataclass_fields__")
45
+
46
+ sha: Mapped[str] = mapped_column(
47
+ Sha,
48
+ init=False,
49
+ repr=False,
50
+ primary_key=True,
51
+ )
52
+
39
53
  def _as_bytes(self) -> bytes:
40
54
  raise NotImplementedError
41
55
 
42
56
  def compute_sha(self) -> None:
43
57
  self.sha = sha1(self._as_bytes()).hexdigest()
44
58
 
45
- @property
46
- def state(self) -> InstanceState:
47
- return cast(InstanceState, sa.inspect(self))
48
59
 
49
- def asdict(self) -> Mapping[str, Any]:
50
- raise NotImplementedError
60
+ # Base class for models that don't use SHA (e.g., Dataset)
61
+ class BaseNoSHA(_BaseModel):
62
+ __abstract__ = True
63
+ __sql_attrs__ = ("id", "_sa_instance_state", "__dataclass_fields__")
64
+
65
+
66
+ # Keep SHAKeyMixin for backward compatibility if needed
67
+ SHAKeyMixin = Base
esgpull/models/dataset.py CHANGED
@@ -1,12 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import asdict, dataclass
3
+ from collections.abc import Mapping
4
+ from dataclasses import dataclass
5
+ from datetime import datetime, timezone
6
+ from typing import TYPE_CHECKING, Any
4
7
 
8
+ import sqlalchemy as sa
9
+ from sqlalchemy.orm import Mapped, mapped_column, relationship
10
+
11
+ from esgpull.models.base import BaseNoSHA
5
12
  from esgpull.models.utils import find_int, find_str
6
13
 
14
+ if TYPE_CHECKING:
15
+ from esgpull.models.query import File
16
+
7
17
 
8
18
  @dataclass
9
- class Dataset:
19
+ class DatasetRecord:
10
20
  dataset_id: str
11
21
  master_id: str
12
22
  version: str
@@ -15,7 +25,7 @@ class Dataset:
15
25
  number_of_files: int
16
26
 
17
27
  @classmethod
18
- def serialize(cls, source: dict) -> Dataset:
28
+ def serialize(cls, source: dict) -> DatasetRecord:
19
29
  dataset_id = find_str(source["instance_id"]).partition("|")[0]
20
30
  master_id, version = dataset_id.rsplit(".", 1)
21
31
  data_node = find_str(source["data_node"])
@@ -30,5 +40,38 @@ class Dataset:
30
40
  number_of_files=number_of_files,
31
41
  )
32
42
 
33
- def asdict(self) -> dict:
34
- return asdict(self)
43
+
44
+ class Dataset(BaseNoSHA):
45
+ __tablename__ = "dataset"
46
+
47
+ dataset_id: Mapped[str] = mapped_column(sa.String(255), primary_key=True)
48
+ total_files: Mapped[int] = mapped_column(sa.Integer)
49
+ created_at: Mapped[datetime] = mapped_column(
50
+ server_default=sa.func.now(),
51
+ default_factory=lambda: datetime.now(timezone.utc),
52
+ init=False,
53
+ )
54
+ updated_at: Mapped[datetime] = mapped_column(
55
+ server_default=sa.func.now(),
56
+ default_factory=lambda: datetime.now(timezone.utc),
57
+ init=False,
58
+ )
59
+ files: Mapped[list[File]] = relationship(
60
+ back_populates="dataset",
61
+ foreign_keys="[File.dataset_id]",
62
+ primaryjoin="Dataset.dataset_id==File.dataset_id",
63
+ default_factory=list,
64
+ init=False,
65
+ repr=False,
66
+ )
67
+
68
+ def asdict(self) -> Mapping[str, Any]:
69
+ return {
70
+ "dataset_id": self.dataset_id,
71
+ "total_files": self.total_files,
72
+ "created_at": self.created_at.isoformat(),
73
+ "updated_at": self.updated_at.isoformat(),
74
+ }
75
+
76
+ def __hash__(self) -> int:
77
+ return hash(self.dataset_id)
esgpull/models/query.py CHANGED
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from collections.abc import Iterator, MutableMapping, Sequence
4
4
  from datetime import datetime, timezone
5
- from typing import Any, Literal
5
+ from typing import TYPE_CHECKING, Any, Literal
6
6
 
7
7
  import sqlalchemy as sa
8
8
  from rich.console import Console, ConsoleOptions
@@ -15,10 +15,15 @@ from typing_extensions import NotRequired, TypedDict
15
15
  from esgpull import utils
16
16
  from esgpull.exceptions import UntrackableQuery
17
17
  from esgpull.models.base import Base, Sha
18
+ from esgpull.models.dataset import Dataset
18
19
  from esgpull.models.file import FileDict, FileStatus
19
20
  from esgpull.models.options import Options
20
21
  from esgpull.models.selection import FacetValues, Selection
21
22
  from esgpull.models.tag import Tag
23
+
24
+ if TYPE_CHECKING:
25
+ from esgpull.models.dataset import Dataset
26
+
22
27
  from esgpull.models.utils import (
23
28
  find_int,
24
29
  find_str,
@@ -55,9 +60,14 @@ query_tag_proxy = sa.Table(
55
60
 
56
61
  class File(Base):
57
62
  __tablename__ = "file"
63
+ __table_args__ = (
64
+ sa.Index("ix_file_dataset_status", "dataset_id", "status"),
65
+ )
58
66
 
59
67
  file_id: Mapped[str] = mapped_column(sa.String(255), unique=True)
60
- dataset_id: Mapped[str] = mapped_column(sa.String(255))
68
+ dataset_id: Mapped[str] = mapped_column(
69
+ sa.String(255), sa.ForeignKey("dataset.dataset_id")
70
+ )
61
71
  master_id: Mapped[str] = mapped_column(sa.String(255))
62
72
  url: Mapped[str] = mapped_column(sa.String(255))
63
73
  version: Mapped[str] = mapped_column(sa.String(16))
@@ -76,6 +86,11 @@ class File(Base):
76
86
  back_populates="files",
77
87
  repr=False,
78
88
  )
89
+ dataset: Mapped["Dataset"] = relationship(
90
+ back_populates="files",
91
+ init=False,
92
+ repr=False,
93
+ )
79
94
 
80
95
  def _as_bytes(self) -> bytes:
81
96
  self_tuple = (self.file_id, self.checksum)
@@ -100,7 +115,7 @@ class File(Base):
100
115
  size=source["size"],
101
116
  )
102
117
  if "status" in source:
103
- result.status = FileStatus(source.get("source"))
118
+ result.status = FileStatus(source.get("status").lower())
104
119
  return result
105
120
 
106
121
  @classmethod
@@ -395,11 +410,6 @@ class Query(Base):
395
410
  self.tags.remove(tag)
396
411
  return tag is not None
397
412
 
398
- def no_require(self) -> Query:
399
- cl = self.clone(compute_sha=False)
400
- cl._rich_no_require = True # type: ignore [attr-defined]
401
- return cl
402
-
403
413
  def __lshift__(self, child: Query) -> Query:
404
414
  result = self.clone(compute_sha=False)
405
415
  # if self.name != child.require:
@@ -440,7 +450,7 @@ class Query(Base):
440
450
 
441
451
  __rich_measure__ = rich_measure_impl
442
452
 
443
- def _rich_tree(self) -> Tree:
453
+ def _rich_tree(self, hide_require: bool = False) -> Tree:
444
454
  title = Text.from_markup(self.rich_name)
445
455
  if not self.tracked:
446
456
  title.append(" untracked", style="i red")
@@ -449,7 +459,7 @@ class Query(Base):
449
459
  f"\n│ updated {format_date_iso(self.updated_at)}"
450
460
  )
451
461
  contents = Table.grid(padding=(0, 1))
452
- if not hasattr(self, "_rich_no_require") and self.require is not None:
462
+ if not hide_require and self.require is not None:
453
463
  if len(self.require) == 40:
454
464
  require = Text(short_sha(self.require), style="i green")
455
465
  else:
@@ -482,10 +492,44 @@ class Query(Base):
482
492
  count_ondisk, size_ondisk = self.files_count_size(FileStatus.Done)
483
493
  count_total, size_total = self.files_count_size()
484
494
  sizes = f"{format_size(size_ondisk)} / {format_size(size_total)}"
485
- lens = f"{count_ondisk}/{count_total}"
486
- contents.add_row(
487
- "files:", Text(f"{sizes} [{lens}]", style="magenta")
488
- )
495
+ lens = f"{count_ondisk} / {count_total}"
496
+
497
+ # Add dataset completion info
498
+ complete_datasets = 0
499
+ total_datasets = 0
500
+ session = object_session(self)
501
+ orphaned_dataset_count = 0
502
+
503
+ if session is not None:
504
+ from esgpull.models import sql
505
+
506
+ dataset_stats = session.execute(
507
+ sql.dataset.query_stats(self.sha)
508
+ ).all()
509
+
510
+ # Check for orphaned datasets (dataset_ids from files not in Dataset table)
511
+ orphaned_dataset_count = (
512
+ session.scalar(sql.dataset.orphaned(self.sha)) or 0
513
+ )
514
+
515
+ # Compute counts in Python - simpler and more maintainable
516
+ total_datasets = len(dataset_stats)
517
+ complete_datasets = sum(
518
+ 1 for d in dataset_stats if d.done_count == d.total_files
519
+ )
520
+
521
+ contents.add_row("files:", Text(f"{lens}", style="magenta"))
522
+ if orphaned_dataset_count > 0:
523
+ contents.add_row(
524
+ "datasets:",
525
+ "[magenta]? / ?[/] [yellow italic]<- update for accurate datasets[/]",
526
+ )
527
+ else:
528
+ contents.add_row(
529
+ "datasets:",
530
+ f"[magenta]{complete_datasets} / {total_datasets}",
531
+ )
532
+ contents.add_row("size:", Text(f"{sizes}", style="magenta"))
489
533
  tree = Tree("", hide_root=True, guide_style="dim").add(title)
490
534
  if contents.row_count:
491
535
  tree.add(contents)
esgpull/models/sql.py CHANGED
@@ -3,6 +3,7 @@ import functools
3
3
  import sqlalchemy as sa
4
4
 
5
5
  from esgpull.models import Table
6
+ from esgpull.models.dataset import Dataset
6
7
  from esgpull.models.facet import Facet
7
8
  from esgpull.models.file import FileStatus
8
9
  from esgpull.models.query import File, Query, query_file_proxy, query_tag_proxy
@@ -11,15 +12,6 @@ from esgpull.models.synda_file import SyndaFile
11
12
  from esgpull.models.tag import Tag
12
13
 
13
14
 
14
- def count(item: Table) -> sa.Select[tuple[int]]:
15
- table = item.__class__
16
- return (
17
- sa.select(sa.func.count("*"))
18
- .select_from(table)
19
- .filter_by(sha=item.sha)
20
- )
21
-
22
-
23
15
  def count_table(table: type[Table]) -> sa.Select[tuple[int]]:
24
16
  return sa.select(sa.func.count("*")).select_from(table)
25
17
 
@@ -148,6 +140,45 @@ class file:
148
140
  return stmt
149
141
 
150
142
 
143
+ class dataset:
144
+ @staticmethod
145
+ @functools.cache
146
+ def query_stats(query_sha: str) -> sa.Select[tuple[str, int, int]]:
147
+ return (
148
+ sa.select(
149
+ Dataset.dataset_id,
150
+ Dataset.total_files,
151
+ sa.func.count(
152
+ sa.case((File.status == FileStatus.Done, 1))
153
+ ).label("done_count"),
154
+ )
155
+ .join(File)
156
+ .join(query_file_proxy)
157
+ .filter(query_file_proxy.c.query_sha == query_sha)
158
+ .filter(File.dataset_id.isnot(None))
159
+ .group_by(Dataset.dataset_id, Dataset.total_files)
160
+ )
161
+
162
+ @staticmethod
163
+ @functools.cache
164
+ def orphaned(query_sha: str) -> sa.Select[tuple[int]]:
165
+ return (
166
+ sa.select(sa.func.count(sa.distinct(File.dataset_id)))
167
+ .join(query_file_proxy)
168
+ .filter(query_file_proxy.c.query_sha == query_sha)
169
+ .filter(File.dataset_id.isnot(None))
170
+ .filter(~File.dataset_id.in_(sa.select(Dataset.dataset_id)))
171
+ )
172
+
173
+ @staticmethod
174
+ @functools.cache
175
+ def is_complete(dataset: Dataset) -> sa.Select[tuple[bool]]:
176
+ return sa.select(
177
+ sa.func.count(sa.case((File.status == FileStatus.Done, 1)))
178
+ == dataset.total_files
179
+ ).where(File.dataset_id == dataset.dataset_id)
180
+
181
+
151
182
  class query:
152
183
  @staticmethod
153
184
  @functools.cache
@@ -270,3 +301,11 @@ class query_file:
270
301
  .where(query_file_proxy.c.query_sha == query.sha)
271
302
  .where(query_file_proxy.c.file_sha == file.sha)
272
303
  )
304
+
305
+ @staticmethod
306
+ def is_linked(query: Query, file: File) -> sa.Select[tuple[bool]]:
307
+ return sa.select(
308
+ sa.exists()
309
+ .where(query_file_proxy.c.query_sha == query.sha)
310
+ .where(query_file_proxy.c.file_sha == file.sha)
311
+ )