esgpull 0.6.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- esgpull/__init__.py +12 -0
- esgpull/auth.py +181 -0
- esgpull/cli/__init__.py +73 -0
- esgpull/cli/add.py +103 -0
- esgpull/cli/autoremove.py +38 -0
- esgpull/cli/config.py +116 -0
- esgpull/cli/convert.py +285 -0
- esgpull/cli/decorators.py +342 -0
- esgpull/cli/download.py +74 -0
- esgpull/cli/facet.py +23 -0
- esgpull/cli/get.py +28 -0
- esgpull/cli/install.py +85 -0
- esgpull/cli/link.py +105 -0
- esgpull/cli/login.py +56 -0
- esgpull/cli/remove.py +73 -0
- esgpull/cli/retry.py +43 -0
- esgpull/cli/search.py +201 -0
- esgpull/cli/self.py +238 -0
- esgpull/cli/show.py +66 -0
- esgpull/cli/status.py +67 -0
- esgpull/cli/track.py +87 -0
- esgpull/cli/update.py +184 -0
- esgpull/cli/utils.py +247 -0
- esgpull/config.py +410 -0
- esgpull/constants.py +56 -0
- esgpull/context.py +724 -0
- esgpull/database.py +161 -0
- esgpull/download.py +162 -0
- esgpull/esgpull.py +447 -0
- esgpull/exceptions.py +167 -0
- esgpull/fs.py +253 -0
- esgpull/graph.py +460 -0
- esgpull/install_config.py +185 -0
- esgpull/migrations/README +1 -0
- esgpull/migrations/env.py +82 -0
- esgpull/migrations/script.py.mako +24 -0
- esgpull/migrations/versions/0.3.0_update_tables.py +170 -0
- esgpull/migrations/versions/0.3.1_update_tables.py +25 -0
- esgpull/migrations/versions/0.3.2_update_tables.py +26 -0
- esgpull/migrations/versions/0.3.3_update_tables.py +25 -0
- esgpull/migrations/versions/0.3.4_update_tables.py +25 -0
- esgpull/migrations/versions/0.3.5_update_tables.py +25 -0
- esgpull/migrations/versions/0.3.6_update_tables.py +26 -0
- esgpull/migrations/versions/0.3.7_update_tables.py +26 -0
- esgpull/migrations/versions/0.3.8_update_tables.py +26 -0
- esgpull/migrations/versions/0.4.0_update_tables.py +25 -0
- esgpull/migrations/versions/0.5.0_update_tables.py +26 -0
- esgpull/migrations/versions/0.5.1_update_tables.py +26 -0
- esgpull/migrations/versions/0.5.2_update_tables.py +25 -0
- esgpull/migrations/versions/0.5.3_update_tables.py +26 -0
- esgpull/migrations/versions/0.5.4_update_tables.py +25 -0
- esgpull/migrations/versions/0.5.5_update_tables.py +25 -0
- esgpull/migrations/versions/0.6.0_update_tables.py +25 -0
- esgpull/migrations/versions/0.6.1_update_tables.py +25 -0
- esgpull/migrations/versions/0.6.2_update_tables.py +25 -0
- esgpull/migrations/versions/0.6.3_update_tables.py +25 -0
- esgpull/models/__init__.py +31 -0
- esgpull/models/base.py +50 -0
- esgpull/models/dataset.py +34 -0
- esgpull/models/facet.py +18 -0
- esgpull/models/file.py +65 -0
- esgpull/models/options.py +164 -0
- esgpull/models/query.py +481 -0
- esgpull/models/selection.py +201 -0
- esgpull/models/sql.py +258 -0
- esgpull/models/synda_file.py +85 -0
- esgpull/models/tag.py +19 -0
- esgpull/models/utils.py +54 -0
- esgpull/presets.py +13 -0
- esgpull/processor.py +172 -0
- esgpull/py.typed +0 -0
- esgpull/result.py +53 -0
- esgpull/tui.py +346 -0
- esgpull/utils.py +54 -0
- esgpull/version.py +1 -0
- esgpull-0.6.3.dist-info/METADATA +110 -0
- esgpull-0.6.3.dist-info/RECORD +80 -0
- esgpull-0.6.3.dist-info/WHEEL +4 -0
- esgpull-0.6.3.dist-info/entry_points.txt +3 -0
- esgpull-0.6.3.dist-info/licenses/LICENSE +28 -0
esgpull/database.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator, Sequence
|
|
4
|
+
from contextlib import contextmanager
|
|
5
|
+
from dataclasses import InitVar, dataclass, field
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TypeVar
|
|
8
|
+
|
|
9
|
+
import alembic.command
|
|
10
|
+
import sqlalchemy as sa
|
|
11
|
+
import sqlalchemy.orm
|
|
12
|
+
from alembic.config import Config as AlembicConfig
|
|
13
|
+
from alembic.migration import MigrationContext
|
|
14
|
+
from alembic.script import ScriptDirectory
|
|
15
|
+
from sqlalchemy.orm import Session, joinedload, make_transient
|
|
16
|
+
|
|
17
|
+
from esgpull import __file__
|
|
18
|
+
from esgpull.config import Config
|
|
19
|
+
from esgpull.models import File, Table, sql
|
|
20
|
+
from esgpull.version import __version__
|
|
21
|
+
|
|
22
|
+
# from esgpull.exceptions import NoClauseError
|
|
23
|
+
# from esgpull.models import Query
|
|
24
|
+
|
|
25
|
+
T = TypeVar("T")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class Database:
|
|
30
|
+
"""
|
|
31
|
+
Main class to interact with esgpull's sqlite db.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
url: str
|
|
35
|
+
run_migrations: InitVar[bool] = True
|
|
36
|
+
_engine: sa.Engine = field(init=False)
|
|
37
|
+
session: Session = field(init=False)
|
|
38
|
+
version: str | None = field(init=False, default=None)
|
|
39
|
+
|
|
40
|
+
@staticmethod
|
|
41
|
+
def from_config(config: Config, run_migrations: bool = True) -> Database:
|
|
42
|
+
url = f"sqlite:///{config.paths.db / config.db.filename}"
|
|
43
|
+
return Database(url, run_migrations=run_migrations)
|
|
44
|
+
|
|
45
|
+
def __post_init__(self, run_migrations: bool) -> None:
|
|
46
|
+
self._engine = sa.create_engine(self.url)
|
|
47
|
+
self.session = Session(self._engine)
|
|
48
|
+
if run_migrations:
|
|
49
|
+
self._update()
|
|
50
|
+
|
|
51
|
+
def _update(self) -> None:
|
|
52
|
+
alembic_config = AlembicConfig()
|
|
53
|
+
migrations_path = Path(__file__).parent / "migrations"
|
|
54
|
+
alembic_config.set_main_option("script_location", str(migrations_path))
|
|
55
|
+
alembic_config.attributes["connection"] = self._engine
|
|
56
|
+
script = ScriptDirectory.from_config(alembic_config)
|
|
57
|
+
head = script.get_current_head()
|
|
58
|
+
with self._engine.begin() as conn:
|
|
59
|
+
opts = {"version_table": "version"}
|
|
60
|
+
ctx = MigrationContext.configure(conn, opts=opts)
|
|
61
|
+
self.version = ctx.get_current_revision()
|
|
62
|
+
if self.version != head:
|
|
63
|
+
alembic.command.upgrade(alembic_config, __version__)
|
|
64
|
+
self.version = head
|
|
65
|
+
if self.version != __version__:
|
|
66
|
+
alembic.command.revision(
|
|
67
|
+
alembic_config,
|
|
68
|
+
message="update tables",
|
|
69
|
+
autogenerate=True,
|
|
70
|
+
rev_id=__version__,
|
|
71
|
+
)
|
|
72
|
+
self.version = __version__
|
|
73
|
+
|
|
74
|
+
@property
|
|
75
|
+
@contextmanager
|
|
76
|
+
def safe(self) -> Iterator[None]:
|
|
77
|
+
try:
|
|
78
|
+
yield
|
|
79
|
+
except (sa.exc.SQLAlchemyError, KeyboardInterrupt):
|
|
80
|
+
self.session.rollback()
|
|
81
|
+
raise
|
|
82
|
+
|
|
83
|
+
def get(
|
|
84
|
+
self,
|
|
85
|
+
table: type[Table],
|
|
86
|
+
sha: str,
|
|
87
|
+
lazy: bool = True,
|
|
88
|
+
detached: bool = False,
|
|
89
|
+
) -> Table | None:
|
|
90
|
+
if lazy:
|
|
91
|
+
result = self.session.get(table, sha)
|
|
92
|
+
else:
|
|
93
|
+
stmt = sa.select(table).filter_by(sha=sha)
|
|
94
|
+
match self.scalars(stmt.options(joinedload("*")), unique=True):
|
|
95
|
+
case [result]:
|
|
96
|
+
...
|
|
97
|
+
case []:
|
|
98
|
+
result = None
|
|
99
|
+
case [*many]:
|
|
100
|
+
raise ValueError(f"{len(many)} found, expected 1.")
|
|
101
|
+
if detached and result is not None:
|
|
102
|
+
result = table(**result.asdict())
|
|
103
|
+
return result
|
|
104
|
+
|
|
105
|
+
def scalars(
|
|
106
|
+
self, statement: sa.Select[tuple[T]], unique: bool = False
|
|
107
|
+
) -> Sequence[T]:
|
|
108
|
+
with self.safe:
|
|
109
|
+
result = self.session.scalars(statement)
|
|
110
|
+
if unique:
|
|
111
|
+
result = result.unique()
|
|
112
|
+
return result.all()
|
|
113
|
+
|
|
114
|
+
SomeTuple = TypeVar("SomeTuple", bound=tuple)
|
|
115
|
+
|
|
116
|
+
def rows(self, statement: sa.Select[SomeTuple]) -> list[sa.Row[SomeTuple]]:
|
|
117
|
+
with self.safe:
|
|
118
|
+
return list(self.session.execute(statement).all())
|
|
119
|
+
|
|
120
|
+
def add(self, *items: Table) -> None:
|
|
121
|
+
with self.safe:
|
|
122
|
+
self.session.add_all(items)
|
|
123
|
+
self.session.commit()
|
|
124
|
+
for item in items:
|
|
125
|
+
self.session.refresh(item)
|
|
126
|
+
|
|
127
|
+
def delete(self, *items: Table) -> None:
|
|
128
|
+
with self.safe:
|
|
129
|
+
for item in items:
|
|
130
|
+
self.session.delete(item)
|
|
131
|
+
self.session.commit()
|
|
132
|
+
for item in items:
|
|
133
|
+
make_transient(item)
|
|
134
|
+
|
|
135
|
+
def __contains__(self, item: Table) -> bool:
|
|
136
|
+
return self.scalars(sql.count(item))[0] > 0
|
|
137
|
+
|
|
138
|
+
def has_file_id(self, file: File) -> bool:
|
|
139
|
+
return len(self.scalars(sql.file.with_file_id(file.file_id))) == 1
|
|
140
|
+
|
|
141
|
+
def merge(self, item: Table, commit: bool = False) -> Table:
|
|
142
|
+
with self.safe:
|
|
143
|
+
result = self.session.merge(item)
|
|
144
|
+
if commit:
|
|
145
|
+
self.session.commit()
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
def get_deprecated_files(self) -> list[File]:
|
|
149
|
+
duplicates = self.scalars(sql.file.duplicates())
|
|
150
|
+
duplicates_dict: dict[str, list[File]] = {}
|
|
151
|
+
for file in duplicates:
|
|
152
|
+
duplicates_dict.setdefault(file.master_id, [])
|
|
153
|
+
duplicates_dict[file.master_id].append(file)
|
|
154
|
+
deprecated: list[File] = []
|
|
155
|
+
for files in duplicates_dict.values():
|
|
156
|
+
versions = [int(f.version[1:]) for f in files]
|
|
157
|
+
latest_version = "v" + str(max(versions))
|
|
158
|
+
for file in files:
|
|
159
|
+
if file.version != latest_version:
|
|
160
|
+
deprecated.append(file)
|
|
161
|
+
return deprecated
|
esgpull/download.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
# from math import ceil
|
|
2
|
+
from collections.abc import AsyncGenerator
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
from httpx import AsyncClient
|
|
6
|
+
|
|
7
|
+
from esgpull.fs import Digest
|
|
8
|
+
from esgpull.models import File
|
|
9
|
+
|
|
10
|
+
# import asyncio
|
|
11
|
+
# from urllib.parse import urlsplit
|
|
12
|
+
# from esgpull.auth import Auth
|
|
13
|
+
# from esgpull.context import Context
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class DownloadCtx:
|
|
18
|
+
file: File
|
|
19
|
+
completed: int = 0
|
|
20
|
+
chunk: bytes | None = None
|
|
21
|
+
digest: Digest | None = None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def finished(self) -> bool:
|
|
25
|
+
return self.completed == self.file.size
|
|
26
|
+
|
|
27
|
+
@property
|
|
28
|
+
def error(self) -> bool:
|
|
29
|
+
return self.completed > self.file.size
|
|
30
|
+
|
|
31
|
+
def update_digest(self) -> None:
|
|
32
|
+
if self.digest is not None and self.chunk is not None:
|
|
33
|
+
self.digest.update(self.chunk)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BaseDownloader:
|
|
37
|
+
def stream(
|
|
38
|
+
self,
|
|
39
|
+
client: AsyncClient,
|
|
40
|
+
ctx: DownloadCtx,
|
|
41
|
+
chunk_size: int,
|
|
42
|
+
) -> AsyncGenerator[DownloadCtx, None]:
|
|
43
|
+
raise NotImplementedError
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Simple(BaseDownloader):
|
|
47
|
+
"""
|
|
48
|
+
Simple chunked async downloader.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
async def stream(
|
|
52
|
+
self,
|
|
53
|
+
client: AsyncClient,
|
|
54
|
+
ctx: DownloadCtx,
|
|
55
|
+
chunk_size: int,
|
|
56
|
+
) -> AsyncGenerator[DownloadCtx, None]:
|
|
57
|
+
async with client.stream("GET", ctx.file.url) as resp:
|
|
58
|
+
resp.raise_for_status()
|
|
59
|
+
async for chunk in resp.aiter_bytes(chunk_size=chunk_size):
|
|
60
|
+
ctx.completed += len(chunk)
|
|
61
|
+
ctx.chunk = chunk
|
|
62
|
+
ctx.update_digest()
|
|
63
|
+
yield ctx
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# class Distributed(BaseDownloader):
|
|
67
|
+
# """
|
|
68
|
+
# Distributed chunked async downloader.
|
|
69
|
+
# Fetches chunks from multiple URLs pointing to the same file.
|
|
70
|
+
# """
|
|
71
|
+
|
|
72
|
+
# def __init__(
|
|
73
|
+
# self,
|
|
74
|
+
# auth: Auth,
|
|
75
|
+
# *,
|
|
76
|
+
# file: File | None = None,
|
|
77
|
+
# url: str | None = None,
|
|
78
|
+
# config: Config | None = None,
|
|
79
|
+
# max_ping: float = 5.0,
|
|
80
|
+
# ) -> None:
|
|
81
|
+
# super().__init__(auth, file=file, url=url, config=config)
|
|
82
|
+
# self.max_ping = max_ping
|
|
83
|
+
|
|
84
|
+
# async def try_url(self, url: str, client: AsyncClient) -> str | None:
|
|
85
|
+
# result = None
|
|
86
|
+
# node = urlsplit(url).netloc
|
|
87
|
+
# print(f"trying url on '{node}'")
|
|
88
|
+
# try:
|
|
89
|
+
# resp = await client.head(url)
|
|
90
|
+
# print(f"got response on '{node}'")
|
|
91
|
+
# resp.raise_for_status()
|
|
92
|
+
# accept_ranges = resp.headers.get("Accept-Ranges")
|
|
93
|
+
# content_length = resp.headers.get("Content-Length")
|
|
94
|
+
# if (
|
|
95
|
+
# accept_ranges == "bytes"
|
|
96
|
+
# and int(content_length) == self.file.size
|
|
97
|
+
# ):
|
|
98
|
+
# result = str(resp.url)
|
|
99
|
+
# else:
|
|
100
|
+
# print(dict(resp.headers))
|
|
101
|
+
# except HTTPError as err:
|
|
102
|
+
# print(type(err))
|
|
103
|
+
# print(err.request.headers)
|
|
104
|
+
# return result
|
|
105
|
+
|
|
106
|
+
# async def process_queue(
|
|
107
|
+
# self, url: str, queue: asyncio.Queue
|
|
108
|
+
# ) -> tuple[list[tuple[int, bytes]], str]:
|
|
109
|
+
# node = urlsplit(url).netloc
|
|
110
|
+
# print(f"starting process on '{node}'")
|
|
111
|
+
# chunks: list[tuple[int, bytes]] = []
|
|
112
|
+
# async with self.make_client() as client:
|
|
113
|
+
# final_url = await self.try_url(url, client)
|
|
114
|
+
# if final_url is None:
|
|
115
|
+
# print(f"no url found for '{node}'")
|
|
116
|
+
# return chunks, url
|
|
117
|
+
# else:
|
|
118
|
+
# url = final_url
|
|
119
|
+
# while not queue.empty():
|
|
120
|
+
# chunk_idx = await queue.get()
|
|
121
|
+
# print(f"processing chunk {chunk_idx} on '{node}'")
|
|
122
|
+
# start = chunk_idx * self.config.download.chunk_size
|
|
123
|
+
# end = min(
|
|
124
|
+
# self.file.size,
|
|
125
|
+
# (chunk_idx + 1) * self.config.download.chunk_size - 1,
|
|
126
|
+
# )
|
|
127
|
+
# headers = {"Range": f"bytes={start}-{end}"}
|
|
128
|
+
# resp = await client.get(url, headers=headers)
|
|
129
|
+
# queue.task_done()
|
|
130
|
+
# if resp.status_code == 206:
|
|
131
|
+
# chunks.append((chunk_idx, resp.content))
|
|
132
|
+
# else:
|
|
133
|
+
# await queue.put(chunk_idx)
|
|
134
|
+
# print(f"error status {resp.status_code} on '{node}'")
|
|
135
|
+
# break
|
|
136
|
+
# return chunks, url
|
|
137
|
+
|
|
138
|
+
# async def fetch_urls(self) -> list[str]:
|
|
139
|
+
# ctx = Context(distrib=True)
|
|
140
|
+
# ctx.query.instance_id = self.file.file_id
|
|
141
|
+
# results = await ctx._search(file=True)
|
|
142
|
+
# files = [File.from_dict(item) for item in results]
|
|
143
|
+
# return [file.url for file in files]
|
|
144
|
+
|
|
145
|
+
# async def aget(self) -> bytes:
|
|
146
|
+
# nb_chunks = ceil(self.file.size / self.config.download.chunk_size)
|
|
147
|
+
# queue: asyncio.Queue[int] = asyncio.Queue(nb_chunks)
|
|
148
|
+
# for chunk_idx in range(nb_chunks):
|
|
149
|
+
# queue.put_nowait(chunk_idx)
|
|
150
|
+
# completed: list[bool] = [False for _ in range(nb_chunks)]
|
|
151
|
+
# chunks: list[bytes] = [bytes() for _ in range(nb_chunks)]
|
|
152
|
+
# urls = await self.fetch_urls()
|
|
153
|
+
# workers = [self.process_queue(url, queue) for url in urls]
|
|
154
|
+
# for future in asyncio.as_completed(workers):
|
|
155
|
+
# some_chunks, url = await future
|
|
156
|
+
# print(f"got {len(some_chunks)} chunks from {url}")
|
|
157
|
+
# for chunk_idx, chunk in some_chunks:
|
|
158
|
+
# completed[chunk_idx] = True
|
|
159
|
+
# chunks[chunk_idx] = chunk
|
|
160
|
+
# if not all(completed):
|
|
161
|
+
# raise ValueError("TODO: progressive write (with .part file)")
|
|
162
|
+
# return b"".join(chunks)
|