datashare-python 0.5.0__tar.gz → 0.6.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datashare_python-0.5.0 → datashare_python-0.6.0}/PKG-INFO +1 -1
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/cli/__init__.py +0 -2
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/conftest.py +8 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/objects.py +12 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/utils.py +24 -28
- datashare_python-0.6.0/datashare_python/worker-template.tar.gz +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/pyproject.toml +1 -1
- datashare_python-0.5.0/datashare_python/cli/local.py +0 -33
- datashare_python-0.5.0/datashare_python/local_client.py +0 -69
- datashare_python-0.5.0/datashare_python/worker-template.tar.gz +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/.gitignore +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/README.md +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/.gitignore +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/__init__.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/__main__.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/cli/project.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/cli/task.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/cli/utils.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/cli/worker.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/config.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/constants.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/dependencies.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/discovery.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/exceptions.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/task_client.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/template.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/types_.py +0 -0
- {datashare_python-0.5.0 → datashare_python-0.6.0}/datashare_python/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datashare-python
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.6.0
|
|
4
4
|
Summary: Manage Pythoœn tasks and local resources in Datashare
|
|
5
5
|
Project-URL: Homepage, https://icij.github.io/datashare-python/
|
|
6
6
|
Project-URL: Documentation, https://icij.github.io/datashare-python/
|
|
@@ -6,7 +6,6 @@ import typer
|
|
|
6
6
|
from icij_common.logging_utils import setup_loggers
|
|
7
7
|
|
|
8
8
|
import datashare_python
|
|
9
|
-
from datashare_python.cli.local import local_app
|
|
10
9
|
from datashare_python.cli.project import project_app
|
|
11
10
|
from datashare_python.cli.task import task_app
|
|
12
11
|
from datashare_python.cli.utils import AsyncTyper
|
|
@@ -16,7 +15,6 @@ cli_app = AsyncTyper(
|
|
|
16
15
|
context_settings={"help_option_names": ["-h", "--help"]},
|
|
17
16
|
pretty_exceptions_enable=False,
|
|
18
17
|
)
|
|
19
|
-
cli_app.add_typer(local_app)
|
|
20
18
|
cli_app.add_typer(project_app)
|
|
21
19
|
cli_app.add_typer(task_app)
|
|
22
20
|
cli_app.add_typer(worker_app)
|
|
@@ -4,6 +4,7 @@ from collections.abc import AsyncGenerator, Generator, Iterator, Sequence
|
|
|
4
4
|
from pathlib import Path
|
|
5
5
|
|
|
6
6
|
import aiohttp
|
|
7
|
+
import nest_asyncio
|
|
7
8
|
import pytest
|
|
8
9
|
from elasticsearch._async.helpers import async_streaming_bulk
|
|
9
10
|
from icij_common.es import DOC_ROOT_ID, ES_DOCUMENT_TYPE, ID, ESClient
|
|
@@ -200,6 +201,8 @@ def index_docs_ops(
|
|
|
200
201
|
}
|
|
201
202
|
doc = doc.model_dump(by_alias=True) # noqa: PLW2901
|
|
202
203
|
op.update(doc)
|
|
204
|
+
if "path" in op:
|
|
205
|
+
op["path"] = str(op["path"])
|
|
203
206
|
op["_id"] = doc[ID]
|
|
204
207
|
op["routing"] = doc[DOC_ROOT_ID]
|
|
205
208
|
op["type"] = ES_DOCUMENT_TYPE
|
|
@@ -294,3 +297,8 @@ async def all_done(task_client: DatashareTaskClient, not_done: list[str]) -> boo
|
|
|
294
297
|
return False
|
|
295
298
|
not_done.remove(t_id)
|
|
296
299
|
return True
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
@pytest.fixture # noqa: F405
|
|
303
|
+
def typer_asyncio_patch() -> None:
|
|
304
|
+
nest_asyncio.apply()
|
|
@@ -3,6 +3,8 @@ from collections.abc import Awaitable, Callable
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from datetime import UTC, datetime
|
|
5
5
|
from enum import StrEnum, unique
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from pathlib import Path
|
|
6
8
|
from typing import Any, Literal, Self, TypeVar
|
|
7
9
|
|
|
8
10
|
from temporalio import workflow
|
|
@@ -142,6 +144,7 @@ class Document(DatashareModel):
|
|
|
142
144
|
language: str
|
|
143
145
|
content: str | None = None
|
|
144
146
|
content_type: str | None = None
|
|
147
|
+
path: Path | None = None
|
|
145
148
|
tags: list[str] = Field(default_factory=list)
|
|
146
149
|
content_translated: dict[str, str] = Field(
|
|
147
150
|
default_factory=dict, alias="content_translated"
|
|
@@ -159,3 +162,12 @@ class Document(DatashareModel):
|
|
|
159
162
|
root_document=sources[DOC_ROOT_ID],
|
|
160
163
|
tags=sources.get("tags", []),
|
|
161
164
|
)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@dataclass(frozen=True)
|
|
168
|
+
class DocArtifact:
|
|
169
|
+
project: str
|
|
170
|
+
doc_id: str
|
|
171
|
+
artifact: bytes | BytesIO
|
|
172
|
+
filename: str
|
|
173
|
+
metadata_key: str
|
|
@@ -13,6 +13,7 @@ from datetime import timedelta
|
|
|
13
13
|
from functools import partial, wraps
|
|
14
14
|
from hashlib import sha256
|
|
15
15
|
from inspect import signature
|
|
16
|
+
from io import BytesIO
|
|
16
17
|
from pathlib import Path
|
|
17
18
|
from typing import Any, ParamSpec, TypeVar
|
|
18
19
|
from uuid import uuid4
|
|
@@ -34,6 +35,7 @@ from temporalio.common import RetryPolicy, SearchAttributeKey
|
|
|
34
35
|
from temporalio.exceptions import ApplicationError
|
|
35
36
|
|
|
36
37
|
from .constants import METADATA_JSON
|
|
38
|
+
from .objects import DocArtifact
|
|
37
39
|
from .types_ import ProgressRateHandler, RawProgressHandler
|
|
38
40
|
|
|
39
41
|
DependencyLabel = str | None
|
|
@@ -431,49 +433,43 @@ class LogWithWorkerIDMixin:
|
|
|
431
433
|
return handlers
|
|
432
434
|
|
|
433
435
|
|
|
434
|
-
def safe_dir(
|
|
435
|
-
|
|
436
|
-
|
|
436
|
+
def safe_dir(doc_id: str) -> Path:
|
|
437
|
+
if len(doc_id) < 4:
|
|
438
|
+
raise ValueError(f"expected doc_id to be at least 4, found {doc_id}")
|
|
439
|
+
parts = (p for p in (doc_id[:2], doc_id[2:4]) if p)
|
|
437
440
|
return Path(*parts)
|
|
438
441
|
|
|
439
442
|
|
|
440
|
-
def
|
|
441
|
-
return Path(project, safe_dir(
|
|
443
|
+
def _artifacts_dir(doc_id: str, *, project: str) -> Path:
|
|
444
|
+
return Path(project, safe_dir(doc_id), doc_id)
|
|
442
445
|
|
|
443
446
|
|
|
444
|
-
def
|
|
445
|
-
metadata_path =
|
|
447
|
+
def _metadata_path(doc_id: str, *, project: str) -> Path:
|
|
448
|
+
metadata_path = _artifacts_dir(doc_id, project=project) / METADATA_JSON
|
|
446
449
|
return metadata_path
|
|
447
450
|
|
|
448
451
|
|
|
449
|
-
def _read_artifact_metadata(root: Path,
|
|
450
|
-
m_path = root /
|
|
452
|
+
def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict:
|
|
453
|
+
m_path = root / _metadata_path(artifact.filename, project=artifact.project)
|
|
451
454
|
return json.loads(m_path.read_text())
|
|
452
455
|
|
|
453
456
|
|
|
454
|
-
def write_artifact(
|
|
455
|
-
artifact
|
|
456
|
-
root: Path,
|
|
457
|
-
*,
|
|
458
|
-
project: str,
|
|
459
|
-
filename: str,
|
|
460
|
-
metadata_key: str,
|
|
461
|
-
metadata_value: str,
|
|
462
|
-
) -> Path:
|
|
463
|
-
artif_dir = root / artifacts_dir(project, filename=filename)
|
|
457
|
+
def write_artifact(root: Path, artifact: DocArtifact) -> Path:
|
|
458
|
+
artif_dir = root / _artifacts_dir(artifact.doc_id, project=artifact.project)
|
|
464
459
|
artif_dir.mkdir(exist_ok=True, parents=True)
|
|
465
460
|
# TODO: if transcriptions are too large we could also serialize them
|
|
466
461
|
# as jsonl
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
462
|
+
artifact_path: Path = artif_dir / artifact.filename
|
|
463
|
+
if isinstance(artifact.artifact, bytes):
|
|
464
|
+
artifact_path.write_bytes(artifact.artifact)
|
|
465
|
+
elif isinstance(artifact_path, BytesIO):
|
|
466
|
+
with artifact_path.open("wb") as f:
|
|
467
|
+
f.write(artifact.artifact.read())
|
|
468
|
+
meta_path = root / _metadata_path(artifact.doc_id, project=artifact.project)
|
|
469
|
+
meta = _read_artifact_metadata(root, artifact) if meta_path.exists() else dict()
|
|
470
|
+
meta[artifact.metadata_key] = artifact.filename
|
|
475
471
|
meta_path.write_text(json.dumps(meta))
|
|
476
|
-
return
|
|
472
|
+
return artifact_path.relative_to(artif_dir)
|
|
477
473
|
|
|
478
474
|
|
|
479
475
|
def debuggable_name(
|
|
Binary file
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
from typing import Annotated
|
|
2
|
-
|
|
3
|
-
import typer
|
|
4
|
-
|
|
5
|
-
from datashare_python.cli.utils import AsyncTyper
|
|
6
|
-
from datashare_python.constants import DEFAULT_NAMESPACE, DEFAULT_TEMPORAL_ADDRESS
|
|
7
|
-
from datashare_python.local_client import LocalClient
|
|
8
|
-
|
|
9
|
-
_REGISTER_NAMESPACE_HELP = "register namespace"
|
|
10
|
-
_TEMPORAL_URL_HELP = "address for temporal server"
|
|
11
|
-
_NAMESPACE_HELP = "namespace name"
|
|
12
|
-
_LOCAL = "local"
|
|
13
|
-
|
|
14
|
-
local_app = AsyncTyper(name=_LOCAL)
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@local_app.async_command(help=_REGISTER_NAMESPACE_HELP)
|
|
18
|
-
async def register_namespace(
|
|
19
|
-
namespace: Annotated[
|
|
20
|
-
str, typer.Option("--namespace", "-n", help=_NAMESPACE_HELP)
|
|
21
|
-
] = DEFAULT_NAMESPACE,
|
|
22
|
-
temporal_address: Annotated[
|
|
23
|
-
str, typer.Option("--temporal-address", "-a", help=_TEMPORAL_URL_HELP)
|
|
24
|
-
] = DEFAULT_TEMPORAL_ADDRESS,
|
|
25
|
-
) -> None:
|
|
26
|
-
"""Create namespace
|
|
27
|
-
|
|
28
|
-
:param namespace: namespace
|
|
29
|
-
:param temporal_address: target host
|
|
30
|
-
"""
|
|
31
|
-
client = LocalClient()
|
|
32
|
-
|
|
33
|
-
await client.register_namespace(temporal_address, namespace)
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import importlib
|
|
3
|
-
import logging
|
|
4
|
-
from functools import partial
|
|
5
|
-
|
|
6
|
-
from google.protobuf.duration_pb2 import Duration
|
|
7
|
-
from temporalio.api.workflowservice.v1 import (
|
|
8
|
-
ListNamespacesRequest,
|
|
9
|
-
RegisterNamespaceRequest,
|
|
10
|
-
)
|
|
11
|
-
from temporalio.service import ConnectConfig, ServiceClient
|
|
12
|
-
|
|
13
|
-
LOGGER = logging.getLogger(__name__)
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class LocalClient:
|
|
17
|
-
@staticmethod
|
|
18
|
-
async def start_workers(
|
|
19
|
-
temporal_address: str, worker_paths: list[str], *args, **kwargs
|
|
20
|
-
) -> None:
|
|
21
|
-
"""Start worker modules, defined as a list of worker paths
|
|
22
|
-
|
|
23
|
-
:param temporal_address: temporal address string (host:port)
|
|
24
|
-
:param worker_paths: list of worker modules
|
|
25
|
-
"""
|
|
26
|
-
workers = []
|
|
27
|
-
|
|
28
|
-
for worker_path in worker_paths:
|
|
29
|
-
try:
|
|
30
|
-
module_parts = worker_path.split(".")
|
|
31
|
-
module_path = ".".join(module_parts[:-1])
|
|
32
|
-
worker_method = module_parts[-1]
|
|
33
|
-
module = importlib.import_module(module_path)
|
|
34
|
-
worker = getattr(module, worker_method)
|
|
35
|
-
worker_partial = partial(worker, *args, **kwargs)
|
|
36
|
-
|
|
37
|
-
workers.append(worker_partial(target_host=temporal_address))
|
|
38
|
-
|
|
39
|
-
LOGGER.info("'%s' imported successfully.", worker_path)
|
|
40
|
-
except (ModuleNotFoundError, AttributeError):
|
|
41
|
-
LOGGER.error("'%s' not found in path. Skipping.", worker_path)
|
|
42
|
-
continue
|
|
43
|
-
|
|
44
|
-
await asyncio.gather(*workers, return_exceptions=True)
|
|
45
|
-
|
|
46
|
-
@staticmethod
|
|
47
|
-
async def register_namespace(temporal_address: str, namespace: str) -> None:
|
|
48
|
-
"""Register a temporal namespace
|
|
49
|
-
|
|
50
|
-
:param temporal_address: temporal address string
|
|
51
|
-
:param namespace: namespace string
|
|
52
|
-
"""
|
|
53
|
-
client = await ServiceClient.connect(
|
|
54
|
-
ConnectConfig(target_host=temporal_address)
|
|
55
|
-
)
|
|
56
|
-
list_resp = await client.workflow_service.list_namespaces(
|
|
57
|
-
ListNamespacesRequest()
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
if namespace in [ns.namespace_info.name for ns in list_resp.namespaces]:
|
|
61
|
-
return
|
|
62
|
-
|
|
63
|
-
await client.workflow_service.register_namespace(
|
|
64
|
-
RegisterNamespaceRequest(
|
|
65
|
-
namespace=namespace,
|
|
66
|
-
# retain for thirty days
|
|
67
|
-
workflow_execution_retention_period=Duration(seconds=30 * 24 * 60 * 60),
|
|
68
|
-
)
|
|
69
|
-
)
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|