datashare-python 0.6.0__tar.gz → 0.6.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datashare_python-0.6.0 → datashare_python-0.6.2}/PKG-INFO +1 -1
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/cli/task.py +2 -3
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/config.py +5 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/conftest.py +1 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/constants.py +2 -3
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/objects.py +87 -6
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/utils.py +19 -13
- datashare_python-0.6.2/datashare_python/worker-template.tar.gz +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/pyproject.toml +1 -1
- datashare_python-0.6.0/datashare_python/worker-template.tar.gz +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/.gitignore +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/README.md +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/.gitignore +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/__init__.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/__main__.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/cli/__init__.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/cli/project.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/cli/utils.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/cli/worker.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/dependencies.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/discovery.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/exceptions.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/task_client.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/template.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/types_.py +0 -0
- {datashare_python-0.6.0 → datashare_python-0.6.2}/datashare_python/worker.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datashare-python
|
|
3
|
-
Version: 0.6.
|
|
3
|
+
Version: 0.6.2
|
|
4
4
|
Summary: Manage Pythoœn tasks and local resources in Datashare
|
|
5
5
|
Project-URL: Homepage, https://icij.github.io/datashare-python/
|
|
6
6
|
Project-URL: Documentation, https://icij.github.io/datashare-python/
|
|
@@ -10,8 +10,7 @@ import typer
|
|
|
10
10
|
from alive_progress import alive_bar
|
|
11
11
|
|
|
12
12
|
from datashare_python.cli.utils import AsyncTyper, eprint
|
|
13
|
-
from datashare_python.
|
|
14
|
-
from datashare_python.objects import READY_STATES, Task, TaskError, TaskState
|
|
13
|
+
from datashare_python.objects import READY_STATES, Task, TaskError, TaskGroup, TaskState
|
|
15
14
|
from datashare_python.task_client import DatashareTaskClient
|
|
16
15
|
|
|
17
16
|
logger = logging.getLogger(__name__)
|
|
@@ -41,7 +40,7 @@ async def start(
|
|
|
41
40
|
group: Annotated[
|
|
42
41
|
str | None,
|
|
43
42
|
typer.Option("--group", "-g", help=_GROUP_HELP),
|
|
44
|
-
] =
|
|
43
|
+
] = TaskGroup.python, # noqa: F821
|
|
45
44
|
ds_address: Annotated[
|
|
46
45
|
str, typer.Option("--ds-address", "-a", help=_DS_URL_HELP)
|
|
47
46
|
] = DEFAULT_DS_ADDRESS,
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import ClassVar
|
|
2
3
|
|
|
3
4
|
from icij_common.es import ESClient
|
|
@@ -87,6 +88,10 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
|
|
|
87
88
|
|
|
88
89
|
max_concurrent_io_activities: int = 5
|
|
89
90
|
|
|
91
|
+
docs_root: Path | None = None
|
|
92
|
+
artifacts_root: Path | None = None
|
|
93
|
+
workdir: Path | None = None
|
|
94
|
+
|
|
90
95
|
def to_es_client(self) -> ESClient:
|
|
91
96
|
return self.elasticsearch.to_es_client(self.datashare.api_key)
|
|
92
97
|
|
|
@@ -1,11 +1,8 @@
|
|
|
1
1
|
from pathlib import Path
|
|
2
2
|
|
|
3
|
-
from .objects import TaskGroup
|
|
4
|
-
|
|
5
3
|
PACKAGE_DIR = Path(__file__).parent
|
|
6
4
|
PACKAGE_ROOT = PACKAGE_DIR.parent
|
|
7
5
|
|
|
8
|
-
PYTHON_TASK_GROUP = TaskGroup(name="PYTHON")
|
|
9
6
|
|
|
10
7
|
DEFAULT_TEMPORAL_ADDRESS = "temporal:7233"
|
|
11
8
|
|
|
@@ -14,3 +11,5 @@ DEFAULT_DS_ADDRESS = "http://localhost:8080"
|
|
|
14
11
|
DEFAULT_NAMESPACE = "datashare-default"
|
|
15
12
|
|
|
16
13
|
METADATA_JSON = "metadata.json"
|
|
14
|
+
|
|
15
|
+
TIKA_METADATA_RESOURCENAME = "tika_metadata_resourcename"
|
|
@@ -5,12 +5,24 @@ from datetime import UTC, datetime
|
|
|
5
5
|
from enum import StrEnum, unique
|
|
6
6
|
from io import BytesIO
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Literal, Self, TypeVar
|
|
8
|
+
from typing import Any, Literal, Self, TypeVar, cast
|
|
9
9
|
|
|
10
10
|
from temporalio import workflow
|
|
11
11
|
|
|
12
|
+
from .constants import TIKA_METADATA_RESOURCENAME
|
|
13
|
+
|
|
12
14
|
with workflow.unsafe.imports_passed_through():
|
|
13
|
-
from icij_common.es import
|
|
15
|
+
from icij_common.es import (
|
|
16
|
+
DOC_CONTENT,
|
|
17
|
+
DOC_CONTENT_TRANSLATED,
|
|
18
|
+
DOC_LANGUAGE,
|
|
19
|
+
DOC_METADATA,
|
|
20
|
+
DOC_PATH,
|
|
21
|
+
DOC_ROOT_ID,
|
|
22
|
+
ID_,
|
|
23
|
+
INDEX_,
|
|
24
|
+
SOURCE,
|
|
25
|
+
)
|
|
14
26
|
|
|
15
27
|
from icij_common.pydantic_utils import (
|
|
16
28
|
icij_config,
|
|
@@ -137,11 +149,48 @@ class Task(Message):
|
|
|
137
149
|
class TaskGroup:
|
|
138
150
|
name: str
|
|
139
151
|
|
|
152
|
+
@property
|
|
153
|
+
@classmethod
|
|
154
|
+
def python(cls) -> Self:
|
|
155
|
+
return cls(name="PYTHON")
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@unique
|
|
159
|
+
class DocumentLocation(StrEnum):
|
|
160
|
+
ORIGINAL = "original"
|
|
161
|
+
ARTIFACTS = "artifacts"
|
|
162
|
+
WORKDIR = "workdir"
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class FilesystemDocument(DatashareModel):
|
|
166
|
+
id: str
|
|
167
|
+
path: Path
|
|
168
|
+
index: str
|
|
169
|
+
location: DocumentLocation
|
|
170
|
+
resource_name: str
|
|
171
|
+
|
|
172
|
+
def locate(
|
|
173
|
+
self, original_root: Path, *, artifacts_root: Path, workdir: Path
|
|
174
|
+
) -> Path:
|
|
175
|
+
from datashare_python.utils import artifacts_dir # noqa: PLC0415
|
|
176
|
+
|
|
177
|
+
project = self.index
|
|
178
|
+
match self.location:
|
|
179
|
+
case DocumentLocation.ORIGINAL:
|
|
180
|
+
return original_root / self.path
|
|
181
|
+
case DocumentLocation.ARTIFACTS:
|
|
182
|
+
return artifacts_root / artifacts_dir(self.id, project=project) / "raw"
|
|
183
|
+
case DocumentLocation.WORKDIR:
|
|
184
|
+
return workdir / self.path
|
|
185
|
+
case _:
|
|
186
|
+
raise ValueError(f"invalid location: {self.path}")
|
|
187
|
+
|
|
140
188
|
|
|
141
189
|
class Document(DatashareModel):
|
|
142
190
|
id: str
|
|
143
|
-
root_document: str
|
|
144
191
|
language: str
|
|
192
|
+
index: str | None = None
|
|
193
|
+
root_document: str | None = None
|
|
145
194
|
content: str | None = None
|
|
146
195
|
content_type: str | None = None
|
|
147
196
|
path: Path | None = None
|
|
@@ -149,6 +198,7 @@ class Document(DatashareModel):
|
|
|
149
198
|
content_translated: dict[str, str] = Field(
|
|
150
199
|
default_factory=dict, alias="content_translated"
|
|
151
200
|
)
|
|
201
|
+
metadata: dict[str, Any] | None = None
|
|
152
202
|
type: str = Field(default="Document", frozen=True)
|
|
153
203
|
|
|
154
204
|
@classmethod
|
|
@@ -156,11 +206,42 @@ class Document(DatashareModel):
|
|
|
156
206
|
sources = es_doc[SOURCE]
|
|
157
207
|
return cls(
|
|
158
208
|
id=es_doc[ID_],
|
|
159
|
-
|
|
160
|
-
|
|
209
|
+
index=es_doc.get(INDEX_),
|
|
210
|
+
content=sources.get(DOC_CONTENT),
|
|
211
|
+
content_translated=sources.get(DOC_CONTENT_TRANSLATED, dict()),
|
|
161
212
|
language=sources[DOC_LANGUAGE],
|
|
162
|
-
root_document=sources
|
|
213
|
+
root_document=sources.get(DOC_ROOT_ID),
|
|
163
214
|
tags=sources.get("tags", []),
|
|
215
|
+
path=sources.get(DOC_PATH),
|
|
216
|
+
metadata=sources.get(DOC_METADATA),
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
def to_filesystem(self) -> FilesystemDocument:
|
|
220
|
+
from .utils import artifacts_dir # noqa: PLC0415
|
|
221
|
+
|
|
222
|
+
if self.metadata is None:
|
|
223
|
+
raise ValueError(
|
|
224
|
+
"can't compute filesyste path for document withtout metadata"
|
|
225
|
+
)
|
|
226
|
+
resource_name = cast(str, self.metadata[TIKA_METADATA_RESOURCENAME])
|
|
227
|
+
if self.root_document is None:
|
|
228
|
+
path = self.path
|
|
229
|
+
location = DocumentLocation.ORIGINAL
|
|
230
|
+
else:
|
|
231
|
+
if self.index is None:
|
|
232
|
+
msg = (
|
|
233
|
+
f"can't compute filesystem path for embedded doc {self.id} without"
|
|
234
|
+
f" index"
|
|
235
|
+
)
|
|
236
|
+
raise ValueError(msg)
|
|
237
|
+
path = artifacts_dir(doc_id=self.id, project=self.index) / "raw"
|
|
238
|
+
location = DocumentLocation.ARTIFACTS
|
|
239
|
+
return FilesystemDocument(
|
|
240
|
+
id=self.id,
|
|
241
|
+
path=path,
|
|
242
|
+
index=self.index,
|
|
243
|
+
location=location,
|
|
244
|
+
resource_name=resource_name,
|
|
164
245
|
)
|
|
165
246
|
|
|
166
247
|
|
|
@@ -440,12 +440,12 @@ def safe_dir(doc_id: str) -> Path:
|
|
|
440
440
|
return Path(*parts)
|
|
441
441
|
|
|
442
442
|
|
|
443
|
-
def
|
|
443
|
+
def artifacts_dir(doc_id: str, *, project: str) -> Path:
|
|
444
444
|
return Path(project, safe_dir(doc_id), doc_id)
|
|
445
445
|
|
|
446
446
|
|
|
447
447
|
def _metadata_path(doc_id: str, *, project: str) -> Path:
|
|
448
|
-
metadata_path =
|
|
448
|
+
metadata_path = artifacts_dir(doc_id, project=project) / METADATA_JSON
|
|
449
449
|
return metadata_path
|
|
450
450
|
|
|
451
451
|
|
|
@@ -455,7 +455,7 @@ def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict:
|
|
|
455
455
|
|
|
456
456
|
|
|
457
457
|
def write_artifact(root: Path, artifact: DocArtifact) -> Path:
|
|
458
|
-
artif_dir = root /
|
|
458
|
+
artif_dir = root / artifacts_dir(artifact.doc_id, project=artifact.project)
|
|
459
459
|
artif_dir.mkdir(exist_ok=True, parents=True)
|
|
460
460
|
# TODO: if transcriptions are too large we could also serialize them
|
|
461
461
|
# as jsonl
|
|
@@ -479,17 +479,23 @@ def debuggable_name(
|
|
|
479
479
|
displayable_file_name = [c[:component_size_limit] for c in path.parts]
|
|
480
480
|
uuid = sha256(str(path).encode()).hexdigest() if deterministic else uuid4().hex
|
|
481
481
|
uuid = uuid[:20]
|
|
482
|
-
return f"{uuid}-{'
|
|
482
|
+
return f"{uuid}-{'--'.join(displayable_file_name)}"
|
|
483
483
|
|
|
484
484
|
|
|
485
|
-
def activity_contextual_id(
|
|
485
|
+
def activity_contextual_id(
|
|
486
|
+
*, wf_context: bool = True, act_context: bool = False, run_context: bool = False
|
|
487
|
+
) -> str:
|
|
488
|
+
contextual_id = []
|
|
486
489
|
act_info = activity.info()
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
act_type = act_info.activity_type
|
|
490
|
-
contextual_id = f"{act_type}-{act_id}-{act_run_id}"
|
|
490
|
+
if not wf_context and not act_context:
|
|
491
|
+
raise ValueError("at least one of wf_context and act_context must be True")
|
|
491
492
|
if wf_context:
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
493
|
+
contextual_id.append(act_info.workflow_id)
|
|
494
|
+
if run_context:
|
|
495
|
+
contextual_id.append(act_info.workflow_run_id)
|
|
496
|
+
if act_context:
|
|
497
|
+
contextual_id.append(act_info.activity_type)
|
|
498
|
+
contextual_id.append(act_info.activity_id)
|
|
499
|
+
if run_context:
|
|
500
|
+
contextual_id.append(act_info.activity_run_id)
|
|
501
|
+
return "-".join(contextual_id)
|
|
Binary file
|
|
Binary file
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|