datashare-python 0.6.0__py3-none-any.whl → 0.6.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,8 +10,7 @@ import typer
10
10
  from alive_progress import alive_bar
11
11
 
12
12
  from datashare_python.cli.utils import AsyncTyper, eprint
13
- from datashare_python.constants import PYTHON_TASK_GROUP
14
- from datashare_python.objects import READY_STATES, Task, TaskError, TaskState
13
+ from datashare_python.objects import READY_STATES, Task, TaskError, TaskGroup, TaskState
15
14
  from datashare_python.task_client import DatashareTaskClient
16
15
 
17
16
  logger = logging.getLogger(__name__)
@@ -41,7 +40,7 @@ async def start(
41
40
  group: Annotated[
42
41
  str | None,
43
42
  typer.Option("--group", "-g", help=_GROUP_HELP),
44
- ] = PYTHON_TASK_GROUP.name,
43
+ ] = TaskGroup.python, # noqa: F821
45
44
  ds_address: Annotated[
46
45
  str, typer.Option("--ds-address", "-a", help=_DS_URL_HELP)
47
46
  ] = DEFAULT_DS_ADDRESS,
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import ClassVar
2
3
 
3
4
  from icij_common.es import ESClient
@@ -87,6 +88,10 @@ class WorkerConfig(ICIJSettings, LogWithWorkerIDMixin, BaseModel):
87
88
 
88
89
  max_concurrent_io_activities: int = 5
89
90
 
91
+ docs_root: Path | None = None
92
+ artifacts_root: Path | None = None
93
+ workdir: Path | None = None
94
+
90
95
  def to_es_client(self) -> ESClient:
91
96
  return self.elasticsearch.to_es_client(self.datashare.api_key)
92
97
 
@@ -262,6 +262,7 @@ def doc_2() -> Document:
262
262
  def doc_3() -> Document:
263
263
  return Document(
264
264
  id="doc-3",
265
+ index=TEST_PROJECT,
265
266
  root_document="root-3",
266
267
  language="SPANISH",
267
268
  content="traduce este texto al inglés",
@@ -1,11 +1,8 @@
1
1
  from pathlib import Path
2
2
 
3
- from .objects import TaskGroup
4
-
5
3
  PACKAGE_DIR = Path(__file__).parent
6
4
  PACKAGE_ROOT = PACKAGE_DIR.parent
7
5
 
8
- PYTHON_TASK_GROUP = TaskGroup(name="PYTHON")
9
6
 
10
7
  DEFAULT_TEMPORAL_ADDRESS = "temporal:7233"
11
8
 
@@ -14,3 +11,5 @@ DEFAULT_DS_ADDRESS = "http://localhost:8080"
14
11
  DEFAULT_NAMESPACE = "datashare-default"
15
12
 
16
13
  METADATA_JSON = "metadata.json"
14
+
15
+ TIKA_METADATA_RESOURCENAME = "tika_metadata_resourcename"
@@ -5,12 +5,24 @@ from datetime import UTC, datetime
5
5
  from enum import StrEnum, unique
6
6
  from io import BytesIO
7
7
  from pathlib import Path
8
- from typing import Any, Literal, Self, TypeVar
8
+ from typing import Any, Literal, Self, TypeVar, cast
9
9
 
10
10
  from temporalio import workflow
11
11
 
12
+ from .constants import TIKA_METADATA_RESOURCENAME
13
+
12
14
  with workflow.unsafe.imports_passed_through():
13
- from icij_common.es import DOC_CONTENT, DOC_LANGUAGE, DOC_ROOT_ID, ID_, SOURCE
15
+ from icij_common.es import (
16
+ DOC_CONTENT,
17
+ DOC_CONTENT_TRANSLATED,
18
+ DOC_LANGUAGE,
19
+ DOC_METADATA,
20
+ DOC_PATH,
21
+ DOC_ROOT_ID,
22
+ ID_,
23
+ INDEX_,
24
+ SOURCE,
25
+ )
14
26
 
15
27
  from icij_common.pydantic_utils import (
16
28
  icij_config,
@@ -137,11 +149,48 @@ class Task(Message):
137
149
  class TaskGroup:
138
150
  name: str
139
151
 
152
+ @property
153
+ @classmethod
154
+ def python(cls) -> Self:
155
+ return cls(name="PYTHON")
156
+
157
+
158
+ @unique
159
+ class DocumentLocation(StrEnum):
160
+ ORIGINAL = "original"
161
+ ARTIFACTS = "artifacts"
162
+ WORKDIR = "workdir"
163
+
164
+
165
+ class FilesystemDocument(DatashareModel):
166
+ id: str
167
+ path: Path
168
+ index: str
169
+ location: DocumentLocation
170
+ resource_name: str
171
+
172
+ def locate(
173
+ self, original_root: Path, *, artifacts_root: Path, workdir: Path
174
+ ) -> Path:
175
+ from datashare_python.utils import artifacts_dir # noqa: PLC0415
176
+
177
+ project = self.index
178
+ match self.location:
179
+ case DocumentLocation.ORIGINAL:
180
+ return original_root / self.path
181
+ case DocumentLocation.ARTIFACTS:
182
+ return artifacts_root / artifacts_dir(self.id, project=project) / "raw"
183
+ case DocumentLocation.WORKDIR:
184
+ return workdir / self.path
185
+ case _:
186
+ raise ValueError(f"invalid location: {self.path}")
187
+
140
188
 
141
189
  class Document(DatashareModel):
142
190
  id: str
143
- root_document: str
144
191
  language: str
192
+ index: str | None = None
193
+ root_document: str | None = None
145
194
  content: str | None = None
146
195
  content_type: str | None = None
147
196
  path: Path | None = None
@@ -149,6 +198,7 @@ class Document(DatashareModel):
149
198
  content_translated: dict[str, str] = Field(
150
199
  default_factory=dict, alias="content_translated"
151
200
  )
201
+ metadata: dict[str, Any] | None = None
152
202
  type: str = Field(default="Document", frozen=True)
153
203
 
154
204
  @classmethod
@@ -156,11 +206,42 @@ class Document(DatashareModel):
156
206
  sources = es_doc[SOURCE]
157
207
  return cls(
158
208
  id=es_doc[ID_],
159
- content=sources[DOC_CONTENT],
160
- content_translated=sources.get("content_translated", dict()),
209
+ index=es_doc.get(INDEX_),
210
+ content=sources.get(DOC_CONTENT),
211
+ content_translated=sources.get(DOC_CONTENT_TRANSLATED, dict()),
161
212
  language=sources[DOC_LANGUAGE],
162
- root_document=sources[DOC_ROOT_ID],
213
+ root_document=sources.get(DOC_ROOT_ID),
163
214
  tags=sources.get("tags", []),
215
+ path=sources.get(DOC_PATH),
216
+ metadata=sources.get(DOC_METADATA),
217
+ )
218
+
219
+ def to_filesystem(self) -> FilesystemDocument:
220
+ from .utils import artifacts_dir # noqa: PLC0415
221
+
222
+ if self.metadata is None:
223
+ raise ValueError(
224
+ "can't compute filesyste path for document withtout metadata"
225
+ )
226
+ resource_name = cast(str, self.metadata[TIKA_METADATA_RESOURCENAME])
227
+ if self.root_document is None:
228
+ path = self.path
229
+ location = DocumentLocation.ORIGINAL
230
+ else:
231
+ if self.index is None:
232
+ msg = (
233
+ f"can't compute filesystem path for embedded doc {self.id} without"
234
+ f" index"
235
+ )
236
+ raise ValueError(msg)
237
+ path = artifacts_dir(doc_id=self.id, project=self.index) / "raw"
238
+ location = DocumentLocation.ARTIFACTS
239
+ return FilesystemDocument(
240
+ id=self.id,
241
+ path=path,
242
+ index=self.index,
243
+ location=location,
244
+ resource_name=resource_name,
164
245
  )
165
246
 
166
247
 
datashare_python/utils.py CHANGED
@@ -440,12 +440,12 @@ def safe_dir(doc_id: str) -> Path:
440
440
  return Path(*parts)
441
441
 
442
442
 
443
- def _artifacts_dir(doc_id: str, *, project: str) -> Path:
443
+ def artifacts_dir(doc_id: str, *, project: str) -> Path:
444
444
  return Path(project, safe_dir(doc_id), doc_id)
445
445
 
446
446
 
447
447
  def _metadata_path(doc_id: str, *, project: str) -> Path:
448
- metadata_path = _artifacts_dir(doc_id, project=project) / METADATA_JSON
448
+ metadata_path = artifacts_dir(doc_id, project=project) / METADATA_JSON
449
449
  return metadata_path
450
450
 
451
451
 
@@ -455,7 +455,7 @@ def _read_artifact_metadata(root: Path, artifact: DocArtifact) -> dict:
455
455
 
456
456
 
457
457
  def write_artifact(root: Path, artifact: DocArtifact) -> Path:
458
- artif_dir = root / _artifacts_dir(artifact.doc_id, project=artifact.project)
458
+ artif_dir = root / artifacts_dir(artifact.doc_id, project=artifact.project)
459
459
  artif_dir.mkdir(exist_ok=True, parents=True)
460
460
  # TODO: if transcriptions are too large we could also serialize them
461
461
  # as jsonl
@@ -479,17 +479,23 @@ def debuggable_name(
479
479
  displayable_file_name = [c[:component_size_limit] for c in path.parts]
480
480
  uuid = sha256(str(path).encode()).hexdigest() if deterministic else uuid4().hex
481
481
  uuid = uuid[:20]
482
- return f"{uuid}-{'__'.join(displayable_file_name)}"
482
+ return f"{uuid}-{'--'.join(displayable_file_name)}"
483
483
 
484
484
 
485
- def activity_contextual_id(*, wf_context: bool = False) -> str:
485
+ def activity_contextual_id(
486
+ *, wf_context: bool = True, act_context: bool = False, run_context: bool = False
487
+ ) -> str:
488
+ contextual_id = []
486
489
  act_info = activity.info()
487
- act_id = act_info.activity_id
488
- act_run_id = act_info.activity_id
489
- act_type = act_info.activity_type
490
- contextual_id = f"{act_type}-{act_id}-{act_run_id}"
490
+ if not wf_context and not act_context:
491
+ raise ValueError("at least one of wf_context and act_context must be True")
491
492
  if wf_context:
492
- wf_id = act_info.workflow_id
493
- wf_run_id = act_info.workflow_run_id
494
- contextual_id += f"-{wf_id}-{wf_run_id}"
495
- return contextual_id
493
+ contextual_id.append(act_info.workflow_id)
494
+ if run_context:
495
+ contextual_id.append(act_info.workflow_run_id)
496
+ if act_context:
497
+ contextual_id.append(act_info.activity_type)
498
+ contextual_id.append(act_info.activity_id)
499
+ if run_context:
500
+ contextual_id.append(act_info.activity_run_id)
501
+ return "-".join(contextual_id)
Binary file
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datashare-python
3
- Version: 0.6.0
3
+ Version: 0.6.2
4
4
  Summary: Manage Pythoœn tasks and local resources in Datashare
5
5
  Project-URL: Homepage, https://icij.github.io/datashare-python/
6
6
  Project-URL: Documentation, https://icij.github.io/datashare-python/
@@ -1,25 +1,25 @@
1
1
  datashare_python/.gitignore,sha256=e-SRgnvGGdsjRrqgKsTzALz6Obx8IYiOjr0yaAxT6v8,22
2
2
  datashare_python/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  datashare_python/__main__.py,sha256=g-fvS46zl9umKmGrSpl-OG-8PSuZgjqvTCqjpsZtSps,101
4
- datashare_python/config.py,sha256=p-uBTle30kbUdqrj8rXcYv2gHiNwtqcYMnGi4Kctumk,3683
5
- datashare_python/conftest.py,sha256=JfVoi2MXx5XVAyQV4XxRQ6bHh-26vj56Wtjn-Zrgefk,8484
6
- datashare_python/constants.py,sha256=e6Px11OUee9GSHwTgsgFMszGCMwpW-OznHSMgINvepc,338
4
+ datashare_python/config.py,sha256=Mn43zAqUH8bmVpPQ4DK42ffg5LmRtC7ZHHgBlIINlD4,3814
5
+ datashare_python/conftest.py,sha256=zUEB9d3CcYVVfsuEXkvF1G5JCEchTknB6pFUiM8FbXA,8512
6
+ datashare_python/constants.py,sha256=a8-ceZKBVMXydcoNQ35fSjFjxeJ7dt-N6eAvqtPpf9g,320
7
7
  datashare_python/dependencies.py,sha256=4UsVFKRjd2Q0ghg_fUU24P26tFYhg_SnAENj2mKErrY,4060
8
8
  datashare_python/discovery.py,sha256=UsfIb_pL56BQ5i5xvfuhOO0bDefjA_oToxrCeJQQbcU,6925
9
9
  datashare_python/exceptions.py,sha256=bVHEAXxDPKfxeeMC0hJXEsrJkgsKO2ESAhxWU96GA4M,496
10
- datashare_python/objects.py,sha256=hTtkmuQIlQycJsBFgK3wU9emrIXzWmc3sOuEIGpniNI,4581
10
+ datashare_python/objects.py,sha256=mz025_6DogMMXSk5PqEctp0slvY4EKkuMDm80Kpm5_A,7020
11
11
  datashare_python/task_client.py,sha256=oTmP8bvZW0UyhLNMi1AV3XIAx7hrdbxNRss2Mw2azEc,8435
12
12
  datashare_python/template.py,sha256=RxKTYLXoS_EQ8Jc41JkBXppPdbCFqDWfP3BmC0gvB5o,4024
13
13
  datashare_python/types_.py,sha256=9Hk1XqpdXbM1TnEzwvJ5G9ABbaCZW9KgBTtiPBVn_7k,649
14
- datashare_python/utils.py,sha256=qnBrK-oefv6OiL9XUBImsuTUdpwgi2lE97kVv4M5X94,16974
15
- datashare_python/worker-template.tar.gz,sha256=cH5LwVFfdhzooE81PtC2sL6NFThLS2274Wt63XRv5TE,274869
14
+ datashare_python/utils.py,sha256=ZGZKO9Q4_aLVVilZUCkmHQ21M_37hVOCr7G-qZPOflU,17234
15
+ datashare_python/worker-template.tar.gz,sha256=f_9jyCcZDTgQvzl7K3vD5Ved9MaHXUV9Ox_uZWzub10,274849
16
16
  datashare_python/worker.py,sha256=UdSCWZw8qhkhzok89oU7J35VTDZwxRAqef-5Z8yt95A,6333
17
17
  datashare_python/cli/__init__.py,sha256=9BPWtssDgsVfWMsZ1TtZCla0EC_kai4RHttr8oNLYOE,1401
18
18
  datashare_python/cli/project.py,sha256=w32Gy9AOL5B00uDT4in7YUCt2g68FnNbvwg2M3a8G6o,946
19
- datashare_python/cli/task.py,sha256=9If5OC7loG4C4gWWl4iOeqPJ4GOLlCWXQfuNLUHORrQ,5860
19
+ datashare_python/cli/task.py,sha256=8mvKGS21bZ14BgZ0Uo-dfameljkaI2ZBha80ywCy-E8,5822
20
20
  datashare_python/cli/utils.py,sha256=p69CQb0zfixuyBkiZprhdMCc_NuYwXyAn6vC9H1UzAw,911
21
21
  datashare_python/cli/worker.py,sha256=I4KTpFIpXFowioFn72Rm6LBCYlY-Dhp4NBIPvtRgUXE,5283
22
- datashare_python-0.6.0.dist-info/METADATA,sha256=JIEU1lFTYPVZA9chJpchkMc5ZalduhrI89v2uT8ZwIk,907
23
- datashare_python-0.6.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
24
- datashare_python-0.6.0.dist-info/entry_points.txt,sha256=ILE7auxabHWiu3GC-AunWnzjhOI_SbZp7D4GqZHlLw4,68
25
- datashare_python-0.6.0.dist-info/RECORD,,
22
+ datashare_python-0.6.2.dist-info/METADATA,sha256=-tF3-ZSn0AA_xOJe-25-cJUnE-am9-yqDBRZqeNWbFs,907
23
+ datashare_python-0.6.2.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
24
+ datashare_python-0.6.2.dist-info/entry_points.txt,sha256=ILE7auxabHWiu3GC-AunWnzjhOI_SbZp7D4GqZHlLw4,68
25
+ datashare_python-0.6.2.dist-info/RECORD,,