datashare-extract-worker 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ .idea/
2
+ # Python
3
+ *.log
4
+ venv
5
+ *.egg-info
6
+ .eggs
7
+ __pycache__
8
+ *.pytest_cache
9
+ *.pyc
10
+ build
11
+ dist
12
+ test/.env
13
+ .cache
14
+ tmp
15
+ ./*.csv
16
+ .DS_Store
17
+
18
+ # VS code
19
+ .vscode
20
+ # Doc
21
+ site
@@ -0,0 +1,114 @@
1
+ FROM python:3.13-slim-trixie AS builder
2
+
3
+
4
+ FROM python:3.13-slim-trixie AS extract-worker-builder
5
+
6
+ ENV PYTHONUNBUFFERED=1
7
+ ENV UV_HTTP_TIMEOUT=300
8
+ ENV UV_LINK_MODE=copy
9
+ ENV UV_COMPILE_BYTECODE=1
10
+ ENV UV_NO_DEV=1
11
+
12
+ COPY --from=ghcr.io/astral-sh/uv:0.11.6 /uv /uvx /bin/
13
+
14
+ WORKDIR /app
15
+
16
+
17
+ FROM extract-worker-builder AS io-worker
18
+ # Install deps first to optimize layer cache
19
+ RUN --mount=type=cache,target=~/.cache/uv \
20
+ --mount=type=bind,source=uv.dist.lock,target=uv.lock \
21
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
22
+ uv sync -v --frozen --no-editable --no-install-project
23
+
24
+ # Then copy code
25
+ ADD uv.dist.lock ./uv.lock
26
+ ADD pyproject.toml README.md ./
27
+ ADD extract_worker ./extract_worker/
28
+ ADD entrypoints/io_worker.sh ./entrypoints/io_worker.sh
29
+
30
+ # Then install service
31
+ RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable
32
+ RUN rm -rf ~/.cache/pip
33
+
34
+ ENTRYPOINT ["entrypoints/io_worker.sh"]
35
+
36
+
37
+ FROM extract-worker-builder AS extract-cpu-worker
38
+ # Install deps first to optimize layer cache
39
+ RUN --mount=type=cache,target=~/.cache/uv \
40
+ --mount=type=bind,source=uv.dist.lock,target=uv.lock \
41
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
42
+ uv sync -v --frozen --no-editable --no-install-project --extra base --extra cpu
43
+
44
+ # Then copy code
45
+ ADD uv.dist.lock ./uv.lock
46
+ ADD pyproject.toml README.md ./
47
+ ADD extract_worker ./extract_worker/
48
+ ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh
49
+
50
+ # Then install service
51
+ RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra cpu
52
+ RUN rm -rf ~/.cache/pip
53
+
54
+ ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"]
55
+
56
+
57
+ FROM extract-worker-builder AS extract-gpu-worker
58
+ # Install deps first to optimize layer cache
59
+ RUN --mount=type=cache,target=~/.cache/uv \
60
+ --mount=type=bind,source=uv.dist.lock,target=uv.lock \
61
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
62
+ uv sync -v --frozen --no-editable --no-install-project --extra base --extra gpu
63
+
64
+ # Then copy code
65
+ ADD uv.dist.lock ./uv.lock
66
+ ADD pyproject.toml README.md ./
67
+ ADD extract_worker ./extract_worker/
68
+ ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh
69
+
70
+ # Then install service
71
+ RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra gpu
72
+ RUN rm -rf ~/.cache/pip
73
+
74
+ ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"]
75
+
76
+
77
+ FROM extract-worker-builder AS extract-cpu-mineru-worker
78
+ # Install deps first to optimize layer cache
79
+ RUN --mount=type=cache,target=~/.cache/uv \
80
+ --mount=type=bind,source=uv.dist.lock,target=uv.lock \
81
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
82
+ uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra cpu
83
+
84
+ # Then copy code
85
+ ADD uv.dist.lock ./uv.lock
86
+ ADD pyproject.toml README.md ./
87
+ ADD extract_worker ./extract_worker/
88
+ ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh
89
+
90
+ # Then install service
91
+ RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra cpu
92
+ RUN rm -rf ~/.cache/pip
93
+
94
+ ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"]
95
+
96
+
97
+ FROM extract-worker-builder AS extract-gpu-mineru-worker
98
+ # Install deps first to optimize layer cache
99
+ RUN --mount=type=cache,target=~/.cache/uv \
100
+ --mount=type=bind,source=uv.dist.lock,target=uv.lock \
101
+ --mount=type=bind,source=pyproject.toml,target=pyproject.toml \
102
+ uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra gpu
103
+
104
+ # Then copy code
105
+ ADD uv.dist.lock ./uv.lock
106
+ ADD pyproject.toml README.md ./
107
+ ADD extract_worker ./extract_worker/
108
+ ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh
109
+
110
+ # Then install service
111
+ RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra gpu
112
+ RUN rm -rf ~/.cache/pip
113
+
114
+ ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"]
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: datashare-extract-worker
3
+ Version: 0.1.3
4
+ Author-email: Clément Doumouro <cdoumouro@icij.org>, Clément Doumouro <clement.doumouro@gmail.com>, Lion Summerbell <lsummerbell@icij.org>
5
+ Requires-Python: <3.14,>=3.11
6
+ Requires-Dist: datashare-python~=0.8.20
7
+ Requires-Dist: extract-python==0.4.2
8
+ Requires-Dist: temporalio==1.23.0
9
+ Provides-Extra: base
10
+ Requires-Dist: extract-python[docling,marker]==0.4.2; extra == 'base'
11
+ Provides-Extra: cpu
12
+ Requires-Dist: torch==2.11.0; extra == 'cpu'
13
+ Requires-Dist: torchvision==0.26.0; extra == 'cpu'
14
+ Provides-Extra: gpu
15
+ Requires-Dist: cuda-bindings==12.9.4; (sys_platform == 'linux') and extra == 'gpu'
16
+ Requires-Dist: torch==2.11.0+cu129; (sys_platform == 'linux') and extra == 'gpu'
17
+ Requires-Dist: torchvision==0.26.0+cu129; (sys_platform == 'linux') and extra == 'gpu'
18
+ Provides-Extra: mineru
19
+ Requires-Dist: extract-python[mineru]==0.4.2; extra == 'mineru'
File without changes
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ uv run --no-sync datashare-python worker start \
5
+ --dependencies extract.extract \
6
+ --queue extract.cpu \
7
+ --activity extract.extract-markdown-content
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ uv run --no-sync datashare-python worker start \
5
+ --dependencies extract.extract \
6
+ --queue extract.gpu \
7
+ --activity extract.extract-markdown-content
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -e
3
+
4
+ uv run --no-sync datashare-python worker start \
5
+ --dependencies extract.io \
6
+ --queue extract.io \
7
+ --activity extract.create-markdown-batches
@@ -0,0 +1,325 @@
1
+ import contextlib
2
+ import logging
3
+ import mimetypes
4
+ import os
5
+ from collections.abc import AsyncIterable
6
+ from functools import cache
7
+ from itertools import chain
8
+ from pathlib import Path
9
+ from typing import Any, cast
10
+
11
+ from datashare_python.dependencies import lifespan_es_client, lifespan_worker_config
12
+ from datashare_python.objects import DocArtifact, Document, DocumentLocation
13
+ from datashare_python.utils import (
14
+ ActivityWithProgress,
15
+ activity_defn,
16
+ activity_workdir,
17
+ read_jsonl,
18
+ write_artifact,
19
+ )
20
+ from extract_python import Pipeline
21
+ from extract_python.objects import InputDoc, OutputFormat, SupportedExt
22
+ from icij_common.es import (
23
+ DOC_CONTENT_TYPE,
24
+ DOC_LANGUAGE,
25
+ DOC_METADATA,
26
+ DOC_PATH,
27
+ DOC_ROOT_ID,
28
+ ES_DOCUMENT_TYPE,
29
+ HITS,
30
+ QUERY,
31
+ SOURCE,
32
+ ESClient,
33
+ ESSort,
34
+ and_query,
35
+ has_id,
36
+ has_type,
37
+ )
38
+ from pydantic import TypeAdapter
39
+
40
+ from .config import ExtractWorkerConfig
41
+ from .constants import MARKDOWN_DIRNAME, MARKDOWN_METADATA_KEY
42
+ from .objects import (
43
+ DocId,
44
+ DocumentSearchQuery,
45
+ ErrorReport,
46
+ MarkdownExtractResponse,
47
+ PipelineConfig,
48
+ ProcessedDoc,
49
+ ProcessingReport,
50
+ )
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+ mimetypes.init()
55
+
56
+
57
+ class MarkdownExtract(ActivityWithProgress):
58
+ @activity_defn(name="extract.worker_config")
59
+ async def extract_worker_config(self) -> ExtractWorkerConfig:
60
+ worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
61
+ return worker_config
62
+
63
+ @activity_defn(name="extract.create-markdown-batches")
64
+ async def create_markdown_extract_batches(
65
+ self,
66
+ project: str,
67
+ docs: list[DocId] | DocumentSearchQuery | None,
68
+ config: PipelineConfig,
69
+ ) -> list[Path]:
70
+ es_client = lifespan_es_client()
71
+ worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
72
+ workdir = worker_config.workdir
73
+ artifacts_root = worker_config.artifacts_root
74
+ output_dir = activity_workdir(workdir, project)
75
+ output_dir.mkdir(parents=True, exist_ok=True)
76
+ target_n_pages_per_batch = worker_config.markdown.target_n_pages_per_batch
77
+ supported_exts = config.supported_exts()
78
+ batch_paths = [
79
+ p.relative_to(workdir)
80
+ async for p in create_markdown_extract_batches_act(
81
+ docs,
82
+ project,
83
+ supported_exts,
84
+ artifacts_root=artifacts_root,
85
+ workdir=workdir,
86
+ output_dir=output_dir,
87
+ target_n_pages_per_batch=target_n_pages_per_batch,
88
+ es_client=es_client,
89
+ )
90
+ ]
91
+ return batch_paths
92
+
93
+ @activity_defn(name="extract.extract-markdown-content")
94
+ async def extract_markdown_content(
95
+ self, batch: Path, project: str, config: PipelineConfig
96
+ ) -> MarkdownExtractResponse:
97
+ pipeline = Pipeline.from_config(config)
98
+ worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
99
+ workdir = worker_config.workdir
100
+ output_dir = activity_workdir(workdir, project)
101
+ output_dir.mkdir(parents=True, exist_ok=True)
102
+ batch = workdir / batch
103
+ return await extract_markdown_content_act(
104
+ pipeline,
105
+ batch,
106
+ worker_config=worker_config,
107
+ output_dir=output_dir,
108
+ )
109
+
110
+
111
+ # Sort documents aiming for consistent processing type in a batch
112
+ _DOC_SORT = [f"{DOC_CONTENT_TYPE}:asc", f"{DOC_LANGUAGE}:asc", "_doc:asc"]
113
+ _DOC_CONTENT_SOURCES = [DOC_PATH, DOC_ROOT_ID, DOC_LANGUAGE, DOC_METADATA]
114
+
115
+
116
+ async def create_markdown_extract_batches_act(
117
+ docs: list[DocId] | DocumentSearchQuery | None,
118
+ project: str,
119
+ supported_exts: set[SupportedExt],
120
+ *,
121
+ artifacts_root: Path,
122
+ workdir: Path,
123
+ output_dir: Path,
124
+ target_n_pages_per_batch: int,
125
+ es_client: ESClient | None = None,
126
+ ) -> AsyncIterable[Path]:
127
+ # TODO: supported content types should be args
128
+ query = _build_doc_query(docs, supported_exts)
129
+ docs = (
130
+ _symlink_embedded_processed_doc_to_workdir(d, artifacts_root, workdir=workdir)
131
+ async for d in _search_docs(es_client, project, query, sort=_DOC_SORT)
132
+ )
133
+ batches = _batch_by_n_pages(docs, target_n_pages_per_batch=target_n_pages_per_batch)
134
+ async for p in _write_batches(batches, output_dir):
135
+ yield p
136
+
137
+
138
+ _BatchTypeAdapter = TypeAdapter(list[ProcessedDoc])
139
+
140
+
141
+ async def extract_markdown_content_act(
142
+ pipeline: Pipeline,
143
+ batch: Path,
144
+ *,
145
+ worker_config: ExtractWorkerConfig,
146
+ output_dir: Path,
147
+ ) -> MarkdownExtractResponse:
148
+ docs = _BatchTypeAdapter.validate_python(list(read_jsonl(batch)))
149
+ docs_root = worker_config.docs_root
150
+ artifacts_root = worker_config.artifacts_root
151
+ workdir = worker_config.workdir
152
+ input_docs = (
153
+ InputDoc.from_path(
154
+ d.locate(
155
+ original_root=docs_root, artifacts_root=artifacts_root, workdir=workdir
156
+ )
157
+ )
158
+ for d in docs
159
+ )
160
+ results = pipeline.extract_content(
161
+ input_docs, output_format=OutputFormat.MARKDOWN, output_path=output_dir
162
+ )
163
+ docs = iter(docs)
164
+ n_docs, n_pages, n_successes, n_successes_pages = 0, 0, 0, 0
165
+ errors = []
166
+ async for extract_res in results:
167
+ doc = next(docs)
168
+ n_docs += 1
169
+ n_pages += doc.n_pages
170
+ if extract_res.errors:
171
+ error = ErrorReport(
172
+ doc=doc, status=extract_res.status, errors=extract_res.errors
173
+ )
174
+ errors.append(error)
175
+ else:
176
+ n_successes += 1
177
+ n_successes_pages += doc.n_pages
178
+ md_path = output_dir / extract_res.output.path
179
+ artifact = DocArtifact(
180
+ project=doc.index,
181
+ doc_id=doc.id,
182
+ artifact=md_path,
183
+ metadata_key=MARKDOWN_METADATA_KEY,
184
+ filename=MARKDOWN_DIRNAME,
185
+ )
186
+ write_artifact(artifacts_root, artifact)
187
+ processed = ProcessingReport(n_docs=n_docs, n_pages=n_pages)
188
+ successes = ProcessingReport(n_docs=n_successes, n_pages=n_successes_pages)
189
+ response = MarkdownExtractResponse(
190
+ processed=processed, successes=successes, errors=errors
191
+ )
192
+ return response
193
+
194
+
195
+ def _with_supported_exts_query(supported_exts: set[SupportedExt]) -> dict[str, Any]:
196
+ supported_mimes = sorted(chain(*(ext_to_mime_types(f) for f in supported_exts)))
197
+ format_query = {"terms": {DOC_CONTENT_TYPE: supported_mimes}}
198
+ query = and_query(
199
+ format_query, has_type(type_field="type", type_value=ES_DOCUMENT_TYPE)
200
+ )
201
+ return query[QUERY]
202
+
203
+
204
+ def _build_doc_query(
205
+ docs: list[DocId] | DocumentSearchQuery | None, supported_exts: set[SupportedExt]
206
+ ) -> dict[str, Any]:
207
+ format_query = _with_supported_exts_query(supported_exts)
208
+ match docs:
209
+ case dict():
210
+ if not docs:
211
+ return {QUERY: format_query}
212
+ return and_query(format_query, docs)
213
+ case None:
214
+ return {QUERY: format_query}
215
+ case list():
216
+ return and_query(format_query, has_id(docs))
217
+ case _:
218
+ raise ValueError(f"unsupported format {docs.__class__.__name__}")
219
+
220
+
221
+ async def _search_docs(
222
+ es_client: ESClient, project: str, query: dict[str, Any], sort: ESSort = None
223
+ ) -> AsyncIterable[ProcessedDoc]:
224
+ async for page in es_client.poll_search_pages(
225
+ index=project,
226
+ body=query,
227
+ sort=sort,
228
+ _source_includes=_DOC_CONTENT_SOURCES,
229
+ ):
230
+ for hit in page[HITS][HITS]:
231
+ n_pages = None
232
+ meta = hit[SOURCE].get(DOC_METADATA)
233
+ if meta is not None:
234
+ n_pages = meta.get("tika_metadata_xmptpg_npages")
235
+ yield ProcessedDoc.from_fs_doc(
236
+ Document.from_es(hit).to_filesystem(), n_pages=n_pages
237
+ )
238
+
239
+
240
+ async def _batch_by_n_pages(
241
+ docs: AsyncIterable[ProcessedDoc], target_n_pages_per_batch: int
242
+ ) -> AsyncIterable[list[ProcessedDoc]]:
243
+ current_n_pages = 0
244
+ current_batch = []
245
+ async for d in docs:
246
+ if current_n_pages >= target_n_pages_per_batch:
247
+ yield current_batch
248
+ current_n_pages = 0
249
+ current_batch = []
250
+ current_batch.append(d)
251
+ current_n_pages += d.n_pages
252
+ if current_batch:
253
+ yield current_batch
254
+
255
+
256
+ async def _write_batches(
257
+ batches: AsyncIterable[list[ProcessedDoc]], root: Path
258
+ ) -> AsyncIterable[Path]:
259
+ batch_id = 0
260
+ async for batch in batches:
261
+ batch_path = root / f"{batch_id}.jsonl"
262
+ with batch_path.open("w") as f:
263
+ for fs_doc in batch:
264
+ f.write(f"{fs_doc.model_dump_json()}\n")
265
+ yield batch_path
266
+ batch_id += 1
267
+
268
+
269
+ def _symlink_embedded_processed_doc_to_workdir(
270
+ doc: ProcessedDoc, artifacts_root: Path, *, workdir: Path
271
+ ) -> ProcessedDoc:
272
+ match doc.location:
273
+ case DocumentLocation.ARTIFACTS:
274
+ symlinks_dir = workdir / doc.index / "symlinks"
275
+ symlinks_dir.mkdir(parents=True, exist_ok=True)
276
+ symlink_path = Path(*doc.path.parts[:-1], doc.id)
277
+ # Replace the "raw" with the doc id
278
+ doc_ext = Path(doc.resource_name).suffix
279
+ symlink_path = symlink_path.relative_to(Path(doc.index))
280
+ symlink_path = symlinks_dir / f"{symlink_path}{doc_ext}"
281
+ symlink_path.parent.mkdir(parents=True, exist_ok=True)
282
+ artifact_path = artifacts_root / doc.path
283
+ with contextlib.suppress(FileExistsError):
284
+ os.symlink(artifact_path, symlink_path)
285
+ return ProcessedDoc(
286
+ path=symlink_path.relative_to(workdir),
287
+ id=doc.id,
288
+ location=DocumentLocation.WORKDIR,
289
+ index=doc.index,
290
+ resource_name=doc.resource_name,
291
+ n_pages=doc.n_pages,
292
+ )
293
+ case DocumentLocation.ORIGINAL:
294
+ return doc
295
+ case _:
296
+ raise ValueError(f"unsupported location {doc.location}")
297
+
298
+
299
+ @cache
300
+ def ext_to_mime_types(ext: SupportedExt) -> set[str]:
301
+ # All particular cases
302
+ match ext:
303
+ case SupportedExt.NXML:
304
+ return ext_to_mime_types(SupportedExt.XML)
305
+ case SupportedExt.ADOC | SupportedExt.ASCIIDOC:
306
+ return {"text/x-asciidoc"}
307
+ case SupportedExt.QMD | SupportedExt.RMD:
308
+ return ext_to_mime_types(SupportedExt.MD)
309
+ case SupportedExt.XBRL:
310
+ return ext_to_mime_types(SupportedExt.HTLM)
311
+ try:
312
+ types = {mimetypes.types_map[ext]}
313
+ except KeyError as e:
314
+ raise ValueError(f"unsupported mimetype {ext}") from e
315
+ other = mimetypes.common_types.get(ext)
316
+ if other is not None:
317
+ types.add(other)
318
+ return types
319
+
320
+
321
+ ACTIVITIES = [
322
+ MarkdownExtract.extract_worker_config,
323
+ MarkdownExtract.create_markdown_extract_batches,
324
+ MarkdownExtract.extract_markdown_content,
325
+ ]
@@ -0,0 +1,18 @@
1
+ from datashare_python.config import WorkerConfig
2
+ from datashare_python.objects import DatashareModel
3
+ from pydantic import Field
4
+
5
+ from .constants import TorchDevice
6
+
7
+
8
+ class MarkdownExtractConfig(DatashareModel):
9
+ target_n_pages_per_batch: int = 100
10
+
11
+
12
+ class ExtractWorkerConfig(WorkerConfig):
13
+ device: TorchDevice = Field(default=TorchDevice.CPU, frozen=True)
14
+
15
+ markdown: MarkdownExtractConfig = Field(default_factory=MarkdownExtractConfig)
16
+
17
+
18
+ WORKER_CONFIG_CLS = ExtractWorkerConfig
@@ -0,0 +1,34 @@
1
+ from enum import StrEnum
2
+
3
+ from extract_python import PipelineType
4
+
5
+ MARKDOWN_METADATA_KEY = "extract.markdown"
6
+ MARKDOWN_DIRNAME = "markdown"
7
+
8
+
9
+ class TaskQueue(StrEnum):
10
+ WORKFLOWS = "datashare.workflows"
11
+ IO = "extract.io"
12
+ EXTRACT_GPU_MINER_U = "extract.gpu.mineru"
13
+ EXTRACT_CPU_MINER_U = "extract.cpu.mineru"
14
+ EXTRACT_GPU = "extract.gpu"
15
+ EXTRACT_CPU = "extract.cpu"
16
+
17
+
18
+ class TorchDevice(StrEnum):
19
+ CPU = "cpu"
20
+ GPU = "cuda"
21
+
22
+ def md_extract_queue(self, pipeline: PipelineType) -> TaskQueue:
23
+ is_mineru = pipeline is PipelineType.MINER_U
24
+ match self:
25
+ case TorchDevice.GPU:
26
+ if is_mineru:
27
+ return TaskQueue.EXTRACT_GPU_MINER_U
28
+ return TaskQueue.EXTRACT_GPU
29
+ case TorchDevice.CPU:
30
+ if is_mineru:
31
+ return TaskQueue.EXTRACT_CPU_MINER_U
32
+ return TaskQueue.EXTRACT_CPU
33
+ case _:
34
+ raise ValueError(f"unsupported TorchDevice {self}")
@@ -0,0 +1,11 @@
1
+ from datashare_python.dependencies import (
2
+ lifespan_es_client, # noqa: F401
3
+ set_es_client,
4
+ set_loggers,
5
+ set_worker_config,
6
+ )
7
+
8
+ IO = [set_worker_config, set_loggers, set_es_client]
9
+ EXTRACT = [set_worker_config, set_loggers]
10
+
11
+ DEPENDENCIES = {"extract.io": IO, "extract.extract": EXTRACT}
@@ -0,0 +1,68 @@
1
+ from typing import Annotated, Any, Self
2
+
3
+ from datashare_python.objects import DatashareModel, FilesystemDocument
4
+ from extract_python import (
5
+ DoclingPipelineConfig,
6
+ PipelineType,
7
+ Status,
8
+ )
9
+ from extract_python import (
10
+ PipelineConfig as ExtractPipelineConfig,
11
+ )
12
+ from extract_python.objects import Error
13
+ from icij_common.pydantic_utils import make_enum_discriminator, tagged_union
14
+ from pydantic import Discriminator, Field
15
+
16
+ DocumentSearchQuery = dict[str, Any]
17
+ DocId = str
18
+
19
+
20
+ pipeline_discriminator = make_enum_discriminator("pipeline", PipelineType)
21
+ PipelineConfig = Annotated[
22
+ tagged_union(ExtractPipelineConfig.__subclasses__(), lambda t: t.pipeline.default),
23
+ Discriminator(pipeline_discriminator),
24
+ ]
25
+
26
+
27
+ class MarkdownExtractArgs(DatashareModel):
28
+ project: str
29
+ docs: list[DocId] | DocumentSearchQuery | None
30
+ config: PipelineConfig = Field(default_factory=DoclingPipelineConfig)
31
+
32
+
33
+ class ProcessingReport(DatashareModel):
34
+ n_docs: int = 0
35
+ n_pages: int = 0
36
+
37
+ def __add__(self, other: Self) -> Self:
38
+ return ProcessingReport(
39
+ n_docs=other.n_docs + self.n_docs, n_pages=other.n_pages + self.n_pages
40
+ )
41
+
42
+
43
+ class ProcessedDoc(FilesystemDocument):
44
+ n_pages: int
45
+
46
+ @classmethod
47
+ def from_fs_doc(cls, fs_doc: FilesystemDocument, n_pages: int | None) -> Self:
48
+ n_pages = n_pages if n_pages is not None else 1
49
+ return cls(n_pages=n_pages, **fs_doc.model_dump())
50
+
51
+
52
+ class ErrorReport(DatashareModel):
53
+ doc: ProcessedDoc
54
+ status: Status
55
+ errors: list[Error] = []
56
+
57
+
58
+ class MarkdownExtractResponse(DatashareModel):
59
+ processed: ProcessingReport = Field(default_factory=ProcessingReport)
60
+ successes: ProcessingReport = Field(default_factory=ProcessingReport)
61
+ errors: list[ErrorReport] = Field(default_factory=list)
62
+
63
+ @classmethod
64
+ def from_responses(cls, *responses: Self) -> Self:
65
+ processed = sum((r.processed for r in responses), start=ProcessingReport())
66
+ successes = sum((r.successes for r in responses), start=ProcessingReport())
67
+ errors = sum((r.errors for r in responses), start=[])
68
+ return cls(processed=processed, successes=successes, errors=errors)