datashare-extract-worker 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datashare_extract_worker-0.1.3/.gitignore +21 -0
- datashare_extract_worker-0.1.3/Dockerfile +114 -0
- datashare_extract_worker-0.1.3/PKG-INFO +19 -0
- datashare_extract_worker-0.1.3/README.md +0 -0
- datashare_extract_worker-0.1.3/entrypoints/extract_cpu_worker.sh +7 -0
- datashare_extract_worker-0.1.3/entrypoints/extract_gpu_worker.sh +7 -0
- datashare_extract_worker-0.1.3/entrypoints/io_worker.sh +7 -0
- datashare_extract_worker-0.1.3/extract_worker/__init__.py +0 -0
- datashare_extract_worker-0.1.3/extract_worker/activities.py +325 -0
- datashare_extract_worker-0.1.3/extract_worker/config.py +18 -0
- datashare_extract_worker-0.1.3/extract_worker/constants.py +34 -0
- datashare_extract_worker-0.1.3/extract_worker/dependencies.py +11 -0
- datashare_extract_worker-0.1.3/extract_worker/objects.py +68 -0
- datashare_extract_worker-0.1.3/extract_worker/workflows.py +66 -0
- datashare_extract_worker-0.1.3/pyproject.toml +134 -0
- datashare_extract_worker-0.1.3/uv.dist.lock +4406 -0
- datashare_extract_worker-0.1.3/uv.lock +4434 -0
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
FROM python:3.13-slim-trixie AS builder
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
FROM python:3.13-slim-trixie AS extract-worker-builder
|
|
5
|
+
|
|
6
|
+
ENV PYTHONUNBUFFERED=1
|
|
7
|
+
ENV UV_HTTP_TIMEOUT=300
|
|
8
|
+
ENV UV_LINK_MODE=copy
|
|
9
|
+
ENV UV_COMPILE_BYTECODE=1
|
|
10
|
+
ENV UV_NO_DEV=1
|
|
11
|
+
|
|
12
|
+
COPY --from=ghcr.io/astral-sh/uv:0.11.6 /uv /uvx /bin/
|
|
13
|
+
|
|
14
|
+
WORKDIR /app
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
FROM extract-worker-builder AS io-worker
|
|
18
|
+
# Install deps first to optimize layer cache
|
|
19
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
20
|
+
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
|
|
21
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
22
|
+
uv sync -v --frozen --no-editable --no-install-project
|
|
23
|
+
|
|
24
|
+
# Then copy code
|
|
25
|
+
ADD uv.dist.lock ./uv.lock
|
|
26
|
+
ADD pyproject.toml README.md ./
|
|
27
|
+
ADD extract_worker ./extract_worker/
|
|
28
|
+
ADD entrypoints/io_worker.sh ./entrypoints/io_worker.sh
|
|
29
|
+
|
|
30
|
+
# Then install service
|
|
31
|
+
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable
|
|
32
|
+
RUN rm -rf ~/.cache/pip
|
|
33
|
+
|
|
34
|
+
ENTRYPOINT ["entrypoints/io_worker.sh"]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
FROM extract-worker-builder AS extract-cpu-worker
|
|
38
|
+
# Install deps first to optimize layer cache
|
|
39
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
40
|
+
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
|
|
41
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
42
|
+
uv sync -v --frozen --no-editable --no-install-project --extra base --extra cpu
|
|
43
|
+
|
|
44
|
+
# Then copy code
|
|
45
|
+
ADD uv.dist.lock ./uv.lock
|
|
46
|
+
ADD pyproject.toml README.md ./
|
|
47
|
+
ADD extract_worker ./extract_worker/
|
|
48
|
+
ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh
|
|
49
|
+
|
|
50
|
+
# Then install service
|
|
51
|
+
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra cpu
|
|
52
|
+
RUN rm -rf ~/.cache/pip
|
|
53
|
+
|
|
54
|
+
ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
FROM extract-worker-builder AS extract-gpu-worker
|
|
58
|
+
# Install deps first to optimize layer cache
|
|
59
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
60
|
+
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
|
|
61
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
62
|
+
uv sync -v --frozen --no-editable --no-install-project --extra base --extra gpu
|
|
63
|
+
|
|
64
|
+
# Then copy code
|
|
65
|
+
ADD uv.dist.lock ./uv.lock
|
|
66
|
+
ADD pyproject.toml README.md ./
|
|
67
|
+
ADD extract_worker ./extract_worker/
|
|
68
|
+
ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh
|
|
69
|
+
|
|
70
|
+
# Then install service
|
|
71
|
+
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra base --extra gpu
|
|
72
|
+
RUN rm -rf ~/.cache/pip
|
|
73
|
+
|
|
74
|
+
ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"]
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
FROM extract-worker-builder AS extract-cpu-mineru-worker
|
|
78
|
+
# Install deps first to optimize layer cache
|
|
79
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
80
|
+
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
|
|
81
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
82
|
+
uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra cpu
|
|
83
|
+
|
|
84
|
+
# Then copy code
|
|
85
|
+
ADD uv.dist.lock ./uv.lock
|
|
86
|
+
ADD pyproject.toml README.md ./
|
|
87
|
+
ADD extract_worker ./extract_worker/
|
|
88
|
+
ADD entrypoints/extract_cpu_worker.sh ./entrypoints/extract_cpu_worker.sh
|
|
89
|
+
|
|
90
|
+
# Then install service
|
|
91
|
+
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra cpu
|
|
92
|
+
RUN rm -rf ~/.cache/pip
|
|
93
|
+
|
|
94
|
+
ENTRYPOINT ["entrypoints/extract_cpu_worker.sh"]
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
FROM extract-worker-builder AS extract-gpu-mineru-worker
|
|
98
|
+
# Install deps first to optimize layer cache
|
|
99
|
+
RUN --mount=type=cache,target=~/.cache/uv \
|
|
100
|
+
--mount=type=bind,source=uv.dist.lock,target=uv.lock \
|
|
101
|
+
--mount=type=bind,source=pyproject.toml,target=pyproject.toml \
|
|
102
|
+
uv sync -v --frozen --no-editable --no-install-project --extra mineru --extra gpu
|
|
103
|
+
|
|
104
|
+
# Then copy code
|
|
105
|
+
ADD uv.dist.lock ./uv.lock
|
|
106
|
+
ADD pyproject.toml README.md ./
|
|
107
|
+
ADD extract_worker ./extract_worker/
|
|
108
|
+
ADD entrypoints/extract_gpu_worker.sh ./entrypoints/extract_gpu_worker.sh
|
|
109
|
+
|
|
110
|
+
# Then install service
|
|
111
|
+
RUN --mount=type=cache,target=~/.cache/uv uv sync -v --frozen --no-editable --extra mineru --extra gpu
|
|
112
|
+
RUN rm -rf ~/.cache/pip
|
|
113
|
+
|
|
114
|
+
ENTRYPOINT ["entrypoints/extract_gpu_worker.sh"]
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datashare-extract-worker
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Author-email: Clément Doumouro <cdoumouro@icij.org>, Clément Doumouro <clement.doumouro@gmail.com>, Lion Summerbell <lsummerbell@icij.org>
|
|
5
|
+
Requires-Python: <3.14,>=3.11
|
|
6
|
+
Requires-Dist: datashare-python~=0.8.20
|
|
7
|
+
Requires-Dist: extract-python==0.4.2
|
|
8
|
+
Requires-Dist: temporalio==1.23.0
|
|
9
|
+
Provides-Extra: base
|
|
10
|
+
Requires-Dist: extract-python[docling,marker]==0.4.2; extra == 'base'
|
|
11
|
+
Provides-Extra: cpu
|
|
12
|
+
Requires-Dist: torch==2.11.0; extra == 'cpu'
|
|
13
|
+
Requires-Dist: torchvision==0.26.0; extra == 'cpu'
|
|
14
|
+
Provides-Extra: gpu
|
|
15
|
+
Requires-Dist: cuda-bindings==12.9.4; (sys_platform == 'linux') and extra == 'gpu'
|
|
16
|
+
Requires-Dist: torch==2.11.0+cu129; (sys_platform == 'linux') and extra == 'gpu'
|
|
17
|
+
Requires-Dist: torchvision==0.26.0+cu129; (sys_platform == 'linux') and extra == 'gpu'
|
|
18
|
+
Provides-Extra: mineru
|
|
19
|
+
Requires-Dist: extract-python[mineru]==0.4.2; extra == 'mineru'
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import logging
|
|
3
|
+
import mimetypes
|
|
4
|
+
import os
|
|
5
|
+
from collections.abc import AsyncIterable
|
|
6
|
+
from functools import cache
|
|
7
|
+
from itertools import chain
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, cast
|
|
10
|
+
|
|
11
|
+
from datashare_python.dependencies import lifespan_es_client, lifespan_worker_config
|
|
12
|
+
from datashare_python.objects import DocArtifact, Document, DocumentLocation
|
|
13
|
+
from datashare_python.utils import (
|
|
14
|
+
ActivityWithProgress,
|
|
15
|
+
activity_defn,
|
|
16
|
+
activity_workdir,
|
|
17
|
+
read_jsonl,
|
|
18
|
+
write_artifact,
|
|
19
|
+
)
|
|
20
|
+
from extract_python import Pipeline
|
|
21
|
+
from extract_python.objects import InputDoc, OutputFormat, SupportedExt
|
|
22
|
+
from icij_common.es import (
|
|
23
|
+
DOC_CONTENT_TYPE,
|
|
24
|
+
DOC_LANGUAGE,
|
|
25
|
+
DOC_METADATA,
|
|
26
|
+
DOC_PATH,
|
|
27
|
+
DOC_ROOT_ID,
|
|
28
|
+
ES_DOCUMENT_TYPE,
|
|
29
|
+
HITS,
|
|
30
|
+
QUERY,
|
|
31
|
+
SOURCE,
|
|
32
|
+
ESClient,
|
|
33
|
+
ESSort,
|
|
34
|
+
and_query,
|
|
35
|
+
has_id,
|
|
36
|
+
has_type,
|
|
37
|
+
)
|
|
38
|
+
from pydantic import TypeAdapter
|
|
39
|
+
|
|
40
|
+
from .config import ExtractWorkerConfig
|
|
41
|
+
from .constants import MARKDOWN_DIRNAME, MARKDOWN_METADATA_KEY
|
|
42
|
+
from .objects import (
|
|
43
|
+
DocId,
|
|
44
|
+
DocumentSearchQuery,
|
|
45
|
+
ErrorReport,
|
|
46
|
+
MarkdownExtractResponse,
|
|
47
|
+
PipelineConfig,
|
|
48
|
+
ProcessedDoc,
|
|
49
|
+
ProcessingReport,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
logger = logging.getLogger(__name__)
|
|
53
|
+
|
|
54
|
+
mimetypes.init()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class MarkdownExtract(ActivityWithProgress):
|
|
58
|
+
@activity_defn(name="extract.worker_config")
|
|
59
|
+
async def extract_worker_config(self) -> ExtractWorkerConfig:
|
|
60
|
+
worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
|
|
61
|
+
return worker_config
|
|
62
|
+
|
|
63
|
+
@activity_defn(name="extract.create-markdown-batches")
|
|
64
|
+
async def create_markdown_extract_batches(
|
|
65
|
+
self,
|
|
66
|
+
project: str,
|
|
67
|
+
docs: list[DocId] | DocumentSearchQuery | None,
|
|
68
|
+
config: PipelineConfig,
|
|
69
|
+
) -> list[Path]:
|
|
70
|
+
es_client = lifespan_es_client()
|
|
71
|
+
worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
|
|
72
|
+
workdir = worker_config.workdir
|
|
73
|
+
artifacts_root = worker_config.artifacts_root
|
|
74
|
+
output_dir = activity_workdir(workdir, project)
|
|
75
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
76
|
+
target_n_pages_per_batch = worker_config.markdown.target_n_pages_per_batch
|
|
77
|
+
supported_exts = config.supported_exts()
|
|
78
|
+
batch_paths = [
|
|
79
|
+
p.relative_to(workdir)
|
|
80
|
+
async for p in create_markdown_extract_batches_act(
|
|
81
|
+
docs,
|
|
82
|
+
project,
|
|
83
|
+
supported_exts,
|
|
84
|
+
artifacts_root=artifacts_root,
|
|
85
|
+
workdir=workdir,
|
|
86
|
+
output_dir=output_dir,
|
|
87
|
+
target_n_pages_per_batch=target_n_pages_per_batch,
|
|
88
|
+
es_client=es_client,
|
|
89
|
+
)
|
|
90
|
+
]
|
|
91
|
+
return batch_paths
|
|
92
|
+
|
|
93
|
+
@activity_defn(name="extract.extract-markdown-content")
|
|
94
|
+
async def extract_markdown_content(
|
|
95
|
+
self, batch: Path, project: str, config: PipelineConfig
|
|
96
|
+
) -> MarkdownExtractResponse:
|
|
97
|
+
pipeline = Pipeline.from_config(config)
|
|
98
|
+
worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
|
|
99
|
+
workdir = worker_config.workdir
|
|
100
|
+
output_dir = activity_workdir(workdir, project)
|
|
101
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
102
|
+
batch = workdir / batch
|
|
103
|
+
return await extract_markdown_content_act(
|
|
104
|
+
pipeline,
|
|
105
|
+
batch,
|
|
106
|
+
worker_config=worker_config,
|
|
107
|
+
output_dir=output_dir,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# Sort documents aiming for consistent processing type in a batch
|
|
112
|
+
_DOC_SORT = [f"{DOC_CONTENT_TYPE}:asc", f"{DOC_LANGUAGE}:asc", "_doc:asc"]
|
|
113
|
+
_DOC_CONTENT_SOURCES = [DOC_PATH, DOC_ROOT_ID, DOC_LANGUAGE, DOC_METADATA]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
async def create_markdown_extract_batches_act(
|
|
117
|
+
docs: list[DocId] | DocumentSearchQuery | None,
|
|
118
|
+
project: str,
|
|
119
|
+
supported_exts: set[SupportedExt],
|
|
120
|
+
*,
|
|
121
|
+
artifacts_root: Path,
|
|
122
|
+
workdir: Path,
|
|
123
|
+
output_dir: Path,
|
|
124
|
+
target_n_pages_per_batch: int,
|
|
125
|
+
es_client: ESClient | None = None,
|
|
126
|
+
) -> AsyncIterable[Path]:
|
|
127
|
+
# TODO: supported content types should be args
|
|
128
|
+
query = _build_doc_query(docs, supported_exts)
|
|
129
|
+
docs = (
|
|
130
|
+
_symlink_embedded_processed_doc_to_workdir(d, artifacts_root, workdir=workdir)
|
|
131
|
+
async for d in _search_docs(es_client, project, query, sort=_DOC_SORT)
|
|
132
|
+
)
|
|
133
|
+
batches = _batch_by_n_pages(docs, target_n_pages_per_batch=target_n_pages_per_batch)
|
|
134
|
+
async for p in _write_batches(batches, output_dir):
|
|
135
|
+
yield p
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
_BatchTypeAdapter = TypeAdapter(list[ProcessedDoc])
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
async def extract_markdown_content_act(
|
|
142
|
+
pipeline: Pipeline,
|
|
143
|
+
batch: Path,
|
|
144
|
+
*,
|
|
145
|
+
worker_config: ExtractWorkerConfig,
|
|
146
|
+
output_dir: Path,
|
|
147
|
+
) -> MarkdownExtractResponse:
|
|
148
|
+
docs = _BatchTypeAdapter.validate_python(list(read_jsonl(batch)))
|
|
149
|
+
docs_root = worker_config.docs_root
|
|
150
|
+
artifacts_root = worker_config.artifacts_root
|
|
151
|
+
workdir = worker_config.workdir
|
|
152
|
+
input_docs = (
|
|
153
|
+
InputDoc.from_path(
|
|
154
|
+
d.locate(
|
|
155
|
+
original_root=docs_root, artifacts_root=artifacts_root, workdir=workdir
|
|
156
|
+
)
|
|
157
|
+
)
|
|
158
|
+
for d in docs
|
|
159
|
+
)
|
|
160
|
+
results = pipeline.extract_content(
|
|
161
|
+
input_docs, output_format=OutputFormat.MARKDOWN, output_path=output_dir
|
|
162
|
+
)
|
|
163
|
+
docs = iter(docs)
|
|
164
|
+
n_docs, n_pages, n_successes, n_successes_pages = 0, 0, 0, 0
|
|
165
|
+
errors = []
|
|
166
|
+
async for extract_res in results:
|
|
167
|
+
doc = next(docs)
|
|
168
|
+
n_docs += 1
|
|
169
|
+
n_pages += doc.n_pages
|
|
170
|
+
if extract_res.errors:
|
|
171
|
+
error = ErrorReport(
|
|
172
|
+
doc=doc, status=extract_res.status, errors=extract_res.errors
|
|
173
|
+
)
|
|
174
|
+
errors.append(error)
|
|
175
|
+
else:
|
|
176
|
+
n_successes += 1
|
|
177
|
+
n_successes_pages += doc.n_pages
|
|
178
|
+
md_path = output_dir / extract_res.output.path
|
|
179
|
+
artifact = DocArtifact(
|
|
180
|
+
project=doc.index,
|
|
181
|
+
doc_id=doc.id,
|
|
182
|
+
artifact=md_path,
|
|
183
|
+
metadata_key=MARKDOWN_METADATA_KEY,
|
|
184
|
+
filename=MARKDOWN_DIRNAME,
|
|
185
|
+
)
|
|
186
|
+
write_artifact(artifacts_root, artifact)
|
|
187
|
+
processed = ProcessingReport(n_docs=n_docs, n_pages=n_pages)
|
|
188
|
+
successes = ProcessingReport(n_docs=n_successes, n_pages=n_successes_pages)
|
|
189
|
+
response = MarkdownExtractResponse(
|
|
190
|
+
processed=processed, successes=successes, errors=errors
|
|
191
|
+
)
|
|
192
|
+
return response
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def _with_supported_exts_query(supported_exts: set[SupportedExt]) -> dict[str, Any]:
|
|
196
|
+
supported_mimes = sorted(chain(*(ext_to_mime_types(f) for f in supported_exts)))
|
|
197
|
+
format_query = {"terms": {DOC_CONTENT_TYPE: supported_mimes}}
|
|
198
|
+
query = and_query(
|
|
199
|
+
format_query, has_type(type_field="type", type_value=ES_DOCUMENT_TYPE)
|
|
200
|
+
)
|
|
201
|
+
return query[QUERY]
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _build_doc_query(
|
|
205
|
+
docs: list[DocId] | DocumentSearchQuery | None, supported_exts: set[SupportedExt]
|
|
206
|
+
) -> dict[str, Any]:
|
|
207
|
+
format_query = _with_supported_exts_query(supported_exts)
|
|
208
|
+
match docs:
|
|
209
|
+
case dict():
|
|
210
|
+
if not docs:
|
|
211
|
+
return {QUERY: format_query}
|
|
212
|
+
return and_query(format_query, docs)
|
|
213
|
+
case None:
|
|
214
|
+
return {QUERY: format_query}
|
|
215
|
+
case list():
|
|
216
|
+
return and_query(format_query, has_id(docs))
|
|
217
|
+
case _:
|
|
218
|
+
raise ValueError(f"unsupported format {docs.__class__.__name__}")
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
async def _search_docs(
|
|
222
|
+
es_client: ESClient, project: str, query: dict[str, Any], sort: ESSort = None
|
|
223
|
+
) -> AsyncIterable[ProcessedDoc]:
|
|
224
|
+
async for page in es_client.poll_search_pages(
|
|
225
|
+
index=project,
|
|
226
|
+
body=query,
|
|
227
|
+
sort=sort,
|
|
228
|
+
_source_includes=_DOC_CONTENT_SOURCES,
|
|
229
|
+
):
|
|
230
|
+
for hit in page[HITS][HITS]:
|
|
231
|
+
n_pages = None
|
|
232
|
+
meta = hit[SOURCE].get(DOC_METADATA)
|
|
233
|
+
if meta is not None:
|
|
234
|
+
n_pages = meta.get("tika_metadata_xmptpg_npages")
|
|
235
|
+
yield ProcessedDoc.from_fs_doc(
|
|
236
|
+
Document.from_es(hit).to_filesystem(), n_pages=n_pages
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
async def _batch_by_n_pages(
|
|
241
|
+
docs: AsyncIterable[ProcessedDoc], target_n_pages_per_batch: int
|
|
242
|
+
) -> AsyncIterable[list[ProcessedDoc]]:
|
|
243
|
+
current_n_pages = 0
|
|
244
|
+
current_batch = []
|
|
245
|
+
async for d in docs:
|
|
246
|
+
if current_n_pages >= target_n_pages_per_batch:
|
|
247
|
+
yield current_batch
|
|
248
|
+
current_n_pages = 0
|
|
249
|
+
current_batch = []
|
|
250
|
+
current_batch.append(d)
|
|
251
|
+
current_n_pages += d.n_pages
|
|
252
|
+
if current_batch:
|
|
253
|
+
yield current_batch
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
async def _write_batches(
|
|
257
|
+
batches: AsyncIterable[list[ProcessedDoc]], root: Path
|
|
258
|
+
) -> AsyncIterable[Path]:
|
|
259
|
+
batch_id = 0
|
|
260
|
+
async for batch in batches:
|
|
261
|
+
batch_path = root / f"{batch_id}.jsonl"
|
|
262
|
+
with batch_path.open("w") as f:
|
|
263
|
+
for fs_doc in batch:
|
|
264
|
+
f.write(f"{fs_doc.model_dump_json()}\n")
|
|
265
|
+
yield batch_path
|
|
266
|
+
batch_id += 1
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def _symlink_embedded_processed_doc_to_workdir(
|
|
270
|
+
doc: ProcessedDoc, artifacts_root: Path, *, workdir: Path
|
|
271
|
+
) -> ProcessedDoc:
|
|
272
|
+
match doc.location:
|
|
273
|
+
case DocumentLocation.ARTIFACTS:
|
|
274
|
+
symlinks_dir = workdir / doc.index / "symlinks"
|
|
275
|
+
symlinks_dir.mkdir(parents=True, exist_ok=True)
|
|
276
|
+
symlink_path = Path(*doc.path.parts[:-1], doc.id)
|
|
277
|
+
# Replace the "raw" with the doc id
|
|
278
|
+
doc_ext = Path(doc.resource_name).suffix
|
|
279
|
+
symlink_path = symlink_path.relative_to(Path(doc.index))
|
|
280
|
+
symlink_path = symlinks_dir / f"{symlink_path}{doc_ext}"
|
|
281
|
+
symlink_path.parent.mkdir(parents=True, exist_ok=True)
|
|
282
|
+
artifact_path = artifacts_root / doc.path
|
|
283
|
+
with contextlib.suppress(FileExistsError):
|
|
284
|
+
os.symlink(artifact_path, symlink_path)
|
|
285
|
+
return ProcessedDoc(
|
|
286
|
+
path=symlink_path.relative_to(workdir),
|
|
287
|
+
id=doc.id,
|
|
288
|
+
location=DocumentLocation.WORKDIR,
|
|
289
|
+
index=doc.index,
|
|
290
|
+
resource_name=doc.resource_name,
|
|
291
|
+
n_pages=doc.n_pages,
|
|
292
|
+
)
|
|
293
|
+
case DocumentLocation.ORIGINAL:
|
|
294
|
+
return doc
|
|
295
|
+
case _:
|
|
296
|
+
raise ValueError(f"unsupported location {doc.location}")
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
@cache
|
|
300
|
+
def ext_to_mime_types(ext: SupportedExt) -> set[str]:
|
|
301
|
+
# All particular cases
|
|
302
|
+
match ext:
|
|
303
|
+
case SupportedExt.NXML:
|
|
304
|
+
return ext_to_mime_types(SupportedExt.XML)
|
|
305
|
+
case SupportedExt.ADOC | SupportedExt.ASCIIDOC:
|
|
306
|
+
return {"text/x-asciidoc"}
|
|
307
|
+
case SupportedExt.QMD | SupportedExt.RMD:
|
|
308
|
+
return ext_to_mime_types(SupportedExt.MD)
|
|
309
|
+
case SupportedExt.XBRL:
|
|
310
|
+
return ext_to_mime_types(SupportedExt.HTLM)
|
|
311
|
+
try:
|
|
312
|
+
types = {mimetypes.types_map[ext]}
|
|
313
|
+
except KeyError as e:
|
|
314
|
+
raise ValueError(f"unsupported mimetype {ext}") from e
|
|
315
|
+
other = mimetypes.common_types.get(ext)
|
|
316
|
+
if other is not None:
|
|
317
|
+
types.add(other)
|
|
318
|
+
return types
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
ACTIVITIES = [
|
|
322
|
+
MarkdownExtract.extract_worker_config,
|
|
323
|
+
MarkdownExtract.create_markdown_extract_batches,
|
|
324
|
+
MarkdownExtract.extract_markdown_content,
|
|
325
|
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from datashare_python.config import WorkerConfig
|
|
2
|
+
from datashare_python.objects import DatashareModel
|
|
3
|
+
from pydantic import Field
|
|
4
|
+
|
|
5
|
+
from .constants import TorchDevice
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class MarkdownExtractConfig(DatashareModel):
|
|
9
|
+
target_n_pages_per_batch: int = 100
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExtractWorkerConfig(WorkerConfig):
|
|
13
|
+
device: TorchDevice = Field(default=TorchDevice.CPU, frozen=True)
|
|
14
|
+
|
|
15
|
+
markdown: MarkdownExtractConfig = Field(default_factory=MarkdownExtractConfig)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
WORKER_CONFIG_CLS = ExtractWorkerConfig
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
|
|
3
|
+
from extract_python import PipelineType
|
|
4
|
+
|
|
5
|
+
MARKDOWN_METADATA_KEY = "extract.markdown"
|
|
6
|
+
MARKDOWN_DIRNAME = "markdown"
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class TaskQueue(StrEnum):
|
|
10
|
+
WORKFLOWS = "datashare.workflows"
|
|
11
|
+
IO = "extract.io"
|
|
12
|
+
EXTRACT_GPU_MINER_U = "extract.gpu.mineru"
|
|
13
|
+
EXTRACT_CPU_MINER_U = "extract.cpu.mineru"
|
|
14
|
+
EXTRACT_GPU = "extract.gpu"
|
|
15
|
+
EXTRACT_CPU = "extract.cpu"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class TorchDevice(StrEnum):
|
|
19
|
+
CPU = "cpu"
|
|
20
|
+
GPU = "cuda"
|
|
21
|
+
|
|
22
|
+
def md_extract_queue(self, pipeline: PipelineType) -> TaskQueue:
|
|
23
|
+
is_mineru = pipeline is PipelineType.MINER_U
|
|
24
|
+
match self:
|
|
25
|
+
case TorchDevice.GPU:
|
|
26
|
+
if is_mineru:
|
|
27
|
+
return TaskQueue.EXTRACT_GPU_MINER_U
|
|
28
|
+
return TaskQueue.EXTRACT_GPU
|
|
29
|
+
case TorchDevice.CPU:
|
|
30
|
+
if is_mineru:
|
|
31
|
+
return TaskQueue.EXTRACT_CPU_MINER_U
|
|
32
|
+
return TaskQueue.EXTRACT_CPU
|
|
33
|
+
case _:
|
|
34
|
+
raise ValueError(f"unsupported TorchDevice {self}")
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from datashare_python.dependencies import (
|
|
2
|
+
lifespan_es_client, # noqa: F401
|
|
3
|
+
set_es_client,
|
|
4
|
+
set_loggers,
|
|
5
|
+
set_worker_config,
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
IO = [set_worker_config, set_loggers, set_es_client]
|
|
9
|
+
EXTRACT = [set_worker_config, set_loggers]
|
|
10
|
+
|
|
11
|
+
DEPENDENCIES = {"extract.io": IO, "extract.extract": EXTRACT}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from typing import Annotated, Any, Self
|
|
2
|
+
|
|
3
|
+
from datashare_python.objects import DatashareModel, FilesystemDocument
|
|
4
|
+
from extract_python import (
|
|
5
|
+
DoclingPipelineConfig,
|
|
6
|
+
PipelineType,
|
|
7
|
+
Status,
|
|
8
|
+
)
|
|
9
|
+
from extract_python import (
|
|
10
|
+
PipelineConfig as ExtractPipelineConfig,
|
|
11
|
+
)
|
|
12
|
+
from extract_python.objects import Error
|
|
13
|
+
from icij_common.pydantic_utils import make_enum_discriminator, tagged_union
|
|
14
|
+
from pydantic import Discriminator, Field
|
|
15
|
+
|
|
16
|
+
DocumentSearchQuery = dict[str, Any]
|
|
17
|
+
DocId = str
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
pipeline_discriminator = make_enum_discriminator("pipeline", PipelineType)
|
|
21
|
+
PipelineConfig = Annotated[
|
|
22
|
+
tagged_union(ExtractPipelineConfig.__subclasses__(), lambda t: t.pipeline.default),
|
|
23
|
+
Discriminator(pipeline_discriminator),
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class MarkdownExtractArgs(DatashareModel):
|
|
28
|
+
project: str
|
|
29
|
+
docs: list[DocId] | DocumentSearchQuery | None
|
|
30
|
+
config: PipelineConfig = Field(default_factory=DoclingPipelineConfig)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ProcessingReport(DatashareModel):
|
|
34
|
+
n_docs: int = 0
|
|
35
|
+
n_pages: int = 0
|
|
36
|
+
|
|
37
|
+
def __add__(self, other: Self) -> Self:
|
|
38
|
+
return ProcessingReport(
|
|
39
|
+
n_docs=other.n_docs + self.n_docs, n_pages=other.n_pages + self.n_pages
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ProcessedDoc(FilesystemDocument):
|
|
44
|
+
n_pages: int
|
|
45
|
+
|
|
46
|
+
@classmethod
|
|
47
|
+
def from_fs_doc(cls, fs_doc: FilesystemDocument, n_pages: int | None) -> Self:
|
|
48
|
+
n_pages = n_pages if n_pages is not None else 1
|
|
49
|
+
return cls(n_pages=n_pages, **fs_doc.model_dump())
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class ErrorReport(DatashareModel):
|
|
53
|
+
doc: ProcessedDoc
|
|
54
|
+
status: Status
|
|
55
|
+
errors: list[Error] = []
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class MarkdownExtractResponse(DatashareModel):
|
|
59
|
+
processed: ProcessingReport = Field(default_factory=ProcessingReport)
|
|
60
|
+
successes: ProcessingReport = Field(default_factory=ProcessingReport)
|
|
61
|
+
errors: list[ErrorReport] = Field(default_factory=list)
|
|
62
|
+
|
|
63
|
+
@classmethod
|
|
64
|
+
def from_responses(cls, *responses: Self) -> Self:
|
|
65
|
+
processed = sum((r.processed for r in responses), start=ProcessingReport())
|
|
66
|
+
successes = sum((r.successes for r in responses), start=ProcessingReport())
|
|
67
|
+
errors = sum((r.errors for r in responses), start=[])
|
|
68
|
+
return cls(processed=processed, successes=successes, errors=errors)
|