datashare-extract-worker 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,19 @@
1
+ Metadata-Version: 2.4
2
+ Name: datashare-extract-worker
3
+ Version: 0.1.3
4
+ Author-email: Clément Doumouro <cdoumouro@icij.org>, Clément Doumouro <clement.doumouro@gmail.com>, Lion Summerbell <lsummerbell@icij.org>
5
+ Requires-Python: <3.14,>=3.11
6
+ Requires-Dist: datashare-python~=0.8.20
7
+ Requires-Dist: extract-python==0.4.2
8
+ Requires-Dist: temporalio==1.23.0
9
+ Provides-Extra: base
10
+ Requires-Dist: extract-python[docling,marker]==0.4.2; extra == 'base'
11
+ Provides-Extra: cpu
12
+ Requires-Dist: torch==2.11.0; extra == 'cpu'
13
+ Requires-Dist: torchvision==0.26.0; extra == 'cpu'
14
+ Provides-Extra: gpu
15
+ Requires-Dist: cuda-bindings==12.9.4; (sys_platform == 'linux') and extra == 'gpu'
16
+ Requires-Dist: torch==2.11.0+cu129; (sys_platform == 'linux') and extra == 'gpu'
17
+ Requires-Dist: torchvision==0.26.0+cu129; (sys_platform == 'linux') and extra == 'gpu'
18
+ Provides-Extra: mineru
19
+ Requires-Dist: extract-python[mineru]==0.4.2; extra == 'mineru'
@@ -0,0 +1,11 @@
1
+ extract_worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ extract_worker/activities.py,sha256=W8fIP47XY39qaw_6X8Gem_AnfdbloRz5I6W8pgnqIho,10909
3
+ extract_worker/config.py,sha256=ZAcomS7KQD7SYxPZqSWGAuBOO4uuMOvj489Lyq6_2Ls,490
4
+ extract_worker/constants.py,sha256=GQue-4WG7auapHvxj2rk9x7wfQItf1LFaCEFeSe9Qak,1012
5
+ extract_worker/dependencies.py,sha256=_avB1j9IX-GvxUhJPA9JyqUhHnX7aUO8aY7OxMN16Qo,303
6
+ extract_worker/objects.py,sha256=xnNx_8BfRL8T10uAezIDLdyHWg1cF3XvUh8CFMohpCA,2185
7
+ extract_worker/workflows.py,sha256=58mTZiXwwBkcXwSEPAif0O032KxkDVaWCPVQJlnLcVw,2392
8
+ datashare_extract_worker-0.1.3.dist-info/METADATA,sha256=QOwnKv-AK4e9S85zQLNvj9A20UeRPG_y28a8vWurIBU,915
9
+ datashare_extract_worker-0.1.3.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
10
+ datashare_extract_worker-0.1.3.dist-info/entry_points.txt,sha256=2s1sCvz-0PQaepb3RmxFVgKxOuG203vJnTdlwIvx41I,317
11
+ datashare_extract_worker-0.1.3.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,11 @@
1
+ [datashare.activities]
2
+ activities = extract_worker.activities:ACTIVITIES
3
+
4
+ [datashare.dependencies]
5
+ dependencies = extract_worker.dependencies:DEPENDENCIES
6
+
7
+ [datashare.worker_config_cls]
8
+ worker_config_cls = extract_worker.config_:WORKER_CONFIG_CLS
9
+
10
+ [datashare.workflows]
11
+ workflows = extract_worker.workflows:WORKFLOWS
File without changes
@@ -0,0 +1,325 @@
1
+ import contextlib
2
+ import logging
3
+ import mimetypes
4
+ import os
5
+ from collections.abc import AsyncIterable
6
+ from functools import cache
7
+ from itertools import chain
8
+ from pathlib import Path
9
+ from typing import Any, cast
10
+
11
+ from datashare_python.dependencies import lifespan_es_client, lifespan_worker_config
12
+ from datashare_python.objects import DocArtifact, Document, DocumentLocation
13
+ from datashare_python.utils import (
14
+ ActivityWithProgress,
15
+ activity_defn,
16
+ activity_workdir,
17
+ read_jsonl,
18
+ write_artifact,
19
+ )
20
+ from extract_python import Pipeline
21
+ from extract_python.objects import InputDoc, OutputFormat, SupportedExt
22
+ from icij_common.es import (
23
+ DOC_CONTENT_TYPE,
24
+ DOC_LANGUAGE,
25
+ DOC_METADATA,
26
+ DOC_PATH,
27
+ DOC_ROOT_ID,
28
+ ES_DOCUMENT_TYPE,
29
+ HITS,
30
+ QUERY,
31
+ SOURCE,
32
+ ESClient,
33
+ ESSort,
34
+ and_query,
35
+ has_id,
36
+ has_type,
37
+ )
38
+ from pydantic import TypeAdapter
39
+
40
+ from .config import ExtractWorkerConfig
41
+ from .constants import MARKDOWN_DIRNAME, MARKDOWN_METADATA_KEY
42
+ from .objects import (
43
+ DocId,
44
+ DocumentSearchQuery,
45
+ ErrorReport,
46
+ MarkdownExtractResponse,
47
+ PipelineConfig,
48
+ ProcessedDoc,
49
+ ProcessingReport,
50
+ )
51
+
52
+ logger = logging.getLogger(__name__)
53
+
54
+ mimetypes.init()
55
+
56
+
57
+ class MarkdownExtract(ActivityWithProgress):
58
+ @activity_defn(name="extract.worker_config")
59
+ async def extract_worker_config(self) -> ExtractWorkerConfig:
60
+ worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
61
+ return worker_config
62
+
63
+ @activity_defn(name="extract.create-markdown-batches")
64
+ async def create_markdown_extract_batches(
65
+ self,
66
+ project: str,
67
+ docs: list[DocId] | DocumentSearchQuery | None,
68
+ config: PipelineConfig,
69
+ ) -> list[Path]:
70
+ es_client = lifespan_es_client()
71
+ worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
72
+ workdir = worker_config.workdir
73
+ artifacts_root = worker_config.artifacts_root
74
+ output_dir = activity_workdir(workdir, project)
75
+ output_dir.mkdir(parents=True, exist_ok=True)
76
+ target_n_pages_per_batch = worker_config.markdown.target_n_pages_per_batch
77
+ supported_exts = config.supported_exts()
78
+ batch_paths = [
79
+ p.relative_to(workdir)
80
+ async for p in create_markdown_extract_batches_act(
81
+ docs,
82
+ project,
83
+ supported_exts,
84
+ artifacts_root=artifacts_root,
85
+ workdir=workdir,
86
+ output_dir=output_dir,
87
+ target_n_pages_per_batch=target_n_pages_per_batch,
88
+ es_client=es_client,
89
+ )
90
+ ]
91
+ return batch_paths
92
+
93
+ @activity_defn(name="extract.extract-markdown-content")
94
+ async def extract_markdown_content(
95
+ self, batch: Path, project: str, config: PipelineConfig
96
+ ) -> MarkdownExtractResponse:
97
+ pipeline = Pipeline.from_config(config)
98
+ worker_config = cast(ExtractWorkerConfig, lifespan_worker_config())
99
+ workdir = worker_config.workdir
100
+ output_dir = activity_workdir(workdir, project)
101
+ output_dir.mkdir(parents=True, exist_ok=True)
102
+ batch = workdir / batch
103
+ return await extract_markdown_content_act(
104
+ pipeline,
105
+ batch,
106
+ worker_config=worker_config,
107
+ output_dir=output_dir,
108
+ )
109
+
110
+
111
+ # Sort documents aiming for consistent processing type in a batch
112
+ _DOC_SORT = [f"{DOC_CONTENT_TYPE}:asc", f"{DOC_LANGUAGE}:asc", "_doc:asc"]
113
+ _DOC_CONTENT_SOURCES = [DOC_PATH, DOC_ROOT_ID, DOC_LANGUAGE, DOC_METADATA]
114
+
115
+
116
+ async def create_markdown_extract_batches_act(
117
+ docs: list[DocId] | DocumentSearchQuery | None,
118
+ project: str,
119
+ supported_exts: set[SupportedExt],
120
+ *,
121
+ artifacts_root: Path,
122
+ workdir: Path,
123
+ output_dir: Path,
124
+ target_n_pages_per_batch: int,
125
+ es_client: ESClient | None = None,
126
+ ) -> AsyncIterable[Path]:
127
+ # TODO: supported content types should be args
128
+ query = _build_doc_query(docs, supported_exts)
129
+ docs = (
130
+ _symlink_embedded_processed_doc_to_workdir(d, artifacts_root, workdir=workdir)
131
+ async for d in _search_docs(es_client, project, query, sort=_DOC_SORT)
132
+ )
133
+ batches = _batch_by_n_pages(docs, target_n_pages_per_batch=target_n_pages_per_batch)
134
+ async for p in _write_batches(batches, output_dir):
135
+ yield p
136
+
137
+
138
+ _BatchTypeAdapter = TypeAdapter(list[ProcessedDoc])
139
+
140
+
141
+ async def extract_markdown_content_act(
142
+ pipeline: Pipeline,
143
+ batch: Path,
144
+ *,
145
+ worker_config: ExtractWorkerConfig,
146
+ output_dir: Path,
147
+ ) -> MarkdownExtractResponse:
148
+ docs = _BatchTypeAdapter.validate_python(list(read_jsonl(batch)))
149
+ docs_root = worker_config.docs_root
150
+ artifacts_root = worker_config.artifacts_root
151
+ workdir = worker_config.workdir
152
+ input_docs = (
153
+ InputDoc.from_path(
154
+ d.locate(
155
+ original_root=docs_root, artifacts_root=artifacts_root, workdir=workdir
156
+ )
157
+ )
158
+ for d in docs
159
+ )
160
+ results = pipeline.extract_content(
161
+ input_docs, output_format=OutputFormat.MARKDOWN, output_path=output_dir
162
+ )
163
+ docs = iter(docs)
164
+ n_docs, n_pages, n_successes, n_successes_pages = 0, 0, 0, 0
165
+ errors = []
166
+ async for extract_res in results:
167
+ doc = next(docs)
168
+ n_docs += 1
169
+ n_pages += doc.n_pages
170
+ if extract_res.errors:
171
+ error = ErrorReport(
172
+ doc=doc, status=extract_res.status, errors=extract_res.errors
173
+ )
174
+ errors.append(error)
175
+ else:
176
+ n_successes += 1
177
+ n_successes_pages += doc.n_pages
178
+ md_path = output_dir / extract_res.output.path
179
+ artifact = DocArtifact(
180
+ project=doc.index,
181
+ doc_id=doc.id,
182
+ artifact=md_path,
183
+ metadata_key=MARKDOWN_METADATA_KEY,
184
+ filename=MARKDOWN_DIRNAME,
185
+ )
186
+ write_artifact(artifacts_root, artifact)
187
+ processed = ProcessingReport(n_docs=n_docs, n_pages=n_pages)
188
+ successes = ProcessingReport(n_docs=n_successes, n_pages=n_successes_pages)
189
+ response = MarkdownExtractResponse(
190
+ processed=processed, successes=successes, errors=errors
191
+ )
192
+ return response
193
+
194
+
195
+ def _with_supported_exts_query(supported_exts: set[SupportedExt]) -> dict[str, Any]:
196
+ supported_mimes = sorted(chain(*(ext_to_mime_types(f) for f in supported_exts)))
197
+ format_query = {"terms": {DOC_CONTENT_TYPE: supported_mimes}}
198
+ query = and_query(
199
+ format_query, has_type(type_field="type", type_value=ES_DOCUMENT_TYPE)
200
+ )
201
+ return query[QUERY]
202
+
203
+
204
+ def _build_doc_query(
205
+ docs: list[DocId] | DocumentSearchQuery | None, supported_exts: set[SupportedExt]
206
+ ) -> dict[str, Any]:
207
+ format_query = _with_supported_exts_query(supported_exts)
208
+ match docs:
209
+ case dict():
210
+ if not docs:
211
+ return {QUERY: format_query}
212
+ return and_query(format_query, docs)
213
+ case None:
214
+ return {QUERY: format_query}
215
+ case list():
216
+ return and_query(format_query, has_id(docs))
217
+ case _:
218
+ raise ValueError(f"unsupported format {docs.__class__.__name__}")
219
+
220
+
221
+ async def _search_docs(
222
+ es_client: ESClient, project: str, query: dict[str, Any], sort: ESSort = None
223
+ ) -> AsyncIterable[ProcessedDoc]:
224
+ async for page in es_client.poll_search_pages(
225
+ index=project,
226
+ body=query,
227
+ sort=sort,
228
+ _source_includes=_DOC_CONTENT_SOURCES,
229
+ ):
230
+ for hit in page[HITS][HITS]:
231
+ n_pages = None
232
+ meta = hit[SOURCE].get(DOC_METADATA)
233
+ if meta is not None:
234
+ n_pages = meta.get("tika_metadata_xmptpg_npages")
235
+ yield ProcessedDoc.from_fs_doc(
236
+ Document.from_es(hit).to_filesystem(), n_pages=n_pages
237
+ )
238
+
239
+
240
+ async def _batch_by_n_pages(
241
+ docs: AsyncIterable[ProcessedDoc], target_n_pages_per_batch: int
242
+ ) -> AsyncIterable[list[ProcessedDoc]]:
243
+ current_n_pages = 0
244
+ current_batch = []
245
+ async for d in docs:
246
+ if current_n_pages >= target_n_pages_per_batch:
247
+ yield current_batch
248
+ current_n_pages = 0
249
+ current_batch = []
250
+ current_batch.append(d)
251
+ current_n_pages += d.n_pages
252
+ if current_batch:
253
+ yield current_batch
254
+
255
+
256
+ async def _write_batches(
257
+ batches: AsyncIterable[list[ProcessedDoc]], root: Path
258
+ ) -> AsyncIterable[Path]:
259
+ batch_id = 0
260
+ async for batch in batches:
261
+ batch_path = root / f"{batch_id}.jsonl"
262
+ with batch_path.open("w") as f:
263
+ for fs_doc in batch:
264
+ f.write(f"{fs_doc.model_dump_json()}\n")
265
+ yield batch_path
266
+ batch_id += 1
267
+
268
+
269
+ def _symlink_embedded_processed_doc_to_workdir(
270
+ doc: ProcessedDoc, artifacts_root: Path, *, workdir: Path
271
+ ) -> ProcessedDoc:
272
+ match doc.location:
273
+ case DocumentLocation.ARTIFACTS:
274
+ symlinks_dir = workdir / doc.index / "symlinks"
275
+ symlinks_dir.mkdir(parents=True, exist_ok=True)
276
+ symlink_path = Path(*doc.path.parts[:-1], doc.id)
277
+ # Replace the "raw" with the doc id
278
+ doc_ext = Path(doc.resource_name).suffix
279
+ symlink_path = symlink_path.relative_to(Path(doc.index))
280
+ symlink_path = symlinks_dir / f"{symlink_path}{doc_ext}"
281
+ symlink_path.parent.mkdir(parents=True, exist_ok=True)
282
+ artifact_path = artifacts_root / doc.path
283
+ with contextlib.suppress(FileExistsError):
284
+ os.symlink(artifact_path, symlink_path)
285
+ return ProcessedDoc(
286
+ path=symlink_path.relative_to(workdir),
287
+ id=doc.id,
288
+ location=DocumentLocation.WORKDIR,
289
+ index=doc.index,
290
+ resource_name=doc.resource_name,
291
+ n_pages=doc.n_pages,
292
+ )
293
+ case DocumentLocation.ORIGINAL:
294
+ return doc
295
+ case _:
296
+ raise ValueError(f"unsupported location {doc.location}")
297
+
298
+
299
+ @cache
300
+ def ext_to_mime_types(ext: SupportedExt) -> set[str]:
301
+ # All particular cases
302
+ match ext:
303
+ case SupportedExt.NXML:
304
+ return ext_to_mime_types(SupportedExt.XML)
305
+ case SupportedExt.ADOC | SupportedExt.ASCIIDOC:
306
+ return {"text/x-asciidoc"}
307
+ case SupportedExt.QMD | SupportedExt.RMD:
308
+ return ext_to_mime_types(SupportedExt.MD)
309
+ case SupportedExt.XBRL:
310
+ return ext_to_mime_types(SupportedExt.HTLM)
311
+ try:
312
+ types = {mimetypes.types_map[ext]}
313
+ except KeyError as e:
314
+ raise ValueError(f"unsupported mimetype {ext}") from e
315
+ other = mimetypes.common_types.get(ext)
316
+ if other is not None:
317
+ types.add(other)
318
+ return types
319
+
320
+
321
+ ACTIVITIES = [
322
+ MarkdownExtract.extract_worker_config,
323
+ MarkdownExtract.create_markdown_extract_batches,
324
+ MarkdownExtract.extract_markdown_content,
325
+ ]
@@ -0,0 +1,18 @@
1
+ from datashare_python.config import WorkerConfig
2
+ from datashare_python.objects import DatashareModel
3
+ from pydantic import Field
4
+
5
+ from .constants import TorchDevice
6
+
7
+
8
+ class MarkdownExtractConfig(DatashareModel):
9
+ target_n_pages_per_batch: int = 100
10
+
11
+
12
+ class ExtractWorkerConfig(WorkerConfig):
13
+ device: TorchDevice = Field(default=TorchDevice.CPU, frozen=True)
14
+
15
+ markdown: MarkdownExtractConfig = Field(default_factory=MarkdownExtractConfig)
16
+
17
+
18
+ WORKER_CONFIG_CLS = ExtractWorkerConfig
@@ -0,0 +1,34 @@
1
+ from enum import StrEnum
2
+
3
+ from extract_python import PipelineType
4
+
5
+ MARKDOWN_METADATA_KEY = "extract.markdown"
6
+ MARKDOWN_DIRNAME = "markdown"
7
+
8
+
9
+ class TaskQueue(StrEnum):
10
+ WORKFLOWS = "datashare.workflows"
11
+ IO = "extract.io"
12
+ EXTRACT_GPU_MINER_U = "extract.gpu.mineru"
13
+ EXTRACT_CPU_MINER_U = "extract.cpu.mineru"
14
+ EXTRACT_GPU = "extract.gpu"
15
+ EXTRACT_CPU = "extract.cpu"
16
+
17
+
18
+ class TorchDevice(StrEnum):
19
+ CPU = "cpu"
20
+ GPU = "cuda"
21
+
22
+ def md_extract_queue(self, pipeline: PipelineType) -> TaskQueue:
23
+ is_mineru = pipeline is PipelineType.MINER_U
24
+ match self:
25
+ case TorchDevice.GPU:
26
+ if is_mineru:
27
+ return TaskQueue.EXTRACT_GPU_MINER_U
28
+ return TaskQueue.EXTRACT_GPU
29
+ case TorchDevice.CPU:
30
+ if is_mineru:
31
+ return TaskQueue.EXTRACT_CPU_MINER_U
32
+ return TaskQueue.EXTRACT_CPU
33
+ case _:
34
+ raise ValueError(f"unsupported TorchDevice {self}")
@@ -0,0 +1,11 @@
1
+ from datashare_python.dependencies import (
2
+ lifespan_es_client, # noqa: F401
3
+ set_es_client,
4
+ set_loggers,
5
+ set_worker_config,
6
+ )
7
+
8
+ IO = [set_worker_config, set_loggers, set_es_client]
9
+ EXTRACT = [set_worker_config, set_loggers]
10
+
11
+ DEPENDENCIES = {"extract.io": IO, "extract.extract": EXTRACT}
@@ -0,0 +1,68 @@
1
+ from typing import Annotated, Any, Self
2
+
3
+ from datashare_python.objects import DatashareModel, FilesystemDocument
4
+ from extract_python import (
5
+ DoclingPipelineConfig,
6
+ PipelineType,
7
+ Status,
8
+ )
9
+ from extract_python import (
10
+ PipelineConfig as ExtractPipelineConfig,
11
+ )
12
+ from extract_python.objects import Error
13
+ from icij_common.pydantic_utils import make_enum_discriminator, tagged_union
14
+ from pydantic import Discriminator, Field
15
+
16
+ DocumentSearchQuery = dict[str, Any]
17
+ DocId = str
18
+
19
+
20
+ pipeline_discriminator = make_enum_discriminator("pipeline", PipelineType)
21
+ PipelineConfig = Annotated[
22
+ tagged_union(ExtractPipelineConfig.__subclasses__(), lambda t: t.pipeline.default),
23
+ Discriminator(pipeline_discriminator),
24
+ ]
25
+
26
+
27
+ class MarkdownExtractArgs(DatashareModel):
28
+ project: str
29
+ docs: list[DocId] | DocumentSearchQuery | None
30
+ config: PipelineConfig = Field(default_factory=DoclingPipelineConfig)
31
+
32
+
33
+ class ProcessingReport(DatashareModel):
34
+ n_docs: int = 0
35
+ n_pages: int = 0
36
+
37
+ def __add__(self, other: Self) -> Self:
38
+ return ProcessingReport(
39
+ n_docs=other.n_docs + self.n_docs, n_pages=other.n_pages + self.n_pages
40
+ )
41
+
42
+
43
+ class ProcessedDoc(FilesystemDocument):
44
+ n_pages: int
45
+
46
+ @classmethod
47
+ def from_fs_doc(cls, fs_doc: FilesystemDocument, n_pages: int | None) -> Self:
48
+ n_pages = n_pages if n_pages is not None else 1
49
+ return cls(n_pages=n_pages, **fs_doc.model_dump())
50
+
51
+
52
+ class ErrorReport(DatashareModel):
53
+ doc: ProcessedDoc
54
+ status: Status
55
+ errors: list[Error] = []
56
+
57
+
58
+ class MarkdownExtractResponse(DatashareModel):
59
+ processed: ProcessingReport = Field(default_factory=ProcessingReport)
60
+ successes: ProcessingReport = Field(default_factory=ProcessingReport)
61
+ errors: list[ErrorReport] = Field(default_factory=list)
62
+
63
+ @classmethod
64
+ def from_responses(cls, *responses: Self) -> Self:
65
+ processed = sum((r.processed for r in responses), start=ProcessingReport())
66
+ successes = sum((r.successes for r in responses), start=ProcessingReport())
67
+ errors = sum((r.errors for r in responses), start=[])
68
+ return cls(processed=processed, successes=successes, errors=errors)
@@ -0,0 +1,66 @@
1
+ import asyncio
2
+ import logging
3
+ from datetime import timedelta
4
+ from enum import StrEnum
5
+
6
+ from temporalio import workflow
7
+
8
+ with workflow.unsafe.imports_passed_through():
9
+ from datashare_python.utils import WorkflowWithProgress, execute_activity
10
+
11
+ from .activities import MarkdownExtract
12
+ from .objects import MarkdownExtractArgs, MarkdownExtractResponse
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class TaskQueues(StrEnum):
18
+ EXTRACT_GPU = "extract.extract-gpu"
19
+ EXTRACT_CPU = "extract.extract-cpu"
20
+ IO = "extract.io"
21
+ WORKFLOWS = "datashare.workflows"
22
+
23
+
24
+ @workflow.defn(name="extract.markdown-content")
25
+ class ExtractMarkdownContentWorkflow(WorkflowWithProgress):
26
+ @workflow.run
27
+ async def run(self, args: MarkdownExtractArgs) -> MarkdownExtractResponse:
28
+ # Fetch worker config
29
+ worker_config = await execute_activity(
30
+ MarkdownExtract.extract_worker_config,
31
+ task_queue=TaskQueues.IO,
32
+ start_to_close_timeout=timedelta(hours=1),
33
+ )
34
+ # Create batches almost of constant number of pages
35
+ batch_args = [args.project, args.docs, args.config]
36
+ logger.info("creating context extraction batches...")
37
+ heartbeat_timeout = timedelta(seconds=30)
38
+ extract_batches = await execute_activity(
39
+ MarkdownExtract.create_markdown_extract_batches,
40
+ args=batch_args,
41
+ task_queue=TaskQueues.IO,
42
+ start_to_close_timeout=timedelta(hours=6),
43
+ heartbeat_timeout=heartbeat_timeout,
44
+ )
45
+
46
+ # Extract Markdown content
47
+ # Distribute batches docs with (more or less) constant number of page per batch,
48
+ # across workers
49
+ extract_args = [(b, args.project, args.config) for b in extract_batches]
50
+ task_queue = worker_config.device.md_extract_queue(args.config.pipeline)
51
+ extract_acts = (
52
+ execute_activity(
53
+ MarkdownExtract.extract_markdown_content,
54
+ args=args,
55
+ task_queue=task_queue,
56
+ start_to_close_timeout=timedelta(hours=12),
57
+ heartbeat_timeout=heartbeat_timeout,
58
+ )
59
+ for args in extract_args
60
+ )
61
+ responses = await asyncio.gather(*extract_acts)
62
+ response = MarkdownExtractResponse.from_responses(*responses)
63
+ return response
64
+
65
+
66
+ WORKFLOWS = [ExtractMarkdownContentWorkflow]