extract-python 0.5.5__tar.gz → 0.5.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.5.5
3
+ Version: 0.5.7
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.1
10
+ Requires-Dist: extract-core~=0.5.5
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -1,16 +1,20 @@
1
+ import asyncio
1
2
  import shutil
2
3
  import tempfile
3
4
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
5
  from pathlib import Path
6
+ from typing import Any, Self
5
7
 
6
8
  from docling.datamodel.base_models import InputFormat
7
9
  from docling.datamodel.document import ConversionResult
8
- from docling.document_converter import DocumentConverter
10
+ from docling.datamodel.pipeline_options import PipelineOptions
11
+ from docling.document_converter import DocumentConverter, FormatOption
9
12
 
10
13
  # TODO: this is long to load improve it
11
14
  from docling_core.types.doc import ImageRefMode
12
15
  from docling_core.types.io import DocumentStream
13
16
  from extract_core import (
17
+ BaseModel,
14
18
  DoclingFormatOption,
15
19
  DoclingPipelineConfig,
16
20
  Error,
@@ -23,7 +27,10 @@ from extract_core import (
23
27
  Result,
24
28
  Status,
25
29
  )
30
+ from icij_common.pydantic_utils import merge_configs
26
31
  from icij_common.registrable import FromConfig
32
+ from pydantic import ConfigDict, field_serializer
33
+ from pydantic_core.core_schema import SerializerFunctionWrapHandler
27
34
 
28
35
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
29
36
  from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
@@ -49,7 +56,13 @@ class DoclingPipeline(Pipeline):
49
56
  ) -> AsyncGenerator[Result, None]:
50
57
  docs, path_or_streams = map_and_preserve(_to_docling, docs)
51
58
  outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
52
- for doc, res in zip(docs, outputs, strict=True):
59
+
60
+ sentinel = object()
61
+ while True:
62
+ res = await asyncio.to_thread(next, outputs, sentinel)
63
+ if res is sentinel:
64
+ return
65
+ doc = next(docs)
53
66
  yield _to_result(res, doc, output_format, output_path=output_path)
54
67
 
55
68
  @classmethod
@@ -128,3 +141,52 @@ def _to_markdown_doc(
128
141
  shutil.move(tmp_dir, md_dir)
129
142
  pages = PageIndexes.from_page_end_indices(end_indices)
130
143
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
144
+
145
+
146
+ class SerializableFormatOptions(DoclingFormatOption):
147
+ # Utility class to serialize Python format options into a JSON which can be
148
+ # correctly deserialized into a docling FormatOption
149
+ # via DoclingFormatOption.to_docling
150
+ model_config = merge_configs(
151
+ BaseModel.model_config, ConfigDict(polymorphic_serialization=True)
152
+ )
153
+
154
+ pipeline_options: PipelineOptions | None = None
155
+
156
+ @classmethod
157
+ def from_docling(cls, format_opts: FormatOption) -> Self:
158
+ return cls(
159
+ pipeline_cls=format_opts.pipeline_cls.__name__,
160
+ pipeline_options=format_opts.pipeline_options,
161
+ backend=format_opts.backend.__name__,
162
+ backend_options=format_opts.backend_options,
163
+ )
164
+
165
+ @field_serializer("pipeline_options", mode="wrap")
166
+ def _serialize_pipeline_opts(
167
+ self, v: PipelineOptions | None, handler: SerializerFunctionWrapHandler
168
+ ) -> Any:
169
+ if v is None:
170
+ return handler(v)
171
+ serialized = handler(v)
172
+ picture_desc_opts = getattr(v, "picture_description_options", None)
173
+ if picture_desc_opts is not None:
174
+ if "picture_description_options" not in serialized:
175
+ serialized["picture_description_options"] = dict()
176
+ serialized["picture_description_options"]["kind"] = picture_desc_opts.kind
177
+ ocr_opts = getattr(v, "ocr_options", None)
178
+ if ocr_opts is not None:
179
+ if "ocr_options" not in serialized:
180
+ serialized["ocr_options"] = dict()
181
+ serialized["ocr_options"]["kind"] = ocr_opts.kind
182
+ layout_opts = getattr(v, "layout_options", None)
183
+ if layout_opts is not None:
184
+ if "layout_options" not in serialized:
185
+ serialized["layout_options"] = dict()
186
+ serialized["layout_opts"]["kind"] = layout_opts.kind
187
+ table_structure_opts = getattr(v, "table_structure_options", None)
188
+ if table_structure_opts is not None:
189
+ if "table_structure_options" not in serialized:
190
+ serialized["table_structure_options"] = dict()
191
+ serialized["table_structure_options"]["kind"] = table_structure_opts.kind
192
+ return serialized
@@ -1,3 +1,4 @@
1
+ import asyncio
1
2
  import gc
2
3
  from collections.abc import AsyncGenerator, Iterable
3
4
  from copy import deepcopy
@@ -86,7 +87,7 @@ class MarkerPipeline(Pipeline):
86
87
  renderer=renderer,
87
88
  )
88
89
  for doc in docs:
89
- yield _process_doc(doc, converter, output_format, output_path)
90
+ yield await _process_doc(doc, converter, output_format, output_path)
90
91
 
91
92
  @classmethod
92
93
  def _from_config(cls, config: MarkerPipelineConfig) -> Self:
@@ -94,7 +95,7 @@ class MarkerPipeline(Pipeline):
94
95
 
95
96
 
96
97
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
97
- def _process_doc(
98
+ async def _process_doc(
98
99
  doc: InputDoc,
99
100
  converter: "PdfConverter",
100
101
  output_format: OutputFormat,
@@ -102,7 +103,7 @@ def _process_doc(
102
103
  ) -> Result:
103
104
  from marker.output import text_from_rendered # noqa: PLC0415
104
105
 
105
- rendered = converter(str(doc.path))
106
+ rendered = await asyncio.to_thread(converter, str(doc.path))
106
107
  content, _, images = text_from_rendered(rendered)
107
108
  match output_format:
108
109
  case OutputFormat.MARKDOWN:
@@ -9,7 +9,7 @@ readme = "README.md"
9
9
  requires-python = ">=3.11,<3.14"
10
10
  dependencies = [
11
11
  "icij-common~=0.8.2",
12
- "extract-core~=0.1",
12
+ "extract-core~=0.5.5",
13
13
  ]
14
14
 
15
15
  [project.optional-dependencies]
@@ -871,12 +871,14 @@ requires-dist = [
871
871
  [package.metadata.requires-dev]
872
872
  dev = [
873
873
  { name = "docling", specifier = "~=2.96" },
874
- { name = "huggingface-hub", specifier = "~=0.36.2" },
874
+ { name = "docling-slim", extras = ["models-vlm-inline"], specifier = "~=2.96" },
875
875
  { name = "mpmath", specifier = "~=1.3.0" },
876
876
  { name = "pytest", specifier = "~=8.3.5" },
877
877
  { name = "pytest-asyncio", specifier = "~=0.25.3" },
878
878
  { name = "ruff", specifier = "==0.15.2" },
879
879
  { name = "sympy", specifier = "~=1.14.0" },
880
+ { name = "torch", specifier = "==2.12.0", index = "https://download.pytorch.org/whl/cpu" },
881
+ { name = "torchvision", specifier = "==0.27.0", index = "https://download.pytorch.org/whl/cpu" },
880
882
  ]
881
883
 
882
884
  [[package]]
File without changes