extract-python 0.5.14__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.5.14
3
+ Version: 0.6.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.5.5
10
+ Requires-Dist: extract-core~=0.6.0
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -1,11 +1,13 @@
1
1
  import asyncio
2
+ import json
3
+ import logging
2
4
  import shutil
3
5
  import tempfile
4
6
  from collections.abc import AsyncGenerator, Iterable, Iterator
7
+ from functools import partial
5
8
  from pathlib import Path
6
9
  from typing import Any, Self
7
10
 
8
- from docling.datamodel.base_models import InputFormat
9
11
  from docling.datamodel.document import ConversionResult
10
12
  from docling.datamodel.pipeline_options import PipelineOptions
11
13
  from docling.document_converter import DocumentConverter, FormatOption
@@ -15,7 +17,6 @@ from docling_core.types.doc import ImageRefMode
15
17
  from docling_core.types.io import DocumentStream
16
18
  from extract_core import (
17
19
  BaseModel,
18
- Device,
19
20
  DoclingFormatOption,
20
21
  DoclingPipelineConfig,
21
22
  Error,
@@ -29,28 +30,29 @@ from extract_core import (
29
30
  Status,
30
31
  )
31
32
  from icij_common.pydantic_utils import merge_configs
32
- from icij_common.registrable import FromConfig
33
33
  from pydantic import ConfigDict, field_serializer
34
34
  from pydantic_core.core_schema import SerializerFunctionWrapHandler
35
35
 
36
36
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
37
37
  from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
38
38
 
39
+ logger = logging.getLogger(__name__)
40
+
39
41
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
40
42
 
41
43
 
42
44
  @Pipeline.register(PipelineType.DOCLING)
43
45
  class DoclingPipeline(Pipeline):
44
- def __init__(
45
- self,
46
- format_options: dict["InputFormat", DoclingFormatOption] | None = None,
47
- *,
48
- device: Device = Device.CPU,
49
- ):
50
- super().__init__(device)
46
+ def __init__(self, config: DoclingPipelineConfig):
47
+ super().__init__(config)
51
48
  format_options = {
52
- k: v.to_docling(self._device) for k, v in format_options.items()
49
+ k: v.to_docling(self._device)
50
+ for k, v in self._config.format_options.items()
53
51
  }
52
+ logger.info(
53
+ "resolved format options to: %s",
54
+ lambda: partial(json.dumps, format_options, indent=2),
55
+ )
54
56
  allowed_format = [
55
57
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
56
58
  ]
@@ -72,15 +74,6 @@ class DoclingPipeline(Pipeline):
72
74
  doc = next(docs)
73
75
  yield _to_result(res, doc, output_format, output_path=output_path)
74
76
 
75
- @classmethod
76
- def _from_config(
77
- cls,
78
- config: DoclingPipelineConfig,
79
- *,
80
- device: Device = Device.CPU,
81
- ) -> FromConfig:
82
- return cls(config.format_options, device=device)
83
-
84
77
 
85
78
  def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
86
79
  for d in docs:
@@ -2,13 +2,10 @@ import asyncio
2
2
  import gc
3
3
  from collections.abc import AsyncGenerator, Iterable
4
4
  from copy import deepcopy
5
- from functools import cache
6
5
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, ClassVar, Self
6
+ from typing import TYPE_CHECKING
8
7
 
9
8
  from extract_core import (
10
- BasePipelineConfig,
11
- Device,
12
9
  InputDoc,
13
10
  MarkdownDoc,
14
11
  OutputFormat,
@@ -17,9 +14,7 @@ from extract_core import (
17
14
  PipelineType,
18
15
  Result,
19
16
  Status,
20
- SupportedExt,
21
17
  )
22
- from pydantic import Field
23
18
 
24
19
  from .constants import ARTIFACTS
25
20
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
@@ -29,55 +24,11 @@ if TYPE_CHECKING:
29
24
  from PIL import Image
30
25
 
31
26
 
32
- class MarkerPipelineConfig(BasePipelineConfig):
33
- pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
34
-
35
- config: dict[str, Any] = Field(default_factory=dict)
36
-
37
- @classmethod
38
- @cache
39
- def supported_exts(cls) -> set[SupportedExt]:
40
- # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
- return {
42
- SupportedExt.PDF,
43
- SupportedExt.XLS,
44
- SupportedExt.XLSX,
45
- SupportedExt.XLSM,
46
- SupportedExt.CSV,
47
- SupportedExt.ODS,
48
- SupportedExt.DOC,
49
- SupportedExt.DOCX,
50
- SupportedExt.ODT,
51
- SupportedExt.PPT,
52
- SupportedExt.PPTX,
53
- SupportedExt.ODP,
54
- SupportedExt.HTLM,
55
- SupportedExt.EPUB,
56
- SupportedExt.PNG,
57
- SupportedExt.JPG,
58
- SupportedExt.JPEG,
59
- SupportedExt.WEBP,
60
- SupportedExt.GIF,
61
- SupportedExt.TIFF,
62
- }
63
-
64
-
65
27
  _MARKER_CONVERSION_ERRORS = tuple()
66
28
 
67
29
 
68
30
  @Pipeline.register(PipelineType.MARKER)
69
31
  class MarkerPipeline(Pipeline):
70
- def __init__(
71
- self,
72
- marker_config: dict[str, Any] | None = None,
73
- *,
74
- device: Device = Device.CPU,
75
- ):
76
- super().__init__(device)
77
- if marker_config is None:
78
- marker_config = dict()
79
- self._marker_config = marker_config
80
-
81
32
  async def extract_content(
82
33
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
83
34
  ) -> AsyncGenerator[Result, None]:
@@ -85,7 +36,7 @@ class MarkerPipeline(Pipeline):
85
36
  from marker.converters.pdf import PdfConverter # noqa: PLC0415
86
37
  from marker.models import create_model_dict # noqa: PLC0415
87
38
 
88
- config = deepcopy(self._marker_config)
39
+ config = deepcopy(self._config.config)
89
40
  config["output_format"] = output_format.to_marker()
90
41
  config_parser = ConfigParser(config)
91
42
  renderer = config_parser.get_renderer()
@@ -98,15 +49,6 @@ class MarkerPipeline(Pipeline):
98
49
  for doc in docs:
99
50
  yield await _process_doc(doc, converter, output_format, output_path)
100
51
 
101
- @classmethod
102
- def _from_config(
103
- cls,
104
- config: MarkerPipelineConfig,
105
- *,
106
- device: Device = Device.CPU,
107
- ) -> Self:
108
- return cls(config.config, device=device)
109
-
110
52
 
111
53
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
112
54
  async def _process_doc(
@@ -5,14 +5,11 @@ from collections.abc import AsyncGenerator, Callable, Iterable
5
5
  from functools import partial
6
6
  from pathlib import Path
7
7
  from tempfile import TemporaryDirectory
8
- from typing import Self
9
8
 
10
9
  from extract_core import (
11
10
  ConversionOutput,
12
- Device,
13
11
  InputDoc,
14
12
  MinerUBackend,
15
- MinerUConfig,
16
13
  MinerUPipelineConfig,
17
14
  OutputFormat,
18
15
  PageIndexes,
@@ -31,13 +28,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
31
28
 
32
29
  @Pipeline.register(PipelineType.MINER_U)
33
30
  class MinerUPipeline(Pipeline):
34
- def __init__(
35
- self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
36
- ):
37
- super().__init__(device)
38
- self._config = config
39
- self._language = language
40
- self._md_make_fn = _parse_md_make_fn(config.backend)
31
+ def __init__(self, config: MinerUPipelineConfig):
32
+ super().__init__(config)
33
+ self._language = self._config.language
34
+ self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
41
35
 
42
36
  async def extract_content(
43
37
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -59,7 +53,7 @@ class MinerUPipeline(Pipeline):
59
53
  pdf_file_names=pdfs_names,
60
54
  pdf_bytes_list=pdfs_bytes,
61
55
  p_lang_list=p_lang_list,
62
- **self._config.as_parse_kwargs(),
56
+ **self._config.config.as_parse_kwargs(),
63
57
  )
64
58
  res_paths = [
65
59
  _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
@@ -73,15 +67,6 @@ class MinerUPipeline(Pipeline):
73
67
  output_path=output_path,
74
68
  )
75
69
 
76
- @classmethod
77
- def _from_config(
78
- cls,
79
- config: MinerUPipelineConfig,
80
- *,
81
- device: Device = Device.CPU,
82
- ) -> Self:
83
- return cls(config.config, language=config.language, device=device)
84
-
85
70
 
86
71
  def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
87
72
  output_path = output_dir / pdf_filename
@@ -9,7 +9,7 @@ readme = "README.md"
9
9
  requires-python = ">=3.11,<3.14"
10
10
  dependencies = [
11
11
  "icij-common~=0.8.2",
12
- "extract-core~=0.5.5",
12
+ "extract-core~=0.6.0",
13
13
  ]
14
14
 
15
15
  [project.optional-dependencies]
@@ -51,14 +51,28 @@ override-dependencies = [
51
51
  "pillow==11.3.0",
52
52
  ]
53
53
 
54
+ [[tool.uv.index]]
55
+ name = "pytorch-cpu"
56
+ url = "https://download.pytorch.org/whl/cpu"
57
+ explicit = true
58
+
59
+
54
60
  [tool.uv.sources]
55
61
  extract-core = { path = "../extract-core", editable = true }
62
+ torch = [
63
+ { index = "pytorch-cpu" },
64
+ ]
65
+ torchvision = [
66
+ { index = "pytorch-cpu" },
67
+ ]
56
68
 
57
69
  [dependency-groups]
58
70
  dev = [
59
71
  "pytest~=8.3.5",
60
72
  "pytest-asyncio~=0.25.3",
61
73
  "ruff==0.15.2",
74
+ "torch==2.12.0",
75
+ "torchvision==0.27.0",
62
76
  ]
63
77
 
64
78
  [project.urls]