extract-python 0.4.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,41 +1,23 @@
1
- from .objects import InputDoc, OutputFormat, Status
2
- from .pipeline import Pipeline, PipelineConfig, PipelineType
3
-
4
1
  try:
5
- from .docling_ import (
6
- DOCLING_DEFAULT_ARTIFACTS_PATH,
7
- DoclingPipeline,
8
- DoclingPipelineConfig,
9
- )
2
+ from .docling_ import DOCLING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline
10
3
  except ImportError:
11
- DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline, DoclingPipelineConfig = (
12
- None,
13
- None,
14
- None,
15
- )
4
+ DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline = None, None
16
5
 
17
6
  try:
18
- from .marker_ import MarkerPipeline, MarkerPipelineConfig
7
+ from .marker_ import MarkerPipeline
19
8
  except ImportError:
20
- MarkerPipeline, MarkerPipelineConfig = None, None
9
+ MarkerPipeline = None
21
10
 
22
11
 
23
12
  try:
24
- from .miner_u import MinerUPipeline, MinerUPipelineConfig
13
+ from .miner_u import MinerUPipeline
25
14
  except ImportError:
26
- MinerUPipeline, MinerUPipelineConfig = None, None
15
+ MinerUPipeline = None
27
16
 
28
17
 
29
18
  __all__ = [
30
19
  "DoclingPipeline",
31
- "DoclingPipelineConfig",
32
- "InputDoc",
33
20
  "DOCLING_DEFAULT_ARTIFACTS_PATH",
34
21
  "MarkerPipeline",
35
- "MarkerPipelineConfig",
36
- "OutputFormat",
37
- "Pipeline",
38
- "PipelineType",
39
- "PipelineConfig",
40
- "Status",
22
+ "MinerUPipeline",
41
23
  ]
@@ -1,6 +1,2 @@
1
1
  ARTIFACTS = "artifacts"
2
- CPU_GROUP = "cpu"
3
- MINER_U_GROUP = "miner-u"
4
- EXTRACT_CONTENT_TASK = "extract-content"
5
- EXTRACT_CONTENT_MINER_U_TASK = "extract-content-miner-u"
6
2
  DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
@@ -1,190 +1,42 @@
1
- import importlib
2
1
  import shutil
3
2
  import tempfile
4
3
  from collections.abc import AsyncGenerator, Iterable, Iterator
5
- from functools import cache
6
4
  from pathlib import Path
7
- from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
8
5
 
9
- from docling.backend.abstract_backend import AbstractDocumentBackend
10
- from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
11
-
12
- # Data model import are quick it's ok to leave it there
13
- from docling.datamodel.base_models import FormatToExtensions, InputFormat
6
+ from docling.datamodel.base_models import InputFormat
14
7
  from docling.datamodel.document import ConversionResult
15
- from docling.datamodel.pipeline_options import (
16
- EasyOcrOptions,
17
- PdfPipelineOptions,
18
- PipelineOptions,
19
- ThreadedPdfPipelineOptions,
20
- )
21
- from docling.document_converter import DocumentConverter, FormatOption
22
- from docling.pipeline.base_pipeline import BasePipeline
8
+ from docling.document_converter import DocumentConverter
23
9
 
24
10
  # TODO: this is long to load improve it
25
11
  from docling_core.types.doc import ImageRefMode
26
12
  from docling_core.types.io import DocumentStream
27
- from icij_common.pydantic_utils import to_lower_snake_case
28
- from icij_common.registrable import FromConfig
29
- from pydantic import (
30
- AfterValidator,
31
- BeforeValidator,
32
- Field,
33
- PlainSerializer,
34
- WrapSerializer,
35
- model_validator,
36
- )
37
- from pydantic_core.core_schema import SerializerFunctionWrapHandler
38
-
39
- from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
40
- from .objects import (
13
+ from extract_core import (
14
+ DoclingFormatOption,
15
+ DoclingPipelineConfig,
41
16
  Error,
42
17
  InputDoc,
43
18
  MarkdownDoc,
44
19
  OutputFormat,
45
20
  PageIndexes,
21
+ Pipeline,
22
+ PipelineType,
46
23
  Result,
47
24
  Status,
48
- SupportedExt,
49
25
  )
50
- from .pipeline import Pipeline, PipelineConfig, PipelineType
51
- from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
52
-
53
- DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
54
-
55
-
56
- def _validate_pipeline_opts(v: "PipelineOptions") -> None:
57
- if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
58
- msg = "generate_picture_images should be set to true"
59
- raise ValueError(msg)
60
- return v
61
-
62
-
63
- T = TypeVar("T")
64
-
65
-
66
- def _find_subcls(cls: type[T], name: str) -> type[T]:
67
- # Check if the class available
68
- for c in all_subclasses(cls):
69
- if c.__name__ == name:
70
- return c
71
- # Then apply ad-hoc search
72
- if "pipeline" in cls.__name__.lower():
73
- module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
74
- try:
75
- module = importlib.import_module(module_name)
76
- return getattr(module, name)
77
- except (ModuleNotFoundError, AttributeError):
78
- pass
79
- raise ValueError(f"unknown {cls.__name__} subclass {name}")
80
-
81
-
82
- def _find_init_arg_type(cls: type[Any], arg: str) -> type:
83
- hints = get_type_hints(cls.__init__)
84
- return hints[arg]
85
-
86
-
87
- def _resolve_pipeline_cls(v: Any) -> Any:
88
- if isinstance(v, str):
89
- return _find_subcls(BasePipeline, v)
90
- return v
91
-
92
-
93
- def _ser_class_as_str(v: Any) -> Any:
94
- if isinstance(v, type):
95
- return v.__name__
96
- return v
97
-
98
-
99
- def _ser_with_backend_option_kind(
100
- v: Any, handler: SerializerFunctionWrapHandler
101
- ) -> Any:
102
- serialized = handler(v)
103
- if isinstance(v, BaseBackendOptions):
104
- kind = getattr(v, "kind", None)
105
- if kind is not None:
106
- serialized["kind"] = kind
107
- return serialized
108
-
109
-
110
- def _resolve_backend(v: Any) -> Any:
111
- if isinstance(v, str):
112
- return _find_subcls(AbstractDocumentBackend, v)
113
- return v
114
-
115
-
116
- class DoclingFormatOption(FormatOption):
117
- pipeline_cls: Annotated[
118
- str | type[BasePipeline],
119
- BeforeValidator(_resolve_pipeline_cls),
120
- PlainSerializer(_ser_class_as_str),
121
- ]
122
- pipeline_options: Annotated[
123
- dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
124
- ] = None
125
- backend: Annotated[
126
- str | type[AbstractDocumentBackend],
127
- BeforeValidator(_resolve_backend),
128
- PlainSerializer(_ser_class_as_str),
129
- ]
130
- backend_options: Annotated[
131
- BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
132
- ] = None
133
-
134
- @model_validator(mode="after")
135
- def _resolve_pipeline_options(self) -> Self:
136
- if isinstance(self.pipeline_options, dict):
137
- option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
138
- self.pipeline_options = option_cls.model_validate(self.pipeline_options)
139
- return self
140
-
141
-
142
- @cache
143
- def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
144
- from docling.backend.docling_parse_backend import ( # noqa: PLC0415
145
- DoclingParseDocumentBackend,
146
- )
147
- from docling.pipeline.standard_pdf_pipeline import ( # noqa: PLC0415
148
- StandardPdfPipeline,
149
- )
150
-
151
- return {
152
- InputFormat.PDF: DoclingFormatOption(
153
- pipeline_cls=StandardPdfPipeline,
154
- backend=DoclingParseDocumentBackend,
155
- pipeline_options=ThreadedPdfPipelineOptions(
156
- ocr_options=EasyOcrOptions(), generate_picture_images=True
157
- ),
158
- ),
159
- }
160
-
161
-
162
- class DoclingPipelineConfig(PipelineConfig):
163
- pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
26
+ from icij_common.registrable import FromConfig
164
27
 
165
- format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
166
- default_factory=_default_format_opts
167
- )
28
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
29
+ from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
168
30
 
169
- @classmethod
170
- @cache
171
- def supported_exts(cls) -> set[SupportedExt]:
172
- unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
173
- supported = set()
174
- for f in InputFormat:
175
- if f in unsupported:
176
- continue
177
- for ext in FormatToExtensions[f]:
178
- supported.add(SupportedExt(f".{ext.lower()}"))
179
- return supported
31
+ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
180
32
 
181
33
 
182
34
  @Pipeline.register(PipelineType.DOCLING)
183
35
  class DoclingPipeline(Pipeline):
184
36
  def __init__(
185
- self, format_options: dict["InputFormat", "FormatOption"] | None = None
37
+ self, format_options: dict["InputFormat", DoclingFormatOption] | None = None
186
38
  ):
187
-
39
+ format_options = {k: v.to_docling() for k, v in format_options.items()}
188
40
  allowed_format = [
189
41
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
190
42
  ]
extract_python/marker_.py CHANGED
@@ -5,10 +5,8 @@ from functools import cache
5
5
  from pathlib import Path
6
6
  from typing import TYPE_CHECKING, Any, ClassVar, Self
7
7
 
8
- from pydantic import Field
9
-
10
- from .constants import ARTIFACTS
11
- from .objects import (
8
+ from extract_core import BasePipelineConfig, Pipeline, PipelineType
9
+ from extract_core.objects import (
12
10
  InputDoc,
13
11
  MarkdownDoc,
14
12
  OutputFormat,
@@ -17,7 +15,9 @@ from .objects import (
17
15
  Status,
18
16
  SupportedExt,
19
17
  )
20
- from .pipeline import Pipeline, PipelineConfig, PipelineType
18
+ from pydantic import Field
19
+
20
+ from .constants import ARTIFACTS
21
21
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
22
22
 
23
23
  if TYPE_CHECKING:
@@ -25,10 +25,10 @@ if TYPE_CHECKING:
25
25
  from PIL import Image
26
26
 
27
27
 
28
- class MarkerPipelineConfig(PipelineConfig):
28
+ class MarkerPipelineConfig(BasePipelineConfig):
29
29
  pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
30
30
 
31
- config: dict[str, Any] = dict()
31
+ config: dict[str, Any] = Field(default_factory=dict)
32
32
 
33
33
  @classmethod
34
34
  @cache
extract_python/miner_u.py CHANGED
@@ -1,96 +1,32 @@
1
1
  import json
2
2
  import shutil
3
3
  from collections.abc import AsyncGenerator, Callable, Iterable
4
- from copy import copy
5
- from enum import StrEnum
6
- from functools import cache, partial
4
+ from functools import partial
7
5
  from pathlib import Path
8
6
  from tempfile import TemporaryDirectory
9
- from typing import Any, ClassVar, Self
7
+ from typing import Self
10
8
 
11
- from pydantic import Field
12
- from pydantic_extra_types.language_code import LanguageAlpha2
13
-
14
- from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
15
- from .objects import (
16
- BaseModel,
9
+ from extract_core import (
17
10
  ConversionOutput,
18
11
  InputDoc,
12
+ MinerUBackend,
13
+ MinerUConfig,
14
+ MinerUPipelineConfig,
19
15
  OutputFormat,
20
16
  PageIndexes,
17
+ Pipeline,
18
+ PipelineType,
21
19
  Result,
22
20
  Status,
23
- SupportedExt,
24
21
  )
25
- from .pipeline import Pipeline, PipelineConfig, PipelineType
22
+
23
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
26
24
  from .utils import path_to_artifacts_dirname
27
25
 
28
26
  _MINER_U_CONVERSION_ERRORS = tuple()
29
27
  MDMakeFunction = Callable[[list, str, str], str | None]
30
28
 
31
29
 
32
- class MinerUBackend(StrEnum):
33
- PIPELINE = "pipeline"
34
- VLM = "vlm"
35
-
36
-
37
- class MinerUConfig(BaseModel):
38
- backend: MinerUBackend = MinerUBackend.PIPELINE
39
- enable_formula_extraction: bool = True
40
- enable_table_extraction: bool = True
41
- # TODO: use enum or literal here
42
- parse_method: str = "auto"
43
-
44
- def as_parse_kwargs(self) -> dict[str, Any]:
45
- kwargs = copy(self._get_default_kwargs())
46
- kwargs["backend"] = self.backend
47
- kwargs["parse_method"] = self.parse_method
48
- kwargs["formula_enable"] = self.enable_formula_extraction
49
- kwargs["table_enable"] = self.enable_table_extraction
50
- return kwargs
51
-
52
- @classmethod
53
- @cache
54
- def _get_default_kwargs(cls) -> dict[str, Any]:
55
- from mineru.utils.enum_class import MakeMode # noqa: PLC0415
56
-
57
- return {
58
- "server_url": None,
59
- # We don't dump md directly we process, we dump the middle json in order
60
- # to be able to get page indexes
61
- "parse_method": "auto",
62
- "dump_md": False,
63
- "dump_middle_json": True,
64
- "f_draw_layout_bbox": False,
65
- "f_draw_span_bbox": False,
66
- "f_dump_model_output": False, # might be useful for debug though
67
- "f_dump_orig_pdf": False,
68
- "f_dump_content_list": False, # might be useful for debug though
69
- "start_page_id": 0,
70
- "f_make_md_mode": MakeMode.MM_MD,
71
- "image_analysis": True,
72
- "end_page_id": None,
73
- "client_side_output_generation": False,
74
- }
75
-
76
-
77
- class MinerUPipelineConfig(PipelineConfig): # noqa: F821
78
- pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
79
-
80
- config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
81
- language: LanguageAlpha2 = Field(frozen=True, default="en")
82
-
83
- @classmethod
84
- @cache
85
- def supported_exts(cls) -> set[SupportedExt]:
86
- return {
87
- SupportedExt.PDF,
88
- SupportedExt.DOCX,
89
- SupportedExt.PPTX,
90
- SupportedExt.XLSX,
91
- }
92
-
93
-
94
30
  @Pipeline.register(PipelineType.MINER_U)
95
31
  class MinerUPipeline(Pipeline):
96
32
  def __init__(self, config: MinerUConfig, language: str):
extract_python/utils.py CHANGED
@@ -6,26 +6,20 @@ from itertools import tee
6
6
  from pathlib import Path, PurePath
7
7
  from typing import Protocol, TypeVar
8
8
 
9
- from .objects import Error, InputDoc, Result, Status
9
+ from extract_core import Error, InputDoc, Result, Status
10
10
 
11
11
  R = TypeVar("R")
12
- T = TypeVar("T")
12
+ In = TypeVar("In")
13
13
 
14
14
 
15
15
  def map_and_preserve(
16
- fn: Callable[[Iterable[T]], Iterator[R]], inputs: Iterable[T]
17
- ) -> tuple[Iterable[T], Iterator[R]]:
16
+ fn: Callable[[Iterable[In]], Iterator[R]], inputs: Iterable[In]
17
+ ) -> tuple[Iterable[In], Iterator[R]]:
18
18
  save_inputs, function_inputs = tee(inputs)
19
19
  outputs = iter(fn(function_inputs))
20
20
  return save_inputs, outputs
21
21
 
22
22
 
23
- def all_subclasses(cls: type[T]) -> set[type[T]]:
24
- return set(cls.__subclasses__()).union(
25
- [s for c in cls.__subclasses__() for s in all_subclasses(c)]
26
- )
27
-
28
-
29
23
  def path_to_artifacts_dirname(path: PurePath, sep: str = "_") -> str:
30
24
  dirname = f"{path.name[: -len(path.suffix)]}"
31
25
  ext = path.suffix
@@ -1,12 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.4.2
3
+ Version: 0.5.4
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
+ Requires-Dist: extract-core~=0.1
10
11
  Requires-Dist: icij-common~=0.8.2
11
12
  Provides-Extra: benches
12
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -21,4 +22,5 @@ Provides-Extra: mineru
21
22
  Requires-Dist: mineru[mlx]~=3.2; (sys_platform == 'darwin') and extra == 'mineru'
22
23
  Requires-Dist: mineru[pipeline,vlm]~=3.2; extra == 'mineru'
23
24
  Requires-Dist: pydantic-extra-types[pycountry]~=2.11; extra == 'mineru'
25
+ Requires-Dist: python-pptx~=1.0; extra == 'mineru'
24
26
  Requires-Dist: six~=1.17; extra == 'mineru'
@@ -0,0 +1,9 @@
1
+ extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
2
+ extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
3
+ extract_python/docling_.py,sha256=C4WP1AJrvS2n-KytlGc_1CShjdTGM077I6b9tvw4NhY,4727
4
+ extract_python/marker_.py,sha256=mLJA1m9G4JQtBs1wz8rmshdbaH81DhIwkRzDKZPJH8A,5058
5
+ extract_python/miner_u.py,sha256=jjHqHx7-2w0LSxYNcjvgWoLDTXsv_y1eeyteSfXqjk4,5771
6
+ extract_python/utils.py,sha256=NiYf65iCF7QO4loh7u4t38Ww3eVJUdBpWStL4eX_DqE,1781
7
+ extract_python-0.5.4.dist-info/METADATA,sha256=4EHPqAxM-8FnZ_Tco8QFpzJqCvNwe58ul_tO0C9aDN0,1216
8
+ extract_python-0.5.4.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
9
+ extract_python-0.5.4.dist-info/RECORD,,
extract_python/objects.py DELETED
@@ -1,323 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- import os
5
- import traceback
6
- import uuid
7
- from abc import ABC
8
- from enum import StrEnum
9
- from functools import cache
10
- from io import BytesIO
11
- from pathlib import Path
12
- from typing import Annotated, Any, NoReturn, Self
13
-
14
- from icij_common.pydantic_utils import (
15
- icij_config,
16
- merge_configs,
17
- no_enum_values_config,
18
- safe_copy,
19
- )
20
- from pydantic import AfterValidator, RootModel, TypeAdapter
21
- from pydantic import BaseModel as _BaseModel
22
-
23
- try:
24
- from docling.datamodel.base_models import (
25
- ConversionStatus,
26
- ErrorItem,
27
- FormatToExtensions,
28
- InputFormat,
29
- )
30
- from docling.datamodel.document import InputDocument
31
- from docling_core.types.io import DocumentStream
32
- except ImportError:
33
- ConversionStatus, ErrorItem, InputFormat = None, None, None
34
- InputDocument = None
35
- DocumentStream = None
36
-
37
- logger = logging.getLogger(__name__)
38
- base_config = merge_configs(icij_config(), no_enum_values_config())
39
-
40
-
41
- @cache
42
- def _ext_to_docling_input_format() -> dict:
43
- from .docling_ import DoclingPipelineConfig # noqa: PLC0415
44
-
45
- mapping = dict()
46
- supported = DoclingPipelineConfig.supported_exts()
47
- for input_f, exts in FormatToExtensions.items():
48
- for ext in exts:
49
- try:
50
- ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
51
- except ValueError:
52
- continue
53
- if ext in supported:
54
- mapping[ext] = input_f
55
- return mapping
56
-
57
-
58
- class BaseModel(_BaseModel):
59
- model_config = base_config
60
-
61
-
62
- class SupportedExt(StrEnum):
63
- ADOC = ".adoc"
64
- ASC = ".asc"
65
- ASCIIDOC = ".asciidoc"
66
- BMP = ".bmp"
67
- CSV = ".csv"
68
- DOC = ".doc"
69
- DOCX = ".docx"
70
- DOTX = ".dotx"
71
- DOTM = ".dotm"
72
- DOCM = ".docm"
73
- EPUB = ".epub"
74
- EML = ".eml"
75
- GIF = ".gif"
76
- HTLM = ".html"
77
- HTM = ".htm"
78
- JPEG = ".jpeg"
79
- JPG = ".jpg"
80
- JSON = ".json"
81
- LATEX = ".latex"
82
- MD = ".md"
83
- NXML = ".nxml"
84
- ODP = ".odp"
85
- ODS = ".ods"
86
- ODT = ".odt"
87
- PDF = ".pdf"
88
- PNG = ".png"
89
- PPSX = ".ppsx"
90
- PPT = ".ppt"
91
- PPTM = ".pptm"
92
- PPSM = ".ppsm"
93
- POTX = ".potx"
94
- POTM = ".potm"
95
- PPTX = ".pptx"
96
- QMD = ".qmd"
97
- RMD = ".rmd"
98
- TEX = ".tex"
99
- TIF = ".tif"
100
- TIFF = ".tiff"
101
- TXT = ".txt"
102
- TEXT = ".text"
103
- WEBP = ".webp"
104
- XBRL = ".xbrl"
105
- XHTML = ".xhtml"
106
- XLS = ".xls"
107
- XLSM = ".xlsm"
108
- XLSX = ".xlsx"
109
- XLTX = ".xltx"
110
- XML = ".xml"
111
-
112
- def to_docling(self) -> InputFormat:
113
- return _ext_to_docling_input_format()[self]
114
-
115
-
116
- class OutputFormat(StrEnum):
117
- MARKDOWN = ".md"
118
-
119
- @property
120
- def suffix(self) -> str:
121
- return self.value[1:]
122
-
123
- def to_marker(self) -> str:
124
- match self:
125
- case OutputFormat.MARKDOWN:
126
- return "markdown"
127
- case _:
128
- raise ValueError(f"{self} is unsupported by marker")
129
-
130
-
131
- class Status(StrEnum):
132
- FAILURE = "failure"
133
- SUCCESS = "success"
134
- PARTIAL_SUCCESS = "partial_success"
135
-
136
- @classmethod
137
- def from_docling(cls, v: Any) -> Self:
138
- from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
139
-
140
- if v is ConversionStatus.SUCCESS:
141
- return cls.SUCCESS
142
- if v is ConversionStatus.PARTIAL_SUCCESS:
143
- return cls.PARTIAL_SUCCESS
144
- if isinstance(v, ConversionStatus):
145
- return cls.FAILURE
146
- raise TypeError(f"can't convert {v!r} to {cls.__name__!r}")
147
-
148
- @property
149
- def allows_conversion(self) -> bool:
150
- return self is Status.SUCCESS or self is Status.PARTIAL_SUCCESS
151
-
152
-
153
- class Error(BaseModel):
154
- id: str
155
- title: str
156
- detail: str
157
-
158
- @classmethod
159
- def from_exception(cls, exception: BaseException) -> Self:
160
- title = exception.__class__.__name__
161
- trace_lines = traceback.format_exception(
162
- None, value=exception, tb=exception.__traceback__
163
- )
164
- detail = f"{exception}\n{''.join(trace_lines)}"
165
- error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
166
- error = cls(id=error_id, title=title, detail=detail)
167
- return error
168
-
169
- @classmethod
170
- def from_docling(cls, docling_error: ErrorItem) -> Self:
171
- title = "DoclingConversionError"
172
- error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
173
- detail = (
174
- f"error in module {docling_error.module_name} of"
175
- f" {docling_error.component_type}:\n{docling_error.error_message}"
176
- )
177
- return cls(id=error_id, title=title, detail=detail)
178
-
179
-
180
- def _id_title(title: str) -> str:
181
- id_title = []
182
- for i, letter in enumerate(title):
183
- if i and letter.isupper():
184
- id_title.append("-")
185
- id_title.append(letter.lower())
186
- return "".join(id_title)
187
-
188
-
189
- class InputDoc(BaseModel):
190
- ext: SupportedExt
191
- path: Path
192
- content: bytes | None = None
193
-
194
- @classmethod
195
- def from_path(cls, path: str | Path) -> Self:
196
- if isinstance(path, str):
197
- path = Path(path)
198
- ext = SupportedExt(path.suffix)
199
- return cls(path=path, ext=ext)
200
-
201
- def to_docling(self) -> Path | DocumentStream:
202
- if self.content is not None:
203
- return DocumentStream(name=str(self.path), stream=BytesIO(self.content))
204
- if not self.path.suffix:
205
- return DocumentStream(
206
- name=str(self.path), stream=BytesIO(self.path.read_bytes())
207
- )
208
- return self.path
209
-
210
- def without_content(self) -> Self:
211
- return safe_copy(self, update={"content": None})
212
-
213
-
214
- class PageIndexes(RootModel[list[tuple[int, int]]]):
215
- # Stores page end index
216
- @classmethod
217
- def from_page_end_indices(cls, lengths: list[int]) -> Self:
218
- return [
219
- ((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
220
- ]
221
-
222
-
223
- class ConversionOutput(BaseModel):
224
- path: Path
225
- pages: PageIndexes = []
226
-
227
-
228
- class MarkdownDoc(ConversionOutput):
229
- @classmethod
230
- @property
231
- @cache
232
- def _valid_conversion_statuses(cls) -> set[ConversionStatus]:
233
- from docling.datamodel.base_models import ConversionStatus # noqa: PLC0415
234
-
235
- return {ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS}
236
-
237
-
238
- def _input_should_not_have_content(value: InputDoc) -> InputDoc:
239
- if value.content is not None:
240
- raise ValueError(f"response input can't have content, but got {value}")
241
- return value
242
-
243
-
244
- class _BaseResult(BaseModel, ABC):
245
- input: InputDoc
246
- status: Status
247
- errors: list[Error] = []
248
-
249
-
250
- class Result(_BaseResult):
251
- # TODO: we could also use generics here when we add more output formats
252
- output: ConversionOutput | None
253
-
254
- def to_response(self) -> ResponseResult:
255
- return ResponseResult(
256
- input=self.input.without_content(),
257
- status=self.status,
258
- errors=self.errors,
259
- output_path=self.output.path,
260
- )
261
-
262
-
263
- class ResponseResult(_BaseResult):
264
- input: Annotated[InputDoc, AfterValidator(func=_input_should_not_have_content)]
265
- output_path: Path
266
-
267
-
268
- class ExtractionResponse(BaseModel):
269
- results: list[ResponseResult]
270
-
271
-
272
- _INPUT_DOCS_ADAPTER = TypeAdapter(list[InputDoc | Path])
273
-
274
-
275
- def parse_extraction_request(
276
- docs: str | list[dict | str], *, data_dir: Path
277
- ) -> list[InputDoc]:
278
- if isinstance(docs, str):
279
- logger.debug("exploring files in %s", data_dir.absolute())
280
- docs_dir = Path(data_dir) / docs
281
- docs = _as_input_docs(docs_dir)
282
- msg = "found %s"
283
- if len(docs) > 10:
284
- msg = msg + ", and more..."
285
- logger.debug("found %s", docs[:10])
286
- return docs
287
- docs = _INPUT_DOCS_ADAPTER.validate_python(docs)
288
- if not docs:
289
- return []
290
- if isinstance(docs[0], Path):
291
- doc_meta = []
292
- unknown_exts = []
293
- for doc in docs:
294
- _, ext = os.path.splitext(str(doc))
295
- if not ext:
296
- unknown_exts.append(doc)
297
- else:
298
- doc_meta.append(InputDoc.from_path(path=doc.relative_to(data_dir)))
299
- if unknown_exts:
300
- raise ValueError(f"found files with unknown extensions {unknown_exts}")
301
- return doc_meta
302
- return docs
303
-
304
-
305
- def _raise(err: OSError) -> NoReturn:
306
- raise err
307
-
308
-
309
- def _as_input_docs(
310
- docs_dir: Path, *, supported_ext: set[str] | None = None
311
- ) -> list[InputDoc]:
312
- if supported_ext is None:
313
- supported_ext = {v.value for v in SupportedExt}
314
- docs = []
315
- for root, _, files in os.walk(docs_dir, onerror=_raise):
316
- root = Path(root) # noqa: PLW2901
317
- for f in files:
318
- ext = Path(f).suffix
319
- if not ext or ext not in supported_ext:
320
- continue
321
- docs.append(InputDoc.from_path(path=root / f))
322
- docs = sorted(docs, key=lambda x: x.path)
323
- return docs
@@ -1,38 +0,0 @@
1
- from abc import ABC, abstractmethod
2
- from collections.abc import AsyncGenerator, Iterable
3
- from enum import StrEnum
4
- from pathlib import Path
5
- from typing import ClassVar
6
-
7
- from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_values_config
8
- from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
9
- from pydantic import Field
10
-
11
- from .objects import InputDoc, OutputFormat, Result, SupportedExt
12
-
13
- StructuredContent = str
14
-
15
-
16
- class PipelineType(StrEnum):
17
- DOCLING = "docling"
18
- MARKER = "marker"
19
- MINER_U = "miner_u"
20
-
21
-
22
- class PipelineConfig(RegistrableConfig, ABC):
23
- # TODO: move this icij_config() to RegistrableConfig
24
- model_config = merge_configs(icij_config(), no_enum_values_config())
25
-
26
- registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
27
- pipeline: ClassVar[PipelineType]
28
-
29
- @classmethod
30
- @abstractmethod
31
- def supported_exts(cls) -> set[SupportedExt]: ...
32
-
33
-
34
- class Pipeline(RegistrableFromConfig, ABC):
35
- @abstractmethod
36
- async def extract_content(
37
- self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
38
- ) -> AsyncGenerator[Result, None]: ...
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
4
- extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
5
- extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
6
- extract_python/objects.py,sha256=MHCUZ9L8LVXlSlHyDMnbuWV1KHWMhUEJQMEDTc9hYD0,8761
7
- extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.4.2.dist-info/METADATA,sha256=95THYq0jZgY2-1X2s8hDoFEo9_aNeukdHPxlcd8_rmI,1132
10
- extract_python-0.4.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.4.2.dist-info/RECORD,,