extract-python 0.2.1__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {extract_python-0.2.1 → extract_python-0.3.1}/PKG-INFO +1 -1
  2. extract_python-0.3.1/extract_python/docling_.py +203 -0
  3. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/marker_.py +17 -11
  4. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/miner_u.py +43 -30
  5. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/objects.py +43 -2
  6. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/pipeline.py +1 -1
  7. {extract_python-0.2.1 → extract_python-0.3.1}/pyproject.toml +1 -0
  8. {extract_python-0.2.1 → extract_python-0.3.1}/uv.lock +28 -2
  9. extract_python-0.2.1/extract_python/docling_.py +0 -258
  10. {extract_python-0.2.1 → extract_python-0.3.1}/.dockerignore +0 -0
  11. {extract_python-0.2.1 → extract_python-0.3.1}/.github/workflows/publish.yml +0 -0
  12. {extract_python-0.2.1 → extract_python-0.3.1}/.github/workflows/tests.yml +0 -0
  13. {extract_python-0.2.1 → extract_python-0.3.1}/.gitignore +0 -0
  14. {extract_python-0.2.1 → extract_python-0.3.1}/.python-version +0 -0
  15. {extract_python-0.2.1 → extract_python-0.3.1}/Dockerfile +0 -0
  16. {extract_python-0.2.1 → extract_python-0.3.1}/README.md +0 -0
  17. {extract_python-0.2.1 → extract_python-0.3.1}/benches/__init__.py +0 -0
  18. {extract_python-0.2.1 → extract_python-0.3.1}/benches/compare.ipynb +0 -0
  19. {extract_python-0.2.1 → extract_python-0.3.1}/benches/compare.py +0 -0
  20. {extract_python-0.2.1 → extract_python-0.3.1}/benches/constants.py +0 -0
  21. {extract_python-0.2.1 → extract_python-0.3.1}/data/.gitignore +0 -0
  22. {extract_python-0.2.1 → extract_python-0.3.1}/docker-compose.yml +0 -0
  23. {extract_python-0.2.1 → extract_python-0.3.1}/extract +0 -0
  24. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/__init__.py +0 -0
  25. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/constants.py +0 -0
  26. {extract_python-0.2.1 → extract_python-0.3.1}/extract_python/utils.py +0 -0
  27. {extract_python-0.2.1 → extract_python-0.3.1}/qa/ruff.toml +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,203 @@
1
+ import shutil
2
+ import tempfile
3
+ from collections.abc import AsyncGenerator, Iterable, Iterator
4
+ from functools import cache
5
+ from pathlib import Path
6
+ from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+
8
+ from icij_common.registrable import FromConfig
9
+ from pydantic import AfterValidator, Field
10
+
11
+ from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
12
+ from .objects import (
13
+ Error,
14
+ InputDoc,
15
+ MarkdownDoc,
16
+ OutputFormat,
17
+ PageIndexes,
18
+ Result,
19
+ Status,
20
+ SupportedExt,
21
+ )
22
+ from .pipeline import Pipeline, PipelineConfig, PipelineType
23
+ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
24
+
25
+ DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
26
+
27
+ if TYPE_CHECKING:
28
+ from docling.datamodel.base_models import InputFormat
29
+ from docling.datamodel.pipeline_options import PipelineOptions
30
+ from docling.document_converter import ConversionResult, FormatOption
31
+ from docling_core.types.io import DocumentStream
32
+
33
+
34
+ def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
36
+
37
+ if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
38
+ msg = "generate_picture_images should be set to true"
39
+ raise ValueError(msg)
40
+
41
+
42
+ def _validate_options(
43
+ data: dict["InputFormat", "FormatOption"],
44
+ ) -> dict["InputFormat", "FormatOption"]:
45
+ for opts in data.values():
46
+ _validate_pipeline_opts(opts.pipeline_options)
47
+ return data
48
+
49
+
50
+ @cache
51
+ def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
+ from docling.datamodel.pipeline_options import (
53
+ EasyOcrOptions,
54
+ PdfPipelineOptions,
55
+ )
56
+ from docling.document_converter import PdfFormatOption
57
+
58
+ return {
59
+ InputFormat.PDF: PdfFormatOption(
60
+ pipeline_options=PdfPipelineOptions(
61
+ ocr_options=EasyOcrOptions(), generate_picture_images=True
62
+ )
63
+ ),
64
+ }
65
+
66
+
67
+ T = TypeVar("T")
68
+
69
+
70
+ def _find_subcls(cls: type[T], name: str) -> type[T]:
71
+ for c in all_subclasses(cls):
72
+ if c.__name__ == name:
73
+ return c
74
+ raise ValueError(f"unknown {cls.__name__} subclass {name}")
75
+
76
+
77
+ @PipelineConfig.register()
78
+ class DoclingPipelineConfig(PipelineConfig):
79
+ pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
80
+ task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
81
+
82
+ format_options: Annotated[
83
+ dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
84
+ ] = Field(default_factory=_default_format_opts)
85
+
86
+ _unsupported_input_formats: ClassVar[set[InputFormat]] = {
87
+ InputFormat.AUDIO,
88
+ InputFormat.METS_GBS,
89
+ InputFormat.VTT,
90
+ }
91
+
92
+ @classmethod
93
+ @cache
94
+ def supported_exts(cls) -> set[SupportedExt]:
95
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
96
+
97
+ supported = set()
98
+ for f in InputFormat:
99
+ if f in cls._unsupported_input_formats:
100
+ continue
101
+ for ext in FormatToExtensions[f]:
102
+ supported.add(SupportedExt(f".{ext.lower()}"))
103
+ return supported
104
+
105
+
106
+ @Pipeline.register(PipelineType.DOCLING)
107
+ class DoclingPipeline(Pipeline):
108
+ def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
109
+ from docling.document_converter import DocumentConverter
110
+
111
+ allowed_format = [
112
+ f.to_docling() for f in DoclingPipelineConfig.supported_exts()
113
+ ]
114
+ self._converter = DocumentConverter(
115
+ allowed_formats=allowed_format, format_options=format_options
116
+ )
117
+
118
+ async def extract_content(
119
+ self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
120
+ ) -> AsyncGenerator[Result, None]:
121
+ docs, path_or_streams = map_and_preserve(_to_docling, docs)
122
+ outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
123
+ for doc, res in zip(docs, outputs, strict=True):
124
+ yield _to_result(res, doc, output_format, output_path=output_path)
125
+
126
+ @classmethod
127
+ def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
128
+ return cls(config.format_options)
129
+
130
+
131
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
132
+ for d in docs:
133
+ yield d.to_docling()
134
+
135
+
136
+ def _to_result(
137
+ res: "ConversionResult",
138
+ input_document: InputDoc,
139
+ output_format: OutputFormat,
140
+ output_path: Path,
141
+ **kwargs,
142
+ ) -> Result:
143
+ output_path.mkdir(parents=True, exist_ok=True)
144
+ output = None
145
+ status = Status.from_docling(res.status)
146
+ if status.allows_conversion:
147
+ match output_format:
148
+ case OutputFormat.MARKDOWN:
149
+ output = _to_markdown_doc(res, output_path, **kwargs)
150
+ case _:
151
+ raise NotImplementedError(f"unsupported output format {output_format}")
152
+ errors = [Error.from_docling(e) for e in res.errors]
153
+ input_doc = input_document.without_content()
154
+ return Result(input=input_doc, status=status, errors=errors, output=output)
155
+
156
+
157
+ def _to_markdown_doc(
158
+ res: "ConversionResult",
159
+ output_path: Path,
160
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
161
+ **kwargs,
162
+ ) -> MarkdownDoc:
163
+ from docling_core.types.doc import ImageRefMode
164
+
165
+ # TODO: Should we add a hash to avoid collision between files with same names
166
+ # nested in the tree structured
167
+ md_dir_name = path_to_artifacts_dirname(res.input.file)
168
+ md_dir = output_path / md_dir_name
169
+ if md_dir.exists():
170
+ raise FileExistsError(f"directory {md_dir} already exists")
171
+ # Let's avoid issue of duplicated input file names flattened top level
172
+ md_filename = md_dir_name + OutputFormat.MARKDOWN
173
+ total_length = 0
174
+ n_pages = len(res.pages)
175
+
176
+ with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
177
+ tmp_dir = Path(td)
178
+ page_path = Path("page.md")
179
+ # We do a chdir to bypass a Docling bug which only allows to maintain relative
180
+ # image ref when saving the markdown to a relative path
181
+ with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
182
+ end_indices = []
183
+ for page_i in range(n_pages):
184
+ res.document.save_as_markdown(
185
+ page_path,
186
+ page_no=page_i + 1,
187
+ image_mode=ImageRefMode.REFERENCED,
188
+ artifacts_dir=Path(ARTIFACTS),
189
+ **kwargs,
190
+ )
191
+ content = page_path.read_text()
192
+ if page_i > 0:
193
+ content += "\n"
194
+ if page_i < n_pages - 1:
195
+ content += page_sep
196
+ total_length += len(content)
197
+ end_indices.append(total_length)
198
+ f.write(content)
199
+ f.flush()
200
+ page_path.unlink()
201
+ shutil.move(tmp_dir, md_dir)
202
+ pages = PageIndexes.from_page_end_indices(end_indices)
203
+ return MarkdownDoc(path=Path(md_dir_name), pages=pages)
@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
3
3
  from copy import deepcopy
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Any, ClassVar, Self
7
-
8
- from marker.config.parser import ConfigParser
9
- from marker.converters.pdf import PdfConverter
10
- from marker.models import create_model_dict
11
- from marker.output import text_from_rendered
12
- from marker.renderers.markdown import MarkdownRenderer
13
- from PIL.Image import Image
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
+
14
8
  from pydantic import Field
15
9
 
16
10
  from .constants import ARTIFACTS, CPU_GROUP
@@ -26,6 +20,10 @@ from .objects import (
26
20
  from .pipeline import Pipeline, PipelineConfig, PipelineType
27
21
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
28
22
 
23
+ if TYPE_CHECKING:
24
+ from marker.converters.pdf import PdfConverter
25
+ from PIL import Image
26
+
29
27
 
30
28
  @PipelineConfig.register()
31
29
  class MarkerPipelineConfig(PipelineConfig):
@@ -36,7 +34,7 @@ class MarkerPipelineConfig(PipelineConfig):
36
34
 
37
35
  @classmethod
38
36
  @cache
39
- def supported_formats(cls) -> set[SupportedExt]:
37
+ def supported_exts(cls) -> set[SupportedExt]:
40
38
  # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
39
  return {
42
40
  SupportedExt.PDF,
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
75
73
  async def extract_content(
76
74
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
77
75
  ) -> AsyncGenerator[Result, None]:
76
+ from marker.config.parser import ConfigParser
77
+ from marker.converters.pdf import PdfConverter
78
+ from marker.models import create_model_dict
79
+
78
80
  config = deepcopy(self._marker_config)
79
81
  config["output_format"] = output_format.to_marker()
80
82
  config_parser = ConfigParser(config)
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
96
98
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
97
99
  def _process_doc(
98
100
  doc: InputDoc,
99
- converter: PdfConverter,
101
+ converter: "PdfConverter",
100
102
  output_format: OutputFormat,
101
103
  output_path: Path,
102
104
  ) -> Result:
105
+ from marker.output import text_from_rendered
106
+
103
107
  rendered = converter(str(doc.path))
104
108
  content, _, images = text_from_rendered(rendered)
105
109
  match output_format:
@@ -112,8 +116,10 @@ def _process_doc(
112
116
 
113
117
 
114
118
  def _to_markdown_doc(
115
- input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
119
+ input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
116
120
  ) -> MarkdownDoc:
121
+ from marker.renderers.markdown import MarkdownRenderer
122
+
117
123
  # TODO: Should we add a hash to avoid collision between files with same names
118
124
  # nested in the tree structured
119
125
  md_dir_name = path_to_artifacts_dirname(input_doc.path)
@@ -8,12 +8,6 @@ from pathlib import Path
8
8
  from tempfile import TemporaryDirectory
9
9
  from typing import Any, ClassVar, Self
10
10
 
11
- from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
12
- union_make as pipeline_union_make,
13
- )
14
- from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
15
- from mineru.cli.common import aio_do_parse
16
- from mineru.utils.enum_class import MakeMode
17
11
  from pydantic import Field
18
12
  from pydantic_extra_types.language_code import LanguageAlpha2
19
13
 
@@ -47,33 +41,39 @@ class MinerUConfig(BaseModel):
47
41
  # TODO: use enum or literal here
48
42
  parse_method: str = "auto"
49
43
 
50
- default_kwargs: ClassVar[dict] = {
51
- "server_url": None,
52
- # We don't dump md directly we process, we dump the middle json in order to be
53
- # able to get page indexes
54
- "parse_method": "auto",
55
- "dump_md": False,
56
- "dump_middle_json": True,
57
- "f_draw_layout_bbox": False,
58
- "f_draw_span_bbox": False,
59
- "f_dump_model_output": False, # might be useful for debug though
60
- "f_dump_orig_pdf": False,
61
- "f_dump_content_list": False, # might be useful for debug though
62
- "start_page_id": 0,
63
- "f_make_md_mode": MakeMode.MM_MD,
64
- "image_analysis": True,
65
- "end_page_id": None,
66
- "client_side_output_generation": False,
67
- }
68
-
69
44
  def as_parse_kwargs(self) -> dict[str, Any]:
70
- kwargs = copy(self.default_kwargs)
45
+ kwargs = copy(self._get_default_kwargs())
71
46
  kwargs["backend"] = self.backend
72
47
  kwargs["parse_method"] = self.parse_method
73
48
  kwargs["formula_enable"] = self.enable_formula_extraction
74
49
  kwargs["table_enable"] = self.enable_table_extraction
75
50
  return kwargs
76
51
 
52
+ @classmethod
53
+ @cache
54
+ def _get_default_kwargs(cls) -> dict[str, Any]:
55
+
56
+ from mineru.utils.enum_class import MakeMode
57
+
58
+ return {
59
+ "server_url": None,
60
+ # We don't dump md directly we process, we dump the middle json in order to be
61
+ # able to get page indexes
62
+ "parse_method": "auto",
63
+ "dump_md": False,
64
+ "dump_middle_json": True,
65
+ "f_draw_layout_bbox": False,
66
+ "f_draw_span_bbox": False,
67
+ "f_dump_model_output": False, # might be useful for debug though
68
+ "f_dump_orig_pdf": False,
69
+ "f_dump_content_list": False, # might be useful for debug though
70
+ "start_page_id": 0,
71
+ "f_make_md_mode": MakeMode.MM_MD,
72
+ "image_analysis": True,
73
+ "end_page_id": None,
74
+ "client_side_output_generation": False,
75
+ }
76
+
77
77
 
78
78
  @PipelineConfig.register() # noqa: F821
79
79
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
@@ -85,7 +85,7 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
85
85
 
86
86
  @classmethod
87
87
  @cache
88
- def supported_formats(cls) -> set[SupportedExt]:
88
+ def supported_exts(cls) -> set[SupportedExt]:
89
89
  return {
90
90
  SupportedExt.PDF,
91
91
  SupportedExt.DOCX,
@@ -104,6 +104,8 @@ class MinerUPipeline(Pipeline):
104
104
  async def extract_content(
105
105
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
106
106
  ) -> AsyncGenerator[Result, None]:
107
+ from mineru.cli.common import aio_do_parse
108
+
107
109
  docs = list(docs)
108
110
  # TODO: exclude files which are not pdf and return an error
109
111
  pdfs_bytes = [d.path.read_bytes() for d in docs]
@@ -149,11 +151,18 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
149
151
 
150
152
 
151
153
  def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
154
+
152
155
  match backend:
153
156
  case MinerUBackend.PIPELINE:
154
- return pipeline_union_make
157
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
158
+ union_make,
159
+ )
160
+
161
+ return union_make
155
162
  case MinerUBackend.VLM:
156
- return vlm_union_make
163
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
164
+
165
+ return union_make
157
166
  case _:
158
167
  raise ValueError(f"Unsupported backend: {backend}")
159
168
 
@@ -201,8 +210,12 @@ def _dump_md_content(
201
210
  output_path: Path,
202
211
  md_path: Path,
203
212
  im_dir: Path,
204
- md_make_mode: str = MakeMode.MM_MD,
213
+ md_make_mode: str | None = None,
205
214
  ) -> ConversionOutput:
215
+ from mineru.utils.enum_class import MakeMode
216
+
217
+ if md_make_mode is None:
218
+ md_make_mode = MakeMode.MM_MD
206
219
  total_length = 0
207
220
  end_indices = []
208
221
  with md_path.open("w") as f:
@@ -21,7 +21,12 @@ from pydantic import AfterValidator, RootModel, TypeAdapter
21
21
  from pydantic import BaseModel as _BaseModel
22
22
 
23
23
  try:
24
- from docling.datamodel.base_models import ConversionStatus, ErrorItem, InputFormat
24
+ from docling.datamodel.base_models import (
25
+ ConversionStatus,
26
+ ErrorItem,
27
+ FormatToExtensions,
28
+ InputFormat,
29
+ )
25
30
  from docling.datamodel.document import InputDocument
26
31
  from docling_core.types.io import DocumentStream
27
32
  except ImportError:
@@ -33,42 +38,78 @@ logger = logging.getLogger(__name__)
33
38
  base_config = merge_configs(icij_config(), no_enum_values_config())
34
39
 
35
40
 
41
+ @cache
42
+ def _ext_to_docling_input_format() -> dict:
43
+ from .docling_ import DoclingPipelineConfig # noqa: PLC0415
44
+
45
+ mapping = dict()
46
+ supported = DoclingPipelineConfig.supported_exts()
47
+ for input_f, exts in FormatToExtensions.items():
48
+ for ext in exts:
49
+ try:
50
+ ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
51
+ except ValueError:
52
+ continue
53
+ if ext in supported:
54
+ mapping[ext] = input_f
55
+ return mapping
56
+
57
+
36
58
  class BaseModel(_BaseModel):
37
59
  model_config = base_config
38
60
 
39
61
 
40
62
  class SupportedExt(StrEnum):
41
63
  ADOC = ".adoc"
64
+ ASC = ".asc"
42
65
  ASCIIDOC = ".asciidoc"
43
66
  BMP = ".bmp"
44
67
  CSV = ".csv"
45
68
  DOC = ".doc"
46
69
  DOCX = ".docx"
70
+ DOTX = ".dotx"
71
+ DOTM = ".dotm"
72
+ DOCM = ".docm"
47
73
  EPUB = ".epub"
48
74
  GIF = ".gif"
49
75
  HTLM = ".html"
76
+ HTM = ".htm"
50
77
  JPEG = ".jpeg"
51
78
  JPG = ".jpg"
79
+ JSON = ".json"
80
+ LATEX = ".latex"
52
81
  MD = ".md"
82
+ NXML = ".nxml"
53
83
  ODP = ".odp"
54
84
  ODS = ".ods"
55
85
  ODT = ".odt"
56
86
  PDF = ".pdf"
57
87
  PNG = ".png"
88
+ PPSX = ".ppsx"
58
89
  PPT = ".ppt"
90
+ PPTM = ".pptm"
91
+ PPSM = ".ppsm"
92
+ POTX = ".potx"
93
+ POTM = ".potm"
59
94
  PPTX = ".pptx"
95
+ QMD = ".qmd"
96
+ RMD = ".rmd"
60
97
  TEX = ".tex"
98
+ TIF = ".tif"
61
99
  TIFF = ".tiff"
62
100
  TXT = ".txt"
101
+ TEXT = ".text"
63
102
  WEBP = ".webp"
103
+ XBRL = ".xbrl"
64
104
  XHTML = ".xhtml"
65
105
  XLS = ".xls"
66
106
  XLSM = ".xlsm"
67
107
  XLSX = ".xlsx"
68
108
  XLTX = ".xltx"
109
+ XML = ".xml"
69
110
 
70
111
  def to_docling(self) -> InputFormat:
71
- return InputFormat(self.value[1:])
112
+ return _ext_to_docling_input_format()[self]
72
113
 
73
114
 
74
115
  class OutputFormat(StrEnum):
@@ -30,7 +30,7 @@ class PipelineConfig(RegistrableConfig, ABC):
30
30
 
31
31
  @classmethod
32
32
  @abstractmethod
33
- def supported_formats(cls) -> set[SupportedExt]: ...
33
+ def supported_exts(cls) -> set[SupportedExt]: ...
34
34
 
35
35
 
36
36
  class Pipeline(RegistrableFromConfig, ABC):
@@ -51,6 +51,7 @@ required-environments = [
51
51
  dev = [
52
52
  "pytest~=8.3.5",
53
53
  "pytest-asyncio~=0.25.3",
54
+ "ruff==0.15.2",
54
55
  ]
55
56
 
56
57
  [project.urls]
@@ -868,7 +868,6 @@ wheels = [
868
868
 
869
869
  [[package]]
870
870
  name = "extract-python"
871
- version = "0.1.0"
872
871
  source = { editable = "." }
873
872
  dependencies = [
874
873
  { name = "icij-common" },
@@ -898,11 +897,12 @@ mineru = [
898
897
  dev = [
899
898
  { name = "pytest" },
900
899
  { name = "pytest-asyncio" },
900
+ { name = "ruff" },
901
901
  ]
902
902
 
903
903
  [package.metadata]
904
904
  requires-dist = [
905
- { name = "docling-slim", extras = ["standard", "feat-ocr-easyocr", "feat-ocr-tesserocr", "feat-ocr-mac"], marker = "extra == 'docling'", specifier = "~=2.96" },
905
+ { name = "docling-slim", extras = ["feat-ocr-easyocr", "feat-ocr-mac", "feat-ocr-tesserocr", "standard"], marker = "extra == 'docling'", specifier = "~=2.96" },
906
906
  { name = "html2image", marker = "extra == 'benches'", specifier = "~=2.0.7" },
907
907
  { name = "icij-common", specifier = "~=0.8.2" },
908
908
  { name = "markdown2", marker = "extra == 'benches'", specifier = ">=2.5.4" },
@@ -920,6 +920,7 @@ provides-extras = ["benches", "docling", "marker", "mineru"]
920
920
  dev = [
921
921
  { name = "pytest", specifier = "~=8.3.5" },
922
922
  { name = "pytest-asyncio", specifier = "~=0.25.3" },
923
+ { name = "ruff", specifier = "==0.15.2" },
923
924
  ]
924
925
 
925
926
  [[package]]
@@ -4334,6 +4335,31 @@ wheels = [
4334
4335
  { url = "https://files.pythonhosted.org/packages/3f/50/0a9e7e7afe7339bd5e36911f0ceb15fed51945836ed803ae5afd661057fd/rtree-1.4.1-py3-none-win_arm64.whl", hash = "sha256:3d46f55729b28138e897ffef32f7ce93ac335cb67f9120125ad3742a220800f0", size = 355253, upload-time = "2025-08-13T19:32:00.296Z" },
4335
4336
  ]
4336
4337
 
4338
+ [[package]]
4339
+ name = "ruff"
4340
+ version = "0.15.2"
4341
+ source = { registry = "https://pypi.org/simple" }
4342
+ sdist = { url = "https://files.pythonhosted.org/packages/06/04/eab13a954e763b0606f460443fcbf6bb5a0faf06890ea3754ff16523dce5/ruff-0.15.2.tar.gz", hash = "sha256:14b965afee0969e68bb871eba625343b8673375f457af4abe98553e8bbb98342", size = 4558148, upload-time = "2026-02-19T22:32:20.271Z" }
4343
+ wheels = [
4344
+ { url = "https://files.pythonhosted.org/packages/2f/70/3a4dc6d09b13cb3e695f28307e5d889b2e1a66b7af9c5e257e796695b0e6/ruff-0.15.2-py3-none-linux_armv6l.whl", hash = "sha256:120691a6fdae2f16d65435648160f5b81a9625288f75544dc40637436b5d3c0d", size = 10430565, upload-time = "2026-02-19T22:32:41.824Z" },
4345
+ { url = "https://files.pythonhosted.org/packages/71/0b/bb8457b56185ece1305c666dc895832946d24055be90692381c31d57466d/ruff-0.15.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:a89056d831256099658b6bba4037ac6dd06f49d194199215befe2bb10457ea5e", size = 10820354, upload-time = "2026-02-19T22:32:07.366Z" },
4346
+ { url = "https://files.pythonhosted.org/packages/2d/c1/e0532d7f9c9e0b14c46f61b14afd563298b8b83f337b6789ddd987e46121/ruff-0.15.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e36dee3a64be0ebd23c86ffa3aa3fd3ac9a712ff295e192243f814a830b6bd87", size = 10170767, upload-time = "2026-02-19T22:32:13.188Z" },
4347
+ { url = "https://files.pythonhosted.org/packages/47/e8/da1aa341d3af017a21c7a62fb5ec31d4e7ad0a93ab80e3a508316efbcb23/ruff-0.15.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a9fb47b6d9764677f8c0a193c0943ce9a05d6763523f132325af8a858eadc2b9", size = 10529591, upload-time = "2026-02-19T22:32:02.547Z" },
4348
+ { url = "https://files.pythonhosted.org/packages/93/74/184fbf38e9f3510231fbc5e437e808f0b48c42d1df9434b208821efcd8d6/ruff-0.15.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f376990f9d0d6442ea9014b19621d8f2aaf2b8e39fdbfc79220b7f0c596c9b80", size = 10260771, upload-time = "2026-02-19T22:32:36.938Z" },
4349
+ { url = "https://files.pythonhosted.org/packages/05/ac/605c20b8e059a0bc4b42360414baa4892ff278cec1c91fff4be0dceedefd/ruff-0.15.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2dcc987551952d73cbf5c88d9fdee815618d497e4df86cd4c4824cc59d5dd75f", size = 11045791, upload-time = "2026-02-19T22:32:31.642Z" },
4350
+ { url = "https://files.pythonhosted.org/packages/fd/52/db6e419908f45a894924d410ac77d64bdd98ff86901d833364251bd08e22/ruff-0.15.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:42a47fd785cbe8c01b9ff45031af875d101b040ad8f4de7bbb716487c74c9a77", size = 11879271, upload-time = "2026-02-19T22:32:29.305Z" },
4351
+ { url = "https://files.pythonhosted.org/packages/3e/d8/7992b18f2008bdc9231d0f10b16df7dda964dbf639e2b8b4c1b4e91b83af/ruff-0.15.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cbe9f49354866e575b4c6943856989f966421870e85cd2ac94dccb0a9dcb2fea", size = 11303707, upload-time = "2026-02-19T22:32:22.492Z" },
4352
+ { url = "https://files.pythonhosted.org/packages/d7/02/849b46184bcfdd4b64cde61752cc9a146c54759ed036edd11857e9b8443b/ruff-0.15.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7a672c82b5f9887576087d97be5ce439f04bbaf548ee987b92d3a7dede41d3a", size = 11149151, upload-time = "2026-02-19T22:32:44.234Z" },
4353
+ { url = "https://files.pythonhosted.org/packages/70/04/f5284e388bab60d1d3b99614a5a9aeb03e0f333847e2429bebd2aaa1feec/ruff-0.15.2-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:72ecc64f46f7019e2bcc3cdc05d4a7da958b629a5ab7033195e11a438403d956", size = 11091132, upload-time = "2026-02-19T22:32:24.691Z" },
4354
+ { url = "https://files.pythonhosted.org/packages/fa/ae/88d844a21110e14d92cf73d57363fab59b727ebeabe78009b9ccb23500af/ruff-0.15.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:8dcf243b15b561c655c1ef2f2b0050e5d50db37fe90115507f6ff37d865dc8b4", size = 10504717, upload-time = "2026-02-19T22:32:26.75Z" },
4355
+ { url = "https://files.pythonhosted.org/packages/64/27/867076a6ada7f2b9c8292884ab44d08fd2ba71bd2b5364d4136f3cd537e1/ruff-0.15.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dab6941c862c05739774677c6273166d2510d254dac0695c0e3f5efa1b5585de", size = 10263122, upload-time = "2026-02-19T22:32:10.036Z" },
4356
+ { url = "https://files.pythonhosted.org/packages/e7/ef/faf9321d550f8ebf0c6373696e70d1758e20ccdc3951ad7af00c0956be7c/ruff-0.15.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1b9164f57fc36058e9a6806eb92af185b0697c9fe4c7c52caa431c6554521e5c", size = 10735295, upload-time = "2026-02-19T22:32:39.227Z" },
4357
+ { url = "https://files.pythonhosted.org/packages/2f/55/e8089fec62e050ba84d71b70e7834b97709ca9b7aba10c1a0b196e493f97/ruff-0.15.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:80d24fcae24d42659db7e335b9e1531697a7102c19185b8dc4a028b952865fd8", size = 11241641, upload-time = "2026-02-19T22:32:34.617Z" },
4358
+ { url = "https://files.pythonhosted.org/packages/23/01/1c30526460f4d23222d0fabd5888868262fd0e2b71a00570ca26483cd993/ruff-0.15.2-py3-none-win32.whl", hash = "sha256:fd5ff9e5f519a7e1bd99cbe8daa324010a74f5e2ebc97c6242c08f26f3714f6f", size = 10507885, upload-time = "2026-02-19T22:32:15.635Z" },
4359
+ { url = "https://files.pythonhosted.org/packages/5c/10/3d18e3bbdf8fc50bbb4ac3cc45970aa5a9753c5cb51bf9ed9a3cd8b79fa3/ruff-0.15.2-py3-none-win_amd64.whl", hash = "sha256:d20014e3dfa400f3ff84830dfb5755ece2de45ab62ecea4af6b7262d0fb4f7c5", size = 11623725, upload-time = "2026-02-19T22:32:04.947Z" },
4360
+ { url = "https://files.pythonhosted.org/packages/6d/78/097c0798b1dab9f8affe73da9642bb4500e098cb27fd8dc9724816ac747b/ruff-0.15.2-py3-none-win_arm64.whl", hash = "sha256:cabddc5822acdc8f7b5527b36ceac55cc51eec7b1946e60181de8fe83ca8876e", size = 10941649, upload-time = "2026-02-19T22:32:18.108Z" },
4361
+ ]
4362
+
4337
4363
  [[package]]
4338
4364
  name = "safetensors"
4339
4365
  version = "0.6.2"
@@ -1,258 +0,0 @@
1
- import shutil
2
- import tempfile
3
- from collections.abc import AsyncGenerator, Iterable, Iterator
4
- from functools import cache
5
- from pathlib import Path
6
- from typing import Any, ClassVar, Literal, TypeVar
7
-
8
- from docling.backend.abstract_backend import AbstractDocumentBackend
9
- from docling.datamodel.base_models import InputFormat
10
- from docling.datamodel.document import ConversionResult
11
- from docling.datamodel.pipeline_options import (
12
- EasyOcrOptions,
13
- PdfPipelineOptions,
14
- PipelineOptions,
15
- VlmPipelineOptions,
16
- )
17
- from docling.document_converter import DocumentConverter, FormatOption
18
- from docling.models.factories import get_ocr_factory
19
- from docling.pipeline.base_pipeline import BasePipeline
20
- from docling_core.types.doc import ImageRefMode
21
- from docling_core.types.io import DocumentStream
22
- from icij_common.registrable import FromConfig
23
- from pydantic import Field, model_validator
24
-
25
- from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
26
- from .objects import (
27
- BaseModel,
28
- Error,
29
- InputDoc,
30
- MarkdownDoc,
31
- OutputFormat,
32
- PageIndexes,
33
- Result,
34
- Status,
35
- SupportedExt,
36
- )
37
- from .pipeline import Pipeline, PipelineConfig, PipelineType
38
- from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
39
-
40
- DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
41
-
42
-
43
- class _PdfPipelineOptions(PdfPipelineOptions):
44
- generate_picture_images: bool = Field(default=True, frozen=True)
45
-
46
- @model_validator(mode="before")
47
- @classmethod
48
- def validate_ocr_options(cls, data: Any) -> Any:
49
- if isinstance(data, dict):
50
- ocr_options = data.get("ocr_options")
51
- if not isinstance(ocr_options, dict):
52
- return data
53
- allow_external_plugins = ocr_options.get("allow_external_plugins", False)
54
- ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
55
- kind = ocr_options.pop("kind")
56
- data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
57
- return data
58
-
59
-
60
- OptionsByPipeline = list[
61
- tuple[Literal["pdf"], _PdfPipelineOptions]
62
- | tuple[Literal["vlm"], VlmPipelineOptions]
63
- ]
64
-
65
-
66
- def _default_pipeline_options() -> OptionsByPipeline:
67
- pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
68
- return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
69
-
70
-
71
- class DoclingFormatOption(BaseModel):
72
- pipeline_cls: str
73
- backend_cls: str
74
-
75
- def to_docling(
76
- self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
77
- ) -> FormatOption:
78
- pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
79
- backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
80
- if "vlm" in self.pipeline_cls.lower():
81
- pipeline_options = pipeline_options.get("vlm")
82
- if pipeline_options is not None:
83
- pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
84
- elif "pdf" in self.pipeline_cls.lower():
85
- pipeline_options = pipeline_options.get("pdf")
86
- if pipeline_options is not None:
87
- pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
88
- else:
89
- raise ValueError(
90
- f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
91
- )
92
- return FormatOption(
93
- pipeline_cls=pipeline_cls,
94
- pipeline_options=pipeline_options,
95
- backend=backend_cls,
96
- )
97
-
98
-
99
- @cache
100
- def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
101
- supported_fmt = {InputFormat.PDF}
102
- return {
103
- fmt: DoclingFormatOption(
104
- pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
105
- )
106
- for fmt, opt in DocumentConverter().format_to_options.items()
107
- if fmt in supported_fmt
108
- }
109
-
110
-
111
- T = TypeVar("T")
112
-
113
-
114
- def _find_subcls(cls: type[T], name: str) -> type[T]:
115
- for c in all_subclasses(cls):
116
- if c.__name__ == name:
117
- return c
118
- raise ValueError(f"unknown {cls.__name__} subclass {name}")
119
-
120
-
121
- @PipelineConfig.register()
122
- class DoclingPipelineConfig(PipelineConfig):
123
- pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
124
- task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
125
-
126
- pipeline_options: OptionsByPipeline = Field(
127
- default_factory=_default_pipeline_options
128
- )
129
- format_options: dict[InputFormat, DoclingFormatOption] = Field(
130
- default_factory=_default_format_options
131
- )
132
-
133
- def to_format_options(self) -> dict[InputFormat, FormatOption]:
134
- pipeline_options = dict(self.pipeline_options)
135
- return {
136
- InputFormat(f): opt.to_docling(pipeline_options)
137
- for f, opt in self.format_options.items()
138
- }
139
-
140
- @classmethod
141
- @cache
142
- def supported_formats(cls) -> set[SupportedExt]:
143
- # Subset of https://docling-project.github.io/docling/usage/supported_formats/
144
- return {
145
- SupportedExt.ADOC,
146
- SupportedExt.ASCIIDOC,
147
- SupportedExt.BMP,
148
- SupportedExt.CSV,
149
- SupportedExt.DOCX,
150
- SupportedExt.HTLM,
151
- SupportedExt.JPG,
152
- SupportedExt.MD,
153
- SupportedExt.PDF,
154
- SupportedExt.PNG,
155
- SupportedExt.PPTX,
156
- SupportedExt.TEX,
157
- SupportedExt.TIFF,
158
- SupportedExt.TXT,
159
- SupportedExt.WEBP,
160
- SupportedExt.XHTML,
161
- SupportedExt.XLSX,
162
- }
163
-
164
-
165
- DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
166
-
167
-
168
- @Pipeline.register(PipelineType.DOCLING)
169
- class DoclingPipeline(Pipeline):
170
- def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
171
- if format_options is None:
172
- format_options = DEFAULT_FORMAT_OPTIONS
173
- self._converter = DocumentConverter(format_options=format_options)
174
-
175
- async def extract_content(
176
- self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
177
- ) -> AsyncGenerator[Result, None]:
178
- docs, path_or_streams = map_and_preserve(_to_docling, docs)
179
- outputs = self._converter.convert_all(path_or_streams, raises_on_error=False)
180
- for doc, res in zip(docs, outputs, strict=True):
181
- yield _to_result(res, doc, output_format, output_path=output_path)
182
-
183
- @classmethod
184
- def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
185
- return cls(config.to_format_options())
186
-
187
-
188
- def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
189
- for d in docs:
190
- yield d.to_docling()
191
-
192
-
193
- def _to_result(
194
- res: ConversionResult,
195
- input_document: InputDoc,
196
- output_format: OutputFormat,
197
- output_path: Path,
198
- **kwargs,
199
- ) -> Result:
200
- output_path.mkdir(parents=True, exist_ok=True)
201
- output = None
202
- status = Status.from_docling(res.status)
203
- if status.allows_conversion:
204
- match output_format:
205
- case OutputFormat.MARKDOWN:
206
- output = _to_markdown_doc(res, output_path, **kwargs)
207
- case _:
208
- raise NotImplementedError(f"unsupported output format {output_format}")
209
- errors = [Error.from_docling(e) for e in res.errors]
210
- input_doc = input_document.without_content()
211
- return Result(input=input_doc, status=status, errors=errors, output=output)
212
-
213
-
214
- def _to_markdown_doc(
215
- res: ConversionResult,
216
- output_path: Path,
217
- page_sep: str = DEFAULT_MD_PAGE_SEP,
218
- **kwargs,
219
- ) -> MarkdownDoc:
220
- # TODO: Should we add a hash to avoid collision between files with same names
221
- # nested in the tree structured
222
- md_dir_name = path_to_artifacts_dirname(res.input.file)
223
- md_dir = output_path / md_dir_name
224
- if md_dir.exists():
225
- raise FileExistsError(f"directory {md_dir} already exists")
226
- # Let's avoid issue of duplicated input file names flattened top level
227
- md_filename = md_dir_name + OutputFormat.MARKDOWN
228
- total_length = 0
229
- n_pages = len(res.pages)
230
-
231
- with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
232
- tmp_dir = Path(td)
233
- page_path = Path("page.md")
234
- # We do a chdir to bypass a Docling bug which only allows to maintain relative
235
- # image ref when saving the markdown to a relative path
236
- with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
237
- end_indices = []
238
- for page_i in range(n_pages):
239
- res.document.save_as_markdown(
240
- page_path,
241
- page_no=page_i + 1,
242
- image_mode=ImageRefMode.REFERENCED,
243
- artifacts_dir=Path(ARTIFACTS),
244
- **kwargs,
245
- )
246
- content = page_path.read_text()
247
- if page_i > 0:
248
- content += "\n"
249
- if page_i < n_pages - 1:
250
- content += page_sep
251
- total_length += len(content)
252
- end_indices.append(total_length)
253
- f.write(content)
254
- f.flush()
255
- page_path.unlink()
256
- shutil.move(tmp_dir, md_dir)
257
- pages = PageIndexes.from_page_end_indices(end_indices)
258
- return MarkdownDoc(path=Path(md_dir_name), pages=pages)
File without changes
File without changes