extract-python 0.5.13__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.5.13
3
+ Version: 0.7.0
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
9
  Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.5.5
10
+ Requires-Dist: extract-core~=0.6.0
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -3,7 +3,7 @@ from tempfile import TemporaryDirectory
3
3
 
4
4
  import markdown2
5
5
  import pypdfium2
6
- from extract_core import BaseModel, OutputFormat, PageIndexes
6
+ from extract_core import BaseModel, OutputFormat, Pages
7
7
  from extract_python.utils import chdir
8
8
  from html2image import Html2Image
9
9
  from PIL import Image, ImageDraw
@@ -93,7 +93,7 @@ def side_by_side_md_page_comp(
93
93
  if len(md_files) != 1:
94
94
  msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
95
95
  raise ValueError(msg)
96
- md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
96
+ md_content = (md_files[0].read_bytes()[page_ix[0] : page_ix[1]]).decode()
97
97
  # change the current dir so that the browser renders images properly
98
98
  with chdir(compared_path):
99
99
  md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
@@ -140,9 +140,9 @@ def _scan_pages(
140
140
  root: Path, comparison: ComparisonItem
141
141
  ) -> list[dict[str, tuple[int, int]]]:
142
142
  all_pages = [
143
- PageIndexes.model_validate_json(
143
+ Pages.model_validate_json(
144
144
  (root / compared / "artifacts" / "pages.json").read_text()
145
- ).root
145
+ )
146
146
  for compared in comparison.compared
147
147
  ]
148
148
  all_pages = zip(*all_pages, strict=True)
@@ -0,0 +1,2 @@
1
+ ARTIFACTS = "artifacts"
2
+ DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
@@ -1,11 +1,13 @@
1
1
  import asyncio
2
+ import json
3
+ import logging
2
4
  import shutil
3
5
  import tempfile
4
6
  from collections.abc import AsyncGenerator, Iterable, Iterator
7
+ from functools import partial
5
8
  from pathlib import Path
6
9
  from typing import Any, Self
7
10
 
8
- from docling.datamodel.base_models import InputFormat
9
11
  from docling.datamodel.document import ConversionResult
10
12
  from docling.datamodel.pipeline_options import PipelineOptions
11
13
  from docling.document_converter import DocumentConverter, FormatOption
@@ -15,42 +17,41 @@ from docling_core.types.doc import ImageRefMode
15
17
  from docling_core.types.io import DocumentStream
16
18
  from extract_core import (
17
19
  BaseModel,
18
- Device,
19
20
  DoclingFormatOption,
20
21
  DoclingPipelineConfig,
21
22
  Error,
22
23
  InputDoc,
23
24
  MarkdownDoc,
24
25
  OutputFormat,
25
- PageIndexes,
26
26
  Pipeline,
27
27
  PipelineType,
28
28
  Result,
29
29
  Status,
30
30
  )
31
31
  from icij_common.pydantic_utils import merge_configs
32
- from icij_common.registrable import FromConfig
33
32
  from pydantic import ConfigDict, field_serializer
34
33
  from pydantic_core.core_schema import SerializerFunctionWrapHandler
35
34
 
36
35
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
37
- from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
36
+ from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
37
+
38
+ logger = logging.getLogger(__name__)
38
39
 
39
40
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
40
41
 
41
42
 
42
43
  @Pipeline.register(PipelineType.DOCLING)
43
44
  class DoclingPipeline(Pipeline):
44
- def __init__(
45
- self,
46
- format_options: dict["InputFormat", DoclingFormatOption] | None = None,
47
- *,
48
- device: Device = Device.CPU,
49
- ):
50
- super().__init__(device)
45
+ def __init__(self, config: DoclingPipelineConfig):
46
+ super().__init__(config)
51
47
  format_options = {
52
- k: v.to_docling(self._device) for k, v in format_options.items()
48
+ k: v.to_docling(self._device)
49
+ for k, v in self._config.format_options.items()
53
50
  }
51
+ logger.info(
52
+ "resolved format options to: %s",
53
+ lambda: partial(json.dumps, format_options, indent=2),
54
+ )
54
55
  allowed_format = [
55
56
  f.to_docling() for f in DoclingPipelineConfig.supported_exts()
56
57
  ]
@@ -72,15 +73,6 @@ class DoclingPipeline(Pipeline):
72
73
  doc = next(docs)
73
74
  yield _to_result(res, doc, output_format, output_path=output_path)
74
75
 
75
- @classmethod
76
- def _from_config(
77
- cls,
78
- config: DoclingPipelineConfig,
79
- *,
80
- device: Device = Device.CPU,
81
- ) -> FromConfig:
82
- return cls(config.format_options, device=device)
83
-
84
76
 
85
77
  def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
86
78
  for d in docs:
@@ -122,39 +114,38 @@ def _to_markdown_doc(
122
114
  raise FileExistsError(f"directory {md_dir} already exists")
123
115
  # Let's avoid issue of duplicated input file names flattened top level
124
116
  md_filename = md_dir_name + OutputFormat.MARKDOWN
125
- total_length = 0
126
- n_pages = len(res.pages)
127
-
128
117
  with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
129
118
  tmp_dir = Path(td)
130
- page_path = Path("page.md")
131
- # We do a chdir to bypass a Docling bug which only allows to maintain relative
132
- # image ref when saving the markdown to a relative path
133
- with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
134
- end_indices = []
135
- for page_i in range(n_pages):
136
- res.document.save_as_markdown(
137
- page_path,
138
- page_no=page_i + 1,
139
- image_mode=ImageRefMode.REFERENCED,
140
- artifacts_dir=Path(ARTIFACTS),
141
- **kwargs,
142
- )
143
- content = page_path.read_text()
144
- if page_i > 0:
145
- content += "\n"
146
- if page_i < n_pages - 1:
147
- content += page_sep
148
- total_length += len(content)
149
- end_indices.append(total_length)
150
- f.write(content)
151
- f.flush()
152
- page_path.unlink()
119
+ md_path = tmp_dir / md_filename
120
+ current_page_path = tmp_dir / "page.md"
121
+ with chdir(tmp_dir):
122
+ # We do a chdir to bypass a Docling bug which only allows to maintain
123
+ # relative image ref when saving the markdown to a relative path
124
+ pages = _docling_pages_it(res, current_page_path, **kwargs)
125
+ with md_path.open("wb") as f:
126
+ pages = write_pages(pages, page_sep, f)
127
+ # Clean up the tmp page file before move everything to the end destination
128
+ current_page_path.unlink(missing_ok=True)
153
129
  shutil.move(tmp_dir, md_dir)
154
- pages = PageIndexes.from_page_end_indices(end_indices)
155
130
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
156
131
 
157
132
 
133
+ def _docling_pages_it(
134
+ res: ConversionResult, output_path: Path, **kwargs
135
+ ) -> Iterable[str]:
136
+ n_pages = len(res.pages)
137
+ for page_i in range(n_pages):
138
+ res.document.save_as_markdown(
139
+ output_path,
140
+ page_no=page_i + 1,
141
+ image_mode=ImageRefMode.REFERENCED,
142
+ artifacts_dir=Path(ARTIFACTS),
143
+ **kwargs,
144
+ )
145
+ content = output_path.read_text()
146
+ yield content
147
+
148
+
158
149
  class SerializableFormatOptions(DoclingFormatOption):
159
150
  # Utility class to serialize Python format options into a JSON which can be
160
151
  # correctly deserialized into a docling FormatOption
@@ -2,82 +2,32 @@ import asyncio
2
2
  import gc
3
3
  from collections.abc import AsyncGenerator, Iterable
4
4
  from copy import deepcopy
5
- from functools import cache
6
5
  from pathlib import Path
7
- from typing import TYPE_CHECKING, Any, ClassVar, Self
6
+ from typing import TYPE_CHECKING
8
7
 
9
8
  from extract_core import (
10
- BasePipelineConfig,
11
- Device,
12
9
  InputDoc,
13
10
  MarkdownDoc,
14
11
  OutputFormat,
15
- PageIndexes,
16
12
  Pipeline,
17
13
  PipelineType,
18
14
  Result,
19
15
  Status,
20
- SupportedExt,
21
16
  )
22
- from pydantic import Field
23
17
 
24
- from .constants import ARTIFACTS
25
- from .utils import path_to_artifacts_dirname, report_recoverable_errors
18
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
19
+ from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
26
20
 
27
21
  if TYPE_CHECKING:
28
22
  from marker.converters.pdf import PdfConverter
29
23
  from PIL import Image
30
24
 
31
25
 
32
- class MarkerPipelineConfig(BasePipelineConfig):
33
- pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
34
-
35
- config: dict[str, Any] = Field(default_factory=dict)
36
-
37
- @classmethod
38
- @cache
39
- def supported_exts(cls) -> set[SupportedExt]:
40
- # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
- return {
42
- SupportedExt.PDF,
43
- SupportedExt.XLS,
44
- SupportedExt.XLSX,
45
- SupportedExt.XLSM,
46
- SupportedExt.CSV,
47
- SupportedExt.ODS,
48
- SupportedExt.DOC,
49
- SupportedExt.DOCX,
50
- SupportedExt.ODT,
51
- SupportedExt.PPT,
52
- SupportedExt.PPTX,
53
- SupportedExt.ODP,
54
- SupportedExt.HTLM,
55
- SupportedExt.EPUB,
56
- SupportedExt.PNG,
57
- SupportedExt.JPG,
58
- SupportedExt.JPEG,
59
- SupportedExt.WEBP,
60
- SupportedExt.GIF,
61
- SupportedExt.TIFF,
62
- }
63
-
64
-
65
26
  _MARKER_CONVERSION_ERRORS = tuple()
66
27
 
67
28
 
68
29
  @Pipeline.register(PipelineType.MARKER)
69
30
  class MarkerPipeline(Pipeline):
70
- def __init__(
71
- self,
72
- marker_config: dict[str, Any] | None = None,
73
- *,
74
- device: Device = Device.CPU,
75
- ):
76
- super().__init__(device)
77
- if marker_config is None:
78
- marker_config = dict()
79
- self._marker_config = marker_config
80
-
81
31
  async def extract_content(
82
32
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
83
33
  ) -> AsyncGenerator[Result, None]:
@@ -85,7 +35,7 @@ class MarkerPipeline(Pipeline):
85
35
  from marker.converters.pdf import PdfConverter # noqa: PLC0415
86
36
  from marker.models import create_model_dict # noqa: PLC0415
87
37
 
88
- config = deepcopy(self._marker_config)
38
+ config = deepcopy(self._config.config)
89
39
  config["output_format"] = output_format.to_marker()
90
40
  config_parser = ConfigParser(config)
91
41
  renderer = config_parser.get_renderer()
@@ -98,15 +48,6 @@ class MarkerPipeline(Pipeline):
98
48
  for doc in docs:
99
49
  yield await _process_doc(doc, converter, output_format, output_path)
100
50
 
101
- @classmethod
102
- def _from_config(
103
- cls,
104
- config: MarkerPipelineConfig,
105
- *,
106
- device: Device = Device.CPU,
107
- ) -> Self:
108
- return cls(config.config, device=device)
109
-
110
51
 
111
52
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
112
53
  async def _process_doc(
@@ -121,7 +62,9 @@ async def _process_doc(
121
62
  content, _, images = text_from_rendered(rendered)
122
63
  match output_format:
123
64
  case OutputFormat.MARKDOWN:
124
- output = _to_markdown_doc(doc, content, images, output_path)
65
+ output = _to_markdown_doc(
66
+ doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
67
+ )
125
68
  case _:
126
69
  raise NotImplementedError(f"unsupported output format {output_format}")
127
70
  input_doc = doc.without_content()
@@ -129,7 +72,12 @@ async def _process_doc(
129
72
 
130
73
 
131
74
  def _to_markdown_doc(
132
- input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
75
+ input_doc: InputDoc,
76
+ content: str,
77
+ images: dict[str, "Image"],
78
+ output_path: Path,
79
+ *,
80
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
133
81
  ) -> MarkdownDoc:
134
82
  from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
135
83
 
@@ -143,24 +91,9 @@ def _to_markdown_doc(
143
91
  im.save(artifacts_dir / im_name)
144
92
  del images
145
93
  gc.collect()
146
- page_sep = MarkdownRenderer.page_separator
147
- content = content.split(page_sep)
148
- n_pages = len(content)
149
- md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
150
- OutputFormat.MARKDOWN.value
151
- )
152
- total_length = 0
153
- end_indices = []
154
- with md_path.open("w", encoding="utf-8") as f:
155
- for page_i, page_content in enumerate(content):
156
- content = page_content
157
- if page_i > 0:
158
- content += "\n"
159
- if page_i < n_pages - 1:
160
- content += page_sep
161
- total_length += len(content)
162
- end_indices.append(total_length)
163
- f.write(content)
164
- f.flush()
165
- pages = PageIndexes.from_page_end_indices(end_indices)
94
+ pages = content.split(MarkdownRenderer.page_separator)
95
+ md_path = output_path / md_dir_name / md_dir_name
96
+ md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
97
+ with md_path.open("wb") as f:
98
+ pages = write_pages(pages, page_sep, f)
166
99
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
@@ -5,17 +5,13 @@ from collections.abc import AsyncGenerator, Callable, Iterable
5
5
  from functools import partial
6
6
  from pathlib import Path
7
7
  from tempfile import TemporaryDirectory
8
- from typing import Self
9
8
 
10
9
  from extract_core import (
11
10
  ConversionOutput,
12
- Device,
13
11
  InputDoc,
14
12
  MinerUBackend,
15
- MinerUConfig,
16
13
  MinerUPipelineConfig,
17
14
  OutputFormat,
18
- PageIndexes,
19
15
  Pipeline,
20
16
  PipelineType,
21
17
  Result,
@@ -23,7 +19,7 @@ from extract_core import (
23
19
  )
24
20
 
25
21
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
26
- from .utils import path_to_artifacts_dirname, reset_env
22
+ from .utils import path_to_artifacts_dirname, reset_env, write_pages
27
23
 
28
24
  _MINER_U_CONVERSION_ERRORS = tuple()
29
25
  MDMakeFunction = Callable[[list, str, str], str | None]
@@ -31,13 +27,10 @@ MDMakeFunction = Callable[[list, str, str], str | None]
31
27
 
32
28
  @Pipeline.register(PipelineType.MINER_U)
33
29
  class MinerUPipeline(Pipeline):
34
- def __init__(
35
- self, config: MinerUConfig, language: str, *, device: Device = Device.CPU
36
- ):
37
- super().__init__(device)
38
- self._config = config
39
- self._language = language
40
- self._md_make_fn = _parse_md_make_fn(config.backend)
30
+ def __init__(self, config: MinerUPipelineConfig):
31
+ super().__init__(config)
32
+ self._language = self._config.language
33
+ self._md_make_fn = _parse_md_make_fn(self._config.config.backend)
41
34
 
42
35
  async def extract_content(
43
36
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -59,7 +52,7 @@ class MinerUPipeline(Pipeline):
59
52
  pdf_file_names=pdfs_names,
60
53
  pdf_bytes_list=pdfs_bytes,
61
54
  p_lang_list=p_lang_list,
62
- **self._config.as_parse_kwargs(),
55
+ **self._config.config.as_parse_kwargs(),
63
56
  )
64
57
  res_paths = [
65
58
  _revert_mineru_output(workdir, pdf_filename=p) for p in pdfs_names
@@ -73,15 +66,6 @@ class MinerUPipeline(Pipeline):
73
66
  output_path=output_path,
74
67
  )
75
68
 
76
- @classmethod
77
- def _from_config(
78
- cls,
79
- config: MinerUPipelineConfig,
80
- *,
81
- device: Device = Device.CPU,
82
- ) -> Self:
83
- return cls(config.config, language=config.language, device=device)
84
-
85
69
 
86
70
  def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
87
71
  output_path = output_dir / pdf_filename
@@ -163,21 +147,9 @@ def _dump_md_content(
163
147
 
164
148
  if md_make_mode is None:
165
149
  md_make_mode = MakeMode.MM_MD
166
- total_length = 0
167
- end_indices = []
168
- with md_path.open("w") as f:
169
- n_pages = len(pdf_info)
170
- for page_i, page in enumerate(pdf_info):
171
- content = md_make_fn([page], md_make_mode, str(im_dir))
172
- if page_i > 0:
173
- content += "\n"
174
- if page_i < n_pages - 1:
175
- content += page_sep
176
- total_length += len(content)
177
- end_indices.append(total_length)
178
- f.write(content)
179
- f.flush()
180
- end_indices = PageIndexes.from_page_end_indices(end_indices)
150
+ pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
151
+ with md_path.open("wb") as f:
152
+ pages = write_pages(pages, page_sep, f)
181
153
  output_path = md_path.parent.relative_to(output_path)
182
- output = ConversionOutput(path=output_path, pages=end_indices)
154
+ output = ConversionOutput(path=output_path, pages=pages)
183
155
  return output
@@ -5,9 +5,9 @@ from copy import copy
5
5
  from functools import wraps
6
6
  from itertools import tee
7
7
  from pathlib import Path, PurePath
8
- from typing import Protocol, TypeVar
8
+ from typing import BinaryIO, Protocol, TypeVar
9
9
 
10
- from extract_core import Error, InputDoc, Result, Status
10
+ from extract_core import Error, InputDoc, Pages, Result, Status
11
11
 
12
12
  R = TypeVar("R")
13
13
  In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
73
73
  finally:
74
74
  os.environ.clear()
75
75
  os.environ.update(old_env)
76
+
77
+
78
+ def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
79
+ pages_byte_sizes = []
80
+ pages = iter(pages)
81
+ content = None
82
+ for p in pages:
83
+ if content:
84
+ pages_byte_sizes.append(out.write((content + page_sep).encode()))
85
+ content = p
86
+ if content:
87
+ pages_byte_sizes.append(out.write(content.encode()))
88
+ return Pages.from_pages_bytes_sizes(pages_byte_sizes)
@@ -9,7 +9,7 @@ readme = "README.md"
9
9
  requires-python = ">=3.11,<3.14"
10
10
  dependencies = [
11
11
  "icij-common~=0.8.2",
12
- "extract-core~=0.5.5",
12
+ "extract-core~=0.6.0",
13
13
  ]
14
14
 
15
15
  [project.optional-dependencies]
@@ -51,14 +51,28 @@ override-dependencies = [
51
51
  "pillow==11.3.0",
52
52
  ]
53
53
 
54
+ [[tool.uv.index]]
55
+ name = "pytorch-cpu"
56
+ url = "https://download.pytorch.org/whl/cpu"
57
+ explicit = true
58
+
59
+
54
60
  [tool.uv.sources]
55
61
  extract-core = { path = "../extract-core", editable = true }
62
+ torch = [
63
+ { index = "pytorch-cpu" },
64
+ ]
65
+ torchvision = [
66
+ { index = "pytorch-cpu" },
67
+ ]
56
68
 
57
69
  [dependency-groups]
58
70
  dev = [
59
71
  "pytest~=8.3.5",
60
72
  "pytest-asyncio~=0.25.3",
61
73
  "ruff==0.15.2",
74
+ "torch==2.12.0",
75
+ "torchvision==0.27.0",
62
76
  ]
63
77
 
64
78
  [project.urls]