extract-python 0.6.0__tar.gz → 0.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.6.0
3
+ Version: 0.7.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
7
7
  Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
8
8
  Author-email: Clément Doumouro <cdoumouro@icij.org>
9
- Requires-Python: <3.14,>=3.11
10
- Requires-Dist: extract-core~=0.6.0
9
+ Requires-Python: <3.15,>=3.13
10
+ Requires-Dist: extract-core~=0.7.0
11
11
  Requires-Dist: icij-common~=0.8.2
12
12
  Provides-Extra: benches
13
13
  Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -3,7 +3,7 @@ from tempfile import TemporaryDirectory
3
3
 
4
4
  import markdown2
5
5
  import pypdfium2
6
- from extract_core import BaseModel, OutputFormat, PageIndexes
6
+ from extract_core import BaseModel, OutputFormat, Pages
7
7
  from extract_python.utils import chdir
8
8
  from html2image import Html2Image
9
9
  from PIL import Image, ImageDraw
@@ -93,7 +93,7 @@ def side_by_side_md_page_comp(
93
93
  if len(md_files) != 1:
94
94
  msg = f"unexpected number of md files ({len(md_files)}) in {compared_path}"
95
95
  raise ValueError(msg)
96
- md_content = md_files[0].read_text()[page_ix[0] : page_ix[1]]
96
+ md_content = (md_files[0].read_bytes()[page_ix[0] : page_ix[1]]).decode()
97
97
  # change the current dir so that the browser renders images properly
98
98
  with chdir(compared_path):
99
99
  md_page_im = _render_md(md_content, compared_path, html_size=ref_im.size)
@@ -140,9 +140,9 @@ def _scan_pages(
140
140
  root: Path, comparison: ComparisonItem
141
141
  ) -> list[dict[str, tuple[int, int]]]:
142
142
  all_pages = [
143
- PageIndexes.model_validate_json(
143
+ Pages.model_validate_json(
144
144
  (root / compared / "artifacts" / "pages.json").read_text()
145
- ).root
145
+ )
146
146
  for compared in comparison.compared
147
147
  ]
148
148
  all_pages = zip(*all_pages, strict=True)
@@ -0,0 +1,2 @@
1
+ ARTIFACTS = "artifacts"
2
+ DEFAULT_MD_PAGE_SEP = '\n<div style="page-break-after: always;"></div>\n'
@@ -23,7 +23,6 @@ from extract_core import (
23
23
  InputDoc,
24
24
  MarkdownDoc,
25
25
  OutputFormat,
26
- PageIndexes,
27
26
  Pipeline,
28
27
  PipelineType,
29
28
  Result,
@@ -34,7 +33,7 @@ from pydantic import ConfigDict, field_serializer
34
33
  from pydantic_core.core_schema import SerializerFunctionWrapHandler
35
34
 
36
35
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
37
- from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
36
+ from .utils import chdir, map_and_preserve, path_to_artifacts_dirname, write_pages
38
37
 
39
38
  logger = logging.getLogger(__name__)
40
39
 
@@ -115,39 +114,38 @@ def _to_markdown_doc(
115
114
  raise FileExistsError(f"directory {md_dir} already exists")
116
115
  # Let's avoid issue of duplicated input file names flattened top level
117
116
  md_filename = md_dir_name + OutputFormat.MARKDOWN
118
- total_length = 0
119
- n_pages = len(res.pages)
120
-
121
117
  with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as td:
122
118
  tmp_dir = Path(td)
123
- page_path = Path("page.md")
124
- # We do a chdir to bypass a Docling bug which only allows to maintain relative
125
- # image ref when saving the markdown to a relative path
126
- with (tmp_dir / md_filename).open("w") as f, chdir(tmp_dir):
127
- end_indices = []
128
- for page_i in range(n_pages):
129
- res.document.save_as_markdown(
130
- page_path,
131
- page_no=page_i + 1,
132
- image_mode=ImageRefMode.REFERENCED,
133
- artifacts_dir=Path(ARTIFACTS),
134
- **kwargs,
135
- )
136
- content = page_path.read_text()
137
- if page_i > 0:
138
- content += "\n"
139
- if page_i < n_pages - 1:
140
- content += page_sep
141
- total_length += len(content)
142
- end_indices.append(total_length)
143
- f.write(content)
144
- f.flush()
145
- page_path.unlink()
119
+ md_path = tmp_dir / md_filename
120
+ current_page_path = tmp_dir / "page.md"
121
+ with chdir(tmp_dir):
122
+ # We do a chdir to bypass a Docling bug which only allows to maintain
123
+ # relative image ref when saving the markdown to a relative path
124
+ pages = _docling_pages_it(res, current_page_path, **kwargs)
125
+ with md_path.open("wb") as f:
126
+ pages = write_pages(pages, page_sep, f)
127
+ # Clean up the tmp page file before move everything to the end destination
128
+ current_page_path.unlink(missing_ok=True)
146
129
  shutil.move(tmp_dir, md_dir)
147
- pages = PageIndexes.from_page_end_indices(end_indices)
148
130
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
149
131
 
150
132
 
133
+ def _docling_pages_it(
134
+ res: ConversionResult, output_path: Path, **kwargs
135
+ ) -> Iterable[str]:
136
+ n_pages = len(res.pages)
137
+ for page_i in range(n_pages):
138
+ res.document.save_as_markdown(
139
+ output_path,
140
+ page_no=page_i + 1,
141
+ image_mode=ImageRefMode.REFERENCED,
142
+ artifacts_dir=Path(ARTIFACTS),
143
+ **kwargs,
144
+ )
145
+ content = output_path.read_text()
146
+ yield content
147
+
148
+
151
149
  class SerializableFormatOptions(DoclingFormatOption):
152
150
  # Utility class to serialize Python format options into a JSON which can be
153
151
  # correctly deserialized into a docling FormatOption
@@ -9,15 +9,14 @@ from extract_core import (
9
9
  InputDoc,
10
10
  MarkdownDoc,
11
11
  OutputFormat,
12
- PageIndexes,
13
12
  Pipeline,
14
13
  PipelineType,
15
14
  Result,
16
15
  Status,
17
16
  )
18
17
 
19
- from .constants import ARTIFACTS
20
- from .utils import path_to_artifacts_dirname, report_recoverable_errors
18
+ from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
19
+ from .utils import path_to_artifacts_dirname, report_recoverable_errors, write_pages
21
20
 
22
21
  if TYPE_CHECKING:
23
22
  from marker.converters.pdf import PdfConverter
@@ -63,7 +62,9 @@ async def _process_doc(
63
62
  content, _, images = text_from_rendered(rendered)
64
63
  match output_format:
65
64
  case OutputFormat.MARKDOWN:
66
- output = _to_markdown_doc(doc, content, images, output_path)
65
+ output = _to_markdown_doc(
66
+ doc, content, images, output_path, page_sep=DEFAULT_MD_PAGE_SEP
67
+ )
67
68
  case _:
68
69
  raise NotImplementedError(f"unsupported output format {output_format}")
69
70
  input_doc = doc.without_content()
@@ -71,7 +72,12 @@ async def _process_doc(
71
72
 
72
73
 
73
74
  def _to_markdown_doc(
74
- input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
75
+ input_doc: InputDoc,
76
+ content: str,
77
+ images: dict[str, "Image"],
78
+ output_path: Path,
79
+ *,
80
+ page_sep: str = DEFAULT_MD_PAGE_SEP,
75
81
  ) -> MarkdownDoc:
76
82
  from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
77
83
 
@@ -85,24 +91,9 @@ def _to_markdown_doc(
85
91
  im.save(artifacts_dir / im_name)
86
92
  del images
87
93
  gc.collect()
88
- page_sep = MarkdownRenderer.page_separator
89
- content = content.split(page_sep)
90
- n_pages = len(content)
91
- md_path = (output_path / md_dir_name / md_dir_name).with_suffix(
92
- OutputFormat.MARKDOWN.value
93
- )
94
- total_length = 0
95
- end_indices = []
96
- with md_path.open("w", encoding="utf-8") as f:
97
- for page_i, page_content in enumerate(content):
98
- content = page_content
99
- if page_i > 0:
100
- content += "\n"
101
- if page_i < n_pages - 1:
102
- content += page_sep
103
- total_length += len(content)
104
- end_indices.append(total_length)
105
- f.write(content)
106
- f.flush()
107
- pages = PageIndexes.from_page_end_indices(end_indices)
94
+ pages = content.split(MarkdownRenderer.page_separator)
95
+ md_path = output_path / md_dir_name / md_dir_name
96
+ md_path = md_path.with_suffix(OutputFormat.MARKDOWN.value)
97
+ with md_path.open("wb") as f:
98
+ pages = write_pages(pages, page_sep, f)
108
99
  return MarkdownDoc(path=Path(md_dir_name), pages=pages)
@@ -12,7 +12,6 @@ from extract_core import (
12
12
  MinerUBackend,
13
13
  MinerUPipelineConfig,
14
14
  OutputFormat,
15
- PageIndexes,
16
15
  Pipeline,
17
16
  PipelineType,
18
17
  Result,
@@ -20,7 +19,7 @@ from extract_core import (
20
19
  )
21
20
 
22
21
  from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
23
- from .utils import path_to_artifacts_dirname, reset_env
22
+ from .utils import path_to_artifacts_dirname, reset_env, write_pages
24
23
 
25
24
  _MINER_U_CONVERSION_ERRORS = tuple()
26
25
  MDMakeFunction = Callable[[list, str, str], str | None]
@@ -148,21 +147,9 @@ def _dump_md_content(
148
147
 
149
148
  if md_make_mode is None:
150
149
  md_make_mode = MakeMode.MM_MD
151
- total_length = 0
152
- end_indices = []
153
- with md_path.open("w") as f:
154
- n_pages = len(pdf_info)
155
- for page_i, page in enumerate(pdf_info):
156
- content = md_make_fn([page], md_make_mode, str(im_dir))
157
- if page_i > 0:
158
- content += "\n"
159
- if page_i < n_pages - 1:
160
- content += page_sep
161
- total_length += len(content)
162
- end_indices.append(total_length)
163
- f.write(content)
164
- f.flush()
165
- end_indices = PageIndexes.from_page_end_indices(end_indices)
150
+ pages = (md_make_fn([p], md_make_mode, str(im_dir)) for p in pdf_info)
151
+ with md_path.open("wb") as f:
152
+ pages = write_pages(pages, page_sep, f)
166
153
  output_path = md_path.parent.relative_to(output_path)
167
- output = ConversionOutput(path=output_path, pages=end_indices)
154
+ output = ConversionOutput(path=output_path, pages=pages)
168
155
  return output
@@ -5,9 +5,9 @@ from copy import copy
5
5
  from functools import wraps
6
6
  from itertools import tee
7
7
  from pathlib import Path, PurePath
8
- from typing import Protocol, TypeVar
8
+ from typing import BinaryIO, Protocol, TypeVar
9
9
 
10
- from extract_core import Error, InputDoc, Result, Status
10
+ from extract_core import Error, InputDoc, Pages, Result, Status
11
11
 
12
12
  R = TypeVar("R")
13
13
  In = TypeVar("In")
@@ -73,3 +73,16 @@ def reset_env() -> Generator[None, None, None]:
73
73
  finally:
74
74
  os.environ.clear()
75
75
  os.environ.update(old_env)
76
+
77
+
78
+ def write_pages(pages: Iterable[str], page_sep: str, out: BinaryIO) -> Pages:
79
+ pages_byte_sizes = []
80
+ pages = iter(pages)
81
+ content = None
82
+ for p in pages:
83
+ if content:
84
+ pages_byte_sizes.append(out.write((content + page_sep).encode()))
85
+ content = p
86
+ if content:
87
+ pages_byte_sizes.append(out.write(content.encode()))
88
+ return Pages.from_pages_bytes_sizes(pages_byte_sizes)
@@ -6,10 +6,10 @@ authors = [
6
6
  { name = "Clément Doumouro", email = "cdoumouro@icij.org" },
7
7
  ]
8
8
  readme = "README.md"
9
- requires-python = ">=3.11,<3.14"
9
+ requires-python = ">=3.13,<3.15"
10
10
  dependencies = [
11
11
  "icij-common~=0.8.2",
12
- "extract-core~=0.6.0",
12
+ "extract-core~=0.7.0",
13
13
  ]
14
14
 
15
15
  [project.optional-dependencies]
@@ -1,2 +0,0 @@
1
- ARTIFACTS = "artifacts"
2
- DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'
File without changes
File without changes