extract-python 0.2.1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,28 +3,13 @@ import tempfile
3
3
  from collections.abc import AsyncGenerator, Iterable, Iterator
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Any, ClassVar, Literal, TypeVar
7
-
8
- from docling.backend.abstract_backend import AbstractDocumentBackend
9
- from docling.datamodel.base_models import InputFormat
10
- from docling.datamodel.document import ConversionResult
11
- from docling.datamodel.pipeline_options import (
12
- EasyOcrOptions,
13
- PdfPipelineOptions,
14
- PipelineOptions,
15
- VlmPipelineOptions,
16
- )
17
- from docling.document_converter import DocumentConverter, FormatOption
18
- from docling.models.factories import get_ocr_factory
19
- from docling.pipeline.base_pipeline import BasePipeline
20
- from docling_core.types.doc import ImageRefMode
21
- from docling_core.types.io import DocumentStream
6
+ from typing import TYPE_CHECKING, Annotated, ClassVar, TypeVar
7
+
22
8
  from icij_common.registrable import FromConfig
23
- from pydantic import Field, model_validator
9
+ from pydantic import AfterValidator, Field
24
10
 
25
11
  from .constants import ARTIFACTS, CPU_GROUP, DEFAULT_MD_PAGE_SEP
26
12
  from .objects import (
27
- BaseModel,
28
13
  Error,
29
14
  InputDoc,
30
15
  MarkdownDoc,
@@ -39,72 +24,43 @@ from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_di
39
24
 
40
25
  DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
41
26
 
27
+ if TYPE_CHECKING:
28
+ from docling.datamodel.base_models import InputFormat
29
+ from docling.datamodel.pipeline_options import PipelineOptions
30
+ from docling.document_converter import ConversionResult, FormatOption
31
+ from docling_core.types.io import DocumentStream
42
32
 
43
- class _PdfPipelineOptions(PdfPipelineOptions):
44
- generate_picture_images: bool = Field(default=True, frozen=True)
45
33
 
46
- @model_validator(mode="before")
47
- @classmethod
48
- def validate_ocr_options(cls, data: Any) -> Any:
49
- if isinstance(data, dict):
50
- ocr_options = data.get("ocr_options")
51
- if not isinstance(ocr_options, dict):
52
- return data
53
- allow_external_plugins = ocr_options.get("allow_external_plugins", False)
54
- ocr_factory = get_ocr_factory(allow_external_plugins=allow_external_plugins)
55
- kind = ocr_options.pop("kind")
56
- data["ocr_options"] = ocr_factory.create_options(kind=kind, **ocr_options)
57
- return data
58
-
59
-
60
- OptionsByPipeline = list[
61
- tuple[Literal["pdf"], _PdfPipelineOptions]
62
- | tuple[Literal["vlm"], VlmPipelineOptions]
63
- ]
64
-
65
-
66
- def _default_pipeline_options() -> OptionsByPipeline:
67
- pipeline_options = _PdfPipelineOptions(ocr_options=EasyOcrOptions())
68
- return [("pdf", pipeline_options), ("vlm", VlmPipelineOptions())]
69
-
70
-
71
- class DoclingFormatOption(BaseModel):
72
- pipeline_cls: str
73
- backend_cls: str
74
-
75
- def to_docling(
76
- self, pipeline_options: dict[Literal["pdf", "vlm"], PipelineOptions]
77
- ) -> FormatOption:
78
- pipeline_cls = _find_subcls(BasePipeline, self.pipeline_cls)
79
- backend_cls = _find_subcls(AbstractDocumentBackend, self.backend_cls)
80
- if "vlm" in self.pipeline_cls.lower():
81
- pipeline_options = pipeline_options.get("vlm")
82
- if pipeline_options is not None:
83
- pipeline_options = VlmPipelineOptions.model_validate(pipeline_options)
84
- elif "pdf" in self.pipeline_cls.lower():
85
- pipeline_options = pipeline_options.get("pdf")
86
- if pipeline_options is not None:
87
- pipeline_options = _PdfPipelineOptions.model_validate(pipeline_options)
88
- else:
89
- raise ValueError(
90
- f"invalid pipeline_cls: {pipeline_cls}, expected a VLM or PDF pipeline"
91
- )
92
- return FormatOption(
93
- pipeline_cls=pipeline_cls,
94
- pipeline_options=pipeline_options,
95
- backend=backend_cls,
96
- )
34
+ def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
35
+ from docling.datamodel.pipeline_options import PdfPipelineOptions
36
+
37
+ if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
38
+ msg = "generate_picture_images should be set to true"
39
+ raise ValueError(msg)
40
+
41
+
42
+ def _validate_options(
43
+ data: dict["InputFormat", "FormatOption"],
44
+ ) -> dict["InputFormat", "FormatOption"]:
45
+ for opts in data.values():
46
+ _validate_pipeline_opts(opts.pipeline_options)
47
+ return data
97
48
 
98
49
 
99
50
  @cache
100
- def _default_format_options() -> dict[InputFormat, DoclingFormatOption]:
101
- supported_fmt = {InputFormat.PDF}
51
+ def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
52
+ from docling.datamodel.pipeline_options import (
53
+ EasyOcrOptions,
54
+ PdfPipelineOptions,
55
+ )
56
+ from docling.document_converter import PdfFormatOption
57
+
102
58
  return {
103
- fmt: DoclingFormatOption(
104
- pipeline_cls=opt.pipeline_cls.__name__, backend_cls=opt.backend.__name__
105
- )
106
- for fmt, opt in DocumentConverter().format_to_options.items()
107
- if fmt in supported_fmt
59
+ InputFormat.PDF: PdfFormatOption(
60
+ pipeline_options=PdfPipelineOptions(
61
+ ocr_options=EasyOcrOptions(), generate_picture_images=True
62
+ )
63
+ ),
108
64
  }
109
65
 
110
66
 
@@ -123,54 +79,41 @@ class DoclingPipelineConfig(PipelineConfig):
123
79
  pipeline: PipelineType = Field(frozen=True, default=PipelineType.DOCLING)
124
80
  task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
125
81
 
126
- pipeline_options: OptionsByPipeline = Field(
127
- default_factory=_default_pipeline_options
128
- )
129
- format_options: dict[InputFormat, DoclingFormatOption] = Field(
130
- default_factory=_default_format_options
131
- )
82
+ format_options: Annotated[
83
+ dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
84
+ ] = Field(default_factory=_default_format_opts)
132
85
 
133
- def to_format_options(self) -> dict[InputFormat, FormatOption]:
134
- pipeline_options = dict(self.pipeline_options)
135
- return {
136
- InputFormat(f): opt.to_docling(pipeline_options)
137
- for f, opt in self.format_options.items()
138
- }
86
+ _unsupported_input_formats: ClassVar[set[InputFormat]] = {
87
+ InputFormat.AUDIO,
88
+ InputFormat.METS_GBS,
89
+ InputFormat.VTT,
90
+ }
139
91
 
140
92
  @classmethod
141
93
  @cache
142
- def supported_formats(cls) -> set[SupportedExt]:
143
- # Subset of https://docling-project.github.io/docling/usage/supported_formats/
144
- return {
145
- SupportedExt.ADOC,
146
- SupportedExt.ASCIIDOC,
147
- SupportedExt.BMP,
148
- SupportedExt.CSV,
149
- SupportedExt.DOCX,
150
- SupportedExt.HTLM,
151
- SupportedExt.JPG,
152
- SupportedExt.MD,
153
- SupportedExt.PDF,
154
- SupportedExt.PNG,
155
- SupportedExt.PPTX,
156
- SupportedExt.TEX,
157
- SupportedExt.TIFF,
158
- SupportedExt.TXT,
159
- SupportedExt.WEBP,
160
- SupportedExt.XHTML,
161
- SupportedExt.XLSX,
162
- }
163
-
164
-
165
- DEFAULT_FORMAT_OPTIONS = DoclingPipelineConfig().to_format_options()
94
+ def supported_exts(cls) -> set[SupportedExt]:
95
+ from docling.datamodel.base_models import FormatToExtensions, InputFormat
96
+
97
+ supported = set()
98
+ for f in InputFormat:
99
+ if f in cls._unsupported_input_formats:
100
+ continue
101
+ for ext in FormatToExtensions[f]:
102
+ supported.add(SupportedExt(f".{ext.lower()}"))
103
+ return supported
166
104
 
167
105
 
168
106
  @Pipeline.register(PipelineType.DOCLING)
169
107
  class DoclingPipeline(Pipeline):
170
108
  def __init__(self, format_options: dict[InputFormat, FormatOption] | None = None):
171
- if format_options is None:
172
- format_options = DEFAULT_FORMAT_OPTIONS
173
- self._converter = DocumentConverter(format_options=format_options)
109
+ from docling.document_converter import DocumentConverter
110
+
111
+ allowed_format = [
112
+ f.to_docling() for f in DoclingPipelineConfig.supported_exts()
113
+ ]
114
+ self._converter = DocumentConverter(
115
+ allowed_formats=allowed_format, format_options=format_options
116
+ )
174
117
 
175
118
  async def extract_content(
176
119
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
@@ -182,16 +125,16 @@ class DoclingPipeline(Pipeline):
182
125
 
183
126
  @classmethod
184
127
  def _from_config(cls, config: DoclingPipelineConfig) -> FromConfig:
185
- return cls(config.to_format_options())
128
+ return cls(config.format_options)
186
129
 
187
130
 
188
- def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | DocumentStream]:
131
+ def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path | "DocumentStream"]:
189
132
  for d in docs:
190
133
  yield d.to_docling()
191
134
 
192
135
 
193
136
  def _to_result(
194
- res: ConversionResult,
137
+ res: "ConversionResult",
195
138
  input_document: InputDoc,
196
139
  output_format: OutputFormat,
197
140
  output_path: Path,
@@ -212,11 +155,13 @@ def _to_result(
212
155
 
213
156
 
214
157
  def _to_markdown_doc(
215
- res: ConversionResult,
158
+ res: "ConversionResult",
216
159
  output_path: Path,
217
160
  page_sep: str = DEFAULT_MD_PAGE_SEP,
218
161
  **kwargs,
219
162
  ) -> MarkdownDoc:
163
+ from docling_core.types.doc import ImageRefMode
164
+
220
165
  # TODO: Should we add a hash to avoid collision between files with same names
221
166
  # nested in the tree structured
222
167
  md_dir_name = path_to_artifacts_dirname(res.input.file)
extract_python/marker_.py CHANGED
@@ -3,14 +3,8 @@ from collections.abc import AsyncGenerator, Iterable
3
3
  from copy import deepcopy
4
4
  from functools import cache
5
5
  from pathlib import Path
6
- from typing import Any, ClassVar, Self
7
-
8
- from marker.config.parser import ConfigParser
9
- from marker.converters.pdf import PdfConverter
10
- from marker.models import create_model_dict
11
- from marker.output import text_from_rendered
12
- from marker.renderers.markdown import MarkdownRenderer
13
- from PIL.Image import Image
6
+ from typing import TYPE_CHECKING, Any, ClassVar, Self
7
+
14
8
  from pydantic import Field
15
9
 
16
10
  from .constants import ARTIFACTS, CPU_GROUP
@@ -26,6 +20,10 @@ from .objects import (
26
20
  from .pipeline import Pipeline, PipelineConfig, PipelineType
27
21
  from .utils import path_to_artifacts_dirname, report_recoverable_errors
28
22
 
23
+ if TYPE_CHECKING:
24
+ from marker.converters.pdf import PdfConverter
25
+ from PIL import Image
26
+
29
27
 
30
28
  @PipelineConfig.register()
31
29
  class MarkerPipelineConfig(PipelineConfig):
@@ -36,7 +34,7 @@ class MarkerPipelineConfig(PipelineConfig):
36
34
 
37
35
  @classmethod
38
36
  @cache
39
- def supported_formats(cls) -> set[SupportedExt]:
37
+ def supported_exts(cls) -> set[SupportedExt]:
40
38
  # Subset of https://documentation.datalab.to/docs/common/supportedfiletypes
41
39
  return {
42
40
  SupportedExt.PDF,
@@ -75,6 +73,10 @@ class MarkerPipeline(Pipeline):
75
73
  async def extract_content(
76
74
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
77
75
  ) -> AsyncGenerator[Result, None]:
76
+ from marker.config.parser import ConfigParser
77
+ from marker.converters.pdf import PdfConverter
78
+ from marker.models import create_model_dict
79
+
78
80
  config = deepcopy(self._marker_config)
79
81
  config["output_format"] = output_format.to_marker()
80
82
  config_parser = ConfigParser(config)
@@ -96,10 +98,12 @@ class MarkerPipeline(Pipeline):
96
98
  @report_recoverable_errors(_MARKER_CONVERSION_ERRORS)
97
99
  def _process_doc(
98
100
  doc: InputDoc,
99
- converter: PdfConverter,
101
+ converter: "PdfConverter",
100
102
  output_format: OutputFormat,
101
103
  output_path: Path,
102
104
  ) -> Result:
105
+ from marker.output import text_from_rendered
106
+
103
107
  rendered = converter(str(doc.path))
104
108
  content, _, images = text_from_rendered(rendered)
105
109
  match output_format:
@@ -112,8 +116,10 @@ def _process_doc(
112
116
 
113
117
 
114
118
  def _to_markdown_doc(
115
- input_doc: InputDoc, content: str, images: dict[str, Image], output_path: Path
119
+ input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
116
120
  ) -> MarkdownDoc:
121
+ from marker.renderers.markdown import MarkdownRenderer
122
+
117
123
  # TODO: Should we add a hash to avoid collision between files with same names
118
124
  # nested in the tree structured
119
125
  md_dir_name = path_to_artifacts_dirname(input_doc.path)
extract_python/miner_u.py CHANGED
@@ -8,12 +8,6 @@ from pathlib import Path
8
8
  from tempfile import TemporaryDirectory
9
9
  from typing import Any, ClassVar, Self
10
10
 
11
- from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
12
- union_make as pipeline_union_make,
13
- )
14
- from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make as vlm_union_make
15
- from mineru.cli.common import aio_do_parse
16
- from mineru.utils.enum_class import MakeMode
17
11
  from pydantic import Field
18
12
  from pydantic_extra_types.language_code import LanguageAlpha2
19
13
 
@@ -47,33 +41,39 @@ class MinerUConfig(BaseModel):
47
41
  # TODO: use enum or literal here
48
42
  parse_method: str = "auto"
49
43
 
50
- default_kwargs: ClassVar[dict] = {
51
- "server_url": None,
52
- # We don't dump md directly we process, we dump the middle json in order to be
53
- # able to get page indexes
54
- "parse_method": "auto",
55
- "dump_md": False,
56
- "dump_middle_json": True,
57
- "f_draw_layout_bbox": False,
58
- "f_draw_span_bbox": False,
59
- "f_dump_model_output": False, # might be useful for debug though
60
- "f_dump_orig_pdf": False,
61
- "f_dump_content_list": False, # might be useful for debug though
62
- "start_page_id": 0,
63
- "f_make_md_mode": MakeMode.MM_MD,
64
- "image_analysis": True,
65
- "end_page_id": None,
66
- "client_side_output_generation": False,
67
- }
68
-
69
44
  def as_parse_kwargs(self) -> dict[str, Any]:
70
- kwargs = copy(self.default_kwargs)
45
+ kwargs = copy(self._get_default_kwargs())
71
46
  kwargs["backend"] = self.backend
72
47
  kwargs["parse_method"] = self.parse_method
73
48
  kwargs["formula_enable"] = self.enable_formula_extraction
74
49
  kwargs["table_enable"] = self.enable_table_extraction
75
50
  return kwargs
76
51
 
52
+ @classmethod
53
+ @cache
54
+ def _get_default_kwargs(cls) -> dict[str, Any]:
55
+
56
+ from mineru.utils.enum_class import MakeMode
57
+
58
+ return {
59
+ "server_url": None,
60
+ # We don't dump md directly we process, we dump the middle json in order to be
61
+ # able to get page indexes
62
+ "parse_method": "auto",
63
+ "dump_md": False,
64
+ "dump_middle_json": True,
65
+ "f_draw_layout_bbox": False,
66
+ "f_draw_span_bbox": False,
67
+ "f_dump_model_output": False, # might be useful for debug though
68
+ "f_dump_orig_pdf": False,
69
+ "f_dump_content_list": False, # might be useful for debug though
70
+ "start_page_id": 0,
71
+ "f_make_md_mode": MakeMode.MM_MD,
72
+ "image_analysis": True,
73
+ "end_page_id": None,
74
+ "client_side_output_generation": False,
75
+ }
76
+
77
77
 
78
78
  @PipelineConfig.register() # noqa: F821
79
79
  class MinerUPipelineConfig(PipelineConfig): # noqa: F821
@@ -85,7 +85,7 @@ class MinerUPipelineConfig(PipelineConfig): # noqa: F821
85
85
 
86
86
  @classmethod
87
87
  @cache
88
- def supported_formats(cls) -> set[SupportedExt]:
88
+ def supported_exts(cls) -> set[SupportedExt]:
89
89
  return {
90
90
  SupportedExt.PDF,
91
91
  SupportedExt.DOCX,
@@ -104,6 +104,8 @@ class MinerUPipeline(Pipeline):
104
104
  async def extract_content(
105
105
  self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
106
106
  ) -> AsyncGenerator[Result, None]:
107
+ from mineru.cli.common import aio_do_parse
108
+
107
109
  docs = list(docs)
108
110
  # TODO: exclude files which are not pdf and return an error
109
111
  pdfs_bytes = [d.path.read_bytes() for d in docs]
@@ -149,11 +151,18 @@ def _revert_mineru_output(output_dir: Path, *, pdf_filename: str) -> Path:
149
151
 
150
152
 
151
153
  def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
154
+
152
155
  match backend:
153
156
  case MinerUBackend.PIPELINE:
154
- return pipeline_union_make
157
+ from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
158
+ union_make,
159
+ )
160
+
161
+ return union_make
155
162
  case MinerUBackend.VLM:
156
- return vlm_union_make
163
+ from mineru.backend.vlm.vlm_middle_json_mkcontent import union_make
164
+
165
+ return union_make
157
166
  case _:
158
167
  raise ValueError(f"Unsupported backend: {backend}")
159
168
 
@@ -201,8 +210,12 @@ def _dump_md_content(
201
210
  output_path: Path,
202
211
  md_path: Path,
203
212
  im_dir: Path,
204
- md_make_mode: str = MakeMode.MM_MD,
213
+ md_make_mode: str | None = None,
205
214
  ) -> ConversionOutput:
215
+ from mineru.utils.enum_class import MakeMode
216
+
217
+ if md_make_mode is None:
218
+ md_make_mode = MakeMode.MM_MD
206
219
  total_length = 0
207
220
  end_indices = []
208
221
  with md_path.open("w") as f:
extract_python/objects.py CHANGED
@@ -21,7 +21,12 @@ from pydantic import AfterValidator, RootModel, TypeAdapter
21
21
  from pydantic import BaseModel as _BaseModel
22
22
 
23
23
  try:
24
- from docling.datamodel.base_models import ConversionStatus, ErrorItem, InputFormat
24
+ from docling.datamodel.base_models import (
25
+ ConversionStatus,
26
+ ErrorItem,
27
+ FormatToExtensions,
28
+ InputFormat,
29
+ )
25
30
  from docling.datamodel.document import InputDocument
26
31
  from docling_core.types.io import DocumentStream
27
32
  except ImportError:
@@ -33,42 +38,78 @@ logger = logging.getLogger(__name__)
33
38
  base_config = merge_configs(icij_config(), no_enum_values_config())
34
39
 
35
40
 
41
+ @cache
42
+ def _ext_to_docling_input_format() -> dict:
43
+ from .docling_ import DoclingPipelineConfig # noqa: PLC0415
44
+
45
+ mapping = dict()
46
+ supported = DoclingPipelineConfig.supported_exts()
47
+ for input_f, exts in FormatToExtensions.items():
48
+ for ext in exts:
49
+ try:
50
+ ext = SupportedExt(f".{ext.lower()}") # noqa: PLW2901
51
+ except ValueError:
52
+ continue
53
+ if ext in supported:
54
+ mapping[ext] = input_f
55
+ return mapping
56
+
57
+
36
58
  class BaseModel(_BaseModel):
37
59
  model_config = base_config
38
60
 
39
61
 
40
62
  class SupportedExt(StrEnum):
41
63
  ADOC = ".adoc"
64
+ ASC = ".asc"
42
65
  ASCIIDOC = ".asciidoc"
43
66
  BMP = ".bmp"
44
67
  CSV = ".csv"
45
68
  DOC = ".doc"
46
69
  DOCX = ".docx"
70
+ DOTX = ".dotx"
71
+ DOTM = ".dotm"
72
+ DOCM = ".docm"
47
73
  EPUB = ".epub"
48
74
  GIF = ".gif"
49
75
  HTLM = ".html"
76
+ HTM = ".htm"
50
77
  JPEG = ".jpeg"
51
78
  JPG = ".jpg"
79
+ JSON = ".json"
80
+ LATEX = ".latex"
52
81
  MD = ".md"
82
+ NXML = ".nxml"
53
83
  ODP = ".odp"
54
84
  ODS = ".ods"
55
85
  ODT = ".odt"
56
86
  PDF = ".pdf"
57
87
  PNG = ".png"
88
+ PPSX = ".ppsx"
58
89
  PPT = ".ppt"
90
+ PPTM = ".pptm"
91
+ PPSM = ".ppsm"
92
+ POTX = ".potx"
93
+ POTM = ".potm"
59
94
  PPTX = ".pptx"
95
+ QMD = ".qmd"
96
+ RMD = ".rmd"
60
97
  TEX = ".tex"
98
+ TIF = ".tif"
61
99
  TIFF = ".tiff"
62
100
  TXT = ".txt"
101
+ TEXT = ".text"
63
102
  WEBP = ".webp"
103
+ XBRL = ".xbrl"
64
104
  XHTML = ".xhtml"
65
105
  XLS = ".xls"
66
106
  XLSM = ".xlsm"
67
107
  XLSX = ".xlsx"
68
108
  XLTX = ".xltx"
109
+ XML = ".xml"
69
110
 
70
111
  def to_docling(self) -> InputFormat:
71
- return InputFormat(self.value[1:])
112
+ return _ext_to_docling_input_format()[self]
72
113
 
73
114
 
74
115
  class OutputFormat(StrEnum):
@@ -30,7 +30,7 @@ class PipelineConfig(RegistrableConfig, ABC):
30
30
 
31
31
  @classmethod
32
32
  @abstractmethod
33
- def supported_formats(cls) -> set[SupportedExt]: ...
33
+ def supported_exts(cls) -> set[SupportedExt]: ...
34
34
 
35
35
 
36
36
  class Pipeline(RegistrableFromConfig, ABC):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: extract-python
3
- Version: 0.2.1
3
+ Version: 0.3.1
4
4
  Summary: Structured content extraction
5
5
  Project-URL: Homepage, https://github.com/ICIJ/extract-python
6
6
  Project-URL: Repository, https://github.com/ICIJ/extract-python
@@ -0,0 +1,11 @@
1
+ extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
+ extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
+ extract_python/docling_.py,sha256=lWWQ2PT5qOFUcJkeKw8ibF4JxzxQBgf93_CfvNcykDg,7041
4
+ extract_python/marker_.py,sha256=ocRFxWX__A-M31z7Qr67OMcWRvgGO_C3tyZpiKc-bXw,5027
5
+ extract_python/miner_u.py,sha256=hwRFTvtWGN_mRuv0p6H7nKS89dTErQxI1yOrvh6238M,8010
6
+ extract_python/objects.py,sha256=kCxg6m7j01Z1sNAGi8vniokMnw6Ry0gU2lXXH2uan8A,8744
7
+ extract_python/pipeline.py,sha256=VhDvfCxMEKvhFbMA-yxWO7FEeErDoLQCHiTRNnrbI8Y,1204
8
+ extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
+ extract_python-0.3.1.dist-info/METADATA,sha256=qtfZpwEIKgWzkfbxGYMVP-pNFMFAbLrZo1-hmDXcgvE,1132
10
+ extract_python-0.3.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
+ extract_python-0.3.1.dist-info/RECORD,,
@@ -1,11 +0,0 @@
1
- extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
2
- extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
3
- extract_python/docling_.py,sha256=L7RPOYTN4kuodsgfi7NVsXILRerbAZUDaYxqVofxOog,9267
4
- extract_python/marker_.py,sha256=EUGpBRAe9mE0QbSdMFdvE16_m-c-DeAtwZ8F79w2Mcg,4908
5
- extract_python/miner_u.py,sha256=Bse66I5Yj-PiOgejr3JjXXDkjCh46M9KuwTEB8QK5g4,7750
6
- extract_python/objects.py,sha256=hqa9ONk9KwylvQa6DvKIEQnnCgfy-T-d5SU2LpfmTcQ,7815
7
- extract_python/pipeline.py,sha256=0qkuqEcxEbc3_sy8gNbLPwq8IIlC8cfGaqk_5fNpOCM,1207
8
- extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
9
- extract_python-0.2.1.dist-info/METADATA,sha256=iaIXzaha4s-kqJzkedwJn5VsvQGZhy3KJQJpDns8pR4,1132
10
- extract_python-0.2.1.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
11
- extract_python-0.2.1.dist-info/RECORD,,