extract-python 0.3.1__tar.gz → 0.3.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {extract_python-0.3.1 → extract_python-0.3.2}/PKG-INFO +1 -1
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/docling_.py +16 -16
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/marker_.py +5 -5
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/miner_u.py +9 -8
- {extract_python-0.3.1 → extract_python-0.3.2}/.dockerignore +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/.github/workflows/publish.yml +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/.github/workflows/tests.yml +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/.gitignore +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/.python-version +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/Dockerfile +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/README.md +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/benches/__init__.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/benches/compare.ipynb +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/benches/compare.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/benches/constants.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/data/.gitignore +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/docker-compose.yml +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/extract +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/__init__.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/constants.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/objects.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/pipeline.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/extract_python/utils.py +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/pyproject.toml +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/qa/ruff.toml +0 -0
- {extract_python-0.3.1 → extract_python-0.3.2}/uv.lock +0 -0
|
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def _validate_pipeline_opts(opts: "PipelineOptions") -> None:
|
|
35
|
-
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
35
|
+
from docling.datamodel.pipeline_options import PdfPipelineOptions # noqa: PLC0415
|
|
36
36
|
|
|
37
37
|
if isinstance(opts, PdfPipelineOptions) and not opts.generate_picture_images:
|
|
38
38
|
msg = "generate_picture_images should be set to true"
|
|
@@ -49,11 +49,11 @@ def _validate_options(
|
|
|
49
49
|
|
|
50
50
|
@cache
|
|
51
51
|
def _default_format_opts() -> dict["InputFormat", "FormatOption"]:
|
|
52
|
-
from docling.datamodel.pipeline_options import (
|
|
52
|
+
from docling.datamodel.pipeline_options import ( # noqa: PLC0415
|
|
53
53
|
EasyOcrOptions,
|
|
54
54
|
PdfPipelineOptions,
|
|
55
55
|
)
|
|
56
|
-
from docling.document_converter import PdfFormatOption
|
|
56
|
+
from docling.document_converter import PdfFormatOption # noqa: PLC0415
|
|
57
57
|
|
|
58
58
|
return {
|
|
59
59
|
InputFormat.PDF: PdfFormatOption(
|
|
@@ -80,23 +80,21 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
80
80
|
task_group: ClassVar[str] = Field(frozen=True, default=CPU_GROUP)
|
|
81
81
|
|
|
82
82
|
format_options: Annotated[
|
|
83
|
-
dict[InputFormat, FormatOption] | None, AfterValidator(_validate_options)
|
|
83
|
+
dict["InputFormat", "FormatOption"] | None, AfterValidator(_validate_options)
|
|
84
84
|
] = Field(default_factory=_default_format_opts)
|
|
85
85
|
|
|
86
|
-
_unsupported_input_formats: ClassVar[set[InputFormat]] = {
|
|
87
|
-
InputFormat.AUDIO,
|
|
88
|
-
InputFormat.METS_GBS,
|
|
89
|
-
InputFormat.VTT,
|
|
90
|
-
}
|
|
91
|
-
|
|
92
86
|
@classmethod
|
|
93
87
|
@cache
|
|
94
88
|
def supported_exts(cls) -> set[SupportedExt]:
|
|
95
|
-
from docling.datamodel.base_models import
|
|
89
|
+
from docling.datamodel.base_models import ( # noqa: PLC0415
|
|
90
|
+
FormatToExtensions,
|
|
91
|
+
InputFormat,
|
|
92
|
+
)
|
|
96
93
|
|
|
94
|
+
unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
|
|
97
95
|
supported = set()
|
|
98
96
|
for f in InputFormat:
|
|
99
|
-
if f in
|
|
97
|
+
if f in unsupported:
|
|
100
98
|
continue
|
|
101
99
|
for ext in FormatToExtensions[f]:
|
|
102
100
|
supported.add(SupportedExt(f".{ext.lower()}"))
|
|
@@ -105,8 +103,10 @@ class DoclingPipelineConfig(PipelineConfig):
|
|
|
105
103
|
|
|
106
104
|
@Pipeline.register(PipelineType.DOCLING)
|
|
107
105
|
class DoclingPipeline(Pipeline):
|
|
108
|
-
def __init__(
|
|
109
|
-
|
|
106
|
+
def __init__(
|
|
107
|
+
self, format_options: dict["InputFormat", "FormatOption"] | None = None
|
|
108
|
+
):
|
|
109
|
+
from docling.document_converter import DocumentConverter # noqa: PLC0415
|
|
110
110
|
|
|
111
111
|
allowed_format = [
|
|
112
112
|
f.to_docling() for f in DoclingPipelineConfig.supported_exts()
|
|
@@ -128,7 +128,7 @@ class DoclingPipeline(Pipeline):
|
|
|
128
128
|
return cls(config.format_options)
|
|
129
129
|
|
|
130
130
|
|
|
131
|
-
def _to_docling(docs: Iterable[InputDoc]) -> Iterator[Path |
|
|
131
|
+
def _to_docling(docs: Iterable[InputDoc]) -> Iterator["Path | DocumentStream"]:
|
|
132
132
|
for d in docs:
|
|
133
133
|
yield d.to_docling()
|
|
134
134
|
|
|
@@ -160,7 +160,7 @@ def _to_markdown_doc(
|
|
|
160
160
|
page_sep: str = DEFAULT_MD_PAGE_SEP,
|
|
161
161
|
**kwargs,
|
|
162
162
|
) -> MarkdownDoc:
|
|
163
|
-
from docling_core.types.doc import ImageRefMode
|
|
163
|
+
from docling_core.types.doc import ImageRefMode # noqa: PLC0415
|
|
164
164
|
|
|
165
165
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
166
166
|
# nested in the tree structured
|
|
@@ -73,9 +73,9 @@ class MarkerPipeline(Pipeline):
|
|
|
73
73
|
async def extract_content(
|
|
74
74
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
75
75
|
) -> AsyncGenerator[Result, None]:
|
|
76
|
-
from marker.config.parser import ConfigParser
|
|
77
|
-
from marker.converters.pdf import PdfConverter
|
|
78
|
-
from marker.models import create_model_dict
|
|
76
|
+
from marker.config.parser import ConfigParser # noqa: PLC0415
|
|
77
|
+
from marker.converters.pdf import PdfConverter # noqa: PLC0415
|
|
78
|
+
from marker.models import create_model_dict # noqa: PLC0415
|
|
79
79
|
|
|
80
80
|
config = deepcopy(self._marker_config)
|
|
81
81
|
config["output_format"] = output_format.to_marker()
|
|
@@ -102,7 +102,7 @@ def _process_doc(
|
|
|
102
102
|
output_format: OutputFormat,
|
|
103
103
|
output_path: Path,
|
|
104
104
|
) -> Result:
|
|
105
|
-
from marker.output import text_from_rendered
|
|
105
|
+
from marker.output import text_from_rendered # noqa: PLC0415
|
|
106
106
|
|
|
107
107
|
rendered = converter(str(doc.path))
|
|
108
108
|
content, _, images = text_from_rendered(rendered)
|
|
@@ -118,7 +118,7 @@ def _process_doc(
|
|
|
118
118
|
def _to_markdown_doc(
|
|
119
119
|
input_doc: InputDoc, content: str, images: dict[str, "Image"], output_path: Path
|
|
120
120
|
) -> MarkdownDoc:
|
|
121
|
-
from marker.renderers.markdown import MarkdownRenderer
|
|
121
|
+
from marker.renderers.markdown import MarkdownRenderer # noqa: PLC0415
|
|
122
122
|
|
|
123
123
|
# TODO: Should we add a hash to avoid collision between files with same names
|
|
124
124
|
# nested in the tree structured
|
|
@@ -52,13 +52,12 @@ class MinerUConfig(BaseModel):
|
|
|
52
52
|
@classmethod
|
|
53
53
|
@cache
|
|
54
54
|
def _get_default_kwargs(cls) -> dict[str, Any]:
|
|
55
|
-
|
|
56
|
-
from mineru.utils.enum_class import MakeMode
|
|
55
|
+
from mineru.utils.enum_class import MakeMode # noqa: PLC0415
|
|
57
56
|
|
|
58
57
|
return {
|
|
59
58
|
"server_url": None,
|
|
60
|
-
# We don't dump md directly we process, we dump the middle json in order
|
|
61
|
-
# able to get page indexes
|
|
59
|
+
# We don't dump md directly we process, we dump the middle json in order
|
|
60
|
+
# to be able to get page indexes
|
|
62
61
|
"parse_method": "auto",
|
|
63
62
|
"dump_md": False,
|
|
64
63
|
"dump_middle_json": True,
|
|
@@ -104,7 +103,7 @@ class MinerUPipeline(Pipeline):
|
|
|
104
103
|
async def extract_content(
|
|
105
104
|
self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
|
|
106
105
|
) -> AsyncGenerator[Result, None]:
|
|
107
|
-
from mineru.cli.common import aio_do_parse
|
|
106
|
+
from mineru.cli.common import aio_do_parse # noqa: PLC0415
|
|
108
107
|
|
|
109
108
|
docs = list(docs)
|
|
110
109
|
# TODO: exclude files which are not pdf and return an error
|
|
@@ -154,13 +153,15 @@ def _parse_md_make_fn(backend: MinerUBackend) -> MDMakeFunction:
|
|
|
154
153
|
|
|
155
154
|
match backend:
|
|
156
155
|
case MinerUBackend.PIPELINE:
|
|
157
|
-
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import (
|
|
156
|
+
from mineru.backend.pipeline.pipeline_middle_json_mkcontent import ( # noqa: PLC0415
|
|
158
157
|
union_make,
|
|
159
158
|
)
|
|
160
159
|
|
|
161
160
|
return union_make
|
|
162
161
|
case MinerUBackend.VLM:
|
|
163
|
-
from mineru.backend.vlm.vlm_middle_json_mkcontent import
|
|
162
|
+
from mineru.backend.vlm.vlm_middle_json_mkcontent import ( # noqa: PLC0415
|
|
163
|
+
union_make,
|
|
164
|
+
)
|
|
164
165
|
|
|
165
166
|
return union_make
|
|
166
167
|
case _:
|
|
@@ -212,7 +213,7 @@ def _dump_md_content(
|
|
|
212
213
|
im_dir: Path,
|
|
213
214
|
md_make_mode: str | None = None,
|
|
214
215
|
) -> ConversionOutput:
|
|
215
|
-
from mineru.utils.enum_class import MakeMode
|
|
216
|
+
from mineru.utils.enum_class import MakeMode # noqa: PLC0415
|
|
216
217
|
|
|
217
218
|
if md_make_mode is None:
|
|
218
219
|
md_make_mode = MakeMode.MM_MD
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|