PyPI - extract-python - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

extract-python 0.4.2py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

extract_python/__init__.py +7 -25
extract_python/constants.py +0 -4
extract_python/docling_.py +13 -161
extract_python/marker_.py +7 -7
extract_python/miner_u.py +10 -74
extract_python/utils.py +4 -10
{extract_python-0.4.2.dist-info → extract_python-0.5.4.dist-info}/METADATA +3 -1
extract_python-0.5.4.dist-info/RECORD +9 -0
extract_python/objects.py +0 -323
extract_python/pipeline.py +0 -38
extract_python-0.4.2.dist-info/RECORD +0 -11
{extract_python-0.4.2.dist-info → extract_python-0.5.4.dist-info}/WHEEL +0 -0

extract_python/__init__.py CHANGED Viewed

@@ -1,41 +1,23 @@
-from .objects import InputDoc, OutputFormat, Status
-from .pipeline import Pipeline, PipelineConfig, PipelineType
 try:
-    from .docling_ import (
-        DOCLING_DEFAULT_ARTIFACTS_PATH,
-        DoclingPipeline,
-        DoclingPipelineConfig,
-    )
+    from .docling_ import DOCLING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline
 except ImportError:
-    DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline, DoclingPipelineConfig = (
-        None,
-        None,
-        None,
-    )
+    DOCKING_DEFAULT_ARTIFACTS_PATH, DoclingPipeline = None, None
 try:
-    from .marker_ import MarkerPipeline, MarkerPipelineConfig
+    from .marker_ import MarkerPipeline
 except ImportError:
-    MarkerPipeline, MarkerPipelineConfig = None, None
+    MarkerPipeline = None
 try:
-    from .miner_u import MinerUPipeline, MinerUPipelineConfig
+    from .miner_u import MinerUPipeline
 except ImportError:
-    MinerUPipeline, MinerUPipelineConfig = None, None
+    MinerUPipeline = None
 __all__ = [
     "DoclingPipeline",
-    "DoclingPipelineConfig",
-    "InputDoc",
     "DOCLING_DEFAULT_ARTIFACTS_PATH",
     "MarkerPipeline",
-    "MarkerPipelineConfig",
-    "OutputFormat",
-    "Pipeline",
-    "PipelineType",
-    "PipelineConfig",
-    "Status",
+    "MinerUPipeline",
 ]

extract_python/constants.py CHANGED Viewed

@@ -1,6 +1,2 @@
 ARTIFACTS = "artifacts"
-CPU_GROUP = "cpu"
-MINER_U_GROUP = "miner-u"
-EXTRACT_CONTENT_TASK = "extract-content"
-EXTRACT_CONTENT_MINER_U_TASK = "extract-content-miner-u"
 DEFAULT_MD_PAGE_SEP = '<div style="page-break-after: always;"></div>'

extract_python/docling_.py CHANGED Viewed

@@ -1,190 +1,42 @@
-import importlib
 import shutil
 import tempfile
 from collections.abc import AsyncGenerator, Iterable, Iterator
-from functools import cache
 from pathlib import Path
-from typing import Annotated, Any, ClassVar, Self, TypeVar, get_type_hints
-from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.datamodel.backend_options import BackendOptions, BaseBackendOptions
-# Data model import are quick it's ok to leave it there
-from docling.datamodel.base_models import FormatToExtensions, InputFormat
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    EasyOcrOptions,
-    PdfPipelineOptions,
-    PipelineOptions,
-    ThreadedPdfPipelineOptions,
-)
-from docling.document_converter import DocumentConverter, FormatOption
-from docling.pipeline.base_pipeline import BasePipeline
+from docling.document_converter import DocumentConverter
 # TODO: this is long to load improve it
 from docling_core.types.doc import ImageRefMode
 from docling_core.types.io import DocumentStream
-from icij_common.pydantic_utils import to_lower_snake_case
-from icij_common.registrable import FromConfig
-from pydantic import (
-    AfterValidator,
-    BeforeValidator,
-    Field,
-    PlainSerializer,
-    WrapSerializer,
-    model_validator,
-)
-from pydantic_core.core_schema import SerializerFunctionWrapHandler
-from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .objects import (
+from extract_core import (
+    DoclingFormatOption,
+    DoclingPipelineConfig,
     Error,
     InputDoc,
     MarkdownDoc,
     OutputFormat,
     PageIndexes,
+    Pipeline,
+    PipelineType,
     Result,
     Status,
-    SupportedExt,
 )
-from .pipeline import Pipeline, PipelineConfig, PipelineType
-from .utils import all_subclasses, chdir, map_and_preserve, path_to_artifacts_dirname
-DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
-def _validate_pipeline_opts(v: "PipelineOptions") -> None:
-    if isinstance(v, PdfPipelineOptions) and not v.generate_picture_images:
-        msg = "generate_picture_images should be set to true"
-        raise ValueError(msg)
-    return v
-T = TypeVar("T")
-def _find_subcls(cls: type[T], name: str) -> type[T]:
-    # Check if the class available
-    for c in all_subclasses(cls):
-        if c.__name__ == name:
-            return c
-    # Then apply ad-hoc search
-    if "pipeline" in cls.__name__.lower():
-        module_name = f"docling.pipeline.{to_lower_snake_case(name)}"
-        try:
-            module = importlib.import_module(module_name)
-            return getattr(module, name)
-        except (ModuleNotFoundError, AttributeError):
-            pass
-    raise ValueError(f"unknown {cls.__name__} subclass {name}")
-def _find_init_arg_type(cls: type[Any], arg: str) -> type:
-    hints = get_type_hints(cls.__init__)
-    return hints[arg]
-def _resolve_pipeline_cls(v: Any) -> Any:
-    if isinstance(v, str):
-        return _find_subcls(BasePipeline, v)
-    return v
-def _ser_class_as_str(v: Any) -> Any:
-    if isinstance(v, type):
-        return v.__name__
-    return v
-def _ser_with_backend_option_kind(
-    v: Any, handler: SerializerFunctionWrapHandler
-) -> Any:
-    serialized = handler(v)
-    if isinstance(v, BaseBackendOptions):
-        kind = getattr(v, "kind", None)
-        if kind is not None:
-            serialized["kind"] = kind
-    return serialized
-def _resolve_backend(v: Any) -> Any:
-    if isinstance(v, str):
-        return _find_subcls(AbstractDocumentBackend, v)
-    return v
-class DoclingFormatOption(FormatOption):
-    pipeline_cls: Annotated[
-        str | type[BasePipeline],
-        BeforeValidator(_resolve_pipeline_cls),
-        PlainSerializer(_ser_class_as_str),
-    ]
-    pipeline_options: Annotated[
-        dict | PipelineOptions | None, AfterValidator(_validate_pipeline_opts)
-    ] = None
-    backend: Annotated[
-        str | type[AbstractDocumentBackend],
-        BeforeValidator(_resolve_backend),
-        PlainSerializer(_ser_class_as_str),
-    ]
-    backend_options: Annotated[
-        BackendOptions | None, WrapSerializer(_ser_with_backend_option_kind)
-    ] = None
-    @model_validator(mode="after")
-    def _resolve_pipeline_options(self) -> Self:
-        if isinstance(self.pipeline_options, dict):
-            option_cls = _find_init_arg_type(self.pipeline_cls, "pipeline_options")
-            self.pipeline_options = option_cls.model_validate(self.pipeline_options)
-        return self
-@cache
-def _default_format_opts() -> dict[InputFormat, DoclingFormatOption]:
-    from docling.backend.docling_parse_backend import (  # noqa: PLC0415
-        DoclingParseDocumentBackend,
-    )
-    from docling.pipeline.standard_pdf_pipeline import (  # noqa: PLC0415
-        StandardPdfPipeline,
-    )
-    return {
-        InputFormat.PDF: DoclingFormatOption(
-            pipeline_cls=StandardPdfPipeline,
-            backend=DoclingParseDocumentBackend,
-            pipeline_options=ThreadedPdfPipelineOptions(
-                ocr_options=EasyOcrOptions(), generate_picture_images=True
-            ),
-        ),
-    }
-class DoclingPipelineConfig(PipelineConfig):
-    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.DOCLING)
+from icij_common.registrable import FromConfig
-    format_options: dict[InputFormat, DoclingFormatOption | FormatOption] = Field(
-        default_factory=_default_format_opts
-    )
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
+from .utils import chdir, map_and_preserve, path_to_artifacts_dirname
-    @classmethod
-    @cache
-    def supported_exts(cls) -> set[SupportedExt]:
-        unsupported = {InputFormat.AUDIO, InputFormat.METS_GBS, InputFormat.VTT}
-        supported = set()
-        for f in InputFormat:
-            if f in unsupported:
-                continue
-            for ext in FormatToExtensions[f]:
-                supported.add(SupportedExt(f".{ext.lower()}"))
-        return supported
+DOCLING_DEFAULT_ARTIFACTS_PATH = Path.home().joinpath(".cache", "docling", "models")
 @Pipeline.register(PipelineType.DOCLING)
 class DoclingPipeline(Pipeline):
     def __init__(
-        self, format_options: dict["InputFormat", "FormatOption"] | None = None
+        self, format_options: dict["InputFormat", DoclingFormatOption] | None = None
     ):
+        format_options = {k: v.to_docling() for k, v in format_options.items()}
         allowed_format = [
             f.to_docling() for f in DoclingPipelineConfig.supported_exts()
         ]

extract_python/marker_.py CHANGED Viewed

@@ -5,10 +5,8 @@ from functools import cache
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, ClassVar, Self
-from pydantic import Field
-from .constants import ARTIFACTS
-from .objects import (
+from extract_core import BasePipelineConfig, Pipeline, PipelineType
+from extract_core.objects import (
     InputDoc,
     MarkdownDoc,
     OutputFormat,
@@ -17,7 +15,9 @@ from .objects import (
     Status,
     SupportedExt,
 )
-from .pipeline import Pipeline, PipelineConfig, PipelineType
+from pydantic import Field
+from .constants import ARTIFACTS
 from .utils import path_to_artifacts_dirname, report_recoverable_errors
 if TYPE_CHECKING:
@@ -25,10 +25,10 @@ if TYPE_CHECKING:
     from PIL import Image
-class MarkerPipelineConfig(PipelineConfig):
+class MarkerPipelineConfig(BasePipelineConfig):
     pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MARKER)
-    config: dict[str, Any] = dict()
+    config: dict[str, Any] = Field(default_factory=dict)
     @classmethod
     @cache

extract_python/miner_u.py CHANGED Viewed

@@ -1,96 +1,32 @@
 import json
 import shutil
 from collections.abc import AsyncGenerator, Callable, Iterable
-from copy import copy
-from enum import StrEnum
-from functools import cache, partial
+from functools import partial
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import Any, ClassVar, Self
+from typing import Self
-from pydantic import Field
-from pydantic_extra_types.language_code import LanguageAlpha2
-from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
-from .objects import (
-    BaseModel,
+from extract_core import (
     ConversionOutput,
     InputDoc,
+    MinerUBackend,
+    MinerUConfig,
+    MinerUPipelineConfig,
     OutputFormat,
     PageIndexes,
+    Pipeline,
+    PipelineType,
     Result,
     Status,
-    SupportedExt,
 )
-from .pipeline import Pipeline, PipelineConfig, PipelineType
+from .constants import ARTIFACTS, DEFAULT_MD_PAGE_SEP
 from .utils import path_to_artifacts_dirname
 _MINER_U_CONVERSION_ERRORS = tuple()
 MDMakeFunction = Callable[[list, str, str], str | None]
-class MinerUBackend(StrEnum):
-    PIPELINE = "pipeline"
-    VLM = "vlm"
-class MinerUConfig(BaseModel):
-    backend: MinerUBackend = MinerUBackend.PIPELINE
-    enable_formula_extraction: bool = True
-    enable_table_extraction: bool = True
-    # TODO: use enum or literal here
-    parse_method: str = "auto"
-    def as_parse_kwargs(self) -> dict[str, Any]:
-        kwargs = copy(self._get_default_kwargs())
-        kwargs["backend"] = self.backend
-        kwargs["parse_method"] = self.parse_method
-        kwargs["formula_enable"] = self.enable_formula_extraction
-        kwargs["table_enable"] = self.enable_table_extraction
-        return kwargs
-    @classmethod
-    @cache
-    def _get_default_kwargs(cls) -> dict[str, Any]:
-        from mineru.utils.enum_class import MakeMode  # noqa: PLC0415
-        return {
-            "server_url": None,
-            # We don't dump md directly we process, we dump the middle json in order
-            # to be able to get page indexes
-            "parse_method": "auto",
-            "dump_md": False,
-            "dump_middle_json": True,
-            "f_draw_layout_bbox": False,
-            "f_draw_span_bbox": False,
-            "f_dump_model_output": False,  # might be useful for debug though
-            "f_dump_orig_pdf": False,
-            "f_dump_content_list": False,  # might be useful for debug though
-            "start_page_id": 0,
-            "f_make_md_mode": MakeMode.MM_MD,
-            "image_analysis": True,
-            "end_page_id": None,
-            "client_side_output_generation": False,
-        }
-class MinerUPipelineConfig(PipelineConfig):  # noqa: F821
-    pipeline: ClassVar[PipelineType] = Field(frozen=True, default=PipelineType.MINER_U)
-    config: MinerUConfig = Field(frozen=True, default=MinerUConfig())
-    language: LanguageAlpha2 = Field(frozen=True, default="en")
-    @classmethod
-    @cache
-    def supported_exts(cls) -> set[SupportedExt]:
-        return {
-            SupportedExt.PDF,
-            SupportedExt.DOCX,
-            SupportedExt.PPTX,
-            SupportedExt.XLSX,
-        }
 @Pipeline.register(PipelineType.MINER_U)
 class MinerUPipeline(Pipeline):
     def __init__(self, config: MinerUConfig, language: str):

extract_python/utils.py CHANGED Viewed

@@ -6,26 +6,20 @@ from itertools import tee
 from pathlib import Path, PurePath
 from typing import Protocol, TypeVar
-from .objects import Error, InputDoc, Result, Status
+from extract_core import Error, InputDoc, Result, Status
 R = TypeVar("R")
-T = TypeVar("T")
+In = TypeVar("In")
 def map_and_preserve(
-    fn: Callable[[Iterable[T]], Iterator[R]], inputs: Iterable[T]
-) -> tuple[Iterable[T], Iterator[R]]:
+    fn: Callable[[Iterable[In]], Iterator[R]], inputs: Iterable[In]
+) -> tuple[Iterable[In], Iterator[R]]:
     save_inputs, function_inputs = tee(inputs)
     outputs = iter(fn(function_inputs))
     return save_inputs, outputs
-def all_subclasses(cls: type[T]) -> set[type[T]]:
-    return set(cls.__subclasses__()).union(
-        [s for c in cls.__subclasses__() for s in all_subclasses(c)]
-    )
 def path_to_artifacts_dirname(path: PurePath, sep: str = "_") -> str:
     dirname = f"{path.name[: -len(path.suffix)]}"
     ext = path.suffix

{extract_python-0.4.2.dist-info → extract_python-0.5.4.dist-info}/METADATA RENAMED Viewed

@@ -1,12 +1,13 @@
 Metadata-Version: 2.4
 Name: extract-python
-Version: 0.4.2
+Version: 0.5.4
 Summary: Structured content extraction
 Project-URL: Homepage, https://github.com/ICIJ/extract-python
 Project-URL: Repository, https://github.com/ICIJ/extract-python
 Project-URL: Issues, https://github.com/ICIJ/extract-python/issues
 Author-email: Clément Doumouro <cdoumouro@icij.org>
 Requires-Python: <3.14,>=3.11
+Requires-Dist: extract-core~=0.1
 Requires-Dist: icij-common~=0.8.2
 Provides-Extra: benches
 Requires-Dist: html2image~=2.0.7; extra == 'benches'
@@ -21,4 +22,5 @@ Provides-Extra: mineru
 Requires-Dist: mineru[mlx]~=3.2; (sys_platform == 'darwin') and extra == 'mineru'
 Requires-Dist: mineru[pipeline,vlm]~=3.2; extra == 'mineru'
 Requires-Dist: pydantic-extra-types[pycountry]~=2.11; extra == 'mineru'
+Requires-Dist: python-pptx~=1.0; extra == 'mineru'
 Requires-Dist: six~=1.17; extra == 'mineru'

extract_python-0.5.4.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+extract_python/__init__.py,sha256=CrqmcyLwD2JgtQNuGRIQ8wr1cWdlKkgMlCz_2reaPJo,470
+extract_python/constants.py,sha256=Hxl2Bc-GJX71gjFgj3U7tRRA_nHhneoT6YPOx9CjYsc,94
+extract_python/docling_.py,sha256=C4WP1AJrvS2n-KytlGc_1CShjdTGM077I6b9tvw4NhY,4727
+extract_python/marker_.py,sha256=mLJA1m9G4JQtBs1wz8rmshdbaH81DhIwkRzDKZPJH8A,5058
+extract_python/miner_u.py,sha256=jjHqHx7-2w0LSxYNcjvgWoLDTXsv_y1eeyteSfXqjk4,5771
+extract_python/utils.py,sha256=NiYf65iCF7QO4loh7u4t38Ww3eVJUdBpWStL4eX_DqE,1781
+extract_python-0.5.4.dist-info/METADATA,sha256=4EHPqAxM-8FnZ_Tco8QFpzJqCvNwe58ul_tO0C9aDN0,1216
+extract_python-0.5.4.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+extract_python-0.5.4.dist-info/RECORD,,

extract_python/objects.py DELETED Viewed

@@ -1,323 +0,0 @@
-from __future__ import annotations
-import logging
-import os
-import traceback
-import uuid
-from abc import ABC
-from enum import StrEnum
-from functools import cache
-from io import BytesIO
-from pathlib import Path
-from typing import Annotated, Any, NoReturn, Self
-from icij_common.pydantic_utils import (
-    icij_config,
-    merge_configs,
-    no_enum_values_config,
-    safe_copy,
-)
-from pydantic import AfterValidator, RootModel, TypeAdapter
-from pydantic import BaseModel as _BaseModel
-try:
-    from docling.datamodel.base_models import (
-        ConversionStatus,
-        ErrorItem,
-        FormatToExtensions,
-        InputFormat,
-    )
-    from docling.datamodel.document import InputDocument
-    from docling_core.types.io import DocumentStream
-except ImportError:
-    ConversionStatus, ErrorItem, InputFormat = None, None, None
-    InputDocument = None
-    DocumentStream = None
-logger = logging.getLogger(__name__)
-base_config = merge_configs(icij_config(), no_enum_values_config())
-@cache
-def _ext_to_docling_input_format() -> dict:
-    from .docling_ import DoclingPipelineConfig  # noqa: PLC0415
-    mapping = dict()
-    supported = DoclingPipelineConfig.supported_exts()
-    for input_f, exts in FormatToExtensions.items():
-        for ext in exts:
-            try:
-                ext = SupportedExt(f".{ext.lower()}")  # noqa: PLW2901
-            except ValueError:
-                continue
-            if ext in supported:
-                mapping[ext] = input_f
-    return mapping
-class BaseModel(_BaseModel):
-    model_config = base_config
-class SupportedExt(StrEnum):
-    ADOC = ".adoc"
-    ASC = ".asc"
-    ASCIIDOC = ".asciidoc"
-    BMP = ".bmp"
-    CSV = ".csv"
-    DOC = ".doc"
-    DOCX = ".docx"
-    DOTX = ".dotx"
-    DOTM = ".dotm"
-    DOCM = ".docm"
-    EPUB = ".epub"
-    EML = ".eml"
-    GIF = ".gif"
-    HTLM = ".html"
-    HTM = ".htm"
-    JPEG = ".jpeg"
-    JPG = ".jpg"
-    JSON = ".json"
-    LATEX = ".latex"
-    MD = ".md"
-    NXML = ".nxml"
-    ODP = ".odp"
-    ODS = ".ods"
-    ODT = ".odt"
-    PDF = ".pdf"
-    PNG = ".png"
-    PPSX = ".ppsx"
-    PPT = ".ppt"
-    PPTM = ".pptm"
-    PPSM = ".ppsm"
-    POTX = ".potx"
-    POTM = ".potm"
-    PPTX = ".pptx"
-    QMD = ".qmd"
-    RMD = ".rmd"
-    TEX = ".tex"
-    TIF = ".tif"
-    TIFF = ".tiff"
-    TXT = ".txt"
-    TEXT = ".text"
-    WEBP = ".webp"
-    XBRL = ".xbrl"
-    XHTML = ".xhtml"
-    XLS = ".xls"
-    XLSM = ".xlsm"
-    XLSX = ".xlsx"
-    XLTX = ".xltx"
-    XML = ".xml"
-    def to_docling(self) -> InputFormat:
-        return _ext_to_docling_input_format()[self]
-class OutputFormat(StrEnum):
-    MARKDOWN = ".md"
-    @property
-    def suffix(self) -> str:
-        return self.value[1:]
-    def to_marker(self) -> str:
-        match self:
-            case OutputFormat.MARKDOWN:
-                return "markdown"
-            case _:
-                raise ValueError(f"{self} is unsupported by marker")
-class Status(StrEnum):
-    FAILURE = "failure"
-    SUCCESS = "success"
-    PARTIAL_SUCCESS = "partial_success"
-    @classmethod
-    def from_docling(cls, v: Any) -> Self:
-        from docling.datamodel.base_models import ConversionStatus  # noqa: PLC0415
-        if v is ConversionStatus.SUCCESS:
-            return cls.SUCCESS
-        if v is ConversionStatus.PARTIAL_SUCCESS:
-            return cls.PARTIAL_SUCCESS
-        if isinstance(v, ConversionStatus):
-            return cls.FAILURE
-        raise TypeError(f"can't convert {v!r} to {cls.__name__!r}")
-    @property
-    def allows_conversion(self) -> bool:
-        return self is Status.SUCCESS or self is Status.PARTIAL_SUCCESS
-class Error(BaseModel):
-    id: str
-    title: str
-    detail: str
-    @classmethod
-    def from_exception(cls, exception: BaseException) -> Self:
-        title = exception.__class__.__name__
-        trace_lines = traceback.format_exception(
-            None, value=exception, tb=exception.__traceback__
-        )
-        detail = f"{exception}\n{''.join(trace_lines)}"
-        error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
-        error = cls(id=error_id, title=title, detail=detail)
-        return error
-    @classmethod
-    def from_docling(cls, docling_error: ErrorItem) -> Self:
-        title = "DoclingConversionError"
-        error_id = f"{_id_title(title)}-{uuid.uuid4().hex}"
-        detail = (
-            f"error in module {docling_error.module_name} of"
-            f" {docling_error.component_type}:\n{docling_error.error_message}"
-        )
-        return cls(id=error_id, title=title, detail=detail)
-def _id_title(title: str) -> str:
-    id_title = []
-    for i, letter in enumerate(title):
-        if i and letter.isupper():
-            id_title.append("-")
-        id_title.append(letter.lower())
-    return "".join(id_title)
-class InputDoc(BaseModel):
-    ext: SupportedExt
-    path: Path
-    content: bytes | None = None
-    @classmethod
-    def from_path(cls, path: str | Path) -> Self:
-        if isinstance(path, str):
-            path = Path(path)
-        ext = SupportedExt(path.suffix)
-        return cls(path=path, ext=ext)
-    def to_docling(self) -> Path | DocumentStream:
-        if self.content is not None:
-            return DocumentStream(name=str(self.path), stream=BytesIO(self.content))
-        if not self.path.suffix:
-            return DocumentStream(
-                name=str(self.path), stream=BytesIO(self.path.read_bytes())
-            )
-        return self.path
-    def without_content(self) -> Self:
-        return safe_copy(self, update={"content": None})
-class PageIndexes(RootModel[list[tuple[int, int]]]):
-    # Stores page end index
-    @classmethod
-    def from_page_end_indices(cls, lengths: list[int]) -> Self:
-        return [
-            ((lengths[p - 1] if p > 0 else 0), lengths[p]) for p in range(len(lengths))
-        ]
-class ConversionOutput(BaseModel):
-    path: Path
-    pages: PageIndexes = []
-class MarkdownDoc(ConversionOutput):
-    @classmethod
-    @property
-    @cache
-    def _valid_conversion_statuses(cls) -> set[ConversionStatus]:
-        from docling.datamodel.base_models import ConversionStatus  # noqa: PLC0415
-        return {ConversionStatus.SUCCESS, ConversionStatus.PARTIAL_SUCCESS}
-def _input_should_not_have_content(value: InputDoc) -> InputDoc:
-    if value.content is not None:
-        raise ValueError(f"response input can't have content, but got {value}")
-    return value
-class _BaseResult(BaseModel, ABC):
-    input: InputDoc
-    status: Status
-    errors: list[Error] = []
-class Result(_BaseResult):
-    # TODO: we could also use generics here when we add more output formats
-    output: ConversionOutput | None
-    def to_response(self) -> ResponseResult:
-        return ResponseResult(
-            input=self.input.without_content(),
-            status=self.status,
-            errors=self.errors,
-            output_path=self.output.path,
-        )
-class ResponseResult(_BaseResult):
-    input: Annotated[InputDoc, AfterValidator(func=_input_should_not_have_content)]
-    output_path: Path
-class ExtractionResponse(BaseModel):
-    results: list[ResponseResult]
-_INPUT_DOCS_ADAPTER = TypeAdapter(list[InputDoc | Path])
-def parse_extraction_request(
-    docs: str | list[dict | str], *, data_dir: Path
-) -> list[InputDoc]:
-    if isinstance(docs, str):
-        logger.debug("exploring files in %s", data_dir.absolute())
-        docs_dir = Path(data_dir) / docs
-        docs = _as_input_docs(docs_dir)
-        msg = "found %s"
-        if len(docs) > 10:
-            msg = msg + ", and more..."
-        logger.debug("found %s", docs[:10])
-        return docs
-    docs = _INPUT_DOCS_ADAPTER.validate_python(docs)
-    if not docs:
-        return []
-    if isinstance(docs[0], Path):
-        doc_meta = []
-        unknown_exts = []
-        for doc in docs:
-            _, ext = os.path.splitext(str(doc))
-            if not ext:
-                unknown_exts.append(doc)
-            else:
-                doc_meta.append(InputDoc.from_path(path=doc.relative_to(data_dir)))
-        if unknown_exts:
-            raise ValueError(f"found files with unknown extensions {unknown_exts}")
-        return doc_meta
-    return docs
-def _raise(err: OSError) -> NoReturn:
-    raise err
-def _as_input_docs(
-    docs_dir: Path, *, supported_ext: set[str] | None = None
-) -> list[InputDoc]:
-    if supported_ext is None:
-        supported_ext = {v.value for v in SupportedExt}
-    docs = []
-    for root, _, files in os.walk(docs_dir, onerror=_raise):
-        root = Path(root)  # noqa: PLW2901
-        for f in files:
-            ext = Path(f).suffix
-            if not ext or ext not in supported_ext:
-                continue
-            docs.append(InputDoc.from_path(path=root / f))
-    docs = sorted(docs, key=lambda x: x.path)
-    return docs

extract_python/pipeline.py DELETED Viewed

@@ -1,38 +0,0 @@
-from abc import ABC, abstractmethod
-from collections.abc import AsyncGenerator, Iterable
-from enum import StrEnum
-from pathlib import Path
-from typing import ClassVar
-from icij_common.pydantic_utils import icij_config, merge_configs, no_enum_values_config
-from icij_common.registrable import RegistrableConfig, RegistrableFromConfig
-from pydantic import Field
-from .objects import InputDoc, OutputFormat, Result, SupportedExt
-StructuredContent = str
-class PipelineType(StrEnum):
-    DOCLING = "docling"
-    MARKER = "marker"
-    MINER_U = "miner_u"
-class PipelineConfig(RegistrableConfig, ABC):
-    # TODO: move this icij_config() to RegistrableConfig
-    model_config = merge_configs(icij_config(), no_enum_values_config())
-    registry_key: ClassVar[str] = Field(frozen=True, default="pipeline")
-    pipeline: ClassVar[PipelineType]
-    @classmethod
-    @abstractmethod
-    def supported_exts(cls) -> set[SupportedExt]: ...
-class Pipeline(RegistrableFromConfig, ABC):
-    @abstractmethod
-    async def extract_content(
-        self, docs: Iterable[InputDoc], output_format: OutputFormat, output_path: Path
-    ) -> AsyncGenerator[Result, None]: ...

extract_python-0.4.2.dist-info/RECORD DELETED Viewed

@@ -1,11 +0,0 @@
-extract_python/__init__.py,sha256=Y-lcFbJd5sX3wt1UnyYAbm7H81Sgg7p2fG07CuuJUco,945
-extract_python/constants.py,sha256=JmAkjXyQMYwpTod9DCLc11zOc3caK2j_2ji_r3hGZws,236
-extract_python/docling_.py,sha256=ys2vK4zgpWsPObIZWRFhHM4fNkojMYUa9QRevl8bd3c,9342
-extract_python/marker_.py,sha256=ACk9wa-wrEwYv4D7SKW4KjpZxrp2hBIt9_pheRhV0go,5014
-extract_python/miner_u.py,sha256=EcTXfdvArkoSw3bKkiWLerYAhXMU6ssJFn9kOsFVDPE,8007
-extract_python/objects.py,sha256=MHCUZ9L8LVXlSlHyDMnbuWV1KHWMhUEJQMEDTc9hYD0,8761
-extract_python/pipeline.py,sha256=ijQ8wI5x3kAzTfx3T-V52qSoAA_8IA_ihK1NPWVMwFM,1162
-extract_python/utils.py,sha256=kXe0CyT6zGYDTOvxs3BJYxq2-cgJZ0d_IT3hdMbuXa8,1943
-extract_python-0.4.2.dist-info/METADATA,sha256=95THYq0jZgY2-1X2s8hDoFEo9_aNeukdHPxlcd8_rmI,1132
-extract_python-0.4.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
-extract_python-0.4.2.dist-info/RECORD,,

{extract_python-0.4.2.dist-info → extract_python-0.5.4.dist-info}/WHEEL RENAMED Viewed

File without changes

extract-python 0.4.2__py3-none-any.whl → 0.5.4__py3-none-any.whl

extract-python 0.4.2py3-none-any.whl → 0.5.4py3-none-any.whl