PyPI - docling-jobkit - Versions diffs - 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl - Mend

docling-jobkit 1.8.0py3-none-any.whl → 1.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

docling_jobkit/connectors/local_path_source_processor.py ADDED Viewed

@@ -0,0 +1,126 @@
+from pathlib import Path
+from typing import Iterator, TypedDict
+from docling_core.types.io import DocumentStream
+from docling_jobkit.connectors.source_processor import BaseSourceProcessor
+from docling_jobkit.datamodel.task_sources import TaskLocalPathSource
+def _should_ignore_file(file_path: Path) -> bool:
+    """
+    Check if a file should be ignored based on common patterns for
+    hidden files, temporary files, and system metadata files.
+    Returns True if the file should be ignored, False otherwise.
+    """
+    name = file_path.name
+    # Hidden files (starting with .)
+    if name.startswith("."):
+        return True
+    # Vim temporary files
+    if name.endswith(("~", ".swp", ".swo")):
+        return True
+    # Emacs temporary files
+    if name.startswith("#") and name.endswith("#"):
+        return True
+    # Microsoft Office temporary files
+    if name.startswith("~$"):
+        return True
+    # Windows thumbnail cache
+    if name.lower() == "thumbs.db":
+        return True
+    # Desktop.ini (Windows)
+    if name.lower() == "desktop.ini":
+        return True
+    return False
+class LocalPathFileIdentifier(TypedDict):
+    path: Path
+    size: int
+    last_modified: float
+class LocalPathSourceProcessor(BaseSourceProcessor[LocalPathFileIdentifier]):
+    def __init__(self, source: TaskLocalPathSource):
+        super().__init__()
+        self._source = source
+    def _initialize(self):
+        """Validate that the path exists."""
+        if not self._source.path.exists():
+            raise FileNotFoundError(f"Path does not exist: {self._source.path}")
+    def _finalize(self):
+        """No cleanup needed for local filesystem."""
+    def _list_document_ids(self) -> Iterator[LocalPathFileIdentifier]:
+        """
+        List all files based on the source configuration.
+        - If path is a file, yield that single file
+        - If path is a directory, discover files based on pattern and recursive settings
+        """
+        path = self._source.path
+        if path.is_file():
+            # Single file case
+            stat = path.stat()
+            yield LocalPathFileIdentifier(
+                path=path,
+                size=stat.st_size,
+                last_modified=stat.st_mtime,
+            )
+        elif path.is_dir():
+            # Directory case - use glob or rglob based on recursive setting
+            if self._source.recursive:
+                # Recursive traversal
+                files = path.rglob(self._source.pattern)
+            else:
+                # Non-recursive traversal
+                files = path.glob(self._source.pattern)
+            for file_path in files:
+                # Only yield actual files, not directories
+                # Skip hidden files, temporary files, and system metadata
+                if file_path.is_file() and not _should_ignore_file(file_path):
+                    stat = file_path.stat()
+                    yield LocalPathFileIdentifier(
+                        path=file_path,
+                        size=stat.st_size,
+                        last_modified=stat.st_mtime,
+                    )
+        else:
+            raise ValueError(f"Path is neither a file nor a directory: {path}")
+    def _count_documents(self) -> int:
+        """Count total number of documents."""
+        return sum(1 for _ in self._list_document_ids())
+    def _fetch_document_by_id(
+        self, identifier: LocalPathFileIdentifier
+    ) -> DocumentStream:
+        """Fetch a document by opening the file from the local filesystem."""
+        file_path = identifier["path"]
+        # Open file in binary mode and return as DocumentStream
+        with open(file_path, "rb") as f:
+            content = f.read()
+        from io import BytesIO
+        buffer = BytesIO(content)
+        return DocumentStream(name=str(file_path), stream=buffer)
+    def _fetch_documents(self) -> Iterator[DocumentStream]:
+        """Iterate through all documents."""
+        for identifier in self._list_document_ids():
+            yield self._fetch_document_by_id(identifier)

docling_jobkit/connectors/local_path_target_processor.py ADDED Viewed

@@ -0,0 +1,92 @@
+from pathlib import Path
+from typing import BinaryIO
+from docling_jobkit.connectors.target_processor import BaseTargetProcessor
+from docling_jobkit.datamodel.task_targets import LocalPathTarget
+class LocalPathTargetProcessor(BaseTargetProcessor):
+    def __init__(self, target: LocalPathTarget):
+        super().__init__()
+        self._target = target
+    def _initialize(self) -> None:
+        """
+        Ensure the target directory exists.
+        If path is a directory, create it. If it's a file path, create parent directories.
+        """
+        path = self._target.path
+        # If path looks like a directory (ends with / or has no extension), treat as directory
+        # Otherwise, create parent directories for the file
+        if path.suffix == "" or str(path).endswith("/"):
+            # Treat as directory
+            path.mkdir(parents=True, exist_ok=True)
+        else:
+            # Treat as file - create parent directories
+            path.parent.mkdir(parents=True, exist_ok=True)
+    def _finalize(self) -> None:
+        """No cleanup needed for local filesystem."""
+    def upload_file(
+        self,
+        filename: str | Path,
+        target_filename: str,
+        content_type: str,
+    ) -> None:
+        """
+        Copy a file from local filesystem to the target location.
+        """
+        source_path = Path(filename)
+        target_path = self._get_target_path(target_filename)
+        # Ensure parent directory exists
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+        # Copy file content
+        with open(source_path, "rb") as src:
+            with open(target_path, "wb") as dst:
+                dst.write(src.read())
+    def upload_object(
+        self,
+        obj: str | bytes | BinaryIO,
+        target_filename: str,
+        content_type: str,
+    ) -> None:
+        """
+        Write an in-memory object (bytes or file-like) to the target location.
+        """
+        target_path = self._get_target_path(target_filename)
+        # Ensure parent directory exists
+        target_path.parent.mkdir(parents=True, exist_ok=True)
+        # Write content based on type
+        if isinstance(obj, str):
+            with open(target_path, "w", encoding="utf-8") as f:
+                f.write(obj)
+        elif isinstance(obj, (bytes, bytearray)):
+            with open(target_path, "wb") as f:
+                f.write(obj)
+        else:
+            # Assume it's a file-like object
+            with open(target_path, "wb") as f:
+                f.write(obj.read())
+    def _get_target_path(self, target_filename: str) -> Path:
+        """
+        Determine the full target path based on the configured path.
+        - If path is a directory, append target_filename
+        - If path is a file, use it directly (ignore target_filename)
+        """
+        path = self._target.path
+        # Check if path is intended to be a directory
+        if path.is_dir() or path.suffix == "" or str(path).endswith("/"):
+            # Treat as directory - append target_filename
+            return path / target_filename
+        else:
+            # Treat as file - use the path directly
+            return path

docling_jobkit/connectors/s3_source_processor.py CHANGED Viewed

@@ -1,14 +1,20 @@
 from io import BytesIO
-from typing import Iterator
+from typing import Iterator, TypedDict
-from docling.datamodel.base_models import DocumentStream
+from docling_core.types.io import DocumentStream
-from docling_jobkit.connectors.s3_helper import get_s3_connection, get_source_files
+from docling_jobkit.connectors.s3_helper import get_s3_connection
 from docling_jobkit.connectors.source_processor import BaseSourceProcessor
 from docling_jobkit.datamodel.s3_coords import S3Coordinates
-class S3SourceProcessor(BaseSourceProcessor):
+class S3FileIdentifier(TypedDict):
+    key: str  # S3 object key
+    size: int  # optional, include if available
+    last_modified: str | None  # ISO timestamp, optional
+class S3SourceProcessor(BaseSourceProcessor[S3FileIdentifier]):
     def __init__(self, coords: S3Coordinates):
         super().__init__()
         self._coords = coords
@@ -19,25 +25,40 @@ class S3SourceProcessor(BaseSourceProcessor):
     def _finalize(self):
         self._client.close()
-    def _fetch_documents(self) -> Iterator[DocumentStream]:
-        # get list of object_keys
-        object_keys = get_source_files(
-            s3_source_client=self._client,
-            s3_source_resource=self._resource,
-            s3_coords=self._coords,
+    def _list_document_ids(self) -> Iterator[S3FileIdentifier]:
+        paginator = self._client.get_paginator("list_objects_v2")
+        for page in paginator.paginate(
+            Bucket=self._coords.bucket,
+            Prefix=self._coords.key_prefix,
+        ):
+            for obj in page.get("Contents", []):
+                last_modified = obj.get("LastModified", None)
+                yield S3FileIdentifier(
+                    key=obj["Key"],  # type: ignore[typeddict-item]  # Key is always present in S3 list_objects_v2 response
+                    size=obj.get("Size", 0),
+                    last_modified=last_modified.isoformat() if last_modified else None,
+                )
+    def _count_documents(self) -> int:
+        total = 0
+        paginator = self._client.get_paginator("list_objects_v2")
+        for page in paginator.paginate(
+            Bucket=self._coords.bucket,
+            Prefix=self._coords.key_prefix,
+        ):
+            total += len(page.get("Contents", []))
+        return total
+    # ----------------- Document fetch -----------------
+    def _fetch_document_by_id(self, identifier: S3FileIdentifier) -> DocumentStream:
+        buffer = BytesIO()
+        self._client.download_fileobj(
+            Bucket=self._coords.bucket, Key=identifier["key"], Fileobj=buffer
         )
+        buffer.seek(0)
+        return DocumentStream(name=identifier["key"], stream=buffer)
-        # download and yield one document at the time
-        for obj_key in object_keys:
-            # todo. stream is BytesIO
-            buffer = BytesIO()
-            self._client.download_fileobj(
-                Bucket=self._coords.bucket,
-                Key=obj_key,
-                Fileobj=buffer,
-            )
-            buffer.seek(0)
-            yield DocumentStream(
-                name=obj_key,
-                stream=buffer,
-            )
+    def _fetch_documents(self):
+        for key in self._list_document_ids():
+            yield self._fetch_document_by_id(key)

docling_jobkit/connectors/source_processor.py CHANGED Viewed

@@ -1,11 +1,30 @@
 from abc import ABC, abstractmethod
 from contextlib import AbstractContextManager
-from typing import Iterator
+from itertools import islice
+from typing import Callable, Generic, Iterator, Sequence, TypeVar
 from docling.datamodel.base_models import DocumentStream
+FileIdentifierT = TypeVar("FileIdentifierT")  # identifier type per connector
-class BaseSourceProcessor(AbstractContextManager, ABC):
+class DocumentChunk(Generic[FileIdentifierT]):
+    def __init__(
+        self,
+        ids: Sequence[FileIdentifierT],
+        fetcher: Callable[[FileIdentifierT], DocumentStream],
+        chunk_index: int,
+    ):
+        self.ids = ids
+        self._fetcher = fetcher
+        self.index = chunk_index
+    def iter_documents(self) -> Iterator[DocumentStream]:
+        for doc_id in self.ids:
+            yield self._fetcher(doc_id)
+class BaseSourceProcessor(Generic[FileIdentifierT], AbstractContextManager, ABC):
     """
     Base class for source processors.
     Handles initialization state and context management.
@@ -35,9 +54,40 @@ class BaseSourceProcessor(AbstractContextManager, ABC):
     def _fetch_documents(self) -> Iterator[DocumentStream]:
         """Yield documents from the source."""
+    def _list_document_ids(self) -> Iterator[FileIdentifierT] | None:
+        return None
+    def _fetch_document_by_id(self, identifier: FileIdentifierT) -> DocumentStream:
+        raise NotImplementedError
+    def _count_documents(self) -> int | None:
+        return None
     def iterate_documents(self) -> Iterator[DocumentStream]:
         if not self._initialized:
             raise RuntimeError(
                 "Processor not initialized. Use 'with' to open it first."
             )
         yield from self._fetch_documents()
+    def iterate_document_chunks(
+        self, chunk_size: int
+    ) -> Iterator[DocumentChunk[FileIdentifierT]]:
+        ids_gen = self._list_document_ids()
+        if ids_gen is None:
+            raise RuntimeError("Connector does not support chunking.")
+        chunk_index = 0
+        while True:
+            ids = list(islice(ids_gen, chunk_size))
+            if not ids:
+                break
+            yield DocumentChunk(
+                ids=ids,
+                fetcher=self._fetch_document_by_id,
+                chunk_index=chunk_index,
+            )
+            chunk_index += 1

docling_jobkit/connectors/source_processor_factory.py CHANGED Viewed

@@ -2,12 +2,16 @@ from docling_jobkit.connectors.google_drive_source_processor import (
     GoogleDriveSourceProcessor,
 )
 from docling_jobkit.connectors.http_source_processor import HttpSourceProcessor
+from docling_jobkit.connectors.local_path_source_processor import (
+    LocalPathSourceProcessor,
+)
 from docling_jobkit.connectors.s3_source_processor import S3SourceProcessor
 from docling_jobkit.connectors.source_processor import BaseSourceProcessor
 from docling_jobkit.datamodel.task_sources import (
     TaskFileSource,
     TaskGoogleDriveSource,
     TaskHttpSource,
+    TaskLocalPathSource,
     TaskS3Source,
     TaskSource,
 )
@@ -20,5 +24,7 @@ def get_source_processor(source: TaskSource) -> BaseSourceProcessor:
         return S3SourceProcessor(source)
     elif isinstance(source, TaskGoogleDriveSource):
         return GoogleDriveSourceProcessor(source)
+    elif isinstance(source, TaskLocalPathSource):
+        return LocalPathSourceProcessor(source)
     raise RuntimeError(f"No source processor for this source. {type(source)=}")

docling_jobkit/connectors/target_processor_factory.py CHANGED Viewed

@@ -1,10 +1,14 @@
 from docling_jobkit.connectors.google_drive_target_processor import (
     GoogleDriveTargetProcessor,
 )
+from docling_jobkit.connectors.local_path_target_processor import (
+    LocalPathTargetProcessor,
+)
 from docling_jobkit.connectors.s3_target_processor import S3TargetProcessor
 from docling_jobkit.connectors.target_processor import BaseTargetProcessor
 from docling_jobkit.datamodel.task_targets import (
     GoogleDriveTarget,
+    LocalPathTarget,
     S3Target,
     TaskTarget,
 )
@@ -15,5 +19,7 @@ def get_target_processor(target: TaskTarget) -> BaseTargetProcessor:
         return S3TargetProcessor(target)
     if isinstance(target, GoogleDriveTarget):
         return GoogleDriveTargetProcessor(target)
+    if isinstance(target, LocalPathTarget):
+        return LocalPathTargetProcessor(target)
     raise RuntimeError(f"No target processor for this target. {type(target)=}")

docling_jobkit/convert/chunking.py CHANGED Viewed

@@ -220,6 +220,7 @@ def process_chunk_results(
     task: Task,
     conv_results: Iterable[ConversionResult],
     work_dir: Path,
+    chunker_manager: Optional[DocumentChunkerManager] = None,
 ) -> DoclingTaskResult:
     # Let's start by processing the documents
     start_time = time.monotonic()
@@ -234,7 +235,7 @@ def process_chunk_results(
     num_failed = 0
     # TODO: DocumentChunkerManager should be initialized outside for really working as a cache
-    chunker_manager = DocumentChunkerManager()
+    chunker_manager = chunker_manager or DocumentChunkerManager()
     for conv_res in conv_results:
         errors = conv_res.errors
         filename = conv_res.input.file.name

docling_jobkit/convert/manager.py CHANGED Viewed

@@ -28,10 +28,16 @@ from docling.datamodel.pipeline_options import (
     PictureDescriptionVlmOptions,
     ProcessingPipeline,
     TableFormerMode,
+    TableStructureOptions,
     VlmPipelineOptions,
 )
 from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+    DocumentConverter,
+    FormatOption,
+    ImageFormatOption,
+    PdfFormatOption,
+)
 from docling.models.factories import get_ocr_factory
 from docling.pipeline.vlm_pipeline import VlmPipeline
 from docling_core.types.doc import ImageRefMode
@@ -68,12 +74,28 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
         data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
             serialize_as_any=True, mode="json"
         )
+        data["pipeline_options_type"] = (
+            f"{pdf_format_option.pipeline_options.__class__.__module__}."
+            f"{pdf_format_option.pipeline_options.__class__.__qualname__}"
+        )
+    else:
+        data["pipeline_options_type"] = None
     # Replace `pipeline_cls` with a string representation
-    data["pipeline_cls"] = repr(data["pipeline_cls"])
+    pipeline_cls = pdf_format_option.pipeline_cls
+    data["pipeline_cls"] = (
+        f"{pipeline_cls.__module__}.{pipeline_cls.__qualname__}"
+        if pipeline_cls is not None
+        else "None"
+    )
     # Replace `backend` with a string representation
-    data["backend"] = repr(data["backend"])
+    backend = pdf_format_option.backend
+    data["backend"] = (
+        f"{backend.__module__}.{backend.__qualname__}"
+        if backend is not None
+        else "None"
+    )
     # Serialize the dictionary to JSON with sorted keys to have consistent hashes
     serialized_data = json.dumps(data, sort_keys=True)
@@ -121,9 +143,19 @@ class DoclingConverterManager:
         @lru_cache(maxsize=cache_size)
         def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
             pdf_format_option = self._options_map[options_hash]
+            image_format_option: FormatOption = pdf_format_option
+            if isinstance(pdf_format_option.pipeline_cls, type) and issubclass(
+                pdf_format_option.pipeline_cls, VlmPipeline
+            ):
+                image_format_option = ImageFormatOption(
+                    pipeline_cls=pdf_format_option.pipeline_cls,
+                    pipeline_options=pdf_format_option.pipeline_options,
+                    backend_options=pdf_format_option.backend_options,
+                )
             format_options: dict[InputFormat, FormatOption] = {
                 InputFormat.PDF: pdf_format_option,
-                InputFormat.IMAGE: pdf_format_option,
+                InputFormat.IMAGE: image_format_option,
             }
             return DocumentConverter(format_options=format_options)
@@ -178,11 +210,9 @@ class DoclingConverterManager:
             do_picture_classification=request.do_picture_classification,
             do_picture_description=request.do_picture_description,
         )
-        pipeline_options.table_structure_options.mode = TableFormerMode(
-            request.table_mode
-        )
-        pipeline_options.table_structure_options.do_cell_matching = (
-            request.table_cell_matching
+        pipeline_options.table_structure_options = TableStructureOptions(
+            mode=TableFormerMode(request.table_mode),
+            do_cell_matching=request.table_cell_matching,
         )
         if request.image_export_mode != ImageRefMode.PLACEHOLDER:
@@ -282,6 +312,27 @@ class DoclingConverterManager:
                 request.vlm_pipeline_model_api.model_dump()
             )
+        pipeline_options.do_picture_classification = request.do_picture_classification
+        pipeline_options.do_picture_description = request.do_picture_description
+        if request.picture_description_local is not None:
+            pipeline_options.picture_description_options = (
+                PictureDescriptionVlmOptions.model_validate(
+                    request.picture_description_local.model_dump()
+                )
+            )
+        if request.picture_description_api is not None:
+            pipeline_options.picture_description_options = (
+                PictureDescriptionApiOptions.model_validate(
+                    request.picture_description_api.model_dump()
+                )
+            )
+        pipeline_options.picture_description_options.picture_area_threshold = (
+            request.picture_description_area_threshold
+        )
         return pipeline_options
     # Computes the PDF pipeline options and returns the PdfFormatOption and its hash

docling_jobkit/datamodel/task_sources.py CHANGED Viewed

@@ -1,6 +1,7 @@
+from pathlib import Path
 from typing import Annotated, Literal
-from pydantic import Field
+from pydantic import BaseModel, Field
 from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
 from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
@@ -23,7 +24,61 @@ class TaskGoogleDriveSource(GoogleDriveCoordinates):
     kind: Literal["google_drive"] = "google_drive"
+class TaskLocalPathSource(BaseModel):
+    kind: Literal["local_path"] = "local_path"
+    path: Annotated[
+        Path,
+        Field(
+            description=(
+                "Local filesystem path to a file or directory. "
+                "For files, the single file will be processed. "
+                "For directories, files will be discovered based on the pattern and recursive settings. "
+                "Required."
+            ),
+            examples=[
+                "/path/to/document.pdf",
+                "/path/to/documents/",
+                "./data/input/",
+            ],
+        ),
+    ]
+    pattern: Annotated[
+        str,
+        Field(
+            description=(
+                "Glob pattern for matching files within a directory. "
+                "Supports standard glob syntax (e.g., '*.pdf', '**/*.docx'). "
+                "Only applicable when path is a directory. "
+                "Optional, defaults to '*' (all files)."
+            ),
+            examples=[
+                "*.pdf",
+                "*.{pdf,docx}",
+                "**/*.pdf",
+                "report_*.pdf",
+            ],
+        ),
+    ] = "*"
+    recursive: Annotated[
+        bool,
+        Field(
+            description=(
+                "If True, recursively traverse subdirectories when path is a directory. "
+                "If False, only process files in the immediate directory. "
+                "Optional, defaults to True."
+            ),
+        ),
+    ] = True
 TaskSource = Annotated[
-    TaskFileSource | TaskHttpSource | TaskS3Source | TaskGoogleDriveSource,
+    TaskFileSource
+    | TaskHttpSource
+    | TaskS3Source
+    | TaskGoogleDriveSource
+    | TaskLocalPathSource,
     Field(discriminator="kind"),
 ]

docling_jobkit/datamodel/task_targets.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from pathlib import Path
 from typing import Annotated, Literal
 from pydantic import AnyHttpUrl, BaseModel, Field
@@ -27,7 +28,33 @@ class PutTarget(BaseModel):
     url: AnyHttpUrl
+class LocalPathTarget(BaseModel):
+    kind: Literal["local_path"] = "local_path"
+    path: Annotated[
+        Path,
+        Field(
+            description=(
+                "Local filesystem path for output. "
+                "Can be a directory (outputs will be written inside) or a file path. "
+                "Directories will be created if they don't exist. "
+                "Required."
+            ),
+            examples=[
+                "/path/to/output/",
+                "./data/output/",
+                "/path/to/output.json",
+            ],
+        ),
+    ]
 TaskTarget = Annotated[
-    InBodyTarget | ZipTarget | S3Target | GoogleDriveTarget | PutTarget,
+    InBodyTarget
+    | ZipTarget
+    | S3Target
+    | GoogleDriveTarget
+    | PutTarget
+    | LocalPathTarget,
     Field(discriminator="kind"),
 ]

docling-jobkit 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

docling-jobkit 1.8.0py3-none-any.whl → 1.9.0py3-none-any.whl