docling-jobkit 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_jobkit/cli/local.py +14 -3
- docling_jobkit/cli/multiproc.py +504 -0
- docling_jobkit/connectors/google_drive_helper.py +5 -5
- docling_jobkit/connectors/google_drive_source_processor.py +30 -1
- docling_jobkit/connectors/http_source_processor.py +23 -3
- docling_jobkit/connectors/local_path_source_processor.py +126 -0
- docling_jobkit/connectors/local_path_target_processor.py +92 -0
- docling_jobkit/connectors/s3_source_processor.py +45 -24
- docling_jobkit/connectors/source_processor.py +52 -2
- docling_jobkit/connectors/source_processor_factory.py +6 -0
- docling_jobkit/connectors/target_processor_factory.py +6 -0
- docling_jobkit/convert/chunking.py +2 -1
- docling_jobkit/convert/manager.py +60 -9
- docling_jobkit/datamodel/task_sources.py +57 -2
- docling_jobkit/datamodel/task_targets.py +28 -1
- docling_jobkit/orchestrators/local/orchestrator.py +8 -0
- docling_jobkit/orchestrators/local/worker.py +6 -5
- docling_jobkit/orchestrators/rq/orchestrator.py +13 -3
- docling_jobkit/orchestrators/rq/worker.py +3 -0
- docling_jobkit/ray_job/main.py +12 -3
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.9.0.dist-info}/METADATA +77 -7
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.9.0.dist-info}/RECORD +25 -22
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.9.0.dist-info}/WHEEL +1 -1
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.9.0.dist-info}/entry_points.txt +1 -0
- {docling_jobkit-1.8.0.dist-info → docling_jobkit-1.9.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Iterator, TypedDict
|
|
3
|
+
|
|
4
|
+
from docling_core.types.io import DocumentStream
|
|
5
|
+
|
|
6
|
+
from docling_jobkit.connectors.source_processor import BaseSourceProcessor
|
|
7
|
+
from docling_jobkit.datamodel.task_sources import TaskLocalPathSource
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _should_ignore_file(file_path: Path) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
Check if a file should be ignored based on common patterns for
|
|
13
|
+
hidden files, temporary files, and system metadata files.
|
|
14
|
+
|
|
15
|
+
Returns True if the file should be ignored, False otherwise.
|
|
16
|
+
"""
|
|
17
|
+
name = file_path.name
|
|
18
|
+
|
|
19
|
+
# Hidden files (starting with .)
|
|
20
|
+
if name.startswith("."):
|
|
21
|
+
return True
|
|
22
|
+
|
|
23
|
+
# Vim temporary files
|
|
24
|
+
if name.endswith(("~", ".swp", ".swo")):
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
# Emacs temporary files
|
|
28
|
+
if name.startswith("#") and name.endswith("#"):
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
# Microsoft Office temporary files
|
|
32
|
+
if name.startswith("~$"):
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
# Windows thumbnail cache
|
|
36
|
+
if name.lower() == "thumbs.db":
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
# Desktop.ini (Windows)
|
|
40
|
+
if name.lower() == "desktop.ini":
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LocalPathFileIdentifier(TypedDict):
|
|
47
|
+
path: Path
|
|
48
|
+
size: int
|
|
49
|
+
last_modified: float
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class LocalPathSourceProcessor(BaseSourceProcessor[LocalPathFileIdentifier]):
|
|
53
|
+
def __init__(self, source: TaskLocalPathSource):
|
|
54
|
+
super().__init__()
|
|
55
|
+
self._source = source
|
|
56
|
+
|
|
57
|
+
def _initialize(self):
|
|
58
|
+
"""Validate that the path exists."""
|
|
59
|
+
if not self._source.path.exists():
|
|
60
|
+
raise FileNotFoundError(f"Path does not exist: {self._source.path}")
|
|
61
|
+
|
|
62
|
+
def _finalize(self):
|
|
63
|
+
"""No cleanup needed for local filesystem."""
|
|
64
|
+
|
|
65
|
+
def _list_document_ids(self) -> Iterator[LocalPathFileIdentifier]:
|
|
66
|
+
"""
|
|
67
|
+
List all files based on the source configuration.
|
|
68
|
+
- If path is a file, yield that single file
|
|
69
|
+
- If path is a directory, discover files based on pattern and recursive settings
|
|
70
|
+
"""
|
|
71
|
+
path = self._source.path
|
|
72
|
+
|
|
73
|
+
if path.is_file():
|
|
74
|
+
# Single file case
|
|
75
|
+
stat = path.stat()
|
|
76
|
+
yield LocalPathFileIdentifier(
|
|
77
|
+
path=path,
|
|
78
|
+
size=stat.st_size,
|
|
79
|
+
last_modified=stat.st_mtime,
|
|
80
|
+
)
|
|
81
|
+
elif path.is_dir():
|
|
82
|
+
# Directory case - use glob or rglob based on recursive setting
|
|
83
|
+
if self._source.recursive:
|
|
84
|
+
# Recursive traversal
|
|
85
|
+
files = path.rglob(self._source.pattern)
|
|
86
|
+
else:
|
|
87
|
+
# Non-recursive traversal
|
|
88
|
+
files = path.glob(self._source.pattern)
|
|
89
|
+
|
|
90
|
+
for file_path in files:
|
|
91
|
+
# Only yield actual files, not directories
|
|
92
|
+
# Skip hidden files, temporary files, and system metadata
|
|
93
|
+
if file_path.is_file() and not _should_ignore_file(file_path):
|
|
94
|
+
stat = file_path.stat()
|
|
95
|
+
yield LocalPathFileIdentifier(
|
|
96
|
+
path=file_path,
|
|
97
|
+
size=stat.st_size,
|
|
98
|
+
last_modified=stat.st_mtime,
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError(f"Path is neither a file nor a directory: {path}")
|
|
102
|
+
|
|
103
|
+
def _count_documents(self) -> int:
|
|
104
|
+
"""Count total number of documents."""
|
|
105
|
+
return sum(1 for _ in self._list_document_ids())
|
|
106
|
+
|
|
107
|
+
def _fetch_document_by_id(
|
|
108
|
+
self, identifier: LocalPathFileIdentifier
|
|
109
|
+
) -> DocumentStream:
|
|
110
|
+
"""Fetch a document by opening the file from the local filesystem."""
|
|
111
|
+
file_path = identifier["path"]
|
|
112
|
+
|
|
113
|
+
# Open file in binary mode and return as DocumentStream
|
|
114
|
+
with open(file_path, "rb") as f:
|
|
115
|
+
content = f.read()
|
|
116
|
+
|
|
117
|
+
from io import BytesIO
|
|
118
|
+
|
|
119
|
+
buffer = BytesIO(content)
|
|
120
|
+
|
|
121
|
+
return DocumentStream(name=str(file_path), stream=buffer)
|
|
122
|
+
|
|
123
|
+
def _fetch_documents(self) -> Iterator[DocumentStream]:
|
|
124
|
+
"""Iterate through all documents."""
|
|
125
|
+
for identifier in self._list_document_ids():
|
|
126
|
+
yield self._fetch_document_by_id(identifier)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import BinaryIO
|
|
3
|
+
|
|
4
|
+
from docling_jobkit.connectors.target_processor import BaseTargetProcessor
|
|
5
|
+
from docling_jobkit.datamodel.task_targets import LocalPathTarget
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LocalPathTargetProcessor(BaseTargetProcessor):
|
|
9
|
+
def __init__(self, target: LocalPathTarget):
|
|
10
|
+
super().__init__()
|
|
11
|
+
self._target = target
|
|
12
|
+
|
|
13
|
+
def _initialize(self) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Ensure the target directory exists.
|
|
16
|
+
If path is a directory, create it. If it's a file path, create parent directories.
|
|
17
|
+
"""
|
|
18
|
+
path = self._target.path
|
|
19
|
+
|
|
20
|
+
# If path looks like a directory (ends with / or has no extension), treat as directory
|
|
21
|
+
# Otherwise, create parent directories for the file
|
|
22
|
+
if path.suffix == "" or str(path).endswith("/"):
|
|
23
|
+
# Treat as directory
|
|
24
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
else:
|
|
26
|
+
# Treat as file - create parent directories
|
|
27
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
def _finalize(self) -> None:
|
|
30
|
+
"""No cleanup needed for local filesystem."""
|
|
31
|
+
|
|
32
|
+
def upload_file(
|
|
33
|
+
self,
|
|
34
|
+
filename: str | Path,
|
|
35
|
+
target_filename: str,
|
|
36
|
+
content_type: str,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Copy a file from local filesystem to the target location.
|
|
40
|
+
"""
|
|
41
|
+
source_path = Path(filename)
|
|
42
|
+
target_path = self._get_target_path(target_filename)
|
|
43
|
+
|
|
44
|
+
# Ensure parent directory exists
|
|
45
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
# Copy file content
|
|
48
|
+
with open(source_path, "rb") as src:
|
|
49
|
+
with open(target_path, "wb") as dst:
|
|
50
|
+
dst.write(src.read())
|
|
51
|
+
|
|
52
|
+
def upload_object(
|
|
53
|
+
self,
|
|
54
|
+
obj: str | bytes | BinaryIO,
|
|
55
|
+
target_filename: str,
|
|
56
|
+
content_type: str,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Write an in-memory object (bytes or file-like) to the target location.
|
|
60
|
+
"""
|
|
61
|
+
target_path = self._get_target_path(target_filename)
|
|
62
|
+
|
|
63
|
+
# Ensure parent directory exists
|
|
64
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
# Write content based on type
|
|
67
|
+
if isinstance(obj, str):
|
|
68
|
+
with open(target_path, "w", encoding="utf-8") as f:
|
|
69
|
+
f.write(obj)
|
|
70
|
+
elif isinstance(obj, (bytes, bytearray)):
|
|
71
|
+
with open(target_path, "wb") as f:
|
|
72
|
+
f.write(obj)
|
|
73
|
+
else:
|
|
74
|
+
# Assume it's a file-like object
|
|
75
|
+
with open(target_path, "wb") as f:
|
|
76
|
+
f.write(obj.read())
|
|
77
|
+
|
|
78
|
+
def _get_target_path(self, target_filename: str) -> Path:
|
|
79
|
+
"""
|
|
80
|
+
Determine the full target path based on the configured path.
|
|
81
|
+
- If path is a directory, append target_filename
|
|
82
|
+
- If path is a file, use it directly (ignore target_filename)
|
|
83
|
+
"""
|
|
84
|
+
path = self._target.path
|
|
85
|
+
|
|
86
|
+
# Check if path is intended to be a directory
|
|
87
|
+
if path.is_dir() or path.suffix == "" or str(path).endswith("/"):
|
|
88
|
+
# Treat as directory - append target_filename
|
|
89
|
+
return path / target_filename
|
|
90
|
+
else:
|
|
91
|
+
# Treat as file - use the path directly
|
|
92
|
+
return path
|
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
|
-
from typing import Iterator
|
|
2
|
+
from typing import Iterator, TypedDict
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from docling_core.types.io import DocumentStream
|
|
5
5
|
|
|
6
|
-
from docling_jobkit.connectors.s3_helper import get_s3_connection
|
|
6
|
+
from docling_jobkit.connectors.s3_helper import get_s3_connection
|
|
7
7
|
from docling_jobkit.connectors.source_processor import BaseSourceProcessor
|
|
8
8
|
from docling_jobkit.datamodel.s3_coords import S3Coordinates
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class
|
|
11
|
+
class S3FileIdentifier(TypedDict):
|
|
12
|
+
key: str # S3 object key
|
|
13
|
+
size: int # optional, include if available
|
|
14
|
+
last_modified: str | None # ISO timestamp, optional
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class S3SourceProcessor(BaseSourceProcessor[S3FileIdentifier]):
|
|
12
18
|
def __init__(self, coords: S3Coordinates):
|
|
13
19
|
super().__init__()
|
|
14
20
|
self._coords = coords
|
|
@@ -19,25 +25,40 @@ class S3SourceProcessor(BaseSourceProcessor):
|
|
|
19
25
|
def _finalize(self):
|
|
20
26
|
self._client.close()
|
|
21
27
|
|
|
22
|
-
def
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
def _list_document_ids(self) -> Iterator[S3FileIdentifier]:
|
|
29
|
+
paginator = self._client.get_paginator("list_objects_v2")
|
|
30
|
+
for page in paginator.paginate(
|
|
31
|
+
Bucket=self._coords.bucket,
|
|
32
|
+
Prefix=self._coords.key_prefix,
|
|
33
|
+
):
|
|
34
|
+
for obj in page.get("Contents", []):
|
|
35
|
+
last_modified = obj.get("LastModified", None)
|
|
36
|
+
yield S3FileIdentifier(
|
|
37
|
+
key=obj["Key"], # type: ignore[typeddict-item] # Key is always present in S3 list_objects_v2 response
|
|
38
|
+
size=obj.get("Size", 0),
|
|
39
|
+
last_modified=last_modified.isoformat() if last_modified else None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def _count_documents(self) -> int:
|
|
43
|
+
total = 0
|
|
44
|
+
paginator = self._client.get_paginator("list_objects_v2")
|
|
45
|
+
for page in paginator.paginate(
|
|
46
|
+
Bucket=self._coords.bucket,
|
|
47
|
+
Prefix=self._coords.key_prefix,
|
|
48
|
+
):
|
|
49
|
+
total += len(page.get("Contents", []))
|
|
50
|
+
return total
|
|
51
|
+
|
|
52
|
+
# ----------------- Document fetch -----------------
|
|
53
|
+
|
|
54
|
+
def _fetch_document_by_id(self, identifier: S3FileIdentifier) -> DocumentStream:
|
|
55
|
+
buffer = BytesIO()
|
|
56
|
+
self._client.download_fileobj(
|
|
57
|
+
Bucket=self._coords.bucket, Key=identifier["key"], Fileobj=buffer
|
|
28
58
|
)
|
|
59
|
+
buffer.seek(0)
|
|
60
|
+
return DocumentStream(name=identifier["key"], stream=buffer)
|
|
29
61
|
|
|
30
|
-
|
|
31
|
-
for
|
|
32
|
-
|
|
33
|
-
buffer = BytesIO()
|
|
34
|
-
self._client.download_fileobj(
|
|
35
|
-
Bucket=self._coords.bucket,
|
|
36
|
-
Key=obj_key,
|
|
37
|
-
Fileobj=buffer,
|
|
38
|
-
)
|
|
39
|
-
buffer.seek(0)
|
|
40
|
-
yield DocumentStream(
|
|
41
|
-
name=obj_key,
|
|
42
|
-
stream=buffer,
|
|
43
|
-
)
|
|
62
|
+
def _fetch_documents(self):
|
|
63
|
+
for key in self._list_document_ids():
|
|
64
|
+
yield self._fetch_document_by_id(key)
|
|
@@ -1,11 +1,30 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from contextlib import AbstractContextManager
|
|
3
|
-
from
|
|
3
|
+
from itertools import islice
|
|
4
|
+
from typing import Callable, Generic, Iterator, Sequence, TypeVar
|
|
4
5
|
|
|
5
6
|
from docling.datamodel.base_models import DocumentStream
|
|
6
7
|
|
|
8
|
+
FileIdentifierT = TypeVar("FileIdentifierT") # identifier type per connector
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
|
|
11
|
+
class DocumentChunk(Generic[FileIdentifierT]):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
ids: Sequence[FileIdentifierT],
|
|
15
|
+
fetcher: Callable[[FileIdentifierT], DocumentStream],
|
|
16
|
+
chunk_index: int,
|
|
17
|
+
):
|
|
18
|
+
self.ids = ids
|
|
19
|
+
self._fetcher = fetcher
|
|
20
|
+
self.index = chunk_index
|
|
21
|
+
|
|
22
|
+
def iter_documents(self) -> Iterator[DocumentStream]:
|
|
23
|
+
for doc_id in self.ids:
|
|
24
|
+
yield self._fetcher(doc_id)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseSourceProcessor(Generic[FileIdentifierT], AbstractContextManager, ABC):
|
|
9
28
|
"""
|
|
10
29
|
Base class for source processors.
|
|
11
30
|
Handles initialization state and context management.
|
|
@@ -35,9 +54,40 @@ class BaseSourceProcessor(AbstractContextManager, ABC):
|
|
|
35
54
|
def _fetch_documents(self) -> Iterator[DocumentStream]:
|
|
36
55
|
"""Yield documents from the source."""
|
|
37
56
|
|
|
57
|
+
def _list_document_ids(self) -> Iterator[FileIdentifierT] | None:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def _fetch_document_by_id(self, identifier: FileIdentifierT) -> DocumentStream:
|
|
61
|
+
raise NotImplementedError
|
|
62
|
+
|
|
63
|
+
def _count_documents(self) -> int | None:
|
|
64
|
+
return None
|
|
65
|
+
|
|
38
66
|
def iterate_documents(self) -> Iterator[DocumentStream]:
|
|
39
67
|
if not self._initialized:
|
|
40
68
|
raise RuntimeError(
|
|
41
69
|
"Processor not initialized. Use 'with' to open it first."
|
|
42
70
|
)
|
|
43
71
|
yield from self._fetch_documents()
|
|
72
|
+
|
|
73
|
+
def iterate_document_chunks(
|
|
74
|
+
self, chunk_size: int
|
|
75
|
+
) -> Iterator[DocumentChunk[FileIdentifierT]]:
|
|
76
|
+
ids_gen = self._list_document_ids()
|
|
77
|
+
if ids_gen is None:
|
|
78
|
+
raise RuntimeError("Connector does not support chunking.")
|
|
79
|
+
|
|
80
|
+
chunk_index = 0
|
|
81
|
+
|
|
82
|
+
while True:
|
|
83
|
+
ids = list(islice(ids_gen, chunk_size))
|
|
84
|
+
if not ids:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
yield DocumentChunk(
|
|
88
|
+
ids=ids,
|
|
89
|
+
fetcher=self._fetch_document_by_id,
|
|
90
|
+
chunk_index=chunk_index,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
chunk_index += 1
|
|
@@ -2,12 +2,16 @@ from docling_jobkit.connectors.google_drive_source_processor import (
|
|
|
2
2
|
GoogleDriveSourceProcessor,
|
|
3
3
|
)
|
|
4
4
|
from docling_jobkit.connectors.http_source_processor import HttpSourceProcessor
|
|
5
|
+
from docling_jobkit.connectors.local_path_source_processor import (
|
|
6
|
+
LocalPathSourceProcessor,
|
|
7
|
+
)
|
|
5
8
|
from docling_jobkit.connectors.s3_source_processor import S3SourceProcessor
|
|
6
9
|
from docling_jobkit.connectors.source_processor import BaseSourceProcessor
|
|
7
10
|
from docling_jobkit.datamodel.task_sources import (
|
|
8
11
|
TaskFileSource,
|
|
9
12
|
TaskGoogleDriveSource,
|
|
10
13
|
TaskHttpSource,
|
|
14
|
+
TaskLocalPathSource,
|
|
11
15
|
TaskS3Source,
|
|
12
16
|
TaskSource,
|
|
13
17
|
)
|
|
@@ -20,5 +24,7 @@ def get_source_processor(source: TaskSource) -> BaseSourceProcessor:
|
|
|
20
24
|
return S3SourceProcessor(source)
|
|
21
25
|
elif isinstance(source, TaskGoogleDriveSource):
|
|
22
26
|
return GoogleDriveSourceProcessor(source)
|
|
27
|
+
elif isinstance(source, TaskLocalPathSource):
|
|
28
|
+
return LocalPathSourceProcessor(source)
|
|
23
29
|
|
|
24
30
|
raise RuntimeError(f"No source processor for this source. {type(source)=}")
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
from docling_jobkit.connectors.google_drive_target_processor import (
|
|
2
2
|
GoogleDriveTargetProcessor,
|
|
3
3
|
)
|
|
4
|
+
from docling_jobkit.connectors.local_path_target_processor import (
|
|
5
|
+
LocalPathTargetProcessor,
|
|
6
|
+
)
|
|
4
7
|
from docling_jobkit.connectors.s3_target_processor import S3TargetProcessor
|
|
5
8
|
from docling_jobkit.connectors.target_processor import BaseTargetProcessor
|
|
6
9
|
from docling_jobkit.datamodel.task_targets import (
|
|
7
10
|
GoogleDriveTarget,
|
|
11
|
+
LocalPathTarget,
|
|
8
12
|
S3Target,
|
|
9
13
|
TaskTarget,
|
|
10
14
|
)
|
|
@@ -15,5 +19,7 @@ def get_target_processor(target: TaskTarget) -> BaseTargetProcessor:
|
|
|
15
19
|
return S3TargetProcessor(target)
|
|
16
20
|
if isinstance(target, GoogleDriveTarget):
|
|
17
21
|
return GoogleDriveTargetProcessor(target)
|
|
22
|
+
if isinstance(target, LocalPathTarget):
|
|
23
|
+
return LocalPathTargetProcessor(target)
|
|
18
24
|
|
|
19
25
|
raise RuntimeError(f"No target processor for this target. {type(target)=}")
|
|
@@ -220,6 +220,7 @@ def process_chunk_results(
|
|
|
220
220
|
task: Task,
|
|
221
221
|
conv_results: Iterable[ConversionResult],
|
|
222
222
|
work_dir: Path,
|
|
223
|
+
chunker_manager: Optional[DocumentChunkerManager] = None,
|
|
223
224
|
) -> DoclingTaskResult:
|
|
224
225
|
# Let's start by processing the documents
|
|
225
226
|
start_time = time.monotonic()
|
|
@@ -234,7 +235,7 @@ def process_chunk_results(
|
|
|
234
235
|
num_failed = 0
|
|
235
236
|
|
|
236
237
|
# TODO: DocumentChunkerManager should be initialized outside for really working as a cache
|
|
237
|
-
chunker_manager = DocumentChunkerManager()
|
|
238
|
+
chunker_manager = chunker_manager or DocumentChunkerManager()
|
|
238
239
|
for conv_res in conv_results:
|
|
239
240
|
errors = conv_res.errors
|
|
240
241
|
filename = conv_res.input.file.name
|
|
@@ -28,10 +28,16 @@ from docling.datamodel.pipeline_options import (
|
|
|
28
28
|
PictureDescriptionVlmOptions,
|
|
29
29
|
ProcessingPipeline,
|
|
30
30
|
TableFormerMode,
|
|
31
|
+
TableStructureOptions,
|
|
31
32
|
VlmPipelineOptions,
|
|
32
33
|
)
|
|
33
34
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
|
|
34
|
-
from docling.document_converter import
|
|
35
|
+
from docling.document_converter import (
|
|
36
|
+
DocumentConverter,
|
|
37
|
+
FormatOption,
|
|
38
|
+
ImageFormatOption,
|
|
39
|
+
PdfFormatOption,
|
|
40
|
+
)
|
|
35
41
|
from docling.models.factories import get_ocr_factory
|
|
36
42
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
37
43
|
from docling_core.types.doc import ImageRefMode
|
|
@@ -68,12 +74,28 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
|
|
|
68
74
|
data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
|
|
69
75
|
serialize_as_any=True, mode="json"
|
|
70
76
|
)
|
|
77
|
+
data["pipeline_options_type"] = (
|
|
78
|
+
f"{pdf_format_option.pipeline_options.__class__.__module__}."
|
|
79
|
+
f"{pdf_format_option.pipeline_options.__class__.__qualname__}"
|
|
80
|
+
)
|
|
81
|
+
else:
|
|
82
|
+
data["pipeline_options_type"] = None
|
|
71
83
|
|
|
72
84
|
# Replace `pipeline_cls` with a string representation
|
|
73
|
-
|
|
85
|
+
pipeline_cls = pdf_format_option.pipeline_cls
|
|
86
|
+
data["pipeline_cls"] = (
|
|
87
|
+
f"{pipeline_cls.__module__}.{pipeline_cls.__qualname__}"
|
|
88
|
+
if pipeline_cls is not None
|
|
89
|
+
else "None"
|
|
90
|
+
)
|
|
74
91
|
|
|
75
92
|
# Replace `backend` with a string representation
|
|
76
|
-
|
|
93
|
+
backend = pdf_format_option.backend
|
|
94
|
+
data["backend"] = (
|
|
95
|
+
f"{backend.__module__}.{backend.__qualname__}"
|
|
96
|
+
if backend is not None
|
|
97
|
+
else "None"
|
|
98
|
+
)
|
|
77
99
|
|
|
78
100
|
# Serialize the dictionary to JSON with sorted keys to have consistent hashes
|
|
79
101
|
serialized_data = json.dumps(data, sort_keys=True)
|
|
@@ -121,9 +143,19 @@ class DoclingConverterManager:
|
|
|
121
143
|
@lru_cache(maxsize=cache_size)
|
|
122
144
|
def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
|
|
123
145
|
pdf_format_option = self._options_map[options_hash]
|
|
146
|
+
image_format_option: FormatOption = pdf_format_option
|
|
147
|
+
if isinstance(pdf_format_option.pipeline_cls, type) and issubclass(
|
|
148
|
+
pdf_format_option.pipeline_cls, VlmPipeline
|
|
149
|
+
):
|
|
150
|
+
image_format_option = ImageFormatOption(
|
|
151
|
+
pipeline_cls=pdf_format_option.pipeline_cls,
|
|
152
|
+
pipeline_options=pdf_format_option.pipeline_options,
|
|
153
|
+
backend_options=pdf_format_option.backend_options,
|
|
154
|
+
)
|
|
155
|
+
|
|
124
156
|
format_options: dict[InputFormat, FormatOption] = {
|
|
125
157
|
InputFormat.PDF: pdf_format_option,
|
|
126
|
-
InputFormat.IMAGE:
|
|
158
|
+
InputFormat.IMAGE: image_format_option,
|
|
127
159
|
}
|
|
128
160
|
|
|
129
161
|
return DocumentConverter(format_options=format_options)
|
|
@@ -178,11 +210,9 @@ class DoclingConverterManager:
|
|
|
178
210
|
do_picture_classification=request.do_picture_classification,
|
|
179
211
|
do_picture_description=request.do_picture_description,
|
|
180
212
|
)
|
|
181
|
-
pipeline_options.table_structure_options
|
|
182
|
-
request.table_mode
|
|
183
|
-
|
|
184
|
-
pipeline_options.table_structure_options.do_cell_matching = (
|
|
185
|
-
request.table_cell_matching
|
|
213
|
+
pipeline_options.table_structure_options = TableStructureOptions(
|
|
214
|
+
mode=TableFormerMode(request.table_mode),
|
|
215
|
+
do_cell_matching=request.table_cell_matching,
|
|
186
216
|
)
|
|
187
217
|
|
|
188
218
|
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
|
|
@@ -282,6 +312,27 @@ class DoclingConverterManager:
|
|
|
282
312
|
request.vlm_pipeline_model_api.model_dump()
|
|
283
313
|
)
|
|
284
314
|
|
|
315
|
+
pipeline_options.do_picture_classification = request.do_picture_classification
|
|
316
|
+
pipeline_options.do_picture_description = request.do_picture_description
|
|
317
|
+
|
|
318
|
+
if request.picture_description_local is not None:
|
|
319
|
+
pipeline_options.picture_description_options = (
|
|
320
|
+
PictureDescriptionVlmOptions.model_validate(
|
|
321
|
+
request.picture_description_local.model_dump()
|
|
322
|
+
)
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
if request.picture_description_api is not None:
|
|
326
|
+
pipeline_options.picture_description_options = (
|
|
327
|
+
PictureDescriptionApiOptions.model_validate(
|
|
328
|
+
request.picture_description_api.model_dump()
|
|
329
|
+
)
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
pipeline_options.picture_description_options.picture_area_threshold = (
|
|
333
|
+
request.picture_description_area_threshold
|
|
334
|
+
)
|
|
335
|
+
|
|
285
336
|
return pipeline_options
|
|
286
337
|
|
|
287
338
|
# Computes the PDF pipeline options and returns the PdfFormatOption and its hash
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import Annotated, Literal
|
|
2
3
|
|
|
3
|
-
from pydantic import Field
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
4
5
|
|
|
5
6
|
from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
|
|
6
7
|
from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
|
|
@@ -23,7 +24,61 @@ class TaskGoogleDriveSource(GoogleDriveCoordinates):
|
|
|
23
24
|
kind: Literal["google_drive"] = "google_drive"
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
class TaskLocalPathSource(BaseModel):
|
|
28
|
+
kind: Literal["local_path"] = "local_path"
|
|
29
|
+
|
|
30
|
+
path: Annotated[
|
|
31
|
+
Path,
|
|
32
|
+
Field(
|
|
33
|
+
description=(
|
|
34
|
+
"Local filesystem path to a file or directory. "
|
|
35
|
+
"For files, the single file will be processed. "
|
|
36
|
+
"For directories, files will be discovered based on the pattern and recursive settings. "
|
|
37
|
+
"Required."
|
|
38
|
+
),
|
|
39
|
+
examples=[
|
|
40
|
+
"/path/to/document.pdf",
|
|
41
|
+
"/path/to/documents/",
|
|
42
|
+
"./data/input/",
|
|
43
|
+
],
|
|
44
|
+
),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
pattern: Annotated[
|
|
48
|
+
str,
|
|
49
|
+
Field(
|
|
50
|
+
description=(
|
|
51
|
+
"Glob pattern for matching files within a directory. "
|
|
52
|
+
"Supports standard glob syntax (e.g., '*.pdf', '**/*.docx'). "
|
|
53
|
+
"Only applicable when path is a directory. "
|
|
54
|
+
"Optional, defaults to '*' (all files)."
|
|
55
|
+
),
|
|
56
|
+
examples=[
|
|
57
|
+
"*.pdf",
|
|
58
|
+
"*.{pdf,docx}",
|
|
59
|
+
"**/*.pdf",
|
|
60
|
+
"report_*.pdf",
|
|
61
|
+
],
|
|
62
|
+
),
|
|
63
|
+
] = "*"
|
|
64
|
+
|
|
65
|
+
recursive: Annotated[
|
|
66
|
+
bool,
|
|
67
|
+
Field(
|
|
68
|
+
description=(
|
|
69
|
+
"If True, recursively traverse subdirectories when path is a directory. "
|
|
70
|
+
"If False, only process files in the immediate directory. "
|
|
71
|
+
"Optional, defaults to True."
|
|
72
|
+
),
|
|
73
|
+
),
|
|
74
|
+
] = True
|
|
75
|
+
|
|
76
|
+
|
|
26
77
|
TaskSource = Annotated[
|
|
27
|
-
TaskFileSource
|
|
78
|
+
TaskFileSource
|
|
79
|
+
| TaskHttpSource
|
|
80
|
+
| TaskS3Source
|
|
81
|
+
| TaskGoogleDriveSource
|
|
82
|
+
| TaskLocalPathSource,
|
|
28
83
|
Field(discriminator="kind"),
|
|
29
84
|
]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import Annotated, Literal
|
|
2
3
|
|
|
3
4
|
from pydantic import AnyHttpUrl, BaseModel, Field
|
|
@@ -27,7 +28,33 @@ class PutTarget(BaseModel):
|
|
|
27
28
|
url: AnyHttpUrl
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
class LocalPathTarget(BaseModel):
|
|
32
|
+
kind: Literal["local_path"] = "local_path"
|
|
33
|
+
|
|
34
|
+
path: Annotated[
|
|
35
|
+
Path,
|
|
36
|
+
Field(
|
|
37
|
+
description=(
|
|
38
|
+
"Local filesystem path for output. "
|
|
39
|
+
"Can be a directory (outputs will be written inside) or a file path. "
|
|
40
|
+
"Directories will be created if they don't exist. "
|
|
41
|
+
"Required."
|
|
42
|
+
),
|
|
43
|
+
examples=[
|
|
44
|
+
"/path/to/output/",
|
|
45
|
+
"./data/output/",
|
|
46
|
+
"/path/to/output.json",
|
|
47
|
+
],
|
|
48
|
+
),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
30
52
|
TaskTarget = Annotated[
|
|
31
|
-
InBodyTarget
|
|
53
|
+
InBodyTarget
|
|
54
|
+
| ZipTarget
|
|
55
|
+
| S3Target
|
|
56
|
+
| GoogleDriveTarget
|
|
57
|
+
| PutTarget
|
|
58
|
+
| LocalPathTarget,
|
|
32
59
|
Field(discriminator="kind"),
|
|
33
60
|
]
|