docling-jobkit 1.8.1__py3-none-any.whl → 1.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docling_jobkit/cli/local.py +14 -3
- docling_jobkit/cli/multiproc.py +504 -0
- docling_jobkit/connectors/google_drive_helper.py +5 -5
- docling_jobkit/connectors/google_drive_source_processor.py +30 -1
- docling_jobkit/connectors/http_source_processor.py +23 -3
- docling_jobkit/connectors/local_path_source_processor.py +126 -0
- docling_jobkit/connectors/local_path_target_processor.py +92 -0
- docling_jobkit/connectors/s3_source_processor.py +45 -24
- docling_jobkit/connectors/source_processor.py +52 -2
- docling_jobkit/connectors/source_processor_factory.py +6 -0
- docling_jobkit/connectors/target_processor_factory.py +6 -0
- docling_jobkit/convert/chunking.py +2 -1
- docling_jobkit/convert/manager.py +4 -5
- docling_jobkit/convert/results_processor.py +1 -1
- docling_jobkit/datamodel/task_sources.py +57 -2
- docling_jobkit/datamodel/task_targets.py +28 -1
- docling_jobkit/orchestrators/local/orchestrator.py +8 -0
- docling_jobkit/orchestrators/local/worker.py +6 -5
- docling_jobkit/orchestrators/rq/orchestrator.py +13 -3
- docling_jobkit/orchestrators/rq/worker.py +3 -0
- docling_jobkit/ray_job/main.py +12 -3
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/METADATA +77 -7
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/RECORD +26 -23
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/entry_points.txt +1 -0
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/WHEEL +0 -0
- {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import Iterator, TypedDict
|
|
3
|
+
|
|
4
|
+
from docling_core.types.io import DocumentStream
|
|
5
|
+
|
|
6
|
+
from docling_jobkit.connectors.source_processor import BaseSourceProcessor
|
|
7
|
+
from docling_jobkit.datamodel.task_sources import TaskLocalPathSource
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _should_ignore_file(file_path: Path) -> bool:
|
|
11
|
+
"""
|
|
12
|
+
Check if a file should be ignored based on common patterns for
|
|
13
|
+
hidden files, temporary files, and system metadata files.
|
|
14
|
+
|
|
15
|
+
Returns True if the file should be ignored, False otherwise.
|
|
16
|
+
"""
|
|
17
|
+
name = file_path.name
|
|
18
|
+
|
|
19
|
+
# Hidden files (starting with .)
|
|
20
|
+
if name.startswith("."):
|
|
21
|
+
return True
|
|
22
|
+
|
|
23
|
+
# Vim temporary files
|
|
24
|
+
if name.endswith(("~", ".swp", ".swo")):
|
|
25
|
+
return True
|
|
26
|
+
|
|
27
|
+
# Emacs temporary files
|
|
28
|
+
if name.startswith("#") and name.endswith("#"):
|
|
29
|
+
return True
|
|
30
|
+
|
|
31
|
+
# Microsoft Office temporary files
|
|
32
|
+
if name.startswith("~$"):
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
# Windows thumbnail cache
|
|
36
|
+
if name.lower() == "thumbs.db":
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
# Desktop.ini (Windows)
|
|
40
|
+
if name.lower() == "desktop.ini":
|
|
41
|
+
return True
|
|
42
|
+
|
|
43
|
+
return False
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class LocalPathFileIdentifier(TypedDict):
|
|
47
|
+
path: Path
|
|
48
|
+
size: int
|
|
49
|
+
last_modified: float
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class LocalPathSourceProcessor(BaseSourceProcessor[LocalPathFileIdentifier]):
|
|
53
|
+
def __init__(self, source: TaskLocalPathSource):
|
|
54
|
+
super().__init__()
|
|
55
|
+
self._source = source
|
|
56
|
+
|
|
57
|
+
def _initialize(self):
|
|
58
|
+
"""Validate that the path exists."""
|
|
59
|
+
if not self._source.path.exists():
|
|
60
|
+
raise FileNotFoundError(f"Path does not exist: {self._source.path}")
|
|
61
|
+
|
|
62
|
+
def _finalize(self):
|
|
63
|
+
"""No cleanup needed for local filesystem."""
|
|
64
|
+
|
|
65
|
+
def _list_document_ids(self) -> Iterator[LocalPathFileIdentifier]:
|
|
66
|
+
"""
|
|
67
|
+
List all files based on the source configuration.
|
|
68
|
+
- If path is a file, yield that single file
|
|
69
|
+
- If path is a directory, discover files based on pattern and recursive settings
|
|
70
|
+
"""
|
|
71
|
+
path = self._source.path
|
|
72
|
+
|
|
73
|
+
if path.is_file():
|
|
74
|
+
# Single file case
|
|
75
|
+
stat = path.stat()
|
|
76
|
+
yield LocalPathFileIdentifier(
|
|
77
|
+
path=path,
|
|
78
|
+
size=stat.st_size,
|
|
79
|
+
last_modified=stat.st_mtime,
|
|
80
|
+
)
|
|
81
|
+
elif path.is_dir():
|
|
82
|
+
# Directory case - use glob or rglob based on recursive setting
|
|
83
|
+
if self._source.recursive:
|
|
84
|
+
# Recursive traversal
|
|
85
|
+
files = path.rglob(self._source.pattern)
|
|
86
|
+
else:
|
|
87
|
+
# Non-recursive traversal
|
|
88
|
+
files = path.glob(self._source.pattern)
|
|
89
|
+
|
|
90
|
+
for file_path in files:
|
|
91
|
+
# Only yield actual files, not directories
|
|
92
|
+
# Skip hidden files, temporary files, and system metadata
|
|
93
|
+
if file_path.is_file() and not _should_ignore_file(file_path):
|
|
94
|
+
stat = file_path.stat()
|
|
95
|
+
yield LocalPathFileIdentifier(
|
|
96
|
+
path=file_path,
|
|
97
|
+
size=stat.st_size,
|
|
98
|
+
last_modified=stat.st_mtime,
|
|
99
|
+
)
|
|
100
|
+
else:
|
|
101
|
+
raise ValueError(f"Path is neither a file nor a directory: {path}")
|
|
102
|
+
|
|
103
|
+
def _count_documents(self) -> int:
|
|
104
|
+
"""Count total number of documents."""
|
|
105
|
+
return sum(1 for _ in self._list_document_ids())
|
|
106
|
+
|
|
107
|
+
def _fetch_document_by_id(
|
|
108
|
+
self, identifier: LocalPathFileIdentifier
|
|
109
|
+
) -> DocumentStream:
|
|
110
|
+
"""Fetch a document by opening the file from the local filesystem."""
|
|
111
|
+
file_path = identifier["path"]
|
|
112
|
+
|
|
113
|
+
# Open file in binary mode and return as DocumentStream
|
|
114
|
+
with open(file_path, "rb") as f:
|
|
115
|
+
content = f.read()
|
|
116
|
+
|
|
117
|
+
from io import BytesIO
|
|
118
|
+
|
|
119
|
+
buffer = BytesIO(content)
|
|
120
|
+
|
|
121
|
+
return DocumentStream(name=str(file_path), stream=buffer)
|
|
122
|
+
|
|
123
|
+
def _fetch_documents(self) -> Iterator[DocumentStream]:
|
|
124
|
+
"""Iterate through all documents."""
|
|
125
|
+
for identifier in self._list_document_ids():
|
|
126
|
+
yield self._fetch_document_by_id(identifier)
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
from typing import BinaryIO
|
|
3
|
+
|
|
4
|
+
from docling_jobkit.connectors.target_processor import BaseTargetProcessor
|
|
5
|
+
from docling_jobkit.datamodel.task_targets import LocalPathTarget
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class LocalPathTargetProcessor(BaseTargetProcessor):
|
|
9
|
+
def __init__(self, target: LocalPathTarget):
|
|
10
|
+
super().__init__()
|
|
11
|
+
self._target = target
|
|
12
|
+
|
|
13
|
+
def _initialize(self) -> None:
|
|
14
|
+
"""
|
|
15
|
+
Ensure the target directory exists.
|
|
16
|
+
If path is a directory, create it. If it's a file path, create parent directories.
|
|
17
|
+
"""
|
|
18
|
+
path = self._target.path
|
|
19
|
+
|
|
20
|
+
# If path looks like a directory (ends with / or has no extension), treat as directory
|
|
21
|
+
# Otherwise, create parent directories for the file
|
|
22
|
+
if path.suffix == "" or str(path).endswith("/"):
|
|
23
|
+
# Treat as directory
|
|
24
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
else:
|
|
26
|
+
# Treat as file - create parent directories
|
|
27
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
def _finalize(self) -> None:
|
|
30
|
+
"""No cleanup needed for local filesystem."""
|
|
31
|
+
|
|
32
|
+
def upload_file(
|
|
33
|
+
self,
|
|
34
|
+
filename: str | Path,
|
|
35
|
+
target_filename: str,
|
|
36
|
+
content_type: str,
|
|
37
|
+
) -> None:
|
|
38
|
+
"""
|
|
39
|
+
Copy a file from local filesystem to the target location.
|
|
40
|
+
"""
|
|
41
|
+
source_path = Path(filename)
|
|
42
|
+
target_path = self._get_target_path(target_filename)
|
|
43
|
+
|
|
44
|
+
# Ensure parent directory exists
|
|
45
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
# Copy file content
|
|
48
|
+
with open(source_path, "rb") as src:
|
|
49
|
+
with open(target_path, "wb") as dst:
|
|
50
|
+
dst.write(src.read())
|
|
51
|
+
|
|
52
|
+
def upload_object(
|
|
53
|
+
self,
|
|
54
|
+
obj: str | bytes | BinaryIO,
|
|
55
|
+
target_filename: str,
|
|
56
|
+
content_type: str,
|
|
57
|
+
) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Write an in-memory object (bytes or file-like) to the target location.
|
|
60
|
+
"""
|
|
61
|
+
target_path = self._get_target_path(target_filename)
|
|
62
|
+
|
|
63
|
+
# Ensure parent directory exists
|
|
64
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
# Write content based on type
|
|
67
|
+
if isinstance(obj, str):
|
|
68
|
+
with open(target_path, "w", encoding="utf-8") as f:
|
|
69
|
+
f.write(obj)
|
|
70
|
+
elif isinstance(obj, (bytes, bytearray)):
|
|
71
|
+
with open(target_path, "wb") as f:
|
|
72
|
+
f.write(obj)
|
|
73
|
+
else:
|
|
74
|
+
# Assume it's a file-like object
|
|
75
|
+
with open(target_path, "wb") as f:
|
|
76
|
+
f.write(obj.read())
|
|
77
|
+
|
|
78
|
+
def _get_target_path(self, target_filename: str) -> Path:
|
|
79
|
+
"""
|
|
80
|
+
Determine the full target path based on the configured path.
|
|
81
|
+
- If path is a directory, append target_filename
|
|
82
|
+
- If path is a file, use it directly (ignore target_filename)
|
|
83
|
+
"""
|
|
84
|
+
path = self._target.path
|
|
85
|
+
|
|
86
|
+
# Check if path is intended to be a directory
|
|
87
|
+
if path.is_dir() or path.suffix == "" or str(path).endswith("/"):
|
|
88
|
+
# Treat as directory - append target_filename
|
|
89
|
+
return path / target_filename
|
|
90
|
+
else:
|
|
91
|
+
# Treat as file - use the path directly
|
|
92
|
+
return path
|
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
from io import BytesIO
|
|
2
|
-
from typing import Iterator
|
|
2
|
+
from typing import Iterator, TypedDict
|
|
3
3
|
|
|
4
|
-
from
|
|
4
|
+
from docling_core.types.io import DocumentStream
|
|
5
5
|
|
|
6
|
-
from docling_jobkit.connectors.s3_helper import get_s3_connection
|
|
6
|
+
from docling_jobkit.connectors.s3_helper import get_s3_connection
|
|
7
7
|
from docling_jobkit.connectors.source_processor import BaseSourceProcessor
|
|
8
8
|
from docling_jobkit.datamodel.s3_coords import S3Coordinates
|
|
9
9
|
|
|
10
10
|
|
|
11
|
-
class
|
|
11
|
+
class S3FileIdentifier(TypedDict):
|
|
12
|
+
key: str # S3 object key
|
|
13
|
+
size: int # optional, include if available
|
|
14
|
+
last_modified: str | None # ISO timestamp, optional
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class S3SourceProcessor(BaseSourceProcessor[S3FileIdentifier]):
|
|
12
18
|
def __init__(self, coords: S3Coordinates):
|
|
13
19
|
super().__init__()
|
|
14
20
|
self._coords = coords
|
|
@@ -19,25 +25,40 @@ class S3SourceProcessor(BaseSourceProcessor):
|
|
|
19
25
|
def _finalize(self):
|
|
20
26
|
self._client.close()
|
|
21
27
|
|
|
22
|
-
def
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
+
def _list_document_ids(self) -> Iterator[S3FileIdentifier]:
|
|
29
|
+
paginator = self._client.get_paginator("list_objects_v2")
|
|
30
|
+
for page in paginator.paginate(
|
|
31
|
+
Bucket=self._coords.bucket,
|
|
32
|
+
Prefix=self._coords.key_prefix,
|
|
33
|
+
):
|
|
34
|
+
for obj in page.get("Contents", []):
|
|
35
|
+
last_modified = obj.get("LastModified", None)
|
|
36
|
+
yield S3FileIdentifier(
|
|
37
|
+
key=obj["Key"], # type: ignore[typeddict-item] # Key is always present in S3 list_objects_v2 response
|
|
38
|
+
size=obj.get("Size", 0),
|
|
39
|
+
last_modified=last_modified.isoformat() if last_modified else None,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
def _count_documents(self) -> int:
|
|
43
|
+
total = 0
|
|
44
|
+
paginator = self._client.get_paginator("list_objects_v2")
|
|
45
|
+
for page in paginator.paginate(
|
|
46
|
+
Bucket=self._coords.bucket,
|
|
47
|
+
Prefix=self._coords.key_prefix,
|
|
48
|
+
):
|
|
49
|
+
total += len(page.get("Contents", []))
|
|
50
|
+
return total
|
|
51
|
+
|
|
52
|
+
# ----------------- Document fetch -----------------
|
|
53
|
+
|
|
54
|
+
def _fetch_document_by_id(self, identifier: S3FileIdentifier) -> DocumentStream:
|
|
55
|
+
buffer = BytesIO()
|
|
56
|
+
self._client.download_fileobj(
|
|
57
|
+
Bucket=self._coords.bucket, Key=identifier["key"], Fileobj=buffer
|
|
28
58
|
)
|
|
59
|
+
buffer.seek(0)
|
|
60
|
+
return DocumentStream(name=identifier["key"], stream=buffer)
|
|
29
61
|
|
|
30
|
-
|
|
31
|
-
for
|
|
32
|
-
|
|
33
|
-
buffer = BytesIO()
|
|
34
|
-
self._client.download_fileobj(
|
|
35
|
-
Bucket=self._coords.bucket,
|
|
36
|
-
Key=obj_key,
|
|
37
|
-
Fileobj=buffer,
|
|
38
|
-
)
|
|
39
|
-
buffer.seek(0)
|
|
40
|
-
yield DocumentStream(
|
|
41
|
-
name=obj_key,
|
|
42
|
-
stream=buffer,
|
|
43
|
-
)
|
|
62
|
+
def _fetch_documents(self):
|
|
63
|
+
for key in self._list_document_ids():
|
|
64
|
+
yield self._fetch_document_by_id(key)
|
|
@@ -1,11 +1,30 @@
|
|
|
1
1
|
from abc import ABC, abstractmethod
|
|
2
2
|
from contextlib import AbstractContextManager
|
|
3
|
-
from
|
|
3
|
+
from itertools import islice
|
|
4
|
+
from typing import Callable, Generic, Iterator, Sequence, TypeVar
|
|
4
5
|
|
|
5
6
|
from docling.datamodel.base_models import DocumentStream
|
|
6
7
|
|
|
8
|
+
FileIdentifierT = TypeVar("FileIdentifierT") # identifier type per connector
|
|
7
9
|
|
|
8
|
-
|
|
10
|
+
|
|
11
|
+
class DocumentChunk(Generic[FileIdentifierT]):
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
ids: Sequence[FileIdentifierT],
|
|
15
|
+
fetcher: Callable[[FileIdentifierT], DocumentStream],
|
|
16
|
+
chunk_index: int,
|
|
17
|
+
):
|
|
18
|
+
self.ids = ids
|
|
19
|
+
self._fetcher = fetcher
|
|
20
|
+
self.index = chunk_index
|
|
21
|
+
|
|
22
|
+
def iter_documents(self) -> Iterator[DocumentStream]:
|
|
23
|
+
for doc_id in self.ids:
|
|
24
|
+
yield self._fetcher(doc_id)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseSourceProcessor(Generic[FileIdentifierT], AbstractContextManager, ABC):
|
|
9
28
|
"""
|
|
10
29
|
Base class for source processors.
|
|
11
30
|
Handles initialization state and context management.
|
|
@@ -35,9 +54,40 @@ class BaseSourceProcessor(AbstractContextManager, ABC):
|
|
|
35
54
|
def _fetch_documents(self) -> Iterator[DocumentStream]:
|
|
36
55
|
"""Yield documents from the source."""
|
|
37
56
|
|
|
57
|
+
def _list_document_ids(self) -> Iterator[FileIdentifierT] | None:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
def _fetch_document_by_id(self, identifier: FileIdentifierT) -> DocumentStream:
|
|
61
|
+
raise NotImplementedError
|
|
62
|
+
|
|
63
|
+
def _count_documents(self) -> int | None:
|
|
64
|
+
return None
|
|
65
|
+
|
|
38
66
|
def iterate_documents(self) -> Iterator[DocumentStream]:
|
|
39
67
|
if not self._initialized:
|
|
40
68
|
raise RuntimeError(
|
|
41
69
|
"Processor not initialized. Use 'with' to open it first."
|
|
42
70
|
)
|
|
43
71
|
yield from self._fetch_documents()
|
|
72
|
+
|
|
73
|
+
def iterate_document_chunks(
|
|
74
|
+
self, chunk_size: int
|
|
75
|
+
) -> Iterator[DocumentChunk[FileIdentifierT]]:
|
|
76
|
+
ids_gen = self._list_document_ids()
|
|
77
|
+
if ids_gen is None:
|
|
78
|
+
raise RuntimeError("Connector does not support chunking.")
|
|
79
|
+
|
|
80
|
+
chunk_index = 0
|
|
81
|
+
|
|
82
|
+
while True:
|
|
83
|
+
ids = list(islice(ids_gen, chunk_size))
|
|
84
|
+
if not ids:
|
|
85
|
+
break
|
|
86
|
+
|
|
87
|
+
yield DocumentChunk(
|
|
88
|
+
ids=ids,
|
|
89
|
+
fetcher=self._fetch_document_by_id,
|
|
90
|
+
chunk_index=chunk_index,
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
chunk_index += 1
|
|
@@ -2,12 +2,16 @@ from docling_jobkit.connectors.google_drive_source_processor import (
|
|
|
2
2
|
GoogleDriveSourceProcessor,
|
|
3
3
|
)
|
|
4
4
|
from docling_jobkit.connectors.http_source_processor import HttpSourceProcessor
|
|
5
|
+
from docling_jobkit.connectors.local_path_source_processor import (
|
|
6
|
+
LocalPathSourceProcessor,
|
|
7
|
+
)
|
|
5
8
|
from docling_jobkit.connectors.s3_source_processor import S3SourceProcessor
|
|
6
9
|
from docling_jobkit.connectors.source_processor import BaseSourceProcessor
|
|
7
10
|
from docling_jobkit.datamodel.task_sources import (
|
|
8
11
|
TaskFileSource,
|
|
9
12
|
TaskGoogleDriveSource,
|
|
10
13
|
TaskHttpSource,
|
|
14
|
+
TaskLocalPathSource,
|
|
11
15
|
TaskS3Source,
|
|
12
16
|
TaskSource,
|
|
13
17
|
)
|
|
@@ -20,5 +24,7 @@ def get_source_processor(source: TaskSource) -> BaseSourceProcessor:
|
|
|
20
24
|
return S3SourceProcessor(source)
|
|
21
25
|
elif isinstance(source, TaskGoogleDriveSource):
|
|
22
26
|
return GoogleDriveSourceProcessor(source)
|
|
27
|
+
elif isinstance(source, TaskLocalPathSource):
|
|
28
|
+
return LocalPathSourceProcessor(source)
|
|
23
29
|
|
|
24
30
|
raise RuntimeError(f"No source processor for this source. {type(source)=}")
|
|
@@ -1,10 +1,14 @@
|
|
|
1
1
|
from docling_jobkit.connectors.google_drive_target_processor import (
|
|
2
2
|
GoogleDriveTargetProcessor,
|
|
3
3
|
)
|
|
4
|
+
from docling_jobkit.connectors.local_path_target_processor import (
|
|
5
|
+
LocalPathTargetProcessor,
|
|
6
|
+
)
|
|
4
7
|
from docling_jobkit.connectors.s3_target_processor import S3TargetProcessor
|
|
5
8
|
from docling_jobkit.connectors.target_processor import BaseTargetProcessor
|
|
6
9
|
from docling_jobkit.datamodel.task_targets import (
|
|
7
10
|
GoogleDriveTarget,
|
|
11
|
+
LocalPathTarget,
|
|
8
12
|
S3Target,
|
|
9
13
|
TaskTarget,
|
|
10
14
|
)
|
|
@@ -15,5 +19,7 @@ def get_target_processor(target: TaskTarget) -> BaseTargetProcessor:
|
|
|
15
19
|
return S3TargetProcessor(target)
|
|
16
20
|
if isinstance(target, GoogleDriveTarget):
|
|
17
21
|
return GoogleDriveTargetProcessor(target)
|
|
22
|
+
if isinstance(target, LocalPathTarget):
|
|
23
|
+
return LocalPathTargetProcessor(target)
|
|
18
24
|
|
|
19
25
|
raise RuntimeError(f"No target processor for this target. {type(target)=}")
|
|
@@ -220,6 +220,7 @@ def process_chunk_results(
|
|
|
220
220
|
task: Task,
|
|
221
221
|
conv_results: Iterable[ConversionResult],
|
|
222
222
|
work_dir: Path,
|
|
223
|
+
chunker_manager: Optional[DocumentChunkerManager] = None,
|
|
223
224
|
) -> DoclingTaskResult:
|
|
224
225
|
# Let's start by processing the documents
|
|
225
226
|
start_time = time.monotonic()
|
|
@@ -234,7 +235,7 @@ def process_chunk_results(
|
|
|
234
235
|
num_failed = 0
|
|
235
236
|
|
|
236
237
|
# TODO: DocumentChunkerManager should be initialized outside for really working as a cache
|
|
237
|
-
chunker_manager = DocumentChunkerManager()
|
|
238
|
+
chunker_manager = chunker_manager or DocumentChunkerManager()
|
|
238
239
|
for conv_res in conv_results:
|
|
239
240
|
errors = conv_res.errors
|
|
240
241
|
filename = conv_res.input.file.name
|
|
@@ -28,6 +28,7 @@ from docling.datamodel.pipeline_options import (
|
|
|
28
28
|
PictureDescriptionVlmOptions,
|
|
29
29
|
ProcessingPipeline,
|
|
30
30
|
TableFormerMode,
|
|
31
|
+
TableStructureOptions,
|
|
31
32
|
VlmPipelineOptions,
|
|
32
33
|
)
|
|
33
34
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
|
|
@@ -209,11 +210,9 @@ class DoclingConverterManager:
|
|
|
209
210
|
do_picture_classification=request.do_picture_classification,
|
|
210
211
|
do_picture_description=request.do_picture_description,
|
|
211
212
|
)
|
|
212
|
-
pipeline_options.table_structure_options
|
|
213
|
-
request.table_mode
|
|
214
|
-
|
|
215
|
-
pipeline_options.table_structure_options.do_cell_matching = (
|
|
216
|
-
request.table_cell_matching
|
|
213
|
+
pipeline_options.table_structure_options = TableStructureOptions(
|
|
214
|
+
mode=TableFormerMode(request.table_mode),
|
|
215
|
+
do_cell_matching=request.table_cell_matching,
|
|
217
216
|
)
|
|
218
217
|
|
|
219
218
|
if request.image_export_mode != ImageRefMode.PLACEHOLDER:
|
|
@@ -38,7 +38,7 @@ class ResultsProcessor:
|
|
|
38
38
|
to_formats: list[str] | None = None,
|
|
39
39
|
generate_page_images: bool = False,
|
|
40
40
|
generate_picture_images: bool = False,
|
|
41
|
-
export_parquet_file: bool =
|
|
41
|
+
export_parquet_file: bool = False,
|
|
42
42
|
scratch_dir: Path | None = None,
|
|
43
43
|
):
|
|
44
44
|
self._target_processor = target_processor
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import Annotated, Literal
|
|
2
3
|
|
|
3
|
-
from pydantic import Field
|
|
4
|
+
from pydantic import BaseModel, Field
|
|
4
5
|
|
|
5
6
|
from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
|
|
6
7
|
from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
|
|
@@ -23,7 +24,61 @@ class TaskGoogleDriveSource(GoogleDriveCoordinates):
|
|
|
23
24
|
kind: Literal["google_drive"] = "google_drive"
|
|
24
25
|
|
|
25
26
|
|
|
27
|
+
class TaskLocalPathSource(BaseModel):
|
|
28
|
+
kind: Literal["local_path"] = "local_path"
|
|
29
|
+
|
|
30
|
+
path: Annotated[
|
|
31
|
+
Path,
|
|
32
|
+
Field(
|
|
33
|
+
description=(
|
|
34
|
+
"Local filesystem path to a file or directory. "
|
|
35
|
+
"For files, the single file will be processed. "
|
|
36
|
+
"For directories, files will be discovered based on the pattern and recursive settings. "
|
|
37
|
+
"Required."
|
|
38
|
+
),
|
|
39
|
+
examples=[
|
|
40
|
+
"/path/to/document.pdf",
|
|
41
|
+
"/path/to/documents/",
|
|
42
|
+
"./data/input/",
|
|
43
|
+
],
|
|
44
|
+
),
|
|
45
|
+
]
|
|
46
|
+
|
|
47
|
+
pattern: Annotated[
|
|
48
|
+
str,
|
|
49
|
+
Field(
|
|
50
|
+
description=(
|
|
51
|
+
"Glob pattern for matching files within a directory. "
|
|
52
|
+
"Supports standard glob syntax (e.g., '*.pdf', '**/*.docx'). "
|
|
53
|
+
"Only applicable when path is a directory. "
|
|
54
|
+
"Optional, defaults to '*' (all files)."
|
|
55
|
+
),
|
|
56
|
+
examples=[
|
|
57
|
+
"*.pdf",
|
|
58
|
+
"*.{pdf,docx}",
|
|
59
|
+
"**/*.pdf",
|
|
60
|
+
"report_*.pdf",
|
|
61
|
+
],
|
|
62
|
+
),
|
|
63
|
+
] = "*"
|
|
64
|
+
|
|
65
|
+
recursive: Annotated[
|
|
66
|
+
bool,
|
|
67
|
+
Field(
|
|
68
|
+
description=(
|
|
69
|
+
"If True, recursively traverse subdirectories when path is a directory. "
|
|
70
|
+
"If False, only process files in the immediate directory. "
|
|
71
|
+
"Optional, defaults to True."
|
|
72
|
+
),
|
|
73
|
+
),
|
|
74
|
+
] = True
|
|
75
|
+
|
|
76
|
+
|
|
26
77
|
TaskSource = Annotated[
|
|
27
|
-
TaskFileSource
|
|
78
|
+
TaskFileSource
|
|
79
|
+
| TaskHttpSource
|
|
80
|
+
| TaskS3Source
|
|
81
|
+
| TaskGoogleDriveSource
|
|
82
|
+
| TaskLocalPathSource,
|
|
28
83
|
Field(discriminator="kind"),
|
|
29
84
|
]
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
from pathlib import Path
|
|
1
2
|
from typing import Annotated, Literal
|
|
2
3
|
|
|
3
4
|
from pydantic import AnyHttpUrl, BaseModel, Field
|
|
@@ -27,7 +28,33 @@ class PutTarget(BaseModel):
|
|
|
27
28
|
url: AnyHttpUrl
|
|
28
29
|
|
|
29
30
|
|
|
31
|
+
class LocalPathTarget(BaseModel):
|
|
32
|
+
kind: Literal["local_path"] = "local_path"
|
|
33
|
+
|
|
34
|
+
path: Annotated[
|
|
35
|
+
Path,
|
|
36
|
+
Field(
|
|
37
|
+
description=(
|
|
38
|
+
"Local filesystem path for output. "
|
|
39
|
+
"Can be a directory (outputs will be written inside) or a file path. "
|
|
40
|
+
"Directories will be created if they don't exist. "
|
|
41
|
+
"Required."
|
|
42
|
+
),
|
|
43
|
+
examples=[
|
|
44
|
+
"/path/to/output/",
|
|
45
|
+
"./data/output/",
|
|
46
|
+
"/path/to/output.json",
|
|
47
|
+
],
|
|
48
|
+
),
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
|
|
30
52
|
TaskTarget = Annotated[
|
|
31
|
-
InBodyTarget
|
|
53
|
+
InBodyTarget
|
|
54
|
+
| ZipTarget
|
|
55
|
+
| S3Target
|
|
56
|
+
| GoogleDriveTarget
|
|
57
|
+
| PutTarget
|
|
58
|
+
| LocalPathTarget,
|
|
32
59
|
Field(discriminator="kind"),
|
|
33
60
|
]
|
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import gc
|
|
2
3
|
import logging
|
|
3
4
|
import tempfile
|
|
4
5
|
import uuid
|
|
@@ -10,6 +11,7 @@ from pydantic import BaseModel
|
|
|
10
11
|
|
|
11
12
|
from docling.datamodel.base_models import InputFormat
|
|
12
13
|
|
|
14
|
+
from docling_jobkit.convert.chunking import DocumentChunkerManager
|
|
13
15
|
from docling_jobkit.convert.manager import DoclingConverterManager
|
|
14
16
|
from docling_jobkit.datamodel.chunking import BaseChunkerOptions, ChunkingExportOptions
|
|
15
17
|
from docling_jobkit.datamodel.convert import ConvertDocumentsOptions
|
|
@@ -41,6 +43,8 @@ class LocalOrchestrator(BaseOrchestrator):
|
|
|
41
43
|
self.task_queue: asyncio.Queue[str] = asyncio.Queue()
|
|
42
44
|
self.queue_list: list[str] = []
|
|
43
45
|
self.cm = converter_manager
|
|
46
|
+
self.chunker_manager = DocumentChunkerManager()
|
|
47
|
+
self.worker_cms: list[DoclingConverterManager] = []
|
|
44
48
|
self._task_results: dict[str, DoclingTaskResult] = {}
|
|
45
49
|
self.scratch_dir = self.config.scratch_dir or Path(
|
|
46
50
|
tempfile.mkdtemp(prefix="docling_")
|
|
@@ -129,6 +133,10 @@ class LocalOrchestrator(BaseOrchestrator):
|
|
|
129
133
|
|
|
130
134
|
async def clear_converters(self):
|
|
131
135
|
self.cm.clear_cache()
|
|
136
|
+
self.chunker_manager.clear_cache()
|
|
137
|
+
for cm in self.worker_cms:
|
|
138
|
+
cm.clear_cache()
|
|
139
|
+
gc.collect()
|
|
132
140
|
|
|
133
141
|
async def check_connection(self):
|
|
134
142
|
pass
|
|
@@ -34,11 +34,11 @@ class AsyncLocalWorker:
|
|
|
34
34
|
|
|
35
35
|
async def loop(self):
|
|
36
36
|
_log.debug(f"Starting loop for worker {self.worker_id}")
|
|
37
|
-
|
|
38
|
-
self.orchestrator.cm
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
37
|
+
if self.use_shared_manager:
|
|
38
|
+
cm = self.orchestrator.cm
|
|
39
|
+
else:
|
|
40
|
+
cm = DoclingConverterManager(self.orchestrator.cm.config)
|
|
41
|
+
self.orchestrator.worker_cms.append(cm)
|
|
42
42
|
while True:
|
|
43
43
|
task_id: str = await self.orchestrator.task_queue.get()
|
|
44
44
|
self.orchestrator.queue_list.remove(task_id)
|
|
@@ -94,6 +94,7 @@ class AsyncLocalWorker:
|
|
|
94
94
|
task=task,
|
|
95
95
|
conv_results=conv_results,
|
|
96
96
|
work_dir=workdir,
|
|
97
|
+
chunker_manager=self.orchestrator.chunker_manager,
|
|
97
98
|
)
|
|
98
99
|
|
|
99
100
|
return processed_results
|
|
@@ -226,12 +226,22 @@ class RQOrchestrator(BaseOrchestrator):
|
|
|
226
226
|
|
|
227
227
|
async def delete_task(self, task_id: str):
|
|
228
228
|
_log.info(f"Deleting result of task {task_id=}")
|
|
229
|
+
|
|
230
|
+
# Delete the result data from Redis if it exists
|
|
229
231
|
if task_id in self._task_result_keys:
|
|
230
232
|
await self._async_redis_conn.delete(self._task_result_keys[task_id])
|
|
231
233
|
del self._task_result_keys[task_id]
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
234
|
+
|
|
235
|
+
# Delete the RQ job itself to free up Redis memory
|
|
236
|
+
# This includes the job metadata and result stream
|
|
237
|
+
try:
|
|
238
|
+
job = Job.fetch(task_id, connection=self._redis_conn)
|
|
239
|
+
job.delete()
|
|
240
|
+
_log.debug(f"Deleted RQ job {task_id=}")
|
|
241
|
+
except Exception as e:
|
|
242
|
+
# Job may not exist or already be deleted - this is not an error
|
|
243
|
+
_log.debug(f"Could not delete RQ job {task_id=}: {e}")
|
|
244
|
+
|
|
235
245
|
await super().delete_task(task_id)
|
|
236
246
|
|
|
237
247
|
async def warm_up_caches(self):
|