docling-jobkit 1.8.1__py3-none-any.whl → 1.9.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. docling_jobkit/cli/local.py +14 -3
  2. docling_jobkit/cli/multiproc.py +504 -0
  3. docling_jobkit/connectors/google_drive_helper.py +5 -5
  4. docling_jobkit/connectors/google_drive_source_processor.py +30 -1
  5. docling_jobkit/connectors/http_source_processor.py +23 -3
  6. docling_jobkit/connectors/local_path_source_processor.py +126 -0
  7. docling_jobkit/connectors/local_path_target_processor.py +92 -0
  8. docling_jobkit/connectors/s3_source_processor.py +45 -24
  9. docling_jobkit/connectors/source_processor.py +52 -2
  10. docling_jobkit/connectors/source_processor_factory.py +6 -0
  11. docling_jobkit/connectors/target_processor_factory.py +6 -0
  12. docling_jobkit/convert/chunking.py +2 -1
  13. docling_jobkit/convert/manager.py +4 -5
  14. docling_jobkit/convert/results_processor.py +1 -1
  15. docling_jobkit/datamodel/task_sources.py +57 -2
  16. docling_jobkit/datamodel/task_targets.py +28 -1
  17. docling_jobkit/orchestrators/local/orchestrator.py +8 -0
  18. docling_jobkit/orchestrators/local/worker.py +6 -5
  19. docling_jobkit/orchestrators/rq/orchestrator.py +13 -3
  20. docling_jobkit/orchestrators/rq/worker.py +3 -0
  21. docling_jobkit/ray_job/main.py +12 -3
  22. {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/METADATA +77 -7
  23. {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/RECORD +26 -23
  24. {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/entry_points.txt +1 -0
  25. {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/WHEEL +0 -0
  26. {docling_jobkit-1.8.1.dist-info → docling_jobkit-1.9.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,126 @@
1
+ from pathlib import Path
2
+ from typing import Iterator, TypedDict
3
+
4
+ from docling_core.types.io import DocumentStream
5
+
6
+ from docling_jobkit.connectors.source_processor import BaseSourceProcessor
7
+ from docling_jobkit.datamodel.task_sources import TaskLocalPathSource
8
+
9
+
10
+ def _should_ignore_file(file_path: Path) -> bool:
11
+ """
12
+ Check if a file should be ignored based on common patterns for
13
+ hidden files, temporary files, and system metadata files.
14
+
15
+ Returns True if the file should be ignored, False otherwise.
16
+ """
17
+ name = file_path.name
18
+
19
+ # Hidden files (starting with .)
20
+ if name.startswith("."):
21
+ return True
22
+
23
+ # Vim temporary files
24
+ if name.endswith(("~", ".swp", ".swo")):
25
+ return True
26
+
27
+ # Emacs temporary files
28
+ if name.startswith("#") and name.endswith("#"):
29
+ return True
30
+
31
+ # Microsoft Office temporary files
32
+ if name.startswith("~$"):
33
+ return True
34
+
35
+ # Windows thumbnail cache
36
+ if name.lower() == "thumbs.db":
37
+ return True
38
+
39
+ # Desktop.ini (Windows)
40
+ if name.lower() == "desktop.ini":
41
+ return True
42
+
43
+ return False
44
+
45
+
46
+ class LocalPathFileIdentifier(TypedDict):
47
+ path: Path
48
+ size: int
49
+ last_modified: float
50
+
51
+
52
+ class LocalPathSourceProcessor(BaseSourceProcessor[LocalPathFileIdentifier]):
53
+ def __init__(self, source: TaskLocalPathSource):
54
+ super().__init__()
55
+ self._source = source
56
+
57
+ def _initialize(self):
58
+ """Validate that the path exists."""
59
+ if not self._source.path.exists():
60
+ raise FileNotFoundError(f"Path does not exist: {self._source.path}")
61
+
62
+ def _finalize(self):
63
+ """No cleanup needed for local filesystem."""
64
+
65
+ def _list_document_ids(self) -> Iterator[LocalPathFileIdentifier]:
66
+ """
67
+ List all files based on the source configuration.
68
+ - If path is a file, yield that single file
69
+ - If path is a directory, discover files based on pattern and recursive settings
70
+ """
71
+ path = self._source.path
72
+
73
+ if path.is_file():
74
+ # Single file case
75
+ stat = path.stat()
76
+ yield LocalPathFileIdentifier(
77
+ path=path,
78
+ size=stat.st_size,
79
+ last_modified=stat.st_mtime,
80
+ )
81
+ elif path.is_dir():
82
+ # Directory case - use glob or rglob based on recursive setting
83
+ if self._source.recursive:
84
+ # Recursive traversal
85
+ files = path.rglob(self._source.pattern)
86
+ else:
87
+ # Non-recursive traversal
88
+ files = path.glob(self._source.pattern)
89
+
90
+ for file_path in files:
91
+ # Only yield actual files, not directories
92
+ # Skip hidden files, temporary files, and system metadata
93
+ if file_path.is_file() and not _should_ignore_file(file_path):
94
+ stat = file_path.stat()
95
+ yield LocalPathFileIdentifier(
96
+ path=file_path,
97
+ size=stat.st_size,
98
+ last_modified=stat.st_mtime,
99
+ )
100
+ else:
101
+ raise ValueError(f"Path is neither a file nor a directory: {path}")
102
+
103
+ def _count_documents(self) -> int:
104
+ """Count total number of documents."""
105
+ return sum(1 for _ in self._list_document_ids())
106
+
107
+ def _fetch_document_by_id(
108
+ self, identifier: LocalPathFileIdentifier
109
+ ) -> DocumentStream:
110
+ """Fetch a document by opening the file from the local filesystem."""
111
+ file_path = identifier["path"]
112
+
113
+ # Open file in binary mode and return as DocumentStream
114
+ with open(file_path, "rb") as f:
115
+ content = f.read()
116
+
117
+ from io import BytesIO
118
+
119
+ buffer = BytesIO(content)
120
+
121
+ return DocumentStream(name=str(file_path), stream=buffer)
122
+
123
+ def _fetch_documents(self) -> Iterator[DocumentStream]:
124
+ """Iterate through all documents."""
125
+ for identifier in self._list_document_ids():
126
+ yield self._fetch_document_by_id(identifier)
@@ -0,0 +1,92 @@
1
+ from pathlib import Path
2
+ from typing import BinaryIO
3
+
4
+ from docling_jobkit.connectors.target_processor import BaseTargetProcessor
5
+ from docling_jobkit.datamodel.task_targets import LocalPathTarget
6
+
7
+
8
+ class LocalPathTargetProcessor(BaseTargetProcessor):
9
+ def __init__(self, target: LocalPathTarget):
10
+ super().__init__()
11
+ self._target = target
12
+
13
+ def _initialize(self) -> None:
14
+ """
15
+ Ensure the target directory exists.
16
+ If path is a directory, create it. If it's a file path, create parent directories.
17
+ """
18
+ path = self._target.path
19
+
20
+ # If path looks like a directory (ends with / or has no extension), treat as directory
21
+ # Otherwise, create parent directories for the file
22
+ if path.suffix == "" or str(path).endswith("/"):
23
+ # Treat as directory
24
+ path.mkdir(parents=True, exist_ok=True)
25
+ else:
26
+ # Treat as file - create parent directories
27
+ path.parent.mkdir(parents=True, exist_ok=True)
28
+
29
+ def _finalize(self) -> None:
30
+ """No cleanup needed for local filesystem."""
31
+
32
+ def upload_file(
33
+ self,
34
+ filename: str | Path,
35
+ target_filename: str,
36
+ content_type: str,
37
+ ) -> None:
38
+ """
39
+ Copy a file from local filesystem to the target location.
40
+ """
41
+ source_path = Path(filename)
42
+ target_path = self._get_target_path(target_filename)
43
+
44
+ # Ensure parent directory exists
45
+ target_path.parent.mkdir(parents=True, exist_ok=True)
46
+
47
+ # Copy file content
48
+ with open(source_path, "rb") as src:
49
+ with open(target_path, "wb") as dst:
50
+ dst.write(src.read())
51
+
52
+ def upload_object(
53
+ self,
54
+ obj: str | bytes | BinaryIO,
55
+ target_filename: str,
56
+ content_type: str,
57
+ ) -> None:
58
+ """
59
+ Write an in-memory object (bytes or file-like) to the target location.
60
+ """
61
+ target_path = self._get_target_path(target_filename)
62
+
63
+ # Ensure parent directory exists
64
+ target_path.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ # Write content based on type
67
+ if isinstance(obj, str):
68
+ with open(target_path, "w", encoding="utf-8") as f:
69
+ f.write(obj)
70
+ elif isinstance(obj, (bytes, bytearray)):
71
+ with open(target_path, "wb") as f:
72
+ f.write(obj)
73
+ else:
74
+ # Assume it's a file-like object
75
+ with open(target_path, "wb") as f:
76
+ f.write(obj.read())
77
+
78
+ def _get_target_path(self, target_filename: str) -> Path:
79
+ """
80
+ Determine the full target path based on the configured path.
81
+ - If path is a directory, append target_filename
82
+ - If path is a file, use it directly (ignore target_filename)
83
+ """
84
+ path = self._target.path
85
+
86
+ # Check if path is intended to be a directory
87
+ if path.is_dir() or path.suffix == "" or str(path).endswith("/"):
88
+ # Treat as directory - append target_filename
89
+ return path / target_filename
90
+ else:
91
+ # Treat as file - use the path directly
92
+ return path
@@ -1,14 +1,20 @@
1
1
  from io import BytesIO
2
- from typing import Iterator
2
+ from typing import Iterator, TypedDict
3
3
 
4
- from docling.datamodel.base_models import DocumentStream
4
+ from docling_core.types.io import DocumentStream
5
5
 
6
- from docling_jobkit.connectors.s3_helper import get_s3_connection, get_source_files
6
+ from docling_jobkit.connectors.s3_helper import get_s3_connection
7
7
  from docling_jobkit.connectors.source_processor import BaseSourceProcessor
8
8
  from docling_jobkit.datamodel.s3_coords import S3Coordinates
9
9
 
10
10
 
11
- class S3SourceProcessor(BaseSourceProcessor):
11
+ class S3FileIdentifier(TypedDict):
12
+ key: str # S3 object key
13
+ size: int # optional, include if available
14
+ last_modified: str | None # ISO timestamp, optional
15
+
16
+
17
+ class S3SourceProcessor(BaseSourceProcessor[S3FileIdentifier]):
12
18
  def __init__(self, coords: S3Coordinates):
13
19
  super().__init__()
14
20
  self._coords = coords
@@ -19,25 +25,40 @@ class S3SourceProcessor(BaseSourceProcessor):
19
25
  def _finalize(self):
20
26
  self._client.close()
21
27
 
22
- def _fetch_documents(self) -> Iterator[DocumentStream]:
23
- # get list of object_keys
24
- object_keys = get_source_files(
25
- s3_source_client=self._client,
26
- s3_source_resource=self._resource,
27
- s3_coords=self._coords,
28
+ def _list_document_ids(self) -> Iterator[S3FileIdentifier]:
29
+ paginator = self._client.get_paginator("list_objects_v2")
30
+ for page in paginator.paginate(
31
+ Bucket=self._coords.bucket,
32
+ Prefix=self._coords.key_prefix,
33
+ ):
34
+ for obj in page.get("Contents", []):
35
+ last_modified = obj.get("LastModified", None)
36
+ yield S3FileIdentifier(
37
+ key=obj["Key"], # type: ignore[typeddict-item] # Key is always present in S3 list_objects_v2 response
38
+ size=obj.get("Size", 0),
39
+ last_modified=last_modified.isoformat() if last_modified else None,
40
+ )
41
+
42
+ def _count_documents(self) -> int:
43
+ total = 0
44
+ paginator = self._client.get_paginator("list_objects_v2")
45
+ for page in paginator.paginate(
46
+ Bucket=self._coords.bucket,
47
+ Prefix=self._coords.key_prefix,
48
+ ):
49
+ total += len(page.get("Contents", []))
50
+ return total
51
+
52
+ # ----------------- Document fetch -----------------
53
+
54
+ def _fetch_document_by_id(self, identifier: S3FileIdentifier) -> DocumentStream:
55
+ buffer = BytesIO()
56
+ self._client.download_fileobj(
57
+ Bucket=self._coords.bucket, Key=identifier["key"], Fileobj=buffer
28
58
  )
59
+ buffer.seek(0)
60
+ return DocumentStream(name=identifier["key"], stream=buffer)
29
61
 
30
- # download and yield one document at the time
31
- for obj_key in object_keys:
32
- # todo. stream is BytesIO
33
- buffer = BytesIO()
34
- self._client.download_fileobj(
35
- Bucket=self._coords.bucket,
36
- Key=obj_key,
37
- Fileobj=buffer,
38
- )
39
- buffer.seek(0)
40
- yield DocumentStream(
41
- name=obj_key,
42
- stream=buffer,
43
- )
62
+ def _fetch_documents(self):
63
+ for key in self._list_document_ids():
64
+ yield self._fetch_document_by_id(key)
@@ -1,11 +1,30 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from contextlib import AbstractContextManager
3
- from typing import Iterator
3
+ from itertools import islice
4
+ from typing import Callable, Generic, Iterator, Sequence, TypeVar
4
5
 
5
6
  from docling.datamodel.base_models import DocumentStream
6
7
 
8
+ FileIdentifierT = TypeVar("FileIdentifierT") # identifier type per connector
7
9
 
8
- class BaseSourceProcessor(AbstractContextManager, ABC):
10
+
11
+ class DocumentChunk(Generic[FileIdentifierT]):
12
+ def __init__(
13
+ self,
14
+ ids: Sequence[FileIdentifierT],
15
+ fetcher: Callable[[FileIdentifierT], DocumentStream],
16
+ chunk_index: int,
17
+ ):
18
+ self.ids = ids
19
+ self._fetcher = fetcher
20
+ self.index = chunk_index
21
+
22
+ def iter_documents(self) -> Iterator[DocumentStream]:
23
+ for doc_id in self.ids:
24
+ yield self._fetcher(doc_id)
25
+
26
+
27
+ class BaseSourceProcessor(Generic[FileIdentifierT], AbstractContextManager, ABC):
9
28
  """
10
29
  Base class for source processors.
11
30
  Handles initialization state and context management.
@@ -35,9 +54,40 @@ class BaseSourceProcessor(AbstractContextManager, ABC):
35
54
  def _fetch_documents(self) -> Iterator[DocumentStream]:
36
55
  """Yield documents from the source."""
37
56
 
57
+ def _list_document_ids(self) -> Iterator[FileIdentifierT] | None:
58
+ return None
59
+
60
+ def _fetch_document_by_id(self, identifier: FileIdentifierT) -> DocumentStream:
61
+ raise NotImplementedError
62
+
63
+ def _count_documents(self) -> int | None:
64
+ return None
65
+
38
66
  def iterate_documents(self) -> Iterator[DocumentStream]:
39
67
  if not self._initialized:
40
68
  raise RuntimeError(
41
69
  "Processor not initialized. Use 'with' to open it first."
42
70
  )
43
71
  yield from self._fetch_documents()
72
+
73
+ def iterate_document_chunks(
74
+ self, chunk_size: int
75
+ ) -> Iterator[DocumentChunk[FileIdentifierT]]:
76
+ ids_gen = self._list_document_ids()
77
+ if ids_gen is None:
78
+ raise RuntimeError("Connector does not support chunking.")
79
+
80
+ chunk_index = 0
81
+
82
+ while True:
83
+ ids = list(islice(ids_gen, chunk_size))
84
+ if not ids:
85
+ break
86
+
87
+ yield DocumentChunk(
88
+ ids=ids,
89
+ fetcher=self._fetch_document_by_id,
90
+ chunk_index=chunk_index,
91
+ )
92
+
93
+ chunk_index += 1
@@ -2,12 +2,16 @@ from docling_jobkit.connectors.google_drive_source_processor import (
2
2
  GoogleDriveSourceProcessor,
3
3
  )
4
4
  from docling_jobkit.connectors.http_source_processor import HttpSourceProcessor
5
+ from docling_jobkit.connectors.local_path_source_processor import (
6
+ LocalPathSourceProcessor,
7
+ )
5
8
  from docling_jobkit.connectors.s3_source_processor import S3SourceProcessor
6
9
  from docling_jobkit.connectors.source_processor import BaseSourceProcessor
7
10
  from docling_jobkit.datamodel.task_sources import (
8
11
  TaskFileSource,
9
12
  TaskGoogleDriveSource,
10
13
  TaskHttpSource,
14
+ TaskLocalPathSource,
11
15
  TaskS3Source,
12
16
  TaskSource,
13
17
  )
@@ -20,5 +24,7 @@ def get_source_processor(source: TaskSource) -> BaseSourceProcessor:
20
24
  return S3SourceProcessor(source)
21
25
  elif isinstance(source, TaskGoogleDriveSource):
22
26
  return GoogleDriveSourceProcessor(source)
27
+ elif isinstance(source, TaskLocalPathSource):
28
+ return LocalPathSourceProcessor(source)
23
29
 
24
30
  raise RuntimeError(f"No source processor for this source. {type(source)=}")
@@ -1,10 +1,14 @@
1
1
  from docling_jobkit.connectors.google_drive_target_processor import (
2
2
  GoogleDriveTargetProcessor,
3
3
  )
4
+ from docling_jobkit.connectors.local_path_target_processor import (
5
+ LocalPathTargetProcessor,
6
+ )
4
7
  from docling_jobkit.connectors.s3_target_processor import S3TargetProcessor
5
8
  from docling_jobkit.connectors.target_processor import BaseTargetProcessor
6
9
  from docling_jobkit.datamodel.task_targets import (
7
10
  GoogleDriveTarget,
11
+ LocalPathTarget,
8
12
  S3Target,
9
13
  TaskTarget,
10
14
  )
@@ -15,5 +19,7 @@ def get_target_processor(target: TaskTarget) -> BaseTargetProcessor:
15
19
  return S3TargetProcessor(target)
16
20
  if isinstance(target, GoogleDriveTarget):
17
21
  return GoogleDriveTargetProcessor(target)
22
+ if isinstance(target, LocalPathTarget):
23
+ return LocalPathTargetProcessor(target)
18
24
 
19
25
  raise RuntimeError(f"No target processor for this target. {type(target)=}")
@@ -220,6 +220,7 @@ def process_chunk_results(
220
220
  task: Task,
221
221
  conv_results: Iterable[ConversionResult],
222
222
  work_dir: Path,
223
+ chunker_manager: Optional[DocumentChunkerManager] = None,
223
224
  ) -> DoclingTaskResult:
224
225
  # Let's start by processing the documents
225
226
  start_time = time.monotonic()
@@ -234,7 +235,7 @@ def process_chunk_results(
234
235
  num_failed = 0
235
236
 
236
237
  # TODO: DocumentChunkerManager should be initialized outside for really working as a cache
237
- chunker_manager = DocumentChunkerManager()
238
+ chunker_manager = chunker_manager or DocumentChunkerManager()
238
239
  for conv_res in conv_results:
239
240
  errors = conv_res.errors
240
241
  filename = conv_res.input.file.name
@@ -28,6 +28,7 @@ from docling.datamodel.pipeline_options import (
28
28
  PictureDescriptionVlmOptions,
29
29
  ProcessingPipeline,
30
30
  TableFormerMode,
31
+ TableStructureOptions,
31
32
  VlmPipelineOptions,
32
33
  )
33
34
  from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
@@ -209,11 +210,9 @@ class DoclingConverterManager:
209
210
  do_picture_classification=request.do_picture_classification,
210
211
  do_picture_description=request.do_picture_description,
211
212
  )
212
- pipeline_options.table_structure_options.mode = TableFormerMode(
213
- request.table_mode
214
- )
215
- pipeline_options.table_structure_options.do_cell_matching = (
216
- request.table_cell_matching
213
+ pipeline_options.table_structure_options = TableStructureOptions(
214
+ mode=TableFormerMode(request.table_mode),
215
+ do_cell_matching=request.table_cell_matching,
217
216
  )
218
217
 
219
218
  if request.image_export_mode != ImageRefMode.PLACEHOLDER:
@@ -38,7 +38,7 @@ class ResultsProcessor:
38
38
  to_formats: list[str] | None = None,
39
39
  generate_page_images: bool = False,
40
40
  generate_picture_images: bool = False,
41
- export_parquet_file: bool = True,
41
+ export_parquet_file: bool = False,
42
42
  scratch_dir: Path | None = None,
43
43
  ):
44
44
  self._target_processor = target_processor
@@ -1,6 +1,7 @@
1
+ from pathlib import Path
1
2
  from typing import Annotated, Literal
2
3
 
3
- from pydantic import Field
4
+ from pydantic import BaseModel, Field
4
5
 
5
6
  from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
6
7
  from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
@@ -23,7 +24,61 @@ class TaskGoogleDriveSource(GoogleDriveCoordinates):
23
24
  kind: Literal["google_drive"] = "google_drive"
24
25
 
25
26
 
27
+ class TaskLocalPathSource(BaseModel):
28
+ kind: Literal["local_path"] = "local_path"
29
+
30
+ path: Annotated[
31
+ Path,
32
+ Field(
33
+ description=(
34
+ "Local filesystem path to a file or directory. "
35
+ "For files, the single file will be processed. "
36
+ "For directories, files will be discovered based on the pattern and recursive settings. "
37
+ "Required."
38
+ ),
39
+ examples=[
40
+ "/path/to/document.pdf",
41
+ "/path/to/documents/",
42
+ "./data/input/",
43
+ ],
44
+ ),
45
+ ]
46
+
47
+ pattern: Annotated[
48
+ str,
49
+ Field(
50
+ description=(
51
+ "Glob pattern for matching files within a directory. "
52
+ "Supports standard glob syntax (e.g., '*.pdf', '**/*.docx'). "
53
+ "Only applicable when path is a directory. "
54
+ "Optional, defaults to '*' (all files)."
55
+ ),
56
+ examples=[
57
+ "*.pdf",
58
+ "*.{pdf,docx}",
59
+ "**/*.pdf",
60
+ "report_*.pdf",
61
+ ],
62
+ ),
63
+ ] = "*"
64
+
65
+ recursive: Annotated[
66
+ bool,
67
+ Field(
68
+ description=(
69
+ "If True, recursively traverse subdirectories when path is a directory. "
70
+ "If False, only process files in the immediate directory. "
71
+ "Optional, defaults to True."
72
+ ),
73
+ ),
74
+ ] = True
75
+
76
+
26
77
  TaskSource = Annotated[
27
- TaskFileSource | TaskHttpSource | TaskS3Source | TaskGoogleDriveSource,
78
+ TaskFileSource
79
+ | TaskHttpSource
80
+ | TaskS3Source
81
+ | TaskGoogleDriveSource
82
+ | TaskLocalPathSource,
28
83
  Field(discriminator="kind"),
29
84
  ]
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import Annotated, Literal
2
3
 
3
4
  from pydantic import AnyHttpUrl, BaseModel, Field
@@ -27,7 +28,33 @@ class PutTarget(BaseModel):
27
28
  url: AnyHttpUrl
28
29
 
29
30
 
31
+ class LocalPathTarget(BaseModel):
32
+ kind: Literal["local_path"] = "local_path"
33
+
34
+ path: Annotated[
35
+ Path,
36
+ Field(
37
+ description=(
38
+ "Local filesystem path for output. "
39
+ "Can be a directory (outputs will be written inside) or a file path. "
40
+ "Directories will be created if they don't exist. "
41
+ "Required."
42
+ ),
43
+ examples=[
44
+ "/path/to/output/",
45
+ "./data/output/",
46
+ "/path/to/output.json",
47
+ ],
48
+ ),
49
+ ]
50
+
51
+
30
52
  TaskTarget = Annotated[
31
- InBodyTarget | ZipTarget | S3Target | GoogleDriveTarget | PutTarget,
53
+ InBodyTarget
54
+ | ZipTarget
55
+ | S3Target
56
+ | GoogleDriveTarget
57
+ | PutTarget
58
+ | LocalPathTarget,
32
59
  Field(discriminator="kind"),
33
60
  ]
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import gc
2
3
  import logging
3
4
  import tempfile
4
5
  import uuid
@@ -10,6 +11,7 @@ from pydantic import BaseModel
10
11
 
11
12
  from docling.datamodel.base_models import InputFormat
12
13
 
14
+ from docling_jobkit.convert.chunking import DocumentChunkerManager
13
15
  from docling_jobkit.convert.manager import DoclingConverterManager
14
16
  from docling_jobkit.datamodel.chunking import BaseChunkerOptions, ChunkingExportOptions
15
17
  from docling_jobkit.datamodel.convert import ConvertDocumentsOptions
@@ -41,6 +43,8 @@ class LocalOrchestrator(BaseOrchestrator):
41
43
  self.task_queue: asyncio.Queue[str] = asyncio.Queue()
42
44
  self.queue_list: list[str] = []
43
45
  self.cm = converter_manager
46
+ self.chunker_manager = DocumentChunkerManager()
47
+ self.worker_cms: list[DoclingConverterManager] = []
44
48
  self._task_results: dict[str, DoclingTaskResult] = {}
45
49
  self.scratch_dir = self.config.scratch_dir or Path(
46
50
  tempfile.mkdtemp(prefix="docling_")
@@ -129,6 +133,10 @@ class LocalOrchestrator(BaseOrchestrator):
129
133
 
130
134
  async def clear_converters(self):
131
135
  self.cm.clear_cache()
136
+ self.chunker_manager.clear_cache()
137
+ for cm in self.worker_cms:
138
+ cm.clear_cache()
139
+ gc.collect()
132
140
 
133
141
  async def check_connection(self):
134
142
  pass
@@ -34,11 +34,11 @@ class AsyncLocalWorker:
34
34
 
35
35
  async def loop(self):
36
36
  _log.debug(f"Starting loop for worker {self.worker_id}")
37
- cm = (
38
- self.orchestrator.cm
39
- if self.use_shared_manager
40
- else DoclingConverterManager(self.orchestrator.cm.config)
41
- )
37
+ if self.use_shared_manager:
38
+ cm = self.orchestrator.cm
39
+ else:
40
+ cm = DoclingConverterManager(self.orchestrator.cm.config)
41
+ self.orchestrator.worker_cms.append(cm)
42
42
  while True:
43
43
  task_id: str = await self.orchestrator.task_queue.get()
44
44
  self.orchestrator.queue_list.remove(task_id)
@@ -94,6 +94,7 @@ class AsyncLocalWorker:
94
94
  task=task,
95
95
  conv_results=conv_results,
96
96
  work_dir=workdir,
97
+ chunker_manager=self.orchestrator.chunker_manager,
97
98
  )
98
99
 
99
100
  return processed_results
@@ -226,12 +226,22 @@ class RQOrchestrator(BaseOrchestrator):
226
226
 
227
227
  async def delete_task(self, task_id: str):
228
228
  _log.info(f"Deleting result of task {task_id=}")
229
+
230
+ # Delete the result data from Redis if it exists
229
231
  if task_id in self._task_result_keys:
230
232
  await self._async_redis_conn.delete(self._task_result_keys[task_id])
231
233
  del self._task_result_keys[task_id]
232
- # TODO: consider also deleting the task
233
- # job = Job.fetch(task_id, connection=self._redis_conn)
234
- # job.delete()
234
+
235
+ # Delete the RQ job itself to free up Redis memory
236
+ # This includes the job metadata and result stream
237
+ try:
238
+ job = Job.fetch(task_id, connection=self._redis_conn)
239
+ job.delete()
240
+ _log.debug(f"Deleted RQ job {task_id=}")
241
+ except Exception as e:
242
+ # Job may not exist or already be deleted - this is not an error
243
+ _log.debug(f"Could not delete RQ job {task_id=}: {e}")
244
+
235
245
  await super().delete_task(task_id)
236
246
 
237
247
  async def warm_up_caches(self):