docling-jobkit 1.8.0__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ from pathlib import Path
2
+ from typing import Iterator, TypedDict
3
+
4
+ from docling_core.types.io import DocumentStream
5
+
6
+ from docling_jobkit.connectors.source_processor import BaseSourceProcessor
7
+ from docling_jobkit.datamodel.task_sources import TaskLocalPathSource
8
+
9
+
10
+ def _should_ignore_file(file_path: Path) -> bool:
11
+ """
12
+ Check if a file should be ignored based on common patterns for
13
+ hidden files, temporary files, and system metadata files.
14
+
15
+ Returns True if the file should be ignored, False otherwise.
16
+ """
17
+ name = file_path.name
18
+
19
+ # Hidden files (starting with .)
20
+ if name.startswith("."):
21
+ return True
22
+
23
+ # Vim temporary files
24
+ if name.endswith(("~", ".swp", ".swo")):
25
+ return True
26
+
27
+ # Emacs temporary files
28
+ if name.startswith("#") and name.endswith("#"):
29
+ return True
30
+
31
+ # Microsoft Office temporary files
32
+ if name.startswith("~$"):
33
+ return True
34
+
35
+ # Windows thumbnail cache
36
+ if name.lower() == "thumbs.db":
37
+ return True
38
+
39
+ # Desktop.ini (Windows)
40
+ if name.lower() == "desktop.ini":
41
+ return True
42
+
43
+ return False
44
+
45
+
46
+ class LocalPathFileIdentifier(TypedDict):
47
+ path: Path
48
+ size: int
49
+ last_modified: float
50
+
51
+
52
+ class LocalPathSourceProcessor(BaseSourceProcessor[LocalPathFileIdentifier]):
53
+ def __init__(self, source: TaskLocalPathSource):
54
+ super().__init__()
55
+ self._source = source
56
+
57
+ def _initialize(self):
58
+ """Validate that the path exists."""
59
+ if not self._source.path.exists():
60
+ raise FileNotFoundError(f"Path does not exist: {self._source.path}")
61
+
62
+ def _finalize(self):
63
+ """No cleanup needed for local filesystem."""
64
+
65
+ def _list_document_ids(self) -> Iterator[LocalPathFileIdentifier]:
66
+ """
67
+ List all files based on the source configuration.
68
+ - If path is a file, yield that single file
69
+ - If path is a directory, discover files based on pattern and recursive settings
70
+ """
71
+ path = self._source.path
72
+
73
+ if path.is_file():
74
+ # Single file case
75
+ stat = path.stat()
76
+ yield LocalPathFileIdentifier(
77
+ path=path,
78
+ size=stat.st_size,
79
+ last_modified=stat.st_mtime,
80
+ )
81
+ elif path.is_dir():
82
+ # Directory case - use glob or rglob based on recursive setting
83
+ if self._source.recursive:
84
+ # Recursive traversal
85
+ files = path.rglob(self._source.pattern)
86
+ else:
87
+ # Non-recursive traversal
88
+ files = path.glob(self._source.pattern)
89
+
90
+ for file_path in files:
91
+ # Only yield actual files, not directories
92
+ # Skip hidden files, temporary files, and system metadata
93
+ if file_path.is_file() and not _should_ignore_file(file_path):
94
+ stat = file_path.stat()
95
+ yield LocalPathFileIdentifier(
96
+ path=file_path,
97
+ size=stat.st_size,
98
+ last_modified=stat.st_mtime,
99
+ )
100
+ else:
101
+ raise ValueError(f"Path is neither a file nor a directory: {path}")
102
+
103
+ def _count_documents(self) -> int:
104
+ """Count total number of documents."""
105
+ return sum(1 for _ in self._list_document_ids())
106
+
107
+ def _fetch_document_by_id(
108
+ self, identifier: LocalPathFileIdentifier
109
+ ) -> DocumentStream:
110
+ """Fetch a document by opening the file from the local filesystem."""
111
+ file_path = identifier["path"]
112
+
113
+ # Open file in binary mode and return as DocumentStream
114
+ with open(file_path, "rb") as f:
115
+ content = f.read()
116
+
117
+ from io import BytesIO
118
+
119
+ buffer = BytesIO(content)
120
+
121
+ return DocumentStream(name=str(file_path), stream=buffer)
122
+
123
+ def _fetch_documents(self) -> Iterator[DocumentStream]:
124
+ """Iterate through all documents."""
125
+ for identifier in self._list_document_ids():
126
+ yield self._fetch_document_by_id(identifier)
@@ -0,0 +1,92 @@
1
+ from pathlib import Path
2
+ from typing import BinaryIO
3
+
4
+ from docling_jobkit.connectors.target_processor import BaseTargetProcessor
5
+ from docling_jobkit.datamodel.task_targets import LocalPathTarget
6
+
7
+
8
+ class LocalPathTargetProcessor(BaseTargetProcessor):
9
+ def __init__(self, target: LocalPathTarget):
10
+ super().__init__()
11
+ self._target = target
12
+
13
+ def _initialize(self) -> None:
14
+ """
15
+ Ensure the target directory exists.
16
+ If path is a directory, create it. If it's a file path, create parent directories.
17
+ """
18
+ path = self._target.path
19
+
20
+ # If path looks like a directory (ends with / or has no extension), treat as directory
21
+ # Otherwise, create parent directories for the file
22
+ if path.suffix == "" or str(path).endswith("/"):
23
+ # Treat as directory
24
+ path.mkdir(parents=True, exist_ok=True)
25
+ else:
26
+ # Treat as file - create parent directories
27
+ path.parent.mkdir(parents=True, exist_ok=True)
28
+
29
+ def _finalize(self) -> None:
30
+ """No cleanup needed for local filesystem."""
31
+
32
+ def upload_file(
33
+ self,
34
+ filename: str | Path,
35
+ target_filename: str,
36
+ content_type: str,
37
+ ) -> None:
38
+ """
39
+ Copy a file from local filesystem to the target location.
40
+ """
41
+ source_path = Path(filename)
42
+ target_path = self._get_target_path(target_filename)
43
+
44
+ # Ensure parent directory exists
45
+ target_path.parent.mkdir(parents=True, exist_ok=True)
46
+
47
+ # Copy file content
48
+ with open(source_path, "rb") as src:
49
+ with open(target_path, "wb") as dst:
50
+ dst.write(src.read())
51
+
52
+ def upload_object(
53
+ self,
54
+ obj: str | bytes | BinaryIO,
55
+ target_filename: str,
56
+ content_type: str,
57
+ ) -> None:
58
+ """
59
+ Write an in-memory object (bytes or file-like) to the target location.
60
+ """
61
+ target_path = self._get_target_path(target_filename)
62
+
63
+ # Ensure parent directory exists
64
+ target_path.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ # Write content based on type
67
+ if isinstance(obj, str):
68
+ with open(target_path, "w", encoding="utf-8") as f:
69
+ f.write(obj)
70
+ elif isinstance(obj, (bytes, bytearray)):
71
+ with open(target_path, "wb") as f:
72
+ f.write(obj)
73
+ else:
74
+ # Assume it's a file-like object
75
+ with open(target_path, "wb") as f:
76
+ f.write(obj.read())
77
+
78
+ def _get_target_path(self, target_filename: str) -> Path:
79
+ """
80
+ Determine the full target path based on the configured path.
81
+ - If path is a directory, append target_filename
82
+ - If path is a file, use it directly (ignore target_filename)
83
+ """
84
+ path = self._target.path
85
+
86
+ # Check if path is intended to be a directory
87
+ if path.is_dir() or path.suffix == "" or str(path).endswith("/"):
88
+ # Treat as directory - append target_filename
89
+ return path / target_filename
90
+ else:
91
+ # Treat as file - use the path directly
92
+ return path
@@ -1,14 +1,20 @@
1
1
  from io import BytesIO
2
- from typing import Iterator
2
+ from typing import Iterator, TypedDict
3
3
 
4
- from docling.datamodel.base_models import DocumentStream
4
+ from docling_core.types.io import DocumentStream
5
5
 
6
- from docling_jobkit.connectors.s3_helper import get_s3_connection, get_source_files
6
+ from docling_jobkit.connectors.s3_helper import get_s3_connection
7
7
  from docling_jobkit.connectors.source_processor import BaseSourceProcessor
8
8
  from docling_jobkit.datamodel.s3_coords import S3Coordinates
9
9
 
10
10
 
11
- class S3SourceProcessor(BaseSourceProcessor):
11
+ class S3FileIdentifier(TypedDict):
12
+ key: str # S3 object key
13
+ size: int # optional, include if available
14
+ last_modified: str | None # ISO timestamp, optional
15
+
16
+
17
+ class S3SourceProcessor(BaseSourceProcessor[S3FileIdentifier]):
12
18
  def __init__(self, coords: S3Coordinates):
13
19
  super().__init__()
14
20
  self._coords = coords
@@ -19,25 +25,40 @@ class S3SourceProcessor(BaseSourceProcessor):
19
25
  def _finalize(self):
20
26
  self._client.close()
21
27
 
22
- def _fetch_documents(self) -> Iterator[DocumentStream]:
23
- # get list of object_keys
24
- object_keys = get_source_files(
25
- s3_source_client=self._client,
26
- s3_source_resource=self._resource,
27
- s3_coords=self._coords,
28
+ def _list_document_ids(self) -> Iterator[S3FileIdentifier]:
29
+ paginator = self._client.get_paginator("list_objects_v2")
30
+ for page in paginator.paginate(
31
+ Bucket=self._coords.bucket,
32
+ Prefix=self._coords.key_prefix,
33
+ ):
34
+ for obj in page.get("Contents", []):
35
+ last_modified = obj.get("LastModified", None)
36
+ yield S3FileIdentifier(
37
+ key=obj["Key"], # type: ignore[typeddict-item] # Key is always present in S3 list_objects_v2 response
38
+ size=obj.get("Size", 0),
39
+ last_modified=last_modified.isoformat() if last_modified else None,
40
+ )
41
+
42
+ def _count_documents(self) -> int:
43
+ total = 0
44
+ paginator = self._client.get_paginator("list_objects_v2")
45
+ for page in paginator.paginate(
46
+ Bucket=self._coords.bucket,
47
+ Prefix=self._coords.key_prefix,
48
+ ):
49
+ total += len(page.get("Contents", []))
50
+ return total
51
+
52
+ # ----------------- Document fetch -----------------
53
+
54
+ def _fetch_document_by_id(self, identifier: S3FileIdentifier) -> DocumentStream:
55
+ buffer = BytesIO()
56
+ self._client.download_fileobj(
57
+ Bucket=self._coords.bucket, Key=identifier["key"], Fileobj=buffer
28
58
  )
59
+ buffer.seek(0)
60
+ return DocumentStream(name=identifier["key"], stream=buffer)
29
61
 
30
- # download and yield one document at the time
31
- for obj_key in object_keys:
32
- # todo. stream is BytesIO
33
- buffer = BytesIO()
34
- self._client.download_fileobj(
35
- Bucket=self._coords.bucket,
36
- Key=obj_key,
37
- Fileobj=buffer,
38
- )
39
- buffer.seek(0)
40
- yield DocumentStream(
41
- name=obj_key,
42
- stream=buffer,
43
- )
62
+ def _fetch_documents(self):
63
+ for key in self._list_document_ids():
64
+ yield self._fetch_document_by_id(key)
@@ -1,11 +1,30 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from contextlib import AbstractContextManager
3
- from typing import Iterator
3
+ from itertools import islice
4
+ from typing import Callable, Generic, Iterator, Sequence, TypeVar
4
5
 
5
6
  from docling.datamodel.base_models import DocumentStream
6
7
 
8
+ FileIdentifierT = TypeVar("FileIdentifierT") # identifier type per connector
7
9
 
8
- class BaseSourceProcessor(AbstractContextManager, ABC):
10
+
11
+ class DocumentChunk(Generic[FileIdentifierT]):
12
+ def __init__(
13
+ self,
14
+ ids: Sequence[FileIdentifierT],
15
+ fetcher: Callable[[FileIdentifierT], DocumentStream],
16
+ chunk_index: int,
17
+ ):
18
+ self.ids = ids
19
+ self._fetcher = fetcher
20
+ self.index = chunk_index
21
+
22
+ def iter_documents(self) -> Iterator[DocumentStream]:
23
+ for doc_id in self.ids:
24
+ yield self._fetcher(doc_id)
25
+
26
+
27
+ class BaseSourceProcessor(Generic[FileIdentifierT], AbstractContextManager, ABC):
9
28
  """
10
29
  Base class for source processors.
11
30
  Handles initialization state and context management.
@@ -35,9 +54,40 @@ class BaseSourceProcessor(AbstractContextManager, ABC):
35
54
  def _fetch_documents(self) -> Iterator[DocumentStream]:
36
55
  """Yield documents from the source."""
37
56
 
57
+ def _list_document_ids(self) -> Iterator[FileIdentifierT] | None:
58
+ return None
59
+
60
+ def _fetch_document_by_id(self, identifier: FileIdentifierT) -> DocumentStream:
61
+ raise NotImplementedError
62
+
63
+ def _count_documents(self) -> int | None:
64
+ return None
65
+
38
66
  def iterate_documents(self) -> Iterator[DocumentStream]:
39
67
  if not self._initialized:
40
68
  raise RuntimeError(
41
69
  "Processor not initialized. Use 'with' to open it first."
42
70
  )
43
71
  yield from self._fetch_documents()
72
+
73
+ def iterate_document_chunks(
74
+ self, chunk_size: int
75
+ ) -> Iterator[DocumentChunk[FileIdentifierT]]:
76
+ ids_gen = self._list_document_ids()
77
+ if ids_gen is None:
78
+ raise RuntimeError("Connector does not support chunking.")
79
+
80
+ chunk_index = 0
81
+
82
+ while True:
83
+ ids = list(islice(ids_gen, chunk_size))
84
+ if not ids:
85
+ break
86
+
87
+ yield DocumentChunk(
88
+ ids=ids,
89
+ fetcher=self._fetch_document_by_id,
90
+ chunk_index=chunk_index,
91
+ )
92
+
93
+ chunk_index += 1
@@ -2,12 +2,16 @@ from docling_jobkit.connectors.google_drive_source_processor import (
2
2
  GoogleDriveSourceProcessor,
3
3
  )
4
4
  from docling_jobkit.connectors.http_source_processor import HttpSourceProcessor
5
+ from docling_jobkit.connectors.local_path_source_processor import (
6
+ LocalPathSourceProcessor,
7
+ )
5
8
  from docling_jobkit.connectors.s3_source_processor import S3SourceProcessor
6
9
  from docling_jobkit.connectors.source_processor import BaseSourceProcessor
7
10
  from docling_jobkit.datamodel.task_sources import (
8
11
  TaskFileSource,
9
12
  TaskGoogleDriveSource,
10
13
  TaskHttpSource,
14
+ TaskLocalPathSource,
11
15
  TaskS3Source,
12
16
  TaskSource,
13
17
  )
@@ -20,5 +24,7 @@ def get_source_processor(source: TaskSource) -> BaseSourceProcessor:
20
24
  return S3SourceProcessor(source)
21
25
  elif isinstance(source, TaskGoogleDriveSource):
22
26
  return GoogleDriveSourceProcessor(source)
27
+ elif isinstance(source, TaskLocalPathSource):
28
+ return LocalPathSourceProcessor(source)
23
29
 
24
30
  raise RuntimeError(f"No source processor for this source. {type(source)=}")
@@ -1,10 +1,14 @@
1
1
  from docling_jobkit.connectors.google_drive_target_processor import (
2
2
  GoogleDriveTargetProcessor,
3
3
  )
4
+ from docling_jobkit.connectors.local_path_target_processor import (
5
+ LocalPathTargetProcessor,
6
+ )
4
7
  from docling_jobkit.connectors.s3_target_processor import S3TargetProcessor
5
8
  from docling_jobkit.connectors.target_processor import BaseTargetProcessor
6
9
  from docling_jobkit.datamodel.task_targets import (
7
10
  GoogleDriveTarget,
11
+ LocalPathTarget,
8
12
  S3Target,
9
13
  TaskTarget,
10
14
  )
@@ -15,5 +19,7 @@ def get_target_processor(target: TaskTarget) -> BaseTargetProcessor:
15
19
  return S3TargetProcessor(target)
16
20
  if isinstance(target, GoogleDriveTarget):
17
21
  return GoogleDriveTargetProcessor(target)
22
+ if isinstance(target, LocalPathTarget):
23
+ return LocalPathTargetProcessor(target)
18
24
 
19
25
  raise RuntimeError(f"No target processor for this target. {type(target)=}")
@@ -220,6 +220,7 @@ def process_chunk_results(
220
220
  task: Task,
221
221
  conv_results: Iterable[ConversionResult],
222
222
  work_dir: Path,
223
+ chunker_manager: Optional[DocumentChunkerManager] = None,
223
224
  ) -> DoclingTaskResult:
224
225
  # Let's start by processing the documents
225
226
  start_time = time.monotonic()
@@ -234,7 +235,7 @@ def process_chunk_results(
234
235
  num_failed = 0
235
236
 
236
237
  # TODO: DocumentChunkerManager should be initialized outside for really working as a cache
237
- chunker_manager = DocumentChunkerManager()
238
+ chunker_manager = chunker_manager or DocumentChunkerManager()
238
239
  for conv_res in conv_results:
239
240
  errors = conv_res.errors
240
241
  filename = conv_res.input.file.name
@@ -28,10 +28,16 @@ from docling.datamodel.pipeline_options import (
28
28
  PictureDescriptionVlmOptions,
29
29
  ProcessingPipeline,
30
30
  TableFormerMode,
31
+ TableStructureOptions,
31
32
  VlmPipelineOptions,
32
33
  )
33
34
  from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, InlineVlmOptions
34
- from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
35
+ from docling.document_converter import (
36
+ DocumentConverter,
37
+ FormatOption,
38
+ ImageFormatOption,
39
+ PdfFormatOption,
40
+ )
35
41
  from docling.models.factories import get_ocr_factory
36
42
  from docling.pipeline.vlm_pipeline import VlmPipeline
37
43
  from docling_core.types.doc import ImageRefMode
@@ -68,12 +74,28 @@ def _hash_pdf_format_option(pdf_format_option: PdfFormatOption) -> bytes:
68
74
  data["pipeline_options"] = pdf_format_option.pipeline_options.model_dump(
69
75
  serialize_as_any=True, mode="json"
70
76
  )
77
+ data["pipeline_options_type"] = (
78
+ f"{pdf_format_option.pipeline_options.__class__.__module__}."
79
+ f"{pdf_format_option.pipeline_options.__class__.__qualname__}"
80
+ )
81
+ else:
82
+ data["pipeline_options_type"] = None
71
83
 
72
84
  # Replace `pipeline_cls` with a string representation
73
- data["pipeline_cls"] = repr(data["pipeline_cls"])
85
+ pipeline_cls = pdf_format_option.pipeline_cls
86
+ data["pipeline_cls"] = (
87
+ f"{pipeline_cls.__module__}.{pipeline_cls.__qualname__}"
88
+ if pipeline_cls is not None
89
+ else "None"
90
+ )
74
91
 
75
92
  # Replace `backend` with a string representation
76
- data["backend"] = repr(data["backend"])
93
+ backend = pdf_format_option.backend
94
+ data["backend"] = (
95
+ f"{backend.__module__}.{backend.__qualname__}"
96
+ if backend is not None
97
+ else "None"
98
+ )
77
99
 
78
100
  # Serialize the dictionary to JSON with sorted keys to have consistent hashes
79
101
  serialized_data = json.dumps(data, sort_keys=True)
@@ -121,9 +143,19 @@ class DoclingConverterManager:
121
143
  @lru_cache(maxsize=cache_size)
122
144
  def _get_converter_from_hash(options_hash: bytes) -> DocumentConverter:
123
145
  pdf_format_option = self._options_map[options_hash]
146
+ image_format_option: FormatOption = pdf_format_option
147
+ if isinstance(pdf_format_option.pipeline_cls, type) and issubclass(
148
+ pdf_format_option.pipeline_cls, VlmPipeline
149
+ ):
150
+ image_format_option = ImageFormatOption(
151
+ pipeline_cls=pdf_format_option.pipeline_cls,
152
+ pipeline_options=pdf_format_option.pipeline_options,
153
+ backend_options=pdf_format_option.backend_options,
154
+ )
155
+
124
156
  format_options: dict[InputFormat, FormatOption] = {
125
157
  InputFormat.PDF: pdf_format_option,
126
- InputFormat.IMAGE: pdf_format_option,
158
+ InputFormat.IMAGE: image_format_option,
127
159
  }
128
160
 
129
161
  return DocumentConverter(format_options=format_options)
@@ -178,11 +210,9 @@ class DoclingConverterManager:
178
210
  do_picture_classification=request.do_picture_classification,
179
211
  do_picture_description=request.do_picture_description,
180
212
  )
181
- pipeline_options.table_structure_options.mode = TableFormerMode(
182
- request.table_mode
183
- )
184
- pipeline_options.table_structure_options.do_cell_matching = (
185
- request.table_cell_matching
213
+ pipeline_options.table_structure_options = TableStructureOptions(
214
+ mode=TableFormerMode(request.table_mode),
215
+ do_cell_matching=request.table_cell_matching,
186
216
  )
187
217
 
188
218
  if request.image_export_mode != ImageRefMode.PLACEHOLDER:
@@ -282,6 +312,27 @@ class DoclingConverterManager:
282
312
  request.vlm_pipeline_model_api.model_dump()
283
313
  )
284
314
 
315
+ pipeline_options.do_picture_classification = request.do_picture_classification
316
+ pipeline_options.do_picture_description = request.do_picture_description
317
+
318
+ if request.picture_description_local is not None:
319
+ pipeline_options.picture_description_options = (
320
+ PictureDescriptionVlmOptions.model_validate(
321
+ request.picture_description_local.model_dump()
322
+ )
323
+ )
324
+
325
+ if request.picture_description_api is not None:
326
+ pipeline_options.picture_description_options = (
327
+ PictureDescriptionApiOptions.model_validate(
328
+ request.picture_description_api.model_dump()
329
+ )
330
+ )
331
+
332
+ pipeline_options.picture_description_options.picture_area_threshold = (
333
+ request.picture_description_area_threshold
334
+ )
335
+
285
336
  return pipeline_options
286
337
 
287
338
  # Computes the PDF pipeline options and returns the PdfFormatOption and its hash
@@ -1,6 +1,7 @@
1
+ from pathlib import Path
1
2
  from typing import Annotated, Literal
2
3
 
3
- from pydantic import Field
4
+ from pydantic import BaseModel, Field
4
5
 
5
6
  from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
6
7
  from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
@@ -23,7 +24,61 @@ class TaskGoogleDriveSource(GoogleDriveCoordinates):
23
24
  kind: Literal["google_drive"] = "google_drive"
24
25
 
25
26
 
27
+ class TaskLocalPathSource(BaseModel):
28
+ kind: Literal["local_path"] = "local_path"
29
+
30
+ path: Annotated[
31
+ Path,
32
+ Field(
33
+ description=(
34
+ "Local filesystem path to a file or directory. "
35
+ "For files, the single file will be processed. "
36
+ "For directories, files will be discovered based on the pattern and recursive settings. "
37
+ "Required."
38
+ ),
39
+ examples=[
40
+ "/path/to/document.pdf",
41
+ "/path/to/documents/",
42
+ "./data/input/",
43
+ ],
44
+ ),
45
+ ]
46
+
47
+ pattern: Annotated[
48
+ str,
49
+ Field(
50
+ description=(
51
+ "Glob pattern for matching files within a directory. "
52
+ "Supports standard glob syntax (e.g., '*.pdf', '**/*.docx'). "
53
+ "Only applicable when path is a directory. "
54
+ "Optional, defaults to '*' (all files)."
55
+ ),
56
+ examples=[
57
+ "*.pdf",
58
+ "*.{pdf,docx}",
59
+ "**/*.pdf",
60
+ "report_*.pdf",
61
+ ],
62
+ ),
63
+ ] = "*"
64
+
65
+ recursive: Annotated[
66
+ bool,
67
+ Field(
68
+ description=(
69
+ "If True, recursively traverse subdirectories when path is a directory. "
70
+ "If False, only process files in the immediate directory. "
71
+ "Optional, defaults to True."
72
+ ),
73
+ ),
74
+ ] = True
75
+
76
+
26
77
  TaskSource = Annotated[
27
- TaskFileSource | TaskHttpSource | TaskS3Source | TaskGoogleDriveSource,
78
+ TaskFileSource
79
+ | TaskHttpSource
80
+ | TaskS3Source
81
+ | TaskGoogleDriveSource
82
+ | TaskLocalPathSource,
28
83
  Field(discriminator="kind"),
29
84
  ]
@@ -1,3 +1,4 @@
1
+ from pathlib import Path
1
2
  from typing import Annotated, Literal
2
3
 
3
4
  from pydantic import AnyHttpUrl, BaseModel, Field
@@ -27,7 +28,33 @@ class PutTarget(BaseModel):
27
28
  url: AnyHttpUrl
28
29
 
29
30
 
31
+ class LocalPathTarget(BaseModel):
32
+ kind: Literal["local_path"] = "local_path"
33
+
34
+ path: Annotated[
35
+ Path,
36
+ Field(
37
+ description=(
38
+ "Local filesystem path for output. "
39
+ "Can be a directory (outputs will be written inside) or a file path. "
40
+ "Directories will be created if they don't exist. "
41
+ "Required."
42
+ ),
43
+ examples=[
44
+ "/path/to/output/",
45
+ "./data/output/",
46
+ "/path/to/output.json",
47
+ ],
48
+ ),
49
+ ]
50
+
51
+
30
52
  TaskTarget = Annotated[
31
- InBodyTarget | ZipTarget | S3Target | GoogleDriveTarget | PutTarget,
53
+ InBodyTarget
54
+ | ZipTarget
55
+ | S3Target
56
+ | GoogleDriveTarget
57
+ | PutTarget
58
+ | LocalPathTarget,
32
59
  Field(discriminator="kind"),
33
60
  ]