docling-jobkit 1.8.1__py3-none-any.whl → 1.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -18,9 +18,15 @@ from docling_jobkit.datamodel.task_sources import (
18
18
  TaskFileSource,
19
19
  TaskGoogleDriveSource,
20
20
  TaskHttpSource,
21
+ TaskLocalPathSource,
21
22
  TaskS3Source,
22
23
  )
23
- from docling_jobkit.datamodel.task_targets import GoogleDriveTarget, S3Target, ZipTarget
24
+ from docling_jobkit.datamodel.task_targets import (
25
+ GoogleDriveTarget,
26
+ LocalPathTarget,
27
+ S3Target,
28
+ ZipTarget,
29
+ )
24
30
 
25
31
  console = Console()
26
32
  err_console = Console(stderr=True)
@@ -34,12 +40,17 @@ app = typer.Typer(
34
40
  )
35
41
 
36
42
  JobTaskSource = Annotated[
37
- TaskFileSource | TaskHttpSource | TaskS3Source | TaskGoogleDriveSource,
43
+ TaskFileSource
44
+ | TaskHttpSource
45
+ | TaskLocalPathSource
46
+ | TaskS3Source
47
+ | TaskGoogleDriveSource,
38
48
  Field(discriminator="kind"),
39
49
  ]
40
50
 
41
51
  JobTaskTarget = Annotated[
42
- ZipTarget | S3Target | GoogleDriveTarget, Field(discriminator="kind")
52
+ ZipTarget | LocalPathTarget | S3Target | GoogleDriveTarget,
53
+ Field(discriminator="kind"),
43
54
  ]
44
55
 
45
56
 
@@ -0,0 +1,504 @@
1
+ import logging
2
+ import multiprocessing as mp
3
+ import queue
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Annotated, Any, Optional
7
+
8
+ import typer
9
+ import yaml
10
+ from pydantic import BaseModel, Field, ValidationError
11
+ from rich.console import Console
12
+ from rich.progress import (
13
+ BarColumn,
14
+ Progress,
15
+ SpinnerColumn,
16
+ TaskProgressColumn,
17
+ TextColumn,
18
+ )
19
+
20
+ from docling_jobkit.connectors.source_processor_factory import get_source_processor
21
+ from docling_jobkit.connectors.target_processor_factory import get_target_processor
22
+ from docling_jobkit.convert.manager import (
23
+ DoclingConverterManager,
24
+ DoclingConverterManagerConfig,
25
+ )
26
+ from docling_jobkit.convert.results_processor import ResultsProcessor
27
+ from docling_jobkit.datamodel.convert import ConvertDocumentsOptions
28
+ from docling_jobkit.datamodel.task_sources import (
29
+ TaskFileSource,
30
+ TaskGoogleDriveSource,
31
+ TaskHttpSource,
32
+ TaskLocalPathSource,
33
+ TaskS3Source,
34
+ )
35
+ from docling_jobkit.datamodel.task_targets import (
36
+ GoogleDriveTarget,
37
+ LocalPathTarget,
38
+ S3Target,
39
+ ZipTarget,
40
+ )
41
+
42
+ console = Console()
43
+ err_console = Console(stderr=True)
44
+ _log = logging.getLogger(__name__)
45
+
46
+ app = typer.Typer(
47
+ name="Docling Jobkit Multiproc",
48
+ no_args_is_help=True,
49
+ add_completion=False,
50
+ pretty_exceptions_enable=False,
51
+ )
52
+
53
+ JobTaskSource = Annotated[
54
+ TaskFileSource
55
+ | TaskHttpSource
56
+ | TaskLocalPathSource
57
+ | TaskS3Source
58
+ | TaskGoogleDriveSource,
59
+ Field(discriminator="kind"),
60
+ ]
61
+
62
+ JobTaskTarget = Annotated[
63
+ ZipTarget | LocalPathTarget | S3Target | GoogleDriveTarget,
64
+ Field(discriminator="kind"),
65
+ ]
66
+
67
+
68
+ class JobConfig(BaseModel):
69
+ options: ConvertDocumentsOptions = ConvertDocumentsOptions()
70
+ sources: list[JobTaskSource]
71
+ target: JobTaskTarget
72
+
73
+
74
+ class BatchResult(BaseModel):
75
+ """Result of processing a single batch"""
76
+
77
+ chunk_index: int
78
+ num_documents: int
79
+ num_succeeded: int
80
+ num_failed: int
81
+ failed_documents: list[str]
82
+ processing_time: float
83
+ error_message: Optional[str] = None
84
+
85
+
86
+ def _load_config(config_file: Path) -> JobConfig:
87
+ """Load and validate configuration file."""
88
+ try:
89
+ with config_file.open("r") as f:
90
+ raw_data = yaml.safe_load(f)
91
+ return JobConfig(**raw_data)
92
+ except FileNotFoundError:
93
+ err_console.print(f"[red]❌ File not found: {config_file}[/red]")
94
+ raise typer.Exit(1)
95
+ except ValidationError as e:
96
+ err_console.print("[red]❌ Validation failed:[/red]")
97
+ err_console.print(e.json(indent=2))
98
+ raise typer.Exit(1)
99
+
100
+
101
+ def _process_source(
102
+ source: JobTaskSource,
103
+ source_idx: int,
104
+ total_sources: int,
105
+ config: JobConfig,
106
+ batch_size: int,
107
+ num_processes: int,
108
+ artifacts_path: Optional[Path],
109
+ enable_remote_services: bool,
110
+ allow_external_plugins: bool,
111
+ quiet: bool,
112
+ log_level: int,
113
+ progress_queue: Optional[Any] = None,
114
+ ) -> list[BatchResult]:
115
+ """Process a single source and return batch results."""
116
+ if not quiet:
117
+ console.print(
118
+ f"[bold]Processing source {source_idx + 1}/{total_sources}[/bold]"
119
+ )
120
+
121
+ batch_results: list[BatchResult] = []
122
+
123
+ with get_source_processor(source) as source_processor:
124
+ # Check if source supports chunking
125
+ try:
126
+ chunks_iter = source_processor.iterate_document_chunks(batch_size)
127
+ except RuntimeError as e:
128
+ err_console.print(f"[red]❌ Source does not support chunking: {e}[/red]")
129
+ err_console.print(
130
+ "[yellow]Hint: Only S3 and Google Drive sources support batch processing[/yellow]"
131
+ )
132
+ raise typer.Exit(1)
133
+
134
+ # Collect all chunks first to know total count
135
+ chunks = list(chunks_iter)
136
+ num_chunks = len(chunks)
137
+
138
+ if num_chunks == 0:
139
+ if not quiet:
140
+ console.print("[yellow]No documents found in source[/yellow]")
141
+ return batch_results
142
+
143
+ # Calculate total number of documents across all chunks
144
+ total_documents = sum(len(list(chunk.ids)) for chunk in chunks)
145
+
146
+ if not quiet:
147
+ console.print(
148
+ f"Found {total_documents} documents in {num_chunks} batches to process"
149
+ )
150
+
151
+ # Prepare arguments for each batch
152
+ batch_args = [
153
+ (
154
+ chunk.index,
155
+ list(chunk.ids),
156
+ source,
157
+ config.target,
158
+ config.options,
159
+ artifacts_path,
160
+ enable_remote_services,
161
+ allow_external_plugins,
162
+ log_level,
163
+ progress_queue,
164
+ )
165
+ for chunk in chunks
166
+ ]
167
+
168
+ # Process batches in parallel with progress tracking
169
+ with Progress(
170
+ SpinnerColumn(),
171
+ TextColumn("[progress.description]{task.description}"),
172
+ BarColumn(),
173
+ TaskProgressColumn(),
174
+ console=console,
175
+ disable=quiet,
176
+ ) as progress:
177
+ task = progress.add_task("Processing documents...", total=total_documents)
178
+
179
+ with mp.Pool(processes=num_processes) as pool:
180
+ # Start async processing
181
+ async_results = [
182
+ pool.apply_async(process_batch, args) for args in batch_args
183
+ ]
184
+
185
+ # Track which results have been collected
186
+ collected_indices = set()
187
+ completed_batches = 0
188
+
189
+ # Monitor progress queue while batches are processing
190
+ while completed_batches < len(batch_args):
191
+ try:
192
+ # Check for progress updates (non-blocking with timeout)
193
+ if progress_queue:
194
+ try:
195
+ msg = progress_queue.get(timeout=0.1)
196
+ if msg == "document_completed":
197
+ progress.update(task, advance=1)
198
+ except queue.Empty:
199
+ pass
200
+
201
+ # Check if any batch has completed
202
+ for idx, async_result in enumerate(async_results):
203
+ if idx not in collected_indices and async_result.ready():
204
+ batch_result = async_result.get()
205
+ batch_results.append(batch_result)
206
+ collected_indices.add(idx)
207
+ completed_batches += 1
208
+ except KeyboardInterrupt:
209
+ pool.terminate()
210
+ raise
211
+
212
+ return batch_results
213
+
214
+
215
+ def _display_summary(
216
+ all_batch_results: list[BatchResult],
217
+ overall_time: float,
218
+ quiet: bool,
219
+ ) -> None:
220
+ """Display processing summary and failed documents."""
221
+ total_batches = len(all_batch_results)
222
+ total_documents = sum(r.num_documents for r in all_batch_results)
223
+ total_succeeded = sum(r.num_succeeded for r in all_batch_results)
224
+ total_failed = sum(r.num_failed for r in all_batch_results)
225
+
226
+ if not quiet:
227
+ console.print()
228
+ console.print("[bold]Processing Summary[/bold]")
229
+ console.print("=" * 50)
230
+
231
+ console.print(f"Total Batches: {total_batches}")
232
+ console.print(f"Total Documents: {total_documents}")
233
+ console.print(f"Successful: {total_succeeded}")
234
+ console.print(f"Failed: {total_failed}")
235
+ console.print(f"Total Processing Time: {overall_time:.2f}s")
236
+ if total_documents > 0:
237
+ console.print(f"Average per Document: {overall_time / total_documents:.2f}s")
238
+
239
+ # Display failed documents if any
240
+ if total_failed > 0:
241
+ if not quiet:
242
+ console.print()
243
+ console.print("[bold red]Failed Documents:[/bold red]")
244
+ for batch_result in all_batch_results:
245
+ if batch_result.num_failed > 0:
246
+ if not quiet:
247
+ console.print(
248
+ f"\n[yellow]Batch {batch_result.chunk_index}:[/yellow]"
249
+ )
250
+ if batch_result.error_message:
251
+ console.print(f" Batch Error: {batch_result.error_message}")
252
+ for failed_doc in batch_result.failed_documents:
253
+ console.print(f" - {failed_doc}")
254
+
255
+ if total_failed > 0:
256
+ raise typer.Exit(1)
257
+
258
+
259
+ def process_batch(
260
+ chunk_index: int,
261
+ document_ids: list[Any],
262
+ source: JobTaskSource,
263
+ target: JobTaskTarget,
264
+ options: ConvertDocumentsOptions,
265
+ artifacts_path: Optional[Path],
266
+ enable_remote_services: bool,
267
+ allow_external_plugins: bool,
268
+ log_level: int,
269
+ progress_queue: Optional[Any] = None,
270
+ ) -> BatchResult:
271
+ """
272
+ Process a single batch of documents in a subprocess.
273
+
274
+ This function is executed in a separate process and handles:
275
+ - Initializing source and target processors from config
276
+ - Converting documents in the batch
277
+ - Writing results to target
278
+ - Tracking successes and failures
279
+
280
+ Args:
281
+ chunk_index: Index of this batch/chunk
282
+ document_ids: List of document identifiers for this batch
283
+ source: Source configuration
284
+ target: Target configuration
285
+ options: Conversion options
286
+ artifacts_path: Optional path to model artifacts
287
+ enable_remote_services: Whether to enable remote services
288
+ allow_external_plugins: Whether to allow external plugins
289
+
290
+ Returns:
291
+ BatchResult with processing statistics and any errors
292
+ """
293
+ # Configure logging for this subprocess
294
+ logging.basicConfig(level=log_level, force=True)
295
+ logging.getLogger().setLevel(log_level)
296
+
297
+ start_time = time.time()
298
+ num_succeeded = 0
299
+ num_failed = 0
300
+ failed_documents: list[str] = []
301
+
302
+ try:
303
+ # Initialize converter manager
304
+ cm_config = DoclingConverterManagerConfig(
305
+ artifacts_path=artifacts_path,
306
+ enable_remote_services=enable_remote_services,
307
+ allow_external_plugins=allow_external_plugins,
308
+ options_cache_size=1,
309
+ )
310
+ manager = DoclingConverterManager(config=cm_config)
311
+
312
+ # Process documents in this batch using factories
313
+ with get_source_processor(source) as source_processor:
314
+ with get_target_processor(target) as target_processor:
315
+ result_processor = ResultsProcessor(
316
+ target_processor=target_processor,
317
+ to_formats=[v.value for v in options.to_formats],
318
+ generate_page_images=options.include_images,
319
+ generate_picture_images=options.include_images,
320
+ )
321
+
322
+ # Get a new chunk with the same document IDs
323
+ # This recreates the chunk in the subprocess context
324
+ chunk = None
325
+ for c in source_processor.iterate_document_chunks(len(document_ids)):
326
+ # Find the chunk with matching IDs
327
+ if list(c.ids) == document_ids:
328
+ chunk = c
329
+ break
330
+
331
+ if chunk is None:
332
+ raise RuntimeError(
333
+ f"Could not find documents for batch {chunk_index} with IDs: {document_ids}"
334
+ )
335
+
336
+ # Use the chunk's iter_documents method to get documents
337
+ documents = list(chunk.iter_documents())
338
+
339
+ # Convert and process documents
340
+ for item in result_processor.process_documents(
341
+ manager.convert_documents(
342
+ sources=documents,
343
+ options=options,
344
+ )
345
+ ):
346
+ if "SUCCESS" in item:
347
+ num_succeeded += 1
348
+ else:
349
+ num_failed += 1
350
+ failed_documents.append(item)
351
+
352
+ # Send progress update after each document
353
+ if progress_queue:
354
+ progress_queue.put("document_completed")
355
+
356
+ processing_time = time.time() - start_time
357
+
358
+ return BatchResult(
359
+ chunk_index=chunk_index,
360
+ num_documents=len(document_ids),
361
+ num_succeeded=num_succeeded,
362
+ num_failed=num_failed,
363
+ failed_documents=failed_documents,
364
+ processing_time=processing_time,
365
+ )
366
+
367
+ except Exception as e:
368
+ processing_time = time.time() - start_time
369
+ _log.error(f"Batch {chunk_index} failed with error: {e}")
370
+ return BatchResult(
371
+ chunk_index=chunk_index,
372
+ num_documents=len(document_ids),
373
+ num_succeeded=num_succeeded,
374
+ num_failed=len(document_ids) - num_succeeded,
375
+ failed_documents=failed_documents or [f"Batch error: {e!s}"],
376
+ processing_time=processing_time,
377
+ error_message=str(e),
378
+ )
379
+
380
+
381
+ @app.command(no_args_is_help=True)
382
+ def convert(
383
+ config_file: Annotated[
384
+ Path,
385
+ typer.Argument(
386
+ help="Configuration file of the job", exists=True, readable=True
387
+ ),
388
+ ],
389
+ batch_size: Annotated[
390
+ int,
391
+ typer.Option(
392
+ "--batch-size",
393
+ "-b",
394
+ help="Number of documents to process in each batch",
395
+ ),
396
+ ] = 10,
397
+ num_processes: Annotated[
398
+ Optional[int],
399
+ typer.Option(
400
+ "--num-processes",
401
+ "-n",
402
+ help="Number of parallel processes (default: 4 or less depending on CPU count)",
403
+ ),
404
+ ] = None,
405
+ artifacts_path: Annotated[
406
+ Optional[Path],
407
+ typer.Option(..., help="If provided, the location of the model artifacts."),
408
+ ] = None,
409
+ enable_remote_services: Annotated[
410
+ bool,
411
+ typer.Option(
412
+ ..., help="Must be enabled when using models connecting to remote services."
413
+ ),
414
+ ] = False,
415
+ allow_external_plugins: Annotated[
416
+ bool,
417
+ typer.Option(
418
+ ..., help="Must be enabled for loading modules from third-party plugins."
419
+ ),
420
+ ] = False,
421
+ quiet: Annotated[
422
+ bool,
423
+ typer.Option(
424
+ "--quiet",
425
+ "-q",
426
+ help="Suppress progress bar and detailed output",
427
+ ),
428
+ ] = False,
429
+ verbose: Annotated[
430
+ int,
431
+ typer.Option(
432
+ "--verbose",
433
+ "-v",
434
+ count=True,
435
+ help="Set the verbosity level. -v for info logging, -vv for debug logging.",
436
+ ),
437
+ ] = 0,
438
+ ):
439
+ """
440
+ Convert documents using multiprocessing for parallel batch processing.
441
+
442
+ Each batch of documents is processed in a separate subprocess, allowing
443
+ for efficient parallel processing of large document collections.
444
+ """
445
+ # Configure logging based on verbosity level
446
+ # Default: WARNING (no -v flag)
447
+ # -v: INFO level
448
+ # -vv or more: DEBUG level
449
+ if verbose == 0:
450
+ log_level = logging.WARNING
451
+ elif verbose == 1:
452
+ log_level = logging.INFO
453
+ else:
454
+ log_level = logging.DEBUG
455
+
456
+ logging.basicConfig(level=log_level, force=True)
457
+ logging.getLogger().setLevel(log_level)
458
+
459
+ # Determine number of processes
460
+ if num_processes is None:
461
+ num_processes = min(mp.cpu_count(), 4)
462
+
463
+ if not quiet:
464
+ console.print("[bold blue]Docling Jobkit Multiproc[/bold blue]")
465
+ console.print(f"Batch size: {batch_size}")
466
+ console.print(f"Number of processes: {num_processes}")
467
+ console.print()
468
+
469
+ # Load and validate config file
470
+ config = _load_config(config_file)
471
+
472
+ # Create a queue for progress updates from worker processes
473
+ manager = mp.Manager()
474
+ progress_queue = manager.Queue()
475
+
476
+ # Process each source
477
+ all_batch_results: list[BatchResult] = []
478
+ overall_start_time = time.time()
479
+
480
+ for source_idx, source in enumerate(config.sources):
481
+ batch_results = _process_source(
482
+ source=source,
483
+ source_idx=source_idx,
484
+ total_sources=len(config.sources),
485
+ config=config,
486
+ batch_size=batch_size,
487
+ num_processes=num_processes,
488
+ artifacts_path=artifacts_path,
489
+ enable_remote_services=enable_remote_services,
490
+ allow_external_plugins=allow_external_plugins,
491
+ quiet=quiet,
492
+ log_level=log_level,
493
+ progress_queue=progress_queue,
494
+ )
495
+ all_batch_results.extend(batch_results)
496
+
497
+ overall_time = time.time() - overall_start_time
498
+
499
+ # Display summary
500
+ _display_summary(all_batch_results, overall_time, quiet)
501
+
502
+
503
+ if __name__ == "__main__":
504
+ app()
@@ -13,7 +13,7 @@ from googleapiclient.http import MediaFileUpload, MediaIoBaseDownload, MediaIoBa
13
13
  from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
14
14
 
15
15
 
16
- class FileInfoType(TypedDict):
16
+ class GoogleDriveFileIdentifier(TypedDict):
17
17
  id: str
18
18
  name: str
19
19
  mimeType: str
@@ -120,7 +120,7 @@ def _yield_children(service: Resource, folder_id: str):
120
120
  def _yield_files_infos(
121
121
  service: Resource,
122
122
  coords: GoogleDriveCoordinates,
123
- ) -> Iterable[FileInfoType]:
123
+ ) -> Iterable[GoogleDriveFileIdentifier]:
124
124
  """
125
125
  Depth-first traversal of Google Drive.
126
126
  Yields dicts: {id, name, mimeType, path}
@@ -136,7 +136,7 @@ def _yield_files_infos(
136
136
  .execute()
137
137
  )
138
138
 
139
- info: FileInfoType
139
+ info: GoogleDriveFileIdentifier
140
140
  if not (root_meta.get("mimeType") == "application/vnd.google-apps.folder"):
141
141
  info = {
142
142
  "id": root_meta["id"],
@@ -167,13 +167,13 @@ def _yield_files_infos(
167
167
  def get_source_files_infos(
168
168
  service: Resource,
169
169
  coords: GoogleDriveCoordinates,
170
- ) -> List[FileInfoType]:
170
+ ) -> List[GoogleDriveFileIdentifier]:
171
171
  return list(_yield_files_infos(service, coords))
172
172
 
173
173
 
174
174
  def download_file(
175
175
  service: Resource,
176
- file_info: FileInfoType,
176
+ file_info: GoogleDriveFileIdentifier,
177
177
  file_stream: BytesIO,
178
178
  ) -> None:
179
179
  """
@@ -3,11 +3,12 @@ from typing import Iterator
3
3
 
4
4
  from docling.datamodel.base_models import DocumentStream
5
5
 
6
+ from docling_jobkit.connectors.google_drive_helper import GoogleDriveFileIdentifier
6
7
  from docling_jobkit.connectors.source_processor import BaseSourceProcessor
7
8
  from docling_jobkit.datamodel.google_drive_coords import GoogleDriveCoordinates
8
9
 
9
10
 
10
- class GoogleDriveSourceProcessor(BaseSourceProcessor):
11
+ class GoogleDriveSourceProcessor(BaseSourceProcessor[GoogleDriveFileIdentifier]):
11
12
  def __init__(self, coords: GoogleDriveCoordinates):
12
13
  super().__init__()
13
14
  self._coords = coords
@@ -45,3 +46,31 @@ class GoogleDriveSourceProcessor(BaseSourceProcessor):
45
46
  name=file_info["name"],
46
47
  stream=buffer,
47
48
  )
49
+
50
+ def _list_document_ids(self) -> Iterator[GoogleDriveFileIdentifier]:
51
+ from docling_jobkit.connectors.google_drive_helper import get_source_files_infos
52
+
53
+ for info in get_source_files_infos(self._service, self._coords):
54
+ yield GoogleDriveFileIdentifier(
55
+ id=info["id"],
56
+ name=info["name"],
57
+ mimeType=info["mimeType"],
58
+ path=info["path"],
59
+ )
60
+
61
+ def _fetch_document_by_id(self, info: GoogleDriveFileIdentifier) -> DocumentStream:
62
+ from docling_jobkit.connectors.google_drive_helper import download_file
63
+
64
+ buffer = BytesIO()
65
+
66
+ download_file(
67
+ service=self._service,
68
+ file_info=info,
69
+ file_stream=buffer,
70
+ )
71
+ buffer.seek(0)
72
+
73
+ return DocumentStream(
74
+ name=info["name"],
75
+ stream=buffer,
76
+ )
@@ -1,12 +1,17 @@
1
- from typing import Iterator
1
+ from typing import Iterator, TypedDict
2
2
 
3
- from docling.datamodel.base_models import DocumentStream
3
+ from docling_core.types.io import DocumentStream
4
4
 
5
5
  from docling_jobkit.connectors.source_processor import BaseSourceProcessor
6
6
  from docling_jobkit.datamodel.http_inputs import FileSource, HttpSource
7
7
 
8
8
 
9
- class HttpSourceProcessor(BaseSourceProcessor):
9
+ class HttpFileIdentifier(TypedDict):
10
+ source: HttpSource | FileSource
11
+ index: int
12
+
13
+
14
+ class HttpSourceProcessor(BaseSourceProcessor[HttpFileIdentifier]):
10
15
  def __init__(self, source: HttpSource | FileSource):
11
16
  super().__init__()
12
17
  self._source = source
@@ -17,6 +22,21 @@ class HttpSourceProcessor(BaseSourceProcessor):
17
22
  def _finalize(self):
18
23
  pass
19
24
 
25
+ def _list_document_ids(self) -> Iterator[HttpFileIdentifier]:
26
+ """Yield a single identifier for the HTTP/File source."""
27
+ yield HttpFileIdentifier(source=self._source, index=0)
28
+
29
+ def _fetch_document_by_id(self, identifier: HttpFileIdentifier) -> DocumentStream:
30
+ """Fetch document from the identifier."""
31
+ source = identifier["source"]
32
+ if isinstance(source, FileSource):
33
+ return source.to_document_stream()
34
+ elif isinstance(source, HttpSource):
35
+ # TODO: fetch, e.g. using the helpers in docling-core
36
+ raise NotImplementedError("HttpSource fetching is not yet implemented")
37
+ else:
38
+ raise ValueError(f"Unsupported source type: {type(source)}")
39
+
20
40
  def _fetch_documents(self) -> Iterator[DocumentStream]:
21
41
  if isinstance(self._source, FileSource):
22
42
  yield self._source.to_document_stream()