nv-ingest 2025.8.13.dev20250813__py3-none-any.whl → 2025.8.15.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (56) hide show
  1. nv_ingest/framework/orchestration/execution/__init__.py +3 -0
  2. nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  3. nv_ingest/framework/orchestration/execution/options.py +112 -0
  4. nv_ingest/framework/orchestration/process/__init__.py +3 -0
  5. nv_ingest/framework/orchestration/process/dependent_services.py +55 -0
  6. nv_ingest/framework/orchestration/process/execution.py +497 -0
  7. nv_ingest/framework/orchestration/process/lifecycle.py +122 -0
  8. nv_ingest/framework/orchestration/process/strategies.py +182 -0
  9. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +1 -1
  10. nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  11. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +23 -23
  12. nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  13. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +8 -4
  14. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +16 -16
  15. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +9 -5
  16. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +8 -4
  17. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +10 -6
  18. nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  19. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +18 -17
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +14 -13
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +15 -13
  23. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  24. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  25. nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +22 -13
  26. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +10 -7
  27. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +10 -8
  28. nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  29. nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  30. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +71 -61
  31. nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +7 -5
  32. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +8 -4
  33. nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  34. nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  35. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +7 -5
  36. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +13 -14
  37. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +18 -12
  38. nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  39. nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  40. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +33 -326
  41. nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  42. nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  43. nv_ingest/pipeline/__init__.py +3 -0
  44. nv_ingest/pipeline/config/__init__.py +3 -0
  45. nv_ingest/pipeline/config/loaders.py +198 -0
  46. nv_ingest/pipeline/config/replica_resolver.py +227 -0
  47. nv_ingest/pipeline/default_pipeline_impl.py +517 -0
  48. nv_ingest/pipeline/ingest_pipeline.py +389 -0
  49. nv_ingest/pipeline/pipeline_schema.py +398 -0
  50. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/METADATA +1 -1
  51. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/RECORD +54 -40
  52. nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  53. nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  54. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/WHEEL +0 -0
  55. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/licenses/LICENSE +0 -0
  56. {nv_ingest-2025.8.13.dev20250813.dist-info → nv_ingest-2025.8.15.dev20250815.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,517 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # noqa
6
+ # flake8: noqa
7
+ # pylint: disable=line-too-long
8
+
9
+ """
10
+ Default pipeline implementation for libmode.
11
+
12
+ This module contains the default libmode pipeline configuration as a string,
13
+ allowing the pipeline to be loaded without requiring external YAML files.
14
+ """
15
+
16
+ DEFAULT_LIBMODE_PIPELINE_YAML = """# Default Ingestion Pipeline Configuration for Library Mode
17
+ # This file replicates the static pipeline defined in pipeline_builders.py
18
+
19
+ name: "NVIngest default libmode pipeline"
20
+ description: "This is the default ingestion pipeline for NVIngest in library mode"
21
+ stages:
22
+ # Source
23
+ - name: "source_stage"
24
+ type: "source"
25
+ phase: 0 # PRE_PROCESSING
26
+ actor: "nv_ingest.framework.orchestration.ray.stages.sources.message_broker_task_source:MessageBrokerTaskSourceStage"
27
+ config:
28
+ broker_client:
29
+ client_type: "simple"
30
+ host: $MESSAGE_CLIENT_HOST|"0.0.0.0"
31
+ port: $MESSAGE_CLIENT_PORT|7671
32
+ task_queue: "ingest_task_queue"
33
+ poll_interval: 0.1
34
+ replicas:
35
+ min_replicas: 0
36
+ max_replicas:
37
+ strategy: "static"
38
+ value: 1
39
+ static_replicas:
40
+ strategy: "static"
41
+ value: 1
42
+ runs_after: []
43
+
44
+ # Pre-processing
45
+ - name: "metadata_injector"
46
+ type: "stage"
47
+ phase: 0 # PRE_PROCESSING
48
+ actor: "nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector:MetadataInjectionStage"
49
+ config: {}
50
+ replicas:
51
+ min_replicas: 0
52
+ max_replicas:
53
+ strategy: "static"
54
+ value: 1
55
+ static_replicas:
56
+ strategy: "static"
57
+ value: 1
58
+ runs_after:
59
+ - "source_stage"
60
+
61
+ # Primitive Extraction
62
+ - name: "pdf_extractor"
63
+ type: "stage"
64
+ phase: 1 # EXTRACTION
65
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor:PDFExtractorStage"
66
+ config:
67
+ pdfium_config:
68
+ auth_token: $NGC_API_KEY|""
69
+ yolox_endpoints: [
70
+ $YOLOX_GRPC_ENDPOINT|"",
71
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
72
+ ]
73
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
74
+ nemoretriever_parse_config:
75
+ auth_token: $NGC_API_KEY|""
76
+ nemoretriever_parse_endpoints: [
77
+ $NEMORETRIEVER_PARSE_GRPC_ENDPOINT|"",
78
+ $NEMORETRIEVER_PARSE_HTTP_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
79
+ ]
80
+ nemoretriever_parse_infer_protocol: $NEMORETRIEVER_PARSE_INFER_PROTOCOL|http
81
+ nemoretriever_parse_model_name: $NEMORETRIEVER_PARSE_MODEL_NAME|"nvidia/nemoretriever-parse"
82
+ yolox_endpoints: [
83
+ $YOLOX_GRPC_ENDPOINT|"",
84
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
85
+ ]
86
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
87
+ replicas:
88
+ min_replicas: 0
89
+ max_replicas:
90
+ strategy: "memory_thresholding"
91
+ memory_per_replica_mb: 10000 # Heuristic max consumption
92
+ static_replicas:
93
+ strategy: "memory_static_global_percent"
94
+ memory_per_replica_mb: 10000
95
+ limit: 16
96
+
97
+ - name: "audio_extractor"
98
+ type: "stage"
99
+ phase: 1 # EXTRACTION
100
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.audio_extractor:AudioExtractorStage"
101
+ config:
102
+ audio_extraction_config:
103
+ audio_endpoints: [
104
+ $AUDIO_GRPC_ENDPOINT|"grpc.nvcf.nvidia.com:443",
105
+ $AUDIO_HTTP_ENDPOINT|""
106
+ ]
107
+ function_id: $AUDIO_FUNCTION_ID|"1598d209-5e27-4d3c-8079-4751568b1081"
108
+ audio_infer_protocol: $AUDIO_INFER_PROTOCOL|grpc
109
+ auth_token: $NGC_API_KEY|""
110
+ replicas:
111
+ min_replicas: 0
112
+ max_replicas:
113
+ strategy: "static"
114
+ value: 2
115
+ static_replicas:
116
+ strategy: "static"
117
+ value: 2
118
+
119
+ - name: "docx_extractor"
120
+ type: "stage"
121
+ phase: 1 # EXTRACTION
122
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.docx_extractor:DocxExtractorStage"
123
+ config:
124
+ docx_extraction_config:
125
+ yolox_endpoints: [
126
+ $YOLOX_GRPC_ENDPOINT|"",
127
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
128
+ ]
129
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
130
+ auth_token: $NGC_API_KEY|""
131
+ replicas:
132
+ min_replicas: 0
133
+ max_replicas:
134
+ strategy: "static"
135
+ value: 2
136
+ static_replicas:
137
+ strategy: "static"
138
+ value: 1
139
+
140
+ - name: "pptx_extractor"
141
+ type: "stage"
142
+ phase: 1 # EXTRACTION
143
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor:PPTXExtractorStage"
144
+ config:
145
+ pptx_extraction_config:
146
+ yolox_endpoints: [
147
+ $YOLOX_GRPC_ENDPOINT|"",
148
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
149
+ ]
150
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
151
+ auth_token: $NGC_API_KEY|""
152
+ replicas:
153
+ min_replicas: 0
154
+ max_replicas:
155
+ strategy: "static"
156
+ value: 2
157
+ static_replicas:
158
+ strategy: "static"
159
+ value: 1
160
+
161
+ - name: "image_extractor"
162
+ type: "stage"
163
+ phase: 1 # EXTRACTION
164
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.image_extractor:ImageExtractorStage"
165
+ config:
166
+ image_extraction_config:
167
+ yolox_endpoints: [
168
+ $YOLOX_GRPC_ENDPOINT|"",
169
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
170
+ ]
171
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
172
+ auth_token: $NGC_API_KEY|""
173
+ replicas:
174
+ min_replicas: 0
175
+ max_replicas:
176
+ strategy: "static"
177
+ value: 2
178
+ static_replicas:
179
+ strategy: "static"
180
+ value: 1
181
+
182
+ - name: "html_extractor"
183
+ type: "stage"
184
+ phase: 1 # EXTRACTION
185
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor:HtmlExtractorStage"
186
+ config: {}
187
+ replicas:
188
+ min_replicas: 0
189
+ max_replicas:
190
+ strategy: "static"
191
+ value: 2
192
+ static_replicas:
193
+ strategy: "static"
194
+ value: 1
195
+
196
+ - name: "infographic_extractor"
197
+ type: "stage"
198
+ phase: 1 # EXTRACTION
199
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extractor:InfographicExtractorStage"
200
+ config:
201
+ endpoint_config:
202
+ ocr_endpoints: [
203
+ $OCR_GRPC_ENDPOINT|"grpc.nvcf.nvidia.com:443",
204
+ $OCR_HTTP_ENDPOINT|""
205
+ ]
206
+ ocr_infer_protocol: $OCR_INFER_PROTOCOL|grpc
207
+ auth_token: $NGC_API_KEY|""
208
+ replicas:
209
+ min_replicas: 0
210
+ max_replicas:
211
+ strategy: "static"
212
+ value: 2
213
+ static_replicas:
214
+ strategy: "static"
215
+ value: 1
216
+
217
+ - name: "table_extractor"
218
+ type: "stage"
219
+ phase: 1 # EXTRACTION
220
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor:TableExtractorStage"
221
+ config:
222
+ endpoint_config:
223
+ yolox_endpoints: [
224
+ $YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT|"",
225
+ $YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
226
+ ]
227
+ yolox_infer_protocol: $YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL|"http"
228
+ ocr_endpoints: [
229
+ $OCR_GRPC_ENDPOINT|"",
230
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
231
+ ]
232
+ ocr_infer_protocol: $PADDLE_INFER_PROTOCOL|"http"
233
+ auth_token: $NGC_API_KEY|""
234
+ replicas:
235
+ min_replicas: 0
236
+ max_replicas:
237
+ strategy: "memory_thresholding"
238
+ memory_per_replica_mb: 10000
239
+ static_replicas:
240
+ strategy: "memory_static_global_percent"
241
+ memory_per_replica_mb: 10000
242
+ limit: 6
243
+
244
+ - name: "chart_extractor"
245
+ type: "stage"
246
+ phase: 1 # EXTRACTION
247
+ actor: "nv_ingest.framework.orchestration.ray.stages.extractors.chart_extractor:ChartExtractorStage"
248
+ config:
249
+ endpoint_config:
250
+ yolox_endpoints: [
251
+ $YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT|"",
252
+ $YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1"
253
+ ]
254
+ yolox_infer_protocol: $YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL|"http"
255
+ ocr_endpoints: [
256
+ $OCR_GRPC_ENDPOINT|"",
257
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
258
+ ]
259
+ ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
260
+ auth_token: $NGC_API_KEY|""
261
+ replicas:
262
+ min_replicas: 0
263
+ max_replicas:
264
+ strategy: "memory_thresholding"
265
+ memory_per_replica_mb: 10000
266
+ static_replicas:
267
+ strategy: "memory_static_global_percent"
268
+ memory_per_replica_mb: 10000
269
+ limit: 6
270
+
271
+ # Post-processing / Mutators
272
+ - name: "image_filter"
273
+ type: "stage"
274
+ phase: 3 # MUTATION
275
+ actor: "nv_ingest.framework.orchestration.ray.stages.mutate.image_filter:ImageFilterStage"
276
+ replicas:
277
+ min_replicas: 0
278
+ max_replicas:
279
+ strategy: "static"
280
+ value: 1
281
+ static_replicas:
282
+ strategy: "static"
283
+ value: 1
284
+
285
+ - name: "image_dedup"
286
+ type: "stage"
287
+ phase: 3 # MUTATION
288
+ actor: "nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup:ImageDedupStage"
289
+ replicas:
290
+ min_replicas: 0
291
+ max_replicas:
292
+ strategy: "static"
293
+ value: 1
294
+ static_replicas:
295
+ strategy: "static"
296
+ value: 1
297
+
298
+ - name: "text_splitter"
299
+ type: "stage"
300
+ phase: 4 # TRANSFORM
301
+ actor: "nv_ingest.framework.orchestration.ray.stages.transforms.text_splitter:TextSplitterStage"
302
+ config:
303
+ chunk_size: 512
304
+ chunk_overlap: 20
305
+ replicas:
306
+ min_replicas: 0
307
+ max_replicas:
308
+ strategy: "static"
309
+ value: 3
310
+ static_replicas:
311
+ strategy: "static"
312
+ value: 1
313
+
314
+ # Transforms and Synthesis
315
+ - name: "image_caption"
316
+ type: "stage"
317
+ phase: 4 # TRANSFORM
318
+ actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
319
+ config:
320
+ api_key: $NGC_API_KEY|""
321
+ endpoint_url: $VLM_CAPTION_ENDPOINT|"https://ai.api.nvidia.com/v1/gr/nvidia/llama-3.1-nemotron-nano-vl-8b-v1/chat/completions"
322
+ model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
323
+ prompt: "Caption the content of this image:"
324
+ replicas:
325
+ min_replicas: 0
326
+ max_replicas:
327
+ strategy: "static"
328
+ value: 1
329
+ static_replicas:
330
+ strategy: "static"
331
+ value: 1
332
+
333
+ - name: "text_embedder"
334
+ type: "stage"
335
+ phase: 4 # TRANSFORM
336
+ actor: "nv_ingest.framework.orchestration.ray.stages.transforms.text_embed:TextEmbeddingTransformStage"
337
+ config:
338
+ api_key: $NGC_API_KEY|""
339
+ embedding_model: $EMBEDDING_NIM_MODEL_NAME|"nvidia/llama-3.2-nv-embedqa-1b-v2"
340
+ embedding_nim_endpoint: $EMBEDDING_NIM_ENDPOINT|"https://integrate.api.nvidia.com/v1"
341
+ replicas:
342
+ min_replicas: 0
343
+ max_replicas:
344
+ strategy: "static"
345
+ value: 2
346
+ static_replicas:
347
+ strategy: "static"
348
+ value: 1
349
+
350
+ # Storage and Output
351
+ - name: "image_storage"
352
+ type: "stage"
353
+ phase: 5 # RESPONSE
354
+ actor: "nv_ingest.framework.orchestration.ray.stages.storage.image_storage:ImageStorageStage"
355
+ replicas:
356
+ min_replicas: 0
357
+ max_replicas:
358
+ strategy: "static"
359
+ value: 1
360
+ static_replicas:
361
+ strategy: "static"
362
+ value: 1
363
+
364
+ - name: "embedding_storage"
365
+ type: "stage"
366
+ phase: 5 # RESPONSE
367
+ actor: "nv_ingest.framework.orchestration.ray.stages.storage.store_embeddings:EmbeddingStorageStage"
368
+ replicas:
369
+ min_replicas: 0
370
+ max_replicas:
371
+ strategy: "static"
372
+ value: 1
373
+ static_replicas:
374
+ strategy: "static"
375
+ value: 1
376
+
377
+ - name: "broker_response"
378
+ type: "stage"
379
+ phase: 5 # RESPONSE
380
+ actor: "nv_ingest.framework.orchestration.ray.stages.sinks.message_broker_task_sink:MessageBrokerTaskSinkStage"
381
+ config:
382
+ broker_client:
383
+ client_type: "simple"
384
+ host: "localhost"
385
+ port: 7671
386
+ replicas:
387
+ min_replicas: 1
388
+ max_replicas:
389
+ strategy: "static"
390
+ value: 2
391
+ static_replicas:
392
+ strategy: "static"
393
+ value: 1
394
+
395
+ # Telemetry and Drain
396
+ - name: "otel_tracer"
397
+ type: "stage"
398
+ phase: 6 # TELEMETRY
399
+ actor: "nv_ingest.framework.orchestration.ray.stages.telemetry.otel_tracer:OpenTelemetryTracerStage"
400
+ config:
401
+ otel_endpoint: $OTEL_EXPORTER_OTLP_ENDPOINT|"http://localhost:4317"
402
+ replicas:
403
+ min_replicas: 0
404
+ max_replicas:
405
+ strategy: "static"
406
+ value: 1
407
+ static_replicas:
408
+ strategy: "static"
409
+ value: 1
410
+ runs_after:
411
+ - "broker_response"
412
+
413
+ - name: "default_drain"
414
+ type: "sink"
415
+ phase: 7 # DRAIN
416
+ actor: "nv_ingest.framework.orchestration.ray.stages.sinks.default_drain:DefaultDrainSink"
417
+ config: {}
418
+ replicas:
419
+ min_replicas: 0
420
+ max_replicas:
421
+ strategy: "static"
422
+ value: 1
423
+ static_replicas:
424
+ strategy: "static"
425
+ value: 1
426
+
427
+ edges:
428
+ # Intake
429
+ - from: "source_stage"
430
+ to: "metadata_injector"
431
+ queue_size: 32
432
+
433
+ # Document Extractors
434
+ - from: "metadata_injector"
435
+ to: "pdf_extractor"
436
+ queue_size: 32
437
+ - from: "pdf_extractor"
438
+ to: "audio_extractor"
439
+ queue_size: 32
440
+ - from: "audio_extractor"
441
+ to: "docx_extractor"
442
+ queue_size: 32
443
+ - from: "docx_extractor"
444
+ to: "pptx_extractor"
445
+ queue_size: 32
446
+ - from: "pptx_extractor"
447
+ to: "image_extractor"
448
+ queue_size: 32
449
+ - from: "image_extractor"
450
+ to: "html_extractor"
451
+ queue_size: 32
452
+ - from: "html_extractor"
453
+ to: "infographic_extractor"
454
+ queue_size: 32
455
+
456
+ # Primitive Extractors
457
+ - from: "infographic_extractor"
458
+ to: "table_extractor"
459
+ queue_size: 32
460
+ - from: "table_extractor"
461
+ to: "chart_extractor"
462
+ queue_size: 32
463
+ - from: "chart_extractor"
464
+ to: "image_filter"
465
+ queue_size: 32
466
+
467
+ # Primitive Mutators
468
+ - from: "image_filter"
469
+ to: "image_dedup"
470
+ queue_size: 32
471
+ - from: "image_dedup"
472
+ to: "text_splitter"
473
+ queue_size: 32
474
+
475
+ # Primitive Transforms
476
+ - from: "text_splitter"
477
+ to: "image_caption"
478
+ queue_size: 32
479
+ - from: "image_caption"
480
+ to: "text_embedder"
481
+ queue_size: 32
482
+ - from: "text_embedder"
483
+ to: "image_storage"
484
+ queue_size: 32
485
+
486
+ # Primitive Storage
487
+ - from: "image_storage"
488
+ to: "embedding_storage"
489
+ queue_size: 32
490
+ - from: "embedding_storage"
491
+ to: "broker_response"
492
+ queue_size: 32
493
+
494
+ # Response and Telemetry
495
+ - from: "broker_response"
496
+ to: "otel_tracer"
497
+ queue_size: 32
498
+ - from: "otel_tracer"
499
+ to: "default_drain"
500
+ queue_size: 32
501
+
502
+ # Pipeline Runtime Configuration
503
+ # These parameters control dynamic scaling and PID controller behavior
504
+ # All values can be overridden by environment variables from env_config.py
505
+ pipeline:
506
+ disable_dynamic_scaling: $INGEST_DISABLE_DYNAMIC_SCALING|true
507
+ dynamic_memory_threshold: $INGEST_DYNAMIC_MEMORY_THRESHOLD|0.75
508
+ pid_controller:
509
+ kp: $INGEST_DYNAMIC_MEMORY_KP|0.2
510
+ ki: $INGEST_DYNAMIC_MEMORY_KI|0.01
511
+ ema_alpha: $INGEST_DYNAMIC_MEMORY_EMA_ALPHA|0.1
512
+ target_queue_depth: $INGEST_DYNAMIC_MEMORY_TARGET_QUEUE_DEPTH|0
513
+ penalty_factor: $INGEST_DYNAMIC_MEMORY_PENALTY_FACTOR|0.1
514
+ error_boost_factor: $INGEST_DYNAMIC_MEMORY_ERROR_BOOST_FACTOR|1.5
515
+ rcm_memory_safety_buffer_fraction: $INGEST_DYNAMIC_MEMORY_RCM_MEMORY_SAFETY_BUFFER_FRACTION|0.15
516
+ launch_simple_broker: true
517
+ """