gst-python-ml 1.0.3__tar.gz → 1.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/MANIFEST.in +0 -2
  2. {gst_python_ml-1.0.3/plugins/python/gst_python_ml.egg-info → gst_python_ml-1.0.4}/PKG-INFO +146 -15
  3. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/README.md +145 -14
  4. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_objectdetector.py +1 -1
  5. gst_python_ml-1.0.4/plugins/python/clip.py +309 -0
  6. gst_python_ml-1.0.4/plugins/python/depth.py +270 -0
  7. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4/plugins/python/gst_python_ml.egg-info}/PKG-INFO +146 -15
  8. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/SOURCES.txt +4 -0
  9. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/top_level.txt +0 -1
  10. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_cairo.py +23 -0
  11. gst_python_ml-1.0.4/plugins/python/pose.py +295 -0
  12. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/analytics_utils.py +19 -2
  13. gst_python_ml-1.0.4/plugins/python/vad.py +194 -0
  14. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/pyproject.toml +1 -1
  15. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/COPYING +0 -0
  16. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/__init__.py +0 -0
  17. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_aggregator.py +0 -0
  18. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_caption.py +0 -0
  19. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_classifier.py +0 -0
  20. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_llm.py +0 -0
  21. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_separate.py +0 -0
  22. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_transcribe.py +0 -0
  23. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_transform.py +0 -0
  24. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_translate.py +0 -0
  25. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_tts.py +0 -0
  26. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/caption_phi.py +0 -0
  27. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/caption_qwen.py +0 -0
  28. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/classifier.py +0 -0
  29. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/coquitts.py +0 -0
  30. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/data/soccer/app.py +0 -0
  31. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/data/soccer/botsort_people_reid.yaml +0 -0
  32. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/data/soccer/bytetrack_ball.yaml +0 -0
  33. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/demo_soccer.py +0 -0
  34. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/demucs.py +0 -0
  35. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/__init__.py +0 -0
  36. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/device_queue_pool.py +0 -0
  37. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/engine_factory.py +0 -0
  38. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/engine_manager.py +0 -0
  39. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/litert_engine.py +0 -0
  40. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/ml_engine.py +0 -0
  41. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/onnx_engine.py +0 -0
  42. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/openvino_engine.py +0 -0
  43. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/pytorch_engine.py +0 -0
  44. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/pytorch_vision_engine.py +0 -0
  45. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/tensorflow_engine.py +0 -0
  46. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/dependency_links.txt +0 -0
  47. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/requires.txt +0 -0
  48. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/kafkasink.py +0 -0
  49. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/llm.py +0 -0
  50. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/llm_stream_filter.py +0 -0
  51. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/__init__.py +0 -0
  52. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/global_logger.py +0 -0
  53. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/gst_logger.py +0 -0
  54. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/logger.py +0 -0
  55. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/logger_factory.py +0 -0
  56. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/mariantranslate.py +0 -0
  57. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/maskrcnn.py +0 -0
  58. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/objectdetector.py +0 -0
  59. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay.py +0 -0
  60. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_counter.py +0 -0
  61. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/__init__.py +0 -0
  62. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_opengl.py +0 -0
  63. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_skia.py +0 -0
  64. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_utils.py +0 -0
  65. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_utils_interface.py +0 -0
  66. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_vulkan.py +0 -0
  67. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/sepformer.py +0 -0
  68. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/stablediffusion.py +0 -0
  69. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/streamdemux.py +0 -0
  70. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/streammux.py +0 -0
  71. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/__init__.py +0 -0
  72. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/caption_utils.py +0 -0
  73. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/format_converter.py +0 -0
  74. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/gst_feature_manager.py +0 -0
  75. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/metadata.py +0 -0
  76. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/muxed_buffer_processor.py +0 -0
  77. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/runtime_utils.py +0 -0
  78. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/video_transform.py +0 -0
  79. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/whisperlive.py +0 -0
  80. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/whisperspeechtts.py +0 -0
  81. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/whispertranscribe.py +0 -0
  82. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/yolo.py +0 -0
  83. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/setup.cfg +0 -0
  84. {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/tests/test_pipelines.py +0 -0
@@ -3,5 +3,3 @@ include COPYING
3
3
  recursive-include plugins/python *
4
4
  global-exclude __pycache__/*
5
5
  global-exclude *.pyc
6
- prune plugins/python/birdseye
7
- exclude plugins/python/birds_eye.py
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: gst-python-ml
3
- Version: 1.0.3
3
+ Version: 1.0.4
4
4
  Summary: An ML package for GStreamer
5
5
  Author-email: Aaron Boxer <aaron.boxer@collabora.com>
6
6
  Project-URL: Homepage, https://github.com/collabora/gst-python-ml
@@ -51,9 +51,13 @@ Supported functionality includes:
51
51
 
52
52
  1. object detection
53
53
  1. tracking
54
+ 1. pose estimation (COCO 17-keypoint skeleton)
55
+ 1. monocular depth estimation
56
+ 1. zero-shot classification (CLIP / SigLIP)
54
57
  1. video captioning
55
58
  1. translation
56
59
  1. transcription
60
+ 1. voice activity detection
57
61
  1. speech to text
58
62
  1. text to speech
59
63
  1. text to image
@@ -241,12 +245,6 @@ or
241
245
 
242
246
  Now, in the container shell, set up `uv` `venv` as detailed above.
243
247
 
244
- ## IMPORTANT NOTES
245
-
246
- ### Birdseye
247
-
248
- To use `pyml_birdseye`, additional pip requirements must be installed from the `plugins/python/birdseye` folder.
249
-
250
248
 
251
249
  ## Post Install
252
250
 
@@ -343,6 +341,147 @@ GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_tracking.mp4 ! decodebin
343
341
  ```
344
342
 
345
343
 
344
+ ### Pose Estimation
345
+
346
+ `pyml_yolo_pose` supports all YOLO pose models. Recommended model names:
347
+ ```
348
+ yolo11n-pose (fastest)
349
+ yolo11s-pose
350
+ yolo11m-pose (best accuracy)
351
+ ```
352
+
353
+ #### YOLO pose with skeleton visualization (rendered on frame)
354
+
355
+ ```
356
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
357
+ d. ! queue \
358
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
359
+ ! pyml_yolo_pose model-name=yolo11n-pose device=cuda \
360
+ ! videoconvert ! autovideosink sync=false
361
+ ```
362
+
363
+ #### YOLO pose with bounding box overlay (metadata only, no in-element rendering)
364
+
365
+ ```
366
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
367
+ d. ! queue \
368
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
369
+ ! pyml_yolo_pose model-name=yolo11n-pose device=cuda visualize=false \
370
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
371
+ ```
372
+
373
+ ### Depth Estimation
374
+
375
+ `pyml_depth` supports DepthAnything V2 models from HuggingFace. Available model sizes:
376
+ ```
377
+ depth-anything/Depth-Anything-V2-Small-hf (fastest, ~100 MB)
378
+ depth-anything/Depth-Anything-V2-Base-hf
379
+ depth-anything/Depth-Anything-V2-Large-hf (most accurate)
380
+ ```
381
+
382
+ Available colormaps: `inferno` (default), `jet`, `viridis`, `plasma`, `magma`
383
+
384
+ #### DepthAnything V2 with inferno colormap
385
+
386
+ ```
387
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
388
+ d. ! queue \
389
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
390
+ ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda \
391
+ ! videoconvert ! autovideosink sync=false
392
+ ```
393
+
394
+ #### DepthAnything V2 with jet colormap
395
+
396
+ ```
397
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
398
+ d. ! queue \
399
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
400
+ ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda colormap=jet \
401
+ ! videoconvert ! autovideosink sync=false
402
+ ```
403
+
404
+ #### Depth with reduced compute via frame-stride
405
+
406
+ ```
407
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
408
+ d. ! queue \
409
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
410
+ ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda frame-stride=2 \
411
+ ! videoconvert ! autovideosink sync=false
412
+ ```
413
+
414
+ #### Depth with original video side-by-side (tee)
415
+
416
+ ```
417
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
418
+ d. ! queue \
419
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
420
+ ! tee name=t \
421
+ t. ! queue ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda ! videoconvert ! autovideosink sync=false \
422
+ t. ! queue ! videoconvert ! autovideosink sync=false
423
+ ```
424
+
425
+ ### Zero-Shot Classification (CLIP / SigLIP)
426
+
427
+ `pyml_clip` classifies each frame against a user-defined set of text labels
428
+ with no fixed label set — labels are set at pipeline launch time.
429
+
430
+ Supported models:
431
+ ```
432
+ openai/clip-vit-base-patch32 (default, ~600 MB)
433
+ openai/clip-vit-large-patch14 (more accurate, ~1.7 GB)
434
+ google/siglip-base-patch16-224 (SigLIP, better zero-shot accuracy)
435
+ google/siglip-large-patch16-384 (SigLIP large)
436
+ ```
437
+
438
+ #### CLIP with custom labels
439
+
440
+ ```
441
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
442
+ d. ! queue \
443
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
444
+ ! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
445
+ labels="person, bicycle, car, dog, cat" top-k=3 \
446
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
447
+ ```
448
+
449
+ #### SigLIP (better zero-shot accuracy than CLIP)
450
+
451
+ ```
452
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
453
+ d. ! queue \
454
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
455
+ ! pyml_clip model-name=google/siglip-base-patch16-224 device=cuda \
456
+ labels="people walking, empty street, crowd, indoor scene" top-k=1 \
457
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
458
+ ```
459
+
460
+ #### CLIP with threshold (only report labels above 20% confidence)
461
+
462
+ ```
463
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
464
+ d. ! queue \
465
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
466
+ ! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
467
+ labels="person, bicycle, car, dog, cat" threshold=0.2 \
468
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
469
+ ```
470
+
471
+ ### Voice Activity Detection
472
+
473
+ #### Standalone VAD with metadata (pass-through, speech probability attached to buffers)
474
+
475
+ ```
476
+ GST_DEBUG=4 gst-launch-1.0 pulsesrc ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.7 ! fakesink
477
+ ```
478
+
479
+ #### VAD gating before transcription (mute silent audio, reduce Whisper latency)
480
+
481
+ ```
482
+ GST_DEBUG=4 gst-launch-1.0 filesrc location=data/air_traffic_korean_with_english.wav ! decodebin ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.6 gate=true ! pyml_whispertranscribe device=cuda language=ko ! fakesink
483
+ ```
484
+
346
485
  ### Transcription
347
486
 
348
487
  #### transcription with initial prompt set
@@ -417,14 +556,6 @@ https://huggingface.co/models?sort=trending&search=Helsinki
417
556
  GST_DEBUG=3 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvertscale ! video/x-raw,width=640,height=480 ! tee name=t t. ! queue ! textoverlay name=overlay wait-text=false ! videoconvert ! autovideosink t. ! queue leaky=2 max-size-buffers=1 ! videoconvertscale ! video/x-raw,width=240,height=180 ! pyml_caption_qwen device=cuda:0 prompt="In one sentence, describe what you see?" model-name="Qwen/Qwen2.5-VL-3B-Instruct-AWQ" name=cap cap.src ! fakesink async=0 sync=0 cap.text_src ! queue ! coalescehistory history-length=10 ! pyml_llm model-name="Qwen/Qwen3-0.6B" device=cuda system-prompt="You receive the history of what happened in recent times, summarize it nicely with excitement but NEVER mention the specific times. Focus on the most recent events." ! queue ! overlay.text_sink
418
557
  ```
419
558
 
420
-
421
- ### Bird's Eye View
422
-
423
- `GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvert ! pyml_birdseye ! videoconvert ! autovideosink`
424
-
425
- `GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videorate ! video/x-raw,framerate=30/1 ! videoconvert ! pyml_birdseye ! videoconvert ! openh264enc ! h264parse ! matroskamux ! filesink location=output.mkv`
426
-
427
-
428
559
  ### kafkasink
429
560
 
430
561
  #### Setting up kafka network
@@ -6,9 +6,13 @@ Supported functionality includes:
6
6
 
7
7
  1. object detection
8
8
  1. tracking
9
+ 1. pose estimation (COCO 17-keypoint skeleton)
10
+ 1. monocular depth estimation
11
+ 1. zero-shot classification (CLIP / SigLIP)
9
12
  1. video captioning
10
13
  1. translation
11
14
  1. transcription
15
+ 1. voice activity detection
12
16
  1. speech to text
13
17
  1. text to speech
14
18
  1. text to image
@@ -196,12 +200,6 @@ or
196
200
 
197
201
  Now, in the container shell, set up `uv` `venv` as detailed above.
198
202
 
199
- ## IMPORTANT NOTES
200
-
201
- ### Birdseye
202
-
203
- To use `pyml_birdseye`, additional pip requirements must be installed from the `plugins/python/birdseye` folder.
204
-
205
203
 
206
204
  ## Post Install
207
205
 
@@ -298,6 +296,147 @@ GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_tracking.mp4 ! decodebin
298
296
  ```
299
297
 
300
298
 
299
+ ### Pose Estimation
300
+
301
+ `pyml_yolo_pose` supports all YOLO pose models. Recommended model names:
302
+ ```
303
+ yolo11n-pose (fastest)
304
+ yolo11s-pose
305
+ yolo11m-pose (best accuracy)
306
+ ```
307
+
308
+ #### YOLO pose with skeleton visualization (rendered on frame)
309
+
310
+ ```
311
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
312
+ d. ! queue \
313
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
314
+ ! pyml_yolo_pose model-name=yolo11n-pose device=cuda \
315
+ ! videoconvert ! autovideosink sync=false
316
+ ```
317
+
318
+ #### YOLO pose with bounding box overlay (metadata only, no in-element rendering)
319
+
320
+ ```
321
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
322
+ d. ! queue \
323
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
324
+ ! pyml_yolo_pose model-name=yolo11n-pose device=cuda visualize=false \
325
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
326
+ ```
327
+
328
+ ### Depth Estimation
329
+
330
+ `pyml_depth` supports DepthAnything V2 models from HuggingFace. Available model sizes:
331
+ ```
332
+ depth-anything/Depth-Anything-V2-Small-hf (fastest, ~100 MB)
333
+ depth-anything/Depth-Anything-V2-Base-hf
334
+ depth-anything/Depth-Anything-V2-Large-hf (most accurate)
335
+ ```
336
+
337
+ Available colormaps: `inferno` (default), `jet`, `viridis`, `plasma`, `magma`
338
+
339
+ #### DepthAnything V2 with inferno colormap
340
+
341
+ ```
342
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
343
+ d. ! queue \
344
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
345
+ ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda \
346
+ ! videoconvert ! autovideosink sync=false
347
+ ```
348
+
349
+ #### DepthAnything V2 with jet colormap
350
+
351
+ ```
352
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
353
+ d. ! queue \
354
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
355
+ ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda colormap=jet \
356
+ ! videoconvert ! autovideosink sync=false
357
+ ```
358
+
359
+ #### Depth with reduced compute via frame-stride
360
+
361
+ ```
362
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
363
+ d. ! queue \
364
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
365
+ ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda frame-stride=2 \
366
+ ! videoconvert ! autovideosink sync=false
367
+ ```
368
+
369
+ #### Depth with original video side-by-side (tee)
370
+
371
+ ```
372
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
373
+ d. ! queue \
374
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
375
+ ! tee name=t \
376
+ t. ! queue ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda ! videoconvert ! autovideosink sync=false \
377
+ t. ! queue ! videoconvert ! autovideosink sync=false
378
+ ```
379
+
380
+ ### Zero-Shot Classification (CLIP / SigLIP)
381
+
382
+ `pyml_clip` classifies each frame against a user-defined set of text labels
383
+ with no fixed label set — labels are set at pipeline launch time.
384
+
385
+ Supported models:
386
+ ```
387
+ openai/clip-vit-base-patch32 (default, ~600 MB)
388
+ openai/clip-vit-large-patch14 (more accurate, ~1.7 GB)
389
+ google/siglip-base-patch16-224 (SigLIP, better zero-shot accuracy)
390
+ google/siglip-large-patch16-384 (SigLIP large)
391
+ ```
392
+
393
+ #### CLIP with custom labels
394
+
395
+ ```
396
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
397
+ d. ! queue \
398
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
399
+ ! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
400
+ labels="person, bicycle, car, dog, cat" top-k=3 \
401
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
402
+ ```
403
+
404
+ #### SigLIP (better zero-shot accuracy than CLIP)
405
+
406
+ ```
407
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
408
+ d. ! queue \
409
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
410
+ ! pyml_clip model-name=google/siglip-base-patch16-224 device=cuda \
411
+ labels="people walking, empty street, crowd, indoor scene" top-k=1 \
412
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
413
+ ```
414
+
415
+ #### CLIP with threshold (only report labels above 20% confidence)
416
+
417
+ ```
418
+ gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
419
+ d. ! queue \
420
+ ! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
421
+ ! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
422
+ labels="person, bicycle, car, dog, cat" threshold=0.2 \
423
+ ! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
424
+ ```
425
+
426
+ ### Voice Activity Detection
427
+
428
+ #### Standalone VAD with metadata (pass-through, speech probability attached to buffers)
429
+
430
+ ```
431
+ GST_DEBUG=4 gst-launch-1.0 pulsesrc ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.7 ! fakesink
432
+ ```
433
+
434
+ #### VAD gating before transcription (mute silent audio, reduce Whisper latency)
435
+
436
+ ```
437
+ GST_DEBUG=4 gst-launch-1.0 filesrc location=data/air_traffic_korean_with_english.wav ! decodebin ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.6 gate=true ! pyml_whispertranscribe device=cuda language=ko ! fakesink
438
+ ```
439
+
301
440
  ### Transcription
302
441
 
303
442
  #### transcription with initial prompt set
@@ -372,14 +511,6 @@ https://huggingface.co/models?sort=trending&search=Helsinki
372
511
  GST_DEBUG=3 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvertscale ! video/x-raw,width=640,height=480 ! tee name=t t. ! queue ! textoverlay name=overlay wait-text=false ! videoconvert ! autovideosink t. ! queue leaky=2 max-size-buffers=1 ! videoconvertscale ! video/x-raw,width=240,height=180 ! pyml_caption_qwen device=cuda:0 prompt="In one sentence, describe what you see?" model-name="Qwen/Qwen2.5-VL-3B-Instruct-AWQ" name=cap cap.src ! fakesink async=0 sync=0 cap.text_src ! queue ! coalescehistory history-length=10 ! pyml_llm model-name="Qwen/Qwen3-0.6B" device=cuda system-prompt="You receive the history of what happened in recent times, summarize it nicely with excitement but NEVER mention the specific times. Focus on the most recent events." ! queue ! overlay.text_sink
373
512
  ```
374
513
 
375
-
376
- ### Bird's Eye View
377
-
378
- `GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvert ! pyml_birdseye ! videoconvert ! autovideosink`
379
-
380
- `GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videorate ! video/x-raw,framerate=30/1 ! videoconvert ! pyml_birdseye ! videoconvert ! openh264enc ! h264parse ! matroskamux ! filesink location=output.mkv`
381
-
382
-
383
514
  ### kafkasink
384
515
 
385
516
  #### Setting up kafka network
@@ -124,7 +124,7 @@ class BaseObjectDetector(VideoTransform):
124
124
  count = GstAnalytics.relation_get_length(attached_meta)
125
125
  self.logger.info(f"Total metadata relations attached: {count}")
126
126
  else:
127
- self.logger.error("No metadata attached to buffer")
127
+ self.logger.debug("No detections on this buffer")
128
128
 
129
129
  return Gst.FlowReturn.OK
130
130