gst-python-ml 1.0.3__tar.gz → 1.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/MANIFEST.in +0 -2
- {gst_python_ml-1.0.3/plugins/python/gst_python_ml.egg-info → gst_python_ml-1.0.4}/PKG-INFO +146 -15
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/README.md +145 -14
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_objectdetector.py +1 -1
- gst_python_ml-1.0.4/plugins/python/clip.py +309 -0
- gst_python_ml-1.0.4/plugins/python/depth.py +270 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4/plugins/python/gst_python_ml.egg-info}/PKG-INFO +146 -15
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/SOURCES.txt +4 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/top_level.txt +0 -1
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_cairo.py +23 -0
- gst_python_ml-1.0.4/plugins/python/pose.py +295 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/analytics_utils.py +19 -2
- gst_python_ml-1.0.4/plugins/python/vad.py +194 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/pyproject.toml +1 -1
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/COPYING +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/__init__.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_aggregator.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_caption.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_classifier.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_llm.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_separate.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_transcribe.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_transform.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_translate.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/base_tts.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/caption_phi.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/caption_qwen.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/classifier.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/coquitts.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/data/soccer/app.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/data/soccer/botsort_people_reid.yaml +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/data/soccer/bytetrack_ball.yaml +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/demo_soccer.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/demucs.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/__init__.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/device_queue_pool.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/engine_factory.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/engine_manager.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/litert_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/ml_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/onnx_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/openvino_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/pytorch_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/pytorch_vision_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/engine/tensorflow_engine.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/dependency_links.txt +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/gst_python_ml.egg-info/requires.txt +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/kafkasink.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/llm.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/llm_stream_filter.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/__init__.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/global_logger.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/gst_logger.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/logger.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/log/logger_factory.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/mariantranslate.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/maskrcnn.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/objectdetector.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_counter.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/__init__.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_opengl.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_skia.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_utils.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_utils_interface.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/overlay_helper/overlay_vulkan.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/sepformer.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/stablediffusion.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/streamdemux.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/streammux.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/__init__.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/caption_utils.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/format_converter.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/gst_feature_manager.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/metadata.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/muxed_buffer_processor.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/utils/runtime_utils.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/video_transform.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/whisperlive.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/whisperspeechtts.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/whispertranscribe.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/plugins/python/yolo.py +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/setup.cfg +0 -0
- {gst_python_ml-1.0.3 → gst_python_ml-1.0.4}/tests/test_pipelines.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: gst-python-ml
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.4
|
|
4
4
|
Summary: An ML package for GStreamer
|
|
5
5
|
Author-email: Aaron Boxer <aaron.boxer@collabora.com>
|
|
6
6
|
Project-URL: Homepage, https://github.com/collabora/gst-python-ml
|
|
@@ -51,9 +51,13 @@ Supported functionality includes:
|
|
|
51
51
|
|
|
52
52
|
1. object detection
|
|
53
53
|
1. tracking
|
|
54
|
+
1. pose estimation (COCO 17-keypoint skeleton)
|
|
55
|
+
1. monocular depth estimation
|
|
56
|
+
1. zero-shot classification (CLIP / SigLIP)
|
|
54
57
|
1. video captioning
|
|
55
58
|
1. translation
|
|
56
59
|
1. transcription
|
|
60
|
+
1. voice activity detection
|
|
57
61
|
1. speech to text
|
|
58
62
|
1. text to speech
|
|
59
63
|
1. text to image
|
|
@@ -241,12 +245,6 @@ or
|
|
|
241
245
|
|
|
242
246
|
Now, in the container shell, set up `uv` `venv` as detailed above.
|
|
243
247
|
|
|
244
|
-
## IMPORTANT NOTES
|
|
245
|
-
|
|
246
|
-
### Birdseye
|
|
247
|
-
|
|
248
|
-
To use `pyml_birdseye`, additional pip requirements must be installed from the `plugins/python/birdseye` folder.
|
|
249
|
-
|
|
250
248
|
|
|
251
249
|
## Post Install
|
|
252
250
|
|
|
@@ -343,6 +341,147 @@ GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_tracking.mp4 ! decodebin
|
|
|
343
341
|
```
|
|
344
342
|
|
|
345
343
|
|
|
344
|
+
### Pose Estimation
|
|
345
|
+
|
|
346
|
+
`pyml_yolo_pose` supports all YOLO pose models. Recommended model names:
|
|
347
|
+
```
|
|
348
|
+
yolo11n-pose (fastest)
|
|
349
|
+
yolo11s-pose
|
|
350
|
+
yolo11m-pose (best accuracy)
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
#### YOLO pose with skeleton visualization (rendered on frame)
|
|
354
|
+
|
|
355
|
+
```
|
|
356
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
357
|
+
d. ! queue \
|
|
358
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
359
|
+
! pyml_yolo_pose model-name=yolo11n-pose device=cuda \
|
|
360
|
+
! videoconvert ! autovideosink sync=false
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
#### YOLO pose with bounding box overlay (metadata only, no in-element rendering)
|
|
364
|
+
|
|
365
|
+
```
|
|
366
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
367
|
+
d. ! queue \
|
|
368
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
369
|
+
! pyml_yolo_pose model-name=yolo11n-pose device=cuda visualize=false \
|
|
370
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
371
|
+
```
|
|
372
|
+
|
|
373
|
+
### Depth Estimation
|
|
374
|
+
|
|
375
|
+
`pyml_depth` supports DepthAnything V2 models from HuggingFace. Available model sizes:
|
|
376
|
+
```
|
|
377
|
+
depth-anything/Depth-Anything-V2-Small-hf (fastest, ~100 MB)
|
|
378
|
+
depth-anything/Depth-Anything-V2-Base-hf
|
|
379
|
+
depth-anything/Depth-Anything-V2-Large-hf (most accurate)
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
Available colormaps: `inferno` (default), `jet`, `viridis`, `plasma`, `magma`
|
|
383
|
+
|
|
384
|
+
#### DepthAnything V2 with inferno colormap
|
|
385
|
+
|
|
386
|
+
```
|
|
387
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
388
|
+
d. ! queue \
|
|
389
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
390
|
+
! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda \
|
|
391
|
+
! videoconvert ! autovideosink sync=false
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
#### DepthAnything V2 with jet colormap
|
|
395
|
+
|
|
396
|
+
```
|
|
397
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
398
|
+
d. ! queue \
|
|
399
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
400
|
+
! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda colormap=jet \
|
|
401
|
+
! videoconvert ! autovideosink sync=false
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
#### Depth with reduced compute via frame-stride
|
|
405
|
+
|
|
406
|
+
```
|
|
407
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
408
|
+
d. ! queue \
|
|
409
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
410
|
+
! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda frame-stride=2 \
|
|
411
|
+
! videoconvert ! autovideosink sync=false
|
|
412
|
+
```
|
|
413
|
+
|
|
414
|
+
#### Depth with original video side-by-side (tee)
|
|
415
|
+
|
|
416
|
+
```
|
|
417
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
418
|
+
d. ! queue \
|
|
419
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
420
|
+
! tee name=t \
|
|
421
|
+
t. ! queue ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda ! videoconvert ! autovideosink sync=false \
|
|
422
|
+
t. ! queue ! videoconvert ! autovideosink sync=false
|
|
423
|
+
```
|
|
424
|
+
|
|
425
|
+
### Zero-Shot Classification (CLIP / SigLIP)
|
|
426
|
+
|
|
427
|
+
`pyml_clip` classifies each frame against a user-defined set of text labels
|
|
428
|
+
with no fixed label set — labels are set at pipeline launch time.
|
|
429
|
+
|
|
430
|
+
Supported models:
|
|
431
|
+
```
|
|
432
|
+
openai/clip-vit-base-patch32 (default, ~600 MB)
|
|
433
|
+
openai/clip-vit-large-patch14 (more accurate, ~1.7 GB)
|
|
434
|
+
google/siglip-base-patch16-224 (SigLIP, better zero-shot accuracy)
|
|
435
|
+
google/siglip-large-patch16-384 (SigLIP large)
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
#### CLIP with custom labels
|
|
439
|
+
|
|
440
|
+
```
|
|
441
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
442
|
+
d. ! queue \
|
|
443
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
444
|
+
! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
|
|
445
|
+
labels="person, bicycle, car, dog, cat" top-k=3 \
|
|
446
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
447
|
+
```
|
|
448
|
+
|
|
449
|
+
#### SigLIP (better zero-shot accuracy than CLIP)
|
|
450
|
+
|
|
451
|
+
```
|
|
452
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
453
|
+
d. ! queue \
|
|
454
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
455
|
+
! pyml_clip model-name=google/siglip-base-patch16-224 device=cuda \
|
|
456
|
+
labels="people walking, empty street, crowd, indoor scene" top-k=1 \
|
|
457
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
458
|
+
```
|
|
459
|
+
|
|
460
|
+
#### CLIP with threshold (only report labels above 20% confidence)
|
|
461
|
+
|
|
462
|
+
```
|
|
463
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
464
|
+
d. ! queue \
|
|
465
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
466
|
+
! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
|
|
467
|
+
labels="person, bicycle, car, dog, cat" threshold=0.2 \
|
|
468
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
469
|
+
```
|
|
470
|
+
|
|
471
|
+
### Voice Activity Detection
|
|
472
|
+
|
|
473
|
+
#### Standalone VAD with metadata (pass-through, speech probability attached to buffers)
|
|
474
|
+
|
|
475
|
+
```
|
|
476
|
+
GST_DEBUG=4 gst-launch-1.0 pulsesrc ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.7 ! fakesink
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
#### VAD gating before transcription (mute silent audio, reduce Whisper latency)
|
|
480
|
+
|
|
481
|
+
```
|
|
482
|
+
GST_DEBUG=4 gst-launch-1.0 filesrc location=data/air_traffic_korean_with_english.wav ! decodebin ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.6 gate=true ! pyml_whispertranscribe device=cuda language=ko ! fakesink
|
|
483
|
+
```
|
|
484
|
+
|
|
346
485
|
### Transcription
|
|
347
486
|
|
|
348
487
|
#### transcription with initial prompt set
|
|
@@ -417,14 +556,6 @@ https://huggingface.co/models?sort=trending&search=Helsinki
|
|
|
417
556
|
GST_DEBUG=3 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvertscale ! video/x-raw,width=640,height=480 ! tee name=t t. ! queue ! textoverlay name=overlay wait-text=false ! videoconvert ! autovideosink t. ! queue leaky=2 max-size-buffers=1 ! videoconvertscale ! video/x-raw,width=240,height=180 ! pyml_caption_qwen device=cuda:0 prompt="In one sentence, describe what you see?" model-name="Qwen/Qwen2.5-VL-3B-Instruct-AWQ" name=cap cap.src ! fakesink async=0 sync=0 cap.text_src ! queue ! coalescehistory history-length=10 ! pyml_llm model-name="Qwen/Qwen3-0.6B" device=cuda system-prompt="You receive the history of what happened in recent times, summarize it nicely with excitement but NEVER mention the specific times. Focus on the most recent events." ! queue ! overlay.text_sink
|
|
418
557
|
```
|
|
419
558
|
|
|
420
|
-
|
|
421
|
-
### Bird's Eye View
|
|
422
|
-
|
|
423
|
-
`GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvert ! pyml_birdseye ! videoconvert ! autovideosink`
|
|
424
|
-
|
|
425
|
-
`GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videorate ! video/x-raw,framerate=30/1 ! videoconvert ! pyml_birdseye ! videoconvert ! openh264enc ! h264parse ! matroskamux ! filesink location=output.mkv`
|
|
426
|
-
|
|
427
|
-
|
|
428
559
|
### kafkasink
|
|
429
560
|
|
|
430
561
|
#### Setting up kafka network
|
|
@@ -6,9 +6,13 @@ Supported functionality includes:
|
|
|
6
6
|
|
|
7
7
|
1. object detection
|
|
8
8
|
1. tracking
|
|
9
|
+
1. pose estimation (COCO 17-keypoint skeleton)
|
|
10
|
+
1. monocular depth estimation
|
|
11
|
+
1. zero-shot classification (CLIP / SigLIP)
|
|
9
12
|
1. video captioning
|
|
10
13
|
1. translation
|
|
11
14
|
1. transcription
|
|
15
|
+
1. voice activity detection
|
|
12
16
|
1. speech to text
|
|
13
17
|
1. text to speech
|
|
14
18
|
1. text to image
|
|
@@ -196,12 +200,6 @@ or
|
|
|
196
200
|
|
|
197
201
|
Now, in the container shell, set up `uv` `venv` as detailed above.
|
|
198
202
|
|
|
199
|
-
## IMPORTANT NOTES
|
|
200
|
-
|
|
201
|
-
### Birdseye
|
|
202
|
-
|
|
203
|
-
To use `pyml_birdseye`, additional pip requirements must be installed from the `plugins/python/birdseye` folder.
|
|
204
|
-
|
|
205
203
|
|
|
206
204
|
## Post Install
|
|
207
205
|
|
|
@@ -298,6 +296,147 @@ GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_tracking.mp4 ! decodebin
|
|
|
298
296
|
```
|
|
299
297
|
|
|
300
298
|
|
|
299
|
+
### Pose Estimation
|
|
300
|
+
|
|
301
|
+
`pyml_yolo_pose` supports all YOLO pose models. Recommended model names:
|
|
302
|
+
```
|
|
303
|
+
yolo11n-pose (fastest)
|
|
304
|
+
yolo11s-pose
|
|
305
|
+
yolo11m-pose (best accuracy)
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
#### YOLO pose with skeleton visualization (rendered on frame)
|
|
309
|
+
|
|
310
|
+
```
|
|
311
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
312
|
+
d. ! queue \
|
|
313
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
314
|
+
! pyml_yolo_pose model-name=yolo11n-pose device=cuda \
|
|
315
|
+
! videoconvert ! autovideosink sync=false
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
#### YOLO pose with bounding box overlay (metadata only, no in-element rendering)
|
|
319
|
+
|
|
320
|
+
```
|
|
321
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
322
|
+
d. ! queue \
|
|
323
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
324
|
+
! pyml_yolo_pose model-name=yolo11n-pose device=cuda visualize=false \
|
|
325
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
326
|
+
```
|
|
327
|
+
|
|
328
|
+
### Depth Estimation
|
|
329
|
+
|
|
330
|
+
`pyml_depth` supports DepthAnything V2 models from HuggingFace. Available model sizes:
|
|
331
|
+
```
|
|
332
|
+
depth-anything/Depth-Anything-V2-Small-hf (fastest, ~100 MB)
|
|
333
|
+
depth-anything/Depth-Anything-V2-Base-hf
|
|
334
|
+
depth-anything/Depth-Anything-V2-Large-hf (most accurate)
|
|
335
|
+
```
|
|
336
|
+
|
|
337
|
+
Available colormaps: `inferno` (default), `jet`, `viridis`, `plasma`, `magma`
|
|
338
|
+
|
|
339
|
+
#### DepthAnything V2 with inferno colormap
|
|
340
|
+
|
|
341
|
+
```
|
|
342
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
343
|
+
d. ! queue \
|
|
344
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
345
|
+
! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda \
|
|
346
|
+
! videoconvert ! autovideosink sync=false
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
#### DepthAnything V2 with jet colormap
|
|
350
|
+
|
|
351
|
+
```
|
|
352
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
353
|
+
d. ! queue \
|
|
354
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
355
|
+
! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda colormap=jet \
|
|
356
|
+
! videoconvert ! autovideosink sync=false
|
|
357
|
+
```
|
|
358
|
+
|
|
359
|
+
#### Depth with reduced compute via frame-stride
|
|
360
|
+
|
|
361
|
+
```
|
|
362
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
363
|
+
d. ! queue \
|
|
364
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
365
|
+
! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda frame-stride=2 \
|
|
366
|
+
! videoconvert ! autovideosink sync=false
|
|
367
|
+
```
|
|
368
|
+
|
|
369
|
+
#### Depth with original video side-by-side (tee)
|
|
370
|
+
|
|
371
|
+
```
|
|
372
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
373
|
+
d. ! queue \
|
|
374
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
375
|
+
! tee name=t \
|
|
376
|
+
t. ! queue ! pyml_depth model-name=depth-anything/Depth-Anything-V2-Small-hf device=cuda ! videoconvert ! autovideosink sync=false \
|
|
377
|
+
t. ! queue ! videoconvert ! autovideosink sync=false
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
### Zero-Shot Classification (CLIP / SigLIP)
|
|
381
|
+
|
|
382
|
+
`pyml_clip` classifies each frame against a user-defined set of text labels
|
|
383
|
+
with no fixed label set — labels are set at pipeline launch time.
|
|
384
|
+
|
|
385
|
+
Supported models:
|
|
386
|
+
```
|
|
387
|
+
openai/clip-vit-base-patch32 (default, ~600 MB)
|
|
388
|
+
openai/clip-vit-large-patch14 (more accurate, ~1.7 GB)
|
|
389
|
+
google/siglip-base-patch16-224 (SigLIP, better zero-shot accuracy)
|
|
390
|
+
google/siglip-large-patch16-384 (SigLIP large)
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
#### CLIP with custom labels
|
|
394
|
+
|
|
395
|
+
```
|
|
396
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
397
|
+
d. ! queue \
|
|
398
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
399
|
+
! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
|
|
400
|
+
labels="person, bicycle, car, dog, cat" top-k=3 \
|
|
401
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
#### SigLIP (better zero-shot accuracy than CLIP)
|
|
405
|
+
|
|
406
|
+
```
|
|
407
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
408
|
+
d. ! queue \
|
|
409
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
410
|
+
! pyml_clip model-name=google/siglip-base-patch16-224 device=cuda \
|
|
411
|
+
labels="people walking, empty street, crowd, indoor scene" top-k=1 \
|
|
412
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
#### CLIP with threshold (only report labels above 20% confidence)
|
|
416
|
+
|
|
417
|
+
```
|
|
418
|
+
gst-launch-1.0 filesrc location=data/people.mp4 ! decodebin name=d \
|
|
419
|
+
d. ! queue \
|
|
420
|
+
! videoconvert ! videoscale ! "video/x-raw,width=640,height=480" \
|
|
421
|
+
! pyml_clip model-name=openai/clip-vit-base-patch32 device=cuda \
|
|
422
|
+
labels="person, bicycle, car, dog, cat" threshold=0.2 \
|
|
423
|
+
! videoconvert ! pyml_overlay ! videoconvert ! autovideosink sync=false
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### Voice Activity Detection
|
|
427
|
+
|
|
428
|
+
#### Standalone VAD with metadata (pass-through, speech probability attached to buffers)
|
|
429
|
+
|
|
430
|
+
```
|
|
431
|
+
GST_DEBUG=4 gst-launch-1.0 pulsesrc ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.7 ! fakesink
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
#### VAD gating before transcription (mute silent audio, reduce Whisper latency)
|
|
435
|
+
|
|
436
|
+
```
|
|
437
|
+
GST_DEBUG=4 gst-launch-1.0 filesrc location=data/air_traffic_korean_with_english.wav ! decodebin ! audioconvert ! audioresample ! audio/x-raw,format=S16LE,rate=16000,channels=1 ! pyml_vad threshold=0.6 gate=true ! pyml_whispertranscribe device=cuda language=ko ! fakesink
|
|
438
|
+
```
|
|
439
|
+
|
|
301
440
|
### Transcription
|
|
302
441
|
|
|
303
442
|
#### transcription with initial prompt set
|
|
@@ -372,14 +511,6 @@ https://huggingface.co/models?sort=trending&search=Helsinki
|
|
|
372
511
|
GST_DEBUG=3 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvertscale ! video/x-raw,width=640,height=480 ! tee name=t t. ! queue ! textoverlay name=overlay wait-text=false ! videoconvert ! autovideosink t. ! queue leaky=2 max-size-buffers=1 ! videoconvertscale ! video/x-raw,width=240,height=180 ! pyml_caption_qwen device=cuda:0 prompt="In one sentence, describe what you see?" model-name="Qwen/Qwen2.5-VL-3B-Instruct-AWQ" name=cap cap.src ! fakesink async=0 sync=0 cap.text_src ! queue ! coalescehistory history-length=10 ! pyml_llm model-name="Qwen/Qwen3-0.6B" device=cuda system-prompt="You receive the history of what happened in recent times, summarize it nicely with excitement but NEVER mention the specific times. Focus on the most recent events." ! queue ! overlay.text_sink
|
|
373
512
|
```
|
|
374
513
|
|
|
375
|
-
|
|
376
|
-
### Bird's Eye View
|
|
377
|
-
|
|
378
|
-
`GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videoconvert ! pyml_birdseye ! videoconvert ! autovideosink`
|
|
379
|
-
|
|
380
|
-
`GST_DEBUG=4 gst-launch-1.0 filesrc location=data/soccer_single_camera.mp4 ! decodebin ! videorate ! video/x-raw,framerate=30/1 ! videoconvert ! pyml_birdseye ! videoconvert ! openh264enc ! h264parse ! matroskamux ! filesink location=output.mkv`
|
|
381
|
-
|
|
382
|
-
|
|
383
514
|
### kafkasink
|
|
384
515
|
|
|
385
516
|
#### Setting up kafka network
|
|
@@ -124,7 +124,7 @@ class BaseObjectDetector(VideoTransform):
|
|
|
124
124
|
count = GstAnalytics.relation_get_length(attached_meta)
|
|
125
125
|
self.logger.info(f"Total metadata relations attached: {count}")
|
|
126
126
|
else:
|
|
127
|
-
self.logger.
|
|
127
|
+
self.logger.debug("No detections on this buffer")
|
|
128
128
|
|
|
129
129
|
return Gst.FlowReturn.OK
|
|
130
130
|
|