PyPI - mllm-annotator - Versions diffs - 0.1.0__tar.gz - Mend

mllm-annotator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

mllm_annotator-0.1.0/LICENSE +21 -0
mllm_annotator-0.1.0/PKG-INFO +170 -0
mllm_annotator-0.1.0/README.md +138 -0
mllm_annotator-0.1.0/pyproject.toml +58 -0
mllm_annotator-0.1.0/setup.cfg +4 -0
mllm_annotator-0.1.0/src/mllm_annotator/__init__.py +33 -0
mllm_annotator-0.1.0/src/mllm_annotator/config.py +151 -0
mllm_annotator-0.1.0/src/mllm_annotator/core.py +786 -0
mllm_annotator-0.1.0/src/mllm_annotator/embedder.py +375 -0
mllm_annotator-0.1.0/src/mllm_annotator/ui.py +1524 -0
mllm_annotator-0.1.0/src/mllm_annotator.egg-info/PKG-INFO +170 -0
mllm_annotator-0.1.0/src/mllm_annotator.egg-info/SOURCES.txt +14 -0
mllm_annotator-0.1.0/src/mllm_annotator.egg-info/dependency_links.txt +1 -0
mllm_annotator-0.1.0/src/mllm_annotator.egg-info/entry_points.txt +5 -0
mllm_annotator-0.1.0/src/mllm_annotator.egg-info/requires.txt +13 -0
mllm_annotator-0.1.0/src/mllm_annotator.egg-info/top_level.txt +1 -0

mllm_annotator-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 University of Bern, Data Science Lab, Matteo Boi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

mllm_annotator-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,170 @@
+Metadata-Version: 2.4
+Name: mllm-annotator
+Version: 0.1.0
+Summary: Resumable multimodal-LLM annotator and embedder for folders of audio or image files.
+Author-email: Matteo Boi <matteo.boi@unibe.ch>
+License-Expression: MIT
+Project-URL: Homepage, https://github.com/BoiMat/mllm-annotator
+Project-URL: Repository, https://github.com/BoiMat/mllm-annotator
+Keywords: gemini,annotation,labeling,multimodal,audio,image,embeddings,llm
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Science/Research
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Multimedia
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: google-genai>=1.0.0
+Requires-Dist: keyring>=24
+Provides-Extra: ui
+Requires-Dist: customtkinter>=5.2.0; extra == "ui"
+Provides-Extra: viz
+Requires-Dist: umap-learn>=0.5; extra == "viz"
+Requires-Dist: matplotlib>=3.8; extra == "viz"
+Requires-Dist: numpy>=1.24; extra == "viz"
+Provides-Extra: all
+Requires-Dist: mllm-annotator[ui,viz]; extra == "all"
+Dynamic: license-file
+# mllm-annotator
+A small, resumable tool for sending folders of audio or image files to a
+multimodal LLM for **automatic annotation**, plus an **embedding + UMAP
+visualization** workflow. Gemini is the current backend; the design keeps the
+provider behind a thin seam so others can be added later.
+It ships both a command-line tool and a desktop GUI.
+## Install
+```powershell
+# CLI only
+pip install mllm-annotator
+# with the desktop GUI and the embed/visualize feature
+pip install "mllm-annotator[ui,viz]"
+```
+Or, for development from a clone:
+```powershell
+uv sync --extra ui --extra viz
+```
+The embed/visualize feature also needs **ffmpeg** on your `PATH` to handle
+audio formats Gemini can't embed directly (e.g. `.aac`, `.opus`). It is an
+optional system dependency, not a pip package; without it those files are
+skipped.
+## API key
+Provide a Gemini API key in any one of these ways (checked in this order):
+1. environment variable `GEMINI_API_KEY` (or `GOOGLE_API_KEY`):
+   ```powershell
+   $env:GEMINI_API_KEY="your_api_key"
+   ```
+2. a `.env` file in the current working directory:
+   ```text
+   GEMINI_API_KEY=your_api_key
+   ```
+3. saved from inside the GUI — click **API Key**, paste it, and it is stored
+   securely in your OS keyring (Windows Credential Manager / macOS Keychain /
+   Linux Secret Service). No plaintext file is written.
+`.env` is ignored by git, and keys are never written into the built package.
+## Command line
+```powershell
+mllm-annotator --help
+```
+### Examples
+Horse cough annotation:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\horse_audio" `
+  --media-type audio `
+  --instruction "Annotate if the audio contains a horse cough or another sound such as the horse smacking the microphone." `
+  --daily-limit 500
+```
+Swiss German transcription validation:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\swiss_german_audio" `
+  --media-type audio `
+  --labels-csv "C:\data\transcriptions.csv" `
+  --instruction "Confirm whether the attached Swiss German audio matches the associated transcription. If it is wrong, rewrite the correct transcription." `
+  --daily-limit 500
+```
+Image captioning:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\images" `
+  --media-type image `
+  --instruction "Caption the attached image." `
+  --daily-limit 500
+```
+## Desktop GUI
+```powershell
+mllm-annotator-ui
+```
+The GUI lets you browse for the data folder, choose audio or image mode,
+optionally select a `filename,label` CSV, write the instruction, preview the
+file table, and start or resume processing. It shows the rewritten prompt and
+updates each row as Gemini responses arrive, using the same JSONL result/state
+files as the CLI. A second tab embeds the media and shows an interactive 2-D
+UMAP projection (zoom/pan toolbar, hover a point for its file name).
+## CSV format
+The optional labels CSV must contain exactly one row per media file and these
+columns:
+```csv
+filename,label
+audio_001.wav,expected transcription or label
+audio_002.wav,another label
+```
+For `--recursive`, `filename` must be the relative path with forward slashes,
+for example `speaker_a/audio_001.wav`.
+## Resume behavior
+By default, results are appended to `runs/results.jsonl` and progress is saved
+in `runs/state.json`. If the daily limit is reached or the API returns a
+quota/rate limit, run the same command again later or the next day — already
+processed files are skipped.
+The first run rewrites your natural-language instruction with `gemini-3.5-flash`
+and stores it in the state file. The media files are processed with
+`gemini-3.1-flash-lite`. Use `--no-rewrite` to skip the rewrite call.
+Before spending API calls, you can validate the folder and optional CSV:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\images" `
+  --media-type image `
+  --instruction "Caption the attached image." `
+  --dry-run
+```

mllm_annotator-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,138 @@
+# mllm-annotator
+A small, resumable tool for sending folders of audio or image files to a
+multimodal LLM for **automatic annotation**, plus an **embedding + UMAP
+visualization** workflow. Gemini is the current backend; the design keeps the
+provider behind a thin seam so others can be added later.
+It ships both a command-line tool and a desktop GUI.
+## Install
+```powershell
+# CLI only
+pip install mllm-annotator
+# with the desktop GUI and the embed/visualize feature
+pip install "mllm-annotator[ui,viz]"
+```
+Or, for development from a clone:
+```powershell
+uv sync --extra ui --extra viz
+```
+The embed/visualize feature also needs **ffmpeg** on your `PATH` to handle
+audio formats Gemini can't embed directly (e.g. `.aac`, `.opus`). It is an
+optional system dependency, not a pip package; without it those files are
+skipped.
+## API key
+Provide a Gemini API key in any one of these ways (checked in this order):
+1. environment variable `GEMINI_API_KEY` (or `GOOGLE_API_KEY`):
+   ```powershell
+   $env:GEMINI_API_KEY="your_api_key"
+   ```
+2. a `.env` file in the current working directory:
+   ```text
+   GEMINI_API_KEY=your_api_key
+   ```
+3. saved from inside the GUI — click **API Key**, paste it, and it is stored
+   securely in your OS keyring (Windows Credential Manager / macOS Keychain /
+   Linux Secret Service). No plaintext file is written.
+`.env` is ignored by git, and keys are never written into the built package.
+## Command line
+```powershell
+mllm-annotator --help
+```
+### Examples
+Horse cough annotation:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\horse_audio" `
+  --media-type audio `
+  --instruction "Annotate if the audio contains a horse cough or another sound such as the horse smacking the microphone." `
+  --daily-limit 500
+```
+Swiss German transcription validation:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\swiss_german_audio" `
+  --media-type audio `
+  --labels-csv "C:\data\transcriptions.csv" `
+  --instruction "Confirm whether the attached Swiss German audio matches the associated transcription. If it is wrong, rewrite the correct transcription." `
+  --daily-limit 500
+```
+Image captioning:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\images" `
+  --media-type image `
+  --instruction "Caption the attached image." `
+  --daily-limit 500
+```
+## Desktop GUI
+```powershell
+mllm-annotator-ui
+```
+The GUI lets you browse for the data folder, choose audio or image mode,
+optionally select a `filename,label` CSV, write the instruction, preview the
+file table, and start or resume processing. It shows the rewritten prompt and
+updates each row as Gemini responses arrive, using the same JSONL result/state
+files as the CLI. A second tab embeds the media and shows an interactive 2-D
+UMAP projection (zoom/pan toolbar, hover a point for its file name).
+## CSV format
+The optional labels CSV must contain exactly one row per media file and these
+columns:
+```csv
+filename,label
+audio_001.wav,expected transcription or label
+audio_002.wav,another label
+```
+For `--recursive`, `filename` must be the relative path with forward slashes,
+for example `speaker_a/audio_001.wav`.
+## Resume behavior
+By default, results are appended to `runs/results.jsonl` and progress is saved
+in `runs/state.json`. If the daily limit is reached or the API returns a
+quota/rate limit, run the same command again later or the next day — already
+processed files are skipped.
+The first run rewrites your natural-language instruction with `gemini-3.5-flash`
+and stores it in the state file. The media files are processed with
+`gemini-3.1-flash-lite`. Use `--no-rewrite` to skip the rewrite call.
+Before spending API calls, you can validate the folder and optional CSV:
+```powershell
+mllm-annotator `
+  --input-folder "C:\data\images" `
+  --media-type image `
+  --instruction "Caption the attached image." `
+  --dry-run
+```

mllm_annotator-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,58 @@
+[project]
+name = "mllm-annotator"
+version = "0.1.0"
+description = "Resumable multimodal-LLM annotator and embedder for folders of audio or image files."
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+license-files = ["LICENSE"]
+authors = [{ name = "Matteo Boi", email = "matteo.boi@unibe.ch" }]
+keywords = [
+    "gemini",
+    "annotation",
+    "labeling",
+    "multimodal",
+    "audio",
+    "image",
+    "embeddings",
+    "llm",
+]
+classifiers = [
+    "Development Status :: 3 - Alpha",
+    "Intended Audience :: Science/Research",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Topic :: Multimedia",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "google-genai>=1.0.0",
+    "keyring>=24",
+]
+[project.optional-dependencies]
+ui = ["customtkinter>=5.2.0"]
+viz = ["umap-learn>=0.5", "matplotlib>=3.8", "numpy>=1.24"]
+all = ["mllm-annotator[ui,viz]"]
+[project.urls]
+Homepage = "https://github.com/BoiMat/mllm-annotator"
+Repository = "https://github.com/BoiMat/mllm-annotator"
+[project.scripts]
+mllm-annotator = "mllm_annotator.core:main"
+[project.gui-scripts]
+mllm-annotator-ui = "mllm_annotator.ui:main"
+[build-system]
+requires = ["setuptools>=77"]
+build-backend = "setuptools.build_meta"
+[tool.setuptools.packages.find]
+where = ["src"]
+[tool.uv]
+package = true

mllm_annotator-0.1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

mllm_annotator-0.1.0/src/mllm_annotator/__init__.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""mllm-annotator: resumable multimodal-LLM media annotation and embedding.
+Gemini is the first (and currently only) backend; the public surface is kept
+backend-agnostic so additional providers can be added later.
+"""
+from __future__ import annotations
+__version__ = "0.1.0"
+from .config import (
+    BACKENDS,
+    Backend,
+    ConfigError,
+    clear_api_key,
+    get_api_key,
+    has_api_key,
+    store_api_key,
+)
+from .core import MediaItem, RateLimitReached
+__all__ = [
+    "__version__",
+    "BACKENDS",
+    "Backend",
+    "ConfigError",
+    "MediaItem",
+    "RateLimitReached",
+    "get_api_key",
+    "has_api_key",
+    "store_api_key",
+    "clear_api_key",
+]

mllm_annotator-0.1.0/src/mllm_annotator/config.py ADDED Viewed

@@ -0,0 +1,151 @@
+"""API key resolution and secure storage, per backend.
+For each backend, a key is resolved in order from:
+1. an environment variable (e.g. ``GEMINI_API_KEY`` / ``GOOGLE_API_KEY``);
+2. a ``.env`` file in the current working directory;
+3. the OS keyring (Windows Credential Manager / macOS Keychain / Secret
+   Service), where the GUI stores keys the user enters.
+The keyring is optional at runtime: if the ``keyring`` package or its backend
+is unavailable, resolution silently falls back to the environment/``.env``.
+Backends are registered in ``BACKENDS``. Gemini is the only one today; adding
+another is just a new ``Backend`` entry plus its client wiring — key storage,
+resolution, and the GUI dialog all pick it up automatically.
+"""
+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+KEYRING_SERVICE = "mllm-annotator"
+class ConfigError(Exception):
+    """Raised when user-provided inputs are inconsistent or incomplete."""
+@dataclass(frozen=True)
+class Backend:
+    """A model provider that needs an API key."""
+    id: str
+    label: str
+    env_vars: tuple[str, ...]
+GEMINI = Backend(
+    id="gemini",
+    label="Google Gemini",
+    env_vars=("GEMINI_API_KEY", "GOOGLE_API_KEY"),
+)
+BACKENDS: dict[str, Backend] = {GEMINI.id: GEMINI}
+DEFAULT_BACKEND = GEMINI.id
+def _backend(backend_id: str) -> Backend:
+    try:
+        return BACKENDS[backend_id]
+    except KeyError:
+        raise ConfigError(f"Unknown backend: {backend_id}") from None
+def load_env_file(path: Path) -> None:
+    """Load ``KEY=VALUE`` lines from *path* into os.environ (real env wins)."""
+    if not path.exists() or not path.is_file():
+        return
+    for raw_line in path.read_text(encoding="utf-8-sig").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        value = value.strip().strip('"').strip("'")
+        if key:
+            os.environ.setdefault(key, value)
+def _keyring() -> Any | None:
+    """Return the keyring module, or None if it (or its backend) is unusable."""
+    try:
+        import keyring
+    except Exception:
+        return None
+    return keyring
+def _keyring_get(backend_id: str) -> str | None:
+    kr = _keyring()
+    if kr is None:
+        return None
+    try:
+        return kr.get_password(KEYRING_SERVICE, backend_id)
+    except Exception:
+        return None
+def _env_key(backend: Backend) -> str | None:
+    load_env_file(Path.cwd() / ".env")
+    for var in backend.env_vars:
+        value = os.environ.get(var)
+        if value:
+            return value
+    return None
+def get_api_key(backend_id: str = DEFAULT_BACKEND) -> str:
+    """Return the API key for *backend_id*, or raise ConfigError with guidance."""
+    backend = _backend(backend_id)
+    key = _env_key(backend) or _keyring_get(backend.id)
+    if key:
+        return key
+    raise ConfigError(
+        f"No API key set for {backend.label}. Set {' or '.join(backend.env_vars)}, "
+        "add it to a .env file, or save it in the app (the 'Set API Key' button, "
+        "top-left)."
+    )
+def key_source(backend_id: str = DEFAULT_BACKEND) -> str | None:
+    """Where *backend_id*'s key comes from: 'environment', 'keyring', or None."""
+    backend = _backend(backend_id)
+    if _env_key(backend):
+        return "environment"
+    if _keyring_get(backend.id):
+        return "keyring"
+    return None
+def has_api_key(backend_id: str = DEFAULT_BACKEND) -> bool:
+    return key_source(backend_id) is not None
+def store_api_key(key: str, backend_id: str = DEFAULT_BACKEND) -> None:
+    """Persist *key* for *backend_id* in the OS keyring."""
+    _backend(backend_id)  # validate
+    kr = _keyring()
+    if kr is None:
+        raise ConfigError(
+            "Cannot save the API key: the 'keyring' package or its OS backend is "
+            "unavailable. Set an environment variable or use a .env file instead."
+        )
+    try:
+        kr.set_password(KEYRING_SERVICE, backend_id, key)
+    except Exception as exc:  # noqa: BLE001 - surface backend failures clearly.
+        raise ConfigError(f"Could not save the API key to the keyring: {exc}") from exc
+def clear_api_key(backend_id: str = DEFAULT_BACKEND) -> None:
+    """Remove *backend_id*'s key from the OS keyring (best effort)."""
+    kr = _keyring()
+    if kr is None:
+        return
+    try:
+        kr.delete_password(KEYRING_SERVICE, backend_id)
+    except Exception:
+        pass