mllm-annotator 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 University of Bern, Data Science Lab, Matteo Boi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,170 @@
1
+ Metadata-Version: 2.4
2
+ Name: mllm-annotator
3
+ Version: 0.1.0
4
+ Summary: Resumable multimodal-LLM annotator and embedder for folders of audio or image files.
5
+ Author-email: Matteo Boi <matteo.boi@unibe.ch>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/BoiMat/mllm-annotator
8
+ Project-URL: Repository, https://github.com/BoiMat/mllm-annotator
9
+ Keywords: gemini,annotation,labeling,multimodal,audio,image,embeddings,llm
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Multimedia
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: google-genai>=1.0.0
22
+ Requires-Dist: keyring>=24
23
+ Provides-Extra: ui
24
+ Requires-Dist: customtkinter>=5.2.0; extra == "ui"
25
+ Provides-Extra: viz
26
+ Requires-Dist: umap-learn>=0.5; extra == "viz"
27
+ Requires-Dist: matplotlib>=3.8; extra == "viz"
28
+ Requires-Dist: numpy>=1.24; extra == "viz"
29
+ Provides-Extra: all
30
+ Requires-Dist: mllm-annotator[ui,viz]; extra == "all"
31
+ Dynamic: license-file
32
+
33
+ # mllm-annotator
34
+
35
+ A small, resumable tool for sending folders of audio or image files to a
36
+ multimodal LLM for **automatic annotation**, plus an **embedding + UMAP
37
+ visualization** workflow. Gemini is the current backend; the design keeps the
38
+ provider behind a thin seam so others can be added later.
39
+
40
+ It ships both a command-line tool and a desktop GUI.
41
+
42
+ ## Install
43
+
44
+ ```powershell
45
+ # CLI only
46
+ pip install mllm-annotator
47
+
48
+ # with the desktop GUI and the embed/visualize feature
49
+ pip install "mllm-annotator[ui,viz]"
50
+ ```
51
+
52
+ Or, for development from a clone:
53
+
54
+ ```powershell
55
+ uv sync --extra ui --extra viz
56
+ ```
57
+
58
+ The embed/visualize feature also needs **ffmpeg** on your `PATH` to handle
59
+ audio formats Gemini can't embed directly (e.g. `.aac`, `.opus`). It is an
60
+ optional system dependency, not a pip package; without it those files are
61
+ skipped.
62
+
63
+ ## API key
64
+
65
+ Provide a Gemini API key in any one of these ways (checked in this order):
66
+
67
+ 1. environment variable `GEMINI_API_KEY` (or `GOOGLE_API_KEY`):
68
+
69
+ ```powershell
70
+ $env:GEMINI_API_KEY="your_api_key"
71
+ ```
72
+
73
+ 2. a `.env` file in the current working directory:
74
+
75
+ ```text
76
+ GEMINI_API_KEY=your_api_key
77
+ ```
78
+
79
+ 3. saved from inside the GUI — click **API Key**, paste it, and it is stored
80
+ securely in your OS keyring (Windows Credential Manager / macOS Keychain /
81
+ Linux Secret Service). No plaintext file is written.
82
+
83
+ `.env` is ignored by git, and keys are never written into the built package.
84
+
85
+ ## Command line
86
+
87
+ ```powershell
88
+ mllm-annotator --help
89
+ ```
90
+
91
+ ### Examples
92
+
93
+ Horse cough annotation:
94
+
95
+ ```powershell
96
+ mllm-annotator `
97
+ --input-folder "C:\data\horse_audio" `
98
+ --media-type audio `
99
+ --instruction "Annotate if the audio contains a horse cough or another sound such as the horse smacking the microphone." `
100
+ --daily-limit 500
101
+ ```
102
+
103
+ Swiss German transcription validation:
104
+
105
+ ```powershell
106
+ mllm-annotator `
107
+ --input-folder "C:\data\swiss_german_audio" `
108
+ --media-type audio `
109
+ --labels-csv "C:\data\transcriptions.csv" `
110
+ --instruction "Confirm whether the attached Swiss German audio matches the associated transcription. If it is wrong, rewrite the correct transcription." `
111
+ --daily-limit 500
112
+ ```
113
+
114
+ Image captioning:
115
+
116
+ ```powershell
117
+ mllm-annotator `
118
+ --input-folder "C:\data\images" `
119
+ --media-type image `
120
+ --instruction "Caption the attached image." `
121
+ --daily-limit 500
122
+ ```
123
+
124
+ ## Desktop GUI
125
+
126
+ ```powershell
127
+ mllm-annotator-ui
128
+ ```
129
+
130
+ The GUI lets you browse for the data folder, choose audio or image mode,
131
+ optionally select a `filename,label` CSV, write the instruction, preview the
132
+ file table, and start or resume processing. It shows the rewritten prompt and
133
+ updates each row as Gemini responses arrive, using the same JSONL result/state
134
+ files as the CLI. A second tab embeds the media and shows an interactive 2-D
135
+ UMAP projection (zoom/pan toolbar, hover a point for its file name).
136
+
137
+ ## CSV format
138
+
139
+ The optional labels CSV must contain exactly one row per media file and these
140
+ columns:
141
+
142
+ ```csv
143
+ filename,label
144
+ audio_001.wav,expected transcription or label
145
+ audio_002.wav,another label
146
+ ```
147
+
148
+ For `--recursive`, `filename` must be the relative path with forward slashes,
149
+ for example `speaker_a/audio_001.wav`.
150
+
151
+ ## Resume behavior
152
+
153
+ By default, results are appended to `runs/results.jsonl` and progress is saved
154
+ in `runs/state.json`. If the daily limit is reached or the API returns a
155
+ quota/rate limit, run the same command again later or the next day — already
156
+ processed files are skipped.
157
+
158
+ The first run rewrites your natural-language instruction with `gemini-3.5-flash`
159
+ and stores it in the state file. The media files are processed with
160
+ `gemini-3.1-flash-lite`. Use `--no-rewrite` to skip the rewrite call.
161
+
162
+ Before spending API calls, you can validate the folder and optional CSV:
163
+
164
+ ```powershell
165
+ mllm-annotator `
166
+ --input-folder "C:\data\images" `
167
+ --media-type image `
168
+ --instruction "Caption the attached image." `
169
+ --dry-run
170
+ ```
@@ -0,0 +1,138 @@
1
+ # mllm-annotator
2
+
3
+ A small, resumable tool for sending folders of audio or image files to a
4
+ multimodal LLM for **automatic annotation**, plus an **embedding + UMAP
5
+ visualization** workflow. Gemini is the current backend; the design keeps the
6
+ provider behind a thin seam so others can be added later.
7
+
8
+ It ships both a command-line tool and a desktop GUI.
9
+
10
+ ## Install
11
+
12
+ ```powershell
13
+ # CLI only
14
+ pip install mllm-annotator
15
+
16
+ # with the desktop GUI and the embed/visualize feature
17
+ pip install "mllm-annotator[ui,viz]"
18
+ ```
19
+
20
+ Or, for development from a clone:
21
+
22
+ ```powershell
23
+ uv sync --extra ui --extra viz
24
+ ```
25
+
26
+ The embed/visualize feature also needs **ffmpeg** on your `PATH` to handle
27
+ audio formats Gemini can't embed directly (e.g. `.aac`, `.opus`). It is an
28
+ optional system dependency, not a pip package; without it those files are
29
+ skipped.
30
+
31
+ ## API key
32
+
33
+ Provide a Gemini API key in any one of these ways (checked in this order):
34
+
35
+ 1. environment variable `GEMINI_API_KEY` (or `GOOGLE_API_KEY`):
36
+
37
+ ```powershell
38
+ $env:GEMINI_API_KEY="your_api_key"
39
+ ```
40
+
41
+ 2. a `.env` file in the current working directory:
42
+
43
+ ```text
44
+ GEMINI_API_KEY=your_api_key
45
+ ```
46
+
47
+ 3. saved from inside the GUI — click **API Key**, paste it, and it is stored
48
+ securely in your OS keyring (Windows Credential Manager / macOS Keychain /
49
+ Linux Secret Service). No plaintext file is written.
50
+
51
+ `.env` is ignored by git, and keys are never written into the built package.
52
+
53
+ ## Command line
54
+
55
+ ```powershell
56
+ mllm-annotator --help
57
+ ```
58
+
59
+ ### Examples
60
+
61
+ Horse cough annotation:
62
+
63
+ ```powershell
64
+ mllm-annotator `
65
+ --input-folder "C:\data\horse_audio" `
66
+ --media-type audio `
67
+ --instruction "Annotate if the audio contains a horse cough or another sound such as the horse smacking the microphone." `
68
+ --daily-limit 500
69
+ ```
70
+
71
+ Swiss German transcription validation:
72
+
73
+ ```powershell
74
+ mllm-annotator `
75
+ --input-folder "C:\data\swiss_german_audio" `
76
+ --media-type audio `
77
+ --labels-csv "C:\data\transcriptions.csv" `
78
+ --instruction "Confirm whether the attached Swiss German audio matches the associated transcription. If it is wrong, rewrite the correct transcription." `
79
+ --daily-limit 500
80
+ ```
81
+
82
+ Image captioning:
83
+
84
+ ```powershell
85
+ mllm-annotator `
86
+ --input-folder "C:\data\images" `
87
+ --media-type image `
88
+ --instruction "Caption the attached image." `
89
+ --daily-limit 500
90
+ ```
91
+
92
+ ## Desktop GUI
93
+
94
+ ```powershell
95
+ mllm-annotator-ui
96
+ ```
97
+
98
+ The GUI lets you browse for the data folder, choose audio or image mode,
99
+ optionally select a `filename,label` CSV, write the instruction, preview the
100
+ file table, and start or resume processing. It shows the rewritten prompt and
101
+ updates each row as Gemini responses arrive, using the same JSONL result/state
102
+ files as the CLI. A second tab embeds the media and shows an interactive 2-D
103
+ UMAP projection (zoom/pan toolbar, hover a point for its file name).
104
+
105
+ ## CSV format
106
+
107
+ The optional labels CSV must contain exactly one row per media file and these
108
+ columns:
109
+
110
+ ```csv
111
+ filename,label
112
+ audio_001.wav,expected transcription or label
113
+ audio_002.wav,another label
114
+ ```
115
+
116
+ For `--recursive`, `filename` must be the relative path with forward slashes,
117
+ for example `speaker_a/audio_001.wav`.
118
+
119
+ ## Resume behavior
120
+
121
+ By default, results are appended to `runs/results.jsonl` and progress is saved
122
+ in `runs/state.json`. If the daily limit is reached or the API returns a
123
+ quota/rate limit, run the same command again later or the next day — already
124
+ processed files are skipped.
125
+
126
+ The first run rewrites your natural-language instruction with `gemini-3.5-flash`
127
+ and stores it in the state file. The media files are processed with
128
+ `gemini-3.1-flash-lite`. Use `--no-rewrite` to skip the rewrite call.
129
+
130
+ Before spending API calls, you can validate the folder and optional CSV:
131
+
132
+ ```powershell
133
+ mllm-annotator `
134
+ --input-folder "C:\data\images" `
135
+ --media-type image `
136
+ --instruction "Caption the attached image." `
137
+ --dry-run
138
+ ```
@@ -0,0 +1,58 @@
1
+ [project]
2
+ name = "mllm-annotator"
3
+ version = "0.1.0"
4
+ description = "Resumable multimodal-LLM annotator and embedder for folders of audio or image files."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ license = "MIT"
8
+ license-files = ["LICENSE"]
9
+ authors = [{ name = "Matteo Boi", email = "matteo.boi@unibe.ch" }]
10
+ keywords = [
11
+ "gemini",
12
+ "annotation",
13
+ "labeling",
14
+ "multimodal",
15
+ "audio",
16
+ "image",
17
+ "embeddings",
18
+ "llm",
19
+ ]
20
+ classifiers = [
21
+ "Development Status :: 3 - Alpha",
22
+ "Intended Audience :: Science/Research",
23
+ "Programming Language :: Python :: 3",
24
+ "Programming Language :: Python :: 3.10",
25
+ "Programming Language :: Python :: 3.11",
26
+ "Programming Language :: Python :: 3.12",
27
+ "Topic :: Multimedia",
28
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
29
+ ]
30
+ dependencies = [
31
+ "google-genai>=1.0.0",
32
+ "keyring>=24",
33
+ ]
34
+
35
+ [project.optional-dependencies]
36
+ ui = ["customtkinter>=5.2.0"]
37
+ viz = ["umap-learn>=0.5", "matplotlib>=3.8", "numpy>=1.24"]
38
+ all = ["mllm-annotator[ui,viz]"]
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/BoiMat/mllm-annotator"
42
+ Repository = "https://github.com/BoiMat/mllm-annotator"
43
+
44
+ [project.scripts]
45
+ mllm-annotator = "mllm_annotator.core:main"
46
+
47
+ [project.gui-scripts]
48
+ mllm-annotator-ui = "mllm_annotator.ui:main"
49
+
50
+ [build-system]
51
+ requires = ["setuptools>=77"]
52
+ build-backend = "setuptools.build_meta"
53
+
54
+ [tool.setuptools.packages.find]
55
+ where = ["src"]
56
+
57
+ [tool.uv]
58
+ package = true
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,33 @@
1
+ """mllm-annotator: resumable multimodal-LLM media annotation and embedding.
2
+
3
+ Gemini is the first (and currently only) backend; the public surface is kept
4
+ backend-agnostic so additional providers can be added later.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ __version__ = "0.1.0"
10
+
11
+ from .config import (
12
+ BACKENDS,
13
+ Backend,
14
+ ConfigError,
15
+ clear_api_key,
16
+ get_api_key,
17
+ has_api_key,
18
+ store_api_key,
19
+ )
20
+ from .core import MediaItem, RateLimitReached
21
+
22
+ __all__ = [
23
+ "__version__",
24
+ "BACKENDS",
25
+ "Backend",
26
+ "ConfigError",
27
+ "MediaItem",
28
+ "RateLimitReached",
29
+ "get_api_key",
30
+ "has_api_key",
31
+ "store_api_key",
32
+ "clear_api_key",
33
+ ]
@@ -0,0 +1,151 @@
1
+ """API key resolution and secure storage, per backend.
2
+
3
+ For each backend, a key is resolved in order from:
4
+
5
+ 1. an environment variable (e.g. ``GEMINI_API_KEY`` / ``GOOGLE_API_KEY``);
6
+ 2. a ``.env`` file in the current working directory;
7
+ 3. the OS keyring (Windows Credential Manager / macOS Keychain / Secret
8
+ Service), where the GUI stores keys the user enters.
9
+
10
+ The keyring is optional at runtime: if the ``keyring`` package or its backend
11
+ is unavailable, resolution silently falls back to the environment/``.env``.
12
+
13
+ Backends are registered in ``BACKENDS``. Gemini is the only one today; adding
14
+ another is just a new ``Backend`` entry plus its client wiring — key storage,
15
+ resolution, and the GUI dialog all pick it up automatically.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import os
21
+ from dataclasses import dataclass
22
+ from pathlib import Path
23
+ from typing import Any
24
+
25
+ KEYRING_SERVICE = "mllm-annotator"
26
+
27
+
28
+ class ConfigError(Exception):
29
+ """Raised when user-provided inputs are inconsistent or incomplete."""
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class Backend:
34
+ """A model provider that needs an API key."""
35
+
36
+ id: str
37
+ label: str
38
+ env_vars: tuple[str, ...]
39
+
40
+
41
+ GEMINI = Backend(
42
+ id="gemini",
43
+ label="Google Gemini",
44
+ env_vars=("GEMINI_API_KEY", "GOOGLE_API_KEY"),
45
+ )
46
+
47
+ BACKENDS: dict[str, Backend] = {GEMINI.id: GEMINI}
48
+ DEFAULT_BACKEND = GEMINI.id
49
+
50
+
51
+ def _backend(backend_id: str) -> Backend:
52
+ try:
53
+ return BACKENDS[backend_id]
54
+ except KeyError:
55
+ raise ConfigError(f"Unknown backend: {backend_id}") from None
56
+
57
+
58
+ def load_env_file(path: Path) -> None:
59
+ """Load ``KEY=VALUE`` lines from *path* into os.environ (real env wins)."""
60
+ if not path.exists() or not path.is_file():
61
+ return
62
+ for raw_line in path.read_text(encoding="utf-8-sig").splitlines():
63
+ line = raw_line.strip()
64
+ if not line or line.startswith("#") or "=" not in line:
65
+ continue
66
+ key, value = line.split("=", 1)
67
+ key = key.strip()
68
+ value = value.strip().strip('"').strip("'")
69
+ if key:
70
+ os.environ.setdefault(key, value)
71
+
72
+
73
+ def _keyring() -> Any | None:
74
+ """Return the keyring module, or None if it (or its backend) is unusable."""
75
+ try:
76
+ import keyring
77
+ except Exception:
78
+ return None
79
+ return keyring
80
+
81
+
82
+ def _keyring_get(backend_id: str) -> str | None:
83
+ kr = _keyring()
84
+ if kr is None:
85
+ return None
86
+ try:
87
+ return kr.get_password(KEYRING_SERVICE, backend_id)
88
+ except Exception:
89
+ return None
90
+
91
+
92
+ def _env_key(backend: Backend) -> str | None:
93
+ load_env_file(Path.cwd() / ".env")
94
+ for var in backend.env_vars:
95
+ value = os.environ.get(var)
96
+ if value:
97
+ return value
98
+ return None
99
+
100
+
101
+ def get_api_key(backend_id: str = DEFAULT_BACKEND) -> str:
102
+ """Return the API key for *backend_id*, or raise ConfigError with guidance."""
103
+ backend = _backend(backend_id)
104
+ key = _env_key(backend) or _keyring_get(backend.id)
105
+ if key:
106
+ return key
107
+ raise ConfigError(
108
+ f"No API key set for {backend.label}. Set {' or '.join(backend.env_vars)}, "
109
+ "add it to a .env file, or save it in the app (the 'Set API Key' button, "
110
+ "top-left)."
111
+ )
112
+
113
+
114
+ def key_source(backend_id: str = DEFAULT_BACKEND) -> str | None:
115
+ """Where *backend_id*'s key comes from: 'environment', 'keyring', or None."""
116
+ backend = _backend(backend_id)
117
+ if _env_key(backend):
118
+ return "environment"
119
+ if _keyring_get(backend.id):
120
+ return "keyring"
121
+ return None
122
+
123
+
124
+ def has_api_key(backend_id: str = DEFAULT_BACKEND) -> bool:
125
+ return key_source(backend_id) is not None
126
+
127
+
128
+ def store_api_key(key: str, backend_id: str = DEFAULT_BACKEND) -> None:
129
+ """Persist *key* for *backend_id* in the OS keyring."""
130
+ _backend(backend_id) # validate
131
+ kr = _keyring()
132
+ if kr is None:
133
+ raise ConfigError(
134
+ "Cannot save the API key: the 'keyring' package or its OS backend is "
135
+ "unavailable. Set an environment variable or use a .env file instead."
136
+ )
137
+ try:
138
+ kr.set_password(KEYRING_SERVICE, backend_id, key)
139
+ except Exception as exc: # noqa: BLE001 - surface backend failures clearly.
140
+ raise ConfigError(f"Could not save the API key to the keyring: {exc}") from exc
141
+
142
+
143
+ def clear_api_key(backend_id: str = DEFAULT_BACKEND) -> None:
144
+ """Remove *backend_id*'s key from the OS keyring (best effort)."""
145
+ kr = _keyring()
146
+ if kr is None:
147
+ return
148
+ try:
149
+ kr.delete_password(KEYRING_SERVICE, backend_id)
150
+ except Exception:
151
+ pass