alt-text-llm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alt-text-llm might be problematic. Click here for more details.
- alt_text_llm/__init__.py +13 -0
- alt_text_llm/generate.py +208 -0
- alt_text_llm/label.py +347 -0
- alt_text_llm/main.py +235 -0
- alt_text_llm/scan.py +219 -0
- alt_text_llm/utils.py +515 -0
- alt_text_llm-0.1.0.dist-info/METADATA +181 -0
- alt_text_llm-0.1.0.dist-info/RECORD +12 -0
- alt_text_llm-0.1.0.dist-info/WHEEL +5 -0
- alt_text_llm-0.1.0.dist-info/entry_points.txt +2 -0
- alt_text_llm-0.1.0.dist-info/licenses/LICENSE +21 -0
- alt_text_llm-0.1.0.dist-info/top_level.txt +1 -0
alt_text_llm/utils.py
ADDED
|
@@ -0,0 +1,515 @@
|
|
|
1
|
+
"""Shared utilities for alt text generation and labeling."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
import subprocess
|
|
6
|
+
import textwrap
|
|
7
|
+
from dataclasses import asdict, dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import (
|
|
10
|
+
TYPE_CHECKING,
|
|
11
|
+
Collection,
|
|
12
|
+
Dict,
|
|
13
|
+
Iterable,
|
|
14
|
+
Optional,
|
|
15
|
+
Sequence,
|
|
16
|
+
)
|
|
17
|
+
from urllib.parse import urlparse
|
|
18
|
+
|
|
19
|
+
import git
|
|
20
|
+
import requests
|
|
21
|
+
from ruamel.yaml import YAML, YAMLError
|
|
22
|
+
|
|
23
|
+
if TYPE_CHECKING:
|
|
24
|
+
from alt_text_llm import scan
|
|
25
|
+
|
|
26
|
+
_executable_cache: Dict[str, str] = {}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def find_executable(name: str) -> str:
|
|
30
|
+
"""
|
|
31
|
+
Find and cache the absolute path of an executable.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name: The name of the executable to find.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The absolute path to the executable.
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
FileNotFoundError: If the executable cannot be found.
|
|
41
|
+
"""
|
|
42
|
+
if name in _executable_cache:
|
|
43
|
+
return _executable_cache[name]
|
|
44
|
+
|
|
45
|
+
executable_path = shutil.which(name)
|
|
46
|
+
if not executable_path:
|
|
47
|
+
raise FileNotFoundError(
|
|
48
|
+
f"Executable '{name}' not found. Please ensure it is in your PATH."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
_executable_cache[name] = executable_path
|
|
52
|
+
return executable_path
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def get_git_root(starting_dir: Optional[Path] = None) -> Path:
|
|
56
|
+
"""
|
|
57
|
+
Returns the absolute path to the top-level directory of the Git repository.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
starting_dir: Directory from which to start searching for the Git root.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
Path: Absolute path to the Git repository root.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
RuntimeError: If Git root cannot be determined.
|
|
67
|
+
"""
|
|
68
|
+
git_executable = find_executable("git")
|
|
69
|
+
completed_process = subprocess.run(
|
|
70
|
+
[git_executable, "rev-parse", "--show-toplevel"],
|
|
71
|
+
capture_output=True,
|
|
72
|
+
text=True,
|
|
73
|
+
check=True,
|
|
74
|
+
cwd=starting_dir if starting_dir else Path.cwd(),
|
|
75
|
+
)
|
|
76
|
+
if completed_process.returncode == 0:
|
|
77
|
+
return Path(completed_process.stdout.strip())
|
|
78
|
+
raise RuntimeError("Failed to get Git root")
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def get_files(
|
|
82
|
+
dir_to_search: Optional[Path] = None,
|
|
83
|
+
filetypes_to_match: Collection[str] = (".md",),
|
|
84
|
+
use_git_ignore: bool = True,
|
|
85
|
+
ignore_dirs: Optional[Collection[str]] = None,
|
|
86
|
+
) -> tuple[Path, ...]:
|
|
87
|
+
"""
|
|
88
|
+
Returns a tuple of all files in the specified directory of the Git
|
|
89
|
+
repository.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
dir_to_search: A directory to search for files.
|
|
93
|
+
filetypes_to_match: A collection of file types to search for.
|
|
94
|
+
use_git_ignore: Whether to exclude files based on .gitignore.
|
|
95
|
+
ignore_dirs: Directory names to ignore.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
tuple[Path, ...]: A tuple of all matching files.
|
|
99
|
+
"""
|
|
100
|
+
files: list[Path] = []
|
|
101
|
+
if dir_to_search is not None:
|
|
102
|
+
for filetype in filetypes_to_match:
|
|
103
|
+
files.extend(dir_to_search.rglob(f"*{filetype}"))
|
|
104
|
+
|
|
105
|
+
# Filter out ignored directories
|
|
106
|
+
if ignore_dirs:
|
|
107
|
+
files = [
|
|
108
|
+
f
|
|
109
|
+
for f in files
|
|
110
|
+
if not any(ignore_dir in f.parts for ignore_dir in ignore_dirs)
|
|
111
|
+
]
|
|
112
|
+
|
|
113
|
+
if use_git_ignore:
|
|
114
|
+
try:
|
|
115
|
+
root = get_git_root(starting_dir=dir_to_search)
|
|
116
|
+
repo = git.Repo(root)
|
|
117
|
+
# Convert file paths to paths relative to the git root
|
|
118
|
+
relative_files = [file.relative_to(root) for file in files]
|
|
119
|
+
# Filter out ignored files
|
|
120
|
+
files = [
|
|
121
|
+
file
|
|
122
|
+
for file, rel_file in zip(files, relative_files)
|
|
123
|
+
if not repo.ignored(rel_file)
|
|
124
|
+
]
|
|
125
|
+
except (
|
|
126
|
+
git.GitCommandError,
|
|
127
|
+
ValueError,
|
|
128
|
+
RuntimeError,
|
|
129
|
+
subprocess.CalledProcessError,
|
|
130
|
+
):
|
|
131
|
+
# If Git operations fail, continue without Git filtering
|
|
132
|
+
pass
|
|
133
|
+
return tuple(files)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def split_yaml(file_path: Path, verbose: bool = False) -> tuple[dict, str]:
|
|
137
|
+
"""
|
|
138
|
+
Split a markdown file into its YAML frontmatter and content.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
file_path: Path to the markdown file
|
|
142
|
+
verbose: Whether to print error messages
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Tuple of (metadata dict, content string)
|
|
146
|
+
"""
|
|
147
|
+
yaml = YAML(
|
|
148
|
+
typ="rt"
|
|
149
|
+
) # 'rt' means round-trip, preserving comments and formatting
|
|
150
|
+
yaml.preserve_quotes = True # Preserve quote style
|
|
151
|
+
|
|
152
|
+
with file_path.open("r", encoding="utf-8") as f:
|
|
153
|
+
content = f.read()
|
|
154
|
+
|
|
155
|
+
# Split frontmatter and content
|
|
156
|
+
parts = content.split("---", 2)
|
|
157
|
+
if len(parts) < 3:
|
|
158
|
+
if verbose:
|
|
159
|
+
print(f"Skipping {file_path}: No valid frontmatter found")
|
|
160
|
+
return {}, ""
|
|
161
|
+
|
|
162
|
+
try:
|
|
163
|
+
metadata = yaml.load(parts[1])
|
|
164
|
+
if not metadata:
|
|
165
|
+
metadata = {}
|
|
166
|
+
except YAMLError as e:
|
|
167
|
+
print(f"Error parsing YAML in {file_path}: {str(e)}")
|
|
168
|
+
return {}, ""
|
|
169
|
+
|
|
170
|
+
return metadata, parts[2]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def is_url(path: str) -> bool:
|
|
174
|
+
"""Check if path is a URL."""
|
|
175
|
+
parsed = urlparse(path)
|
|
176
|
+
return bool(parsed.scheme and parsed.netloc)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _parse_paragraphs(
|
|
180
|
+
lines: Sequence[str],
|
|
181
|
+
) -> tuple[list[list[str]], list[int]]:
|
|
182
|
+
"""Parse lines into paragraphs and their start indices."""
|
|
183
|
+
paragraphs: list[list[str]] = []
|
|
184
|
+
paragraph_starts: list[int] = []
|
|
185
|
+
current: list[str] = []
|
|
186
|
+
|
|
187
|
+
for idx, line in enumerate(lines):
|
|
188
|
+
if line.strip() == "":
|
|
189
|
+
if current:
|
|
190
|
+
paragraphs.append(current)
|
|
191
|
+
paragraph_starts.append(idx - len(current))
|
|
192
|
+
current = []
|
|
193
|
+
else:
|
|
194
|
+
current.append(line.rstrip("\n"))
|
|
195
|
+
|
|
196
|
+
if current:
|
|
197
|
+
paragraphs.append(current)
|
|
198
|
+
paragraph_starts.append(len(lines) - len(current))
|
|
199
|
+
|
|
200
|
+
return paragraphs, paragraph_starts
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _find_target_paragraph(
|
|
204
|
+
lines: Sequence[str],
|
|
205
|
+
target_idx: int,
|
|
206
|
+
paragraphs: list[list[str]],
|
|
207
|
+
paragraph_starts: list[int],
|
|
208
|
+
) -> int | None:
|
|
209
|
+
"""Find the paragraph index for the target line."""
|
|
210
|
+
selected_line = lines[target_idx] if target_idx < len(lines) else ""
|
|
211
|
+
|
|
212
|
+
if selected_line.strip() != "":
|
|
213
|
+
selected_stripped = selected_line.rstrip("\n")
|
|
214
|
+
for i, paragraph in enumerate(paragraphs):
|
|
215
|
+
if selected_stripped in paragraph:
|
|
216
|
+
return i
|
|
217
|
+
else:
|
|
218
|
+
for i, start in enumerate(paragraph_starts):
|
|
219
|
+
if start > target_idx:
|
|
220
|
+
return i
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def paragraph_context(
|
|
225
|
+
lines: Sequence[str],
|
|
226
|
+
target_idx: int,
|
|
227
|
+
max_before: int | None = None,
|
|
228
|
+
max_after: int = 2,
|
|
229
|
+
) -> str:
|
|
230
|
+
"""
|
|
231
|
+
Return a slice of text around *target_idx* in **paragraph** units.
|
|
232
|
+
|
|
233
|
+
A *paragraph* is any non-empty run of lines separated by at least one blank
|
|
234
|
+
line. The returned snippet includes:
|
|
235
|
+
|
|
236
|
+
• Up to *max_before* paragraphs **before** the target paragraph.
|
|
237
|
+
– ``None`` means *unlimited* (all preceding paragraphs).
|
|
238
|
+
– ``0`` means *no* paragraphs before the target.
|
|
239
|
+
• The target paragraph itself.
|
|
240
|
+
• Up to *max_after* paragraphs **after** the target paragraph (``0`` means
|
|
241
|
+
none).
|
|
242
|
+
|
|
243
|
+
If *target_idx* is located on a blank line, the function treats the **next**
|
|
244
|
+
paragraph as the target. Requests that are out-of-bounds or that point
|
|
245
|
+
past the last paragraph return an empty string instead of raising. The
|
|
246
|
+
original line formatting (including Markdown, punctuation, etc.) is
|
|
247
|
+
preserved.
|
|
248
|
+
"""
|
|
249
|
+
if (
|
|
250
|
+
target_idx < 0
|
|
251
|
+
or (max_before is not None and max_before < 0)
|
|
252
|
+
or max_after < 0
|
|
253
|
+
): # pragma: no cover
|
|
254
|
+
raise ValueError(
|
|
255
|
+
f"{target_idx=}, {max_before=}, and {max_after=} must be non-negative"
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
paragraphs, paragraph_starts = _parse_paragraphs(lines)
|
|
259
|
+
par_idx = _find_target_paragraph(
|
|
260
|
+
lines, target_idx, paragraphs, paragraph_starts
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
if par_idx is None:
|
|
264
|
+
return ""
|
|
265
|
+
|
|
266
|
+
if max_before is None:
|
|
267
|
+
start_idx = 0
|
|
268
|
+
elif max_before == 0:
|
|
269
|
+
start_idx = par_idx
|
|
270
|
+
else:
|
|
271
|
+
start_idx = max(0, par_idx - max_before)
|
|
272
|
+
|
|
273
|
+
end_idx = min(len(paragraphs), par_idx + max_after + 1)
|
|
274
|
+
|
|
275
|
+
snippet_lines: list[str] = []
|
|
276
|
+
for para in paragraphs[start_idx:end_idx]:
|
|
277
|
+
snippet_lines.extend(para)
|
|
278
|
+
snippet_lines.append("")
|
|
279
|
+
|
|
280
|
+
return "\n".join(snippet_lines).strip()
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@dataclass(slots=True)
|
|
284
|
+
class AltGenerationResult:
|
|
285
|
+
"""Container for AI-generated alt text suggestions."""
|
|
286
|
+
|
|
287
|
+
markdown_file: str
|
|
288
|
+
asset_path: str
|
|
289
|
+
suggested_alt: str
|
|
290
|
+
model: str
|
|
291
|
+
context_snippet: str
|
|
292
|
+
line_number: int
|
|
293
|
+
final_alt: str | None = None
|
|
294
|
+
|
|
295
|
+
def to_json(self) -> dict[str, object]:
|
|
296
|
+
"""Convert to JSON-serializable dict."""
|
|
297
|
+
return asdict(self)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
class AltGenerationError(Exception):
|
|
301
|
+
"""Raised when caption generation fails."""
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _convert_avif_to_png(asset_path: Path, workspace: Path) -> Path:
|
|
305
|
+
"""Convert AVIF images to PNG format for LLM compatibility."""
|
|
306
|
+
if asset_path.suffix.lower() != ".avif":
|
|
307
|
+
return asset_path
|
|
308
|
+
|
|
309
|
+
png_target = workspace / f"{asset_path.stem}.png"
|
|
310
|
+
magick_executable = find_executable("magick")
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
subprocess.run(
|
|
314
|
+
[magick_executable, str(asset_path), str(png_target)],
|
|
315
|
+
check=True,
|
|
316
|
+
capture_output=True,
|
|
317
|
+
text=True,
|
|
318
|
+
)
|
|
319
|
+
return png_target
|
|
320
|
+
except subprocess.CalledProcessError as err:
|
|
321
|
+
raise AltGenerationError(
|
|
322
|
+
f"Failed to convert AVIF to PNG: {err.stderr or err.stdout}"
|
|
323
|
+
) from err
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _convert_gif_to_mp4(asset_path: Path, workspace: Path) -> Path:
|
|
327
|
+
"""Convert GIF files to MP4 format for LLM compatibility."""
|
|
328
|
+
if asset_path.suffix.lower() != ".gif":
|
|
329
|
+
raise ValueError(f"Unsupported file type '{asset_path.suffix}'.")
|
|
330
|
+
|
|
331
|
+
mp4_target = workspace / f"{asset_path.stem}.mp4"
|
|
332
|
+
ffmpeg_executable = find_executable("ffmpeg")
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
subprocess.run(
|
|
336
|
+
[
|
|
337
|
+
ffmpeg_executable,
|
|
338
|
+
"-i",
|
|
339
|
+
str(asset_path),
|
|
340
|
+
"-vf",
|
|
341
|
+
"scale=trunc(iw/2)*2:trunc(ih/2)*2",
|
|
342
|
+
"-y",
|
|
343
|
+
str(mp4_target),
|
|
344
|
+
],
|
|
345
|
+
check=True,
|
|
346
|
+
capture_output=True,
|
|
347
|
+
text=True,
|
|
348
|
+
timeout=30,
|
|
349
|
+
)
|
|
350
|
+
return mp4_target
|
|
351
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired) as err:
|
|
352
|
+
raise AltGenerationError(
|
|
353
|
+
f"Failed to convert GIF to MP4: {err}"
|
|
354
|
+
) from err
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _convert_asset_for_llm(asset_path: Path, workspace: Path) -> Path:
|
|
358
|
+
"""Converts asset to a format compatible with the LLM if needed."""
|
|
359
|
+
if asset_path.suffix.lower() == ".avif":
|
|
360
|
+
return _convert_avif_to_png(asset_path, workspace)
|
|
361
|
+
if asset_path.suffix.lower() == ".gif":
|
|
362
|
+
return _convert_gif_to_mp4(asset_path, workspace)
|
|
363
|
+
return asset_path
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def download_asset(queue_item: "scan.QueueItem", workspace: Path) -> Path:
|
|
367
|
+
"""Download or locate asset file, returning path to accessible copy."""
|
|
368
|
+
asset_path = queue_item.asset_path
|
|
369
|
+
|
|
370
|
+
if is_url(asset_path):
|
|
371
|
+
headers = {
|
|
372
|
+
"User-Agent": (
|
|
373
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
374
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
375
|
+
"Chrome/91.0.4472.124 Safari/537.36"
|
|
376
|
+
)
|
|
377
|
+
}
|
|
378
|
+
response = requests.get(
|
|
379
|
+
asset_path, timeout=20, stream=True, headers=headers
|
|
380
|
+
)
|
|
381
|
+
response.raise_for_status()
|
|
382
|
+
suffix = Path(urlparse(asset_path).path).suffix or ".bin"
|
|
383
|
+
target = workspace / f"asset{suffix}"
|
|
384
|
+
with target.open("wb") as handle:
|
|
385
|
+
for chunk in response.iter_content(chunk_size=8192):
|
|
386
|
+
handle.write(chunk)
|
|
387
|
+
return _convert_asset_for_llm(target, workspace)
|
|
388
|
+
|
|
389
|
+
# Try relative to markdown file first
|
|
390
|
+
markdown_path = Path(queue_item.markdown_file)
|
|
391
|
+
candidate = markdown_path.parent / asset_path
|
|
392
|
+
if candidate.exists():
|
|
393
|
+
return _convert_asset_for_llm(candidate.resolve(), workspace)
|
|
394
|
+
|
|
395
|
+
# Try relative to git root
|
|
396
|
+
git_root = get_git_root()
|
|
397
|
+
alternative = git_root / asset_path.lstrip("/")
|
|
398
|
+
if alternative.exists():
|
|
399
|
+
return _convert_asset_for_llm(alternative.resolve(), workspace)
|
|
400
|
+
|
|
401
|
+
raise FileNotFoundError(
|
|
402
|
+
f"Unable to locate asset '{asset_path}' referenced in {queue_item.markdown_file}"
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
|
|
406
|
+
def generate_article_context(
|
|
407
|
+
queue_item: "scan.QueueItem",
|
|
408
|
+
max_before: int | None = None,
|
|
409
|
+
max_after: int = 2,
|
|
410
|
+
trim_frontmatter: bool = False,
|
|
411
|
+
) -> str:
|
|
412
|
+
"""Generate context with all preceding paragraphs and 2 after for LLM
|
|
413
|
+
prompts."""
|
|
414
|
+
markdown_path = Path(queue_item.markdown_file)
|
|
415
|
+
source_text = markdown_path.read_text(encoding="utf-8")
|
|
416
|
+
source_lines = source_text.splitlines()
|
|
417
|
+
|
|
418
|
+
# Convert from 1-based line number to 0-based index
|
|
419
|
+
line_number_to_pass = queue_item.line_number - 1
|
|
420
|
+
lines_to_show = source_lines
|
|
421
|
+
|
|
422
|
+
if trim_frontmatter:
|
|
423
|
+
# Try to split YAML frontmatter and get content only
|
|
424
|
+
_, split_content = split_yaml(markdown_path, verbose=False)
|
|
425
|
+
|
|
426
|
+
# If frontmatter found, use content without frontmatter
|
|
427
|
+
if split_content.strip():
|
|
428
|
+
lines_to_show = split_content.splitlines()
|
|
429
|
+
num_frontmatter_lines = len(source_lines) - len(lines_to_show)
|
|
430
|
+
line_number_to_pass = (
|
|
431
|
+
queue_item.line_number - 1 - num_frontmatter_lines
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
return paragraph_context(
|
|
435
|
+
lines_to_show,
|
|
436
|
+
line_number_to_pass,
|
|
437
|
+
max_before=max_before,
|
|
438
|
+
max_after=max_after,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
|
|
442
|
+
def build_prompt(
|
|
443
|
+
queue_item: "scan.QueueItem",
|
|
444
|
+
max_chars: int,
|
|
445
|
+
) -> str:
|
|
446
|
+
"""Build prompt for LLM caption generation."""
|
|
447
|
+
base_prompt = textwrap.dedent(
|
|
448
|
+
"""
|
|
449
|
+
Generate concise alt text for accessibility and SEO.
|
|
450
|
+
Describe the intended information of the image clearly and accurately.
|
|
451
|
+
"""
|
|
452
|
+
).strip()
|
|
453
|
+
|
|
454
|
+
article_context = generate_article_context(
|
|
455
|
+
queue_item, trim_frontmatter=False
|
|
456
|
+
)
|
|
457
|
+
main_prompt = textwrap.dedent(
|
|
458
|
+
f"""
|
|
459
|
+
Context from {queue_item.markdown_file}:
|
|
460
|
+
{article_context}
|
|
461
|
+
|
|
462
|
+
Critical requirements:
|
|
463
|
+
- Under {max_chars} characters (aim for 1-2 sentences when possible)
|
|
464
|
+
- Do not include redundant information (e.g. "image of", "picture of", "diagram illustrating", "a diagram of")
|
|
465
|
+
- Return only the alt text, no quotes
|
|
466
|
+
- For text-heavy images: transcribe key text content, then describe visual elements
|
|
467
|
+
- Don't reintroduce acronyms
|
|
468
|
+
- Don't describe purely visual elements unless directly relevant for
|
|
469
|
+
understanding the content (e.g. don't say "the line in this scientific chart is green")
|
|
470
|
+
- Describe spatial relationships and visual hierarchy when important
|
|
471
|
+
|
|
472
|
+
Prioritize completeness over brevity - include both textual content and visual description as needed.
|
|
473
|
+
While thinking quietly, propose a candidate alt text. Then critique the candidate alt text—
|
|
474
|
+
does it accurately describe the information the image is meant to convey?
|
|
475
|
+
Incorporate the critique into the alt text to improve it. Only output the improved alt text.
|
|
476
|
+
"""
|
|
477
|
+
).strip()
|
|
478
|
+
|
|
479
|
+
return f"{base_prompt}\n{main_prompt}"
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def load_existing_captions(captions_path: Path) -> set[str]:
|
|
483
|
+
"""Load existing asset paths from captions file."""
|
|
484
|
+
try:
|
|
485
|
+
with open(captions_path, encoding="utf-8") as f:
|
|
486
|
+
data = json.load(f)
|
|
487
|
+
return {item["asset_path"] for item in data if "asset_path" in item}
|
|
488
|
+
except (FileNotFoundError, json.JSONDecodeError, KeyError, TypeError):
|
|
489
|
+
return set()
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def write_output(
|
|
493
|
+
results: Iterable[AltGenerationResult],
|
|
494
|
+
output_path: Path,
|
|
495
|
+
append_mode: bool = False,
|
|
496
|
+
) -> None:
|
|
497
|
+
"""Write results to JSON file."""
|
|
498
|
+
payload = [result.to_json() for result in results]
|
|
499
|
+
|
|
500
|
+
if append_mode and output_path.exists():
|
|
501
|
+
# Load existing data and append new results
|
|
502
|
+
try:
|
|
503
|
+
with open(output_path, encoding="utf-8") as f:
|
|
504
|
+
existing_data = json.load(f)
|
|
505
|
+
if isinstance(existing_data, list):
|
|
506
|
+
payload = existing_data + payload
|
|
507
|
+
except (json.JSONDecodeError, TypeError):
|
|
508
|
+
# If existing file is corrupted, just use new data
|
|
509
|
+
print(f"Existing file {output_path} is corrupted, using new data")
|
|
510
|
+
|
|
511
|
+
print(f"Writing {len(payload)} results to {output_path}")
|
|
512
|
+
output_path.write_text(
|
|
513
|
+
json.dumps(payload, indent=2, ensure_ascii=False),
|
|
514
|
+
encoding="utf-8",
|
|
515
|
+
)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: alt-text-llm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: AI-powered alt text generation and labeling tools for markdown content
|
|
5
|
+
Author: TurnTrout
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Repository, https://github.com/alexander-turner/alt-text-llm
|
|
8
|
+
Keywords: alt-text,accessibility,markdown,llm,ai
|
|
9
|
+
Requires-Python: >=3.11
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Requires-Dist: gitpython
|
|
13
|
+
Requires-Dist: requests
|
|
14
|
+
Requires-Dist: ruamel.yaml
|
|
15
|
+
Requires-Dist: markdown-it-py
|
|
16
|
+
Requires-Dist: rich
|
|
17
|
+
Requires-Dist: tqdm
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest; extra == "dev"
|
|
20
|
+
Requires-Dist: mypy; extra == "dev"
|
|
21
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
22
|
+
Dynamic: license-file
|
|
23
|
+
|
|
24
|
+
# alt-text-llm
|
|
25
|
+
|
|
26
|
+
AI-powered alt text generation and labeling tools for markdown content. Originally developed for [my website](https://turntrout.com/design) ([repo](https://github.com/alexander-turner/TurnTrout.com)).
|
|
27
|
+
|
|
28
|
+
## Installation
|
|
29
|
+
|
|
30
|
+
### Quick install from GitHub
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
pip install git+https://github.com/alexander-turner/alt-text-llm.git
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### Automated setup (includes system dependencies)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
git clone https://github.com/alexander-turner/alt-text-llm.git
|
|
40
|
+
cd alt-text-llm
|
|
41
|
+
./setup.sh
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## Prerequisites
|
|
45
|
+
|
|
46
|
+
The following command-line tools must be installed:
|
|
47
|
+
|
|
48
|
+
- **`llm`** - LLM interface ([install instructions](https://llm.datasette.io/))
|
|
49
|
+
- **`git`** - Version control
|
|
50
|
+
- **`magick`** (ImageMagick) - Image processing
|
|
51
|
+
- **`ffmpeg`** - Video processing
|
|
52
|
+
- **`imgcat`** - Terminal image display
|
|
53
|
+
|
|
54
|
+
**macOS:**
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
brew install imagemagick ffmpeg imgcat
|
|
58
|
+
pip install llm
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
**Linux:**
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
sudo apt-get install imagemagick ffmpeg
|
|
65
|
+
pip install llm
|
|
66
|
+
# imgcat: curl -sL https://iterm2.com/utilities/imgcat -o ~/.local/bin/imgcat && chmod +x ~/.local/bin/imgcat
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
|
|
71
|
+
The tool provides three main commands: `scan`, `generate`, and `label`.
|
|
72
|
+
|
|
73
|
+
### 1. Scan for missing alt text
|
|
74
|
+
|
|
75
|
+
Scan your markdown files to find images without meaningful alt text:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
alt-text-llm scan --root /path/to/markdown/files
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
This creates `asset_queue.json` with all assets needing alt text.
|
|
82
|
+
|
|
83
|
+
### 2. Generate AI suggestions
|
|
84
|
+
|
|
85
|
+
Generate alt text suggestions using an LLM:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
alt-text-llm generate \
|
|
89
|
+
--root /path/to/markdown/files \
|
|
90
|
+
--model gemini-2.5-flash \
|
|
91
|
+
--suggestions-file suggested_alts.json
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Available options:**
|
|
95
|
+
|
|
96
|
+
- `--model` (required) - LLM model to use (e.g., `gemini-2.5-flash`, `gpt-4o-mini`, `claude-3-5-sonnet`)
|
|
97
|
+
- `--max-chars` - Maximum characters for alt text (default: 300)
|
|
98
|
+
- `--timeout` - LLM timeout in seconds (default: 120)
|
|
99
|
+
- `--estimate-only` - Only show cost estimate without generating
|
|
100
|
+
- `--process-existing` - Also process assets that already have captions
|
|
101
|
+
|
|
102
|
+
**Cost estimation:**
|
|
103
|
+
|
|
104
|
+
```bash
|
|
105
|
+
alt-text-llm generate \
|
|
106
|
+
--root /path/to/markdown/files \
|
|
107
|
+
--model gemini-2.5-flash \
|
|
108
|
+
--estimate-only
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 3. Label and approve suggestions
|
|
112
|
+
|
|
113
|
+
Interactively review and approve the AI-generated suggestions:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
alt-text-llm label \
|
|
117
|
+
--suggestions-file suggested_alts.json \
|
|
118
|
+
--output asset_captions.json
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
**Interactive commands:**
|
|
122
|
+
|
|
123
|
+
- Edit the suggested alt text (vim keybindings enabled)
|
|
124
|
+
- Press Enter to accept the suggestion as-is
|
|
125
|
+
- Submit `undo` or `u` to go back to the previous item
|
|
126
|
+
- Images display in your terminal (requires `imgcat`)
|
|
127
|
+
|
|
128
|
+
## Example workflow
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
# 1. Scan markdown files for missing alt text
|
|
132
|
+
alt-text-llm scan --root ./content
|
|
133
|
+
|
|
134
|
+
# 2. Estimate the cost
|
|
135
|
+
alt-text-llm generate \
|
|
136
|
+
--root ./content \
|
|
137
|
+
--model gemini-2.5-flash \
|
|
138
|
+
--estimate-only
|
|
139
|
+
|
|
140
|
+
# 3. Generate suggestions (if cost is acceptable)
|
|
141
|
+
alt-text-llm generate \
|
|
142
|
+
--root ./content \
|
|
143
|
+
--model gemini-2.5-flash
|
|
144
|
+
|
|
145
|
+
# 4. Review and approve suggestions
|
|
146
|
+
alt-text-llm label
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Configuration
|
|
150
|
+
|
|
151
|
+
### LLM Integration
|
|
152
|
+
|
|
153
|
+
This tool uses the [`llm` CLI tool](https://llm.datasette.io/) to generate alt text. This provides access to many different AI models including:
|
|
154
|
+
|
|
155
|
+
- **Gemini** (Google) via the [llm-gemini plugin](https://github.com/simonw/llm-gemini)
|
|
156
|
+
- **Claude** (Anthropic) via the [llm-claude-3 plugin](https://github.com/tomviner/llm-claude-3)
|
|
157
|
+
- And [many more via plugins](https://llm.datasette.io/en/stable/plugins/directory.html)
|
|
158
|
+
|
|
159
|
+
### Setting up your model
|
|
160
|
+
|
|
161
|
+
**For Gemini models (default):**
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
llm install llm-gemini
|
|
165
|
+
llm keys set gemini # enter API key
|
|
166
|
+
llm -m gemini-2.5-flash "Hello, world!"
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
**For other models:**
|
|
170
|
+
|
|
171
|
+
1. Install the appropriate llm plugin (e.g., `llm install llm-openai`)
|
|
172
|
+
2. Configure your API key (e.g., `llm keys set openai`)
|
|
173
|
+
3. Use the model name with `--model` flag (e.g., `--model gpt-4o-mini`)
|
|
174
|
+
|
|
175
|
+
See the [llm documentation](https://llm.datasette.io/en/stable/setup.html) for setup instructions and the [plugin directory](https://llm.datasette.io/en/stable/plugins/directory.html) for available models.
|
|
176
|
+
|
|
177
|
+
## Output files
|
|
178
|
+
|
|
179
|
+
- `asset_queue.json` - Queue of assets needing alt text (from `scan`)
|
|
180
|
+
- `suggested_alts.json` - AI-generated suggestions (from `generate`)
|
|
181
|
+
- `asset_captions.json` - Approved final captions (from `label`)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
alt_text_llm/__init__.py,sha256=vkNaW0Zx2C7JtXD9nG7NHFWBFYqYZ_iECgRtdJP4f5A,222
|
|
2
|
+
alt_text_llm/generate.py,sha256=dYLQMzF9qS4cNoyH4v4_mIZZa2bWeqoVpXYBnw2zlu0,6550
|
|
3
|
+
alt_text_llm/label.py,sha256=XvPINQfW-NFcxTbaa0rdaVKK2P6gE6UqrnIEDXV8T5k,11295
|
|
4
|
+
alt_text_llm/main.py,sha256=CQsRnwP2u2Jca4Kdj73DBntjYND_OUd1nkKxHv4qwQs,7146
|
|
5
|
+
alt_text_llm/scan.py,sha256=fOhfJb5rKLQejFaj1iCAu0vrqIe_bKx08jkeYXFGd-E,6233
|
|
6
|
+
alt_text_llm/utils.py,sha256=4xMFXviMvVB4XXZdMN-VeUB1TefdjpNpWQsWVBYCWMA,16418
|
|
7
|
+
alt_text_llm-0.1.0.dist-info/licenses/LICENSE,sha256=VCpqtaN5u5ulLyhFHpAIKHfYLkMYubaYtpK2m1Bss6c,1085
|
|
8
|
+
alt_text_llm-0.1.0.dist-info/METADATA,sha256=MYgTZlNC_6a9br6fLi19DWcEamB-ahXIk2vkR_UVLHg,4978
|
|
9
|
+
alt_text_llm-0.1.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
10
|
+
alt_text_llm-0.1.0.dist-info/entry_points.txt,sha256=SQyNVYF_LXPoleopqGrZOyR878rKcmGtUS9gIhNLRpY,56
|
|
11
|
+
alt_text_llm-0.1.0.dist-info/top_level.txt,sha256=SJh1xf4GM9seHJryaePMI469CUtALg30wM22vUIqnw4,13
|
|
12
|
+
alt_text_llm-0.1.0.dist-info/RECORD,,
|