memdex 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2427 @@
1
+ #!/usr/bin/env python3
2
+ """Project-level semantic retrieval helper.
3
+
4
+ This script intentionally depends only on Python stdlib for the control plane.
5
+ It shells out to `npx repomix`, `notebooklm`, `git`, and `rg` when needed.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import argparse
11
+ import concurrent.futures
12
+ import contextlib
13
+ import datetime as dt
14
+ import errno
15
+ import fnmatch
16
+ import hashlib
17
+ import json
18
+ import os
19
+ import re
20
+ import shlex
21
+ import shutil
22
+ import subprocess
23
+ import sys
24
+ import threading
25
+ import time
26
+ from pathlib import Path
27
+ from typing import Any
28
+
29
+
30
+ CONFIG_DIR = ".memdex"
31
+ CONFIG_JSON = "config.json"
32
+ STATE_JSON = "state.local.json"
33
+ PENDING_UPLOAD_JSON = "pending-upload.local.json"
34
+ DEFAULT_NOTEBOOK_TITLE_PREFIX = "memdex"
35
+ SCRIPT_PATH = Path(__file__).resolve()
36
+ SCRIPT_CMD_ENV = "MEMDEX_CMD"
37
+ LEGACY_SCRIPT_CMD_ENV = "CODEBASE_RETRIEVE_CMD"
38
+ NOTEBOOKLM_PACKAGE = "git+https://github.com/teng-lin/notebooklm-py.git"
39
+ NOTEBOOKLM_BIN_ENV = "NOTEBOOKLM_BIN"
40
+
41
+
42
+ def now_utc() -> dt.datetime:
43
+ return dt.datetime.now(dt.timezone.utc).replace(microsecond=0)
44
+
45
+
46
+ def iso(ts: dt.datetime | None = None) -> str:
47
+ return (ts or now_utc()).isoformat().replace("+00:00", "Z")
48
+
49
+
50
+ def parse_iso(value: str | None) -> dt.datetime | None:
51
+ if not value:
52
+ return None
53
+ return dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
54
+
55
+
56
+ def die(message: str, code: int = 2) -> None:
57
+ print(f"error: {message}", file=sys.stderr)
58
+ raise SystemExit(code)
59
+
60
+
61
+ def script_cmd() -> list[str]:
62
+ override = os.environ.get(SCRIPT_CMD_ENV, "").strip()
63
+ if not override:
64
+ override = os.environ.get(LEGACY_SCRIPT_CMD_ENV, "").strip()
65
+ if override:
66
+ return shlex.split(override)
67
+ return [sys.executable or "python3", str(SCRIPT_PATH)]
68
+
69
+
70
+ def command_line(repo: Path, command: str, *parts: str) -> str:
71
+ rendered = [*script_cmd(), command, "--repo", str(repo), *parts]
72
+ return " ".join(shlex.quote(part) for part in rendered)
73
+
74
+
75
+ def missing_config_message(repo: Path, config_file: Path, command: str = "") -> str:
76
+ init_create = command_line(repo, "init", "--create-notebook")
77
+ init_reuse = command_line(repo, "init", "--reuse-existing-notebook")
78
+ ask = command_line(repo, "ask", "your question")
79
+ ask_yes = command_line(repo, "ask", "--yes", "your question")
80
+ locate = command_line(repo, "locate", "thing to find")
81
+ lines = [
82
+ f"project is not initialized for project retrieval: {config_file}",
83
+ "",
84
+ "Initialize this repo first:",
85
+ f" {init_create}",
86
+ "",
87
+ "Or reuse an existing NotebookLM notebook with the expected title:",
88
+ f" {init_reuse}",
89
+ "",
90
+ "Then ask or locate directly; both commands run freshness preflight:",
91
+ f" {ask}",
92
+ f" {locate}",
93
+ "",
94
+ "If this is the first broad upload and you already approve it:",
95
+ f" {ask_yes}",
96
+ ]
97
+ if command:
98
+ lines.insert(1, f"Command `{command}` needs `.memdex/config.json` before it can run.")
99
+ return "\n".join(lines)
100
+
101
+
102
+ def uninitialized_status(repo: Path, config_file: Path) -> dict[str, Any]:
103
+ return {
104
+ "status": "not-initialized",
105
+ "initialized": False,
106
+ "config": str(config_file),
107
+ "message": "project is not initialized for project retrieval",
108
+ "next": {
109
+ "createNotebook": command_line(repo, "init", "--create-notebook"),
110
+ "reuseExistingNotebook": command_line(repo, "init", "--reuse-existing-notebook"),
111
+ "ask": command_line(repo, "ask", "your question"),
112
+ "locate": command_line(repo, "locate", "thing to find"),
113
+ "askWithFirstUploadApproval": command_line(repo, "ask", "--yes", "your question"),
114
+ },
115
+ }
116
+
117
+
118
+ @contextlib.contextmanager
119
+ def repo_lock(repo: Path, *, timeout_seconds: float = 300.0):
120
+ lock_path = repo / CONFIG_DIR / ".lock"
121
+ lock_path.parent.mkdir(parents=True, exist_ok=True)
122
+ start = time.monotonic()
123
+ fd: int | None = None
124
+ while True:
125
+ try:
126
+ fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
127
+ os.write(fd, f"pid={os.getpid()}\ncreatedAt={iso()}\n".encode("utf-8"))
128
+ break
129
+ except OSError as error:
130
+ if error.errno != errno.EEXIST:
131
+ raise
132
+ if time.monotonic() - start > timeout_seconds:
133
+ die(f"timed out waiting for lock: {lock_path}")
134
+ time.sleep(0.2)
135
+ try:
136
+ yield
137
+ finally:
138
+ if fd is not None:
139
+ os.close(fd)
140
+ try:
141
+ lock_path.unlink()
142
+ except FileNotFoundError:
143
+ pass
144
+
145
+
146
+ def run(argv: list[str], cwd: Path, *, input_text: str | None = None, timeout: int | None = None) -> subprocess.CompletedProcess[str]:
147
+ return subprocess.run(
148
+ argv,
149
+ cwd=str(cwd),
150
+ input=input_text,
151
+ text=True,
152
+ stdout=subprocess.PIPE,
153
+ stderr=subprocess.PIPE,
154
+ timeout=timeout,
155
+ check=False,
156
+ )
157
+
158
+
159
+ def require_tool(name: str) -> None:
160
+ if shutil.which(name) is None:
161
+ die(f"required tool not found on PATH: {name}")
162
+
163
+
164
+ def notebooklm_cmd() -> list[str]:
165
+ override = os.environ.get(NOTEBOOKLM_BIN_ENV, "").strip()
166
+ if override:
167
+ return shlex.split(override)
168
+ found = shutil.which("notebooklm")
169
+ if found:
170
+ return [found]
171
+ die(
172
+ "required tool not found on PATH: notebooklm\n"
173
+ f"Install persistently: uv tool install {NOTEBOOKLM_PACKAGE}\n"
174
+ f"Or set {NOTEBOOKLM_BIN_ENV}='uvx --from {NOTEBOOKLM_PACKAGE} notebooklm'"
175
+ )
176
+
177
+
178
+ def sha256_bytes(data: bytes) -> str:
179
+ return "sha256:" + hashlib.sha256(data).hexdigest()
180
+
181
+
182
+ def sha256_text(data: str) -> str:
183
+ return sha256_bytes(data.encode("utf-8"))
184
+
185
+
186
+ def sha256_file(path: Path) -> str:
187
+ digest = hashlib.sha256()
188
+ with path.open("rb") as handle:
189
+ for chunk in iter(lambda: handle.read(1024 * 1024), b""):
190
+ digest.update(chunk)
191
+ return "sha256:" + digest.hexdigest()
192
+
193
+
194
+ def remove_file_quiet(path: Path) -> None:
195
+ try:
196
+ path.unlink()
197
+ except FileNotFoundError:
198
+ pass
199
+
200
+
201
+ def default_include() -> list[str]:
202
+ return [
203
+ "src",
204
+ "crates",
205
+ "packages",
206
+ "apps",
207
+ "bins",
208
+ "docs",
209
+ "scripts",
210
+ "tests",
211
+ "xtask",
212
+ "AGENTS.md",
213
+ "CLAUDE.md",
214
+ "README.md",
215
+ "Cargo.toml",
216
+ "package.json",
217
+ "justfile",
218
+ ]
219
+
220
+
221
+ def default_groups() -> list[dict[str, Any]]:
222
+ return [
223
+ {"id": "docs", "include": ["AGENTS.md", "CLAUDE.md", "README.md", "docs/**"]},
224
+ {"id": "apps", "include": ["apps/**"]},
225
+ {"id": "packages", "include": ["packages/**"]},
226
+ {"id": "src", "include": ["src/**", "crates/**", "bins/**", "xtask/**"]},
227
+ {"id": "tests", "include": ["tests/**", "testdata/**"]},
228
+ {"id": "scripts", "include": ["scripts/**"]},
229
+ ]
230
+
231
+
232
+ def slugify(value: str) -> str:
233
+ lowered = value.strip().lower()
234
+ slug = re.sub(r"[^a-z0-9._-]+", "-", lowered)
235
+ slug = re.sub(r"-+", "-", slug).strip("-")
236
+ return slug or "repo"
237
+
238
+
239
+ def default_notebook_title(project_name: str, title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX) -> str:
240
+ return f"{title_prefix}:{project_name}"
241
+
242
+
243
+ def default_source_title_prefix(project_name: str, title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX) -> str:
244
+ return f"{slugify(title_prefix)}-{slugify(project_name)}-repo"
245
+
246
+
247
+ def default_short_source_title_prefix() -> str:
248
+ return "memdex"
249
+
250
+
251
+ def default_config(
252
+ repo: Path,
253
+ notebook_id: str = "",
254
+ *,
255
+ project_name: str | None = None,
256
+ notebook_title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX,
257
+ notebook_title: str | None = None,
258
+ ) -> dict[str, Any]:
259
+ project = project_name or repo.name
260
+ title = notebook_title or default_notebook_title(project, notebook_title_prefix)
261
+ return {
262
+ "version": 1,
263
+ "project": {
264
+ "name": project,
265
+ },
266
+ "provider": "notebooklm",
267
+ "notebooklm": {
268
+ "notebook_id": notebook_id,
269
+ "notebook_title_prefix": notebook_title_prefix,
270
+ "notebook_title": title,
271
+ "source_title_prefix": default_short_source_title_prefix(),
272
+ "wait_after_upload": True,
273
+ "upload_parallelism": 4,
274
+ "wait_parallelism": 8,
275
+ "delete_parallelism": 4,
276
+ },
277
+ "bundle": {
278
+ "tool": "repomix",
279
+ "mode": "chunked",
280
+ "include": default_include(),
281
+ "output": f"{CONFIG_DIR}/cache/{{prefix}}-{{timestamp}}.txt",
282
+ "style": "",
283
+ "compress": False,
284
+ "target_chunk_bytes": 716800,
285
+ "max_chunk_bytes": 900000,
286
+ "source_title_template": "{prefix}--{set}--{group}--{chunk}--{hash}.md",
287
+ "groups": default_groups(),
288
+ "default_group": {"enabled": True, "id": "misc"},
289
+ },
290
+ "refresh": {
291
+ "auto": True,
292
+ "mode": "replace",
293
+ "check_ttl_seconds": 300,
294
+ "min_upload_interval_seconds": 900,
295
+ "max_staleness_seconds": 86400,
296
+ "keep_previous_sources": 0,
297
+ "delete_previous_after_success": True,
298
+ },
299
+ "safety": {
300
+ "require_user_approval_first_upload": True,
301
+ "never_upload": [
302
+ ".env*",
303
+ "**/.env*",
304
+ ".git/**",
305
+ "**/.git/**",
306
+ "node_modules/**",
307
+ "**/node_modules/**",
308
+ "target/**",
309
+ "**/target/**",
310
+ "dist/**",
311
+ "**/dist/**",
312
+ "build/**",
313
+ "**/build/**",
314
+ "coverage/**",
315
+ "**/coverage/**",
316
+ ".next/**",
317
+ "**/.next/**",
318
+ ".generated/**",
319
+ "**/.generated/**",
320
+ "public/**",
321
+ "**/public/**",
322
+ "*.png",
323
+ "**/*.png",
324
+ "*.jpg",
325
+ "**/*.jpg",
326
+ "*.jpeg",
327
+ "**/*.jpeg",
328
+ "*.gif",
329
+ "**/*.gif",
330
+ "*.webp",
331
+ "**/*.webp",
332
+ "*.svg",
333
+ "**/*.svg",
334
+ "*.ico",
335
+ "**/*.ico",
336
+ "*.otf",
337
+ "**/*.otf",
338
+ "*.ttf",
339
+ "**/*.ttf",
340
+ "*.woff",
341
+ "**/*.woff",
342
+ "*.woff2",
343
+ "**/*.woff2",
344
+ "*.mp4",
345
+ "**/*.mp4",
346
+ "*.mov",
347
+ "**/*.mov",
348
+ "*.zip",
349
+ "**/*.zip",
350
+ "*.tar",
351
+ "**/*.tar",
352
+ "*.gz",
353
+ "**/*.gz",
354
+ ],
355
+ },
356
+ "retrieval": {
357
+ "line_numbers_require_local_verify": True,
358
+ "max_local_matches": 80,
359
+ },
360
+ }
361
+
362
+
363
+ def config_path(repo: Path) -> Path:
364
+ candidates = [
365
+ repo / CONFIG_DIR / CONFIG_JSON,
366
+ repo / CONFIG_DIR / "config.yaml",
367
+ repo / CONFIG_DIR / "config.yml",
368
+ repo / ".notebooklm" / CONFIG_JSON,
369
+ repo / ".notebooklm" / "config.yaml",
370
+ repo / ".notebooklm" / "config.yml",
371
+ ]
372
+ for path in candidates:
373
+ if path.exists():
374
+ return path
375
+ return repo / CONFIG_DIR / CONFIG_JSON
376
+
377
+
378
+ def load_config(repo: Path, *, command: str = "") -> tuple[dict[str, Any], Path]:
379
+ path = config_path(repo)
380
+ if not path.exists():
381
+ die(missing_config_message(repo, path, command))
382
+ if path.suffix == ".json":
383
+ return json.loads(path.read_text()), path
384
+ try:
385
+ import yaml # type: ignore
386
+ except Exception as error: # pragma: no cover - depends on host env
387
+ die(f"YAML config requires PyYAML or use JSON config instead: {error}")
388
+ return yaml.safe_load(path.read_text()), path
389
+
390
+
391
+ def write_json(path: Path, value: Any) -> None:
392
+ path.parent.mkdir(parents=True, exist_ok=True)
393
+ path.write_text(json.dumps(value, indent=2, ensure_ascii=False) + "\n")
394
+
395
+
396
+ def load_state(config_file: Path) -> tuple[dict[str, Any], Path]:
397
+ state_path = config_file.parent / STATE_JSON
398
+ if state_path.exists():
399
+ return json.loads(state_path.read_text()), state_path
400
+ return {"sources": []}, state_path
401
+
402
+
403
+ def include_specs(config: dict[str, Any]) -> list[str]:
404
+ include = config.get("bundle", {}).get("include") or default_include()
405
+ return [str(item).strip().strip("/") for item in include if str(item).strip()]
406
+
407
+
408
+ def group_specs(group: dict[str, Any]) -> list[str]:
409
+ include = group.get("include") or []
410
+ return [str(item).strip().strip("/") for item in include if str(item).strip()]
411
+
412
+
413
+ def never_upload_specs(config: dict[str, Any]) -> list[str]:
414
+ built_in = [
415
+ ".git/**",
416
+ "**/.git/**",
417
+ ".env*",
418
+ "**/.env*",
419
+ "node_modules/**",
420
+ "**/node_modules/**",
421
+ ".next/**",
422
+ "**/.next/**",
423
+ "dist/**",
424
+ "**/dist/**",
425
+ "build/**",
426
+ "**/build/**",
427
+ "coverage/**",
428
+ "**/coverage/**",
429
+ ".generated/**",
430
+ "**/.generated/**",
431
+ "public/**",
432
+ "**/public/**",
433
+ "*.png",
434
+ "**/*.png",
435
+ "*.jpg",
436
+ "**/*.jpg",
437
+ "*.jpeg",
438
+ "**/*.jpeg",
439
+ "*.gif",
440
+ "**/*.gif",
441
+ "*.webp",
442
+ "**/*.webp",
443
+ "*.svg",
444
+ "**/*.svg",
445
+ "*.ico",
446
+ "**/*.ico",
447
+ "*.otf",
448
+ "**/*.otf",
449
+ "*.ttf",
450
+ "**/*.ttf",
451
+ "*.woff",
452
+ "**/*.woff",
453
+ "*.woff2",
454
+ "**/*.woff2",
455
+ "*.mp4",
456
+ "**/*.mp4",
457
+ "*.mov",
458
+ "**/*.mov",
459
+ "*.zip",
460
+ "**/*.zip",
461
+ "*.tar",
462
+ "**/*.tar",
463
+ "*.gz",
464
+ "**/*.gz",
465
+ ]
466
+ never_upload = config.get("safety", {}).get("never_upload") or []
467
+ return [str(item).strip() for item in [*built_in, *never_upload] if str(item).strip()]
468
+
469
+
470
+ def path_matches_spec(path: str, spec: str) -> bool:
471
+ clean = path.strip().lstrip("./")
472
+ pattern = spec.strip().lstrip("./")
473
+ if not pattern:
474
+ return False
475
+ if pattern in {".", "*"}:
476
+ return True
477
+ if clean == pattern or clean.startswith(pattern.rstrip("/") + "/"):
478
+ return True
479
+ return fnmatch.fnmatch(clean, pattern) or fnmatch.fnmatch("./" + clean, pattern)
480
+
481
+
482
+ def path_is_included(path: str, includes: list[str]) -> bool:
483
+ for spec in includes:
484
+ if path_matches_spec(path, spec):
485
+ return True
486
+ return False
487
+
488
+
489
+ def path_is_ignored(path: str, ignores: list[str]) -> bool:
490
+ return any(path_matches_spec(path, spec) for spec in ignores)
491
+
492
+
493
+ def bundle_mode(config: dict[str, Any]) -> str:
494
+ return str(config.get("bundle", {}).get("mode") or "chunked")
495
+
496
+
497
+ def parse_size_bytes(value: Any, fallback: int) -> int:
498
+ if isinstance(value, int):
499
+ return value
500
+ text = str(value or "").strip().lower()
501
+ if not text:
502
+ return fallback
503
+ match = re.fullmatch(r"(\d+)(?:\s*(b|kb|kib|mb|mib))?", text)
504
+ if not match:
505
+ return fallback
506
+ amount = int(match.group(1))
507
+ unit = match.group(2) or "b"
508
+ if unit in {"kb", "kib"}:
509
+ return amount * 1024
510
+ if unit in {"mb", "mib"}:
511
+ return amount * 1024 * 1024
512
+ return amount
513
+
514
+
515
+ def positive_int(value: Any, fallback: int, *, minimum: int = 1, maximum: int = 32) -> int:
516
+ try:
517
+ parsed = int(value)
518
+ except (TypeError, ValueError):
519
+ parsed = fallback
520
+ return max(minimum, min(maximum, parsed))
521
+
522
+
523
+ def list_git_files(repo: Path) -> list[str]:
524
+ result = run(["git", "ls-files", "-co", "--exclude-standard"], repo)
525
+ if result.returncode != 0:
526
+ files: list[str] = []
527
+ for path in repo.rglob("*"):
528
+ if not path.is_file():
529
+ continue
530
+ rel = path.relative_to(repo).as_posix()
531
+ if rel.startswith(".git/"):
532
+ continue
533
+ files.append(rel)
534
+ return sorted(files)
535
+ return sorted(line.strip() for line in result.stdout.splitlines() if line.strip())
536
+
537
+
538
+ def collect_bundle_files(repo: Path, config: dict[str, Any]) -> list[str]:
539
+ includes = include_specs(config)
540
+ ignores = never_upload_specs(config)
541
+ files: list[str] = []
542
+ for path in list_git_files(repo):
543
+ if not path_is_included(path, includes):
544
+ continue
545
+ if path_is_ignored(path, ignores):
546
+ continue
547
+ full = repo / path
548
+ if not full.is_file() or full.is_symlink():
549
+ continue
550
+ files.append(path)
551
+ return sorted(set(files))
552
+
553
+
554
+ def chunk_file_size(repo: Path, path: str) -> int:
555
+ full = repo / path
556
+ return full.stat().st_size + len(path.encode("utf-8")) + 64
557
+
558
+
559
+ def file_bucket(path: str) -> str:
560
+ parts = path.split("/")
561
+ if len(parts) >= 3 and parts[0] in {"apps", "packages", "crates"}:
562
+ return "/".join(parts[:3])
563
+ if len(parts) >= 2:
564
+ return "/".join(parts[:2])
565
+ return parts[0]
566
+
567
+
568
+ def source_title_for_chunk(config: dict[str, Any], *, set_id: str, group: str, index: int, chunk_hash: str) -> str:
569
+ configured = str(config.get("notebooklm", {}).get("source_title_prefix") or "").strip()
570
+ legacy = configured.startswith("codebase-retrieve-")
571
+ prefix = default_short_source_title_prefix() if legacy or not configured else configured
572
+ template = str(
573
+ config.get("bundle", {}).get("source_title_template")
574
+ or "{prefix}--{set}--{group}--{chunk}--{hash}.md"
575
+ )
576
+ return template.format(
577
+ prefix=slugify(prefix),
578
+ set=set_id,
579
+ set_id=set_id,
580
+ group=slugify(group),
581
+ chunk=f"{index:03d}",
582
+ idx=f"{index:03d}",
583
+ hash=chunk_hash[:8],
584
+ )
585
+
586
+
587
+ def chunk_hash_for_files(repo: Path, files: list[str]) -> str:
588
+ digest = hashlib.sha256()
589
+ for path in files:
590
+ digest.update(path.encode("utf-8"))
591
+ digest.update(b"\0")
592
+ full = repo / path
593
+ if full.is_file():
594
+ with full.open("rb") as handle:
595
+ for block in iter(lambda: handle.read(1024 * 1024), b""):
596
+ digest.update(block)
597
+ digest.update(b"\0")
598
+ return digest.hexdigest()
599
+
600
+
601
+ def assign_files_to_groups(files: list[str], config: dict[str, Any]) -> list[tuple[str, str]]:
602
+ bundle = config.get("bundle", {})
603
+ groups = bundle.get("groups") if "groups" in bundle else default_groups()
604
+ groups = groups or []
605
+ assigned: list[tuple[str, str]] = []
606
+ seen: set[str] = set()
607
+ for group in groups:
608
+ gid = slugify(str(group.get("id") or "group"))
609
+ specs = group_specs(group)
610
+ for path in files:
611
+ if path in seen:
612
+ continue
613
+ if specs and path_is_included(path, specs):
614
+ assigned.append((gid, path))
615
+ seen.add(path)
616
+ default_group = bundle.get("default_group") if "default_group" in bundle else {"enabled": True, "id": "misc"}
617
+ default_group = default_group or {}
618
+ if default_group.get("enabled"):
619
+ gid = slugify(str(default_group.get("id") or "misc"))
620
+ for path in files:
621
+ if path not in seen:
622
+ assigned.append((gid, path))
623
+ seen.add(path)
624
+ elif not groups:
625
+ for path in files:
626
+ assigned.append(("repo", path))
627
+ return assigned
628
+
629
+
630
+ def flush_chunk(chunks: list[dict[str, Any]], repo: Path, config: dict[str, Any], set_id: str, group: str, index: int, files: list[str], total: int) -> None:
631
+ if not files:
632
+ return
633
+ digest = chunk_hash_for_files(repo, files)
634
+ chunks.append(
635
+ {
636
+ "group": group,
637
+ "chunk": f"{index:03d}",
638
+ "index": index,
639
+ "files": files[:],
640
+ "estimatedBytes": total,
641
+ "sha256": "sha256:" + digest,
642
+ "title": source_title_for_chunk(config, set_id=set_id, group=group, index=index, chunk_hash=digest),
643
+ }
644
+ )
645
+
646
+
647
+ def active_chunk_file_members(state: dict[str, Any] | None, group: str) -> list[list[str]]:
648
+ if not state:
649
+ return []
650
+ members: list[tuple[int, list[str]]] = []
651
+ for source in active_sources(state):
652
+ if str(source.get("group") or "") != group:
653
+ continue
654
+ files = source.get("files")
655
+ if not isinstance(files, list) or not files:
656
+ continue
657
+ chunk = str(source.get("chunk") or "0")
658
+ try:
659
+ index = int(chunk)
660
+ except ValueError:
661
+ index = 0
662
+ clean_files = [str(path) for path in files if str(path)]
663
+ if clean_files:
664
+ members.append((index, clean_files))
665
+ return [files for _, files in sorted(members, key=lambda item: item[0])]
666
+
667
+
668
+ def append_greedy_chunks(
669
+ chunks: list[dict[str, Any]],
670
+ repo: Path,
671
+ config: dict[str, Any],
672
+ *,
673
+ set_id: str,
674
+ group: str,
675
+ start_index: int,
676
+ files: list[str],
677
+ target: int,
678
+ max_bytes: int,
679
+ ) -> int:
680
+ current: list[str] = []
681
+ current_size = 0
682
+ index = start_index
683
+ for path in files:
684
+ size = chunk_file_size(repo, path)
685
+ if size > max_bytes:
686
+ die(f"file exceeds max chunk size ({max_bytes} bytes): {path} ({size} bytes)")
687
+ if current and current_size + size > target:
688
+ flush_chunk(chunks, repo, config, set_id, group, index, current, current_size)
689
+ current = []
690
+ current_size = 0
691
+ index += 1
692
+ current.append(path)
693
+ current_size += size
694
+ if current:
695
+ flush_chunk(chunks, repo, config, set_id, group, index, current, current_size)
696
+ index += 1
697
+ return index
698
+
699
+
700
+ def plan_group_chunks(
701
+ chunks: list[dict[str, Any]],
702
+ repo: Path,
703
+ config: dict[str, Any],
704
+ *,
705
+ set_id: str,
706
+ group: str,
707
+ files: list[str],
708
+ target: int,
709
+ max_bytes: int,
710
+ state: dict[str, Any] | None,
711
+ ) -> None:
712
+ ordered = sorted(files, key=lambda path: (file_bucket(path), path))
713
+ available = set(ordered)
714
+ kept: list[list[str]] = []
715
+ for previous_files in active_chunk_file_members(state, group):
716
+ retained = [path for path in previous_files if path in available]
717
+ if not retained:
718
+ continue
719
+ total = sum(chunk_file_size(repo, path) for path in retained)
720
+ if any(chunk_file_size(repo, path) > max_bytes for path in retained):
721
+ for path in retained:
722
+ size = chunk_file_size(repo, path)
723
+ if size > max_bytes:
724
+ die(f"file exceeds max chunk size ({max_bytes} bytes): {path} ({size} bytes)")
725
+ if total <= max_bytes:
726
+ kept.append(retained)
727
+ for path in retained:
728
+ available.discard(path)
729
+
730
+ index = 1
731
+ for files_in_chunk in kept:
732
+ total = sum(chunk_file_size(repo, path) for path in files_in_chunk)
733
+ flush_chunk(chunks, repo, config, set_id, group, index, files_in_chunk, total)
734
+ index += 1
735
+
736
+ remaining = [path for path in ordered if path in available]
737
+ append_greedy_chunks(
738
+ chunks,
739
+ repo,
740
+ config,
741
+ set_id=set_id,
742
+ group=group,
743
+ start_index=index,
744
+ files=remaining,
745
+ target=target,
746
+ max_bytes=max_bytes,
747
+ )
748
+
749
+
750
+ def plan_bundle_chunks(repo: Path, config: dict[str, Any], *, set_id: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
751
+ bundle = config.get("bundle", {})
752
+ target = parse_size_bytes(bundle.get("target_chunk_bytes"), 716800)
753
+ max_bytes = parse_size_bytes(bundle.get("max_chunk_bytes"), 900000)
754
+ if target > max_bytes:
755
+ target = max_bytes
756
+ assigned = assign_files_to_groups(collect_bundle_files(repo, config), config)
757
+ by_group: dict[str, list[str]] = {}
758
+ for group, path in assigned:
759
+ by_group.setdefault(group, []).append(path)
760
+ chunks: list[dict[str, Any]] = []
761
+ for group in sorted(by_group):
762
+ plan_group_chunks(
763
+ chunks,
764
+ repo,
765
+ config,
766
+ set_id=set_id,
767
+ group=group,
768
+ files=by_group[group],
769
+ target=target,
770
+ max_bytes=max_bytes,
771
+ state=state,
772
+ )
773
+ return chunks
774
+
775
+
776
+ def git_head(repo: Path) -> str:
777
+ result = run(["git", "rev-parse", "HEAD"], repo)
778
+ if result.returncode != 0:
779
+ return "no-git-head"
780
+ return result.stdout.strip()
781
+
782
+
783
+ def git_status_records(repo: Path) -> list[tuple[str, str]]:
784
+ result = run(["git", "status", "--porcelain=v1", "-z", "--untracked-files=all"], repo)
785
+ if result.returncode != 0:
786
+ return []
787
+ raw = [part for part in result.stdout.split("\0") if part]
788
+ records: list[tuple[str, str]] = []
789
+ skip_next = False
790
+ for item in raw:
791
+ if skip_next:
792
+ skip_next = False
793
+ continue
794
+ status = item[:2]
795
+ path = item[3:]
796
+ if status.startswith("R") or status.startswith("C"):
797
+ skip_next = True
798
+ records.append((status, path))
799
+ return records
800
+
801
+
802
+ def fast_fingerprint(repo: Path, config: dict[str, Any], config_file: Path) -> tuple[str, list[str]]:
803
+ includes = include_specs(config)
804
+ ignores = never_upload_specs(config)
805
+ parts = [f"head={git_head(repo)}", f"config={sha256_file(config_file)}"]
806
+ relevant_paths: list[str] = []
807
+ for status, path in git_status_records(repo):
808
+ if not path_is_included(path, includes) or path_is_ignored(path, ignores):
809
+ continue
810
+ relevant_paths.append(path)
811
+ full = repo / path
812
+ if full.is_file():
813
+ content_hash = sha256_file(full)
814
+ elif full.exists():
815
+ content_hash = "dir"
816
+ else:
817
+ content_hash = "missing"
818
+ parts.append(f"{status} {path} {content_hash}")
819
+ return sha256_text("\n".join(parts)), relevant_paths
820
+
821
+
822
+ def seconds_since(value: str | None) -> float | None:
823
+ parsed = parse_iso(value)
824
+ if not parsed:
825
+ return None
826
+ return (now_utc() - parsed).total_seconds()
827
+
828
+
829
+ def state_uploaded_fingerprint(state: dict[str, Any]) -> str | None:
830
+ return state.get("lastUploadedFastFingerprint")
831
+
832
+
833
+ def expand_bundle_path(repo: Path, config: dict[str, Any]) -> Path:
834
+ prefix = config.get("notebooklm", {}).get("source_title_prefix") or f"{repo.name}-repo"
835
+ timestamp = now_utc().strftime("%Y%m%dT%H%M%SZ")
836
+ template = config.get("bundle", {}).get("output") or f"{CONFIG_DIR}/cache/{{prefix}}-{{timestamp}}.txt"
837
+ rel = template.format(prefix=prefix, timestamp=timestamp)
838
+ return repo / rel
839
+
840
+
841
+ def expand_chunk_path(repo: Path, config: dict[str, Any], title: str) -> Path:
842
+ template = config.get("bundle", {}).get("output") or f"{CONFIG_DIR}/cache/{{title}}"
843
+ if "{title}" in template:
844
+ rel = template.format(title=title, prefix=config.get("notebooklm", {}).get("source_title_prefix") or default_short_source_title_prefix(), timestamp=now_utc().strftime("%Y%m%dT%H%M%SZ"))
845
+ return repo / rel
846
+ base = repo / template
847
+ return base.parent / title
848
+
849
+
850
+ def repomix_cmd() -> list[str]:
851
+ found = shutil.which("repomix")
852
+ if found:
853
+ return [found]
854
+ if shutil.which("npx"):
855
+ return ["npx", "repomix"]
856
+ die("required tool not found on PATH: repomix or npx")
857
+
858
+
859
+ def repomix_base_argv(config: dict[str, Any]) -> list[str]:
860
+ argv = repomix_cmd()
861
+ bundle = config.get("bundle", {})
862
+ style = str(bundle.get("style") or "").strip()
863
+ if style:
864
+ argv.extend(["--style", style])
865
+ if bundle.get("compress"):
866
+ argv.append("--compress")
867
+ ignore = ",".join(never_upload_specs(config))
868
+ if ignore:
869
+ argv.extend(["--ignore", ignore])
870
+ return argv
871
+
872
+
873
+ def build_bundle(repo: Path, config: dict[str, Any]) -> Path:
874
+ out = expand_bundle_path(repo, config)
875
+ out.parent.mkdir(parents=True, exist_ok=True)
876
+ include = ",".join(include_specs(config))
877
+ argv = [*repomix_base_argv(config), "--include", include, "--output", str(out)]
878
+ result = run(argv, repo, timeout=600)
879
+ if result.returncode != 0:
880
+ die(f"repomix failed:\n{result.stdout}\n{result.stderr}")
881
+ return out
882
+
883
+
884
+ def build_bundle_set(repo: Path, config: dict[str, Any], *, set_id: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
885
+ max_bytes = parse_size_bytes(config.get("bundle", {}).get("max_chunk_bytes"), 900000)
886
+ chunks = plan_bundle_chunks(repo, config, set_id=set_id, state=state)
887
+ bundles: list[dict[str, Any]] = []
888
+ try:
889
+ for chunk in chunks:
890
+ title = str(chunk["title"])
891
+ out = expand_chunk_path(repo, config, title)
892
+ out.parent.mkdir(parents=True, exist_ok=True)
893
+ input_text = "\n".join(str(path) for path in chunk["files"]) + "\n"
894
+ argv = [*repomix_base_argv(config), "--stdin", "--output", str(out)]
895
+ result = run(argv, repo, input_text=input_text, timeout=600)
896
+ if result.returncode != 0:
897
+ die(f"repomix failed for chunk {title}:\n{result.stdout}\n{result.stderr}")
898
+ actual_size = out.stat().st_size
899
+ if actual_size > max_bytes:
900
+ die(f"rendered chunk exceeds max size ({max_bytes} bytes): {title} ({actual_size} bytes)")
901
+ item = dict(chunk)
902
+ item["path"] = str(out)
903
+ item["bundleSha256"] = sha256_file(out)
904
+ item["contentSha256"] = item["bundleSha256"]
905
+ item["fileListSha256"] = item.get("sha256")
906
+ item["actualBytes"] = actual_size
907
+ item["fileCount"] = len(chunk["files"])
908
+ bundles.append(item)
909
+ except BaseException:
910
+ for bundle in bundles:
911
+ if bundle.get("path"):
912
+ remove_file_quiet(Path(str(bundle["path"])))
913
+ raise
914
+ return bundles
915
+
916
+
917
+ def notebook_id(config: dict[str, Any]) -> str:
918
+ value = config.get("notebooklm", {}).get("notebook_id", "")
919
+ if not value:
920
+ die("notebooklm.notebook_id missing in config")
921
+ return str(value)
922
+
923
+
924
+ def notebook_title(config: dict[str, Any]) -> str:
925
+ project = str(config.get("project", {}).get("name") or "repo")
926
+ prefix = str(config.get("notebooklm", {}).get("notebook_title_prefix") or DEFAULT_NOTEBOOK_TITLE_PREFIX)
927
+ return str(config.get("notebooklm", {}).get("notebook_title") or default_notebook_title(project, prefix))
928
+
929
+
930
+ def parse_notebook_json(stdout: str, fallback_title: str) -> dict[str, Any] | None:
931
+ try:
932
+ data = json.loads(stdout)
933
+ except json.JSONDecodeError:
934
+ return None
935
+ candidates = [data]
936
+ if isinstance(data, dict):
937
+ for key in ("notebook", "data", "result"):
938
+ value = data.get(key)
939
+ if isinstance(value, dict):
940
+ candidates.append(value)
941
+ for item in candidates:
942
+ if not isinstance(item, dict):
943
+ continue
944
+ nid = item.get("id") or item.get("notebook_id") or item.get("notebookId")
945
+ title = item.get("title") or item.get("name") or fallback_title
946
+ if nid:
947
+ return {"id": str(nid), "title": str(title)}
948
+ return None
949
+
950
+
951
+ def list_notebooks(repo: Path) -> list[dict[str, Any]]:
952
+ result = run([*notebooklm_cmd(), "list", "--json"], repo, timeout=120)
953
+ if result.returncode != 0:
954
+ die(f"notebooklm list failed:\n{result.stdout}\n{result.stderr}")
955
+ try:
956
+ data = json.loads(result.stdout)
957
+ except json.JSONDecodeError as error:
958
+ die(f"notebooklm list returned invalid JSON: {error}")
959
+ notebooks = data.get("notebooks", data if isinstance(data, list) else [])
960
+ return [item for item in notebooks if isinstance(item, dict)]
961
+
962
+
963
+ def find_notebook_by_title(repo: Path, title: str) -> dict[str, Any] | None:
964
+ matches = [item for item in list_notebooks(repo) if str(item.get("title", "")) == title]
965
+ if len(matches) > 1:
966
+ ids = ", ".join(str(item.get("id", "")) for item in matches)
967
+ die(f"multiple notebooks found with title {title!r}: {ids}")
968
+ if not matches:
969
+ return None
970
+ item = matches[0]
971
+ return {"id": str(item.get("id", "")), "title": str(item.get("title", title))}
972
+
973
+
974
+ def create_notebook(repo: Path, title: str) -> dict[str, Any]:
975
+ result = run([*notebooklm_cmd(), "create", title, "--json"], repo, timeout=180)
976
+ if result.returncode != 0:
977
+ die(f"notebooklm create failed:\n{result.stdout}\n{result.stderr}")
978
+ notebook = parse_notebook_json(result.stdout, title)
979
+ if notebook:
980
+ return notebook
981
+ found = find_notebook_by_title(repo, title)
982
+ if found:
983
+ return found
984
+ die(f"created notebook but could not resolve notebook id for title {title!r}")
985
+
986
+
987
+ def list_sources(repo: Path, nbid: str) -> list[dict[str, Any]]:
988
+ result = run([*notebooklm_cmd(), "source", "list", "-n", nbid, "--json"], repo, timeout=120)
989
+ if result.returncode != 0:
990
+ return []
991
+ try:
992
+ data = json.loads(result.stdout)
993
+ except json.JSONDecodeError:
994
+ return []
995
+ sources = data.get("sources", data if isinstance(data, list) else [])
996
+ return [src for src in sources if isinstance(src, dict)]
997
+
998
+
999
+ def find_source_by_title(repo: Path, nbid: str, title: str) -> dict[str, Any] | None:
1000
+ for src in list_sources(repo, nbid):
1001
+ if str(src.get("title", "")) != title:
1002
+ continue
1003
+ sid = src.get("id")
1004
+ if sid:
1005
+ return {"id": str(sid), "title": title}
1006
+ return None
1007
+
1008
+
1009
+ def find_uploaded_source(before: list[dict[str, Any]], after: list[dict[str, Any]], bundle: Path, prefix: str, title_hint: str | None = None) -> dict[str, Any]:
1010
+ before_ids = {str(src.get("id")) for src in before if src.get("id")}
1011
+ basename = title_hint or bundle.name
1012
+ for src in after:
1013
+ title = str(src.get("title", ""))
1014
+ sid = str(src.get("id", ""))
1015
+ if sid and sid not in before_ids and (title == basename or title.startswith(prefix)):
1016
+ return {"id": sid, "title": title or basename}
1017
+ for src in after:
1018
+ title = str(src.get("title", ""))
1019
+ sid = str(src.get("id", ""))
1020
+ if sid and (title == basename or title.startswith(prefix)):
1021
+ return {"id": sid, "title": title or basename}
1022
+ return {"id": "", "title": basename}
1023
+
1024
+
1025
+ def source_from_add_json(stdout: str, bundle: Path, title_hint: str | None = None) -> dict[str, Any] | None:
1026
+ try:
1027
+ data = json.loads(stdout)
1028
+ except json.JSONDecodeError:
1029
+ return None
1030
+ candidates = [data]
1031
+ if isinstance(data, dict):
1032
+ for key in ("source", "data", "result"):
1033
+ value = data.get(key)
1034
+ if isinstance(value, dict):
1035
+ candidates.append(value)
1036
+ for item in candidates:
1037
+ if not isinstance(item, dict):
1038
+ continue
1039
+ sid = item.get("id") or item.get("source_id") or item.get("sourceId")
1040
+ title = item.get("title") or item.get("name") or title_hint or bundle.name
1041
+ if sid:
1042
+ return {"id": str(sid), "title": str(title)}
1043
+ return None
1044
+
1045
+
1046
+ def upload_bundle(repo: Path, config: dict[str, Any], state: dict[str, Any], bundle: Path, bundle_hash: str) -> dict[str, Any]:
1047
+ nbid = notebook_id(config)
1048
+ prefix = str(config.get("notebooklm", {}).get("source_title_prefix") or bundle.stem)
1049
+ before = list_sources(repo, nbid)
1050
+ result = run([*notebooklm_cmd(), "source", "add", str(bundle), "-n", nbid, "--json"], repo, timeout=600)
1051
+ if result.returncode != 0:
1052
+ die(f"notebooklm source add failed:\n{result.stdout}\n{result.stderr}")
1053
+ after = list_sources(repo, nbid)
1054
+ source = source_from_add_json(result.stdout, bundle) or find_uploaded_source(before, after, bundle, prefix)
1055
+ source.update({"bundleSha256": bundle_hash, "uploadedAt": iso()})
1056
+
1057
+ if config.get("notebooklm", {}).get("wait_after_upload") and source.get("id"):
1058
+ wait = run([*notebooklm_cmd(), "source", "wait", str(source["id"]), "-n", nbid], repo, timeout=600)
1059
+ if wait.returncode != 0:
1060
+ print(f"warning: source wait failed for {source['id']}", file=sys.stderr)
1061
+
1062
+ if config.get("refresh", {}).get("mode", "replace") == "replace":
1063
+ pruned_ids = prune_sources(repo, config, state, source)
1064
+ if pruned_ids:
1065
+ source["_prunedSourceIds"] = pruned_ids
1066
+ return source
1067
+
1068
+
1069
+ def upload_file_source(repo: Path, config: dict[str, Any], path: Path, title: str) -> dict[str, Any]:
1070
+ nbid = notebook_id(config)
1071
+ result = run([*notebooklm_cmd(), "source", "add", str(path), "-n", nbid, "--title", title, "--json"], repo, timeout=600)
1072
+ if result.returncode != 0:
1073
+ die(f"notebooklm source add failed for {title}:\n{result.stdout}\n{result.stderr}")
1074
+ source = source_from_add_json(result.stdout, path, title) or find_source_by_title(repo, nbid, title)
1075
+ if not source or not source.get("id"):
1076
+ die(f"uploaded source but could not resolve source id for {title}")
1077
+ return source
1078
+
1079
+
1080
+ def wait_source_ready(repo: Path, nbid: str, source_id: str) -> bool:
1081
+ wait = run([*notebooklm_cmd(), "source", "wait", source_id, "-n", nbid], repo, timeout=600)
1082
+ return wait.returncode == 0
1083
+
1084
+
1085
+ def source_content_sha(value: dict[str, Any]) -> str:
1086
+ return str(value.get("contentSha256") or value.get("chunkSha256") or value.get("bundleSha256") or "")
1087
+
1088
+
1089
+ def source_file_list_sha(value: dict[str, Any]) -> str:
1090
+ return str(value.get("fileListSha256") or value.get("sha256") or "")
1091
+
1092
+
1093
+ def chunk_key(value: dict[str, Any]) -> str:
1094
+ return f"{value.get('group')}/{value.get('chunk')}"
1095
+
1096
+
1097
+ def temp_source_prefix(config: dict[str, Any]) -> str:
1098
+ prefix = str(config.get("notebooklm", {}).get("temporary_source_title_prefix") or "").strip()
1099
+ if prefix:
1100
+ return slugify(prefix)
1101
+ return f"{str(config.get('notebooklm', {}).get('source_title_prefix') or default_short_source_title_prefix()).strip()}tmp"
1102
+
1103
+
1104
+ def temp_source_title(config: dict[str, Any], *, set_id: str, kind: str, title: str, content_sha: str) -> str:
1105
+ digest = content_sha.split(":", 1)[-1]
1106
+ return f"{temp_source_prefix(config)}--{set_id}--{slugify(kind)}--{slugify(title)}--{digest[:8]}.md"
1107
+
1108
+
1109
+ def stage_temp_source_file(repo: Path, title: str, source_path: Path) -> Path:
1110
+ staged = repo / CONFIG_DIR / "cache" / title
1111
+ staged.parent.mkdir(parents=True, exist_ok=True)
1112
+ shutil.copyfile(source_path, staged)
1113
+ return staged
1114
+
1115
+
1116
+ def source_with_chunk_metadata(source: dict[str, Any], bundle: dict[str, Any], *, status: str, reused: bool = False) -> dict[str, Any]:
1117
+ item = dict(source)
1118
+ item.update(
1119
+ {
1120
+ "group": bundle.get("group"),
1121
+ "chunk": bundle.get("chunk"),
1122
+ "chunkKey": chunk_key(bundle),
1123
+ "chunkSha256": bundle.get("bundleSha256"),
1124
+ "contentSha256": bundle.get("contentSha256") or bundle.get("bundleSha256"),
1125
+ "fileListSha256": bundle.get("fileListSha256") or bundle.get("sha256"),
1126
+ "fileCount": bundle.get("fileCount"),
1127
+ "files": list(bundle.get("files", [])),
1128
+ "status": status,
1129
+ }
1130
+ )
1131
+ if reused:
1132
+ item["reused"] = True
1133
+ item["reusedAt"] = iso()
1134
+ else:
1135
+ item["uploadedAt"] = iso()
1136
+ return item
1137
+
1138
+
1139
+ def upload_one_chunk(repo: Path, config: dict[str, Any], bundle: dict[str, Any]) -> dict[str, Any]:
1140
+ nbid = notebook_id(config)
1141
+ path = Path(str(bundle["path"]))
1142
+ title = str(bundle["title"])
1143
+ result = run([*notebooklm_cmd(), "source", "add", str(path), "-n", nbid, "--title", title, "--json"], repo, timeout=600)
1144
+ if result.returncode != 0:
1145
+ die(f"notebooklm source add failed for chunk {title}:\n{result.stdout}\n{result.stderr}")
1146
+ source = source_from_add_json(result.stdout, path, title) or find_source_by_title(repo, nbid, title)
1147
+ if not source or not source.get("id"):
1148
+ die(f"uploaded chunk but could not resolve source id for {title}")
1149
+ return source_with_chunk_metadata(source, bundle, status="uploaded")
1150
+
1151
+
1152
+ def source_set_hash(bundles: list[dict[str, Any]]) -> str:
1153
+ parts = [
1154
+ f"{bundle.get('group')} {bundle.get('chunk')} {source_content_sha(bundle)} {source_file_list_sha(bundle)}"
1155
+ for bundle in bundles
1156
+ ]
1157
+ return sha256_text("\n".join(parts))
1158
+
1159
+
1160
+ def active_sources(state: dict[str, Any]) -> list[dict[str, Any]]:
1161
+ source_set = state.get("activeSourceSet")
1162
+ if isinstance(source_set, dict):
1163
+ sources = source_set.get("sources")
1164
+ if isinstance(sources, list):
1165
+ return [src for src in sources if isinstance(src, dict)]
1166
+ return [src for src in state.get("sources", []) if isinstance(src, dict)]
1167
+
1168
+
1169
+ def active_ready_source_ids(state: dict[str, Any]) -> list[str]:
1170
+ ids: list[str] = []
1171
+ for src in active_sources(state):
1172
+ sid = str(src.get("id") or "")
1173
+ if not sid:
1174
+ continue
1175
+ status = str(src.get("status") or "ready")
1176
+ if status == "ready":
1177
+ ids.append(sid)
1178
+ return ids
1179
+
1180
+
1181
+ def cleanup_pending_source_ids(state: dict[str, Any]) -> list[str]:
1182
+ raw = state.get("cleanupPendingSourceIds")
1183
+ if not isinstance(raw, list):
1184
+ return []
1185
+ return [sid for sid in dict.fromkeys(str(item) for item in raw if str(item))]
1186
+
1187
+
1188
+ def queue_cleanup_source_ids(state: dict[str, Any], source_ids: list[str]) -> list[str]:
1189
+ active_ids = {str(src.get("id") or "") for src in active_sources(state) if src.get("id")}
1190
+ merged = [sid for sid in dict.fromkeys([*cleanup_pending_source_ids(state), *source_ids]) if sid and sid not in active_ids]
1191
+ if merged:
1192
+ state["cleanupPendingSourceIds"] = merged
1193
+ else:
1194
+ state.pop("cleanupPendingSourceIds", None)
1195
+ return merged
1196
+
1197
+
1198
+ def pending_upload_path(repo: Path) -> Path:
1199
+ return repo / CONFIG_DIR / PENDING_UPLOAD_JSON
1200
+
1201
+
1202
+ def clear_pending_upload(repo: Path) -> None:
1203
+ remove_file_quiet(pending_upload_path(repo))
1204
+
1205
+
1206
+ def write_pending_upload(repo: Path, value: dict[str, Any]) -> None:
1207
+ write_json(pending_upload_path(repo), value)
1208
+
1209
+
1210
+ def read_pending_upload(repo: Path) -> dict[str, Any] | None:
1211
+ path = pending_upload_path(repo)
1212
+ if not path.exists():
1213
+ return None
1214
+ try:
1215
+ data = json.loads(path.read_text())
1216
+ except json.JSONDecodeError:
1217
+ return {"sources": []}
1218
+ return data if isinstance(data, dict) else {"sources": []}
1219
+
1220
+
1221
+ def delete_source_ids_parallel(repo: Path, nbid: str, source_ids: list[str], *, parallelism: int) -> list[str]:
1222
+ ids = [sid for sid in dict.fromkeys(source_ids) if sid]
1223
+ if not ids:
1224
+ return []
1225
+ workers = min(len(ids), max(1, parallelism))
1226
+
1227
+ def delete_one(sid: str) -> str | None:
1228
+ result = run([*notebooklm_cmd(), "source", "delete", sid, "-n", nbid, "--yes"], repo, timeout=120)
1229
+ if result.returncode != 0:
1230
+ print(f"warning: failed to delete source {sid}", file=sys.stderr)
1231
+ return None
1232
+ return sid
1233
+
1234
+ deleted: list[str] = []
1235
+ with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1236
+ futures = [executor.submit(delete_one, sid) for sid in ids]
1237
+ for future in concurrent.futures.as_completed(futures):
1238
+ sid = future.result()
1239
+ if sid:
1240
+ deleted.append(sid)
1241
+ print(f"cleanup {len(deleted)}/{len(ids)}", file=sys.stderr)
1242
+ return deleted
1243
+
1244
+
1245
+ def recover_pending_cleanup(repo: Path, config: dict[str, Any], state: dict[str, Any], state_path: Path) -> list[str]:
1246
+ pending_ids = cleanup_pending_source_ids(state)
1247
+ if not pending_ids:
1248
+ return []
1249
+ active_ids = {str(src.get("id") or "") for src in active_sources(state) if src.get("id")}
1250
+ delete_ids = [sid for sid in pending_ids if sid not in active_ids]
1251
+ if not delete_ids:
1252
+ state.pop("cleanupPendingSourceIds", None)
1253
+ write_json(state_path, state)
1254
+ return []
1255
+ deleted = delete_source_ids_parallel(
1256
+ repo,
1257
+ notebook_id(config),
1258
+ delete_ids,
1259
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1260
+ )
1261
+ deleted_set = set(deleted)
1262
+ remaining = [sid for sid in pending_ids if sid not in deleted_set and sid not in active_ids]
1263
+ if remaining:
1264
+ state["cleanupPendingSourceIds"] = remaining
1265
+ else:
1266
+ state.pop("cleanupPendingSourceIds", None)
1267
+ write_json(state_path, state)
1268
+ return deleted
1269
+
1270
+
1271
+ def recover_pending_upload(repo: Path, config: dict[str, Any], state: dict[str, Any] | None = None) -> list[str]:
1272
+ pending = read_pending_upload(repo)
1273
+ if not pending:
1274
+ return []
1275
+ sources = pending.get("sources")
1276
+ if not isinstance(sources, list):
1277
+ clear_pending_upload(repo)
1278
+ return []
1279
+ active_ids = {str(src.get("id")) for src in active_sources(state or {}) if src.get("id")}
1280
+ ids = [str(src.get("id")) for src in sources if isinstance(src, dict) and src.get("id")]
1281
+ if ids and active_ids and all(sid in active_ids for sid in ids):
1282
+ clear_pending_upload(repo)
1283
+ return []
1284
+ nbid = str(pending.get("notebookId") or notebook_id(config))
1285
+ delete_ids = [sid for sid in ids if sid not in active_ids]
1286
+ deleted = delete_source_ids_parallel(
1287
+ repo,
1288
+ nbid,
1289
+ delete_ids,
1290
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1291
+ )
1292
+ remaining = [src for src in sources if isinstance(src, dict) and str(src.get("id") or "") not in set(deleted)]
1293
+ if remaining:
1294
+ pending["sources"] = remaining
1295
+ write_pending_upload(repo, pending)
1296
+ else:
1297
+ clear_pending_upload(repo)
1298
+ return deleted
1299
+
1300
+
1301
+ def append_pending_source(repo: Path, journal: dict[str, Any], source: dict[str, Any], lock: threading.Lock) -> None:
1302
+ with lock:
1303
+ sources = journal.setdefault("sources", [])
1304
+ if isinstance(sources, list):
1305
+ sources.append({"id": source.get("id"), "title": source.get("title")})
1306
+ write_pending_upload(repo, journal)
1307
+
1308
+
1309
+ def find_reusable_source(bundle: dict[str, Any], previous_sources: list[dict[str, Any]], used_ids: set[str]) -> dict[str, Any] | None:
1310
+ wanted = source_content_sha(bundle)
1311
+ if not wanted:
1312
+ return None
1313
+ for source in previous_sources:
1314
+ sid = str(source.get("id") or "")
1315
+ if not sid or sid in used_ids:
1316
+ continue
1317
+ if str(source.get("status") or "ready") != "ready":
1318
+ continue
1319
+ if source_content_sha(source) == wanted:
1320
+ used_ids.add(sid)
1321
+ return source
1322
+ return None
1323
+
1324
+
1325
+ def upload_chunks_parallel(repo: Path, config: dict[str, Any], bundles: list[tuple[int, dict[str, Any]]], *, set_id: str) -> list[tuple[int, dict[str, Any]]]:
1326
+ if not bundles:
1327
+ return []
1328
+ nbid = notebook_id(config)
1329
+ workers = min(
1330
+ len(bundles),
1331
+ positive_int(config.get("notebooklm", {}).get("upload_parallelism"), 4),
1332
+ )
1333
+ journal: dict[str, Any] = {
1334
+ "version": 1,
1335
+ "setId": set_id,
1336
+ "notebookId": nbid,
1337
+ "startedAt": iso(),
1338
+ "sources": [],
1339
+ }
1340
+ write_pending_upload(repo, journal)
1341
+ journal_lock = threading.Lock()
1342
+ uploaded: list[tuple[int, dict[str, Any]]] = []
1343
+ errors: list[BaseException] = []
1344
+
1345
+ def upload_pair(pair: tuple[int, dict[str, Any]]) -> tuple[int, dict[str, Any]]:
1346
+ index, bundle = pair
1347
+ source = upload_one_chunk(repo, config, bundle)
1348
+ append_pending_source(repo, journal, source, journal_lock)
1349
+ return index, source
1350
+
1351
+ with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1352
+ futures = [executor.submit(upload_pair, pair) for pair in bundles]
1353
+ for future in concurrent.futures.as_completed(futures):
1354
+ try:
1355
+ item = future.result()
1356
+ uploaded.append(item)
1357
+ print(f"upload {len(uploaded)}/{len(bundles)}", file=sys.stderr)
1358
+ except BaseException as error:
1359
+ errors.append(error)
1360
+
1361
+ if errors:
1362
+ delete_source_ids_parallel(
1363
+ repo,
1364
+ nbid,
1365
+ [str(source.get("id") or "") for _, source in uploaded],
1366
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1367
+ )
1368
+ clear_pending_upload(repo)
1369
+ raise errors[0]
1370
+ return sorted(uploaded, key=lambda item: item[0])
1371
+
1372
+
1373
+ def wait_uploaded_sources_parallel(repo: Path, config: dict[str, Any], sources: list[tuple[int, dict[str, Any]]]) -> list[tuple[int, dict[str, Any]]]:
1374
+ if not sources or not config.get("notebooklm", {}).get("wait_after_upload", True):
1375
+ return sources
1376
+ nbid = notebook_id(config)
1377
+ workers = min(
1378
+ len(sources),
1379
+ positive_int(config.get("notebooklm", {}).get("wait_parallelism"), 8),
1380
+ )
1381
+ ready: list[tuple[int, dict[str, Any]]] = []
1382
+ errors: list[str] = []
1383
+
1384
+ def wait_one(pair: tuple[int, dict[str, Any]]) -> tuple[int, dict[str, Any]]:
1385
+ index, source = pair
1386
+ sid = str(source.get("id") or "")
1387
+ if not sid:
1388
+ raise RuntimeError(f"missing source id for {source.get('title')}")
1389
+ if not wait_source_ready(repo, nbid, sid):
1390
+ raise RuntimeError(f"source processing failed for chunk {source.get('title')}: {sid}")
1391
+ item = dict(source)
1392
+ item["status"] = "ready"
1393
+ return index, item
1394
+
1395
+ with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1396
+ futures = [executor.submit(wait_one, pair) for pair in sources]
1397
+ for future in concurrent.futures.as_completed(futures):
1398
+ try:
1399
+ item = future.result()
1400
+ ready.append(item)
1401
+ print(f"wait {len(ready)}/{len(sources)}", file=sys.stderr)
1402
+ except Exception as error:
1403
+ errors.append(str(error))
1404
+ if errors:
1405
+ die("\n".join(errors))
1406
+ return sorted(ready, key=lambda item: item[0])
1407
+
1408
+
1409
+ def upload_bundle_set(repo: Path, config: dict[str, Any], state: dict[str, Any], bundles: list[dict[str, Any]], *, set_id: str) -> dict[str, Any]:
1410
+ nbid = notebook_id(config)
1411
+ recover_pending_upload(repo, config, state)
1412
+ previous_sources = active_sources(state)
1413
+ used_reuse_ids: set[str] = set()
1414
+ sources_by_index: list[dict[str, Any] | None] = [None] * len(bundles)
1415
+ upload_pairs: list[tuple[int, dict[str, Any]]] = []
1416
+ for index, bundle in enumerate(bundles):
1417
+ reusable = find_reusable_source(bundle, previous_sources, used_reuse_ids)
1418
+ if reusable:
1419
+ sources_by_index[index] = source_with_chunk_metadata(reusable, bundle, status="ready", reused=True)
1420
+ else:
1421
+ upload_pairs.append((index, bundle))
1422
+ uploaded_sources = upload_chunks_parallel(repo, config, upload_pairs, set_id=set_id)
1423
+ try:
1424
+ ready_sources = wait_uploaded_sources_parallel(repo, config, uploaded_sources)
1425
+ except BaseException:
1426
+ delete_source_ids_parallel(
1427
+ repo,
1428
+ nbid,
1429
+ [str(source.get("id") or "") for _, source in uploaded_sources],
1430
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1431
+ )
1432
+ clear_pending_upload(repo)
1433
+ raise
1434
+ for index, source in ready_sources:
1435
+ sources_by_index[index] = source
1436
+ sources = [source for source in sources_by_index if isinstance(source, dict)]
1437
+ active_ids = {str(src.get("id")) for src in sources if src.get("id")}
1438
+ previous_ids = [str(src.get("id")) for src in previous_sources if src.get("id")]
1439
+ keep_previous = int(config.get("refresh", {}).get("keep_previous_sources", 0))
1440
+ keep_ids = set(previous_ids[-keep_previous:]) if keep_previous > 0 else set()
1441
+ retired_ids = [sid for sid in previous_ids if sid not in active_ids and sid not in keep_ids]
1442
+ source_set = {
1443
+ "id": set_id,
1444
+ "prefix": str(config.get("notebooklm", {}).get("source_title_prefix") or default_short_source_title_prefix()),
1445
+ "bundleSetSha256": source_set_hash(bundles),
1446
+ "uploadedAt": iso(),
1447
+ "sources": sources,
1448
+ }
1449
+ if config.get("refresh", {}).get("mode", "replace") == "replace" and config.get("refresh", {}).get("delete_previous_after_success", True):
1450
+ source_set["_retiredSourceIds"] = retired_ids
1451
+ return source_set
1452
+
1453
+
1454
+ def prune_sources(repo: Path, config: dict[str, Any], state: dict[str, Any], new_source: dict[str, Any]) -> list[str]:
1455
+ refresh = config.get("refresh", {})
1456
+ if not refresh.get("delete_previous_after_success", True):
1457
+ return []
1458
+ keep_previous = int(refresh.get("keep_previous_sources", 1))
1459
+ recorded = [src for src in state.get("sources", []) if src.get("id")]
1460
+ keep_ids = {str(src.get("id")) for src in recorded[-keep_previous:]} if keep_previous > 0 else set()
1461
+ keep_ids.add(str(new_source.get("id", "")))
1462
+ nbid = notebook_id(config)
1463
+ pruned_ids: list[str] = []
1464
+ for src in recorded:
1465
+ sid = str(src.get("id", ""))
1466
+ if not sid or sid in keep_ids:
1467
+ continue
1468
+ delete = run([*notebooklm_cmd(), "source", "delete", sid, "-n", nbid, "--yes"], repo, timeout=120)
1469
+ if delete.returncode != 0:
1470
+ print(f"warning: failed to delete old source {sid}", file=sys.stderr)
1471
+ else:
1472
+ pruned_ids.append(sid)
1473
+ return pruned_ids
1474
+
1475
+
1476
+ def ensure_index(
1477
+ repo: Path,
1478
+ *,
1479
+ force: bool = False,
1480
+ yes: bool = False,
1481
+ json_output: bool = False,
1482
+ command: str = "ensure",
1483
+ return_uninitialized: bool = False,
1484
+ ) -> dict[str, Any]:
1485
+ config_file = config_path(repo)
1486
+ if not config_file.exists():
1487
+ if json_output or return_uninitialized:
1488
+ return uninitialized_status(repo, config_file)
1489
+ die(missing_config_message(repo, config_file, command))
1490
+ with repo_lock(repo):
1491
+ return ensure_index_locked(repo, force=force, yes=yes, json_output=json_output, command=command)
1492
+
1493
+
1494
+ def ensure_index_locked(repo: Path, *, force: bool = False, yes: bool = False, json_output: bool = False, command: str = "ensure") -> dict[str, Any]:
1495
+ config, cfg_path = load_config(repo, command=command)
1496
+ state, state_path = load_state(cfg_path)
1497
+ recover_pending_upload(repo, config, state)
1498
+ recover_pending_cleanup(repo, config, state, state_path)
1499
+ fast_hash, relevant_paths = fast_fingerprint(repo, config, cfg_path)
1500
+ refresh = config.get("refresh", {})
1501
+ check_ttl = int(refresh.get("check_ttl_seconds", 300))
1502
+ min_interval = int(refresh.get("min_upload_interval_seconds", 900))
1503
+ max_staleness = int(refresh.get("max_staleness_seconds", 86400))
1504
+ checked_age = seconds_since(state.get("lastCheckedAt"))
1505
+ uploaded_age = seconds_since(state.get("lastUploadedAt"))
1506
+ uploaded_fingerprint = state_uploaded_fingerprint(state)
1507
+
1508
+ result: dict[str, Any] = {
1509
+ "status": "unknown",
1510
+ "config": str(cfg_path),
1511
+ "state": str(state_path),
1512
+ "relevant_changed_paths": relevant_paths,
1513
+ "fast_fingerprint": fast_hash,
1514
+ }
1515
+
1516
+ if not force and checked_age is not None and checked_age < check_ttl and uploaded_fingerprint == fast_hash:
1517
+ state["lastCheckedAt"] = iso()
1518
+ state["lastCheckedFastFingerprint"] = fast_hash
1519
+ state["lastBundlePath"] = None
1520
+ write_json(state_path, state)
1521
+ result.update({"status": "fresh-ttl", "checked_age_seconds": checked_age})
1522
+ return result
1523
+
1524
+ if not force and uploaded_fingerprint == fast_hash and state.get("lastUploadedAt"):
1525
+ state["lastCheckedAt"] = iso()
1526
+ state["lastCheckedFastFingerprint"] = fast_hash
1527
+ state["lastBundlePath"] = None
1528
+ write_json(state_path, state)
1529
+ result.update({"status": "fresh-fingerprint"})
1530
+ return result
1531
+
1532
+ first_upload = not active_sources(state)
1533
+ if first_upload and config.get("safety", {}).get("require_user_approval_first_upload", True) and not yes and not force:
1534
+ result.update({"status": "needs-first-upload-approval"})
1535
+ return result
1536
+
1537
+ if not force and uploaded_age is not None and uploaded_age < min_interval and uploaded_age < max_staleness:
1538
+ state["lastCheckedAt"] = iso()
1539
+ state["lastCheckedFastFingerprint"] = fast_hash
1540
+ state["lastBundlePath"] = None
1541
+ write_json(state_path, state)
1542
+ result.update({"status": "stale-throttled", "uploaded_age_seconds": uploaded_age})
1543
+ return result
1544
+
1545
+ if not refresh.get("auto", True) and not force:
1546
+ state["lastCheckedAt"] = iso()
1547
+ state["lastCheckedFastFingerprint"] = fast_hash
1548
+ state["lastBundlePath"] = None
1549
+ write_json(state_path, state)
1550
+ result.update({"status": "auto-refresh-disabled"})
1551
+ return result
1552
+
1553
+ if bundle_mode(config) == "chunked":
1554
+ set_id = now_utc().strftime("%y%m%d%H%M")
1555
+ bundles = build_bundle_set(repo, config, set_id=set_id, state=state)
1556
+ try:
1557
+ bundle_set_sha = source_set_hash(bundles)
1558
+ if not force and state.get("lastBundleSetSha256") == bundle_set_sha:
1559
+ state.update({
1560
+ "lastCheckedAt": iso(),
1561
+ "lastCheckedFastFingerprint": fast_hash,
1562
+ "lastBundlePath": None,
1563
+ })
1564
+ write_json(state_path, state)
1565
+ result.update({"status": "fresh-bundle-hash", "bundleSetSha256": bundle_set_sha, "bundleDeleted": True})
1566
+ return result
1567
+
1568
+ source_set = upload_bundle_set(repo, config, state, bundles, set_id=set_id)
1569
+ retired_ids = [str(sid) for sid in source_set.pop("_retiredSourceIds", []) if str(sid)]
1570
+ state.update({
1571
+ "lastCheckedAt": iso(),
1572
+ "lastUploadedAt": iso(),
1573
+ "lastConfigSha256": sha256_file(cfg_path),
1574
+ "lastCheckedFastFingerprint": fast_hash,
1575
+ "lastUploadedFastFingerprint": fast_hash,
1576
+ "lastFastFingerprint": fast_hash,
1577
+ "lastBundleSetSha256": bundle_set_sha,
1578
+ "lastBundleSha256": bundle_set_sha,
1579
+ "lastBundlePath": None,
1580
+ "activeSourceSet": source_set,
1581
+ "sources": [src for src in source_set.get("sources", []) if isinstance(src, dict)],
1582
+ })
1583
+ cleanup_pending_ids = queue_cleanup_source_ids(state, retired_ids)
1584
+ write_json(state_path, state)
1585
+ clear_pending_upload(repo)
1586
+ result.update(
1587
+ {
1588
+ "status": "uploaded",
1589
+ "bundleSetSha256": bundle_set_sha,
1590
+ "bundleDeleted": True,
1591
+ "sourceSet": source_set,
1592
+ "cleanupPendingSourceIds": cleanup_pending_ids,
1593
+ }
1594
+ )
1595
+ return result
1596
+ finally:
1597
+ for bundle in bundles:
1598
+ if bundle.get("path"):
1599
+ remove_file_quiet(Path(str(bundle["path"])))
1600
+
1601
+ bundle = build_bundle(repo, config)
1602
+ try:
1603
+ bundle_hash = sha256_file(bundle)
1604
+ if not force and state.get("lastBundleSha256") == bundle_hash:
1605
+ state.update({
1606
+ "lastCheckedAt": iso(),
1607
+ "lastCheckedFastFingerprint": fast_hash,
1608
+ "lastBundlePath": None,
1609
+ })
1610
+ write_json(state_path, state)
1611
+ result.update({"status": "fresh-bundle-hash", "bundleSha256": bundle_hash, "bundleDeleted": True})
1612
+ return result
1613
+
1614
+ source = upload_bundle(repo, config, state, bundle, bundle_hash)
1615
+ pruned_ids = set(source.pop("_prunedSourceIds", []))
1616
+ sources = [src for src in state.get("sources", []) if str(src.get("id", "")) not in pruned_ids]
1617
+ if source.get("id") or source.get("title"):
1618
+ sources.append(source)
1619
+ state.update({
1620
+ "lastCheckedAt": iso(),
1621
+ "lastUploadedAt": iso(),
1622
+ "lastConfigSha256": sha256_file(cfg_path),
1623
+ "lastCheckedFastFingerprint": fast_hash,
1624
+ "lastUploadedFastFingerprint": fast_hash,
1625
+ "lastFastFingerprint": fast_hash,
1626
+ "lastBundleSha256": bundle_hash,
1627
+ "lastBundlePath": None,
1628
+ "sources": sources,
1629
+ })
1630
+ write_json(state_path, state)
1631
+ result.update({"status": "uploaded", "bundleSha256": bundle_hash, "bundleDeleted": True, "source": source})
1632
+ return result
1633
+ finally:
1634
+ remove_file_quiet(bundle)
1635
+
1636
+
1637
+ def ask_provider(repo: Path, question: str) -> dict[str, Any]:
1638
+ config, cfg_path = load_config(repo, command="ask")
1639
+ state, _ = load_state(cfg_path)
1640
+ nbid = notebook_id(config)
1641
+ argv = [*notebooklm_cmd(), "ask", question, "-n", nbid]
1642
+ for source_id in active_ready_source_ids(state):
1643
+ argv.extend(["-s", source_id])
1644
+ argv.append("--json")
1645
+ result = run(argv, repo, timeout=180)
1646
+ if result.returncode != 0:
1647
+ return {"error": True, "stdout": result.stdout, "stderr": result.stderr}
1648
+ try:
1649
+ return json.loads(result.stdout)
1650
+ except json.JSONDecodeError:
1651
+ return {"answer": result.stdout}
1652
+
1653
+
1654
+ PATH_RE = re.compile(r"(?:(?:[\w.-]+/)+[\w.@+-]+\.(?:rs|ts|tsx|js|jsx|py|go|java|kt|md|toml|yaml|yml|json|sh|sql|css|scss|html))")
1655
+ TERM_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{3,}|[A-Za-z0-9][A-Za-z0-9_-]{4,}")
1656
+ STOP_TERMS = {
1657
+ "agent",
1658
+ "authority",
1659
+ "btreemap",
1660
+ "bundle",
1661
+ "codex",
1662
+ "command",
1663
+ "docs",
1664
+ "fixture",
1665
+ "gate",
1666
+ "justfile",
1667
+ "keywords",
1668
+ "local",
1669
+ "names",
1670
+ "paths",
1671
+ "postgres",
1672
+ "postgresql",
1673
+ "real",
1674
+ "refs",
1675
+ "repo",
1676
+ "shell",
1677
+ "test",
1678
+ "trigger",
1679
+ "where",
1680
+ "which",
1681
+ "what",
1682
+ "when",
1683
+ "implemented",
1684
+ "implementation",
1685
+ "function",
1686
+ "tests",
1687
+ "files",
1688
+ "return",
1689
+ "likely",
1690
+ "line",
1691
+ "numbers",
1692
+ "source",
1693
+ "notebooklm",
1694
+ }
1695
+
1696
+
1697
+ def answer_text(data: dict[str, Any]) -> str:
1698
+ value = data.get("answer")
1699
+ if isinstance(value, str):
1700
+ return value
1701
+ return json.dumps(data, ensure_ascii=False)
1702
+
1703
+
1704
+ def active_sources_by_id(repo: Path) -> dict[str, dict[str, Any]]:
1705
+ _, config_file = load_config(repo, command="ask")
1706
+ state, _ = load_state(config_file)
1707
+ by_id: dict[str, dict[str, Any]] = {}
1708
+ for source in active_sources(state):
1709
+ sid = str(source.get("id") or "")
1710
+ if sid:
1711
+ by_id[sid] = source
1712
+ return by_id
1713
+
1714
+
1715
+ def reference_path_candidates(repo: Path, source: dict[str, Any], text: str) -> list[tuple[str, int | None]]:
1716
+ files = [str(path) for path in source.get("files", []) if str(path)]
1717
+ file_set = set(files)
1718
+ matches: list[tuple[str, int | None]] = []
1719
+
1720
+ for raw in PATH_RE.findall(text):
1721
+ path = raw.strip("`'\".,;:()[]{}<>")
1722
+ if path in file_set and (repo / path).is_file():
1723
+ matches.append((path, None))
1724
+
1725
+ if matches:
1726
+ return sorted(set(matches))[:5]
1727
+
1728
+ snippet = " ".join(text.split())
1729
+ if len(snippet) < 4 or len(snippet) > 240 or "<directory_structure>" in text:
1730
+ return []
1731
+
1732
+ for path in files:
1733
+ full = repo / path
1734
+ if not full.is_file() or full.stat().st_size > 2_000_000:
1735
+ continue
1736
+ try:
1737
+ content = full.read_text(encoding="utf-8", errors="ignore")
1738
+ except OSError:
1739
+ continue
1740
+ line_no: int | None = None
1741
+ index = content.find(text)
1742
+ if index >= 0:
1743
+ line_no = content.count("\n", 0, index) + 1
1744
+ elif snippet not in " ".join(content.split()):
1745
+ continue
1746
+ matches.append((path, line_no))
1747
+ if len(matches) >= 5:
1748
+ break
1749
+ return matches
1750
+
1751
+
1752
+ def format_reference_paths(paths: list[tuple[str, int | None]]) -> str:
1753
+ rendered = [f"{path}:{line}" if line else path for path, line in paths[:3]]
1754
+ suffix = "" if len(paths) <= 3 else f", ...(+{len(paths) - 3})"
1755
+ return ", ".join(rendered) + suffix
1756
+
1757
+
1758
+ def print_compact_references(repo: Path, answer: dict[str, Any]) -> None:
1759
+ references = answer.get("references")
1760
+ if not isinstance(references, list) or not references:
1761
+ return
1762
+
1763
+ sources = active_sources_by_id(repo)
1764
+ rows: list[str] = []
1765
+ seen_numbers: set[str] = set()
1766
+ for ref in references:
1767
+ if not isinstance(ref, dict):
1768
+ continue
1769
+ number = str(ref.get("citation_number") or "").strip()
1770
+ if not number or number in seen_numbers:
1771
+ continue
1772
+ seen_numbers.add(number)
1773
+ source = sources.get(str(ref.get("source_id") or ""))
1774
+ paths = reference_path_candidates(repo, source or {}, str(ref.get("cited_text") or "")) if source else []
1775
+ if paths:
1776
+ rows.append(f"[{number}] {format_reference_paths(paths)}")
1777
+
1778
+ if rows:
1779
+ print("\nreferences:")
1780
+ for row in rows:
1781
+ print(row)
1782
+
1783
+
1784
+ def extract_candidates(text: str, query: str) -> tuple[list[str], list[str]]:
1785
+ paths = sorted(set(PATH_RE.findall(text)))
1786
+ terms = set()
1787
+ for raw in TERM_RE.findall(text + "\n" + query):
1788
+ term = raw.strip("`'\"")
1789
+ if len(term) < 4 or term.lower() in STOP_TERMS:
1790
+ continue
1791
+ if "/" in term or "." in term:
1792
+ continue
1793
+ terms.add(term)
1794
+ return paths, sorted(terms)[:24]
1795
+
1796
+
1797
+ def high_signal_terms(terms: list[str]) -> list[str]:
1798
+ selected: list[str] = []
1799
+ for term in terms:
1800
+ lower = term.lower()
1801
+ if lower in STOP_TERMS:
1802
+ continue
1803
+ has_symbol_shape = "_" in term or "-" in term or any(char.isupper() for char in term[1:])
1804
+ if has_symbol_shape or len(term) >= 14:
1805
+ selected.append(term)
1806
+ return selected or [term for term in terms if term.lower() not in STOP_TERMS][:8]
1807
+
1808
+
1809
+ def rg_roots(repo: Path, config: dict[str, Any], candidate_paths: list[str]) -> list[list[str]]:
1810
+ candidate_roots = [path for path in candidate_paths if (repo / path).exists()]
1811
+ roots = [spec for spec in include_specs(config) if (repo / spec).exists()]
1812
+ if not roots:
1813
+ roots = ["."]
1814
+ groups: list[list[str]] = []
1815
+ if candidate_roots:
1816
+ groups.append(candidate_roots)
1817
+ groups.append(roots)
1818
+ return groups
1819
+
1820
+
1821
+ def parse_rg_matches(stdout: str, seen: set[tuple[str, str, str]], remaining: int) -> list[dict[str, Any]]:
1822
+ matches: list[dict[str, Any]] = []
1823
+ for line in stdout.splitlines():
1824
+ if len(matches) >= remaining:
1825
+ break
1826
+ parts = line.split(":", 2)
1827
+ if len(parts) != 3:
1828
+ continue
1829
+ path, line_no, text = parts
1830
+ key = (path, line_no, text.strip())
1831
+ if key in seen:
1832
+ continue
1833
+ seen.add(key)
1834
+ matches.append({"path": path, "line": int(line_no) if line_no.isdigit() else line_no, "text": text.strip()})
1835
+ return matches
1836
+
1837
+
1838
+ def local_rg(repo: Path, config: dict[str, Any], terms: list[str], candidate_paths: list[str] | None = None) -> list[dict[str, Any]]:
1839
+ if not terms or shutil.which("rg") is None:
1840
+ return []
1841
+ signal_terms = high_signal_terms(terms)
1842
+ pattern = "|".join(re.escape(term) for term in signal_terms[:16])
1843
+ max_matches = int(config.get("retrieval", {}).get("max_local_matches", 80))
1844
+ matches: list[dict[str, Any]] = []
1845
+ seen: set[tuple[str, str, str]] = set()
1846
+ for roots in rg_roots(repo, config, candidate_paths or []):
1847
+ remaining = max_matches - len(matches)
1848
+ if remaining <= 0:
1849
+ break
1850
+ cmd = ["rg", "-n", "-S", "-e", pattern, "--", *roots]
1851
+ result = run(cmd, repo, timeout=120)
1852
+ if result.returncode not in (0, 1):
1853
+ return [{"error": result.stderr.strip()}]
1854
+ matches.extend(parse_rg_matches(result.stdout, seen, remaining))
1855
+ return matches
1856
+
1857
+
1858
+ def print_result(data: Any, as_json: bool) -> None:
1859
+ if as_json:
1860
+ print(json.dumps(data, indent=2, ensure_ascii=False))
1861
+ else:
1862
+ if isinstance(data, dict):
1863
+ for key, value in data.items():
1864
+ if isinstance(value, (dict, list)):
1865
+ print(f"{key}: {json.dumps(value, ensure_ascii=False)}")
1866
+ else:
1867
+ print(f"{key}: {value}")
1868
+ else:
1869
+ print(data)
1870
+
1871
+
1872
+ def freshness_warning(freshness: dict[str, Any]) -> str | None:
1873
+ status = str(freshness.get("status") or "")
1874
+ if status == "stale-throttled":
1875
+ changed = freshness.get("relevant_changed_paths") or []
1876
+ uploaded_age = freshness.get("uploaded_age_seconds")
1877
+ changed_text = ""
1878
+ if isinstance(changed, list) and changed:
1879
+ preview = ", ".join(str(path) for path in changed[:5])
1880
+ suffix = "" if len(changed) <= 5 else f", ...(+{len(changed) - 5})"
1881
+ changed_text = f"; changed={preview}{suffix}"
1882
+ age_text = f"; uploaded_age_seconds={uploaded_age}" if uploaded_age is not None else ""
1883
+ return f"warning: index is stale-throttled{age_text}{changed_text}; provider answer may lag local changes. Use --force-refresh or refresh --force if needed."
1884
+ if status == "needs-first-upload-approval":
1885
+ return "warning: first broad upload requires approval; rerun with --yes or run refresh explicitly."
1886
+ if status == "auto-refresh-disabled":
1887
+ return "warning: auto refresh is disabled; provider answer may lag local changes."
1888
+ return None
1889
+
1890
+
1891
+ def provider_block_message(freshness: dict[str, Any]) -> str | None:
1892
+ status = str(freshness.get("status") or "")
1893
+ if status == "not-initialized":
1894
+ return "skipped; project is not initialized for project retrieval."
1895
+ if status == "needs-first-upload-approval":
1896
+ return "skipped; first broad upload requires approval. Rerun ask/locate with --yes or run refresh explicitly."
1897
+ return None
1898
+
1899
+
1900
+ def first_upload_next(repo: Path, command: str, query: str) -> dict[str, str]:
1901
+ return {
1902
+ f"{command}WithFirstUploadApproval": command_line(repo, command, "--yes", query),
1903
+ "refresh": command_line(repo, "refresh", "--force"),
1904
+ }
1905
+
1906
+
1907
+ def provider_block_payload(freshness: dict[str, Any], *, next_steps: dict[str, str] | None = None) -> dict[str, Any]:
1908
+ payload: dict[str, Any] = {"error": True, "message": provider_block_message(freshness) or "skipped"}
1909
+ block_next = freshness.get("next") or next_steps
1910
+ if block_next:
1911
+ payload["next"] = block_next
1912
+ return payload
1913
+
1914
+
1915
+ def print_ask_result(freshness: dict[str, Any], answer: dict[str, Any], args: argparse.Namespace) -> None:
1916
+ if args.json:
1917
+ print_result({"freshness": freshness, "provider_answer": answer}, True)
1918
+ return
1919
+ repo = Path(args.repo).resolve()
1920
+ warning = freshness_warning(freshness)
1921
+ if warning:
1922
+ print(warning)
1923
+ if args.verbose:
1924
+ print(f"freshness: {json.dumps(freshness, ensure_ascii=False)}")
1925
+ metadata = {key: answer[key] for key in ("conversation_id", "turn_number", "is_follow_up") if key in answer}
1926
+ references = answer.get("references")
1927
+ if isinstance(references, list):
1928
+ metadata["references_count"] = len(references)
1929
+ if metadata:
1930
+ print(f"provider: {json.dumps(metadata, ensure_ascii=False)}")
1931
+ print(answer_text(answer))
1932
+ print_compact_references(repo, answer)
1933
+
1934
+
1935
+ def print_locate_result(result: dict[str, Any], args: argparse.Namespace) -> None:
1936
+ if args.json:
1937
+ print_result(result, True)
1938
+ return
1939
+ warning = freshness_warning(result.get("freshness", {}))
1940
+ if warning:
1941
+ print(warning)
1942
+ if args.verbose:
1943
+ print(f"freshness: {json.dumps(result.get('freshness', {}), ensure_ascii=False)}")
1944
+ visible = {key: value for key, value in result.items() if key != "freshness"}
1945
+ print_result(visible, False)
1946
+
1947
+
1948
+ def cmd_init(args: argparse.Namespace) -> None:
1949
+ repo = Path(args.repo).resolve()
1950
+ cfg_dir = repo / CONFIG_DIR
1951
+ cfg = cfg_dir / CONFIG_JSON
1952
+ if cfg.exists() and not args.force:
1953
+ die(f"config already exists: {cfg}")
1954
+ project_name = args.project_name or repo.name
1955
+ title_prefix = args.notebook_title_prefix or DEFAULT_NOTEBOOK_TITLE_PREFIX
1956
+ title = args.notebook_title or default_notebook_title(project_name, title_prefix)
1957
+ notebook_id_value = args.notebook_id or ""
1958
+ resolved_notebook: dict[str, Any] | None = None
1959
+ if not notebook_id_value and (args.reuse_existing_notebook or args.create_notebook):
1960
+ resolved_notebook = find_notebook_by_title(repo, title)
1961
+ if not resolved_notebook and args.create_notebook:
1962
+ resolved_notebook = create_notebook(repo, title)
1963
+ if not resolved_notebook:
1964
+ die(f"no NotebookLM notebook found with title {title!r}; pass --create-notebook or --notebook-id")
1965
+ notebook_id_value = str(resolved_notebook.get("id") or "")
1966
+ config = default_config(
1967
+ repo,
1968
+ notebook_id_value,
1969
+ project_name=project_name,
1970
+ notebook_title_prefix=title_prefix,
1971
+ notebook_title=title,
1972
+ )
1973
+ if args.include:
1974
+ config["bundle"]["include"] = [part.strip() for part in args.include.split(",") if part.strip()]
1975
+ if args.source_title_prefix:
1976
+ config["notebooklm"]["source_title_prefix"] = args.source_title_prefix
1977
+ write_json(cfg, config)
1978
+ (cfg_dir / ".gitignore").write_text("state.local.json\npending-upload.local.json\ncache/\n*.lock\n")
1979
+ print(f"created: {cfg}")
1980
+ print(f"created: {cfg_dir / '.gitignore'}")
1981
+ print(f"notebook_title: {title}")
1982
+ if resolved_notebook:
1983
+ print(f"notebook_id: {notebook_id_value}")
1984
+ if notebook_id_value:
1985
+ print("next:")
1986
+ print(f" {command_line(repo, 'ensure', '--yes')}")
1987
+ print(f" {command_line(repo, 'ask', 'your question')}")
1988
+ else:
1989
+ print("next:")
1990
+ print(" set notebooklm.notebook_id in the config, or rerun init with --create-notebook / --reuse-existing-notebook / --notebook-id")
1991
+
1992
+
1993
+ def cmd_status(args: argparse.Namespace) -> None:
1994
+ repo = Path(args.repo).resolve()
1995
+ cfg_candidate = config_path(repo)
1996
+ if not cfg_candidate.exists():
1997
+ print_result(uninitialized_status(repo, cfg_candidate), args.json)
1998
+ return
1999
+ config, cfg_path = load_config(repo, command="status")
2000
+ state, state_path = load_state(cfg_path)
2001
+ fast_hash, changed = fast_fingerprint(repo, config, cfg_path)
2002
+ data = {
2003
+ "initialized": True,
2004
+ "config": str(cfg_path),
2005
+ "state": str(state_path),
2006
+ "provider": config.get("provider"),
2007
+ "projectName": config.get("project", {}).get("name"),
2008
+ "notebook_id": config.get("notebooklm", {}).get("notebook_id"),
2009
+ "notebookTitle": notebook_title(config),
2010
+ "sourceTitlePrefix": config.get("notebooklm", {}).get("source_title_prefix"),
2011
+ "lastCheckedAt": state.get("lastCheckedAt"),
2012
+ "lastUploadedAt": state.get("lastUploadedAt"),
2013
+ "lastBundleSha256": state.get("lastBundleSha256"),
2014
+ "fastFingerprint": fast_hash,
2015
+ "stateCheckedFastFingerprint": state.get("lastCheckedFastFingerprint"),
2016
+ "stateUploadedFastFingerprint": state_uploaded_fingerprint(state),
2017
+ "stateFastFingerprint": state.get("lastFastFingerprint"),
2018
+ "relevantChangedPaths": changed,
2019
+ "sources": state.get("sources", []),
2020
+ }
2021
+ print_result(data, args.json)
2022
+
2023
+
2024
+ def cmd_pack(args: argparse.Namespace) -> None:
2025
+ repo = Path(args.repo).resolve()
2026
+ config, cfg_path = load_config(repo, command="pack")
2027
+ state, _ = load_state(cfg_path)
2028
+ set_id = args.set_id or now_utc().strftime("%y%m%d%H%M")
2029
+ chunks = plan_bundle_chunks(repo, config, set_id=set_id, state=state)
2030
+ if args.dry_run:
2031
+ print_result(
2032
+ {
2033
+ "setId": set_id,
2034
+ "mode": "chunked",
2035
+ "chunkCount": len(chunks),
2036
+ "chunks": [
2037
+ {
2038
+ "group": chunk.get("group"),
2039
+ "chunk": chunk.get("chunk"),
2040
+ "title": chunk.get("title"),
2041
+ "estimatedBytes": chunk.get("estimatedBytes"),
2042
+ "fileCount": len(chunk.get("files", [])),
2043
+ **({"files": chunk.get("files", [])} if args.include_files else {}),
2044
+ }
2045
+ for chunk in chunks
2046
+ ],
2047
+ },
2048
+ args.json,
2049
+ )
2050
+ return
2051
+ bundles = build_bundle_set(repo, config, set_id=set_id, state=state)
2052
+ print_result(
2053
+ {
2054
+ "setId": set_id,
2055
+ "bundleCount": len(bundles),
2056
+ "bundles": [
2057
+ {
2058
+ "group": bundle.get("group"),
2059
+ "chunk": bundle.get("chunk"),
2060
+ "title": bundle.get("title"),
2061
+ "path": bundle.get("path"),
2062
+ "fileCount": bundle.get("fileCount"),
2063
+ "bundleSha256": bundle.get("bundleSha256"),
2064
+ "contentSha256": bundle.get("contentSha256"),
2065
+ }
2066
+ for bundle in bundles
2067
+ ],
2068
+ },
2069
+ args.json,
2070
+ )
2071
+
2072
+
2073
+ def cmd_ensure(args: argparse.Namespace) -> None:
2074
+ repo = Path(args.repo).resolve()
2075
+ print_result(ensure_index(repo, force=args.force, yes=args.yes, json_output=args.json, command="ensure"), args.json)
2076
+
2077
+
2078
+ def cmd_refresh(args: argparse.Namespace) -> None:
2079
+ repo = Path(args.repo).resolve()
2080
+ print_result(ensure_index(repo, force=True, yes=True, json_output=args.json, command="refresh"), args.json)
2081
+
2082
+
2083
+ def cmd_ask(args: argparse.Namespace) -> None:
2084
+ repo = Path(args.repo).resolve()
2085
+ freshness = ensure_index(
2086
+ repo,
2087
+ force=args.force_refresh,
2088
+ yes=args.yes,
2089
+ json_output=args.json,
2090
+ command="ask",
2091
+ return_uninitialized=True,
2092
+ )
2093
+ blocked = provider_block_message(freshness)
2094
+ if blocked:
2095
+ next_steps = None
2096
+ if freshness.get("status") == "needs-first-upload-approval":
2097
+ next_steps = first_upload_next(repo, "ask", args.question)
2098
+ print_ask_result(freshness, provider_block_payload(freshness, next_steps=next_steps), args)
2099
+ return
2100
+ answer = ask_provider(repo, args.question)
2101
+ print_ask_result(freshness, answer, args)
2102
+
2103
+
2104
+ def cmd_locate(args: argparse.Namespace) -> None:
2105
+ repo = Path(args.repo).resolve()
2106
+ freshness = ensure_index(
2107
+ repo,
2108
+ force=args.force_refresh,
2109
+ yes=args.yes,
2110
+ json_output=args.json,
2111
+ command="locate",
2112
+ return_uninitialized=True,
2113
+ )
2114
+ blocked = provider_block_message(freshness)
2115
+ if blocked:
2116
+ next_steps = freshness.get("next")
2117
+ if not next_steps and freshness.get("status") == "needs-first-upload-approval":
2118
+ next_steps = first_upload_next(repo, "locate", args.query)
2119
+ result = {
2120
+ "freshness": freshness,
2121
+ "notebooklm_candidates": {"paths": [], "existing_paths": [], "terms": []},
2122
+ "local_line_refs": [],
2123
+ "provider_misses_or_stale_paths": [],
2124
+ "provider_answer": f"({blocked})",
2125
+ "claim_boundary": "Semantic provider was not called because retrieval preflight is blocked.",
2126
+ }
2127
+ if next_steps:
2128
+ result["next"] = next_steps
2129
+ print_locate_result(result, args)
2130
+ return
2131
+ prompt = (
2132
+ "Find the code location for this repository question. Return likely repo paths, "
2133
+ "function names, test names, command names, and keywords for rg. If exact line "
2134
+ f"numbers are unavailable, say so. Question: {args.query}"
2135
+ )
2136
+ provider = ask_provider(repo, prompt)
2137
+ text = answer_text(provider)
2138
+ paths, terms = extract_candidates(text, args.query)
2139
+ config, _ = load_config(repo, command="locate")
2140
+ existing_paths = [path for path in paths if (repo / path).exists()]
2141
+ stale_paths = [path for path in paths if not (repo / path).exists()]
2142
+ matches = local_rg(repo, config, terms, existing_paths)
2143
+ result = {
2144
+ "freshness": freshness,
2145
+ "notebooklm_candidates": {"paths": paths, "existing_paths": existing_paths, "terms": terms},
2146
+ "local_line_refs": matches,
2147
+ "provider_misses_or_stale_paths": stale_paths,
2148
+ "provider_answer": provider if args.include_provider_answer else "(hidden; pass --include-provider-answer)",
2149
+ "claim_boundary": "Line refs come from local rg results, not NotebookLM.",
2150
+ }
2151
+ print_locate_result(result, args)
2152
+
2153
+
2154
+ def temp_source_sets(state: dict[str, Any]) -> list[dict[str, Any]]:
2155
+ sets = state.get("temporarySourceSets")
2156
+ if isinstance(sets, list):
2157
+ return [item for item in sets if isinstance(item, dict)]
2158
+ return []
2159
+
2160
+
2161
+ def temp_source_expires_at(ttl_seconds: int) -> str | None:
2162
+ if ttl_seconds <= 0:
2163
+ return None
2164
+ return iso(now_utc() + dt.timedelta(seconds=ttl_seconds))
2165
+
2166
+
2167
+ def source_is_expired(source_set: dict[str, Any]) -> bool:
2168
+ expires_at = source_set.get("expiresAt")
2169
+ parsed = parse_iso(str(expires_at)) if expires_at else None
2170
+ return bool(parsed and parsed <= now_utc())
2171
+
2172
+
2173
+ def cmd_temp_source_upload(args: argparse.Namespace) -> None:
2174
+ repo = Path(args.repo).resolve()
2175
+ config, cfg_path = load_config(repo, command="temp-source upload")
2176
+ state, state_path = load_state(cfg_path)
2177
+ source_path = Path(args.file).expanduser()
2178
+ if not source_path.is_absolute():
2179
+ source_path = (repo / source_path).resolve()
2180
+ if not source_path.is_file():
2181
+ die(f"temp source file not found: {source_path}")
2182
+ set_id = now_utc().strftime("%y%m%d%H%M")
2183
+ content_sha = sha256_file(source_path)
2184
+ title = temp_source_title(config, set_id=set_id, kind=args.kind, title=args.title, content_sha=content_sha)
2185
+ staged_path = stage_temp_source_file(repo, title, source_path)
2186
+ with repo_lock(repo):
2187
+ try:
2188
+ state, state_path = load_state(cfg_path)
2189
+ source = upload_file_source(repo, config, staged_path, title)
2190
+ status = "uploaded"
2191
+ if config.get("notebooklm", {}).get("wait_after_upload", True) and source.get("id"):
2192
+ status = "ready" if wait_source_ready(repo, notebook_id(config), str(source["id"])) else "error"
2193
+ if status != "ready":
2194
+ delete_source_ids_parallel(
2195
+ repo,
2196
+ notebook_id(config),
2197
+ [str(source.get("id") or "")],
2198
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
2199
+ )
2200
+ die(f"source processing failed for temp source {title}: {source.get('id')}")
2201
+ active = state.get("activeSourceSet") if isinstance(state.get("activeSourceSet"), dict) else {}
2202
+ item = {
2203
+ "id": source.get("id"),
2204
+ "title": source.get("title") or title,
2205
+ "contentSha256": content_sha,
2206
+ "uploadedAt": iso(),
2207
+ "status": status,
2208
+ "origin": {
2209
+ "activeSourceSetId": active.get("id"),
2210
+ "chunkKeys": list(args.origin_chunk or []),
2211
+ "filePaths": list(args.origin_file or []),
2212
+ },
2213
+ }
2214
+ source_set = {
2215
+ "id": set_id,
2216
+ "kind": slugify(args.kind),
2217
+ "purpose": args.title,
2218
+ "createdAt": iso(),
2219
+ "expiresAt": temp_source_expires_at(int(args.ttl_seconds or 0)),
2220
+ "sources": [item],
2221
+ }
2222
+ sets = temp_source_sets(state)
2223
+ sets.append(source_set)
2224
+ state["temporarySourceSets"] = sets
2225
+ write_json(state_path, state)
2226
+ finally:
2227
+ remove_file_quiet(staged_path)
2228
+ print_result({"sourceSet": source_set, "source": item}, args.json)
2229
+
2230
+
2231
+ def cmd_temp_source_list(args: argparse.Namespace) -> None:
2232
+ repo = Path(args.repo).resolve()
2233
+ config, cfg_path = load_config(repo, command="temp-source list")
2234
+ state, _ = load_state(cfg_path)
2235
+ sets = temp_source_sets(state)
2236
+ if args.kind:
2237
+ wanted = slugify(args.kind)
2238
+ sets = [item for item in sets if str(item.get("kind") or "") == wanted]
2239
+ prefix = temp_source_prefix(config)
2240
+ provider_matches = [src for src in list_sources(repo, notebook_id(config)) if str(src.get("title") or "").startswith(prefix + "--")]
2241
+ tracked_ids = {
2242
+ str(src.get("id"))
2243
+ for source_set in temp_source_sets(state)
2244
+ for src in source_set.get("sources", [])
2245
+ if isinstance(src, dict) and src.get("id")
2246
+ }
2247
+ untracked = [src for src in provider_matches if str(src.get("id") or "") not in tracked_ids]
2248
+ print_result({"temporarySourceSets": sets, "untrackedPrefixMatches": untracked}, args.json)
2249
+
2250
+
2251
+ def cmd_temp_source_cleanup(args: argparse.Namespace) -> None:
2252
+ repo = Path(args.repo).resolve()
2253
+ config, cfg_path = load_config(repo, command="temp-source cleanup")
2254
+ with repo_lock(repo):
2255
+ state, state_path = load_state(cfg_path)
2256
+ sets = temp_source_sets(state)
2257
+ wanted_kind = slugify(args.kind) if args.kind else ""
2258
+ selected: list[dict[str, Any]] = []
2259
+ kept: list[dict[str, Any]] = []
2260
+ for source_set in sets:
2261
+ matches = True
2262
+ if args.set_id and str(source_set.get("id") or "") != str(args.set_id):
2263
+ matches = False
2264
+ if wanted_kind and str(source_set.get("kind") or "") != wanted_kind:
2265
+ matches = False
2266
+ if args.expired and not source_is_expired(source_set):
2267
+ matches = False
2268
+ if matches:
2269
+ selected.append(source_set)
2270
+ else:
2271
+ kept.append(source_set)
2272
+ if not args.yes:
2273
+ die("cleanup requires --yes")
2274
+ source_ids = [
2275
+ str(src.get("id"))
2276
+ for source_set in selected
2277
+ for src in source_set.get("sources", [])
2278
+ if isinstance(src, dict) and src.get("id")
2279
+ ]
2280
+ deleted = delete_source_ids_parallel(
2281
+ repo,
2282
+ notebook_id(config),
2283
+ source_ids,
2284
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
2285
+ )
2286
+ deleted_set = set(deleted)
2287
+ remaining_selected: list[dict[str, Any]] = []
2288
+ for source_set in selected:
2289
+ sources = [
2290
+ src
2291
+ for src in source_set.get("sources", [])
2292
+ if isinstance(src, dict) and str(src.get("id") or "") not in deleted_set
2293
+ ]
2294
+ if sources:
2295
+ item = dict(source_set)
2296
+ item["sources"] = sources
2297
+ remaining_selected.append(item)
2298
+ state["temporarySourceSets"] = kept + remaining_selected
2299
+ write_json(state_path, state)
2300
+ prefix = temp_source_prefix(config)
2301
+ provider_matches = [src for src in list_sources(repo, notebook_id(config)) if str(src.get("title") or "").startswith(prefix + "--")]
2302
+ tracked_ids = {
2303
+ str(src.get("id"))
2304
+ for source_set in temp_source_sets(state)
2305
+ for src in source_set.get("sources", [])
2306
+ if isinstance(src, dict) and src.get("id")
2307
+ }
2308
+ deleted_set = set(deleted)
2309
+ untracked = [
2310
+ src
2311
+ for src in provider_matches
2312
+ if str(src.get("id") or "") not in tracked_ids and str(src.get("id") or "") not in deleted_set
2313
+ ]
2314
+ if args.include_untracked_prefix:
2315
+ extra_ids = [str(src.get("id")) for src in untracked if src.get("id")]
2316
+ extra_deleted = delete_source_ids_parallel(
2317
+ repo,
2318
+ notebook_id(config),
2319
+ extra_ids,
2320
+ parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
2321
+ )
2322
+ deleted.extend(extra_deleted)
2323
+ untracked = [src for src in untracked if str(src.get("id") or "") not in set(extra_deleted)]
2324
+ print_result({"deletedSourceIds": deleted, "untrackedPrefixMatches": untracked}, args.json)
2325
+
2326
+
2327
+ def build_parser() -> argparse.ArgumentParser:
2328
+ parser = argparse.ArgumentParser(prog="memdex")
2329
+ sub = parser.add_subparsers(dest="command", required=True)
2330
+
2331
+ init = sub.add_parser("init")
2332
+ init.add_argument("--repo", default=".")
2333
+ init.add_argument("--notebook-id", default="")
2334
+ init.add_argument("--project-name", default="")
2335
+ init.add_argument("--notebook-title-prefix", default=DEFAULT_NOTEBOOK_TITLE_PREFIX)
2336
+ init.add_argument("--notebook-title", default="")
2337
+ init.add_argument("--reuse-existing-notebook", action="store_true")
2338
+ init.add_argument("--create-notebook", action="store_true")
2339
+ init.add_argument("--source-title-prefix", default="")
2340
+ init.add_argument("--include", default="")
2341
+ init.add_argument("--force", action="store_true")
2342
+ init.set_defaults(func=cmd_init)
2343
+
2344
+ status = sub.add_parser("status")
2345
+ status.add_argument("--repo", default=".")
2346
+ status.add_argument("--json", action="store_true")
2347
+ status.set_defaults(func=cmd_status)
2348
+
2349
+ pack = sub.add_parser("pack")
2350
+ pack.add_argument("--repo", default=".")
2351
+ pack.add_argument("--set-id", default="")
2352
+ pack.add_argument("--dry-run", action="store_true")
2353
+ pack.add_argument("--include-files", action="store_true")
2354
+ pack.add_argument("--json", action="store_true")
2355
+ pack.set_defaults(func=cmd_pack)
2356
+
2357
+ ensure = sub.add_parser("ensure")
2358
+ ensure.add_argument("--repo", default=".")
2359
+ ensure.add_argument("--force", action="store_true")
2360
+ ensure.add_argument("--yes", action="store_true")
2361
+ ensure.add_argument("--json", action="store_true")
2362
+ ensure.set_defaults(func=cmd_ensure)
2363
+
2364
+ refresh = sub.add_parser("refresh")
2365
+ refresh.add_argument("--repo", default=".")
2366
+ refresh.add_argument("--force", action="store_true")
2367
+ refresh.add_argument("--json", action="store_true")
2368
+ refresh.set_defaults(func=cmd_refresh)
2369
+
2370
+ ask = sub.add_parser("ask")
2371
+ ask.add_argument("question")
2372
+ ask.add_argument("--repo", default=".")
2373
+ ask.add_argument("--yes", action="store_true")
2374
+ ask.add_argument("--force-refresh", action="store_true")
2375
+ ask.add_argument("--json", action="store_true")
2376
+ ask.add_argument("--verbose", action="store_true")
2377
+ ask.set_defaults(func=cmd_ask)
2378
+
2379
+ locate = sub.add_parser("locate")
2380
+ locate.add_argument("query")
2381
+ locate.add_argument("--repo", default=".")
2382
+ locate.add_argument("--yes", action="store_true")
2383
+ locate.add_argument("--force-refresh", action="store_true")
2384
+ locate.add_argument("--include-provider-answer", action="store_true")
2385
+ locate.add_argument("--json", action="store_true")
2386
+ locate.add_argument("--verbose", action="store_true")
2387
+ locate.set_defaults(func=cmd_locate)
2388
+
2389
+ temp = sub.add_parser("temp-source")
2390
+ temp_sub = temp.add_subparsers(dest="temp_command", required=True)
2391
+
2392
+ temp_upload = temp_sub.add_parser("upload")
2393
+ temp_upload.add_argument("--repo", default=".")
2394
+ temp_upload.add_argument("--kind", required=True)
2395
+ temp_upload.add_argument("--title", required=True)
2396
+ temp_upload.add_argument("--file", required=True)
2397
+ temp_upload.add_argument("--origin-chunk", action="append", default=[])
2398
+ temp_upload.add_argument("--origin-file", action="append", default=[])
2399
+ temp_upload.add_argument("--ttl-seconds", type=int, default=0)
2400
+ temp_upload.add_argument("--json", action="store_true")
2401
+ temp_upload.set_defaults(func=cmd_temp_source_upload)
2402
+
2403
+ temp_list = temp_sub.add_parser("list")
2404
+ temp_list.add_argument("--repo", default=".")
2405
+ temp_list.add_argument("--kind", default="")
2406
+ temp_list.add_argument("--json", action="store_true")
2407
+ temp_list.set_defaults(func=cmd_temp_source_list)
2408
+
2409
+ temp_cleanup = temp_sub.add_parser("cleanup")
2410
+ temp_cleanup.add_argument("--repo", default=".")
2411
+ temp_cleanup.add_argument("--kind", default="")
2412
+ temp_cleanup.add_argument("--set-id", default="")
2413
+ temp_cleanup.add_argument("--expired", action="store_true")
2414
+ temp_cleanup.add_argument("--include-untracked-prefix", action="store_true")
2415
+ temp_cleanup.add_argument("--yes", action="store_true")
2416
+ temp_cleanup.add_argument("--json", action="store_true")
2417
+ temp_cleanup.set_defaults(func=cmd_temp_source_cleanup)
2418
+ return parser
2419
+
2420
+
2421
+ def main() -> None:
2422
+ args = build_parser().parse_args()
2423
+ args.func(args)
2424
+
2425
+
2426
+ if __name__ == "__main__":
2427
+ main()