memdex 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/scripts/memdex.py DELETED
@@ -1,2517 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Project-level semantic retrieval helper.
3
-
4
- This script intentionally depends only on Python stdlib for the control plane.
5
- It shells out to `npx repomix`, `notebooklm`, `git`, and `rg` when needed.
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import argparse
11
- import concurrent.futures
12
- import contextlib
13
- import datetime as dt
14
- import errno
15
- import fnmatch
16
- import hashlib
17
- import json
18
- import os
19
- import re
20
- import shlex
21
- import shutil
22
- import subprocess
23
- import sys
24
- import textwrap
25
- import threading
26
- import time
27
- from pathlib import Path
28
- from typing import Any
29
-
30
-
31
- CONFIG_DIR = ".memdex"
32
- CONFIG_JSON = "config.json"
33
- STATE_JSON = "state.local.json"
34
- PENDING_UPLOAD_JSON = "pending-upload.local.json"
35
- DEFAULT_NOTEBOOK_TITLE_PREFIX = "memdex"
36
- SCRIPT_PATH = Path(__file__).resolve()
37
- SCRIPT_CMD_ENV = "MEMDEX_CMD"
38
- LEGACY_SCRIPT_CMD_ENV = "CODEBASE_RETRIEVE_CMD"
39
- NOTEBOOKLM_PACKAGE = "git+https://github.com/teng-lin/notebooklm-py.git"
40
- NOTEBOOKLM_BIN_ENV = "NOTEBOOKLM_BIN"
41
-
42
-
43
- def now_utc() -> dt.datetime:
44
- return dt.datetime.now(dt.timezone.utc).replace(microsecond=0)
45
-
46
-
47
- def iso(ts: dt.datetime | None = None) -> str:
48
- return (ts or now_utc()).isoformat().replace("+00:00", "Z")
49
-
50
-
51
- def parse_iso(value: str | None) -> dt.datetime | None:
52
- if not value:
53
- return None
54
- return dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
55
-
56
-
57
- def die(message: str, code: int = 2) -> None:
58
- print(f"error: {message}", file=sys.stderr)
59
- raise SystemExit(code)
60
-
61
-
62
- def script_cmd() -> list[str]:
63
- override = os.environ.get(SCRIPT_CMD_ENV, "").strip()
64
- if not override:
65
- override = os.environ.get(LEGACY_SCRIPT_CMD_ENV, "").strip()
66
- if override:
67
- return shlex.split(override)
68
- return [sys.executable or "python3", str(SCRIPT_PATH)]
69
-
70
-
71
- def command_line(repo: Path, command: str, *parts: str) -> str:
72
- rendered = [*script_cmd(), command, "--repo", str(repo), *parts]
73
- return " ".join(shlex.quote(part) for part in rendered)
74
-
75
-
76
- def missing_config_message(repo: Path, config_file: Path, command: str = "") -> str:
77
- init_create = command_line(repo, "init", "--create-notebook")
78
- init_reuse = command_line(repo, "init", "--reuse-existing-notebook")
79
- ask = command_line(repo, "ask", "your question")
80
- ask_yes = command_line(repo, "ask", "--yes", "your question")
81
- locate = command_line(repo, "locate", "thing to find")
82
- lines = [
83
- f"project is not initialized for project retrieval: {config_file}",
84
- "",
85
- "Initialize this repo first:",
86
- f" {init_create}",
87
- "",
88
- "Or reuse an existing NotebookLM notebook with the expected title:",
89
- f" {init_reuse}",
90
- "",
91
- "Then ask or locate directly; both commands run freshness preflight:",
92
- f" {ask}",
93
- f" {locate}",
94
- "",
95
- "If this is the first broad upload and you already approve it:",
96
- f" {ask_yes}",
97
- ]
98
- if command:
99
- lines.insert(1, f"Command `{command}` needs `.memdex/config.json` before it can run.")
100
- return "\n".join(lines)
101
-
102
-
103
- def uninitialized_status(repo: Path, config_file: Path) -> dict[str, Any]:
104
- return {
105
- "status": "not-initialized",
106
- "initialized": False,
107
- "config": str(config_file),
108
- "message": "project is not initialized for project retrieval",
109
- "next": {
110
- "createNotebook": command_line(repo, "init", "--create-notebook"),
111
- "reuseExistingNotebook": command_line(repo, "init", "--reuse-existing-notebook"),
112
- "ask": command_line(repo, "ask", "your question"),
113
- "locate": command_line(repo, "locate", "thing to find"),
114
- "askWithFirstUploadApproval": command_line(repo, "ask", "--yes", "your question"),
115
- },
116
- }
117
-
118
-
119
- @contextlib.contextmanager
120
- def repo_lock(repo: Path, *, timeout_seconds: float = 300.0):
121
- lock_path = repo / CONFIG_DIR / ".lock"
122
- lock_path.parent.mkdir(parents=True, exist_ok=True)
123
- start = time.monotonic()
124
- fd: int | None = None
125
- while True:
126
- try:
127
- fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
128
- os.write(fd, f"pid={os.getpid()}\ncreatedAt={iso()}\n".encode("utf-8"))
129
- break
130
- except OSError as error:
131
- if error.errno != errno.EEXIST:
132
- raise
133
- if time.monotonic() - start > timeout_seconds:
134
- die(f"timed out waiting for lock: {lock_path}")
135
- time.sleep(0.2)
136
- try:
137
- yield
138
- finally:
139
- if fd is not None:
140
- os.close(fd)
141
- try:
142
- lock_path.unlink()
143
- except FileNotFoundError:
144
- pass
145
-
146
-
147
- def run(argv: list[str], cwd: Path, *, input_text: str | None = None, timeout: int | None = None) -> subprocess.CompletedProcess[str]:
148
- return subprocess.run(
149
- argv,
150
- cwd=str(cwd),
151
- input=input_text,
152
- text=True,
153
- stdout=subprocess.PIPE,
154
- stderr=subprocess.PIPE,
155
- timeout=timeout,
156
- check=False,
157
- )
158
-
159
-
160
- def require_tool(name: str) -> None:
161
- if shutil.which(name) is None:
162
- die(f"required tool not found on PATH: {name}")
163
-
164
-
165
- def notebooklm_cmd() -> list[str]:
166
- override = os.environ.get(NOTEBOOKLM_BIN_ENV, "").strip()
167
- if override:
168
- return shlex.split(override)
169
- found = shutil.which("notebooklm")
170
- if found:
171
- return [found]
172
- die(
173
- "required tool not found on PATH: notebooklm\n"
174
- f"Install persistently: uv tool install {NOTEBOOKLM_PACKAGE}\n"
175
- f"Or set {NOTEBOOKLM_BIN_ENV}='uvx --from {NOTEBOOKLM_PACKAGE} notebooklm'"
176
- )
177
-
178
-
179
- def sha256_bytes(data: bytes) -> str:
180
- return "sha256:" + hashlib.sha256(data).hexdigest()
181
-
182
-
183
- def sha256_text(data: str) -> str:
184
- return sha256_bytes(data.encode("utf-8"))
185
-
186
-
187
- def sha256_file(path: Path) -> str:
188
- digest = hashlib.sha256()
189
- with path.open("rb") as handle:
190
- for chunk in iter(lambda: handle.read(1024 * 1024), b""):
191
- digest.update(chunk)
192
- return "sha256:" + digest.hexdigest()
193
-
194
-
195
- def remove_file_quiet(path: Path) -> None:
196
- try:
197
- path.unlink()
198
- except FileNotFoundError:
199
- pass
200
-
201
-
202
- def default_include() -> list[str]:
203
- return [
204
- "src",
205
- "crates",
206
- "packages",
207
- "apps",
208
- "bins",
209
- "docs",
210
- "scripts",
211
- "tests",
212
- "xtask",
213
- "AGENTS.md",
214
- "CLAUDE.md",
215
- "README.md",
216
- "Cargo.toml",
217
- "package.json",
218
- "justfile",
219
- ]
220
-
221
-
222
- def default_groups() -> list[dict[str, Any]]:
223
- return [
224
- {"id": "docs", "include": ["AGENTS.md", "CLAUDE.md", "README.md", "docs/**"]},
225
- {"id": "apps", "include": ["apps/**"]},
226
- {"id": "packages", "include": ["packages/**"]},
227
- {"id": "src", "include": ["src/**", "crates/**", "bins/**", "xtask/**"]},
228
- {"id": "tests", "include": ["tests/**", "testdata/**"]},
229
- {"id": "scripts", "include": ["scripts/**"]},
230
- ]
231
-
232
-
233
- def slugify(value: str) -> str:
234
- lowered = value.strip().lower()
235
- slug = re.sub(r"[^a-z0-9._-]+", "-", lowered)
236
- slug = re.sub(r"-+", "-", slug).strip("-")
237
- return slug or "repo"
238
-
239
-
240
- def default_notebook_title(project_name: str, title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX) -> str:
241
- return f"{title_prefix}:{project_name}"
242
-
243
-
244
- def default_source_title_prefix(project_name: str, title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX) -> str:
245
- return f"{slugify(title_prefix)}-{slugify(project_name)}-repo"
246
-
247
-
248
- def default_short_source_title_prefix() -> str:
249
- return "memdex"
250
-
251
-
252
- def default_config(
253
- repo: Path,
254
- notebook_id: str = "",
255
- *,
256
- project_name: str | None = None,
257
- notebook_title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX,
258
- notebook_title: str | None = None,
259
- ) -> dict[str, Any]:
260
- project = project_name or repo.name
261
- title = notebook_title or default_notebook_title(project, notebook_title_prefix)
262
- return {
263
- "version": 1,
264
- "project": {
265
- "name": project,
266
- },
267
- "provider": "notebooklm",
268
- "notebooklm": {
269
- "notebook_id": notebook_id,
270
- "notebook_title_prefix": notebook_title_prefix,
271
- "notebook_title": title,
272
- "source_title_prefix": default_short_source_title_prefix(),
273
- "wait_after_upload": True,
274
- "upload_parallelism": 4,
275
- "wait_parallelism": 8,
276
- "delete_parallelism": 4,
277
- },
278
- "bundle": {
279
- "tool": "repomix",
280
- "mode": "chunked",
281
- "include": default_include(),
282
- "output": f"{CONFIG_DIR}/cache/{{prefix}}-{{timestamp}}.txt",
283
- "style": "",
284
- "compress": False,
285
- "target_chunk_bytes": 716800,
286
- "max_chunk_bytes": 900000,
287
- "source_title_template": "{prefix}--{set}--{group}--{chunk}--{hash}.md",
288
- "groups": default_groups(),
289
- "default_group": {"enabled": True, "id": "misc"},
290
- },
291
- "refresh": {
292
- "auto": True,
293
- "mode": "replace",
294
- "check_ttl_seconds": 300,
295
- "min_upload_interval_seconds": 900,
296
- "max_staleness_seconds": 86400,
297
- "keep_previous_sources": 0,
298
- "delete_previous_after_success": True,
299
- },
300
- "safety": {
301
- "require_user_approval_first_upload": True,
302
- "never_upload": [
303
- ".env*",
304
- "**/.env*",
305
- ".git/**",
306
- "**/.git/**",
307
- "node_modules/**",
308
- "**/node_modules/**",
309
- "target/**",
310
- "**/target/**",
311
- "dist/**",
312
- "**/dist/**",
313
- "build/**",
314
- "**/build/**",
315
- "coverage/**",
316
- "**/coverage/**",
317
- ".next/**",
318
- "**/.next/**",
319
- ".generated/**",
320
- "**/.generated/**",
321
- "public/**",
322
- "**/public/**",
323
- "*.png",
324
- "**/*.png",
325
- "*.jpg",
326
- "**/*.jpg",
327
- "*.jpeg",
328
- "**/*.jpeg",
329
- "*.gif",
330
- "**/*.gif",
331
- "*.webp",
332
- "**/*.webp",
333
- "*.svg",
334
- "**/*.svg",
335
- "*.ico",
336
- "**/*.ico",
337
- "*.otf",
338
- "**/*.otf",
339
- "*.ttf",
340
- "**/*.ttf",
341
- "*.woff",
342
- "**/*.woff",
343
- "*.woff2",
344
- "**/*.woff2",
345
- "*.mp4",
346
- "**/*.mp4",
347
- "*.mov",
348
- "**/*.mov",
349
- "*.zip",
350
- "**/*.zip",
351
- "*.tar",
352
- "**/*.tar",
353
- "*.gz",
354
- "**/*.gz",
355
- ],
356
- },
357
- "retrieval": {
358
- "line_numbers_require_local_verify": True,
359
- "max_local_matches": 80,
360
- },
361
- }
362
-
363
-
364
- def config_path(repo: Path) -> Path:
365
- candidates = [
366
- repo / CONFIG_DIR / CONFIG_JSON,
367
- repo / CONFIG_DIR / "config.yaml",
368
- repo / CONFIG_DIR / "config.yml",
369
- repo / ".notebooklm" / CONFIG_JSON,
370
- repo / ".notebooklm" / "config.yaml",
371
- repo / ".notebooklm" / "config.yml",
372
- ]
373
- for path in candidates:
374
- if path.exists():
375
- return path
376
- return repo / CONFIG_DIR / CONFIG_JSON
377
-
378
-
379
- def load_config(repo: Path, *, command: str = "") -> tuple[dict[str, Any], Path]:
380
- path = config_path(repo)
381
- if not path.exists():
382
- die(missing_config_message(repo, path, command))
383
- if path.suffix == ".json":
384
- return json.loads(path.read_text()), path
385
- try:
386
- import yaml # type: ignore
387
- except Exception as error: # pragma: no cover - depends on host env
388
- die(f"YAML config requires PyYAML or use JSON config instead: {error}")
389
- return yaml.safe_load(path.read_text()), path
390
-
391
-
392
- def write_json(path: Path, value: Any) -> None:
393
- path.parent.mkdir(parents=True, exist_ok=True)
394
- path.write_text(json.dumps(value, indent=2, ensure_ascii=False) + "\n")
395
-
396
-
397
- def load_state(config_file: Path) -> tuple[dict[str, Any], Path]:
398
- state_path = config_file.parent / STATE_JSON
399
- if state_path.exists():
400
- return json.loads(state_path.read_text()), state_path
401
- return {"sources": []}, state_path
402
-
403
-
404
- def include_specs(config: dict[str, Any]) -> list[str]:
405
- include = config.get("bundle", {}).get("include") or default_include()
406
- return [str(item).strip().strip("/") for item in include if str(item).strip()]
407
-
408
-
409
- def group_specs(group: dict[str, Any]) -> list[str]:
410
- include = group.get("include") or []
411
- return [str(item).strip().strip("/") for item in include if str(item).strip()]
412
-
413
-
414
- def never_upload_specs(config: dict[str, Any]) -> list[str]:
415
- built_in = [
416
- ".git/**",
417
- "**/.git/**",
418
- ".env*",
419
- "**/.env*",
420
- "node_modules/**",
421
- "**/node_modules/**",
422
- ".next/**",
423
- "**/.next/**",
424
- "dist/**",
425
- "**/dist/**",
426
- "build/**",
427
- "**/build/**",
428
- "coverage/**",
429
- "**/coverage/**",
430
- ".generated/**",
431
- "**/.generated/**",
432
- "public/**",
433
- "**/public/**",
434
- "*.png",
435
- "**/*.png",
436
- "*.jpg",
437
- "**/*.jpg",
438
- "*.jpeg",
439
- "**/*.jpeg",
440
- "*.gif",
441
- "**/*.gif",
442
- "*.webp",
443
- "**/*.webp",
444
- "*.svg",
445
- "**/*.svg",
446
- "*.ico",
447
- "**/*.ico",
448
- "*.otf",
449
- "**/*.otf",
450
- "*.ttf",
451
- "**/*.ttf",
452
- "*.woff",
453
- "**/*.woff",
454
- "*.woff2",
455
- "**/*.woff2",
456
- "*.mp4",
457
- "**/*.mp4",
458
- "*.mov",
459
- "**/*.mov",
460
- "*.zip",
461
- "**/*.zip",
462
- "*.tar",
463
- "**/*.tar",
464
- "*.gz",
465
- "**/*.gz",
466
- ]
467
- never_upload = config.get("safety", {}).get("never_upload") or []
468
- return [str(item).strip() for item in [*built_in, *never_upload] if str(item).strip()]
469
-
470
-
471
- def path_matches_spec(path: str, spec: str) -> bool:
472
- clean = path.strip().lstrip("./")
473
- pattern = spec.strip().lstrip("./")
474
- if not pattern:
475
- return False
476
- if pattern in {".", "*"}:
477
- return True
478
- if clean == pattern or clean.startswith(pattern.rstrip("/") + "/"):
479
- return True
480
- return fnmatch.fnmatch(clean, pattern) or fnmatch.fnmatch("./" + clean, pattern)
481
-
482
-
483
- def path_is_included(path: str, includes: list[str]) -> bool:
484
- for spec in includes:
485
- if path_matches_spec(path, spec):
486
- return True
487
- return False
488
-
489
-
490
- def path_is_ignored(path: str, ignores: list[str]) -> bool:
491
- return any(path_matches_spec(path, spec) for spec in ignores)
492
-
493
-
494
- def bundle_mode(config: dict[str, Any]) -> str:
495
- return str(config.get("bundle", {}).get("mode") or "chunked")
496
-
497
-
498
- def parse_size_bytes(value: Any, fallback: int) -> int:
499
- if isinstance(value, int):
500
- return value
501
- text = str(value or "").strip().lower()
502
- if not text:
503
- return fallback
504
- match = re.fullmatch(r"(\d+)(?:\s*(b|kb|kib|mb|mib))?", text)
505
- if not match:
506
- return fallback
507
- amount = int(match.group(1))
508
- unit = match.group(2) or "b"
509
- if unit in {"kb", "kib"}:
510
- return amount * 1024
511
- if unit in {"mb", "mib"}:
512
- return amount * 1024 * 1024
513
- return amount
514
-
515
-
516
- def positive_int(value: Any, fallback: int, *, minimum: int = 1, maximum: int = 32) -> int:
517
- try:
518
- parsed = int(value)
519
- except (TypeError, ValueError):
520
- parsed = fallback
521
- return max(minimum, min(maximum, parsed))
522
-
523
-
524
- def list_git_files(repo: Path) -> list[str]:
525
- result = run(["git", "ls-files", "-co", "--exclude-standard"], repo)
526
- if result.returncode != 0:
527
- files: list[str] = []
528
- for path in repo.rglob("*"):
529
- if not path.is_file():
530
- continue
531
- rel = path.relative_to(repo).as_posix()
532
- if rel.startswith(".git/"):
533
- continue
534
- files.append(rel)
535
- return sorted(files)
536
- return sorted(line.strip() for line in result.stdout.splitlines() if line.strip())
537
-
538
-
539
- def collect_bundle_files(repo: Path, config: dict[str, Any]) -> list[str]:
540
- includes = include_specs(config)
541
- ignores = never_upload_specs(config)
542
- files: list[str] = []
543
- for path in list_git_files(repo):
544
- if not path_is_included(path, includes):
545
- continue
546
- if path_is_ignored(path, ignores):
547
- continue
548
- full = repo / path
549
- if not full.is_file() or full.is_symlink():
550
- continue
551
- files.append(path)
552
- return sorted(set(files))
553
-
554
-
555
- def chunk_file_size(repo: Path, path: str) -> int:
556
- full = repo / path
557
- return full.stat().st_size + len(path.encode("utf-8")) + 64
558
-
559
-
560
- def file_bucket(path: str) -> str:
561
- parts = path.split("/")
562
- if len(parts) >= 3 and parts[0] in {"apps", "packages", "crates"}:
563
- return "/".join(parts[:3])
564
- if len(parts) >= 2:
565
- return "/".join(parts[:2])
566
- return parts[0]
567
-
568
-
569
- def source_title_for_chunk(config: dict[str, Any], *, set_id: str, group: str, index: int, chunk_hash: str) -> str:
570
- configured = str(config.get("notebooklm", {}).get("source_title_prefix") or "").strip()
571
- legacy = configured.startswith("codebase-retrieve-")
572
- prefix = default_short_source_title_prefix() if legacy or not configured else configured
573
- template = str(
574
- config.get("bundle", {}).get("source_title_template")
575
- or "{prefix}--{set}--{group}--{chunk}--{hash}.md"
576
- )
577
- return template.format(
578
- prefix=slugify(prefix),
579
- set=set_id,
580
- set_id=set_id,
581
- group=slugify(group),
582
- chunk=f"{index:03d}",
583
- idx=f"{index:03d}",
584
- hash=chunk_hash[:8],
585
- )
586
-
587
-
588
- def chunk_hash_for_files(repo: Path, files: list[str]) -> str:
589
- digest = hashlib.sha256()
590
- for path in files:
591
- digest.update(path.encode("utf-8"))
592
- digest.update(b"\0")
593
- full = repo / path
594
- if full.is_file():
595
- with full.open("rb") as handle:
596
- for block in iter(lambda: handle.read(1024 * 1024), b""):
597
- digest.update(block)
598
- digest.update(b"\0")
599
- return digest.hexdigest()
600
-
601
-
602
- def assign_files_to_groups(files: list[str], config: dict[str, Any]) -> list[tuple[str, str]]:
603
- bundle = config.get("bundle", {})
604
- groups = bundle.get("groups") if "groups" in bundle else default_groups()
605
- groups = groups or []
606
- assigned: list[tuple[str, str]] = []
607
- seen: set[str] = set()
608
- for group in groups:
609
- gid = slugify(str(group.get("id") or "group"))
610
- specs = group_specs(group)
611
- for path in files:
612
- if path in seen:
613
- continue
614
- if specs and path_is_included(path, specs):
615
- assigned.append((gid, path))
616
- seen.add(path)
617
- default_group = bundle.get("default_group") if "default_group" in bundle else {"enabled": True, "id": "misc"}
618
- default_group = default_group or {}
619
- if default_group.get("enabled"):
620
- gid = slugify(str(default_group.get("id") or "misc"))
621
- for path in files:
622
- if path not in seen:
623
- assigned.append((gid, path))
624
- seen.add(path)
625
- elif not groups:
626
- for path in files:
627
- assigned.append(("repo", path))
628
- return assigned
629
-
630
-
631
- def flush_chunk(chunks: list[dict[str, Any]], repo: Path, config: dict[str, Any], set_id: str, group: str, index: int, files: list[str], total: int) -> None:
632
- if not files:
633
- return
634
- digest = chunk_hash_for_files(repo, files)
635
- chunks.append(
636
- {
637
- "group": group,
638
- "chunk": f"{index:03d}",
639
- "index": index,
640
- "files": files[:],
641
- "estimatedBytes": total,
642
- "sha256": "sha256:" + digest,
643
- "title": source_title_for_chunk(config, set_id=set_id, group=group, index=index, chunk_hash=digest),
644
- }
645
- )
646
-
647
-
648
- def active_chunk_file_members(state: dict[str, Any] | None, group: str) -> list[list[str]]:
649
- if not state:
650
- return []
651
- members: list[tuple[int, list[str]]] = []
652
- for source in active_sources(state):
653
- if str(source.get("group") or "") != group:
654
- continue
655
- files = source.get("files")
656
- if not isinstance(files, list) or not files:
657
- continue
658
- chunk = str(source.get("chunk") or "0")
659
- try:
660
- index = int(chunk)
661
- except ValueError:
662
- index = 0
663
- clean_files = [str(path) for path in files if str(path)]
664
- if clean_files:
665
- members.append((index, clean_files))
666
- return [files for _, files in sorted(members, key=lambda item: item[0])]
667
-
668
-
669
- def append_greedy_chunks(
670
- chunks: list[dict[str, Any]],
671
- repo: Path,
672
- config: dict[str, Any],
673
- *,
674
- set_id: str,
675
- group: str,
676
- start_index: int,
677
- files: list[str],
678
- target: int,
679
- max_bytes: int,
680
- ) -> int:
681
- current: list[str] = []
682
- current_size = 0
683
- index = start_index
684
- for path in files:
685
- size = chunk_file_size(repo, path)
686
- if size > max_bytes:
687
- die(f"file exceeds max chunk size ({max_bytes} bytes): {path} ({size} bytes)")
688
- if current and current_size + size > target:
689
- flush_chunk(chunks, repo, config, set_id, group, index, current, current_size)
690
- current = []
691
- current_size = 0
692
- index += 1
693
- current.append(path)
694
- current_size += size
695
- if current:
696
- flush_chunk(chunks, repo, config, set_id, group, index, current, current_size)
697
- index += 1
698
- return index
699
-
700
-
701
- def plan_group_chunks(
702
- chunks: list[dict[str, Any]],
703
- repo: Path,
704
- config: dict[str, Any],
705
- *,
706
- set_id: str,
707
- group: str,
708
- files: list[str],
709
- target: int,
710
- max_bytes: int,
711
- state: dict[str, Any] | None,
712
- ) -> None:
713
- ordered = sorted(files, key=lambda path: (file_bucket(path), path))
714
- available = set(ordered)
715
- kept: list[list[str]] = []
716
- for previous_files in active_chunk_file_members(state, group):
717
- retained = [path for path in previous_files if path in available]
718
- if not retained:
719
- continue
720
- total = sum(chunk_file_size(repo, path) for path in retained)
721
- if any(chunk_file_size(repo, path) > max_bytes for path in retained):
722
- for path in retained:
723
- size = chunk_file_size(repo, path)
724
- if size > max_bytes:
725
- die(f"file exceeds max chunk size ({max_bytes} bytes): {path} ({size} bytes)")
726
- if total <= max_bytes:
727
- kept.append(retained)
728
- for path in retained:
729
- available.discard(path)
730
-
731
- index = 1
732
- for files_in_chunk in kept:
733
- total = sum(chunk_file_size(repo, path) for path in files_in_chunk)
734
- flush_chunk(chunks, repo, config, set_id, group, index, files_in_chunk, total)
735
- index += 1
736
-
737
- remaining = [path for path in ordered if path in available]
738
- append_greedy_chunks(
739
- chunks,
740
- repo,
741
- config,
742
- set_id=set_id,
743
- group=group,
744
- start_index=index,
745
- files=remaining,
746
- target=target,
747
- max_bytes=max_bytes,
748
- )
749
-
750
-
751
- def plan_bundle_chunks(repo: Path, config: dict[str, Any], *, set_id: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
752
- bundle = config.get("bundle", {})
753
- target = parse_size_bytes(bundle.get("target_chunk_bytes"), 716800)
754
- max_bytes = parse_size_bytes(bundle.get("max_chunk_bytes"), 900000)
755
- if target > max_bytes:
756
- target = max_bytes
757
- assigned = assign_files_to_groups(collect_bundle_files(repo, config), config)
758
- by_group: dict[str, list[str]] = {}
759
- for group, path in assigned:
760
- by_group.setdefault(group, []).append(path)
761
- chunks: list[dict[str, Any]] = []
762
- for group in sorted(by_group):
763
- plan_group_chunks(
764
- chunks,
765
- repo,
766
- config,
767
- set_id=set_id,
768
- group=group,
769
- files=by_group[group],
770
- target=target,
771
- max_bytes=max_bytes,
772
- state=state,
773
- )
774
- return chunks
775
-
776
-
777
- def git_head(repo: Path) -> str:
778
- result = run(["git", "rev-parse", "HEAD"], repo)
779
- if result.returncode != 0:
780
- return "no-git-head"
781
- return result.stdout.strip()
782
-
783
-
784
- def git_status_records(repo: Path) -> list[tuple[str, str]]:
785
- result = run(["git", "status", "--porcelain=v1", "-z", "--untracked-files=all"], repo)
786
- if result.returncode != 0:
787
- return []
788
- raw = [part for part in result.stdout.split("\0") if part]
789
- records: list[tuple[str, str]] = []
790
- skip_next = False
791
- for item in raw:
792
- if skip_next:
793
- skip_next = False
794
- continue
795
- status = item[:2]
796
- path = item[3:]
797
- if status.startswith("R") or status.startswith("C"):
798
- skip_next = True
799
- records.append((status, path))
800
- return records
801
-
802
-
803
- def fast_fingerprint(repo: Path, config: dict[str, Any], config_file: Path) -> tuple[str, list[str]]:
804
- includes = include_specs(config)
805
- ignores = never_upload_specs(config)
806
- parts = [f"head={git_head(repo)}", f"config={sha256_file(config_file)}"]
807
- relevant_paths: list[str] = []
808
- for status, path in git_status_records(repo):
809
- if not path_is_included(path, includes) or path_is_ignored(path, ignores):
810
- continue
811
- relevant_paths.append(path)
812
- full = repo / path
813
- if full.is_file():
814
- content_hash = sha256_file(full)
815
- elif full.exists():
816
- content_hash = "dir"
817
- else:
818
- content_hash = "missing"
819
- parts.append(f"{status} {path} {content_hash}")
820
- return sha256_text("\n".join(parts)), relevant_paths
821
-
822
-
823
- def seconds_since(value: str | None) -> float | None:
824
- parsed = parse_iso(value)
825
- if not parsed:
826
- return None
827
- return (now_utc() - parsed).total_seconds()
828
-
829
-
830
- def state_uploaded_fingerprint(state: dict[str, Any]) -> str | None:
831
- return state.get("lastUploadedFastFingerprint")
832
-
833
-
834
- def expand_bundle_path(repo: Path, config: dict[str, Any]) -> Path:
835
- prefix = config.get("notebooklm", {}).get("source_title_prefix") or f"{repo.name}-repo"
836
- timestamp = now_utc().strftime("%Y%m%dT%H%M%SZ")
837
- template = config.get("bundle", {}).get("output") or f"{CONFIG_DIR}/cache/{{prefix}}-{{timestamp}}.txt"
838
- rel = template.format(prefix=prefix, timestamp=timestamp)
839
- return repo / rel
840
-
841
-
842
- def expand_chunk_path(repo: Path, config: dict[str, Any], title: str) -> Path:
843
- template = config.get("bundle", {}).get("output") or f"{CONFIG_DIR}/cache/{{title}}"
844
- if "{title}" in template:
845
- rel = template.format(title=title, prefix=config.get("notebooklm", {}).get("source_title_prefix") or default_short_source_title_prefix(), timestamp=now_utc().strftime("%Y%m%dT%H%M%SZ"))
846
- return repo / rel
847
- base = repo / template
848
- return base.parent / title
849
-
850
-
851
- def repomix_cmd() -> list[str]:
852
- found = shutil.which("repomix")
853
- if found:
854
- return [found]
855
- if shutil.which("npx"):
856
- return ["npx", "repomix"]
857
- die("required tool not found on PATH: repomix or npx")
858
-
859
-
860
- def repomix_base_argv(config: dict[str, Any]) -> list[str]:
861
- argv = repomix_cmd()
862
- bundle = config.get("bundle", {})
863
- style = str(bundle.get("style") or "").strip()
864
- if style:
865
- argv.extend(["--style", style])
866
- if bundle.get("compress"):
867
- argv.append("--compress")
868
- ignore = ",".join(never_upload_specs(config))
869
- if ignore:
870
- argv.extend(["--ignore", ignore])
871
- return argv
872
-
873
-
874
- def build_bundle(repo: Path, config: dict[str, Any]) -> Path:
875
- out = expand_bundle_path(repo, config)
876
- out.parent.mkdir(parents=True, exist_ok=True)
877
- include = ",".join(include_specs(config))
878
- argv = [*repomix_base_argv(config), "--include", include, "--output", str(out)]
879
- result = run(argv, repo, timeout=600)
880
- if result.returncode != 0:
881
- die(f"repomix failed:\n{result.stdout}\n{result.stderr}")
882
- return out
883
-
884
-
885
- def build_bundle_set(repo: Path, config: dict[str, Any], *, set_id: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
886
- max_bytes = parse_size_bytes(config.get("bundle", {}).get("max_chunk_bytes"), 900000)
887
- chunks = plan_bundle_chunks(repo, config, set_id=set_id, state=state)
888
- bundles: list[dict[str, Any]] = []
889
- try:
890
- for chunk in chunks:
891
- title = str(chunk["title"])
892
- out = expand_chunk_path(repo, config, title)
893
- out.parent.mkdir(parents=True, exist_ok=True)
894
- input_text = "\n".join(str(path) for path in chunk["files"]) + "\n"
895
- argv = [*repomix_base_argv(config), "--stdin", "--output", str(out)]
896
- result = run(argv, repo, input_text=input_text, timeout=600)
897
- if result.returncode != 0:
898
- die(f"repomix failed for chunk {title}:\n{result.stdout}\n{result.stderr}")
899
- actual_size = out.stat().st_size
900
- if actual_size > max_bytes:
901
- die(f"rendered chunk exceeds max size ({max_bytes} bytes): {title} ({actual_size} bytes)")
902
- item = dict(chunk)
903
- item["path"] = str(out)
904
- item["bundleSha256"] = sha256_file(out)
905
- item["contentSha256"] = item["bundleSha256"]
906
- item["fileListSha256"] = item.get("sha256")
907
- item["actualBytes"] = actual_size
908
- item["fileCount"] = len(chunk["files"])
909
- bundles.append(item)
910
- except BaseException:
911
- for bundle in bundles:
912
- if bundle.get("path"):
913
- remove_file_quiet(Path(str(bundle["path"])))
914
- raise
915
- return bundles
916
-
917
-
918
- def notebook_id(config: dict[str, Any]) -> str:
919
- value = config.get("notebooklm", {}).get("notebook_id", "")
920
- if not value:
921
- die("notebooklm.notebook_id missing in config")
922
- return str(value)
923
-
924
-
925
- def notebook_title(config: dict[str, Any]) -> str:
926
- project = str(config.get("project", {}).get("name") or "repo")
927
- prefix = str(config.get("notebooklm", {}).get("notebook_title_prefix") or DEFAULT_NOTEBOOK_TITLE_PREFIX)
928
- return str(config.get("notebooklm", {}).get("notebook_title") or default_notebook_title(project, prefix))
929
-
930
-
931
- def parse_notebook_json(stdout: str, fallback_title: str) -> dict[str, Any] | None:
932
- try:
933
- data = json.loads(stdout)
934
- except json.JSONDecodeError:
935
- return None
936
- candidates = [data]
937
- if isinstance(data, dict):
938
- for key in ("notebook", "data", "result"):
939
- value = data.get(key)
940
- if isinstance(value, dict):
941
- candidates.append(value)
942
- for item in candidates:
943
- if not isinstance(item, dict):
944
- continue
945
- nid = item.get("id") or item.get("notebook_id") or item.get("notebookId")
946
- title = item.get("title") or item.get("name") or fallback_title
947
- if nid:
948
- return {"id": str(nid), "title": str(title)}
949
- return None
950
-
951
-
952
- def list_notebooks(repo: Path) -> list[dict[str, Any]]:
953
- result = run([*notebooklm_cmd(), "list", "--json"], repo, timeout=120)
954
- if result.returncode != 0:
955
- die(f"notebooklm list failed:\n{result.stdout}\n{result.stderr}")
956
- try:
957
- data = json.loads(result.stdout)
958
- except json.JSONDecodeError as error:
959
- die(f"notebooklm list returned invalid JSON: {error}")
960
- notebooks = data.get("notebooks", data if isinstance(data, list) else [])
961
- return [item for item in notebooks if isinstance(item, dict)]
962
-
963
-
964
- def find_notebook_by_title(repo: Path, title: str) -> dict[str, Any] | None:
965
- matches = [item for item in list_notebooks(repo) if str(item.get("title", "")) == title]
966
- if len(matches) > 1:
967
- ids = ", ".join(str(item.get("id", "")) for item in matches)
968
- die(f"multiple notebooks found with title {title!r}: {ids}")
969
- if not matches:
970
- return None
971
- item = matches[0]
972
- return {"id": str(item.get("id", "")), "title": str(item.get("title", title))}
973
-
974
-
975
- def create_notebook(repo: Path, title: str) -> dict[str, Any]:
976
- result = run([*notebooklm_cmd(), "create", title, "--json"], repo, timeout=180)
977
- if result.returncode != 0:
978
- die(f"notebooklm create failed:\n{result.stdout}\n{result.stderr}")
979
- notebook = parse_notebook_json(result.stdout, title)
980
- if notebook:
981
- return notebook
982
- found = find_notebook_by_title(repo, title)
983
- if found:
984
- return found
985
- die(f"created notebook but could not resolve notebook id for title {title!r}")
986
-
987
-
988
- def list_sources(repo: Path, nbid: str) -> list[dict[str, Any]]:
989
- result = run([*notebooklm_cmd(), "source", "list", "-n", nbid, "--json"], repo, timeout=120)
990
- if result.returncode != 0:
991
- return []
992
- try:
993
- data = json.loads(result.stdout)
994
- except json.JSONDecodeError:
995
- return []
996
- sources = data.get("sources", data if isinstance(data, list) else [])
997
- return [src for src in sources if isinstance(src, dict)]
998
-
999
-
1000
- def find_source_by_title(repo: Path, nbid: str, title: str) -> dict[str, Any] | None:
1001
- for src in list_sources(repo, nbid):
1002
- if str(src.get("title", "")) != title:
1003
- continue
1004
- sid = src.get("id")
1005
- if sid:
1006
- return {"id": str(sid), "title": title}
1007
- return None
1008
-
1009
-
1010
- def find_uploaded_source(before: list[dict[str, Any]], after: list[dict[str, Any]], bundle: Path, prefix: str, title_hint: str | None = None) -> dict[str, Any]:
1011
- before_ids = {str(src.get("id")) for src in before if src.get("id")}
1012
- basename = title_hint or bundle.name
1013
- for src in after:
1014
- title = str(src.get("title", ""))
1015
- sid = str(src.get("id", ""))
1016
- if sid and sid not in before_ids and (title == basename or title.startswith(prefix)):
1017
- return {"id": sid, "title": title or basename}
1018
- for src in after:
1019
- title = str(src.get("title", ""))
1020
- sid = str(src.get("id", ""))
1021
- if sid and (title == basename or title.startswith(prefix)):
1022
- return {"id": sid, "title": title or basename}
1023
- return {"id": "", "title": basename}
1024
-
1025
-
1026
- def source_from_add_json(stdout: str, bundle: Path, title_hint: str | None = None) -> dict[str, Any] | None:
1027
- try:
1028
- data = json.loads(stdout)
1029
- except json.JSONDecodeError:
1030
- return None
1031
- candidates = [data]
1032
- if isinstance(data, dict):
1033
- for key in ("source", "data", "result"):
1034
- value = data.get(key)
1035
- if isinstance(value, dict):
1036
- candidates.append(value)
1037
- for item in candidates:
1038
- if not isinstance(item, dict):
1039
- continue
1040
- sid = item.get("id") or item.get("source_id") or item.get("sourceId")
1041
- title = item.get("title") or item.get("name") or title_hint or bundle.name
1042
- if sid:
1043
- return {"id": str(sid), "title": str(title)}
1044
- return None
1045
-
1046
-
1047
- def upload_bundle(repo: Path, config: dict[str, Any], state: dict[str, Any], bundle: Path, bundle_hash: str) -> dict[str, Any]:
1048
- nbid = notebook_id(config)
1049
- prefix = str(config.get("notebooklm", {}).get("source_title_prefix") or bundle.stem)
1050
- before = list_sources(repo, nbid)
1051
- result = run([*notebooklm_cmd(), "source", "add", str(bundle), "-n", nbid, "--json"], repo, timeout=600)
1052
- if result.returncode != 0:
1053
- die(f"notebooklm source add failed:\n{result.stdout}\n{result.stderr}")
1054
- after = list_sources(repo, nbid)
1055
- source = source_from_add_json(result.stdout, bundle) or find_uploaded_source(before, after, bundle, prefix)
1056
- source.update({"bundleSha256": bundle_hash, "uploadedAt": iso()})
1057
-
1058
- if config.get("notebooklm", {}).get("wait_after_upload") and source.get("id"):
1059
- wait = run([*notebooklm_cmd(), "source", "wait", str(source["id"]), "-n", nbid], repo, timeout=600)
1060
- if wait.returncode != 0:
1061
- print(f"warning: source wait failed for {source['id']}", file=sys.stderr)
1062
-
1063
- if config.get("refresh", {}).get("mode", "replace") == "replace":
1064
- pruned_ids = prune_sources(repo, config, state, source)
1065
- if pruned_ids:
1066
- source["_prunedSourceIds"] = pruned_ids
1067
- return source
1068
-
1069
-
1070
- def upload_file_source(repo: Path, config: dict[str, Any], path: Path, title: str) -> dict[str, Any]:
1071
- nbid = notebook_id(config)
1072
- result = run([*notebooklm_cmd(), "source", "add", str(path), "-n", nbid, "--title", title, "--json"], repo, timeout=600)
1073
- if result.returncode != 0:
1074
- die(f"notebooklm source add failed for {title}:\n{result.stdout}\n{result.stderr}")
1075
- source = source_from_add_json(result.stdout, path, title) or find_source_by_title(repo, nbid, title)
1076
- if not source or not source.get("id"):
1077
- die(f"uploaded source but could not resolve source id for {title}")
1078
- return source
1079
-
1080
-
1081
- def wait_source_ready(repo: Path, nbid: str, source_id: str) -> bool:
1082
- wait = run([*notebooklm_cmd(), "source", "wait", source_id, "-n", nbid], repo, timeout=600)
1083
- return wait.returncode == 0
1084
-
1085
-
1086
- def source_content_sha(value: dict[str, Any]) -> str:
1087
- return str(value.get("contentSha256") or value.get("chunkSha256") or value.get("bundleSha256") or "")
1088
-
1089
-
1090
- def source_file_list_sha(value: dict[str, Any]) -> str:
1091
- return str(value.get("fileListSha256") or value.get("sha256") or "")
1092
-
1093
-
1094
- def chunk_key(value: dict[str, Any]) -> str:
1095
- return f"{value.get('group')}/{value.get('chunk')}"
1096
-
1097
-
1098
- def temp_source_prefix(config: dict[str, Any]) -> str:
1099
- prefix = str(config.get("notebooklm", {}).get("temporary_source_title_prefix") or "").strip()
1100
- if prefix:
1101
- return slugify(prefix)
1102
- return f"{str(config.get('notebooklm', {}).get('source_title_prefix') or default_short_source_title_prefix()).strip()}tmp"
1103
-
1104
-
1105
- def temp_source_title(config: dict[str, Any], *, set_id: str, kind: str, title: str, content_sha: str) -> str:
1106
- digest = content_sha.split(":", 1)[-1]
1107
- return f"{temp_source_prefix(config)}--{set_id}--{slugify(kind)}--{slugify(title)}--{digest[:8]}.md"
1108
-
1109
-
1110
- def stage_temp_source_file(repo: Path, title: str, source_path: Path) -> Path:
1111
- staged = repo / CONFIG_DIR / "cache" / title
1112
- staged.parent.mkdir(parents=True, exist_ok=True)
1113
- shutil.copyfile(source_path, staged)
1114
- return staged
1115
-
1116
-
1117
- def source_with_chunk_metadata(source: dict[str, Any], bundle: dict[str, Any], *, status: str, reused: bool = False) -> dict[str, Any]:
1118
- item = dict(source)
1119
- item.update(
1120
- {
1121
- "group": bundle.get("group"),
1122
- "chunk": bundle.get("chunk"),
1123
- "chunkKey": chunk_key(bundle),
1124
- "chunkSha256": bundle.get("bundleSha256"),
1125
- "contentSha256": bundle.get("contentSha256") or bundle.get("bundleSha256"),
1126
- "fileListSha256": bundle.get("fileListSha256") or bundle.get("sha256"),
1127
- "fileCount": bundle.get("fileCount"),
1128
- "files": list(bundle.get("files", [])),
1129
- "status": status,
1130
- }
1131
- )
1132
- if reused:
1133
- item["reused"] = True
1134
- item["reusedAt"] = iso()
1135
- else:
1136
- item["uploadedAt"] = iso()
1137
- return item
1138
-
1139
-
1140
- def upload_one_chunk(repo: Path, config: dict[str, Any], bundle: dict[str, Any]) -> dict[str, Any]:
1141
- nbid = notebook_id(config)
1142
- path = Path(str(bundle["path"]))
1143
- title = str(bundle["title"])
1144
- result = run([*notebooklm_cmd(), "source", "add", str(path), "-n", nbid, "--title", title, "--json"], repo, timeout=600)
1145
- if result.returncode != 0:
1146
- die(f"notebooklm source add failed for chunk {title}:\n{result.stdout}\n{result.stderr}")
1147
- source = source_from_add_json(result.stdout, path, title) or find_source_by_title(repo, nbid, title)
1148
- if not source or not source.get("id"):
1149
- die(f"uploaded chunk but could not resolve source id for {title}")
1150
- return source_with_chunk_metadata(source, bundle, status="uploaded")
1151
-
1152
-
1153
- def source_set_hash(bundles: list[dict[str, Any]]) -> str:
1154
- parts = [
1155
- f"{bundle.get('group')} {bundle.get('chunk')} {source_content_sha(bundle)} {source_file_list_sha(bundle)}"
1156
- for bundle in bundles
1157
- ]
1158
- return sha256_text("\n".join(parts))
1159
-
1160
-
1161
- def active_sources(state: dict[str, Any]) -> list[dict[str, Any]]:
1162
- source_set = state.get("activeSourceSet")
1163
- if isinstance(source_set, dict):
1164
- sources = source_set.get("sources")
1165
- if isinstance(sources, list):
1166
- return [src for src in sources if isinstance(src, dict)]
1167
- return [src for src in state.get("sources", []) if isinstance(src, dict)]
1168
-
1169
-
1170
- def active_ready_source_ids(state: dict[str, Any]) -> list[str]:
1171
- ids: list[str] = []
1172
- for src in active_sources(state):
1173
- sid = str(src.get("id") or "")
1174
- if not sid:
1175
- continue
1176
- status = str(src.get("status") or "ready")
1177
- if status == "ready":
1178
- ids.append(sid)
1179
- return ids
1180
-
1181
-
1182
- def cleanup_pending_source_ids(state: dict[str, Any]) -> list[str]:
1183
- raw = state.get("cleanupPendingSourceIds")
1184
- if not isinstance(raw, list):
1185
- return []
1186
- return [sid for sid in dict.fromkeys(str(item) for item in raw if str(item))]
1187
-
1188
-
1189
- def queue_cleanup_source_ids(state: dict[str, Any], source_ids: list[str]) -> list[str]:
1190
- active_ids = {str(src.get("id") or "") for src in active_sources(state) if src.get("id")}
1191
- merged = [sid for sid in dict.fromkeys([*cleanup_pending_source_ids(state), *source_ids]) if sid and sid not in active_ids]
1192
- if merged:
1193
- state["cleanupPendingSourceIds"] = merged
1194
- else:
1195
- state.pop("cleanupPendingSourceIds", None)
1196
- return merged
1197
-
1198
-
1199
- def pending_upload_path(repo: Path) -> Path:
1200
- return repo / CONFIG_DIR / PENDING_UPLOAD_JSON
1201
-
1202
-
1203
- def clear_pending_upload(repo: Path) -> None:
1204
- remove_file_quiet(pending_upload_path(repo))
1205
-
1206
-
1207
- def write_pending_upload(repo: Path, value: dict[str, Any]) -> None:
1208
- write_json(pending_upload_path(repo), value)
1209
-
1210
-
1211
- def read_pending_upload(repo: Path) -> dict[str, Any] | None:
1212
- path = pending_upload_path(repo)
1213
- if not path.exists():
1214
- return None
1215
- try:
1216
- data = json.loads(path.read_text())
1217
- except json.JSONDecodeError:
1218
- return {"sources": []}
1219
- return data if isinstance(data, dict) else {"sources": []}
1220
-
1221
-
1222
- def delete_source_ids_parallel(repo: Path, nbid: str, source_ids: list[str], *, parallelism: int) -> list[str]:
1223
- ids = [sid for sid in dict.fromkeys(source_ids) if sid]
1224
- if not ids:
1225
- return []
1226
- workers = min(len(ids), max(1, parallelism))
1227
-
1228
- def delete_one(sid: str) -> str | None:
1229
- result = run([*notebooklm_cmd(), "source", "delete", sid, "-n", nbid, "--yes"], repo, timeout=120)
1230
- if result.returncode != 0:
1231
- print(f"warning: failed to delete source {sid}", file=sys.stderr)
1232
- return None
1233
- return sid
1234
-
1235
- deleted: list[str] = []
1236
- with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1237
- futures = [executor.submit(delete_one, sid) for sid in ids]
1238
- for future in concurrent.futures.as_completed(futures):
1239
- sid = future.result()
1240
- if sid:
1241
- deleted.append(sid)
1242
- print(f"cleanup {len(deleted)}/{len(ids)}", file=sys.stderr)
1243
- return deleted
1244
-
1245
-
1246
- def recover_pending_cleanup(repo: Path, config: dict[str, Any], state: dict[str, Any], state_path: Path) -> list[str]:
1247
- pending_ids = cleanup_pending_source_ids(state)
1248
- if not pending_ids:
1249
- return []
1250
- active_ids = {str(src.get("id") or "") for src in active_sources(state) if src.get("id")}
1251
- delete_ids = [sid for sid in pending_ids if sid not in active_ids]
1252
- if not delete_ids:
1253
- state.pop("cleanupPendingSourceIds", None)
1254
- write_json(state_path, state)
1255
- return []
1256
- deleted = delete_source_ids_parallel(
1257
- repo,
1258
- notebook_id(config),
1259
- delete_ids,
1260
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1261
- )
1262
- deleted_set = set(deleted)
1263
- remaining = [sid for sid in pending_ids if sid not in deleted_set and sid not in active_ids]
1264
- if remaining:
1265
- state["cleanupPendingSourceIds"] = remaining
1266
- else:
1267
- state.pop("cleanupPendingSourceIds", None)
1268
- write_json(state_path, state)
1269
- return deleted
1270
-
1271
-
1272
- def recover_pending_upload(repo: Path, config: dict[str, Any], state: dict[str, Any] | None = None) -> list[str]:
1273
- pending = read_pending_upload(repo)
1274
- if not pending:
1275
- return []
1276
- sources = pending.get("sources")
1277
- if not isinstance(sources, list):
1278
- clear_pending_upload(repo)
1279
- return []
1280
- active_ids = {str(src.get("id")) for src in active_sources(state or {}) if src.get("id")}
1281
- ids = [str(src.get("id")) for src in sources if isinstance(src, dict) and src.get("id")]
1282
- if ids and active_ids and all(sid in active_ids for sid in ids):
1283
- clear_pending_upload(repo)
1284
- return []
1285
- nbid = str(pending.get("notebookId") or notebook_id(config))
1286
- delete_ids = [sid for sid in ids if sid not in active_ids]
1287
- deleted = delete_source_ids_parallel(
1288
- repo,
1289
- nbid,
1290
- delete_ids,
1291
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1292
- )
1293
- remaining = [src for src in sources if isinstance(src, dict) and str(src.get("id") or "") not in set(deleted)]
1294
- if remaining:
1295
- pending["sources"] = remaining
1296
- write_pending_upload(repo, pending)
1297
- else:
1298
- clear_pending_upload(repo)
1299
- return deleted
1300
-
1301
-
1302
- def append_pending_source(repo: Path, journal: dict[str, Any], source: dict[str, Any], lock: threading.Lock) -> None:
1303
- with lock:
1304
- sources = journal.setdefault("sources", [])
1305
- if isinstance(sources, list):
1306
- sources.append({"id": source.get("id"), "title": source.get("title")})
1307
- write_pending_upload(repo, journal)
1308
-
1309
-
1310
- def find_reusable_source(bundle: dict[str, Any], previous_sources: list[dict[str, Any]], used_ids: set[str]) -> dict[str, Any] | None:
1311
- wanted = source_content_sha(bundle)
1312
- if not wanted:
1313
- return None
1314
- for source in previous_sources:
1315
- sid = str(source.get("id") or "")
1316
- if not sid or sid in used_ids:
1317
- continue
1318
- if str(source.get("status") or "ready") != "ready":
1319
- continue
1320
- if source_content_sha(source) == wanted:
1321
- used_ids.add(sid)
1322
- return source
1323
- return None
1324
-
1325
-
1326
- def upload_chunks_parallel(repo: Path, config: dict[str, Any], bundles: list[tuple[int, dict[str, Any]]], *, set_id: str) -> list[tuple[int, dict[str, Any]]]:
1327
- if not bundles:
1328
- return []
1329
- nbid = notebook_id(config)
1330
- workers = min(
1331
- len(bundles),
1332
- positive_int(config.get("notebooklm", {}).get("upload_parallelism"), 4),
1333
- )
1334
- journal: dict[str, Any] = {
1335
- "version": 1,
1336
- "setId": set_id,
1337
- "notebookId": nbid,
1338
- "startedAt": iso(),
1339
- "sources": [],
1340
- }
1341
- write_pending_upload(repo, journal)
1342
- journal_lock = threading.Lock()
1343
- uploaded: list[tuple[int, dict[str, Any]]] = []
1344
- errors: list[BaseException] = []
1345
-
1346
- def upload_pair(pair: tuple[int, dict[str, Any]]) -> tuple[int, dict[str, Any]]:
1347
- index, bundle = pair
1348
- source = upload_one_chunk(repo, config, bundle)
1349
- append_pending_source(repo, journal, source, journal_lock)
1350
- return index, source
1351
-
1352
- with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1353
- futures = [executor.submit(upload_pair, pair) for pair in bundles]
1354
- for future in concurrent.futures.as_completed(futures):
1355
- try:
1356
- item = future.result()
1357
- uploaded.append(item)
1358
- print(f"upload {len(uploaded)}/{len(bundles)}", file=sys.stderr)
1359
- except BaseException as error:
1360
- errors.append(error)
1361
-
1362
- if errors:
1363
- delete_source_ids_parallel(
1364
- repo,
1365
- nbid,
1366
- [str(source.get("id") or "") for _, source in uploaded],
1367
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1368
- )
1369
- clear_pending_upload(repo)
1370
- raise errors[0]
1371
- return sorted(uploaded, key=lambda item: item[0])
1372
-
1373
-
1374
- def wait_uploaded_sources_parallel(repo: Path, config: dict[str, Any], sources: list[tuple[int, dict[str, Any]]]) -> list[tuple[int, dict[str, Any]]]:
1375
- if not sources or not config.get("notebooklm", {}).get("wait_after_upload", True):
1376
- return sources
1377
- nbid = notebook_id(config)
1378
- workers = min(
1379
- len(sources),
1380
- positive_int(config.get("notebooklm", {}).get("wait_parallelism"), 8),
1381
- )
1382
- ready: list[tuple[int, dict[str, Any]]] = []
1383
- errors: list[str] = []
1384
-
1385
- def wait_one(pair: tuple[int, dict[str, Any]]) -> tuple[int, dict[str, Any]]:
1386
- index, source = pair
1387
- sid = str(source.get("id") or "")
1388
- if not sid:
1389
- raise RuntimeError(f"missing source id for {source.get('title')}")
1390
- if not wait_source_ready(repo, nbid, sid):
1391
- raise RuntimeError(f"source processing failed for chunk {source.get('title')}: {sid}")
1392
- item = dict(source)
1393
- item["status"] = "ready"
1394
- return index, item
1395
-
1396
- with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
1397
- futures = [executor.submit(wait_one, pair) for pair in sources]
1398
- for future in concurrent.futures.as_completed(futures):
1399
- try:
1400
- item = future.result()
1401
- ready.append(item)
1402
- print(f"wait {len(ready)}/{len(sources)}", file=sys.stderr)
1403
- except Exception as error:
1404
- errors.append(str(error))
1405
- if errors:
1406
- die("\n".join(errors))
1407
- return sorted(ready, key=lambda item: item[0])
1408
-
1409
-
1410
- def upload_bundle_set(repo: Path, config: dict[str, Any], state: dict[str, Any], bundles: list[dict[str, Any]], *, set_id: str) -> dict[str, Any]:
1411
- nbid = notebook_id(config)
1412
- recover_pending_upload(repo, config, state)
1413
- previous_sources = active_sources(state)
1414
- used_reuse_ids: set[str] = set()
1415
- sources_by_index: list[dict[str, Any] | None] = [None] * len(bundles)
1416
- upload_pairs: list[tuple[int, dict[str, Any]]] = []
1417
- for index, bundle in enumerate(bundles):
1418
- reusable = find_reusable_source(bundle, previous_sources, used_reuse_ids)
1419
- if reusable:
1420
- sources_by_index[index] = source_with_chunk_metadata(reusable, bundle, status="ready", reused=True)
1421
- else:
1422
- upload_pairs.append((index, bundle))
1423
- uploaded_sources = upload_chunks_parallel(repo, config, upload_pairs, set_id=set_id)
1424
- try:
1425
- ready_sources = wait_uploaded_sources_parallel(repo, config, uploaded_sources)
1426
- except BaseException:
1427
- delete_source_ids_parallel(
1428
- repo,
1429
- nbid,
1430
- [str(source.get("id") or "") for _, source in uploaded_sources],
1431
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
1432
- )
1433
- clear_pending_upload(repo)
1434
- raise
1435
- for index, source in ready_sources:
1436
- sources_by_index[index] = source
1437
- sources = [source for source in sources_by_index if isinstance(source, dict)]
1438
- active_ids = {str(src.get("id")) for src in sources if src.get("id")}
1439
- previous_ids = [str(src.get("id")) for src in previous_sources if src.get("id")]
1440
- keep_previous = int(config.get("refresh", {}).get("keep_previous_sources", 0))
1441
- keep_ids = set(previous_ids[-keep_previous:]) if keep_previous > 0 else set()
1442
- retired_ids = [sid for sid in previous_ids if sid not in active_ids and sid not in keep_ids]
1443
- source_set = {
1444
- "id": set_id,
1445
- "prefix": str(config.get("notebooklm", {}).get("source_title_prefix") or default_short_source_title_prefix()),
1446
- "bundleSetSha256": source_set_hash(bundles),
1447
- "uploadedAt": iso(),
1448
- "sources": sources,
1449
- }
1450
- if config.get("refresh", {}).get("mode", "replace") == "replace" and config.get("refresh", {}).get("delete_previous_after_success", True):
1451
- source_set["_retiredSourceIds"] = retired_ids
1452
- return source_set
1453
-
1454
-
1455
- def prune_sources(repo: Path, config: dict[str, Any], state: dict[str, Any], new_source: dict[str, Any]) -> list[str]:
1456
- refresh = config.get("refresh", {})
1457
- if not refresh.get("delete_previous_after_success", True):
1458
- return []
1459
- keep_previous = int(refresh.get("keep_previous_sources", 1))
1460
- recorded = [src for src in state.get("sources", []) if src.get("id")]
1461
- keep_ids = {str(src.get("id")) for src in recorded[-keep_previous:]} if keep_previous > 0 else set()
1462
- keep_ids.add(str(new_source.get("id", "")))
1463
- nbid = notebook_id(config)
1464
- pruned_ids: list[str] = []
1465
- for src in recorded:
1466
- sid = str(src.get("id", ""))
1467
- if not sid or sid in keep_ids:
1468
- continue
1469
- delete = run([*notebooklm_cmd(), "source", "delete", sid, "-n", nbid, "--yes"], repo, timeout=120)
1470
- if delete.returncode != 0:
1471
- print(f"warning: failed to delete old source {sid}", file=sys.stderr)
1472
- else:
1473
- pruned_ids.append(sid)
1474
- return pruned_ids
1475
-
1476
-
1477
- def ensure_index(
1478
- repo: Path,
1479
- *,
1480
- force: bool = False,
1481
- yes: bool = False,
1482
- json_output: bool = False,
1483
- command: str = "ensure",
1484
- return_uninitialized: bool = False,
1485
- ) -> dict[str, Any]:
1486
- config_file = config_path(repo)
1487
- if not config_file.exists():
1488
- if json_output or return_uninitialized:
1489
- return uninitialized_status(repo, config_file)
1490
- die(missing_config_message(repo, config_file, command))
1491
- with repo_lock(repo):
1492
- return ensure_index_locked(repo, force=force, yes=yes, json_output=json_output, command=command)
1493
-
1494
-
1495
- def ensure_index_locked(repo: Path, *, force: bool = False, yes: bool = False, json_output: bool = False, command: str = "ensure") -> dict[str, Any]:
1496
- config, cfg_path = load_config(repo, command=command)
1497
- state, state_path = load_state(cfg_path)
1498
- recover_pending_upload(repo, config, state)
1499
- recover_pending_cleanup(repo, config, state, state_path)
1500
- fast_hash, relevant_paths = fast_fingerprint(repo, config, cfg_path)
1501
- refresh = config.get("refresh", {})
1502
- check_ttl = int(refresh.get("check_ttl_seconds", 300))
1503
- min_interval = int(refresh.get("min_upload_interval_seconds", 900))
1504
- max_staleness = int(refresh.get("max_staleness_seconds", 86400))
1505
- checked_age = seconds_since(state.get("lastCheckedAt"))
1506
- uploaded_age = seconds_since(state.get("lastUploadedAt"))
1507
- uploaded_fingerprint = state_uploaded_fingerprint(state)
1508
-
1509
- result: dict[str, Any] = {
1510
- "status": "unknown",
1511
- "config": str(cfg_path),
1512
- "state": str(state_path),
1513
- "relevant_changed_paths": relevant_paths,
1514
- "fast_fingerprint": fast_hash,
1515
- }
1516
-
1517
- if not force and checked_age is not None and checked_age < check_ttl and uploaded_fingerprint == fast_hash:
1518
- state["lastCheckedAt"] = iso()
1519
- state["lastCheckedFastFingerprint"] = fast_hash
1520
- state["lastBundlePath"] = None
1521
- write_json(state_path, state)
1522
- result.update({"status": "fresh-ttl", "checked_age_seconds": checked_age})
1523
- return result
1524
-
1525
- if not force and uploaded_fingerprint == fast_hash and state.get("lastUploadedAt"):
1526
- state["lastCheckedAt"] = iso()
1527
- state["lastCheckedFastFingerprint"] = fast_hash
1528
- state["lastBundlePath"] = None
1529
- write_json(state_path, state)
1530
- result.update({"status": "fresh-fingerprint"})
1531
- return result
1532
-
1533
- first_upload = not active_sources(state)
1534
- if first_upload and config.get("safety", {}).get("require_user_approval_first_upload", True) and not yes and not force:
1535
- result.update({"status": "needs-first-upload-approval"})
1536
- return result
1537
-
1538
- if not force and uploaded_age is not None and uploaded_age < min_interval and uploaded_age < max_staleness:
1539
- state["lastCheckedAt"] = iso()
1540
- state["lastCheckedFastFingerprint"] = fast_hash
1541
- state["lastBundlePath"] = None
1542
- write_json(state_path, state)
1543
- result.update({"status": "stale-throttled", "uploaded_age_seconds": uploaded_age})
1544
- return result
1545
-
1546
- if not refresh.get("auto", True) and not force:
1547
- state["lastCheckedAt"] = iso()
1548
- state["lastCheckedFastFingerprint"] = fast_hash
1549
- state["lastBundlePath"] = None
1550
- write_json(state_path, state)
1551
- result.update({"status": "auto-refresh-disabled"})
1552
- return result
1553
-
1554
- if bundle_mode(config) == "chunked":
1555
- set_id = now_utc().strftime("%y%m%d%H%M")
1556
- bundles = build_bundle_set(repo, config, set_id=set_id, state=state)
1557
- try:
1558
- bundle_set_sha = source_set_hash(bundles)
1559
- if not force and state.get("lastBundleSetSha256") == bundle_set_sha:
1560
- state.update({
1561
- "lastCheckedAt": iso(),
1562
- "lastCheckedFastFingerprint": fast_hash,
1563
- "lastBundlePath": None,
1564
- })
1565
- write_json(state_path, state)
1566
- result.update({"status": "fresh-bundle-hash", "bundleSetSha256": bundle_set_sha, "bundleDeleted": True})
1567
- return result
1568
-
1569
- source_set = upload_bundle_set(repo, config, state, bundles, set_id=set_id)
1570
- retired_ids = [str(sid) for sid in source_set.pop("_retiredSourceIds", []) if str(sid)]
1571
- state.update({
1572
- "lastCheckedAt": iso(),
1573
- "lastUploadedAt": iso(),
1574
- "lastConfigSha256": sha256_file(cfg_path),
1575
- "lastCheckedFastFingerprint": fast_hash,
1576
- "lastUploadedFastFingerprint": fast_hash,
1577
- "lastFastFingerprint": fast_hash,
1578
- "lastBundleSetSha256": bundle_set_sha,
1579
- "lastBundleSha256": bundle_set_sha,
1580
- "lastBundlePath": None,
1581
- "activeSourceSet": source_set,
1582
- "sources": [src for src in source_set.get("sources", []) if isinstance(src, dict)],
1583
- })
1584
- cleanup_pending_ids = queue_cleanup_source_ids(state, retired_ids)
1585
- write_json(state_path, state)
1586
- clear_pending_upload(repo)
1587
- result.update(
1588
- {
1589
- "status": "uploaded",
1590
- "bundleSetSha256": bundle_set_sha,
1591
- "bundleDeleted": True,
1592
- "sourceSet": source_set,
1593
- "cleanupPendingSourceIds": cleanup_pending_ids,
1594
- }
1595
- )
1596
- return result
1597
- finally:
1598
- for bundle in bundles:
1599
- if bundle.get("path"):
1600
- remove_file_quiet(Path(str(bundle["path"])))
1601
-
1602
- bundle = build_bundle(repo, config)
1603
- try:
1604
- bundle_hash = sha256_file(bundle)
1605
- if not force and state.get("lastBundleSha256") == bundle_hash:
1606
- state.update({
1607
- "lastCheckedAt": iso(),
1608
- "lastCheckedFastFingerprint": fast_hash,
1609
- "lastBundlePath": None,
1610
- })
1611
- write_json(state_path, state)
1612
- result.update({"status": "fresh-bundle-hash", "bundleSha256": bundle_hash, "bundleDeleted": True})
1613
- return result
1614
-
1615
- source = upload_bundle(repo, config, state, bundle, bundle_hash)
1616
- pruned_ids = set(source.pop("_prunedSourceIds", []))
1617
- sources = [src for src in state.get("sources", []) if str(src.get("id", "")) not in pruned_ids]
1618
- if source.get("id") or source.get("title"):
1619
- sources.append(source)
1620
- state.update({
1621
- "lastCheckedAt": iso(),
1622
- "lastUploadedAt": iso(),
1623
- "lastConfigSha256": sha256_file(cfg_path),
1624
- "lastCheckedFastFingerprint": fast_hash,
1625
- "lastUploadedFastFingerprint": fast_hash,
1626
- "lastFastFingerprint": fast_hash,
1627
- "lastBundleSha256": bundle_hash,
1628
- "lastBundlePath": None,
1629
- "sources": sources,
1630
- })
1631
- write_json(state_path, state)
1632
- result.update({"status": "uploaded", "bundleSha256": bundle_hash, "bundleDeleted": True, "source": source})
1633
- return result
1634
- finally:
1635
- remove_file_quiet(bundle)
1636
-
1637
-
1638
- def ask_provider(repo: Path, question: str) -> dict[str, Any]:
1639
- config, cfg_path = load_config(repo, command="ask")
1640
- state, _ = load_state(cfg_path)
1641
- nbid = notebook_id(config)
1642
- argv = [*notebooklm_cmd(), "ask", question, "-n", nbid]
1643
- for source_id in active_ready_source_ids(state):
1644
- argv.extend(["-s", source_id])
1645
- argv.append("--json")
1646
- result = run(argv, repo, timeout=180)
1647
- if result.returncode != 0:
1648
- return {"error": True, "stdout": result.stdout, "stderr": result.stderr}
1649
- try:
1650
- return json.loads(result.stdout)
1651
- except json.JSONDecodeError:
1652
- return {"answer": result.stdout}
1653
-
1654
-
1655
- PATH_RE = re.compile(r"(?:(?:[\w.-]+/)+[\w.@+-]+\.(?:rs|ts|tsx|js|jsx|py|go|java|kt|md|toml|yaml|yml|json|sh|sql|css|scss|html))")
1656
- TERM_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{3,}|[A-Za-z0-9][A-Za-z0-9_-]{4,}")
1657
- STOP_TERMS = {
1658
- "agent",
1659
- "authority",
1660
- "btreemap",
1661
- "bundle",
1662
- "codex",
1663
- "command",
1664
- "docs",
1665
- "fixture",
1666
- "gate",
1667
- "justfile",
1668
- "keywords",
1669
- "local",
1670
- "names",
1671
- "paths",
1672
- "postgres",
1673
- "postgresql",
1674
- "real",
1675
- "refs",
1676
- "repo",
1677
- "shell",
1678
- "test",
1679
- "trigger",
1680
- "where",
1681
- "which",
1682
- "what",
1683
- "when",
1684
- "implemented",
1685
- "implementation",
1686
- "function",
1687
- "tests",
1688
- "files",
1689
- "return",
1690
- "likely",
1691
- "line",
1692
- "numbers",
1693
- "source",
1694
- "notebooklm",
1695
- }
1696
-
1697
-
1698
- def answer_text(data: dict[str, Any]) -> str:
1699
- value = data.get("answer")
1700
- if isinstance(value, str):
1701
- return value
1702
- return json.dumps(data, ensure_ascii=False)
1703
-
1704
-
1705
- def active_sources_by_id(repo: Path) -> dict[str, dict[str, Any]]:
1706
- _, config_file = load_config(repo, command="ask")
1707
- state, _ = load_state(config_file)
1708
- by_id: dict[str, dict[str, Any]] = {}
1709
- for source in active_sources(state):
1710
- sid = str(source.get("id") or "")
1711
- if sid:
1712
- by_id[sid] = source
1713
- return by_id
1714
-
1715
-
1716
- def reference_path_candidates(repo: Path, source: dict[str, Any], text: str) -> list[tuple[str, int | None]]:
1717
- files = [str(path) for path in source.get("files", []) if str(path)]
1718
- file_set = set(files)
1719
- matches: list[tuple[str, int | None]] = []
1720
-
1721
- for raw in PATH_RE.findall(text):
1722
- path = raw.strip("`'\".,;:()[]{}<>")
1723
- if path in file_set and (repo / path).is_file():
1724
- matches.append((path, None))
1725
-
1726
- if matches:
1727
- return sorted(set(matches))[:5]
1728
-
1729
- snippet = " ".join(text.split())
1730
- if len(snippet) < 4 or len(snippet) > 240 or "<directory_structure>" in text:
1731
- return []
1732
-
1733
- for path in files:
1734
- full = repo / path
1735
- if not full.is_file() or full.stat().st_size > 2_000_000:
1736
- continue
1737
- try:
1738
- content = full.read_text(encoding="utf-8", errors="ignore")
1739
- except OSError:
1740
- continue
1741
- line_no: int | None = None
1742
- index = content.find(text)
1743
- if index >= 0:
1744
- line_no = content.count("\n", 0, index) + 1
1745
- elif snippet not in " ".join(content.split()):
1746
- continue
1747
- matches.append((path, line_no))
1748
- if len(matches) >= 5:
1749
- break
1750
- return matches
1751
-
1752
-
1753
- def format_reference_paths(paths: list[tuple[str, int | None]]) -> str:
1754
- rendered = [f"{path}:{line}" if line else path for path, line in paths[:3]]
1755
- suffix = "" if len(paths) <= 3 else f", ...(+{len(paths) - 3})"
1756
- return ", ".join(rendered) + suffix
1757
-
1758
-
1759
- def print_compact_references(repo: Path, answer: dict[str, Any]) -> None:
1760
- references = answer.get("references")
1761
- if not isinstance(references, list) or not references:
1762
- return
1763
-
1764
- sources = active_sources_by_id(repo)
1765
- rows: list[str] = []
1766
- seen_numbers: set[str] = set()
1767
- for ref in references:
1768
- if not isinstance(ref, dict):
1769
- continue
1770
- number = str(ref.get("citation_number") or "").strip()
1771
- if not number or number in seen_numbers:
1772
- continue
1773
- seen_numbers.add(number)
1774
- source = sources.get(str(ref.get("source_id") or ""))
1775
- paths = reference_path_candidates(repo, source or {}, str(ref.get("cited_text") or "")) if source else []
1776
- if paths:
1777
- rows.append(f"[{number}] {format_reference_paths(paths)}")
1778
-
1779
- if rows:
1780
- print("\nreferences:")
1781
- for row in rows:
1782
- print(row)
1783
-
1784
-
1785
- def extract_candidates(text: str, query: str) -> tuple[list[str], list[str]]:
1786
- paths = sorted(set(PATH_RE.findall(text)))
1787
- terms = set()
1788
- for raw in TERM_RE.findall(text + "\n" + query):
1789
- term = raw.strip("`'\"")
1790
- if len(term) < 4 or term.lower() in STOP_TERMS:
1791
- continue
1792
- if "/" in term or "." in term:
1793
- continue
1794
- terms.add(term)
1795
- return paths, sorted(terms)[:24]
1796
-
1797
-
1798
- def high_signal_terms(terms: list[str]) -> list[str]:
1799
- selected: list[str] = []
1800
- for term in terms:
1801
- lower = term.lower()
1802
- if lower in STOP_TERMS:
1803
- continue
1804
- has_symbol_shape = "_" in term or "-" in term or any(char.isupper() for char in term[1:])
1805
- if has_symbol_shape or len(term) >= 14:
1806
- selected.append(term)
1807
- return selected or [term for term in terms if term.lower() not in STOP_TERMS][:8]
1808
-
1809
-
1810
- def rg_roots(repo: Path, config: dict[str, Any], candidate_paths: list[str]) -> list[list[str]]:
1811
- candidate_roots = [path for path in candidate_paths if (repo / path).exists()]
1812
- roots = [spec for spec in include_specs(config) if (repo / spec).exists()]
1813
- if not roots:
1814
- roots = ["."]
1815
- groups: list[list[str]] = []
1816
- if candidate_roots:
1817
- groups.append(candidate_roots)
1818
- groups.append(roots)
1819
- return groups
1820
-
1821
-
1822
- def parse_rg_matches(stdout: str, seen: set[tuple[str, str, str]], remaining: int) -> list[dict[str, Any]]:
1823
- matches: list[dict[str, Any]] = []
1824
- for line in stdout.splitlines():
1825
- if len(matches) >= remaining:
1826
- break
1827
- parts = line.split(":", 2)
1828
- if len(parts) != 3:
1829
- continue
1830
- path, line_no, text = parts
1831
- key = (path, line_no, text.strip())
1832
- if key in seen:
1833
- continue
1834
- seen.add(key)
1835
- matches.append({"path": path, "line": int(line_no) if line_no.isdigit() else line_no, "text": text.strip()})
1836
- return matches
1837
-
1838
-
1839
- def local_rg(repo: Path, config: dict[str, Any], terms: list[str], candidate_paths: list[str] | None = None) -> list[dict[str, Any]]:
1840
- if not terms or shutil.which("rg") is None:
1841
- return []
1842
- signal_terms = high_signal_terms(terms)
1843
- pattern = "|".join(re.escape(term) for term in signal_terms[:16])
1844
- max_matches = int(config.get("retrieval", {}).get("max_local_matches", 80))
1845
- matches: list[dict[str, Any]] = []
1846
- seen: set[tuple[str, str, str]] = set()
1847
- for roots in rg_roots(repo, config, candidate_paths or []):
1848
- remaining = max_matches - len(matches)
1849
- if remaining <= 0:
1850
- break
1851
- cmd = ["rg", "-n", "-S", "-e", pattern, "--", *roots]
1852
- result = run(cmd, repo, timeout=120)
1853
- if result.returncode not in (0, 1):
1854
- return [{"error": result.stderr.strip()}]
1855
- matches.extend(parse_rg_matches(result.stdout, seen, remaining))
1856
- return matches
1857
-
1858
-
1859
- def print_result(data: Any, as_json: bool) -> None:
1860
- if as_json:
1861
- print(json.dumps(data, indent=2, ensure_ascii=False))
1862
- else:
1863
- if isinstance(data, dict):
1864
- for key, value in data.items():
1865
- if isinstance(value, (dict, list)):
1866
- print(f"{key}: {json.dumps(value, ensure_ascii=False)}")
1867
- else:
1868
- print(f"{key}: {value}")
1869
- else:
1870
- print(data)
1871
-
1872
-
1873
- def freshness_warning(freshness: dict[str, Any]) -> str | None:
1874
- status = str(freshness.get("status") or "")
1875
- if status == "stale-throttled":
1876
- changed = freshness.get("relevant_changed_paths") or []
1877
- uploaded_age = freshness.get("uploaded_age_seconds")
1878
- changed_text = ""
1879
- if isinstance(changed, list) and changed:
1880
- preview = ", ".join(str(path) for path in changed[:5])
1881
- suffix = "" if len(changed) <= 5 else f", ...(+{len(changed) - 5})"
1882
- changed_text = f"; changed={preview}{suffix}"
1883
- age_text = f"; uploaded_age_seconds={uploaded_age}" if uploaded_age is not None else ""
1884
- return f"warning: index is stale-throttled{age_text}{changed_text}; provider answer may lag local changes. Use --force-refresh or refresh --force if needed."
1885
- if status == "needs-first-upload-approval":
1886
- return "warning: first broad upload requires approval; rerun with --yes or run refresh explicitly."
1887
- if status == "auto-refresh-disabled":
1888
- return "warning: auto refresh is disabled; provider answer may lag local changes."
1889
- return None
1890
-
1891
-
1892
- def provider_block_message(freshness: dict[str, Any]) -> str | None:
1893
- status = str(freshness.get("status") or "")
1894
- if status == "not-initialized":
1895
- return "skipped; project is not initialized for project retrieval."
1896
- if status == "needs-first-upload-approval":
1897
- return "skipped; first broad upload requires approval. Rerun ask/locate with --yes or run refresh explicitly."
1898
- return None
1899
-
1900
-
1901
- def first_upload_next(repo: Path, command: str, query: str) -> dict[str, str]:
1902
- return {
1903
- f"{command}WithFirstUploadApproval": command_line(repo, command, "--yes", query),
1904
- "refresh": command_line(repo, "refresh", "--force"),
1905
- }
1906
-
1907
-
1908
- def provider_block_payload(freshness: dict[str, Any], *, next_steps: dict[str, str] | None = None) -> dict[str, Any]:
1909
- payload: dict[str, Any] = {"error": True, "message": provider_block_message(freshness) or "skipped"}
1910
- block_next = freshness.get("next") or next_steps
1911
- if block_next:
1912
- payload["next"] = block_next
1913
- return payload
1914
-
1915
-
1916
- def print_ask_result(freshness: dict[str, Any], answer: dict[str, Any], args: argparse.Namespace) -> None:
1917
- if args.json:
1918
- print_result({"freshness": freshness, "provider_answer": answer}, True)
1919
- return
1920
- repo = Path(args.repo).resolve()
1921
- warning = freshness_warning(freshness)
1922
- if warning:
1923
- print(warning)
1924
- if args.verbose:
1925
- print(f"freshness: {json.dumps(freshness, ensure_ascii=False)}")
1926
- metadata = {key: answer[key] for key in ("conversation_id", "turn_number", "is_follow_up") if key in answer}
1927
- references = answer.get("references")
1928
- if isinstance(references, list):
1929
- metadata["references_count"] = len(references)
1930
- if metadata:
1931
- print(f"provider: {json.dumps(metadata, ensure_ascii=False)}")
1932
- print(answer_text(answer))
1933
- print_compact_references(repo, answer)
1934
-
1935
-
1936
- def print_locate_result(result: dict[str, Any], args: argparse.Namespace) -> None:
1937
- if args.json:
1938
- print_result(result, True)
1939
- return
1940
- warning = freshness_warning(result.get("freshness", {}))
1941
- if warning:
1942
- print(warning)
1943
- if args.verbose:
1944
- print(f"freshness: {json.dumps(result.get('freshness', {}), ensure_ascii=False)}")
1945
- visible = {key: value for key, value in result.items() if key != "freshness"}
1946
- print_result(visible, False)
1947
-
1948
-
1949
- def cmd_init(args: argparse.Namespace) -> None:
1950
- repo = Path(args.repo).resolve()
1951
- cfg_dir = repo / CONFIG_DIR
1952
- cfg = cfg_dir / CONFIG_JSON
1953
- if cfg.exists() and not args.force:
1954
- die(f"config already exists: {cfg}")
1955
- project_name = args.project_name or repo.name
1956
- title_prefix = args.notebook_title_prefix or DEFAULT_NOTEBOOK_TITLE_PREFIX
1957
- title = args.notebook_title or default_notebook_title(project_name, title_prefix)
1958
- notebook_id_value = args.notebook_id or ""
1959
- resolved_notebook: dict[str, Any] | None = None
1960
- if not notebook_id_value and (args.reuse_existing_notebook or args.create_notebook):
1961
- resolved_notebook = find_notebook_by_title(repo, title)
1962
- if not resolved_notebook and args.create_notebook:
1963
- resolved_notebook = create_notebook(repo, title)
1964
- if not resolved_notebook:
1965
- die(f"no NotebookLM notebook found with title {title!r}; pass --create-notebook or --notebook-id")
1966
- notebook_id_value = str(resolved_notebook.get("id") or "")
1967
- config = default_config(
1968
- repo,
1969
- notebook_id_value,
1970
- project_name=project_name,
1971
- notebook_title_prefix=title_prefix,
1972
- notebook_title=title,
1973
- )
1974
- if args.include:
1975
- config["bundle"]["include"] = [part.strip() for part in args.include.split(",") if part.strip()]
1976
- if args.source_title_prefix:
1977
- config["notebooklm"]["source_title_prefix"] = args.source_title_prefix
1978
- write_json(cfg, config)
1979
- (cfg_dir / ".gitignore").write_text("state.local.json\npending-upload.local.json\ncache/\n*.lock\n")
1980
- print(f"created: {cfg}")
1981
- print(f"created: {cfg_dir / '.gitignore'}")
1982
- print(f"notebook_title: {title}")
1983
- if resolved_notebook:
1984
- print(f"notebook_id: {notebook_id_value}")
1985
- if notebook_id_value:
1986
- print("next:")
1987
- print(f" {command_line(repo, 'ensure', '--yes')}")
1988
- print(f" {command_line(repo, 'ask', 'your question')}")
1989
- else:
1990
- print("next:")
1991
- print(" set notebooklm.notebook_id in the config, or rerun init with --create-notebook / --reuse-existing-notebook / --notebook-id")
1992
-
1993
-
1994
- def cmd_status(args: argparse.Namespace) -> None:
1995
- repo = Path(args.repo).resolve()
1996
- cfg_candidate = config_path(repo)
1997
- if not cfg_candidate.exists():
1998
- print_result(uninitialized_status(repo, cfg_candidate), args.json)
1999
- return
2000
- config, cfg_path = load_config(repo, command="status")
2001
- state, state_path = load_state(cfg_path)
2002
- fast_hash, changed = fast_fingerprint(repo, config, cfg_path)
2003
- data = {
2004
- "initialized": True,
2005
- "config": str(cfg_path),
2006
- "state": str(state_path),
2007
- "provider": config.get("provider"),
2008
- "projectName": config.get("project", {}).get("name"),
2009
- "notebook_id": config.get("notebooklm", {}).get("notebook_id"),
2010
- "notebookTitle": notebook_title(config),
2011
- "sourceTitlePrefix": config.get("notebooklm", {}).get("source_title_prefix"),
2012
- "lastCheckedAt": state.get("lastCheckedAt"),
2013
- "lastUploadedAt": state.get("lastUploadedAt"),
2014
- "lastBundleSha256": state.get("lastBundleSha256"),
2015
- "fastFingerprint": fast_hash,
2016
- "stateCheckedFastFingerprint": state.get("lastCheckedFastFingerprint"),
2017
- "stateUploadedFastFingerprint": state_uploaded_fingerprint(state),
2018
- "stateFastFingerprint": state.get("lastFastFingerprint"),
2019
- "relevantChangedPaths": changed,
2020
- "sources": state.get("sources", []),
2021
- }
2022
- print_result(data, args.json)
2023
-
2024
-
2025
- def cmd_pack(args: argparse.Namespace) -> None:
2026
- repo = Path(args.repo).resolve()
2027
- config, cfg_path = load_config(repo, command="pack")
2028
- state, _ = load_state(cfg_path)
2029
- set_id = args.set_id or now_utc().strftime("%y%m%d%H%M")
2030
- chunks = plan_bundle_chunks(repo, config, set_id=set_id, state=state)
2031
- if args.dry_run:
2032
- print_result(
2033
- {
2034
- "setId": set_id,
2035
- "mode": "chunked",
2036
- "chunkCount": len(chunks),
2037
- "chunks": [
2038
- {
2039
- "group": chunk.get("group"),
2040
- "chunk": chunk.get("chunk"),
2041
- "title": chunk.get("title"),
2042
- "estimatedBytes": chunk.get("estimatedBytes"),
2043
- "fileCount": len(chunk.get("files", [])),
2044
- **({"files": chunk.get("files", [])} if args.include_files else {}),
2045
- }
2046
- for chunk in chunks
2047
- ],
2048
- },
2049
- args.json,
2050
- )
2051
- return
2052
- bundles = build_bundle_set(repo, config, set_id=set_id, state=state)
2053
- print_result(
2054
- {
2055
- "setId": set_id,
2056
- "bundleCount": len(bundles),
2057
- "bundles": [
2058
- {
2059
- "group": bundle.get("group"),
2060
- "chunk": bundle.get("chunk"),
2061
- "title": bundle.get("title"),
2062
- "path": bundle.get("path"),
2063
- "fileCount": bundle.get("fileCount"),
2064
- "bundleSha256": bundle.get("bundleSha256"),
2065
- "contentSha256": bundle.get("contentSha256"),
2066
- }
2067
- for bundle in bundles
2068
- ],
2069
- },
2070
- args.json,
2071
- )
2072
-
2073
-
2074
- def cmd_ensure(args: argparse.Namespace) -> None:
2075
- repo = Path(args.repo).resolve()
2076
- print_result(ensure_index(repo, force=args.force, yes=args.yes, json_output=args.json, command="ensure"), args.json)
2077
-
2078
-
2079
- def cmd_refresh(args: argparse.Namespace) -> None:
2080
- repo = Path(args.repo).resolve()
2081
- print_result(ensure_index(repo, force=True, yes=True, json_output=args.json, command="refresh"), args.json)
2082
-
2083
-
2084
- def cmd_ask(args: argparse.Namespace) -> None:
2085
- repo = Path(args.repo).resolve()
2086
- freshness = ensure_index(
2087
- repo,
2088
- force=args.force_refresh,
2089
- yes=args.yes,
2090
- json_output=args.json,
2091
- command="ask",
2092
- return_uninitialized=True,
2093
- )
2094
- blocked = provider_block_message(freshness)
2095
- if blocked:
2096
- next_steps = None
2097
- if freshness.get("status") == "needs-first-upload-approval":
2098
- next_steps = first_upload_next(repo, "ask", args.question)
2099
- print_ask_result(freshness, provider_block_payload(freshness, next_steps=next_steps), args)
2100
- return
2101
- answer = ask_provider(repo, args.question)
2102
- print_ask_result(freshness, answer, args)
2103
-
2104
-
2105
- def cmd_locate(args: argparse.Namespace) -> None:
2106
- repo = Path(args.repo).resolve()
2107
- freshness = ensure_index(
2108
- repo,
2109
- force=args.force_refresh,
2110
- yes=args.yes,
2111
- json_output=args.json,
2112
- command="locate",
2113
- return_uninitialized=True,
2114
- )
2115
- blocked = provider_block_message(freshness)
2116
- if blocked:
2117
- next_steps = freshness.get("next")
2118
- if not next_steps and freshness.get("status") == "needs-first-upload-approval":
2119
- next_steps = first_upload_next(repo, "locate", args.query)
2120
- result = {
2121
- "freshness": freshness,
2122
- "notebooklm_candidates": {"paths": [], "existing_paths": [], "terms": []},
2123
- "local_line_refs": [],
2124
- "provider_misses_or_stale_paths": [],
2125
- "provider_answer": f"({blocked})",
2126
- "claim_boundary": "Semantic provider was not called because retrieval preflight is blocked.",
2127
- }
2128
- if next_steps:
2129
- result["next"] = next_steps
2130
- print_locate_result(result, args)
2131
- return
2132
- prompt = (
2133
- "Find the code location for this repository question. Return likely repo paths, "
2134
- "function names, test names, command names, and keywords for rg. If exact line "
2135
- f"numbers are unavailable, say so. Question: {args.query}"
2136
- )
2137
- provider = ask_provider(repo, prompt)
2138
- text = answer_text(provider)
2139
- paths, terms = extract_candidates(text, args.query)
2140
- config, _ = load_config(repo, command="locate")
2141
- existing_paths = [path for path in paths if (repo / path).exists()]
2142
- stale_paths = [path for path in paths if not (repo / path).exists()]
2143
- matches = local_rg(repo, config, terms, existing_paths)
2144
- result = {
2145
- "freshness": freshness,
2146
- "notebooklm_candidates": {"paths": paths, "existing_paths": existing_paths, "terms": terms},
2147
- "local_line_refs": matches,
2148
- "provider_misses_or_stale_paths": stale_paths,
2149
- "provider_answer": provider if args.include_provider_answer else "(hidden; pass --include-provider-answer)",
2150
- "claim_boundary": "Line refs come from local rg results, not NotebookLM.",
2151
- }
2152
- print_locate_result(result, args)
2153
-
2154
-
2155
- def temp_source_sets(state: dict[str, Any]) -> list[dict[str, Any]]:
2156
- sets = state.get("temporarySourceSets")
2157
- if isinstance(sets, list):
2158
- return [item for item in sets if isinstance(item, dict)]
2159
- return []
2160
-
2161
-
2162
- def temp_source_expires_at(ttl_seconds: int) -> str | None:
2163
- if ttl_seconds <= 0:
2164
- return None
2165
- return iso(now_utc() + dt.timedelta(seconds=ttl_seconds))
2166
-
2167
-
2168
- def source_is_expired(source_set: dict[str, Any]) -> bool:
2169
- expires_at = source_set.get("expiresAt")
2170
- parsed = parse_iso(str(expires_at)) if expires_at else None
2171
- return bool(parsed and parsed <= now_utc())
2172
-
2173
-
2174
- def cmd_temp_source_upload(args: argparse.Namespace) -> None:
2175
- repo = Path(args.repo).resolve()
2176
- config, cfg_path = load_config(repo, command="temp-source upload")
2177
- state, state_path = load_state(cfg_path)
2178
- source_path = Path(args.file).expanduser()
2179
- if not source_path.is_absolute():
2180
- source_path = (repo / source_path).resolve()
2181
- if not source_path.is_file():
2182
- die(f"temp source file not found: {source_path}")
2183
- set_id = now_utc().strftime("%y%m%d%H%M")
2184
- content_sha = sha256_file(source_path)
2185
- title = temp_source_title(config, set_id=set_id, kind=args.kind, title=args.title, content_sha=content_sha)
2186
- staged_path = stage_temp_source_file(repo, title, source_path)
2187
- with repo_lock(repo):
2188
- try:
2189
- state, state_path = load_state(cfg_path)
2190
- source = upload_file_source(repo, config, staged_path, title)
2191
- status = "uploaded"
2192
- if config.get("notebooklm", {}).get("wait_after_upload", True) and source.get("id"):
2193
- status = "ready" if wait_source_ready(repo, notebook_id(config), str(source["id"])) else "error"
2194
- if status != "ready":
2195
- delete_source_ids_parallel(
2196
- repo,
2197
- notebook_id(config),
2198
- [str(source.get("id") or "")],
2199
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
2200
- )
2201
- die(f"source processing failed for temp source {title}: {source.get('id')}")
2202
- active = state.get("activeSourceSet") if isinstance(state.get("activeSourceSet"), dict) else {}
2203
- item = {
2204
- "id": source.get("id"),
2205
- "title": source.get("title") or title,
2206
- "contentSha256": content_sha,
2207
- "uploadedAt": iso(),
2208
- "status": status,
2209
- "origin": {
2210
- "activeSourceSetId": active.get("id"),
2211
- "chunkKeys": list(args.origin_chunk or []),
2212
- "filePaths": list(args.origin_file or []),
2213
- },
2214
- }
2215
- source_set = {
2216
- "id": set_id,
2217
- "kind": slugify(args.kind),
2218
- "purpose": args.title,
2219
- "createdAt": iso(),
2220
- "expiresAt": temp_source_expires_at(int(args.ttl_seconds or 0)),
2221
- "sources": [item],
2222
- }
2223
- sets = temp_source_sets(state)
2224
- sets.append(source_set)
2225
- state["temporarySourceSets"] = sets
2226
- write_json(state_path, state)
2227
- finally:
2228
- remove_file_quiet(staged_path)
2229
- print_result({"sourceSet": source_set, "source": item}, args.json)
2230
-
2231
-
2232
- def cmd_temp_source_list(args: argparse.Namespace) -> None:
2233
- repo = Path(args.repo).resolve()
2234
- config, cfg_path = load_config(repo, command="temp-source list")
2235
- state, _ = load_state(cfg_path)
2236
- sets = temp_source_sets(state)
2237
- if args.kind:
2238
- wanted = slugify(args.kind)
2239
- sets = [item for item in sets if str(item.get("kind") or "") == wanted]
2240
- prefix = temp_source_prefix(config)
2241
- provider_matches = [src for src in list_sources(repo, notebook_id(config)) if str(src.get("title") or "").startswith(prefix + "--")]
2242
- tracked_ids = {
2243
- str(src.get("id"))
2244
- for source_set in temp_source_sets(state)
2245
- for src in source_set.get("sources", [])
2246
- if isinstance(src, dict) and src.get("id")
2247
- }
2248
- untracked = [src for src in provider_matches if str(src.get("id") or "") not in tracked_ids]
2249
- print_result({"temporarySourceSets": sets, "untrackedPrefixMatches": untracked}, args.json)
2250
-
2251
-
2252
- def cmd_temp_source_cleanup(args: argparse.Namespace) -> None:
2253
- repo = Path(args.repo).resolve()
2254
- config, cfg_path = load_config(repo, command="temp-source cleanup")
2255
- with repo_lock(repo):
2256
- state, state_path = load_state(cfg_path)
2257
- sets = temp_source_sets(state)
2258
- wanted_kind = slugify(args.kind) if args.kind else ""
2259
- selected: list[dict[str, Any]] = []
2260
- kept: list[dict[str, Any]] = []
2261
- for source_set in sets:
2262
- matches = True
2263
- if args.set_id and str(source_set.get("id") or "") != str(args.set_id):
2264
- matches = False
2265
- if wanted_kind and str(source_set.get("kind") or "") != wanted_kind:
2266
- matches = False
2267
- if args.expired and not source_is_expired(source_set):
2268
- matches = False
2269
- if matches:
2270
- selected.append(source_set)
2271
- else:
2272
- kept.append(source_set)
2273
- if not args.yes:
2274
- die("cleanup requires --yes")
2275
- source_ids = [
2276
- str(src.get("id"))
2277
- for source_set in selected
2278
- for src in source_set.get("sources", [])
2279
- if isinstance(src, dict) and src.get("id")
2280
- ]
2281
- deleted = delete_source_ids_parallel(
2282
- repo,
2283
- notebook_id(config),
2284
- source_ids,
2285
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
2286
- )
2287
- deleted_set = set(deleted)
2288
- remaining_selected: list[dict[str, Any]] = []
2289
- for source_set in selected:
2290
- sources = [
2291
- src
2292
- for src in source_set.get("sources", [])
2293
- if isinstance(src, dict) and str(src.get("id") or "") not in deleted_set
2294
- ]
2295
- if sources:
2296
- item = dict(source_set)
2297
- item["sources"] = sources
2298
- remaining_selected.append(item)
2299
- state["temporarySourceSets"] = kept + remaining_selected
2300
- write_json(state_path, state)
2301
- prefix = temp_source_prefix(config)
2302
- provider_matches = [src for src in list_sources(repo, notebook_id(config)) if str(src.get("title") or "").startswith(prefix + "--")]
2303
- tracked_ids = {
2304
- str(src.get("id"))
2305
- for source_set in temp_source_sets(state)
2306
- for src in source_set.get("sources", [])
2307
- if isinstance(src, dict) and src.get("id")
2308
- }
2309
- deleted_set = set(deleted)
2310
- untracked = [
2311
- src
2312
- for src in provider_matches
2313
- if str(src.get("id") or "") not in tracked_ids and str(src.get("id") or "") not in deleted_set
2314
- ]
2315
- if args.include_untracked_prefix:
2316
- extra_ids = [str(src.get("id")) for src in untracked if src.get("id")]
2317
- extra_deleted = delete_source_ids_parallel(
2318
- repo,
2319
- notebook_id(config),
2320
- extra_ids,
2321
- parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
2322
- )
2323
- deleted.extend(extra_deleted)
2324
- untracked = [src for src in untracked if str(src.get("id") or "") not in set(extra_deleted)]
2325
- print_result({"deletedSourceIds": deleted, "untrackedPrefixMatches": untracked}, args.json)
2326
-
2327
-
2328
- def build_parser() -> argparse.ArgumentParser:
2329
- parser = argparse.ArgumentParser(
2330
- prog="memdex",
2331
- formatter_class=argparse.RawDescriptionHelpFormatter,
2332
- description=textwrap.dedent(
2333
- """
2334
- Agent-facing semantic retrieval for projects and source sets.
2335
-
2336
- Memdex uses NotebookLM as a semantic locator, then treats local files,
2337
- command output, and project docs as authority for exact evidence.
2338
- Start with init once. For normal agent work, call ask or locate directly;
2339
- they run freshness preflight before querying the provider.
2340
- """
2341
- ).strip(),
2342
- epilog=textwrap.dedent(
2343
- """
2344
- Common agent paths:
2345
- memdex init --repo . --create-notebook
2346
- memdex ask --repo . "Where is retry/backfill documented?"
2347
- memdex locate --repo . "invoice export retry command"
2348
- memdex ask --repo . --yes "question" # approve first broad upload
2349
-
2350
- Command routing:
2351
- ask answer architecture/docs/status questions over the source set
2352
- locate find likely files or symbols and return local line refs
2353
- init create .memdex/config.json and bind a NotebookLM notebook
2354
- status inspect local config, freshness, and recorded source state
2355
- ensure prewarm or refresh the index when policy allows
2356
- refresh force a source replacement
2357
- pack preview deterministic repomix chunks without provider Q&A
2358
- """
2359
- ).strip(),
2360
- )
2361
- sub = parser.add_subparsers(title="commands", dest="command", metavar="<command>", required=True)
2362
-
2363
- ask = sub.add_parser(
2364
- "ask",
2365
- help="answer semantic project questions with freshness preflight",
2366
- description=textwrap.dedent(
2367
- """
2368
- Ask a question over the configured source set.
2369
-
2370
- Use this for architecture, docs, behavior, ownership, or status questions.
2371
- Memdex checks freshness first, queries NotebookLM, then prints compact
2372
- provider references. Verify exact claims from local evidence.
2373
- """
2374
- ).strip(),
2375
- )
2376
- ask.add_argument("question", help="natural-language question to ask over the source set")
2377
- ask.add_argument("--repo", default=".", help="project root (default: current directory)")
2378
- ask.add_argument("--yes", action="store_true", help="approve first broad upload if setup is otherwise ready")
2379
- ask.add_argument("--force-refresh", action="store_true", help="refresh managed sources before asking")
2380
- ask.add_argument("--json", action="store_true", help="print machine-readable JSON")
2381
- ask.add_argument("--verbose", action="store_true", help="include freshness and provider metadata")
2382
- ask.set_defaults(func=cmd_ask)
2383
-
2384
- locate = sub.add_parser(
2385
- "locate",
2386
- help="find likely files or symbols and verify local line refs",
2387
- description=textwrap.dedent(
2388
- """
2389
- Locate implementation, docs, tests, or symbols and verify local line refs.
2390
-
2391
- Use this when the user asks "where is X?" or needs candidate paths.
2392
- Memdex queries the semantic provider for candidates, then checks local
2393
- files with exact line references when possible.
2394
- """
2395
- ).strip(),
2396
- )
2397
- locate.add_argument("query", help="natural-language thing to find")
2398
- locate.add_argument("--repo", default=".", help="project root (default: current directory)")
2399
- locate.add_argument("--yes", action="store_true", help="approve first broad upload if setup is otherwise ready")
2400
- locate.add_argument("--force-refresh", action="store_true", help="refresh managed sources before locating")
2401
- locate.add_argument("--include-provider-answer", action="store_true", help="include the raw provider answer in output")
2402
- locate.add_argument("--json", action="store_true", help="print machine-readable JSON")
2403
- locate.add_argument("--verbose", action="store_true", help="include freshness metadata")
2404
- locate.set_defaults(func=cmd_locate)
2405
-
2406
- init = sub.add_parser(
2407
- "init",
2408
- formatter_class=argparse.RawDescriptionHelpFormatter,
2409
- help="create .memdex/config.json and bind a NotebookLM notebook",
2410
- description="Create project-local Memdex config and bind it to a NotebookLM notebook.",
2411
- epilog=textwrap.dedent(
2412
- """
2413
- Examples:
2414
- memdex init --repo . --create-notebook
2415
- memdex init --repo . --reuse-existing-notebook
2416
- memdex init --repo . --notebook-id <id>
2417
- """
2418
- ).strip(),
2419
- )
2420
- init.add_argument("--repo", default=".", help="project, repo, vault, or source-set root (default: current directory)")
2421
- init.add_argument("--notebook-id", default="", help="bind an existing NotebookLM notebook by ID")
2422
- init.add_argument("--project-name", default="", help="stable project key for notebook and source titles (default: repo basename)")
2423
- init.add_argument("--notebook-title-prefix", default=DEFAULT_NOTEBOOK_TITLE_PREFIX, help="NotebookLM title prefix (default: memdex)")
2424
- init.add_argument("--notebook-title", default="", help="exact NotebookLM title to create or reuse")
2425
- init.add_argument("--reuse-existing-notebook", action="store_true", help="reuse an exact title match; do not create cloud state")
2426
- init.add_argument("--create-notebook", action="store_true", help="create the NotebookLM notebook when no exact title match exists")
2427
- init.add_argument("--source-title-prefix", default="", help="prefix for managed NotebookLM source titles (default: memdex)")
2428
- init.add_argument("--include", default="", help="comma-separated include roots or files for the source set")
2429
- init.add_argument("--force", action="store_true", help="overwrite existing .memdex/config.json")
2430
- init.set_defaults(func=cmd_init)
2431
-
2432
- status = sub.add_parser(
2433
- "status",
2434
- help="inspect config, freshness, and recorded source state",
2435
- description="Inspect local Memdex config, freshness fingerprints, and NotebookLM source state.",
2436
- )
2437
- status.add_argument("--repo", default=".", help="project root (default: current directory)")
2438
- status.add_argument("--json", action="store_true", help="print machine-readable JSON")
2439
- status.set_defaults(func=cmd_status)
2440
-
2441
- pack = sub.add_parser(
2442
- "pack",
2443
- help="preview deterministic repomix chunks",
2444
- description="Preview or build deterministic whole-file chunks for the configured source set.",
2445
- )
2446
- pack.add_argument("--repo", default=".", help="project root (default: current directory)")
2447
- pack.add_argument("--set-id", default="", help="stable source-set ID for rendered chunk titles")
2448
- pack.add_argument("--dry-run", action="store_true", help="show planned chunks without running repomix")
2449
- pack.add_argument("--include-files", action="store_true", help="include per-chunk file lists in output")
2450
- pack.add_argument("--json", action="store_true", help="print machine-readable JSON")
2451
- pack.set_defaults(func=cmd_pack)
2452
-
2453
- ensure = sub.add_parser(
2454
- "ensure",
2455
- help="prewarm or refresh the index when policy allows",
2456
- description="Run freshness preflight and upload/refresh sources only when policy allows.",
2457
- )
2458
- ensure.add_argument("--repo", default=".", help="project root (default: current directory)")
2459
- ensure.add_argument("--force", action="store_true", help="bypass freshness TTL and rebuild source state")
2460
- ensure.add_argument("--yes", action="store_true", help="approve the first broad upload for this run")
2461
- ensure.add_argument("--json", action="store_true", help="print machine-readable JSON")
2462
- ensure.set_defaults(func=cmd_ensure)
2463
-
2464
- refresh = sub.add_parser(
2465
- "refresh",
2466
- help="force source replacement",
2467
- description="Refresh managed NotebookLM sources, replacing old recorded sources after success.",
2468
- )
2469
- refresh.add_argument("--repo", default=".", help="project root (default: current directory)")
2470
- refresh.add_argument("--force", action="store_true", help="force refresh even when freshness checks would skip it")
2471
- refresh.add_argument("--json", action="store_true", help="print machine-readable JSON")
2472
- refresh.set_defaults(func=cmd_refresh)
2473
-
2474
- temp = sub.add_parser(
2475
- "temp-source",
2476
- formatter_class=argparse.RawDescriptionHelpFormatter,
2477
- help="manage temporary derived NotebookLM sources",
2478
- description="Upload, list, or clean temporary derived sources such as notes or study aids.",
2479
- )
2480
- temp_sub = temp.add_subparsers(title="temp-source commands", dest="temp_command", metavar="<command>", required=True)
2481
-
2482
- temp_upload = temp_sub.add_parser("upload", help="upload a temporary source file")
2483
- temp_upload.add_argument("--repo", default=".", help="project root (default: current directory)")
2484
- temp_upload.add_argument("--kind", required=True, help="temporary source kind, for example notes or flashcard")
2485
- temp_upload.add_argument("--title", required=True, help="human-readable title slug for this temporary source")
2486
- temp_upload.add_argument("--file", required=True, help="local markdown/text file to upload")
2487
- temp_upload.add_argument("--origin-chunk", action="append", default=[], help="origin active chunk key; repeatable")
2488
- temp_upload.add_argument("--origin-file", action="append", default=[], help="origin local file path; repeatable")
2489
- temp_upload.add_argument("--ttl-seconds", type=int, default=0, help="optional expiry TTL in seconds")
2490
- temp_upload.add_argument("--json", action="store_true", help="print machine-readable JSON")
2491
- temp_upload.set_defaults(func=cmd_temp_source_upload)
2492
-
2493
- temp_list = temp_sub.add_parser("list", help="list recorded temporary sources")
2494
- temp_list.add_argument("--repo", default=".", help="project root (default: current directory)")
2495
- temp_list.add_argument("--kind", default="", help="filter by temporary source kind")
2496
- temp_list.add_argument("--json", action="store_true", help="print machine-readable JSON")
2497
- temp_list.set_defaults(func=cmd_temp_source_list)
2498
-
2499
- temp_cleanup = temp_sub.add_parser("cleanup", help="delete recorded temporary sources")
2500
- temp_cleanup.add_argument("--repo", default=".", help="project root (default: current directory)")
2501
- temp_cleanup.add_argument("--kind", default="", help="filter by temporary source kind")
2502
- temp_cleanup.add_argument("--set-id", default="", help="filter by temporary source-set ID")
2503
- temp_cleanup.add_argument("--expired", action="store_true", help="clean only expired temporary sources")
2504
- temp_cleanup.add_argument("--include-untracked-prefix", action="store_true", help="also delete untracked prefix matches; requires --yes")
2505
- temp_cleanup.add_argument("--yes", action="store_true", help="confirm deletion")
2506
- temp_cleanup.add_argument("--json", action="store_true", help="print machine-readable JSON")
2507
- temp_cleanup.set_defaults(func=cmd_temp_source_cleanup)
2508
- return parser
2509
-
2510
-
2511
- def main() -> None:
2512
- args = build_parser().parse_args()
2513
- args.func(args)
2514
-
2515
-
2516
- if __name__ == "__main__":
2517
- main()