memdex 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +34 -0
- package/bin/memdex.js +23 -0
- package/package.json +46 -0
- package/scripts/memdex.py +2427 -0
|
@@ -0,0 +1,2427 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Project-level semantic retrieval helper.
|
|
3
|
+
|
|
4
|
+
This script intentionally depends only on Python stdlib for the control plane.
|
|
5
|
+
It shells out to `npx repomix`, `notebooklm`, `git`, and `rg` when needed.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import argparse
|
|
11
|
+
import concurrent.futures
|
|
12
|
+
import contextlib
|
|
13
|
+
import datetime as dt
|
|
14
|
+
import errno
|
|
15
|
+
import fnmatch
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import os
|
|
19
|
+
import re
|
|
20
|
+
import shlex
|
|
21
|
+
import shutil
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
import threading
|
|
25
|
+
import time
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
CONFIG_DIR = ".memdex"
|
|
31
|
+
CONFIG_JSON = "config.json"
|
|
32
|
+
STATE_JSON = "state.local.json"
|
|
33
|
+
PENDING_UPLOAD_JSON = "pending-upload.local.json"
|
|
34
|
+
DEFAULT_NOTEBOOK_TITLE_PREFIX = "memdex"
|
|
35
|
+
SCRIPT_PATH = Path(__file__).resolve()
|
|
36
|
+
SCRIPT_CMD_ENV = "MEMDEX_CMD"
|
|
37
|
+
LEGACY_SCRIPT_CMD_ENV = "CODEBASE_RETRIEVE_CMD"
|
|
38
|
+
NOTEBOOKLM_PACKAGE = "git+https://github.com/teng-lin/notebooklm-py.git"
|
|
39
|
+
NOTEBOOKLM_BIN_ENV = "NOTEBOOKLM_BIN"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def now_utc() -> dt.datetime:
|
|
43
|
+
return dt.datetime.now(dt.timezone.utc).replace(microsecond=0)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def iso(ts: dt.datetime | None = None) -> str:
|
|
47
|
+
return (ts or now_utc()).isoformat().replace("+00:00", "Z")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def parse_iso(value: str | None) -> dt.datetime | None:
|
|
51
|
+
if not value:
|
|
52
|
+
return None
|
|
53
|
+
return dt.datetime.fromisoformat(value.replace("Z", "+00:00"))
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def die(message: str, code: int = 2) -> None:
|
|
57
|
+
print(f"error: {message}", file=sys.stderr)
|
|
58
|
+
raise SystemExit(code)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def script_cmd() -> list[str]:
|
|
62
|
+
override = os.environ.get(SCRIPT_CMD_ENV, "").strip()
|
|
63
|
+
if not override:
|
|
64
|
+
override = os.environ.get(LEGACY_SCRIPT_CMD_ENV, "").strip()
|
|
65
|
+
if override:
|
|
66
|
+
return shlex.split(override)
|
|
67
|
+
return [sys.executable or "python3", str(SCRIPT_PATH)]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def command_line(repo: Path, command: str, *parts: str) -> str:
|
|
71
|
+
rendered = [*script_cmd(), command, "--repo", str(repo), *parts]
|
|
72
|
+
return " ".join(shlex.quote(part) for part in rendered)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def missing_config_message(repo: Path, config_file: Path, command: str = "") -> str:
|
|
76
|
+
init_create = command_line(repo, "init", "--create-notebook")
|
|
77
|
+
init_reuse = command_line(repo, "init", "--reuse-existing-notebook")
|
|
78
|
+
ask = command_line(repo, "ask", "your question")
|
|
79
|
+
ask_yes = command_line(repo, "ask", "--yes", "your question")
|
|
80
|
+
locate = command_line(repo, "locate", "thing to find")
|
|
81
|
+
lines = [
|
|
82
|
+
f"project is not initialized for project retrieval: {config_file}",
|
|
83
|
+
"",
|
|
84
|
+
"Initialize this repo first:",
|
|
85
|
+
f" {init_create}",
|
|
86
|
+
"",
|
|
87
|
+
"Or reuse an existing NotebookLM notebook with the expected title:",
|
|
88
|
+
f" {init_reuse}",
|
|
89
|
+
"",
|
|
90
|
+
"Then ask or locate directly; both commands run freshness preflight:",
|
|
91
|
+
f" {ask}",
|
|
92
|
+
f" {locate}",
|
|
93
|
+
"",
|
|
94
|
+
"If this is the first broad upload and you already approve it:",
|
|
95
|
+
f" {ask_yes}",
|
|
96
|
+
]
|
|
97
|
+
if command:
|
|
98
|
+
lines.insert(1, f"Command `{command}` needs `.memdex/config.json` before it can run.")
|
|
99
|
+
return "\n".join(lines)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def uninitialized_status(repo: Path, config_file: Path) -> dict[str, Any]:
|
|
103
|
+
return {
|
|
104
|
+
"status": "not-initialized",
|
|
105
|
+
"initialized": False,
|
|
106
|
+
"config": str(config_file),
|
|
107
|
+
"message": "project is not initialized for project retrieval",
|
|
108
|
+
"next": {
|
|
109
|
+
"createNotebook": command_line(repo, "init", "--create-notebook"),
|
|
110
|
+
"reuseExistingNotebook": command_line(repo, "init", "--reuse-existing-notebook"),
|
|
111
|
+
"ask": command_line(repo, "ask", "your question"),
|
|
112
|
+
"locate": command_line(repo, "locate", "thing to find"),
|
|
113
|
+
"askWithFirstUploadApproval": command_line(repo, "ask", "--yes", "your question"),
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@contextlib.contextmanager
|
|
119
|
+
def repo_lock(repo: Path, *, timeout_seconds: float = 300.0):
|
|
120
|
+
lock_path = repo / CONFIG_DIR / ".lock"
|
|
121
|
+
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
122
|
+
start = time.monotonic()
|
|
123
|
+
fd: int | None = None
|
|
124
|
+
while True:
|
|
125
|
+
try:
|
|
126
|
+
fd = os.open(str(lock_path), os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
|
127
|
+
os.write(fd, f"pid={os.getpid()}\ncreatedAt={iso()}\n".encode("utf-8"))
|
|
128
|
+
break
|
|
129
|
+
except OSError as error:
|
|
130
|
+
if error.errno != errno.EEXIST:
|
|
131
|
+
raise
|
|
132
|
+
if time.monotonic() - start > timeout_seconds:
|
|
133
|
+
die(f"timed out waiting for lock: {lock_path}")
|
|
134
|
+
time.sleep(0.2)
|
|
135
|
+
try:
|
|
136
|
+
yield
|
|
137
|
+
finally:
|
|
138
|
+
if fd is not None:
|
|
139
|
+
os.close(fd)
|
|
140
|
+
try:
|
|
141
|
+
lock_path.unlink()
|
|
142
|
+
except FileNotFoundError:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def run(argv: list[str], cwd: Path, *, input_text: str | None = None, timeout: int | None = None) -> subprocess.CompletedProcess[str]:
|
|
147
|
+
return subprocess.run(
|
|
148
|
+
argv,
|
|
149
|
+
cwd=str(cwd),
|
|
150
|
+
input=input_text,
|
|
151
|
+
text=True,
|
|
152
|
+
stdout=subprocess.PIPE,
|
|
153
|
+
stderr=subprocess.PIPE,
|
|
154
|
+
timeout=timeout,
|
|
155
|
+
check=False,
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def require_tool(name: str) -> None:
|
|
160
|
+
if shutil.which(name) is None:
|
|
161
|
+
die(f"required tool not found on PATH: {name}")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def notebooklm_cmd() -> list[str]:
|
|
165
|
+
override = os.environ.get(NOTEBOOKLM_BIN_ENV, "").strip()
|
|
166
|
+
if override:
|
|
167
|
+
return shlex.split(override)
|
|
168
|
+
found = shutil.which("notebooklm")
|
|
169
|
+
if found:
|
|
170
|
+
return [found]
|
|
171
|
+
die(
|
|
172
|
+
"required tool not found on PATH: notebooklm\n"
|
|
173
|
+
f"Install persistently: uv tool install {NOTEBOOKLM_PACKAGE}\n"
|
|
174
|
+
f"Or set {NOTEBOOKLM_BIN_ENV}='uvx --from {NOTEBOOKLM_PACKAGE} notebooklm'"
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def sha256_bytes(data: bytes) -> str:
|
|
179
|
+
return "sha256:" + hashlib.sha256(data).hexdigest()
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def sha256_text(data: str) -> str:
|
|
183
|
+
return sha256_bytes(data.encode("utf-8"))
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def sha256_file(path: Path) -> str:
|
|
187
|
+
digest = hashlib.sha256()
|
|
188
|
+
with path.open("rb") as handle:
|
|
189
|
+
for chunk in iter(lambda: handle.read(1024 * 1024), b""):
|
|
190
|
+
digest.update(chunk)
|
|
191
|
+
return "sha256:" + digest.hexdigest()
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def remove_file_quiet(path: Path) -> None:
|
|
195
|
+
try:
|
|
196
|
+
path.unlink()
|
|
197
|
+
except FileNotFoundError:
|
|
198
|
+
pass
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def default_include() -> list[str]:
|
|
202
|
+
return [
|
|
203
|
+
"src",
|
|
204
|
+
"crates",
|
|
205
|
+
"packages",
|
|
206
|
+
"apps",
|
|
207
|
+
"bins",
|
|
208
|
+
"docs",
|
|
209
|
+
"scripts",
|
|
210
|
+
"tests",
|
|
211
|
+
"xtask",
|
|
212
|
+
"AGENTS.md",
|
|
213
|
+
"CLAUDE.md",
|
|
214
|
+
"README.md",
|
|
215
|
+
"Cargo.toml",
|
|
216
|
+
"package.json",
|
|
217
|
+
"justfile",
|
|
218
|
+
]
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def default_groups() -> list[dict[str, Any]]:
|
|
222
|
+
return [
|
|
223
|
+
{"id": "docs", "include": ["AGENTS.md", "CLAUDE.md", "README.md", "docs/**"]},
|
|
224
|
+
{"id": "apps", "include": ["apps/**"]},
|
|
225
|
+
{"id": "packages", "include": ["packages/**"]},
|
|
226
|
+
{"id": "src", "include": ["src/**", "crates/**", "bins/**", "xtask/**"]},
|
|
227
|
+
{"id": "tests", "include": ["tests/**", "testdata/**"]},
|
|
228
|
+
{"id": "scripts", "include": ["scripts/**"]},
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def slugify(value: str) -> str:
|
|
233
|
+
lowered = value.strip().lower()
|
|
234
|
+
slug = re.sub(r"[^a-z0-9._-]+", "-", lowered)
|
|
235
|
+
slug = re.sub(r"-+", "-", slug).strip("-")
|
|
236
|
+
return slug or "repo"
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def default_notebook_title(project_name: str, title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX) -> str:
|
|
240
|
+
return f"{title_prefix}:{project_name}"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def default_source_title_prefix(project_name: str, title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX) -> str:
|
|
244
|
+
return f"{slugify(title_prefix)}-{slugify(project_name)}-repo"
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def default_short_source_title_prefix() -> str:
|
|
248
|
+
return "memdex"
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def default_config(
|
|
252
|
+
repo: Path,
|
|
253
|
+
notebook_id: str = "",
|
|
254
|
+
*,
|
|
255
|
+
project_name: str | None = None,
|
|
256
|
+
notebook_title_prefix: str = DEFAULT_NOTEBOOK_TITLE_PREFIX,
|
|
257
|
+
notebook_title: str | None = None,
|
|
258
|
+
) -> dict[str, Any]:
|
|
259
|
+
project = project_name or repo.name
|
|
260
|
+
title = notebook_title or default_notebook_title(project, notebook_title_prefix)
|
|
261
|
+
return {
|
|
262
|
+
"version": 1,
|
|
263
|
+
"project": {
|
|
264
|
+
"name": project,
|
|
265
|
+
},
|
|
266
|
+
"provider": "notebooklm",
|
|
267
|
+
"notebooklm": {
|
|
268
|
+
"notebook_id": notebook_id,
|
|
269
|
+
"notebook_title_prefix": notebook_title_prefix,
|
|
270
|
+
"notebook_title": title,
|
|
271
|
+
"source_title_prefix": default_short_source_title_prefix(),
|
|
272
|
+
"wait_after_upload": True,
|
|
273
|
+
"upload_parallelism": 4,
|
|
274
|
+
"wait_parallelism": 8,
|
|
275
|
+
"delete_parallelism": 4,
|
|
276
|
+
},
|
|
277
|
+
"bundle": {
|
|
278
|
+
"tool": "repomix",
|
|
279
|
+
"mode": "chunked",
|
|
280
|
+
"include": default_include(),
|
|
281
|
+
"output": f"{CONFIG_DIR}/cache/{{prefix}}-{{timestamp}}.txt",
|
|
282
|
+
"style": "",
|
|
283
|
+
"compress": False,
|
|
284
|
+
"target_chunk_bytes": 716800,
|
|
285
|
+
"max_chunk_bytes": 900000,
|
|
286
|
+
"source_title_template": "{prefix}--{set}--{group}--{chunk}--{hash}.md",
|
|
287
|
+
"groups": default_groups(),
|
|
288
|
+
"default_group": {"enabled": True, "id": "misc"},
|
|
289
|
+
},
|
|
290
|
+
"refresh": {
|
|
291
|
+
"auto": True,
|
|
292
|
+
"mode": "replace",
|
|
293
|
+
"check_ttl_seconds": 300,
|
|
294
|
+
"min_upload_interval_seconds": 900,
|
|
295
|
+
"max_staleness_seconds": 86400,
|
|
296
|
+
"keep_previous_sources": 0,
|
|
297
|
+
"delete_previous_after_success": True,
|
|
298
|
+
},
|
|
299
|
+
"safety": {
|
|
300
|
+
"require_user_approval_first_upload": True,
|
|
301
|
+
"never_upload": [
|
|
302
|
+
".env*",
|
|
303
|
+
"**/.env*",
|
|
304
|
+
".git/**",
|
|
305
|
+
"**/.git/**",
|
|
306
|
+
"node_modules/**",
|
|
307
|
+
"**/node_modules/**",
|
|
308
|
+
"target/**",
|
|
309
|
+
"**/target/**",
|
|
310
|
+
"dist/**",
|
|
311
|
+
"**/dist/**",
|
|
312
|
+
"build/**",
|
|
313
|
+
"**/build/**",
|
|
314
|
+
"coverage/**",
|
|
315
|
+
"**/coverage/**",
|
|
316
|
+
".next/**",
|
|
317
|
+
"**/.next/**",
|
|
318
|
+
".generated/**",
|
|
319
|
+
"**/.generated/**",
|
|
320
|
+
"public/**",
|
|
321
|
+
"**/public/**",
|
|
322
|
+
"*.png",
|
|
323
|
+
"**/*.png",
|
|
324
|
+
"*.jpg",
|
|
325
|
+
"**/*.jpg",
|
|
326
|
+
"*.jpeg",
|
|
327
|
+
"**/*.jpeg",
|
|
328
|
+
"*.gif",
|
|
329
|
+
"**/*.gif",
|
|
330
|
+
"*.webp",
|
|
331
|
+
"**/*.webp",
|
|
332
|
+
"*.svg",
|
|
333
|
+
"**/*.svg",
|
|
334
|
+
"*.ico",
|
|
335
|
+
"**/*.ico",
|
|
336
|
+
"*.otf",
|
|
337
|
+
"**/*.otf",
|
|
338
|
+
"*.ttf",
|
|
339
|
+
"**/*.ttf",
|
|
340
|
+
"*.woff",
|
|
341
|
+
"**/*.woff",
|
|
342
|
+
"*.woff2",
|
|
343
|
+
"**/*.woff2",
|
|
344
|
+
"*.mp4",
|
|
345
|
+
"**/*.mp4",
|
|
346
|
+
"*.mov",
|
|
347
|
+
"**/*.mov",
|
|
348
|
+
"*.zip",
|
|
349
|
+
"**/*.zip",
|
|
350
|
+
"*.tar",
|
|
351
|
+
"**/*.tar",
|
|
352
|
+
"*.gz",
|
|
353
|
+
"**/*.gz",
|
|
354
|
+
],
|
|
355
|
+
},
|
|
356
|
+
"retrieval": {
|
|
357
|
+
"line_numbers_require_local_verify": True,
|
|
358
|
+
"max_local_matches": 80,
|
|
359
|
+
},
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def config_path(repo: Path) -> Path:
|
|
364
|
+
candidates = [
|
|
365
|
+
repo / CONFIG_DIR / CONFIG_JSON,
|
|
366
|
+
repo / CONFIG_DIR / "config.yaml",
|
|
367
|
+
repo / CONFIG_DIR / "config.yml",
|
|
368
|
+
repo / ".notebooklm" / CONFIG_JSON,
|
|
369
|
+
repo / ".notebooklm" / "config.yaml",
|
|
370
|
+
repo / ".notebooklm" / "config.yml",
|
|
371
|
+
]
|
|
372
|
+
for path in candidates:
|
|
373
|
+
if path.exists():
|
|
374
|
+
return path
|
|
375
|
+
return repo / CONFIG_DIR / CONFIG_JSON
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def load_config(repo: Path, *, command: str = "") -> tuple[dict[str, Any], Path]:
|
|
379
|
+
path = config_path(repo)
|
|
380
|
+
if not path.exists():
|
|
381
|
+
die(missing_config_message(repo, path, command))
|
|
382
|
+
if path.suffix == ".json":
|
|
383
|
+
return json.loads(path.read_text()), path
|
|
384
|
+
try:
|
|
385
|
+
import yaml # type: ignore
|
|
386
|
+
except Exception as error: # pragma: no cover - depends on host env
|
|
387
|
+
die(f"YAML config requires PyYAML or use JSON config instead: {error}")
|
|
388
|
+
return yaml.safe_load(path.read_text()), path
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def write_json(path: Path, value: Any) -> None:
|
|
392
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
393
|
+
path.write_text(json.dumps(value, indent=2, ensure_ascii=False) + "\n")
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def load_state(config_file: Path) -> tuple[dict[str, Any], Path]:
|
|
397
|
+
state_path = config_file.parent / STATE_JSON
|
|
398
|
+
if state_path.exists():
|
|
399
|
+
return json.loads(state_path.read_text()), state_path
|
|
400
|
+
return {"sources": []}, state_path
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def include_specs(config: dict[str, Any]) -> list[str]:
|
|
404
|
+
include = config.get("bundle", {}).get("include") or default_include()
|
|
405
|
+
return [str(item).strip().strip("/") for item in include if str(item).strip()]
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def group_specs(group: dict[str, Any]) -> list[str]:
|
|
409
|
+
include = group.get("include") or []
|
|
410
|
+
return [str(item).strip().strip("/") for item in include if str(item).strip()]
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def never_upload_specs(config: dict[str, Any]) -> list[str]:
|
|
414
|
+
built_in = [
|
|
415
|
+
".git/**",
|
|
416
|
+
"**/.git/**",
|
|
417
|
+
".env*",
|
|
418
|
+
"**/.env*",
|
|
419
|
+
"node_modules/**",
|
|
420
|
+
"**/node_modules/**",
|
|
421
|
+
".next/**",
|
|
422
|
+
"**/.next/**",
|
|
423
|
+
"dist/**",
|
|
424
|
+
"**/dist/**",
|
|
425
|
+
"build/**",
|
|
426
|
+
"**/build/**",
|
|
427
|
+
"coverage/**",
|
|
428
|
+
"**/coverage/**",
|
|
429
|
+
".generated/**",
|
|
430
|
+
"**/.generated/**",
|
|
431
|
+
"public/**",
|
|
432
|
+
"**/public/**",
|
|
433
|
+
"*.png",
|
|
434
|
+
"**/*.png",
|
|
435
|
+
"*.jpg",
|
|
436
|
+
"**/*.jpg",
|
|
437
|
+
"*.jpeg",
|
|
438
|
+
"**/*.jpeg",
|
|
439
|
+
"*.gif",
|
|
440
|
+
"**/*.gif",
|
|
441
|
+
"*.webp",
|
|
442
|
+
"**/*.webp",
|
|
443
|
+
"*.svg",
|
|
444
|
+
"**/*.svg",
|
|
445
|
+
"*.ico",
|
|
446
|
+
"**/*.ico",
|
|
447
|
+
"*.otf",
|
|
448
|
+
"**/*.otf",
|
|
449
|
+
"*.ttf",
|
|
450
|
+
"**/*.ttf",
|
|
451
|
+
"*.woff",
|
|
452
|
+
"**/*.woff",
|
|
453
|
+
"*.woff2",
|
|
454
|
+
"**/*.woff2",
|
|
455
|
+
"*.mp4",
|
|
456
|
+
"**/*.mp4",
|
|
457
|
+
"*.mov",
|
|
458
|
+
"**/*.mov",
|
|
459
|
+
"*.zip",
|
|
460
|
+
"**/*.zip",
|
|
461
|
+
"*.tar",
|
|
462
|
+
"**/*.tar",
|
|
463
|
+
"*.gz",
|
|
464
|
+
"**/*.gz",
|
|
465
|
+
]
|
|
466
|
+
never_upload = config.get("safety", {}).get("never_upload") or []
|
|
467
|
+
return [str(item).strip() for item in [*built_in, *never_upload] if str(item).strip()]
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
def path_matches_spec(path: str, spec: str) -> bool:
|
|
471
|
+
clean = path.strip().lstrip("./")
|
|
472
|
+
pattern = spec.strip().lstrip("./")
|
|
473
|
+
if not pattern:
|
|
474
|
+
return False
|
|
475
|
+
if pattern in {".", "*"}:
|
|
476
|
+
return True
|
|
477
|
+
if clean == pattern or clean.startswith(pattern.rstrip("/") + "/"):
|
|
478
|
+
return True
|
|
479
|
+
return fnmatch.fnmatch(clean, pattern) or fnmatch.fnmatch("./" + clean, pattern)
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def path_is_included(path: str, includes: list[str]) -> bool:
|
|
483
|
+
for spec in includes:
|
|
484
|
+
if path_matches_spec(path, spec):
|
|
485
|
+
return True
|
|
486
|
+
return False
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def path_is_ignored(path: str, ignores: list[str]) -> bool:
|
|
490
|
+
return any(path_matches_spec(path, spec) for spec in ignores)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def bundle_mode(config: dict[str, Any]) -> str:
|
|
494
|
+
return str(config.get("bundle", {}).get("mode") or "chunked")
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def parse_size_bytes(value: Any, fallback: int) -> int:
|
|
498
|
+
if isinstance(value, int):
|
|
499
|
+
return value
|
|
500
|
+
text = str(value or "").strip().lower()
|
|
501
|
+
if not text:
|
|
502
|
+
return fallback
|
|
503
|
+
match = re.fullmatch(r"(\d+)(?:\s*(b|kb|kib|mb|mib))?", text)
|
|
504
|
+
if not match:
|
|
505
|
+
return fallback
|
|
506
|
+
amount = int(match.group(1))
|
|
507
|
+
unit = match.group(2) or "b"
|
|
508
|
+
if unit in {"kb", "kib"}:
|
|
509
|
+
return amount * 1024
|
|
510
|
+
if unit in {"mb", "mib"}:
|
|
511
|
+
return amount * 1024 * 1024
|
|
512
|
+
return amount
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def positive_int(value: Any, fallback: int, *, minimum: int = 1, maximum: int = 32) -> int:
|
|
516
|
+
try:
|
|
517
|
+
parsed = int(value)
|
|
518
|
+
except (TypeError, ValueError):
|
|
519
|
+
parsed = fallback
|
|
520
|
+
return max(minimum, min(maximum, parsed))
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def list_git_files(repo: Path) -> list[str]:
|
|
524
|
+
result = run(["git", "ls-files", "-co", "--exclude-standard"], repo)
|
|
525
|
+
if result.returncode != 0:
|
|
526
|
+
files: list[str] = []
|
|
527
|
+
for path in repo.rglob("*"):
|
|
528
|
+
if not path.is_file():
|
|
529
|
+
continue
|
|
530
|
+
rel = path.relative_to(repo).as_posix()
|
|
531
|
+
if rel.startswith(".git/"):
|
|
532
|
+
continue
|
|
533
|
+
files.append(rel)
|
|
534
|
+
return sorted(files)
|
|
535
|
+
return sorted(line.strip() for line in result.stdout.splitlines() if line.strip())
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def collect_bundle_files(repo: Path, config: dict[str, Any]) -> list[str]:
|
|
539
|
+
includes = include_specs(config)
|
|
540
|
+
ignores = never_upload_specs(config)
|
|
541
|
+
files: list[str] = []
|
|
542
|
+
for path in list_git_files(repo):
|
|
543
|
+
if not path_is_included(path, includes):
|
|
544
|
+
continue
|
|
545
|
+
if path_is_ignored(path, ignores):
|
|
546
|
+
continue
|
|
547
|
+
full = repo / path
|
|
548
|
+
if not full.is_file() or full.is_symlink():
|
|
549
|
+
continue
|
|
550
|
+
files.append(path)
|
|
551
|
+
return sorted(set(files))
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def chunk_file_size(repo: Path, path: str) -> int:
|
|
555
|
+
full = repo / path
|
|
556
|
+
return full.stat().st_size + len(path.encode("utf-8")) + 64
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def file_bucket(path: str) -> str:
|
|
560
|
+
parts = path.split("/")
|
|
561
|
+
if len(parts) >= 3 and parts[0] in {"apps", "packages", "crates"}:
|
|
562
|
+
return "/".join(parts[:3])
|
|
563
|
+
if len(parts) >= 2:
|
|
564
|
+
return "/".join(parts[:2])
|
|
565
|
+
return parts[0]
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def source_title_for_chunk(config: dict[str, Any], *, set_id: str, group: str, index: int, chunk_hash: str) -> str:
|
|
569
|
+
configured = str(config.get("notebooklm", {}).get("source_title_prefix") or "").strip()
|
|
570
|
+
legacy = configured.startswith("codebase-retrieve-")
|
|
571
|
+
prefix = default_short_source_title_prefix() if legacy or not configured else configured
|
|
572
|
+
template = str(
|
|
573
|
+
config.get("bundle", {}).get("source_title_template")
|
|
574
|
+
or "{prefix}--{set}--{group}--{chunk}--{hash}.md"
|
|
575
|
+
)
|
|
576
|
+
return template.format(
|
|
577
|
+
prefix=slugify(prefix),
|
|
578
|
+
set=set_id,
|
|
579
|
+
set_id=set_id,
|
|
580
|
+
group=slugify(group),
|
|
581
|
+
chunk=f"{index:03d}",
|
|
582
|
+
idx=f"{index:03d}",
|
|
583
|
+
hash=chunk_hash[:8],
|
|
584
|
+
)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def chunk_hash_for_files(repo: Path, files: list[str]) -> str:
|
|
588
|
+
digest = hashlib.sha256()
|
|
589
|
+
for path in files:
|
|
590
|
+
digest.update(path.encode("utf-8"))
|
|
591
|
+
digest.update(b"\0")
|
|
592
|
+
full = repo / path
|
|
593
|
+
if full.is_file():
|
|
594
|
+
with full.open("rb") as handle:
|
|
595
|
+
for block in iter(lambda: handle.read(1024 * 1024), b""):
|
|
596
|
+
digest.update(block)
|
|
597
|
+
digest.update(b"\0")
|
|
598
|
+
return digest.hexdigest()
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def assign_files_to_groups(files: list[str], config: dict[str, Any]) -> list[tuple[str, str]]:
|
|
602
|
+
bundle = config.get("bundle", {})
|
|
603
|
+
groups = bundle.get("groups") if "groups" in bundle else default_groups()
|
|
604
|
+
groups = groups or []
|
|
605
|
+
assigned: list[tuple[str, str]] = []
|
|
606
|
+
seen: set[str] = set()
|
|
607
|
+
for group in groups:
|
|
608
|
+
gid = slugify(str(group.get("id") or "group"))
|
|
609
|
+
specs = group_specs(group)
|
|
610
|
+
for path in files:
|
|
611
|
+
if path in seen:
|
|
612
|
+
continue
|
|
613
|
+
if specs and path_is_included(path, specs):
|
|
614
|
+
assigned.append((gid, path))
|
|
615
|
+
seen.add(path)
|
|
616
|
+
default_group = bundle.get("default_group") if "default_group" in bundle else {"enabled": True, "id": "misc"}
|
|
617
|
+
default_group = default_group or {}
|
|
618
|
+
if default_group.get("enabled"):
|
|
619
|
+
gid = slugify(str(default_group.get("id") or "misc"))
|
|
620
|
+
for path in files:
|
|
621
|
+
if path not in seen:
|
|
622
|
+
assigned.append((gid, path))
|
|
623
|
+
seen.add(path)
|
|
624
|
+
elif not groups:
|
|
625
|
+
for path in files:
|
|
626
|
+
assigned.append(("repo", path))
|
|
627
|
+
return assigned
|
|
628
|
+
|
|
629
|
+
|
|
630
|
+
def flush_chunk(chunks: list[dict[str, Any]], repo: Path, config: dict[str, Any], set_id: str, group: str, index: int, files: list[str], total: int) -> None:
|
|
631
|
+
if not files:
|
|
632
|
+
return
|
|
633
|
+
digest = chunk_hash_for_files(repo, files)
|
|
634
|
+
chunks.append(
|
|
635
|
+
{
|
|
636
|
+
"group": group,
|
|
637
|
+
"chunk": f"{index:03d}",
|
|
638
|
+
"index": index,
|
|
639
|
+
"files": files[:],
|
|
640
|
+
"estimatedBytes": total,
|
|
641
|
+
"sha256": "sha256:" + digest,
|
|
642
|
+
"title": source_title_for_chunk(config, set_id=set_id, group=group, index=index, chunk_hash=digest),
|
|
643
|
+
}
|
|
644
|
+
)
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def active_chunk_file_members(state: dict[str, Any] | None, group: str) -> list[list[str]]:
|
|
648
|
+
if not state:
|
|
649
|
+
return []
|
|
650
|
+
members: list[tuple[int, list[str]]] = []
|
|
651
|
+
for source in active_sources(state):
|
|
652
|
+
if str(source.get("group") or "") != group:
|
|
653
|
+
continue
|
|
654
|
+
files = source.get("files")
|
|
655
|
+
if not isinstance(files, list) or not files:
|
|
656
|
+
continue
|
|
657
|
+
chunk = str(source.get("chunk") or "0")
|
|
658
|
+
try:
|
|
659
|
+
index = int(chunk)
|
|
660
|
+
except ValueError:
|
|
661
|
+
index = 0
|
|
662
|
+
clean_files = [str(path) for path in files if str(path)]
|
|
663
|
+
if clean_files:
|
|
664
|
+
members.append((index, clean_files))
|
|
665
|
+
return [files for _, files in sorted(members, key=lambda item: item[0])]
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def append_greedy_chunks(
|
|
669
|
+
chunks: list[dict[str, Any]],
|
|
670
|
+
repo: Path,
|
|
671
|
+
config: dict[str, Any],
|
|
672
|
+
*,
|
|
673
|
+
set_id: str,
|
|
674
|
+
group: str,
|
|
675
|
+
start_index: int,
|
|
676
|
+
files: list[str],
|
|
677
|
+
target: int,
|
|
678
|
+
max_bytes: int,
|
|
679
|
+
) -> int:
|
|
680
|
+
current: list[str] = []
|
|
681
|
+
current_size = 0
|
|
682
|
+
index = start_index
|
|
683
|
+
for path in files:
|
|
684
|
+
size = chunk_file_size(repo, path)
|
|
685
|
+
if size > max_bytes:
|
|
686
|
+
die(f"file exceeds max chunk size ({max_bytes} bytes): {path} ({size} bytes)")
|
|
687
|
+
if current and current_size + size > target:
|
|
688
|
+
flush_chunk(chunks, repo, config, set_id, group, index, current, current_size)
|
|
689
|
+
current = []
|
|
690
|
+
current_size = 0
|
|
691
|
+
index += 1
|
|
692
|
+
current.append(path)
|
|
693
|
+
current_size += size
|
|
694
|
+
if current:
|
|
695
|
+
flush_chunk(chunks, repo, config, set_id, group, index, current, current_size)
|
|
696
|
+
index += 1
|
|
697
|
+
return index
|
|
698
|
+
|
|
699
|
+
|
|
700
|
+
def plan_group_chunks(
|
|
701
|
+
chunks: list[dict[str, Any]],
|
|
702
|
+
repo: Path,
|
|
703
|
+
config: dict[str, Any],
|
|
704
|
+
*,
|
|
705
|
+
set_id: str,
|
|
706
|
+
group: str,
|
|
707
|
+
files: list[str],
|
|
708
|
+
target: int,
|
|
709
|
+
max_bytes: int,
|
|
710
|
+
state: dict[str, Any] | None,
|
|
711
|
+
) -> None:
|
|
712
|
+
ordered = sorted(files, key=lambda path: (file_bucket(path), path))
|
|
713
|
+
available = set(ordered)
|
|
714
|
+
kept: list[list[str]] = []
|
|
715
|
+
for previous_files in active_chunk_file_members(state, group):
|
|
716
|
+
retained = [path for path in previous_files if path in available]
|
|
717
|
+
if not retained:
|
|
718
|
+
continue
|
|
719
|
+
total = sum(chunk_file_size(repo, path) for path in retained)
|
|
720
|
+
if any(chunk_file_size(repo, path) > max_bytes for path in retained):
|
|
721
|
+
for path in retained:
|
|
722
|
+
size = chunk_file_size(repo, path)
|
|
723
|
+
if size > max_bytes:
|
|
724
|
+
die(f"file exceeds max chunk size ({max_bytes} bytes): {path} ({size} bytes)")
|
|
725
|
+
if total <= max_bytes:
|
|
726
|
+
kept.append(retained)
|
|
727
|
+
for path in retained:
|
|
728
|
+
available.discard(path)
|
|
729
|
+
|
|
730
|
+
index = 1
|
|
731
|
+
for files_in_chunk in kept:
|
|
732
|
+
total = sum(chunk_file_size(repo, path) for path in files_in_chunk)
|
|
733
|
+
flush_chunk(chunks, repo, config, set_id, group, index, files_in_chunk, total)
|
|
734
|
+
index += 1
|
|
735
|
+
|
|
736
|
+
remaining = [path for path in ordered if path in available]
|
|
737
|
+
append_greedy_chunks(
|
|
738
|
+
chunks,
|
|
739
|
+
repo,
|
|
740
|
+
config,
|
|
741
|
+
set_id=set_id,
|
|
742
|
+
group=group,
|
|
743
|
+
start_index=index,
|
|
744
|
+
files=remaining,
|
|
745
|
+
target=target,
|
|
746
|
+
max_bytes=max_bytes,
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
|
|
750
|
+
def plan_bundle_chunks(repo: Path, config: dict[str, Any], *, set_id: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
|
|
751
|
+
bundle = config.get("bundle", {})
|
|
752
|
+
target = parse_size_bytes(bundle.get("target_chunk_bytes"), 716800)
|
|
753
|
+
max_bytes = parse_size_bytes(bundle.get("max_chunk_bytes"), 900000)
|
|
754
|
+
if target > max_bytes:
|
|
755
|
+
target = max_bytes
|
|
756
|
+
assigned = assign_files_to_groups(collect_bundle_files(repo, config), config)
|
|
757
|
+
by_group: dict[str, list[str]] = {}
|
|
758
|
+
for group, path in assigned:
|
|
759
|
+
by_group.setdefault(group, []).append(path)
|
|
760
|
+
chunks: list[dict[str, Any]] = []
|
|
761
|
+
for group in sorted(by_group):
|
|
762
|
+
plan_group_chunks(
|
|
763
|
+
chunks,
|
|
764
|
+
repo,
|
|
765
|
+
config,
|
|
766
|
+
set_id=set_id,
|
|
767
|
+
group=group,
|
|
768
|
+
files=by_group[group],
|
|
769
|
+
target=target,
|
|
770
|
+
max_bytes=max_bytes,
|
|
771
|
+
state=state,
|
|
772
|
+
)
|
|
773
|
+
return chunks
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def git_head(repo: Path) -> str:
|
|
777
|
+
result = run(["git", "rev-parse", "HEAD"], repo)
|
|
778
|
+
if result.returncode != 0:
|
|
779
|
+
return "no-git-head"
|
|
780
|
+
return result.stdout.strip()
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def git_status_records(repo: Path) -> list[tuple[str, str]]:
|
|
784
|
+
result = run(["git", "status", "--porcelain=v1", "-z", "--untracked-files=all"], repo)
|
|
785
|
+
if result.returncode != 0:
|
|
786
|
+
return []
|
|
787
|
+
raw = [part for part in result.stdout.split("\0") if part]
|
|
788
|
+
records: list[tuple[str, str]] = []
|
|
789
|
+
skip_next = False
|
|
790
|
+
for item in raw:
|
|
791
|
+
if skip_next:
|
|
792
|
+
skip_next = False
|
|
793
|
+
continue
|
|
794
|
+
status = item[:2]
|
|
795
|
+
path = item[3:]
|
|
796
|
+
if status.startswith("R") or status.startswith("C"):
|
|
797
|
+
skip_next = True
|
|
798
|
+
records.append((status, path))
|
|
799
|
+
return records
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def fast_fingerprint(repo: Path, config: dict[str, Any], config_file: Path) -> tuple[str, list[str]]:
|
|
803
|
+
includes = include_specs(config)
|
|
804
|
+
ignores = never_upload_specs(config)
|
|
805
|
+
parts = [f"head={git_head(repo)}", f"config={sha256_file(config_file)}"]
|
|
806
|
+
relevant_paths: list[str] = []
|
|
807
|
+
for status, path in git_status_records(repo):
|
|
808
|
+
if not path_is_included(path, includes) or path_is_ignored(path, ignores):
|
|
809
|
+
continue
|
|
810
|
+
relevant_paths.append(path)
|
|
811
|
+
full = repo / path
|
|
812
|
+
if full.is_file():
|
|
813
|
+
content_hash = sha256_file(full)
|
|
814
|
+
elif full.exists():
|
|
815
|
+
content_hash = "dir"
|
|
816
|
+
else:
|
|
817
|
+
content_hash = "missing"
|
|
818
|
+
parts.append(f"{status} {path} {content_hash}")
|
|
819
|
+
return sha256_text("\n".join(parts)), relevant_paths
|
|
820
|
+
|
|
821
|
+
|
|
822
|
+
def seconds_since(value: str | None) -> float | None:
|
|
823
|
+
parsed = parse_iso(value)
|
|
824
|
+
if not parsed:
|
|
825
|
+
return None
|
|
826
|
+
return (now_utc() - parsed).total_seconds()
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def state_uploaded_fingerprint(state: dict[str, Any]) -> str | None:
|
|
830
|
+
return state.get("lastUploadedFastFingerprint")
|
|
831
|
+
|
|
832
|
+
|
|
833
|
+
def expand_bundle_path(repo: Path, config: dict[str, Any]) -> Path:
|
|
834
|
+
prefix = config.get("notebooklm", {}).get("source_title_prefix") or f"{repo.name}-repo"
|
|
835
|
+
timestamp = now_utc().strftime("%Y%m%dT%H%M%SZ")
|
|
836
|
+
template = config.get("bundle", {}).get("output") or f"{CONFIG_DIR}/cache/{{prefix}}-{{timestamp}}.txt"
|
|
837
|
+
rel = template.format(prefix=prefix, timestamp=timestamp)
|
|
838
|
+
return repo / rel
|
|
839
|
+
|
|
840
|
+
|
|
841
|
+
def expand_chunk_path(repo: Path, config: dict[str, Any], title: str) -> Path:
|
|
842
|
+
template = config.get("bundle", {}).get("output") or f"{CONFIG_DIR}/cache/{{title}}"
|
|
843
|
+
if "{title}" in template:
|
|
844
|
+
rel = template.format(title=title, prefix=config.get("notebooklm", {}).get("source_title_prefix") or default_short_source_title_prefix(), timestamp=now_utc().strftime("%Y%m%dT%H%M%SZ"))
|
|
845
|
+
return repo / rel
|
|
846
|
+
base = repo / template
|
|
847
|
+
return base.parent / title
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
def repomix_cmd() -> list[str]:
|
|
851
|
+
found = shutil.which("repomix")
|
|
852
|
+
if found:
|
|
853
|
+
return [found]
|
|
854
|
+
if shutil.which("npx"):
|
|
855
|
+
return ["npx", "repomix"]
|
|
856
|
+
die("required tool not found on PATH: repomix or npx")
|
|
857
|
+
|
|
858
|
+
|
|
859
|
+
def repomix_base_argv(config: dict[str, Any]) -> list[str]:
|
|
860
|
+
argv = repomix_cmd()
|
|
861
|
+
bundle = config.get("bundle", {})
|
|
862
|
+
style = str(bundle.get("style") or "").strip()
|
|
863
|
+
if style:
|
|
864
|
+
argv.extend(["--style", style])
|
|
865
|
+
if bundle.get("compress"):
|
|
866
|
+
argv.append("--compress")
|
|
867
|
+
ignore = ",".join(never_upload_specs(config))
|
|
868
|
+
if ignore:
|
|
869
|
+
argv.extend(["--ignore", ignore])
|
|
870
|
+
return argv
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def build_bundle(repo: Path, config: dict[str, Any]) -> Path:
|
|
874
|
+
out = expand_bundle_path(repo, config)
|
|
875
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
876
|
+
include = ",".join(include_specs(config))
|
|
877
|
+
argv = [*repomix_base_argv(config), "--include", include, "--output", str(out)]
|
|
878
|
+
result = run(argv, repo, timeout=600)
|
|
879
|
+
if result.returncode != 0:
|
|
880
|
+
die(f"repomix failed:\n{result.stdout}\n{result.stderr}")
|
|
881
|
+
return out
|
|
882
|
+
|
|
883
|
+
|
|
884
|
+
def build_bundle_set(repo: Path, config: dict[str, Any], *, set_id: str, state: dict[str, Any] | None = None) -> list[dict[str, Any]]:
|
|
885
|
+
max_bytes = parse_size_bytes(config.get("bundle", {}).get("max_chunk_bytes"), 900000)
|
|
886
|
+
chunks = plan_bundle_chunks(repo, config, set_id=set_id, state=state)
|
|
887
|
+
bundles: list[dict[str, Any]] = []
|
|
888
|
+
try:
|
|
889
|
+
for chunk in chunks:
|
|
890
|
+
title = str(chunk["title"])
|
|
891
|
+
out = expand_chunk_path(repo, config, title)
|
|
892
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
893
|
+
input_text = "\n".join(str(path) for path in chunk["files"]) + "\n"
|
|
894
|
+
argv = [*repomix_base_argv(config), "--stdin", "--output", str(out)]
|
|
895
|
+
result = run(argv, repo, input_text=input_text, timeout=600)
|
|
896
|
+
if result.returncode != 0:
|
|
897
|
+
die(f"repomix failed for chunk {title}:\n{result.stdout}\n{result.stderr}")
|
|
898
|
+
actual_size = out.stat().st_size
|
|
899
|
+
if actual_size > max_bytes:
|
|
900
|
+
die(f"rendered chunk exceeds max size ({max_bytes} bytes): {title} ({actual_size} bytes)")
|
|
901
|
+
item = dict(chunk)
|
|
902
|
+
item["path"] = str(out)
|
|
903
|
+
item["bundleSha256"] = sha256_file(out)
|
|
904
|
+
item["contentSha256"] = item["bundleSha256"]
|
|
905
|
+
item["fileListSha256"] = item.get("sha256")
|
|
906
|
+
item["actualBytes"] = actual_size
|
|
907
|
+
item["fileCount"] = len(chunk["files"])
|
|
908
|
+
bundles.append(item)
|
|
909
|
+
except BaseException:
|
|
910
|
+
for bundle in bundles:
|
|
911
|
+
if bundle.get("path"):
|
|
912
|
+
remove_file_quiet(Path(str(bundle["path"])))
|
|
913
|
+
raise
|
|
914
|
+
return bundles
|
|
915
|
+
|
|
916
|
+
|
|
917
|
+
def notebook_id(config: dict[str, Any]) -> str:
|
|
918
|
+
value = config.get("notebooklm", {}).get("notebook_id", "")
|
|
919
|
+
if not value:
|
|
920
|
+
die("notebooklm.notebook_id missing in config")
|
|
921
|
+
return str(value)
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
def notebook_title(config: dict[str, Any]) -> str:
|
|
925
|
+
project = str(config.get("project", {}).get("name") or "repo")
|
|
926
|
+
prefix = str(config.get("notebooklm", {}).get("notebook_title_prefix") or DEFAULT_NOTEBOOK_TITLE_PREFIX)
|
|
927
|
+
return str(config.get("notebooklm", {}).get("notebook_title") or default_notebook_title(project, prefix))
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def parse_notebook_json(stdout: str, fallback_title: str) -> dict[str, Any] | None:
|
|
931
|
+
try:
|
|
932
|
+
data = json.loads(stdout)
|
|
933
|
+
except json.JSONDecodeError:
|
|
934
|
+
return None
|
|
935
|
+
candidates = [data]
|
|
936
|
+
if isinstance(data, dict):
|
|
937
|
+
for key in ("notebook", "data", "result"):
|
|
938
|
+
value = data.get(key)
|
|
939
|
+
if isinstance(value, dict):
|
|
940
|
+
candidates.append(value)
|
|
941
|
+
for item in candidates:
|
|
942
|
+
if not isinstance(item, dict):
|
|
943
|
+
continue
|
|
944
|
+
nid = item.get("id") or item.get("notebook_id") or item.get("notebookId")
|
|
945
|
+
title = item.get("title") or item.get("name") or fallback_title
|
|
946
|
+
if nid:
|
|
947
|
+
return {"id": str(nid), "title": str(title)}
|
|
948
|
+
return None
|
|
949
|
+
|
|
950
|
+
|
|
951
|
+
def list_notebooks(repo: Path) -> list[dict[str, Any]]:
|
|
952
|
+
result = run([*notebooklm_cmd(), "list", "--json"], repo, timeout=120)
|
|
953
|
+
if result.returncode != 0:
|
|
954
|
+
die(f"notebooklm list failed:\n{result.stdout}\n{result.stderr}")
|
|
955
|
+
try:
|
|
956
|
+
data = json.loads(result.stdout)
|
|
957
|
+
except json.JSONDecodeError as error:
|
|
958
|
+
die(f"notebooklm list returned invalid JSON: {error}")
|
|
959
|
+
notebooks = data.get("notebooks", data if isinstance(data, list) else [])
|
|
960
|
+
return [item for item in notebooks if isinstance(item, dict)]
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def find_notebook_by_title(repo: Path, title: str) -> dict[str, Any] | None:
|
|
964
|
+
matches = [item for item in list_notebooks(repo) if str(item.get("title", "")) == title]
|
|
965
|
+
if len(matches) > 1:
|
|
966
|
+
ids = ", ".join(str(item.get("id", "")) for item in matches)
|
|
967
|
+
die(f"multiple notebooks found with title {title!r}: {ids}")
|
|
968
|
+
if not matches:
|
|
969
|
+
return None
|
|
970
|
+
item = matches[0]
|
|
971
|
+
return {"id": str(item.get("id", "")), "title": str(item.get("title", title))}
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def create_notebook(repo: Path, title: str) -> dict[str, Any]:
|
|
975
|
+
result = run([*notebooklm_cmd(), "create", title, "--json"], repo, timeout=180)
|
|
976
|
+
if result.returncode != 0:
|
|
977
|
+
die(f"notebooklm create failed:\n{result.stdout}\n{result.stderr}")
|
|
978
|
+
notebook = parse_notebook_json(result.stdout, title)
|
|
979
|
+
if notebook:
|
|
980
|
+
return notebook
|
|
981
|
+
found = find_notebook_by_title(repo, title)
|
|
982
|
+
if found:
|
|
983
|
+
return found
|
|
984
|
+
die(f"created notebook but could not resolve notebook id for title {title!r}")
|
|
985
|
+
|
|
986
|
+
|
|
987
|
+
def list_sources(repo: Path, nbid: str) -> list[dict[str, Any]]:
|
|
988
|
+
result = run([*notebooklm_cmd(), "source", "list", "-n", nbid, "--json"], repo, timeout=120)
|
|
989
|
+
if result.returncode != 0:
|
|
990
|
+
return []
|
|
991
|
+
try:
|
|
992
|
+
data = json.loads(result.stdout)
|
|
993
|
+
except json.JSONDecodeError:
|
|
994
|
+
return []
|
|
995
|
+
sources = data.get("sources", data if isinstance(data, list) else [])
|
|
996
|
+
return [src for src in sources if isinstance(src, dict)]
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def find_source_by_title(repo: Path, nbid: str, title: str) -> dict[str, Any] | None:
|
|
1000
|
+
for src in list_sources(repo, nbid):
|
|
1001
|
+
if str(src.get("title", "")) != title:
|
|
1002
|
+
continue
|
|
1003
|
+
sid = src.get("id")
|
|
1004
|
+
if sid:
|
|
1005
|
+
return {"id": str(sid), "title": title}
|
|
1006
|
+
return None
|
|
1007
|
+
|
|
1008
|
+
|
|
1009
|
+
def find_uploaded_source(before: list[dict[str, Any]], after: list[dict[str, Any]], bundle: Path, prefix: str, title_hint: str | None = None) -> dict[str, Any]:
|
|
1010
|
+
before_ids = {str(src.get("id")) for src in before if src.get("id")}
|
|
1011
|
+
basename = title_hint or bundle.name
|
|
1012
|
+
for src in after:
|
|
1013
|
+
title = str(src.get("title", ""))
|
|
1014
|
+
sid = str(src.get("id", ""))
|
|
1015
|
+
if sid and sid not in before_ids and (title == basename or title.startswith(prefix)):
|
|
1016
|
+
return {"id": sid, "title": title or basename}
|
|
1017
|
+
for src in after:
|
|
1018
|
+
title = str(src.get("title", ""))
|
|
1019
|
+
sid = str(src.get("id", ""))
|
|
1020
|
+
if sid and (title == basename or title.startswith(prefix)):
|
|
1021
|
+
return {"id": sid, "title": title or basename}
|
|
1022
|
+
return {"id": "", "title": basename}
|
|
1023
|
+
|
|
1024
|
+
|
|
1025
|
+
def source_from_add_json(stdout: str, bundle: Path, title_hint: str | None = None) -> dict[str, Any] | None:
|
|
1026
|
+
try:
|
|
1027
|
+
data = json.loads(stdout)
|
|
1028
|
+
except json.JSONDecodeError:
|
|
1029
|
+
return None
|
|
1030
|
+
candidates = [data]
|
|
1031
|
+
if isinstance(data, dict):
|
|
1032
|
+
for key in ("source", "data", "result"):
|
|
1033
|
+
value = data.get(key)
|
|
1034
|
+
if isinstance(value, dict):
|
|
1035
|
+
candidates.append(value)
|
|
1036
|
+
for item in candidates:
|
|
1037
|
+
if not isinstance(item, dict):
|
|
1038
|
+
continue
|
|
1039
|
+
sid = item.get("id") or item.get("source_id") or item.get("sourceId")
|
|
1040
|
+
title = item.get("title") or item.get("name") or title_hint or bundle.name
|
|
1041
|
+
if sid:
|
|
1042
|
+
return {"id": str(sid), "title": str(title)}
|
|
1043
|
+
return None
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def upload_bundle(repo: Path, config: dict[str, Any], state: dict[str, Any], bundle: Path, bundle_hash: str) -> dict[str, Any]:
|
|
1047
|
+
nbid = notebook_id(config)
|
|
1048
|
+
prefix = str(config.get("notebooklm", {}).get("source_title_prefix") or bundle.stem)
|
|
1049
|
+
before = list_sources(repo, nbid)
|
|
1050
|
+
result = run([*notebooklm_cmd(), "source", "add", str(bundle), "-n", nbid, "--json"], repo, timeout=600)
|
|
1051
|
+
if result.returncode != 0:
|
|
1052
|
+
die(f"notebooklm source add failed:\n{result.stdout}\n{result.stderr}")
|
|
1053
|
+
after = list_sources(repo, nbid)
|
|
1054
|
+
source = source_from_add_json(result.stdout, bundle) or find_uploaded_source(before, after, bundle, prefix)
|
|
1055
|
+
source.update({"bundleSha256": bundle_hash, "uploadedAt": iso()})
|
|
1056
|
+
|
|
1057
|
+
if config.get("notebooklm", {}).get("wait_after_upload") and source.get("id"):
|
|
1058
|
+
wait = run([*notebooklm_cmd(), "source", "wait", str(source["id"]), "-n", nbid], repo, timeout=600)
|
|
1059
|
+
if wait.returncode != 0:
|
|
1060
|
+
print(f"warning: source wait failed for {source['id']}", file=sys.stderr)
|
|
1061
|
+
|
|
1062
|
+
if config.get("refresh", {}).get("mode", "replace") == "replace":
|
|
1063
|
+
pruned_ids = prune_sources(repo, config, state, source)
|
|
1064
|
+
if pruned_ids:
|
|
1065
|
+
source["_prunedSourceIds"] = pruned_ids
|
|
1066
|
+
return source
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def upload_file_source(repo: Path, config: dict[str, Any], path: Path, title: str) -> dict[str, Any]:
|
|
1070
|
+
nbid = notebook_id(config)
|
|
1071
|
+
result = run([*notebooklm_cmd(), "source", "add", str(path), "-n", nbid, "--title", title, "--json"], repo, timeout=600)
|
|
1072
|
+
if result.returncode != 0:
|
|
1073
|
+
die(f"notebooklm source add failed for {title}:\n{result.stdout}\n{result.stderr}")
|
|
1074
|
+
source = source_from_add_json(result.stdout, path, title) or find_source_by_title(repo, nbid, title)
|
|
1075
|
+
if not source or not source.get("id"):
|
|
1076
|
+
die(f"uploaded source but could not resolve source id for {title}")
|
|
1077
|
+
return source
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
def wait_source_ready(repo: Path, nbid: str, source_id: str) -> bool:
|
|
1081
|
+
wait = run([*notebooklm_cmd(), "source", "wait", source_id, "-n", nbid], repo, timeout=600)
|
|
1082
|
+
return wait.returncode == 0
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
def source_content_sha(value: dict[str, Any]) -> str:
|
|
1086
|
+
return str(value.get("contentSha256") or value.get("chunkSha256") or value.get("bundleSha256") or "")
|
|
1087
|
+
|
|
1088
|
+
|
|
1089
|
+
def source_file_list_sha(value: dict[str, Any]) -> str:
|
|
1090
|
+
return str(value.get("fileListSha256") or value.get("sha256") or "")
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def chunk_key(value: dict[str, Any]) -> str:
|
|
1094
|
+
return f"{value.get('group')}/{value.get('chunk')}"
|
|
1095
|
+
|
|
1096
|
+
|
|
1097
|
+
def temp_source_prefix(config: dict[str, Any]) -> str:
|
|
1098
|
+
prefix = str(config.get("notebooklm", {}).get("temporary_source_title_prefix") or "").strip()
|
|
1099
|
+
if prefix:
|
|
1100
|
+
return slugify(prefix)
|
|
1101
|
+
return f"{str(config.get('notebooklm', {}).get('source_title_prefix') or default_short_source_title_prefix()).strip()}tmp"
|
|
1102
|
+
|
|
1103
|
+
|
|
1104
|
+
def temp_source_title(config: dict[str, Any], *, set_id: str, kind: str, title: str, content_sha: str) -> str:
|
|
1105
|
+
digest = content_sha.split(":", 1)[-1]
|
|
1106
|
+
return f"{temp_source_prefix(config)}--{set_id}--{slugify(kind)}--{slugify(title)}--{digest[:8]}.md"
|
|
1107
|
+
|
|
1108
|
+
|
|
1109
|
+
def stage_temp_source_file(repo: Path, title: str, source_path: Path) -> Path:
|
|
1110
|
+
staged = repo / CONFIG_DIR / "cache" / title
|
|
1111
|
+
staged.parent.mkdir(parents=True, exist_ok=True)
|
|
1112
|
+
shutil.copyfile(source_path, staged)
|
|
1113
|
+
return staged
|
|
1114
|
+
|
|
1115
|
+
|
|
1116
|
+
def source_with_chunk_metadata(source: dict[str, Any], bundle: dict[str, Any], *, status: str, reused: bool = False) -> dict[str, Any]:
|
|
1117
|
+
item = dict(source)
|
|
1118
|
+
item.update(
|
|
1119
|
+
{
|
|
1120
|
+
"group": bundle.get("group"),
|
|
1121
|
+
"chunk": bundle.get("chunk"),
|
|
1122
|
+
"chunkKey": chunk_key(bundle),
|
|
1123
|
+
"chunkSha256": bundle.get("bundleSha256"),
|
|
1124
|
+
"contentSha256": bundle.get("contentSha256") or bundle.get("bundleSha256"),
|
|
1125
|
+
"fileListSha256": bundle.get("fileListSha256") or bundle.get("sha256"),
|
|
1126
|
+
"fileCount": bundle.get("fileCount"),
|
|
1127
|
+
"files": list(bundle.get("files", [])),
|
|
1128
|
+
"status": status,
|
|
1129
|
+
}
|
|
1130
|
+
)
|
|
1131
|
+
if reused:
|
|
1132
|
+
item["reused"] = True
|
|
1133
|
+
item["reusedAt"] = iso()
|
|
1134
|
+
else:
|
|
1135
|
+
item["uploadedAt"] = iso()
|
|
1136
|
+
return item
|
|
1137
|
+
|
|
1138
|
+
|
|
1139
|
+
def upload_one_chunk(repo: Path, config: dict[str, Any], bundle: dict[str, Any]) -> dict[str, Any]:
|
|
1140
|
+
nbid = notebook_id(config)
|
|
1141
|
+
path = Path(str(bundle["path"]))
|
|
1142
|
+
title = str(bundle["title"])
|
|
1143
|
+
result = run([*notebooklm_cmd(), "source", "add", str(path), "-n", nbid, "--title", title, "--json"], repo, timeout=600)
|
|
1144
|
+
if result.returncode != 0:
|
|
1145
|
+
die(f"notebooklm source add failed for chunk {title}:\n{result.stdout}\n{result.stderr}")
|
|
1146
|
+
source = source_from_add_json(result.stdout, path, title) or find_source_by_title(repo, nbid, title)
|
|
1147
|
+
if not source or not source.get("id"):
|
|
1148
|
+
die(f"uploaded chunk but could not resolve source id for {title}")
|
|
1149
|
+
return source_with_chunk_metadata(source, bundle, status="uploaded")
|
|
1150
|
+
|
|
1151
|
+
|
|
1152
|
+
def source_set_hash(bundles: list[dict[str, Any]]) -> str:
|
|
1153
|
+
parts = [
|
|
1154
|
+
f"{bundle.get('group')} {bundle.get('chunk')} {source_content_sha(bundle)} {source_file_list_sha(bundle)}"
|
|
1155
|
+
for bundle in bundles
|
|
1156
|
+
]
|
|
1157
|
+
return sha256_text("\n".join(parts))
|
|
1158
|
+
|
|
1159
|
+
|
|
1160
|
+
def active_sources(state: dict[str, Any]) -> list[dict[str, Any]]:
|
|
1161
|
+
source_set = state.get("activeSourceSet")
|
|
1162
|
+
if isinstance(source_set, dict):
|
|
1163
|
+
sources = source_set.get("sources")
|
|
1164
|
+
if isinstance(sources, list):
|
|
1165
|
+
return [src for src in sources if isinstance(src, dict)]
|
|
1166
|
+
return [src for src in state.get("sources", []) if isinstance(src, dict)]
|
|
1167
|
+
|
|
1168
|
+
|
|
1169
|
+
def active_ready_source_ids(state: dict[str, Any]) -> list[str]:
|
|
1170
|
+
ids: list[str] = []
|
|
1171
|
+
for src in active_sources(state):
|
|
1172
|
+
sid = str(src.get("id") or "")
|
|
1173
|
+
if not sid:
|
|
1174
|
+
continue
|
|
1175
|
+
status = str(src.get("status") or "ready")
|
|
1176
|
+
if status == "ready":
|
|
1177
|
+
ids.append(sid)
|
|
1178
|
+
return ids
|
|
1179
|
+
|
|
1180
|
+
|
|
1181
|
+
def cleanup_pending_source_ids(state: dict[str, Any]) -> list[str]:
|
|
1182
|
+
raw = state.get("cleanupPendingSourceIds")
|
|
1183
|
+
if not isinstance(raw, list):
|
|
1184
|
+
return []
|
|
1185
|
+
return [sid for sid in dict.fromkeys(str(item) for item in raw if str(item))]
|
|
1186
|
+
|
|
1187
|
+
|
|
1188
|
+
def queue_cleanup_source_ids(state: dict[str, Any], source_ids: list[str]) -> list[str]:
|
|
1189
|
+
active_ids = {str(src.get("id") or "") for src in active_sources(state) if src.get("id")}
|
|
1190
|
+
merged = [sid for sid in dict.fromkeys([*cleanup_pending_source_ids(state), *source_ids]) if sid and sid not in active_ids]
|
|
1191
|
+
if merged:
|
|
1192
|
+
state["cleanupPendingSourceIds"] = merged
|
|
1193
|
+
else:
|
|
1194
|
+
state.pop("cleanupPendingSourceIds", None)
|
|
1195
|
+
return merged
|
|
1196
|
+
|
|
1197
|
+
|
|
1198
|
+
def pending_upload_path(repo: Path) -> Path:
|
|
1199
|
+
return repo / CONFIG_DIR / PENDING_UPLOAD_JSON
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
def clear_pending_upload(repo: Path) -> None:
|
|
1203
|
+
remove_file_quiet(pending_upload_path(repo))
|
|
1204
|
+
|
|
1205
|
+
|
|
1206
|
+
def write_pending_upload(repo: Path, value: dict[str, Any]) -> None:
|
|
1207
|
+
write_json(pending_upload_path(repo), value)
|
|
1208
|
+
|
|
1209
|
+
|
|
1210
|
+
def read_pending_upload(repo: Path) -> dict[str, Any] | None:
|
|
1211
|
+
path = pending_upload_path(repo)
|
|
1212
|
+
if not path.exists():
|
|
1213
|
+
return None
|
|
1214
|
+
try:
|
|
1215
|
+
data = json.loads(path.read_text())
|
|
1216
|
+
except json.JSONDecodeError:
|
|
1217
|
+
return {"sources": []}
|
|
1218
|
+
return data if isinstance(data, dict) else {"sources": []}
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
def delete_source_ids_parallel(repo: Path, nbid: str, source_ids: list[str], *, parallelism: int) -> list[str]:
|
|
1222
|
+
ids = [sid for sid in dict.fromkeys(source_ids) if sid]
|
|
1223
|
+
if not ids:
|
|
1224
|
+
return []
|
|
1225
|
+
workers = min(len(ids), max(1, parallelism))
|
|
1226
|
+
|
|
1227
|
+
def delete_one(sid: str) -> str | None:
|
|
1228
|
+
result = run([*notebooklm_cmd(), "source", "delete", sid, "-n", nbid, "--yes"], repo, timeout=120)
|
|
1229
|
+
if result.returncode != 0:
|
|
1230
|
+
print(f"warning: failed to delete source {sid}", file=sys.stderr)
|
|
1231
|
+
return None
|
|
1232
|
+
return sid
|
|
1233
|
+
|
|
1234
|
+
deleted: list[str] = []
|
|
1235
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
1236
|
+
futures = [executor.submit(delete_one, sid) for sid in ids]
|
|
1237
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1238
|
+
sid = future.result()
|
|
1239
|
+
if sid:
|
|
1240
|
+
deleted.append(sid)
|
|
1241
|
+
print(f"cleanup {len(deleted)}/{len(ids)}", file=sys.stderr)
|
|
1242
|
+
return deleted
|
|
1243
|
+
|
|
1244
|
+
|
|
1245
|
+
def recover_pending_cleanup(repo: Path, config: dict[str, Any], state: dict[str, Any], state_path: Path) -> list[str]:
|
|
1246
|
+
pending_ids = cleanup_pending_source_ids(state)
|
|
1247
|
+
if not pending_ids:
|
|
1248
|
+
return []
|
|
1249
|
+
active_ids = {str(src.get("id") or "") for src in active_sources(state) if src.get("id")}
|
|
1250
|
+
delete_ids = [sid for sid in pending_ids if sid not in active_ids]
|
|
1251
|
+
if not delete_ids:
|
|
1252
|
+
state.pop("cleanupPendingSourceIds", None)
|
|
1253
|
+
write_json(state_path, state)
|
|
1254
|
+
return []
|
|
1255
|
+
deleted = delete_source_ids_parallel(
|
|
1256
|
+
repo,
|
|
1257
|
+
notebook_id(config),
|
|
1258
|
+
delete_ids,
|
|
1259
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
1260
|
+
)
|
|
1261
|
+
deleted_set = set(deleted)
|
|
1262
|
+
remaining = [sid for sid in pending_ids if sid not in deleted_set and sid not in active_ids]
|
|
1263
|
+
if remaining:
|
|
1264
|
+
state["cleanupPendingSourceIds"] = remaining
|
|
1265
|
+
else:
|
|
1266
|
+
state.pop("cleanupPendingSourceIds", None)
|
|
1267
|
+
write_json(state_path, state)
|
|
1268
|
+
return deleted
|
|
1269
|
+
|
|
1270
|
+
|
|
1271
|
+
def recover_pending_upload(repo: Path, config: dict[str, Any], state: dict[str, Any] | None = None) -> list[str]:
|
|
1272
|
+
pending = read_pending_upload(repo)
|
|
1273
|
+
if not pending:
|
|
1274
|
+
return []
|
|
1275
|
+
sources = pending.get("sources")
|
|
1276
|
+
if not isinstance(sources, list):
|
|
1277
|
+
clear_pending_upload(repo)
|
|
1278
|
+
return []
|
|
1279
|
+
active_ids = {str(src.get("id")) for src in active_sources(state or {}) if src.get("id")}
|
|
1280
|
+
ids = [str(src.get("id")) for src in sources if isinstance(src, dict) and src.get("id")]
|
|
1281
|
+
if ids and active_ids and all(sid in active_ids for sid in ids):
|
|
1282
|
+
clear_pending_upload(repo)
|
|
1283
|
+
return []
|
|
1284
|
+
nbid = str(pending.get("notebookId") or notebook_id(config))
|
|
1285
|
+
delete_ids = [sid for sid in ids if sid not in active_ids]
|
|
1286
|
+
deleted = delete_source_ids_parallel(
|
|
1287
|
+
repo,
|
|
1288
|
+
nbid,
|
|
1289
|
+
delete_ids,
|
|
1290
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
1291
|
+
)
|
|
1292
|
+
remaining = [src for src in sources if isinstance(src, dict) and str(src.get("id") or "") not in set(deleted)]
|
|
1293
|
+
if remaining:
|
|
1294
|
+
pending["sources"] = remaining
|
|
1295
|
+
write_pending_upload(repo, pending)
|
|
1296
|
+
else:
|
|
1297
|
+
clear_pending_upload(repo)
|
|
1298
|
+
return deleted
|
|
1299
|
+
|
|
1300
|
+
|
|
1301
|
+
def append_pending_source(repo: Path, journal: dict[str, Any], source: dict[str, Any], lock: threading.Lock) -> None:
|
|
1302
|
+
with lock:
|
|
1303
|
+
sources = journal.setdefault("sources", [])
|
|
1304
|
+
if isinstance(sources, list):
|
|
1305
|
+
sources.append({"id": source.get("id"), "title": source.get("title")})
|
|
1306
|
+
write_pending_upload(repo, journal)
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
def find_reusable_source(bundle: dict[str, Any], previous_sources: list[dict[str, Any]], used_ids: set[str]) -> dict[str, Any] | None:
|
|
1310
|
+
wanted = source_content_sha(bundle)
|
|
1311
|
+
if not wanted:
|
|
1312
|
+
return None
|
|
1313
|
+
for source in previous_sources:
|
|
1314
|
+
sid = str(source.get("id") or "")
|
|
1315
|
+
if not sid or sid in used_ids:
|
|
1316
|
+
continue
|
|
1317
|
+
if str(source.get("status") or "ready") != "ready":
|
|
1318
|
+
continue
|
|
1319
|
+
if source_content_sha(source) == wanted:
|
|
1320
|
+
used_ids.add(sid)
|
|
1321
|
+
return source
|
|
1322
|
+
return None
|
|
1323
|
+
|
|
1324
|
+
|
|
1325
|
+
def upload_chunks_parallel(repo: Path, config: dict[str, Any], bundles: list[tuple[int, dict[str, Any]]], *, set_id: str) -> list[tuple[int, dict[str, Any]]]:
|
|
1326
|
+
if not bundles:
|
|
1327
|
+
return []
|
|
1328
|
+
nbid = notebook_id(config)
|
|
1329
|
+
workers = min(
|
|
1330
|
+
len(bundles),
|
|
1331
|
+
positive_int(config.get("notebooklm", {}).get("upload_parallelism"), 4),
|
|
1332
|
+
)
|
|
1333
|
+
journal: dict[str, Any] = {
|
|
1334
|
+
"version": 1,
|
|
1335
|
+
"setId": set_id,
|
|
1336
|
+
"notebookId": nbid,
|
|
1337
|
+
"startedAt": iso(),
|
|
1338
|
+
"sources": [],
|
|
1339
|
+
}
|
|
1340
|
+
write_pending_upload(repo, journal)
|
|
1341
|
+
journal_lock = threading.Lock()
|
|
1342
|
+
uploaded: list[tuple[int, dict[str, Any]]] = []
|
|
1343
|
+
errors: list[BaseException] = []
|
|
1344
|
+
|
|
1345
|
+
def upload_pair(pair: tuple[int, dict[str, Any]]) -> tuple[int, dict[str, Any]]:
|
|
1346
|
+
index, bundle = pair
|
|
1347
|
+
source = upload_one_chunk(repo, config, bundle)
|
|
1348
|
+
append_pending_source(repo, journal, source, journal_lock)
|
|
1349
|
+
return index, source
|
|
1350
|
+
|
|
1351
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
1352
|
+
futures = [executor.submit(upload_pair, pair) for pair in bundles]
|
|
1353
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1354
|
+
try:
|
|
1355
|
+
item = future.result()
|
|
1356
|
+
uploaded.append(item)
|
|
1357
|
+
print(f"upload {len(uploaded)}/{len(bundles)}", file=sys.stderr)
|
|
1358
|
+
except BaseException as error:
|
|
1359
|
+
errors.append(error)
|
|
1360
|
+
|
|
1361
|
+
if errors:
|
|
1362
|
+
delete_source_ids_parallel(
|
|
1363
|
+
repo,
|
|
1364
|
+
nbid,
|
|
1365
|
+
[str(source.get("id") or "") for _, source in uploaded],
|
|
1366
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
1367
|
+
)
|
|
1368
|
+
clear_pending_upload(repo)
|
|
1369
|
+
raise errors[0]
|
|
1370
|
+
return sorted(uploaded, key=lambda item: item[0])
|
|
1371
|
+
|
|
1372
|
+
|
|
1373
|
+
def wait_uploaded_sources_parallel(repo: Path, config: dict[str, Any], sources: list[tuple[int, dict[str, Any]]]) -> list[tuple[int, dict[str, Any]]]:
|
|
1374
|
+
if not sources or not config.get("notebooklm", {}).get("wait_after_upload", True):
|
|
1375
|
+
return sources
|
|
1376
|
+
nbid = notebook_id(config)
|
|
1377
|
+
workers = min(
|
|
1378
|
+
len(sources),
|
|
1379
|
+
positive_int(config.get("notebooklm", {}).get("wait_parallelism"), 8),
|
|
1380
|
+
)
|
|
1381
|
+
ready: list[tuple[int, dict[str, Any]]] = []
|
|
1382
|
+
errors: list[str] = []
|
|
1383
|
+
|
|
1384
|
+
def wait_one(pair: tuple[int, dict[str, Any]]) -> tuple[int, dict[str, Any]]:
|
|
1385
|
+
index, source = pair
|
|
1386
|
+
sid = str(source.get("id") or "")
|
|
1387
|
+
if not sid:
|
|
1388
|
+
raise RuntimeError(f"missing source id for {source.get('title')}")
|
|
1389
|
+
if not wait_source_ready(repo, nbid, sid):
|
|
1390
|
+
raise RuntimeError(f"source processing failed for chunk {source.get('title')}: {sid}")
|
|
1391
|
+
item = dict(source)
|
|
1392
|
+
item["status"] = "ready"
|
|
1393
|
+
return index, item
|
|
1394
|
+
|
|
1395
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
1396
|
+
futures = [executor.submit(wait_one, pair) for pair in sources]
|
|
1397
|
+
for future in concurrent.futures.as_completed(futures):
|
|
1398
|
+
try:
|
|
1399
|
+
item = future.result()
|
|
1400
|
+
ready.append(item)
|
|
1401
|
+
print(f"wait {len(ready)}/{len(sources)}", file=sys.stderr)
|
|
1402
|
+
except Exception as error:
|
|
1403
|
+
errors.append(str(error))
|
|
1404
|
+
if errors:
|
|
1405
|
+
die("\n".join(errors))
|
|
1406
|
+
return sorted(ready, key=lambda item: item[0])
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
def upload_bundle_set(repo: Path, config: dict[str, Any], state: dict[str, Any], bundles: list[dict[str, Any]], *, set_id: str) -> dict[str, Any]:
|
|
1410
|
+
nbid = notebook_id(config)
|
|
1411
|
+
recover_pending_upload(repo, config, state)
|
|
1412
|
+
previous_sources = active_sources(state)
|
|
1413
|
+
used_reuse_ids: set[str] = set()
|
|
1414
|
+
sources_by_index: list[dict[str, Any] | None] = [None] * len(bundles)
|
|
1415
|
+
upload_pairs: list[tuple[int, dict[str, Any]]] = []
|
|
1416
|
+
for index, bundle in enumerate(bundles):
|
|
1417
|
+
reusable = find_reusable_source(bundle, previous_sources, used_reuse_ids)
|
|
1418
|
+
if reusable:
|
|
1419
|
+
sources_by_index[index] = source_with_chunk_metadata(reusable, bundle, status="ready", reused=True)
|
|
1420
|
+
else:
|
|
1421
|
+
upload_pairs.append((index, bundle))
|
|
1422
|
+
uploaded_sources = upload_chunks_parallel(repo, config, upload_pairs, set_id=set_id)
|
|
1423
|
+
try:
|
|
1424
|
+
ready_sources = wait_uploaded_sources_parallel(repo, config, uploaded_sources)
|
|
1425
|
+
except BaseException:
|
|
1426
|
+
delete_source_ids_parallel(
|
|
1427
|
+
repo,
|
|
1428
|
+
nbid,
|
|
1429
|
+
[str(source.get("id") or "") for _, source in uploaded_sources],
|
|
1430
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
1431
|
+
)
|
|
1432
|
+
clear_pending_upload(repo)
|
|
1433
|
+
raise
|
|
1434
|
+
for index, source in ready_sources:
|
|
1435
|
+
sources_by_index[index] = source
|
|
1436
|
+
sources = [source for source in sources_by_index if isinstance(source, dict)]
|
|
1437
|
+
active_ids = {str(src.get("id")) for src in sources if src.get("id")}
|
|
1438
|
+
previous_ids = [str(src.get("id")) for src in previous_sources if src.get("id")]
|
|
1439
|
+
keep_previous = int(config.get("refresh", {}).get("keep_previous_sources", 0))
|
|
1440
|
+
keep_ids = set(previous_ids[-keep_previous:]) if keep_previous > 0 else set()
|
|
1441
|
+
retired_ids = [sid for sid in previous_ids if sid not in active_ids and sid not in keep_ids]
|
|
1442
|
+
source_set = {
|
|
1443
|
+
"id": set_id,
|
|
1444
|
+
"prefix": str(config.get("notebooklm", {}).get("source_title_prefix") or default_short_source_title_prefix()),
|
|
1445
|
+
"bundleSetSha256": source_set_hash(bundles),
|
|
1446
|
+
"uploadedAt": iso(),
|
|
1447
|
+
"sources": sources,
|
|
1448
|
+
}
|
|
1449
|
+
if config.get("refresh", {}).get("mode", "replace") == "replace" and config.get("refresh", {}).get("delete_previous_after_success", True):
|
|
1450
|
+
source_set["_retiredSourceIds"] = retired_ids
|
|
1451
|
+
return source_set
|
|
1452
|
+
|
|
1453
|
+
|
|
1454
|
+
def prune_sources(repo: Path, config: dict[str, Any], state: dict[str, Any], new_source: dict[str, Any]) -> list[str]:
|
|
1455
|
+
refresh = config.get("refresh", {})
|
|
1456
|
+
if not refresh.get("delete_previous_after_success", True):
|
|
1457
|
+
return []
|
|
1458
|
+
keep_previous = int(refresh.get("keep_previous_sources", 1))
|
|
1459
|
+
recorded = [src for src in state.get("sources", []) if src.get("id")]
|
|
1460
|
+
keep_ids = {str(src.get("id")) for src in recorded[-keep_previous:]} if keep_previous > 0 else set()
|
|
1461
|
+
keep_ids.add(str(new_source.get("id", "")))
|
|
1462
|
+
nbid = notebook_id(config)
|
|
1463
|
+
pruned_ids: list[str] = []
|
|
1464
|
+
for src in recorded:
|
|
1465
|
+
sid = str(src.get("id", ""))
|
|
1466
|
+
if not sid or sid in keep_ids:
|
|
1467
|
+
continue
|
|
1468
|
+
delete = run([*notebooklm_cmd(), "source", "delete", sid, "-n", nbid, "--yes"], repo, timeout=120)
|
|
1469
|
+
if delete.returncode != 0:
|
|
1470
|
+
print(f"warning: failed to delete old source {sid}", file=sys.stderr)
|
|
1471
|
+
else:
|
|
1472
|
+
pruned_ids.append(sid)
|
|
1473
|
+
return pruned_ids
|
|
1474
|
+
|
|
1475
|
+
|
|
1476
|
+
def ensure_index(
|
|
1477
|
+
repo: Path,
|
|
1478
|
+
*,
|
|
1479
|
+
force: bool = False,
|
|
1480
|
+
yes: bool = False,
|
|
1481
|
+
json_output: bool = False,
|
|
1482
|
+
command: str = "ensure",
|
|
1483
|
+
return_uninitialized: bool = False,
|
|
1484
|
+
) -> dict[str, Any]:
|
|
1485
|
+
config_file = config_path(repo)
|
|
1486
|
+
if not config_file.exists():
|
|
1487
|
+
if json_output or return_uninitialized:
|
|
1488
|
+
return uninitialized_status(repo, config_file)
|
|
1489
|
+
die(missing_config_message(repo, config_file, command))
|
|
1490
|
+
with repo_lock(repo):
|
|
1491
|
+
return ensure_index_locked(repo, force=force, yes=yes, json_output=json_output, command=command)
|
|
1492
|
+
|
|
1493
|
+
|
|
1494
|
+
def ensure_index_locked(repo: Path, *, force: bool = False, yes: bool = False, json_output: bool = False, command: str = "ensure") -> dict[str, Any]:
|
|
1495
|
+
config, cfg_path = load_config(repo, command=command)
|
|
1496
|
+
state, state_path = load_state(cfg_path)
|
|
1497
|
+
recover_pending_upload(repo, config, state)
|
|
1498
|
+
recover_pending_cleanup(repo, config, state, state_path)
|
|
1499
|
+
fast_hash, relevant_paths = fast_fingerprint(repo, config, cfg_path)
|
|
1500
|
+
refresh = config.get("refresh", {})
|
|
1501
|
+
check_ttl = int(refresh.get("check_ttl_seconds", 300))
|
|
1502
|
+
min_interval = int(refresh.get("min_upload_interval_seconds", 900))
|
|
1503
|
+
max_staleness = int(refresh.get("max_staleness_seconds", 86400))
|
|
1504
|
+
checked_age = seconds_since(state.get("lastCheckedAt"))
|
|
1505
|
+
uploaded_age = seconds_since(state.get("lastUploadedAt"))
|
|
1506
|
+
uploaded_fingerprint = state_uploaded_fingerprint(state)
|
|
1507
|
+
|
|
1508
|
+
result: dict[str, Any] = {
|
|
1509
|
+
"status": "unknown",
|
|
1510
|
+
"config": str(cfg_path),
|
|
1511
|
+
"state": str(state_path),
|
|
1512
|
+
"relevant_changed_paths": relevant_paths,
|
|
1513
|
+
"fast_fingerprint": fast_hash,
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
if not force and checked_age is not None and checked_age < check_ttl and uploaded_fingerprint == fast_hash:
|
|
1517
|
+
state["lastCheckedAt"] = iso()
|
|
1518
|
+
state["lastCheckedFastFingerprint"] = fast_hash
|
|
1519
|
+
state["lastBundlePath"] = None
|
|
1520
|
+
write_json(state_path, state)
|
|
1521
|
+
result.update({"status": "fresh-ttl", "checked_age_seconds": checked_age})
|
|
1522
|
+
return result
|
|
1523
|
+
|
|
1524
|
+
if not force and uploaded_fingerprint == fast_hash and state.get("lastUploadedAt"):
|
|
1525
|
+
state["lastCheckedAt"] = iso()
|
|
1526
|
+
state["lastCheckedFastFingerprint"] = fast_hash
|
|
1527
|
+
state["lastBundlePath"] = None
|
|
1528
|
+
write_json(state_path, state)
|
|
1529
|
+
result.update({"status": "fresh-fingerprint"})
|
|
1530
|
+
return result
|
|
1531
|
+
|
|
1532
|
+
first_upload = not active_sources(state)
|
|
1533
|
+
if first_upload and config.get("safety", {}).get("require_user_approval_first_upload", True) and not yes and not force:
|
|
1534
|
+
result.update({"status": "needs-first-upload-approval"})
|
|
1535
|
+
return result
|
|
1536
|
+
|
|
1537
|
+
if not force and uploaded_age is not None and uploaded_age < min_interval and uploaded_age < max_staleness:
|
|
1538
|
+
state["lastCheckedAt"] = iso()
|
|
1539
|
+
state["lastCheckedFastFingerprint"] = fast_hash
|
|
1540
|
+
state["lastBundlePath"] = None
|
|
1541
|
+
write_json(state_path, state)
|
|
1542
|
+
result.update({"status": "stale-throttled", "uploaded_age_seconds": uploaded_age})
|
|
1543
|
+
return result
|
|
1544
|
+
|
|
1545
|
+
if not refresh.get("auto", True) and not force:
|
|
1546
|
+
state["lastCheckedAt"] = iso()
|
|
1547
|
+
state["lastCheckedFastFingerprint"] = fast_hash
|
|
1548
|
+
state["lastBundlePath"] = None
|
|
1549
|
+
write_json(state_path, state)
|
|
1550
|
+
result.update({"status": "auto-refresh-disabled"})
|
|
1551
|
+
return result
|
|
1552
|
+
|
|
1553
|
+
if bundle_mode(config) == "chunked":
|
|
1554
|
+
set_id = now_utc().strftime("%y%m%d%H%M")
|
|
1555
|
+
bundles = build_bundle_set(repo, config, set_id=set_id, state=state)
|
|
1556
|
+
try:
|
|
1557
|
+
bundle_set_sha = source_set_hash(bundles)
|
|
1558
|
+
if not force and state.get("lastBundleSetSha256") == bundle_set_sha:
|
|
1559
|
+
state.update({
|
|
1560
|
+
"lastCheckedAt": iso(),
|
|
1561
|
+
"lastCheckedFastFingerprint": fast_hash,
|
|
1562
|
+
"lastBundlePath": None,
|
|
1563
|
+
})
|
|
1564
|
+
write_json(state_path, state)
|
|
1565
|
+
result.update({"status": "fresh-bundle-hash", "bundleSetSha256": bundle_set_sha, "bundleDeleted": True})
|
|
1566
|
+
return result
|
|
1567
|
+
|
|
1568
|
+
source_set = upload_bundle_set(repo, config, state, bundles, set_id=set_id)
|
|
1569
|
+
retired_ids = [str(sid) for sid in source_set.pop("_retiredSourceIds", []) if str(sid)]
|
|
1570
|
+
state.update({
|
|
1571
|
+
"lastCheckedAt": iso(),
|
|
1572
|
+
"lastUploadedAt": iso(),
|
|
1573
|
+
"lastConfigSha256": sha256_file(cfg_path),
|
|
1574
|
+
"lastCheckedFastFingerprint": fast_hash,
|
|
1575
|
+
"lastUploadedFastFingerprint": fast_hash,
|
|
1576
|
+
"lastFastFingerprint": fast_hash,
|
|
1577
|
+
"lastBundleSetSha256": bundle_set_sha,
|
|
1578
|
+
"lastBundleSha256": bundle_set_sha,
|
|
1579
|
+
"lastBundlePath": None,
|
|
1580
|
+
"activeSourceSet": source_set,
|
|
1581
|
+
"sources": [src for src in source_set.get("sources", []) if isinstance(src, dict)],
|
|
1582
|
+
})
|
|
1583
|
+
cleanup_pending_ids = queue_cleanup_source_ids(state, retired_ids)
|
|
1584
|
+
write_json(state_path, state)
|
|
1585
|
+
clear_pending_upload(repo)
|
|
1586
|
+
result.update(
|
|
1587
|
+
{
|
|
1588
|
+
"status": "uploaded",
|
|
1589
|
+
"bundleSetSha256": bundle_set_sha,
|
|
1590
|
+
"bundleDeleted": True,
|
|
1591
|
+
"sourceSet": source_set,
|
|
1592
|
+
"cleanupPendingSourceIds": cleanup_pending_ids,
|
|
1593
|
+
}
|
|
1594
|
+
)
|
|
1595
|
+
return result
|
|
1596
|
+
finally:
|
|
1597
|
+
for bundle in bundles:
|
|
1598
|
+
if bundle.get("path"):
|
|
1599
|
+
remove_file_quiet(Path(str(bundle["path"])))
|
|
1600
|
+
|
|
1601
|
+
bundle = build_bundle(repo, config)
|
|
1602
|
+
try:
|
|
1603
|
+
bundle_hash = sha256_file(bundle)
|
|
1604
|
+
if not force and state.get("lastBundleSha256") == bundle_hash:
|
|
1605
|
+
state.update({
|
|
1606
|
+
"lastCheckedAt": iso(),
|
|
1607
|
+
"lastCheckedFastFingerprint": fast_hash,
|
|
1608
|
+
"lastBundlePath": None,
|
|
1609
|
+
})
|
|
1610
|
+
write_json(state_path, state)
|
|
1611
|
+
result.update({"status": "fresh-bundle-hash", "bundleSha256": bundle_hash, "bundleDeleted": True})
|
|
1612
|
+
return result
|
|
1613
|
+
|
|
1614
|
+
source = upload_bundle(repo, config, state, bundle, bundle_hash)
|
|
1615
|
+
pruned_ids = set(source.pop("_prunedSourceIds", []))
|
|
1616
|
+
sources = [src for src in state.get("sources", []) if str(src.get("id", "")) not in pruned_ids]
|
|
1617
|
+
if source.get("id") or source.get("title"):
|
|
1618
|
+
sources.append(source)
|
|
1619
|
+
state.update({
|
|
1620
|
+
"lastCheckedAt": iso(),
|
|
1621
|
+
"lastUploadedAt": iso(),
|
|
1622
|
+
"lastConfigSha256": sha256_file(cfg_path),
|
|
1623
|
+
"lastCheckedFastFingerprint": fast_hash,
|
|
1624
|
+
"lastUploadedFastFingerprint": fast_hash,
|
|
1625
|
+
"lastFastFingerprint": fast_hash,
|
|
1626
|
+
"lastBundleSha256": bundle_hash,
|
|
1627
|
+
"lastBundlePath": None,
|
|
1628
|
+
"sources": sources,
|
|
1629
|
+
})
|
|
1630
|
+
write_json(state_path, state)
|
|
1631
|
+
result.update({"status": "uploaded", "bundleSha256": bundle_hash, "bundleDeleted": True, "source": source})
|
|
1632
|
+
return result
|
|
1633
|
+
finally:
|
|
1634
|
+
remove_file_quiet(bundle)
|
|
1635
|
+
|
|
1636
|
+
|
|
1637
|
+
def ask_provider(repo: Path, question: str) -> dict[str, Any]:
|
|
1638
|
+
config, cfg_path = load_config(repo, command="ask")
|
|
1639
|
+
state, _ = load_state(cfg_path)
|
|
1640
|
+
nbid = notebook_id(config)
|
|
1641
|
+
argv = [*notebooklm_cmd(), "ask", question, "-n", nbid]
|
|
1642
|
+
for source_id in active_ready_source_ids(state):
|
|
1643
|
+
argv.extend(["-s", source_id])
|
|
1644
|
+
argv.append("--json")
|
|
1645
|
+
result = run(argv, repo, timeout=180)
|
|
1646
|
+
if result.returncode != 0:
|
|
1647
|
+
return {"error": True, "stdout": result.stdout, "stderr": result.stderr}
|
|
1648
|
+
try:
|
|
1649
|
+
return json.loads(result.stdout)
|
|
1650
|
+
except json.JSONDecodeError:
|
|
1651
|
+
return {"answer": result.stdout}
|
|
1652
|
+
|
|
1653
|
+
|
|
1654
|
+
PATH_RE = re.compile(r"(?:(?:[\w.-]+/)+[\w.@+-]+\.(?:rs|ts|tsx|js|jsx|py|go|java|kt|md|toml|yaml|yml|json|sh|sql|css|scss|html))")
|
|
1655
|
+
TERM_RE = re.compile(r"[A-Za-z_][A-Za-z0-9_]{3,}|[A-Za-z0-9][A-Za-z0-9_-]{4,}")
|
|
1656
|
+
STOP_TERMS = {
|
|
1657
|
+
"agent",
|
|
1658
|
+
"authority",
|
|
1659
|
+
"btreemap",
|
|
1660
|
+
"bundle",
|
|
1661
|
+
"codex",
|
|
1662
|
+
"command",
|
|
1663
|
+
"docs",
|
|
1664
|
+
"fixture",
|
|
1665
|
+
"gate",
|
|
1666
|
+
"justfile",
|
|
1667
|
+
"keywords",
|
|
1668
|
+
"local",
|
|
1669
|
+
"names",
|
|
1670
|
+
"paths",
|
|
1671
|
+
"postgres",
|
|
1672
|
+
"postgresql",
|
|
1673
|
+
"real",
|
|
1674
|
+
"refs",
|
|
1675
|
+
"repo",
|
|
1676
|
+
"shell",
|
|
1677
|
+
"test",
|
|
1678
|
+
"trigger",
|
|
1679
|
+
"where",
|
|
1680
|
+
"which",
|
|
1681
|
+
"what",
|
|
1682
|
+
"when",
|
|
1683
|
+
"implemented",
|
|
1684
|
+
"implementation",
|
|
1685
|
+
"function",
|
|
1686
|
+
"tests",
|
|
1687
|
+
"files",
|
|
1688
|
+
"return",
|
|
1689
|
+
"likely",
|
|
1690
|
+
"line",
|
|
1691
|
+
"numbers",
|
|
1692
|
+
"source",
|
|
1693
|
+
"notebooklm",
|
|
1694
|
+
}
|
|
1695
|
+
|
|
1696
|
+
|
|
1697
|
+
def answer_text(data: dict[str, Any]) -> str:
|
|
1698
|
+
value = data.get("answer")
|
|
1699
|
+
if isinstance(value, str):
|
|
1700
|
+
return value
|
|
1701
|
+
return json.dumps(data, ensure_ascii=False)
|
|
1702
|
+
|
|
1703
|
+
|
|
1704
|
+
def active_sources_by_id(repo: Path) -> dict[str, dict[str, Any]]:
|
|
1705
|
+
_, config_file = load_config(repo, command="ask")
|
|
1706
|
+
state, _ = load_state(config_file)
|
|
1707
|
+
by_id: dict[str, dict[str, Any]] = {}
|
|
1708
|
+
for source in active_sources(state):
|
|
1709
|
+
sid = str(source.get("id") or "")
|
|
1710
|
+
if sid:
|
|
1711
|
+
by_id[sid] = source
|
|
1712
|
+
return by_id
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
def reference_path_candidates(repo: Path, source: dict[str, Any], text: str) -> list[tuple[str, int | None]]:
|
|
1716
|
+
files = [str(path) for path in source.get("files", []) if str(path)]
|
|
1717
|
+
file_set = set(files)
|
|
1718
|
+
matches: list[tuple[str, int | None]] = []
|
|
1719
|
+
|
|
1720
|
+
for raw in PATH_RE.findall(text):
|
|
1721
|
+
path = raw.strip("`'\".,;:()[]{}<>")
|
|
1722
|
+
if path in file_set and (repo / path).is_file():
|
|
1723
|
+
matches.append((path, None))
|
|
1724
|
+
|
|
1725
|
+
if matches:
|
|
1726
|
+
return sorted(set(matches))[:5]
|
|
1727
|
+
|
|
1728
|
+
snippet = " ".join(text.split())
|
|
1729
|
+
if len(snippet) < 4 or len(snippet) > 240 or "<directory_structure>" in text:
|
|
1730
|
+
return []
|
|
1731
|
+
|
|
1732
|
+
for path in files:
|
|
1733
|
+
full = repo / path
|
|
1734
|
+
if not full.is_file() or full.stat().st_size > 2_000_000:
|
|
1735
|
+
continue
|
|
1736
|
+
try:
|
|
1737
|
+
content = full.read_text(encoding="utf-8", errors="ignore")
|
|
1738
|
+
except OSError:
|
|
1739
|
+
continue
|
|
1740
|
+
line_no: int | None = None
|
|
1741
|
+
index = content.find(text)
|
|
1742
|
+
if index >= 0:
|
|
1743
|
+
line_no = content.count("\n", 0, index) + 1
|
|
1744
|
+
elif snippet not in " ".join(content.split()):
|
|
1745
|
+
continue
|
|
1746
|
+
matches.append((path, line_no))
|
|
1747
|
+
if len(matches) >= 5:
|
|
1748
|
+
break
|
|
1749
|
+
return matches
|
|
1750
|
+
|
|
1751
|
+
|
|
1752
|
+
def format_reference_paths(paths: list[tuple[str, int | None]]) -> str:
|
|
1753
|
+
rendered = [f"{path}:{line}" if line else path for path, line in paths[:3]]
|
|
1754
|
+
suffix = "" if len(paths) <= 3 else f", ...(+{len(paths) - 3})"
|
|
1755
|
+
return ", ".join(rendered) + suffix
|
|
1756
|
+
|
|
1757
|
+
|
|
1758
|
+
def print_compact_references(repo: Path, answer: dict[str, Any]) -> None:
|
|
1759
|
+
references = answer.get("references")
|
|
1760
|
+
if not isinstance(references, list) or not references:
|
|
1761
|
+
return
|
|
1762
|
+
|
|
1763
|
+
sources = active_sources_by_id(repo)
|
|
1764
|
+
rows: list[str] = []
|
|
1765
|
+
seen_numbers: set[str] = set()
|
|
1766
|
+
for ref in references:
|
|
1767
|
+
if not isinstance(ref, dict):
|
|
1768
|
+
continue
|
|
1769
|
+
number = str(ref.get("citation_number") or "").strip()
|
|
1770
|
+
if not number or number in seen_numbers:
|
|
1771
|
+
continue
|
|
1772
|
+
seen_numbers.add(number)
|
|
1773
|
+
source = sources.get(str(ref.get("source_id") or ""))
|
|
1774
|
+
paths = reference_path_candidates(repo, source or {}, str(ref.get("cited_text") or "")) if source else []
|
|
1775
|
+
if paths:
|
|
1776
|
+
rows.append(f"[{number}] {format_reference_paths(paths)}")
|
|
1777
|
+
|
|
1778
|
+
if rows:
|
|
1779
|
+
print("\nreferences:")
|
|
1780
|
+
for row in rows:
|
|
1781
|
+
print(row)
|
|
1782
|
+
|
|
1783
|
+
|
|
1784
|
+
def extract_candidates(text: str, query: str) -> tuple[list[str], list[str]]:
|
|
1785
|
+
paths = sorted(set(PATH_RE.findall(text)))
|
|
1786
|
+
terms = set()
|
|
1787
|
+
for raw in TERM_RE.findall(text + "\n" + query):
|
|
1788
|
+
term = raw.strip("`'\"")
|
|
1789
|
+
if len(term) < 4 or term.lower() in STOP_TERMS:
|
|
1790
|
+
continue
|
|
1791
|
+
if "/" in term or "." in term:
|
|
1792
|
+
continue
|
|
1793
|
+
terms.add(term)
|
|
1794
|
+
return paths, sorted(terms)[:24]
|
|
1795
|
+
|
|
1796
|
+
|
|
1797
|
+
def high_signal_terms(terms: list[str]) -> list[str]:
|
|
1798
|
+
selected: list[str] = []
|
|
1799
|
+
for term in terms:
|
|
1800
|
+
lower = term.lower()
|
|
1801
|
+
if lower in STOP_TERMS:
|
|
1802
|
+
continue
|
|
1803
|
+
has_symbol_shape = "_" in term or "-" in term or any(char.isupper() for char in term[1:])
|
|
1804
|
+
if has_symbol_shape or len(term) >= 14:
|
|
1805
|
+
selected.append(term)
|
|
1806
|
+
return selected or [term for term in terms if term.lower() not in STOP_TERMS][:8]
|
|
1807
|
+
|
|
1808
|
+
|
|
1809
|
+
def rg_roots(repo: Path, config: dict[str, Any], candidate_paths: list[str]) -> list[list[str]]:
|
|
1810
|
+
candidate_roots = [path for path in candidate_paths if (repo / path).exists()]
|
|
1811
|
+
roots = [spec for spec in include_specs(config) if (repo / spec).exists()]
|
|
1812
|
+
if not roots:
|
|
1813
|
+
roots = ["."]
|
|
1814
|
+
groups: list[list[str]] = []
|
|
1815
|
+
if candidate_roots:
|
|
1816
|
+
groups.append(candidate_roots)
|
|
1817
|
+
groups.append(roots)
|
|
1818
|
+
return groups
|
|
1819
|
+
|
|
1820
|
+
|
|
1821
|
+
def parse_rg_matches(stdout: str, seen: set[tuple[str, str, str]], remaining: int) -> list[dict[str, Any]]:
|
|
1822
|
+
matches: list[dict[str, Any]] = []
|
|
1823
|
+
for line in stdout.splitlines():
|
|
1824
|
+
if len(matches) >= remaining:
|
|
1825
|
+
break
|
|
1826
|
+
parts = line.split(":", 2)
|
|
1827
|
+
if len(parts) != 3:
|
|
1828
|
+
continue
|
|
1829
|
+
path, line_no, text = parts
|
|
1830
|
+
key = (path, line_no, text.strip())
|
|
1831
|
+
if key in seen:
|
|
1832
|
+
continue
|
|
1833
|
+
seen.add(key)
|
|
1834
|
+
matches.append({"path": path, "line": int(line_no) if line_no.isdigit() else line_no, "text": text.strip()})
|
|
1835
|
+
return matches
|
|
1836
|
+
|
|
1837
|
+
|
|
1838
|
+
def local_rg(repo: Path, config: dict[str, Any], terms: list[str], candidate_paths: list[str] | None = None) -> list[dict[str, Any]]:
|
|
1839
|
+
if not terms or shutil.which("rg") is None:
|
|
1840
|
+
return []
|
|
1841
|
+
signal_terms = high_signal_terms(terms)
|
|
1842
|
+
pattern = "|".join(re.escape(term) for term in signal_terms[:16])
|
|
1843
|
+
max_matches = int(config.get("retrieval", {}).get("max_local_matches", 80))
|
|
1844
|
+
matches: list[dict[str, Any]] = []
|
|
1845
|
+
seen: set[tuple[str, str, str]] = set()
|
|
1846
|
+
for roots in rg_roots(repo, config, candidate_paths or []):
|
|
1847
|
+
remaining = max_matches - len(matches)
|
|
1848
|
+
if remaining <= 0:
|
|
1849
|
+
break
|
|
1850
|
+
cmd = ["rg", "-n", "-S", "-e", pattern, "--", *roots]
|
|
1851
|
+
result = run(cmd, repo, timeout=120)
|
|
1852
|
+
if result.returncode not in (0, 1):
|
|
1853
|
+
return [{"error": result.stderr.strip()}]
|
|
1854
|
+
matches.extend(parse_rg_matches(result.stdout, seen, remaining))
|
|
1855
|
+
return matches
|
|
1856
|
+
|
|
1857
|
+
|
|
1858
|
+
def print_result(data: Any, as_json: bool) -> None:
|
|
1859
|
+
if as_json:
|
|
1860
|
+
print(json.dumps(data, indent=2, ensure_ascii=False))
|
|
1861
|
+
else:
|
|
1862
|
+
if isinstance(data, dict):
|
|
1863
|
+
for key, value in data.items():
|
|
1864
|
+
if isinstance(value, (dict, list)):
|
|
1865
|
+
print(f"{key}: {json.dumps(value, ensure_ascii=False)}")
|
|
1866
|
+
else:
|
|
1867
|
+
print(f"{key}: {value}")
|
|
1868
|
+
else:
|
|
1869
|
+
print(data)
|
|
1870
|
+
|
|
1871
|
+
|
|
1872
|
+
def freshness_warning(freshness: dict[str, Any]) -> str | None:
|
|
1873
|
+
status = str(freshness.get("status") or "")
|
|
1874
|
+
if status == "stale-throttled":
|
|
1875
|
+
changed = freshness.get("relevant_changed_paths") or []
|
|
1876
|
+
uploaded_age = freshness.get("uploaded_age_seconds")
|
|
1877
|
+
changed_text = ""
|
|
1878
|
+
if isinstance(changed, list) and changed:
|
|
1879
|
+
preview = ", ".join(str(path) for path in changed[:5])
|
|
1880
|
+
suffix = "" if len(changed) <= 5 else f", ...(+{len(changed) - 5})"
|
|
1881
|
+
changed_text = f"; changed={preview}{suffix}"
|
|
1882
|
+
age_text = f"; uploaded_age_seconds={uploaded_age}" if uploaded_age is not None else ""
|
|
1883
|
+
return f"warning: index is stale-throttled{age_text}{changed_text}; provider answer may lag local changes. Use --force-refresh or refresh --force if needed."
|
|
1884
|
+
if status == "needs-first-upload-approval":
|
|
1885
|
+
return "warning: first broad upload requires approval; rerun with --yes or run refresh explicitly."
|
|
1886
|
+
if status == "auto-refresh-disabled":
|
|
1887
|
+
return "warning: auto refresh is disabled; provider answer may lag local changes."
|
|
1888
|
+
return None
|
|
1889
|
+
|
|
1890
|
+
|
|
1891
|
+
def provider_block_message(freshness: dict[str, Any]) -> str | None:
|
|
1892
|
+
status = str(freshness.get("status") or "")
|
|
1893
|
+
if status == "not-initialized":
|
|
1894
|
+
return "skipped; project is not initialized for project retrieval."
|
|
1895
|
+
if status == "needs-first-upload-approval":
|
|
1896
|
+
return "skipped; first broad upload requires approval. Rerun ask/locate with --yes or run refresh explicitly."
|
|
1897
|
+
return None
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
def first_upload_next(repo: Path, command: str, query: str) -> dict[str, str]:
|
|
1901
|
+
return {
|
|
1902
|
+
f"{command}WithFirstUploadApproval": command_line(repo, command, "--yes", query),
|
|
1903
|
+
"refresh": command_line(repo, "refresh", "--force"),
|
|
1904
|
+
}
|
|
1905
|
+
|
|
1906
|
+
|
|
1907
|
+
def provider_block_payload(freshness: dict[str, Any], *, next_steps: dict[str, str] | None = None) -> dict[str, Any]:
|
|
1908
|
+
payload: dict[str, Any] = {"error": True, "message": provider_block_message(freshness) or "skipped"}
|
|
1909
|
+
block_next = freshness.get("next") or next_steps
|
|
1910
|
+
if block_next:
|
|
1911
|
+
payload["next"] = block_next
|
|
1912
|
+
return payload
|
|
1913
|
+
|
|
1914
|
+
|
|
1915
|
+
def print_ask_result(freshness: dict[str, Any], answer: dict[str, Any], args: argparse.Namespace) -> None:
|
|
1916
|
+
if args.json:
|
|
1917
|
+
print_result({"freshness": freshness, "provider_answer": answer}, True)
|
|
1918
|
+
return
|
|
1919
|
+
repo = Path(args.repo).resolve()
|
|
1920
|
+
warning = freshness_warning(freshness)
|
|
1921
|
+
if warning:
|
|
1922
|
+
print(warning)
|
|
1923
|
+
if args.verbose:
|
|
1924
|
+
print(f"freshness: {json.dumps(freshness, ensure_ascii=False)}")
|
|
1925
|
+
metadata = {key: answer[key] for key in ("conversation_id", "turn_number", "is_follow_up") if key in answer}
|
|
1926
|
+
references = answer.get("references")
|
|
1927
|
+
if isinstance(references, list):
|
|
1928
|
+
metadata["references_count"] = len(references)
|
|
1929
|
+
if metadata:
|
|
1930
|
+
print(f"provider: {json.dumps(metadata, ensure_ascii=False)}")
|
|
1931
|
+
print(answer_text(answer))
|
|
1932
|
+
print_compact_references(repo, answer)
|
|
1933
|
+
|
|
1934
|
+
|
|
1935
|
+
def print_locate_result(result: dict[str, Any], args: argparse.Namespace) -> None:
|
|
1936
|
+
if args.json:
|
|
1937
|
+
print_result(result, True)
|
|
1938
|
+
return
|
|
1939
|
+
warning = freshness_warning(result.get("freshness", {}))
|
|
1940
|
+
if warning:
|
|
1941
|
+
print(warning)
|
|
1942
|
+
if args.verbose:
|
|
1943
|
+
print(f"freshness: {json.dumps(result.get('freshness', {}), ensure_ascii=False)}")
|
|
1944
|
+
visible = {key: value for key, value in result.items() if key != "freshness"}
|
|
1945
|
+
print_result(visible, False)
|
|
1946
|
+
|
|
1947
|
+
|
|
1948
|
+
def cmd_init(args: argparse.Namespace) -> None:
|
|
1949
|
+
repo = Path(args.repo).resolve()
|
|
1950
|
+
cfg_dir = repo / CONFIG_DIR
|
|
1951
|
+
cfg = cfg_dir / CONFIG_JSON
|
|
1952
|
+
if cfg.exists() and not args.force:
|
|
1953
|
+
die(f"config already exists: {cfg}")
|
|
1954
|
+
project_name = args.project_name or repo.name
|
|
1955
|
+
title_prefix = args.notebook_title_prefix or DEFAULT_NOTEBOOK_TITLE_PREFIX
|
|
1956
|
+
title = args.notebook_title or default_notebook_title(project_name, title_prefix)
|
|
1957
|
+
notebook_id_value = args.notebook_id or ""
|
|
1958
|
+
resolved_notebook: dict[str, Any] | None = None
|
|
1959
|
+
if not notebook_id_value and (args.reuse_existing_notebook or args.create_notebook):
|
|
1960
|
+
resolved_notebook = find_notebook_by_title(repo, title)
|
|
1961
|
+
if not resolved_notebook and args.create_notebook:
|
|
1962
|
+
resolved_notebook = create_notebook(repo, title)
|
|
1963
|
+
if not resolved_notebook:
|
|
1964
|
+
die(f"no NotebookLM notebook found with title {title!r}; pass --create-notebook or --notebook-id")
|
|
1965
|
+
notebook_id_value = str(resolved_notebook.get("id") or "")
|
|
1966
|
+
config = default_config(
|
|
1967
|
+
repo,
|
|
1968
|
+
notebook_id_value,
|
|
1969
|
+
project_name=project_name,
|
|
1970
|
+
notebook_title_prefix=title_prefix,
|
|
1971
|
+
notebook_title=title,
|
|
1972
|
+
)
|
|
1973
|
+
if args.include:
|
|
1974
|
+
config["bundle"]["include"] = [part.strip() for part in args.include.split(",") if part.strip()]
|
|
1975
|
+
if args.source_title_prefix:
|
|
1976
|
+
config["notebooklm"]["source_title_prefix"] = args.source_title_prefix
|
|
1977
|
+
write_json(cfg, config)
|
|
1978
|
+
(cfg_dir / ".gitignore").write_text("state.local.json\npending-upload.local.json\ncache/\n*.lock\n")
|
|
1979
|
+
print(f"created: {cfg}")
|
|
1980
|
+
print(f"created: {cfg_dir / '.gitignore'}")
|
|
1981
|
+
print(f"notebook_title: {title}")
|
|
1982
|
+
if resolved_notebook:
|
|
1983
|
+
print(f"notebook_id: {notebook_id_value}")
|
|
1984
|
+
if notebook_id_value:
|
|
1985
|
+
print("next:")
|
|
1986
|
+
print(f" {command_line(repo, 'ensure', '--yes')}")
|
|
1987
|
+
print(f" {command_line(repo, 'ask', 'your question')}")
|
|
1988
|
+
else:
|
|
1989
|
+
print("next:")
|
|
1990
|
+
print(" set notebooklm.notebook_id in the config, or rerun init with --create-notebook / --reuse-existing-notebook / --notebook-id")
|
|
1991
|
+
|
|
1992
|
+
|
|
1993
|
+
def cmd_status(args: argparse.Namespace) -> None:
|
|
1994
|
+
repo = Path(args.repo).resolve()
|
|
1995
|
+
cfg_candidate = config_path(repo)
|
|
1996
|
+
if not cfg_candidate.exists():
|
|
1997
|
+
print_result(uninitialized_status(repo, cfg_candidate), args.json)
|
|
1998
|
+
return
|
|
1999
|
+
config, cfg_path = load_config(repo, command="status")
|
|
2000
|
+
state, state_path = load_state(cfg_path)
|
|
2001
|
+
fast_hash, changed = fast_fingerprint(repo, config, cfg_path)
|
|
2002
|
+
data = {
|
|
2003
|
+
"initialized": True,
|
|
2004
|
+
"config": str(cfg_path),
|
|
2005
|
+
"state": str(state_path),
|
|
2006
|
+
"provider": config.get("provider"),
|
|
2007
|
+
"projectName": config.get("project", {}).get("name"),
|
|
2008
|
+
"notebook_id": config.get("notebooklm", {}).get("notebook_id"),
|
|
2009
|
+
"notebookTitle": notebook_title(config),
|
|
2010
|
+
"sourceTitlePrefix": config.get("notebooklm", {}).get("source_title_prefix"),
|
|
2011
|
+
"lastCheckedAt": state.get("lastCheckedAt"),
|
|
2012
|
+
"lastUploadedAt": state.get("lastUploadedAt"),
|
|
2013
|
+
"lastBundleSha256": state.get("lastBundleSha256"),
|
|
2014
|
+
"fastFingerprint": fast_hash,
|
|
2015
|
+
"stateCheckedFastFingerprint": state.get("lastCheckedFastFingerprint"),
|
|
2016
|
+
"stateUploadedFastFingerprint": state_uploaded_fingerprint(state),
|
|
2017
|
+
"stateFastFingerprint": state.get("lastFastFingerprint"),
|
|
2018
|
+
"relevantChangedPaths": changed,
|
|
2019
|
+
"sources": state.get("sources", []),
|
|
2020
|
+
}
|
|
2021
|
+
print_result(data, args.json)
|
|
2022
|
+
|
|
2023
|
+
|
|
2024
|
+
def cmd_pack(args: argparse.Namespace) -> None:
|
|
2025
|
+
repo = Path(args.repo).resolve()
|
|
2026
|
+
config, cfg_path = load_config(repo, command="pack")
|
|
2027
|
+
state, _ = load_state(cfg_path)
|
|
2028
|
+
set_id = args.set_id or now_utc().strftime("%y%m%d%H%M")
|
|
2029
|
+
chunks = plan_bundle_chunks(repo, config, set_id=set_id, state=state)
|
|
2030
|
+
if args.dry_run:
|
|
2031
|
+
print_result(
|
|
2032
|
+
{
|
|
2033
|
+
"setId": set_id,
|
|
2034
|
+
"mode": "chunked",
|
|
2035
|
+
"chunkCount": len(chunks),
|
|
2036
|
+
"chunks": [
|
|
2037
|
+
{
|
|
2038
|
+
"group": chunk.get("group"),
|
|
2039
|
+
"chunk": chunk.get("chunk"),
|
|
2040
|
+
"title": chunk.get("title"),
|
|
2041
|
+
"estimatedBytes": chunk.get("estimatedBytes"),
|
|
2042
|
+
"fileCount": len(chunk.get("files", [])),
|
|
2043
|
+
**({"files": chunk.get("files", [])} if args.include_files else {}),
|
|
2044
|
+
}
|
|
2045
|
+
for chunk in chunks
|
|
2046
|
+
],
|
|
2047
|
+
},
|
|
2048
|
+
args.json,
|
|
2049
|
+
)
|
|
2050
|
+
return
|
|
2051
|
+
bundles = build_bundle_set(repo, config, set_id=set_id, state=state)
|
|
2052
|
+
print_result(
|
|
2053
|
+
{
|
|
2054
|
+
"setId": set_id,
|
|
2055
|
+
"bundleCount": len(bundles),
|
|
2056
|
+
"bundles": [
|
|
2057
|
+
{
|
|
2058
|
+
"group": bundle.get("group"),
|
|
2059
|
+
"chunk": bundle.get("chunk"),
|
|
2060
|
+
"title": bundle.get("title"),
|
|
2061
|
+
"path": bundle.get("path"),
|
|
2062
|
+
"fileCount": bundle.get("fileCount"),
|
|
2063
|
+
"bundleSha256": bundle.get("bundleSha256"),
|
|
2064
|
+
"contentSha256": bundle.get("contentSha256"),
|
|
2065
|
+
}
|
|
2066
|
+
for bundle in bundles
|
|
2067
|
+
],
|
|
2068
|
+
},
|
|
2069
|
+
args.json,
|
|
2070
|
+
)
|
|
2071
|
+
|
|
2072
|
+
|
|
2073
|
+
def cmd_ensure(args: argparse.Namespace) -> None:
|
|
2074
|
+
repo = Path(args.repo).resolve()
|
|
2075
|
+
print_result(ensure_index(repo, force=args.force, yes=args.yes, json_output=args.json, command="ensure"), args.json)
|
|
2076
|
+
|
|
2077
|
+
|
|
2078
|
+
def cmd_refresh(args: argparse.Namespace) -> None:
|
|
2079
|
+
repo = Path(args.repo).resolve()
|
|
2080
|
+
print_result(ensure_index(repo, force=True, yes=True, json_output=args.json, command="refresh"), args.json)
|
|
2081
|
+
|
|
2082
|
+
|
|
2083
|
+
def cmd_ask(args: argparse.Namespace) -> None:
|
|
2084
|
+
repo = Path(args.repo).resolve()
|
|
2085
|
+
freshness = ensure_index(
|
|
2086
|
+
repo,
|
|
2087
|
+
force=args.force_refresh,
|
|
2088
|
+
yes=args.yes,
|
|
2089
|
+
json_output=args.json,
|
|
2090
|
+
command="ask",
|
|
2091
|
+
return_uninitialized=True,
|
|
2092
|
+
)
|
|
2093
|
+
blocked = provider_block_message(freshness)
|
|
2094
|
+
if blocked:
|
|
2095
|
+
next_steps = None
|
|
2096
|
+
if freshness.get("status") == "needs-first-upload-approval":
|
|
2097
|
+
next_steps = first_upload_next(repo, "ask", args.question)
|
|
2098
|
+
print_ask_result(freshness, provider_block_payload(freshness, next_steps=next_steps), args)
|
|
2099
|
+
return
|
|
2100
|
+
answer = ask_provider(repo, args.question)
|
|
2101
|
+
print_ask_result(freshness, answer, args)
|
|
2102
|
+
|
|
2103
|
+
|
|
2104
|
+
def cmd_locate(args: argparse.Namespace) -> None:
|
|
2105
|
+
repo = Path(args.repo).resolve()
|
|
2106
|
+
freshness = ensure_index(
|
|
2107
|
+
repo,
|
|
2108
|
+
force=args.force_refresh,
|
|
2109
|
+
yes=args.yes,
|
|
2110
|
+
json_output=args.json,
|
|
2111
|
+
command="locate",
|
|
2112
|
+
return_uninitialized=True,
|
|
2113
|
+
)
|
|
2114
|
+
blocked = provider_block_message(freshness)
|
|
2115
|
+
if blocked:
|
|
2116
|
+
next_steps = freshness.get("next")
|
|
2117
|
+
if not next_steps and freshness.get("status") == "needs-first-upload-approval":
|
|
2118
|
+
next_steps = first_upload_next(repo, "locate", args.query)
|
|
2119
|
+
result = {
|
|
2120
|
+
"freshness": freshness,
|
|
2121
|
+
"notebooklm_candidates": {"paths": [], "existing_paths": [], "terms": []},
|
|
2122
|
+
"local_line_refs": [],
|
|
2123
|
+
"provider_misses_or_stale_paths": [],
|
|
2124
|
+
"provider_answer": f"({blocked})",
|
|
2125
|
+
"claim_boundary": "Semantic provider was not called because retrieval preflight is blocked.",
|
|
2126
|
+
}
|
|
2127
|
+
if next_steps:
|
|
2128
|
+
result["next"] = next_steps
|
|
2129
|
+
print_locate_result(result, args)
|
|
2130
|
+
return
|
|
2131
|
+
prompt = (
|
|
2132
|
+
"Find the code location for this repository question. Return likely repo paths, "
|
|
2133
|
+
"function names, test names, command names, and keywords for rg. If exact line "
|
|
2134
|
+
f"numbers are unavailable, say so. Question: {args.query}"
|
|
2135
|
+
)
|
|
2136
|
+
provider = ask_provider(repo, prompt)
|
|
2137
|
+
text = answer_text(provider)
|
|
2138
|
+
paths, terms = extract_candidates(text, args.query)
|
|
2139
|
+
config, _ = load_config(repo, command="locate")
|
|
2140
|
+
existing_paths = [path for path in paths if (repo / path).exists()]
|
|
2141
|
+
stale_paths = [path for path in paths if not (repo / path).exists()]
|
|
2142
|
+
matches = local_rg(repo, config, terms, existing_paths)
|
|
2143
|
+
result = {
|
|
2144
|
+
"freshness": freshness,
|
|
2145
|
+
"notebooklm_candidates": {"paths": paths, "existing_paths": existing_paths, "terms": terms},
|
|
2146
|
+
"local_line_refs": matches,
|
|
2147
|
+
"provider_misses_or_stale_paths": stale_paths,
|
|
2148
|
+
"provider_answer": provider if args.include_provider_answer else "(hidden; pass --include-provider-answer)",
|
|
2149
|
+
"claim_boundary": "Line refs come from local rg results, not NotebookLM.",
|
|
2150
|
+
}
|
|
2151
|
+
print_locate_result(result, args)
|
|
2152
|
+
|
|
2153
|
+
|
|
2154
|
+
def temp_source_sets(state: dict[str, Any]) -> list[dict[str, Any]]:
|
|
2155
|
+
sets = state.get("temporarySourceSets")
|
|
2156
|
+
if isinstance(sets, list):
|
|
2157
|
+
return [item for item in sets if isinstance(item, dict)]
|
|
2158
|
+
return []
|
|
2159
|
+
|
|
2160
|
+
|
|
2161
|
+
def temp_source_expires_at(ttl_seconds: int) -> str | None:
|
|
2162
|
+
if ttl_seconds <= 0:
|
|
2163
|
+
return None
|
|
2164
|
+
return iso(now_utc() + dt.timedelta(seconds=ttl_seconds))
|
|
2165
|
+
|
|
2166
|
+
|
|
2167
|
+
def source_is_expired(source_set: dict[str, Any]) -> bool:
|
|
2168
|
+
expires_at = source_set.get("expiresAt")
|
|
2169
|
+
parsed = parse_iso(str(expires_at)) if expires_at else None
|
|
2170
|
+
return bool(parsed and parsed <= now_utc())
|
|
2171
|
+
|
|
2172
|
+
|
|
2173
|
+
def cmd_temp_source_upload(args: argparse.Namespace) -> None:
|
|
2174
|
+
repo = Path(args.repo).resolve()
|
|
2175
|
+
config, cfg_path = load_config(repo, command="temp-source upload")
|
|
2176
|
+
state, state_path = load_state(cfg_path)
|
|
2177
|
+
source_path = Path(args.file).expanduser()
|
|
2178
|
+
if not source_path.is_absolute():
|
|
2179
|
+
source_path = (repo / source_path).resolve()
|
|
2180
|
+
if not source_path.is_file():
|
|
2181
|
+
die(f"temp source file not found: {source_path}")
|
|
2182
|
+
set_id = now_utc().strftime("%y%m%d%H%M")
|
|
2183
|
+
content_sha = sha256_file(source_path)
|
|
2184
|
+
title = temp_source_title(config, set_id=set_id, kind=args.kind, title=args.title, content_sha=content_sha)
|
|
2185
|
+
staged_path = stage_temp_source_file(repo, title, source_path)
|
|
2186
|
+
with repo_lock(repo):
|
|
2187
|
+
try:
|
|
2188
|
+
state, state_path = load_state(cfg_path)
|
|
2189
|
+
source = upload_file_source(repo, config, staged_path, title)
|
|
2190
|
+
status = "uploaded"
|
|
2191
|
+
if config.get("notebooklm", {}).get("wait_after_upload", True) and source.get("id"):
|
|
2192
|
+
status = "ready" if wait_source_ready(repo, notebook_id(config), str(source["id"])) else "error"
|
|
2193
|
+
if status != "ready":
|
|
2194
|
+
delete_source_ids_parallel(
|
|
2195
|
+
repo,
|
|
2196
|
+
notebook_id(config),
|
|
2197
|
+
[str(source.get("id") or "")],
|
|
2198
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
2199
|
+
)
|
|
2200
|
+
die(f"source processing failed for temp source {title}: {source.get('id')}")
|
|
2201
|
+
active = state.get("activeSourceSet") if isinstance(state.get("activeSourceSet"), dict) else {}
|
|
2202
|
+
item = {
|
|
2203
|
+
"id": source.get("id"),
|
|
2204
|
+
"title": source.get("title") or title,
|
|
2205
|
+
"contentSha256": content_sha,
|
|
2206
|
+
"uploadedAt": iso(),
|
|
2207
|
+
"status": status,
|
|
2208
|
+
"origin": {
|
|
2209
|
+
"activeSourceSetId": active.get("id"),
|
|
2210
|
+
"chunkKeys": list(args.origin_chunk or []),
|
|
2211
|
+
"filePaths": list(args.origin_file or []),
|
|
2212
|
+
},
|
|
2213
|
+
}
|
|
2214
|
+
source_set = {
|
|
2215
|
+
"id": set_id,
|
|
2216
|
+
"kind": slugify(args.kind),
|
|
2217
|
+
"purpose": args.title,
|
|
2218
|
+
"createdAt": iso(),
|
|
2219
|
+
"expiresAt": temp_source_expires_at(int(args.ttl_seconds or 0)),
|
|
2220
|
+
"sources": [item],
|
|
2221
|
+
}
|
|
2222
|
+
sets = temp_source_sets(state)
|
|
2223
|
+
sets.append(source_set)
|
|
2224
|
+
state["temporarySourceSets"] = sets
|
|
2225
|
+
write_json(state_path, state)
|
|
2226
|
+
finally:
|
|
2227
|
+
remove_file_quiet(staged_path)
|
|
2228
|
+
print_result({"sourceSet": source_set, "source": item}, args.json)
|
|
2229
|
+
|
|
2230
|
+
|
|
2231
|
+
def cmd_temp_source_list(args: argparse.Namespace) -> None:
|
|
2232
|
+
repo = Path(args.repo).resolve()
|
|
2233
|
+
config, cfg_path = load_config(repo, command="temp-source list")
|
|
2234
|
+
state, _ = load_state(cfg_path)
|
|
2235
|
+
sets = temp_source_sets(state)
|
|
2236
|
+
if args.kind:
|
|
2237
|
+
wanted = slugify(args.kind)
|
|
2238
|
+
sets = [item for item in sets if str(item.get("kind") or "") == wanted]
|
|
2239
|
+
prefix = temp_source_prefix(config)
|
|
2240
|
+
provider_matches = [src for src in list_sources(repo, notebook_id(config)) if str(src.get("title") or "").startswith(prefix + "--")]
|
|
2241
|
+
tracked_ids = {
|
|
2242
|
+
str(src.get("id"))
|
|
2243
|
+
for source_set in temp_source_sets(state)
|
|
2244
|
+
for src in source_set.get("sources", [])
|
|
2245
|
+
if isinstance(src, dict) and src.get("id")
|
|
2246
|
+
}
|
|
2247
|
+
untracked = [src for src in provider_matches if str(src.get("id") or "") not in tracked_ids]
|
|
2248
|
+
print_result({"temporarySourceSets": sets, "untrackedPrefixMatches": untracked}, args.json)
|
|
2249
|
+
|
|
2250
|
+
|
|
2251
|
+
def cmd_temp_source_cleanup(args: argparse.Namespace) -> None:
|
|
2252
|
+
repo = Path(args.repo).resolve()
|
|
2253
|
+
config, cfg_path = load_config(repo, command="temp-source cleanup")
|
|
2254
|
+
with repo_lock(repo):
|
|
2255
|
+
state, state_path = load_state(cfg_path)
|
|
2256
|
+
sets = temp_source_sets(state)
|
|
2257
|
+
wanted_kind = slugify(args.kind) if args.kind else ""
|
|
2258
|
+
selected: list[dict[str, Any]] = []
|
|
2259
|
+
kept: list[dict[str, Any]] = []
|
|
2260
|
+
for source_set in sets:
|
|
2261
|
+
matches = True
|
|
2262
|
+
if args.set_id and str(source_set.get("id") or "") != str(args.set_id):
|
|
2263
|
+
matches = False
|
|
2264
|
+
if wanted_kind and str(source_set.get("kind") or "") != wanted_kind:
|
|
2265
|
+
matches = False
|
|
2266
|
+
if args.expired and not source_is_expired(source_set):
|
|
2267
|
+
matches = False
|
|
2268
|
+
if matches:
|
|
2269
|
+
selected.append(source_set)
|
|
2270
|
+
else:
|
|
2271
|
+
kept.append(source_set)
|
|
2272
|
+
if not args.yes:
|
|
2273
|
+
die("cleanup requires --yes")
|
|
2274
|
+
source_ids = [
|
|
2275
|
+
str(src.get("id"))
|
|
2276
|
+
for source_set in selected
|
|
2277
|
+
for src in source_set.get("sources", [])
|
|
2278
|
+
if isinstance(src, dict) and src.get("id")
|
|
2279
|
+
]
|
|
2280
|
+
deleted = delete_source_ids_parallel(
|
|
2281
|
+
repo,
|
|
2282
|
+
notebook_id(config),
|
|
2283
|
+
source_ids,
|
|
2284
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
2285
|
+
)
|
|
2286
|
+
deleted_set = set(deleted)
|
|
2287
|
+
remaining_selected: list[dict[str, Any]] = []
|
|
2288
|
+
for source_set in selected:
|
|
2289
|
+
sources = [
|
|
2290
|
+
src
|
|
2291
|
+
for src in source_set.get("sources", [])
|
|
2292
|
+
if isinstance(src, dict) and str(src.get("id") or "") not in deleted_set
|
|
2293
|
+
]
|
|
2294
|
+
if sources:
|
|
2295
|
+
item = dict(source_set)
|
|
2296
|
+
item["sources"] = sources
|
|
2297
|
+
remaining_selected.append(item)
|
|
2298
|
+
state["temporarySourceSets"] = kept + remaining_selected
|
|
2299
|
+
write_json(state_path, state)
|
|
2300
|
+
prefix = temp_source_prefix(config)
|
|
2301
|
+
provider_matches = [src for src in list_sources(repo, notebook_id(config)) if str(src.get("title") or "").startswith(prefix + "--")]
|
|
2302
|
+
tracked_ids = {
|
|
2303
|
+
str(src.get("id"))
|
|
2304
|
+
for source_set in temp_source_sets(state)
|
|
2305
|
+
for src in source_set.get("sources", [])
|
|
2306
|
+
if isinstance(src, dict) and src.get("id")
|
|
2307
|
+
}
|
|
2308
|
+
deleted_set = set(deleted)
|
|
2309
|
+
untracked = [
|
|
2310
|
+
src
|
|
2311
|
+
for src in provider_matches
|
|
2312
|
+
if str(src.get("id") or "") not in tracked_ids and str(src.get("id") or "") not in deleted_set
|
|
2313
|
+
]
|
|
2314
|
+
if args.include_untracked_prefix:
|
|
2315
|
+
extra_ids = [str(src.get("id")) for src in untracked if src.get("id")]
|
|
2316
|
+
extra_deleted = delete_source_ids_parallel(
|
|
2317
|
+
repo,
|
|
2318
|
+
notebook_id(config),
|
|
2319
|
+
extra_ids,
|
|
2320
|
+
parallelism=positive_int(config.get("notebooklm", {}).get("delete_parallelism"), 4),
|
|
2321
|
+
)
|
|
2322
|
+
deleted.extend(extra_deleted)
|
|
2323
|
+
untracked = [src for src in untracked if str(src.get("id") or "") not in set(extra_deleted)]
|
|
2324
|
+
print_result({"deletedSourceIds": deleted, "untrackedPrefixMatches": untracked}, args.json)
|
|
2325
|
+
|
|
2326
|
+
|
|
2327
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
2328
|
+
parser = argparse.ArgumentParser(prog="memdex")
|
|
2329
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
2330
|
+
|
|
2331
|
+
init = sub.add_parser("init")
|
|
2332
|
+
init.add_argument("--repo", default=".")
|
|
2333
|
+
init.add_argument("--notebook-id", default="")
|
|
2334
|
+
init.add_argument("--project-name", default="")
|
|
2335
|
+
init.add_argument("--notebook-title-prefix", default=DEFAULT_NOTEBOOK_TITLE_PREFIX)
|
|
2336
|
+
init.add_argument("--notebook-title", default="")
|
|
2337
|
+
init.add_argument("--reuse-existing-notebook", action="store_true")
|
|
2338
|
+
init.add_argument("--create-notebook", action="store_true")
|
|
2339
|
+
init.add_argument("--source-title-prefix", default="")
|
|
2340
|
+
init.add_argument("--include", default="")
|
|
2341
|
+
init.add_argument("--force", action="store_true")
|
|
2342
|
+
init.set_defaults(func=cmd_init)
|
|
2343
|
+
|
|
2344
|
+
status = sub.add_parser("status")
|
|
2345
|
+
status.add_argument("--repo", default=".")
|
|
2346
|
+
status.add_argument("--json", action="store_true")
|
|
2347
|
+
status.set_defaults(func=cmd_status)
|
|
2348
|
+
|
|
2349
|
+
pack = sub.add_parser("pack")
|
|
2350
|
+
pack.add_argument("--repo", default=".")
|
|
2351
|
+
pack.add_argument("--set-id", default="")
|
|
2352
|
+
pack.add_argument("--dry-run", action="store_true")
|
|
2353
|
+
pack.add_argument("--include-files", action="store_true")
|
|
2354
|
+
pack.add_argument("--json", action="store_true")
|
|
2355
|
+
pack.set_defaults(func=cmd_pack)
|
|
2356
|
+
|
|
2357
|
+
ensure = sub.add_parser("ensure")
|
|
2358
|
+
ensure.add_argument("--repo", default=".")
|
|
2359
|
+
ensure.add_argument("--force", action="store_true")
|
|
2360
|
+
ensure.add_argument("--yes", action="store_true")
|
|
2361
|
+
ensure.add_argument("--json", action="store_true")
|
|
2362
|
+
ensure.set_defaults(func=cmd_ensure)
|
|
2363
|
+
|
|
2364
|
+
refresh = sub.add_parser("refresh")
|
|
2365
|
+
refresh.add_argument("--repo", default=".")
|
|
2366
|
+
refresh.add_argument("--force", action="store_true")
|
|
2367
|
+
refresh.add_argument("--json", action="store_true")
|
|
2368
|
+
refresh.set_defaults(func=cmd_refresh)
|
|
2369
|
+
|
|
2370
|
+
ask = sub.add_parser("ask")
|
|
2371
|
+
ask.add_argument("question")
|
|
2372
|
+
ask.add_argument("--repo", default=".")
|
|
2373
|
+
ask.add_argument("--yes", action="store_true")
|
|
2374
|
+
ask.add_argument("--force-refresh", action="store_true")
|
|
2375
|
+
ask.add_argument("--json", action="store_true")
|
|
2376
|
+
ask.add_argument("--verbose", action="store_true")
|
|
2377
|
+
ask.set_defaults(func=cmd_ask)
|
|
2378
|
+
|
|
2379
|
+
locate = sub.add_parser("locate")
|
|
2380
|
+
locate.add_argument("query")
|
|
2381
|
+
locate.add_argument("--repo", default=".")
|
|
2382
|
+
locate.add_argument("--yes", action="store_true")
|
|
2383
|
+
locate.add_argument("--force-refresh", action="store_true")
|
|
2384
|
+
locate.add_argument("--include-provider-answer", action="store_true")
|
|
2385
|
+
locate.add_argument("--json", action="store_true")
|
|
2386
|
+
locate.add_argument("--verbose", action="store_true")
|
|
2387
|
+
locate.set_defaults(func=cmd_locate)
|
|
2388
|
+
|
|
2389
|
+
temp = sub.add_parser("temp-source")
|
|
2390
|
+
temp_sub = temp.add_subparsers(dest="temp_command", required=True)
|
|
2391
|
+
|
|
2392
|
+
temp_upload = temp_sub.add_parser("upload")
|
|
2393
|
+
temp_upload.add_argument("--repo", default=".")
|
|
2394
|
+
temp_upload.add_argument("--kind", required=True)
|
|
2395
|
+
temp_upload.add_argument("--title", required=True)
|
|
2396
|
+
temp_upload.add_argument("--file", required=True)
|
|
2397
|
+
temp_upload.add_argument("--origin-chunk", action="append", default=[])
|
|
2398
|
+
temp_upload.add_argument("--origin-file", action="append", default=[])
|
|
2399
|
+
temp_upload.add_argument("--ttl-seconds", type=int, default=0)
|
|
2400
|
+
temp_upload.add_argument("--json", action="store_true")
|
|
2401
|
+
temp_upload.set_defaults(func=cmd_temp_source_upload)
|
|
2402
|
+
|
|
2403
|
+
temp_list = temp_sub.add_parser("list")
|
|
2404
|
+
temp_list.add_argument("--repo", default=".")
|
|
2405
|
+
temp_list.add_argument("--kind", default="")
|
|
2406
|
+
temp_list.add_argument("--json", action="store_true")
|
|
2407
|
+
temp_list.set_defaults(func=cmd_temp_source_list)
|
|
2408
|
+
|
|
2409
|
+
temp_cleanup = temp_sub.add_parser("cleanup")
|
|
2410
|
+
temp_cleanup.add_argument("--repo", default=".")
|
|
2411
|
+
temp_cleanup.add_argument("--kind", default="")
|
|
2412
|
+
temp_cleanup.add_argument("--set-id", default="")
|
|
2413
|
+
temp_cleanup.add_argument("--expired", action="store_true")
|
|
2414
|
+
temp_cleanup.add_argument("--include-untracked-prefix", action="store_true")
|
|
2415
|
+
temp_cleanup.add_argument("--yes", action="store_true")
|
|
2416
|
+
temp_cleanup.add_argument("--json", action="store_true")
|
|
2417
|
+
temp_cleanup.set_defaults(func=cmd_temp_source_cleanup)
|
|
2418
|
+
return parser
|
|
2419
|
+
|
|
2420
|
+
|
|
2421
|
+
def main() -> None:
|
|
2422
|
+
args = build_parser().parse_args()
|
|
2423
|
+
args.func(args)
|
|
2424
|
+
|
|
2425
|
+
|
|
2426
|
+
if __name__ == "__main__":
|
|
2427
|
+
main()
|