nogic 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nogic/__init__.py +3 -0
- nogic/api/__init__.py +23 -0
- nogic/api/client.py +390 -0
- nogic/commands/__init__.py +1 -0
- nogic/commands/init.py +125 -0
- nogic/commands/login.py +75 -0
- nogic/commands/projects.py +138 -0
- nogic/commands/reindex.py +117 -0
- nogic/commands/status.py +165 -0
- nogic/commands/sync.py +72 -0
- nogic/commands/telemetry_cmd.py +65 -0
- nogic/commands/watch.py +167 -0
- nogic/config.py +157 -0
- nogic/ignore.py +109 -0
- nogic/main.py +58 -0
- nogic/parsing/__init__.py +22 -0
- nogic/parsing/js_extractor.py +674 -0
- nogic/parsing/parser.py +220 -0
- nogic/parsing/python_extractor.py +484 -0
- nogic/parsing/types.py +80 -0
- nogic/storage/__init__.py +14 -0
- nogic/storage/relationships.py +322 -0
- nogic/storage/schema.py +154 -0
- nogic/storage/symbols.py +203 -0
- nogic/telemetry.py +142 -0
- nogic/ui.py +60 -0
- nogic/watcher/__init__.py +7 -0
- nogic/watcher/monitor.py +80 -0
- nogic/watcher/storage.py +185 -0
- nogic/watcher/sync.py +879 -0
- nogic-0.0.1.dist-info/METADATA +201 -0
- nogic-0.0.1.dist-info/RECORD +35 -0
- nogic-0.0.1.dist-info/WHEEL +4 -0
- nogic-0.0.1.dist-info/entry_points.txt +2 -0
- nogic-0.0.1.dist-info/licenses/LICENSE +21 -0
nogic/watcher/sync.py
ADDED
|
@@ -0,0 +1,879 @@
|
|
|
1
|
+
"""Sync service for file changes - sends to backend API."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import json as _json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import time
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, Optional
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
from rich.progress import (
|
|
15
|
+
Progress,
|
|
16
|
+
SpinnerColumn,
|
|
17
|
+
TextColumn,
|
|
18
|
+
BarColumn,
|
|
19
|
+
TaskProgressColumn,
|
|
20
|
+
TimeElapsedColumn,
|
|
21
|
+
MofNCompleteColumn,
|
|
22
|
+
)
|
|
23
|
+
from rich.console import Console
|
|
24
|
+
|
|
25
|
+
from nogic.config import Config, get_language
|
|
26
|
+
from nogic.api import NogicClient, IndexProgress
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
def _int_env(name: str, default: int) -> int:
|
|
31
|
+
"""Read an integer from an environment variable, falling back to default on bad values."""
|
|
32
|
+
raw = os.environ.get(name)
|
|
33
|
+
if raw is None:
|
|
34
|
+
return default
|
|
35
|
+
try:
|
|
36
|
+
return int(raw)
|
|
37
|
+
except ValueError:
|
|
38
|
+
return default
|
|
39
|
+
|
|
40
|
+
# Batch size for uploading files (configurable via NOGIC_BATCH_SIZE)
|
|
41
|
+
BATCH_SIZE = _int_env("NOGIC_BATCH_SIZE", 150)
|
|
42
|
+
|
|
43
|
+
# Max payload size per batch in bytes (~10MB default, well under GCP LB 32MB limit)
|
|
44
|
+
MAX_BATCH_BYTES = _int_env("NOGIC_MAX_BATCH_BYTES", 10 * 1024 * 1024)
|
|
45
|
+
|
|
46
|
+
# Hard cap per file in bytes (~30MB) — must fit in a single request under GCP LB 32MB limit
|
|
47
|
+
MAX_SINGLE_FILE_BYTES = _int_env("NOGIC_MAX_FILE_SIZE", 30 * 1024 * 1024)
|
|
48
|
+
|
|
49
|
+
# Max concurrent uploads (configurable via NOGIC_MAX_CONCURRENT)
|
|
50
|
+
MAX_CONCURRENT = _int_env("NOGIC_MAX_CONCURRENT", 8)
|
|
51
|
+
|
|
52
|
+
STAGE_LABELS = {
|
|
53
|
+
"hashing": "Computing file hashes",
|
|
54
|
+
"parsing": "Parsing files",
|
|
55
|
+
"imports": "Resolving imports",
|
|
56
|
+
"nodes": "Creating symbol nodes",
|
|
57
|
+
"call_graph": "Building call graph",
|
|
58
|
+
"edges": "Linking relationships",
|
|
59
|
+
"embedding": "Generating embeddings",
|
|
60
|
+
"saving": "Saving to database",
|
|
61
|
+
"complete": "Complete",
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
# Check if legacy mode is forced via environment variable
|
|
65
|
+
LEGACY_UPLOAD = os.environ.get("NOGIC_LEGACY_UPLOAD", "").lower() in ("1", "true", "yes")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class SyncService:
|
|
69
|
+
"""Sync service that indexes files via the backend API.
|
|
70
|
+
|
|
71
|
+
Uses backend as source of truth - no local sync state tracking.
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(
|
|
75
|
+
self,
|
|
76
|
+
config: Config,
|
|
77
|
+
root_path: Path,
|
|
78
|
+
log: Optional[Callable[[str], None]] = None,
|
|
79
|
+
json_mode: bool = False,
|
|
80
|
+
):
|
|
81
|
+
self.config = config
|
|
82
|
+
self.root_path = root_path
|
|
83
|
+
self.log = log or print
|
|
84
|
+
self.json_mode = json_mode
|
|
85
|
+
self._client: Optional[NogicClient] = None
|
|
86
|
+
# In-memory cache of file info for current session
|
|
87
|
+
self._file_cache: dict[str, dict] = {} # path -> {hash, content, language}
|
|
88
|
+
|
|
89
|
+
def _emit_json(self, event: str, **kwargs):
|
|
90
|
+
"""Emit a single NDJSON progress line to stdout (json_mode only)."""
|
|
91
|
+
payload = {"event": event, "timestamp": int(time.time()), **kwargs}
|
|
92
|
+
sys.stdout.write(_json.dumps(payload) + "\n")
|
|
93
|
+
sys.stdout.flush()
|
|
94
|
+
|
|
95
|
+
@property
|
|
96
|
+
def client(self) -> NogicClient:
|
|
97
|
+
if self._client is None:
|
|
98
|
+
self._client = NogicClient(self.config)
|
|
99
|
+
return self._client
|
|
100
|
+
|
|
101
|
+
def close(self):
|
|
102
|
+
if self._client:
|
|
103
|
+
self._client.close()
|
|
104
|
+
self._client = None
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def compute_hash(content: str) -> str:
|
|
108
|
+
"""Compute SHA256 hash of content."""
|
|
109
|
+
return hashlib.sha256(content.encode()).hexdigest()
|
|
110
|
+
|
|
111
|
+
@staticmethod
|
|
112
|
+
def _split_into_batches(files: list[dict]) -> list[list[dict]]:
|
|
113
|
+
"""Split files into batches respecting both count and payload size limits.
|
|
114
|
+
|
|
115
|
+
Files larger than MAX_BATCH_BYTES get their own solo batch so they
|
|
116
|
+
don't inflate a shared batch beyond the limit.
|
|
117
|
+
"""
|
|
118
|
+
batches = []
|
|
119
|
+
current_batch = []
|
|
120
|
+
current_size = 0
|
|
121
|
+
|
|
122
|
+
for f in files:
|
|
123
|
+
file_size = len(f.get("content", "").encode("utf-8"))
|
|
124
|
+
|
|
125
|
+
# If this single file exceeds the batch byte limit, send it solo
|
|
126
|
+
if file_size > MAX_BATCH_BYTES:
|
|
127
|
+
if current_batch:
|
|
128
|
+
batches.append(current_batch)
|
|
129
|
+
current_batch = []
|
|
130
|
+
current_size = 0
|
|
131
|
+
batches.append([f])
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
# Start new batch if adding this file exceeds limits
|
|
135
|
+
if current_batch and (
|
|
136
|
+
len(current_batch) >= BATCH_SIZE
|
|
137
|
+
or current_size + file_size > MAX_BATCH_BYTES
|
|
138
|
+
):
|
|
139
|
+
batches.append(current_batch)
|
|
140
|
+
current_batch = []
|
|
141
|
+
current_size = 0
|
|
142
|
+
|
|
143
|
+
current_batch.append(f)
|
|
144
|
+
current_size += file_size
|
|
145
|
+
|
|
146
|
+
if current_batch:
|
|
147
|
+
batches.append(current_batch)
|
|
148
|
+
|
|
149
|
+
return batches
|
|
150
|
+
|
|
151
|
+
def scan_files(
|
|
152
|
+
self,
|
|
153
|
+
root_path: Path,
|
|
154
|
+
ignore_check: Callable[[Path], bool]
|
|
155
|
+
) -> list[dict]:
|
|
156
|
+
"""
|
|
157
|
+
Scan directory and collect file info.
|
|
158
|
+
|
|
159
|
+
Returns list of {path, hash, content, language} for supported files.
|
|
160
|
+
"""
|
|
161
|
+
# Collect files using os.walk for directory pruning (skips ignored dirs entirely)
|
|
162
|
+
if not self.json_mode:
|
|
163
|
+
console = Console()
|
|
164
|
+
ctx = console.status("[bold blue]Finding files...", spinner="dots")
|
|
165
|
+
ctx.__enter__()
|
|
166
|
+
|
|
167
|
+
all_files = []
|
|
168
|
+
for dirpath, dirnames, filenames in os.walk(root_path):
|
|
169
|
+
dp = Path(dirpath)
|
|
170
|
+
dirnames[:] = [
|
|
171
|
+
d for d in dirnames
|
|
172
|
+
if not ignore_check(dp / d)
|
|
173
|
+
]
|
|
174
|
+
for fname in filenames:
|
|
175
|
+
fpath = dp / fname
|
|
176
|
+
if not ignore_check(fpath):
|
|
177
|
+
all_files.append(fpath)
|
|
178
|
+
|
|
179
|
+
if not self.json_mode:
|
|
180
|
+
ctx.__exit__(None, None, None)
|
|
181
|
+
|
|
182
|
+
files_info = []
|
|
183
|
+
supported_count = 0
|
|
184
|
+
skipped_large = 0
|
|
185
|
+
total_files = len(all_files)
|
|
186
|
+
|
|
187
|
+
if self.json_mode:
|
|
188
|
+
# Emit progress via NDJSON
|
|
189
|
+
for i, path in enumerate(all_files):
|
|
190
|
+
if i % 50 == 0 or i == total_files - 1:
|
|
191
|
+
self._emit_json("progress", phase="scanning", current=i + 1, total=total_files)
|
|
192
|
+
|
|
193
|
+
language = get_language(path)
|
|
194
|
+
if not language:
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
file_size = path.stat().st_size
|
|
199
|
+
except OSError:
|
|
200
|
+
continue
|
|
201
|
+
|
|
202
|
+
if file_size > MAX_SINGLE_FILE_BYTES:
|
|
203
|
+
skipped_large += 1
|
|
204
|
+
continue
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
content = path.read_text(encoding="utf-8")
|
|
208
|
+
except (OSError, UnicodeDecodeError):
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
rel_path = str(path.relative_to(root_path))
|
|
212
|
+
content_hash = self.compute_hash(content)
|
|
213
|
+
|
|
214
|
+
files_info.append({
|
|
215
|
+
"path": rel_path,
|
|
216
|
+
"hash": content_hash,
|
|
217
|
+
"content": content,
|
|
218
|
+
"language": language,
|
|
219
|
+
})
|
|
220
|
+
supported_count += 1
|
|
221
|
+
else:
|
|
222
|
+
# Rich progress bar
|
|
223
|
+
with Progress(
|
|
224
|
+
SpinnerColumn(),
|
|
225
|
+
TextColumn("[progress.description]{task.description}"),
|
|
226
|
+
BarColumn(),
|
|
227
|
+
MofNCompleteColumn(),
|
|
228
|
+
TimeElapsedColumn(),
|
|
229
|
+
transient=False,
|
|
230
|
+
) as progress:
|
|
231
|
+
task = progress.add_task("Scanning files", total=total_files)
|
|
232
|
+
|
|
233
|
+
for path in all_files:
|
|
234
|
+
progress.update(task, advance=1)
|
|
235
|
+
|
|
236
|
+
language = get_language(path)
|
|
237
|
+
if not language:
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
try:
|
|
241
|
+
file_size = path.stat().st_size
|
|
242
|
+
except OSError:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
if file_size > MAX_SINGLE_FILE_BYTES:
|
|
246
|
+
skipped_large += 1
|
|
247
|
+
continue
|
|
248
|
+
|
|
249
|
+
try:
|
|
250
|
+
content = path.read_text(encoding="utf-8")
|
|
251
|
+
except (OSError, UnicodeDecodeError):
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
rel_path = str(path.relative_to(root_path))
|
|
255
|
+
content_hash = self.compute_hash(content)
|
|
256
|
+
|
|
257
|
+
files_info.append({
|
|
258
|
+
"path": rel_path,
|
|
259
|
+
"hash": content_hash,
|
|
260
|
+
"content": content,
|
|
261
|
+
"language": language,
|
|
262
|
+
})
|
|
263
|
+
supported_count += 1
|
|
264
|
+
|
|
265
|
+
msg = f"Found {supported_count} supported files ({len(all_files)} total)"
|
|
266
|
+
if skipped_large > 0:
|
|
267
|
+
msg += f", {skipped_large} skipped (>{MAX_SINGLE_FILE_BYTES // (1024 * 1024)}MB)"
|
|
268
|
+
self.log(msg)
|
|
269
|
+
return files_info
|
|
270
|
+
|
|
271
|
+
def sync_files(self, files_info: list[dict]) -> bool:
|
|
272
|
+
"""
|
|
273
|
+
Sync files to backend.
|
|
274
|
+
|
|
275
|
+
Uses two-phase parallel indexing if server supports it:
|
|
276
|
+
1. Upload all batches in parallel to /v1/index/upload
|
|
277
|
+
2. Call /v1/index/finalize/stream once for global resolution
|
|
278
|
+
|
|
279
|
+
Falls back to legacy sequential flow if server doesn't support it.
|
|
280
|
+
"""
|
|
281
|
+
if not files_info:
|
|
282
|
+
self.log("No supported files to sync")
|
|
283
|
+
return True
|
|
284
|
+
|
|
285
|
+
if not self.config.project_id:
|
|
286
|
+
self.log("Error: No project configured.")
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
if not self.config.api_key:
|
|
290
|
+
self.log("Error: No API key configured.")
|
|
291
|
+
return False
|
|
292
|
+
|
|
293
|
+
# Check if we should use legacy mode
|
|
294
|
+
if LEGACY_UPLOAD:
|
|
295
|
+
self.log("Using legacy upload mode (NOGIC_LEGACY_UPLOAD=1)")
|
|
296
|
+
return self._sync_files_legacy(files_info)
|
|
297
|
+
|
|
298
|
+
# Try two-phase parallel flow
|
|
299
|
+
try:
|
|
300
|
+
# Check if server supports new endpoints
|
|
301
|
+
stats = self.client.get_staging_stats(self.config.project_id)
|
|
302
|
+
if stats is None:
|
|
303
|
+
# Server returned 404 - doesn't support two-phase indexing
|
|
304
|
+
self.log("Server doesn't support two-phase indexing, using legacy mode")
|
|
305
|
+
return self._sync_files_legacy(files_info)
|
|
306
|
+
|
|
307
|
+
# Use new parallel flow
|
|
308
|
+
return self._sync_files_parallel(files_info)
|
|
309
|
+
except httpx.HTTPStatusError as e:
|
|
310
|
+
if e.response.status_code == 404:
|
|
311
|
+
self.log("Server doesn't support two-phase indexing, using legacy mode")
|
|
312
|
+
return self._sync_files_legacy(files_info)
|
|
313
|
+
raise
|
|
314
|
+
except Exception as e:
|
|
315
|
+
logger.warning(f"Two-phase check failed ({e}), falling back to legacy mode")
|
|
316
|
+
return self._sync_files_legacy(files_info)
|
|
317
|
+
|
|
318
|
+
def _sync_files_legacy(self, files_info: list[dict]) -> bool:
|
|
319
|
+
"""Legacy sequential upload (old flow)."""
|
|
320
|
+
return self._upload_files_batched(files_info)
|
|
321
|
+
|
|
322
|
+
def _sync_files_parallel(self, files_info: list[dict]) -> bool:
|
|
323
|
+
"""
|
|
324
|
+
Two-phase parallel upload flow.
|
|
325
|
+
|
|
326
|
+
Phase 1: Upload all batches in parallel
|
|
327
|
+
Phase 2: Finalize with streaming progress
|
|
328
|
+
"""
|
|
329
|
+
# Prepare files for upload
|
|
330
|
+
upload_files = [
|
|
331
|
+
{
|
|
332
|
+
"path": f["path"],
|
|
333
|
+
"content": f["content"],
|
|
334
|
+
"language": f["language"],
|
|
335
|
+
"hash": f["hash"],
|
|
336
|
+
}
|
|
337
|
+
for f in files_info
|
|
338
|
+
]
|
|
339
|
+
|
|
340
|
+
# Split into batches respecting both count and size limits
|
|
341
|
+
batches = self._split_into_batches(upload_files)
|
|
342
|
+
|
|
343
|
+
total_files = len(upload_files)
|
|
344
|
+
num_batches = len(batches)
|
|
345
|
+
|
|
346
|
+
# Clear any stale staging data from previous sessions
|
|
347
|
+
try:
|
|
348
|
+
self.client.clear_staging(self.config.project_id)
|
|
349
|
+
except Exception as e:
|
|
350
|
+
logger.debug(f"Could not clear staging (may not exist): {e}")
|
|
351
|
+
|
|
352
|
+
# Phase 1: Parallel upload
|
|
353
|
+
self.log(f"Phase 1: Uploading {total_files} files in {num_batches} batches...")
|
|
354
|
+
|
|
355
|
+
if self.json_mode:
|
|
356
|
+
self._emit_json("progress", phase="uploading", current=0, total=num_batches)
|
|
357
|
+
|
|
358
|
+
try:
|
|
359
|
+
result = asyncio.run(
|
|
360
|
+
self._upload_batches_parallel_json(batches, num_batches)
|
|
361
|
+
)
|
|
362
|
+
except Exception as e:
|
|
363
|
+
self._emit_json("error", message=f"Parallel upload failed: {e}")
|
|
364
|
+
try:
|
|
365
|
+
self.client.clear_staging(self.config.project_id)
|
|
366
|
+
except Exception:
|
|
367
|
+
pass
|
|
368
|
+
self.log("Falling back to legacy mode...")
|
|
369
|
+
return self._sync_files_legacy(files_info)
|
|
370
|
+
else:
|
|
371
|
+
with Progress(
|
|
372
|
+
SpinnerColumn(),
|
|
373
|
+
TextColumn("[progress.description]{task.description}"),
|
|
374
|
+
BarColumn(),
|
|
375
|
+
MofNCompleteColumn(),
|
|
376
|
+
TimeElapsedColumn(),
|
|
377
|
+
transient=False,
|
|
378
|
+
) as progress:
|
|
379
|
+
upload_task = progress.add_task(
|
|
380
|
+
"Uploading batches",
|
|
381
|
+
total=num_batches
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
try:
|
|
385
|
+
result = asyncio.run(
|
|
386
|
+
self._upload_batches_parallel(batches, progress, upload_task)
|
|
387
|
+
)
|
|
388
|
+
except Exception as e:
|
|
389
|
+
self.log(f"Parallel upload failed: {e}")
|
|
390
|
+
try:
|
|
391
|
+
self.client.clear_staging(self.config.project_id)
|
|
392
|
+
except Exception:
|
|
393
|
+
pass
|
|
394
|
+
self.log("Falling back to legacy mode...")
|
|
395
|
+
return self._sync_files_legacy(files_info)
|
|
396
|
+
|
|
397
|
+
total_uploaded = result['total_parsed']
|
|
398
|
+
total_staged = result['total_staged']
|
|
399
|
+
total_skipped = result.get('total_skipped', 0)
|
|
400
|
+
|
|
401
|
+
if total_skipped == 0 and total_staged < total_uploaded:
|
|
402
|
+
total_skipped = total_uploaded - total_staged
|
|
403
|
+
|
|
404
|
+
self.log(f"Uploaded {total_uploaded} files ({total_staged} staged, {total_skipped} unchanged)")
|
|
405
|
+
|
|
406
|
+
if total_staged == 0:
|
|
407
|
+
self.log("All files unchanged, nothing to index")
|
|
408
|
+
if self.json_mode:
|
|
409
|
+
self._emit_json("complete", files_indexed=0, nodes_created=0, edges_created=0)
|
|
410
|
+
return True
|
|
411
|
+
|
|
412
|
+
# Phase 2: Finalize with streaming progress
|
|
413
|
+
self.log("Phase 2: Processing...")
|
|
414
|
+
|
|
415
|
+
total_indexed = 0
|
|
416
|
+
total_skipped = 0
|
|
417
|
+
total_nodes = 0
|
|
418
|
+
total_edges = 0
|
|
419
|
+
all_errors = []
|
|
420
|
+
|
|
421
|
+
if self.json_mode:
|
|
422
|
+
try:
|
|
423
|
+
for event in self.client.finalize_stream(self.config.project_id):
|
|
424
|
+
if event.stage == "complete":
|
|
425
|
+
total_indexed = event.files_indexed or 0
|
|
426
|
+
total_skipped = event.files_skipped or 0
|
|
427
|
+
total_nodes = event.nodes_created or 0
|
|
428
|
+
total_edges = event.edges_created or 0
|
|
429
|
+
if event.errors:
|
|
430
|
+
all_errors.extend(event.errors)
|
|
431
|
+
self._emit_json(
|
|
432
|
+
"complete",
|
|
433
|
+
files_indexed=total_indexed,
|
|
434
|
+
files_skipped=total_skipped,
|
|
435
|
+
nodes_created=total_nodes,
|
|
436
|
+
edges_created=total_edges,
|
|
437
|
+
errors=len(all_errors),
|
|
438
|
+
)
|
|
439
|
+
break
|
|
440
|
+
|
|
441
|
+
if event.stage == "error":
|
|
442
|
+
self._emit_json("error", message=event.message or "Processing error")
|
|
443
|
+
break
|
|
444
|
+
|
|
445
|
+
stage = event.stage
|
|
446
|
+
current = event.current or 0
|
|
447
|
+
total = event.total or 0
|
|
448
|
+
label = STAGE_LABELS.get(stage, stage)
|
|
449
|
+
self._emit_json("progress", phase="processing", stage=stage, label=label, current=current, total=total)
|
|
450
|
+
|
|
451
|
+
except httpx.HTTPStatusError as e:
|
|
452
|
+
self._emit_json("error", message=f"Finalize error: {e.response.status_code}")
|
|
453
|
+
raise
|
|
454
|
+
except httpx.RequestError as e:
|
|
455
|
+
self._emit_json("error", message=f"Request error during finalize: {e}")
|
|
456
|
+
raise
|
|
457
|
+
else:
|
|
458
|
+
# Rich progress for terminal
|
|
459
|
+
stage_tasks: dict = {}
|
|
460
|
+
|
|
461
|
+
with Progress(
|
|
462
|
+
SpinnerColumn(),
|
|
463
|
+
TextColumn("[progress.description]{task.description:<12}"),
|
|
464
|
+
BarColumn(),
|
|
465
|
+
MofNCompleteColumn(),
|
|
466
|
+
TimeElapsedColumn(),
|
|
467
|
+
transient=False,
|
|
468
|
+
) as progress:
|
|
469
|
+
try:
|
|
470
|
+
for event in self.client.finalize_stream(self.config.project_id):
|
|
471
|
+
if event.stage == "complete":
|
|
472
|
+
total_indexed = event.files_indexed or 0
|
|
473
|
+
total_skipped = event.files_skipped or 0
|
|
474
|
+
total_nodes = event.nodes_created or 0
|
|
475
|
+
total_edges = event.edges_created or 0
|
|
476
|
+
if event.errors:
|
|
477
|
+
all_errors.extend(event.errors)
|
|
478
|
+
for task_id in stage_tasks.values():
|
|
479
|
+
progress.update(task_id, completed=progress.tasks[task_id].total)
|
|
480
|
+
break
|
|
481
|
+
|
|
482
|
+
if event.stage == "error":
|
|
483
|
+
all_errors.append({"path": "finalize", "error": event.message})
|
|
484
|
+
break
|
|
485
|
+
|
|
486
|
+
stage = event.stage
|
|
487
|
+
current = event.current or 0
|
|
488
|
+
total = event.total or 0
|
|
489
|
+
|
|
490
|
+
if stage not in stage_tasks:
|
|
491
|
+
task_total = total if total > 0 else 100
|
|
492
|
+
label = STAGE_LABELS.get(stage, stage)
|
|
493
|
+
stage_tasks[stage] = progress.add_task(
|
|
494
|
+
label,
|
|
495
|
+
total=task_total,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
task_id = stage_tasks[stage]
|
|
499
|
+
|
|
500
|
+
if total > 0 and progress.tasks[task_id].total != total:
|
|
501
|
+
progress.update(task_id, total=total)
|
|
502
|
+
|
|
503
|
+
progress.update(task_id, completed=current)
|
|
504
|
+
|
|
505
|
+
except httpx.HTTPStatusError as e:
|
|
506
|
+
self.log(f"Finalize error: {e.response.status_code}")
|
|
507
|
+
raise
|
|
508
|
+
except httpx.RequestError as e:
|
|
509
|
+
self.log(f"Request error during finalize: {e}")
|
|
510
|
+
raise
|
|
511
|
+
|
|
512
|
+
# Summary
|
|
513
|
+
if total_skipped > 0:
|
|
514
|
+
self.log(f"Indexed {total_indexed} files ({total_skipped} unchanged), {total_nodes} nodes, {total_edges} edges")
|
|
515
|
+
else:
|
|
516
|
+
self.log(f"Indexed {total_indexed} files, {total_nodes} nodes, {total_edges} edges")
|
|
517
|
+
|
|
518
|
+
if all_errors:
|
|
519
|
+
self.log(f"Errors ({len(all_errors)}):")
|
|
520
|
+
for err in all_errors[:10]:
|
|
521
|
+
self.log(f" {err.get('path', 'unknown')}: {err.get('error', 'unknown error')}")
|
|
522
|
+
if len(all_errors) > 10:
|
|
523
|
+
self.log(f" ... and {len(all_errors) - 10} more errors")
|
|
524
|
+
|
|
525
|
+
return True
|
|
526
|
+
|
|
527
|
+
async def _upload_batches_parallel_json(
|
|
528
|
+
self,
|
|
529
|
+
batches: list[list[dict]],
|
|
530
|
+
num_batches: int,
|
|
531
|
+
max_concurrent: int = MAX_CONCURRENT,
|
|
532
|
+
) -> dict:
|
|
533
|
+
"""Upload all batches in parallel, emitting NDJSON progress."""
|
|
534
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
535
|
+
results = {"total_parsed": 0, "total_staged": 0, "total_skipped": 0, "errors": []}
|
|
536
|
+
completed = 0
|
|
537
|
+
lock = asyncio.Lock()
|
|
538
|
+
|
|
539
|
+
async with httpx.AsyncClient(
|
|
540
|
+
timeout=60.0,
|
|
541
|
+
limits=httpx.Limits(max_connections=max_concurrent + 2, max_keepalive_connections=max_concurrent),
|
|
542
|
+
) as client:
|
|
543
|
+
async def upload_one(batch: list[dict], batch_num: int) -> dict:
|
|
544
|
+
nonlocal completed
|
|
545
|
+
async with semaphore:
|
|
546
|
+
resp = await client.post(
|
|
547
|
+
f"{self.config.api_url}/v1/index/upload",
|
|
548
|
+
json={
|
|
549
|
+
"project_id": self.config.project_id,
|
|
550
|
+
"files": batch,
|
|
551
|
+
},
|
|
552
|
+
headers={
|
|
553
|
+
"Authorization": f"Bearer {self.config.api_key}",
|
|
554
|
+
"Content-Type": "application/json",
|
|
555
|
+
},
|
|
556
|
+
)
|
|
557
|
+
resp.raise_for_status()
|
|
558
|
+
data = resp.json()
|
|
559
|
+
|
|
560
|
+
async with lock:
|
|
561
|
+
completed += 1
|
|
562
|
+
self._emit_json("progress", phase="uploading", current=completed, total=num_batches)
|
|
563
|
+
|
|
564
|
+
return data
|
|
565
|
+
|
|
566
|
+
tasks = [upload_one(batch, i) for i, batch in enumerate(batches)]
|
|
567
|
+
responses = await asyncio.gather(*tasks, return_exceptions=True)
|
|
568
|
+
|
|
569
|
+
for resp in responses:
|
|
570
|
+
if isinstance(resp, Exception):
|
|
571
|
+
results["errors"].append(str(resp))
|
|
572
|
+
logger.error(f"Batch upload failed: {resp}")
|
|
573
|
+
elif isinstance(resp, dict):
|
|
574
|
+
results["total_parsed"] += resp.get("files_parsed", 0)
|
|
575
|
+
results["total_staged"] = max(results["total_staged"], resp.get("total_staged", 0))
|
|
576
|
+
results["total_skipped"] += resp.get("files_skipped", 0)
|
|
577
|
+
|
|
578
|
+
if results["errors"]:
|
|
579
|
+
raise Exception(f"{len(results['errors'])} batches failed: {results['errors'][0]}")
|
|
580
|
+
|
|
581
|
+
return results
|
|
582
|
+
|
|
583
|
+
async def _upload_batches_parallel(
|
|
584
|
+
self,
|
|
585
|
+
batches: list[list[dict]],
|
|
586
|
+
progress: Progress,
|
|
587
|
+
task_id,
|
|
588
|
+
max_concurrent: int = MAX_CONCURRENT,
|
|
589
|
+
) -> dict:
|
|
590
|
+
"""Upload all batches in parallel using asyncio."""
|
|
591
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
|
592
|
+
results = {"total_parsed": 0, "total_staged": 0, "total_skipped": 0, "errors": []}
|
|
593
|
+
completed = 0
|
|
594
|
+
lock = asyncio.Lock()
|
|
595
|
+
|
|
596
|
+
# Create ONE shared client with connection pooling for all requests
|
|
597
|
+
# This enables HTTP/2 multiplexing and connection reuse
|
|
598
|
+
async with httpx.AsyncClient(
|
|
599
|
+
timeout=60.0,
|
|
600
|
+
limits=httpx.Limits(max_connections=max_concurrent + 2, max_keepalive_connections=max_concurrent),
|
|
601
|
+
) as client:
|
|
602
|
+
|
|
603
|
+
async def upload_one(batch: list[dict], batch_num: int) -> dict:
|
|
604
|
+
nonlocal completed
|
|
605
|
+
async with semaphore:
|
|
606
|
+
resp = await client.post(
|
|
607
|
+
f"{self.config.api_url}/v1/index/upload",
|
|
608
|
+
json={
|
|
609
|
+
"project_id": self.config.project_id,
|
|
610
|
+
"files": batch,
|
|
611
|
+
},
|
|
612
|
+
headers={
|
|
613
|
+
"Authorization": f"Bearer {self.config.api_key}",
|
|
614
|
+
"Content-Type": "application/json",
|
|
615
|
+
},
|
|
616
|
+
)
|
|
617
|
+
resp.raise_for_status()
|
|
618
|
+
data = resp.json()
|
|
619
|
+
|
|
620
|
+
# Update progress (use lock to ensure thread-safe counter)
|
|
621
|
+
async with lock:
|
|
622
|
+
completed += 1
|
|
623
|
+
progress.update(task_id, completed=completed)
|
|
624
|
+
|
|
625
|
+
return data
|
|
626
|
+
|
|
627
|
+
# Create tasks for all batches
|
|
628
|
+
tasks = [
|
|
629
|
+
upload_one(batch, i)
|
|
630
|
+
for i, batch in enumerate(batches)
|
|
631
|
+
]
|
|
632
|
+
|
|
633
|
+
# Run all tasks in parallel and collect results
|
|
634
|
+
responses = await asyncio.gather(*tasks, return_exceptions=True)
|
|
635
|
+
|
|
636
|
+
for resp in responses:
|
|
637
|
+
if isinstance(resp, Exception):
|
|
638
|
+
results["errors"].append(str(resp))
|
|
639
|
+
logger.error(f"Batch upload failed: {resp}")
|
|
640
|
+
elif isinstance(resp, dict):
|
|
641
|
+
results["total_parsed"] += resp.get("files_parsed", 0)
|
|
642
|
+
results["total_staged"] = max(results["total_staged"], resp.get("total_staged", 0)) # Use max (cumulative, order not guaranteed)
|
|
643
|
+
results["total_skipped"] += resp.get("files_skipped", 0) # Sum skipped across batches
|
|
644
|
+
|
|
645
|
+
if results["errors"]:
|
|
646
|
+
# If any batch failed, raise an exception
|
|
647
|
+
raise Exception(f"{len(results['errors'])} batches failed: {results['errors'][0]}")
|
|
648
|
+
|
|
649
|
+
return results
|
|
650
|
+
|
|
651
|
+
def _upload_files_batched(self, files: list[dict]) -> bool:
|
|
652
|
+
"""Upload files in batches with progress."""
|
|
653
|
+
if not files:
|
|
654
|
+
if self.json_mode:
|
|
655
|
+
self._emit_json("complete", files_indexed=0, nodes_created=0, edges_created=0)
|
|
656
|
+
return True
|
|
657
|
+
|
|
658
|
+
upload_files = [
|
|
659
|
+
{
|
|
660
|
+
"path": f["path"],
|
|
661
|
+
"content": f["content"],
|
|
662
|
+
"language": f["language"],
|
|
663
|
+
"hash": f["hash"],
|
|
664
|
+
}
|
|
665
|
+
for f in files
|
|
666
|
+
]
|
|
667
|
+
|
|
668
|
+
total_files = len(upload_files)
|
|
669
|
+
batches = self._split_into_batches(upload_files)
|
|
670
|
+
num_batches = len(batches)
|
|
671
|
+
|
|
672
|
+
total_indexed = 0
|
|
673
|
+
total_skipped = 0
|
|
674
|
+
total_nodes = 0
|
|
675
|
+
total_edges = 0
|
|
676
|
+
all_errors = []
|
|
677
|
+
files_done = 0
|
|
678
|
+
|
|
679
|
+
if self.json_mode:
|
|
680
|
+
for batch_num, batch in enumerate(batches):
|
|
681
|
+
self._emit_json("progress", phase="uploading", current=batch_num, total=num_batches,
|
|
682
|
+
description=f"Uploading batch {batch_num + 1}/{num_batches}")
|
|
683
|
+
try:
|
|
684
|
+
result = self._send_batch_stream(batch)
|
|
685
|
+
if result:
|
|
686
|
+
total_indexed += result.files_indexed or 0
|
|
687
|
+
total_skipped += result.files_skipped or 0
|
|
688
|
+
total_nodes += result.nodes_created or 0
|
|
689
|
+
total_edges += result.edges_created or 0
|
|
690
|
+
if result.errors:
|
|
691
|
+
all_errors.extend(result.errors)
|
|
692
|
+
except Exception as e:
|
|
693
|
+
self._emit_json("error", message=f"Batch {batch_num + 1} failed: {e}")
|
|
694
|
+
|
|
695
|
+
files_done += len(batch)
|
|
696
|
+
|
|
697
|
+
self._emit_json(
|
|
698
|
+
"complete",
|
|
699
|
+
files_indexed=total_indexed,
|
|
700
|
+
files_skipped=total_skipped,
|
|
701
|
+
nodes_created=total_nodes,
|
|
702
|
+
edges_created=total_edges,
|
|
703
|
+
errors=len(all_errors),
|
|
704
|
+
)
|
|
705
|
+
else:
|
|
706
|
+
with Progress(
|
|
707
|
+
SpinnerColumn(),
|
|
708
|
+
TextColumn("[progress.description]{task.description}"),
|
|
709
|
+
BarColumn(),
|
|
710
|
+
MofNCompleteColumn(),
|
|
711
|
+
TimeElapsedColumn(),
|
|
712
|
+
transient=False,
|
|
713
|
+
) as progress:
|
|
714
|
+
upload_task = progress.add_task(
|
|
715
|
+
"Uploading files",
|
|
716
|
+
total=total_files
|
|
717
|
+
)
|
|
718
|
+
|
|
719
|
+
for batch_num, batch in enumerate(batches):
|
|
720
|
+
progress.update(
|
|
721
|
+
upload_task,
|
|
722
|
+
description=f"Uploading batch {batch_num + 1}/{num_batches}"
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
try:
|
|
726
|
+
result = self._send_batch_stream(batch, progress)
|
|
727
|
+
if result:
|
|
728
|
+
total_indexed += result.files_indexed or 0
|
|
729
|
+
total_skipped += result.files_skipped or 0
|
|
730
|
+
total_nodes += result.nodes_created or 0
|
|
731
|
+
total_edges += result.edges_created or 0
|
|
732
|
+
if result.errors:
|
|
733
|
+
all_errors.extend(result.errors)
|
|
734
|
+
except Exception as e:
|
|
735
|
+
self.log(f" [ERROR] Batch {batch_num + 1} failed: {e}")
|
|
736
|
+
|
|
737
|
+
files_done += len(batch)
|
|
738
|
+
progress.update(upload_task, completed=files_done)
|
|
739
|
+
|
|
740
|
+
# Summary
|
|
741
|
+
if total_skipped > 0:
|
|
742
|
+
self.log(f"Indexed {total_indexed} files ({total_skipped} unchanged), {total_nodes} nodes, {total_edges} edges")
|
|
743
|
+
else:
|
|
744
|
+
self.log(f"Indexed {total_indexed} files, {total_nodes} nodes, {total_edges} edges")
|
|
745
|
+
if all_errors:
|
|
746
|
+
self.log(f"Errors ({len(all_errors)}):")
|
|
747
|
+
for err in all_errors[:10]:
|
|
748
|
+
self.log(f" {err.get('path', 'unknown')}: {err.get('error', 'unknown error')}")
|
|
749
|
+
if len(all_errors) > 10:
|
|
750
|
+
self.log(f" ... and {len(all_errors) - 10} more errors")
|
|
751
|
+
|
|
752
|
+
return True
|
|
753
|
+
|
|
754
|
+
def _send_batch_stream(
|
|
755
|
+
self,
|
|
756
|
+
files: list[dict],
|
|
757
|
+
parent_progress: Optional[Progress] = None
|
|
758
|
+
) -> Optional[IndexProgress]:
|
|
759
|
+
"""Send a batch using streaming endpoint, return final result."""
|
|
760
|
+
final_result = None
|
|
761
|
+
|
|
762
|
+
try:
|
|
763
|
+
for event in self.client.index_files_stream(
|
|
764
|
+
self.config.project_id, files
|
|
765
|
+
):
|
|
766
|
+
if event.stage == "complete":
|
|
767
|
+
final_result = event
|
|
768
|
+
break
|
|
769
|
+
except httpx.HTTPStatusError as e:
|
|
770
|
+
logger.error(f"Stream error: {e.response.status_code}")
|
|
771
|
+
# Try sync fallback
|
|
772
|
+
try:
|
|
773
|
+
result = self.client.index_files(self.config.project_id, files)
|
|
774
|
+
final_result = IndexProgress(
|
|
775
|
+
stage="complete",
|
|
776
|
+
message="Done",
|
|
777
|
+
current=len(files),
|
|
778
|
+
total=len(files),
|
|
779
|
+
files_indexed=result.files_indexed,
|
|
780
|
+
files_skipped=result.files_skipped,
|
|
781
|
+
nodes_created=result.nodes_created,
|
|
782
|
+
edges_created=result.edges_created,
|
|
783
|
+
errors=result.errors,
|
|
784
|
+
)
|
|
785
|
+
except Exception as e2:
|
|
786
|
+
logger.error(f"Sync fallback also failed: {e2}")
|
|
787
|
+
raise
|
|
788
|
+
except httpx.RequestError as e:
|
|
789
|
+
logger.error(f"Request error: {e}")
|
|
790
|
+
raise
|
|
791
|
+
|
|
792
|
+
return final_result
|
|
793
|
+
|
|
794
|
+
def initial_scan(
|
|
795
|
+
self,
|
|
796
|
+
root_path: Path,
|
|
797
|
+
ignore_check: Callable[[Path], bool]
|
|
798
|
+
):
|
|
799
|
+
"""
|
|
800
|
+
Full scan and sync with backend.
|
|
801
|
+
|
|
802
|
+
This is the main entry point for watch --force or sync command.
|
|
803
|
+
"""
|
|
804
|
+
# Step 1: Scan files locally
|
|
805
|
+
files_info = self.scan_files(root_path, ignore_check)
|
|
806
|
+
|
|
807
|
+
# Populate file cache so watch mode can skip unchanged files
|
|
808
|
+
for f in files_info:
|
|
809
|
+
self._file_cache[f["path"]] = {
|
|
810
|
+
"hash": f["hash"],
|
|
811
|
+
"language": f["language"],
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
# Step 2: Send to backend (includes hashes for server-side dedup)
|
|
815
|
+
self.sync_files(files_info)
|
|
816
|
+
|
|
817
|
+
def delete_file_immediate(self, path: Path) -> bool:
|
|
818
|
+
"""Delete a file's nodes from the graph (for watch mode)."""
|
|
819
|
+
language = get_language(path)
|
|
820
|
+
if not language:
|
|
821
|
+
return False
|
|
822
|
+
|
|
823
|
+
rel_path = str(path.relative_to(self.root_path))
|
|
824
|
+
|
|
825
|
+
try:
|
|
826
|
+
deleted = self.client.delete_files(self.config.project_id, [rel_path])
|
|
827
|
+
# Remove from cache
|
|
828
|
+
self._file_cache.pop(rel_path, None)
|
|
829
|
+
return deleted > 0
|
|
830
|
+
except Exception as e:
|
|
831
|
+
logger.error(f"Failed to delete {rel_path}: {e}")
|
|
832
|
+
return False
|
|
833
|
+
|
|
834
|
+
def sync_file_immediate(self, path: Path) -> bool:
|
|
835
|
+
"""
|
|
836
|
+
Sync a single file immediately (for watch mode).
|
|
837
|
+
|
|
838
|
+
Returns True if file was synced.
|
|
839
|
+
"""
|
|
840
|
+
language = get_language(path)
|
|
841
|
+
if not language:
|
|
842
|
+
return False
|
|
843
|
+
|
|
844
|
+
try:
|
|
845
|
+
if path.stat().st_size > MAX_SINGLE_FILE_BYTES:
|
|
846
|
+
return False
|
|
847
|
+
except OSError:
|
|
848
|
+
return False
|
|
849
|
+
|
|
850
|
+
try:
|
|
851
|
+
content = path.read_text(encoding="utf-8")
|
|
852
|
+
except (OSError, UnicodeDecodeError):
|
|
853
|
+
return False
|
|
854
|
+
|
|
855
|
+
rel_path = str(path.relative_to(self.root_path))
|
|
856
|
+
|
|
857
|
+
# Check if content changed from our cache
|
|
858
|
+
content_hash = self.compute_hash(content)
|
|
859
|
+
cached = self._file_cache.get(rel_path)
|
|
860
|
+
|
|
861
|
+
if cached and cached.get("hash") == content_hash:
|
|
862
|
+
return False # No change
|
|
863
|
+
|
|
864
|
+
# Update cache
|
|
865
|
+
self._file_cache[rel_path] = {
|
|
866
|
+
"hash": content_hash,
|
|
867
|
+
"content": content,
|
|
868
|
+
"language": language,
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
# Send to backend
|
|
872
|
+
files = [{"path": rel_path, "content": content, "language": language, "hash": content_hash}]
|
|
873
|
+
|
|
874
|
+
try:
|
|
875
|
+
result = self.client.index_files(self.config.project_id, files)
|
|
876
|
+
return result.files_indexed > 0
|
|
877
|
+
except Exception as e:
|
|
878
|
+
logger.error(f"Failed to sync {rel_path}: {e}")
|
|
879
|
+
return False
|