nogic 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
nogic/watcher/sync.py ADDED
@@ -0,0 +1,879 @@
1
+ """Sync service for file changes - sends to backend API."""
2
+
3
+ import asyncio
4
+ import hashlib
5
+ import json as _json
6
+ import logging
7
+ import os
8
+ import sys
9
+ import time
10
+ from pathlib import Path
11
+ from typing import Callable, Optional
12
+
13
+ import httpx
14
+ from rich.progress import (
15
+ Progress,
16
+ SpinnerColumn,
17
+ TextColumn,
18
+ BarColumn,
19
+ TaskProgressColumn,
20
+ TimeElapsedColumn,
21
+ MofNCompleteColumn,
22
+ )
23
+ from rich.console import Console
24
+
25
+ from nogic.config import Config, get_language
26
+ from nogic.api import NogicClient, IndexProgress
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ def _int_env(name: str, default: int) -> int:
31
+ """Read an integer from an environment variable, falling back to default on bad values."""
32
+ raw = os.environ.get(name)
33
+ if raw is None:
34
+ return default
35
+ try:
36
+ return int(raw)
37
+ except ValueError:
38
+ return default
39
+
40
+ # Batch size for uploading files (configurable via NOGIC_BATCH_SIZE)
41
+ BATCH_SIZE = _int_env("NOGIC_BATCH_SIZE", 150)
42
+
43
+ # Max payload size per batch in bytes (~10MB default, well under GCP LB 32MB limit)
44
+ MAX_BATCH_BYTES = _int_env("NOGIC_MAX_BATCH_BYTES", 10 * 1024 * 1024)
45
+
46
+ # Hard cap per file in bytes (~30MB) — must fit in a single request under GCP LB 32MB limit
47
+ MAX_SINGLE_FILE_BYTES = _int_env("NOGIC_MAX_FILE_SIZE", 30 * 1024 * 1024)
48
+
49
+ # Max concurrent uploads (configurable via NOGIC_MAX_CONCURRENT)
50
+ MAX_CONCURRENT = _int_env("NOGIC_MAX_CONCURRENT", 8)
51
+
52
+ STAGE_LABELS = {
53
+ "hashing": "Computing file hashes",
54
+ "parsing": "Parsing files",
55
+ "imports": "Resolving imports",
56
+ "nodes": "Creating symbol nodes",
57
+ "call_graph": "Building call graph",
58
+ "edges": "Linking relationships",
59
+ "embedding": "Generating embeddings",
60
+ "saving": "Saving to database",
61
+ "complete": "Complete",
62
+ }
63
+
64
+ # Check if legacy mode is forced via environment variable
65
+ LEGACY_UPLOAD = os.environ.get("NOGIC_LEGACY_UPLOAD", "").lower() in ("1", "true", "yes")
66
+
67
+
68
+ class SyncService:
69
+ """Sync service that indexes files via the backend API.
70
+
71
+ Uses backend as source of truth - no local sync state tracking.
72
+ """
73
+
74
+ def __init__(
75
+ self,
76
+ config: Config,
77
+ root_path: Path,
78
+ log: Optional[Callable[[str], None]] = None,
79
+ json_mode: bool = False,
80
+ ):
81
+ self.config = config
82
+ self.root_path = root_path
83
+ self.log = log or print
84
+ self.json_mode = json_mode
85
+ self._client: Optional[NogicClient] = None
86
+ # In-memory cache of file info for current session
87
+ self._file_cache: dict[str, dict] = {} # path -> {hash, content, language}
88
+
89
+ def _emit_json(self, event: str, **kwargs):
90
+ """Emit a single NDJSON progress line to stdout (json_mode only)."""
91
+ payload = {"event": event, "timestamp": int(time.time()), **kwargs}
92
+ sys.stdout.write(_json.dumps(payload) + "\n")
93
+ sys.stdout.flush()
94
+
95
+ @property
96
+ def client(self) -> NogicClient:
97
+ if self._client is None:
98
+ self._client = NogicClient(self.config)
99
+ return self._client
100
+
101
+ def close(self):
102
+ if self._client:
103
+ self._client.close()
104
+ self._client = None
105
+
106
+ @staticmethod
107
+ def compute_hash(content: str) -> str:
108
+ """Compute SHA256 hash of content."""
109
+ return hashlib.sha256(content.encode()).hexdigest()
110
+
111
+ @staticmethod
112
+ def _split_into_batches(files: list[dict]) -> list[list[dict]]:
113
+ """Split files into batches respecting both count and payload size limits.
114
+
115
+ Files larger than MAX_BATCH_BYTES get their own solo batch so they
116
+ don't inflate a shared batch beyond the limit.
117
+ """
118
+ batches = []
119
+ current_batch = []
120
+ current_size = 0
121
+
122
+ for f in files:
123
+ file_size = len(f.get("content", "").encode("utf-8"))
124
+
125
+ # If this single file exceeds the batch byte limit, send it solo
126
+ if file_size > MAX_BATCH_BYTES:
127
+ if current_batch:
128
+ batches.append(current_batch)
129
+ current_batch = []
130
+ current_size = 0
131
+ batches.append([f])
132
+ continue
133
+
134
+ # Start new batch if adding this file exceeds limits
135
+ if current_batch and (
136
+ len(current_batch) >= BATCH_SIZE
137
+ or current_size + file_size > MAX_BATCH_BYTES
138
+ ):
139
+ batches.append(current_batch)
140
+ current_batch = []
141
+ current_size = 0
142
+
143
+ current_batch.append(f)
144
+ current_size += file_size
145
+
146
+ if current_batch:
147
+ batches.append(current_batch)
148
+
149
+ return batches
150
+
151
+ def scan_files(
152
+ self,
153
+ root_path: Path,
154
+ ignore_check: Callable[[Path], bool]
155
+ ) -> list[dict]:
156
+ """
157
+ Scan directory and collect file info.
158
+
159
+ Returns list of {path, hash, content, language} for supported files.
160
+ """
161
+ # Collect files using os.walk for directory pruning (skips ignored dirs entirely)
162
+ if not self.json_mode:
163
+ console = Console()
164
+ ctx = console.status("[bold blue]Finding files...", spinner="dots")
165
+ ctx.__enter__()
166
+
167
+ all_files = []
168
+ for dirpath, dirnames, filenames in os.walk(root_path):
169
+ dp = Path(dirpath)
170
+ dirnames[:] = [
171
+ d for d in dirnames
172
+ if not ignore_check(dp / d)
173
+ ]
174
+ for fname in filenames:
175
+ fpath = dp / fname
176
+ if not ignore_check(fpath):
177
+ all_files.append(fpath)
178
+
179
+ if not self.json_mode:
180
+ ctx.__exit__(None, None, None)
181
+
182
+ files_info = []
183
+ supported_count = 0
184
+ skipped_large = 0
185
+ total_files = len(all_files)
186
+
187
+ if self.json_mode:
188
+ # Emit progress via NDJSON
189
+ for i, path in enumerate(all_files):
190
+ if i % 50 == 0 or i == total_files - 1:
191
+ self._emit_json("progress", phase="scanning", current=i + 1, total=total_files)
192
+
193
+ language = get_language(path)
194
+ if not language:
195
+ continue
196
+
197
+ try:
198
+ file_size = path.stat().st_size
199
+ except OSError:
200
+ continue
201
+
202
+ if file_size > MAX_SINGLE_FILE_BYTES:
203
+ skipped_large += 1
204
+ continue
205
+
206
+ try:
207
+ content = path.read_text(encoding="utf-8")
208
+ except (OSError, UnicodeDecodeError):
209
+ continue
210
+
211
+ rel_path = str(path.relative_to(root_path))
212
+ content_hash = self.compute_hash(content)
213
+
214
+ files_info.append({
215
+ "path": rel_path,
216
+ "hash": content_hash,
217
+ "content": content,
218
+ "language": language,
219
+ })
220
+ supported_count += 1
221
+ else:
222
+ # Rich progress bar
223
+ with Progress(
224
+ SpinnerColumn(),
225
+ TextColumn("[progress.description]{task.description}"),
226
+ BarColumn(),
227
+ MofNCompleteColumn(),
228
+ TimeElapsedColumn(),
229
+ transient=False,
230
+ ) as progress:
231
+ task = progress.add_task("Scanning files", total=total_files)
232
+
233
+ for path in all_files:
234
+ progress.update(task, advance=1)
235
+
236
+ language = get_language(path)
237
+ if not language:
238
+ continue
239
+
240
+ try:
241
+ file_size = path.stat().st_size
242
+ except OSError:
243
+ continue
244
+
245
+ if file_size > MAX_SINGLE_FILE_BYTES:
246
+ skipped_large += 1
247
+ continue
248
+
249
+ try:
250
+ content = path.read_text(encoding="utf-8")
251
+ except (OSError, UnicodeDecodeError):
252
+ continue
253
+
254
+ rel_path = str(path.relative_to(root_path))
255
+ content_hash = self.compute_hash(content)
256
+
257
+ files_info.append({
258
+ "path": rel_path,
259
+ "hash": content_hash,
260
+ "content": content,
261
+ "language": language,
262
+ })
263
+ supported_count += 1
264
+
265
+ msg = f"Found {supported_count} supported files ({len(all_files)} total)"
266
+ if skipped_large > 0:
267
+ msg += f", {skipped_large} skipped (>{MAX_SINGLE_FILE_BYTES // (1024 * 1024)}MB)"
268
+ self.log(msg)
269
+ return files_info
270
+
271
+ def sync_files(self, files_info: list[dict]) -> bool:
272
+ """
273
+ Sync files to backend.
274
+
275
+ Uses two-phase parallel indexing if server supports it:
276
+ 1. Upload all batches in parallel to /v1/index/upload
277
+ 2. Call /v1/index/finalize/stream once for global resolution
278
+
279
+ Falls back to legacy sequential flow if server doesn't support it.
280
+ """
281
+ if not files_info:
282
+ self.log("No supported files to sync")
283
+ return True
284
+
285
+ if not self.config.project_id:
286
+ self.log("Error: No project configured.")
287
+ return False
288
+
289
+ if not self.config.api_key:
290
+ self.log("Error: No API key configured.")
291
+ return False
292
+
293
+ # Check if we should use legacy mode
294
+ if LEGACY_UPLOAD:
295
+ self.log("Using legacy upload mode (NOGIC_LEGACY_UPLOAD=1)")
296
+ return self._sync_files_legacy(files_info)
297
+
298
+ # Try two-phase parallel flow
299
+ try:
300
+ # Check if server supports new endpoints
301
+ stats = self.client.get_staging_stats(self.config.project_id)
302
+ if stats is None:
303
+ # Server returned 404 - doesn't support two-phase indexing
304
+ self.log("Server doesn't support two-phase indexing, using legacy mode")
305
+ return self._sync_files_legacy(files_info)
306
+
307
+ # Use new parallel flow
308
+ return self._sync_files_parallel(files_info)
309
+ except httpx.HTTPStatusError as e:
310
+ if e.response.status_code == 404:
311
+ self.log("Server doesn't support two-phase indexing, using legacy mode")
312
+ return self._sync_files_legacy(files_info)
313
+ raise
314
+ except Exception as e:
315
+ logger.warning(f"Two-phase check failed ({e}), falling back to legacy mode")
316
+ return self._sync_files_legacy(files_info)
317
+
318
+ def _sync_files_legacy(self, files_info: list[dict]) -> bool:
319
+ """Legacy sequential upload (old flow)."""
320
+ return self._upload_files_batched(files_info)
321
+
322
+ def _sync_files_parallel(self, files_info: list[dict]) -> bool:
323
+ """
324
+ Two-phase parallel upload flow.
325
+
326
+ Phase 1: Upload all batches in parallel
327
+ Phase 2: Finalize with streaming progress
328
+ """
329
+ # Prepare files for upload
330
+ upload_files = [
331
+ {
332
+ "path": f["path"],
333
+ "content": f["content"],
334
+ "language": f["language"],
335
+ "hash": f["hash"],
336
+ }
337
+ for f in files_info
338
+ ]
339
+
340
+ # Split into batches respecting both count and size limits
341
+ batches = self._split_into_batches(upload_files)
342
+
343
+ total_files = len(upload_files)
344
+ num_batches = len(batches)
345
+
346
+ # Clear any stale staging data from previous sessions
347
+ try:
348
+ self.client.clear_staging(self.config.project_id)
349
+ except Exception as e:
350
+ logger.debug(f"Could not clear staging (may not exist): {e}")
351
+
352
+ # Phase 1: Parallel upload
353
+ self.log(f"Phase 1: Uploading {total_files} files in {num_batches} batches...")
354
+
355
+ if self.json_mode:
356
+ self._emit_json("progress", phase="uploading", current=0, total=num_batches)
357
+
358
+ try:
359
+ result = asyncio.run(
360
+ self._upload_batches_parallel_json(batches, num_batches)
361
+ )
362
+ except Exception as e:
363
+ self._emit_json("error", message=f"Parallel upload failed: {e}")
364
+ try:
365
+ self.client.clear_staging(self.config.project_id)
366
+ except Exception:
367
+ pass
368
+ self.log("Falling back to legacy mode...")
369
+ return self._sync_files_legacy(files_info)
370
+ else:
371
+ with Progress(
372
+ SpinnerColumn(),
373
+ TextColumn("[progress.description]{task.description}"),
374
+ BarColumn(),
375
+ MofNCompleteColumn(),
376
+ TimeElapsedColumn(),
377
+ transient=False,
378
+ ) as progress:
379
+ upload_task = progress.add_task(
380
+ "Uploading batches",
381
+ total=num_batches
382
+ )
383
+
384
+ try:
385
+ result = asyncio.run(
386
+ self._upload_batches_parallel(batches, progress, upload_task)
387
+ )
388
+ except Exception as e:
389
+ self.log(f"Parallel upload failed: {e}")
390
+ try:
391
+ self.client.clear_staging(self.config.project_id)
392
+ except Exception:
393
+ pass
394
+ self.log("Falling back to legacy mode...")
395
+ return self._sync_files_legacy(files_info)
396
+
397
+ total_uploaded = result['total_parsed']
398
+ total_staged = result['total_staged']
399
+ total_skipped = result.get('total_skipped', 0)
400
+
401
+ if total_skipped == 0 and total_staged < total_uploaded:
402
+ total_skipped = total_uploaded - total_staged
403
+
404
+ self.log(f"Uploaded {total_uploaded} files ({total_staged} staged, {total_skipped} unchanged)")
405
+
406
+ if total_staged == 0:
407
+ self.log("All files unchanged, nothing to index")
408
+ if self.json_mode:
409
+ self._emit_json("complete", files_indexed=0, nodes_created=0, edges_created=0)
410
+ return True
411
+
412
+ # Phase 2: Finalize with streaming progress
413
+ self.log("Phase 2: Processing...")
414
+
415
+ total_indexed = 0
416
+ total_skipped = 0
417
+ total_nodes = 0
418
+ total_edges = 0
419
+ all_errors = []
420
+
421
+ if self.json_mode:
422
+ try:
423
+ for event in self.client.finalize_stream(self.config.project_id):
424
+ if event.stage == "complete":
425
+ total_indexed = event.files_indexed or 0
426
+ total_skipped = event.files_skipped or 0
427
+ total_nodes = event.nodes_created or 0
428
+ total_edges = event.edges_created or 0
429
+ if event.errors:
430
+ all_errors.extend(event.errors)
431
+ self._emit_json(
432
+ "complete",
433
+ files_indexed=total_indexed,
434
+ files_skipped=total_skipped,
435
+ nodes_created=total_nodes,
436
+ edges_created=total_edges,
437
+ errors=len(all_errors),
438
+ )
439
+ break
440
+
441
+ if event.stage == "error":
442
+ self._emit_json("error", message=event.message or "Processing error")
443
+ break
444
+
445
+ stage = event.stage
446
+ current = event.current or 0
447
+ total = event.total or 0
448
+ label = STAGE_LABELS.get(stage, stage)
449
+ self._emit_json("progress", phase="processing", stage=stage, label=label, current=current, total=total)
450
+
451
+ except httpx.HTTPStatusError as e:
452
+ self._emit_json("error", message=f"Finalize error: {e.response.status_code}")
453
+ raise
454
+ except httpx.RequestError as e:
455
+ self._emit_json("error", message=f"Request error during finalize: {e}")
456
+ raise
457
+ else:
458
+ # Rich progress for terminal
459
+ stage_tasks: dict = {}
460
+
461
+ with Progress(
462
+ SpinnerColumn(),
463
+ TextColumn("[progress.description]{task.description:<12}"),
464
+ BarColumn(),
465
+ MofNCompleteColumn(),
466
+ TimeElapsedColumn(),
467
+ transient=False,
468
+ ) as progress:
469
+ try:
470
+ for event in self.client.finalize_stream(self.config.project_id):
471
+ if event.stage == "complete":
472
+ total_indexed = event.files_indexed or 0
473
+ total_skipped = event.files_skipped or 0
474
+ total_nodes = event.nodes_created or 0
475
+ total_edges = event.edges_created or 0
476
+ if event.errors:
477
+ all_errors.extend(event.errors)
478
+ for task_id in stage_tasks.values():
479
+ progress.update(task_id, completed=progress.tasks[task_id].total)
480
+ break
481
+
482
+ if event.stage == "error":
483
+ all_errors.append({"path": "finalize", "error": event.message})
484
+ break
485
+
486
+ stage = event.stage
487
+ current = event.current or 0
488
+ total = event.total or 0
489
+
490
+ if stage not in stage_tasks:
491
+ task_total = total if total > 0 else 100
492
+ label = STAGE_LABELS.get(stage, stage)
493
+ stage_tasks[stage] = progress.add_task(
494
+ label,
495
+ total=task_total,
496
+ )
497
+
498
+ task_id = stage_tasks[stage]
499
+
500
+ if total > 0 and progress.tasks[task_id].total != total:
501
+ progress.update(task_id, total=total)
502
+
503
+ progress.update(task_id, completed=current)
504
+
505
+ except httpx.HTTPStatusError as e:
506
+ self.log(f"Finalize error: {e.response.status_code}")
507
+ raise
508
+ except httpx.RequestError as e:
509
+ self.log(f"Request error during finalize: {e}")
510
+ raise
511
+
512
+ # Summary
513
+ if total_skipped > 0:
514
+ self.log(f"Indexed {total_indexed} files ({total_skipped} unchanged), {total_nodes} nodes, {total_edges} edges")
515
+ else:
516
+ self.log(f"Indexed {total_indexed} files, {total_nodes} nodes, {total_edges} edges")
517
+
518
+ if all_errors:
519
+ self.log(f"Errors ({len(all_errors)}):")
520
+ for err in all_errors[:10]:
521
+ self.log(f" {err.get('path', 'unknown')}: {err.get('error', 'unknown error')}")
522
+ if len(all_errors) > 10:
523
+ self.log(f" ... and {len(all_errors) - 10} more errors")
524
+
525
+ return True
526
+
527
+ async def _upload_batches_parallel_json(
528
+ self,
529
+ batches: list[list[dict]],
530
+ num_batches: int,
531
+ max_concurrent: int = MAX_CONCURRENT,
532
+ ) -> dict:
533
+ """Upload all batches in parallel, emitting NDJSON progress."""
534
+ semaphore = asyncio.Semaphore(max_concurrent)
535
+ results = {"total_parsed": 0, "total_staged": 0, "total_skipped": 0, "errors": []}
536
+ completed = 0
537
+ lock = asyncio.Lock()
538
+
539
+ async with httpx.AsyncClient(
540
+ timeout=60.0,
541
+ limits=httpx.Limits(max_connections=max_concurrent + 2, max_keepalive_connections=max_concurrent),
542
+ ) as client:
543
+ async def upload_one(batch: list[dict], batch_num: int) -> dict:
544
+ nonlocal completed
545
+ async with semaphore:
546
+ resp = await client.post(
547
+ f"{self.config.api_url}/v1/index/upload",
548
+ json={
549
+ "project_id": self.config.project_id,
550
+ "files": batch,
551
+ },
552
+ headers={
553
+ "Authorization": f"Bearer {self.config.api_key}",
554
+ "Content-Type": "application/json",
555
+ },
556
+ )
557
+ resp.raise_for_status()
558
+ data = resp.json()
559
+
560
+ async with lock:
561
+ completed += 1
562
+ self._emit_json("progress", phase="uploading", current=completed, total=num_batches)
563
+
564
+ return data
565
+
566
+ tasks = [upload_one(batch, i) for i, batch in enumerate(batches)]
567
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
568
+
569
+ for resp in responses:
570
+ if isinstance(resp, Exception):
571
+ results["errors"].append(str(resp))
572
+ logger.error(f"Batch upload failed: {resp}")
573
+ elif isinstance(resp, dict):
574
+ results["total_parsed"] += resp.get("files_parsed", 0)
575
+ results["total_staged"] = max(results["total_staged"], resp.get("total_staged", 0))
576
+ results["total_skipped"] += resp.get("files_skipped", 0)
577
+
578
+ if results["errors"]:
579
+ raise Exception(f"{len(results['errors'])} batches failed: {results['errors'][0]}")
580
+
581
+ return results
582
+
583
+ async def _upload_batches_parallel(
584
+ self,
585
+ batches: list[list[dict]],
586
+ progress: Progress,
587
+ task_id,
588
+ max_concurrent: int = MAX_CONCURRENT,
589
+ ) -> dict:
590
+ """Upload all batches in parallel using asyncio."""
591
+ semaphore = asyncio.Semaphore(max_concurrent)
592
+ results = {"total_parsed": 0, "total_staged": 0, "total_skipped": 0, "errors": []}
593
+ completed = 0
594
+ lock = asyncio.Lock()
595
+
596
+ # Create ONE shared client with connection pooling for all requests
597
+ # This enables HTTP/2 multiplexing and connection reuse
598
+ async with httpx.AsyncClient(
599
+ timeout=60.0,
600
+ limits=httpx.Limits(max_connections=max_concurrent + 2, max_keepalive_connections=max_concurrent),
601
+ ) as client:
602
+
603
+ async def upload_one(batch: list[dict], batch_num: int) -> dict:
604
+ nonlocal completed
605
+ async with semaphore:
606
+ resp = await client.post(
607
+ f"{self.config.api_url}/v1/index/upload",
608
+ json={
609
+ "project_id": self.config.project_id,
610
+ "files": batch,
611
+ },
612
+ headers={
613
+ "Authorization": f"Bearer {self.config.api_key}",
614
+ "Content-Type": "application/json",
615
+ },
616
+ )
617
+ resp.raise_for_status()
618
+ data = resp.json()
619
+
620
+ # Update progress (use lock to ensure thread-safe counter)
621
+ async with lock:
622
+ completed += 1
623
+ progress.update(task_id, completed=completed)
624
+
625
+ return data
626
+
627
+ # Create tasks for all batches
628
+ tasks = [
629
+ upload_one(batch, i)
630
+ for i, batch in enumerate(batches)
631
+ ]
632
+
633
+ # Run all tasks in parallel and collect results
634
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
635
+
636
+ for resp in responses:
637
+ if isinstance(resp, Exception):
638
+ results["errors"].append(str(resp))
639
+ logger.error(f"Batch upload failed: {resp}")
640
+ elif isinstance(resp, dict):
641
+ results["total_parsed"] += resp.get("files_parsed", 0)
642
+ results["total_staged"] = max(results["total_staged"], resp.get("total_staged", 0)) # Use max (cumulative, order not guaranteed)
643
+ results["total_skipped"] += resp.get("files_skipped", 0) # Sum skipped across batches
644
+
645
+ if results["errors"]:
646
+ # If any batch failed, raise an exception
647
+ raise Exception(f"{len(results['errors'])} batches failed: {results['errors'][0]}")
648
+
649
+ return results
650
+
651
+ def _upload_files_batched(self, files: list[dict]) -> bool:
652
+ """Upload files in batches with progress."""
653
+ if not files:
654
+ if self.json_mode:
655
+ self._emit_json("complete", files_indexed=0, nodes_created=0, edges_created=0)
656
+ return True
657
+
658
+ upload_files = [
659
+ {
660
+ "path": f["path"],
661
+ "content": f["content"],
662
+ "language": f["language"],
663
+ "hash": f["hash"],
664
+ }
665
+ for f in files
666
+ ]
667
+
668
+ total_files = len(upload_files)
669
+ batches = self._split_into_batches(upload_files)
670
+ num_batches = len(batches)
671
+
672
+ total_indexed = 0
673
+ total_skipped = 0
674
+ total_nodes = 0
675
+ total_edges = 0
676
+ all_errors = []
677
+ files_done = 0
678
+
679
+ if self.json_mode:
680
+ for batch_num, batch in enumerate(batches):
681
+ self._emit_json("progress", phase="uploading", current=batch_num, total=num_batches,
682
+ description=f"Uploading batch {batch_num + 1}/{num_batches}")
683
+ try:
684
+ result = self._send_batch_stream(batch)
685
+ if result:
686
+ total_indexed += result.files_indexed or 0
687
+ total_skipped += result.files_skipped or 0
688
+ total_nodes += result.nodes_created or 0
689
+ total_edges += result.edges_created or 0
690
+ if result.errors:
691
+ all_errors.extend(result.errors)
692
+ except Exception as e:
693
+ self._emit_json("error", message=f"Batch {batch_num + 1} failed: {e}")
694
+
695
+ files_done += len(batch)
696
+
697
+ self._emit_json(
698
+ "complete",
699
+ files_indexed=total_indexed,
700
+ files_skipped=total_skipped,
701
+ nodes_created=total_nodes,
702
+ edges_created=total_edges,
703
+ errors=len(all_errors),
704
+ )
705
+ else:
706
+ with Progress(
707
+ SpinnerColumn(),
708
+ TextColumn("[progress.description]{task.description}"),
709
+ BarColumn(),
710
+ MofNCompleteColumn(),
711
+ TimeElapsedColumn(),
712
+ transient=False,
713
+ ) as progress:
714
+ upload_task = progress.add_task(
715
+ "Uploading files",
716
+ total=total_files
717
+ )
718
+
719
+ for batch_num, batch in enumerate(batches):
720
+ progress.update(
721
+ upload_task,
722
+ description=f"Uploading batch {batch_num + 1}/{num_batches}"
723
+ )
724
+
725
+ try:
726
+ result = self._send_batch_stream(batch, progress)
727
+ if result:
728
+ total_indexed += result.files_indexed or 0
729
+ total_skipped += result.files_skipped or 0
730
+ total_nodes += result.nodes_created or 0
731
+ total_edges += result.edges_created or 0
732
+ if result.errors:
733
+ all_errors.extend(result.errors)
734
+ except Exception as e:
735
+ self.log(f" [ERROR] Batch {batch_num + 1} failed: {e}")
736
+
737
+ files_done += len(batch)
738
+ progress.update(upload_task, completed=files_done)
739
+
740
+ # Summary
741
+ if total_skipped > 0:
742
+ self.log(f"Indexed {total_indexed} files ({total_skipped} unchanged), {total_nodes} nodes, {total_edges} edges")
743
+ else:
744
+ self.log(f"Indexed {total_indexed} files, {total_nodes} nodes, {total_edges} edges")
745
+ if all_errors:
746
+ self.log(f"Errors ({len(all_errors)}):")
747
+ for err in all_errors[:10]:
748
+ self.log(f" {err.get('path', 'unknown')}: {err.get('error', 'unknown error')}")
749
+ if len(all_errors) > 10:
750
+ self.log(f" ... and {len(all_errors) - 10} more errors")
751
+
752
+ return True
753
+
754
+ def _send_batch_stream(
755
+ self,
756
+ files: list[dict],
757
+ parent_progress: Optional[Progress] = None
758
+ ) -> Optional[IndexProgress]:
759
+ """Send a batch using streaming endpoint, return final result."""
760
+ final_result = None
761
+
762
+ try:
763
+ for event in self.client.index_files_stream(
764
+ self.config.project_id, files
765
+ ):
766
+ if event.stage == "complete":
767
+ final_result = event
768
+ break
769
+ except httpx.HTTPStatusError as e:
770
+ logger.error(f"Stream error: {e.response.status_code}")
771
+ # Try sync fallback
772
+ try:
773
+ result = self.client.index_files(self.config.project_id, files)
774
+ final_result = IndexProgress(
775
+ stage="complete",
776
+ message="Done",
777
+ current=len(files),
778
+ total=len(files),
779
+ files_indexed=result.files_indexed,
780
+ files_skipped=result.files_skipped,
781
+ nodes_created=result.nodes_created,
782
+ edges_created=result.edges_created,
783
+ errors=result.errors,
784
+ )
785
+ except Exception as e2:
786
+ logger.error(f"Sync fallback also failed: {e2}")
787
+ raise
788
+ except httpx.RequestError as e:
789
+ logger.error(f"Request error: {e}")
790
+ raise
791
+
792
+ return final_result
793
+
794
+ def initial_scan(
795
+ self,
796
+ root_path: Path,
797
+ ignore_check: Callable[[Path], bool]
798
+ ):
799
+ """
800
+ Full scan and sync with backend.
801
+
802
+ This is the main entry point for watch --force or sync command.
803
+ """
804
+ # Step 1: Scan files locally
805
+ files_info = self.scan_files(root_path, ignore_check)
806
+
807
+ # Populate file cache so watch mode can skip unchanged files
808
+ for f in files_info:
809
+ self._file_cache[f["path"]] = {
810
+ "hash": f["hash"],
811
+ "language": f["language"],
812
+ }
813
+
814
+ # Step 2: Send to backend (includes hashes for server-side dedup)
815
+ self.sync_files(files_info)
816
+
817
+ def delete_file_immediate(self, path: Path) -> bool:
818
+ """Delete a file's nodes from the graph (for watch mode)."""
819
+ language = get_language(path)
820
+ if not language:
821
+ return False
822
+
823
+ rel_path = str(path.relative_to(self.root_path))
824
+
825
+ try:
826
+ deleted = self.client.delete_files(self.config.project_id, [rel_path])
827
+ # Remove from cache
828
+ self._file_cache.pop(rel_path, None)
829
+ return deleted > 0
830
+ except Exception as e:
831
+ logger.error(f"Failed to delete {rel_path}: {e}")
832
+ return False
833
+
834
+ def sync_file_immediate(self, path: Path) -> bool:
835
+ """
836
+ Sync a single file immediately (for watch mode).
837
+
838
+ Returns True if file was synced.
839
+ """
840
+ language = get_language(path)
841
+ if not language:
842
+ return False
843
+
844
+ try:
845
+ if path.stat().st_size > MAX_SINGLE_FILE_BYTES:
846
+ return False
847
+ except OSError:
848
+ return False
849
+
850
+ try:
851
+ content = path.read_text(encoding="utf-8")
852
+ except (OSError, UnicodeDecodeError):
853
+ return False
854
+
855
+ rel_path = str(path.relative_to(self.root_path))
856
+
857
+ # Check if content changed from our cache
858
+ content_hash = self.compute_hash(content)
859
+ cached = self._file_cache.get(rel_path)
860
+
861
+ if cached and cached.get("hash") == content_hash:
862
+ return False # No change
863
+
864
+ # Update cache
865
+ self._file_cache[rel_path] = {
866
+ "hash": content_hash,
867
+ "content": content,
868
+ "language": language,
869
+ }
870
+
871
+ # Send to backend
872
+ files = [{"path": rel_path, "content": content, "language": language, "hash": content_hash}]
873
+
874
+ try:
875
+ result = self.client.index_files(self.config.project_id, files)
876
+ return result.files_indexed > 0
877
+ except Exception as e:
878
+ logger.error(f"Failed to sync {rel_path}: {e}")
879
+ return False