rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,859 @@
1
+ """
2
+ IMS Publisher Module
3
+
4
+ Reads knowledge base content files and publishes them to RAGFlow with automatic metadata extraction
5
+ from folder structure.
6
+
7
+ Features:
8
+ - RAGFlow SDK integration for document upload and management
9
+ - Tag-in-title format: [tag1][tag2] filename.ext
10
+ - Preserves dots in filenames (e.g., "agents.md" stays "agents.md")
11
+ - Two-location tag storage: title + meta_fields for optimal search performance
12
+ - Dataset-based organization
13
+ - MD5 hash-based change detection
14
+ """
15
+
16
+ import hashlib
17
+ import json
18
+ import re
19
+ import time
20
+ import uuid
21
+ from collections import defaultdict
22
+ from dataclasses import dataclass
23
+ from pathlib import Path
24
+ from typing import cast
25
+
26
+ from .services.document_service import DocumentService
27
+ from .services.document_data import DocumentData
28
+ from .ragflow_client import DocumentMetadata, RAGFlowClient, RAGFlowClientError
29
+ from .typing_utils import DocumentLike, JsonDict
30
+
31
+ # Extensions RAGFlow can actually parse (from ragflow source: api/utils/file_utils.py).
32
+ # Files with other extensions are uploaded (server stores them) but must NOT be sent to parsing.
33
+ RAGFLOW_PARSABLE_EXTENSIONS = {
34
+ # Documents
35
+ ".pdf", ".doc", ".docx", ".ppt", ".pptx", ".pages",
36
+ ".xls", ".xlsx", ".csv",
37
+ # Text / Markdown / Code (parsed via TxtParser / MarkdownParser)
38
+ ".md", ".mdx", ".txt",
39
+ ".py", ".js", ".java", ".c", ".cpp", ".h", ".php", ".go",
40
+ ".ts", ".sh", ".cs", ".kt", ".sql",
41
+ # Web / Config / Data
42
+ ".htm", ".html", ".json", ".jsonl", ".ldjson",
43
+ ".ini",
44
+ # Email
45
+ ".msg", ".eml",
46
+ }
47
+
48
+
49
+ @dataclass
50
+ class PublishResult:
51
+ """Result of publishing a single content file."""
52
+
53
+ success: bool
54
+ document_id: str
55
+ file_path: str
56
+ tags: list[str]
57
+ dataset_id: str = "" # Dataset ID where document was uploaded
58
+ error: str | None = None
59
+ skipped: bool = False # True if skipped due to no changes
60
+
61
+ def __str__(self) -> str:
62
+ """String representation of result."""
63
+ if self.skipped:
64
+ return f"⊘ {self.file_path} (unchanged)"
65
+
66
+ status = "✓" if self.success else "✗"
67
+ if self.success:
68
+ return f"{status} {self.file_path} → {self.document_id} (tags: {', '.join(self.tags)})"
69
+ else:
70
+ return f"{status} {self.file_path} → Error: {self.error}"
71
+
72
+
73
+ class ContentPublisher:
74
+ """Publishes knowledge base content files to RAGFlow with metadata extraction."""
75
+
76
+ def __init__(
77
+ self,
78
+ client: RAGFlowClient,
79
+ workspace_root: str,
80
+ dataset_default: str = "aia",
81
+ dataset_template: str = "aia-{release}",
82
+ enable_change_tracking: bool = True,
83
+ file_extensions: list[str] | None = None
84
+ ):
85
+ """
86
+ Initialize the publisher.
87
+
88
+ Args:
89
+ client: RAGFlow client instance
90
+ workspace_root: Root directory of the workspace
91
+ dataset_default: Default dataset name for docs without release
92
+ dataset_template: Dataset name template (can use {release} placeholder)
93
+ enable_change_tracking: Enable hash-based change detection (default: True)
94
+ file_extensions: List of file extensions to publish (default: None = all files)
95
+ """
96
+ self.client = client
97
+ self.workspace_root = Path(workspace_root).resolve()
98
+ self.dataset_default = dataset_default
99
+ self.dataset_template = dataset_template
100
+ self.enable_change_tracking = enable_change_tracking
101
+ self.file_extensions = file_extensions # None = all files (no extension filter)
102
+ self._skip_names = {'.DS_Store', 'Thumbs.db', '.gitkeep', '.mcp.json'}
103
+ self._skip_folders = {'.cursor-plugin', '.claude-plugin'}
104
+
105
+ def publish_folder(
106
+ self,
107
+ folder_path: str,
108
+ dry_run: bool = False,
109
+ recursive: bool = True,
110
+ force: bool = False,
111
+ parse_documents: bool = True,
112
+ wait_for_parsing: bool = True
113
+ ) -> list[PublishResult]:
114
+ """
115
+ Publish all content files in a folder to RAGFlow.
116
+
117
+ Args:
118
+ folder_path: Path to folder containing content files
119
+ dry_run: If True, only simulate publishing
120
+ recursive: If True, include subfolders
121
+ force: If True, force republish ignoring change detection
122
+ parse_documents: If True, trigger parsing after upload (default: True)
123
+ wait_for_parsing: If True, wait for parsing to complete (default: True)
124
+
125
+ Returns:
126
+ List of PublishResult for each file
127
+ """
128
+ folder = Path(folder_path).resolve()
129
+
130
+ if not folder.exists() or not folder.is_dir():
131
+ print(f"✗ Folder not found: {folder}")
132
+ return []
133
+
134
+ # Find all content files
135
+ files: list[Path] = []
136
+ if self.file_extensions:
137
+ # Explicit extension filter
138
+ for ext in self.file_extensions:
139
+ if recursive:
140
+ files.extend(folder.rglob(f"*{ext}"))
141
+ else:
142
+ files.extend(folder.glob(f"*{ext}"))
143
+ else:
144
+ # All files (skip junk and plugin folders)
145
+ glob_iter = folder.rglob("*") if recursive else folder.glob("*")
146
+ files = [
147
+ f for f in glob_iter
148
+ if f.is_file()
149
+ and f.name not in self._skip_names
150
+ and not any(part in self._skip_folders for part in f.parts)
151
+ ]
152
+
153
+ if not files:
154
+ print(f"No content files found in {folder}")
155
+ return []
156
+
157
+ # Sort files for consistent ordering
158
+ files = sorted(files)
159
+
160
+ print(f"Found {len(files)} file(s) to publish")
161
+ if dry_run:
162
+ print("DRY RUN MODE - No actual publishing will occur")
163
+ if force:
164
+ print("FORCE MODE - Republishing all files regardless of changes")
165
+ print()
166
+
167
+ # Build caches for all files upfront (read ONCE, extract ONCE, hash ONCE)
168
+ print("Reading files...")
169
+ all_caches = [] # ALL successfully read caches (for cleanup/orphan detection)
170
+ results = []
171
+
172
+ for file in files:
173
+ try:
174
+ # Create cache (reads file, extracts metadata, calculates hash - ALL ONCE)
175
+ cache = DocumentData.from_file(
176
+ file,
177
+ self.workspace_root,
178
+ self.file_extensions,
179
+ publish_root=folder,
180
+ )
181
+ all_caches.append(cache)
182
+
183
+ except Exception as e:
184
+ print(f"✗ Error reading {file.name}: {e}")
185
+ continue
186
+
187
+ # Duplicate cleanup: remove stale server copies of the same file (before publish)
188
+ self._cleanup_duplicates(all_caches, dry_run)
189
+
190
+ # Check for changes after cleanup so skip decisions reflect current server state.
191
+ doc_caches = [] # Only changed caches (for publishing)
192
+ skipped_count = 0
193
+
194
+ if self.enable_change_tracking and not force and not dry_run:
195
+ print("\nChecking for changes...")
196
+
197
+ for cache in all_caches:
198
+ if self.enable_change_tracking and not force and not dry_run:
199
+ if self._has_content_changed_cached(cache):
200
+ doc_caches.append(cache)
201
+ else:
202
+ skipped_count += 1
203
+ print(f"⊘ Skipped (unchanged): {cache.doc_title}")
204
+ # Add skipped file to results for summary
205
+ results.append(PublishResult(
206
+ success=True,
207
+ document_id=cache.ims_doc_id,
208
+ file_path=str(cache.file_path),
209
+ tags=cache.tags,
210
+ skipped=True
211
+ ))
212
+ else:
213
+ doc_caches.append(cache)
214
+
215
+ if skipped_count > 0:
216
+ print(f"\nSkipped {skipped_count} unchanged file(s)")
217
+ print(f"Publishing {len(doc_caches)} changed file(s)\n")
218
+
219
+ current_folder = None
220
+ docs_to_parse = []
221
+
222
+ for cache in doc_caches: # Iterate over caches, not files
223
+ # Print folder header when entering new folder
224
+ file_folder = str(cache.file_path.parent.relative_to(folder))
225
+ if file_folder != current_folder:
226
+ if current_folder is not None:
227
+ print() # Blank line between folders
228
+ folder_display = file_folder if file_folder != "." else "<root>"
229
+ # Show path relative to workspace root for cleaner output
230
+ folder_relative = folder.relative_to(self.workspace_root)
231
+ print(f"{folder_relative}" if file_folder == "." else f"{folder_relative}/{file_folder}")
232
+ current_folder = file_folder
233
+
234
+ # Upload file (skip redundant change check - already verified above)
235
+ result = self.publish_file(
236
+ cache=cache,
237
+ dry_run=dry_run,
238
+ force=force,
239
+ parse_documents=False,
240
+ skip_change_check=True
241
+ )
242
+ results.append(result)
243
+
244
+ # Collect documents for batch parsing — only parsable extensions
245
+ if parse_documents and result.success and not result.skipped and not dry_run:
246
+ if cache.file_path.suffix.lower() in RAGFLOW_PARSABLE_EXTENSIONS:
247
+ docs_to_parse.append({
248
+ "id": result.document_id,
249
+ "name": cache.file_path.name,
250
+ "dataset_id": result.dataset_id,
251
+ "folder": str(cache.file_path.parent.relative_to(folder))
252
+ })
253
+ else:
254
+ print(f" ⊘ Skipping parse (unsupported extension): {cache.doc_title}")
255
+
256
+ # Batch parse all uploaded documents at once
257
+ if parse_documents and wait_for_parsing and not dry_run:
258
+ if docs_to_parse:
259
+ print(f"\nStarting parsing for {len(docs_to_parse)} document(s)...")
260
+ self._parse_documents(docs_to_parse, wait_for_completion=False, silent=True)
261
+ self._wait_for_all_parsing_with_progress(docs_to_parse)
262
+
263
+ # Print summary
264
+ self._print_summary(results, dry_run)
265
+
266
+ # Orphan cleanup: only safe when publishing the full instructions root.
267
+ # When publishing a subfolder, local caches cover only a subset of server
268
+ # docs — running orphan detection would delete everything else.
269
+ is_full_publish = folder.name == "instructions" or folder.parent == self.workspace_root
270
+ if is_full_publish:
271
+ managed_domains_by_dataset: dict[str, set[str]] = {}
272
+ for cache in all_caches:
273
+ dataset_name = self._resolve_dataset_name({"release": cache.release})
274
+ managed_domains_by_dataset.setdefault(dataset_name, set()).add(cache.domain)
275
+ self._cleanup_orphans(
276
+ all_caches,
277
+ dry_run,
278
+ managed_domains_by_dataset=managed_domains_by_dataset,
279
+ )
280
+ else:
281
+ print("\nOrphan detection skipped (subfolder publish)")
282
+
283
+ return results
284
+
285
+ def publish_file(
286
+ self,
287
+ file_path: str | None = None,
288
+ metadata: JsonDict | None = None,
289
+ cache: DocumentData | None = None,
290
+ dry_run: bool = False,
291
+ force: bool = False,
292
+ parse_documents: bool = True,
293
+ wait_for_parsing: bool = True,
294
+ skip_change_check: bool = False
295
+ ) -> PublishResult:
296
+ """
297
+ Publish a single content file to RAGFlow.
298
+
299
+ Args:
300
+ file_path: Path to content file (legacy, for backward compatibility)
301
+ metadata: Optional metadata override (auto-extracted if not provided)
302
+ cache: Pre-built DocumentData (preferred, avoids re-reading)
303
+ dry_run: If True, only simulate publishing
304
+ force: If True, force republish ignoring change detection
305
+ parse_documents: If True, trigger parsing after upload (default: True)
306
+ wait_for_parsing: If True, wait for parsing to complete (default: True)
307
+ skip_change_check: If True, skip change verification (already verified)
308
+
309
+ Returns:
310
+ PublishResult with outcome
311
+ """
312
+ # If cache provided, use it (no re-reading, no re-calculation)
313
+ if cache:
314
+ file = cache.file_path
315
+ ims_doc_id = cache.ims_doc_id
316
+ content = cache.content
317
+ content_str = cache.content_str
318
+ is_text = cache.is_text
319
+
320
+ # Build metadata dict from cache if not provided
321
+ if metadata is None:
322
+ metadata = cache.to_metadata_dict()
323
+ else:
324
+ # Legacy path: file_path provided (for backward compatibility)
325
+ if not file_path:
326
+ return PublishResult(
327
+ success=False,
328
+ document_id="",
329
+ file_path="",
330
+ tags=[],
331
+ error="Either file_path or cache must be provided"
332
+ )
333
+
334
+ file = Path(file_path).resolve()
335
+
336
+ # Validate file exists
337
+ if not file.exists():
338
+ return PublishResult(
339
+ success=False,
340
+ document_id="",
341
+ file_path=str(file),
342
+ tags=[],
343
+ error="File not found"
344
+ )
345
+
346
+ # Create cache on-the-fly (still better than reading multiple times)
347
+ cache = DocumentData.from_file(
348
+ file,
349
+ self.workspace_root,
350
+ self.file_extensions,
351
+ publish_root=file.parent,
352
+ )
353
+
354
+ ims_doc_id = cache.ims_doc_id
355
+ content = cache.content
356
+ content_str = cache.content_str
357
+ is_text = cache.is_text
358
+
359
+ if metadata is None:
360
+ metadata = cache.to_metadata_dict()
361
+
362
+ # Determine dataset name from template and metadata
363
+ dataset_name = self._resolve_dataset_name(metadata)
364
+
365
+ try:
366
+ # Skip empty text files
367
+ if is_text and (not content_str or content_str.strip() == ""):
368
+ return PublishResult(
369
+ success=False,
370
+ document_id=ims_doc_id,
371
+ file_path=str(file),
372
+ tags=metadata.get('tags', []),
373
+ error="File is empty - skipping"
374
+ )
375
+
376
+ # Check if changed (only if not already verified)
377
+ if not skip_change_check and self.enable_change_tracking and not force and not dry_run:
378
+ if not self._has_content_changed_cached(cache):
379
+ print(f"⊘ Skipped (unchanged): {file.name}")
380
+ return PublishResult(
381
+ success=True,
382
+ document_id=ims_doc_id,
383
+ file_path=str(file),
384
+ tags=metadata.get('tags', []),
385
+ skipped=True
386
+ )
387
+
388
+ # Add file size for binary files
389
+ if not is_text:
390
+ metadata['file_size'] = len(content)
391
+
392
+
393
+ if dry_run:
394
+ print(f"[DRY RUN] Would publish: {file.name}")
395
+ print(f" Document ID: {ims_doc_id}")
396
+ print(f" Dataset: {dataset_name}")
397
+ print(f" File type: {'text' if is_text else 'binary'}")
398
+ print(f" Metadata: {metadata}")
399
+ if is_text and content_str:
400
+ print(f" Content size: {len(content_str)} characters")
401
+ else:
402
+ print(f" File size: {metadata.get('file_size', 0)} bytes")
403
+
404
+ return PublishResult(
405
+ success=True,
406
+ document_id=ims_doc_id,
407
+ file_path=str(file),
408
+ tags=metadata.get('tags', [])
409
+ )
410
+
411
+ original_path = metadata.get("original_path", "")
412
+
413
+ # Create DocumentMetadata for RAGFlow
414
+ doc_metadata = DocumentMetadata(
415
+ tags=cast(list[str], metadata.get('tags', [])),
416
+ domain=str(metadata.get('domain', 'general')),
417
+ release=str(metadata.get('release', '')),
418
+ content_hash=str(metadata.get('content_hash', '')),
419
+ ims_doc_id=ims_doc_id,
420
+ original_path=str(original_path),
421
+ resource_path=cast(str | None, metadata.get("resource_path")),
422
+ sort_order=cast(int | None, metadata.get("sort_order")),
423
+ frontmatter=cast(JsonDict | None, metadata.get("frontmatter")),
424
+ line_count=cast(int | None, metadata.get("line_count")),
425
+ doc_title=str(metadata.get('doc_title', file.name))
426
+ )
427
+
428
+ # Upload to RAGFlow using pre-read content from cache (no re-reading!)
429
+ result = self.client.upload_document(
430
+ file_path=file,
431
+ metadata=doc_metadata,
432
+ dataset_name=dataset_name,
433
+ dataset_template=self.dataset_template,
434
+ force=force,
435
+ content=content # Pass pre-read content from cache
436
+ )
437
+
438
+ # None means document was skipped (unchanged)
439
+ if result is None:
440
+ return PublishResult(
441
+ success=True,
442
+ document_id=ims_doc_id,
443
+ file_path=str(file),
444
+ tags=metadata.get('tags', []),
445
+ dataset_id="",
446
+ skipped=True
447
+ )
448
+
449
+ doc, dataset_id = result
450
+ # Use RAGFlow's internal doc.id for parsing (not ims_doc_id)
451
+ publish_result = PublishResult(
452
+ success=True,
453
+ document_id=doc.id, # RAGFlow internal ID for parsing
454
+ file_path=str(file),
455
+ tags=metadata.get('tags', []),
456
+ dataset_id=dataset_id
457
+ )
458
+
459
+ # Trigger parsing if enabled — only for parsable extensions
460
+ if parse_documents and not dry_run:
461
+ if file.suffix.lower() in RAGFLOW_PARSABLE_EXTENSIONS:
462
+ documents = [{
463
+ "id": doc.id,
464
+ "name": file.name,
465
+ "dataset_id": dataset_id
466
+ }]
467
+ self._parse_documents(
468
+ documents,
469
+ wait_for_completion=wait_for_parsing
470
+ )
471
+ else:
472
+ print(f" ⊘ Skipping parse (unsupported extension): {file.name}")
473
+
474
+ return publish_result
475
+
476
+ except RAGFlowClientError as e:
477
+ return PublishResult(
478
+ success=False,
479
+ document_id="",
480
+ file_path=str(file),
481
+ tags=[],
482
+ error=str(e)
483
+ )
484
+ except Exception as e:
485
+ return PublishResult(
486
+ success=False,
487
+ document_id="",
488
+ file_path=str(file),
489
+ tags=[],
490
+ error=str(e)
491
+ )
492
+
493
+ def _parse_documents(
494
+ self,
495
+ documents: list[JsonDict],
496
+ wait_for_completion: bool = True,
497
+ silent: bool = False
498
+ ) -> None:
499
+ """
500
+ Trigger parsing for one or more documents.
501
+
502
+ Args:
503
+ documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id}
504
+ wait_for_completion: Wait for parsing to complete
505
+ silent: If True, don't print progress messages (for streaming mode)
506
+ """
507
+ # Delegate to client's batch parsing method
508
+ result = self.client.parse_documents_batch(documents, silent=silent)
509
+
510
+ # Wait for completion if requested (only for successful datasets)
511
+ if wait_for_completion and not silent and result["success"]:
512
+ # Filter to only successfully triggered documents
513
+ docs_to_wait = [
514
+ doc for doc in documents
515
+ if doc["dataset_id"] in result["success"]
516
+ ]
517
+
518
+ if docs_to_wait:
519
+ # Use DocumentService for consistent waiting with progress bar
520
+ doc_service = DocumentService(self.client)
521
+ doc_service.wait_for_parsing(docs_to_wait)
522
+
523
+ def _wait_for_all_parsing_with_progress(
524
+ self,
525
+ documents: list[JsonDict],
526
+ timeout: int = 300, # 5 minutes
527
+ poll_interval: float = 0.5
528
+ ) -> None:
529
+ """
530
+ Wait for all documents to finish parsing with clean progress display grouped by folder.
531
+
532
+ Delegates to DocumentService.wait_for_parsing() for reusable implementation.
533
+
534
+ Args:
535
+ documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id, "folder": folder}
536
+ timeout: Max seconds to wait
537
+ poll_interval: Seconds between status checks (reduced to 0.5 for smoother progress)
538
+ """
539
+ doc_service = DocumentService(self.client)
540
+ doc_service.wait_for_parsing(documents, timeout, poll_interval)
541
+
542
+ def _has_content_changed_cached(self, cache: DocumentData) -> bool:
543
+ """
544
+ Check if document content has changed using pre-calculated hash.
545
+
546
+ Args:
547
+ cache: DocumentData with pre-calculated hash
548
+
549
+ Returns:
550
+ True if changed or new, False if unchanged
551
+ """
552
+ try:
553
+ # Resolve dataset name from cache metadata
554
+ dataset_name = self._resolve_dataset_name({
555
+ 'release': cache.release,
556
+ 'tags': cache.tags,
557
+ 'domain': cache.domain
558
+ })
559
+
560
+ # Get dataset
561
+ dataset = self.client.get_dataset(name=dataset_name)
562
+ if not dataset:
563
+ return True # New dataset = new document
564
+
565
+ # Search for existing document by ims_doc_id
566
+ metadata_filter = {
567
+ "logic": "and",
568
+ "conditions": [{
569
+ "name": "ims_doc_id",
570
+ "comparison_operator": "is",
571
+ "value": cache.ims_doc_id
572
+ }]
573
+ }
574
+
575
+ docs = self.client.list_documents(
576
+ dataset,
577
+ page_size=1,
578
+ metadata_condition=metadata_filter
579
+ )
580
+
581
+ if not docs:
582
+ return True # Document doesn't exist
583
+
584
+ # Compare hashes (use pre-calculated hash from cache)
585
+ existing_doc = docs[0]
586
+ existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
587
+
588
+ if isinstance(existing_meta, dict):
589
+ existing_hash = existing_meta.get("content_hash")
590
+ else:
591
+ existing_hash = getattr(existing_meta, 'content_hash', None)
592
+
593
+ if not existing_hash:
594
+ return True # No hash = changed
595
+
596
+ # Compare: cache.content_hash was already calculated in DocumentData
597
+ return cache.content_hash != str(existing_hash)
598
+
599
+ except Exception as e:
600
+ print(f" Warning: Could not check existing document: {e}")
601
+ return True # Assume changed on error
602
+
603
+ def _resolve_dataset_name(self, metadata: JsonDict) -> str:
604
+ """
605
+ Resolve dataset name from template using metadata.
606
+
607
+ Args:
608
+ metadata: Document metadata containing release info
609
+
610
+ Returns:
611
+ Resolved dataset name
612
+ """
613
+ release = metadata.get('release', '')
614
+
615
+ if release:
616
+ # Use template and replace {release} placeholder
617
+ return self.dataset_template.replace('{release}', release)
618
+ else:
619
+ # No release - use default dataset
620
+ return self.dataset_default
621
+
622
+ def _cleanup_duplicates(self, all_caches: list[DocumentData], dry_run: bool) -> None:
623
+ """Remove stale duplicate server docs before publishing.
624
+
625
+ For each dataset, finds server docs where the same original_path appears
626
+ more than once. Keeps the copy whose ims_doc_id matches the local file
627
+ (authoritative). If no copy matches, keeps the most recently created one
628
+ (first in desc create_time order). Deletes the rest.
629
+
630
+ Only touches documents with a non-empty original_path.
631
+ Custom documents (no original_path) are never affected.
632
+
633
+ Args:
634
+ all_caches: DocumentData for every local file in the published folder tree.
635
+ dry_run: If True, report duplicates but do not delete.
636
+ """
637
+ # Build: dataset_name -> {original_path -> canonical ims_doc_id}
638
+ canonical_by_dataset: dict[str, dict[str, str]] = {}
639
+ for cache in all_caches:
640
+ if not cache.original_path:
641
+ continue
642
+ dataset_name = self._resolve_dataset_name({"release": cache.release})
643
+ canonical_by_dataset.setdefault(dataset_name, {})[cache.original_path] = cache.ims_doc_id
644
+
645
+ if not canonical_by_dataset:
646
+ return
647
+
648
+ print("\nDuplicate detection...")
649
+ for dataset_name, canonical_ids in canonical_by_dataset.items():
650
+ dataset = self.client.get_dataset(name=dataset_name)
651
+ if not dataset:
652
+ continue
653
+
654
+ try:
655
+ all_docs = self.client.list_documents(
656
+ dataset,
657
+ page_size=self.client.page_size,
658
+ )
659
+ except Exception as e:
660
+ print(f" Warning: Could not list documents in '{dataset_name}': {e}")
661
+ continue
662
+
663
+ def _strip_duplicate_suffix(name: str) -> str:
664
+ path = Path(name)
665
+ new_name = f"{re.sub(r'\(\d+\)$', '', path.stem)}{path.suffix}"
666
+ return name[: -len(path.name)] + new_name
667
+
668
+ managed_docs = []
669
+ for doc in all_docs:
670
+ meta = getattr(doc, "meta_fields", {}) or {}
671
+ original_path = meta.get("original_path", "") if isinstance(meta, dict) else getattr(meta, "original_path", "")
672
+ if original_path:
673
+ managed_docs.append(doc)
674
+
675
+ # Group by original_path; list order is desc create_time (most recent first)
676
+ by_path: dict[str, list[DocumentLike]] = defaultdict(list)
677
+ for doc in managed_docs:
678
+ meta = getattr(doc, "meta_fields", {}) or {}
679
+ original_path = meta.get("original_path", "") if isinstance(meta, dict) else getattr(meta, "original_path", "")
680
+ if original_path:
681
+ by_path[original_path].append(doc)
682
+
683
+ duplicates = [] # (doc, original_path) to delete
684
+ for original_path, docs in by_path.items():
685
+ if len(docs) <= 1:
686
+ continue
687
+
688
+ canonical_ims_doc_id = canonical_ids.get(original_path)
689
+ # Find the authoritative doc (matching local ims_doc_id)
690
+ authoritative: DocumentLike | None = None
691
+ if canonical_ims_doc_id:
692
+ for doc in docs:
693
+ meta = getattr(doc, "meta_fields", {}) or {}
694
+ doc_ims_id = meta.get("ims_doc_id", "") if isinstance(meta, dict) else getattr(meta, "ims_doc_id", "")
695
+ if doc_ims_id == canonical_ims_doc_id:
696
+ authoritative = doc
697
+ break
698
+
699
+ # Fallback: keep most recent (first in desc order)
700
+ if authoritative is None:
701
+ authoritative = docs[0]
702
+
703
+ for doc in docs:
704
+ if doc.id != authoritative.id:
705
+ duplicates.append((doc, original_path))
706
+
707
+ # Name duplicates: foo.md + foo(1).md + foo(2).md ...
708
+ name_groups: dict[str, list[DocumentLike]] = defaultdict(list)
709
+ for doc in all_docs:
710
+ doc_name = getattr(doc, "name", "") or ""
711
+ if not doc_name:
712
+ continue
713
+ name_groups[_strip_duplicate_suffix(doc_name)].append(doc)
714
+
715
+ for doc_name_stripped, docs in name_groups.items():
716
+ if len(docs) > 1:
717
+ for doc in docs:
718
+ duplicates.append((doc, getattr(doc, "name", "") or ""))
719
+
720
+ if not duplicates:
721
+ print(f" '{dataset_name}': no duplicates")
722
+ continue
723
+
724
+ print(f" '{dataset_name}': {len(duplicates)} duplicate(s)")
725
+ seen_doc_ids = set()
726
+ for doc, original_path in duplicates:
727
+ if doc.id in seen_doc_ids:
728
+ continue
729
+ seen_doc_ids.add(doc.id)
730
+ if dry_run:
731
+ print(f" [DRY RUN] Would delete duplicate: {original_path}")
732
+ else:
733
+ try:
734
+ dataset.delete_documents([doc.id])
735
+ print(f" Deleted duplicate: {original_path}")
736
+ except Exception as e:
737
+ print(f" Warning: Failed to delete duplicate '{original_path}': {e}")
738
+
739
+ def _cleanup_orphans(
740
+ self,
741
+ all_caches: list[DocumentData],
742
+ dry_run: bool,
743
+ managed_domains_by_dataset: dict[str, set[str]] | None = None,
744
+ ) -> None:
745
+ """Delete server documents whose original_path is no longer present locally.
746
+
747
+ Only touches documents that have a non-empty original_path metadata field.
748
+ Custom documents (uploaded via put_document, no original_path) are NEVER deleted.
749
+
750
+ Args:
751
+ all_caches: DocumentData for every local file in the published folder tree.
752
+ dry_run: If True, report orphans but do not delete.
753
+ """
754
+ # Build: dataset_name -> set of local original_paths
755
+ local_paths_by_dataset: dict[str, set[str]] = {}
756
+ for cache in all_caches:
757
+ if not cache.original_path:
758
+ continue
759
+ dataset_name = self._resolve_dataset_name({"release": cache.release})
760
+ local_paths_by_dataset.setdefault(dataset_name, set()).add(cache.original_path)
761
+
762
+ if not local_paths_by_dataset:
763
+ return
764
+
765
+ print("\nOrphan detection...")
766
+ for dataset_name, local_paths in local_paths_by_dataset.items():
767
+ managed_domains = (
768
+ managed_domains_by_dataset.get(dataset_name, set())
769
+ if managed_domains_by_dataset
770
+ else set()
771
+ )
772
+ dataset = self.client.get_dataset(name=dataset_name)
773
+ if not dataset:
774
+ continue
775
+
776
+ # Fetch only publisher-managed docs (non-empty original_path) via server-side filter.
777
+ # Custom documents (no original_path) are skipped at the query level.
778
+ try:
779
+ managed_docs = self.client.list_documents(
780
+ dataset,
781
+ page_size=self.client.page_size,
782
+ metadata_condition={
783
+ "logic": "and",
784
+ "conditions": [{
785
+ "name": "original_path",
786
+ "comparison_operator": "not empty",
787
+ "value": ""
788
+ }]
789
+ }
790
+ )
791
+ except Exception as e:
792
+ print(f" Warning: Could not list documents in '{dataset_name}': {e}")
793
+ continue
794
+
795
+ orphans = []
796
+ for doc in managed_docs:
797
+ meta = getattr(doc, "meta_fields", {}) or {}
798
+ if isinstance(meta, dict):
799
+ original_path = meta.get("original_path", "")
800
+ domain = meta.get("domain", "")
801
+ else:
802
+ original_path = getattr(meta, "original_path", "")
803
+ domain = getattr(meta, "domain", "")
804
+
805
+ if not original_path:
806
+ continue
807
+
808
+ if managed_domains and domain not in managed_domains:
809
+ continue
810
+
811
+ if original_path not in local_paths:
812
+ orphans.append((doc, original_path))
813
+
814
+ if not orphans:
815
+ print(f" '{dataset_name}': no orphans")
816
+ continue
817
+
818
+ print(f" '{dataset_name}': {len(orphans)} orphan(s)")
819
+ for doc, original_path in orphans:
820
+ if dry_run:
821
+ print(f" [DRY RUN] Would delete: {original_path}")
822
+ else:
823
+ try:
824
+ dataset.delete_documents([doc.id])
825
+ print(f" Deleted orphan: {original_path}")
826
+ except Exception as e:
827
+ print(f" Warning: Failed to delete orphan '{original_path}': {e}")
828
+
829
+ @staticmethod
830
+ def _print_summary(results: list[PublishResult], dry_run: bool = False) -> None:
831
+ """
832
+ Print summary of publishing results.
833
+
834
+ Args:
835
+ results: List of PublishResult
836
+ dry_run: Whether this was a dry run
837
+ """
838
+ successful = [r for r in results if r.success and not r.skipped]
839
+ skipped = [r for r in results if r.skipped]
840
+ failed = [r for r in results if not r.success]
841
+
842
+ print("\n" + "="*60)
843
+ if dry_run:
844
+ print("DRY RUN SUMMARY")
845
+ else:
846
+ print("PUBLISHING SUMMARY")
847
+ print("="*60)
848
+
849
+ print(f"Total files: {len(results)}")
850
+ print(f"✓ Successful: {len(successful)}")
851
+ print(f"⊘ Skipped (unchanged): {len(skipped)}")
852
+ print(f"✗ Failed: {len(failed)}")
853
+
854
+ if failed:
855
+ print("\nFailed files:")
856
+ for result in failed:
857
+ print(f" {result}")
858
+
859
+ print("="*60)