rosetta-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_cli/__init__.py +12 -0
- rosetta_cli/__main__.py +6 -0
- rosetta_cli/cli.py +379 -0
- rosetta_cli/commands/__init__.py +5 -0
- rosetta_cli/commands/base_command.py +82 -0
- rosetta_cli/commands/cleanup_command.py +214 -0
- rosetta_cli/commands/list_command.py +70 -0
- rosetta_cli/commands/parse_command.py +205 -0
- rosetta_cli/commands/publish_command.py +113 -0
- rosetta_cli/commands/verify_command.py +46 -0
- rosetta_cli/ims_auth.py +124 -0
- rosetta_cli/ims_config.py +317 -0
- rosetta_cli/ims_publisher.py +859 -0
- rosetta_cli/ims_utils.py +28 -0
- rosetta_cli/ragflow_client.py +928 -0
- rosetta_cli/services/__init__.py +8 -0
- rosetta_cli/services/auth_service.py +114 -0
- rosetta_cli/services/dataset_service.py +72 -0
- rosetta_cli/services/document_data.py +408 -0
- rosetta_cli/services/document_service.py +357 -0
- rosetta_cli/typing_utils.py +49 -0
- rosetta_cli-2.0.0.dist-info/METADATA +639 -0
- rosetta_cli-2.0.0.dist-info/RECORD +26 -0
- rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
- rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
- rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,859 @@
|
|
|
1
|
+
"""
|
|
2
|
+
IMS Publisher Module
|
|
3
|
+
|
|
4
|
+
Reads knowledge base content files and publishes them to RAGFlow with automatic metadata extraction
|
|
5
|
+
from folder structure.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- RAGFlow SDK integration for document upload and management
|
|
9
|
+
- Tag-in-title format: [tag1][tag2] filename.ext
|
|
10
|
+
- Preserves dots in filenames (e.g., "agents.md" stays "agents.md")
|
|
11
|
+
- Two-location tag storage: title + meta_fields for optimal search performance
|
|
12
|
+
- Dataset-based organization
|
|
13
|
+
- MD5 hash-based change detection
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import hashlib
|
|
17
|
+
import json
|
|
18
|
+
import re
|
|
19
|
+
import time
|
|
20
|
+
import uuid
|
|
21
|
+
from collections import defaultdict
|
|
22
|
+
from dataclasses import dataclass
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import cast
|
|
25
|
+
|
|
26
|
+
from .services.document_service import DocumentService
|
|
27
|
+
from .services.document_data import DocumentData
|
|
28
|
+
from .ragflow_client import DocumentMetadata, RAGFlowClient, RAGFlowClientError
|
|
29
|
+
from .typing_utils import DocumentLike, JsonDict
|
|
30
|
+
|
|
31
|
+
# Extensions RAGFlow can actually parse (from ragflow source: api/utils/file_utils.py).
|
|
32
|
+
# Files with other extensions are uploaded (server stores them) but must NOT be sent to parsing.
|
|
33
|
+
RAGFLOW_PARSABLE_EXTENSIONS = {
|
|
34
|
+
# Documents
|
|
35
|
+
".pdf", ".doc", ".docx", ".ppt", ".pptx", ".pages",
|
|
36
|
+
".xls", ".xlsx", ".csv",
|
|
37
|
+
# Text / Markdown / Code (parsed via TxtParser / MarkdownParser)
|
|
38
|
+
".md", ".mdx", ".txt",
|
|
39
|
+
".py", ".js", ".java", ".c", ".cpp", ".h", ".php", ".go",
|
|
40
|
+
".ts", ".sh", ".cs", ".kt", ".sql",
|
|
41
|
+
# Web / Config / Data
|
|
42
|
+
".htm", ".html", ".json", ".jsonl", ".ldjson",
|
|
43
|
+
".ini",
|
|
44
|
+
# Email
|
|
45
|
+
".msg", ".eml",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@dataclass
|
|
50
|
+
class PublishResult:
|
|
51
|
+
"""Result of publishing a single content file."""
|
|
52
|
+
|
|
53
|
+
success: bool
|
|
54
|
+
document_id: str
|
|
55
|
+
file_path: str
|
|
56
|
+
tags: list[str]
|
|
57
|
+
dataset_id: str = "" # Dataset ID where document was uploaded
|
|
58
|
+
error: str | None = None
|
|
59
|
+
skipped: bool = False # True if skipped due to no changes
|
|
60
|
+
|
|
61
|
+
def __str__(self) -> str:
|
|
62
|
+
"""String representation of result."""
|
|
63
|
+
if self.skipped:
|
|
64
|
+
return f"⊘ {self.file_path} (unchanged)"
|
|
65
|
+
|
|
66
|
+
status = "✓" if self.success else "✗"
|
|
67
|
+
if self.success:
|
|
68
|
+
return f"{status} {self.file_path} → {self.document_id} (tags: {', '.join(self.tags)})"
|
|
69
|
+
else:
|
|
70
|
+
return f"{status} {self.file_path} → Error: {self.error}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class ContentPublisher:
|
|
74
|
+
"""Publishes knowledge base content files to RAGFlow with metadata extraction."""
|
|
75
|
+
|
|
76
|
+
def __init__(
|
|
77
|
+
self,
|
|
78
|
+
client: RAGFlowClient,
|
|
79
|
+
workspace_root: str,
|
|
80
|
+
dataset_default: str = "aia",
|
|
81
|
+
dataset_template: str = "aia-{release}",
|
|
82
|
+
enable_change_tracking: bool = True,
|
|
83
|
+
file_extensions: list[str] | None = None
|
|
84
|
+
):
|
|
85
|
+
"""
|
|
86
|
+
Initialize the publisher.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
client: RAGFlow client instance
|
|
90
|
+
workspace_root: Root directory of the workspace
|
|
91
|
+
dataset_default: Default dataset name for docs without release
|
|
92
|
+
dataset_template: Dataset name template (can use {release} placeholder)
|
|
93
|
+
enable_change_tracking: Enable hash-based change detection (default: True)
|
|
94
|
+
file_extensions: List of file extensions to publish (default: None = all files)
|
|
95
|
+
"""
|
|
96
|
+
self.client = client
|
|
97
|
+
self.workspace_root = Path(workspace_root).resolve()
|
|
98
|
+
self.dataset_default = dataset_default
|
|
99
|
+
self.dataset_template = dataset_template
|
|
100
|
+
self.enable_change_tracking = enable_change_tracking
|
|
101
|
+
self.file_extensions = file_extensions # None = all files (no extension filter)
|
|
102
|
+
self._skip_names = {'.DS_Store', 'Thumbs.db', '.gitkeep', '.mcp.json'}
|
|
103
|
+
self._skip_folders = {'.cursor-plugin', '.claude-plugin'}
|
|
104
|
+
|
|
105
|
+
def publish_folder(
|
|
106
|
+
self,
|
|
107
|
+
folder_path: str,
|
|
108
|
+
dry_run: bool = False,
|
|
109
|
+
recursive: bool = True,
|
|
110
|
+
force: bool = False,
|
|
111
|
+
parse_documents: bool = True,
|
|
112
|
+
wait_for_parsing: bool = True
|
|
113
|
+
) -> list[PublishResult]:
|
|
114
|
+
"""
|
|
115
|
+
Publish all content files in a folder to RAGFlow.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
folder_path: Path to folder containing content files
|
|
119
|
+
dry_run: If True, only simulate publishing
|
|
120
|
+
recursive: If True, include subfolders
|
|
121
|
+
force: If True, force republish ignoring change detection
|
|
122
|
+
parse_documents: If True, trigger parsing after upload (default: True)
|
|
123
|
+
wait_for_parsing: If True, wait for parsing to complete (default: True)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of PublishResult for each file
|
|
127
|
+
"""
|
|
128
|
+
folder = Path(folder_path).resolve()
|
|
129
|
+
|
|
130
|
+
if not folder.exists() or not folder.is_dir():
|
|
131
|
+
print(f"✗ Folder not found: {folder}")
|
|
132
|
+
return []
|
|
133
|
+
|
|
134
|
+
# Find all content files
|
|
135
|
+
files: list[Path] = []
|
|
136
|
+
if self.file_extensions:
|
|
137
|
+
# Explicit extension filter
|
|
138
|
+
for ext in self.file_extensions:
|
|
139
|
+
if recursive:
|
|
140
|
+
files.extend(folder.rglob(f"*{ext}"))
|
|
141
|
+
else:
|
|
142
|
+
files.extend(folder.glob(f"*{ext}"))
|
|
143
|
+
else:
|
|
144
|
+
# All files (skip junk and plugin folders)
|
|
145
|
+
glob_iter = folder.rglob("*") if recursive else folder.glob("*")
|
|
146
|
+
files = [
|
|
147
|
+
f for f in glob_iter
|
|
148
|
+
if f.is_file()
|
|
149
|
+
and f.name not in self._skip_names
|
|
150
|
+
and not any(part in self._skip_folders for part in f.parts)
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
if not files:
|
|
154
|
+
print(f"No content files found in {folder}")
|
|
155
|
+
return []
|
|
156
|
+
|
|
157
|
+
# Sort files for consistent ordering
|
|
158
|
+
files = sorted(files)
|
|
159
|
+
|
|
160
|
+
print(f"Found {len(files)} file(s) to publish")
|
|
161
|
+
if dry_run:
|
|
162
|
+
print("DRY RUN MODE - No actual publishing will occur")
|
|
163
|
+
if force:
|
|
164
|
+
print("FORCE MODE - Republishing all files regardless of changes")
|
|
165
|
+
print()
|
|
166
|
+
|
|
167
|
+
# Build caches for all files upfront (read ONCE, extract ONCE, hash ONCE)
|
|
168
|
+
print("Reading files...")
|
|
169
|
+
all_caches = [] # ALL successfully read caches (for cleanup/orphan detection)
|
|
170
|
+
results = []
|
|
171
|
+
|
|
172
|
+
for file in files:
|
|
173
|
+
try:
|
|
174
|
+
# Create cache (reads file, extracts metadata, calculates hash - ALL ONCE)
|
|
175
|
+
cache = DocumentData.from_file(
|
|
176
|
+
file,
|
|
177
|
+
self.workspace_root,
|
|
178
|
+
self.file_extensions,
|
|
179
|
+
publish_root=folder,
|
|
180
|
+
)
|
|
181
|
+
all_caches.append(cache)
|
|
182
|
+
|
|
183
|
+
except Exception as e:
|
|
184
|
+
print(f"✗ Error reading {file.name}: {e}")
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
# Duplicate cleanup: remove stale server copies of the same file (before publish)
|
|
188
|
+
self._cleanup_duplicates(all_caches, dry_run)
|
|
189
|
+
|
|
190
|
+
# Check for changes after cleanup so skip decisions reflect current server state.
|
|
191
|
+
doc_caches = [] # Only changed caches (for publishing)
|
|
192
|
+
skipped_count = 0
|
|
193
|
+
|
|
194
|
+
if self.enable_change_tracking and not force and not dry_run:
|
|
195
|
+
print("\nChecking for changes...")
|
|
196
|
+
|
|
197
|
+
for cache in all_caches:
|
|
198
|
+
if self.enable_change_tracking and not force and not dry_run:
|
|
199
|
+
if self._has_content_changed_cached(cache):
|
|
200
|
+
doc_caches.append(cache)
|
|
201
|
+
else:
|
|
202
|
+
skipped_count += 1
|
|
203
|
+
print(f"⊘ Skipped (unchanged): {cache.doc_title}")
|
|
204
|
+
# Add skipped file to results for summary
|
|
205
|
+
results.append(PublishResult(
|
|
206
|
+
success=True,
|
|
207
|
+
document_id=cache.ims_doc_id,
|
|
208
|
+
file_path=str(cache.file_path),
|
|
209
|
+
tags=cache.tags,
|
|
210
|
+
skipped=True
|
|
211
|
+
))
|
|
212
|
+
else:
|
|
213
|
+
doc_caches.append(cache)
|
|
214
|
+
|
|
215
|
+
if skipped_count > 0:
|
|
216
|
+
print(f"\nSkipped {skipped_count} unchanged file(s)")
|
|
217
|
+
print(f"Publishing {len(doc_caches)} changed file(s)\n")
|
|
218
|
+
|
|
219
|
+
current_folder = None
|
|
220
|
+
docs_to_parse = []
|
|
221
|
+
|
|
222
|
+
for cache in doc_caches: # Iterate over caches, not files
|
|
223
|
+
# Print folder header when entering new folder
|
|
224
|
+
file_folder = str(cache.file_path.parent.relative_to(folder))
|
|
225
|
+
if file_folder != current_folder:
|
|
226
|
+
if current_folder is not None:
|
|
227
|
+
print() # Blank line between folders
|
|
228
|
+
folder_display = file_folder if file_folder != "." else "<root>"
|
|
229
|
+
# Show path relative to workspace root for cleaner output
|
|
230
|
+
folder_relative = folder.relative_to(self.workspace_root)
|
|
231
|
+
print(f"{folder_relative}" if file_folder == "." else f"{folder_relative}/{file_folder}")
|
|
232
|
+
current_folder = file_folder
|
|
233
|
+
|
|
234
|
+
# Upload file (skip redundant change check - already verified above)
|
|
235
|
+
result = self.publish_file(
|
|
236
|
+
cache=cache,
|
|
237
|
+
dry_run=dry_run,
|
|
238
|
+
force=force,
|
|
239
|
+
parse_documents=False,
|
|
240
|
+
skip_change_check=True
|
|
241
|
+
)
|
|
242
|
+
results.append(result)
|
|
243
|
+
|
|
244
|
+
# Collect documents for batch parsing — only parsable extensions
|
|
245
|
+
if parse_documents and result.success and not result.skipped and not dry_run:
|
|
246
|
+
if cache.file_path.suffix.lower() in RAGFLOW_PARSABLE_EXTENSIONS:
|
|
247
|
+
docs_to_parse.append({
|
|
248
|
+
"id": result.document_id,
|
|
249
|
+
"name": cache.file_path.name,
|
|
250
|
+
"dataset_id": result.dataset_id,
|
|
251
|
+
"folder": str(cache.file_path.parent.relative_to(folder))
|
|
252
|
+
})
|
|
253
|
+
else:
|
|
254
|
+
print(f" ⊘ Skipping parse (unsupported extension): {cache.doc_title}")
|
|
255
|
+
|
|
256
|
+
# Batch parse all uploaded documents at once
|
|
257
|
+
if parse_documents and wait_for_parsing and not dry_run:
|
|
258
|
+
if docs_to_parse:
|
|
259
|
+
print(f"\nStarting parsing for {len(docs_to_parse)} document(s)...")
|
|
260
|
+
self._parse_documents(docs_to_parse, wait_for_completion=False, silent=True)
|
|
261
|
+
self._wait_for_all_parsing_with_progress(docs_to_parse)
|
|
262
|
+
|
|
263
|
+
# Print summary
|
|
264
|
+
self._print_summary(results, dry_run)
|
|
265
|
+
|
|
266
|
+
# Orphan cleanup: only safe when publishing the full instructions root.
|
|
267
|
+
# When publishing a subfolder, local caches cover only a subset of server
|
|
268
|
+
# docs — running orphan detection would delete everything else.
|
|
269
|
+
is_full_publish = folder.name == "instructions" or folder.parent == self.workspace_root
|
|
270
|
+
if is_full_publish:
|
|
271
|
+
managed_domains_by_dataset: dict[str, set[str]] = {}
|
|
272
|
+
for cache in all_caches:
|
|
273
|
+
dataset_name = self._resolve_dataset_name({"release": cache.release})
|
|
274
|
+
managed_domains_by_dataset.setdefault(dataset_name, set()).add(cache.domain)
|
|
275
|
+
self._cleanup_orphans(
|
|
276
|
+
all_caches,
|
|
277
|
+
dry_run,
|
|
278
|
+
managed_domains_by_dataset=managed_domains_by_dataset,
|
|
279
|
+
)
|
|
280
|
+
else:
|
|
281
|
+
print("\nOrphan detection skipped (subfolder publish)")
|
|
282
|
+
|
|
283
|
+
return results
|
|
284
|
+
|
|
285
|
+
def publish_file(
|
|
286
|
+
self,
|
|
287
|
+
file_path: str | None = None,
|
|
288
|
+
metadata: JsonDict | None = None,
|
|
289
|
+
cache: DocumentData | None = None,
|
|
290
|
+
dry_run: bool = False,
|
|
291
|
+
force: bool = False,
|
|
292
|
+
parse_documents: bool = True,
|
|
293
|
+
wait_for_parsing: bool = True,
|
|
294
|
+
skip_change_check: bool = False
|
|
295
|
+
) -> PublishResult:
|
|
296
|
+
"""
|
|
297
|
+
Publish a single content file to RAGFlow.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
file_path: Path to content file (legacy, for backward compatibility)
|
|
301
|
+
metadata: Optional metadata override (auto-extracted if not provided)
|
|
302
|
+
cache: Pre-built DocumentData (preferred, avoids re-reading)
|
|
303
|
+
dry_run: If True, only simulate publishing
|
|
304
|
+
force: If True, force republish ignoring change detection
|
|
305
|
+
parse_documents: If True, trigger parsing after upload (default: True)
|
|
306
|
+
wait_for_parsing: If True, wait for parsing to complete (default: True)
|
|
307
|
+
skip_change_check: If True, skip change verification (already verified)
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
PublishResult with outcome
|
|
311
|
+
"""
|
|
312
|
+
# If cache provided, use it (no re-reading, no re-calculation)
|
|
313
|
+
if cache:
|
|
314
|
+
file = cache.file_path
|
|
315
|
+
ims_doc_id = cache.ims_doc_id
|
|
316
|
+
content = cache.content
|
|
317
|
+
content_str = cache.content_str
|
|
318
|
+
is_text = cache.is_text
|
|
319
|
+
|
|
320
|
+
# Build metadata dict from cache if not provided
|
|
321
|
+
if metadata is None:
|
|
322
|
+
metadata = cache.to_metadata_dict()
|
|
323
|
+
else:
|
|
324
|
+
# Legacy path: file_path provided (for backward compatibility)
|
|
325
|
+
if not file_path:
|
|
326
|
+
return PublishResult(
|
|
327
|
+
success=False,
|
|
328
|
+
document_id="",
|
|
329
|
+
file_path="",
|
|
330
|
+
tags=[],
|
|
331
|
+
error="Either file_path or cache must be provided"
|
|
332
|
+
)
|
|
333
|
+
|
|
334
|
+
file = Path(file_path).resolve()
|
|
335
|
+
|
|
336
|
+
# Validate file exists
|
|
337
|
+
if not file.exists():
|
|
338
|
+
return PublishResult(
|
|
339
|
+
success=False,
|
|
340
|
+
document_id="",
|
|
341
|
+
file_path=str(file),
|
|
342
|
+
tags=[],
|
|
343
|
+
error="File not found"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Create cache on-the-fly (still better than reading multiple times)
|
|
347
|
+
cache = DocumentData.from_file(
|
|
348
|
+
file,
|
|
349
|
+
self.workspace_root,
|
|
350
|
+
self.file_extensions,
|
|
351
|
+
publish_root=file.parent,
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
ims_doc_id = cache.ims_doc_id
|
|
355
|
+
content = cache.content
|
|
356
|
+
content_str = cache.content_str
|
|
357
|
+
is_text = cache.is_text
|
|
358
|
+
|
|
359
|
+
if metadata is None:
|
|
360
|
+
metadata = cache.to_metadata_dict()
|
|
361
|
+
|
|
362
|
+
# Determine dataset name from template and metadata
|
|
363
|
+
dataset_name = self._resolve_dataset_name(metadata)
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
# Skip empty text files
|
|
367
|
+
if is_text and (not content_str or content_str.strip() == ""):
|
|
368
|
+
return PublishResult(
|
|
369
|
+
success=False,
|
|
370
|
+
document_id=ims_doc_id,
|
|
371
|
+
file_path=str(file),
|
|
372
|
+
tags=metadata.get('tags', []),
|
|
373
|
+
error="File is empty - skipping"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
# Check if changed (only if not already verified)
|
|
377
|
+
if not skip_change_check and self.enable_change_tracking and not force and not dry_run:
|
|
378
|
+
if not self._has_content_changed_cached(cache):
|
|
379
|
+
print(f"⊘ Skipped (unchanged): {file.name}")
|
|
380
|
+
return PublishResult(
|
|
381
|
+
success=True,
|
|
382
|
+
document_id=ims_doc_id,
|
|
383
|
+
file_path=str(file),
|
|
384
|
+
tags=metadata.get('tags', []),
|
|
385
|
+
skipped=True
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
# Add file size for binary files
|
|
389
|
+
if not is_text:
|
|
390
|
+
metadata['file_size'] = len(content)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
if dry_run:
|
|
394
|
+
print(f"[DRY RUN] Would publish: {file.name}")
|
|
395
|
+
print(f" Document ID: {ims_doc_id}")
|
|
396
|
+
print(f" Dataset: {dataset_name}")
|
|
397
|
+
print(f" File type: {'text' if is_text else 'binary'}")
|
|
398
|
+
print(f" Metadata: {metadata}")
|
|
399
|
+
if is_text and content_str:
|
|
400
|
+
print(f" Content size: {len(content_str)} characters")
|
|
401
|
+
else:
|
|
402
|
+
print(f" File size: {metadata.get('file_size', 0)} bytes")
|
|
403
|
+
|
|
404
|
+
return PublishResult(
|
|
405
|
+
success=True,
|
|
406
|
+
document_id=ims_doc_id,
|
|
407
|
+
file_path=str(file),
|
|
408
|
+
tags=metadata.get('tags', [])
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
original_path = metadata.get("original_path", "")
|
|
412
|
+
|
|
413
|
+
# Create DocumentMetadata for RAGFlow
|
|
414
|
+
doc_metadata = DocumentMetadata(
|
|
415
|
+
tags=cast(list[str], metadata.get('tags', [])),
|
|
416
|
+
domain=str(metadata.get('domain', 'general')),
|
|
417
|
+
release=str(metadata.get('release', '')),
|
|
418
|
+
content_hash=str(metadata.get('content_hash', '')),
|
|
419
|
+
ims_doc_id=ims_doc_id,
|
|
420
|
+
original_path=str(original_path),
|
|
421
|
+
resource_path=cast(str | None, metadata.get("resource_path")),
|
|
422
|
+
sort_order=cast(int | None, metadata.get("sort_order")),
|
|
423
|
+
frontmatter=cast(JsonDict | None, metadata.get("frontmatter")),
|
|
424
|
+
line_count=cast(int | None, metadata.get("line_count")),
|
|
425
|
+
doc_title=str(metadata.get('doc_title', file.name))
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
# Upload to RAGFlow using pre-read content from cache (no re-reading!)
|
|
429
|
+
result = self.client.upload_document(
|
|
430
|
+
file_path=file,
|
|
431
|
+
metadata=doc_metadata,
|
|
432
|
+
dataset_name=dataset_name,
|
|
433
|
+
dataset_template=self.dataset_template,
|
|
434
|
+
force=force,
|
|
435
|
+
content=content # Pass pre-read content from cache
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
# None means document was skipped (unchanged)
|
|
439
|
+
if result is None:
|
|
440
|
+
return PublishResult(
|
|
441
|
+
success=True,
|
|
442
|
+
document_id=ims_doc_id,
|
|
443
|
+
file_path=str(file),
|
|
444
|
+
tags=metadata.get('tags', []),
|
|
445
|
+
dataset_id="",
|
|
446
|
+
skipped=True
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
doc, dataset_id = result
|
|
450
|
+
# Use RAGFlow's internal doc.id for parsing (not ims_doc_id)
|
|
451
|
+
publish_result = PublishResult(
|
|
452
|
+
success=True,
|
|
453
|
+
document_id=doc.id, # RAGFlow internal ID for parsing
|
|
454
|
+
file_path=str(file),
|
|
455
|
+
tags=metadata.get('tags', []),
|
|
456
|
+
dataset_id=dataset_id
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
# Trigger parsing if enabled — only for parsable extensions
|
|
460
|
+
if parse_documents and not dry_run:
|
|
461
|
+
if file.suffix.lower() in RAGFLOW_PARSABLE_EXTENSIONS:
|
|
462
|
+
documents = [{
|
|
463
|
+
"id": doc.id,
|
|
464
|
+
"name": file.name,
|
|
465
|
+
"dataset_id": dataset_id
|
|
466
|
+
}]
|
|
467
|
+
self._parse_documents(
|
|
468
|
+
documents,
|
|
469
|
+
wait_for_completion=wait_for_parsing
|
|
470
|
+
)
|
|
471
|
+
else:
|
|
472
|
+
print(f" ⊘ Skipping parse (unsupported extension): {file.name}")
|
|
473
|
+
|
|
474
|
+
return publish_result
|
|
475
|
+
|
|
476
|
+
except RAGFlowClientError as e:
|
|
477
|
+
return PublishResult(
|
|
478
|
+
success=False,
|
|
479
|
+
document_id="",
|
|
480
|
+
file_path=str(file),
|
|
481
|
+
tags=[],
|
|
482
|
+
error=str(e)
|
|
483
|
+
)
|
|
484
|
+
except Exception as e:
|
|
485
|
+
return PublishResult(
|
|
486
|
+
success=False,
|
|
487
|
+
document_id="",
|
|
488
|
+
file_path=str(file),
|
|
489
|
+
tags=[],
|
|
490
|
+
error=str(e)
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
def _parse_documents(
|
|
494
|
+
self,
|
|
495
|
+
documents: list[JsonDict],
|
|
496
|
+
wait_for_completion: bool = True,
|
|
497
|
+
silent: bool = False
|
|
498
|
+
) -> None:
|
|
499
|
+
"""
|
|
500
|
+
Trigger parsing for one or more documents.
|
|
501
|
+
|
|
502
|
+
Args:
|
|
503
|
+
documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id}
|
|
504
|
+
wait_for_completion: Wait for parsing to complete
|
|
505
|
+
silent: If True, don't print progress messages (for streaming mode)
|
|
506
|
+
"""
|
|
507
|
+
# Delegate to client's batch parsing method
|
|
508
|
+
result = self.client.parse_documents_batch(documents, silent=silent)
|
|
509
|
+
|
|
510
|
+
# Wait for completion if requested (only for successful datasets)
|
|
511
|
+
if wait_for_completion and not silent and result["success"]:
|
|
512
|
+
# Filter to only successfully triggered documents
|
|
513
|
+
docs_to_wait = [
|
|
514
|
+
doc for doc in documents
|
|
515
|
+
if doc["dataset_id"] in result["success"]
|
|
516
|
+
]
|
|
517
|
+
|
|
518
|
+
if docs_to_wait:
|
|
519
|
+
# Use DocumentService for consistent waiting with progress bar
|
|
520
|
+
doc_service = DocumentService(self.client)
|
|
521
|
+
doc_service.wait_for_parsing(docs_to_wait)
|
|
522
|
+
|
|
523
|
+
def _wait_for_all_parsing_with_progress(
|
|
524
|
+
self,
|
|
525
|
+
documents: list[JsonDict],
|
|
526
|
+
timeout: int = 300, # 5 minutes
|
|
527
|
+
poll_interval: float = 0.5
|
|
528
|
+
) -> None:
|
|
529
|
+
"""
|
|
530
|
+
Wait for all documents to finish parsing with clean progress display grouped by folder.
|
|
531
|
+
|
|
532
|
+
Delegates to DocumentService.wait_for_parsing() for reusable implementation.
|
|
533
|
+
|
|
534
|
+
Args:
|
|
535
|
+
documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id, "folder": folder}
|
|
536
|
+
timeout: Max seconds to wait
|
|
537
|
+
poll_interval: Seconds between status checks (reduced to 0.5 for smoother progress)
|
|
538
|
+
"""
|
|
539
|
+
doc_service = DocumentService(self.client)
|
|
540
|
+
doc_service.wait_for_parsing(documents, timeout, poll_interval)
|
|
541
|
+
|
|
542
|
+
def _has_content_changed_cached(self, cache: DocumentData) -> bool:
|
|
543
|
+
"""
|
|
544
|
+
Check if document content has changed using pre-calculated hash.
|
|
545
|
+
|
|
546
|
+
Args:
|
|
547
|
+
cache: DocumentData with pre-calculated hash
|
|
548
|
+
|
|
549
|
+
Returns:
|
|
550
|
+
True if changed or new, False if unchanged
|
|
551
|
+
"""
|
|
552
|
+
try:
|
|
553
|
+
# Resolve dataset name from cache metadata
|
|
554
|
+
dataset_name = self._resolve_dataset_name({
|
|
555
|
+
'release': cache.release,
|
|
556
|
+
'tags': cache.tags,
|
|
557
|
+
'domain': cache.domain
|
|
558
|
+
})
|
|
559
|
+
|
|
560
|
+
# Get dataset
|
|
561
|
+
dataset = self.client.get_dataset(name=dataset_name)
|
|
562
|
+
if not dataset:
|
|
563
|
+
return True # New dataset = new document
|
|
564
|
+
|
|
565
|
+
# Search for existing document by ims_doc_id
|
|
566
|
+
metadata_filter = {
|
|
567
|
+
"logic": "and",
|
|
568
|
+
"conditions": [{
|
|
569
|
+
"name": "ims_doc_id",
|
|
570
|
+
"comparison_operator": "is",
|
|
571
|
+
"value": cache.ims_doc_id
|
|
572
|
+
}]
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
docs = self.client.list_documents(
|
|
576
|
+
dataset,
|
|
577
|
+
page_size=1,
|
|
578
|
+
metadata_condition=metadata_filter
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
if not docs:
|
|
582
|
+
return True # Document doesn't exist
|
|
583
|
+
|
|
584
|
+
# Compare hashes (use pre-calculated hash from cache)
|
|
585
|
+
existing_doc = docs[0]
|
|
586
|
+
existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
|
|
587
|
+
|
|
588
|
+
if isinstance(existing_meta, dict):
|
|
589
|
+
existing_hash = existing_meta.get("content_hash")
|
|
590
|
+
else:
|
|
591
|
+
existing_hash = getattr(existing_meta, 'content_hash', None)
|
|
592
|
+
|
|
593
|
+
if not existing_hash:
|
|
594
|
+
return True # No hash = changed
|
|
595
|
+
|
|
596
|
+
# Compare: cache.content_hash was already calculated in DocumentData
|
|
597
|
+
return cache.content_hash != str(existing_hash)
|
|
598
|
+
|
|
599
|
+
except Exception as e:
|
|
600
|
+
print(f" Warning: Could not check existing document: {e}")
|
|
601
|
+
return True # Assume changed on error
|
|
602
|
+
|
|
603
|
+
def _resolve_dataset_name(self, metadata: JsonDict) -> str:
|
|
604
|
+
"""
|
|
605
|
+
Resolve dataset name from template using metadata.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
metadata: Document metadata containing release info
|
|
609
|
+
|
|
610
|
+
Returns:
|
|
611
|
+
Resolved dataset name
|
|
612
|
+
"""
|
|
613
|
+
release = metadata.get('release', '')
|
|
614
|
+
|
|
615
|
+
if release:
|
|
616
|
+
# Use template and replace {release} placeholder
|
|
617
|
+
return self.dataset_template.replace('{release}', release)
|
|
618
|
+
else:
|
|
619
|
+
# No release - use default dataset
|
|
620
|
+
return self.dataset_default
|
|
621
|
+
|
|
622
|
+
def _cleanup_duplicates(self, all_caches: list[DocumentData], dry_run: bool) -> None:
|
|
623
|
+
"""Remove stale duplicate server docs before publishing.
|
|
624
|
+
|
|
625
|
+
For each dataset, finds server docs where the same original_path appears
|
|
626
|
+
more than once. Keeps the copy whose ims_doc_id matches the local file
|
|
627
|
+
(authoritative). If no copy matches, keeps the most recently created one
|
|
628
|
+
(first in desc create_time order). Deletes the rest.
|
|
629
|
+
|
|
630
|
+
Only touches documents with a non-empty original_path.
|
|
631
|
+
Custom documents (no original_path) are never affected.
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
all_caches: DocumentData for every local file in the published folder tree.
|
|
635
|
+
dry_run: If True, report duplicates but do not delete.
|
|
636
|
+
"""
|
|
637
|
+
# Build: dataset_name -> {original_path -> canonical ims_doc_id}
|
|
638
|
+
canonical_by_dataset: dict[str, dict[str, str]] = {}
|
|
639
|
+
for cache in all_caches:
|
|
640
|
+
if not cache.original_path:
|
|
641
|
+
continue
|
|
642
|
+
dataset_name = self._resolve_dataset_name({"release": cache.release})
|
|
643
|
+
canonical_by_dataset.setdefault(dataset_name, {})[cache.original_path] = cache.ims_doc_id
|
|
644
|
+
|
|
645
|
+
if not canonical_by_dataset:
|
|
646
|
+
return
|
|
647
|
+
|
|
648
|
+
print("\nDuplicate detection...")
|
|
649
|
+
for dataset_name, canonical_ids in canonical_by_dataset.items():
|
|
650
|
+
dataset = self.client.get_dataset(name=dataset_name)
|
|
651
|
+
if not dataset:
|
|
652
|
+
continue
|
|
653
|
+
|
|
654
|
+
try:
|
|
655
|
+
all_docs = self.client.list_documents(
|
|
656
|
+
dataset,
|
|
657
|
+
page_size=self.client.page_size,
|
|
658
|
+
)
|
|
659
|
+
except Exception as e:
|
|
660
|
+
print(f" Warning: Could not list documents in '{dataset_name}': {e}")
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
def _strip_duplicate_suffix(name: str) -> str:
|
|
664
|
+
path = Path(name)
|
|
665
|
+
new_name = f"{re.sub(r'\(\d+\)$', '', path.stem)}{path.suffix}"
|
|
666
|
+
return name[: -len(path.name)] + new_name
|
|
667
|
+
|
|
668
|
+
managed_docs = []
|
|
669
|
+
for doc in all_docs:
|
|
670
|
+
meta = getattr(doc, "meta_fields", {}) or {}
|
|
671
|
+
original_path = meta.get("original_path", "") if isinstance(meta, dict) else getattr(meta, "original_path", "")
|
|
672
|
+
if original_path:
|
|
673
|
+
managed_docs.append(doc)
|
|
674
|
+
|
|
675
|
+
# Group by original_path; list order is desc create_time (most recent first)
|
|
676
|
+
by_path: dict[str, list[DocumentLike]] = defaultdict(list)
|
|
677
|
+
for doc in managed_docs:
|
|
678
|
+
meta = getattr(doc, "meta_fields", {}) or {}
|
|
679
|
+
original_path = meta.get("original_path", "") if isinstance(meta, dict) else getattr(meta, "original_path", "")
|
|
680
|
+
if original_path:
|
|
681
|
+
by_path[original_path].append(doc)
|
|
682
|
+
|
|
683
|
+
duplicates = [] # (doc, original_path) to delete
|
|
684
|
+
for original_path, docs in by_path.items():
|
|
685
|
+
if len(docs) <= 1:
|
|
686
|
+
continue
|
|
687
|
+
|
|
688
|
+
canonical_ims_doc_id = canonical_ids.get(original_path)
|
|
689
|
+
# Find the authoritative doc (matching local ims_doc_id)
|
|
690
|
+
authoritative: DocumentLike | None = None
|
|
691
|
+
if canonical_ims_doc_id:
|
|
692
|
+
for doc in docs:
|
|
693
|
+
meta = getattr(doc, "meta_fields", {}) or {}
|
|
694
|
+
doc_ims_id = meta.get("ims_doc_id", "") if isinstance(meta, dict) else getattr(meta, "ims_doc_id", "")
|
|
695
|
+
if doc_ims_id == canonical_ims_doc_id:
|
|
696
|
+
authoritative = doc
|
|
697
|
+
break
|
|
698
|
+
|
|
699
|
+
# Fallback: keep most recent (first in desc order)
|
|
700
|
+
if authoritative is None:
|
|
701
|
+
authoritative = docs[0]
|
|
702
|
+
|
|
703
|
+
for doc in docs:
|
|
704
|
+
if doc.id != authoritative.id:
|
|
705
|
+
duplicates.append((doc, original_path))
|
|
706
|
+
|
|
707
|
+
# Name duplicates: foo.md + foo(1).md + foo(2).md ...
|
|
708
|
+
name_groups: dict[str, list[DocumentLike]] = defaultdict(list)
|
|
709
|
+
for doc in all_docs:
|
|
710
|
+
doc_name = getattr(doc, "name", "") or ""
|
|
711
|
+
if not doc_name:
|
|
712
|
+
continue
|
|
713
|
+
name_groups[_strip_duplicate_suffix(doc_name)].append(doc)
|
|
714
|
+
|
|
715
|
+
for doc_name_stripped, docs in name_groups.items():
|
|
716
|
+
if len(docs) > 1:
|
|
717
|
+
for doc in docs:
|
|
718
|
+
duplicates.append((doc, getattr(doc, "name", "") or ""))
|
|
719
|
+
|
|
720
|
+
if not duplicates:
|
|
721
|
+
print(f" '{dataset_name}': no duplicates")
|
|
722
|
+
continue
|
|
723
|
+
|
|
724
|
+
print(f" '{dataset_name}': {len(duplicates)} duplicate(s)")
|
|
725
|
+
seen_doc_ids = set()
|
|
726
|
+
for doc, original_path in duplicates:
|
|
727
|
+
if doc.id in seen_doc_ids:
|
|
728
|
+
continue
|
|
729
|
+
seen_doc_ids.add(doc.id)
|
|
730
|
+
if dry_run:
|
|
731
|
+
print(f" [DRY RUN] Would delete duplicate: {original_path}")
|
|
732
|
+
else:
|
|
733
|
+
try:
|
|
734
|
+
dataset.delete_documents([doc.id])
|
|
735
|
+
print(f" Deleted duplicate: {original_path}")
|
|
736
|
+
except Exception as e:
|
|
737
|
+
print(f" Warning: Failed to delete duplicate '{original_path}': {e}")
|
|
738
|
+
|
|
739
|
+
def _cleanup_orphans(
|
|
740
|
+
self,
|
|
741
|
+
all_caches: list[DocumentData],
|
|
742
|
+
dry_run: bool,
|
|
743
|
+
managed_domains_by_dataset: dict[str, set[str]] | None = None,
|
|
744
|
+
) -> None:
|
|
745
|
+
"""Delete server documents whose original_path is no longer present locally.
|
|
746
|
+
|
|
747
|
+
Only touches documents that have a non-empty original_path metadata field.
|
|
748
|
+
Custom documents (uploaded via put_document, no original_path) are NEVER deleted.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
all_caches: DocumentData for every local file in the published folder tree.
|
|
752
|
+
dry_run: If True, report orphans but do not delete.
|
|
753
|
+
"""
|
|
754
|
+
# Build: dataset_name -> set of local original_paths
|
|
755
|
+
local_paths_by_dataset: dict[str, set[str]] = {}
|
|
756
|
+
for cache in all_caches:
|
|
757
|
+
if not cache.original_path:
|
|
758
|
+
continue
|
|
759
|
+
dataset_name = self._resolve_dataset_name({"release": cache.release})
|
|
760
|
+
local_paths_by_dataset.setdefault(dataset_name, set()).add(cache.original_path)
|
|
761
|
+
|
|
762
|
+
if not local_paths_by_dataset:
|
|
763
|
+
return
|
|
764
|
+
|
|
765
|
+
print("\nOrphan detection...")
|
|
766
|
+
for dataset_name, local_paths in local_paths_by_dataset.items():
|
|
767
|
+
managed_domains = (
|
|
768
|
+
managed_domains_by_dataset.get(dataset_name, set())
|
|
769
|
+
if managed_domains_by_dataset
|
|
770
|
+
else set()
|
|
771
|
+
)
|
|
772
|
+
dataset = self.client.get_dataset(name=dataset_name)
|
|
773
|
+
if not dataset:
|
|
774
|
+
continue
|
|
775
|
+
|
|
776
|
+
# Fetch only publisher-managed docs (non-empty original_path) via server-side filter.
|
|
777
|
+
# Custom documents (no original_path) are skipped at the query level.
|
|
778
|
+
try:
|
|
779
|
+
managed_docs = self.client.list_documents(
|
|
780
|
+
dataset,
|
|
781
|
+
page_size=self.client.page_size,
|
|
782
|
+
metadata_condition={
|
|
783
|
+
"logic": "and",
|
|
784
|
+
"conditions": [{
|
|
785
|
+
"name": "original_path",
|
|
786
|
+
"comparison_operator": "not empty",
|
|
787
|
+
"value": ""
|
|
788
|
+
}]
|
|
789
|
+
}
|
|
790
|
+
)
|
|
791
|
+
except Exception as e:
|
|
792
|
+
print(f" Warning: Could not list documents in '{dataset_name}': {e}")
|
|
793
|
+
continue
|
|
794
|
+
|
|
795
|
+
orphans = []
|
|
796
|
+
for doc in managed_docs:
|
|
797
|
+
meta = getattr(doc, "meta_fields", {}) or {}
|
|
798
|
+
if isinstance(meta, dict):
|
|
799
|
+
original_path = meta.get("original_path", "")
|
|
800
|
+
domain = meta.get("domain", "")
|
|
801
|
+
else:
|
|
802
|
+
original_path = getattr(meta, "original_path", "")
|
|
803
|
+
domain = getattr(meta, "domain", "")
|
|
804
|
+
|
|
805
|
+
if not original_path:
|
|
806
|
+
continue
|
|
807
|
+
|
|
808
|
+
if managed_domains and domain not in managed_domains:
|
|
809
|
+
continue
|
|
810
|
+
|
|
811
|
+
if original_path not in local_paths:
|
|
812
|
+
orphans.append((doc, original_path))
|
|
813
|
+
|
|
814
|
+
if not orphans:
|
|
815
|
+
print(f" '{dataset_name}': no orphans")
|
|
816
|
+
continue
|
|
817
|
+
|
|
818
|
+
print(f" '{dataset_name}': {len(orphans)} orphan(s)")
|
|
819
|
+
for doc, original_path in orphans:
|
|
820
|
+
if dry_run:
|
|
821
|
+
print(f" [DRY RUN] Would delete: {original_path}")
|
|
822
|
+
else:
|
|
823
|
+
try:
|
|
824
|
+
dataset.delete_documents([doc.id])
|
|
825
|
+
print(f" Deleted orphan: {original_path}")
|
|
826
|
+
except Exception as e:
|
|
827
|
+
print(f" Warning: Failed to delete orphan '{original_path}': {e}")
|
|
828
|
+
|
|
829
|
+
@staticmethod
|
|
830
|
+
def _print_summary(results: list[PublishResult], dry_run: bool = False) -> None:
|
|
831
|
+
"""
|
|
832
|
+
Print summary of publishing results.
|
|
833
|
+
|
|
834
|
+
Args:
|
|
835
|
+
results: List of PublishResult
|
|
836
|
+
dry_run: Whether this was a dry run
|
|
837
|
+
"""
|
|
838
|
+
successful = [r for r in results if r.success and not r.skipped]
|
|
839
|
+
skipped = [r for r in results if r.skipped]
|
|
840
|
+
failed = [r for r in results if not r.success]
|
|
841
|
+
|
|
842
|
+
print("\n" + "="*60)
|
|
843
|
+
if dry_run:
|
|
844
|
+
print("DRY RUN SUMMARY")
|
|
845
|
+
else:
|
|
846
|
+
print("PUBLISHING SUMMARY")
|
|
847
|
+
print("="*60)
|
|
848
|
+
|
|
849
|
+
print(f"Total files: {len(results)}")
|
|
850
|
+
print(f"✓ Successful: {len(successful)}")
|
|
851
|
+
print(f"⊘ Skipped (unchanged): {len(skipped)}")
|
|
852
|
+
print(f"✗ Failed: {len(failed)}")
|
|
853
|
+
|
|
854
|
+
if failed:
|
|
855
|
+
print("\nFailed files:")
|
|
856
|
+
for result in failed:
|
|
857
|
+
print(f" {result}")
|
|
858
|
+
|
|
859
|
+
print("="*60)
|