rosetta-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_cli/__init__.py +12 -0
- rosetta_cli/__main__.py +6 -0
- rosetta_cli/cli.py +379 -0
- rosetta_cli/commands/__init__.py +5 -0
- rosetta_cli/commands/base_command.py +82 -0
- rosetta_cli/commands/cleanup_command.py +214 -0
- rosetta_cli/commands/list_command.py +70 -0
- rosetta_cli/commands/parse_command.py +205 -0
- rosetta_cli/commands/publish_command.py +113 -0
- rosetta_cli/commands/verify_command.py +46 -0
- rosetta_cli/ims_auth.py +124 -0
- rosetta_cli/ims_config.py +317 -0
- rosetta_cli/ims_publisher.py +859 -0
- rosetta_cli/ims_utils.py +28 -0
- rosetta_cli/ragflow_client.py +928 -0
- rosetta_cli/services/__init__.py +8 -0
- rosetta_cli/services/auth_service.py +114 -0
- rosetta_cli/services/dataset_service.py +72 -0
- rosetta_cli/services/document_data.py +408 -0
- rosetta_cli/services/document_service.py +357 -0
- rosetta_cli/typing_utils.py +49 -0
- rosetta_cli-2.0.0.dist-info/METADATA +639 -0
- rosetta_cli-2.0.0.dist-info/RECORD +26 -0
- rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
- rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
- rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Service
|
|
3
|
+
|
|
4
|
+
Handles document operations including listing, filtering, and status management.
|
|
5
|
+
Eliminates code duplication and fixes naming inconsistencies.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
from typing import cast
|
|
10
|
+
|
|
11
|
+
from tqdm import tqdm
|
|
12
|
+
from ..ragflow_client import RAGFlowClient
|
|
13
|
+
from ..typing_utils import DatasetLike, DocumentLike, JsonDict
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DocumentService:
|
|
17
|
+
"""Service for handling document operations"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, client: RAGFlowClient):
|
|
20
|
+
"""
|
|
21
|
+
Initialize DocumentService.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
client: RAGFlow client instance
|
|
25
|
+
"""
|
|
26
|
+
self.client = client
|
|
27
|
+
|
|
28
|
+
def list_documents_by_status(
|
|
29
|
+
self,
|
|
30
|
+
dataset: DatasetLike,
|
|
31
|
+
statuses: list[str] | None = None,
|
|
32
|
+
limit: int = 1000
|
|
33
|
+
) -> list[DocumentLike]:
|
|
34
|
+
"""
|
|
35
|
+
List documents filtered by parse status using server-side filtering.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
dataset: Dataset object
|
|
39
|
+
statuses: List of status values to filter by (e.g., ["FAIL", "UNSTART", "CANCEL"])
|
|
40
|
+
Supported values: "UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"
|
|
41
|
+
limit: Maximum number of documents to return
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
List of Document objects matching the criteria
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
Exception: If RAGFlow API returns error
|
|
48
|
+
"""
|
|
49
|
+
try:
|
|
50
|
+
# Use server-side filtering via HTTP API run parameter
|
|
51
|
+
# If statuses is None, no filter is applied (fetches all)
|
|
52
|
+
documents = self.client.list_documents(
|
|
53
|
+
dataset,
|
|
54
|
+
run=statuses,
|
|
55
|
+
page_size=limit
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return documents if documents else []
|
|
59
|
+
|
|
60
|
+
except Exception as e:
|
|
61
|
+
# Fallback to client-side filtering if HTTP API fails
|
|
62
|
+
print(f"Warning: Server-side filtering failed, using fallback: {e}")
|
|
63
|
+
|
|
64
|
+
all_documents = dataset.list_documents(page_size=limit)
|
|
65
|
+
|
|
66
|
+
if not all_documents or not statuses:
|
|
67
|
+
return all_documents or []
|
|
68
|
+
|
|
69
|
+
# Client-side filtering fallback
|
|
70
|
+
filtered: list[DocumentLike] = []
|
|
71
|
+
for doc in all_documents:
|
|
72
|
+
doc_status = getattr(doc, 'run', 'UNSTART')
|
|
73
|
+
if doc_status in statuses:
|
|
74
|
+
filtered.append(doc)
|
|
75
|
+
|
|
76
|
+
return filtered
|
|
77
|
+
|
|
78
|
+
def filter_documents_by_prefix(
|
|
79
|
+
self,
|
|
80
|
+
dataset: DatasetLike,
|
|
81
|
+
prefix: str,
|
|
82
|
+
limit: int = 1000
|
|
83
|
+
) -> list[DocumentLike]:
|
|
84
|
+
"""
|
|
85
|
+
Filter documents by title prefix using server-side filtering.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
dataset: Dataset object
|
|
89
|
+
prefix: Title prefix to filter by
|
|
90
|
+
limit: Maximum number of documents to return
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of Document objects with matching prefix
|
|
94
|
+
"""
|
|
95
|
+
metadata_filter = {
|
|
96
|
+
"logic": "and",
|
|
97
|
+
"conditions": [{
|
|
98
|
+
"name": "doc_title",
|
|
99
|
+
"comparison_operator": "start with",
|
|
100
|
+
"value": prefix
|
|
101
|
+
}]
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
documents = self.client.list_documents(
|
|
105
|
+
dataset,
|
|
106
|
+
page_size=limit,
|
|
107
|
+
metadata_condition=metadata_filter
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
return documents if documents else []
|
|
111
|
+
|
|
112
|
+
def filter_documents_by_tags(
|
|
113
|
+
self,
|
|
114
|
+
dataset: DatasetLike,
|
|
115
|
+
tags: list[str],
|
|
116
|
+
limit: int = 1000
|
|
117
|
+
) -> list[DocumentLike]:
|
|
118
|
+
"""
|
|
119
|
+
Filter documents by tags using server-side metadata filtering.
|
|
120
|
+
|
|
121
|
+
Uses "or" logic to find documents containing ANY of the specified tags.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
dataset: Dataset object
|
|
125
|
+
tags: List of tags to filter by (e.g., ["r1", "agents"])
|
|
126
|
+
limit: Maximum number of documents to return
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
List of Document objects containing any of the specified tags
|
|
130
|
+
|
|
131
|
+
Example:
|
|
132
|
+
tags=["r1", "agents"] finds documents with tags containing "r1" OR "agents"
|
|
133
|
+
"""
|
|
134
|
+
if not tags:
|
|
135
|
+
return []
|
|
136
|
+
|
|
137
|
+
# Build metadata condition with "or" logic
|
|
138
|
+
# Each tag gets a "contains" condition
|
|
139
|
+
metadata_filter = {
|
|
140
|
+
"logic": "or",
|
|
141
|
+
"conditions": [
|
|
142
|
+
{
|
|
143
|
+
"name": "tags",
|
|
144
|
+
"comparison_operator": "contains",
|
|
145
|
+
"value": tag
|
|
146
|
+
}
|
|
147
|
+
for tag in tags
|
|
148
|
+
]
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
documents = self.client.list_documents(
|
|
152
|
+
dataset,
|
|
153
|
+
page_size=limit,
|
|
154
|
+
metadata_condition=metadata_filter
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
return documents if documents else []
|
|
158
|
+
|
|
159
|
+
def get_document_summary(self, document: DocumentLike) -> JsonDict:
|
|
160
|
+
"""
|
|
161
|
+
Get summary information from a document.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
document: Document object
|
|
165
|
+
|
|
166
|
+
Returns:
|
|
167
|
+
Dictionary with document summary
|
|
168
|
+
"""
|
|
169
|
+
return {
|
|
170
|
+
'id': getattr(document, 'id', 'N/A'),
|
|
171
|
+
'name': getattr(document, 'name', 'Untitled'),
|
|
172
|
+
'size': getattr(document, 'size', 0),
|
|
173
|
+
'run': getattr(document, 'run', 'UNKNOWN'),
|
|
174
|
+
'chunk_count': getattr(document, 'chunk_count', 0),
|
|
175
|
+
'meta_fields': getattr(document, 'meta_fields', {})
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
def format_document_display(self, document: DocumentLike, index: int) -> str:
|
|
179
|
+
"""
|
|
180
|
+
Format document information for display.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
document: Document object
|
|
184
|
+
index: Display index number
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Formatted string for display
|
|
188
|
+
"""
|
|
189
|
+
summary = self.get_document_summary(document)
|
|
190
|
+
|
|
191
|
+
# Format parse status with emoji
|
|
192
|
+
status_icon = {
|
|
193
|
+
'DONE': '✓',
|
|
194
|
+
'RUNNING': '⏳',
|
|
195
|
+
'FAIL': '✗',
|
|
196
|
+
'UNSTART': '○',
|
|
197
|
+
'CANCEL': '⊘'
|
|
198
|
+
}.get(summary['run'], '?')
|
|
199
|
+
|
|
200
|
+
lines = [
|
|
201
|
+
f"{index}. {summary['name']}",
|
|
202
|
+
f" ID: {summary['id']}",
|
|
203
|
+
f" Size: {summary['size']:,} bytes",
|
|
204
|
+
f" Parse Status: {status_icon} {summary['run']} (Chunks: {summary['chunk_count']})"
|
|
205
|
+
]
|
|
206
|
+
|
|
207
|
+
# Add metadata if available
|
|
208
|
+
meta_fields = summary['meta_fields']
|
|
209
|
+
if meta_fields:
|
|
210
|
+
# meta_fields is a Base object from SDK, use getattr() directly
|
|
211
|
+
tags = cast(list[str], getattr(meta_fields, 'tags', []))
|
|
212
|
+
domain = getattr(meta_fields, 'domain', 'N/A')
|
|
213
|
+
release = getattr(meta_fields, 'release', 'N/A')
|
|
214
|
+
source = getattr(meta_fields, 'original_path', getattr(meta_fields, 'source_path', 'N/A'))
|
|
215
|
+
|
|
216
|
+
if tags:
|
|
217
|
+
lines.append(f" Tags: {', '.join(tags)}")
|
|
218
|
+
if domain != 'N/A':
|
|
219
|
+
lines.append(f" Domain: {domain}")
|
|
220
|
+
if release != 'N/A':
|
|
221
|
+
lines.append(f" Release: {release}")
|
|
222
|
+
if source != 'N/A':
|
|
223
|
+
lines.append(f" Source: {source}")
|
|
224
|
+
|
|
225
|
+
return '\n'.join(lines)
|
|
226
|
+
|
|
227
|
+
def wait_for_parsing(
|
|
228
|
+
self,
|
|
229
|
+
documents: list[JsonDict],
|
|
230
|
+
timeout: int = 300,
|
|
231
|
+
poll_interval: float = 0.5
|
|
232
|
+
) -> tuple[int, int]:
|
|
233
|
+
"""
|
|
234
|
+
Wait for documents to finish parsing with progress display grouped by folder.
|
|
235
|
+
|
|
236
|
+
Args:
|
|
237
|
+
documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id, "folder": folder}
|
|
238
|
+
timeout: Max seconds to wait (default: 300 = 5 minutes)
|
|
239
|
+
poll_interval: Seconds between status checks (default: 0.5 for smooth progress)
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Tuple of (success_count, failed_count)
|
|
243
|
+
"""
|
|
244
|
+
if not documents:
|
|
245
|
+
return 0, 0
|
|
246
|
+
|
|
247
|
+
# Group by folder for display
|
|
248
|
+
by_folder: dict[str, list[str]] = {}
|
|
249
|
+
doc_info: dict[str, JsonDict] = {}
|
|
250
|
+
|
|
251
|
+
for doc in documents:
|
|
252
|
+
folder = str(doc.get("folder", "."))
|
|
253
|
+
if folder not in by_folder:
|
|
254
|
+
by_folder[folder] = []
|
|
255
|
+
by_folder[folder].append(str(doc["id"]))
|
|
256
|
+
|
|
257
|
+
doc_id = str(doc["id"])
|
|
258
|
+
doc_info[doc_id] = {
|
|
259
|
+
"dataset_id": str(doc["dataset_id"]),
|
|
260
|
+
"name": str(doc["name"]),
|
|
261
|
+
"folder": folder,
|
|
262
|
+
"status": "queued",
|
|
263
|
+
"chunks": 0,
|
|
264
|
+
"tokens": 0
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
print(f"\n📄 Parsing {len(documents)} document(s)...\n")
|
|
268
|
+
|
|
269
|
+
start_time = time.time()
|
|
270
|
+
completed: set[str] = set()
|
|
271
|
+
failed: set[str] = set()
|
|
272
|
+
|
|
273
|
+
# Create progress bar with better refresh rate
|
|
274
|
+
pbar = tqdm(
|
|
275
|
+
total=len(documents),
|
|
276
|
+
desc="Overall progress",
|
|
277
|
+
unit="doc",
|
|
278
|
+
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
|
|
279
|
+
mininterval=0.1 # Update display at most every 0.1 seconds
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
while len(completed) + len(failed) < len(documents):
|
|
284
|
+
elapsed = time.time() - start_time
|
|
285
|
+
if elapsed > timeout:
|
|
286
|
+
pending_count = len(documents) - len(completed) - len(failed)
|
|
287
|
+
pbar.write(f"\n⚠️ Timeout after {timeout}s. {pending_count} documents still parsing.")
|
|
288
|
+
break
|
|
289
|
+
|
|
290
|
+
time.sleep(poll_interval)
|
|
291
|
+
|
|
292
|
+
# Check status of all pending documents
|
|
293
|
+
newly_completed = []
|
|
294
|
+
newly_failed = []
|
|
295
|
+
|
|
296
|
+
for doc_id, info in doc_info.items():
|
|
297
|
+
if doc_id in completed or doc_id in failed:
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
status = self.client.get_parse_status(info["dataset_id"], doc_id)
|
|
302
|
+
run_status = status.get("run", "UNKNOWN")
|
|
303
|
+
|
|
304
|
+
if run_status == "DONE":
|
|
305
|
+
info["status"] = "done"
|
|
306
|
+
info["chunks"] = status.get("chunk_count", 0)
|
|
307
|
+
info["tokens"] = status.get("token_count", 0)
|
|
308
|
+
completed.add(doc_id)
|
|
309
|
+
newly_completed.append(doc_id)
|
|
310
|
+
elif run_status == "FAIL":
|
|
311
|
+
info["status"] = "failed"
|
|
312
|
+
failed.add(doc_id)
|
|
313
|
+
newly_failed.append(doc_id)
|
|
314
|
+
elif run_status == "RUNNING":
|
|
315
|
+
info["status"] = "parsing"
|
|
316
|
+
except Exception as e:
|
|
317
|
+
# Don't fail entire operation for single status check error
|
|
318
|
+
pbar.write(f" ⚠️ Failed to get status for document {doc_id}: {e}")
|
|
319
|
+
|
|
320
|
+
# Update progress bar and print status for newly completed/failed
|
|
321
|
+
# Add small delay between updates to make progress visible
|
|
322
|
+
if newly_completed or newly_failed:
|
|
323
|
+
for doc_id in newly_completed:
|
|
324
|
+
info = doc_info[doc_id]
|
|
325
|
+
if info["folder"] == ".":
|
|
326
|
+
display = info['name'] # Root folder - show only filename
|
|
327
|
+
else:
|
|
328
|
+
display = f"{info['folder']}/{info['name']}" # Show folder/filename
|
|
329
|
+
pbar.write(f" ✅ {display}: {info['chunks']} chunks, {info['tokens']} tokens")
|
|
330
|
+
pbar.update(1)
|
|
331
|
+
time.sleep(0.05) # Small delay to make progress visible
|
|
332
|
+
|
|
333
|
+
for doc_id in newly_failed:
|
|
334
|
+
info = doc_info[doc_id]
|
|
335
|
+
if info["folder"] == ".":
|
|
336
|
+
display = info['name'] # Root folder - show only filename
|
|
337
|
+
else:
|
|
338
|
+
display = f"{info['folder']}/{info['name']}" # Show folder/filename
|
|
339
|
+
pbar.write(f" ✗ {display}: Failed")
|
|
340
|
+
pbar.update(1)
|
|
341
|
+
time.sleep(0.05) # Small delay to make progress visible
|
|
342
|
+
|
|
343
|
+
pbar.close()
|
|
344
|
+
|
|
345
|
+
elapsed = time.time() - start_time
|
|
346
|
+
if len(completed) == len(documents):
|
|
347
|
+
print(f"\n✅ All {len(documents)} documents parsed successfully ({elapsed:.1f}s)")
|
|
348
|
+
else:
|
|
349
|
+
print(f"\n⚠️ Completed: {len(completed)}, Failed: {len(failed)}, Total: {len(documents)}")
|
|
350
|
+
|
|
351
|
+
except KeyboardInterrupt:
|
|
352
|
+
pbar.close()
|
|
353
|
+
print("\n\n⚠️ Parsing interrupted by user (Ctrl+C)")
|
|
354
|
+
print("ℹ️ Parsing continues on server. Check RAGFlow UI for status.")
|
|
355
|
+
raise
|
|
356
|
+
|
|
357
|
+
return len(completed), len(failed)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Shared typing helpers for IMS CLI tools."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
from typing import Any, Protocol, TypeAlias
|
|
8
|
+
|
|
9
|
+
JsonValue: TypeAlias = Any
|
|
10
|
+
JsonDict: TypeAlias = dict[str, Any]
|
|
11
|
+
JsonList: TypeAlias = list[Any]
|
|
12
|
+
CommandArgs: TypeAlias = argparse.Namespace
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DocumentLike(Protocol):
|
|
16
|
+
id: str
|
|
17
|
+
name: str | None
|
|
18
|
+
run: str | None
|
|
19
|
+
size: int | None
|
|
20
|
+
chunk_count: int | None
|
|
21
|
+
token_count: int | None
|
|
22
|
+
progress: float | None
|
|
23
|
+
progress_msg: str | None
|
|
24
|
+
meta_fields: object
|
|
25
|
+
|
|
26
|
+
def update(self, payload: dict[str, object]) -> object: ...
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class DatasetLike(Protocol):
|
|
30
|
+
id: str
|
|
31
|
+
name: str
|
|
32
|
+
|
|
33
|
+
def list_documents(
|
|
34
|
+
self,
|
|
35
|
+
*,
|
|
36
|
+
id: str | None = None,
|
|
37
|
+
name: str | None = None,
|
|
38
|
+
page: int = 1,
|
|
39
|
+
page_size: int = 30,
|
|
40
|
+
keywords: str | None = None,
|
|
41
|
+
) -> list[DocumentLike]: ...
|
|
42
|
+
|
|
43
|
+
def delete_documents(self, ids: Sequence[str]) -> object: ...
|
|
44
|
+
|
|
45
|
+
def upload_documents(self, documents: Sequence[dict[str, object]]) -> list[DocumentLike]: ...
|
|
46
|
+
|
|
47
|
+
def async_parse_documents(self, document_ids: Sequence[str]) -> object: ...
|
|
48
|
+
|
|
49
|
+
def get(self, path: str, params: dict[str, object]) -> object: ...
|