rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,357 @@
1
+ """
2
+ Document Service
3
+
4
+ Handles document operations including listing, filtering, and status management.
5
+ Eliminates code duplication and fixes naming inconsistencies.
6
+ """
7
+
8
+ import time
9
+ from typing import cast
10
+
11
+ from tqdm import tqdm
12
+ from ..ragflow_client import RAGFlowClient
13
+ from ..typing_utils import DatasetLike, DocumentLike, JsonDict
14
+
15
+
16
+ class DocumentService:
17
+ """Service for handling document operations"""
18
+
19
+ def __init__(self, client: RAGFlowClient):
20
+ """
21
+ Initialize DocumentService.
22
+
23
+ Args:
24
+ client: RAGFlow client instance
25
+ """
26
+ self.client = client
27
+
28
+ def list_documents_by_status(
29
+ self,
30
+ dataset: DatasetLike,
31
+ statuses: list[str] | None = None,
32
+ limit: int = 1000
33
+ ) -> list[DocumentLike]:
34
+ """
35
+ List documents filtered by parse status using server-side filtering.
36
+
37
+ Args:
38
+ dataset: Dataset object
39
+ statuses: List of status values to filter by (e.g., ["FAIL", "UNSTART", "CANCEL"])
40
+ Supported values: "UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"
41
+ limit: Maximum number of documents to return
42
+
43
+ Returns:
44
+ List of Document objects matching the criteria
45
+
46
+ Raises:
47
+ Exception: If RAGFlow API returns error
48
+ """
49
+ try:
50
+ # Use server-side filtering via HTTP API run parameter
51
+ # If statuses is None, no filter is applied (fetches all)
52
+ documents = self.client.list_documents(
53
+ dataset,
54
+ run=statuses,
55
+ page_size=limit
56
+ )
57
+
58
+ return documents if documents else []
59
+
60
+ except Exception as e:
61
+ # Fallback to client-side filtering if HTTP API fails
62
+ print(f"Warning: Server-side filtering failed, using fallback: {e}")
63
+
64
+ all_documents = dataset.list_documents(page_size=limit)
65
+
66
+ if not all_documents or not statuses:
67
+ return all_documents or []
68
+
69
+ # Client-side filtering fallback
70
+ filtered: list[DocumentLike] = []
71
+ for doc in all_documents:
72
+ doc_status = getattr(doc, 'run', 'UNSTART')
73
+ if doc_status in statuses:
74
+ filtered.append(doc)
75
+
76
+ return filtered
77
+
78
+ def filter_documents_by_prefix(
79
+ self,
80
+ dataset: DatasetLike,
81
+ prefix: str,
82
+ limit: int = 1000
83
+ ) -> list[DocumentLike]:
84
+ """
85
+ Filter documents by title prefix using server-side filtering.
86
+
87
+ Args:
88
+ dataset: Dataset object
89
+ prefix: Title prefix to filter by
90
+ limit: Maximum number of documents to return
91
+
92
+ Returns:
93
+ List of Document objects with matching prefix
94
+ """
95
+ metadata_filter = {
96
+ "logic": "and",
97
+ "conditions": [{
98
+ "name": "doc_title",
99
+ "comparison_operator": "start with",
100
+ "value": prefix
101
+ }]
102
+ }
103
+
104
+ documents = self.client.list_documents(
105
+ dataset,
106
+ page_size=limit,
107
+ metadata_condition=metadata_filter
108
+ )
109
+
110
+ return documents if documents else []
111
+
112
+ def filter_documents_by_tags(
113
+ self,
114
+ dataset: DatasetLike,
115
+ tags: list[str],
116
+ limit: int = 1000
117
+ ) -> list[DocumentLike]:
118
+ """
119
+ Filter documents by tags using server-side metadata filtering.
120
+
121
+ Uses "or" logic to find documents containing ANY of the specified tags.
122
+
123
+ Args:
124
+ dataset: Dataset object
125
+ tags: List of tags to filter by (e.g., ["r1", "agents"])
126
+ limit: Maximum number of documents to return
127
+
128
+ Returns:
129
+ List of Document objects containing any of the specified tags
130
+
131
+ Example:
132
+ tags=["r1", "agents"] finds documents with tags containing "r1" OR "agents"
133
+ """
134
+ if not tags:
135
+ return []
136
+
137
+ # Build metadata condition with "or" logic
138
+ # Each tag gets a "contains" condition
139
+ metadata_filter = {
140
+ "logic": "or",
141
+ "conditions": [
142
+ {
143
+ "name": "tags",
144
+ "comparison_operator": "contains",
145
+ "value": tag
146
+ }
147
+ for tag in tags
148
+ ]
149
+ }
150
+
151
+ documents = self.client.list_documents(
152
+ dataset,
153
+ page_size=limit,
154
+ metadata_condition=metadata_filter
155
+ )
156
+
157
+ return documents if documents else []
158
+
159
+ def get_document_summary(self, document: DocumentLike) -> JsonDict:
160
+ """
161
+ Get summary information from a document.
162
+
163
+ Args:
164
+ document: Document object
165
+
166
+ Returns:
167
+ Dictionary with document summary
168
+ """
169
+ return {
170
+ 'id': getattr(document, 'id', 'N/A'),
171
+ 'name': getattr(document, 'name', 'Untitled'),
172
+ 'size': getattr(document, 'size', 0),
173
+ 'run': getattr(document, 'run', 'UNKNOWN'),
174
+ 'chunk_count': getattr(document, 'chunk_count', 0),
175
+ 'meta_fields': getattr(document, 'meta_fields', {})
176
+ }
177
+
178
+ def format_document_display(self, document: DocumentLike, index: int) -> str:
179
+ """
180
+ Format document information for display.
181
+
182
+ Args:
183
+ document: Document object
184
+ index: Display index number
185
+
186
+ Returns:
187
+ Formatted string for display
188
+ """
189
+ summary = self.get_document_summary(document)
190
+
191
+ # Format parse status with emoji
192
+ status_icon = {
193
+ 'DONE': '✓',
194
+ 'RUNNING': '⏳',
195
+ 'FAIL': '✗',
196
+ 'UNSTART': '○',
197
+ 'CANCEL': '⊘'
198
+ }.get(summary['run'], '?')
199
+
200
+ lines = [
201
+ f"{index}. {summary['name']}",
202
+ f" ID: {summary['id']}",
203
+ f" Size: {summary['size']:,} bytes",
204
+ f" Parse Status: {status_icon} {summary['run']} (Chunks: {summary['chunk_count']})"
205
+ ]
206
+
207
+ # Add metadata if available
208
+ meta_fields = summary['meta_fields']
209
+ if meta_fields:
210
+ # meta_fields is a Base object from SDK, use getattr() directly
211
+ tags = cast(list[str], getattr(meta_fields, 'tags', []))
212
+ domain = getattr(meta_fields, 'domain', 'N/A')
213
+ release = getattr(meta_fields, 'release', 'N/A')
214
+ source = getattr(meta_fields, 'original_path', getattr(meta_fields, 'source_path', 'N/A'))
215
+
216
+ if tags:
217
+ lines.append(f" Tags: {', '.join(tags)}")
218
+ if domain != 'N/A':
219
+ lines.append(f" Domain: {domain}")
220
+ if release != 'N/A':
221
+ lines.append(f" Release: {release}")
222
+ if source != 'N/A':
223
+ lines.append(f" Source: {source}")
224
+
225
+ return '\n'.join(lines)
226
+
227
+ def wait_for_parsing(
228
+ self,
229
+ documents: list[JsonDict],
230
+ timeout: int = 300,
231
+ poll_interval: float = 0.5
232
+ ) -> tuple[int, int]:
233
+ """
234
+ Wait for documents to finish parsing with progress display grouped by folder.
235
+
236
+ Args:
237
+ documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id, "folder": folder}
238
+ timeout: Max seconds to wait (default: 300 = 5 minutes)
239
+ poll_interval: Seconds between status checks (default: 0.5 for smooth progress)
240
+
241
+ Returns:
242
+ Tuple of (success_count, failed_count)
243
+ """
244
+ if not documents:
245
+ return 0, 0
246
+
247
+ # Group by folder for display
248
+ by_folder: dict[str, list[str]] = {}
249
+ doc_info: dict[str, JsonDict] = {}
250
+
251
+ for doc in documents:
252
+ folder = str(doc.get("folder", "."))
253
+ if folder not in by_folder:
254
+ by_folder[folder] = []
255
+ by_folder[folder].append(str(doc["id"]))
256
+
257
+ doc_id = str(doc["id"])
258
+ doc_info[doc_id] = {
259
+ "dataset_id": str(doc["dataset_id"]),
260
+ "name": str(doc["name"]),
261
+ "folder": folder,
262
+ "status": "queued",
263
+ "chunks": 0,
264
+ "tokens": 0
265
+ }
266
+
267
+ print(f"\n📄 Parsing {len(documents)} document(s)...\n")
268
+
269
+ start_time = time.time()
270
+ completed: set[str] = set()
271
+ failed: set[str] = set()
272
+
273
+ # Create progress bar with better refresh rate
274
+ pbar = tqdm(
275
+ total=len(documents),
276
+ desc="Overall progress",
277
+ unit="doc",
278
+ bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
279
+ mininterval=0.1 # Update display at most every 0.1 seconds
280
+ )
281
+
282
+ try:
283
+ while len(completed) + len(failed) < len(documents):
284
+ elapsed = time.time() - start_time
285
+ if elapsed > timeout:
286
+ pending_count = len(documents) - len(completed) - len(failed)
287
+ pbar.write(f"\n⚠️ Timeout after {timeout}s. {pending_count} documents still parsing.")
288
+ break
289
+
290
+ time.sleep(poll_interval)
291
+
292
+ # Check status of all pending documents
293
+ newly_completed = []
294
+ newly_failed = []
295
+
296
+ for doc_id, info in doc_info.items():
297
+ if doc_id in completed or doc_id in failed:
298
+ continue
299
+
300
+ try:
301
+ status = self.client.get_parse_status(info["dataset_id"], doc_id)
302
+ run_status = status.get("run", "UNKNOWN")
303
+
304
+ if run_status == "DONE":
305
+ info["status"] = "done"
306
+ info["chunks"] = status.get("chunk_count", 0)
307
+ info["tokens"] = status.get("token_count", 0)
308
+ completed.add(doc_id)
309
+ newly_completed.append(doc_id)
310
+ elif run_status == "FAIL":
311
+ info["status"] = "failed"
312
+ failed.add(doc_id)
313
+ newly_failed.append(doc_id)
314
+ elif run_status == "RUNNING":
315
+ info["status"] = "parsing"
316
+ except Exception as e:
317
+ # Don't fail entire operation for single status check error
318
+ pbar.write(f" ⚠️ Failed to get status for document {doc_id}: {e}")
319
+
320
+ # Update progress bar and print status for newly completed/failed
321
+ # Add small delay between updates to make progress visible
322
+ if newly_completed or newly_failed:
323
+ for doc_id in newly_completed:
324
+ info = doc_info[doc_id]
325
+ if info["folder"] == ".":
326
+ display = info['name'] # Root folder - show only filename
327
+ else:
328
+ display = f"{info['folder']}/{info['name']}" # Show folder/filename
329
+ pbar.write(f" ✅ {display}: {info['chunks']} chunks, {info['tokens']} tokens")
330
+ pbar.update(1)
331
+ time.sleep(0.05) # Small delay to make progress visible
332
+
333
+ for doc_id in newly_failed:
334
+ info = doc_info[doc_id]
335
+ if info["folder"] == ".":
336
+ display = info['name'] # Root folder - show only filename
337
+ else:
338
+ display = f"{info['folder']}/{info['name']}" # Show folder/filename
339
+ pbar.write(f" ✗ {display}: Failed")
340
+ pbar.update(1)
341
+ time.sleep(0.05) # Small delay to make progress visible
342
+
343
+ pbar.close()
344
+
345
+ elapsed = time.time() - start_time
346
+ if len(completed) == len(documents):
347
+ print(f"\n✅ All {len(documents)} documents parsed successfully ({elapsed:.1f}s)")
348
+ else:
349
+ print(f"\n⚠️ Completed: {len(completed)}, Failed: {len(failed)}, Total: {len(documents)}")
350
+
351
+ except KeyboardInterrupt:
352
+ pbar.close()
353
+ print("\n\n⚠️ Parsing interrupted by user (Ctrl+C)")
354
+ print("ℹ️ Parsing continues on server. Check RAGFlow UI for status.")
355
+ raise
356
+
357
+ return len(completed), len(failed)
@@ -0,0 +1,49 @@
1
+ """Shared typing helpers for IMS CLI tools."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ from collections.abc import Sequence
7
+ from typing import Any, Protocol, TypeAlias
8
+
9
+ JsonValue: TypeAlias = Any
10
+ JsonDict: TypeAlias = dict[str, Any]
11
+ JsonList: TypeAlias = list[Any]
12
+ CommandArgs: TypeAlias = argparse.Namespace
13
+
14
+
15
+ class DocumentLike(Protocol):
16
+ id: str
17
+ name: str | None
18
+ run: str | None
19
+ size: int | None
20
+ chunk_count: int | None
21
+ token_count: int | None
22
+ progress: float | None
23
+ progress_msg: str | None
24
+ meta_fields: object
25
+
26
+ def update(self, payload: dict[str, object]) -> object: ...
27
+
28
+
29
+ class DatasetLike(Protocol):
30
+ id: str
31
+ name: str
32
+
33
+ def list_documents(
34
+ self,
35
+ *,
36
+ id: str | None = None,
37
+ name: str | None = None,
38
+ page: int = 1,
39
+ page_size: int = 30,
40
+ keywords: str | None = None,
41
+ ) -> list[DocumentLike]: ...
42
+
43
+ def delete_documents(self, ids: Sequence[str]) -> object: ...
44
+
45
+ def upload_documents(self, documents: Sequence[dict[str, object]]) -> list[DocumentLike]: ...
46
+
47
+ def async_parse_documents(self, document_ids: Sequence[str]) -> object: ...
48
+
49
+ def get(self, path: str, params: dict[str, object]) -> object: ...