rosetta-cli 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rosetta_cli/__init__.py +12 -0
- rosetta_cli/__main__.py +6 -0
- rosetta_cli/cli.py +379 -0
- rosetta_cli/commands/__init__.py +5 -0
- rosetta_cli/commands/base_command.py +82 -0
- rosetta_cli/commands/cleanup_command.py +214 -0
- rosetta_cli/commands/list_command.py +70 -0
- rosetta_cli/commands/parse_command.py +205 -0
- rosetta_cli/commands/publish_command.py +113 -0
- rosetta_cli/commands/verify_command.py +46 -0
- rosetta_cli/ims_auth.py +124 -0
- rosetta_cli/ims_config.py +317 -0
- rosetta_cli/ims_publisher.py +859 -0
- rosetta_cli/ims_utils.py +28 -0
- rosetta_cli/ragflow_client.py +928 -0
- rosetta_cli/services/__init__.py +8 -0
- rosetta_cli/services/auth_service.py +114 -0
- rosetta_cli/services/dataset_service.py +72 -0
- rosetta_cli/services/document_data.py +408 -0
- rosetta_cli/services/document_service.py +357 -0
- rosetta_cli/typing_utils.py +49 -0
- rosetta_cli-2.0.0.dist-info/METADATA +639 -0
- rosetta_cli-2.0.0.dist-info/RECORD +26 -0
- rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
- rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
- rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,928 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RAGFlow Client Wrapper for IMS Publishing
|
|
3
|
+
|
|
4
|
+
This module provides a wrapper around the ragflow-sdk for IMS-specific operations.
|
|
5
|
+
|
|
6
|
+
Key Features:
|
|
7
|
+
- Dataset management with template resolution (aia-{release})
|
|
8
|
+
- Document upload with change detection (MD5 hashing)
|
|
9
|
+
- Tag-in-title format: [tag1][tag2][tag3] filename.ext
|
|
10
|
+
- Two-stage filtering support (server keyword + client metadata)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import hashlib
|
|
14
|
+
import json
|
|
15
|
+
import time
|
|
16
|
+
import requests
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Dict, List, Optional, cast
|
|
21
|
+
|
|
22
|
+
from ragflow_sdk import RAGFlow
|
|
23
|
+
from ragflow_sdk.modules.dataset import DataSet
|
|
24
|
+
from ragflow_sdk.modules.document import Document
|
|
25
|
+
from .typing_utils import DatasetLike, DocumentLike, JsonDict
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class DocumentMetadata:
|
|
31
|
+
"""Metadata structure for IMS documents"""
|
|
32
|
+
tags: list[str]
|
|
33
|
+
domain: str
|
|
34
|
+
release: str
|
|
35
|
+
content_hash: str
|
|
36
|
+
ims_doc_id: str
|
|
37
|
+
original_path: str = ""
|
|
38
|
+
resource_path: str | None = None
|
|
39
|
+
sort_order: int | None = None
|
|
40
|
+
frontmatter: JsonDict | None = None
|
|
41
|
+
line_count: int | None = None
|
|
42
|
+
doc_title: str = "" # Bare filename for server-side filtering
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RAGFlowClientError(Exception):
|
|
46
|
+
"""Base exception for RAGFlow client errors"""
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class AuthenticationError(RAGFlowClientError):
|
|
51
|
+
"""Authentication/authorization errors (401, 403)"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class NotFoundError(RAGFlowClientError):
|
|
56
|
+
"""Resource not found errors (404)"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class NetworkError(RAGFlowClientError):
|
|
61
|
+
"""Network-related errors"""
|
|
62
|
+
pass
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class RAGFlowClient:
|
|
66
|
+
"""
|
|
67
|
+
Wrapper class for RAGFlow SDK operations.
|
|
68
|
+
|
|
69
|
+
Provides high-level methods for dataset and document management
|
|
70
|
+
with IMS-specific functionality like tag-in-title format and
|
|
71
|
+
change detection.
|
|
72
|
+
|
|
73
|
+
Usage:
|
|
74
|
+
client = RAGFlowClient(api_key="ragflow-xxx", base_url="http://ragflow.local")
|
|
75
|
+
|
|
76
|
+
# Create/get dataset
|
|
77
|
+
dataset = client.create_dataset("aia-r1", "Release 1 instructions")
|
|
78
|
+
|
|
79
|
+
# Upload document with tags
|
|
80
|
+
doc = client.upload_document(
|
|
81
|
+
file_path=Path("agents.md"),
|
|
82
|
+
metadata=DocumentMetadata(...),
|
|
83
|
+
dataset_id=dataset.id
|
|
84
|
+
)
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self,
|
|
89
|
+
api_key: str,
|
|
90
|
+
base_url: str,
|
|
91
|
+
version: str = "v1",
|
|
92
|
+
timeout: int = 30,
|
|
93
|
+
embedding_model: str | None = None,
|
|
94
|
+
chunk_method: str = "naive",
|
|
95
|
+
parser_config: JsonDict | None = None,
|
|
96
|
+
page_size: int = 1000
|
|
97
|
+
):
|
|
98
|
+
"""
|
|
99
|
+
Initialize RAGFlow client.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
api_key: RAGFlow API key (format: ragflow-xxxx)
|
|
103
|
+
base_url: RAGFlow instance URL (e.g., http://ragflow.local)
|
|
104
|
+
version: API version (default: v1)
|
|
105
|
+
timeout: Request timeout in seconds (default: 30)
|
|
106
|
+
embedding_model: Embedding model (format: model_name@provider, e.g., text-embedding-3-small@OpenAI)
|
|
107
|
+
chunk_method: Chunking method (default: naive)
|
|
108
|
+
parser_config: Parser configuration dict for chunk_method settings
|
|
109
|
+
page_size: Default page size for list operations (default: 1000)
|
|
110
|
+
|
|
111
|
+
Raises:
|
|
112
|
+
ValueError: If api_key or base_url is empty
|
|
113
|
+
"""
|
|
114
|
+
if not api_key:
|
|
115
|
+
raise ValueError("api_key cannot be empty")
|
|
116
|
+
if not base_url:
|
|
117
|
+
raise ValueError("base_url cannot be empty")
|
|
118
|
+
|
|
119
|
+
self.api_key = api_key
|
|
120
|
+
self.base_url = base_url
|
|
121
|
+
self.version = version
|
|
122
|
+
self.timeout = timeout
|
|
123
|
+
self.embedding_model = embedding_model
|
|
124
|
+
self.chunk_method = chunk_method
|
|
125
|
+
self.parser_config = parser_config or {}
|
|
126
|
+
self.page_size = page_size
|
|
127
|
+
|
|
128
|
+
# Initialize RAGFlow SDK client
|
|
129
|
+
self._client = RAGFlow(api_key=api_key, base_url=base_url, version=version)
|
|
130
|
+
|
|
131
|
+
def _handle_response_error(self, response: Any, operation: str) -> None:
|
|
132
|
+
"""
|
|
133
|
+
Handle API response errors uniformly.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
response: Response object from requests
|
|
137
|
+
operation: Description of the operation for error messages
|
|
138
|
+
|
|
139
|
+
Raises:
|
|
140
|
+
AuthenticationError: For 401/403 errors
|
|
141
|
+
NotFoundError: For 404 errors
|
|
142
|
+
NetworkError: For network-related errors
|
|
143
|
+
RAGFlowClientError: For other errors
|
|
144
|
+
"""
|
|
145
|
+
try:
|
|
146
|
+
if hasattr(response, 'status_code'):
|
|
147
|
+
if response.status_code == 401:
|
|
148
|
+
raise AuthenticationError(
|
|
149
|
+
f"{operation} failed: Invalid API key or expired token"
|
|
150
|
+
)
|
|
151
|
+
elif response.status_code == 403:
|
|
152
|
+
raise AuthenticationError(
|
|
153
|
+
f"{operation} failed: Insufficient permissions"
|
|
154
|
+
)
|
|
155
|
+
elif response.status_code == 404:
|
|
156
|
+
raise NotFoundError(
|
|
157
|
+
f"{operation} failed: Resource not found"
|
|
158
|
+
)
|
|
159
|
+
elif response.status_code >= 500:
|
|
160
|
+
raise NetworkError(
|
|
161
|
+
f"{operation} failed: Server error (status {response.status_code})"
|
|
162
|
+
)
|
|
163
|
+
except Exception as e:
|
|
164
|
+
if isinstance(e, RAGFlowClientError):
|
|
165
|
+
raise
|
|
166
|
+
raise NetworkError(f"{operation} failed: {str(e)}")
|
|
167
|
+
|
|
168
|
+
def create_dataset(
|
|
169
|
+
self,
|
|
170
|
+
name: str,
|
|
171
|
+
description: str = "",
|
|
172
|
+
embedding_model: str | None = None,
|
|
173
|
+
permission: str = "team",
|
|
174
|
+
chunk_method: str | None = None,
|
|
175
|
+
parser_config: JsonDict | None = None
|
|
176
|
+
) -> DataSet:
|
|
177
|
+
"""
|
|
178
|
+
Create a new dataset.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
name: Dataset name
|
|
182
|
+
description: Dataset description
|
|
183
|
+
embedding_model: Embedding model (uses client default if not specified)
|
|
184
|
+
permission: Access permission (default: "team" = shared)
|
|
185
|
+
chunk_method: Chunking method (uses client default if not specified)
|
|
186
|
+
parser_config: Parser configuration dict (uses client default if not specified)
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
Created DataSet object
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
RAGFlowClientError: If creation fails
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
# Use method parameters or fall back to client defaults
|
|
196
|
+
emb_model = embedding_model if embedding_model is not None else self.embedding_model
|
|
197
|
+
chunk_meth = chunk_method if chunk_method is not None else self.chunk_method
|
|
198
|
+
parser_cfg = parser_config if parser_config is not None else self.parser_config
|
|
199
|
+
|
|
200
|
+
# Build create_dataset kwargs
|
|
201
|
+
kwargs: dict[str, object] = {
|
|
202
|
+
"name": name,
|
|
203
|
+
"description": description,
|
|
204
|
+
"permission": permission,
|
|
205
|
+
"chunk_method": chunk_meth
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
# Add optional parameters if provided
|
|
209
|
+
if emb_model:
|
|
210
|
+
kwargs["embedding_model"] = emb_model
|
|
211
|
+
|
|
212
|
+
# Convert parser_config dict to DataSet.ParserConfig object if needed
|
|
213
|
+
if parser_cfg:
|
|
214
|
+
kwargs["parser_config"] = DataSet.ParserConfig(self._client, parser_cfg)
|
|
215
|
+
|
|
216
|
+
dataset = self._client.create_dataset(**kwargs)
|
|
217
|
+
|
|
218
|
+
return dataset
|
|
219
|
+
|
|
220
|
+
except Exception as e:
|
|
221
|
+
raise RAGFlowClientError(f"Failed to create dataset '{name}': {str(e)}")
|
|
222
|
+
|
|
223
|
+
def list_datasets(
|
|
224
|
+
self,
|
|
225
|
+
page: int = 1,
|
|
226
|
+
page_size: int = 30,
|
|
227
|
+
orderby: str = "create_time",
|
|
228
|
+
desc: bool = True,
|
|
229
|
+
id: str | None = None,
|
|
230
|
+
name: str | None = None
|
|
231
|
+
) -> list[DataSet]:
|
|
232
|
+
"""
|
|
233
|
+
List all datasets with optional filtering.
|
|
234
|
+
|
|
235
|
+
Args:
|
|
236
|
+
page: Page number (1-indexed)
|
|
237
|
+
page_size: Number of datasets per page
|
|
238
|
+
orderby: Field to sort by
|
|
239
|
+
desc: Sort in descending order
|
|
240
|
+
id: Filter by dataset ID (exact match)
|
|
241
|
+
name: Filter by dataset name (exact match lookup - will fail if not found)
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
List of DataSet objects
|
|
245
|
+
|
|
246
|
+
Raises:
|
|
247
|
+
RAGFlowClientError: If listing fails
|
|
248
|
+
"""
|
|
249
|
+
try:
|
|
250
|
+
datasets = self._client.list_datasets(
|
|
251
|
+
page=page,
|
|
252
|
+
page_size=page_size,
|
|
253
|
+
orderby=orderby,
|
|
254
|
+
desc=desc,
|
|
255
|
+
id=id,
|
|
256
|
+
name=name
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
return cast(list[DataSet], datasets)
|
|
260
|
+
|
|
261
|
+
except Exception as e:
|
|
262
|
+
# If name/id filter is used and dataset doesn't exist, RAGFlow returns permission error
|
|
263
|
+
# This is expected behavior - return empty list instead of raising
|
|
264
|
+
if (name or id) and "lacks permission" in str(e):
|
|
265
|
+
return []
|
|
266
|
+
raise RAGFlowClientError(f"Failed to list datasets: {str(e)}")
|
|
267
|
+
|
|
268
|
+
def get_dataset(self, id: str | None = None, name: str | None = None) -> DataSet | None:
|
|
269
|
+
"""
|
|
270
|
+
Get a single dataset by ID or name using server-side filtering.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
id: Dataset ID (exact match)
|
|
274
|
+
name: Dataset name (exact match)
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
DataSet object if found, None otherwise
|
|
278
|
+
|
|
279
|
+
Note:
|
|
280
|
+
Provide either id OR name, not both. If both provided, id takes precedence.
|
|
281
|
+
"""
|
|
282
|
+
try:
|
|
283
|
+
if id:
|
|
284
|
+
# Filter by ID
|
|
285
|
+
datasets = self._client.list_datasets(id=id, page_size=1)
|
|
286
|
+
elif name:
|
|
287
|
+
# Filter by name (RAGFlow does substring, we verify exact match)
|
|
288
|
+
datasets = self._client.list_datasets(name=name, page_size=10)
|
|
289
|
+
# Filter for exact match
|
|
290
|
+
datasets = [ds for ds in datasets if ds.name == name]
|
|
291
|
+
else:
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
if datasets and len(datasets) > 0:
|
|
295
|
+
return datasets[0]
|
|
296
|
+
return None
|
|
297
|
+
|
|
298
|
+
except Exception as e:
|
|
299
|
+
# Check if it's a permission/not found error
|
|
300
|
+
error_msg = str(e).lower()
|
|
301
|
+
if "lacks permission" in error_msg or "not found" in error_msg:
|
|
302
|
+
return None
|
|
303
|
+
raise RAGFlowClientError(f"Failed to get dataset: {str(e)}")
|
|
304
|
+
|
|
305
|
+
def delete_datasets(self, ids: list[str]) -> None:
|
|
306
|
+
"""
|
|
307
|
+
Delete datasets by IDs.
|
|
308
|
+
|
|
309
|
+
Args:
|
|
310
|
+
ids: List of dataset IDs to delete
|
|
311
|
+
|
|
312
|
+
Raises:
|
|
313
|
+
RAGFlowClientError: If deletion fails
|
|
314
|
+
"""
|
|
315
|
+
try:
|
|
316
|
+
self._client.delete_datasets(ids=ids)
|
|
317
|
+
|
|
318
|
+
except Exception as e:
|
|
319
|
+
raise RAGFlowClientError(f"Failed to delete datasets: {str(e)}")
|
|
320
|
+
|
|
321
|
+
def _ensure_dataset(self, name: str, description: str = "") -> DataSet:
|
|
322
|
+
"""
|
|
323
|
+
Get dataset if exists, create if not.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
name: Dataset name
|
|
327
|
+
description: Dataset description (used if creating)
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
DataSet object
|
|
331
|
+
"""
|
|
332
|
+
dataset = self.get_dataset(name=name)
|
|
333
|
+
if dataset is not None:
|
|
334
|
+
return dataset
|
|
335
|
+
|
|
336
|
+
# Dataset doesn't exist, create it
|
|
337
|
+
return self.create_dataset(name, description)
|
|
338
|
+
|
|
339
|
+
def _resolve_dataset_name(self, template: str, release: str | None) -> str:
|
|
340
|
+
"""
|
|
341
|
+
Resolve dataset name from template.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
template: Name template (e.g., "aia-{release}")
|
|
345
|
+
release: Release identifier (e.g., "r1")
|
|
346
|
+
|
|
347
|
+
Returns:
|
|
348
|
+
Resolved dataset name
|
|
349
|
+
|
|
350
|
+
Examples:
|
|
351
|
+
>>> _resolve_dataset_name("aia-{release}", "r1")
|
|
352
|
+
"aia-r1"
|
|
353
|
+
>>> _resolve_dataset_name("aia", None)
|
|
354
|
+
"aia"
|
|
355
|
+
"""
|
|
356
|
+
if release and "{release}" in template:
|
|
357
|
+
return template.format(release=release)
|
|
358
|
+
return template
|
|
359
|
+
|
|
360
|
+
def _build_title_with_tags(self, tags: list[str], filename: str) -> str:
|
|
361
|
+
"""
|
|
362
|
+
Build document title.
|
|
363
|
+
|
|
364
|
+
Tags are stored in metadata only, not in the title.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
tags: List of tags (unused, kept for compatibility)
|
|
368
|
+
filename: Original filename (with extension)
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
Filename as title
|
|
372
|
+
"""
|
|
373
|
+
return filename
|
|
374
|
+
|
|
375
|
+
def upload_document(
|
|
376
|
+
self,
|
|
377
|
+
file_path: Path | None = None,
|
|
378
|
+
metadata: DocumentMetadata | None = None,
|
|
379
|
+
dataset_name: str | None = None,
|
|
380
|
+
dataset_template: str = "aia-{release}",
|
|
381
|
+
force: bool = False,
|
|
382
|
+
content: bytes | None = None # NEW: Pre-read content from cache
|
|
383
|
+
) -> tuple[DocumentLike, str] | None:
|
|
384
|
+
"""
|
|
385
|
+
Upload document with upsert semantics and change detection.
|
|
386
|
+
|
|
387
|
+
OPTIMIZED: Now accepts pre-read content to avoid redundant file I/O.
|
|
388
|
+
|
|
389
|
+
This method:
|
|
390
|
+
1. Resolves dataset name from template + release
|
|
391
|
+
2. Ensures dataset exists
|
|
392
|
+
3. Builds title with tag prefixes
|
|
393
|
+
4. Checks if document exists (by ims_doc_id)
|
|
394
|
+
5. Compares content hash (skip if unchanged, unless force=True)
|
|
395
|
+
6. Deletes existing document if changed
|
|
396
|
+
7. Uploads new document with metadata
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
file_path: Path to file (for filename, backward compatibility)
|
|
400
|
+
metadata: Document metadata with pre-calculated hash
|
|
401
|
+
dataset_name: Base dataset name or template
|
|
402
|
+
dataset_template: Template for dataset name resolution
|
|
403
|
+
force: Force upload even if unchanged
|
|
404
|
+
content: Pre-read file content (NEW - avoids re-reading file)
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
Tuple of (Document, dataset_id), or None if skipped (unchanged)
|
|
408
|
+
|
|
409
|
+
Raises:
|
|
410
|
+
FileNotFoundError: If file_path does not exist (legacy path)
|
|
411
|
+
RAGFlowClientError: If upload fails
|
|
412
|
+
|
|
413
|
+
Examples:
|
|
414
|
+
>>> # New optimized way (with DocumentData)
|
|
415
|
+
>>> cache = DocumentData.from_file(path, workspace)
|
|
416
|
+
>>> doc, dataset_id = client.upload_document(
|
|
417
|
+
... file_path=cache.file_path,
|
|
418
|
+
... metadata=metadata,
|
|
419
|
+
... dataset_name="aia",
|
|
420
|
+
... content=cache.content # Pre-read content
|
|
421
|
+
... )
|
|
422
|
+
"""
|
|
423
|
+
# If content not provided, fall back to reading file (backward compatibility)
|
|
424
|
+
if content is None:
|
|
425
|
+
if file_path is None or not file_path.exists():
|
|
426
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
427
|
+
content = file_path.read_bytes()
|
|
428
|
+
if metadata is None:
|
|
429
|
+
raise ValueError("metadata is required")
|
|
430
|
+
|
|
431
|
+
# Hash should already be in metadata (calculated in DocumentData)
|
|
432
|
+
# No need to recalculate it here
|
|
433
|
+
actual_hash = metadata.content_hash
|
|
434
|
+
|
|
435
|
+
# Resolve dataset name
|
|
436
|
+
resolved_name = self._resolve_dataset_name(
|
|
437
|
+
dataset_template if "{release}" in dataset_template else (dataset_name or dataset_template),
|
|
438
|
+
metadata.release
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
# Ensure dataset exists
|
|
442
|
+
dataset = self._ensure_dataset(
|
|
443
|
+
resolved_name,
|
|
444
|
+
f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge"
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
# Build display name from normalized doc title when available.
|
|
448
|
+
# For R1, doc_title is filename; for R2, doc_title is logical path.
|
|
449
|
+
# This prevents R2 collisions like SKILL(7).md from repeated bare filenames.
|
|
450
|
+
filename = metadata.doc_title or (file_path.name if file_path else "")
|
|
451
|
+
title = self._build_title_with_tags(metadata.tags, filename)
|
|
452
|
+
|
|
453
|
+
# Check if document exists by searching for ims_doc_id in metadata
|
|
454
|
+
start_time = time.time()
|
|
455
|
+
|
|
456
|
+
# Use server-side metadata filtering to find document by ims_doc_id.
|
|
457
|
+
# RAGFlow may return ownership-style errors when the filtered lookup
|
|
458
|
+
# misses a document in team-shared datasets; treat that as "not found".
|
|
459
|
+
try:
|
|
460
|
+
existing_docs = self.list_documents(
|
|
461
|
+
dataset=dataset,
|
|
462
|
+
metadata_condition={
|
|
463
|
+
"logic": "and",
|
|
464
|
+
"conditions": [{
|
|
465
|
+
"name": "ims_doc_id",
|
|
466
|
+
"comparison_operator": "is",
|
|
467
|
+
"value": metadata.ims_doc_id
|
|
468
|
+
}]
|
|
469
|
+
},
|
|
470
|
+
page_size=1
|
|
471
|
+
)
|
|
472
|
+
except RAGFlowClientError as e:
|
|
473
|
+
msg = str(e).lower()
|
|
474
|
+
if (
|
|
475
|
+
"you don't own" in msg
|
|
476
|
+
or "you do not own" in msg
|
|
477
|
+
or "lacks permission" in msg
|
|
478
|
+
):
|
|
479
|
+
existing_docs = []
|
|
480
|
+
else:
|
|
481
|
+
raise
|
|
482
|
+
|
|
483
|
+
existing_doc = existing_docs[0] if existing_docs else None
|
|
484
|
+
|
|
485
|
+
if existing_doc:
|
|
486
|
+
# Check if content changed by comparing hashes
|
|
487
|
+
existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
|
|
488
|
+
|
|
489
|
+
# Handle both dict and Base object formats
|
|
490
|
+
if isinstance(existing_meta, dict):
|
|
491
|
+
existing_hash = existing_meta.get("content_hash")
|
|
492
|
+
else:
|
|
493
|
+
# It's a Base object, access as attribute
|
|
494
|
+
existing_hash = getattr(existing_meta, 'content_hash', None)
|
|
495
|
+
|
|
496
|
+
if not force and existing_hash and existing_hash == actual_hash:
|
|
497
|
+
# Content unchanged, skip upload
|
|
498
|
+
elapsed = time.time() - start_time
|
|
499
|
+
print(f" ⏩ Skipped (unchanged, {elapsed:.2f}s): {title}")
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
# Content changed, delete old version
|
|
503
|
+
dataset.delete_documents([existing_doc.id])
|
|
504
|
+
print(f" 🔄 Updating: {title}")
|
|
505
|
+
else:
|
|
506
|
+
print(f" ⬆️ Uploading: {title}")
|
|
507
|
+
|
|
508
|
+
# Upload document
|
|
509
|
+
try:
|
|
510
|
+
documents = dataset.upload_documents([{
|
|
511
|
+
"display_name": title,
|
|
512
|
+
"blob": content
|
|
513
|
+
}])
|
|
514
|
+
|
|
515
|
+
if not documents:
|
|
516
|
+
raise RAGFlowClientError("Upload returned no documents")
|
|
517
|
+
|
|
518
|
+
doc = documents[0]
|
|
519
|
+
|
|
520
|
+
# Update metadata
|
|
521
|
+
meta_fields: JsonDict = {
|
|
522
|
+
"ims_doc_id": metadata.ims_doc_id,
|
|
523
|
+
"tags": metadata.tags,
|
|
524
|
+
"domain": metadata.domain,
|
|
525
|
+
"release": metadata.release,
|
|
526
|
+
"content_hash": metadata.content_hash,
|
|
527
|
+
"original_path": metadata.original_path,
|
|
528
|
+
"sort_order": metadata.sort_order,
|
|
529
|
+
"doc_title": metadata.doc_title,
|
|
530
|
+
}
|
|
531
|
+
if metadata.line_count is not None:
|
|
532
|
+
meta_fields["line_count"] = metadata.line_count
|
|
533
|
+
if metadata.resource_path is not None:
|
|
534
|
+
meta_fields["resource_path"] = metadata.resource_path
|
|
535
|
+
frontmatter_value = getattr(metadata, 'frontmatter', None)
|
|
536
|
+
if frontmatter_value is not None:
|
|
537
|
+
meta_fields["frontmatter"] = frontmatter_value
|
|
538
|
+
|
|
539
|
+
doc.update({"meta_fields": meta_fields})
|
|
540
|
+
updated_meta = getattr(doc, 'meta_fields', None)
|
|
541
|
+
if updated_meta:
|
|
542
|
+
# SDK may return a Base object or a dict; handle both
|
|
543
|
+
if isinstance(updated_meta, dict):
|
|
544
|
+
meta_tags = updated_meta.get('tags', [])
|
|
545
|
+
meta_fm = updated_meta.get('frontmatter')
|
|
546
|
+
else:
|
|
547
|
+
meta_tags = getattr(updated_meta, 'tags', []) or []
|
|
548
|
+
meta_fm = getattr(updated_meta, 'frontmatter', None)
|
|
549
|
+
tag_count = len(meta_tags) if isinstance(meta_tags, list) else 0
|
|
550
|
+
print(f" ✅ Metadata set: {tag_count} tags, frontmatter={'yes' if meta_fm else 'no'}")
|
|
551
|
+
else:
|
|
552
|
+
print(f" ⚠️ Metadata update returned empty meta_fields!")
|
|
553
|
+
|
|
554
|
+
elapsed = time.time() - start_time
|
|
555
|
+
print(f" ✅ Done ({elapsed:.2f}s): {title}")
|
|
556
|
+
|
|
557
|
+
# Return doc object and dataset ID for parsing
|
|
558
|
+
# doc.id is RAGFlow's internal document ID needed for parsing
|
|
559
|
+
return (cast(DocumentLike, doc), dataset.id)
|
|
560
|
+
|
|
561
|
+
except Exception as e:
|
|
562
|
+
raise RAGFlowClientError(f"Failed to upload document '{title}': {str(e)}")
|
|
563
|
+
|
|
564
|
+
def trigger_parse(self, dataset_id: str, document_ids: list[str]) -> None:
|
|
565
|
+
"""
|
|
566
|
+
Trigger async parsing for documents.
|
|
567
|
+
|
|
568
|
+
Args:
|
|
569
|
+
dataset_id: Dataset ID containing documents
|
|
570
|
+
document_ids: List of document IDs to parse
|
|
571
|
+
|
|
572
|
+
Raises:
|
|
573
|
+
RAGFlowClientError: If parsing trigger fails
|
|
574
|
+
"""
|
|
575
|
+
dataset = self.get_dataset(id=dataset_id)
|
|
576
|
+
if not dataset:
|
|
577
|
+
raise NotFoundError(f"Dataset not found: {dataset_id}")
|
|
578
|
+
|
|
579
|
+
try:
|
|
580
|
+
dataset.async_parse_documents(document_ids)
|
|
581
|
+
except Exception as e:
|
|
582
|
+
raise RAGFlowClientError(f"Failed to trigger parsing: {str(e)}")
|
|
583
|
+
|
|
584
|
+
def parse_documents_batch(
|
|
585
|
+
self,
|
|
586
|
+
documents: list[JsonDict],
|
|
587
|
+
silent: bool = False
|
|
588
|
+
) -> dict[str, list[str]]:
|
|
589
|
+
"""
|
|
590
|
+
Trigger parsing for multiple documents across datasets.
|
|
591
|
+
|
|
592
|
+
Groups documents by dataset and triggers parsing for each group.
|
|
593
|
+
This is more efficient than calling trigger_parse separately for each document.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id}
|
|
597
|
+
silent: If True, don't print progress messages
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
Dict with "success" and "failed" lists of dataset_ids
|
|
601
|
+
|
|
602
|
+
Examples:
|
|
603
|
+
>>> documents = [
|
|
604
|
+
... {"id": "doc1", "name": "file1.md", "dataset_id": "dataset_a"},
|
|
605
|
+
... {"id": "doc2", "name": "file2.md", "dataset_id": "dataset_a"},
|
|
606
|
+
... {"id": "doc3", "name": "file3.md", "dataset_id": "dataset_b"}
|
|
607
|
+
... ]
|
|
608
|
+
>>> result = client.parse_documents_batch(documents)
|
|
609
|
+
>>> print(result["success"]) # ["dataset_a", "dataset_b"]
|
|
610
|
+
"""
|
|
611
|
+
# Group documents by dataset
|
|
612
|
+
by_dataset: dict[str, list[JsonDict]] = {}
|
|
613
|
+
for doc in documents:
|
|
614
|
+
dataset_id = str(doc["dataset_id"])
|
|
615
|
+
if dataset_id not in by_dataset:
|
|
616
|
+
by_dataset[dataset_id] = []
|
|
617
|
+
by_dataset[dataset_id].append(doc)
|
|
618
|
+
|
|
619
|
+
if not silent:
|
|
620
|
+
print(f"\n📄 Parsing {len(documents)} document(s)...")
|
|
621
|
+
|
|
622
|
+
# Track success/failures
|
|
623
|
+
success_datasets: list[str] = []
|
|
624
|
+
failed_datasets: list[str] = []
|
|
625
|
+
|
|
626
|
+
# Trigger parsing per dataset
|
|
627
|
+
for dataset_id, docs in by_dataset.items():
|
|
628
|
+
doc_ids = [str(d["id"]) for d in docs]
|
|
629
|
+
if not silent:
|
|
630
|
+
print(f" → Triggering parse for {len(doc_ids)} documents in dataset {dataset_id}")
|
|
631
|
+
print(f" → Document IDs: {doc_ids[:3]}{'...' if len(doc_ids) > 3 else ''}")
|
|
632
|
+
|
|
633
|
+
try:
|
|
634
|
+
self.trigger_parse(dataset_id, doc_ids)
|
|
635
|
+
success_datasets.append(dataset_id)
|
|
636
|
+
except Exception as e:
|
|
637
|
+
failed_datasets.append(dataset_id)
|
|
638
|
+
if not silent:
|
|
639
|
+
print(f" ✗ Parse trigger failed: {e}")
|
|
640
|
+
print(f" ℹ️ Documents uploaded but not parsed. Check RAGFlow UI.")
|
|
641
|
+
|
|
642
|
+
return {"success": success_datasets, "failed": failed_datasets}
|
|
643
|
+
|
|
644
|
+
def get_parse_status(self, dataset_id: str, document_id: str) -> JsonDict:
|
|
645
|
+
"""
|
|
646
|
+
Get parsing status for a document.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
dataset_id: Dataset ID containing document
|
|
650
|
+
document_id: Document ID to check
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
Dict with keys: id, name, run, progress, chunk_count, token_count, progress_msg
|
|
654
|
+
run values: "UNSTART", "RUNNING", "DONE", "FAIL", "CANCEL"
|
|
655
|
+
|
|
656
|
+
Raises:
|
|
657
|
+
NotFoundError: If document not found
|
|
658
|
+
RAGFlowClientError: If status check fails
|
|
659
|
+
"""
|
|
660
|
+
dataset = self.get_dataset(id=dataset_id)
|
|
661
|
+
if not dataset:
|
|
662
|
+
raise NotFoundError(f"Dataset not found: {dataset_id}")
|
|
663
|
+
|
|
664
|
+
try:
|
|
665
|
+
docs = dataset.list_documents(id=document_id, page_size=1)
|
|
666
|
+
if not docs or len(docs) == 0:
|
|
667
|
+
raise NotFoundError(f"Document not found: {document_id}")
|
|
668
|
+
|
|
669
|
+
doc = docs[0]
|
|
670
|
+
# Handle missing attributes gracefully
|
|
671
|
+
return {
|
|
672
|
+
"id": getattr(doc, 'id', document_id),
|
|
673
|
+
"name": getattr(doc, 'name', 'Unknown'),
|
|
674
|
+
"run": getattr(doc, 'run', 'UNSTART'),
|
|
675
|
+
"progress": getattr(doc, 'progress', 0.0),
|
|
676
|
+
"chunk_count": getattr(doc, 'chunk_count', 0),
|
|
677
|
+
"token_count": getattr(doc, 'token_count', 0),
|
|
678
|
+
"progress_msg": getattr(doc, 'progress_msg', '')
|
|
679
|
+
}
|
|
680
|
+
except NotFoundError:
|
|
681
|
+
raise
|
|
682
|
+
except Exception as e:
|
|
683
|
+
raise RAGFlowClientError(f"Failed to get parse status: {str(e)}")
|
|
684
|
+
|
|
685
|
+
def list_documents(
|
|
686
|
+
self,
|
|
687
|
+
dataset: DatasetLike,
|
|
688
|
+
id: str | None = None,
|
|
689
|
+
name: str | None = None,
|
|
690
|
+
keywords: str | None = None,
|
|
691
|
+
page: int = 1,
|
|
692
|
+
page_size: int = 30,
|
|
693
|
+
orderby: str = "create_time",
|
|
694
|
+
desc: bool = True,
|
|
695
|
+
create_time_from: int = 0,
|
|
696
|
+
create_time_to: int = 0,
|
|
697
|
+
run: list[str] | None = None,
|
|
698
|
+
suffix: list[str] | None = None,
|
|
699
|
+
metadata_condition: JsonDict | None = None
|
|
700
|
+
) -> list[DocumentLike]:
|
|
701
|
+
"""
|
|
702
|
+
List documents in a dataset with enhanced filtering.
|
|
703
|
+
|
|
704
|
+
This method extends the SDK's list_documents with server-side filtering
|
|
705
|
+
support for parse status (run), file types (suffix), and metadata queries.
|
|
706
|
+
|
|
707
|
+
Args:
|
|
708
|
+
dataset: DataSet object to list documents from
|
|
709
|
+
id: Filter by document ID
|
|
710
|
+
name: Filter by document name
|
|
711
|
+
keywords: Keyword search
|
|
712
|
+
page: Page number (1-indexed)
|
|
713
|
+
page_size: Number of documents per page
|
|
714
|
+
orderby: Field to sort by (default: "create_time")
|
|
715
|
+
desc: Sort in descending order
|
|
716
|
+
create_time_from: Unix timestamp for filtering documents created after this time
|
|
717
|
+
create_time_to: Unix timestamp for filtering documents created before this time
|
|
718
|
+
run: Filter by parse status (e.g., ["DONE"], ["FAIL", "UNSTART"])
|
|
719
|
+
Supported values: "UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"
|
|
720
|
+
suffix: Filter by file extension (e.g., ["pdf", "md"])
|
|
721
|
+
metadata_condition: Metadata filter dict with structure:
|
|
722
|
+
{
|
|
723
|
+
"logic": "and" | "or",
|
|
724
|
+
"conditions": [
|
|
725
|
+
{
|
|
726
|
+
"name": str, # Metadata field name
|
|
727
|
+
"comparison_operator": str, # "is", "contains", "start with", etc.
|
|
728
|
+
"value": any # Comparison value
|
|
729
|
+
}
|
|
730
|
+
]
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
Returns:
|
|
734
|
+
List of Document objects
|
|
735
|
+
|
|
736
|
+
Raises:
|
|
737
|
+
RAGFlowClientError: If listing fails
|
|
738
|
+
|
|
739
|
+
Examples:
|
|
740
|
+
# Filter by parse status
|
|
741
|
+
docs = client.list_documents(dataset, run=["DONE"])
|
|
742
|
+
|
|
743
|
+
# Filter by filename prefix using metadata
|
|
744
|
+
docs = client.list_documents(
|
|
745
|
+
dataset,
|
|
746
|
+
metadata_condition={
|
|
747
|
+
"logic": "and",
|
|
748
|
+
"conditions": [{
|
|
749
|
+
"name": "doc_title",
|
|
750
|
+
"comparison_operator": "start with",
|
|
751
|
+
"value": "agents"
|
|
752
|
+
}]
|
|
753
|
+
}
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
# Combined filters
|
|
757
|
+
docs = client.list_documents(
|
|
758
|
+
dataset,
|
|
759
|
+
run=["FAIL", "UNSTART"],
|
|
760
|
+
suffix=["md", "txt"],
|
|
761
|
+
page_size=self.page_size
|
|
762
|
+
)
|
|
763
|
+
"""
|
|
764
|
+
try:
|
|
765
|
+
# Build query parameters for HTTP API
|
|
766
|
+
params: dict[str, object] = {
|
|
767
|
+
"page": page,
|
|
768
|
+
"page_size": page_size,
|
|
769
|
+
"orderby": orderby,
|
|
770
|
+
"desc": desc,
|
|
771
|
+
}
|
|
772
|
+
|
|
773
|
+
# Add optional standard parameters
|
|
774
|
+
if id is not None:
|
|
775
|
+
params["id"] = id
|
|
776
|
+
if name is not None:
|
|
777
|
+
params["name"] = name
|
|
778
|
+
if keywords is not None:
|
|
779
|
+
params["keywords"] = keywords
|
|
780
|
+
if create_time_from > 0:
|
|
781
|
+
params["create_time_from"] = create_time_from
|
|
782
|
+
if create_time_to > 0:
|
|
783
|
+
params["create_time_to"] = create_time_to
|
|
784
|
+
|
|
785
|
+
# Add enhanced filtering parameters if provided
|
|
786
|
+
if run is not None:
|
|
787
|
+
params["run"] = run
|
|
788
|
+
if suffix is not None:
|
|
789
|
+
params["suffix"] = suffix
|
|
790
|
+
if metadata_condition is not None:
|
|
791
|
+
params["metadata_condition"] = json.dumps(metadata_condition)
|
|
792
|
+
|
|
793
|
+
# Bypass SDK and call HTTP API directly
|
|
794
|
+
# SDK doesn't support run, suffix, metadata_condition parameters
|
|
795
|
+
res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
|
|
796
|
+
res_json = cast(JsonDict, cast(Any, res).json())
|
|
797
|
+
|
|
798
|
+
if res_json.get("code") != 0:
|
|
799
|
+
raise RAGFlowClientError(f"API error: {res_json.get('message', 'Unknown error')}")
|
|
800
|
+
|
|
801
|
+
# Convert response to Document objects (same as SDK does)
|
|
802
|
+
documents: list[DocumentLike] = []
|
|
803
|
+
data = res_json.get("data", {})
|
|
804
|
+
docs = data.get("docs", []) if isinstance(data, dict) else []
|
|
805
|
+
for doc_dict in docs:
|
|
806
|
+
if isinstance(doc_dict, dict):
|
|
807
|
+
documents.append(cast(DocumentLike, Document(cast(Any, dataset).rag, doc_dict)))
|
|
808
|
+
|
|
809
|
+
return documents
|
|
810
|
+
|
|
811
|
+
except Exception as e:
|
|
812
|
+
raise RAGFlowClientError(f"Failed to list documents: {str(e)}")
|
|
813
|
+
|
|
814
|
+
def _filter_by_metadata(self, docs: list[DocumentLike], condition: JsonDict) -> list[DocumentLike]:
|
|
815
|
+
"""
|
|
816
|
+
Client-side fallback for metadata filtering.
|
|
817
|
+
|
|
818
|
+
Args:
|
|
819
|
+
docs: List of Document objects
|
|
820
|
+
condition: Metadata condition dict
|
|
821
|
+
|
|
822
|
+
Returns:
|
|
823
|
+
Filtered list of Document objects
|
|
824
|
+
"""
|
|
825
|
+
logic = condition.get("logic", "and")
|
|
826
|
+
conditions = condition.get("conditions", [])
|
|
827
|
+
|
|
828
|
+
filtered: list[DocumentLike] = []
|
|
829
|
+
for doc in docs:
|
|
830
|
+
# Get document metadata
|
|
831
|
+
meta = getattr(doc, 'meta_fields', {})
|
|
832
|
+
if isinstance(meta, str):
|
|
833
|
+
try:
|
|
834
|
+
meta = json.loads(meta)
|
|
835
|
+
except:
|
|
836
|
+
meta = {}
|
|
837
|
+
|
|
838
|
+
# Evaluate conditions
|
|
839
|
+
matches = []
|
|
840
|
+
for cond in conditions:
|
|
841
|
+
if not isinstance(cond, dict):
|
|
842
|
+
matches.append(False)
|
|
843
|
+
continue
|
|
844
|
+
field_name = cond.get("name")
|
|
845
|
+
operator = cond.get("comparison_operator")
|
|
846
|
+
value = cond.get("value")
|
|
847
|
+
|
|
848
|
+
field_value = meta.get(field_name)
|
|
849
|
+
|
|
850
|
+
# Evaluate condition
|
|
851
|
+
if operator == "is":
|
|
852
|
+
matches.append(field_value == value)
|
|
853
|
+
elif operator == "contains":
|
|
854
|
+
matches.append(str(value) in str(field_value) if field_value is not None else False)
|
|
855
|
+
elif operator == "start with":
|
|
856
|
+
matches.append(str(field_value).startswith(str(value)) if field_value else False)
|
|
857
|
+
elif operator == "end with":
|
|
858
|
+
matches.append(str(field_value).endswith(str(value)) if field_value else False)
|
|
859
|
+
else:
|
|
860
|
+
matches.append(False)
|
|
861
|
+
|
|
862
|
+
# Apply logic
|
|
863
|
+
if logic == "and":
|
|
864
|
+
if all(matches):
|
|
865
|
+
filtered.append(doc)
|
|
866
|
+
elif logic == "or":
|
|
867
|
+
if any(matches):
|
|
868
|
+
filtered.append(doc)
|
|
869
|
+
|
|
870
|
+
return filtered
|
|
871
|
+
|
|
872
|
+
def verify_connection(self) -> bool:
|
|
873
|
+
"""
|
|
874
|
+
Verify API connection and authentication.
|
|
875
|
+
|
|
876
|
+
Returns:
|
|
877
|
+
True if connection successful, False otherwise
|
|
878
|
+
"""
|
|
879
|
+
try:
|
|
880
|
+
self.list_datasets(page_size=1)
|
|
881
|
+
return True
|
|
882
|
+
except Exception:
|
|
883
|
+
return False
|
|
884
|
+
|
|
885
|
+
def get_system_health(self) -> JsonDict | None:
|
|
886
|
+
"""
|
|
887
|
+
Check the health status of RAGFlow's dependencies.
|
|
888
|
+
|
|
889
|
+
Calls the /v1/system/healthz endpoint which checks:
|
|
890
|
+
- Database (MySQL/PostgreSQL)
|
|
891
|
+
- Redis
|
|
892
|
+
- Document Engine (Elasticsearch/Infinity/OpenSearch)
|
|
893
|
+
- Object Storage (MinIO/S3/GCS)
|
|
894
|
+
|
|
895
|
+
Note: This endpoint does NOT require authentication.
|
|
896
|
+
|
|
897
|
+
Returns:
|
|
898
|
+
Health status dict with format:
|
|
899
|
+
{
|
|
900
|
+
'status': 'ok' or 'nok',
|
|
901
|
+
'db': 'ok' or 'nok',
|
|
902
|
+
'redis': 'ok' or 'nok',
|
|
903
|
+
'doc_engine': 'ok' or 'nok',
|
|
904
|
+
'storage': 'ok' or 'nok',
|
|
905
|
+
'_meta': { # Optional: Only present if there are issues
|
|
906
|
+
'db': {'elapsed': '12.3', 'error': '...'},
|
|
907
|
+
'redis': {'elapsed': '8.5', 'error': '...'},
|
|
908
|
+
...
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
Returns None if health check fails
|
|
912
|
+
"""
|
|
913
|
+
try:
|
|
914
|
+
# The healthz endpoint doesn't require authentication
|
|
915
|
+
# Use direct GET request without auth header
|
|
916
|
+
url = f"{self.base_url}/v1/system/healthz"
|
|
917
|
+
response = requests.get(url, timeout=self.timeout)
|
|
918
|
+
|
|
919
|
+
# Accept both 200 (all OK) and 500 (some services down)
|
|
920
|
+
# Both return valid JSON health status
|
|
921
|
+
if response.status_code in (200, 500):
|
|
922
|
+
return cast(JsonDict, response.json())
|
|
923
|
+
else:
|
|
924
|
+
print(f"Health check returned unexpected status {response.status_code}")
|
|
925
|
+
return None
|
|
926
|
+
except Exception as e:
|
|
927
|
+
print(f"Health check failed: {e}")
|
|
928
|
+
return None
|