rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,928 @@
1
+ """
2
+ RAGFlow Client Wrapper for IMS Publishing
3
+
4
+ This module provides a wrapper around the ragflow-sdk for IMS-specific operations.
5
+
6
+ Key Features:
7
+ - Dataset management with template resolution (aia-{release})
8
+ - Document upload with change detection (MD5 hashing)
9
+ - Tag-in-title format: [tag1][tag2][tag3] filename.ext
10
+ - Two-stage filtering support (server keyword + client metadata)
11
+ """
12
+
13
+ import hashlib
14
+ import json
15
+ import time
16
+ import requests
17
+ from collections.abc import Sequence
18
+ from dataclasses import dataclass
19
+ from pathlib import Path
20
+ from typing import Any, Dict, List, Optional, cast
21
+
22
+ from ragflow_sdk import RAGFlow
23
+ from ragflow_sdk.modules.dataset import DataSet
24
+ from ragflow_sdk.modules.document import Document
25
+ from .typing_utils import DatasetLike, DocumentLike, JsonDict
26
+
27
+
28
+
29
+ @dataclass
30
+ class DocumentMetadata:
31
+ """Metadata structure for IMS documents"""
32
+ tags: list[str]
33
+ domain: str
34
+ release: str
35
+ content_hash: str
36
+ ims_doc_id: str
37
+ original_path: str = ""
38
+ resource_path: str | None = None
39
+ sort_order: int | None = None
40
+ frontmatter: JsonDict | None = None
41
+ line_count: int | None = None
42
+ doc_title: str = "" # Bare filename for server-side filtering
43
+
44
+
45
+ class RAGFlowClientError(Exception):
46
+ """Base exception for RAGFlow client errors"""
47
+ pass
48
+
49
+
50
+ class AuthenticationError(RAGFlowClientError):
51
+ """Authentication/authorization errors (401, 403)"""
52
+ pass
53
+
54
+
55
+ class NotFoundError(RAGFlowClientError):
56
+ """Resource not found errors (404)"""
57
+ pass
58
+
59
+
60
+ class NetworkError(RAGFlowClientError):
61
+ """Network-related errors"""
62
+ pass
63
+
64
+
65
+ class RAGFlowClient:
66
+ """
67
+ Wrapper class for RAGFlow SDK operations.
68
+
69
+ Provides high-level methods for dataset and document management
70
+ with IMS-specific functionality like tag-in-title format and
71
+ change detection.
72
+
73
+ Usage:
74
+ client = RAGFlowClient(api_key="ragflow-xxx", base_url="http://ragflow.local")
75
+
76
+ # Create/get dataset
77
+ dataset = client.create_dataset("aia-r1", "Release 1 instructions")
78
+
79
+ # Upload document with tags
80
+ doc = client.upload_document(
81
+ file_path=Path("agents.md"),
82
+ metadata=DocumentMetadata(...),
83
+ dataset_id=dataset.id
84
+ )
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ api_key: str,
90
+ base_url: str,
91
+ version: str = "v1",
92
+ timeout: int = 30,
93
+ embedding_model: str | None = None,
94
+ chunk_method: str = "naive",
95
+ parser_config: JsonDict | None = None,
96
+ page_size: int = 1000
97
+ ):
98
+ """
99
+ Initialize RAGFlow client.
100
+
101
+ Args:
102
+ api_key: RAGFlow API key (format: ragflow-xxxx)
103
+ base_url: RAGFlow instance URL (e.g., http://ragflow.local)
104
+ version: API version (default: v1)
105
+ timeout: Request timeout in seconds (default: 30)
106
+ embedding_model: Embedding model (format: model_name@provider, e.g., text-embedding-3-small@OpenAI)
107
+ chunk_method: Chunking method (default: naive)
108
+ parser_config: Parser configuration dict for chunk_method settings
109
+ page_size: Default page size for list operations (default: 1000)
110
+
111
+ Raises:
112
+ ValueError: If api_key or base_url is empty
113
+ """
114
+ if not api_key:
115
+ raise ValueError("api_key cannot be empty")
116
+ if not base_url:
117
+ raise ValueError("base_url cannot be empty")
118
+
119
+ self.api_key = api_key
120
+ self.base_url = base_url
121
+ self.version = version
122
+ self.timeout = timeout
123
+ self.embedding_model = embedding_model
124
+ self.chunk_method = chunk_method
125
+ self.parser_config = parser_config or {}
126
+ self.page_size = page_size
127
+
128
+ # Initialize RAGFlow SDK client
129
+ self._client = RAGFlow(api_key=api_key, base_url=base_url, version=version)
130
+
131
+ def _handle_response_error(self, response: Any, operation: str) -> None:
132
+ """
133
+ Handle API response errors uniformly.
134
+
135
+ Args:
136
+ response: Response object from requests
137
+ operation: Description of the operation for error messages
138
+
139
+ Raises:
140
+ AuthenticationError: For 401/403 errors
141
+ NotFoundError: For 404 errors
142
+ NetworkError: For network-related errors
143
+ RAGFlowClientError: For other errors
144
+ """
145
+ try:
146
+ if hasattr(response, 'status_code'):
147
+ if response.status_code == 401:
148
+ raise AuthenticationError(
149
+ f"{operation} failed: Invalid API key or expired token"
150
+ )
151
+ elif response.status_code == 403:
152
+ raise AuthenticationError(
153
+ f"{operation} failed: Insufficient permissions"
154
+ )
155
+ elif response.status_code == 404:
156
+ raise NotFoundError(
157
+ f"{operation} failed: Resource not found"
158
+ )
159
+ elif response.status_code >= 500:
160
+ raise NetworkError(
161
+ f"{operation} failed: Server error (status {response.status_code})"
162
+ )
163
+ except Exception as e:
164
+ if isinstance(e, RAGFlowClientError):
165
+ raise
166
+ raise NetworkError(f"{operation} failed: {str(e)}")
167
+
168
+ def create_dataset(
169
+ self,
170
+ name: str,
171
+ description: str = "",
172
+ embedding_model: str | None = None,
173
+ permission: str = "team",
174
+ chunk_method: str | None = None,
175
+ parser_config: JsonDict | None = None
176
+ ) -> DataSet:
177
+ """
178
+ Create a new dataset.
179
+
180
+ Args:
181
+ name: Dataset name
182
+ description: Dataset description
183
+ embedding_model: Embedding model (uses client default if not specified)
184
+ permission: Access permission (default: "team" = shared)
185
+ chunk_method: Chunking method (uses client default if not specified)
186
+ parser_config: Parser configuration dict (uses client default if not specified)
187
+
188
+ Returns:
189
+ Created DataSet object
190
+
191
+ Raises:
192
+ RAGFlowClientError: If creation fails
193
+ """
194
+ try:
195
+ # Use method parameters or fall back to client defaults
196
+ emb_model = embedding_model if embedding_model is not None else self.embedding_model
197
+ chunk_meth = chunk_method if chunk_method is not None else self.chunk_method
198
+ parser_cfg = parser_config if parser_config is not None else self.parser_config
199
+
200
+ # Build create_dataset kwargs
201
+ kwargs: dict[str, object] = {
202
+ "name": name,
203
+ "description": description,
204
+ "permission": permission,
205
+ "chunk_method": chunk_meth
206
+ }
207
+
208
+ # Add optional parameters if provided
209
+ if emb_model:
210
+ kwargs["embedding_model"] = emb_model
211
+
212
+ # Convert parser_config dict to DataSet.ParserConfig object if needed
213
+ if parser_cfg:
214
+ kwargs["parser_config"] = DataSet.ParserConfig(self._client, parser_cfg)
215
+
216
+ dataset = self._client.create_dataset(**kwargs)
217
+
218
+ return dataset
219
+
220
+ except Exception as e:
221
+ raise RAGFlowClientError(f"Failed to create dataset '{name}': {str(e)}")
222
+
223
+ def list_datasets(
224
+ self,
225
+ page: int = 1,
226
+ page_size: int = 30,
227
+ orderby: str = "create_time",
228
+ desc: bool = True,
229
+ id: str | None = None,
230
+ name: str | None = None
231
+ ) -> list[DataSet]:
232
+ """
233
+ List all datasets with optional filtering.
234
+
235
+ Args:
236
+ page: Page number (1-indexed)
237
+ page_size: Number of datasets per page
238
+ orderby: Field to sort by
239
+ desc: Sort in descending order
240
+ id: Filter by dataset ID (exact match)
241
+ name: Filter by dataset name (exact match lookup - will fail if not found)
242
+
243
+ Returns:
244
+ List of DataSet objects
245
+
246
+ Raises:
247
+ RAGFlowClientError: If listing fails
248
+ """
249
+ try:
250
+ datasets = self._client.list_datasets(
251
+ page=page,
252
+ page_size=page_size,
253
+ orderby=orderby,
254
+ desc=desc,
255
+ id=id,
256
+ name=name
257
+ )
258
+
259
+ return cast(list[DataSet], datasets)
260
+
261
+ except Exception as e:
262
+ # If name/id filter is used and dataset doesn't exist, RAGFlow returns permission error
263
+ # This is expected behavior - return empty list instead of raising
264
+ if (name or id) and "lacks permission" in str(e):
265
+ return []
266
+ raise RAGFlowClientError(f"Failed to list datasets: {str(e)}")
267
+
268
+ def get_dataset(self, id: str | None = None, name: str | None = None) -> DataSet | None:
269
+ """
270
+ Get a single dataset by ID or name using server-side filtering.
271
+
272
+ Args:
273
+ id: Dataset ID (exact match)
274
+ name: Dataset name (exact match)
275
+
276
+ Returns:
277
+ DataSet object if found, None otherwise
278
+
279
+ Note:
280
+ Provide either id OR name, not both. If both provided, id takes precedence.
281
+ """
282
+ try:
283
+ if id:
284
+ # Filter by ID
285
+ datasets = self._client.list_datasets(id=id, page_size=1)
286
+ elif name:
287
+ # Filter by name (RAGFlow does substring, we verify exact match)
288
+ datasets = self._client.list_datasets(name=name, page_size=10)
289
+ # Filter for exact match
290
+ datasets = [ds for ds in datasets if ds.name == name]
291
+ else:
292
+ return None
293
+
294
+ if datasets and len(datasets) > 0:
295
+ return datasets[0]
296
+ return None
297
+
298
+ except Exception as e:
299
+ # Check if it's a permission/not found error
300
+ error_msg = str(e).lower()
301
+ if "lacks permission" in error_msg or "not found" in error_msg:
302
+ return None
303
+ raise RAGFlowClientError(f"Failed to get dataset: {str(e)}")
304
+
305
+ def delete_datasets(self, ids: list[str]) -> None:
306
+ """
307
+ Delete datasets by IDs.
308
+
309
+ Args:
310
+ ids: List of dataset IDs to delete
311
+
312
+ Raises:
313
+ RAGFlowClientError: If deletion fails
314
+ """
315
+ try:
316
+ self._client.delete_datasets(ids=ids)
317
+
318
+ except Exception as e:
319
+ raise RAGFlowClientError(f"Failed to delete datasets: {str(e)}")
320
+
321
+ def _ensure_dataset(self, name: str, description: str = "") -> DataSet:
322
+ """
323
+ Get dataset if exists, create if not.
324
+
325
+ Args:
326
+ name: Dataset name
327
+ description: Dataset description (used if creating)
328
+
329
+ Returns:
330
+ DataSet object
331
+ """
332
+ dataset = self.get_dataset(name=name)
333
+ if dataset is not None:
334
+ return dataset
335
+
336
+ # Dataset doesn't exist, create it
337
+ return self.create_dataset(name, description)
338
+
339
+ def _resolve_dataset_name(self, template: str, release: str | None) -> str:
340
+ """
341
+ Resolve dataset name from template.
342
+
343
+ Args:
344
+ template: Name template (e.g., "aia-{release}")
345
+ release: Release identifier (e.g., "r1")
346
+
347
+ Returns:
348
+ Resolved dataset name
349
+
350
+ Examples:
351
+ >>> _resolve_dataset_name("aia-{release}", "r1")
352
+ "aia-r1"
353
+ >>> _resolve_dataset_name("aia", None)
354
+ "aia"
355
+ """
356
+ if release and "{release}" in template:
357
+ return template.format(release=release)
358
+ return template
359
+
360
+ def _build_title_with_tags(self, tags: list[str], filename: str) -> str:
361
+ """
362
+ Build document title.
363
+
364
+ Tags are stored in metadata only, not in the title.
365
+
366
+ Args:
367
+ tags: List of tags (unused, kept for compatibility)
368
+ filename: Original filename (with extension)
369
+
370
+ Returns:
371
+ Filename as title
372
+ """
373
+ return filename
374
+
375
+ def upload_document(
376
+ self,
377
+ file_path: Path | None = None,
378
+ metadata: DocumentMetadata | None = None,
379
+ dataset_name: str | None = None,
380
+ dataset_template: str = "aia-{release}",
381
+ force: bool = False,
382
+ content: bytes | None = None # NEW: Pre-read content from cache
383
+ ) -> tuple[DocumentLike, str] | None:
384
+ """
385
+ Upload document with upsert semantics and change detection.
386
+
387
+ OPTIMIZED: Now accepts pre-read content to avoid redundant file I/O.
388
+
389
+ This method:
390
+ 1. Resolves dataset name from template + release
391
+ 2. Ensures dataset exists
392
+ 3. Builds title with tag prefixes
393
+ 4. Checks if document exists (by ims_doc_id)
394
+ 5. Compares content hash (skip if unchanged, unless force=True)
395
+ 6. Deletes existing document if changed
396
+ 7. Uploads new document with metadata
397
+
398
+ Args:
399
+ file_path: Path to file (for filename, backward compatibility)
400
+ metadata: Document metadata with pre-calculated hash
401
+ dataset_name: Base dataset name or template
402
+ dataset_template: Template for dataset name resolution
403
+ force: Force upload even if unchanged
404
+ content: Pre-read file content (NEW - avoids re-reading file)
405
+
406
+ Returns:
407
+ Tuple of (Document, dataset_id), or None if skipped (unchanged)
408
+
409
+ Raises:
410
+ FileNotFoundError: If file_path does not exist (legacy path)
411
+ RAGFlowClientError: If upload fails
412
+
413
+ Examples:
414
+ >>> # New optimized way (with DocumentData)
415
+ >>> cache = DocumentData.from_file(path, workspace)
416
+ >>> doc, dataset_id = client.upload_document(
417
+ ... file_path=cache.file_path,
418
+ ... metadata=metadata,
419
+ ... dataset_name="aia",
420
+ ... content=cache.content # Pre-read content
421
+ ... )
422
+ """
423
+ # If content not provided, fall back to reading file (backward compatibility)
424
+ if content is None:
425
+ if file_path is None or not file_path.exists():
426
+ raise FileNotFoundError(f"File not found: {file_path}")
427
+ content = file_path.read_bytes()
428
+ if metadata is None:
429
+ raise ValueError("metadata is required")
430
+
431
+ # Hash should already be in metadata (calculated in DocumentData)
432
+ # No need to recalculate it here
433
+ actual_hash = metadata.content_hash
434
+
435
+ # Resolve dataset name
436
+ resolved_name = self._resolve_dataset_name(
437
+ dataset_template if "{release}" in dataset_template else (dataset_name or dataset_template),
438
+ metadata.release
439
+ )
440
+
441
+ # Ensure dataset exists
442
+ dataset = self._ensure_dataset(
443
+ resolved_name,
444
+ f"IMS Knowledge - Release {metadata.release}" if metadata.release else "IMS Knowledge"
445
+ )
446
+
447
+ # Build display name from normalized doc title when available.
448
+ # For R1, doc_title is filename; for R2, doc_title is logical path.
449
+ # This prevents R2 collisions like SKILL(7).md from repeated bare filenames.
450
+ filename = metadata.doc_title or (file_path.name if file_path else "")
451
+ title = self._build_title_with_tags(metadata.tags, filename)
452
+
453
+ # Check if document exists by searching for ims_doc_id in metadata
454
+ start_time = time.time()
455
+
456
+ # Use server-side metadata filtering to find document by ims_doc_id.
457
+ # RAGFlow may return ownership-style errors when the filtered lookup
458
+ # misses a document in team-shared datasets; treat that as "not found".
459
+ try:
460
+ existing_docs = self.list_documents(
461
+ dataset=dataset,
462
+ metadata_condition={
463
+ "logic": "and",
464
+ "conditions": [{
465
+ "name": "ims_doc_id",
466
+ "comparison_operator": "is",
467
+ "value": metadata.ims_doc_id
468
+ }]
469
+ },
470
+ page_size=1
471
+ )
472
+ except RAGFlowClientError as e:
473
+ msg = str(e).lower()
474
+ if (
475
+ "you don't own" in msg
476
+ or "you do not own" in msg
477
+ or "lacks permission" in msg
478
+ ):
479
+ existing_docs = []
480
+ else:
481
+ raise
482
+
483
+ existing_doc = existing_docs[0] if existing_docs else None
484
+
485
+ if existing_doc:
486
+ # Check if content changed by comparing hashes
487
+ existing_meta = getattr(existing_doc, 'meta_fields', {}) or {}
488
+
489
+ # Handle both dict and Base object formats
490
+ if isinstance(existing_meta, dict):
491
+ existing_hash = existing_meta.get("content_hash")
492
+ else:
493
+ # It's a Base object, access as attribute
494
+ existing_hash = getattr(existing_meta, 'content_hash', None)
495
+
496
+ if not force and existing_hash and existing_hash == actual_hash:
497
+ # Content unchanged, skip upload
498
+ elapsed = time.time() - start_time
499
+ print(f" ⏩ Skipped (unchanged, {elapsed:.2f}s): {title}")
500
+ return None
501
+
502
+ # Content changed, delete old version
503
+ dataset.delete_documents([existing_doc.id])
504
+ print(f" 🔄 Updating: {title}")
505
+ else:
506
+ print(f" ⬆️ Uploading: {title}")
507
+
508
+ # Upload document
509
+ try:
510
+ documents = dataset.upload_documents([{
511
+ "display_name": title,
512
+ "blob": content
513
+ }])
514
+
515
+ if not documents:
516
+ raise RAGFlowClientError("Upload returned no documents")
517
+
518
+ doc = documents[0]
519
+
520
+ # Update metadata
521
+ meta_fields: JsonDict = {
522
+ "ims_doc_id": metadata.ims_doc_id,
523
+ "tags": metadata.tags,
524
+ "domain": metadata.domain,
525
+ "release": metadata.release,
526
+ "content_hash": metadata.content_hash,
527
+ "original_path": metadata.original_path,
528
+ "sort_order": metadata.sort_order,
529
+ "doc_title": metadata.doc_title,
530
+ }
531
+ if metadata.line_count is not None:
532
+ meta_fields["line_count"] = metadata.line_count
533
+ if metadata.resource_path is not None:
534
+ meta_fields["resource_path"] = metadata.resource_path
535
+ frontmatter_value = getattr(metadata, 'frontmatter', None)
536
+ if frontmatter_value is not None:
537
+ meta_fields["frontmatter"] = frontmatter_value
538
+
539
+ doc.update({"meta_fields": meta_fields})
540
+ updated_meta = getattr(doc, 'meta_fields', None)
541
+ if updated_meta:
542
+ # SDK may return a Base object or a dict; handle both
543
+ if isinstance(updated_meta, dict):
544
+ meta_tags = updated_meta.get('tags', [])
545
+ meta_fm = updated_meta.get('frontmatter')
546
+ else:
547
+ meta_tags = getattr(updated_meta, 'tags', []) or []
548
+ meta_fm = getattr(updated_meta, 'frontmatter', None)
549
+ tag_count = len(meta_tags) if isinstance(meta_tags, list) else 0
550
+ print(f" ✅ Metadata set: {tag_count} tags, frontmatter={'yes' if meta_fm else 'no'}")
551
+ else:
552
+ print(f" ⚠️ Metadata update returned empty meta_fields!")
553
+
554
+ elapsed = time.time() - start_time
555
+ print(f" ✅ Done ({elapsed:.2f}s): {title}")
556
+
557
+ # Return doc object and dataset ID for parsing
558
+ # doc.id is RAGFlow's internal document ID needed for parsing
559
+ return (cast(DocumentLike, doc), dataset.id)
560
+
561
+ except Exception as e:
562
+ raise RAGFlowClientError(f"Failed to upload document '{title}': {str(e)}")
563
+
564
+ def trigger_parse(self, dataset_id: str, document_ids: list[str]) -> None:
565
+ """
566
+ Trigger async parsing for documents.
567
+
568
+ Args:
569
+ dataset_id: Dataset ID containing documents
570
+ document_ids: List of document IDs to parse
571
+
572
+ Raises:
573
+ RAGFlowClientError: If parsing trigger fails
574
+ """
575
+ dataset = self.get_dataset(id=dataset_id)
576
+ if not dataset:
577
+ raise NotFoundError(f"Dataset not found: {dataset_id}")
578
+
579
+ try:
580
+ dataset.async_parse_documents(document_ids)
581
+ except Exception as e:
582
+ raise RAGFlowClientError(f"Failed to trigger parsing: {str(e)}")
583
+
584
+ def parse_documents_batch(
585
+ self,
586
+ documents: list[JsonDict],
587
+ silent: bool = False
588
+ ) -> dict[str, list[str]]:
589
+ """
590
+ Trigger parsing for multiple documents across datasets.
591
+
592
+ Groups documents by dataset and triggers parsing for each group.
593
+ This is more efficient than calling trigger_parse separately for each document.
594
+
595
+ Args:
596
+ documents: List of {"id": doc_id, "name": name, "dataset_id": dataset_id}
597
+ silent: If True, don't print progress messages
598
+
599
+ Returns:
600
+ Dict with "success" and "failed" lists of dataset_ids
601
+
602
+ Examples:
603
+ >>> documents = [
604
+ ... {"id": "doc1", "name": "file1.md", "dataset_id": "dataset_a"},
605
+ ... {"id": "doc2", "name": "file2.md", "dataset_id": "dataset_a"},
606
+ ... {"id": "doc3", "name": "file3.md", "dataset_id": "dataset_b"}
607
+ ... ]
608
+ >>> result = client.parse_documents_batch(documents)
609
+ >>> print(result["success"]) # ["dataset_a", "dataset_b"]
610
+ """
611
+ # Group documents by dataset
612
+ by_dataset: dict[str, list[JsonDict]] = {}
613
+ for doc in documents:
614
+ dataset_id = str(doc["dataset_id"])
615
+ if dataset_id not in by_dataset:
616
+ by_dataset[dataset_id] = []
617
+ by_dataset[dataset_id].append(doc)
618
+
619
+ if not silent:
620
+ print(f"\n📄 Parsing {len(documents)} document(s)...")
621
+
622
+ # Track success/failures
623
+ success_datasets: list[str] = []
624
+ failed_datasets: list[str] = []
625
+
626
+ # Trigger parsing per dataset
627
+ for dataset_id, docs in by_dataset.items():
628
+ doc_ids = [str(d["id"]) for d in docs]
629
+ if not silent:
630
+ print(f" → Triggering parse for {len(doc_ids)} documents in dataset {dataset_id}")
631
+ print(f" → Document IDs: {doc_ids[:3]}{'...' if len(doc_ids) > 3 else ''}")
632
+
633
+ try:
634
+ self.trigger_parse(dataset_id, doc_ids)
635
+ success_datasets.append(dataset_id)
636
+ except Exception as e:
637
+ failed_datasets.append(dataset_id)
638
+ if not silent:
639
+ print(f" ✗ Parse trigger failed: {e}")
640
+ print(f" ℹ️ Documents uploaded but not parsed. Check RAGFlow UI.")
641
+
642
+ return {"success": success_datasets, "failed": failed_datasets}
643
+
644
+ def get_parse_status(self, dataset_id: str, document_id: str) -> JsonDict:
645
+ """
646
+ Get parsing status for a document.
647
+
648
+ Args:
649
+ dataset_id: Dataset ID containing document
650
+ document_id: Document ID to check
651
+
652
+ Returns:
653
+ Dict with keys: id, name, run, progress, chunk_count, token_count, progress_msg
654
+ run values: "UNSTART", "RUNNING", "DONE", "FAIL", "CANCEL"
655
+
656
+ Raises:
657
+ NotFoundError: If document not found
658
+ RAGFlowClientError: If status check fails
659
+ """
660
+ dataset = self.get_dataset(id=dataset_id)
661
+ if not dataset:
662
+ raise NotFoundError(f"Dataset not found: {dataset_id}")
663
+
664
+ try:
665
+ docs = dataset.list_documents(id=document_id, page_size=1)
666
+ if not docs or len(docs) == 0:
667
+ raise NotFoundError(f"Document not found: {document_id}")
668
+
669
+ doc = docs[0]
670
+ # Handle missing attributes gracefully
671
+ return {
672
+ "id": getattr(doc, 'id', document_id),
673
+ "name": getattr(doc, 'name', 'Unknown'),
674
+ "run": getattr(doc, 'run', 'UNSTART'),
675
+ "progress": getattr(doc, 'progress', 0.0),
676
+ "chunk_count": getattr(doc, 'chunk_count', 0),
677
+ "token_count": getattr(doc, 'token_count', 0),
678
+ "progress_msg": getattr(doc, 'progress_msg', '')
679
+ }
680
+ except NotFoundError:
681
+ raise
682
+ except Exception as e:
683
+ raise RAGFlowClientError(f"Failed to get parse status: {str(e)}")
684
+
685
+ def list_documents(
686
+ self,
687
+ dataset: DatasetLike,
688
+ id: str | None = None,
689
+ name: str | None = None,
690
+ keywords: str | None = None,
691
+ page: int = 1,
692
+ page_size: int = 30,
693
+ orderby: str = "create_time",
694
+ desc: bool = True,
695
+ create_time_from: int = 0,
696
+ create_time_to: int = 0,
697
+ run: list[str] | None = None,
698
+ suffix: list[str] | None = None,
699
+ metadata_condition: JsonDict | None = None
700
+ ) -> list[DocumentLike]:
701
+ """
702
+ List documents in a dataset with enhanced filtering.
703
+
704
+ This method extends the SDK's list_documents with server-side filtering
705
+ support for parse status (run), file types (suffix), and metadata queries.
706
+
707
+ Args:
708
+ dataset: DataSet object to list documents from
709
+ id: Filter by document ID
710
+ name: Filter by document name
711
+ keywords: Keyword search
712
+ page: Page number (1-indexed)
713
+ page_size: Number of documents per page
714
+ orderby: Field to sort by (default: "create_time")
715
+ desc: Sort in descending order
716
+ create_time_from: Unix timestamp for filtering documents created after this time
717
+ create_time_to: Unix timestamp for filtering documents created before this time
718
+ run: Filter by parse status (e.g., ["DONE"], ["FAIL", "UNSTART"])
719
+ Supported values: "UNSTART", "RUNNING", "CANCEL", "DONE", "FAIL"
720
+ suffix: Filter by file extension (e.g., ["pdf", "md"])
721
+ metadata_condition: Metadata filter dict with structure:
722
+ {
723
+ "logic": "and" | "or",
724
+ "conditions": [
725
+ {
726
+ "name": str, # Metadata field name
727
+ "comparison_operator": str, # "is", "contains", "start with", etc.
728
+ "value": any # Comparison value
729
+ }
730
+ ]
731
+ }
732
+
733
+ Returns:
734
+ List of Document objects
735
+
736
+ Raises:
737
+ RAGFlowClientError: If listing fails
738
+
739
+ Examples:
740
+ # Filter by parse status
741
+ docs = client.list_documents(dataset, run=["DONE"])
742
+
743
+ # Filter by filename prefix using metadata
744
+ docs = client.list_documents(
745
+ dataset,
746
+ metadata_condition={
747
+ "logic": "and",
748
+ "conditions": [{
749
+ "name": "doc_title",
750
+ "comparison_operator": "start with",
751
+ "value": "agents"
752
+ }]
753
+ }
754
+ )
755
+
756
+ # Combined filters
757
+ docs = client.list_documents(
758
+ dataset,
759
+ run=["FAIL", "UNSTART"],
760
+ suffix=["md", "txt"],
761
+ page_size=self.page_size
762
+ )
763
+ """
764
+ try:
765
+ # Build query parameters for HTTP API
766
+ params: dict[str, object] = {
767
+ "page": page,
768
+ "page_size": page_size,
769
+ "orderby": orderby,
770
+ "desc": desc,
771
+ }
772
+
773
+ # Add optional standard parameters
774
+ if id is not None:
775
+ params["id"] = id
776
+ if name is not None:
777
+ params["name"] = name
778
+ if keywords is not None:
779
+ params["keywords"] = keywords
780
+ if create_time_from > 0:
781
+ params["create_time_from"] = create_time_from
782
+ if create_time_to > 0:
783
+ params["create_time_to"] = create_time_to
784
+
785
+ # Add enhanced filtering parameters if provided
786
+ if run is not None:
787
+ params["run"] = run
788
+ if suffix is not None:
789
+ params["suffix"] = suffix
790
+ if metadata_condition is not None:
791
+ params["metadata_condition"] = json.dumps(metadata_condition)
792
+
793
+ # Bypass SDK and call HTTP API directly
794
+ # SDK doesn't support run, suffix, metadata_condition parameters
795
+ res = dataset.get(f"/datasets/{dataset.id}/documents", params=params)
796
+ res_json = cast(JsonDict, cast(Any, res).json())
797
+
798
+ if res_json.get("code") != 0:
799
+ raise RAGFlowClientError(f"API error: {res_json.get('message', 'Unknown error')}")
800
+
801
+ # Convert response to Document objects (same as SDK does)
802
+ documents: list[DocumentLike] = []
803
+ data = res_json.get("data", {})
804
+ docs = data.get("docs", []) if isinstance(data, dict) else []
805
+ for doc_dict in docs:
806
+ if isinstance(doc_dict, dict):
807
+ documents.append(cast(DocumentLike, Document(cast(Any, dataset).rag, doc_dict)))
808
+
809
+ return documents
810
+
811
+ except Exception as e:
812
+ raise RAGFlowClientError(f"Failed to list documents: {str(e)}")
813
+
814
+ def _filter_by_metadata(self, docs: list[DocumentLike], condition: JsonDict) -> list[DocumentLike]:
815
+ """
816
+ Client-side fallback for metadata filtering.
817
+
818
+ Args:
819
+ docs: List of Document objects
820
+ condition: Metadata condition dict
821
+
822
+ Returns:
823
+ Filtered list of Document objects
824
+ """
825
+ logic = condition.get("logic", "and")
826
+ conditions = condition.get("conditions", [])
827
+
828
+ filtered: list[DocumentLike] = []
829
+ for doc in docs:
830
+ # Get document metadata
831
+ meta = getattr(doc, 'meta_fields', {})
832
+ if isinstance(meta, str):
833
+ try:
834
+ meta = json.loads(meta)
835
+ except:
836
+ meta = {}
837
+
838
+ # Evaluate conditions
839
+ matches = []
840
+ for cond in conditions:
841
+ if not isinstance(cond, dict):
842
+ matches.append(False)
843
+ continue
844
+ field_name = cond.get("name")
845
+ operator = cond.get("comparison_operator")
846
+ value = cond.get("value")
847
+
848
+ field_value = meta.get(field_name)
849
+
850
+ # Evaluate condition
851
+ if operator == "is":
852
+ matches.append(field_value == value)
853
+ elif operator == "contains":
854
+ matches.append(str(value) in str(field_value) if field_value is not None else False)
855
+ elif operator == "start with":
856
+ matches.append(str(field_value).startswith(str(value)) if field_value else False)
857
+ elif operator == "end with":
858
+ matches.append(str(field_value).endswith(str(value)) if field_value else False)
859
+ else:
860
+ matches.append(False)
861
+
862
+ # Apply logic
863
+ if logic == "and":
864
+ if all(matches):
865
+ filtered.append(doc)
866
+ elif logic == "or":
867
+ if any(matches):
868
+ filtered.append(doc)
869
+
870
+ return filtered
871
+
872
+ def verify_connection(self) -> bool:
873
+ """
874
+ Verify API connection and authentication.
875
+
876
+ Returns:
877
+ True if connection successful, False otherwise
878
+ """
879
+ try:
880
+ self.list_datasets(page_size=1)
881
+ return True
882
+ except Exception:
883
+ return False
884
+
885
+ def get_system_health(self) -> JsonDict | None:
886
+ """
887
+ Check the health status of RAGFlow's dependencies.
888
+
889
+ Calls the /v1/system/healthz endpoint which checks:
890
+ - Database (MySQL/PostgreSQL)
891
+ - Redis
892
+ - Document Engine (Elasticsearch/Infinity/OpenSearch)
893
+ - Object Storage (MinIO/S3/GCS)
894
+
895
+ Note: This endpoint does NOT require authentication.
896
+
897
+ Returns:
898
+ Health status dict with format:
899
+ {
900
+ 'status': 'ok' or 'nok',
901
+ 'db': 'ok' or 'nok',
902
+ 'redis': 'ok' or 'nok',
903
+ 'doc_engine': 'ok' or 'nok',
904
+ 'storage': 'ok' or 'nok',
905
+ '_meta': { # Optional: Only present if there are issues
906
+ 'db': {'elapsed': '12.3', 'error': '...'},
907
+ 'redis': {'elapsed': '8.5', 'error': '...'},
908
+ ...
909
+ }
910
+ }
911
+ Returns None if health check fails
912
+ """
913
+ try:
914
+ # The healthz endpoint doesn't require authentication
915
+ # Use direct GET request without auth header
916
+ url = f"{self.base_url}/v1/system/healthz"
917
+ response = requests.get(url, timeout=self.timeout)
918
+
919
+ # Accept both 200 (all OK) and 500 (some services down)
920
+ # Both return valid JSON health status
921
+ if response.status_code in (200, 500):
922
+ return cast(JsonDict, response.json())
923
+ else:
924
+ print(f"Health check returned unexpected status {response.status_code}")
925
+ return None
926
+ except Exception as e:
927
+ print(f"Health check failed: {e}")
928
+ return None