rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ """
2
+ List Command - Display documents in a dataset
3
+ """
4
+
5
+ from ..services.dataset_service import DatasetService
6
+ from ..services.document_service import DocumentService
7
+
8
+ from .base_command import BaseCommand
9
+ from ..typing_utils import CommandArgs
10
+
11
+
12
+ class ListCommand(BaseCommand):
13
+ """List documents in a RAGFlow dataset."""
14
+
15
+ def execute(self, args: CommandArgs) -> int:
16
+ """Execute list-dataset command."""
17
+ self._start_timing()
18
+
19
+ # Resolve dataset name
20
+ dataset_service = DatasetService(self.client, self.config)
21
+ dataset_name, _auto_detected = dataset_service.resolve_dataset_name(args.dataset)
22
+
23
+ if not dataset_name:
24
+ print("✗ Failed to resolve dataset name")
25
+ return 1
26
+
27
+ # Print header
28
+ print(f"Listing Dataset: {dataset_name}")
29
+ print(f"Environment: {self.config.environment}")
30
+ print(f"RAGFlow Instance: {self.config.base_url}")
31
+ print()
32
+
33
+ # Verify authentication
34
+ from services.auth_service import AuthService
35
+ AuthService.verify_or_exit(self.client, self.config)
36
+
37
+ try:
38
+ # Get dataset
39
+ dataset = self.client.get_dataset(name=dataset_name)
40
+
41
+ if not dataset:
42
+ print(f"✗ Dataset '{dataset_name}' not found")
43
+ dataset_service.display_available_datasets()
44
+ return 1
45
+
46
+ # List documents
47
+ document_service = DocumentService(self.client)
48
+ documents = dataset.list_documents(page_size=self.config.page_size)
49
+
50
+ if not documents:
51
+ print("\nNo documents found in dataset")
52
+ return 0
53
+
54
+ print(f"\nFound {len(documents)} document(s):")
55
+ print("="*80)
56
+
57
+ for i, document in enumerate(documents, 1):
58
+ print(document_service.format_document_display(document, i))
59
+
60
+ print("="*80)
61
+ self._print_timing()
62
+
63
+ return 0
64
+
65
+ except Exception as e:
66
+ print(f"\n✗ Error listing dataset: {e}")
67
+ self._print_timing()
68
+ import traceback
69
+ traceback.print_exc()
70
+ return 1
@@ -0,0 +1,205 @@
1
+ """
2
+ Parse Command - Trigger document parsing without re-uploading
3
+ """
4
+
5
+ import time
6
+
7
+ from ..services.dataset_service import DatasetService
8
+ from ..services.document_service import DocumentService
9
+ from ..services.auth_service import AuthService
10
+
11
+
12
+
13
+ from .base_command import BaseCommand
14
+ from ..typing_utils import CommandArgs, DatasetLike, JsonDict
15
+
16
+
17
+ class ParseCommand(BaseCommand):
18
+ """
19
+ Trigger parsing for documents in a dataset.
20
+
21
+ Handles selective parsing (failed/unparsed) or force re-parse all documents.
22
+ """
23
+
24
+ def execute(self, args: CommandArgs) -> int:
25
+ """Execute parse command."""
26
+ self._start_timing()
27
+
28
+ # CLI flag must override config default for this run.
29
+ self.config.parse_timeout = args.parse_timeout
30
+
31
+ # Resolve dataset name
32
+ dataset_service = DatasetService(self.client, self.config)
33
+ dataset_name, auto_detected = dataset_service.resolve_dataset_name(args.dataset)
34
+
35
+ if not dataset_name:
36
+ print("✗ Failed to resolve dataset name")
37
+ return 1
38
+
39
+ # Print header
40
+ print(f"Parsing Documents in Dataset: {dataset_name}")
41
+ if args.force:
42
+ print("FORCE MODE - Will re-parse ALL documents")
43
+ if args.dry_run:
44
+ print("DRY-RUN MODE - No parsing will be triggered")
45
+ print(f"Environment: {self.config.environment}")
46
+ print(f"RAGFlow Instance: {self.config.base_url}\n")
47
+
48
+ # Verify authentication
49
+ AuthService.verify_or_exit(self.client, self.config)
50
+ print()
51
+
52
+ try:
53
+ # Get dataset
54
+ dataset = self.client.get_dataset(name=dataset_name)
55
+
56
+ if not dataset:
57
+ print(f"✗ Dataset '{dataset_name}' not found")
58
+ dataset_service.display_available_datasets()
59
+ return 1
60
+
61
+ # Get documents to parse
62
+ docs_to_parse, status_counts = self._get_documents_to_parse(
63
+ dataset, args
64
+ )
65
+
66
+ # Print status
67
+ self._print_parse_status(docs_to_parse, status_counts, args)
68
+
69
+ # Dry-run mode
70
+ if args.dry_run:
71
+ return self._handle_dry_run(docs_to_parse, status_counts)
72
+
73
+ # Check if anything to parse
74
+ if not docs_to_parse:
75
+ print("✓ All documents are already parsed or currently parsing")
76
+ self._print_timing()
77
+ return 0
78
+
79
+ # Confirm only in force mode (re-parsing all documents is destructive)
80
+ if args.force and not self._confirm_parsing(docs_to_parse, args):
81
+ print("\n✗ Parsing cancelled")
82
+ return 1
83
+
84
+ # Parse documents
85
+ success, failed = self._parse_and_wait(docs_to_parse, args)
86
+
87
+ # Print summary
88
+ self._print_parse_summary(success, failed, status_counts, args)
89
+
90
+ return 0 if failed == 0 else 1
91
+
92
+ except KeyboardInterrupt:
93
+ print("\n\n✗ Parsing interrupted by user (Ctrl+C)")
94
+ self._print_timing()
95
+ return 1
96
+ except Exception as e:
97
+ print(f"\n✗ Error during parsing: {e}")
98
+ self._print_timing()
99
+ import traceback
100
+ traceback.print_exc()
101
+ return 1
102
+
103
+ def _get_documents_to_parse(self, dataset: DatasetLike, args: CommandArgs) -> tuple[list[JsonDict], dict[str, int]]:
104
+ """Get documents that need parsing."""
105
+ document_service = DocumentService(self.client)
106
+ docs_to_parse: list[JsonDict] = []
107
+ status_counts = {'done': 0, 'running': 0}
108
+
109
+ print(f"Checking parsing status for documents...")
110
+
111
+ if args.force:
112
+ # Force mode: all documents
113
+ documents = dataset.list_documents(page_size=self.config.page_size)
114
+
115
+ print(f"Found {len(documents)} document(s) (force mode - parsing all)\n")
116
+
117
+ for document in documents:
118
+ doc_id = getattr(document, 'id', None)
119
+ if doc_id:
120
+ docs_to_parse.append({
121
+ "id": doc_id,
122
+ "name": getattr(document, 'name', 'Untitled'),
123
+ "dataset_id": dataset.id,
124
+ "folder": ".",
125
+ "status": getattr(document, 'run', 'UNSTART')
126
+ })
127
+ else:
128
+ # Default mode: filter by status (need parsing)
129
+ documents_needing_parse = document_service.list_documents_by_status(
130
+ dataset,
131
+ statuses=["FAIL", "UNSTART", "CANCEL"],
132
+ limit=self.config.page_size
133
+ )
134
+
135
+ print(f"Found {len(documents_needing_parse)} document(s) needing parsing\n")
136
+
137
+ for document in documents_needing_parse:
138
+ doc_id = getattr(document, 'id', None)
139
+ if doc_id:
140
+ docs_to_parse.append({
141
+ "id": doc_id,
142
+ "name": getattr(document, 'name', 'Untitled'),
143
+ "dataset_id": dataset.id,
144
+ "folder": ".",
145
+ "status": getattr(document, 'run', 'UNSTART')
146
+ })
147
+
148
+ return docs_to_parse, status_counts
149
+
150
+ def _print_parse_status(self, docs_to_parse: list[JsonDict], status_counts: dict[str, int], args: CommandArgs) -> None:
151
+ """Print parsing status."""
152
+ if docs_to_parse:
153
+ print(f"Documents to parse ({len(docs_to_parse)}):")
154
+ for doc in docs_to_parse:
155
+ print(f" 📄 {doc['name']} ({doc['status']})")
156
+ print()
157
+
158
+ def _handle_dry_run(self, docs_to_parse: list[JsonDict], status_counts: dict[str, int]) -> int:
159
+ """Handle dry-run mode."""
160
+ print("="*80)
161
+ print(f"Summary (DRY-RUN):")
162
+ print(f" Would parse: {len(docs_to_parse)}")
163
+ print(f" Already done: {status_counts['done']}")
164
+ print(f" Currently running: {status_counts['running']}")
165
+ print("="*80)
166
+ self._print_timing()
167
+ return 0
168
+
169
+ def _confirm_parsing(self, docs_to_parse: list[JsonDict], args: CommandArgs) -> bool:
170
+ """Confirm parsing with user."""
171
+ if args.yes:
172
+ return True
173
+
174
+ response = input(f"⚠️ Trigger parsing for {len(docs_to_parse)} documents? (yes/no): ")
175
+ return response.lower() in ['yes', 'y']
176
+
177
+ def _parse_and_wait(self, docs_to_parse: list[JsonDict], args: CommandArgs) -> tuple[int, int]:
178
+ """Parse documents and wait for completion."""
179
+
180
+ # Use client's batch parsing method (handles grouping and triggering)
181
+ _ = self.client.parse_documents_batch(docs_to_parse, silent=True)
182
+
183
+ # Wait with progress bar using DocumentService
184
+ doc_service = DocumentService(self.client)
185
+ success, failed = doc_service.wait_for_parsing(
186
+ docs_to_parse,
187
+ timeout=self.config.parse_timeout
188
+ )
189
+
190
+ return success, failed
191
+
192
+ def _print_parse_summary(self, success: int, failed: int, status_counts: dict[str, int], args: CommandArgs) -> None:
193
+ """Print parsing summary."""
194
+ print("\n" + "="*80)
195
+ print(f"Parse Summary:")
196
+ print(f" ✓ Successfully parsed: {success}")
197
+ if failed > 0:
198
+ print(f" ✗ Failed: {failed}")
199
+ if not args.force:
200
+ if status_counts['done'] > 0:
201
+ print(f" ⏭️ Skipped (already done): {status_counts['done']}")
202
+ if status_counts['running'] > 0:
203
+ print(f" ⏭️ Skipped (running): {status_counts['running']}")
204
+ print("="*80)
205
+ self._print_timing()
@@ -0,0 +1,113 @@
1
+ """
2
+ Publish Command - Upload knowledge base content to RAGFlow
3
+ """
4
+
5
+ from pathlib import Path
6
+
7
+ from ..ims_publisher import ContentPublisher
8
+ from ..services.auth_service import AuthService
9
+ from ..ims_utils import resolve_workspace_root
10
+
11
+ from .base_command import BaseCommand
12
+ from ..typing_utils import CommandArgs
13
+
14
+
15
+
16
+ class PublishCommand(BaseCommand):
17
+ """
18
+ Publish knowledge base content (files or folders) to RAGFlow.
19
+
20
+ Handles single file or recursive folder publishing with optional dry-run mode.
21
+ """
22
+
23
+ def execute(self, args: CommandArgs) -> int:
24
+ """
25
+ Execute publish command.
26
+
27
+ Args:
28
+ args: Command arguments with path, dry_run, force, no_parse flags
29
+
30
+ Returns:
31
+ 0 if successful, 1 if failed
32
+ """
33
+ self._start_timing()
34
+
35
+ # Print header
36
+ print(f"Publishing knowledge base content from: {args.path}")
37
+ self._print_header_with_api_key()
38
+ print()
39
+
40
+ # Verify authentication
41
+ AuthService.verify_or_exit(self.client, self.config)
42
+
43
+ path = Path(args.path.strip()).resolve()
44
+ workspace_root = resolve_workspace_root(path)
45
+
46
+ publisher = ContentPublisher(
47
+ self.client,
48
+ str(workspace_root),
49
+ dataset_default=self.config.dataset_default,
50
+ dataset_template=self.config.dataset_template
51
+ )
52
+
53
+ # Publish
54
+ exit_code = self._publish_path(publisher, path, args)
55
+
56
+ # Print timing
57
+ print()
58
+ self._print_timing()
59
+
60
+ return exit_code
61
+
62
+ def _publish_path(self, publisher: ContentPublisher, path: Path, args: CommandArgs) -> int:
63
+ """
64
+ Publish a file or directory.
65
+
66
+ Args:
67
+ publisher: ContentPublisher instance
68
+ path: Path to publish
69
+ args: Command arguments
70
+
71
+ Returns:
72
+ Exit code
73
+ """
74
+ if path.is_file():
75
+ return self._publish_file(publisher, path, args)
76
+ elif path.is_dir():
77
+ return self._publish_folder(publisher, path, args)
78
+ else:
79
+ print(f"✗ Path not found: {path}")
80
+ return 1
81
+
82
+ def _publish_file(self, publisher: ContentPublisher, path: Path, args: CommandArgs) -> int:
83
+ """Publish single file."""
84
+ result = publisher.publish_file(
85
+ str(path),
86
+ dry_run=args.dry_run,
87
+ force=args.force,
88
+ parse_documents=not args.no_parse,
89
+ wait_for_parsing=True
90
+ )
91
+ print(f"\n{result}")
92
+ return 0 if result.success else 1
93
+
94
+ def _publish_folder(self, publisher: ContentPublisher, path: Path, args: CommandArgs) -> int:
95
+ """Publish folder recursively."""
96
+ results = publisher.publish_folder(
97
+ str(path),
98
+ dry_run=args.dry_run,
99
+ recursive=True,
100
+ force=args.force,
101
+ parse_documents=not args.no_parse,
102
+ wait_for_parsing=True
103
+ )
104
+
105
+ # Return success if no failures
106
+ failed = [r for r in results if not r.success]
107
+ return 0 if len(failed) == 0 else 1
108
+
109
+ def _print_header_with_api_key(self) -> None:
110
+ """Print header including API key status."""
111
+ print(f"Environment: {self.config.environment}")
112
+ print(f"RAGFlow Instance: {self.config.base_url}")
113
+ print(f"RAGFlow API Key: {'SET' if self.config.api_key else 'NOT SET'}")
@@ -0,0 +1,46 @@
1
+ """
2
+ Verify Command - RAGFlow connection verification
3
+ """
4
+
5
+ from ..services.auth_service import AuthService
6
+
7
+ from .base_command import BaseCommand
8
+ from ..typing_utils import CommandArgs
9
+
10
+
11
+ class VerifyCommand(BaseCommand):
12
+ """
13
+ Verify RAGFlow API connection and authentication.
14
+
15
+ Tests connectivity and API key validity.
16
+ """
17
+
18
+ def execute(self, args: CommandArgs) -> int:
19
+ """
20
+ Execute verify command.
21
+
22
+ Args:
23
+ args: Command arguments (unused for verify)
24
+
25
+ Returns:
26
+ 0 if verification successful, 1 if failed
27
+ """
28
+ self._start_timing()
29
+
30
+ # Print header
31
+ self._print_header("RAGFlow Connection Verification")
32
+ print()
33
+
34
+ # Verify connection using AuthService
35
+ auth_service = AuthService(self.client, self.config)
36
+ success = auth_service.verify_connection()
37
+
38
+ # Print result
39
+ if success:
40
+ print(f"\n✓ All systems operational")
41
+ self._print_timing()
42
+ return 0
43
+ else:
44
+ print(f"\n✗ Connection verification failed")
45
+ self._print_timing()
46
+ return 1
@@ -0,0 +1,124 @@
1
+ """
2
+ IMS Authentication Module
3
+
4
+ Handles API key verification and connection management for RAGFlow.
5
+
6
+ Features:
7
+ - API key authentication
8
+ - Connection verification
9
+ - System health checks
10
+ """
11
+
12
+ from .ims_config import IMSConfig
13
+ from .ragflow_client import AuthenticationError, RAGFlowClient
14
+ from .typing_utils import JsonDict
15
+
16
+
17
+ class IMSAuthManager:
18
+ """Manages RAGFlow API key authentication and connection verification."""
19
+
20
+ def __init__(self, client: RAGFlowClient, config: IMSConfig):
21
+ """
22
+ Initialize the authentication manager.
23
+
24
+ Args:
25
+ client: RAGFlow client instance
26
+ config: RAGFlow configuration
27
+ """
28
+ self.client = client
29
+ self.config = config
30
+
31
+ def verify_api_key(self) -> tuple[bool, str | None]:
32
+ """
33
+ Verify API key is valid by attempting to list datasets.
34
+
35
+ Returns:
36
+ Tuple of (success: bool, error_message: Optional[str])
37
+ """
38
+ try:
39
+ # Try a simple operation to verify API key
40
+ self.client.list_datasets(page_size=1)
41
+ return True, None
42
+ except AuthenticationError as e:
43
+ return False, f"Authentication failed: {e}"
44
+ except Exception as e:
45
+ return False, f"Verification error: {e}"
46
+
47
+ def verify_connection(self) -> tuple[bool, str | None]:
48
+ """
49
+ Verify connection to RAGFlow server.
50
+
51
+ Returns:
52
+ Tuple of (success: bool, error_message: Optional[str])
53
+ """
54
+ try:
55
+ if self.client.verify_connection():
56
+ return True, None
57
+ else:
58
+ return False, f"Connection failed to {self.config.base_url}"
59
+ except Exception as e:
60
+ return False, f"Connection error: {e}"
61
+
62
+ def get_server_info(self) -> JsonDict | None:
63
+ """
64
+ Get RAGFlow server information including basic health status.
65
+
66
+ Returns:
67
+ Server information dict or None if not available
68
+ """
69
+ try:
70
+ # Get basic server info
71
+ info: JsonDict = {
72
+ 'base_url': self.config.base_url,
73
+ 'environment': self.config.environment,
74
+ 'dataset_default': self.config.dataset_default,
75
+ 'dataset_template': self.config.dataset_template,
76
+ }
77
+
78
+ # Try to get health status (non-blocking)
79
+ try:
80
+ health = self.get_system_health()
81
+ if health:
82
+ info['health_status'] = health.get('status', 'unknown')
83
+ info['services'] = {
84
+ 'database': health.get('db', 'unknown'),
85
+ 'redis': health.get('redis', 'unknown'),
86
+ 'doc_engine': health.get('doc_engine', 'unknown'),
87
+ 'storage': health.get('storage', 'unknown'),
88
+ }
89
+ except Exception:
90
+ # Health check failed, but don't fail server info
91
+ info['health_status'] = 'unavailable'
92
+
93
+ return info
94
+ except Exception:
95
+ return None
96
+
97
+ def get_system_health(self) -> JsonDict | None:
98
+ """
99
+ Check the health status of RAGFlow's dependencies.
100
+
101
+ This endpoint checks:
102
+ - Database (MySQL/PostgreSQL)
103
+ - Redis
104
+ - Document Engine (Elasticsearch/Infinity/OpenSearch)
105
+ - Object Storage (MinIO/S3/GCS)
106
+
107
+ Returns:
108
+ Health status dict with format:
109
+ {
110
+ 'status': 'ok' or 'nok',
111
+ 'db': 'ok' or 'nok',
112
+ 'redis': 'ok' or 'nok',
113
+ 'doc_engine': 'ok' or 'nok',
114
+ 'storage': 'ok' or 'nok',
115
+ }
116
+ }
117
+ Returns None if health check is not available
118
+ """
119
+ try:
120
+ # Call the healthz endpoint (no authentication required)
121
+ health_data = self.client.get_system_health()
122
+ return health_data
123
+ except Exception:
124
+ return None