rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ """Rosetta CLI package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ try:
6
+ from importlib.metadata import version
7
+
8
+ __version__ = version("rosetta-cli")
9
+ except Exception:
10
+ __version__ = "unknown"
11
+
12
+ __all__ = ["__version__"]
@@ -0,0 +1,6 @@
1
+ """Entry point for running rosetta-cli as a module."""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ raise SystemExit(main())
rosetta_cli/cli.py ADDED
@@ -0,0 +1,379 @@
1
+ """Rosetta CLI entry point."""
2
+
3
+
4
+ import argparse
5
+ import sys
6
+ from collections.abc import Callable
7
+ from typing import TypeAlias
8
+
9
+ from .commands.base_command import BaseCommand
10
+ from .commands.cleanup_command import CleanupCommand
11
+ from .commands.list_command import ListCommand
12
+ from .commands.parse_command import ParseCommand
13
+ from .commands.publish_command import PublishCommand
14
+ from .commands.verify_command import VerifyCommand
15
+ from .ims_config import IMSConfig
16
+ from .ragflow_client import RAGFlowClient
17
+ from .typing_utils import CommandArgs
18
+
19
+ CommandClass: TypeAlias = Callable[[RAGFlowClient, IMSConfig], BaseCommand]
20
+
21
+
22
+ # Command registry mapping command names to their classes
23
+ COMMAND_REGISTRY: dict[str, CommandClass] = {
24
+ 'publish': PublishCommand,
25
+ 'verify': VerifyCommand,
26
+ 'list-dataset': ListCommand,
27
+ 'cleanup-dataset': CleanupCommand,
28
+ 'parse': ParseCommand,
29
+ }
30
+
31
+
32
+ def execute_command(command_name: str, args: CommandArgs, client: RAGFlowClient, config: IMSConfig) -> int:
33
+ """
34
+ Execute a command by name using the command registry.
35
+
36
+ Args:
37
+ command_name: Name of the command to execute
38
+ args: Parsed command-line arguments
39
+ client: RAGFlow client instance
40
+ config: RAGFlow configuration
41
+
42
+ Returns:
43
+ Exit code (0 for success, 1 for failure)
44
+ """
45
+ command_class = COMMAND_REGISTRY.get(command_name)
46
+ if not command_class:
47
+ print(f"Unknown command: {command_name}")
48
+ return 1
49
+
50
+ command = command_class(client, config)
51
+ return command.execute(args)
52
+
53
+
54
+ def main() -> int:
55
+ """Main CLI entry point."""
56
+ parser = argparse.ArgumentParser(
57
+ description="Rosetta CLI - Publish knowledge base content to RAGFlow and manage datasets\n"
58
+ "All commands include performance timing measurements.",
59
+ formatter_class=argparse.RawDescriptionHelpFormatter,
60
+ epilog="""
61
+ Examples:
62
+ # Publish knowledge base content from a folder (with timing)
63
+ rosetta-cli publish ../instructions
64
+
65
+ # Publish with dry-run (no actual upload)
66
+ rosetta-cli publish ../business --dry-run
67
+
68
+ # Force republish (ignore change detection)
69
+ rosetta-cli publish ../instructions --force
70
+
71
+ # List documents in dataset
72
+ rosetta-cli list-dataset
73
+
74
+ # List documents in specific dataset
75
+ rosetta-cli list-dataset --dataset aia-r1
76
+
77
+ # Cleanup (delete all documents from) a dataset
78
+ rosetta-cli cleanup-dataset --dataset aia-r1
79
+
80
+ # Preview cleanup with dry-run (shows what would be deleted)
81
+ rosetta-cli cleanup-dataset --dataset aia-r1 --dry-run
82
+
83
+ # Cleanup documents with specific prefix
84
+ rosetta-cli cleanup-dataset --dataset aia-r1 --prefix "aqa-phase"
85
+
86
+ # Cleanup documents with specific tags (space-separated)
87
+ rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1 agents"
88
+
89
+ # Cleanup documents with specific tags (comma-separated)
90
+ rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1,agents"
91
+
92
+ # Preview cleanup with prefix filter
93
+ rosetta-cli cleanup-dataset --dataset aia-r1 --prefix "aqa-phase" --dry-run
94
+
95
+ # Preview cleanup with tags filter
96
+ rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1 agents" --dry-run
97
+
98
+ # Cleanup with force (skip confirmation)
99
+ rosetta-cli cleanup-dataset --dataset aia-r1 --force
100
+
101
+ # Cleanup with prefix and force
102
+ rosetta-cli cleanup-dataset --dataset aia-r1 --prefix "aqa-phase" --force
103
+
104
+ # Cleanup with tags and force
105
+ rosetta-cli cleanup-dataset --dataset aia-r1 --tags "r1,agents" --force
106
+
107
+ # Parse documents in dataset (retry failed/unparsed)
108
+ rosetta-cli parse --dataset aia-r1
109
+
110
+ # Force re-parse all documents (e.g., after changing parser config)
111
+ rosetta-cli parse --dataset aia-r1 --force
112
+
113
+ # Preview which documents would be parsed
114
+ rosetta-cli parse --dataset aia-r1 --dry-run
115
+
116
+ # Verify connection
117
+ rosetta-cli verify
118
+
119
+ # Use different environment
120
+ rosetta-cli publish ../instructions --env production
121
+
122
+ Performance Notes:
123
+ - All commands show execution time (ā±ļø Total time: X.XXs)
124
+ - Publishing ~10-15s per file (embedding generation)
125
+ - Change detection skips unchanged files (77% faster)
126
+ - API key verification timing shown when applicable
127
+
128
+ Tag-in-Title Format:
129
+ - Documents are published with tags in title: [tag1][tag2] filename.ext
130
+ - Example: [instructions][agents][r1] agents.md
131
+
132
+ Frontmatter Metadata (publish flow):
133
+ - Supported keys: tags, sort_order
134
+ - tags can be list or comma-separated string
135
+ - tags are merged with path-based tags (case-insensitive dedupe)
136
+ - sort_order is persisted to metadata and affects MCP bundling order
137
+ - original_path/resource_path are normalized from instructions-relative path when applicable
138
+ """
139
+ )
140
+
141
+ # Global arguments
142
+ parser.add_argument(
143
+ '--env',
144
+ type=str,
145
+ default=None,
146
+ help='Environment (local, dev, test, production)'
147
+ )
148
+ parser.add_argument(
149
+ '--env-file',
150
+ type=str,
151
+ default=None,
152
+ help='Explicit path to a .env file'
153
+ )
154
+
155
+ # Subcommands
156
+ subparsers = parser.add_subparsers(dest='command', help='Command to execute')
157
+
158
+ # Publish command
159
+ publish_parser = subparsers.add_parser(
160
+ 'publish',
161
+ help='Publish knowledge base content to RAGFlow'
162
+ )
163
+ publish_parser.add_argument(
164
+ 'path',
165
+ type=str,
166
+ help='Path to content file or folder (e.g., instructions/, business/)'
167
+ )
168
+ publish_parser.add_argument(
169
+ '--dry-run',
170
+ action='store_true',
171
+ help='Simulate publishing without actual upload'
172
+ )
173
+ publish_parser.add_argument(
174
+ '--force',
175
+ action='store_true',
176
+ help='Force republish all files, ignoring change detection'
177
+ )
178
+ publish_parser.add_argument(
179
+ '--no-parse',
180
+ action='store_true',
181
+ help='Skip parsing documents after upload (for debugging)'
182
+ )
183
+ publish_parser.add_argument(
184
+ '--parse-timeout',
185
+ type=int,
186
+ default=300,
187
+ help='Timeout for parsing in seconds (default: 300)'
188
+ )
189
+ publish_parser.add_argument(
190
+ '--env',
191
+ type=str,
192
+ default=None,
193
+ help='Environment (local, dev, test, production)'
194
+ )
195
+ publish_parser.add_argument(
196
+ '--env-file',
197
+ type=str,
198
+ default=None,
199
+ help='Explicit path to a .env file'
200
+ )
201
+
202
+ # Verify command
203
+ verify_parser = subparsers.add_parser(
204
+ 'verify',
205
+ help='Verify RAGFlow connection and API key'
206
+ )
207
+ verify_parser.add_argument(
208
+ '--env',
209
+ type=str,
210
+ default=None,
211
+ help='Environment (local, dev, test, production)'
212
+ )
213
+ verify_parser.add_argument(
214
+ '--env-file',
215
+ type=str,
216
+ default=None,
217
+ help='Explicit path to a .env file'
218
+ )
219
+
220
+ # List dataset command
221
+ list_parser = subparsers.add_parser(
222
+ 'list-dataset',
223
+ help='List documents in a dataset'
224
+ )
225
+ list_parser.add_argument(
226
+ '--dataset',
227
+ type=str,
228
+ default=None,
229
+ help='Dataset name (defaults to configured dataset)'
230
+ )
231
+ list_parser.add_argument(
232
+ '--env',
233
+ type=str,
234
+ default=None,
235
+ help='Environment (local, dev, test, production)'
236
+ )
237
+ list_parser.add_argument(
238
+ '--env-file',
239
+ type=str,
240
+ default=None,
241
+ help='Explicit path to a .env file'
242
+ )
243
+
244
+ # Cleanup dataset command
245
+ cleanup_parser = subparsers.add_parser(
246
+ 'cleanup-dataset',
247
+ help='Delete all documents from a dataset'
248
+ )
249
+ cleanup_parser.add_argument(
250
+ '--dataset',
251
+ type=str,
252
+ default=None,
253
+ help='Dataset name (defaults to configured dataset)'
254
+ )
255
+ cleanup_parser.add_argument(
256
+ '--prefix',
257
+ type=str,
258
+ default=None,
259
+ help='Only delete documents with titles starting with this prefix (e.g., "aqa-phase")'
260
+ )
261
+ cleanup_parser.add_argument(
262
+ '--tags',
263
+ type=str,
264
+ default=None,
265
+ help='Only delete documents with these tags (space or comma separated, e.g., "r1 agents" or "r1,agents")'
266
+ )
267
+ cleanup_parser.add_argument(
268
+ '--dry-run',
269
+ action='store_true',
270
+ help='Show what would be deleted without actually deleting'
271
+ )
272
+ cleanup_parser.add_argument(
273
+ '--force',
274
+ action='store_true',
275
+ help='Skip confirmation prompt'
276
+ )
277
+ cleanup_parser.add_argument(
278
+ '--env',
279
+ type=str,
280
+ default=None,
281
+ help='Environment (local, dev, test, production)'
282
+ )
283
+ cleanup_parser.add_argument(
284
+ '--env-file',
285
+ type=str,
286
+ default=None,
287
+ help='Explicit path to a .env file'
288
+ )
289
+
290
+ # Parse command
291
+ parse_parser = subparsers.add_parser(
292
+ 'parse',
293
+ help='Trigger parsing for documents in a dataset'
294
+ )
295
+ parse_parser.add_argument(
296
+ '--dataset',
297
+ type=str,
298
+ default=None,
299
+ help='Dataset name (defaults to configured dataset)'
300
+ )
301
+ parse_parser.add_argument(
302
+ '--force',
303
+ action='store_true',
304
+ help='Force re-parse ALL documents, even if already parsed'
305
+ )
306
+ parse_parser.add_argument(
307
+ '--dry-run',
308
+ action='store_true',
309
+ help='Show which documents would be parsed without actually parsing'
310
+ )
311
+ parse_parser.add_argument(
312
+ '--yes',
313
+ action='store_true',
314
+ help='Skip confirmation prompt'
315
+ )
316
+ parse_parser.add_argument(
317
+ '--parse-timeout',
318
+ type=int,
319
+ default=300,
320
+ help='Timeout for parsing in seconds (default: 300)'
321
+ )
322
+ parse_parser.add_argument(
323
+ '--env',
324
+ type=str,
325
+ default=None,
326
+ help='Environment (local, dev, test, production)'
327
+ )
328
+ parse_parser.add_argument(
329
+ '--env-file',
330
+ type=str,
331
+ default=None,
332
+ help='Explicit path to a .env file'
333
+ )
334
+
335
+ # Parse arguments
336
+ args = parser.parse_args()
337
+
338
+ # Check if command was provided
339
+ if not args.command:
340
+ parser.print_help()
341
+ return 1
342
+
343
+ try:
344
+ # Load configuration
345
+ config = IMSConfig.from_env(env_file=args.env_file, environment=args.env)
346
+
347
+ # Validate configuration
348
+ config.validate()
349
+
350
+ # Initialize RAGFlow client
351
+ client = RAGFlowClient(
352
+ api_key=config.api_key,
353
+ base_url=config.base_url,
354
+ embedding_model=config.embedding_model,
355
+ chunk_method=config.chunk_method,
356
+ parser_config=config.parser_config,
357
+ page_size=config.page_size
358
+ )
359
+
360
+ return execute_command(args.command, args, client, config)
361
+
362
+ except ValueError as e:
363
+ print(f"āœ— Configuration error: {e}")
364
+ print("\nPlease ensure you have:")
365
+ print("1. Created a .env file (copy from env.template)")
366
+ print("2. Set RAGFLOW_BASE_URL and RAGFLOW_API_KEY")
367
+ return 1
368
+ except KeyboardInterrupt:
369
+ print("\n\nOperation cancelled.")
370
+ return 1
371
+ except Exception as e:
372
+ print(f"āœ— Unexpected error: {e}")
373
+ import traceback
374
+ traceback.print_exc()
375
+ return 1
376
+
377
+
378
+ if __name__ == '__main__':
379
+ sys.exit(main())
@@ -0,0 +1,5 @@
1
+ """IMS Commands Package - Command pattern implementations for IMS CLI"""
2
+
3
+ from .base_command import BaseCommand
4
+
5
+ __all__ = ['BaseCommand']
@@ -0,0 +1,82 @@
1
+ """
2
+ Base Command Abstract Class
3
+
4
+ Defines the interface and common functionality for all IMS CLI commands.
5
+ """
6
+
7
+ import time
8
+ from abc import ABC, abstractmethod
9
+
10
+ from ..ims_config import IMSConfig
11
+ from ..ragflow_client import RAGFlowClient
12
+ from ..typing_utils import CommandArgs
13
+
14
+
15
+ class BaseCommand(ABC):
16
+ """
17
+ Abstract base class for all IMS CLI commands.
18
+
19
+ Provides common functionality for authentication, timing, and error handling.
20
+ Each command must implement the execute() method.
21
+ """
22
+
23
+ def __init__(self, client: RAGFlowClient, config: IMSConfig):
24
+ """
25
+ Initialize command with RAGFlow client and configuration.
26
+
27
+ Args:
28
+ client: RAGFlow client instance
29
+ config: IMS configuration
30
+ """
31
+ self.client = client
32
+ self.config = config
33
+ self._start_time: float | None = None
34
+
35
+ @abstractmethod
36
+ def execute(self, args: CommandArgs) -> int:
37
+ """
38
+ Execute the command with given arguments.
39
+
40
+ Args:
41
+ args: Parsed command-line arguments (argparse.Namespace)
42
+
43
+ Returns:
44
+ Exit code (0 for success, non-zero for failure)
45
+ """
46
+ raise NotImplementedError
47
+
48
+ def _start_timing(self) -> None:
49
+ """Start timing measurement for command execution."""
50
+ self._start_time = time.time()
51
+
52
+ def _get_elapsed_time(self) -> float:
53
+ """
54
+ Get elapsed time since timing started.
55
+
56
+ Returns:
57
+ Elapsed time in seconds
58
+ """
59
+ if self._start_time is None:
60
+ return 0.0
61
+ return time.time() - self._start_time
62
+
63
+ def _print_timing(self, label: str = "Total time") -> None:
64
+ """
65
+ Print timing information.
66
+
67
+ Args:
68
+ label: Label for the timing output
69
+ """
70
+ elapsed = self._get_elapsed_time()
71
+ print(f"ā±ļø {label}: {elapsed:.2f}s")
72
+
73
+ def _print_header(self, title: str) -> None:
74
+ """
75
+ Print command header with configuration info.
76
+
77
+ Args:
78
+ title: Command title/description
79
+ """
80
+ print(title)
81
+ print(f"Environment: {self.config.environment}")
82
+ print(f"RAGFlow Instance: {self.config.base_url}")
@@ -0,0 +1,214 @@
1
+ """
2
+ Cleanup Command - Delete documents from a dataset
3
+ """
4
+
5
+ from ..services.dataset_service import DatasetService
6
+ from ..services.document_service import DocumentService
7
+
8
+ from .base_command import BaseCommand
9
+ from ..typing_utils import CommandArgs, DatasetLike, DocumentLike
10
+
11
+
12
+ class CleanupCommand(BaseCommand):
13
+ """
14
+ Delete documents from a RAGFlow dataset.
15
+
16
+ CRITICAL: Implements safety measures (dry-run, confirmation) to prevent accidental data loss.
17
+ """
18
+
19
+ def execute(self, args: CommandArgs) -> int:
20
+ """Execute cleanup-dataset command."""
21
+ self._start_timing()
22
+
23
+ # Resolve dataset name
24
+ dataset_service = DatasetService(self.client, self.config)
25
+ dataset_name, auto_detected = dataset_service.resolve_dataset_name(args.dataset)
26
+
27
+ if not dataset_name:
28
+ return 1
29
+
30
+ # Print header
31
+ print(f"Cleaning up Dataset: {dataset_name}")
32
+ print(f"Environment: {self.config.environment}")
33
+ print(f"RAGFlow Instance: {self.config.base_url}\n")
34
+
35
+ # Verify authentication
36
+ from services.auth_service import AuthService
37
+ AuthService.verify_or_exit(self.client, self.config)
38
+
39
+ try:
40
+ # Get dataset
41
+ dataset = self.client.get_dataset(name=dataset_name)
42
+
43
+ if not dataset:
44
+ print(f"āœ— Dataset '{dataset_name}' not found")
45
+ dataset_service.display_available_datasets()
46
+ return 1
47
+
48
+ # Filter documents
49
+ filtered_documents = self._get_filtered_documents(
50
+ dataset, dataset_service, args
51
+ )
52
+
53
+ if not filtered_documents:
54
+ print(f"āœ“ No documents found")
55
+ return 0
56
+
57
+ # Show documents
58
+ self._display_documents(filtered_documents)
59
+
60
+ # Dry-run mode
61
+ if args.dry_run:
62
+ return self._handle_dry_run(filtered_documents)
63
+
64
+ # Confirm deletion
65
+ if not self._confirm_deletion(filtered_documents, dataset_name, args):
66
+ print("\nāœ— Cleanup cancelled")
67
+ return 1
68
+
69
+ # Delete documents
70
+ deleted, failed = self._delete_documents(dataset, filtered_documents)
71
+
72
+ # Print summary
73
+ self._print_summary(deleted, failed)
74
+
75
+ return 0 if failed == 0 else 1
76
+
77
+ except Exception as e:
78
+ print(f"\nāœ— Error cleaning up dataset: {e}")
79
+ self._print_timing()
80
+ import traceback
81
+ traceback.print_exc()
82
+ return 1
83
+
84
+ def _get_filtered_documents(
85
+ self,
86
+ dataset: DatasetLike,
87
+ dataset_service: DatasetService,
88
+ args: CommandArgs,
89
+ ) -> list[DocumentLike]:
90
+ """Get documents to delete based on filters."""
91
+ document_service = DocumentService(self.client)
92
+
93
+ if args.tags:
94
+ # Filter by tags (metadata condition)
95
+ tags_list = self._parse_tags(args.tags)
96
+ filtered_documents = document_service.filter_documents_by_tags(
97
+ dataset, tags_list
98
+ )
99
+ print(f"\nFiltered {len(filtered_documents)} document(s) with tags: {', '.join(tags_list)}\n")
100
+ elif args.prefix:
101
+ # Filter by prefix
102
+ filtered_documents = document_service.filter_documents_by_prefix(
103
+ dataset, args.prefix
104
+ )
105
+ print(f"\nFiltered {len(filtered_documents)} document(s) matching prefix '{args.prefix}'\n")
106
+ else:
107
+ # No filter - fetch all
108
+ filtered_documents = dataset.list_documents(page_size=self.config.page_size)
109
+ if filtered_documents:
110
+ print(f"\nFound {len(filtered_documents)} document(s) to delete\n")
111
+
112
+ return filtered_documents
113
+
114
+ def _parse_tags(self, tags_arg: str) -> list[str]:
115
+ """Parse tags from comma or space separated string."""
116
+ # Support both comma and space separated tags
117
+ if ',' in tags_arg:
118
+ tags = [tag.strip() for tag in tags_arg.split(',')]
119
+ else:
120
+ tags = tags_arg.split()
121
+
122
+ # Filter out empty strings
123
+ return [tag for tag in tags if tag]
124
+
125
+ def _display_documents(self, documents: list[DocumentLike]) -> None:
126
+ """Display documents that will be deleted."""
127
+ print("Documents to delete:")
128
+ print("="*80)
129
+ for document in documents:
130
+ doc_name = document.name if hasattr(document, 'name') else 'Untitled'
131
+ print(f" • {doc_name}")
132
+ print("="*80)
133
+ print()
134
+
135
+ def _handle_dry_run(self, documents: list[DocumentLike]) -> int:
136
+ """Handle dry-run mode."""
137
+ print("šŸ” DRY-RUN MODE - No documents will be deleted\n")
138
+ print(f"Summary: {len(documents)} document(s) would be deleted")
139
+ print("="*80)
140
+ self._print_timing()
141
+ return 0
142
+
143
+ def _confirm_deletion(
144
+ self,
145
+ documents: list[DocumentLike],
146
+ dataset_name: str,
147
+ args: CommandArgs,
148
+ ) -> bool:
149
+ """Confirm deletion with user (unless force flag set)."""
150
+ if args.force:
151
+ return True
152
+
153
+ # Build filter message
154
+ filter_msg = ""
155
+ if args.tags:
156
+ tags_list = self._parse_tags(args.tags)
157
+ filter_msg = f" with tags ({', '.join(tags_list)})"
158
+ elif args.prefix:
159
+ filter_msg = f" with prefix '{args.prefix}'"
160
+
161
+ response = input(
162
+ f"āš ļø Delete {len(documents)} documents{filter_msg} from '{dataset_name}'? (yes/no): "
163
+ )
164
+ return response.lower() in ['yes', 'y']
165
+
166
+ def _delete_documents(
167
+ self,
168
+ dataset: DatasetLike,
169
+ documents: list[DocumentLike],
170
+ ) -> tuple[int, int]:
171
+ """Delete documents and return counts."""
172
+ deleted_count = 0
173
+ failed_count = 0
174
+
175
+ # Collect document IDs
176
+ doc_ids_to_delete = []
177
+ doc_id_to_name = {}
178
+
179
+ for document in documents:
180
+ doc_id = document.id if hasattr(document, 'id') else None
181
+ doc_name = document.name if hasattr(document, 'name') else 'Untitled'
182
+
183
+ if not doc_id:
184
+ print(f"āœ— Cannot delete {doc_name}: No document ID")
185
+ failed_count += 1
186
+ continue
187
+
188
+ doc_ids_to_delete.append(doc_id)
189
+ doc_id_to_name[doc_id] = doc_name
190
+
191
+ # Delete batch
192
+ if doc_ids_to_delete:
193
+ try:
194
+ dataset.delete_documents(ids=doc_ids_to_delete)
195
+ deleted_count = len(doc_ids_to_delete)
196
+
197
+ for doc_id in doc_ids_to_delete:
198
+ doc_name = doc_id_to_name.get(doc_id, 'Unknown')
199
+ print(f"āœ“ Deleted: {doc_name}")
200
+
201
+ except Exception as e:
202
+ print(f"āœ— Failed to delete documents: {e}")
203
+ failed_count = len(doc_ids_to_delete)
204
+
205
+ return deleted_count, failed_count
206
+
207
+ def _print_summary(self, deleted: int, failed: int) -> None:
208
+ """Print cleanup summary."""
209
+ print("\n" + "="*80)
210
+ print(f"Cleanup Summary:")
211
+ print(f" āœ“ Deleted: {deleted}")
212
+ print(f" āœ— Failed: {failed}")
213
+ print("="*80)
214
+ self._print_timing()