PyPI - rosetta-cli - Versions diffs - 2.0.0__py3-none-any.whl - Mend

rosetta-cli 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

rosetta_cli/__init__.py +12 -0
rosetta_cli/__main__.py +6 -0
rosetta_cli/cli.py +379 -0
rosetta_cli/commands/__init__.py +5 -0
rosetta_cli/commands/base_command.py +82 -0
rosetta_cli/commands/cleanup_command.py +214 -0
rosetta_cli/commands/list_command.py +70 -0
rosetta_cli/commands/parse_command.py +205 -0
rosetta_cli/commands/publish_command.py +113 -0
rosetta_cli/commands/verify_command.py +46 -0
rosetta_cli/ims_auth.py +124 -0
rosetta_cli/ims_config.py +317 -0
rosetta_cli/ims_publisher.py +859 -0
rosetta_cli/ims_utils.py +28 -0
rosetta_cli/ragflow_client.py +928 -0
rosetta_cli/services/__init__.py +8 -0
rosetta_cli/services/auth_service.py +114 -0
rosetta_cli/services/dataset_service.py +72 -0
rosetta_cli/services/document_data.py +408 -0
rosetta_cli/services/document_service.py +357 -0
rosetta_cli/typing_utils.py +49 -0
rosetta_cli-2.0.0.dist-info/METADATA +639 -0
rosetta_cli-2.0.0.dist-info/RECORD +26 -0
rosetta_cli-2.0.0.dist-info/WHEEL +5 -0
rosetta_cli-2.0.0.dist-info/entry_points.txt +2 -0
rosetta_cli-2.0.0.dist-info/top_level.txt +1 -0

rosetta_cli/commands/list_command.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""
+List Command - Display documents in a dataset
+"""
+from ..services.dataset_service import DatasetService
+from ..services.document_service import DocumentService
+from .base_command import BaseCommand
+from ..typing_utils import CommandArgs
+class ListCommand(BaseCommand):
+    """List documents in a RAGFlow dataset."""
+    def execute(self, args: CommandArgs) -> int:
+        """Execute list-dataset command."""
+        self._start_timing()
+        # Resolve dataset name
+        dataset_service = DatasetService(self.client, self.config)
+        dataset_name, _auto_detected = dataset_service.resolve_dataset_name(args.dataset)
+        if not dataset_name:
+            print("✗ Failed to resolve dataset name")
+            return 1
+        # Print header
+        print(f"Listing Dataset: {dataset_name}")
+        print(f"Environment: {self.config.environment}")
+        print(f"RAGFlow Instance: {self.config.base_url}")
+        print()
+        # Verify authentication
+        from services.auth_service import AuthService
+        AuthService.verify_or_exit(self.client, self.config)
+        try:
+            # Get dataset
+            dataset = self.client.get_dataset(name=dataset_name)
+            if not dataset:
+                print(f"✗ Dataset '{dataset_name}' not found")
+                dataset_service.display_available_datasets()
+                return 1
+            # List documents
+            document_service = DocumentService(self.client)
+            documents = dataset.list_documents(page_size=self.config.page_size)
+            if not documents:
+                print("\nNo documents found in dataset")
+                return 0
+            print(f"\nFound {len(documents)} document(s):")
+            print("="*80)
+            for i, document in enumerate(documents, 1):
+                print(document_service.format_document_display(document, i))
+            print("="*80)
+            self._print_timing()
+            return 0
+        except Exception as e:
+            print(f"\n✗ Error listing dataset: {e}")
+            self._print_timing()
+            import traceback
+            traceback.print_exc()
+            return 1

rosetta_cli/commands/parse_command.py ADDED Viewed

@@ -0,0 +1,205 @@
+"""
+Parse Command - Trigger document parsing without re-uploading
+"""
+import time
+from ..services.dataset_service import DatasetService
+from ..services.document_service import DocumentService
+from ..services.auth_service import AuthService
+from .base_command import BaseCommand
+from ..typing_utils import CommandArgs, DatasetLike, JsonDict
+class ParseCommand(BaseCommand):
+    """
+    Trigger parsing for documents in a dataset.
+    Handles selective parsing (failed/unparsed) or force re-parse all documents.
+    """
+    def execute(self, args: CommandArgs) -> int:
+        """Execute parse command."""
+        self._start_timing()
+        # CLI flag must override config default for this run.
+        self.config.parse_timeout = args.parse_timeout
+        # Resolve dataset name
+        dataset_service = DatasetService(self.client, self.config)
+        dataset_name, auto_detected = dataset_service.resolve_dataset_name(args.dataset)
+        if not dataset_name:
+            print("✗ Failed to resolve dataset name")
+            return 1
+        # Print header
+        print(f"Parsing Documents in Dataset: {dataset_name}")
+        if args.force:
+            print("FORCE MODE - Will re-parse ALL documents")
+        if args.dry_run:
+            print("DRY-RUN MODE - No parsing will be triggered")
+        print(f"Environment: {self.config.environment}")
+        print(f"RAGFlow Instance: {self.config.base_url}\n")
+        # Verify authentication
+        AuthService.verify_or_exit(self.client, self.config)
+        print()
+        try:
+            # Get dataset
+            dataset = self.client.get_dataset(name=dataset_name)
+            if not dataset:
+                print(f"✗ Dataset '{dataset_name}' not found")
+                dataset_service.display_available_datasets()
+                return 1
+            # Get documents to parse
+            docs_to_parse, status_counts = self._get_documents_to_parse(
+                dataset, args
+            )
+            # Print status
+            self._print_parse_status(docs_to_parse, status_counts, args)
+            # Dry-run mode
+            if args.dry_run:
+                return self._handle_dry_run(docs_to_parse, status_counts)
+            # Check if anything to parse
+            if not docs_to_parse:
+                print("✓ All documents are already parsed or currently parsing")
+                self._print_timing()
+                return 0
+            # Confirm only in force mode (re-parsing all documents is destructive)
+            if args.force and not self._confirm_parsing(docs_to_parse, args):
+                print("\n✗ Parsing cancelled")
+                return 1
+            # Parse documents
+            success, failed = self._parse_and_wait(docs_to_parse, args)
+            # Print summary
+            self._print_parse_summary(success, failed, status_counts, args)
+            return 0 if failed == 0 else 1
+        except KeyboardInterrupt:
+            print("\n\n✗ Parsing interrupted by user (Ctrl+C)")
+            self._print_timing()
+            return 1
+        except Exception as e:
+            print(f"\n✗ Error during parsing: {e}")
+            self._print_timing()
+            import traceback
+            traceback.print_exc()
+            return 1
+    def _get_documents_to_parse(self, dataset: DatasetLike, args: CommandArgs) -> tuple[list[JsonDict], dict[str, int]]:
+        """Get documents that need parsing."""
+        document_service = DocumentService(self.client)
+        docs_to_parse: list[JsonDict] = []
+        status_counts = {'done': 0, 'running': 0}
+        print(f"Checking parsing status for documents...")
+        if args.force:
+            # Force mode: all documents
+            documents = dataset.list_documents(page_size=self.config.page_size)
+            print(f"Found {len(documents)} document(s) (force mode - parsing all)\n")
+            for document in documents:
+                doc_id = getattr(document, 'id', None)
+                if doc_id:
+                    docs_to_parse.append({
+                        "id": doc_id,
+                        "name": getattr(document, 'name', 'Untitled'),
+                        "dataset_id": dataset.id,
+                        "folder": ".",
+                        "status": getattr(document, 'run', 'UNSTART')
+                    })
+        else:
+            # Default mode: filter by status (need parsing)
+            documents_needing_parse = document_service.list_documents_by_status(
+                dataset,
+                statuses=["FAIL", "UNSTART", "CANCEL"],
+                limit=self.config.page_size
+            )
+            print(f"Found {len(documents_needing_parse)} document(s) needing parsing\n")
+            for document in documents_needing_parse:
+                doc_id = getattr(document, 'id', None)
+                if doc_id:
+                    docs_to_parse.append({
+                        "id": doc_id,
+                        "name": getattr(document, 'name', 'Untitled'),
+                        "dataset_id": dataset.id,
+                        "folder": ".",
+                        "status": getattr(document, 'run', 'UNSTART')
+                    })
+        return docs_to_parse, status_counts
+    def _print_parse_status(self, docs_to_parse: list[JsonDict], status_counts: dict[str, int], args: CommandArgs) -> None:
+        """Print parsing status."""
+        if docs_to_parse:
+            print(f"Documents to parse ({len(docs_to_parse)}):")
+            for doc in docs_to_parse:
+                print(f"  📄 {doc['name']} ({doc['status']})")
+            print()
+    def _handle_dry_run(self, docs_to_parse: list[JsonDict], status_counts: dict[str, int]) -> int:
+        """Handle dry-run mode."""
+        print("="*80)
+        print(f"Summary (DRY-RUN):")
+        print(f"  Would parse: {len(docs_to_parse)}")
+        print(f"  Already done: {status_counts['done']}")
+        print(f"  Currently running: {status_counts['running']}")
+        print("="*80)
+        self._print_timing()
+        return 0
+    def _confirm_parsing(self, docs_to_parse: list[JsonDict], args: CommandArgs) -> bool:
+        """Confirm parsing with user."""
+        if args.yes:
+            return True
+        response = input(f"⚠️  Trigger parsing for {len(docs_to_parse)} documents? (yes/no): ")
+        return response.lower() in ['yes', 'y']
+    def _parse_and_wait(self, docs_to_parse: list[JsonDict], args: CommandArgs) -> tuple[int, int]:
+        """Parse documents and wait for completion."""
+        # Use client's batch parsing method (handles grouping and triggering)
+        _ = self.client.parse_documents_batch(docs_to_parse, silent=True)
+        # Wait with progress bar using DocumentService
+        doc_service = DocumentService(self.client)
+        success, failed = doc_service.wait_for_parsing(
+            docs_to_parse,
+            timeout=self.config.parse_timeout
+        )
+        return success, failed
+    def _print_parse_summary(self, success: int, failed: int, status_counts: dict[str, int], args: CommandArgs) -> None:
+        """Print parsing summary."""
+        print("\n" + "="*80)
+        print(f"Parse Summary:")
+        print(f"  ✓ Successfully parsed: {success}")
+        if failed > 0:
+            print(f"  ✗ Failed: {failed}")
+        if not args.force:
+            if status_counts['done'] > 0:
+                print(f"  ⏭️  Skipped (already done): {status_counts['done']}")
+            if status_counts['running'] > 0:
+                print(f"  ⏭️  Skipped (running): {status_counts['running']}")
+        print("="*80)
+        self._print_timing()

rosetta_cli/commands/publish_command.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""
+Publish Command - Upload knowledge base content to RAGFlow
+"""
+from pathlib import Path
+from ..ims_publisher import ContentPublisher
+from ..services.auth_service import AuthService
+from ..ims_utils import resolve_workspace_root
+from .base_command import BaseCommand
+from ..typing_utils import CommandArgs
+class PublishCommand(BaseCommand):
+    """
+    Publish knowledge base content (files or folders) to RAGFlow.
+    Handles single file or recursive folder publishing with optional dry-run mode.
+    """
+    def execute(self, args: CommandArgs) -> int:
+        """
+        Execute publish command.
+        Args:
+            args: Command arguments with path, dry_run, force, no_parse flags
+        Returns:
+            0 if successful, 1 if failed
+        """
+        self._start_timing()
+        # Print header
+        print(f"Publishing knowledge base content from: {args.path}")
+        self._print_header_with_api_key()
+        print()
+        # Verify authentication
+        AuthService.verify_or_exit(self.client, self.config)
+        path = Path(args.path.strip()).resolve()
+        workspace_root = resolve_workspace_root(path)
+        publisher = ContentPublisher(
+            self.client,
+            str(workspace_root),
+            dataset_default=self.config.dataset_default,
+            dataset_template=self.config.dataset_template
+        )
+        # Publish
+        exit_code = self._publish_path(publisher, path, args)
+        # Print timing
+        print()
+        self._print_timing()
+        return exit_code
+    def _publish_path(self, publisher: ContentPublisher, path: Path, args: CommandArgs) -> int:
+        """
+        Publish a file or directory.
+        Args:
+            publisher: ContentPublisher instance
+            path: Path to publish
+            args: Command arguments
+        Returns:
+            Exit code
+        """
+        if path.is_file():
+            return self._publish_file(publisher, path, args)
+        elif path.is_dir():
+            return self._publish_folder(publisher, path, args)
+        else:
+            print(f"✗ Path not found: {path}")
+            return 1
+    def _publish_file(self, publisher: ContentPublisher, path: Path, args: CommandArgs) -> int:
+        """Publish single file."""
+        result = publisher.publish_file(
+            str(path),
+            dry_run=args.dry_run,
+            force=args.force,
+            parse_documents=not args.no_parse,
+            wait_for_parsing=True
+        )
+        print(f"\n{result}")
+        return 0 if result.success else 1
+    def _publish_folder(self, publisher: ContentPublisher, path: Path, args: CommandArgs) -> int:
+        """Publish folder recursively."""
+        results = publisher.publish_folder(
+            str(path),
+            dry_run=args.dry_run,
+            recursive=True,
+            force=args.force,
+            parse_documents=not args.no_parse,
+            wait_for_parsing=True
+        )
+        # Return success if no failures
+        failed = [r for r in results if not r.success]
+        return 0 if len(failed) == 0 else 1
+    def _print_header_with_api_key(self) -> None:
+        """Print header including API key status."""
+        print(f"Environment: {self.config.environment}")
+        print(f"RAGFlow Instance: {self.config.base_url}")
+        print(f"RAGFlow API Key: {'SET' if self.config.api_key else 'NOT SET'}")

rosetta_cli/commands/verify_command.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""
+Verify Command - RAGFlow connection verification
+"""
+from ..services.auth_service import AuthService
+from .base_command import BaseCommand
+from ..typing_utils import CommandArgs
+class VerifyCommand(BaseCommand):
+    """
+    Verify RAGFlow API connection and authentication.
+    Tests connectivity and API key validity.
+    """
+    def execute(self, args: CommandArgs) -> int:
+        """
+        Execute verify command.
+        Args:
+            args: Command arguments (unused for verify)
+        Returns:
+            0 if verification successful, 1 if failed
+        """
+        self._start_timing()
+        # Print header
+        self._print_header("RAGFlow Connection Verification")
+        print()
+        # Verify connection using AuthService
+        auth_service = AuthService(self.client, self.config)
+        success = auth_service.verify_connection()
+        # Print result
+        if success:
+            print(f"\n✓ All systems operational")
+            self._print_timing()
+            return 0
+        else:
+            print(f"\n✗ Connection verification failed")
+            self._print_timing()
+            return 1

rosetta_cli/ims_auth.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""
+IMS Authentication Module
+Handles API key verification and connection management for RAGFlow.
+Features:
+- API key authentication
+- Connection verification
+- System health checks
+"""
+from .ims_config import IMSConfig
+from .ragflow_client import AuthenticationError, RAGFlowClient
+from .typing_utils import JsonDict
+class IMSAuthManager:
+    """Manages RAGFlow API key authentication and connection verification."""
+    def __init__(self, client: RAGFlowClient, config: IMSConfig):
+        """
+        Initialize the authentication manager.
+        Args:
+            client: RAGFlow client instance
+            config: RAGFlow configuration
+        """
+        self.client = client
+        self.config = config
+    def verify_api_key(self) -> tuple[bool, str | None]:
+        """
+        Verify API key is valid by attempting to list datasets.
+        Returns:
+            Tuple of (success: bool, error_message: Optional[str])
+        """
+        try:
+            # Try a simple operation to verify API key
+            self.client.list_datasets(page_size=1)
+            return True, None
+        except AuthenticationError as e:
+            return False, f"Authentication failed: {e}"
+        except Exception as e:
+            return False, f"Verification error: {e}"
+    def verify_connection(self) -> tuple[bool, str | None]:
+        """
+        Verify connection to RAGFlow server.
+        Returns:
+            Tuple of (success: bool, error_message: Optional[str])
+        """
+        try:
+            if self.client.verify_connection():
+                return True, None
+            else:
+                return False, f"Connection failed to {self.config.base_url}"
+        except Exception as e:
+            return False, f"Connection error: {e}"
+    def get_server_info(self) -> JsonDict | None:
+        """
+        Get RAGFlow server information including basic health status.
+        Returns:
+            Server information dict or None if not available
+        """
+        try:
+            # Get basic server info
+            info: JsonDict = {
+                'base_url': self.config.base_url,
+                'environment': self.config.environment,
+                'dataset_default': self.config.dataset_default,
+                'dataset_template': self.config.dataset_template,
+            }
+            # Try to get health status (non-blocking)
+            try:
+                health = self.get_system_health()
+                if health:
+                    info['health_status'] = health.get('status', 'unknown')
+                    info['services'] = {
+                        'database': health.get('db', 'unknown'),
+                        'redis': health.get('redis', 'unknown'),
+                        'doc_engine': health.get('doc_engine', 'unknown'),
+                        'storage': health.get('storage', 'unknown'),
+                    }
+            except Exception:
+                # Health check failed, but don't fail server info
+                info['health_status'] = 'unavailable'
+            return info
+        except Exception:
+            return None
+    def get_system_health(self) -> JsonDict | None:
+        """
+        Check the health status of RAGFlow's dependencies.
+        This endpoint checks:
+        - Database (MySQL/PostgreSQL)
+        - Redis
+        - Document Engine (Elasticsearch/Infinity/OpenSearch)
+        - Object Storage (MinIO/S3/GCS)
+        Returns:
+            Health status dict with format:
+            {
+                'status': 'ok' or 'nok',
+                'db': 'ok' or 'nok',
+                'redis': 'ok' or 'nok',
+                'doc_engine': 'ok' or 'nok',
+                'storage': 'ok' or 'nok',
+                }
+            }
+            Returns None if health check is not available
+        """
+        try:
+            # Call the healthz endpoint (no authentication required)
+            health_data = self.client.get_system_health()
+            return health_data
+        except Exception:
+            return None