PyPI - rdkit-cli - Versions diffs - 0.1.0__py3-none-any.whl - Mend

rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

rdkit_cli/__init__.py +4 -0
rdkit_cli/__main__.py +6 -0
rdkit_cli/cli.py +162 -0
rdkit_cli/commands/__init__.py +1 -0
rdkit_cli/commands/conformers.py +220 -0
rdkit_cli/commands/convert.py +162 -0
rdkit_cli/commands/depict.py +311 -0
rdkit_cli/commands/descriptors.py +251 -0
rdkit_cli/commands/diversity.py +232 -0
rdkit_cli/commands/enumerate.py +229 -0
rdkit_cli/commands/filter.py +384 -0
rdkit_cli/commands/fingerprints.py +179 -0
rdkit_cli/commands/fragment.py +284 -0
rdkit_cli/commands/mcs.py +162 -0
rdkit_cli/commands/reactions.py +191 -0
rdkit_cli/commands/scaffold.py +243 -0
rdkit_cli/commands/similarity.py +359 -0
rdkit_cli/commands/standardize.py +138 -0
rdkit_cli/core/__init__.py +1 -0
rdkit_cli/core/conformers.py +197 -0
rdkit_cli/core/depict.py +241 -0
rdkit_cli/core/descriptors.py +248 -0
rdkit_cli/core/diversity.py +174 -0
rdkit_cli/core/enumerate.py +190 -0
rdkit_cli/core/filters.py +443 -0
rdkit_cli/core/fingerprints.py +265 -0
rdkit_cli/core/fragment.py +237 -0
rdkit_cli/core/mcs.py +128 -0
rdkit_cli/core/reactions.py +159 -0
rdkit_cli/core/scaffold.py +174 -0
rdkit_cli/core/similarity.py +206 -0
rdkit_cli/core/standardizer.py +141 -0
rdkit_cli/io/__init__.py +7 -0
rdkit_cli/io/formats.py +109 -0
rdkit_cli/io/readers.py +352 -0
rdkit_cli/io/writers.py +275 -0
rdkit_cli/parallel/__init__.py +5 -0
rdkit_cli/parallel/batch.py +181 -0
rdkit_cli/parallel/executor.py +180 -0
rdkit_cli/progress/__init__.py +5 -0
rdkit_cli/progress/ninja.py +195 -0
rdkit_cli/utils/__init__.py +1 -0
rdkit_cli-0.1.0.dist-info/METADATA +380 -0
rdkit_cli-0.1.0.dist-info/RECORD +47 -0
rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0

rdkit_cli/parallel/batch.py ADDED Viewed

@@ -0,0 +1,181 @@
+"""Batch processing utilities."""
+from dataclasses import dataclass
+from typing import Callable, Any, Optional
+from rdkit_cli.io.readers import MoleculeReader, MoleculeRecord
+from rdkit_cli.io.writers import MoleculeWriter
+from rdkit_cli.progress.ninja import NinjaProgress
+from rdkit_cli.parallel.executor import ParallelExecutor
+@dataclass
+class BatchResult:
+    """Result of batch processing."""
+    total_processed: int
+    successful: int
+    failed: int
+    elapsed_time: float
+def process_molecules(
+    reader: MoleculeReader,
+    writer: MoleculeWriter,
+    processor: Callable[[MoleculeRecord], Optional[dict[str, Any]]],
+    n_workers: int = -1,
+    quiet: bool = False,
+    batch_size: int = 1000,
+) -> BatchResult:
+    """
+    Process molecules from reader through processor and write to writer.
+    This is the main batch processing function used by most commands.
+    Args:
+        reader: MoleculeReader to read from
+        writer: MoleculeWriter to write to
+        processor: Function that takes MoleculeRecord and returns dict or None
+        n_workers: Number of worker processes (-1 for all)
+        quiet: Suppress progress output
+        batch_size: Number of records to process in each batch
+    Returns:
+        BatchResult with processing statistics
+    """
+    total = len(reader)
+    progress = NinjaProgress(total=total, quiet=quiet)
+    successful = 0
+    failed = 0
+    write_buffer: list[dict[str, Any]] = []
+    write_buffer_size = 1000
+    progress.start()
+    try:
+        if n_workers == 1:
+            # Sequential processing
+            for record in reader:
+                result = processor(record)
+                if result is not None:
+                    write_buffer.append(result)
+                    successful += 1
+                else:
+                    failed += 1
+                progress.update()
+                if len(write_buffer) >= write_buffer_size:
+                    writer.write_batch(write_buffer)
+                    write_buffer = []
+        else:
+            # Parallel processing - collect batch, process in parallel, write
+            executor = ParallelExecutor(processor, n_workers=n_workers)
+            batch: list[MoleculeRecord] = []
+            for record in reader:
+                batch.append(record)
+                if len(batch) >= batch_size:
+                    # Process batch in parallel
+                    results = executor.map_ordered(batch)
+                    for result in results:
+                        if result is not None:
+                            write_buffer.append(result)
+                            successful += 1
+                        else:
+                            failed += 1
+                        progress.update()
+                    if len(write_buffer) >= write_buffer_size:
+                        writer.write_batch(write_buffer)
+                        write_buffer = []
+                    batch = []
+            # Process remaining batch
+            if batch:
+                results = executor.map_ordered(batch)
+                for result in results:
+                    if result is not None:
+                        write_buffer.append(result)
+                        successful += 1
+                    else:
+                        failed += 1
+                    progress.update()
+        # Write remaining buffer
+        if write_buffer:
+            writer.write_batch(write_buffer)
+    finally:
+        progress.finish()
+    return BatchResult(
+        total_processed=total,
+        successful=successful,
+        failed=failed,
+        elapsed_time=progress.elapsed_time,
+    )
+def process_molecules_simple(
+    reader: MoleculeReader,
+    processor: Callable[[MoleculeRecord], Optional[dict[str, Any]]],
+    n_workers: int = -1,
+    quiet: bool = False,
+) -> tuple[list[dict[str, Any]], BatchResult]:
+    """
+    Process molecules and return results in memory (for small datasets).
+    Args:
+        reader: MoleculeReader to read from
+        processor: Function that takes MoleculeRecord and returns dict or None
+        n_workers: Number of worker processes (-1 for all)
+        quiet: Suppress progress output
+    Returns:
+        Tuple of (results list, BatchResult)
+    """
+    total = len(reader)
+    progress = NinjaProgress(total=total, quiet=quiet)
+    results: list[dict[str, Any]] = []
+    successful = 0
+    failed = 0
+    progress.start()
+    try:
+        if n_workers == 1:
+            for record in reader:
+                result = processor(record)
+                if result is not None:
+                    results.append(result)
+                    successful += 1
+                else:
+                    failed += 1
+                progress.update()
+        else:
+            executor = ParallelExecutor(processor, n_workers=n_workers)
+            records = list(reader)
+            progress.set_total(len(records))
+            for result in executor.map_ordered(records):
+                if result is not None:
+                    results.append(result)
+                    successful += 1
+                else:
+                    failed += 1
+                progress.update()
+    finally:
+        progress.finish()
+    return results, BatchResult(
+        total_processed=total,
+        successful=successful,
+        failed=failed,
+        elapsed_time=progress.elapsed_time,
+    )

rdkit_cli/parallel/executor.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""Parallel processing executor."""
+import os
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from typing import Callable, Iterator, TypeVar, Optional, Any
+from dataclasses import dataclass
+T = TypeVar("T")
+R = TypeVar("R")
+@dataclass
+class ParallelConfig:
+    """Configuration for parallel processing."""
+    n_workers: int = -1  # -1 means auto-detect
+    chunk_size: int = 100
+    def __post_init__(self):
+        if self.n_workers == -1:
+            self.n_workers = os.cpu_count() or 1
+def get_worker_count(n_requested: int) -> int:
+    """
+    Get actual worker count based on request and system.
+    Args:
+        n_requested: Requested number of workers (-1 for all, 0 for 1)
+    Returns:
+        Actual number of workers to use
+    """
+    max_workers = os.cpu_count() or 1
+    if n_requested <= 0:
+        return max_workers
+    return min(n_requested, max_workers)
+# Global worker function storage for pickling
+_worker_func: Optional[Callable] = None
+_worker_args: tuple = ()
+def _init_worker(func: Callable, args: tuple):
+    """Initialize worker process with function and extra args."""
+    global _worker_func, _worker_args
+    _worker_func = func
+    _worker_args = args
+def _worker_wrapper(item: Any) -> Any:
+    """Wrapper that calls the stored worker function."""
+    global _worker_func, _worker_args
+    if _worker_func is None:
+        raise RuntimeError("Worker function not initialized")
+    return _worker_func(item, *_worker_args)
+class ParallelExecutor:
+    """
+    Generic parallel executor for batch processing.
+    Uses ProcessPoolExecutor since RDKit operations are CPU-bound
+    and benefit from true parallelism (bypassing GIL).
+    """
+    def __init__(
+        self,
+        func: Callable[[T], R],
+        n_workers: int = -1,
+        initializer: Optional[Callable] = None,
+        initargs: tuple = (),
+    ):
+        """
+        Initialize parallel executor.
+        Args:
+            func: Function to apply to each item
+            n_workers: Number of worker processes (-1 for all CPUs)
+            initializer: Optional initializer for worker processes
+            initargs: Arguments for initializer
+        """
+        self.func = func
+        self.n_workers = get_worker_count(n_workers)
+        self.initializer = initializer
+        self.initargs = initargs
+    def map_unordered(
+        self,
+        items: list[T],
+        chunk_size: int = 100,
+    ) -> Iterator[R]:
+        """
+        Process items in parallel, yielding results as they complete.
+        Results may be returned in any order.
+        Args:
+            items: Items to process
+            chunk_size: Number of items per chunk
+        Yields:
+            Results as they complete
+        """
+        if not items:
+            return
+        # For single item or single worker, just run sequentially
+        if len(items) == 1 or self.n_workers == 1:
+            for item in items:
+                yield self.func(item)
+            return
+        with ProcessPoolExecutor(
+            max_workers=self.n_workers,
+            initializer=self.initializer,
+            initargs=self.initargs,
+        ) as executor:
+            # Submit all tasks
+            futures = {executor.submit(self.func, item): i for i, item in enumerate(items)}
+            # Yield results as they complete
+            for future in as_completed(futures):
+                try:
+                    yield future.result()
+                except Exception as e:
+                    # Yield None for failed items, let caller handle
+                    yield None
+    def map_ordered(
+        self,
+        items: list[T],
+        chunk_size: int = 100,
+    ) -> list[R]:
+        """
+        Process items and return results in original order.
+        Args:
+            items: Items to process
+            chunk_size: Number of items per chunk (unused, for API compatibility)
+        Returns:
+            Results in same order as input
+        """
+        if not items:
+            return []
+        # For single item or single worker, just run sequentially
+        if len(items) == 1 or self.n_workers == 1:
+            return [self.func(item) for item in items]
+        with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
+            return list(executor.map(self.func, items, chunksize=max(1, len(items) // (self.n_workers * 4))))
+def parallel_map(
+    func: Callable[[T], R],
+    items: list[T],
+    n_workers: int = -1,
+    ordered: bool = True,
+) -> list[R]:
+    """
+    Simple parallel map with default settings.
+    Args:
+        func: Function to apply to each item
+        items: Items to process
+        n_workers: Number of workers (-1 for all CPUs)
+        ordered: If True, preserve order; if False, return as completed
+    Returns:
+        List of results
+    """
+    executor = ParallelExecutor(func, n_workers=n_workers)
+    if ordered:
+        return executor.map_ordered(items)
+    else:
+        return list(executor.map_unordered(items))

rdkit_cli/progress/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Progress monitoring utilities."""
+from rdkit_cli.progress.ninja import NinjaProgress
+__all__ = ["NinjaProgress"]

rdkit_cli/progress/ninja.py ADDED Viewed

@@ -0,0 +1,195 @@
+"""Ninja-style progress monitoring."""
+import sys
+import time
+import threading
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class ProgressStats:
+    """Statistics for progress display."""
+    completed: int
+    total: int
+    elapsed: float
+    rate: float
+    eta: Optional[float]
+    percentage: float
+class NinjaProgress:
+    """
+    Ninja-style progress reporter.
+    Format: [42/100] 42% | 15.3 it/s | ETA: 3.8s | Elapsed: 2.8s
+    Features:
+    - No progress bar (just stats)
+    - Updates in-place on single line
+    - Thread-safe updates
+    """
+    def __init__(
+        self,
+        total: int,
+        quiet: bool = False,
+        update_interval: float = 0.1,
+        file=None,
+    ):
+        """
+        Initialize progress reporter.
+        Args:
+            total: Total number of items to process
+            quiet: If True, suppress all output
+            update_interval: Minimum seconds between display updates
+            file: File to write progress to (default: stderr)
+        """
+        self.total = total
+        self.quiet = quiet
+        self.update_interval = update_interval
+        self._file = file or sys.stderr
+        self._completed = 0
+        self._start_time: Optional[float] = None
+        self._last_update_time: float = 0
+        self._lock = threading.Lock()
+        self._finished = False
+        self._last_line_length = 0
+    def start(self):
+        """Start the progress tracker."""
+        self._start_time = time.perf_counter()
+        self._display()
+    def update(self, n: int = 1):
+        """
+        Update progress by n items.
+        Args:
+            n: Number of items completed
+        """
+        with self._lock:
+            self._completed += n
+            # Throttle display updates
+            now = time.perf_counter()
+            if now - self._last_update_time >= self.update_interval:
+                self._display()
+                self._last_update_time = now
+    def set_total(self, total: int):
+        """Update the total count (useful when count is discovered during processing)."""
+        with self._lock:
+            self.total = total
+    def finish(self):
+        """Complete the progress display."""
+        with self._lock:
+            self._finished = True
+            self._display(final=True)
+            if not self.quiet:
+                self._file.write("\n")
+                self._file.flush()
+    @property
+    def elapsed_time(self) -> float:
+        """Return elapsed time in seconds."""
+        if self._start_time is None:
+            return 0.0
+        return time.perf_counter() - self._start_time
+    @property
+    def completed(self) -> int:
+        """Return number of completed items."""
+        return self._completed
+    def _calculate_stats(self) -> ProgressStats:
+        """Calculate current progress statistics."""
+        elapsed = self.elapsed_time
+        completed = self._completed
+        # Calculate rate (items per second)
+        rate = completed / elapsed if elapsed > 0 else 0.0
+        # Calculate percentage
+        percentage = (completed / self.total * 100) if self.total > 0 else 0.0
+        # Calculate ETA
+        remaining = self.total - completed
+        eta = remaining / rate if rate > 0 and remaining > 0 else None
+        return ProgressStats(
+            completed=completed,
+            total=self.total,
+            elapsed=elapsed,
+            rate=rate,
+            eta=eta,
+            percentage=percentage,
+        )
+    def _display(self, final: bool = False):
+        """Display the progress line."""
+        if self.quiet:
+            return
+        stats = self._calculate_stats()
+        # Format: [42/100] 42% | 15.3 it/s | ETA: 3.8s | Elapsed: 2.8s
+        parts = [
+            f"[{stats.completed}/{stats.total}]",
+            f"{stats.percentage:.0f}%",
+            f"{stats.rate:.1f} it/s",
+        ]
+        if stats.eta is not None and not final:
+            parts.append(f"ETA: {self._format_time(stats.eta)}")
+        parts.append(f"Elapsed: {self._format_time(stats.elapsed)}")
+        line = " | ".join(parts)
+        # Clear previous line and write new one
+        clear = " " * self._last_line_length
+        self._file.write(f"\r{clear}\r{line}")
+        self._file.flush()
+        self._last_line_length = len(line)
+    @staticmethod
+    def _format_time(seconds: float) -> str:
+        """Format time in human-readable format."""
+        if seconds < 60:
+            return f"{seconds:.1f}s"
+        elif seconds < 3600:
+            mins = int(seconds // 60)
+            secs = seconds % 60
+            return f"{mins}m {secs:.0f}s"
+        else:
+            hours = int(seconds // 3600)
+            mins = int((seconds % 3600) // 60)
+            return f"{hours}h {mins}m"
+class progress_context:
+    """Context manager for progress tracking."""
+    def __init__(self, total: int, quiet: bool = False, description: str = ""):
+        """
+        Initialize progress context.
+        Args:
+            total: Total number of items
+            quiet: Suppress output
+            description: Optional description (currently unused, for future)
+        """
+        self.progress = NinjaProgress(total=total, quiet=quiet)
+        self._description = description
+    def __enter__(self) -> NinjaProgress:
+        self.progress.start()
+        return self.progress
+    def __exit__(self, *args):
+        self.progress.finish()

rdkit_cli/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Utility functions."""