rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. rdkit_cli/__init__.py +4 -0
  2. rdkit_cli/__main__.py +6 -0
  3. rdkit_cli/cli.py +162 -0
  4. rdkit_cli/commands/__init__.py +1 -0
  5. rdkit_cli/commands/conformers.py +220 -0
  6. rdkit_cli/commands/convert.py +162 -0
  7. rdkit_cli/commands/depict.py +311 -0
  8. rdkit_cli/commands/descriptors.py +251 -0
  9. rdkit_cli/commands/diversity.py +232 -0
  10. rdkit_cli/commands/enumerate.py +229 -0
  11. rdkit_cli/commands/filter.py +384 -0
  12. rdkit_cli/commands/fingerprints.py +179 -0
  13. rdkit_cli/commands/fragment.py +284 -0
  14. rdkit_cli/commands/mcs.py +162 -0
  15. rdkit_cli/commands/reactions.py +191 -0
  16. rdkit_cli/commands/scaffold.py +243 -0
  17. rdkit_cli/commands/similarity.py +359 -0
  18. rdkit_cli/commands/standardize.py +138 -0
  19. rdkit_cli/core/__init__.py +1 -0
  20. rdkit_cli/core/conformers.py +197 -0
  21. rdkit_cli/core/depict.py +241 -0
  22. rdkit_cli/core/descriptors.py +248 -0
  23. rdkit_cli/core/diversity.py +174 -0
  24. rdkit_cli/core/enumerate.py +190 -0
  25. rdkit_cli/core/filters.py +443 -0
  26. rdkit_cli/core/fingerprints.py +265 -0
  27. rdkit_cli/core/fragment.py +237 -0
  28. rdkit_cli/core/mcs.py +128 -0
  29. rdkit_cli/core/reactions.py +159 -0
  30. rdkit_cli/core/scaffold.py +174 -0
  31. rdkit_cli/core/similarity.py +206 -0
  32. rdkit_cli/core/standardizer.py +141 -0
  33. rdkit_cli/io/__init__.py +7 -0
  34. rdkit_cli/io/formats.py +109 -0
  35. rdkit_cli/io/readers.py +352 -0
  36. rdkit_cli/io/writers.py +275 -0
  37. rdkit_cli/parallel/__init__.py +5 -0
  38. rdkit_cli/parallel/batch.py +181 -0
  39. rdkit_cli/parallel/executor.py +180 -0
  40. rdkit_cli/progress/__init__.py +5 -0
  41. rdkit_cli/progress/ninja.py +195 -0
  42. rdkit_cli/utils/__init__.py +1 -0
  43. rdkit_cli-0.1.0.dist-info/METADATA +380 -0
  44. rdkit_cli-0.1.0.dist-info/RECORD +47 -0
  45. rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
  46. rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,181 @@
1
+ """Batch processing utilities."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Callable, Any, Optional
5
+
6
+ from rdkit_cli.io.readers import MoleculeReader, MoleculeRecord
7
+ from rdkit_cli.io.writers import MoleculeWriter
8
+ from rdkit_cli.progress.ninja import NinjaProgress
9
+ from rdkit_cli.parallel.executor import ParallelExecutor
10
+
11
+
12
+ @dataclass
13
+ class BatchResult:
14
+ """Result of batch processing."""
15
+
16
+ total_processed: int
17
+ successful: int
18
+ failed: int
19
+ elapsed_time: float
20
+
21
+
22
+ def process_molecules(
23
+ reader: MoleculeReader,
24
+ writer: MoleculeWriter,
25
+ processor: Callable[[MoleculeRecord], Optional[dict[str, Any]]],
26
+ n_workers: int = -1,
27
+ quiet: bool = False,
28
+ batch_size: int = 1000,
29
+ ) -> BatchResult:
30
+ """
31
+ Process molecules from reader through processor and write to writer.
32
+
33
+ This is the main batch processing function used by most commands.
34
+
35
+ Args:
36
+ reader: MoleculeReader to read from
37
+ writer: MoleculeWriter to write to
38
+ processor: Function that takes MoleculeRecord and returns dict or None
39
+ n_workers: Number of worker processes (-1 for all)
40
+ quiet: Suppress progress output
41
+ batch_size: Number of records to process in each batch
42
+
43
+ Returns:
44
+ BatchResult with processing statistics
45
+ """
46
+ total = len(reader)
47
+ progress = NinjaProgress(total=total, quiet=quiet)
48
+
49
+ successful = 0
50
+ failed = 0
51
+ write_buffer: list[dict[str, Any]] = []
52
+ write_buffer_size = 1000
53
+
54
+ progress.start()
55
+
56
+ try:
57
+ if n_workers == 1:
58
+ # Sequential processing
59
+ for record in reader:
60
+ result = processor(record)
61
+ if result is not None:
62
+ write_buffer.append(result)
63
+ successful += 1
64
+ else:
65
+ failed += 1
66
+
67
+ progress.update()
68
+
69
+ if len(write_buffer) >= write_buffer_size:
70
+ writer.write_batch(write_buffer)
71
+ write_buffer = []
72
+ else:
73
+ # Parallel processing - collect batch, process in parallel, write
74
+ executor = ParallelExecutor(processor, n_workers=n_workers)
75
+ batch: list[MoleculeRecord] = []
76
+
77
+ for record in reader:
78
+ batch.append(record)
79
+
80
+ if len(batch) >= batch_size:
81
+ # Process batch in parallel
82
+ results = executor.map_ordered(batch)
83
+ for result in results:
84
+ if result is not None:
85
+ write_buffer.append(result)
86
+ successful += 1
87
+ else:
88
+ failed += 1
89
+ progress.update()
90
+
91
+ if len(write_buffer) >= write_buffer_size:
92
+ writer.write_batch(write_buffer)
93
+ write_buffer = []
94
+
95
+ batch = []
96
+
97
+ # Process remaining batch
98
+ if batch:
99
+ results = executor.map_ordered(batch)
100
+ for result in results:
101
+ if result is not None:
102
+ write_buffer.append(result)
103
+ successful += 1
104
+ else:
105
+ failed += 1
106
+ progress.update()
107
+
108
+ # Write remaining buffer
109
+ if write_buffer:
110
+ writer.write_batch(write_buffer)
111
+
112
+ finally:
113
+ progress.finish()
114
+
115
+ return BatchResult(
116
+ total_processed=total,
117
+ successful=successful,
118
+ failed=failed,
119
+ elapsed_time=progress.elapsed_time,
120
+ )
121
+
122
+
123
+ def process_molecules_simple(
124
+ reader: MoleculeReader,
125
+ processor: Callable[[MoleculeRecord], Optional[dict[str, Any]]],
126
+ n_workers: int = -1,
127
+ quiet: bool = False,
128
+ ) -> tuple[list[dict[str, Any]], BatchResult]:
129
+ """
130
+ Process molecules and return results in memory (for small datasets).
131
+
132
+ Args:
133
+ reader: MoleculeReader to read from
134
+ processor: Function that takes MoleculeRecord and returns dict or None
135
+ n_workers: Number of worker processes (-1 for all)
136
+ quiet: Suppress progress output
137
+
138
+ Returns:
139
+ Tuple of (results list, BatchResult)
140
+ """
141
+ total = len(reader)
142
+ progress = NinjaProgress(total=total, quiet=quiet)
143
+
144
+ results: list[dict[str, Any]] = []
145
+ successful = 0
146
+ failed = 0
147
+
148
+ progress.start()
149
+
150
+ try:
151
+ if n_workers == 1:
152
+ for record in reader:
153
+ result = processor(record)
154
+ if result is not None:
155
+ results.append(result)
156
+ successful += 1
157
+ else:
158
+ failed += 1
159
+ progress.update()
160
+ else:
161
+ executor = ParallelExecutor(processor, n_workers=n_workers)
162
+ records = list(reader)
163
+ progress.set_total(len(records))
164
+
165
+ for result in executor.map_ordered(records):
166
+ if result is not None:
167
+ results.append(result)
168
+ successful += 1
169
+ else:
170
+ failed += 1
171
+ progress.update()
172
+
173
+ finally:
174
+ progress.finish()
175
+
176
+ return results, BatchResult(
177
+ total_processed=total,
178
+ successful=successful,
179
+ failed=failed,
180
+ elapsed_time=progress.elapsed_time,
181
+ )
@@ -0,0 +1,180 @@
1
+ """Parallel processing executor."""
2
+
3
+ import os
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ from typing import Callable, Iterator, TypeVar, Optional, Any
6
+ from dataclasses import dataclass
7
+
8
+ T = TypeVar("T")
9
+ R = TypeVar("R")
10
+
11
+
12
+ @dataclass
13
+ class ParallelConfig:
14
+ """Configuration for parallel processing."""
15
+
16
+ n_workers: int = -1 # -1 means auto-detect
17
+ chunk_size: int = 100
18
+
19
+ def __post_init__(self):
20
+ if self.n_workers == -1:
21
+ self.n_workers = os.cpu_count() or 1
22
+
23
+
24
+ def get_worker_count(n_requested: int) -> int:
25
+ """
26
+ Get actual worker count based on request and system.
27
+
28
+ Args:
29
+ n_requested: Requested number of workers (-1 for all, 0 for 1)
30
+
31
+ Returns:
32
+ Actual number of workers to use
33
+ """
34
+ max_workers = os.cpu_count() or 1
35
+ if n_requested <= 0:
36
+ return max_workers
37
+ return min(n_requested, max_workers)
38
+
39
+
40
+ # Global worker function storage for pickling
41
+ _worker_func: Optional[Callable] = None
42
+ _worker_args: tuple = ()
43
+
44
+
45
+ def _init_worker(func: Callable, args: tuple):
46
+ """Initialize worker process with function and extra args."""
47
+ global _worker_func, _worker_args
48
+ _worker_func = func
49
+ _worker_args = args
50
+
51
+
52
+ def _worker_wrapper(item: Any) -> Any:
53
+ """Wrapper that calls the stored worker function."""
54
+ global _worker_func, _worker_args
55
+ if _worker_func is None:
56
+ raise RuntimeError("Worker function not initialized")
57
+ return _worker_func(item, *_worker_args)
58
+
59
+
60
+ class ParallelExecutor:
61
+ """
62
+ Generic parallel executor for batch processing.
63
+
64
+ Uses ProcessPoolExecutor since RDKit operations are CPU-bound
65
+ and benefit from true parallelism (bypassing GIL).
66
+ """
67
+
68
+ def __init__(
69
+ self,
70
+ func: Callable[[T], R],
71
+ n_workers: int = -1,
72
+ initializer: Optional[Callable] = None,
73
+ initargs: tuple = (),
74
+ ):
75
+ """
76
+ Initialize parallel executor.
77
+
78
+ Args:
79
+ func: Function to apply to each item
80
+ n_workers: Number of worker processes (-1 for all CPUs)
81
+ initializer: Optional initializer for worker processes
82
+ initargs: Arguments for initializer
83
+ """
84
+ self.func = func
85
+ self.n_workers = get_worker_count(n_workers)
86
+ self.initializer = initializer
87
+ self.initargs = initargs
88
+
89
+ def map_unordered(
90
+ self,
91
+ items: list[T],
92
+ chunk_size: int = 100,
93
+ ) -> Iterator[R]:
94
+ """
95
+ Process items in parallel, yielding results as they complete.
96
+
97
+ Results may be returned in any order.
98
+
99
+ Args:
100
+ items: Items to process
101
+ chunk_size: Number of items per chunk
102
+
103
+ Yields:
104
+ Results as they complete
105
+ """
106
+ if not items:
107
+ return
108
+
109
+ # For single item or single worker, just run sequentially
110
+ if len(items) == 1 or self.n_workers == 1:
111
+ for item in items:
112
+ yield self.func(item)
113
+ return
114
+
115
+ with ProcessPoolExecutor(
116
+ max_workers=self.n_workers,
117
+ initializer=self.initializer,
118
+ initargs=self.initargs,
119
+ ) as executor:
120
+ # Submit all tasks
121
+ futures = {executor.submit(self.func, item): i for i, item in enumerate(items)}
122
+
123
+ # Yield results as they complete
124
+ for future in as_completed(futures):
125
+ try:
126
+ yield future.result()
127
+ except Exception as e:
128
+ # Yield None for failed items, let caller handle
129
+ yield None
130
+
131
+ def map_ordered(
132
+ self,
133
+ items: list[T],
134
+ chunk_size: int = 100,
135
+ ) -> list[R]:
136
+ """
137
+ Process items and return results in original order.
138
+
139
+ Args:
140
+ items: Items to process
141
+ chunk_size: Number of items per chunk (unused, for API compatibility)
142
+
143
+ Returns:
144
+ Results in same order as input
145
+ """
146
+ if not items:
147
+ return []
148
+
149
+ # For single item or single worker, just run sequentially
150
+ if len(items) == 1 or self.n_workers == 1:
151
+ return [self.func(item) for item in items]
152
+
153
+ with ProcessPoolExecutor(max_workers=self.n_workers) as executor:
154
+ return list(executor.map(self.func, items, chunksize=max(1, len(items) // (self.n_workers * 4))))
155
+
156
+
157
+ def parallel_map(
158
+ func: Callable[[T], R],
159
+ items: list[T],
160
+ n_workers: int = -1,
161
+ ordered: bool = True,
162
+ ) -> list[R]:
163
+ """
164
+ Simple parallel map with default settings.
165
+
166
+ Args:
167
+ func: Function to apply to each item
168
+ items: Items to process
169
+ n_workers: Number of workers (-1 for all CPUs)
170
+ ordered: If True, preserve order; if False, return as completed
171
+
172
+ Returns:
173
+ List of results
174
+ """
175
+ executor = ParallelExecutor(func, n_workers=n_workers)
176
+
177
+ if ordered:
178
+ return executor.map_ordered(items)
179
+ else:
180
+ return list(executor.map_unordered(items))
@@ -0,0 +1,5 @@
1
+ """Progress monitoring utilities."""
2
+
3
+ from rdkit_cli.progress.ninja import NinjaProgress
4
+
5
+ __all__ = ["NinjaProgress"]
@@ -0,0 +1,195 @@
1
+ """Ninja-style progress monitoring."""
2
+
3
+ import sys
4
+ import time
5
+ import threading
6
+ from dataclasses import dataclass
7
+ from typing import Optional
8
+
9
+
10
+ @dataclass
11
+ class ProgressStats:
12
+ """Statistics for progress display."""
13
+
14
+ completed: int
15
+ total: int
16
+ elapsed: float
17
+ rate: float
18
+ eta: Optional[float]
19
+ percentage: float
20
+
21
+
22
+ class NinjaProgress:
23
+ """
24
+ Ninja-style progress reporter.
25
+
26
+ Format: [42/100] 42% | 15.3 it/s | ETA: 3.8s | Elapsed: 2.8s
27
+
28
+ Features:
29
+ - No progress bar (just stats)
30
+ - Updates in-place on single line
31
+ - Thread-safe updates
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ total: int,
37
+ quiet: bool = False,
38
+ update_interval: float = 0.1,
39
+ file=None,
40
+ ):
41
+ """
42
+ Initialize progress reporter.
43
+
44
+ Args:
45
+ total: Total number of items to process
46
+ quiet: If True, suppress all output
47
+ update_interval: Minimum seconds between display updates
48
+ file: File to write progress to (default: stderr)
49
+ """
50
+ self.total = total
51
+ self.quiet = quiet
52
+ self.update_interval = update_interval
53
+ self._file = file or sys.stderr
54
+
55
+ self._completed = 0
56
+ self._start_time: Optional[float] = None
57
+ self._last_update_time: float = 0
58
+ self._lock = threading.Lock()
59
+ self._finished = False
60
+ self._last_line_length = 0
61
+
62
+ def start(self):
63
+ """Start the progress tracker."""
64
+ self._start_time = time.perf_counter()
65
+ self._display()
66
+
67
+ def update(self, n: int = 1):
68
+ """
69
+ Update progress by n items.
70
+
71
+ Args:
72
+ n: Number of items completed
73
+ """
74
+ with self._lock:
75
+ self._completed += n
76
+
77
+ # Throttle display updates
78
+ now = time.perf_counter()
79
+ if now - self._last_update_time >= self.update_interval:
80
+ self._display()
81
+ self._last_update_time = now
82
+
83
+ def set_total(self, total: int):
84
+ """Update the total count (useful when count is discovered during processing)."""
85
+ with self._lock:
86
+ self.total = total
87
+
88
+ def finish(self):
89
+ """Complete the progress display."""
90
+ with self._lock:
91
+ self._finished = True
92
+ self._display(final=True)
93
+ if not self.quiet:
94
+ self._file.write("\n")
95
+ self._file.flush()
96
+
97
+ @property
98
+ def elapsed_time(self) -> float:
99
+ """Return elapsed time in seconds."""
100
+ if self._start_time is None:
101
+ return 0.0
102
+ return time.perf_counter() - self._start_time
103
+
104
+ @property
105
+ def completed(self) -> int:
106
+ """Return number of completed items."""
107
+ return self._completed
108
+
109
+ def _calculate_stats(self) -> ProgressStats:
110
+ """Calculate current progress statistics."""
111
+ elapsed = self.elapsed_time
112
+ completed = self._completed
113
+
114
+ # Calculate rate (items per second)
115
+ rate = completed / elapsed if elapsed > 0 else 0.0
116
+
117
+ # Calculate percentage
118
+ percentage = (completed / self.total * 100) if self.total > 0 else 0.0
119
+
120
+ # Calculate ETA
121
+ remaining = self.total - completed
122
+ eta = remaining / rate if rate > 0 and remaining > 0 else None
123
+
124
+ return ProgressStats(
125
+ completed=completed,
126
+ total=self.total,
127
+ elapsed=elapsed,
128
+ rate=rate,
129
+ eta=eta,
130
+ percentage=percentage,
131
+ )
132
+
133
+ def _display(self, final: bool = False):
134
+ """Display the progress line."""
135
+ if self.quiet:
136
+ return
137
+
138
+ stats = self._calculate_stats()
139
+
140
+ # Format: [42/100] 42% | 15.3 it/s | ETA: 3.8s | Elapsed: 2.8s
141
+ parts = [
142
+ f"[{stats.completed}/{stats.total}]",
143
+ f"{stats.percentage:.0f}%",
144
+ f"{stats.rate:.1f} it/s",
145
+ ]
146
+
147
+ if stats.eta is not None and not final:
148
+ parts.append(f"ETA: {self._format_time(stats.eta)}")
149
+
150
+ parts.append(f"Elapsed: {self._format_time(stats.elapsed)}")
151
+
152
+ line = " | ".join(parts)
153
+
154
+ # Clear previous line and write new one
155
+ clear = " " * self._last_line_length
156
+ self._file.write(f"\r{clear}\r{line}")
157
+ self._file.flush()
158
+ self._last_line_length = len(line)
159
+
160
+ @staticmethod
161
+ def _format_time(seconds: float) -> str:
162
+ """Format time in human-readable format."""
163
+ if seconds < 60:
164
+ return f"{seconds:.1f}s"
165
+ elif seconds < 3600:
166
+ mins = int(seconds // 60)
167
+ secs = seconds % 60
168
+ return f"{mins}m {secs:.0f}s"
169
+ else:
170
+ hours = int(seconds // 3600)
171
+ mins = int((seconds % 3600) // 60)
172
+ return f"{hours}h {mins}m"
173
+
174
+
175
+ class progress_context:
176
+ """Context manager for progress tracking."""
177
+
178
+ def __init__(self, total: int, quiet: bool = False, description: str = ""):
179
+ """
180
+ Initialize progress context.
181
+
182
+ Args:
183
+ total: Total number of items
184
+ quiet: Suppress output
185
+ description: Optional description (currently unused, for future)
186
+ """
187
+ self.progress = NinjaProgress(total=total, quiet=quiet)
188
+ self._description = description
189
+
190
+ def __enter__(self) -> NinjaProgress:
191
+ self.progress.start()
192
+ return self.progress
193
+
194
+ def __exit__(self, *args):
195
+ self.progress.finish()
@@ -0,0 +1 @@
1
+ """Utility functions."""