rdkit-cli 0.1.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdkit_cli-0.3.0/CHANGELOG.md +77 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/PKG-INFO +272 -9
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/README.md +270 -7
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/pyproject.toml +2 -2
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/__init__.py +1 -1
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/cli.py +102 -23
- rdkit_cli-0.3.0/src/rdkit_cli/commands/align.py +140 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/deduplicate.py +123 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/depict.py +122 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/descriptors.py +38 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/info.py +55 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/merge.py +101 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/mmp.py +311 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/props.py +309 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/protonate.py +121 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/rgroup.py +102 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/rings.py +235 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/rmsd.py +324 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/sample.py +152 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/sascorer.py +103 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/split.py +152 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/standardize.py +61 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/stats.py +172 -0
- rdkit_cli-0.3.0/src/rdkit_cli/commands/validate.py +189 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/align.py +235 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/deduplicate.py +190 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/info.py +127 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/merge.py +119 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/mmp.py +228 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/protonate.py +283 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/rgroup.py +220 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/rings.py +274 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/rmsd.py +247 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/sample.py +156 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/sascorer.py +239 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/split.py +118 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/stats.py +96 -0
- rdkit_cli-0.3.0/src/rdkit_cli/core/validate.py +222 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/readers.py +11 -4
- rdkit_cli-0.3.0/src/rdkit_cli/utils/__init__.py +27 -0
- rdkit_cli-0.3.0/src/rdkit_cli/utils/logging.py +113 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/integration/test_cli.py +401 -0
- rdkit_cli-0.3.0/tests/integration/test_new_commands.py +355 -0
- rdkit_cli-0.3.0/tests/unit/test_align.py +135 -0
- rdkit_cli-0.3.0/tests/unit/test_deduplicate.py +161 -0
- rdkit_cli-0.3.0/tests/unit/test_filters.py +529 -0
- rdkit_cli-0.3.0/tests/unit/test_info.py +97 -0
- rdkit_cli-0.3.0/tests/unit/test_merge.py +79 -0
- rdkit_cli-0.3.0/tests/unit/test_mmp.py +177 -0
- rdkit_cli-0.3.0/tests/unit/test_protonate.py +166 -0
- rdkit_cli-0.3.0/tests/unit/test_rgroup.py +121 -0
- rdkit_cli-0.3.0/tests/unit/test_rings.py +158 -0
- rdkit_cli-0.3.0/tests/unit/test_rmsd.py +171 -0
- rdkit_cli-0.3.0/tests/unit/test_sample.py +155 -0
- rdkit_cli-0.3.0/tests/unit/test_sascorer.py +135 -0
- rdkit_cli-0.3.0/tests/unit/test_split.py +101 -0
- rdkit_cli-0.3.0/tests/unit/test_stats.py +108 -0
- rdkit_cli-0.3.0/tests/unit/test_validate.py +208 -0
- rdkit_cli-0.1.0/CHANGELOG.md +0 -43
- rdkit_cli-0.1.0/src/rdkit_cli/utils/__init__.py +0 -1
- rdkit_cli-0.1.0/tests/unit/test_filters.py +0 -211
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/.github/workflows/publish.yml +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/LICENSE +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/__main__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/conformers.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/convert.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/diversity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/enumerate.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/filter.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/fingerprints.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/fragment.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/mcs.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/reactions.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/scaffold.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/similarity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/conformers.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/depict.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/descriptors.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/diversity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/enumerate.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/filters.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/fingerprints.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/fragment.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/mcs.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/reactions.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/scaffold.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/similarity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/standardizer.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/formats.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/writers.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/parallel/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/parallel/batch.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/parallel/executor.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/progress/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/progress/ninja.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/conftest.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/fixtures/sample.csv +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/fixtures/sample.smi +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/integration/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/integration/test_interop.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_depict.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_descriptors.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_diversity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_enumerate.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_fingerprints.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_fragment.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_io.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_mcs.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_reactions.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_scaffold.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_similarity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_standardizer.py +0 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.3.0] - 2026-01-10
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **info**: Quick molecule information from SMILES (formula, MW, LogP, TPSA, stereocenters, Lipinski violations, InChI/InChIKey)
|
|
13
|
+
- **merge**: Combine multiple molecule files with optional deduplication and source tracking
|
|
14
|
+
- **sascorer**: Calculate Synthetic Accessibility (SA) Score, Natural Product-likeness (NPC), and QED scores
|
|
15
|
+
- **rgroup**: R-group decomposition around a core SMARTS pattern with labeled attachment points
|
|
16
|
+
- **rings**: Ring system analysis - extract ring systems (fused, spiro, bridged) and analyze frequencies
|
|
17
|
+
- **align**: 3D molecular alignment to a reference structure (MCS-based or Open3DAlign)
|
|
18
|
+
- **rmsd**: RMSD calculations between 3D structures (compare to reference, pairwise matrix, conformer analysis)
|
|
19
|
+
- **mmp**: Matched Molecular Pairs analysis - fragment molecules, find pairs, apply transformations
|
|
20
|
+
- **protonate**: Protonation state enumeration at specified pH with neutralization option
|
|
21
|
+
- **props**: Property column operations - add, rename, drop, keep columns in molecule files
|
|
22
|
+
|
|
23
|
+
### Changed
|
|
24
|
+
|
|
25
|
+
- Total command count increased from 19 to 29
|
|
26
|
+
|
|
27
|
+
## [0.2.0] - 2026-01-06
|
|
28
|
+
|
|
29
|
+
### Added
|
|
30
|
+
|
|
31
|
+
- **stats**: Calculate dataset statistics (MolWt, LogP, TPSA, etc. with min/max/mean/median/stdev)
|
|
32
|
+
- **split**: Split files into smaller chunks (by number of chunks or chunk size)
|
|
33
|
+
- **sample**: Randomly sample molecules (by count or fraction, with reservoir sampling for large files)
|
|
34
|
+
- **deduplicate**: Remove duplicate molecules (by SMILES, InChI, InChIKey, or scaffold)
|
|
35
|
+
- **validate**: Validate molecular structures (valence, kekulization, stereo, element constraints)
|
|
36
|
+
|
|
37
|
+
### Changed
|
|
38
|
+
|
|
39
|
+
- Commands are now displayed in alphabetical order in help output
|
|
40
|
+
- Total command count increased from 14 to 19
|
|
41
|
+
|
|
42
|
+
## [0.1.0] - 2026-01-06
|
|
43
|
+
|
|
44
|
+
### Added
|
|
45
|
+
|
|
46
|
+
- Initial release with 14 command categories
|
|
47
|
+
- **descriptors**: Compute molecular descriptors (200+ available)
|
|
48
|
+
- **fingerprints**: Generate molecular fingerprints (morgan, maccs, rdkit, atompair, torsion, pattern)
|
|
49
|
+
- **filter**: Filter molecules by substructure, properties, drug-likeness (Lipinski/Veber/Ghose), PAINS
|
|
50
|
+
- **convert**: Convert between molecular file formats (CSV, TSV, SMI, SDF, Parquet)
|
|
51
|
+
- **standardize**: Standardize and canonicalize molecules
|
|
52
|
+
- **similarity**: Similarity search, matrix computation, and clustering
|
|
53
|
+
- **conformers**: Generate and optimize 3D conformers
|
|
54
|
+
- **reactions**: SMIRKS transformations and reaction enumeration
|
|
55
|
+
- **scaffold**: Murcko scaffold extraction and decomposition
|
|
56
|
+
- **enumerate**: Stereoisomer and tautomer enumeration
|
|
57
|
+
- **fragment**: BRICS/RECAP fragmentation and functional group analysis
|
|
58
|
+
- **diversity**: MaxMin diversity picking and diversity analysis
|
|
59
|
+
- **mcs**: Maximum Common Substructure finding
|
|
60
|
+
- **depict**: SVG/PNG molecular depictions (single, batch, grid)
|
|
61
|
+
|
|
62
|
+
### Features
|
|
63
|
+
|
|
64
|
+
- Multi-core parallel processing via ProcessPoolExecutor
|
|
65
|
+
- Ninja-style progress display with speed and ETA
|
|
66
|
+
- Support for multiple I/O formats (CSV, TSV, SMI, SDF, Parquet)
|
|
67
|
+
- Automatic format detection from file extensions
|
|
68
|
+
- Lazy imports for fast CLI startup (~0.08s)
|
|
69
|
+
- Comprehensive test suite (182 tests)
|
|
70
|
+
|
|
71
|
+
### Dependencies
|
|
72
|
+
|
|
73
|
+
- rdkit>=2024.3.1
|
|
74
|
+
- rich-argparse>=1.4.0
|
|
75
|
+
- pandas>=2.0.0
|
|
76
|
+
- pyarrow>=14.0.0
|
|
77
|
+
- numpy>=1.24.0
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rdkit-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: A comprehensive CLI tool for RDKit cheminformatics operations
|
|
5
5
|
Project-URL: Homepage, https://github.com/vitruves/rdkit-cli
|
|
6
6
|
Project-URL: Repository, https://github.com/vitruves/rdkit-cli
|
|
7
7
|
Project-URL: Issues, https://github.com/vitruves/rdkit-cli/issues
|
|
8
|
-
Author:
|
|
8
|
+
Author: Johan HG Natter
|
|
9
9
|
License-Expression: Apache-2.0
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Keywords: cheminformatics,chemistry,cli,fingerprints,molecular-descriptors,rdkit
|
|
@@ -38,7 +38,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
|
|
|
38
38
|
|
|
39
39
|
## Features
|
|
40
40
|
|
|
41
|
-
- **
|
|
41
|
+
- **29 Command Categories**: align, conformers, convert, deduplicate, depict, descriptors, diversity, enumerate, filter, fingerprints, fragment, info, mcs, merge, mmp, props, protonate, reactions, rgroup, rings, rmsd, sample, sascorer, scaffold, similarity, split, standardize, stats, validate
|
|
42
42
|
- **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
|
|
43
43
|
- **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
|
|
44
44
|
- **Ninja-style Progress**: Real-time progress display with speed and ETA
|
|
@@ -290,6 +290,247 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
|
|
|
290
290
|
rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
291
291
|
```
|
|
292
292
|
|
|
293
|
+
### stats
|
|
294
|
+
|
|
295
|
+
Calculate dataset statistics.
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# Basic statistics
|
|
299
|
+
rdkit-cli stats -i molecules.csv -o stats.json --format json
|
|
300
|
+
|
|
301
|
+
# Specific properties
|
|
302
|
+
rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
|
|
303
|
+
|
|
304
|
+
# List available properties
|
|
305
|
+
rdkit-cli stats -i molecules.csv --list-properties
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### split
|
|
309
|
+
|
|
310
|
+
Split files into smaller chunks.
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
# Split into N files
|
|
314
|
+
rdkit-cli split -i large.csv -o chunks/ -c 10
|
|
315
|
+
|
|
316
|
+
# Split by chunk size
|
|
317
|
+
rdkit-cli split -i large.csv -o chunks/ -s 1000
|
|
318
|
+
|
|
319
|
+
# With custom prefix
|
|
320
|
+
rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### sample
|
|
324
|
+
|
|
325
|
+
Randomly sample molecules.
|
|
326
|
+
|
|
327
|
+
```bash
|
|
328
|
+
# Sample by count
|
|
329
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
|
|
330
|
+
|
|
331
|
+
# Sample by fraction
|
|
332
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
|
|
333
|
+
|
|
334
|
+
# Memory-efficient streaming (reservoir sampling)
|
|
335
|
+
rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### deduplicate
|
|
339
|
+
|
|
340
|
+
Remove duplicate molecules.
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
# Deduplicate by canonical SMILES (default)
|
|
344
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv
|
|
345
|
+
|
|
346
|
+
# Deduplicate by InChIKey
|
|
347
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
|
|
348
|
+
|
|
349
|
+
# Deduplicate by scaffold
|
|
350
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
|
|
351
|
+
|
|
352
|
+
# Keep last occurrence instead of first
|
|
353
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### validate
|
|
357
|
+
|
|
358
|
+
Validate molecular structures.
|
|
359
|
+
|
|
360
|
+
```bash
|
|
361
|
+
# Basic validation
|
|
362
|
+
rdkit-cli validate -i molecules.csv -o validated.csv
|
|
363
|
+
|
|
364
|
+
# Output only valid molecules
|
|
365
|
+
rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
|
|
366
|
+
|
|
367
|
+
# With constraints
|
|
368
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
369
|
+
--max-atoms 100 --max-rings 8
|
|
370
|
+
|
|
371
|
+
# Check allowed elements
|
|
372
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
373
|
+
--allowed-elements C,H,N,O,S,F,Cl
|
|
374
|
+
|
|
375
|
+
# Check stereo and show summary
|
|
376
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
377
|
+
--check-stereo --summary
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
### info
|
|
381
|
+
|
|
382
|
+
Quick molecule information from SMILES.
|
|
383
|
+
|
|
384
|
+
```bash
|
|
385
|
+
# Basic info
|
|
386
|
+
rdkit-cli info "CCO"
|
|
387
|
+
|
|
388
|
+
# JSON output
|
|
389
|
+
rdkit-cli info "c1ccccc1" --json
|
|
390
|
+
|
|
391
|
+
# Shows: formula, MW, LogP, TPSA, stereocenters, Lipinski violations, InChI/InChIKey
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
### merge
|
|
395
|
+
|
|
396
|
+
Combine multiple molecule files.
|
|
397
|
+
|
|
398
|
+
```bash
|
|
399
|
+
# Merge two files
|
|
400
|
+
rdkit-cli merge -i file1.csv file2.csv -o merged.csv
|
|
401
|
+
|
|
402
|
+
# Merge with deduplication
|
|
403
|
+
rdkit-cli merge -i file1.csv file2.csv -o merged.csv --dedupe
|
|
404
|
+
|
|
405
|
+
# Track source file
|
|
406
|
+
rdkit-cli merge -i file1.csv file2.csv -o merged.csv --source-column source
|
|
407
|
+
```
|
|
408
|
+
|
|
409
|
+
### sascorer
|
|
410
|
+
|
|
411
|
+
Calculate synthetic accessibility and drug-likeness scores.
|
|
412
|
+
|
|
413
|
+
```bash
|
|
414
|
+
# SA Score only (default)
|
|
415
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv
|
|
416
|
+
|
|
417
|
+
# Include QED score
|
|
418
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv --qed
|
|
419
|
+
|
|
420
|
+
# Include Natural Product-likeness score
|
|
421
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv --npc
|
|
422
|
+
|
|
423
|
+
# All scores
|
|
424
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv --qed --npc
|
|
425
|
+
```
|
|
426
|
+
|
|
427
|
+
### rgroup
|
|
428
|
+
|
|
429
|
+
R-group decomposition around a core structure.
|
|
430
|
+
|
|
431
|
+
```bash
|
|
432
|
+
# Decompose around benzene core
|
|
433
|
+
rdkit-cli rgroup -i molecules.csv -o decomposed.csv --core "c1ccc([*:1])cc1"
|
|
434
|
+
|
|
435
|
+
# Multiple attachment points
|
|
436
|
+
rdkit-cli rgroup -i molecules.csv -o decomposed.csv \
|
|
437
|
+
--core "c1ccc([*:1])cc([*:2])1"
|
|
438
|
+
```
|
|
439
|
+
|
|
440
|
+
### rings
|
|
441
|
+
|
|
442
|
+
Ring system analysis.
|
|
443
|
+
|
|
444
|
+
```bash
|
|
445
|
+
# Extract ring systems
|
|
446
|
+
rdkit-cli rings extract -i molecules.csv -o rings.csv
|
|
447
|
+
|
|
448
|
+
# Ring information (counts, sizes, aromaticity)
|
|
449
|
+
rdkit-cli rings info -i molecules.csv -o ring_info.csv
|
|
450
|
+
|
|
451
|
+
# Frequency analysis
|
|
452
|
+
rdkit-cli rings frequency -i molecules.csv -o ring_freq.csv
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
### align
|
|
456
|
+
|
|
457
|
+
3D molecular alignment.
|
|
458
|
+
|
|
459
|
+
```bash
|
|
460
|
+
# Align to reference structure (MCS-based)
|
|
461
|
+
rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf
|
|
462
|
+
|
|
463
|
+
# Open3DAlign method
|
|
464
|
+
rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf --method o3a
|
|
465
|
+
```
|
|
466
|
+
|
|
467
|
+
### rmsd
|
|
468
|
+
|
|
469
|
+
RMSD calculations between 3D structures.
|
|
470
|
+
|
|
471
|
+
```bash
|
|
472
|
+
# Compare to reference
|
|
473
|
+
rdkit-cli rmsd compare -i molecules.sdf -o results.csv -r reference.sdf
|
|
474
|
+
|
|
475
|
+
# Pairwise RMSD matrix
|
|
476
|
+
rdkit-cli rmsd matrix -i molecules.sdf -o matrix.csv
|
|
477
|
+
|
|
478
|
+
# Conformer RMSD analysis
|
|
479
|
+
rdkit-cli rmsd conformers -i multi_conf.sdf -o conf_rmsd.csv
|
|
480
|
+
```
|
|
481
|
+
|
|
482
|
+
### mmp
|
|
483
|
+
|
|
484
|
+
Matched Molecular Pairs analysis.
|
|
485
|
+
|
|
486
|
+
```bash
|
|
487
|
+
# Fragment molecules for MMP
|
|
488
|
+
rdkit-cli mmp fragment -i molecules.csv -o fragments.csv
|
|
489
|
+
|
|
490
|
+
# Find matched pairs
|
|
491
|
+
rdkit-cli mmp pairs -i fragments.csv -o pairs.csv
|
|
492
|
+
|
|
493
|
+
# Apply MMP transformation
|
|
494
|
+
rdkit-cli mmp transform -i molecules.csv -o transformed.csv \
|
|
495
|
+
-t "[c:1][CH3]>>[c:1][NH2]"
|
|
496
|
+
```
|
|
497
|
+
|
|
498
|
+
### protonate
|
|
499
|
+
|
|
500
|
+
Protonation state enumeration.
|
|
501
|
+
|
|
502
|
+
```bash
|
|
503
|
+
# Enumerate at physiological pH
|
|
504
|
+
rdkit-cli protonate -i molecules.csv -o protonated.csv --ph 7.4
|
|
505
|
+
|
|
506
|
+
# Neutralize charged molecules
|
|
507
|
+
rdkit-cli protonate -i molecules.csv -o neutral.csv --neutralize
|
|
508
|
+
|
|
509
|
+
# Enumerate all states
|
|
510
|
+
rdkit-cli protonate -i molecules.csv -o states.csv --enumerate-all
|
|
511
|
+
```
|
|
512
|
+
|
|
513
|
+
### props
|
|
514
|
+
|
|
515
|
+
Property column operations.
|
|
516
|
+
|
|
517
|
+
```bash
|
|
518
|
+
# Add a column
|
|
519
|
+
rdkit-cli props add -i molecules.csv -o output.csv -c series -v "series_A"
|
|
520
|
+
|
|
521
|
+
# Rename a column
|
|
522
|
+
rdkit-cli props rename -i molecules.csv -o output.csv --from name --to mol_name
|
|
523
|
+
|
|
524
|
+
# Drop columns
|
|
525
|
+
rdkit-cli props drop -i molecules.csv -o output.csv -c col1,col2
|
|
526
|
+
|
|
527
|
+
# Keep only specific columns
|
|
528
|
+
rdkit-cli props keep -i molecules.csv -o output.csv -c smiles,name,MolWt
|
|
529
|
+
|
|
530
|
+
# List columns
|
|
531
|
+
rdkit-cli props list -i molecules.csv
|
|
532
|
+
```
|
|
533
|
+
|
|
293
534
|
## Global Options
|
|
294
535
|
|
|
295
536
|
| Option | Description |
|
|
@@ -319,19 +560,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
|
319
560
|
### Cheminformatics Pipeline
|
|
320
561
|
|
|
321
562
|
```bash
|
|
322
|
-
# 1.
|
|
323
|
-
rdkit-cli
|
|
563
|
+
# 1. Validate and filter input
|
|
564
|
+
rdkit-cli validate -i raw.csv -o validated.csv --valid-only
|
|
565
|
+
|
|
566
|
+
# 2. Deduplicate
|
|
567
|
+
rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
|
|
568
|
+
|
|
569
|
+
# 3. Standardize molecules
|
|
570
|
+
rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
|
|
324
571
|
|
|
325
|
-
#
|
|
572
|
+
# 4. Filter by drug-likeness
|
|
326
573
|
rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
|
|
327
574
|
|
|
328
|
-
#
|
|
575
|
+
# 5. Compute descriptors
|
|
329
576
|
rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
|
|
330
577
|
|
|
331
|
-
#
|
|
578
|
+
# 6. Get dataset statistics
|
|
579
|
+
rdkit-cli stats -i druglike.csv -o stats.json --format json
|
|
580
|
+
|
|
581
|
+
# 7. Select diverse subset
|
|
332
582
|
rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
|
|
333
583
|
|
|
334
|
-
#
|
|
584
|
+
# 8. Generate depictions
|
|
335
585
|
rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
|
|
336
586
|
```
|
|
337
587
|
|
|
@@ -358,6 +608,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
|
|
|
358
608
|
rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
|
|
359
609
|
```
|
|
360
610
|
|
|
611
|
+
### Large Dataset Processing
|
|
612
|
+
|
|
613
|
+
```bash
|
|
614
|
+
# Sample from a huge dataset
|
|
615
|
+
rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
|
|
616
|
+
|
|
617
|
+
# Split for parallel processing
|
|
618
|
+
rdkit-cli split -i library.csv -o batches/ -c 10
|
|
619
|
+
|
|
620
|
+
# Process batches in parallel (using xargs)
|
|
621
|
+
ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
|
|
622
|
+
```
|
|
623
|
+
|
|
361
624
|
## Development
|
|
362
625
|
|
|
363
626
|
```bash
|
|
@@ -4,7 +4,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
|
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
- **
|
|
7
|
+
- **29 Command Categories**: align, conformers, convert, deduplicate, depict, descriptors, diversity, enumerate, filter, fingerprints, fragment, info, mcs, merge, mmp, props, protonate, reactions, rgroup, rings, rmsd, sample, sascorer, scaffold, similarity, split, standardize, stats, validate
|
|
8
8
|
- **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
|
|
9
9
|
- **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
|
|
10
10
|
- **Ninja-style Progress**: Real-time progress display with speed and ETA
|
|
@@ -256,6 +256,247 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
|
|
|
256
256
|
rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
257
257
|
```
|
|
258
258
|
|
|
259
|
+
### stats
|
|
260
|
+
|
|
261
|
+
Calculate dataset statistics.
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
# Basic statistics
|
|
265
|
+
rdkit-cli stats -i molecules.csv -o stats.json --format json
|
|
266
|
+
|
|
267
|
+
# Specific properties
|
|
268
|
+
rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
|
|
269
|
+
|
|
270
|
+
# List available properties
|
|
271
|
+
rdkit-cli stats -i molecules.csv --list-properties
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### split
|
|
275
|
+
|
|
276
|
+
Split files into smaller chunks.
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
# Split into N files
|
|
280
|
+
rdkit-cli split -i large.csv -o chunks/ -c 10
|
|
281
|
+
|
|
282
|
+
# Split by chunk size
|
|
283
|
+
rdkit-cli split -i large.csv -o chunks/ -s 1000
|
|
284
|
+
|
|
285
|
+
# With custom prefix
|
|
286
|
+
rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### sample
|
|
290
|
+
|
|
291
|
+
Randomly sample molecules.
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
# Sample by count
|
|
295
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
|
|
296
|
+
|
|
297
|
+
# Sample by fraction
|
|
298
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
|
|
299
|
+
|
|
300
|
+
# Memory-efficient streaming (reservoir sampling)
|
|
301
|
+
rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### deduplicate
|
|
305
|
+
|
|
306
|
+
Remove duplicate molecules.
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
# Deduplicate by canonical SMILES (default)
|
|
310
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv
|
|
311
|
+
|
|
312
|
+
# Deduplicate by InChIKey
|
|
313
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
|
|
314
|
+
|
|
315
|
+
# Deduplicate by scaffold
|
|
316
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
|
|
317
|
+
|
|
318
|
+
# Keep last occurrence instead of first
|
|
319
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### validate
|
|
323
|
+
|
|
324
|
+
Validate molecular structures.
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
# Basic validation
|
|
328
|
+
rdkit-cli validate -i molecules.csv -o validated.csv
|
|
329
|
+
|
|
330
|
+
# Output only valid molecules
|
|
331
|
+
rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
|
|
332
|
+
|
|
333
|
+
# With constraints
|
|
334
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
335
|
+
--max-atoms 100 --max-rings 8
|
|
336
|
+
|
|
337
|
+
# Check allowed elements
|
|
338
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
339
|
+
--allowed-elements C,H,N,O,S,F,Cl
|
|
340
|
+
|
|
341
|
+
# Check stereo and show summary
|
|
342
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
343
|
+
--check-stereo --summary
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
### info
|
|
347
|
+
|
|
348
|
+
Quick molecule information from SMILES.
|
|
349
|
+
|
|
350
|
+
```bash
|
|
351
|
+
# Basic info
|
|
352
|
+
rdkit-cli info "CCO"
|
|
353
|
+
|
|
354
|
+
# JSON output
|
|
355
|
+
rdkit-cli info "c1ccccc1" --json
|
|
356
|
+
|
|
357
|
+
# Shows: formula, MW, LogP, TPSA, stereocenters, Lipinski violations, InChI/InChIKey
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### merge
|
|
361
|
+
|
|
362
|
+
Combine multiple molecule files.
|
|
363
|
+
|
|
364
|
+
```bash
|
|
365
|
+
# Merge two files
|
|
366
|
+
rdkit-cli merge -i file1.csv file2.csv -o merged.csv
|
|
367
|
+
|
|
368
|
+
# Merge with deduplication
|
|
369
|
+
rdkit-cli merge -i file1.csv file2.csv -o merged.csv --dedupe
|
|
370
|
+
|
|
371
|
+
# Track source file
|
|
372
|
+
rdkit-cli merge -i file1.csv file2.csv -o merged.csv --source-column source
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
### sascorer
|
|
376
|
+
|
|
377
|
+
Calculate synthetic accessibility and drug-likeness scores.
|
|
378
|
+
|
|
379
|
+
```bash
|
|
380
|
+
# SA Score only (default)
|
|
381
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv
|
|
382
|
+
|
|
383
|
+
# Include QED score
|
|
384
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv --qed
|
|
385
|
+
|
|
386
|
+
# Include Natural Product-likeness score
|
|
387
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv --npc
|
|
388
|
+
|
|
389
|
+
# All scores
|
|
390
|
+
rdkit-cli sascorer -i molecules.csv -o scores.csv --qed --npc
|
|
391
|
+
```
|
|
392
|
+
|
|
393
|
+
### rgroup
|
|
394
|
+
|
|
395
|
+
R-group decomposition around a core structure.
|
|
396
|
+
|
|
397
|
+
```bash
|
|
398
|
+
# Decompose around benzene core
|
|
399
|
+
rdkit-cli rgroup -i molecules.csv -o decomposed.csv --core "c1ccc([*:1])cc1"
|
|
400
|
+
|
|
401
|
+
# Multiple attachment points
|
|
402
|
+
rdkit-cli rgroup -i molecules.csv -o decomposed.csv \
|
|
403
|
+
--core "c1ccc([*:1])cc([*:2])1"
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
### rings
|
|
407
|
+
|
|
408
|
+
Ring system analysis.
|
|
409
|
+
|
|
410
|
+
```bash
|
|
411
|
+
# Extract ring systems
|
|
412
|
+
rdkit-cli rings extract -i molecules.csv -o rings.csv
|
|
413
|
+
|
|
414
|
+
# Ring information (counts, sizes, aromaticity)
|
|
415
|
+
rdkit-cli rings info -i molecules.csv -o ring_info.csv
|
|
416
|
+
|
|
417
|
+
# Frequency analysis
|
|
418
|
+
rdkit-cli rings frequency -i molecules.csv -o ring_freq.csv
|
|
419
|
+
```
|
|
420
|
+
|
|
421
|
+
### align
|
|
422
|
+
|
|
423
|
+
3D molecular alignment.
|
|
424
|
+
|
|
425
|
+
```bash
|
|
426
|
+
# Align to reference structure (MCS-based)
|
|
427
|
+
rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf
|
|
428
|
+
|
|
429
|
+
# Open3DAlign method
|
|
430
|
+
rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf --method o3a
|
|
431
|
+
```
|
|
432
|
+
|
|
433
|
+
### rmsd
|
|
434
|
+
|
|
435
|
+
RMSD calculations between 3D structures.
|
|
436
|
+
|
|
437
|
+
```bash
|
|
438
|
+
# Compare to reference
|
|
439
|
+
rdkit-cli rmsd compare -i molecules.sdf -o results.csv -r reference.sdf
|
|
440
|
+
|
|
441
|
+
# Pairwise RMSD matrix
|
|
442
|
+
rdkit-cli rmsd matrix -i molecules.sdf -o matrix.csv
|
|
443
|
+
|
|
444
|
+
# Conformer RMSD analysis
|
|
445
|
+
rdkit-cli rmsd conformers -i multi_conf.sdf -o conf_rmsd.csv
|
|
446
|
+
```
|
|
447
|
+
|
|
448
|
+
### mmp
|
|
449
|
+
|
|
450
|
+
Matched Molecular Pairs analysis.
|
|
451
|
+
|
|
452
|
+
```bash
|
|
453
|
+
# Fragment molecules for MMP
|
|
454
|
+
rdkit-cli mmp fragment -i molecules.csv -o fragments.csv
|
|
455
|
+
|
|
456
|
+
# Find matched pairs
|
|
457
|
+
rdkit-cli mmp pairs -i fragments.csv -o pairs.csv
|
|
458
|
+
|
|
459
|
+
# Apply MMP transformation
|
|
460
|
+
rdkit-cli mmp transform -i molecules.csv -o transformed.csv \
|
|
461
|
+
-t "[c:1][CH3]>>[c:1][NH2]"
|
|
462
|
+
```
|
|
463
|
+
|
|
464
|
+
### protonate
|
|
465
|
+
|
|
466
|
+
Protonation state enumeration.
|
|
467
|
+
|
|
468
|
+
```bash
|
|
469
|
+
# Enumerate at physiological pH
|
|
470
|
+
rdkit-cli protonate -i molecules.csv -o protonated.csv --ph 7.4
|
|
471
|
+
|
|
472
|
+
# Neutralize charged molecules
|
|
473
|
+
rdkit-cli protonate -i molecules.csv -o neutral.csv --neutralize
|
|
474
|
+
|
|
475
|
+
# Enumerate all states
|
|
476
|
+
rdkit-cli protonate -i molecules.csv -o states.csv --enumerate-all
|
|
477
|
+
```
|
|
478
|
+
|
|
479
|
+
### props
|
|
480
|
+
|
|
481
|
+
Property column operations.
|
|
482
|
+
|
|
483
|
+
```bash
|
|
484
|
+
# Add a column
|
|
485
|
+
rdkit-cli props add -i molecules.csv -o output.csv -c series -v "series_A"
|
|
486
|
+
|
|
487
|
+
# Rename a column
|
|
488
|
+
rdkit-cli props rename -i molecules.csv -o output.csv --from name --to mol_name
|
|
489
|
+
|
|
490
|
+
# Drop columns
|
|
491
|
+
rdkit-cli props drop -i molecules.csv -o output.csv -c col1,col2
|
|
492
|
+
|
|
493
|
+
# Keep only specific columns
|
|
494
|
+
rdkit-cli props keep -i molecules.csv -o output.csv -c smiles,name,MolWt
|
|
495
|
+
|
|
496
|
+
# List columns
|
|
497
|
+
rdkit-cli props list -i molecules.csv
|
|
498
|
+
```
|
|
499
|
+
|
|
259
500
|
## Global Options
|
|
260
501
|
|
|
261
502
|
| Option | Description |
|
|
@@ -285,19 +526,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
|
285
526
|
### Cheminformatics Pipeline
|
|
286
527
|
|
|
287
528
|
```bash
|
|
288
|
-
# 1.
|
|
289
|
-
rdkit-cli
|
|
529
|
+
# 1. Validate and filter input
|
|
530
|
+
rdkit-cli validate -i raw.csv -o validated.csv --valid-only
|
|
531
|
+
|
|
532
|
+
# 2. Deduplicate
|
|
533
|
+
rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
|
|
534
|
+
|
|
535
|
+
# 3. Standardize molecules
|
|
536
|
+
rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
|
|
290
537
|
|
|
291
|
-
#
|
|
538
|
+
# 4. Filter by drug-likeness
|
|
292
539
|
rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
|
|
293
540
|
|
|
294
|
-
#
|
|
541
|
+
# 5. Compute descriptors
|
|
295
542
|
rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
|
|
296
543
|
|
|
297
|
-
#
|
|
544
|
+
# 6. Get dataset statistics
|
|
545
|
+
rdkit-cli stats -i druglike.csv -o stats.json --format json
|
|
546
|
+
|
|
547
|
+
# 7. Select diverse subset
|
|
298
548
|
rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
|
|
299
549
|
|
|
300
|
-
#
|
|
550
|
+
# 8. Generate depictions
|
|
301
551
|
rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
|
|
302
552
|
```
|
|
303
553
|
|
|
@@ -324,6 +574,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
|
|
|
324
574
|
rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
|
|
325
575
|
```
|
|
326
576
|
|
|
577
|
+
### Large Dataset Processing
|
|
578
|
+
|
|
579
|
+
```bash
|
|
580
|
+
# Sample from a huge dataset
|
|
581
|
+
rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
|
|
582
|
+
|
|
583
|
+
# Split for parallel processing
|
|
584
|
+
rdkit-cli split -i library.csv -o batches/ -c 10
|
|
585
|
+
|
|
586
|
+
# Process batches in parallel (using xargs)
|
|
587
|
+
ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
|
|
588
|
+
```
|
|
589
|
+
|
|
327
590
|
## Development
|
|
328
591
|
|
|
329
592
|
```bash
|