rdkit-cli 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/CHANGELOG.md +15 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/PKG-INFO +117 -8
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/README.md +116 -7
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/pyproject.toml +1 -1
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/__init__.py +1 -1
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/cli.py +55 -21
- rdkit_cli-0.2.0/src/rdkit_cli/commands/deduplicate.py +123 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/depict.py +122 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/descriptors.py +38 -0
- rdkit_cli-0.2.0/src/rdkit_cli/commands/sample.py +152 -0
- rdkit_cli-0.2.0/src/rdkit_cli/commands/split.py +152 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/standardize.py +61 -0
- rdkit_cli-0.2.0/src/rdkit_cli/commands/stats.py +172 -0
- rdkit_cli-0.2.0/src/rdkit_cli/commands/validate.py +189 -0
- rdkit_cli-0.2.0/src/rdkit_cli/core/deduplicate.py +190 -0
- rdkit_cli-0.2.0/src/rdkit_cli/core/sample.py +156 -0
- rdkit_cli-0.2.0/src/rdkit_cli/core/split.py +118 -0
- rdkit_cli-0.2.0/src/rdkit_cli/core/stats.py +96 -0
- rdkit_cli-0.2.0/src/rdkit_cli/core/validate.py +222 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/readers.py +11 -4
- rdkit_cli-0.2.0/src/rdkit_cli/utils/__init__.py +27 -0
- rdkit_cli-0.2.0/src/rdkit_cli/utils/logging.py +113 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/integration/test_cli.py +401 -0
- rdkit_cli-0.2.0/tests/unit/test_deduplicate.py +161 -0
- rdkit_cli-0.2.0/tests/unit/test_filters.py +529 -0
- rdkit_cli-0.2.0/tests/unit/test_sample.py +155 -0
- rdkit_cli-0.2.0/tests/unit/test_split.py +101 -0
- rdkit_cli-0.2.0/tests/unit/test_stats.py +108 -0
- rdkit_cli-0.2.0/tests/unit/test_validate.py +208 -0
- rdkit_cli-0.1.0/src/rdkit_cli/utils/__init__.py +0 -1
- rdkit_cli-0.1.0/tests/unit/test_filters.py +0 -211
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/.github/workflows/publish.yml +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/LICENSE +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/__main__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/conformers.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/convert.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/diversity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/enumerate.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/filter.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/fingerprints.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/fragment.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/mcs.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/reactions.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/scaffold.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/similarity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/conformers.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/depict.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/descriptors.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/diversity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/enumerate.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/filters.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/fingerprints.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/fragment.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/mcs.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/reactions.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/scaffold.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/similarity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/standardizer.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/formats.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/writers.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/parallel/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/parallel/batch.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/parallel/executor.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/progress/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/progress/ninja.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/conftest.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/fixtures/sample.csv +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/fixtures/sample.smi +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/integration/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/integration/test_interop.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/__init__.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_depict.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_descriptors.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_diversity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_enumerate.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_fingerprints.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_fragment.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_io.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_mcs.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_reactions.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_scaffold.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_similarity.py +0 -0
- {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_standardizer.py +0 -0
|
@@ -5,6 +5,21 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.2.0] - 2026-01-06
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
|
|
12
|
+
- **stats**: Calculate dataset statistics (MolWt, LogP, TPSA, etc. with min/max/mean/median/stdev)
|
|
13
|
+
- **split**: Split files into smaller chunks (by number of chunks or chunk size)
|
|
14
|
+
- **sample**: Randomly sample molecules (by count or fraction, with reservoir sampling for large files)
|
|
15
|
+
- **deduplicate**: Remove duplicate molecules (by SMILES, InChI, InChIKey, or scaffold)
|
|
16
|
+
- **validate**: Validate molecular structures (valence, kekulization, stereo, element constraints)
|
|
17
|
+
|
|
18
|
+
### Changed
|
|
19
|
+
|
|
20
|
+
- Commands are now displayed in alphabetical order in help output
|
|
21
|
+
- Total command count increased from 14 to 19
|
|
22
|
+
|
|
8
23
|
## [0.1.0] - 2026-01-06
|
|
9
24
|
|
|
10
25
|
### Added
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rdkit-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: A comprehensive CLI tool for RDKit cheminformatics operations
|
|
5
5
|
Project-URL: Homepage, https://github.com/vitruves/rdkit-cli
|
|
6
6
|
Project-URL: Repository, https://github.com/vitruves/rdkit-cli
|
|
@@ -38,7 +38,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
|
|
|
38
38
|
|
|
39
39
|
## Features
|
|
40
40
|
|
|
41
|
-
- **
|
|
41
|
+
- **19 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict, stats, split, sample, deduplicate, validate
|
|
42
42
|
- **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
|
|
43
43
|
- **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
|
|
44
44
|
- **Ninja-style Progress**: Real-time progress display with speed and ETA
|
|
@@ -290,6 +290,93 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
|
|
|
290
290
|
rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
291
291
|
```
|
|
292
292
|
|
|
293
|
+
### stats
|
|
294
|
+
|
|
295
|
+
Calculate dataset statistics.
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# Basic statistics
|
|
299
|
+
rdkit-cli stats -i molecules.csv -o stats.json --format json
|
|
300
|
+
|
|
301
|
+
# Specific properties
|
|
302
|
+
rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
|
|
303
|
+
|
|
304
|
+
# List available properties
|
|
305
|
+
rdkit-cli stats -i molecules.csv --list-properties
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### split
|
|
309
|
+
|
|
310
|
+
Split files into smaller chunks.
|
|
311
|
+
|
|
312
|
+
```bash
|
|
313
|
+
# Split into N files
|
|
314
|
+
rdkit-cli split -i large.csv -o chunks/ -c 10
|
|
315
|
+
|
|
316
|
+
# Split by chunk size
|
|
317
|
+
rdkit-cli split -i large.csv -o chunks/ -s 1000
|
|
318
|
+
|
|
319
|
+
# With custom prefix
|
|
320
|
+
rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
### sample
|
|
324
|
+
|
|
325
|
+
Randomly sample molecules.
|
|
326
|
+
|
|
327
|
+
```bash
|
|
328
|
+
# Sample by count
|
|
329
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
|
|
330
|
+
|
|
331
|
+
# Sample by fraction
|
|
332
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
|
|
333
|
+
|
|
334
|
+
# Memory-efficient streaming (reservoir sampling)
|
|
335
|
+
rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### deduplicate
|
|
339
|
+
|
|
340
|
+
Remove duplicate molecules.
|
|
341
|
+
|
|
342
|
+
```bash
|
|
343
|
+
# Deduplicate by canonical SMILES (default)
|
|
344
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv
|
|
345
|
+
|
|
346
|
+
# Deduplicate by InChIKey
|
|
347
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
|
|
348
|
+
|
|
349
|
+
# Deduplicate by scaffold
|
|
350
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
|
|
351
|
+
|
|
352
|
+
# Keep last occurrence instead of first
|
|
353
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
|
|
354
|
+
```
|
|
355
|
+
|
|
356
|
+
### validate
|
|
357
|
+
|
|
358
|
+
Validate molecular structures.
|
|
359
|
+
|
|
360
|
+
```bash
|
|
361
|
+
# Basic validation
|
|
362
|
+
rdkit-cli validate -i molecules.csv -o validated.csv
|
|
363
|
+
|
|
364
|
+
# Output only valid molecules
|
|
365
|
+
rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
|
|
366
|
+
|
|
367
|
+
# With constraints
|
|
368
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
369
|
+
--max-atoms 100 --max-rings 8
|
|
370
|
+
|
|
371
|
+
# Check allowed elements
|
|
372
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
373
|
+
--allowed-elements C,H,N,O,S,F,Cl
|
|
374
|
+
|
|
375
|
+
# Check stereo and show summary
|
|
376
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
377
|
+
--check-stereo --summary
|
|
378
|
+
```
|
|
379
|
+
|
|
293
380
|
## Global Options
|
|
294
381
|
|
|
295
382
|
| Option | Description |
|
|
@@ -319,19 +406,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
|
319
406
|
### Cheminformatics Pipeline
|
|
320
407
|
|
|
321
408
|
```bash
|
|
322
|
-
# 1.
|
|
323
|
-
rdkit-cli
|
|
409
|
+
# 1. Validate and filter input
|
|
410
|
+
rdkit-cli validate -i raw.csv -o validated.csv --valid-only
|
|
411
|
+
|
|
412
|
+
# 2. Deduplicate
|
|
413
|
+
rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
|
|
324
414
|
|
|
325
|
-
#
|
|
415
|
+
# 3. Standardize molecules
|
|
416
|
+
rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
|
|
417
|
+
|
|
418
|
+
# 4. Filter by drug-likeness
|
|
326
419
|
rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
|
|
327
420
|
|
|
328
|
-
#
|
|
421
|
+
# 5. Compute descriptors
|
|
329
422
|
rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
|
|
330
423
|
|
|
331
|
-
#
|
|
424
|
+
# 6. Get dataset statistics
|
|
425
|
+
rdkit-cli stats -i druglike.csv -o stats.json --format json
|
|
426
|
+
|
|
427
|
+
# 7. Select diverse subset
|
|
332
428
|
rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
|
|
333
429
|
|
|
334
|
-
#
|
|
430
|
+
# 8. Generate depictions
|
|
335
431
|
rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
|
|
336
432
|
```
|
|
337
433
|
|
|
@@ -358,6 +454,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
|
|
|
358
454
|
rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
|
|
359
455
|
```
|
|
360
456
|
|
|
457
|
+
### Large Dataset Processing
|
|
458
|
+
|
|
459
|
+
```bash
|
|
460
|
+
# Sample from a huge dataset
|
|
461
|
+
rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
|
|
462
|
+
|
|
463
|
+
# Split for parallel processing
|
|
464
|
+
rdkit-cli split -i library.csv -o batches/ -c 10
|
|
465
|
+
|
|
466
|
+
# Process batches in parallel (using xargs)
|
|
467
|
+
ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
|
|
468
|
+
```
|
|
469
|
+
|
|
361
470
|
## Development
|
|
362
471
|
|
|
363
472
|
```bash
|
|
@@ -4,7 +4,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
|
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
- **
|
|
7
|
+
- **19 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict, stats, split, sample, deduplicate, validate
|
|
8
8
|
- **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
|
|
9
9
|
- **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
|
|
10
10
|
- **Ninja-style Progress**: Real-time progress display with speed and ETA
|
|
@@ -256,6 +256,93 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
|
|
|
256
256
|
rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
257
257
|
```
|
|
258
258
|
|
|
259
|
+
### stats
|
|
260
|
+
|
|
261
|
+
Calculate dataset statistics.
|
|
262
|
+
|
|
263
|
+
```bash
|
|
264
|
+
# Basic statistics
|
|
265
|
+
rdkit-cli stats -i molecules.csv -o stats.json --format json
|
|
266
|
+
|
|
267
|
+
# Specific properties
|
|
268
|
+
rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
|
|
269
|
+
|
|
270
|
+
# List available properties
|
|
271
|
+
rdkit-cli stats -i molecules.csv --list-properties
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
### split
|
|
275
|
+
|
|
276
|
+
Split files into smaller chunks.
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
# Split into N files
|
|
280
|
+
rdkit-cli split -i large.csv -o chunks/ -c 10
|
|
281
|
+
|
|
282
|
+
# Split by chunk size
|
|
283
|
+
rdkit-cli split -i large.csv -o chunks/ -s 1000
|
|
284
|
+
|
|
285
|
+
# With custom prefix
|
|
286
|
+
rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### sample
|
|
290
|
+
|
|
291
|
+
Randomly sample molecules.
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
# Sample by count
|
|
295
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
|
|
296
|
+
|
|
297
|
+
# Sample by fraction
|
|
298
|
+
rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
|
|
299
|
+
|
|
300
|
+
# Memory-efficient streaming (reservoir sampling)
|
|
301
|
+
rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
### deduplicate
|
|
305
|
+
|
|
306
|
+
Remove duplicate molecules.
|
|
307
|
+
|
|
308
|
+
```bash
|
|
309
|
+
# Deduplicate by canonical SMILES (default)
|
|
310
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv
|
|
311
|
+
|
|
312
|
+
# Deduplicate by InChIKey
|
|
313
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
|
|
314
|
+
|
|
315
|
+
# Deduplicate by scaffold
|
|
316
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
|
|
317
|
+
|
|
318
|
+
# Keep last occurrence instead of first
|
|
319
|
+
rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
|
|
320
|
+
```
|
|
321
|
+
|
|
322
|
+
### validate
|
|
323
|
+
|
|
324
|
+
Validate molecular structures.
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
# Basic validation
|
|
328
|
+
rdkit-cli validate -i molecules.csv -o validated.csv
|
|
329
|
+
|
|
330
|
+
# Output only valid molecules
|
|
331
|
+
rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
|
|
332
|
+
|
|
333
|
+
# With constraints
|
|
334
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
335
|
+
--max-atoms 100 --max-rings 8
|
|
336
|
+
|
|
337
|
+
# Check allowed elements
|
|
338
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
339
|
+
--allowed-elements C,H,N,O,S,F,Cl
|
|
340
|
+
|
|
341
|
+
# Check stereo and show summary
|
|
342
|
+
rdkit-cli validate -i molecules.csv -o validated.csv \
|
|
343
|
+
--check-stereo --summary
|
|
344
|
+
```
|
|
345
|
+
|
|
259
346
|
## Global Options
|
|
260
347
|
|
|
261
348
|
| Option | Description |
|
|
@@ -285,19 +372,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
|
|
|
285
372
|
### Cheminformatics Pipeline
|
|
286
373
|
|
|
287
374
|
```bash
|
|
288
|
-
# 1.
|
|
289
|
-
rdkit-cli
|
|
375
|
+
# 1. Validate and filter input
|
|
376
|
+
rdkit-cli validate -i raw.csv -o validated.csv --valid-only
|
|
377
|
+
|
|
378
|
+
# 2. Deduplicate
|
|
379
|
+
rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
|
|
290
380
|
|
|
291
|
-
#
|
|
381
|
+
# 3. Standardize molecules
|
|
382
|
+
rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
|
|
383
|
+
|
|
384
|
+
# 4. Filter by drug-likeness
|
|
292
385
|
rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
|
|
293
386
|
|
|
294
|
-
#
|
|
387
|
+
# 5. Compute descriptors
|
|
295
388
|
rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
|
|
296
389
|
|
|
297
|
-
#
|
|
390
|
+
# 6. Get dataset statistics
|
|
391
|
+
rdkit-cli stats -i druglike.csv -o stats.json --format json
|
|
392
|
+
|
|
393
|
+
# 7. Select diverse subset
|
|
298
394
|
rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
|
|
299
395
|
|
|
300
|
-
#
|
|
396
|
+
# 8. Generate depictions
|
|
301
397
|
rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
|
|
302
398
|
```
|
|
303
399
|
|
|
@@ -324,6 +420,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
|
|
|
324
420
|
rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
|
|
325
421
|
```
|
|
326
422
|
|
|
423
|
+
### Large Dataset Processing
|
|
424
|
+
|
|
425
|
+
```bash
|
|
426
|
+
# Sample from a huge dataset
|
|
427
|
+
rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
|
|
428
|
+
|
|
429
|
+
# Split for parallel processing
|
|
430
|
+
rdkit-cli split -i library.csv -o batches/ -c 10
|
|
431
|
+
|
|
432
|
+
# Process batches in parallel (using xargs)
|
|
433
|
+
ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
|
|
434
|
+
```
|
|
435
|
+
|
|
327
436
|
## Development
|
|
328
437
|
|
|
329
438
|
```bash
|
|
@@ -68,6 +68,18 @@ def add_common_processing_options(parser: argparse.ArgumentParser):
|
|
|
68
68
|
action="store_true",
|
|
69
69
|
help="Suppress progress output",
|
|
70
70
|
)
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--no-warnings",
|
|
73
|
+
action="store_true",
|
|
74
|
+
help="Suppress RDKit warnings (kekulization errors, etc.)",
|
|
75
|
+
)
|
|
76
|
+
parser.add_argument(
|
|
77
|
+
"--log-level",
|
|
78
|
+
choices=["debug", "info", "warning", "error", "critical"],
|
|
79
|
+
default=None,
|
|
80
|
+
metavar="LEVEL",
|
|
81
|
+
help="RDKit log level (default: warning, use 'error' to suppress warnings)",
|
|
82
|
+
)
|
|
71
83
|
|
|
72
84
|
|
|
73
85
|
def create_parser() -> argparse.ArgumentParser:
|
|
@@ -100,39 +112,49 @@ def create_parser() -> argparse.ArgumentParser:
|
|
|
100
112
|
|
|
101
113
|
|
|
102
114
|
def _register_commands(subparsers):
|
|
103
|
-
"""Register all command subparsers."""
|
|
115
|
+
"""Register all command subparsers (alphabetical order)."""
|
|
104
116
|
from rdkit_cli.commands import (
|
|
105
|
-
descriptors,
|
|
106
|
-
fingerprints,
|
|
107
|
-
filter,
|
|
108
|
-
convert,
|
|
109
|
-
standardize,
|
|
110
|
-
similarity,
|
|
111
117
|
conformers,
|
|
112
|
-
|
|
113
|
-
|
|
118
|
+
convert,
|
|
119
|
+
deduplicate,
|
|
120
|
+
depict,
|
|
121
|
+
descriptors,
|
|
122
|
+
diversity,
|
|
114
123
|
enumerate,
|
|
124
|
+
filter,
|
|
125
|
+
fingerprints,
|
|
115
126
|
fragment,
|
|
116
|
-
diversity,
|
|
117
127
|
mcs,
|
|
118
|
-
|
|
128
|
+
reactions,
|
|
129
|
+
sample,
|
|
130
|
+
scaffold,
|
|
131
|
+
similarity,
|
|
132
|
+
split,
|
|
133
|
+
standardize,
|
|
134
|
+
stats,
|
|
135
|
+
validate,
|
|
119
136
|
)
|
|
120
137
|
|
|
121
138
|
# Each module has a register_parser(subparsers) function
|
|
122
|
-
descriptors.register_parser(subparsers)
|
|
123
|
-
fingerprints.register_parser(subparsers)
|
|
124
|
-
filter.register_parser(subparsers)
|
|
125
|
-
convert.register_parser(subparsers)
|
|
126
|
-
standardize.register_parser(subparsers)
|
|
127
|
-
similarity.register_parser(subparsers)
|
|
128
139
|
conformers.register_parser(subparsers)
|
|
129
|
-
|
|
130
|
-
|
|
140
|
+
convert.register_parser(subparsers)
|
|
141
|
+
deduplicate.register_parser(subparsers)
|
|
142
|
+
depict.register_parser(subparsers)
|
|
143
|
+
descriptors.register_parser(subparsers)
|
|
144
|
+
diversity.register_parser(subparsers)
|
|
131
145
|
enumerate.register_parser(subparsers)
|
|
146
|
+
filter.register_parser(subparsers)
|
|
147
|
+
fingerprints.register_parser(subparsers)
|
|
132
148
|
fragment.register_parser(subparsers)
|
|
133
|
-
diversity.register_parser(subparsers)
|
|
134
149
|
mcs.register_parser(subparsers)
|
|
135
|
-
|
|
150
|
+
reactions.register_parser(subparsers)
|
|
151
|
+
sample.register_parser(subparsers)
|
|
152
|
+
scaffold.register_parser(subparsers)
|
|
153
|
+
similarity.register_parser(subparsers)
|
|
154
|
+
split.register_parser(subparsers)
|
|
155
|
+
standardize.register_parser(subparsers)
|
|
156
|
+
stats.register_parser(subparsers)
|
|
157
|
+
validate.register_parser(subparsers)
|
|
136
158
|
|
|
137
159
|
|
|
138
160
|
def main(args: Optional[list[str]] = None) -> int:
|
|
@@ -144,6 +166,18 @@ def main(args: Optional[list[str]] = None) -> int:
|
|
|
144
166
|
parser.print_help()
|
|
145
167
|
return 1
|
|
146
168
|
|
|
169
|
+
# Configure logging based on --no-warnings or --log-level
|
|
170
|
+
from rdkit_cli.utils import configure_all_warnings, set_rdkit_log_level
|
|
171
|
+
no_warnings = getattr(parsed_args, "no_warnings", False)
|
|
172
|
+
log_level = getattr(parsed_args, "log_level", None)
|
|
173
|
+
|
|
174
|
+
if no_warnings:
|
|
175
|
+
# Suppress both RDKit and application warnings
|
|
176
|
+
configure_all_warnings(suppress=True)
|
|
177
|
+
elif log_level is not None:
|
|
178
|
+
# Only control RDKit log level
|
|
179
|
+
set_rdkit_log_level(log_level)
|
|
180
|
+
|
|
147
181
|
# Each command has a run(args) function via set_defaults(func=...)
|
|
148
182
|
try:
|
|
149
183
|
return parsed_args.func(parsed_args)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Deduplicate command implementation."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def register_parser(subparsers):
|
|
10
|
+
"""Register the deduplicate command."""
|
|
11
|
+
parser = subparsers.add_parser(
|
|
12
|
+
"deduplicate",
|
|
13
|
+
help="Remove duplicate molecules",
|
|
14
|
+
description="Remove duplicate molecules from a dataset based on various molecular identifiers.",
|
|
15
|
+
formatter_class=RdkitHelpFormatter,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
add_common_io_options(parser)
|
|
19
|
+
add_common_processing_options(parser)
|
|
20
|
+
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"-b", "--by",
|
|
23
|
+
choices=["smiles", "inchi", "inchikey", "scaffold"],
|
|
24
|
+
default="smiles",
|
|
25
|
+
help="Deduplication key type (default: smiles)",
|
|
26
|
+
)
|
|
27
|
+
parser.add_argument(
|
|
28
|
+
"--keep",
|
|
29
|
+
choices=["first", "last"],
|
|
30
|
+
default="first",
|
|
31
|
+
help="Which duplicate to keep (default: first)",
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--list-keys",
|
|
35
|
+
action="store_true",
|
|
36
|
+
help="List available key types and exit",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
parser.set_defaults(func=run_deduplicate)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def run_deduplicate(args) -> int:
|
|
43
|
+
"""Run the deduplicate command."""
|
|
44
|
+
from rdkit_cli.core.deduplicate import Deduplicator
|
|
45
|
+
from rdkit_cli.io import create_reader, create_writer
|
|
46
|
+
from rdkit_cli.progress.ninja import NinjaProgress
|
|
47
|
+
|
|
48
|
+
# Handle --list-keys
|
|
49
|
+
if args.list_keys:
|
|
50
|
+
print("Available deduplication keys:")
|
|
51
|
+
print(" smiles - Canonical SMILES (default)")
|
|
52
|
+
print(" inchi - InChI string")
|
|
53
|
+
print(" inchikey - InChIKey (27 character hash)")
|
|
54
|
+
print(" scaffold - Murcko scaffold SMILES")
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
input_path = Path(args.input)
|
|
58
|
+
if not input_path.exists():
|
|
59
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
60
|
+
return 1
|
|
61
|
+
|
|
62
|
+
# Create reader
|
|
63
|
+
reader = create_reader(
|
|
64
|
+
input_path,
|
|
65
|
+
smiles_column=args.smiles_column,
|
|
66
|
+
name_column=args.name_column,
|
|
67
|
+
has_header=not args.no_header,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Read all records with progress
|
|
71
|
+
if not args.quiet:
|
|
72
|
+
print("Reading molecules...", file=sys.stderr)
|
|
73
|
+
|
|
74
|
+
records = []
|
|
75
|
+
with reader:
|
|
76
|
+
total = len(reader)
|
|
77
|
+
progress = NinjaProgress(total=total, quiet=args.quiet)
|
|
78
|
+
progress.start()
|
|
79
|
+
|
|
80
|
+
for record in reader:
|
|
81
|
+
records.append(record)
|
|
82
|
+
progress.update(1)
|
|
83
|
+
|
|
84
|
+
progress.finish()
|
|
85
|
+
|
|
86
|
+
if not records:
|
|
87
|
+
print("Error: No molecules found in input file", file=sys.stderr)
|
|
88
|
+
return 1
|
|
89
|
+
|
|
90
|
+
if not args.quiet:
|
|
91
|
+
print(f"Deduplicating {len(records)} molecules by {args.by}...", file=sys.stderr)
|
|
92
|
+
|
|
93
|
+
# Create deduplicator
|
|
94
|
+
deduplicator = Deduplicator(
|
|
95
|
+
key_type=args.by,
|
|
96
|
+
keep=args.keep,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Deduplicate
|
|
100
|
+
unique_records, n_duplicates = deduplicator.deduplicate(records)
|
|
101
|
+
|
|
102
|
+
# Write output
|
|
103
|
+
output_path = Path(args.output)
|
|
104
|
+
writer = create_writer(output_path)
|
|
105
|
+
|
|
106
|
+
with writer:
|
|
107
|
+
for record in unique_records:
|
|
108
|
+
row = {"smiles": record.smiles}
|
|
109
|
+
if record.name:
|
|
110
|
+
row["name"] = record.name
|
|
111
|
+
for key, value in record.metadata.items():
|
|
112
|
+
if key not in row and key != "smiles":
|
|
113
|
+
row[key] = value
|
|
114
|
+
writer.write_row(row)
|
|
115
|
+
|
|
116
|
+
if not args.quiet:
|
|
117
|
+
print(
|
|
118
|
+
f"Removed {n_duplicates} duplicates. "
|
|
119
|
+
f"Wrote {len(unique_records)} unique molecules to {output_path}",
|
|
120
|
+
file=sys.stderr,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
return 0
|