rdkit-cli 0.1.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. rdkit_cli-0.3.0/CHANGELOG.md +77 -0
  2. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/PKG-INFO +272 -9
  3. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/README.md +270 -7
  4. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/pyproject.toml +2 -2
  5. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/__init__.py +1 -1
  6. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/cli.py +102 -23
  7. rdkit_cli-0.3.0/src/rdkit_cli/commands/align.py +140 -0
  8. rdkit_cli-0.3.0/src/rdkit_cli/commands/deduplicate.py +123 -0
  9. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/depict.py +122 -0
  10. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/descriptors.py +38 -0
  11. rdkit_cli-0.3.0/src/rdkit_cli/commands/info.py +55 -0
  12. rdkit_cli-0.3.0/src/rdkit_cli/commands/merge.py +101 -0
  13. rdkit_cli-0.3.0/src/rdkit_cli/commands/mmp.py +311 -0
  14. rdkit_cli-0.3.0/src/rdkit_cli/commands/props.py +309 -0
  15. rdkit_cli-0.3.0/src/rdkit_cli/commands/protonate.py +121 -0
  16. rdkit_cli-0.3.0/src/rdkit_cli/commands/rgroup.py +102 -0
  17. rdkit_cli-0.3.0/src/rdkit_cli/commands/rings.py +235 -0
  18. rdkit_cli-0.3.0/src/rdkit_cli/commands/rmsd.py +324 -0
  19. rdkit_cli-0.3.0/src/rdkit_cli/commands/sample.py +152 -0
  20. rdkit_cli-0.3.0/src/rdkit_cli/commands/sascorer.py +103 -0
  21. rdkit_cli-0.3.0/src/rdkit_cli/commands/split.py +152 -0
  22. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/standardize.py +61 -0
  23. rdkit_cli-0.3.0/src/rdkit_cli/commands/stats.py +172 -0
  24. rdkit_cli-0.3.0/src/rdkit_cli/commands/validate.py +189 -0
  25. rdkit_cli-0.3.0/src/rdkit_cli/core/align.py +235 -0
  26. rdkit_cli-0.3.0/src/rdkit_cli/core/deduplicate.py +190 -0
  27. rdkit_cli-0.3.0/src/rdkit_cli/core/info.py +127 -0
  28. rdkit_cli-0.3.0/src/rdkit_cli/core/merge.py +119 -0
  29. rdkit_cli-0.3.0/src/rdkit_cli/core/mmp.py +228 -0
  30. rdkit_cli-0.3.0/src/rdkit_cli/core/protonate.py +283 -0
  31. rdkit_cli-0.3.0/src/rdkit_cli/core/rgroup.py +220 -0
  32. rdkit_cli-0.3.0/src/rdkit_cli/core/rings.py +274 -0
  33. rdkit_cli-0.3.0/src/rdkit_cli/core/rmsd.py +247 -0
  34. rdkit_cli-0.3.0/src/rdkit_cli/core/sample.py +156 -0
  35. rdkit_cli-0.3.0/src/rdkit_cli/core/sascorer.py +239 -0
  36. rdkit_cli-0.3.0/src/rdkit_cli/core/split.py +118 -0
  37. rdkit_cli-0.3.0/src/rdkit_cli/core/stats.py +96 -0
  38. rdkit_cli-0.3.0/src/rdkit_cli/core/validate.py +222 -0
  39. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/readers.py +11 -4
  40. rdkit_cli-0.3.0/src/rdkit_cli/utils/__init__.py +27 -0
  41. rdkit_cli-0.3.0/src/rdkit_cli/utils/logging.py +113 -0
  42. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/integration/test_cli.py +401 -0
  43. rdkit_cli-0.3.0/tests/integration/test_new_commands.py +355 -0
  44. rdkit_cli-0.3.0/tests/unit/test_align.py +135 -0
  45. rdkit_cli-0.3.0/tests/unit/test_deduplicate.py +161 -0
  46. rdkit_cli-0.3.0/tests/unit/test_filters.py +529 -0
  47. rdkit_cli-0.3.0/tests/unit/test_info.py +97 -0
  48. rdkit_cli-0.3.0/tests/unit/test_merge.py +79 -0
  49. rdkit_cli-0.3.0/tests/unit/test_mmp.py +177 -0
  50. rdkit_cli-0.3.0/tests/unit/test_protonate.py +166 -0
  51. rdkit_cli-0.3.0/tests/unit/test_rgroup.py +121 -0
  52. rdkit_cli-0.3.0/tests/unit/test_rings.py +158 -0
  53. rdkit_cli-0.3.0/tests/unit/test_rmsd.py +171 -0
  54. rdkit_cli-0.3.0/tests/unit/test_sample.py +155 -0
  55. rdkit_cli-0.3.0/tests/unit/test_sascorer.py +135 -0
  56. rdkit_cli-0.3.0/tests/unit/test_split.py +101 -0
  57. rdkit_cli-0.3.0/tests/unit/test_stats.py +108 -0
  58. rdkit_cli-0.3.0/tests/unit/test_validate.py +208 -0
  59. rdkit_cli-0.1.0/CHANGELOG.md +0 -43
  60. rdkit_cli-0.1.0/src/rdkit_cli/utils/__init__.py +0 -1
  61. rdkit_cli-0.1.0/tests/unit/test_filters.py +0 -211
  62. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/.github/workflows/publish.yml +0 -0
  63. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/LICENSE +0 -0
  64. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/__main__.py +0 -0
  65. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/__init__.py +0 -0
  66. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/conformers.py +0 -0
  67. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/convert.py +0 -0
  68. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/diversity.py +0 -0
  69. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/enumerate.py +0 -0
  70. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/filter.py +0 -0
  71. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/fingerprints.py +0 -0
  72. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/fragment.py +0 -0
  73. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/mcs.py +0 -0
  74. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/reactions.py +0 -0
  75. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/scaffold.py +0 -0
  76. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/commands/similarity.py +0 -0
  77. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/__init__.py +0 -0
  78. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/conformers.py +0 -0
  79. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/depict.py +0 -0
  80. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/descriptors.py +0 -0
  81. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/diversity.py +0 -0
  82. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/enumerate.py +0 -0
  83. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/filters.py +0 -0
  84. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/fingerprints.py +0 -0
  85. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/fragment.py +0 -0
  86. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/mcs.py +0 -0
  87. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/reactions.py +0 -0
  88. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/scaffold.py +0 -0
  89. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/similarity.py +0 -0
  90. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/core/standardizer.py +0 -0
  91. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/__init__.py +0 -0
  92. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/formats.py +0 -0
  93. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/io/writers.py +0 -0
  94. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/parallel/__init__.py +0 -0
  95. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/parallel/batch.py +0 -0
  96. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/parallel/executor.py +0 -0
  97. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/progress/__init__.py +0 -0
  98. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/src/rdkit_cli/progress/ninja.py +0 -0
  99. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/__init__.py +0 -0
  100. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/conftest.py +0 -0
  101. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/fixtures/sample.csv +0 -0
  102. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/fixtures/sample.smi +0 -0
  103. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/integration/__init__.py +0 -0
  104. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/integration/test_interop.py +0 -0
  105. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/__init__.py +0 -0
  106. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_depict.py +0 -0
  107. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_descriptors.py +0 -0
  108. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_diversity.py +0 -0
  109. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_enumerate.py +0 -0
  110. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_fingerprints.py +0 -0
  111. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_fragment.py +0 -0
  112. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_io.py +0 -0
  113. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_mcs.py +0 -0
  114. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_reactions.py +0 -0
  115. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_scaffold.py +0 -0
  116. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_similarity.py +0 -0
  117. {rdkit_cli-0.1.0 → rdkit_cli-0.3.0}/tests/unit/test_standardizer.py +0 -0
@@ -0,0 +1,77 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.3.0] - 2026-01-10
9
+
10
+ ### Added
11
+
12
+ - **info**: Quick molecule information from SMILES (formula, MW, LogP, TPSA, stereocenters, Lipinski violations, InChI/InChIKey)
13
+ - **merge**: Combine multiple molecule files with optional deduplication and source tracking
14
+ - **sascorer**: Calculate Synthetic Accessibility (SA) Score, Natural Product-likeness (NPC), and QED scores
15
+ - **rgroup**: R-group decomposition around a core SMARTS pattern with labeled attachment points
16
+ - **rings**: Ring system analysis - extract ring systems (fused, spiro, bridged) and analyze frequencies
17
+ - **align**: 3D molecular alignment to a reference structure (MCS-based or Open3DAlign)
18
+ - **rmsd**: RMSD calculations between 3D structures (compare to reference, pairwise matrix, conformer analysis)
19
+ - **mmp**: Matched Molecular Pairs analysis - fragment molecules, find pairs, apply transformations
20
+ - **protonate**: Protonation state enumeration at specified pH with neutralization option
21
+ - **props**: Property column operations - add, rename, drop, keep columns in molecule files
22
+
23
+ ### Changed
24
+
25
+ - Total command count increased from 19 to 29
26
+
27
+ ## [0.2.0] - 2026-01-06
28
+
29
+ ### Added
30
+
31
+ - **stats**: Calculate dataset statistics (MolWt, LogP, TPSA, etc. with min/max/mean/median/stdev)
32
+ - **split**: Split files into smaller chunks (by number of chunks or chunk size)
33
+ - **sample**: Randomly sample molecules (by count or fraction, with reservoir sampling for large files)
34
+ - **deduplicate**: Remove duplicate molecules (by SMILES, InChI, InChIKey, or scaffold)
35
+ - **validate**: Validate molecular structures (valence, kekulization, stereo, element constraints)
36
+
37
+ ### Changed
38
+
39
+ - Commands are now displayed in alphabetical order in help output
40
+ - Total command count increased from 14 to 19
41
+
42
+ ## [0.1.0] - 2026-01-06
43
+
44
+ ### Added
45
+
46
+ - Initial release with 14 command categories
47
+ - **descriptors**: Compute molecular descriptors (200+ available)
48
+ - **fingerprints**: Generate molecular fingerprints (morgan, maccs, rdkit, atompair, torsion, pattern)
49
+ - **filter**: Filter molecules by substructure, properties, drug-likeness (Lipinski/Veber/Ghose), PAINS
50
+ - **convert**: Convert between molecular file formats (CSV, TSV, SMI, SDF, Parquet)
51
+ - **standardize**: Standardize and canonicalize molecules
52
+ - **similarity**: Similarity search, matrix computation, and clustering
53
+ - **conformers**: Generate and optimize 3D conformers
54
+ - **reactions**: SMIRKS transformations and reaction enumeration
55
+ - **scaffold**: Murcko scaffold extraction and decomposition
56
+ - **enumerate**: Stereoisomer and tautomer enumeration
57
+ - **fragment**: BRICS/RECAP fragmentation and functional group analysis
58
+ - **diversity**: MaxMin diversity picking and diversity analysis
59
+ - **mcs**: Maximum Common Substructure finding
60
+ - **depict**: SVG/PNG molecular depictions (single, batch, grid)
61
+
62
+ ### Features
63
+
64
+ - Multi-core parallel processing via ProcessPoolExecutor
65
+ - Ninja-style progress display with speed and ETA
66
+ - Support for multiple I/O formats (CSV, TSV, SMI, SDF, Parquet)
67
+ - Automatic format detection from file extensions
68
+ - Lazy imports for fast CLI startup (~0.08s)
69
+ - Comprehensive test suite (182 tests)
70
+
71
+ ### Dependencies
72
+
73
+ - rdkit>=2024.3.1
74
+ - rich-argparse>=1.4.0
75
+ - pandas>=2.0.0
76
+ - pyarrow>=14.0.0
77
+ - numpy>=1.24.0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdkit-cli
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: A comprehensive CLI tool for RDKit cheminformatics operations
5
5
  Project-URL: Homepage, https://github.com/vitruves/rdkit-cli
6
6
  Project-URL: Repository, https://github.com/vitruves/rdkit-cli
7
7
  Project-URL: Issues, https://github.com/vitruves/rdkit-cli/issues
8
- Author: Vitruves
8
+ Author: Johan HG Natter
9
9
  License-Expression: Apache-2.0
10
10
  License-File: LICENSE
11
11
  Keywords: cheminformatics,chemistry,cli,fingerprints,molecular-descriptors,rdkit
@@ -38,7 +38,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
38
38
 
39
39
  ## Features
40
40
 
41
- - **14 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict
41
+ - **29 Command Categories**: align, conformers, convert, deduplicate, depict, descriptors, diversity, enumerate, filter, fingerprints, fragment, info, mcs, merge, mmp, props, protonate, reactions, rgroup, rings, rmsd, sample, sascorer, scaffold, similarity, split, standardize, stats, validate
42
42
  - **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
43
43
  - **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
44
44
  - **Ninja-style Progress**: Real-time progress display with speed and ETA
@@ -290,6 +290,247 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
290
290
  rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
291
291
  ```
292
292
 
293
+ ### stats
294
+
295
+ Calculate dataset statistics.
296
+
297
+ ```bash
298
+ # Basic statistics
299
+ rdkit-cli stats -i molecules.csv -o stats.json --format json
300
+
301
+ # Specific properties
302
+ rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
303
+
304
+ # List available properties
305
+ rdkit-cli stats -i molecules.csv --list-properties
306
+ ```
307
+
308
+ ### split
309
+
310
+ Split files into smaller chunks.
311
+
312
+ ```bash
313
+ # Split into N files
314
+ rdkit-cli split -i large.csv -o chunks/ -c 10
315
+
316
+ # Split by chunk size
317
+ rdkit-cli split -i large.csv -o chunks/ -s 1000
318
+
319
+ # With custom prefix
320
+ rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
321
+ ```
322
+
323
+ ### sample
324
+
325
+ Randomly sample molecules.
326
+
327
+ ```bash
328
+ # Sample by count
329
+ rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
330
+
331
+ # Sample by fraction
332
+ rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
333
+
334
+ # Memory-efficient streaming (reservoir sampling)
335
+ rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
336
+ ```
337
+
338
+ ### deduplicate
339
+
340
+ Remove duplicate molecules.
341
+
342
+ ```bash
343
+ # Deduplicate by canonical SMILES (default)
344
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv
345
+
346
+ # Deduplicate by InChIKey
347
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
348
+
349
+ # Deduplicate by scaffold
350
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
351
+
352
+ # Keep last occurrence instead of first
353
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
354
+ ```
355
+
356
+ ### validate
357
+
358
+ Validate molecular structures.
359
+
360
+ ```bash
361
+ # Basic validation
362
+ rdkit-cli validate -i molecules.csv -o validated.csv
363
+
364
+ # Output only valid molecules
365
+ rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
366
+
367
+ # With constraints
368
+ rdkit-cli validate -i molecules.csv -o validated.csv \
369
+ --max-atoms 100 --max-rings 8
370
+
371
+ # Check allowed elements
372
+ rdkit-cli validate -i molecules.csv -o validated.csv \
373
+ --allowed-elements C,H,N,O,S,F,Cl
374
+
375
+ # Check stereo and show summary
376
+ rdkit-cli validate -i molecules.csv -o validated.csv \
377
+ --check-stereo --summary
378
+ ```
379
+
380
+ ### info
381
+
382
+ Quick molecule information from SMILES.
383
+
384
+ ```bash
385
+ # Basic info
386
+ rdkit-cli info "CCO"
387
+
388
+ # JSON output
389
+ rdkit-cli info "c1ccccc1" --json
390
+
391
+ # Shows: formula, MW, LogP, TPSA, stereocenters, Lipinski violations, InChI/InChIKey
392
+ ```
393
+
394
+ ### merge
395
+
396
+ Combine multiple molecule files.
397
+
398
+ ```bash
399
+ # Merge two files
400
+ rdkit-cli merge -i file1.csv file2.csv -o merged.csv
401
+
402
+ # Merge with deduplication
403
+ rdkit-cli merge -i file1.csv file2.csv -o merged.csv --dedupe
404
+
405
+ # Track source file
406
+ rdkit-cli merge -i file1.csv file2.csv -o merged.csv --source-column source
407
+ ```
408
+
409
+ ### sascorer
410
+
411
+ Calculate synthetic accessibility and drug-likeness scores.
412
+
413
+ ```bash
414
+ # SA Score only (default)
415
+ rdkit-cli sascorer -i molecules.csv -o scores.csv
416
+
417
+ # Include QED score
418
+ rdkit-cli sascorer -i molecules.csv -o scores.csv --qed
419
+
420
+ # Include Natural Product-likeness score
421
+ rdkit-cli sascorer -i molecules.csv -o scores.csv --npc
422
+
423
+ # All scores
424
+ rdkit-cli sascorer -i molecules.csv -o scores.csv --qed --npc
425
+ ```
426
+
427
+ ### rgroup
428
+
429
+ R-group decomposition around a core structure.
430
+
431
+ ```bash
432
+ # Decompose around benzene core
433
+ rdkit-cli rgroup -i molecules.csv -o decomposed.csv --core "c1ccc([*:1])cc1"
434
+
435
+ # Multiple attachment points
436
+ rdkit-cli rgroup -i molecules.csv -o decomposed.csv \
437
+ --core "c1ccc([*:1])cc([*:2])1"
438
+ ```
439
+
440
+ ### rings
441
+
442
+ Ring system analysis.
443
+
444
+ ```bash
445
+ # Extract ring systems
446
+ rdkit-cli rings extract -i molecules.csv -o rings.csv
447
+
448
+ # Ring information (counts, sizes, aromaticity)
449
+ rdkit-cli rings info -i molecules.csv -o ring_info.csv
450
+
451
+ # Frequency analysis
452
+ rdkit-cli rings frequency -i molecules.csv -o ring_freq.csv
453
+ ```
454
+
455
+ ### align
456
+
457
+ 3D molecular alignment.
458
+
459
+ ```bash
460
+ # Align to reference structure (MCS-based)
461
+ rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf
462
+
463
+ # Open3DAlign method
464
+ rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf --method o3a
465
+ ```
466
+
467
+ ### rmsd
468
+
469
+ RMSD calculations between 3D structures.
470
+
471
+ ```bash
472
+ # Compare to reference
473
+ rdkit-cli rmsd compare -i molecules.sdf -o results.csv -r reference.sdf
474
+
475
+ # Pairwise RMSD matrix
476
+ rdkit-cli rmsd matrix -i molecules.sdf -o matrix.csv
477
+
478
+ # Conformer RMSD analysis
479
+ rdkit-cli rmsd conformers -i multi_conf.sdf -o conf_rmsd.csv
480
+ ```
481
+
482
+ ### mmp
483
+
484
+ Matched Molecular Pairs analysis.
485
+
486
+ ```bash
487
+ # Fragment molecules for MMP
488
+ rdkit-cli mmp fragment -i molecules.csv -o fragments.csv
489
+
490
+ # Find matched pairs
491
+ rdkit-cli mmp pairs -i fragments.csv -o pairs.csv
492
+
493
+ # Apply MMP transformation
494
+ rdkit-cli mmp transform -i molecules.csv -o transformed.csv \
495
+ -t "[c:1][CH3]>>[c:1][NH2]"
496
+ ```
497
+
498
+ ### protonate
499
+
500
+ Protonation state enumeration.
501
+
502
+ ```bash
503
+ # Enumerate at physiological pH
504
+ rdkit-cli protonate -i molecules.csv -o protonated.csv --ph 7.4
505
+
506
+ # Neutralize charged molecules
507
+ rdkit-cli protonate -i molecules.csv -o neutral.csv --neutralize
508
+
509
+ # Enumerate all states
510
+ rdkit-cli protonate -i molecules.csv -o states.csv --enumerate-all
511
+ ```
512
+
513
+ ### props
514
+
515
+ Property column operations.
516
+
517
+ ```bash
518
+ # Add a column
519
+ rdkit-cli props add -i molecules.csv -o output.csv -c series -v "series_A"
520
+
521
+ # Rename a column
522
+ rdkit-cli props rename -i molecules.csv -o output.csv --from name --to mol_name
523
+
524
+ # Drop columns
525
+ rdkit-cli props drop -i molecules.csv -o output.csv -c col1,col2
526
+
527
+ # Keep only specific columns
528
+ rdkit-cli props keep -i molecules.csv -o output.csv -c smiles,name,MolWt
529
+
530
+ # List columns
531
+ rdkit-cli props list -i molecules.csv
532
+ ```
533
+
293
534
  ## Global Options
294
535
 
295
536
  | Option | Description |
@@ -319,19 +560,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
319
560
  ### Cheminformatics Pipeline
320
561
 
321
562
  ```bash
322
- # 1. Standardize input molecules
323
- rdkit-cli standardize -i raw.csv -o std.csv --cleanup --neutralize
563
+ # 1. Validate and filter input
564
+ rdkit-cli validate -i raw.csv -o validated.csv --valid-only
565
+
566
+ # 2. Deduplicate
567
+ rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
568
+
569
+ # 3. Standardize molecules
570
+ rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
324
571
 
325
- # 2. Filter by drug-likeness
572
+ # 4. Filter by drug-likeness
326
573
  rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
327
574
 
328
- # 3. Compute descriptors
575
+ # 5. Compute descriptors
329
576
  rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
330
577
 
331
- # 4. Select diverse subset
578
+ # 6. Get dataset statistics
579
+ rdkit-cli stats -i druglike.csv -o stats.json --format json
580
+
581
+ # 7. Select diverse subset
332
582
  rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
333
583
 
334
- # 5. Generate depictions
584
+ # 8. Generate depictions
335
585
  rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
336
586
  ```
337
587
 
@@ -358,6 +608,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
358
608
  rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
359
609
  ```
360
610
 
611
+ ### Large Dataset Processing
612
+
613
+ ```bash
614
+ # Sample from a huge dataset
615
+ rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
616
+
617
+ # Split for parallel processing
618
+ rdkit-cli split -i library.csv -o batches/ -c 10
619
+
620
+ # Process batches in parallel (using xargs)
621
+ ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
622
+ ```
623
+
361
624
  ## Development
362
625
 
363
626
  ```bash
@@ -4,7 +4,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
4
4
 
5
5
  ## Features
6
6
 
7
- - **14 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict
7
+ - **29 Command Categories**: align, conformers, convert, deduplicate, depict, descriptors, diversity, enumerate, filter, fingerprints, fragment, info, mcs, merge, mmp, props, protonate, reactions, rgroup, rings, rmsd, sample, sascorer, scaffold, similarity, split, standardize, stats, validate
8
8
  - **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
9
9
  - **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
10
10
  - **Ninja-style Progress**: Real-time progress display with speed and ETA
@@ -256,6 +256,247 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
256
256
  rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
257
257
  ```
258
258
 
259
+ ### stats
260
+
261
+ Calculate dataset statistics.
262
+
263
+ ```bash
264
+ # Basic statistics
265
+ rdkit-cli stats -i molecules.csv -o stats.json --format json
266
+
267
+ # Specific properties
268
+ rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
269
+
270
+ # List available properties
271
+ rdkit-cli stats -i molecules.csv --list-properties
272
+ ```
273
+
274
+ ### split
275
+
276
+ Split files into smaller chunks.
277
+
278
+ ```bash
279
+ # Split into N files
280
+ rdkit-cli split -i large.csv -o chunks/ -c 10
281
+
282
+ # Split by chunk size
283
+ rdkit-cli split -i large.csv -o chunks/ -s 1000
284
+
285
+ # With custom prefix
286
+ rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
287
+ ```
288
+
289
+ ### sample
290
+
291
+ Randomly sample molecules.
292
+
293
+ ```bash
294
+ # Sample by count
295
+ rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
296
+
297
+ # Sample by fraction
298
+ rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
299
+
300
+ # Memory-efficient streaming (reservoir sampling)
301
+ rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
302
+ ```
303
+
304
+ ### deduplicate
305
+
306
+ Remove duplicate molecules.
307
+
308
+ ```bash
309
+ # Deduplicate by canonical SMILES (default)
310
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv
311
+
312
+ # Deduplicate by InChIKey
313
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
314
+
315
+ # Deduplicate by scaffold
316
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
317
+
318
+ # Keep last occurrence instead of first
319
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
320
+ ```
321
+
322
+ ### validate
323
+
324
+ Validate molecular structures.
325
+
326
+ ```bash
327
+ # Basic validation
328
+ rdkit-cli validate -i molecules.csv -o validated.csv
329
+
330
+ # Output only valid molecules
331
+ rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
332
+
333
+ # With constraints
334
+ rdkit-cli validate -i molecules.csv -o validated.csv \
335
+ --max-atoms 100 --max-rings 8
336
+
337
+ # Check allowed elements
338
+ rdkit-cli validate -i molecules.csv -o validated.csv \
339
+ --allowed-elements C,H,N,O,S,F,Cl
340
+
341
+ # Check stereo and show summary
342
+ rdkit-cli validate -i molecules.csv -o validated.csv \
343
+ --check-stereo --summary
344
+ ```
345
+
346
+ ### info
347
+
348
+ Quick molecule information from SMILES.
349
+
350
+ ```bash
351
+ # Basic info
352
+ rdkit-cli info "CCO"
353
+
354
+ # JSON output
355
+ rdkit-cli info "c1ccccc1" --json
356
+
357
+ # Shows: formula, MW, LogP, TPSA, stereocenters, Lipinski violations, InChI/InChIKey
358
+ ```
359
+
360
+ ### merge
361
+
362
+ Combine multiple molecule files.
363
+
364
+ ```bash
365
+ # Merge two files
366
+ rdkit-cli merge -i file1.csv file2.csv -o merged.csv
367
+
368
+ # Merge with deduplication
369
+ rdkit-cli merge -i file1.csv file2.csv -o merged.csv --dedupe
370
+
371
+ # Track source file
372
+ rdkit-cli merge -i file1.csv file2.csv -o merged.csv --source-column source
373
+ ```
374
+
375
+ ### sascorer
376
+
377
+ Calculate synthetic accessibility and drug-likeness scores.
378
+
379
+ ```bash
380
+ # SA Score only (default)
381
+ rdkit-cli sascorer -i molecules.csv -o scores.csv
382
+
383
+ # Include QED score
384
+ rdkit-cli sascorer -i molecules.csv -o scores.csv --qed
385
+
386
+ # Include Natural Product-likeness score
387
+ rdkit-cli sascorer -i molecules.csv -o scores.csv --npc
388
+
389
+ # All scores
390
+ rdkit-cli sascorer -i molecules.csv -o scores.csv --qed --npc
391
+ ```
392
+
393
+ ### rgroup
394
+
395
+ R-group decomposition around a core structure.
396
+
397
+ ```bash
398
+ # Decompose around benzene core
399
+ rdkit-cli rgroup -i molecules.csv -o decomposed.csv --core "c1ccc([*:1])cc1"
400
+
401
+ # Multiple attachment points
402
+ rdkit-cli rgroup -i molecules.csv -o decomposed.csv \
403
+ --core "c1ccc([*:1])cc([*:2])1"
404
+ ```
405
+
406
+ ### rings
407
+
408
+ Ring system analysis.
409
+
410
+ ```bash
411
+ # Extract ring systems
412
+ rdkit-cli rings extract -i molecules.csv -o rings.csv
413
+
414
+ # Ring information (counts, sizes, aromaticity)
415
+ rdkit-cli rings info -i molecules.csv -o ring_info.csv
416
+
417
+ # Frequency analysis
418
+ rdkit-cli rings frequency -i molecules.csv -o ring_freq.csv
419
+ ```
420
+
421
+ ### align
422
+
423
+ 3D molecular alignment.
424
+
425
+ ```bash
426
+ # Align to reference structure (MCS-based)
427
+ rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf
428
+
429
+ # Open3DAlign method
430
+ rdkit-cli align -i probes.sdf -o aligned.sdf -r reference.sdf --method o3a
431
+ ```
432
+
433
+ ### rmsd
434
+
435
+ RMSD calculations between 3D structures.
436
+
437
+ ```bash
438
+ # Compare to reference
439
+ rdkit-cli rmsd compare -i molecules.sdf -o results.csv -r reference.sdf
440
+
441
+ # Pairwise RMSD matrix
442
+ rdkit-cli rmsd matrix -i molecules.sdf -o matrix.csv
443
+
444
+ # Conformer RMSD analysis
445
+ rdkit-cli rmsd conformers -i multi_conf.sdf -o conf_rmsd.csv
446
+ ```
447
+
448
+ ### mmp
449
+
450
+ Matched Molecular Pairs analysis.
451
+
452
+ ```bash
453
+ # Fragment molecules for MMP
454
+ rdkit-cli mmp fragment -i molecules.csv -o fragments.csv
455
+
456
+ # Find matched pairs
457
+ rdkit-cli mmp pairs -i fragments.csv -o pairs.csv
458
+
459
+ # Apply MMP transformation
460
+ rdkit-cli mmp transform -i molecules.csv -o transformed.csv \
461
+ -t "[c:1][CH3]>>[c:1][NH2]"
462
+ ```
463
+
464
+ ### protonate
465
+
466
+ Protonation state enumeration.
467
+
468
+ ```bash
469
+ # Enumerate at physiological pH
470
+ rdkit-cli protonate -i molecules.csv -o protonated.csv --ph 7.4
471
+
472
+ # Neutralize charged molecules
473
+ rdkit-cli protonate -i molecules.csv -o neutral.csv --neutralize
474
+
475
+ # Enumerate all states
476
+ rdkit-cli protonate -i molecules.csv -o states.csv --enumerate-all
477
+ ```
478
+
479
+ ### props
480
+
481
+ Property column operations.
482
+
483
+ ```bash
484
+ # Add a column
485
+ rdkit-cli props add -i molecules.csv -o output.csv -c series -v "series_A"
486
+
487
+ # Rename a column
488
+ rdkit-cli props rename -i molecules.csv -o output.csv --from name --to mol_name
489
+
490
+ # Drop columns
491
+ rdkit-cli props drop -i molecules.csv -o output.csv -c col1,col2
492
+
493
+ # Keep only specific columns
494
+ rdkit-cli props keep -i molecules.csv -o output.csv -c smiles,name,MolWt
495
+
496
+ # List columns
497
+ rdkit-cli props list -i molecules.csv
498
+ ```
499
+
259
500
  ## Global Options
260
501
 
261
502
  | Option | Description |
@@ -285,19 +526,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
285
526
  ### Cheminformatics Pipeline
286
527
 
287
528
  ```bash
288
- # 1. Standardize input molecules
289
- rdkit-cli standardize -i raw.csv -o std.csv --cleanup --neutralize
529
+ # 1. Validate and filter input
530
+ rdkit-cli validate -i raw.csv -o validated.csv --valid-only
531
+
532
+ # 2. Deduplicate
533
+ rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
534
+
535
+ # 3. Standardize molecules
536
+ rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
290
537
 
291
- # 2. Filter by drug-likeness
538
+ # 4. Filter by drug-likeness
292
539
  rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
293
540
 
294
- # 3. Compute descriptors
541
+ # 5. Compute descriptors
295
542
  rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
296
543
 
297
- # 4. Select diverse subset
544
+ # 6. Get dataset statistics
545
+ rdkit-cli stats -i druglike.csv -o stats.json --format json
546
+
547
+ # 7. Select diverse subset
298
548
  rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
299
549
 
300
- # 5. Generate depictions
550
+ # 8. Generate depictions
301
551
  rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
302
552
  ```
303
553
 
@@ -324,6 +574,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
324
574
  rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
325
575
  ```
326
576
 
577
+ ### Large Dataset Processing
578
+
579
+ ```bash
580
+ # Sample from a huge dataset
581
+ rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
582
+
583
+ # Split for parallel processing
584
+ rdkit-cli split -i library.csv -o batches/ -c 10
585
+
586
+ # Process batches in parallel (using xargs)
587
+ ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
588
+ ```
589
+
327
590
  ## Development
328
591
 
329
592
  ```bash