rdkit-cli 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/CHANGELOG.md +15 -0
  2. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/PKG-INFO +117 -8
  3. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/README.md +116 -7
  4. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/pyproject.toml +1 -1
  5. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/__init__.py +1 -1
  6. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/cli.py +55 -21
  7. rdkit_cli-0.2.0/src/rdkit_cli/commands/deduplicate.py +123 -0
  8. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/depict.py +122 -0
  9. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/descriptors.py +38 -0
  10. rdkit_cli-0.2.0/src/rdkit_cli/commands/sample.py +152 -0
  11. rdkit_cli-0.2.0/src/rdkit_cli/commands/split.py +152 -0
  12. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/standardize.py +61 -0
  13. rdkit_cli-0.2.0/src/rdkit_cli/commands/stats.py +172 -0
  14. rdkit_cli-0.2.0/src/rdkit_cli/commands/validate.py +189 -0
  15. rdkit_cli-0.2.0/src/rdkit_cli/core/deduplicate.py +190 -0
  16. rdkit_cli-0.2.0/src/rdkit_cli/core/sample.py +156 -0
  17. rdkit_cli-0.2.0/src/rdkit_cli/core/split.py +118 -0
  18. rdkit_cli-0.2.0/src/rdkit_cli/core/stats.py +96 -0
  19. rdkit_cli-0.2.0/src/rdkit_cli/core/validate.py +222 -0
  20. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/readers.py +11 -4
  21. rdkit_cli-0.2.0/src/rdkit_cli/utils/__init__.py +27 -0
  22. rdkit_cli-0.2.0/src/rdkit_cli/utils/logging.py +113 -0
  23. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/integration/test_cli.py +401 -0
  24. rdkit_cli-0.2.0/tests/unit/test_deduplicate.py +161 -0
  25. rdkit_cli-0.2.0/tests/unit/test_filters.py +529 -0
  26. rdkit_cli-0.2.0/tests/unit/test_sample.py +155 -0
  27. rdkit_cli-0.2.0/tests/unit/test_split.py +101 -0
  28. rdkit_cli-0.2.0/tests/unit/test_stats.py +108 -0
  29. rdkit_cli-0.2.0/tests/unit/test_validate.py +208 -0
  30. rdkit_cli-0.1.0/src/rdkit_cli/utils/__init__.py +0 -1
  31. rdkit_cli-0.1.0/tests/unit/test_filters.py +0 -211
  32. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/.github/workflows/publish.yml +0 -0
  33. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/LICENSE +0 -0
  34. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/__main__.py +0 -0
  35. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/__init__.py +0 -0
  36. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/conformers.py +0 -0
  37. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/convert.py +0 -0
  38. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/diversity.py +0 -0
  39. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/enumerate.py +0 -0
  40. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/filter.py +0 -0
  41. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/fingerprints.py +0 -0
  42. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/fragment.py +0 -0
  43. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/mcs.py +0 -0
  44. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/reactions.py +0 -0
  45. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/scaffold.py +0 -0
  46. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/commands/similarity.py +0 -0
  47. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/__init__.py +0 -0
  48. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/conformers.py +0 -0
  49. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/depict.py +0 -0
  50. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/descriptors.py +0 -0
  51. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/diversity.py +0 -0
  52. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/enumerate.py +0 -0
  53. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/filters.py +0 -0
  54. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/fingerprints.py +0 -0
  55. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/fragment.py +0 -0
  56. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/mcs.py +0 -0
  57. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/reactions.py +0 -0
  58. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/scaffold.py +0 -0
  59. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/similarity.py +0 -0
  60. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/core/standardizer.py +0 -0
  61. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/__init__.py +0 -0
  62. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/formats.py +0 -0
  63. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/io/writers.py +0 -0
  64. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/parallel/__init__.py +0 -0
  65. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/parallel/batch.py +0 -0
  66. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/parallel/executor.py +0 -0
  67. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/progress/__init__.py +0 -0
  68. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/src/rdkit_cli/progress/ninja.py +0 -0
  69. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/__init__.py +0 -0
  70. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/conftest.py +0 -0
  71. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/fixtures/sample.csv +0 -0
  72. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/fixtures/sample.smi +0 -0
  73. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/integration/__init__.py +0 -0
  74. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/integration/test_interop.py +0 -0
  75. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/__init__.py +0 -0
  76. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_depict.py +0 -0
  77. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_descriptors.py +0 -0
  78. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_diversity.py +0 -0
  79. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_enumerate.py +0 -0
  80. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_fingerprints.py +0 -0
  81. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_fragment.py +0 -0
  82. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_io.py +0 -0
  83. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_mcs.py +0 -0
  84. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_reactions.py +0 -0
  85. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_scaffold.py +0 -0
  86. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_similarity.py +0 -0
  87. {rdkit_cli-0.1.0 → rdkit_cli-0.2.0}/tests/unit/test_standardizer.py +0 -0
@@ -5,6 +5,21 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [0.2.0] - 2026-01-06
9
+
10
+ ### Added
11
+
12
+ - **stats**: Calculate dataset statistics (MolWt, LogP, TPSA, etc. with min/max/mean/median/stdev)
13
+ - **split**: Split files into smaller chunks (by number of chunks or chunk size)
14
+ - **sample**: Randomly sample molecules (by count or fraction, with reservoir sampling for large files)
15
+ - **deduplicate**: Remove duplicate molecules (by SMILES, InChI, InChIKey, or scaffold)
16
+ - **validate**: Validate molecular structures (valence, kekulization, stereo, element constraints)
17
+
18
+ ### Changed
19
+
20
+ - Commands are now displayed in alphabetical order in help output
21
+ - Total command count increased from 14 to 19
22
+
8
23
  ## [0.1.0] - 2026-01-06
9
24
 
10
25
  ### Added
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: rdkit-cli
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: A comprehensive CLI tool for RDKit cheminformatics operations
5
5
  Project-URL: Homepage, https://github.com/vitruves/rdkit-cli
6
6
  Project-URL: Repository, https://github.com/vitruves/rdkit-cli
@@ -38,7 +38,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
38
38
 
39
39
  ## Features
40
40
 
41
- - **14 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict
41
+ - **19 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict, stats, split, sample, deduplicate, validate
42
42
  - **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
43
43
  - **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
44
44
  - **Ninja-style Progress**: Real-time progress display with speed and ETA
@@ -290,6 +290,93 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
290
290
  rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
291
291
  ```
292
292
 
293
+ ### stats
294
+
295
+ Calculate dataset statistics.
296
+
297
+ ```bash
298
+ # Basic statistics
299
+ rdkit-cli stats -i molecules.csv -o stats.json --format json
300
+
301
+ # Specific properties
302
+ rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
303
+
304
+ # List available properties
305
+ rdkit-cli stats -i molecules.csv --list-properties
306
+ ```
307
+
308
+ ### split
309
+
310
+ Split files into smaller chunks.
311
+
312
+ ```bash
313
+ # Split into N files
314
+ rdkit-cli split -i large.csv -o chunks/ -c 10
315
+
316
+ # Split by chunk size
317
+ rdkit-cli split -i large.csv -o chunks/ -s 1000
318
+
319
+ # With custom prefix
320
+ rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
321
+ ```
322
+
323
+ ### sample
324
+
325
+ Randomly sample molecules.
326
+
327
+ ```bash
328
+ # Sample by count
329
+ rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
330
+
331
+ # Sample by fraction
332
+ rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
333
+
334
+ # Memory-efficient streaming (reservoir sampling)
335
+ rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
336
+ ```
337
+
338
+ ### deduplicate
339
+
340
+ Remove duplicate molecules.
341
+
342
+ ```bash
343
+ # Deduplicate by canonical SMILES (default)
344
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv
345
+
346
+ # Deduplicate by InChIKey
347
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
348
+
349
+ # Deduplicate by scaffold
350
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
351
+
352
+ # Keep last occurrence instead of first
353
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
354
+ ```
355
+
356
+ ### validate
357
+
358
+ Validate molecular structures.
359
+
360
+ ```bash
361
+ # Basic validation
362
+ rdkit-cli validate -i molecules.csv -o validated.csv
363
+
364
+ # Output only valid molecules
365
+ rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
366
+
367
+ # With constraints
368
+ rdkit-cli validate -i molecules.csv -o validated.csv \
369
+ --max-atoms 100 --max-rings 8
370
+
371
+ # Check allowed elements
372
+ rdkit-cli validate -i molecules.csv -o validated.csv \
373
+ --allowed-elements C,H,N,O,S,F,Cl
374
+
375
+ # Check stereo and show summary
376
+ rdkit-cli validate -i molecules.csv -o validated.csv \
377
+ --check-stereo --summary
378
+ ```
379
+
293
380
  ## Global Options
294
381
 
295
382
  | Option | Description |
@@ -319,19 +406,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
319
406
  ### Cheminformatics Pipeline
320
407
 
321
408
  ```bash
322
- # 1. Standardize input molecules
323
- rdkit-cli standardize -i raw.csv -o std.csv --cleanup --neutralize
409
+ # 1. Validate and filter input
410
+ rdkit-cli validate -i raw.csv -o validated.csv --valid-only
411
+
412
+ # 2. Deduplicate
413
+ rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
324
414
 
325
- # 2. Filter by drug-likeness
415
+ # 3. Standardize molecules
416
+ rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
417
+
418
+ # 4. Filter by drug-likeness
326
419
  rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
327
420
 
328
- # 3. Compute descriptors
421
+ # 5. Compute descriptors
329
422
  rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
330
423
 
331
- # 4. Select diverse subset
424
+ # 6. Get dataset statistics
425
+ rdkit-cli stats -i druglike.csv -o stats.json --format json
426
+
427
+ # 7. Select diverse subset
332
428
  rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
333
429
 
334
- # 5. Generate depictions
430
+ # 8. Generate depictions
335
431
  rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
336
432
  ```
337
433
 
@@ -358,6 +454,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
358
454
  rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
359
455
  ```
360
456
 
457
+ ### Large Dataset Processing
458
+
459
+ ```bash
460
+ # Sample from a huge dataset
461
+ rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
462
+
463
+ # Split for parallel processing
464
+ rdkit-cli split -i library.csv -o batches/ -c 10
465
+
466
+ # Process batches in parallel (using xargs)
467
+ ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
468
+ ```
469
+
361
470
  ## Development
362
471
 
363
472
  ```bash
@@ -4,7 +4,7 @@ A comprehensive, high-performance CLI tool wrapping RDKit functionality for chem
4
4
 
5
5
  ## Features
6
6
 
7
- - **14 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict
7
+ - **19 Command Categories**: descriptors, fingerprints, filter, convert, standardize, similarity, conformers, reactions, scaffold, enumerate, fragment, diversity, mcs, depict, stats, split, sample, deduplicate, validate
8
8
  - **Multiple Input/Output Formats**: CSV, TSV, SMI, SDF, Parquet
9
9
  - **Parallel Processing**: Efficient multi-core support via ProcessPoolExecutor
10
10
  - **Ninja-style Progress**: Real-time progress display with speed and ETA
@@ -256,6 +256,93 @@ rdkit-cli depict batch -i molecules.csv -o images/ -f svg
256
256
  rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
257
257
  ```
258
258
 
259
+ ### stats
260
+
261
+ Calculate dataset statistics.
262
+
263
+ ```bash
264
+ # Basic statistics
265
+ rdkit-cli stats -i molecules.csv -o stats.json --format json
266
+
267
+ # Specific properties
268
+ rdkit-cli stats -i molecules.csv -p MolWt,LogP,TPSA
269
+
270
+ # List available properties
271
+ rdkit-cli stats -i molecules.csv --list-properties
272
+ ```
273
+
274
+ ### split
275
+
276
+ Split files into smaller chunks.
277
+
278
+ ```bash
279
+ # Split into N files
280
+ rdkit-cli split -i large.csv -o chunks/ -c 10
281
+
282
+ # Split by chunk size
283
+ rdkit-cli split -i large.csv -o chunks/ -s 1000
284
+
285
+ # With custom prefix
286
+ rdkit-cli split -i large.csv -o chunks/ -c 5 --prefix molecules
287
+ ```
288
+
289
+ ### sample
290
+
291
+ Randomly sample molecules.
292
+
293
+ ```bash
294
+ # Sample by count
295
+ rdkit-cli sample -i molecules.csv -o sample.csv -k 100 --seed 42
296
+
297
+ # Sample by fraction
298
+ rdkit-cli sample -i molecules.csv -o sample.csv -f 0.1
299
+
300
+ # Memory-efficient streaming (reservoir sampling)
301
+ rdkit-cli sample -i huge.csv -o sample.csv -k 1000 --stream
302
+ ```
303
+
304
+ ### deduplicate
305
+
306
+ Remove duplicate molecules.
307
+
308
+ ```bash
309
+ # Deduplicate by canonical SMILES (default)
310
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv
311
+
312
+ # Deduplicate by InChIKey
313
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b inchikey
314
+
315
+ # Deduplicate by scaffold
316
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv -b scaffold
317
+
318
+ # Keep last occurrence instead of first
319
+ rdkit-cli deduplicate -i molecules.csv -o unique.csv --keep last
320
+ ```
321
+
322
+ ### validate
323
+
324
+ Validate molecular structures.
325
+
326
+ ```bash
327
+ # Basic validation
328
+ rdkit-cli validate -i molecules.csv -o validated.csv
329
+
330
+ # Output only valid molecules
331
+ rdkit-cli validate -i molecules.csv -o valid.csv --valid-only
332
+
333
+ # With constraints
334
+ rdkit-cli validate -i molecules.csv -o validated.csv \
335
+ --max-atoms 100 --max-rings 8
336
+
337
+ # Check allowed elements
338
+ rdkit-cli validate -i molecules.csv -o validated.csv \
339
+ --allowed-elements C,H,N,O,S,F,Cl
340
+
341
+ # Check stereo and show summary
342
+ rdkit-cli validate -i molecules.csv -o validated.csv \
343
+ --check-stereo --summary
344
+ ```
345
+
259
346
  ## Global Options
260
347
 
261
348
  | Option | Description |
@@ -285,19 +372,28 @@ rdkit-cli depict grid -i molecules.csv -o grid.svg --mols-per-row 4
285
372
  ### Cheminformatics Pipeline
286
373
 
287
374
  ```bash
288
- # 1. Standardize input molecules
289
- rdkit-cli standardize -i raw.csv -o std.csv --cleanup --neutralize
375
+ # 1. Validate and filter input
376
+ rdkit-cli validate -i raw.csv -o validated.csv --valid-only
377
+
378
+ # 2. Deduplicate
379
+ rdkit-cli deduplicate -i validated.csv -o unique.csv -b inchikey
290
380
 
291
- # 2. Filter by drug-likeness
381
+ # 3. Standardize molecules
382
+ rdkit-cli standardize -i unique.csv -o std.csv --cleanup --neutralize
383
+
384
+ # 4. Filter by drug-likeness
292
385
  rdkit-cli filter druglike -i std.csv -o druglike.csv --rule lipinski
293
386
 
294
- # 3. Compute descriptors
387
+ # 5. Compute descriptors
295
388
  rdkit-cli descriptors compute -i druglike.csv -o desc.csv -d MolWt,MolLogP,TPSA,HBD,HBA
296
389
 
297
- # 4. Select diverse subset
390
+ # 6. Get dataset statistics
391
+ rdkit-cli stats -i druglike.csv -o stats.json --format json
392
+
393
+ # 7. Select diverse subset
298
394
  rdkit-cli diversity pick -i druglike.csv -o diverse.csv -k 500
299
395
 
300
- # 5. Generate depictions
396
+ # 8. Generate depictions
301
397
  rdkit-cli depict grid -i diverse.csv -o library.svg --mols-per-row 10
302
398
  ```
303
399
 
@@ -324,6 +420,19 @@ rdkit-cli scaffold murcko -i library.csv -o scaffolds.csv
324
420
  rdkit-cli diversity analyze -i scaffolds.csv --smiles-column scaffold
325
421
  ```
326
422
 
423
+ ### Large Dataset Processing
424
+
425
+ ```bash
426
+ # Sample from a huge dataset
427
+ rdkit-cli sample -i huge_library.csv -o sample.csv -k 10000 --stream
428
+
429
+ # Split for parallel processing
430
+ rdkit-cli split -i library.csv -o batches/ -c 10
431
+
432
+ # Process batches in parallel (using xargs)
433
+ ls batches/*.csv | xargs -P 4 -I {} rdkit-cli descriptors compute -i {} -o {}.desc.csv -d MolWt,LogP
434
+ ```
435
+
327
436
  ## Development
328
437
 
329
438
  ```bash
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "rdkit-cli"
3
- version = "0.1.0"
3
+ version = "0.2.0"
4
4
  description = "A comprehensive CLI tool for RDKit cheminformatics operations"
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -1,4 +1,4 @@
1
1
  """rdkit-cli: A comprehensive CLI tool for RDKit cheminformatics operations."""
2
2
 
3
- __version__ = "0.1.0"
3
+ __version__ = "0.2.0"
4
4
  __author__ = "Vitruves"
@@ -68,6 +68,18 @@ def add_common_processing_options(parser: argparse.ArgumentParser):
68
68
  action="store_true",
69
69
  help="Suppress progress output",
70
70
  )
71
+ parser.add_argument(
72
+ "--no-warnings",
73
+ action="store_true",
74
+ help="Suppress RDKit warnings (kekulization errors, etc.)",
75
+ )
76
+ parser.add_argument(
77
+ "--log-level",
78
+ choices=["debug", "info", "warning", "error", "critical"],
79
+ default=None,
80
+ metavar="LEVEL",
81
+ help="RDKit log level (default: warning, use 'error' to suppress warnings)",
82
+ )
71
83
 
72
84
 
73
85
  def create_parser() -> argparse.ArgumentParser:
@@ -100,39 +112,49 @@ def create_parser() -> argparse.ArgumentParser:
100
112
 
101
113
 
102
114
  def _register_commands(subparsers):
103
- """Register all command subparsers."""
115
+ """Register all command subparsers (alphabetical order)."""
104
116
  from rdkit_cli.commands import (
105
- descriptors,
106
- fingerprints,
107
- filter,
108
- convert,
109
- standardize,
110
- similarity,
111
117
  conformers,
112
- reactions,
113
- scaffold,
118
+ convert,
119
+ deduplicate,
120
+ depict,
121
+ descriptors,
122
+ diversity,
114
123
  enumerate,
124
+ filter,
125
+ fingerprints,
115
126
  fragment,
116
- diversity,
117
127
  mcs,
118
- depict,
128
+ reactions,
129
+ sample,
130
+ scaffold,
131
+ similarity,
132
+ split,
133
+ standardize,
134
+ stats,
135
+ validate,
119
136
  )
120
137
 
121
138
  # Each module has a register_parser(subparsers) function
122
- descriptors.register_parser(subparsers)
123
- fingerprints.register_parser(subparsers)
124
- filter.register_parser(subparsers)
125
- convert.register_parser(subparsers)
126
- standardize.register_parser(subparsers)
127
- similarity.register_parser(subparsers)
128
139
  conformers.register_parser(subparsers)
129
- reactions.register_parser(subparsers)
130
- scaffold.register_parser(subparsers)
140
+ convert.register_parser(subparsers)
141
+ deduplicate.register_parser(subparsers)
142
+ depict.register_parser(subparsers)
143
+ descriptors.register_parser(subparsers)
144
+ diversity.register_parser(subparsers)
131
145
  enumerate.register_parser(subparsers)
146
+ filter.register_parser(subparsers)
147
+ fingerprints.register_parser(subparsers)
132
148
  fragment.register_parser(subparsers)
133
- diversity.register_parser(subparsers)
134
149
  mcs.register_parser(subparsers)
135
- depict.register_parser(subparsers)
150
+ reactions.register_parser(subparsers)
151
+ sample.register_parser(subparsers)
152
+ scaffold.register_parser(subparsers)
153
+ similarity.register_parser(subparsers)
154
+ split.register_parser(subparsers)
155
+ standardize.register_parser(subparsers)
156
+ stats.register_parser(subparsers)
157
+ validate.register_parser(subparsers)
136
158
 
137
159
 
138
160
  def main(args: Optional[list[str]] = None) -> int:
@@ -144,6 +166,18 @@ def main(args: Optional[list[str]] = None) -> int:
144
166
  parser.print_help()
145
167
  return 1
146
168
 
169
+ # Configure logging based on --no-warnings or --log-level
170
+ from rdkit_cli.utils import configure_all_warnings, set_rdkit_log_level
171
+ no_warnings = getattr(parsed_args, "no_warnings", False)
172
+ log_level = getattr(parsed_args, "log_level", None)
173
+
174
+ if no_warnings:
175
+ # Suppress both RDKit and application warnings
176
+ configure_all_warnings(suppress=True)
177
+ elif log_level is not None:
178
+ # Only control RDKit log level
179
+ set_rdkit_log_level(log_level)
180
+
147
181
  # Each command has a run(args) function via set_defaults(func=...)
148
182
  try:
149
183
  return parsed_args.func(parsed_args)
@@ -0,0 +1,123 @@
1
+ """Deduplicate command implementation."""
2
+
3
+ import sys
4
+ from pathlib import Path
5
+
6
+ from rdkit_cli.cli import RdkitHelpFormatter, add_common_io_options, add_common_processing_options
7
+
8
+
9
+ def register_parser(subparsers):
10
+ """Register the deduplicate command."""
11
+ parser = subparsers.add_parser(
12
+ "deduplicate",
13
+ help="Remove duplicate molecules",
14
+ description="Remove duplicate molecules from a dataset based on various molecular identifiers.",
15
+ formatter_class=RdkitHelpFormatter,
16
+ )
17
+
18
+ add_common_io_options(parser)
19
+ add_common_processing_options(parser)
20
+
21
+ parser.add_argument(
22
+ "-b", "--by",
23
+ choices=["smiles", "inchi", "inchikey", "scaffold"],
24
+ default="smiles",
25
+ help="Deduplication key type (default: smiles)",
26
+ )
27
+ parser.add_argument(
28
+ "--keep",
29
+ choices=["first", "last"],
30
+ default="first",
31
+ help="Which duplicate to keep (default: first)",
32
+ )
33
+ parser.add_argument(
34
+ "--list-keys",
35
+ action="store_true",
36
+ help="List available key types and exit",
37
+ )
38
+
39
+ parser.set_defaults(func=run_deduplicate)
40
+
41
+
42
+ def run_deduplicate(args) -> int:
43
+ """Run the deduplicate command."""
44
+ from rdkit_cli.core.deduplicate import Deduplicator
45
+ from rdkit_cli.io import create_reader, create_writer
46
+ from rdkit_cli.progress.ninja import NinjaProgress
47
+
48
+ # Handle --list-keys
49
+ if args.list_keys:
50
+ print("Available deduplication keys:")
51
+ print(" smiles - Canonical SMILES (default)")
52
+ print(" inchi - InChI string")
53
+ print(" inchikey - InChIKey (27 character hash)")
54
+ print(" scaffold - Murcko scaffold SMILES")
55
+ return 0
56
+
57
+ input_path = Path(args.input)
58
+ if not input_path.exists():
59
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
60
+ return 1
61
+
62
+ # Create reader
63
+ reader = create_reader(
64
+ input_path,
65
+ smiles_column=args.smiles_column,
66
+ name_column=args.name_column,
67
+ has_header=not args.no_header,
68
+ )
69
+
70
+ # Read all records with progress
71
+ if not args.quiet:
72
+ print("Reading molecules...", file=sys.stderr)
73
+
74
+ records = []
75
+ with reader:
76
+ total = len(reader)
77
+ progress = NinjaProgress(total=total, quiet=args.quiet)
78
+ progress.start()
79
+
80
+ for record in reader:
81
+ records.append(record)
82
+ progress.update(1)
83
+
84
+ progress.finish()
85
+
86
+ if not records:
87
+ print("Error: No molecules found in input file", file=sys.stderr)
88
+ return 1
89
+
90
+ if not args.quiet:
91
+ print(f"Deduplicating {len(records)} molecules by {args.by}...", file=sys.stderr)
92
+
93
+ # Create deduplicator
94
+ deduplicator = Deduplicator(
95
+ key_type=args.by,
96
+ keep=args.keep,
97
+ )
98
+
99
+ # Deduplicate
100
+ unique_records, n_duplicates = deduplicator.deduplicate(records)
101
+
102
+ # Write output
103
+ output_path = Path(args.output)
104
+ writer = create_writer(output_path)
105
+
106
+ with writer:
107
+ for record in unique_records:
108
+ row = {"smiles": record.smiles}
109
+ if record.name:
110
+ row["name"] = record.name
111
+ for key, value in record.metadata.items():
112
+ if key not in row and key != "smiles":
113
+ row[key] = value
114
+ writer.write_row(row)
115
+
116
+ if not args.quiet:
117
+ print(
118
+ f"Removed {n_duplicates} duplicates. "
119
+ f"Wrote {len(unique_records)} unique molecules to {output_path}",
120
+ file=sys.stderr,
121
+ )
122
+
123
+ return 0