rdkit-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. rdkit_cli/__init__.py +4 -0
  2. rdkit_cli/__main__.py +6 -0
  3. rdkit_cli/cli.py +162 -0
  4. rdkit_cli/commands/__init__.py +1 -0
  5. rdkit_cli/commands/conformers.py +220 -0
  6. rdkit_cli/commands/convert.py +162 -0
  7. rdkit_cli/commands/depict.py +311 -0
  8. rdkit_cli/commands/descriptors.py +251 -0
  9. rdkit_cli/commands/diversity.py +232 -0
  10. rdkit_cli/commands/enumerate.py +229 -0
  11. rdkit_cli/commands/filter.py +384 -0
  12. rdkit_cli/commands/fingerprints.py +179 -0
  13. rdkit_cli/commands/fragment.py +284 -0
  14. rdkit_cli/commands/mcs.py +162 -0
  15. rdkit_cli/commands/reactions.py +191 -0
  16. rdkit_cli/commands/scaffold.py +243 -0
  17. rdkit_cli/commands/similarity.py +359 -0
  18. rdkit_cli/commands/standardize.py +138 -0
  19. rdkit_cli/core/__init__.py +1 -0
  20. rdkit_cli/core/conformers.py +197 -0
  21. rdkit_cli/core/depict.py +241 -0
  22. rdkit_cli/core/descriptors.py +248 -0
  23. rdkit_cli/core/diversity.py +174 -0
  24. rdkit_cli/core/enumerate.py +190 -0
  25. rdkit_cli/core/filters.py +443 -0
  26. rdkit_cli/core/fingerprints.py +265 -0
  27. rdkit_cli/core/fragment.py +237 -0
  28. rdkit_cli/core/mcs.py +128 -0
  29. rdkit_cli/core/reactions.py +159 -0
  30. rdkit_cli/core/scaffold.py +174 -0
  31. rdkit_cli/core/similarity.py +206 -0
  32. rdkit_cli/core/standardizer.py +141 -0
  33. rdkit_cli/io/__init__.py +7 -0
  34. rdkit_cli/io/formats.py +109 -0
  35. rdkit_cli/io/readers.py +352 -0
  36. rdkit_cli/io/writers.py +275 -0
  37. rdkit_cli/parallel/__init__.py +5 -0
  38. rdkit_cli/parallel/batch.py +181 -0
  39. rdkit_cli/parallel/executor.py +180 -0
  40. rdkit_cli/progress/__init__.py +5 -0
  41. rdkit_cli/progress/ninja.py +195 -0
  42. rdkit_cli/utils/__init__.py +1 -0
  43. rdkit_cli-0.1.0.dist-info/METADATA +380 -0
  44. rdkit_cli-0.1.0.dist-info/RECORD +47 -0
  45. rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
  46. rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
  47. rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
@@ -0,0 +1,352 @@
1
+ """File readers for various molecular file formats."""
2
+
3
+ import sys
4
+ from abc import ABC, abstractmethod
5
+ from pathlib import Path
6
+ from typing import Iterator, Optional, Any
7
+
8
+ import pandas as pd
9
+ from rdkit import Chem
10
+
11
+ from rdkit_cli.io.formats import FileFormat, FormatConfig, detect_format
12
+
13
+
14
+ class MoleculeRecord:
15
+ """A molecule with its associated metadata."""
16
+
17
+ __slots__ = ("mol", "smiles", "name", "metadata", "row_idx")
18
+
19
+ def __init__(
20
+ self,
21
+ mol: Optional[Chem.Mol],
22
+ smiles: str = "",
23
+ name: str = "",
24
+ metadata: Optional[dict[str, Any]] = None,
25
+ row_idx: int = -1,
26
+ ):
27
+ self.mol = mol
28
+ self.smiles = smiles
29
+ self.name = name
30
+ self.metadata = metadata or {}
31
+ self.row_idx = row_idx
32
+
33
+ @property
34
+ def is_valid(self) -> bool:
35
+ """Check if molecule was parsed successfully."""
36
+ return self.mol is not None
37
+
38
+
39
+ class MoleculeReader(ABC):
40
+ """Abstract base class for molecule file readers."""
41
+
42
+ @abstractmethod
43
+ def __iter__(self) -> Iterator[MoleculeRecord]:
44
+ """Yield MoleculeRecord objects."""
45
+ pass
46
+
47
+ @abstractmethod
48
+ def __len__(self) -> int:
49
+ """Return total number of molecules (for progress)."""
50
+ pass
51
+
52
+ @abstractmethod
53
+ def close(self):
54
+ """Close any open resources."""
55
+ pass
56
+
57
+ def __enter__(self):
58
+ return self
59
+
60
+ def __exit__(self, *args):
61
+ self.close()
62
+
63
+
64
+ class CSVReader(MoleculeReader):
65
+ """Read molecules from CSV/TSV files."""
66
+
67
+ def __init__(
68
+ self,
69
+ path: Path | str,
70
+ smiles_column: str = "smiles",
71
+ name_column: Optional[str] = None,
72
+ delimiter: str = ",",
73
+ has_header: bool = True,
74
+ ):
75
+ self.path = Path(path)
76
+ self.smiles_column = smiles_column
77
+ self.name_column = name_column
78
+ self.delimiter = delimiter
79
+ self.has_header = has_header
80
+ self._count: Optional[int] = None
81
+ self._df: Optional[pd.DataFrame] = None
82
+
83
+ def __len__(self) -> int:
84
+ if self._count is None:
85
+ # Count lines efficiently
86
+ with open(self.path, "rb") as f:
87
+ self._count = sum(1 for _ in f) - (1 if self.has_header else 0)
88
+ return self._count
89
+
90
+ def __iter__(self) -> Iterator[MoleculeRecord]:
91
+ header = 0 if self.has_header else None
92
+
93
+ # Read in chunks for memory efficiency
94
+ for chunk in pd.read_csv(
95
+ self.path,
96
+ delimiter=self.delimiter,
97
+ header=header,
98
+ chunksize=10000,
99
+ dtype=str,
100
+ na_filter=False,
101
+ ):
102
+ # Handle no-header case
103
+ if not self.has_header:
104
+ # Assume first column is SMILES
105
+ chunk.columns = [self.smiles_column] + [f"col_{i}" for i in range(1, len(chunk.columns))]
106
+
107
+ for idx, row in chunk.iterrows():
108
+ smiles = str(row.get(self.smiles_column, ""))
109
+ name = str(row.get(self.name_column, "")) if self.name_column else ""
110
+
111
+ mol = None
112
+ if smiles:
113
+ try:
114
+ mol = Chem.MolFromSmiles(smiles)
115
+ except Exception:
116
+ pass
117
+
118
+ if mol is None and smiles:
119
+ print(f"Warning: Failed to parse SMILES at row {idx}: {smiles[:50]}", file=sys.stderr)
120
+
121
+ yield MoleculeRecord(
122
+ mol=mol,
123
+ smiles=smiles,
124
+ name=name,
125
+ metadata=row.to_dict(),
126
+ row_idx=int(idx),
127
+ )
128
+
129
+ def close(self):
130
+ pass
131
+
132
+
133
+ class SMIReader(MoleculeReader):
134
+ """Read molecules from SMILES files."""
135
+
136
+ def __init__(
137
+ self,
138
+ path: Path | str,
139
+ has_header: bool = False,
140
+ delimiter: str = " ",
141
+ ):
142
+ self.path = Path(path)
143
+ self.has_header = has_header
144
+ self.delimiter = delimiter
145
+ self._count: Optional[int] = None
146
+
147
+ def __len__(self) -> int:
148
+ if self._count is None:
149
+ with open(self.path, "r") as f:
150
+ count = sum(1 for _ in f)
151
+ self._count = count - (1 if self.has_header else 0)
152
+ return self._count
153
+
154
+ def __iter__(self) -> Iterator[MoleculeRecord]:
155
+ with open(self.path, "r") as f:
156
+ if self.has_header:
157
+ next(f)
158
+
159
+ for idx, line in enumerate(f):
160
+ line = line.strip()
161
+ if not line:
162
+ continue
163
+
164
+ parts = line.split(self.delimiter, 1)
165
+ smiles = parts[0] if parts else ""
166
+ name = parts[1].strip() if len(parts) > 1 else ""
167
+
168
+ mol = None
169
+ if smiles:
170
+ try:
171
+ mol = Chem.MolFromSmiles(smiles)
172
+ except Exception:
173
+ pass
174
+
175
+ if mol is None and smiles:
176
+ print(f"Warning: Failed to parse SMILES at line {idx + 1}: {smiles[:50]}", file=sys.stderr)
177
+
178
+ yield MoleculeRecord(
179
+ mol=mol,
180
+ smiles=smiles,
181
+ name=name,
182
+ metadata={"smiles": smiles, "name": name},
183
+ row_idx=idx,
184
+ )
185
+
186
+ def close(self):
187
+ pass
188
+
189
+
190
+ class SDFReader(MoleculeReader):
191
+ """Read molecules from SDF files."""
192
+
193
+ def __init__(self, path: Path | str):
194
+ self.path = Path(path)
195
+ self._count: Optional[int] = None
196
+
197
+ def __len__(self) -> int:
198
+ if self._count is None:
199
+ # Count $$$$ delimiters
200
+ with open(self.path, "rb") as f:
201
+ self._count = sum(1 for line in f if line.strip() == b"$$$$")
202
+ return self._count
203
+
204
+ def __iter__(self) -> Iterator[MoleculeRecord]:
205
+ supplier = Chem.SDMolSupplier(str(self.path))
206
+
207
+ for idx, mol in enumerate(supplier):
208
+ metadata = {}
209
+ smiles = ""
210
+ name = ""
211
+
212
+ if mol is not None:
213
+ # Extract properties
214
+ for prop in mol.GetPropsAsDict():
215
+ metadata[prop] = mol.GetProp(prop)
216
+
217
+ name = mol.GetProp("_Name") if mol.HasProp("_Name") else ""
218
+ smiles = Chem.MolToSmiles(mol)
219
+ else:
220
+ print(f"Warning: Failed to parse molecule at index {idx}", file=sys.stderr)
221
+
222
+ yield MoleculeRecord(
223
+ mol=mol,
224
+ smiles=smiles,
225
+ name=name,
226
+ metadata=metadata,
227
+ row_idx=idx,
228
+ )
229
+
230
+ def close(self):
231
+ pass
232
+
233
+
234
+ class ParquetReader(MoleculeReader):
235
+ """Read molecules from Parquet files."""
236
+
237
+ def __init__(
238
+ self,
239
+ path: Path | str,
240
+ smiles_column: str = "smiles",
241
+ name_column: Optional[str] = None,
242
+ ):
243
+ self.path = Path(path)
244
+ self.smiles_column = smiles_column
245
+ self.name_column = name_column
246
+ self._count: Optional[int] = None
247
+
248
+ def __len__(self) -> int:
249
+ if self._count is None:
250
+ import pyarrow.parquet as pq
251
+ self._count = pq.read_metadata(self.path).num_rows
252
+ return self._count
253
+
254
+ def __iter__(self) -> Iterator[MoleculeRecord]:
255
+ import pyarrow.parquet as pq
256
+
257
+ # Read in batches for memory efficiency
258
+ parquet_file = pq.ParquetFile(self.path)
259
+
260
+ row_idx = 0
261
+ for batch in parquet_file.iter_batches(batch_size=10000):
262
+ df = batch.to_pandas()
263
+
264
+ for _, row in df.iterrows():
265
+ smiles = str(row.get(self.smiles_column, ""))
266
+ name = str(row.get(self.name_column, "")) if self.name_column else ""
267
+
268
+ mol = None
269
+ if smiles:
270
+ try:
271
+ mol = Chem.MolFromSmiles(smiles)
272
+ except Exception:
273
+ pass
274
+
275
+ if mol is None and smiles:
276
+ print(f"Warning: Failed to parse SMILES at row {row_idx}: {smiles[:50]}", file=sys.stderr)
277
+
278
+ yield MoleculeRecord(
279
+ mol=mol,
280
+ smiles=smiles,
281
+ name=name,
282
+ metadata=row.to_dict(),
283
+ row_idx=row_idx,
284
+ )
285
+ row_idx += 1
286
+
287
+ def close(self):
288
+ pass
289
+
290
+
291
+ def create_reader(
292
+ path: str | Path,
293
+ format_config: Optional[FormatConfig] = None,
294
+ smiles_column: str = "smiles",
295
+ name_column: Optional[str] = None,
296
+ has_header: Optional[bool] = None,
297
+ ) -> MoleculeReader:
298
+ """
299
+ Factory function to create appropriate reader.
300
+
301
+ Args:
302
+ path: Path to input file
303
+ format_config: Optional format configuration
304
+ smiles_column: Name of SMILES column (for CSV/Parquet)
305
+ name_column: Name of name column
306
+ has_header: Override header detection
307
+
308
+ Returns:
309
+ Appropriate MoleculeReader instance
310
+ """
311
+ path = Path(path)
312
+
313
+ if format_config is None:
314
+ file_format = detect_format(path)
315
+ else:
316
+ file_format = format_config.format
317
+ smiles_column = format_config.smiles_column
318
+ name_column = format_config.name_column
319
+ if has_header is None:
320
+ has_header = format_config.has_header
321
+
322
+ if file_format == FileFormat.CSV:
323
+ return CSVReader(
324
+ path,
325
+ smiles_column=smiles_column,
326
+ name_column=name_column,
327
+ delimiter=",",
328
+ has_header=has_header if has_header is not None else True,
329
+ )
330
+ elif file_format == FileFormat.TSV:
331
+ return CSVReader(
332
+ path,
333
+ smiles_column=smiles_column,
334
+ name_column=name_column,
335
+ delimiter="\t",
336
+ has_header=has_header if has_header is not None else True,
337
+ )
338
+ elif file_format == FileFormat.SMI:
339
+ return SMIReader(
340
+ path,
341
+ has_header=has_header if has_header is not None else False,
342
+ )
343
+ elif file_format == FileFormat.SDF:
344
+ return SDFReader(path)
345
+ elif file_format == FileFormat.PARQUET:
346
+ return ParquetReader(
347
+ path,
348
+ smiles_column=smiles_column,
349
+ name_column=name_column,
350
+ )
351
+ else:
352
+ raise ValueError(f"Unsupported format: {file_format}")
@@ -0,0 +1,275 @@
1
+ """File writers for various molecular file formats."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from pathlib import Path
5
+ from typing import Any, Optional
6
+
7
+ import pandas as pd
8
+ from rdkit import Chem
9
+
10
+ from rdkit_cli.io.formats import FileFormat, detect_format
11
+
12
+
13
+ class MoleculeWriter(ABC):
14
+ """Abstract base class for molecule file writers."""
15
+
16
+ @abstractmethod
17
+ def write_row(self, data: dict[str, Any]):
18
+ """Write a single row of data."""
19
+ pass
20
+
21
+ @abstractmethod
22
+ def write_batch(self, data: list[dict[str, Any]]):
23
+ """Write a batch of results."""
24
+ pass
25
+
26
+ @abstractmethod
27
+ def close(self):
28
+ """Finalize and close the file."""
29
+ pass
30
+
31
+ def __enter__(self):
32
+ return self
33
+
34
+ def __exit__(self, *args):
35
+ self.close()
36
+
37
+
38
+ class CSVWriter(MoleculeWriter):
39
+ """Write results to CSV/TSV files."""
40
+
41
+ def __init__(
42
+ self,
43
+ path: Path | str,
44
+ delimiter: str = ",",
45
+ columns: Optional[list[str]] = None,
46
+ ):
47
+ self.path = Path(path)
48
+ self.delimiter = delimiter
49
+ self.columns = columns
50
+ self._file = open(path, "w", newline="", encoding="utf-8")
51
+ self._header_written = False
52
+ self._column_order: Optional[list[str]] = None
53
+
54
+ def write_row(self, data: dict[str, Any]):
55
+ """Write a single row."""
56
+ self.write_batch([data])
57
+
58
+ def write_batch(self, data: list[dict[str, Any]]):
59
+ """Write a batch of results."""
60
+ if not data:
61
+ return
62
+
63
+ # Determine column order from first row if not set
64
+ if self._column_order is None:
65
+ if self.columns:
66
+ self._column_order = self.columns
67
+ else:
68
+ self._column_order = list(data[0].keys())
69
+
70
+ # Write header if not done yet
71
+ if not self._header_written:
72
+ self._file.write(self.delimiter.join(self._column_order) + "\n")
73
+ self._header_written = True
74
+
75
+ # Write data rows
76
+ for row in data:
77
+ values = []
78
+ for col in self._column_order:
79
+ val = row.get(col, "")
80
+ # Handle special types
81
+ if val is None:
82
+ val = ""
83
+ elif isinstance(val, float):
84
+ if pd.isna(val):
85
+ val = ""
86
+ else:
87
+ val = str(val)
88
+ else:
89
+ val = str(val)
90
+ # Escape delimiter and quotes
91
+ if self.delimiter in val or '"' in val or "\n" in val:
92
+ val = '"' + val.replace('"', '""') + '"'
93
+ values.append(val)
94
+ self._file.write(self.delimiter.join(values) + "\n")
95
+
96
+ def close(self):
97
+ """Close the file."""
98
+ if self._file:
99
+ self._file.close()
100
+ self._file = None
101
+
102
+
103
+ class SMIWriter(MoleculeWriter):
104
+ """Write molecules to SMILES files."""
105
+
106
+ def __init__(
107
+ self,
108
+ path: Path | str,
109
+ smiles_column: str = "smiles",
110
+ name_column: Optional[str] = "name",
111
+ ):
112
+ self.path = Path(path)
113
+ self.smiles_column = smiles_column
114
+ self.name_column = name_column
115
+ self._file = open(path, "w", encoding="utf-8")
116
+
117
+ def write_row(self, data: dict[str, Any]):
118
+ """Write a single row."""
119
+ smiles = data.get(self.smiles_column, "")
120
+ name = data.get(self.name_column, "") if self.name_column else ""
121
+
122
+ if smiles:
123
+ if name:
124
+ self._file.write(f"{smiles} {name}\n")
125
+ else:
126
+ self._file.write(f"{smiles}\n")
127
+
128
+ def write_batch(self, data: list[dict[str, Any]]):
129
+ """Write a batch of results."""
130
+ for row in data:
131
+ self.write_row(row)
132
+
133
+ def close(self):
134
+ """Close the file."""
135
+ if self._file:
136
+ self._file.close()
137
+ self._file = None
138
+
139
+
140
+ class SDFWriter(MoleculeWriter):
141
+ """Write molecules to SDF files."""
142
+
143
+ def __init__(self, path: Path | str):
144
+ self.path = Path(path)
145
+ self._writer = Chem.SDWriter(str(path))
146
+
147
+ def write_row(self, data: dict[str, Any]):
148
+ """Write a single row."""
149
+ mol = data.get("mol")
150
+
151
+ if mol is None:
152
+ # Try to create from SMILES
153
+ smiles = data.get("smiles", "")
154
+ if smiles:
155
+ mol = Chem.MolFromSmiles(smiles)
156
+
157
+ if mol is not None:
158
+ # Set properties from data
159
+ for key, value in data.items():
160
+ if key not in ("mol", "smiles") and value is not None:
161
+ try:
162
+ mol.SetProp(str(key), str(value))
163
+ except Exception:
164
+ pass
165
+ self._writer.write(mol)
166
+
167
+ def write_batch(self, data: list[dict[str, Any]]):
168
+ """Write a batch of results."""
169
+ for row in data:
170
+ self.write_row(row)
171
+
172
+ def close(self):
173
+ """Close the writer."""
174
+ if self._writer:
175
+ self._writer.close()
176
+ self._writer = None
177
+
178
+
179
+ class ParquetWriter(MoleculeWriter):
180
+ """Write results to Parquet files."""
181
+
182
+ def __init__(
183
+ self,
184
+ path: Path | str,
185
+ columns: Optional[list[str]] = None,
186
+ ):
187
+ self.path = Path(path)
188
+ self.columns = columns
189
+ self._batches: list[dict[str, Any]] = []
190
+ self._batch_size = 100000 # Write in batches of 100k
191
+
192
+ def write_row(self, data: dict[str, Any]):
193
+ """Write a single row."""
194
+ # Remove mol objects (not serializable)
195
+ clean_data = {k: v for k, v in data.items() if k != "mol"}
196
+ self._batches.append(clean_data)
197
+
198
+ if len(self._batches) >= self._batch_size:
199
+ self._flush()
200
+
201
+ def write_batch(self, data: list[dict[str, Any]]):
202
+ """Write a batch of results."""
203
+ for row in data:
204
+ clean_data = {k: v for k, v in row.items() if k != "mol"}
205
+ self._batches.append(clean_data)
206
+
207
+ if len(self._batches) >= self._batch_size:
208
+ self._flush()
209
+
210
+ def _flush(self):
211
+ """Write accumulated batches to file."""
212
+ if not self._batches:
213
+ return
214
+
215
+ import pyarrow as pa
216
+ import pyarrow.parquet as pq
217
+
218
+ df = pd.DataFrame(self._batches)
219
+
220
+ # Reorder columns if specified
221
+ if self.columns:
222
+ cols = [c for c in self.columns if c in df.columns]
223
+ extra = [c for c in df.columns if c not in self.columns]
224
+ df = df[cols + extra]
225
+
226
+ table = pa.Table.from_pandas(df, preserve_index=False)
227
+
228
+ if self.path.exists():
229
+ # Append to existing file
230
+ existing = pq.read_table(self.path)
231
+ table = pa.concat_tables([existing, table])
232
+
233
+ pq.write_table(table, self.path)
234
+ self._batches = []
235
+
236
+ def close(self):
237
+ """Finalize and close the file."""
238
+ self._flush()
239
+
240
+
241
+ def create_writer(
242
+ path: str | Path,
243
+ format_override: Optional[FileFormat] = None,
244
+ columns: Optional[list[str]] = None,
245
+ smiles_column: str = "smiles",
246
+ name_column: Optional[str] = "name",
247
+ ) -> MoleculeWriter:
248
+ """
249
+ Factory function to create appropriate writer.
250
+
251
+ Args:
252
+ path: Output file path
253
+ format_override: Override auto-detected format
254
+ columns: Column order for output
255
+ smiles_column: Name of SMILES column (for SMI files)
256
+ name_column: Name of name column (for SMI files)
257
+
258
+ Returns:
259
+ Appropriate MoleculeWriter instance
260
+ """
261
+ path = Path(path)
262
+ file_format = format_override or detect_format(path)
263
+
264
+ if file_format == FileFormat.CSV:
265
+ return CSVWriter(path, delimiter=",", columns=columns)
266
+ elif file_format == FileFormat.TSV:
267
+ return CSVWriter(path, delimiter="\t", columns=columns)
268
+ elif file_format == FileFormat.SMI:
269
+ return SMIWriter(path, smiles_column=smiles_column, name_column=name_column)
270
+ elif file_format == FileFormat.SDF:
271
+ return SDFWriter(path)
272
+ elif file_format == FileFormat.PARQUET:
273
+ return ParquetWriter(path, columns=columns)
274
+ else:
275
+ raise ValueError(f"Unsupported format: {file_format}")
@@ -0,0 +1,5 @@
1
+ """Parallel processing utilities."""
2
+
3
+ from rdkit_cli.parallel.executor import ParallelExecutor, parallel_map
4
+
5
+ __all__ = ["ParallelExecutor", "parallel_map"]