rdkit-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rdkit_cli/__init__.py +4 -0
- rdkit_cli/__main__.py +6 -0
- rdkit_cli/cli.py +162 -0
- rdkit_cli/commands/__init__.py +1 -0
- rdkit_cli/commands/conformers.py +220 -0
- rdkit_cli/commands/convert.py +162 -0
- rdkit_cli/commands/depict.py +311 -0
- rdkit_cli/commands/descriptors.py +251 -0
- rdkit_cli/commands/diversity.py +232 -0
- rdkit_cli/commands/enumerate.py +229 -0
- rdkit_cli/commands/filter.py +384 -0
- rdkit_cli/commands/fingerprints.py +179 -0
- rdkit_cli/commands/fragment.py +284 -0
- rdkit_cli/commands/mcs.py +162 -0
- rdkit_cli/commands/reactions.py +191 -0
- rdkit_cli/commands/scaffold.py +243 -0
- rdkit_cli/commands/similarity.py +359 -0
- rdkit_cli/commands/standardize.py +138 -0
- rdkit_cli/core/__init__.py +1 -0
- rdkit_cli/core/conformers.py +197 -0
- rdkit_cli/core/depict.py +241 -0
- rdkit_cli/core/descriptors.py +248 -0
- rdkit_cli/core/diversity.py +174 -0
- rdkit_cli/core/enumerate.py +190 -0
- rdkit_cli/core/filters.py +443 -0
- rdkit_cli/core/fingerprints.py +265 -0
- rdkit_cli/core/fragment.py +237 -0
- rdkit_cli/core/mcs.py +128 -0
- rdkit_cli/core/reactions.py +159 -0
- rdkit_cli/core/scaffold.py +174 -0
- rdkit_cli/core/similarity.py +206 -0
- rdkit_cli/core/standardizer.py +141 -0
- rdkit_cli/io/__init__.py +7 -0
- rdkit_cli/io/formats.py +109 -0
- rdkit_cli/io/readers.py +352 -0
- rdkit_cli/io/writers.py +275 -0
- rdkit_cli/parallel/__init__.py +5 -0
- rdkit_cli/parallel/batch.py +181 -0
- rdkit_cli/parallel/executor.py +180 -0
- rdkit_cli/progress/__init__.py +5 -0
- rdkit_cli/progress/ninja.py +195 -0
- rdkit_cli/utils/__init__.py +1 -0
- rdkit_cli-0.1.0.dist-info/METADATA +380 -0
- rdkit_cli-0.1.0.dist-info/RECORD +47 -0
- rdkit_cli-0.1.0.dist-info/WHEEL +4 -0
- rdkit_cli-0.1.0.dist-info/entry_points.txt +2 -0
- rdkit_cli-0.1.0.dist-info/licenses/LICENSE +190 -0
rdkit_cli/io/readers.py
ADDED
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""File readers for various molecular file formats."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from abc import ABC, abstractmethod
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Iterator, Optional, Any
|
|
7
|
+
|
|
8
|
+
import pandas as pd
|
|
9
|
+
from rdkit import Chem
|
|
10
|
+
|
|
11
|
+
from rdkit_cli.io.formats import FileFormat, FormatConfig, detect_format
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class MoleculeRecord:
|
|
15
|
+
"""A molecule with its associated metadata."""
|
|
16
|
+
|
|
17
|
+
__slots__ = ("mol", "smiles", "name", "metadata", "row_idx")
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
mol: Optional[Chem.Mol],
|
|
22
|
+
smiles: str = "",
|
|
23
|
+
name: str = "",
|
|
24
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
25
|
+
row_idx: int = -1,
|
|
26
|
+
):
|
|
27
|
+
self.mol = mol
|
|
28
|
+
self.smiles = smiles
|
|
29
|
+
self.name = name
|
|
30
|
+
self.metadata = metadata or {}
|
|
31
|
+
self.row_idx = row_idx
|
|
32
|
+
|
|
33
|
+
@property
|
|
34
|
+
def is_valid(self) -> bool:
|
|
35
|
+
"""Check if molecule was parsed successfully."""
|
|
36
|
+
return self.mol is not None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class MoleculeReader(ABC):
|
|
40
|
+
"""Abstract base class for molecule file readers."""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def __iter__(self) -> Iterator[MoleculeRecord]:
|
|
44
|
+
"""Yield MoleculeRecord objects."""
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def __len__(self) -> int:
|
|
49
|
+
"""Return total number of molecules (for progress)."""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def close(self):
|
|
54
|
+
"""Close any open resources."""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
def __enter__(self):
|
|
58
|
+
return self
|
|
59
|
+
|
|
60
|
+
def __exit__(self, *args):
|
|
61
|
+
self.close()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class CSVReader(MoleculeReader):
|
|
65
|
+
"""Read molecules from CSV/TSV files."""
|
|
66
|
+
|
|
67
|
+
def __init__(
|
|
68
|
+
self,
|
|
69
|
+
path: Path | str,
|
|
70
|
+
smiles_column: str = "smiles",
|
|
71
|
+
name_column: Optional[str] = None,
|
|
72
|
+
delimiter: str = ",",
|
|
73
|
+
has_header: bool = True,
|
|
74
|
+
):
|
|
75
|
+
self.path = Path(path)
|
|
76
|
+
self.smiles_column = smiles_column
|
|
77
|
+
self.name_column = name_column
|
|
78
|
+
self.delimiter = delimiter
|
|
79
|
+
self.has_header = has_header
|
|
80
|
+
self._count: Optional[int] = None
|
|
81
|
+
self._df: Optional[pd.DataFrame] = None
|
|
82
|
+
|
|
83
|
+
def __len__(self) -> int:
|
|
84
|
+
if self._count is None:
|
|
85
|
+
# Count lines efficiently
|
|
86
|
+
with open(self.path, "rb") as f:
|
|
87
|
+
self._count = sum(1 for _ in f) - (1 if self.has_header else 0)
|
|
88
|
+
return self._count
|
|
89
|
+
|
|
90
|
+
def __iter__(self) -> Iterator[MoleculeRecord]:
|
|
91
|
+
header = 0 if self.has_header else None
|
|
92
|
+
|
|
93
|
+
# Read in chunks for memory efficiency
|
|
94
|
+
for chunk in pd.read_csv(
|
|
95
|
+
self.path,
|
|
96
|
+
delimiter=self.delimiter,
|
|
97
|
+
header=header,
|
|
98
|
+
chunksize=10000,
|
|
99
|
+
dtype=str,
|
|
100
|
+
na_filter=False,
|
|
101
|
+
):
|
|
102
|
+
# Handle no-header case
|
|
103
|
+
if not self.has_header:
|
|
104
|
+
# Assume first column is SMILES
|
|
105
|
+
chunk.columns = [self.smiles_column] + [f"col_{i}" for i in range(1, len(chunk.columns))]
|
|
106
|
+
|
|
107
|
+
for idx, row in chunk.iterrows():
|
|
108
|
+
smiles = str(row.get(self.smiles_column, ""))
|
|
109
|
+
name = str(row.get(self.name_column, "")) if self.name_column else ""
|
|
110
|
+
|
|
111
|
+
mol = None
|
|
112
|
+
if smiles:
|
|
113
|
+
try:
|
|
114
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
115
|
+
except Exception:
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
if mol is None and smiles:
|
|
119
|
+
print(f"Warning: Failed to parse SMILES at row {idx}: {smiles[:50]}", file=sys.stderr)
|
|
120
|
+
|
|
121
|
+
yield MoleculeRecord(
|
|
122
|
+
mol=mol,
|
|
123
|
+
smiles=smiles,
|
|
124
|
+
name=name,
|
|
125
|
+
metadata=row.to_dict(),
|
|
126
|
+
row_idx=int(idx),
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def close(self):
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class SMIReader(MoleculeReader):
|
|
134
|
+
"""Read molecules from SMILES files."""
|
|
135
|
+
|
|
136
|
+
def __init__(
|
|
137
|
+
self,
|
|
138
|
+
path: Path | str,
|
|
139
|
+
has_header: bool = False,
|
|
140
|
+
delimiter: str = " ",
|
|
141
|
+
):
|
|
142
|
+
self.path = Path(path)
|
|
143
|
+
self.has_header = has_header
|
|
144
|
+
self.delimiter = delimiter
|
|
145
|
+
self._count: Optional[int] = None
|
|
146
|
+
|
|
147
|
+
def __len__(self) -> int:
|
|
148
|
+
if self._count is None:
|
|
149
|
+
with open(self.path, "r") as f:
|
|
150
|
+
count = sum(1 for _ in f)
|
|
151
|
+
self._count = count - (1 if self.has_header else 0)
|
|
152
|
+
return self._count
|
|
153
|
+
|
|
154
|
+
def __iter__(self) -> Iterator[MoleculeRecord]:
|
|
155
|
+
with open(self.path, "r") as f:
|
|
156
|
+
if self.has_header:
|
|
157
|
+
next(f)
|
|
158
|
+
|
|
159
|
+
for idx, line in enumerate(f):
|
|
160
|
+
line = line.strip()
|
|
161
|
+
if not line:
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
parts = line.split(self.delimiter, 1)
|
|
165
|
+
smiles = parts[0] if parts else ""
|
|
166
|
+
name = parts[1].strip() if len(parts) > 1 else ""
|
|
167
|
+
|
|
168
|
+
mol = None
|
|
169
|
+
if smiles:
|
|
170
|
+
try:
|
|
171
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
172
|
+
except Exception:
|
|
173
|
+
pass
|
|
174
|
+
|
|
175
|
+
if mol is None and smiles:
|
|
176
|
+
print(f"Warning: Failed to parse SMILES at line {idx + 1}: {smiles[:50]}", file=sys.stderr)
|
|
177
|
+
|
|
178
|
+
yield MoleculeRecord(
|
|
179
|
+
mol=mol,
|
|
180
|
+
smiles=smiles,
|
|
181
|
+
name=name,
|
|
182
|
+
metadata={"smiles": smiles, "name": name},
|
|
183
|
+
row_idx=idx,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
def close(self):
|
|
187
|
+
pass
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
class SDFReader(MoleculeReader):
|
|
191
|
+
"""Read molecules from SDF files."""
|
|
192
|
+
|
|
193
|
+
def __init__(self, path: Path | str):
|
|
194
|
+
self.path = Path(path)
|
|
195
|
+
self._count: Optional[int] = None
|
|
196
|
+
|
|
197
|
+
def __len__(self) -> int:
|
|
198
|
+
if self._count is None:
|
|
199
|
+
# Count $$$$ delimiters
|
|
200
|
+
with open(self.path, "rb") as f:
|
|
201
|
+
self._count = sum(1 for line in f if line.strip() == b"$$$$")
|
|
202
|
+
return self._count
|
|
203
|
+
|
|
204
|
+
def __iter__(self) -> Iterator[MoleculeRecord]:
|
|
205
|
+
supplier = Chem.SDMolSupplier(str(self.path))
|
|
206
|
+
|
|
207
|
+
for idx, mol in enumerate(supplier):
|
|
208
|
+
metadata = {}
|
|
209
|
+
smiles = ""
|
|
210
|
+
name = ""
|
|
211
|
+
|
|
212
|
+
if mol is not None:
|
|
213
|
+
# Extract properties
|
|
214
|
+
for prop in mol.GetPropsAsDict():
|
|
215
|
+
metadata[prop] = mol.GetProp(prop)
|
|
216
|
+
|
|
217
|
+
name = mol.GetProp("_Name") if mol.HasProp("_Name") else ""
|
|
218
|
+
smiles = Chem.MolToSmiles(mol)
|
|
219
|
+
else:
|
|
220
|
+
print(f"Warning: Failed to parse molecule at index {idx}", file=sys.stderr)
|
|
221
|
+
|
|
222
|
+
yield MoleculeRecord(
|
|
223
|
+
mol=mol,
|
|
224
|
+
smiles=smiles,
|
|
225
|
+
name=name,
|
|
226
|
+
metadata=metadata,
|
|
227
|
+
row_idx=idx,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
def close(self):
|
|
231
|
+
pass
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
class ParquetReader(MoleculeReader):
|
|
235
|
+
"""Read molecules from Parquet files."""
|
|
236
|
+
|
|
237
|
+
def __init__(
|
|
238
|
+
self,
|
|
239
|
+
path: Path | str,
|
|
240
|
+
smiles_column: str = "smiles",
|
|
241
|
+
name_column: Optional[str] = None,
|
|
242
|
+
):
|
|
243
|
+
self.path = Path(path)
|
|
244
|
+
self.smiles_column = smiles_column
|
|
245
|
+
self.name_column = name_column
|
|
246
|
+
self._count: Optional[int] = None
|
|
247
|
+
|
|
248
|
+
def __len__(self) -> int:
|
|
249
|
+
if self._count is None:
|
|
250
|
+
import pyarrow.parquet as pq
|
|
251
|
+
self._count = pq.read_metadata(self.path).num_rows
|
|
252
|
+
return self._count
|
|
253
|
+
|
|
254
|
+
def __iter__(self) -> Iterator[MoleculeRecord]:
|
|
255
|
+
import pyarrow.parquet as pq
|
|
256
|
+
|
|
257
|
+
# Read in batches for memory efficiency
|
|
258
|
+
parquet_file = pq.ParquetFile(self.path)
|
|
259
|
+
|
|
260
|
+
row_idx = 0
|
|
261
|
+
for batch in parquet_file.iter_batches(batch_size=10000):
|
|
262
|
+
df = batch.to_pandas()
|
|
263
|
+
|
|
264
|
+
for _, row in df.iterrows():
|
|
265
|
+
smiles = str(row.get(self.smiles_column, ""))
|
|
266
|
+
name = str(row.get(self.name_column, "")) if self.name_column else ""
|
|
267
|
+
|
|
268
|
+
mol = None
|
|
269
|
+
if smiles:
|
|
270
|
+
try:
|
|
271
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
|
|
275
|
+
if mol is None and smiles:
|
|
276
|
+
print(f"Warning: Failed to parse SMILES at row {row_idx}: {smiles[:50]}", file=sys.stderr)
|
|
277
|
+
|
|
278
|
+
yield MoleculeRecord(
|
|
279
|
+
mol=mol,
|
|
280
|
+
smiles=smiles,
|
|
281
|
+
name=name,
|
|
282
|
+
metadata=row.to_dict(),
|
|
283
|
+
row_idx=row_idx,
|
|
284
|
+
)
|
|
285
|
+
row_idx += 1
|
|
286
|
+
|
|
287
|
+
def close(self):
|
|
288
|
+
pass
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def create_reader(
|
|
292
|
+
path: str | Path,
|
|
293
|
+
format_config: Optional[FormatConfig] = None,
|
|
294
|
+
smiles_column: str = "smiles",
|
|
295
|
+
name_column: Optional[str] = None,
|
|
296
|
+
has_header: Optional[bool] = None,
|
|
297
|
+
) -> MoleculeReader:
|
|
298
|
+
"""
|
|
299
|
+
Factory function to create appropriate reader.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
path: Path to input file
|
|
303
|
+
format_config: Optional format configuration
|
|
304
|
+
smiles_column: Name of SMILES column (for CSV/Parquet)
|
|
305
|
+
name_column: Name of name column
|
|
306
|
+
has_header: Override header detection
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Appropriate MoleculeReader instance
|
|
310
|
+
"""
|
|
311
|
+
path = Path(path)
|
|
312
|
+
|
|
313
|
+
if format_config is None:
|
|
314
|
+
file_format = detect_format(path)
|
|
315
|
+
else:
|
|
316
|
+
file_format = format_config.format
|
|
317
|
+
smiles_column = format_config.smiles_column
|
|
318
|
+
name_column = format_config.name_column
|
|
319
|
+
if has_header is None:
|
|
320
|
+
has_header = format_config.has_header
|
|
321
|
+
|
|
322
|
+
if file_format == FileFormat.CSV:
|
|
323
|
+
return CSVReader(
|
|
324
|
+
path,
|
|
325
|
+
smiles_column=smiles_column,
|
|
326
|
+
name_column=name_column,
|
|
327
|
+
delimiter=",",
|
|
328
|
+
has_header=has_header if has_header is not None else True,
|
|
329
|
+
)
|
|
330
|
+
elif file_format == FileFormat.TSV:
|
|
331
|
+
return CSVReader(
|
|
332
|
+
path,
|
|
333
|
+
smiles_column=smiles_column,
|
|
334
|
+
name_column=name_column,
|
|
335
|
+
delimiter="\t",
|
|
336
|
+
has_header=has_header if has_header is not None else True,
|
|
337
|
+
)
|
|
338
|
+
elif file_format == FileFormat.SMI:
|
|
339
|
+
return SMIReader(
|
|
340
|
+
path,
|
|
341
|
+
has_header=has_header if has_header is not None else False,
|
|
342
|
+
)
|
|
343
|
+
elif file_format == FileFormat.SDF:
|
|
344
|
+
return SDFReader(path)
|
|
345
|
+
elif file_format == FileFormat.PARQUET:
|
|
346
|
+
return ParquetReader(
|
|
347
|
+
path,
|
|
348
|
+
smiles_column=smiles_column,
|
|
349
|
+
name_column=name_column,
|
|
350
|
+
)
|
|
351
|
+
else:
|
|
352
|
+
raise ValueError(f"Unsupported format: {file_format}")
|
rdkit_cli/io/writers.py
ADDED
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""File writers for various molecular file formats."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Any, Optional
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from rdkit import Chem
|
|
9
|
+
|
|
10
|
+
from rdkit_cli.io.formats import FileFormat, detect_format
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class MoleculeWriter(ABC):
|
|
14
|
+
"""Abstract base class for molecule file writers."""
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def write_row(self, data: dict[str, Any]):
|
|
18
|
+
"""Write a single row of data."""
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def write_batch(self, data: list[dict[str, Any]]):
|
|
23
|
+
"""Write a batch of results."""
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def close(self):
|
|
28
|
+
"""Finalize and close the file."""
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
def __enter__(self):
|
|
32
|
+
return self
|
|
33
|
+
|
|
34
|
+
def __exit__(self, *args):
|
|
35
|
+
self.close()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class CSVWriter(MoleculeWriter):
|
|
39
|
+
"""Write results to CSV/TSV files."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
path: Path | str,
|
|
44
|
+
delimiter: str = ",",
|
|
45
|
+
columns: Optional[list[str]] = None,
|
|
46
|
+
):
|
|
47
|
+
self.path = Path(path)
|
|
48
|
+
self.delimiter = delimiter
|
|
49
|
+
self.columns = columns
|
|
50
|
+
self._file = open(path, "w", newline="", encoding="utf-8")
|
|
51
|
+
self._header_written = False
|
|
52
|
+
self._column_order: Optional[list[str]] = None
|
|
53
|
+
|
|
54
|
+
def write_row(self, data: dict[str, Any]):
|
|
55
|
+
"""Write a single row."""
|
|
56
|
+
self.write_batch([data])
|
|
57
|
+
|
|
58
|
+
def write_batch(self, data: list[dict[str, Any]]):
|
|
59
|
+
"""Write a batch of results."""
|
|
60
|
+
if not data:
|
|
61
|
+
return
|
|
62
|
+
|
|
63
|
+
# Determine column order from first row if not set
|
|
64
|
+
if self._column_order is None:
|
|
65
|
+
if self.columns:
|
|
66
|
+
self._column_order = self.columns
|
|
67
|
+
else:
|
|
68
|
+
self._column_order = list(data[0].keys())
|
|
69
|
+
|
|
70
|
+
# Write header if not done yet
|
|
71
|
+
if not self._header_written:
|
|
72
|
+
self._file.write(self.delimiter.join(self._column_order) + "\n")
|
|
73
|
+
self._header_written = True
|
|
74
|
+
|
|
75
|
+
# Write data rows
|
|
76
|
+
for row in data:
|
|
77
|
+
values = []
|
|
78
|
+
for col in self._column_order:
|
|
79
|
+
val = row.get(col, "")
|
|
80
|
+
# Handle special types
|
|
81
|
+
if val is None:
|
|
82
|
+
val = ""
|
|
83
|
+
elif isinstance(val, float):
|
|
84
|
+
if pd.isna(val):
|
|
85
|
+
val = ""
|
|
86
|
+
else:
|
|
87
|
+
val = str(val)
|
|
88
|
+
else:
|
|
89
|
+
val = str(val)
|
|
90
|
+
# Escape delimiter and quotes
|
|
91
|
+
if self.delimiter in val or '"' in val or "\n" in val:
|
|
92
|
+
val = '"' + val.replace('"', '""') + '"'
|
|
93
|
+
values.append(val)
|
|
94
|
+
self._file.write(self.delimiter.join(values) + "\n")
|
|
95
|
+
|
|
96
|
+
def close(self):
|
|
97
|
+
"""Close the file."""
|
|
98
|
+
if self._file:
|
|
99
|
+
self._file.close()
|
|
100
|
+
self._file = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class SMIWriter(MoleculeWriter):
|
|
104
|
+
"""Write molecules to SMILES files."""
|
|
105
|
+
|
|
106
|
+
def __init__(
|
|
107
|
+
self,
|
|
108
|
+
path: Path | str,
|
|
109
|
+
smiles_column: str = "smiles",
|
|
110
|
+
name_column: Optional[str] = "name",
|
|
111
|
+
):
|
|
112
|
+
self.path = Path(path)
|
|
113
|
+
self.smiles_column = smiles_column
|
|
114
|
+
self.name_column = name_column
|
|
115
|
+
self._file = open(path, "w", encoding="utf-8")
|
|
116
|
+
|
|
117
|
+
def write_row(self, data: dict[str, Any]):
|
|
118
|
+
"""Write a single row."""
|
|
119
|
+
smiles = data.get(self.smiles_column, "")
|
|
120
|
+
name = data.get(self.name_column, "") if self.name_column else ""
|
|
121
|
+
|
|
122
|
+
if smiles:
|
|
123
|
+
if name:
|
|
124
|
+
self._file.write(f"{smiles} {name}\n")
|
|
125
|
+
else:
|
|
126
|
+
self._file.write(f"{smiles}\n")
|
|
127
|
+
|
|
128
|
+
def write_batch(self, data: list[dict[str, Any]]):
|
|
129
|
+
"""Write a batch of results."""
|
|
130
|
+
for row in data:
|
|
131
|
+
self.write_row(row)
|
|
132
|
+
|
|
133
|
+
def close(self):
|
|
134
|
+
"""Close the file."""
|
|
135
|
+
if self._file:
|
|
136
|
+
self._file.close()
|
|
137
|
+
self._file = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class SDFWriter(MoleculeWriter):
|
|
141
|
+
"""Write molecules to SDF files."""
|
|
142
|
+
|
|
143
|
+
def __init__(self, path: Path | str):
|
|
144
|
+
self.path = Path(path)
|
|
145
|
+
self._writer = Chem.SDWriter(str(path))
|
|
146
|
+
|
|
147
|
+
def write_row(self, data: dict[str, Any]):
|
|
148
|
+
"""Write a single row."""
|
|
149
|
+
mol = data.get("mol")
|
|
150
|
+
|
|
151
|
+
if mol is None:
|
|
152
|
+
# Try to create from SMILES
|
|
153
|
+
smiles = data.get("smiles", "")
|
|
154
|
+
if smiles:
|
|
155
|
+
mol = Chem.MolFromSmiles(smiles)
|
|
156
|
+
|
|
157
|
+
if mol is not None:
|
|
158
|
+
# Set properties from data
|
|
159
|
+
for key, value in data.items():
|
|
160
|
+
if key not in ("mol", "smiles") and value is not None:
|
|
161
|
+
try:
|
|
162
|
+
mol.SetProp(str(key), str(value))
|
|
163
|
+
except Exception:
|
|
164
|
+
pass
|
|
165
|
+
self._writer.write(mol)
|
|
166
|
+
|
|
167
|
+
def write_batch(self, data: list[dict[str, Any]]):
|
|
168
|
+
"""Write a batch of results."""
|
|
169
|
+
for row in data:
|
|
170
|
+
self.write_row(row)
|
|
171
|
+
|
|
172
|
+
def close(self):
|
|
173
|
+
"""Close the writer."""
|
|
174
|
+
if self._writer:
|
|
175
|
+
self._writer.close()
|
|
176
|
+
self._writer = None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
class ParquetWriter(MoleculeWriter):
|
|
180
|
+
"""Write results to Parquet files."""
|
|
181
|
+
|
|
182
|
+
def __init__(
|
|
183
|
+
self,
|
|
184
|
+
path: Path | str,
|
|
185
|
+
columns: Optional[list[str]] = None,
|
|
186
|
+
):
|
|
187
|
+
self.path = Path(path)
|
|
188
|
+
self.columns = columns
|
|
189
|
+
self._batches: list[dict[str, Any]] = []
|
|
190
|
+
self._batch_size = 100000 # Write in batches of 100k
|
|
191
|
+
|
|
192
|
+
def write_row(self, data: dict[str, Any]):
|
|
193
|
+
"""Write a single row."""
|
|
194
|
+
# Remove mol objects (not serializable)
|
|
195
|
+
clean_data = {k: v for k, v in data.items() if k != "mol"}
|
|
196
|
+
self._batches.append(clean_data)
|
|
197
|
+
|
|
198
|
+
if len(self._batches) >= self._batch_size:
|
|
199
|
+
self._flush()
|
|
200
|
+
|
|
201
|
+
def write_batch(self, data: list[dict[str, Any]]):
|
|
202
|
+
"""Write a batch of results."""
|
|
203
|
+
for row in data:
|
|
204
|
+
clean_data = {k: v for k, v in row.items() if k != "mol"}
|
|
205
|
+
self._batches.append(clean_data)
|
|
206
|
+
|
|
207
|
+
if len(self._batches) >= self._batch_size:
|
|
208
|
+
self._flush()
|
|
209
|
+
|
|
210
|
+
def _flush(self):
|
|
211
|
+
"""Write accumulated batches to file."""
|
|
212
|
+
if not self._batches:
|
|
213
|
+
return
|
|
214
|
+
|
|
215
|
+
import pyarrow as pa
|
|
216
|
+
import pyarrow.parquet as pq
|
|
217
|
+
|
|
218
|
+
df = pd.DataFrame(self._batches)
|
|
219
|
+
|
|
220
|
+
# Reorder columns if specified
|
|
221
|
+
if self.columns:
|
|
222
|
+
cols = [c for c in self.columns if c in df.columns]
|
|
223
|
+
extra = [c for c in df.columns if c not in self.columns]
|
|
224
|
+
df = df[cols + extra]
|
|
225
|
+
|
|
226
|
+
table = pa.Table.from_pandas(df, preserve_index=False)
|
|
227
|
+
|
|
228
|
+
if self.path.exists():
|
|
229
|
+
# Append to existing file
|
|
230
|
+
existing = pq.read_table(self.path)
|
|
231
|
+
table = pa.concat_tables([existing, table])
|
|
232
|
+
|
|
233
|
+
pq.write_table(table, self.path)
|
|
234
|
+
self._batches = []
|
|
235
|
+
|
|
236
|
+
def close(self):
|
|
237
|
+
"""Finalize and close the file."""
|
|
238
|
+
self._flush()
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def create_writer(
|
|
242
|
+
path: str | Path,
|
|
243
|
+
format_override: Optional[FileFormat] = None,
|
|
244
|
+
columns: Optional[list[str]] = None,
|
|
245
|
+
smiles_column: str = "smiles",
|
|
246
|
+
name_column: Optional[str] = "name",
|
|
247
|
+
) -> MoleculeWriter:
|
|
248
|
+
"""
|
|
249
|
+
Factory function to create appropriate writer.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
path: Output file path
|
|
253
|
+
format_override: Override auto-detected format
|
|
254
|
+
columns: Column order for output
|
|
255
|
+
smiles_column: Name of SMILES column (for SMI files)
|
|
256
|
+
name_column: Name of name column (for SMI files)
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
Appropriate MoleculeWriter instance
|
|
260
|
+
"""
|
|
261
|
+
path = Path(path)
|
|
262
|
+
file_format = format_override or detect_format(path)
|
|
263
|
+
|
|
264
|
+
if file_format == FileFormat.CSV:
|
|
265
|
+
return CSVWriter(path, delimiter=",", columns=columns)
|
|
266
|
+
elif file_format == FileFormat.TSV:
|
|
267
|
+
return CSVWriter(path, delimiter="\t", columns=columns)
|
|
268
|
+
elif file_format == FileFormat.SMI:
|
|
269
|
+
return SMIWriter(path, smiles_column=smiles_column, name_column=name_column)
|
|
270
|
+
elif file_format == FileFormat.SDF:
|
|
271
|
+
return SDFWriter(path)
|
|
272
|
+
elif file_format == FileFormat.PARQUET:
|
|
273
|
+
return ParquetWriter(path, columns=columns)
|
|
274
|
+
else:
|
|
275
|
+
raise ValueError(f"Unsupported format: {file_format}")
|