stjames 0.0.65__py3-none-any.whl → 0.0.67__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of stjames might be problematic. Click here for more details.

stjames/molecule.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import re
2
2
  from pathlib import Path
3
- from typing import Annotated, Iterable, Optional, Self, TypeAlias
3
+ from typing import Annotated, Any, Iterable, Optional, Self, Sequence, TypeAlias, TypedDict, TypeVar
4
4
 
5
5
  import pydantic
6
6
  from pydantic import AfterValidator, NonNegativeInt, PositiveInt, ValidationError
@@ -9,6 +9,7 @@ from rdkit.Chem import AllChem
9
9
 
10
10
  from .atom import Atom
11
11
  from .base import Base, round_float, round_optional_float
12
+ from .data import SYMBOL_ELEMENT
12
13
  from .periodic_cell import PeriodicCell
13
14
  from .types import (
14
15
  FloatPerAtom,
@@ -246,28 +247,102 @@ class Molecule(Base):
246
247
  return cls.from_extxyz_lines(extxyz.strip().splitlines(), charge=charge, multiplicity=multiplicity)
247
248
 
248
249
  @classmethod
249
- def from_extxyz_lines(cls: type[Self], lines: Iterable[str], charge: int = 0, multiplicity: PositiveInt = 1) -> Self:
250
- # ensure first line is number of atoms
251
- lines = list(lines)
250
+ def from_extxyz_lines(
251
+ cls: type[Self],
252
+ lines: Iterable[str],
253
+ charge: int | None = None,
254
+ multiplicity: PositiveInt | None = None,
255
+ cell: PeriodicCell | None = None,
256
+ ) -> Self:
257
+ """
258
+ Parses an EXTXYZ file, extracting atom positions, forces (if present), and metadata.
259
+
260
+ Supports:
261
+ - Lattice vectors (cell information)
262
+ - Properties field (species, positions, forces, etc.)
263
+ - Other metadata like charge, multiplicity, energy, etc.
264
+
265
+ :param lines: Iterable of lines from an EXTXYZ file
266
+ :param charge: total charge of the molecule (default: 0 if not found)
267
+ :param multiplicity: spin multiplicity of the molecule (default: 1 if not found)
268
+ :param cell: PeriodicCell containing lattice vectors
269
+ :return: Molecule
270
+ :raises MoleculeReadError: if the file is not in the correct format
271
+ """
272
+ if not isinstance(lines, Sequence):
273
+ lines = list(lines)
274
+
275
+ # Ensure first line contains number of atoms
252
276
  if len(lines[0].split()) == 1:
253
277
  natoms = lines[0].strip()
254
- if not natoms.isdigit() or (int(lines[0]) != len(lines) - 2):
255
- raise MoleculeReadError(f"First line of EXTXYZ file should be the number of atoms, got: {lines[0]} != {len(lines) - 2}")
256
- lines = lines[1:]
278
+ if not natoms.isdigit() or (int(natoms) != len(lines) - 2):
279
+ raise MoleculeReadError(f"First line should be number of atoms, got: {lines[0]} != {len(lines) - 2}")
280
+ data_line, *lines = lines[1:]
257
281
  else:
258
- raise MoleculeReadError(f"First line of EXTXYZ should be only an int denoting number of atoms. Got {lines[0].split()}")
282
+ raise MoleculeReadError(f"First line should be an integer denoting atom count. Got {lines[0].split()}")
259
283
 
260
- # ensure second line contains key-value pairs
261
- if "=" not in lines[0]:
262
- raise MoleculeReadError(f"Invalid property line, got {lines[0]}")
284
+ metadata = parse_extxyz_comment_line(data_line)
263
285
 
264
- cell = parse_comment_line(lines[0])
265
- lines = lines[1:]
286
+ T = TypeVar("T")
266
287
 
267
- try:
268
- return cls(atoms=[Atom.from_xyz(line) for line in lines], cell=cell, charge=charge, multiplicity=multiplicity)
269
- except (ValueError, ValidationError) as e:
270
- raise MoleculeReadError("Error reading molecule from extxyz") from e
288
+ def metadata_optional_get(key: str, value: T | None, default: T) -> T:
289
+ """Set key to default if not found in metadata"""
290
+ if value is None:
291
+ return metadata.get(key, default) # type: ignore [return-value]
292
+
293
+ return value
294
+
295
+ charge = metadata_optional_get("total_charge", charge, 0)
296
+ multiplicity = metadata_optional_get("multiplicity", multiplicity, 1)
297
+ cell = cell or metadata.get("cell")
298
+ energy = metadata.get("energy", None)
299
+
300
+ force_idx = None
301
+ if properties := metadata.get("properties", "").split(":"):
302
+ if properties[0].lower() != "species":
303
+ raise MoleculeReadError(f"Invalid or missing 'Properties' field in EXTXYZ, got: {properties}")
304
+
305
+ # Identify column indices for position and force data
306
+ pos_idx = None
307
+ current_idx = 0 # Start after 'species:S'
308
+
309
+ while current_idx < len(properties):
310
+ if properties[current_idx].lower() == "pos" and properties[current_idx + 1].lower() == "r" and properties[current_idx + 2] == "3":
311
+ pos_idx = current_idx
312
+ elif properties[current_idx].lower() == "forces" and properties[current_idx + 1].lower() == "r" and properties[current_idx + 2] == "3":
313
+ force_idx = current_idx
314
+ current_idx += 3
315
+
316
+ if pos_idx is None:
317
+ raise MoleculeReadError("No position data ('pos:R:3') found in Properties field.")
318
+
319
+ def parse_line_atoms(line: str) -> Atom:
320
+ symbol, sx, sy, sz, *_ = line.split()
321
+ atomic_number = SYMBOL_ELEMENT[symbol.title()]
322
+ x, y, z = map(float, (sx, sy, sz))
323
+
324
+ return Atom(atomic_number=atomic_number, position=(x, y, z))
325
+
326
+ def parse_line_with_grad(line: str) -> tuple[Atom, Vector3D]:
327
+ symbol, sx, sy, sz, sgx, sgy, sgz, *_ = line.split()
328
+ atomic_number = SYMBOL_ELEMENT[symbol.title()]
329
+ x, y, z = map(float, (sx, sy, sz))
330
+ gx, gy, gz = map(float, (sgx, sgy, sgz))
331
+
332
+ return (
333
+ Atom(atomic_number=atomic_number, position=(x, y, z)),
334
+ (-gx, -gy, -gz),
335
+ )
336
+
337
+ atoms: list[Atom]
338
+ gradients: list[Vector3D] | None
339
+ if force_idx is not None:
340
+ atoms, gradients = zip(*map(parse_line_with_grad, lines), strict=True) # type: ignore [assignment]
341
+ else:
342
+ atoms = [parse_line_atoms(line) for line in lines]
343
+ gradients = None
344
+
345
+ return cls(atoms=atoms, cell=cell, charge=charge, multiplicity=multiplicity, energy=energy, gradient=gradients)
271
346
 
272
347
  @classmethod
273
348
  def from_rdkit(cls: type[Self], rdkm: RdkitMol, cid: int = 0) -> Self:
@@ -313,43 +388,62 @@ def _embed_rdkit_mol(rdkm: RdkitMol) -> RdkitMol:
313
388
  return rdkm
314
389
 
315
390
 
316
- def parse_comment_line(line: str) -> PeriodicCell:
317
- """
318
- currently only supporting lattice and porperites fields from comment line
319
- modify in future to support other fields from comment from_xyz_lines
320
- ex: name, mulitplicity, charge, etc.
391
+ class EXTXYZMetadata(TypedDict, total=False):
392
+ properties: Any
393
+ total_charge: int
394
+ multiplicity: int
395
+ energy: float
396
+ cell: PeriodicCell
397
+
398
+
399
+ def parse_extxyz_comment_line(line: str) -> EXTXYZMetadata:
321
400
  """
322
- cell = None
401
+ Parse the comment line of an EXTXYZ file, extracting lattice, properties, and metadata.
402
+
403
+ Supports:
404
+ - Lattice vectors (cell information)
405
+ - Properties field (species, positions, forces, etc.)
406
+ - Other metadata fields like charge, multiplicity, energy, etc.
407
+
408
+ :param line: comment line from an EXTXYZ file
409
+ :return: parsed properties
410
+
411
+ >>> parse_extxyz_comment_line('Lattice="6.0 0.0 0.0 6.0 0.0 0.0 6.0 0.0 0.0"Properties=species:S:1:pos:R:3')
412
+ {'cell': PeriodicCell(lattice_vectors=((6.0, 0.0, 0.0), (6.0, 0.0, 0.0), (6.0, 0.0, 0.0)), is_periodic=(True, True, True), volume=0.0), 'properties': 'species:S:1:pos:R:3'}
413
+ """ # noqa: E501
414
+
323
415
  # Regular expression to match key="value", key='value', or key=value
324
416
  pattern = r"(\S+?=(?:\".*?\"|\'.*?\'|\S+))"
325
417
  pairs = re.findall(pattern, line)
326
418
 
327
- prop_dict = {}
419
+ prop_dict: EXTXYZMetadata = {}
328
420
  for pair in pairs:
329
421
  key, value = pair.split("=", 1)
330
- if key.lower() == "lattice":
331
- value = value.strip("'\"").split()
332
- if len(value) != 9:
333
- raise MoleculeReadError(f"Lattice should have 9 entries got {len(value)}")
422
+ key = key.lower().strip()
423
+ value = value.strip("'\"")
424
+
425
+ if key == "lattice":
426
+ lattice_values = value.split()
427
+ if len(lattice_values) != 9:
428
+ raise MoleculeReadError(f"Lattice should have 9 entries, got {len(lattice_values)}")
334
429
 
335
- # Convert the value to a 3x3 tuple of tuples of floats
336
430
  try:
337
- cell = tuple(tuple(map(float, value[i : i + 3])) for i in range(0, 9, 3))
431
+ cell = tuple(tuple(map(float, lattice_values[i : i + 3])) for i in range(0, 9, 3))
338
432
  except ValueError:
339
- raise MoleculeReadError(f"Lattice should be floats, got {value}")
433
+ raise MoleculeReadError(f"Lattice should be floats, got {lattice_values}")
340
434
 
341
- prop_dict[key] = value
435
+ prop_dict["cell"] = PeriodicCell(lattice_vectors=cell)
342
436
 
343
- elif key.lower() == "properties":
344
- if value.lower() != "species:s:1:pos:r:3":
345
- raise MoleculeReadError(f"Only accepting properties of form species:S:1:pos:R:3, got {value}")
346
- prop_dict[key] = value
347
- else:
348
- raise MoleculeReadError(f"Currently only accepting lattice and propery keys. Got {key}")
437
+ elif key == "properties":
438
+ prop_dict["properties"] = value
349
439
 
350
- if cell is None:
351
- raise MoleculeReadError("Lattice field is required but missing.")
440
+ elif key == "total_charge":
441
+ prop_dict["total_charge"] = int(value)
442
+ elif key == "multiplicity":
443
+ prop_dict["multiplicity"] = int(value)
444
+ elif key == "energy":
445
+ prop_dict["energy"] = float(value)
446
+ else:
447
+ prop_dict[key] = value # type: ignore [literal-required]
352
448
 
353
- if "properties" not in [key.lower() for key in prop_dict.keys()]:
354
- raise MoleculeReadError(f"Property field is required, got keys {prop_dict.keys()}")
355
- return PeriodicCell(lattice_vectors=cell)
449
+ return prop_dict
stjames/pdb.py CHANGED
@@ -1,3 +1,4 @@
1
+ import re
1
2
  from datetime import date, datetime
2
3
  from pathlib import Path
3
4
  from typing import Any, Literal
@@ -276,7 +277,7 @@ def pdb_object_to_pdb_filestring(
276
277
  atom=atom,
277
278
  chain_id=this_chain_id,
278
279
  res_name=residue.name,
279
- res_num=int(_residue_id[2:]),
280
+ res_num=_residue_id[2:],
280
281
  alt_loc=atom.alt_loc or "",
281
282
  )
282
283
  pdb_lines.append(line)
@@ -286,12 +287,12 @@ def pdb_object_to_pdb_filestring(
286
287
  atom=atom,
287
288
  chain_id=this_chain_id,
288
289
  res_name=residue.name,
289
- res_num=int(_residue_id[2:]),
290
+ res_num=_residue_id[2:],
290
291
  alt_loc=atom.alt_loc or "",
291
292
  )
292
293
  pdb_lines.append(line)
293
294
 
294
- pdb_lines.append(f"TER {_atom_id + 1:>5} {residue.name:>3} {this_chain_id}{int(_residue_id[2:]):>4}")
295
+ pdb_lines.append(f"TER {_atom_id + 1:>5} {residue.name:>3} {this_chain_id}{_residue_id[2:]:>4}")
295
296
 
296
297
  # === 2) Non-polymers (e.g. ligands, ions) ===
297
298
  for _np_id, nonpoly in model.non_polymer.items():
@@ -308,7 +309,7 @@ def pdb_object_to_pdb_filestring(
308
309
  atom=atom,
309
310
  chain_id=chain_id_for_np,
310
311
  res_name=nonpoly.name,
311
- res_num=int(_np_id[2:]),
312
+ res_num=_np_id[2:],
312
313
  )
313
314
  pdb_lines.append(line)
314
315
  if atom.anisotropy and atom.anisotropy != [0, 0, 0, 0, 0, 0]:
@@ -317,7 +318,7 @@ def pdb_object_to_pdb_filestring(
317
318
  atom=atom,
318
319
  chain_id=chain_id_for_np,
319
320
  res_name=nonpoly.name,
320
- res_num=int(_np_id[2:]),
321
+ res_num=_np_id[2:],
321
322
  )
322
323
  pdb_lines.append(line)
323
324
 
@@ -330,7 +331,7 @@ def pdb_object_to_pdb_filestring(
330
331
  atom=atom,
331
332
  chain_id=_w_id[0], # Or you can use water.polymer if set
332
333
  res_name="HOH",
333
- res_num=int(_w_id[2:]), # or an incrementing value
334
+ res_num=_w_id[2:], # or an incrementing value
334
335
  )
335
336
  pdb_lines.append(line)
336
337
  if atom.anisotropy and atom.anisotropy != [0, 0, 0, 0, 0, 0]:
@@ -339,7 +340,7 @@ def pdb_object_to_pdb_filestring(
339
340
  atom=atom,
340
341
  chain_id=_w_id[0],
341
342
  res_name="HOH",
342
- res_num=int(_w_id[2:]),
343
+ res_num=_w_id[2:],
343
344
  )
344
345
  pdb_lines.append(line)
345
346
 
@@ -357,7 +358,7 @@ def pdb_object_to_pdb_filestring(
357
358
  atom=atom,
358
359
  chain_id="B",
359
360
  res_name="BRN", # or branched_obj.get("name", "BRN")
360
- res_num=1,
361
+ res_num="1",
361
362
  )
362
363
  pdb_lines.append(line)
363
364
  if atom.anisotropy and atom.anisotropy != [0, 0, 0, 0, 0, 0]:
@@ -366,7 +367,7 @@ def pdb_object_to_pdb_filestring(
366
367
  atom=atom,
367
368
  chain_id="B",
368
369
  res_name="BRN",
369
- res_num=1,
370
+ res_num="1",
370
371
  )
371
372
  pdb_lines.append(line)
372
373
 
@@ -407,7 +408,7 @@ def _format_atom_line(
407
408
  atom: PDBAtom,
408
409
  chain_id: str,
409
410
  res_name: str,
410
- res_num: int | None,
411
+ res_num: str | None,
411
412
  alt_loc: str = "",
412
413
  ) -> str:
413
414
  """
@@ -423,7 +424,15 @@ def _format_atom_line(
423
424
  alt_loc_char = alt_loc if alt_loc else " "
424
425
  residue_name = (res_name or "UNK")[:3] # limit to 3 chars
425
426
  chain_char = (chain_id or "A")[:1] # PDB chain ID is 1 char
426
- residue_num = res_num if res_num is not None else 1
427
+ residue_num_str = "1"
428
+ insertion_code = " "
429
+ if res_num:
430
+ match = re.match(r"(\d+)([a-zA-Z]*)", res_num)
431
+ if match:
432
+ residue_num_str, insertion_code = match.groups()
433
+ insertion_code = insertion_code if insertion_code != "" else " "
434
+
435
+ residue_num = int(residue_num_str)
427
436
 
428
437
  # Format charge: PDB uses e.g. " 2-", " 1+" in columns 79-80
429
438
  # If your model stores charges differently, adapt as needed.
@@ -451,7 +460,8 @@ def _format_atom_line(
451
460
  f"{residue_name:>3}" # residue name (columns 18-20)
452
461
  f" {chain_char}" # chain ID (column 22)
453
462
  f"{residue_num:4d}" # residue sequence number (columns 23-26)
454
- f" " # columns 27-30 (insertion code plus spacing)
463
+ f"{insertion_code}"
464
+ f" " # columns 27-30 (spacing)
455
465
  f"{atom.x:8.3f}" # x (columns 31-38)
456
466
  f"{atom.y:8.3f}" # y (columns 39-46)
457
467
  f"{atom.z:8.3f}" # z (columns 47-54)
@@ -469,7 +479,7 @@ def _format_anisou_line(
469
479
  atom: PDBAtom,
470
480
  chain_id: str,
471
481
  res_name: str,
472
- res_num: int | None,
482
+ res_num: str | None,
473
483
  alt_loc: str = "",
474
484
  ) -> str:
475
485
  """
@@ -485,7 +495,15 @@ def _format_anisou_line(
485
495
  alt_loc_char = alt_loc if alt_loc else " "
486
496
  residue_name = (res_name or "UNK")[:3] # limit to 3 chars
487
497
  chain_char = (chain_id or "A")[:1] # PDB chain ID is 1 char
488
- residue_num = res_num if res_num is not None else 1
498
+ residue_num_str = "1"
499
+ insertion_code = " "
500
+ if res_num:
501
+ match = re.match(r"(\d+)([a-zA-Z]*)", res_num)
502
+ if match:
503
+ residue_num_str, insertion_code = match.groups()
504
+ insertion_code = insertion_code if insertion_code != "" else " "
505
+
506
+ residue_num = int(residue_num_str)
489
507
 
490
508
  chg = ""
491
509
  if atom.charge and abs(atom.charge) > 0:
@@ -528,7 +546,8 @@ def _format_anisou_line(
528
546
  f"{residue_name:>3}" # residue name (columns 18-20)
529
547
  f" {chain_char}" # chain ID (column 22)
530
548
  f"{residue_num:4d}" # residue sequence number (columns 23-26)
531
- f" " # columns 27-28 (insertion code plus spacing)
549
+ f"{insertion_code}"
550
+ f" " # columns 27-28 (plus spacing)
532
551
  f"{aniso_lines}"
533
552
  f" " # columns 70-76 (padding)
534
553
  f"{atom.element:>2}" # element (columns 77-78)
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: stjames
3
- Version: 0.0.65
3
+ Version: 0.0.67
4
4
  Summary: standardized JSON atom/molecule encoding scheme
5
5
  Author-email: Corin Wagen <corin@rowansci.com>
6
6
  Project-URL: Homepage, https://github.com/rowansci/stjames
@@ -12,6 +12,7 @@ Requires-Dist: pydantic>=2.4
12
12
  Requires-Dist: numpy
13
13
  Requires-Dist: requests
14
14
  Requires-Dist: rdkit
15
+ Dynamic: license-file
15
16
 
16
17
  # stjames
17
18
 
@@ -13,9 +13,9 @@ stjames/int_settings.py,sha256=5HXp8opt5ZyY1UpmfaK7NVloWVLM5jkG0elEEqpVLUo,896
13
13
  stjames/message.py,sha256=Rq6QqmHZKecWxYH8fVyXmuoCCPZv8YinvgykSeorXSU,216
14
14
  stjames/method.py,sha256=5hBHk2xQLpxZ52LwJ9FHWaqQMdFKnsbQEOxaVe6O4Go,2321
15
15
  stjames/mode.py,sha256=xw46Cc7f3eTS8i35qECi-8DocAlANhayK3w4akD4HBU,496
16
- stjames/molecule.py,sha256=R0BalcXdvyhuyffoH11Nml_49qiVl4mD0dcGcLqfQYM,14371
16
+ stjames/molecule.py,sha256=4dakMkn-_I5bSWsijLLY0tn5NkBEuZhmtYDj-MDSJE0,17987
17
17
  stjames/opt_settings.py,sha256=gxXGtjy9l-Q5Wen9eO6T6HHRCuS8rfOofdVQIJj0JcI,550
18
- stjames/pdb.py,sha256=YxJgheU37H74cAPjRcYS16Z_-fZo2Yel9V_trluYq9Q,25938
18
+ stjames/pdb.py,sha256=_pIdJCMhIzS4t2HWQa_susDWjZEl0oLn4Njb1QoKvKw,26460
19
19
  stjames/periodic_cell.py,sha256=eV_mArsY_MPEFSrFEsTC-CyCc6V8ITAXdk7yhjjNI7M,1080
20
20
  stjames/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  stjames/scf_settings.py,sha256=WotVgVrayQ_8PUHP39zVtG7iLT9PV41lpzruttFACP8,2356
@@ -60,8 +60,8 @@ stjames/workflows/solubility.py,sha256=kGfVyPPGDLRpf2j6dSY7woCkfsoXSbUzdSImA4mcM
60
60
  stjames/workflows/spin_states.py,sha256=0degmE-frovgoXweshZyjfjqL7nkbaFoO9YoJhvQnaI,4748
61
61
  stjames/workflows/tautomer.py,sha256=7eYKziGPg8Km6lfowTzSkgJfJ4SHUPrAmnTf8Bi-SB0,1164
62
62
  stjames/workflows/workflow.py,sha256=sk2BUz59wdIkT_EyKOnMt5woNrjo3aHVK38cU8x8I7Q,1423
63
- stjames-0.0.65.dist-info/LICENSE,sha256=i7ehYBS-6gGmbTcgU4mgk28pyOx2kScJ0kcx8n7bWLM,1084
64
- stjames-0.0.65.dist-info/METADATA,sha256=OQsAzZU4ZxSwkHhpyaHaiI-3SvcHB5VS_nDKKlv-3FM,1672
65
- stjames-0.0.65.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
66
- stjames-0.0.65.dist-info/top_level.txt,sha256=FYCwxl6quhYOAgG-mnPQcCK8vsVM7B8rIUrO-WrQ_PI,8
67
- stjames-0.0.65.dist-info/RECORD,,
63
+ stjames-0.0.67.dist-info/licenses/LICENSE,sha256=i7ehYBS-6gGmbTcgU4mgk28pyOx2kScJ0kcx8n7bWLM,1084
64
+ stjames-0.0.67.dist-info/METADATA,sha256=6njJyhDxM_XVsH0R-2HzU98usLv4FbAcJw9GI20Kono,1694
65
+ stjames-0.0.67.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
66
+ stjames-0.0.67.dist-info/top_level.txt,sha256=FYCwxl6quhYOAgG-mnPQcCK8vsVM7B8rIUrO-WrQ_PI,8
67
+ stjames-0.0.67.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (76.1.0)
2
+ Generator: setuptools (77.0.3)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5