molforge 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. molforge-0.0.1/.gitignore +74 -0
  2. molforge-0.0.1/CHANGELOG.md +240 -0
  3. molforge-0.0.1/LICENSE +21 -0
  4. molforge-0.0.1/PKG-INFO +246 -0
  5. molforge-0.0.1/README.md +156 -0
  6. molforge-0.0.1/data/README.md +7 -0
  7. molforge-0.0.1/notebooks/README.md +13 -0
  8. molforge-0.0.1/plugins/README.md +15 -0
  9. molforge-0.0.1/plugins/example_plugin/README.md +20 -0
  10. molforge-0.0.1/plugins/example_plugin/pyproject.toml +16 -0
  11. molforge-0.0.1/pyproject.toml +212 -0
  12. molforge-0.0.1/requirements/README.md +8 -0
  13. molforge-0.0.1/scripts/README.md +5 -0
  14. molforge-0.0.1/src/molforge/__init__.py +18 -0
  15. molforge-0.0.1/src/molforge/core/__init__.py +64 -0
  16. molforge-0.0.1/src/molforge/core/atom.py +154 -0
  17. molforge-0.0.1/src/molforge/core/atom_array.py +318 -0
  18. molforge-0.0.1/src/molforge/core/chain.py +149 -0
  19. molforge-0.0.1/src/molforge/core/constants.py +145 -0
  20. molforge-0.0.1/src/molforge/core/protein.py +166 -0
  21. molforge-0.0.1/src/molforge/core/residue.py +160 -0
  22. molforge-0.0.1/src/molforge/docking/__init__.py +143 -0
  23. molforge-0.0.1/src/molforge/io/__init__.py +88 -0
  24. molforge-0.0.1/src/molforge/io/dispatch.py +156 -0
  25. molforge-0.0.1/src/molforge/io/fasta.py +185 -0
  26. molforge-0.0.1/src/molforge/io/mmcif.py +559 -0
  27. molforge-0.0.1/src/molforge/io/mol2.py +33 -0
  28. molforge-0.0.1/src/molforge/io/pdb.py +635 -0
  29. molforge-0.0.1/src/molforge/io/pdb_alphafold.py +91 -0
  30. molforge-0.0.1/src/molforge/io/pdbqt.py +33 -0
  31. molforge-0.0.1/src/molforge/io/pqr.py +33 -0
  32. molforge-0.0.1/src/molforge/io/sdf.py +33 -0
  33. molforge-0.0.1/src/molforge/md/__init__.py +23 -0
  34. molforge-0.0.1/src/molforge/metrics/__init__.py +25 -0
  35. molforge-0.0.1/src/molforge/ml/__init__.py +25 -0
  36. molforge-0.0.1/src/molforge/plugins/__init__.py +34 -0
  37. molforge-0.0.1/src/molforge/plugins/registry.py +48 -0
  38. molforge-0.0.1/src/molforge/py.typed +0 -0
  39. molforge-0.0.1/src/molforge/sequence/__init__.py +69 -0
  40. molforge-0.0.1/src/molforge/sequence/alignment.py +385 -0
  41. molforge-0.0.1/src/molforge/sequence/composition.py +132 -0
  42. molforge-0.0.1/src/molforge/sequence/matrices.py +99 -0
  43. molforge-0.0.1/src/molforge/sequence/mutations.py +195 -0
  44. molforge-0.0.1/src/molforge/structure/__init__.py +85 -0
  45. molforge-0.0.1/src/molforge/structure/contacts.py +175 -0
  46. molforge-0.0.1/src/molforge/structure/dssp.py +428 -0
  47. molforge-0.0.1/src/molforge/structure/geometry.py +144 -0
  48. molforge-0.0.1/src/molforge/structure/rmsd.py +171 -0
  49. molforge-0.0.1/src/molforge/structure/superposition.py +135 -0
  50. molforge-0.0.1/src/molforge/wrappers/__init__.py +11 -0
  51. molforge-0.0.1/src/molforge/wrappers/docking/__init__.py +34 -0
  52. molforge-0.0.1/src/molforge/wrappers/docking/_base.py +7 -0
  53. molforge-0.0.1/src/molforge/wrappers/docking/diffdock.py +17 -0
  54. molforge-0.0.1/src/molforge/wrappers/docking/vina.py +364 -0
  55. molforge-0.0.1/src/molforge/wrappers/folding/__init__.py +38 -0
  56. molforge-0.0.1/src/molforge/wrappers/folding/_base.py +102 -0
  57. molforge-0.0.1/src/molforge/wrappers/folding/alphafold.py +20 -0
  58. molforge-0.0.1/src/molforge/wrappers/folding/boltz.py +20 -0
  59. molforge-0.0.1/src/molforge/wrappers/folding/esmfold.py +234 -0
  60. molforge-0.0.1/src/molforge/wrappers/folding/rosetta.py +19 -0
  61. molforge-0.0.1/src/molforge/wrappers/md/__init__.py +8 -0
  62. molforge-0.0.1/src/molforge/wrappers/md/_base.py +13 -0
  63. molforge-0.0.1/src/molforge/wrappers/md/gromacs.py +18 -0
  64. molforge-0.0.1/src/molforge/wrappers/md/openmm.py +18 -0
  65. molforge-0.0.1/tests/__init__.py +0 -0
  66. molforge-0.0.1/tests/conftest.py +23 -0
  67. molforge-0.0.1/tests/fixtures/cif/dipeptide.cif +39 -0
  68. molforge-0.0.1/tests/fixtures/fasta/.gitkeep +0 -0
  69. molforge-0.0.1/tests/fixtures/fasta/multiline_with_digits.fasta +3 -0
  70. molforge-0.0.1/tests/fixtures/fasta/simple.fasta +6 -0
  71. molforge-0.0.1/tests/fixtures/pdb/.gitkeep +0 -0
  72. molforge-0.0.1/tests/fixtures/pdb/alphafold_mock.pdb +8 -0
  73. molforge-0.0.1/tests/fixtures/pdb/dipeptide.pdb +16 -0
  74. molforge-0.0.1/tests/fixtures/pdb/helix.pdb +62 -0
  75. molforge-0.0.1/tests/fixtures/pdb/multi_model.pdb +12 -0
  76. molforge-0.0.1/tests/fixtures/pdb/tripeptide.pdb +17 -0
  77. molforge-0.0.1/tests/fixtures/pdb/with_altloc.pdb +9 -0
  78. molforge-0.0.1/tests/fixtures/pdb/with_insertion_code.pdb +6 -0
  79. molforge-0.0.1/tests/integration/__init__.py +0 -0
  80. molforge-0.0.1/tests/integration/test_smoke.py +11 -0
  81. molforge-0.0.1/tests/unit/__init__.py +0 -0
  82. molforge-0.0.1/tests/unit/core/__init__.py +0 -0
  83. molforge-0.0.1/tests/unit/core/test_atom_array.py +210 -0
  84. molforge-0.0.1/tests/unit/core/test_constants.py +61 -0
  85. molforge-0.0.1/tests/unit/core/test_core_smoke.py +27 -0
  86. molforge-0.0.1/tests/unit/core/test_core_types.py +33 -0
  87. molforge-0.0.1/tests/unit/core/test_hierarchy.py +260 -0
  88. molforge-0.0.1/tests/unit/docking/__init__.py +0 -0
  89. molforge-0.0.1/tests/unit/docking/test_docking_smoke.py +9 -0
  90. molforge-0.0.1/tests/unit/io/__init__.py +0 -0
  91. molforge-0.0.1/tests/unit/io/test_alphafold.py +56 -0
  92. molforge-0.0.1/tests/unit/io/test_dispatch.py +74 -0
  93. molforge-0.0.1/tests/unit/io/test_fasta.py +135 -0
  94. molforge-0.0.1/tests/unit/io/test_io_smoke.py +9 -0
  95. molforge-0.0.1/tests/unit/io/test_mmcif.py +171 -0
  96. molforge-0.0.1/tests/unit/io/test_pdb.py +254 -0
  97. molforge-0.0.1/tests/unit/md/__init__.py +0 -0
  98. molforge-0.0.1/tests/unit/md/test_md_smoke.py +9 -0
  99. molforge-0.0.1/tests/unit/metrics/__init__.py +0 -0
  100. molforge-0.0.1/tests/unit/metrics/test_metrics_smoke.py +9 -0
  101. molforge-0.0.1/tests/unit/ml/__init__.py +0 -0
  102. molforge-0.0.1/tests/unit/ml/test_ml_smoke.py +9 -0
  103. molforge-0.0.1/tests/unit/plugins/__init__.py +0 -0
  104. molforge-0.0.1/tests/unit/plugins/test_plugins_smoke.py +9 -0
  105. molforge-0.0.1/tests/unit/plugins/test_registry.py +14 -0
  106. molforge-0.0.1/tests/unit/sequence/__init__.py +0 -0
  107. molforge-0.0.1/tests/unit/sequence/test_alignment.py +123 -0
  108. molforge-0.0.1/tests/unit/sequence/test_composition.py +92 -0
  109. molforge-0.0.1/tests/unit/sequence/test_matrices.py +41 -0
  110. molforge-0.0.1/tests/unit/sequence/test_mutations.py +135 -0
  111. molforge-0.0.1/tests/unit/structure/__init__.py +0 -0
  112. molforge-0.0.1/tests/unit/structure/test_contacts.py +90 -0
  113. molforge-0.0.1/tests/unit/structure/test_dssp.py +118 -0
  114. molforge-0.0.1/tests/unit/structure/test_geometry.py +110 -0
  115. molforge-0.0.1/tests/unit/structure/test_rmsd.py +107 -0
  116. molforge-0.0.1/tests/unit/structure/test_structure_smoke.py +9 -0
  117. molforge-0.0.1/tests/unit/structure/test_superposition.py +114 -0
  118. molforge-0.0.1/tests/unit/wrappers/__init__.py +0 -0
  119. molforge-0.0.1/tests/unit/wrappers/test_docking_base.py +82 -0
  120. molforge-0.0.1/tests/unit/wrappers/test_esmfold.py +114 -0
  121. molforge-0.0.1/tests/unit/wrappers/test_folding_base.py +63 -0
  122. molforge-0.0.1/tests/unit/wrappers/test_vina.py +201 -0
@@ -0,0 +1,74 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ *.egg
8
+ *.egg-info/
9
+ dist/
10
+ build/
11
+ develop-eggs/
12
+ downloads/
13
+ eggs/
14
+ .eggs/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ share/python-wheels/
20
+ MANIFEST
21
+ pip-log.txt
22
+ pip-delete-this-directory.txt
23
+
24
+ # Virtual environments
25
+ .venv/
26
+ venv/
27
+ env/
28
+ ENV/
29
+
30
+ # Testing / coverage
31
+ .tox/
32
+ .nox/
33
+ .coverage
34
+ .coverage.*
35
+ .cache
36
+ nosetests.xml
37
+ coverage.xml
38
+ *.cover
39
+ *.py,cover
40
+ .hypothesis/
41
+ .pytest_cache/
42
+ htmlcov/
43
+
44
+ # Type checkers
45
+ .mypy_cache/
46
+ .pytype/
47
+ .pyre/
48
+ .ruff_cache/
49
+
50
+ # Jupyter
51
+ .ipynb_checkpoints
52
+ *.ipynb_checkpoints/
53
+
54
+ # Editors / OS
55
+ .idea/
56
+ .vscode/
57
+ *.swp
58
+ *.swo
59
+ .DS_Store
60
+ Thumbs.db
61
+
62
+ # Docs build output
63
+ docs/_build/
64
+ docs/site/
65
+ site/
66
+
67
+ # Data files (keep small fixtures only; ignore bulk data)
68
+ data/*
69
+ !data/.gitkeep
70
+ !data/README.md
71
+
72
+ # Project-specific
73
+ *.log
74
+ *.tmp
@@ -0,0 +1,240 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ### Added
11
+ - **`molforge.structure.dssp`: Kabsch-Sander secondary-structure assignment.**
12
+ - Pure-NumPy implementation of the canonical DSSP algorithm
13
+ (Kabsch & Sander 1983) with no external dependencies — no DSSP
14
+ binary required, no Biopython, no mkdssp install.
15
+ - Returns both the full 8-state DSSP alphabet (`H` α-helix,
16
+ `G` 3-10 helix, `I` π-helix, `E` β-strand, `B` β-bridge,
17
+ `T` turn, `S` bend, `-` coil) and the 3-state collapse
18
+ (`H` / `E` / `C`) via :func:`dssp_3state`.
19
+ - Geometric backbone amide-H placement (no need for explicit H atoms
20
+ in input), Kabsch-Sander electrostatic H-bond energy model, both
21
+ parallel and antiparallel β-bridge detection.
22
+ - Non-protein residues (water, ligands, ions) and residues with
23
+ incomplete backbones get `-` rather than crashing.
24
+ - Result dict also exposes the full ``(n_res, n_res)`` H-bond energy
25
+ matrix for downstream analyses (custom topology metrics, contact-
26
+ map enrichment, etc.).
27
+ - Replaces the previous stub that raised `NotImplementedError`.
28
+ - New test fixture `tests/fixtures/pdb/helix.pdb` — an idealized
29
+ 15-residue poly-alanine α-helix built from canonical (φ, ψ) values
30
+ via NeRF placement. Produces the expected DSSP `CHHHHHHHHHHHHHC`
31
+ pattern.
32
+ - 12 unit tests covering empty / tiny inputs, helix recognition (≥7 of
33
+ the middle 9 residues classified as H), 3-state collapse, alphabet
34
+ validity, residue labels, H-bond matrix shape, and graceful handling
35
+ of non-protein residues.
36
+ - **`molforge.wrappers.docking.Vina`: second fully-implemented engine wrapper.**
37
+ - Wraps AutoDock Vina via the
38
+ [`vina`](https://pypi.org/project/vina/) PyPI package, which bundles
39
+ the Vina binary so no manual install is required.
40
+ - Configurable scoring function (`vina` or `vinardo`), seed, CPU
41
+ thread count, and verbosity.
42
+ - Takes either a prepared `.pdbqt` file path or (eventually) a
43
+ `Protein` plus charges; the receptor / ligand preparation path for
44
+ `Protein` and `.pdb` / `.sdf` inputs raises a clear
45
+ `NotImplementedError` pointing users at meeko / AutoDockTools.
46
+ - Search box specified by `center` and `box_size` in Å.
47
+ - Multi-pose PDBQT output parsed back into `DockingResult` / `Pose`
48
+ objects with score (kcal/mol), RMSD lower/upper bounds vs the
49
+ best pose, rank, and the ligand atoms as a `Protein`.
50
+ - **`molforge.docking`: completed ABC and result types.**
51
+ - `Pose` and `DockingResult` dataclasses with `best`, `top_n`,
52
+ iteration, and length helpers — replacing the previous stub classes
53
+ that raised `NotImplementedError` on construction.
54
+ - `DockingEngine` ABC with the formal `dock` contract; mirrors
55
+ `FoldingEngine` for API consistency across wrapper categories.
56
+ - `DockingEngineNotInstalledError` for missing-dependency error paths.
57
+ - 28 unit tests (1 marked `@pytest.mark.slow` for the real engine):
58
+ construction is dependency-free, lazy import behavior, materialization
59
+ helpers (path passthrough plus clear errors for unsupported inputs),
60
+ and exhaustive PDBQT output parsing (multi-MODEL, single-pose-no-MODEL,
61
+ score/RMSD extraction, best-first sorting, rank reassignment, empty
62
+ input).
63
+ - **`molforge.sequence`: sequence operations subpackage.**
64
+ - **Pairwise alignment** (`align`, `needleman_wunsch`, `smith_waterman`,
65
+ `Alignment`, `identity`): pure-NumPy Needleman-Wunsch (global) and
66
+ Smith-Waterman (local) with affine gap penalties, BLOSUM62 / PAM250
67
+ substitution matrices, and a `format()` method for human-readable
68
+ alignment blocks. No external dependencies (no Biopython, no
69
+ parasail) so it works in the minimal install.
70
+ - **Substitution matrices** (`BLOSUM62`, `PAM250`, `get_matrix`,
71
+ `available_matrices`): hardcoded as NumPy arrays from the NCBI BLAST
72
+ distribution. No runtime data-file dependency.
73
+ - **Mutations** (`Mutation`, `parse_mutations`, `apply_mutation`,
74
+ `apply_mutations`, `mutate_protein`): protein-engineering notation
75
+ (`A123V`, `A123V/T56K`, chain-prefixed `H:K42N`) with parsing,
76
+ sequence-level application, and `Protein`-level application that
77
+ updates the canonical `AtomArray` (sequence-only — atoms stay put;
78
+ side-chain rebuilding is a job for Rosetta or OpenMM).
79
+ - **Composition / properties** (`composition`, `length`,
80
+ `molecular_weight`, `gravy`, `aromaticity`): the everyday sequence
81
+ stats — per-residue counts/fractions, monoisotopic MW with
82
+ terminal water, Kyte-Doolittle GRAVY score, aromatic fraction.
83
+ - 65 unit tests covering alignment correctness (identity, gaps, full and
84
+ local coverage), matrix lookup and symmetry, mutation parsing
85
+ (including chain prefix and multi-mutant syntax), `Protein` mutation
86
+ with original-unchanged semantics, and all composition helpers.
87
+ - **`molforge.structure`: structural analysis subpackage.**
88
+ - **Superposition** (`superpose`, `kabsch_rmsd`, `SuperpositionResult`):
89
+ Kabsch / Umeyama optimal rigid-body alignment via SVD with proper-
90
+ rotation guarantee (no reflections), optional per-point weights for
91
+ masking outliers, returns rotation + translation + aligned coords +
92
+ RMSD.
93
+ - **RMSD** (`rmsd`, `rmsd_raw`, `rmsd_per_residue`): structure-to-
94
+ structure RMSD with five atom-subset selectors (``ca``, ``backbone``,
95
+ ``backbone_o``, ``all_heavy``, ``all``), optional alignment, and
96
+ per-residue breakdown for localizing structural differences.
97
+ - **Contacts** (`contact_map`, `distance_map`, `residue_contacts`):
98
+ binary contact maps at configurable cutoff, continuous distance
99
+ maps, and all-atom inter-residue contact listings with chain-pair
100
+ filtering for interface analysis.
101
+ - **Geometry** (`centroid`, `center_of_mass`, `radius_of_gyration`,
102
+ `bounding_box`, `translate`, `rotate`, `center_at_origin`): bulk
103
+ geometric properties (mass-weighted or geometric) and in-place
104
+ coordinate transforms that mutate the canonical `AtomArray` directly.
105
+ - Stubs (still `NotImplementedError`): `sasa`, `dssp`.
106
+ - 43 unit tests covering superposition correctness (identity, translation,
107
+ rotation, noisy alignment, proper-rotation guarantee, weighted),
108
+ RMSD computations across atom subsets, contact / distance map
109
+ symmetry and chain filtering, and all geometry operations.
110
+
111
+ - **`molforge.io.read_cif` / `write_cif`: mmCIF / PDBx implementation.**
112
+ - Full read/write of the ``_atom_site`` loop, the only mmCIF block
113
+ that holds atomic coordinate data.
114
+ - Hand-written tokenizer handles quoted strings, comments,
115
+ semicolon-bounded multi-line text fields, and the ``.``/``?`` sentinel
116
+ values for missing/unknown.
117
+ - Header metadata extracted: ``_entry.id``, ``_struct.title``,
118
+ ``_exptl.method``, ``_refine.ls_d_res_high``.
119
+ - Preference for ``auth_*`` columns (matching PDB conventions) with
120
+ fallback to ``label_*``, so PDB↔mmCIF round-trips preserve
121
+ author-assigned chain IDs and residue numbers.
122
+ - Reuses the same altloc resolution strategies and entity-type
123
+ classification as the PDB parser for behavioural consistency.
124
+ - ``CIFParseError`` and ``CIFWriteError`` for typed error handling.
125
+ - Wired into the top-level :func:`load` / :func:`save` dispatcher so
126
+ ``.cif`` and ``.mmcif`` extensions just work.
127
+ - 27 unit tests covering the tokenizer, parsing, write, round-trip
128
+ (CIF→CIF and PDB→CIF→Protein), dispatch, and error paths.
129
+ - **`molforge.wrappers.folding.ESMFold`: first fully-implemented engine wrapper.**
130
+ - Wraps Meta AI's ``facebook/esmfold_v1`` via HuggingFace
131
+ ``transformers``. Single-sequence folding (no MSA needed), fast,
132
+ GPU-friendly.
133
+ - Lazy import of ``torch`` and ``transformers`` keeps ``import
134
+ molforge`` cheap; missing-dependency errors point users at the
135
+ correct ``pip install 'molforge[ml]'`` extra.
136
+ - Configurable device (``cuda``/``cpu``/``mps``/auto), chunk size for
137
+ long-sequence memory management, and dtype (``float32``/``float16``).
138
+ - pLDDT exposed uniformly: per-atom in
139
+ ``metadata["confidence_per_atom"]``, per-residue in
140
+ ``metadata["confidence_per_residue"]``, scalar mean in
141
+ ``metadata["mean_confidence"]``.
142
+ - **`FoldingEngine` ABC**: full contract definition with ``predict`` (abstract),
143
+ ``predict_many`` (overridable batch), and a uniform per-residue
144
+ confidence convention so downstream code reads engine output the same
145
+ way regardless of which engine produced it.
146
+ - **`FoldingEngineNotInstalledError`**: dedicated exception type for missing
147
+ heavy dependencies, with actionable error messages.
148
+ - 17 unit tests covering construction, lazy loading, sequence
149
+ validation, missing-dependency error paths, and post-processing
150
+ (PDB-to-Protein conversion with confidence metadata). The end-to-end
151
+ fold test is marked ``@pytest.mark.slow`` and skipped unless ``torch``
152
+ is installed.
153
+
154
+ ### Changed
155
+ - **Project renamed from `biocore` to `molforge`** (PyPI name collision; the
156
+ `biocore` GitHub organization is a separate, established scientific
157
+ Python community). Import path is now `molforge`.
158
+ - README rewritten around the cross-tool workflow thesis: molforge is
159
+ positioned as connective tissue between docking, MD, folding, design,
160
+ and experimental tools, rather than primarily as a data-representation
161
+ library.
162
+
163
+ ### Added
164
+ - **`molforge.io`: file I/O subsystem.**
165
+ - **PDB reader and writer** (`read_pdb`, `write_pdb`, plus their
166
+ `*_string` variants). Handles the full wwPDB v3.30 column layout,
167
+ HEADER/TITLE/EXPDTA/REMARK 2 metadata, NMR multi-model files,
168
+ alternate locations (with three resolution strategies:
169
+ `highest_occupancy`, `first`, `all`, or a specific altloc id),
170
+ insertion codes, gzipped input/output, hydrogen filtering, and
171
+ automatic entity-type classification (protein / dna / rna / water /
172
+ ion / ligand) per residue.
173
+ - **FASTA reader and writer** (`read_fasta`, `write_fasta`, `*_string`
174
+ variants) with `FastaRecord` dataclass. Tolerant of multi-line
175
+ sequences, embedded digits, comments, and blank lines.
176
+ - **AlphaFold helpers** (`load_alphafold`, `is_alphafold_pdb`). Lifts
177
+ pLDDT out of the B-factor column into `protein.metadata["plddt"]`
178
+ (per atom), `protein.metadata["plddt_per_residue"]`, and
179
+ `protein.metadata["mean_plddt"]`. B-factor column preserved for
180
+ downstream-tool compatibility.
181
+ - **Top-level `load()`, `save()`, `fetch()`** dispatch by file
182
+ extension or explicit `format=` keyword. `fetch()` is stubbed
183
+ pending an HTTP utility.
184
+ - Format stubs (raising `NotImplementedError` with clear pointers):
185
+ `mmcif`, `pdbqt`, `pqr`, `sdf`, `mol2`. The API surface is committed
186
+ so user code targeting these formats won't break when
187
+ implementations land.
188
+ - `PDBParseError`, `PDBWriteError` for typed error handling.
189
+ - 73 unit tests covering PDB parsing, writing, round-trip correctness on
190
+ real fixtures (dipeptide, NMR ensemble, altloc, insertion-coded),
191
+ FASTA edge cases (multiline, digits, comments, malformed input),
192
+ AlphaFold detection and pLDDT extraction, and dispatch behavior.
193
+ - `ACKNOWLEDGEMENTS.md` crediting Protkit, Biotite, Biopython,
194
+ BioPandas, MDAnalysis, OpenMM, RDKit, and the file-format
195
+ specifications we implement.
196
+ - **`molforge.core`: full implementation of the canonical data model.**
197
+ - `AtomArray`: flat, NumPy-backed source of truth with 15 typed fields
198
+ (coords, element, atom_name, residue_name, residue_id, insertion_code,
199
+ chain_id, b_factor, occupancy, charge, serial, record_type, entity_type,
200
+ altloc, model_id). Supports construction from a dict of arrays, boolean
201
+ selection (`select`, `where`), slicing, fancy indexing, and concatenation
202
+ (`append`). Lazily-computed and cached residue / chain boundary indices
203
+ (`residue_starts`, `chain_starts`) with explicit invalidation.
204
+ - `Atom`, `Residue`, `Chain`, `Protein`: lightweight hierarchical *views*
205
+ that hold a reference to a shared `AtomArray` plus an index range.
206
+ Mutations on the hierarchical side write through to the array, so the
207
+ two views never go out of sync.
208
+ - `Protein.select(**filters)`, `protein_only()`, `remove_water()` for
209
+ common substructure operations.
210
+ - First-class support for heterogeneous content: HETATM records,
211
+ ligands, waters, and ions are represented in the same array via
212
+ `record_type` and `entity_type` fields.
213
+ - Insertion codes, alternate locations (altloc), and multi-model
214
+ (NMR / trajectory) structures are modeled from day one.
215
+ - Constants module with three-letter ↔ one-letter mappings for the 20
216
+ canonical amino acids, common non-canonical residues (MSE, SEC,
217
+ phospho-S/T/Y, force-field-specific His variants), DNA / RNA
218
+ nucleotides, waters, and ions. Helper functions `three_to_one`,
219
+ `is_standard_amino_acid`, `is_water`, `is_ion`.
220
+ - 74 unit tests covering `AtomArray`, hierarchical views, constants, and
221
+ cross-cutting hierarchical ↔ linear consistency invariants.
222
+ - Initial repository skeleton with src-layout package structure.
223
+ - Hierarchical data model stubs for `Protein`, `Chain`, `Residue`, `Atom`.
224
+ - Linear / array view stubs (`AtomArray`) alongside hierarchical views.
225
+ - Top-level subpackages: `core`, `sequence`, `structure`, `md`, `docking`,
226
+ `ml`, `io`, `plugins`, `metrics`.
227
+ - Wrapper interface stubs for folding (AlphaFold/ColabFold, ESMFold, Boltz),
228
+ docking (AutoDock Vina, DiffDock), and MD (OpenMM, GROMACS).
229
+ - Plugin registry stub with entry-point discovery.
230
+ - `pyproject.toml` with PEP 621 metadata, extras (`structure`, `sequence`,
231
+ `md`, `docking`, `ml`, `io`, `all`, `dev`, `docs`), and tool config for
232
+ ruff, mypy, pytest, and coverage.
233
+ - GitHub Actions workflows for CI (lint, type-check, tests on Python 3.10-3.12),
234
+ documentation build, and release-to-PyPI on tag.
235
+ - Issue templates (bug, feature, question) and PR template.
236
+ - `CONTRIBUTING.md`, `CODE_OF_CONDUCT.md`, `SECURITY.md`, `CODEDoctorDeanS`.
237
+ - Walkthrough notebook stubs for sequences, structures, MD, and docking.
238
+ - Pinned requirements files per extra under `requirements/`.
239
+
240
+ [Unreleased]: https://github.com/DoctorDean/molforge/compare/HEAD...HEAD
molforge-0.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Dean Sherry
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,246 @@
1
+ Metadata-Version: 2.4
2
+ Name: molforge
3
+ Version: 0.0.1
4
+ Summary: A unified Python library for structural bioinformatics, MD, protein engineering, and ML.
5
+ Project-URL: Homepage, https://github.com/DoctorDean/molforge
6
+ Project-URL: Documentation, https://molforge.readthedocs.io
7
+ Project-URL: Repository, https://github.com/DoctorDean/molforge
8
+ Project-URL: Issues, https://github.com/DoctorDean/molforge/issues
9
+ Project-URL: Changelog, https://github.com/DoctorDean/molforge/blob/main/CHANGELOG.md
10
+ Author: Dean Sherry
11
+ License: MIT License
12
+
13
+ Copyright (c) 2026 Dean Sherry
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ of this software and associated documentation files (the "Software"), to deal
17
+ in the Software without restriction, including without limitation the rights
18
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the Software is
20
+ furnished to do so, subject to the following conditions:
21
+
22
+ The above copyright notice and this permission notice shall be included in all
23
+ copies or substantial portions of the Software.
24
+
25
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ SOFTWARE.
32
+ License-File: LICENSE
33
+ Keywords: bioinformatics,computational-biology,docking,machine-learning,molecular-dynamics,protein-engineering,structural-biology
34
+ Classifier: Development Status :: 3 - Alpha
35
+ Classifier: Intended Audience :: Science/Research
36
+ Classifier: License :: OSI Approved :: MIT License
37
+ Classifier: Operating System :: OS Independent
38
+ Classifier: Programming Language :: Python :: 3
39
+ Classifier: Programming Language :: Python :: 3.10
40
+ Classifier: Programming Language :: Python :: 3.11
41
+ Classifier: Programming Language :: Python :: 3.12
42
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
43
+ Classifier: Topic :: Scientific/Engineering :: Chemistry
44
+ Classifier: Typing :: Typed
45
+ Requires-Python: >=3.10
46
+ Requires-Dist: numpy>=1.24
47
+ Requires-Dist: typing-extensions>=4.7
48
+ Provides-Extra: all
49
+ Requires-Dist: biopython>=1.83; extra == 'all'
50
+ Requires-Dist: biotite>=0.40; extra == 'all'
51
+ Requires-Dist: mdtraj>=1.10; extra == 'all'
52
+ Requires-Dist: openmm>=8.1; (platform_system != 'Windows') and extra == 'all'
53
+ Requires-Dist: rdkit>=2023.9; extra == 'all'
54
+ Requires-Dist: scipy>=1.11; extra == 'all'
55
+ Requires-Dist: torch>=2.1; extra == 'all'
56
+ Requires-Dist: transformers>=4.40; extra == 'all'
57
+ Provides-Extra: dev
58
+ Requires-Dist: build>=1.0; extra == 'dev'
59
+ Requires-Dist: hypothesis>=6.92; extra == 'dev'
60
+ Requires-Dist: mypy>=1.8; extra == 'dev'
61
+ Requires-Dist: pre-commit>=3.6; extra == 'dev'
62
+ Requires-Dist: pytest-cov>=4.1; extra == 'dev'
63
+ Requires-Dist: pytest-xdist>=3.5; extra == 'dev'
64
+ Requires-Dist: pytest>=7.4; extra == 'dev'
65
+ Requires-Dist: ruff>=0.5; extra == 'dev'
66
+ Requires-Dist: twine>=4.0; extra == 'dev'
67
+ Provides-Extra: docking
68
+ Requires-Dist: rdkit>=2023.9; extra == 'docking'
69
+ Provides-Extra: docs
70
+ Requires-Dist: mkdocs-jupyter>=0.24; extra == 'docs'
71
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
72
+ Requires-Dist: mkdocs>=1.5; extra == 'docs'
73
+ Requires-Dist: mkdocstrings[python]>=0.24; extra == 'docs'
74
+ Provides-Extra: io
75
+ Requires-Dist: biopython>=1.83; extra == 'io'
76
+ Requires-Dist: biotite>=0.40; extra == 'io'
77
+ Provides-Extra: md
78
+ Requires-Dist: mdtraj>=1.10; extra == 'md'
79
+ Requires-Dist: openmm>=8.1; (platform_system != 'Windows') and extra == 'md'
80
+ Provides-Extra: ml
81
+ Requires-Dist: torch>=2.1; extra == 'ml'
82
+ Requires-Dist: transformers>=4.40; extra == 'ml'
83
+ Provides-Extra: sequence
84
+ Requires-Dist: biopython>=1.83; extra == 'sequence'
85
+ Provides-Extra: structure
86
+ Requires-Dist: biopython>=1.83; extra == 'structure'
87
+ Requires-Dist: biotite>=0.40; extra == 'structure'
88
+ Requires-Dist: scipy>=1.11; extra == 'structure'
89
+ Description-Content-Type: text/markdown
90
+
91
+ # molforge
92
+
93
+ [![CI](https://github.com/DoctorDean/molforge/actions/workflows/ci.yml/badge.svg)](https://github.com/DoctorDean/molforge/actions/workflows/ci.yml)
94
+ [![PyPI version](https://img.shields.io/pypi/v/molforge.svg)](https://pypi.org/project/molforge/)
95
+ [![Python versions](https://img.shields.io/pypi/pyversions/molforge.svg)](https://pypi.org/project/molforge/)
96
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE)
97
+ [![Code style: ruff](https://img.shields.io/badge/code%20style-ruff-000000.svg)](https://github.com/astral-sh/ruff)
98
+
99
+ ![Molforge Logo](molforge.png)
100
+
101
+ > **A forge for protein workflows.** One Python script, every tool in your stack: docking, MD, folding, antibody and nanobody engineering, de novo design — without the format-conversion tax.
102
+
103
+ `molforge` is an open-source Python library that lets you compose protein workflows across the tools you already use. Bring your structures and sequences in, plug in your engines of choice (Vina, OpenMM, ESMFold, AlphaFold, RFdiffusion, ProteinMPNN, your own model), and walk out with a coherent pipeline instead of five incompatible Python environments and a graveyard of conversion scripts.
104
+
105
+ ---
106
+
107
+ ## Why molforge exists
108
+
109
+ Modern protein work is multi-tool by nature. A real antibody-design loop might fold a sequence with ESMFold, identify CDR loops with anarci, score binding with FoldX, dock against a target with AutoDock Vina, relax with OpenMM, then evaluate with Rosetta. **Each of those tools speaks its own dialect**: different file formats, different atom-naming conventions, different ideas of what "the structure" is. Most of an engineer's day disappears into glue code.
110
+
111
+ `molforge` is the connective tissue. It provides:
112
+
113
+ - A **canonical, NumPy-backed data model** that's cheap to convert in and out of — so every engine in your pipeline reads from and writes to the same representation.
114
+ - **Thin wrappers** around the engines you already trust, with consistent interfaces (so swapping ESMFold for AlphaFold is one line, not a refactor).
115
+ - **First-class IO** for the messy reality of structural-bio files: PDB, mmCIF, FASTA, PDBQT, PQR, SDF, MOL2, and AlphaFold predictions with pLDDT.
116
+ - A **plugin registry** so the next docking engine, folding model, or scoring function can slot into your pipeline without forking molforge.
117
+
118
+ Built as a library, not a framework: there's no orchestrator, no DAG runtime, no decorators you have to import to make things work. Use whatever workflow tool you like — Snakemake, Nextflow, Prefect, a shell script — molforge is just imports.
119
+
120
+ ## Design principles
121
+
122
+ 1. **Workflows over silos.** Every design decision is judged by "does this make it easier to chain N tools together?"
123
+ 2. **Wrappers, not reimplementations.** We don't rebuild OpenMM or AutoDock. We give them a shared vocabulary.
124
+ 3. **One data model, two views.** Hierarchical (`protein.chains["A"].residues[42]`) for biology, linear (`protein.atom_array.coords`) for ML — same data, no conversion.
125
+ 4. **Heterogeneous content is first-class.** Antibodies have glycans. Drug targets have ligands and ions. Membrane proteins have lipids. The data model handles all of it without an awkward special case for "non-protein."
126
+ 5. **Typed, tested, documented.** Strict mypy, ruff-clean, >90% coverage target.
127
+
128
+ ## Installation
129
+
130
+ ```bash
131
+ # minimal core (data model + sequence + basic IO)
132
+ pip install molforge
133
+
134
+ # with structure analysis (RMSD, SASA, contacts)
135
+ pip install "molforge[structure]"
136
+
137
+ # with ML wrappers (torch, transformers, esm)
138
+ pip install "molforge[ml]"
139
+
140
+ # with MD support (openmm, mdtraj)
141
+ pip install "molforge[md]"
142
+
143
+ # with docking (rdkit for small molecules)
144
+ pip install "molforge[docking]"
145
+
146
+ # everything
147
+ pip install "molforge[all]"
148
+
149
+ # development
150
+ git clone https://github.com/DoctorDean/molforge.git
151
+ cd molforge
152
+ pip install -e ".[dev,all]"
153
+ ```
154
+
155
+ ## Quickstart
156
+
157
+ The smallest end-to-end example that shows the cross-tool point:
158
+
159
+ ```python
160
+ import molforge as mf
161
+ from molforge.wrappers.folding import ESMFold
162
+ from molforge.wrappers.docking import Vina
163
+ from molforge.wrappers.md import OpenMM
164
+
165
+ # 1. Fold a sequence
166
+ folded = ESMFold().predict("MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVS...")
167
+
168
+ # 2. Save as PDB, save as mmCIF, hand to anything
169
+ mf.save(folded, "candidate.pdb")
170
+ mf.save(folded, "candidate.cif")
171
+
172
+ # 3. Dock a ligand against it (Vina-prepared PDBQT files)
173
+ result = Vina().dock(
174
+ receptor="receptor.pdbqt",
175
+ ligand="ligand.pdbqt",
176
+ center=(10.0, 5.0, -2.0),
177
+ box_size=(20.0, 20.0, 20.0),
178
+ )
179
+ top_pose = result.best
180
+
181
+ # 4. Drop into MD for relaxation
182
+ trajectory = OpenMM().simulate(top_pose.complex, steps=10_000)
183
+
184
+ # 5. Inspect — hierarchical or linear, your call
185
+ print(folded.sequence) # one-letter per chain
186
+ print(folded.atom_array.coords.shape) # (N, 3) NumPy array
187
+ ca = folded.chains["A"].residues[42].atoms["CA"] # specific atom
188
+ ```
189
+
190
+ Notice what *isn't* there: file-format conversions, atom-name remapping, hand-rolled PDB parsers, custom data classes per engine. molforge does that work so your script reads like the science you're actually doing.
191
+
192
+ ## Repository structure
193
+
194
+ ```
195
+ molforge/
196
+ ├── src/molforge/ # Library source (src-layout)
197
+ │ ├── core/ # Hierarchical + linear data model
198
+ │ ├── sequence/ # Sequence operations, alignment, mutations
199
+ │ ├── structure/ # RMSD, SASA, contacts, geometry
200
+ │ ├── md/ # MD trajectories and analysis
201
+ │ ├── docking/ # Docking abstractions and pose handling
202
+ │ ├── ml/ # ML utilities, featurizers, tensor views
203
+ │ ├── io/ # PDB, mmCIF, FASTA, PDBQT, PQR, SDF, MOL2
204
+ │ ├── plugins/ # Plugin registry and entry-point discovery
205
+ │ ├── metrics/ # TM-score, lDDT, GDT-TS, docking metrics
206
+ │ └── wrappers/ # Thin interfaces to external engines
207
+ │ ├── folding/ # AlphaFold, ESMFold, Boltz, Rosetta
208
+ │ ├── docking/ # AutoDock Vina, DiffDock
209
+ │ └── md/ # OpenMM, GROMACS
210
+ ├── tests/ # pytest suite (unit + integration)
211
+ ├── docs/ # Architecture docs and reference
212
+ ├── notebooks/ # Walkthroughs and worked examples
213
+ ├── plugins/ # Example external plugins
214
+ ├── pyproject.toml # Build config, deps, tool config
215
+ └── ACKNOWLEDGEMENTS.md # Prior art and intellectual debts
216
+ ```
217
+
218
+ A deeper architecture walkthrough is in [`docs/architecture/overview.md`](docs/architecture/overview.md).
219
+
220
+ ## Status
221
+
222
+ molforge is **pre-1.0** and under active development. What's working today:
223
+
224
+ - **Core data model** — `Protein` / `Chain` / `Residue` / `Atom` over a canonical NumPy-backed `AtomArray`, with first-class heterogeneous content (ligands, water, ions, modified residues).
225
+ - **File I/O** — full read/write for **PDB** (with NMR ensembles, altlocs, insertion codes) and **mmCIF** (the modern format for large structures); **FASTA** sequence I/O; **AlphaFold** loader that surfaces pLDDT as a first-class field. PDBQT, PQR, SDF, MOL2 are stubbed with committed APIs.
226
+ - **Sequence operations** — pairwise **alignment** (Needleman-Wunsch / Smith-Waterman with BLOSUM62 / PAM250), point **mutations** with protein-engineering notation (`A123V`, `A123V/T56K`, `H:K42N`), composition and property helpers (MW, GRAVY, aromaticity).
227
+ - **Structural analysis** — Kabsch/Umeyama **superposition**, **RMSD** (whole-structure and per-residue, multiple atom subsets), **contact and distance maps**, **radius of gyration**, **centroid / center of mass**, in-place **translate / rotate**, **DSSP** secondary-structure assignment (8-state and 3-state, no external binary).
228
+ - **Two engine wrappers working end-to-end** — **ESMFold** (sequence → folded `Protein`, `pip install 'molforge[ml]'`) and **AutoDock Vina** (receptor + ligand → ranked docking poses, `pip install vina meeko`). The wrapper pattern is now validated across both folding and docking categories; the rest follow the same template.
229
+
230
+ Coming next: SASA, backbone dihedrals (φ/ψ/ω), `meeko`-based receptor/ligand prep for Vina, additional engine wrappers (AlphaFold, OpenMM). See [`CHANGELOG.md`](CHANGELOG.md) for the full picture.
231
+
232
+ ## Acknowledgements
233
+
234
+ molforge is inspired by [Protkit](https://github.com/silicogenesis/protkit) (SilicoGenesis), which pioneered the idea of a unified, hierarchical representation for protein structures in Python. molforge extends that direction toward cross-tool, cross-format workflows and a different internal architecture (NumPy-backed linear store, hierarchical views as accessors). See [`ACKNOWLEDGEMENTS.md`](ACKNOWLEDGEMENTS.md) for the longer list of projects we've learned from.
235
+
236
+ ## Contributing
237
+
238
+ We welcome contributions. See [`CONTRIBUTING.md`](CONTRIBUTING.md) and the [Code of Conduct](CODE_OF_CONDUCT.md) before opening an issue or PR.
239
+
240
+ ## License
241
+
242
+ MIT — see [`LICENSE`](LICENSE).
243
+
244
+ ## Citation
245
+
246
+ If you use `molforge` in academic work, please cite us (BibTeX coming with the first tagged release).