pymisha 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pymisha-0.1.0/LICENSE +21 -0
- pymisha-0.1.0/MANIFEST.in +4 -0
- pymisha-0.1.0/PKG-INFO +115 -0
- pymisha-0.1.0/README.md +76 -0
- pymisha-0.1.0/pymisha/__init__.py +354 -0
- pymisha-0.1.0/pymisha/_crc64.py +34 -0
- pymisha-0.1.0/pymisha/_name_validation.py +22 -0
- pymisha-0.1.0/pymisha/_pymisha.pyi +550 -0
- pymisha-0.1.0/pymisha/_quadtree.py +592 -0
- pymisha-0.1.0/pymisha/_safe_eval.py +147 -0
- pymisha-0.1.0/pymisha/_safe_pickle.py +49 -0
- pymisha-0.1.0/pymisha/_shared.py +248 -0
- pymisha-0.1.0/pymisha/analysis.py +559 -0
- pymisha-0.1.0/pymisha/dataset.py +552 -0
- pymisha-0.1.0/pymisha/db.py +414 -0
- pymisha-0.1.0/pymisha/db_attrs.py +245 -0
- pymisha-0.1.0/pymisha/db_create.py +989 -0
- pymisha-0.1.0/pymisha/expr.py +104 -0
- pymisha-0.1.0/pymisha/extract.py +648 -0
- pymisha-0.1.0/pymisha/gdir.py +319 -0
- pymisha-0.1.0/pymisha/gsynth.py +1411 -0
- pymisha-0.1.0/pymisha/intervals.py +4202 -0
- pymisha-0.1.0/pymisha/liftover.py +1650 -0
- pymisha-0.1.0/pymisha/lookup.py +513 -0
- pymisha-0.1.0/pymisha/sequence.py +804 -0
- pymisha-0.1.0/pymisha/summary.py +1742 -0
- pymisha-0.1.0/pymisha/tracks.py +3173 -0
- pymisha-0.1.0/pymisha/vtracks.py +1435 -0
- pymisha-0.1.0/pymisha.egg-info/PKG-INFO +115 -0
- pymisha-0.1.0/pymisha.egg-info/SOURCES.txt +190 -0
- pymisha-0.1.0/pymisha.egg-info/dependency_links.txt +1 -0
- pymisha-0.1.0/pymisha.egg-info/not-zip-safe +1 -0
- pymisha-0.1.0/pymisha.egg-info/requires.txt +13 -0
- pymisha-0.1.0/pymisha.egg-info/top_level.txt +2 -0
- pymisha-0.1.0/pyproject.toml +92 -0
- pymisha-0.1.0/setup.cfg +4 -0
- pymisha-0.1.0/setup.py +25 -0
- pymisha-0.1.0/src/BinFinder.cpp +36 -0
- pymisha-0.1.0/src/BinFinder.h +108 -0
- pymisha-0.1.0/src/BinsManager.h +127 -0
- pymisha-0.1.0/src/BufferedFile.cpp +72 -0
- pymisha-0.1.0/src/BufferedFile.h +269 -0
- pymisha-0.1.0/src/CRC64.h +85 -0
- pymisha-0.1.0/src/DnaPSSM.cpp +1242 -0
- pymisha-0.1.0/src/DnaPSSM.h +271 -0
- pymisha-0.1.0/src/GInterval.cpp +115 -0
- pymisha-0.1.0/src/GInterval.h +127 -0
- pymisha-0.1.0/src/GenomeChromKey.h +129 -0
- pymisha-0.1.0/src/GenomeIndex.cpp +193 -0
- pymisha-0.1.0/src/GenomeIndex.h +75 -0
- pymisha-0.1.0/src/GenomeSeqFetch.cpp +181 -0
- pymisha-0.1.0/src/GenomeSeqFetch.h +50 -0
- pymisha-0.1.0/src/GenomeSeqScorer.cpp +36 -0
- pymisha-0.1.0/src/GenomeSeqScorer.h +33 -0
- pymisha-0.1.0/src/GenomeTrack.cpp +317 -0
- pymisha-0.1.0/src/GenomeTrack.h +104 -0
- pymisha-0.1.0/src/GenomeTrack1D.h +106 -0
- pymisha-0.1.0/src/GenomeTrackFixedBin.cpp +429 -0
- pymisha-0.1.0/src/GenomeTrackFixedBin.h +116 -0
- pymisha-0.1.0/src/GenomeTrackSparse.cpp +281 -0
- pymisha-0.1.0/src/GenomeTrackSparse.h +177 -0
- pymisha-0.1.0/src/GenomeUtils.cpp +56 -0
- pymisha-0.1.0/src/GenomeUtils.h +18 -0
- pymisha-0.1.0/src/HashFunc.h +49 -0
- pymisha-0.1.0/src/IncrementalWilcox.cpp +150 -0
- pymisha-0.1.0/src/IncrementalWilcox.h +61 -0
- pymisha-0.1.0/src/KmerCounter.cpp +231 -0
- pymisha-0.1.0/src/KmerCounter.h +49 -0
- pymisha-0.1.0/src/MaskUtils.h +53 -0
- pymisha-0.1.0/src/MaskedBpCounter.cpp +64 -0
- pymisha-0.1.0/src/MaskedBpCounter.h +30 -0
- pymisha-0.1.0/src/PMDataFrame.cpp +195 -0
- pymisha-0.1.0/src/PMDataFrame.h +253 -0
- pymisha-0.1.0/src/PMDb.cpp +277 -0
- pymisha-0.1.0/src/PMDb.h +83 -0
- pymisha-0.1.0/src/PMFindNeighbors.cpp +387 -0
- pymisha-0.1.0/src/PMGsynth.cpp +923 -0
- pymisha-0.1.0/src/PMObject.h +96 -0
- pymisha-0.1.0/src/PMStubs.cpp +4827 -0
- pymisha-0.1.0/src/PMTrackCreate.cpp +708 -0
- pymisha-0.1.0/src/PMTrackExpressionIterator.cpp +257 -0
- pymisha-0.1.0/src/PMTrackExpressionIterator.h +126 -0
- pymisha-0.1.0/src/PMTrackExpressionScanner.cpp +481 -0
- pymisha-0.1.0/src/PMTrackExpressionScanner.h +149 -0
- pymisha-0.1.0/src/PMTrackExpressionVars.cpp +311 -0
- pymisha-0.1.0/src/PMTrackExpressionVars.h +104 -0
- pymisha-0.1.0/src/PMTrackIndexedFormat.cpp +357 -0
- pymisha-0.1.0/src/PMVTrack.cpp +1138 -0
- pymisha-0.1.0/src/PMWilcox.cpp +539 -0
- pymisha-0.1.0/src/PWMScorer.cpp +1415 -0
- pymisha-0.1.0/src/PWMScorer.h +182 -0
- pymisha-0.1.0/src/RaList.h +132 -0
- pymisha-0.1.0/src/Random.h +47 -0
- pymisha-0.1.0/src/RandomShuffle.h +26 -0
- pymisha-0.1.0/src/Segment.h +62 -0
- pymisha-0.1.0/src/SegmentFinder.h +283 -0
- pymisha-0.1.0/src/StratifiedMarkovModel.cpp +341 -0
- pymisha-0.1.0/src/StratifiedMarkovModel.h +190 -0
- pymisha-0.1.0/src/StreamPercentiler.h +242 -0
- pymisha-0.1.0/src/StreamSampler.h +81 -0
- pymisha-0.1.0/src/TGLException.cpp +60 -0
- pymisha-0.1.0/src/TGLException.h +112 -0
- pymisha-0.1.0/src/TrackIndex.cpp +201 -0
- pymisha-0.1.0/src/TrackIndex.h +112 -0
- pymisha-0.1.0/src/config.h +46 -0
- pymisha-0.1.0/src/pmutils.h +68 -0
- pymisha-0.1.0/src/port.h +29 -0
- pymisha-0.1.0/src/pymisha.cpp +667 -0
- pymisha-0.1.0/src/pymisha.h +303 -0
- pymisha-0.1.0/src/pymisha_init.cpp +169 -0
- pymisha-0.1.0/src/util.h +102 -0
- pymisha-0.1.0/src/utils/RunningLogSumExp.h +179 -0
- pymisha-0.1.0/src/utils/RunningMaxDeque.h +107 -0
- pymisha-0.1.0/tests/test_band_intersect.py +166 -0
- pymisha-0.1.0/tests/test_benchmarks.py +837 -0
- pymisha-0.1.0/tests/test_dataset_and_alias.py +750 -0
- pymisha-0.1.0/tests/test_dataset_resolution.py +40 -0
- pymisha-0.1.0/tests/test_db_admin.py +147 -0
- pymisha-0.1.0/tests/test_expr_aliasing.py +103 -0
- pymisha-0.1.0/tests/test_fd_safety.py +22 -0
- pymisha-0.1.0/tests/test_gbins.py +375 -0
- pymisha-0.1.0/tests/test_gcis_decay.py +604 -0
- pymisha-0.1.0/tests/test_gcor.py +306 -0
- pymisha-0.1.0/tests/test_gdb_convert_to_indexed.py +560 -0
- pymisha-0.1.0/tests/test_gdb_create.py +1066 -0
- pymisha-0.1.0/tests/test_gdb_info.py +150 -0
- pymisha-0.1.0/tests/test_gdir.py +225 -0
- pymisha-0.1.0/tests/test_gdist.py +156 -0
- pymisha-0.1.0/tests/test_gdist_vtrack_streaming.py +413 -0
- pymisha-0.1.0/tests/test_gextract.py +220 -0
- pymisha-0.1.0/tests/test_gextract_2d.py +1253 -0
- pymisha-0.1.0/tests/test_gextract_colnames.py +59 -0
- pymisha-0.1.0/tests/test_gintervals.py +1172 -0
- pymisha-0.1.0/tests/test_gintervals_constructors.py +97 -0
- pymisha-0.1.0/tests/test_gintervals_import_genes.py +605 -0
- pymisha-0.1.0/tests/test_gintervals_load_save.py +186 -0
- pymisha-0.1.0/tests/test_gintervals_management.py +63 -0
- pymisha-0.1.0/tests/test_gintervals_mapply.py +141 -0
- pymisha-0.1.0/tests/test_gintervals_neighbors.py +481 -0
- pymisha-0.1.0/tests/test_gintervals_neighbors_directional.py +360 -0
- pymisha-0.1.0/tests/test_gintervals_summary_quantiles.py +180 -0
- pymisha-0.1.0/tests/test_gintervals_update.py +88 -0
- pymisha-0.1.0/tests/test_gintervals_utils.py +554 -0
- pymisha-0.1.0/tests/test_giterator_cartesian_grid.py +59 -0
- pymisha-0.1.0/tests/test_glookup.py +240 -0
- pymisha-0.1.0/tests/test_glookup_streaming.py +351 -0
- pymisha-0.1.0/tests/test_golden_master.py +785 -0
- pymisha-0.1.0/tests/test_golden_master_advanced_intervals.py +128 -0
- pymisha-0.1.0/tests/test_golden_master_liftover.py +134 -0
- pymisha-0.1.0/tests/test_golden_master_sequence.py +145 -0
- pymisha-0.1.0/tests/test_golden_master_stats.py +303 -0
- pymisha-0.1.0/tests/test_golden_master_vtracks.py +252 -0
- pymisha-0.1.0/tests/test_gpartition.py +228 -0
- pymisha-0.1.0/tests/test_gquantiles.py +42 -0
- pymisha-0.1.0/tests/test_gsample.py +321 -0
- pymisha-0.1.0/tests/test_gsegment.py +200 -0
- pymisha-0.1.0/tests/test_gseq.py +127 -0
- pymisha-0.1.0/tests/test_gseq_kmer.py +186 -0
- pymisha-0.1.0/tests/test_gseq_pwm.py +1169 -0
- pymisha-0.1.0/tests/test_gsummary.py +148 -0
- pymisha-0.1.0/tests/test_gsynth.py +1669 -0
- pymisha-0.1.0/tests/test_gsynth_parallel.py +629 -0
- pymisha-0.1.0/tests/test_gtrack_attr.py +135 -0
- pymisha-0.1.0/tests/test_gtrack_attr_import.py +117 -0
- pymisha-0.1.0/tests/test_gtrack_create_empty_indexed.py +48 -0
- pymisha-0.1.0/tests/test_gtrack_create_pwm_energy.py +431 -0
- pymisha-0.1.0/tests/test_gtrack_exists.py +48 -0
- pymisha-0.1.0/tests/test_gtrack_lookup.py +354 -0
- pymisha-0.1.0/tests/test_gtrack_ls.py +230 -0
- pymisha-0.1.0/tests/test_gtrack_var.py +213 -0
- pymisha-0.1.0/tests/test_gvtrack_filter.py +1424 -0
- pymisha-0.1.0/tests/test_gwilcox.py +229 -0
- pymisha-0.1.0/tests/test_import_contacts.py +184 -0
- pymisha-0.1.0/tests/test_init_exports.py +13 -0
- pymisha-0.1.0/tests/test_intervals_2d.py +144 -0
- pymisha-0.1.0/tests/test_intervals_indexed.py +126 -0
- pymisha-0.1.0/tests/test_iterator_policy.py +25 -0
- pymisha-0.1.0/tests/test_liftover.py +2129 -0
- pymisha-0.1.0/tests/test_multi_db.py +663 -0
- pymisha-0.1.0/tests/test_multitask.py +203 -0
- pymisha-0.1.0/tests/test_optimization_summary.py +90 -0
- pymisha-0.1.0/tests/test_pwm_sliding_window.py +1021 -0
- pymisha-0.1.0/tests/test_pwm_spatial.py +356 -0
- pymisha-0.1.0/tests/test_security_robustness.py +150 -0
- pymisha-0.1.0/tests/test_track2d.py +362 -0
- pymisha-0.1.0/tests/test_track_create_import.py +385 -0
- pymisha-0.1.0/tests/test_track_indexed.py +33 -0
- pymisha-0.1.0/tests/test_track_liftover.py +1313 -0
- pymisha-0.1.0/tests/test_track_modify_smooth.py +425 -0
- pymisha-0.1.0/tests/test_vtrack_iterator_2d.py +60 -0
- pymisha-0.1.0/tests/test_vtrack_lse.py +1313 -0
- pymisha-0.1.0/tests/test_vtracks.py +2372 -0
pymisha-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024-2026 Weizmann Institute of Science
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pymisha-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pymisha
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Python interface for misha genomic databases with C++ streaming backends
|
|
5
|
+
Author-email: Aviezer Lifshitz <aviezerl@weizmann.ac.il>
|
|
6
|
+
Maintainer-email: Aviezer Lifshitz <aviezerl@weizmann.ac.il>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tanaylab/pymisha
|
|
9
|
+
Project-URL: Documentation, https://tanaylab.github.io/pymisha/
|
|
10
|
+
Project-URL: Repository, https://github.com/tanaylab/pymisha.git
|
|
11
|
+
Project-URL: Issues, https://github.com/tanaylab/pymisha/issues
|
|
12
|
+
Keywords: genomics,bioinformatics,misha,tracks
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: C++
|
|
22
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
23
|
+
Classifier: Operating System :: MacOS :: MacOS X
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Requires-Dist: numpy>=1.20
|
|
28
|
+
Requires-Dist: pandas>=1.3
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.6; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy; extra == "dev"
|
|
34
|
+
Provides-Extra: docs
|
|
35
|
+
Requires-Dist: sphinx>=7.0; extra == "docs"
|
|
36
|
+
Requires-Dist: myst-parser>=2.0; extra == "docs"
|
|
37
|
+
Requires-Dist: furo>=2024.8.6; extra == "docs"
|
|
38
|
+
Dynamic: license-file
|
|
39
|
+
|
|
40
|
+
# PyMisha
|
|
41
|
+
|
|
42
|
+
Python interface for [misha](https://github.com/tanaylab/misha) genomic databases. PyMisha provides full read/write access to misha track databases with C++ streaming backends for genome-scale operations.
|
|
43
|
+
|
|
44
|
+
## Features
|
|
45
|
+
|
|
46
|
+
- **1D and 2D track support:** Dense, sparse, and 2D (rectangle/point) tracks with full CRUD operations.
|
|
47
|
+
- **C++ streaming backends:** Extraction, summary, quantiles, distribution, lookup, segmentation, Wilcoxon tests, correlation, and sampling all stream through C++ for performance.
|
|
48
|
+
- **Virtual tracks:** Computed-on-the-fly track views with filtering, shifting, and 30+ aggregation functions.
|
|
49
|
+
- **Interval operations:** Union, intersection, difference, canonicalization, neighbors, annotation, normalization, random generation, and liftover.
|
|
50
|
+
- **Sequence analysis:** Extraction, k-mer counting, PWM/PSSM scoring, and Markov-chain synthesis (`gsynth`).
|
|
51
|
+
- **Database management:** Create, link, convert, and manage misha-compatible genomic databases.
|
|
52
|
+
- **R misha compatibility:** Reads and writes the same on-disk formats as R misha (123/145 R exports covered).
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
Prerequisites:
|
|
57
|
+
- Python 3.10+
|
|
58
|
+
- C++17 compiler (GCC 8+, Clang 7+, or Apple Clang 11+)
|
|
59
|
+
- `numpy`, `pandas`
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
For development:
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install -e ".[dev]"
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Quick start
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
import pymisha as pm
|
|
75
|
+
|
|
76
|
+
# Initialize the database
|
|
77
|
+
pm.gdb_init("/path/to/misha_db")
|
|
78
|
+
|
|
79
|
+
# Create intervals and extract data
|
|
80
|
+
intervals = pm.gintervals_from_strings(["chr1:0-1000", "chr1:2000-2600"])
|
|
81
|
+
out = pm.gextract("track1", intervals, iterator=100)
|
|
82
|
+
|
|
83
|
+
# Filter and summarize
|
|
84
|
+
filtered = pm.gscreen("track1 > 0.5", intervals)
|
|
85
|
+
stats = pm.gsummary("track1", intervals)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Examples
|
|
89
|
+
|
|
90
|
+
Using the built-in example database:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import pymisha as pm
|
|
94
|
+
|
|
95
|
+
pm.gdb_init_examples()
|
|
96
|
+
print(pm.gtrack_ls())
|
|
97
|
+
print(pm.gextract("dense_track", pm.gintervals("chr1", 0, 1000)))
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Optional dependencies
|
|
101
|
+
|
|
102
|
+
- `pyBigWig`: For BigWig import in `gtrack_import`.
|
|
103
|
+
- `pyreadr` + `Rscript`: For loading R-serialized big interval sets.
|
|
104
|
+
- `PyYAML`: For richer `gdataset_info` metadata parsing.
|
|
105
|
+
|
|
106
|
+
## Missing features
|
|
107
|
+
|
|
108
|
+
Compared to R misha, the following are not yet implemented:
|
|
109
|
+
|
|
110
|
+
- **Track Arrays:** `gtrack.array.*` and `gvtrack.array.slice`.
|
|
111
|
+
- **Legacy Conversion:** `gtrack.convert` (for migrating old 2D formats).
|
|
112
|
+
|
|
113
|
+
## License
|
|
114
|
+
|
|
115
|
+
MIT. See [LICENSE](LICENSE) for details.
|
pymisha-0.1.0/README.md
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# PyMisha
|
|
2
|
+
|
|
3
|
+
Python interface for [misha](https://github.com/tanaylab/misha) genomic databases. PyMisha provides full read/write access to misha track databases with C++ streaming backends for genome-scale operations.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **1D and 2D track support:** Dense, sparse, and 2D (rectangle/point) tracks with full CRUD operations.
|
|
8
|
+
- **C++ streaming backends:** Extraction, summary, quantiles, distribution, lookup, segmentation, Wilcoxon tests, correlation, and sampling all stream through C++ for performance.
|
|
9
|
+
- **Virtual tracks:** Computed-on-the-fly track views with filtering, shifting, and 30+ aggregation functions.
|
|
10
|
+
- **Interval operations:** Union, intersection, difference, canonicalization, neighbors, annotation, normalization, random generation, and liftover.
|
|
11
|
+
- **Sequence analysis:** Extraction, k-mer counting, PWM/PSSM scoring, and Markov-chain synthesis (`gsynth`).
|
|
12
|
+
- **Database management:** Create, link, convert, and manage misha-compatible genomic databases.
|
|
13
|
+
- **R misha compatibility:** Reads and writes the same on-disk formats as R misha (123/145 R exports covered).
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
Prerequisites:
|
|
18
|
+
- Python 3.10+
|
|
19
|
+
- C++17 compiler (GCC 8+, Clang 7+, or Apple Clang 11+)
|
|
20
|
+
- `numpy`, `pandas`
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install .
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
For development:
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install -e ".[dev]"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Quick start
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import pymisha as pm
|
|
36
|
+
|
|
37
|
+
# Initialize the database
|
|
38
|
+
pm.gdb_init("/path/to/misha_db")
|
|
39
|
+
|
|
40
|
+
# Create intervals and extract data
|
|
41
|
+
intervals = pm.gintervals_from_strings(["chr1:0-1000", "chr1:2000-2600"])
|
|
42
|
+
out = pm.gextract("track1", intervals, iterator=100)
|
|
43
|
+
|
|
44
|
+
# Filter and summarize
|
|
45
|
+
filtered = pm.gscreen("track1 > 0.5", intervals)
|
|
46
|
+
stats = pm.gsummary("track1", intervals)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Examples
|
|
50
|
+
|
|
51
|
+
Using the built-in example database:
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
import pymisha as pm
|
|
55
|
+
|
|
56
|
+
pm.gdb_init_examples()
|
|
57
|
+
print(pm.gtrack_ls())
|
|
58
|
+
print(pm.gextract("dense_track", pm.gintervals("chr1", 0, 1000)))
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Optional dependencies
|
|
62
|
+
|
|
63
|
+
- `pyBigWig`: For BigWig import in `gtrack_import`.
|
|
64
|
+
- `pyreadr` + `Rscript`: For loading R-serialized big interval sets.
|
|
65
|
+
- `PyYAML`: For richer `gdataset_info` metadata parsing.
|
|
66
|
+
|
|
67
|
+
## Missing features
|
|
68
|
+
|
|
69
|
+
Compared to R misha, the following are not yet implemented:
|
|
70
|
+
|
|
71
|
+
- **Track Arrays:** `gtrack.array.*` and `gvtrack.array.slice`.
|
|
72
|
+
- **Legacy Conversion:** `gtrack.convert` (for migrating old 2D formats).
|
|
73
|
+
|
|
74
|
+
## License
|
|
75
|
+
|
|
76
|
+
MIT. See [LICENSE](LICENSE) for details.
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PyMisha - Python wrapper for the misha Genomic Data Analysis Toolkit
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
__version__ = '0.1.0'
|
|
6
|
+
|
|
7
|
+
# Make numpy available for expressions
|
|
8
|
+
import numpy as np # noqa: F401
|
|
9
|
+
|
|
10
|
+
from . import _shared
|
|
11
|
+
from ._shared import (
|
|
12
|
+
CONFIG,
|
|
13
|
+
_bound_colname,
|
|
14
|
+
_checkroot,
|
|
15
|
+
_chunk_slices,
|
|
16
|
+
_df2pymisha,
|
|
17
|
+
_iterated_intervals,
|
|
18
|
+
_itr2pymisha,
|
|
19
|
+
_make_progress_callback,
|
|
20
|
+
_progress_context,
|
|
21
|
+
_pymisha,
|
|
22
|
+
_pymisha2df,
|
|
23
|
+
)
|
|
24
|
+
from .analysis import gcis_decay, gsegment, gwilcox
|
|
25
|
+
from .dataset import (
|
|
26
|
+
gdataset_info,
|
|
27
|
+
gdataset_load,
|
|
28
|
+
gdataset_ls,
|
|
29
|
+
gdataset_save,
|
|
30
|
+
gdataset_unload,
|
|
31
|
+
)
|
|
32
|
+
from .db import (
|
|
33
|
+
gdb_examples_path,
|
|
34
|
+
gdb_info,
|
|
35
|
+
gdb_init,
|
|
36
|
+
gdb_init_examples,
|
|
37
|
+
gdb_reload,
|
|
38
|
+
gdb_unload,
|
|
39
|
+
gsetroot,
|
|
40
|
+
)
|
|
41
|
+
from .db_attrs import gdb_get_readonly_attrs, gdb_set_readonly_attrs
|
|
42
|
+
from .db_create import gdb_convert_to_indexed, gdb_create, gdb_create_genome, gdb_create_linked
|
|
43
|
+
from .extract import gextract, gscreen
|
|
44
|
+
from .gdir import (
|
|
45
|
+
gdir_cd,
|
|
46
|
+
gdir_create,
|
|
47
|
+
gdir_cwd,
|
|
48
|
+
gdir_rm,
|
|
49
|
+
gtrack_create_dirs,
|
|
50
|
+
)
|
|
51
|
+
from .gsynth import (
|
|
52
|
+
GsynthModel,
|
|
53
|
+
gsynth_bin_map,
|
|
54
|
+
gsynth_load,
|
|
55
|
+
gsynth_random,
|
|
56
|
+
gsynth_replace_kmer,
|
|
57
|
+
gsynth_sample,
|
|
58
|
+
gsynth_save,
|
|
59
|
+
gsynth_train,
|
|
60
|
+
)
|
|
61
|
+
from .intervals import (
|
|
62
|
+
gintervals,
|
|
63
|
+
gintervals_2d,
|
|
64
|
+
gintervals_2d_all,
|
|
65
|
+
gintervals_2d_band_intersect,
|
|
66
|
+
gintervals_2d_convert_to_indexed,
|
|
67
|
+
gintervals_all,
|
|
68
|
+
gintervals_annotate,
|
|
69
|
+
gintervals_canonic,
|
|
70
|
+
gintervals_chrom_sizes,
|
|
71
|
+
gintervals_convert_to_indexed,
|
|
72
|
+
gintervals_coverage_fraction,
|
|
73
|
+
gintervals_covered_bp,
|
|
74
|
+
gintervals_dataset,
|
|
75
|
+
gintervals_diff,
|
|
76
|
+
gintervals_exists,
|
|
77
|
+
gintervals_force_range,
|
|
78
|
+
gintervals_from_bed,
|
|
79
|
+
gintervals_from_strings,
|
|
80
|
+
gintervals_from_tuples,
|
|
81
|
+
gintervals_import_genes,
|
|
82
|
+
gintervals_intersect,
|
|
83
|
+
gintervals_is_indexed,
|
|
84
|
+
gintervals_load,
|
|
85
|
+
gintervals_ls,
|
|
86
|
+
gintervals_mapply,
|
|
87
|
+
gintervals_mark_overlaps,
|
|
88
|
+
gintervals_neighbors,
|
|
89
|
+
gintervals_neighbors_directional,
|
|
90
|
+
gintervals_neighbors_downstream,
|
|
91
|
+
gintervals_neighbors_upstream,
|
|
92
|
+
gintervals_normalize,
|
|
93
|
+
gintervals_random,
|
|
94
|
+
gintervals_rbind,
|
|
95
|
+
gintervals_rm,
|
|
96
|
+
gintervals_save,
|
|
97
|
+
gintervals_union,
|
|
98
|
+
gintervals_update,
|
|
99
|
+
gintervals_window,
|
|
100
|
+
giterator_cartesian_grid,
|
|
101
|
+
giterator_intervals,
|
|
102
|
+
)
|
|
103
|
+
from .liftover import (
|
|
104
|
+
gintervals_as_chain,
|
|
105
|
+
gintervals_liftover,
|
|
106
|
+
gintervals_load_chain,
|
|
107
|
+
gtrack_liftover,
|
|
108
|
+
)
|
|
109
|
+
from .lookup import glookup, gtrack_lookup
|
|
110
|
+
from .sequence import (
|
|
111
|
+
gseq_comp,
|
|
112
|
+
gseq_extract,
|
|
113
|
+
gseq_kmer,
|
|
114
|
+
gseq_kmer_dist,
|
|
115
|
+
gseq_pwm,
|
|
116
|
+
gseq_rev,
|
|
117
|
+
gseq_revcomp,
|
|
118
|
+
)
|
|
119
|
+
from .summary import (
|
|
120
|
+
gbins_quantiles,
|
|
121
|
+
gbins_summary,
|
|
122
|
+
gcor,
|
|
123
|
+
gdist,
|
|
124
|
+
gintervals_quantiles,
|
|
125
|
+
gintervals_summary,
|
|
126
|
+
gpartition,
|
|
127
|
+
gquantiles,
|
|
128
|
+
gsample,
|
|
129
|
+
gsummary,
|
|
130
|
+
)
|
|
131
|
+
from .tracks import (
|
|
132
|
+
gtrack_2d_create,
|
|
133
|
+
gtrack_2d_import,
|
|
134
|
+
gtrack_2d_import_contacts,
|
|
135
|
+
gtrack_attr_export,
|
|
136
|
+
gtrack_attr_get,
|
|
137
|
+
gtrack_attr_import,
|
|
138
|
+
gtrack_attr_set,
|
|
139
|
+
gtrack_convert_to_indexed,
|
|
140
|
+
gtrack_copy,
|
|
141
|
+
gtrack_create,
|
|
142
|
+
gtrack_create_dense,
|
|
143
|
+
gtrack_create_empty_indexed,
|
|
144
|
+
gtrack_create_pwm_energy,
|
|
145
|
+
gtrack_create_sparse,
|
|
146
|
+
gtrack_dataset,
|
|
147
|
+
gtrack_exists,
|
|
148
|
+
gtrack_import,
|
|
149
|
+
gtrack_import_mappedseq,
|
|
150
|
+
gtrack_import_set,
|
|
151
|
+
gtrack_info,
|
|
152
|
+
gtrack_ls,
|
|
153
|
+
gtrack_modify,
|
|
154
|
+
gtrack_mv,
|
|
155
|
+
gtrack_rm,
|
|
156
|
+
gtrack_smooth,
|
|
157
|
+
gtrack_var_get,
|
|
158
|
+
gtrack_var_ls,
|
|
159
|
+
gtrack_var_rm,
|
|
160
|
+
gtrack_var_set,
|
|
161
|
+
)
|
|
162
|
+
from .vtracks import (
|
|
163
|
+
gvtrack_clear,
|
|
164
|
+
gvtrack_create,
|
|
165
|
+
gvtrack_filter,
|
|
166
|
+
gvtrack_info,
|
|
167
|
+
gvtrack_iterator,
|
|
168
|
+
gvtrack_iterator_2d,
|
|
169
|
+
gvtrack_ls,
|
|
170
|
+
gvtrack_rm,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def __getattr__(name):
|
|
175
|
+
# Expose live DB state variables instead of stale import-time snapshots.
|
|
176
|
+
if name in {"_GROOT", "_UROOT", "_VTRACKS"}:
|
|
177
|
+
return getattr(_shared, name)
|
|
178
|
+
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
__all__ = [
|
|
182
|
+
# Configuration
|
|
183
|
+
'CONFIG',
|
|
184
|
+
|
|
185
|
+
# Database functions
|
|
186
|
+
'gdb_init',
|
|
187
|
+
'gdb_reload',
|
|
188
|
+
'gdb_unload',
|
|
189
|
+
'gdb_info',
|
|
190
|
+
'gdb_examples_path',
|
|
191
|
+
'gdb_init_examples',
|
|
192
|
+
'gsetroot',
|
|
193
|
+
'gdb_create',
|
|
194
|
+
'gdb_create_genome',
|
|
195
|
+
'gdb_create_linked',
|
|
196
|
+
'gdb_convert_to_indexed',
|
|
197
|
+
'gdb_get_readonly_attrs',
|
|
198
|
+
'gdb_set_readonly_attrs',
|
|
199
|
+
'gdataset_load',
|
|
200
|
+
'gdataset_unload',
|
|
201
|
+
'gdataset_ls',
|
|
202
|
+
'gdataset_save',
|
|
203
|
+
'gdataset_info',
|
|
204
|
+
|
|
205
|
+
# Track functions
|
|
206
|
+
'gextract',
|
|
207
|
+
'gscreen',
|
|
208
|
+
'gsummary',
|
|
209
|
+
'gquantiles',
|
|
210
|
+
'gdist',
|
|
211
|
+
'gpartition',
|
|
212
|
+
'gsample',
|
|
213
|
+
'gcor',
|
|
214
|
+
'gbins_summary',
|
|
215
|
+
'gbins_quantiles',
|
|
216
|
+
'gcis_decay',
|
|
217
|
+
'gsegment',
|
|
218
|
+
'gwilcox',
|
|
219
|
+
'gtrack_ls',
|
|
220
|
+
'gtrack_info',
|
|
221
|
+
'gtrack_exists',
|
|
222
|
+
'gtrack_dataset',
|
|
223
|
+
'gtrack_create',
|
|
224
|
+
'gtrack_create_dense',
|
|
225
|
+
'gtrack_create_sparse',
|
|
226
|
+
'gtrack_import',
|
|
227
|
+
'gtrack_import_mappedseq',
|
|
228
|
+
'gtrack_import_set',
|
|
229
|
+
'gtrack_rm',
|
|
230
|
+
'gtrack_mv',
|
|
231
|
+
'gtrack_copy',
|
|
232
|
+
'gtrack_convert_to_indexed',
|
|
233
|
+
'gtrack_create_empty_indexed',
|
|
234
|
+
'gtrack_attr_get',
|
|
235
|
+
'gtrack_attr_set',
|
|
236
|
+
'gtrack_attr_export',
|
|
237
|
+
'gtrack_attr_import',
|
|
238
|
+
'gtrack_var_ls',
|
|
239
|
+
'gtrack_var_get',
|
|
240
|
+
'gtrack_var_set',
|
|
241
|
+
'gtrack_var_rm',
|
|
242
|
+
'gtrack_modify',
|
|
243
|
+
'gtrack_smooth',
|
|
244
|
+
'gtrack_2d_create',
|
|
245
|
+
'gtrack_2d_import',
|
|
246
|
+
'gtrack_2d_import_contacts',
|
|
247
|
+
'gtrack_create_pwm_energy',
|
|
248
|
+
|
|
249
|
+
# Interval functions
|
|
250
|
+
'gintervals',
|
|
251
|
+
'gintervals_all',
|
|
252
|
+
'gintervals_2d',
|
|
253
|
+
'gintervals_2d_all',
|
|
254
|
+
'gintervals_2d_band_intersect',
|
|
255
|
+
'gintervals_union',
|
|
256
|
+
'gintervals_intersect',
|
|
257
|
+
'gintervals_diff',
|
|
258
|
+
'gintervals_canonic',
|
|
259
|
+
'gintervals_force_range',
|
|
260
|
+
'gintervals_summary',
|
|
261
|
+
'gintervals_quantiles',
|
|
262
|
+
'gintervals_covered_bp',
|
|
263
|
+
'gintervals_coverage_fraction',
|
|
264
|
+
'gintervals_neighbors',
|
|
265
|
+
'gintervals_neighbors_upstream',
|
|
266
|
+
'gintervals_neighbors_downstream',
|
|
267
|
+
'gintervals_neighbors_directional',
|
|
268
|
+
'gintervals_from_tuples',
|
|
269
|
+
'gintervals_from_strings',
|
|
270
|
+
'gintervals_from_bed',
|
|
271
|
+
'gintervals_import_genes',
|
|
272
|
+
'gintervals_window',
|
|
273
|
+
'gintervals_ls',
|
|
274
|
+
'gintervals_exists',
|
|
275
|
+
'gintervals_dataset',
|
|
276
|
+
'gintervals_chrom_sizes',
|
|
277
|
+
'gintervals_load',
|
|
278
|
+
'gintervals_convert_to_indexed',
|
|
279
|
+
'gintervals_2d_convert_to_indexed',
|
|
280
|
+
'gintervals_is_indexed',
|
|
281
|
+
'gintervals_save',
|
|
282
|
+
'gintervals_update',
|
|
283
|
+
'gintervals_mapply',
|
|
284
|
+
'gintervals_rm',
|
|
285
|
+
'giterator_cartesian_grid',
|
|
286
|
+
'giterator_intervals',
|
|
287
|
+
'gintervals_rbind',
|
|
288
|
+
'gintervals_mark_overlaps',
|
|
289
|
+
'gintervals_annotate',
|
|
290
|
+
'gintervals_normalize',
|
|
291
|
+
'gintervals_random',
|
|
292
|
+
|
|
293
|
+
# Virtual track functions
|
|
294
|
+
'gvtrack_create',
|
|
295
|
+
'gvtrack_ls',
|
|
296
|
+
'gvtrack_info',
|
|
297
|
+
'gvtrack_iterator',
|
|
298
|
+
'gvtrack_iterator_2d',
|
|
299
|
+
'gvtrack_filter',
|
|
300
|
+
'gvtrack_rm',
|
|
301
|
+
'gvtrack_clear',
|
|
302
|
+
|
|
303
|
+
# Sequence functions
|
|
304
|
+
'gseq_extract',
|
|
305
|
+
'gseq_rev',
|
|
306
|
+
'gseq_comp',
|
|
307
|
+
'gseq_revcomp',
|
|
308
|
+
'gseq_kmer',
|
|
309
|
+
'gseq_kmer_dist',
|
|
310
|
+
'gseq_pwm',
|
|
311
|
+
|
|
312
|
+
# Lookup functions
|
|
313
|
+
'glookup',
|
|
314
|
+
'gtrack_lookup',
|
|
315
|
+
|
|
316
|
+
# Liftover functions
|
|
317
|
+
'gintervals_load_chain',
|
|
318
|
+
'gintervals_as_chain',
|
|
319
|
+
'gintervals_liftover',
|
|
320
|
+
'gtrack_liftover',
|
|
321
|
+
|
|
322
|
+
# Directory management
|
|
323
|
+
'gdir_cwd',
|
|
324
|
+
'gdir_cd',
|
|
325
|
+
'gdir_create',
|
|
326
|
+
'gdir_rm',
|
|
327
|
+
'gtrack_create_dirs',
|
|
328
|
+
|
|
329
|
+
# Genome synthesis functions
|
|
330
|
+
'GsynthModel',
|
|
331
|
+
'gsynth_bin_map',
|
|
332
|
+
'gsynth_train',
|
|
333
|
+
'gsynth_sample',
|
|
334
|
+
'gsynth_random',
|
|
335
|
+
'gsynth_replace_kmer',
|
|
336
|
+
'gsynth_save',
|
|
337
|
+
'gsynth_load',
|
|
338
|
+
|
|
339
|
+
# Internal (shared)
|
|
340
|
+
'_bound_colname',
|
|
341
|
+
'_checkroot',
|
|
342
|
+
'_chunk_slices',
|
|
343
|
+
'_df2pymisha',
|
|
344
|
+
'_iterated_intervals',
|
|
345
|
+
'_itr2pymisha',
|
|
346
|
+
'_make_progress_callback',
|
|
347
|
+
'_progress_context',
|
|
348
|
+
'_pymisha',
|
|
349
|
+
'_pymisha2df',
|
|
350
|
+
]
|
|
351
|
+
|
|
352
|
+
# Export module locals to the C extension for access to Python functions
|
|
353
|
+
# This must be at the end of the file after all functions are defined
|
|
354
|
+
_pymisha._PMLOCALS = locals()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Shared CRC64-ECMA helpers (parity with C++ CRC64.h)."""
|
|
2
|
+
|
|
3
|
+
_CRC64_POLY = 0xC96C5795D7870F42
|
|
4
|
+
_CRC64_TABLE = None
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _crc64_table():
|
|
8
|
+
table = []
|
|
9
|
+
for i in range(256):
|
|
10
|
+
crc = i
|
|
11
|
+
for _ in range(8):
|
|
12
|
+
if crc & 1:
|
|
13
|
+
crc = (crc >> 1) ^ _CRC64_POLY
|
|
14
|
+
else:
|
|
15
|
+
crc >>= 1
|
|
16
|
+
table.append(crc & 0xFFFFFFFFFFFFFFFF)
|
|
17
|
+
return table
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def crc64_incremental(crc, data):
|
|
21
|
+
global _CRC64_TABLE
|
|
22
|
+
if _CRC64_TABLE is None:
|
|
23
|
+
_CRC64_TABLE = _crc64_table()
|
|
24
|
+
for byte in data:
|
|
25
|
+
crc = (crc >> 8) ^ _CRC64_TABLE[(crc ^ byte) & 0xFF]
|
|
26
|
+
return crc & 0xFFFFFFFFFFFFFFFF
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def crc64_init():
|
|
30
|
+
return 0xFFFFFFFFFFFFFFFF
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def crc64_finalize(crc):
|
|
34
|
+
return (~crc) & 0xFFFFFFFFFFFFFFFF
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Name validation helpers."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
_DOTTED_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_.]*$")
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def validate_dotted_name(name, kind):
|
|
11
|
+
if not isinstance(name, str) or not name:
|
|
12
|
+
raise ValueError(f"{kind} must be a non-empty string")
|
|
13
|
+
if not _DOTTED_NAME_RE.fullmatch(name):
|
|
14
|
+
raise ValueError(
|
|
15
|
+
f"Invalid {kind} '{name}'. Must start with a letter and contain "
|
|
16
|
+
"only alphanumeric characters, underscores, and dots."
|
|
17
|
+
)
|
|
18
|
+
parts = name.split(".")
|
|
19
|
+
if any(not part for part in parts):
|
|
20
|
+
raise ValueError(
|
|
21
|
+
f"Invalid {kind} '{name}'. Empty dot-separated components are not allowed."
|
|
22
|
+
)
|