pymisha 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. pymisha-0.1.0/LICENSE +21 -0
  2. pymisha-0.1.0/MANIFEST.in +4 -0
  3. pymisha-0.1.0/PKG-INFO +115 -0
  4. pymisha-0.1.0/README.md +76 -0
  5. pymisha-0.1.0/pymisha/__init__.py +354 -0
  6. pymisha-0.1.0/pymisha/_crc64.py +34 -0
  7. pymisha-0.1.0/pymisha/_name_validation.py +22 -0
  8. pymisha-0.1.0/pymisha/_pymisha.pyi +550 -0
  9. pymisha-0.1.0/pymisha/_quadtree.py +592 -0
  10. pymisha-0.1.0/pymisha/_safe_eval.py +147 -0
  11. pymisha-0.1.0/pymisha/_safe_pickle.py +49 -0
  12. pymisha-0.1.0/pymisha/_shared.py +248 -0
  13. pymisha-0.1.0/pymisha/analysis.py +559 -0
  14. pymisha-0.1.0/pymisha/dataset.py +552 -0
  15. pymisha-0.1.0/pymisha/db.py +414 -0
  16. pymisha-0.1.0/pymisha/db_attrs.py +245 -0
  17. pymisha-0.1.0/pymisha/db_create.py +989 -0
  18. pymisha-0.1.0/pymisha/expr.py +104 -0
  19. pymisha-0.1.0/pymisha/extract.py +648 -0
  20. pymisha-0.1.0/pymisha/gdir.py +319 -0
  21. pymisha-0.1.0/pymisha/gsynth.py +1411 -0
  22. pymisha-0.1.0/pymisha/intervals.py +4202 -0
  23. pymisha-0.1.0/pymisha/liftover.py +1650 -0
  24. pymisha-0.1.0/pymisha/lookup.py +513 -0
  25. pymisha-0.1.0/pymisha/sequence.py +804 -0
  26. pymisha-0.1.0/pymisha/summary.py +1742 -0
  27. pymisha-0.1.0/pymisha/tracks.py +3173 -0
  28. pymisha-0.1.0/pymisha/vtracks.py +1435 -0
  29. pymisha-0.1.0/pymisha.egg-info/PKG-INFO +115 -0
  30. pymisha-0.1.0/pymisha.egg-info/SOURCES.txt +190 -0
  31. pymisha-0.1.0/pymisha.egg-info/dependency_links.txt +1 -0
  32. pymisha-0.1.0/pymisha.egg-info/not-zip-safe +1 -0
  33. pymisha-0.1.0/pymisha.egg-info/requires.txt +13 -0
  34. pymisha-0.1.0/pymisha.egg-info/top_level.txt +2 -0
  35. pymisha-0.1.0/pyproject.toml +92 -0
  36. pymisha-0.1.0/setup.cfg +4 -0
  37. pymisha-0.1.0/setup.py +25 -0
  38. pymisha-0.1.0/src/BinFinder.cpp +36 -0
  39. pymisha-0.1.0/src/BinFinder.h +108 -0
  40. pymisha-0.1.0/src/BinsManager.h +127 -0
  41. pymisha-0.1.0/src/BufferedFile.cpp +72 -0
  42. pymisha-0.1.0/src/BufferedFile.h +269 -0
  43. pymisha-0.1.0/src/CRC64.h +85 -0
  44. pymisha-0.1.0/src/DnaPSSM.cpp +1242 -0
  45. pymisha-0.1.0/src/DnaPSSM.h +271 -0
  46. pymisha-0.1.0/src/GInterval.cpp +115 -0
  47. pymisha-0.1.0/src/GInterval.h +127 -0
  48. pymisha-0.1.0/src/GenomeChromKey.h +129 -0
  49. pymisha-0.1.0/src/GenomeIndex.cpp +193 -0
  50. pymisha-0.1.0/src/GenomeIndex.h +75 -0
  51. pymisha-0.1.0/src/GenomeSeqFetch.cpp +181 -0
  52. pymisha-0.1.0/src/GenomeSeqFetch.h +50 -0
  53. pymisha-0.1.0/src/GenomeSeqScorer.cpp +36 -0
  54. pymisha-0.1.0/src/GenomeSeqScorer.h +33 -0
  55. pymisha-0.1.0/src/GenomeTrack.cpp +317 -0
  56. pymisha-0.1.0/src/GenomeTrack.h +104 -0
  57. pymisha-0.1.0/src/GenomeTrack1D.h +106 -0
  58. pymisha-0.1.0/src/GenomeTrackFixedBin.cpp +429 -0
  59. pymisha-0.1.0/src/GenomeTrackFixedBin.h +116 -0
  60. pymisha-0.1.0/src/GenomeTrackSparse.cpp +281 -0
  61. pymisha-0.1.0/src/GenomeTrackSparse.h +177 -0
  62. pymisha-0.1.0/src/GenomeUtils.cpp +56 -0
  63. pymisha-0.1.0/src/GenomeUtils.h +18 -0
  64. pymisha-0.1.0/src/HashFunc.h +49 -0
  65. pymisha-0.1.0/src/IncrementalWilcox.cpp +150 -0
  66. pymisha-0.1.0/src/IncrementalWilcox.h +61 -0
  67. pymisha-0.1.0/src/KmerCounter.cpp +231 -0
  68. pymisha-0.1.0/src/KmerCounter.h +49 -0
  69. pymisha-0.1.0/src/MaskUtils.h +53 -0
  70. pymisha-0.1.0/src/MaskedBpCounter.cpp +64 -0
  71. pymisha-0.1.0/src/MaskedBpCounter.h +30 -0
  72. pymisha-0.1.0/src/PMDataFrame.cpp +195 -0
  73. pymisha-0.1.0/src/PMDataFrame.h +253 -0
  74. pymisha-0.1.0/src/PMDb.cpp +277 -0
  75. pymisha-0.1.0/src/PMDb.h +83 -0
  76. pymisha-0.1.0/src/PMFindNeighbors.cpp +387 -0
  77. pymisha-0.1.0/src/PMGsynth.cpp +923 -0
  78. pymisha-0.1.0/src/PMObject.h +96 -0
  79. pymisha-0.1.0/src/PMStubs.cpp +4827 -0
  80. pymisha-0.1.0/src/PMTrackCreate.cpp +708 -0
  81. pymisha-0.1.0/src/PMTrackExpressionIterator.cpp +257 -0
  82. pymisha-0.1.0/src/PMTrackExpressionIterator.h +126 -0
  83. pymisha-0.1.0/src/PMTrackExpressionScanner.cpp +481 -0
  84. pymisha-0.1.0/src/PMTrackExpressionScanner.h +149 -0
  85. pymisha-0.1.0/src/PMTrackExpressionVars.cpp +311 -0
  86. pymisha-0.1.0/src/PMTrackExpressionVars.h +104 -0
  87. pymisha-0.1.0/src/PMTrackIndexedFormat.cpp +357 -0
  88. pymisha-0.1.0/src/PMVTrack.cpp +1138 -0
  89. pymisha-0.1.0/src/PMWilcox.cpp +539 -0
  90. pymisha-0.1.0/src/PWMScorer.cpp +1415 -0
  91. pymisha-0.1.0/src/PWMScorer.h +182 -0
  92. pymisha-0.1.0/src/RaList.h +132 -0
  93. pymisha-0.1.0/src/Random.h +47 -0
  94. pymisha-0.1.0/src/RandomShuffle.h +26 -0
  95. pymisha-0.1.0/src/Segment.h +62 -0
  96. pymisha-0.1.0/src/SegmentFinder.h +283 -0
  97. pymisha-0.1.0/src/StratifiedMarkovModel.cpp +341 -0
  98. pymisha-0.1.0/src/StratifiedMarkovModel.h +190 -0
  99. pymisha-0.1.0/src/StreamPercentiler.h +242 -0
  100. pymisha-0.1.0/src/StreamSampler.h +81 -0
  101. pymisha-0.1.0/src/TGLException.cpp +60 -0
  102. pymisha-0.1.0/src/TGLException.h +112 -0
  103. pymisha-0.1.0/src/TrackIndex.cpp +201 -0
  104. pymisha-0.1.0/src/TrackIndex.h +112 -0
  105. pymisha-0.1.0/src/config.h +46 -0
  106. pymisha-0.1.0/src/pmutils.h +68 -0
  107. pymisha-0.1.0/src/port.h +29 -0
  108. pymisha-0.1.0/src/pymisha.cpp +667 -0
  109. pymisha-0.1.0/src/pymisha.h +303 -0
  110. pymisha-0.1.0/src/pymisha_init.cpp +169 -0
  111. pymisha-0.1.0/src/util.h +102 -0
  112. pymisha-0.1.0/src/utils/RunningLogSumExp.h +179 -0
  113. pymisha-0.1.0/src/utils/RunningMaxDeque.h +107 -0
  114. pymisha-0.1.0/tests/test_band_intersect.py +166 -0
  115. pymisha-0.1.0/tests/test_benchmarks.py +837 -0
  116. pymisha-0.1.0/tests/test_dataset_and_alias.py +750 -0
  117. pymisha-0.1.0/tests/test_dataset_resolution.py +40 -0
  118. pymisha-0.1.0/tests/test_db_admin.py +147 -0
  119. pymisha-0.1.0/tests/test_expr_aliasing.py +103 -0
  120. pymisha-0.1.0/tests/test_fd_safety.py +22 -0
  121. pymisha-0.1.0/tests/test_gbins.py +375 -0
  122. pymisha-0.1.0/tests/test_gcis_decay.py +604 -0
  123. pymisha-0.1.0/tests/test_gcor.py +306 -0
  124. pymisha-0.1.0/tests/test_gdb_convert_to_indexed.py +560 -0
  125. pymisha-0.1.0/tests/test_gdb_create.py +1066 -0
  126. pymisha-0.1.0/tests/test_gdb_info.py +150 -0
  127. pymisha-0.1.0/tests/test_gdir.py +225 -0
  128. pymisha-0.1.0/tests/test_gdist.py +156 -0
  129. pymisha-0.1.0/tests/test_gdist_vtrack_streaming.py +413 -0
  130. pymisha-0.1.0/tests/test_gextract.py +220 -0
  131. pymisha-0.1.0/tests/test_gextract_2d.py +1253 -0
  132. pymisha-0.1.0/tests/test_gextract_colnames.py +59 -0
  133. pymisha-0.1.0/tests/test_gintervals.py +1172 -0
  134. pymisha-0.1.0/tests/test_gintervals_constructors.py +97 -0
  135. pymisha-0.1.0/tests/test_gintervals_import_genes.py +605 -0
  136. pymisha-0.1.0/tests/test_gintervals_load_save.py +186 -0
  137. pymisha-0.1.0/tests/test_gintervals_management.py +63 -0
  138. pymisha-0.1.0/tests/test_gintervals_mapply.py +141 -0
  139. pymisha-0.1.0/tests/test_gintervals_neighbors.py +481 -0
  140. pymisha-0.1.0/tests/test_gintervals_neighbors_directional.py +360 -0
  141. pymisha-0.1.0/tests/test_gintervals_summary_quantiles.py +180 -0
  142. pymisha-0.1.0/tests/test_gintervals_update.py +88 -0
  143. pymisha-0.1.0/tests/test_gintervals_utils.py +554 -0
  144. pymisha-0.1.0/tests/test_giterator_cartesian_grid.py +59 -0
  145. pymisha-0.1.0/tests/test_glookup.py +240 -0
  146. pymisha-0.1.0/tests/test_glookup_streaming.py +351 -0
  147. pymisha-0.1.0/tests/test_golden_master.py +785 -0
  148. pymisha-0.1.0/tests/test_golden_master_advanced_intervals.py +128 -0
  149. pymisha-0.1.0/tests/test_golden_master_liftover.py +134 -0
  150. pymisha-0.1.0/tests/test_golden_master_sequence.py +145 -0
  151. pymisha-0.1.0/tests/test_golden_master_stats.py +303 -0
  152. pymisha-0.1.0/tests/test_golden_master_vtracks.py +252 -0
  153. pymisha-0.1.0/tests/test_gpartition.py +228 -0
  154. pymisha-0.1.0/tests/test_gquantiles.py +42 -0
  155. pymisha-0.1.0/tests/test_gsample.py +321 -0
  156. pymisha-0.1.0/tests/test_gsegment.py +200 -0
  157. pymisha-0.1.0/tests/test_gseq.py +127 -0
  158. pymisha-0.1.0/tests/test_gseq_kmer.py +186 -0
  159. pymisha-0.1.0/tests/test_gseq_pwm.py +1169 -0
  160. pymisha-0.1.0/tests/test_gsummary.py +148 -0
  161. pymisha-0.1.0/tests/test_gsynth.py +1669 -0
  162. pymisha-0.1.0/tests/test_gsynth_parallel.py +629 -0
  163. pymisha-0.1.0/tests/test_gtrack_attr.py +135 -0
  164. pymisha-0.1.0/tests/test_gtrack_attr_import.py +117 -0
  165. pymisha-0.1.0/tests/test_gtrack_create_empty_indexed.py +48 -0
  166. pymisha-0.1.0/tests/test_gtrack_create_pwm_energy.py +431 -0
  167. pymisha-0.1.0/tests/test_gtrack_exists.py +48 -0
  168. pymisha-0.1.0/tests/test_gtrack_lookup.py +354 -0
  169. pymisha-0.1.0/tests/test_gtrack_ls.py +230 -0
  170. pymisha-0.1.0/tests/test_gtrack_var.py +213 -0
  171. pymisha-0.1.0/tests/test_gvtrack_filter.py +1424 -0
  172. pymisha-0.1.0/tests/test_gwilcox.py +229 -0
  173. pymisha-0.1.0/tests/test_import_contacts.py +184 -0
  174. pymisha-0.1.0/tests/test_init_exports.py +13 -0
  175. pymisha-0.1.0/tests/test_intervals_2d.py +144 -0
  176. pymisha-0.1.0/tests/test_intervals_indexed.py +126 -0
  177. pymisha-0.1.0/tests/test_iterator_policy.py +25 -0
  178. pymisha-0.1.0/tests/test_liftover.py +2129 -0
  179. pymisha-0.1.0/tests/test_multi_db.py +663 -0
  180. pymisha-0.1.0/tests/test_multitask.py +203 -0
  181. pymisha-0.1.0/tests/test_optimization_summary.py +90 -0
  182. pymisha-0.1.0/tests/test_pwm_sliding_window.py +1021 -0
  183. pymisha-0.1.0/tests/test_pwm_spatial.py +356 -0
  184. pymisha-0.1.0/tests/test_security_robustness.py +150 -0
  185. pymisha-0.1.0/tests/test_track2d.py +362 -0
  186. pymisha-0.1.0/tests/test_track_create_import.py +385 -0
  187. pymisha-0.1.0/tests/test_track_indexed.py +33 -0
  188. pymisha-0.1.0/tests/test_track_liftover.py +1313 -0
  189. pymisha-0.1.0/tests/test_track_modify_smooth.py +425 -0
  190. pymisha-0.1.0/tests/test_vtrack_iterator_2d.py +60 -0
  191. pymisha-0.1.0/tests/test_vtrack_lse.py +1313 -0
  192. pymisha-0.1.0/tests/test_vtracks.py +2372 -0
pymisha-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024-2026 Weizmann Institute of Science
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,4 @@
1
+ include README.md
2
+ include LICENSE
3
+ include setup.py
4
+ recursive-include src *.cpp *.h
pymisha-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,115 @@
1
+ Metadata-Version: 2.4
2
+ Name: pymisha
3
+ Version: 0.1.0
4
+ Summary: Python interface for misha genomic databases with C++ streaming backends
5
+ Author-email: Aviezer Lifshitz <aviezerl@weizmann.ac.il>
6
+ Maintainer-email: Aviezer Lifshitz <aviezerl@weizmann.ac.il>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/tanaylab/pymisha
9
+ Project-URL: Documentation, https://tanaylab.github.io/pymisha/
10
+ Project-URL: Repository, https://github.com/tanaylab/pymisha.git
11
+ Project-URL: Issues, https://github.com/tanaylab/pymisha/issues
12
+ Keywords: genomics,bioinformatics,misha,tracks
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Topic :: Scientific/Engineering :: Bio-Informatics
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: C++
22
+ Classifier: Operating System :: POSIX :: Linux
23
+ Classifier: Operating System :: MacOS :: MacOS X
24
+ Requires-Python: >=3.10
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: numpy>=1.20
28
+ Requires-Dist: pandas>=1.3
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0; extra == "dev"
31
+ Requires-Dist: pytest-cov; extra == "dev"
32
+ Requires-Dist: ruff>=0.6; extra == "dev"
33
+ Requires-Dist: mypy; extra == "dev"
34
+ Provides-Extra: docs
35
+ Requires-Dist: sphinx>=7.0; extra == "docs"
36
+ Requires-Dist: myst-parser>=2.0; extra == "docs"
37
+ Requires-Dist: furo>=2024.8.6; extra == "docs"
38
+ Dynamic: license-file
39
+
40
+ # PyMisha
41
+
42
+ Python interface for [misha](https://github.com/tanaylab/misha) genomic databases. PyMisha provides full read/write access to misha track databases with C++ streaming backends for genome-scale operations.
43
+
44
+ ## Features
45
+
46
+ - **1D and 2D track support:** Dense, sparse, and 2D (rectangle/point) tracks with full CRUD operations.
47
+ - **C++ streaming backends:** Extraction, summary, quantiles, distribution, lookup, segmentation, Wilcoxon tests, correlation, and sampling all stream through C++ for performance.
48
+ - **Virtual tracks:** Computed-on-the-fly track views with filtering, shifting, and 30+ aggregation functions.
49
+ - **Interval operations:** Union, intersection, difference, canonicalization, neighbors, annotation, normalization, random generation, and liftover.
50
+ - **Sequence analysis:** Extraction, k-mer counting, PWM/PSSM scoring, and Markov-chain synthesis (`gsynth`).
51
+ - **Database management:** Create, link, convert, and manage misha-compatible genomic databases.
52
+ - **R misha compatibility:** Reads and writes the same on-disk formats as R misha (123/145 R exports covered).
53
+
54
+ ## Installation
55
+
56
+ Prerequisites:
57
+ - Python 3.10+
58
+ - C++17 compiler (GCC 8+, Clang 7+, or Apple Clang 11+)
59
+ - `numpy`, `pandas`
60
+
61
+ ```bash
62
+ pip install .
63
+ ```
64
+
65
+ For development:
66
+
67
+ ```bash
68
+ pip install -e ".[dev]"
69
+ ```
70
+
71
+ ## Quick start
72
+
73
+ ```python
74
+ import pymisha as pm
75
+
76
+ # Initialize the database
77
+ pm.gdb_init("/path/to/misha_db")
78
+
79
+ # Create intervals and extract data
80
+ intervals = pm.gintervals_from_strings(["chr1:0-1000", "chr1:2000-2600"])
81
+ out = pm.gextract("track1", intervals, iterator=100)
82
+
83
+ # Filter and summarize
84
+ filtered = pm.gscreen("track1 > 0.5", intervals)
85
+ stats = pm.gsummary("track1", intervals)
86
+ ```
87
+
88
+ ## Examples
89
+
90
+ Using the built-in example database:
91
+
92
+ ```python
93
+ import pymisha as pm
94
+
95
+ pm.gdb_init_examples()
96
+ print(pm.gtrack_ls())
97
+ print(pm.gextract("dense_track", pm.gintervals("chr1", 0, 1000)))
98
+ ```
99
+
100
+ ## Optional dependencies
101
+
102
+ - `pyBigWig`: For BigWig import in `gtrack_import`.
103
+ - `pyreadr` + `Rscript`: For loading R-serialized big interval sets.
104
+ - `PyYAML`: For richer `gdataset_info` metadata parsing.
105
+
106
+ ## Missing features
107
+
108
+ Compared to R misha, the following are not yet implemented:
109
+
110
+ - **Track Arrays:** `gtrack.array.*` and `gvtrack.array.slice`.
111
+ - **Legacy Conversion:** `gtrack.convert` (for migrating old 2D formats).
112
+
113
+ ## License
114
+
115
+ MIT. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,76 @@
1
+ # PyMisha
2
+
3
+ Python interface for [misha](https://github.com/tanaylab/misha) genomic databases. PyMisha provides full read/write access to misha track databases with C++ streaming backends for genome-scale operations.
4
+
5
+ ## Features
6
+
7
+ - **1D and 2D track support:** Dense, sparse, and 2D (rectangle/point) tracks with full CRUD operations.
8
+ - **C++ streaming backends:** Extraction, summary, quantiles, distribution, lookup, segmentation, Wilcoxon tests, correlation, and sampling all stream through C++ for performance.
9
+ - **Virtual tracks:** Computed-on-the-fly track views with filtering, shifting, and 30+ aggregation functions.
10
+ - **Interval operations:** Union, intersection, difference, canonicalization, neighbors, annotation, normalization, random generation, and liftover.
11
+ - **Sequence analysis:** Extraction, k-mer counting, PWM/PSSM scoring, and Markov-chain synthesis (`gsynth`).
12
+ - **Database management:** Create, link, convert, and manage misha-compatible genomic databases.
13
+ - **R misha compatibility:** Reads and writes the same on-disk formats as R misha (123/145 R exports covered).
14
+
15
+ ## Installation
16
+
17
+ Prerequisites:
18
+ - Python 3.10+
19
+ - C++17 compiler (GCC 8+, Clang 7+, or Apple Clang 11+)
20
+ - `numpy`, `pandas`
21
+
22
+ ```bash
23
+ pip install .
24
+ ```
25
+
26
+ For development:
27
+
28
+ ```bash
29
+ pip install -e ".[dev]"
30
+ ```
31
+
32
+ ## Quick start
33
+
34
+ ```python
35
+ import pymisha as pm
36
+
37
+ # Initialize the database
38
+ pm.gdb_init("/path/to/misha_db")
39
+
40
+ # Create intervals and extract data
41
+ intervals = pm.gintervals_from_strings(["chr1:0-1000", "chr1:2000-2600"])
42
+ out = pm.gextract("track1", intervals, iterator=100)
43
+
44
+ # Filter and summarize
45
+ filtered = pm.gscreen("track1 > 0.5", intervals)
46
+ stats = pm.gsummary("track1", intervals)
47
+ ```
48
+
49
+ ## Examples
50
+
51
+ Using the built-in example database:
52
+
53
+ ```python
54
+ import pymisha as pm
55
+
56
+ pm.gdb_init_examples()
57
+ print(pm.gtrack_ls())
58
+ print(pm.gextract("dense_track", pm.gintervals("chr1", 0, 1000)))
59
+ ```
60
+
61
+ ## Optional dependencies
62
+
63
+ - `pyBigWig`: For BigWig import in `gtrack_import`.
64
+ - `pyreadr` + `Rscript`: For loading R-serialized big interval sets.
65
+ - `PyYAML`: For richer `gdataset_info` metadata parsing.
66
+
67
+ ## Missing features
68
+
69
+ Compared to R misha, the following are not yet implemented:
70
+
71
+ - **Track Arrays:** `gtrack.array.*` and `gvtrack.array.slice`.
72
+ - **Legacy Conversion:** `gtrack.convert` (for migrating old 2D formats).
73
+
74
+ ## License
75
+
76
+ MIT. See [LICENSE](LICENSE) for details.
@@ -0,0 +1,354 @@
1
+ """
2
+ PyMisha - Python wrapper for the misha Genomic Data Analysis Toolkit
3
+ """
4
+
5
+ __version__ = '0.1.0'
6
+
7
+ # Make numpy available for expressions
8
+ import numpy as np # noqa: F401
9
+
10
+ from . import _shared
11
+ from ._shared import (
12
+ CONFIG,
13
+ _bound_colname,
14
+ _checkroot,
15
+ _chunk_slices,
16
+ _df2pymisha,
17
+ _iterated_intervals,
18
+ _itr2pymisha,
19
+ _make_progress_callback,
20
+ _progress_context,
21
+ _pymisha,
22
+ _pymisha2df,
23
+ )
24
+ from .analysis import gcis_decay, gsegment, gwilcox
25
+ from .dataset import (
26
+ gdataset_info,
27
+ gdataset_load,
28
+ gdataset_ls,
29
+ gdataset_save,
30
+ gdataset_unload,
31
+ )
32
+ from .db import (
33
+ gdb_examples_path,
34
+ gdb_info,
35
+ gdb_init,
36
+ gdb_init_examples,
37
+ gdb_reload,
38
+ gdb_unload,
39
+ gsetroot,
40
+ )
41
+ from .db_attrs import gdb_get_readonly_attrs, gdb_set_readonly_attrs
42
+ from .db_create import gdb_convert_to_indexed, gdb_create, gdb_create_genome, gdb_create_linked
43
+ from .extract import gextract, gscreen
44
+ from .gdir import (
45
+ gdir_cd,
46
+ gdir_create,
47
+ gdir_cwd,
48
+ gdir_rm,
49
+ gtrack_create_dirs,
50
+ )
51
+ from .gsynth import (
52
+ GsynthModel,
53
+ gsynth_bin_map,
54
+ gsynth_load,
55
+ gsynth_random,
56
+ gsynth_replace_kmer,
57
+ gsynth_sample,
58
+ gsynth_save,
59
+ gsynth_train,
60
+ )
61
+ from .intervals import (
62
+ gintervals,
63
+ gintervals_2d,
64
+ gintervals_2d_all,
65
+ gintervals_2d_band_intersect,
66
+ gintervals_2d_convert_to_indexed,
67
+ gintervals_all,
68
+ gintervals_annotate,
69
+ gintervals_canonic,
70
+ gintervals_chrom_sizes,
71
+ gintervals_convert_to_indexed,
72
+ gintervals_coverage_fraction,
73
+ gintervals_covered_bp,
74
+ gintervals_dataset,
75
+ gintervals_diff,
76
+ gintervals_exists,
77
+ gintervals_force_range,
78
+ gintervals_from_bed,
79
+ gintervals_from_strings,
80
+ gintervals_from_tuples,
81
+ gintervals_import_genes,
82
+ gintervals_intersect,
83
+ gintervals_is_indexed,
84
+ gintervals_load,
85
+ gintervals_ls,
86
+ gintervals_mapply,
87
+ gintervals_mark_overlaps,
88
+ gintervals_neighbors,
89
+ gintervals_neighbors_directional,
90
+ gintervals_neighbors_downstream,
91
+ gintervals_neighbors_upstream,
92
+ gintervals_normalize,
93
+ gintervals_random,
94
+ gintervals_rbind,
95
+ gintervals_rm,
96
+ gintervals_save,
97
+ gintervals_union,
98
+ gintervals_update,
99
+ gintervals_window,
100
+ giterator_cartesian_grid,
101
+ giterator_intervals,
102
+ )
103
+ from .liftover import (
104
+ gintervals_as_chain,
105
+ gintervals_liftover,
106
+ gintervals_load_chain,
107
+ gtrack_liftover,
108
+ )
109
+ from .lookup import glookup, gtrack_lookup
110
+ from .sequence import (
111
+ gseq_comp,
112
+ gseq_extract,
113
+ gseq_kmer,
114
+ gseq_kmer_dist,
115
+ gseq_pwm,
116
+ gseq_rev,
117
+ gseq_revcomp,
118
+ )
119
+ from .summary import (
120
+ gbins_quantiles,
121
+ gbins_summary,
122
+ gcor,
123
+ gdist,
124
+ gintervals_quantiles,
125
+ gintervals_summary,
126
+ gpartition,
127
+ gquantiles,
128
+ gsample,
129
+ gsummary,
130
+ )
131
+ from .tracks import (
132
+ gtrack_2d_create,
133
+ gtrack_2d_import,
134
+ gtrack_2d_import_contacts,
135
+ gtrack_attr_export,
136
+ gtrack_attr_get,
137
+ gtrack_attr_import,
138
+ gtrack_attr_set,
139
+ gtrack_convert_to_indexed,
140
+ gtrack_copy,
141
+ gtrack_create,
142
+ gtrack_create_dense,
143
+ gtrack_create_empty_indexed,
144
+ gtrack_create_pwm_energy,
145
+ gtrack_create_sparse,
146
+ gtrack_dataset,
147
+ gtrack_exists,
148
+ gtrack_import,
149
+ gtrack_import_mappedseq,
150
+ gtrack_import_set,
151
+ gtrack_info,
152
+ gtrack_ls,
153
+ gtrack_modify,
154
+ gtrack_mv,
155
+ gtrack_rm,
156
+ gtrack_smooth,
157
+ gtrack_var_get,
158
+ gtrack_var_ls,
159
+ gtrack_var_rm,
160
+ gtrack_var_set,
161
+ )
162
+ from .vtracks import (
163
+ gvtrack_clear,
164
+ gvtrack_create,
165
+ gvtrack_filter,
166
+ gvtrack_info,
167
+ gvtrack_iterator,
168
+ gvtrack_iterator_2d,
169
+ gvtrack_ls,
170
+ gvtrack_rm,
171
+ )
172
+
173
+
174
+ def __getattr__(name):
175
+ # Expose live DB state variables instead of stale import-time snapshots.
176
+ if name in {"_GROOT", "_UROOT", "_VTRACKS"}:
177
+ return getattr(_shared, name)
178
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
179
+
180
+
181
+ __all__ = [
182
+ # Configuration
183
+ 'CONFIG',
184
+
185
+ # Database functions
186
+ 'gdb_init',
187
+ 'gdb_reload',
188
+ 'gdb_unload',
189
+ 'gdb_info',
190
+ 'gdb_examples_path',
191
+ 'gdb_init_examples',
192
+ 'gsetroot',
193
+ 'gdb_create',
194
+ 'gdb_create_genome',
195
+ 'gdb_create_linked',
196
+ 'gdb_convert_to_indexed',
197
+ 'gdb_get_readonly_attrs',
198
+ 'gdb_set_readonly_attrs',
199
+ 'gdataset_load',
200
+ 'gdataset_unload',
201
+ 'gdataset_ls',
202
+ 'gdataset_save',
203
+ 'gdataset_info',
204
+
205
+ # Track functions
206
+ 'gextract',
207
+ 'gscreen',
208
+ 'gsummary',
209
+ 'gquantiles',
210
+ 'gdist',
211
+ 'gpartition',
212
+ 'gsample',
213
+ 'gcor',
214
+ 'gbins_summary',
215
+ 'gbins_quantiles',
216
+ 'gcis_decay',
217
+ 'gsegment',
218
+ 'gwilcox',
219
+ 'gtrack_ls',
220
+ 'gtrack_info',
221
+ 'gtrack_exists',
222
+ 'gtrack_dataset',
223
+ 'gtrack_create',
224
+ 'gtrack_create_dense',
225
+ 'gtrack_create_sparse',
226
+ 'gtrack_import',
227
+ 'gtrack_import_mappedseq',
228
+ 'gtrack_import_set',
229
+ 'gtrack_rm',
230
+ 'gtrack_mv',
231
+ 'gtrack_copy',
232
+ 'gtrack_convert_to_indexed',
233
+ 'gtrack_create_empty_indexed',
234
+ 'gtrack_attr_get',
235
+ 'gtrack_attr_set',
236
+ 'gtrack_attr_export',
237
+ 'gtrack_attr_import',
238
+ 'gtrack_var_ls',
239
+ 'gtrack_var_get',
240
+ 'gtrack_var_set',
241
+ 'gtrack_var_rm',
242
+ 'gtrack_modify',
243
+ 'gtrack_smooth',
244
+ 'gtrack_2d_create',
245
+ 'gtrack_2d_import',
246
+ 'gtrack_2d_import_contacts',
247
+ 'gtrack_create_pwm_energy',
248
+
249
+ # Interval functions
250
+ 'gintervals',
251
+ 'gintervals_all',
252
+ 'gintervals_2d',
253
+ 'gintervals_2d_all',
254
+ 'gintervals_2d_band_intersect',
255
+ 'gintervals_union',
256
+ 'gintervals_intersect',
257
+ 'gintervals_diff',
258
+ 'gintervals_canonic',
259
+ 'gintervals_force_range',
260
+ 'gintervals_summary',
261
+ 'gintervals_quantiles',
262
+ 'gintervals_covered_bp',
263
+ 'gintervals_coverage_fraction',
264
+ 'gintervals_neighbors',
265
+ 'gintervals_neighbors_upstream',
266
+ 'gintervals_neighbors_downstream',
267
+ 'gintervals_neighbors_directional',
268
+ 'gintervals_from_tuples',
269
+ 'gintervals_from_strings',
270
+ 'gintervals_from_bed',
271
+ 'gintervals_import_genes',
272
+ 'gintervals_window',
273
+ 'gintervals_ls',
274
+ 'gintervals_exists',
275
+ 'gintervals_dataset',
276
+ 'gintervals_chrom_sizes',
277
+ 'gintervals_load',
278
+ 'gintervals_convert_to_indexed',
279
+ 'gintervals_2d_convert_to_indexed',
280
+ 'gintervals_is_indexed',
281
+ 'gintervals_save',
282
+ 'gintervals_update',
283
+ 'gintervals_mapply',
284
+ 'gintervals_rm',
285
+ 'giterator_cartesian_grid',
286
+ 'giterator_intervals',
287
+ 'gintervals_rbind',
288
+ 'gintervals_mark_overlaps',
289
+ 'gintervals_annotate',
290
+ 'gintervals_normalize',
291
+ 'gintervals_random',
292
+
293
+ # Virtual track functions
294
+ 'gvtrack_create',
295
+ 'gvtrack_ls',
296
+ 'gvtrack_info',
297
+ 'gvtrack_iterator',
298
+ 'gvtrack_iterator_2d',
299
+ 'gvtrack_filter',
300
+ 'gvtrack_rm',
301
+ 'gvtrack_clear',
302
+
303
+ # Sequence functions
304
+ 'gseq_extract',
305
+ 'gseq_rev',
306
+ 'gseq_comp',
307
+ 'gseq_revcomp',
308
+ 'gseq_kmer',
309
+ 'gseq_kmer_dist',
310
+ 'gseq_pwm',
311
+
312
+ # Lookup functions
313
+ 'glookup',
314
+ 'gtrack_lookup',
315
+
316
+ # Liftover functions
317
+ 'gintervals_load_chain',
318
+ 'gintervals_as_chain',
319
+ 'gintervals_liftover',
320
+ 'gtrack_liftover',
321
+
322
+ # Directory management
323
+ 'gdir_cwd',
324
+ 'gdir_cd',
325
+ 'gdir_create',
326
+ 'gdir_rm',
327
+ 'gtrack_create_dirs',
328
+
329
+ # Genome synthesis functions
330
+ 'GsynthModel',
331
+ 'gsynth_bin_map',
332
+ 'gsynth_train',
333
+ 'gsynth_sample',
334
+ 'gsynth_random',
335
+ 'gsynth_replace_kmer',
336
+ 'gsynth_save',
337
+ 'gsynth_load',
338
+
339
+ # Internal (shared)
340
+ '_bound_colname',
341
+ '_checkroot',
342
+ '_chunk_slices',
343
+ '_df2pymisha',
344
+ '_iterated_intervals',
345
+ '_itr2pymisha',
346
+ '_make_progress_callback',
347
+ '_progress_context',
348
+ '_pymisha',
349
+ '_pymisha2df',
350
+ ]
351
+
352
+ # Export module locals to the C extension for access to Python functions
353
+ # This must be at the end of the file after all functions are defined
354
+ _pymisha._PMLOCALS = locals()
@@ -0,0 +1,34 @@
1
+ """Shared CRC64-ECMA helpers (parity with C++ CRC64.h)."""
2
+
3
+ _CRC64_POLY = 0xC96C5795D7870F42
4
+ _CRC64_TABLE = None
5
+
6
+
7
+ def _crc64_table():
8
+ table = []
9
+ for i in range(256):
10
+ crc = i
11
+ for _ in range(8):
12
+ if crc & 1:
13
+ crc = (crc >> 1) ^ _CRC64_POLY
14
+ else:
15
+ crc >>= 1
16
+ table.append(crc & 0xFFFFFFFFFFFFFFFF)
17
+ return table
18
+
19
+
20
+ def crc64_incremental(crc, data):
21
+ global _CRC64_TABLE
22
+ if _CRC64_TABLE is None:
23
+ _CRC64_TABLE = _crc64_table()
24
+ for byte in data:
25
+ crc = (crc >> 8) ^ _CRC64_TABLE[(crc ^ byte) & 0xFF]
26
+ return crc & 0xFFFFFFFFFFFFFFFF
27
+
28
+
29
+ def crc64_init():
30
+ return 0xFFFFFFFFFFFFFFFF
31
+
32
+
33
+ def crc64_finalize(crc):
34
+ return (~crc) & 0xFFFFFFFFFFFFFFFF
@@ -0,0 +1,22 @@
1
+ """Name validation helpers."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ _DOTTED_NAME_RE = re.compile(r"^[A-Za-z][A-Za-z0-9_.]*$")
8
+
9
+
10
+ def validate_dotted_name(name, kind):
11
+ if not isinstance(name, str) or not name:
12
+ raise ValueError(f"{kind} must be a non-empty string")
13
+ if not _DOTTED_NAME_RE.fullmatch(name):
14
+ raise ValueError(
15
+ f"Invalid {kind} '{name}'. Must start with a letter and contain "
16
+ "only alphanumeric characters, underscores, and dots."
17
+ )
18
+ parts = name.split(".")
19
+ if any(not part for part in parts):
20
+ raise ValueError(
21
+ f"Invalid {kind} '{name}'. Empty dot-separated components are not allowed."
22
+ )