biotite 1.5.0__cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +216 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +197 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +161 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1228 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-314-x86_64-linux-gnu.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-314-x86_64-linux-gnu.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-314-x86_64-linux-gnu.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-314-x86_64-linux-gnu.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-314-x86_64-linux-gnu.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-314-x86_64-linux-gnu.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-314-x86_64-linux-gnu.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-314-x86_64-linux-gnu.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-314-x86_64-linux-gnu.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-314-x86_64-linux-gnu.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-314-x86_64-linux-gnu.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-314-x86_64-linux-gnu.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-314-x86_64-linux-gnu.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-314-x86_64-linux-gnu.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-314-x86_64-linux-gnu.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-314-x86_64-linux-gnu.so +0 -0
  266. biotite/structure/bonds.pyx +2036 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-314-x86_64-linux-gnu.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +310 -0
  271. biotite/structure/charges.cpython-314-x86_64-linux-gnu.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +683 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +591 -0
  278. biotite/structure/geometry.py +817 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +98 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +389 -0
  311. biotite/structure/io/pdb/file.py +1380 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-314-x86_64-linux-gnu.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +674 -0
  319. biotite/structure/io/pdbx/cif.py +1091 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +362 -0
  322. biotite/structure/io/pdbx/convert.py +2113 -0
  323. biotite/structure/io/pdbx/encoding.cpython-314-x86_64-linux-gnu.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1078 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +716 -0
  337. biotite/structure/rings.py +451 -0
  338. biotite/structure/sasa.cpython-314-x86_64-linux-gnu.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +328 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +160 -0
  349. biotite/version.py +34 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.5.0.dist-info/METADATA +162 -0
  352. biotite-1.5.0.dist-info/RECORD +354 -0
  353. biotite-1.5.0.dist-info/WHEEL +6 -0
  354. biotite-1.5.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1078 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This module contains data encodings for BinaryCIF files.
7
+ """
8
+
9
+ __name__ = "biotite.structure.io.pdbx"
10
+ __author__ = "Patrick Kunzmann"
11
+ __all__ = ["ByteArrayEncoding", "FixedPointEncoding",
12
+ "IntervalQuantizationEncoding", "RunLengthEncoding",
13
+ "DeltaEncoding", "IntegerPackingEncoding", "StringArrayEncoding",
14
+ "TypeCode"]
15
+
16
+ cimport cython
17
+ cimport numpy as np
18
+
19
+ from dataclasses import dataclass
20
+ from abc import ABCMeta, abstractmethod
21
+ from numbers import Integral
22
+ from enum import IntEnum
23
+ import re
24
+ import numpy as np
25
+ from .component import _Component
26
+ from ....file import InvalidFileError
27
+
28
+ ctypedef np.int8_t int8
29
+ ctypedef np.int16_t int16
30
+ ctypedef np.int32_t int32
31
+ ctypedef np.uint8_t uint8
32
+ ctypedef np.uint16_t uint16
33
+ ctypedef np.uint32_t uint32
34
+ ctypedef np.float32_t float32
35
+ ctypedef np.float64_t float64
36
+
37
+ ctypedef fused Integer:
38
+ uint8
39
+ uint16
40
+ uint32
41
+ int8
42
+ int16
43
+ int32
44
+
45
+ # Used to create cartesian product of type combinations
46
+ # in run-length encoding
47
+ ctypedef fused OutputInteger:
48
+ uint8
49
+ uint16
50
+ uint32
51
+ int8
52
+ int16
53
+ int32
54
+
55
+ ctypedef fused Float:
56
+ float32
57
+ float64
58
+
59
+
60
+ CAMEL_CASE_PATTERN = re.compile(r"(?<!^)(?=[A-Z])")
61
+
62
+
63
+ class TypeCode(IntEnum):
64
+ """
65
+ This enum type represents integers that represent data types in
66
+ *BinaryCIF*.
67
+ """
68
+ INT8 = 1
69
+ INT16 = 2
70
+ INT32 = 3
71
+ UINT8 = 4
72
+ UINT16 = 5
73
+ UINT32 = 6
74
+ FLOAT32 = 32
75
+ FLOAT64 = 33
76
+
77
+ @staticmethod
78
+ def from_dtype(dtype):
79
+ """
80
+ Convert a *NumPy* dtype to a *BinaryCIF* type code.
81
+
82
+ Parameters
83
+ ----------
84
+ dtype : dtype or int or TypeCode
85
+ The data type to be converted.
86
+ If already a type code, it is simply returned.
87
+
88
+ Returns
89
+ -------
90
+ type_code : TypeCode
91
+ The corresponding type code.
92
+ """
93
+ if isinstance(dtype, Integral):
94
+ # Already a type code
95
+ return TypeCode(dtype)
96
+ else:
97
+ dtype = np.dtype(dtype)
98
+ # Find the closest dtype supported by the format
99
+ if np.issubdtype(dtype, np.integer):
100
+ # int64 is not supported by format
101
+ if dtype == np.int64:
102
+ supported_dtype = np.int32
103
+ elif dtype == np.uint64:
104
+ supported_dtype = np.uint32
105
+ else:
106
+ supported_dtype = dtype
107
+ elif np.issubdtype(dtype, np.floating):
108
+ if dtype == np.float16:
109
+ supported_dtype = np.float32
110
+ # float128 is not available on all architectures
111
+ elif hasattr(np, "float128") and dtype == np.float128:
112
+ supported_dtype = np.float64
113
+ else:
114
+ supported_dtype = dtype
115
+ else:
116
+ raise ValueError(
117
+ f"dtype '{dtype}' is not supported by BinaryCIF"
118
+ )
119
+ return _DTYPE_TO_TYPE_CODE[
120
+ np.dtype(supported_dtype).newbyteorder("<").str
121
+ ]
122
+
123
+ def to_dtype(self):
124
+ """
125
+ Convert this type code to a *NumPy* dtype.
126
+
127
+ Returns
128
+ -------
129
+ dtype : dtype
130
+ The corresponding data type.
131
+ """
132
+ return _TYPE_CODE_TO_DTYPE[self]
133
+
134
+ # Converts BCIF integers representing the type to an actual NumPy dtype
135
+ _TYPE_CODE_TO_DTYPE = {
136
+ # All data types are little-endian
137
+ TypeCode.INT8: "|i1",
138
+ TypeCode.INT16: "<i2",
139
+ TypeCode.INT32: "<i4",
140
+ TypeCode.UINT8: "|u1",
141
+ TypeCode.UINT16: "<u2",
142
+ TypeCode.UINT32: "<u4",
143
+ TypeCode.FLOAT32: "<f4",
144
+ TypeCode.FLOAT64: "<f8"
145
+ }
146
+ _DTYPE_TO_TYPE_CODE = {val: key for key, val in _TYPE_CODE_TO_DTYPE.items()}
147
+
148
+
149
+ class Encoding(_Component, metaclass=ABCMeta):
150
+ """
151
+ Abstract base class for *BinaryCIF* data encodings.
152
+
153
+ Notes
154
+ -----
155
+ The encoding classes do not omit bound checks for decoding,
156
+ since the file content may be invalid/malicious.
157
+ """
158
+
159
+ @classmethod
160
+ def deserialize(cls, content):
161
+ params = {
162
+ _camel_to_snake_case(param): value
163
+ for param, value in content.items()
164
+ }
165
+ # 'kind' is no parameter, but indicates the class itself
166
+ params.pop("kind")
167
+ try:
168
+ encoding = cls(**params)
169
+ except TypeError as e:
170
+ raise InvalidFileError(
171
+ f"Invalid encoding parameters for {cls.__name__}"
172
+ )
173
+ except ValueError:
174
+ raise InvalidFileError(
175
+ f"Missing encoding parameters for {cls.__name__}"
176
+ )
177
+ return encoding
178
+
179
+ def serialize(self):
180
+ for param in self.__annotations__:
181
+ if getattr(self, param) is None:
182
+ raise ValueError(
183
+ f"'{param}' must be explicitly given or needs to be "
184
+ "determined from first encoding pass, before it is "
185
+ "serialized"
186
+ )
187
+
188
+ serialized = {
189
+ _snake_to_camel_case(param): getattr(self, param)
190
+ for param in self.__annotations__
191
+ }
192
+ serialized.update({
193
+ "kind": _encoding_classes_kinds[type(self).__name__]
194
+ })
195
+ return serialized
196
+
197
+ @abstractmethod
198
+ def encode(self, data):
199
+ """
200
+ Apply this encoding to the given data.
201
+
202
+ Parameters
203
+ ----------
204
+ data : ndarray
205
+ The data to be encoded.
206
+
207
+ Returns
208
+ -------
209
+ encoded_data : ndarray or bytes
210
+ The encoded data.
211
+ """
212
+ raise NotImplementedError()
213
+
214
+ @abstractmethod
215
+ def decode(self, data):
216
+ """
217
+ Apply the inverse of this encoding to the given data.
218
+
219
+ Parameters
220
+ ----------
221
+ data : ndarray or bytes
222
+ The data to be decoded.
223
+
224
+ Returns
225
+ -------
226
+ decoded_data : ndarray
227
+ The decoded data.
228
+
229
+ Warnings
230
+ --------
231
+ When overriding this method, do not omit bound checks with
232
+ ``@cython.boundscheck(False)`` or ``@cython.wraparound(False)``,
233
+ since the file content may be invalid/malicious.
234
+ """
235
+ raise NotImplementedError()
236
+
237
+ def __str__(self):
238
+ # Restore original behavior, as `__str__()` implementation of `_Component`
239
+ # may require serialization, which is not possible for some encodings prior
240
+ # to the first encoding pass
241
+ return object.__str__(self)
242
+
243
+
244
+ @dataclass
245
+ class ByteArrayEncoding(Encoding):
246
+ r"""
247
+ Encoding that encodes an array into bytes.
248
+
249
+ Parameters
250
+ ----------
251
+ type : dytpe or TypeCode, optional
252
+ The data type of the array to be encoded.
253
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
254
+ If omitted, the data type is taken from the data the
255
+ first time :meth:`encode()` is called.
256
+
257
+ Attributes
258
+ ----------
259
+ type : TypeCode
260
+
261
+ Examples
262
+ --------
263
+
264
+ >>> data = np.arange(3)
265
+ >>> print(data)
266
+ [0 1 2]
267
+ >>> print(ByteArrayEncoding().encode(data))
268
+ b'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
269
+ """
270
+ type: ... = None
271
+
272
+ def __post_init__(self):
273
+ if self.type is not None:
274
+ self.type = TypeCode.from_dtype(self.type)
275
+
276
+ def encode(self, data):
277
+ if self.type is None:
278
+ self.type = TypeCode.from_dtype(data.dtype)
279
+ return _safe_cast(data, self.type.to_dtype()).tobytes()
280
+
281
+ def decode(self, data):
282
+ # Data is raw bytes in this case
283
+ return np.frombuffer(data, dtype=self.type.to_dtype())
284
+
285
+
286
+ @dataclass
287
+ class FixedPointEncoding(Encoding):
288
+ """
289
+ Lossy encoding that multiplies floating point values with a given
290
+ factor and subsequently rounds them to the nearest integer.
291
+
292
+ Parameters
293
+ ----------
294
+ factor : float
295
+ The factor by which the data is multiplied before rounding.
296
+ src_type : dtype or TypeCode, optional
297
+ The data type of the array to be encoded.
298
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
299
+ The dtype must be a float type.
300
+ If omitted, the data type is taken from the data the
301
+ first time :meth:`encode()` is called.
302
+
303
+ Attributes
304
+ ----------
305
+ factor : float
306
+ src_type : TypeCode
307
+
308
+ Examples
309
+ --------
310
+
311
+ >>> data = np.array([9.87, 6.543])
312
+ >>> print(data)
313
+ [9.870 6.543]
314
+ >>> print(FixedPointEncoding(factor=100).encode(data))
315
+ [987 654]
316
+ """
317
+ factor: ...
318
+ src_type: ... = None
319
+
320
+ def __post_init__(self):
321
+ if self.src_type is not None:
322
+ self.src_type = TypeCode.from_dtype(self.src_type)
323
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
324
+ raise ValueError(
325
+ "Only floating point types are supported"
326
+ )
327
+
328
+ def encode(self, data):
329
+ # If not given in constructor, it is determined from the data
330
+ if self.src_type is None:
331
+ self.src_type = TypeCode.from_dtype(data.dtype)
332
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
333
+ raise ValueError(
334
+ "Only floating point types are supported"
335
+ )
336
+
337
+ # Round to avoid wrong values due to floating point inaccuracies
338
+ scaled_data = np.round(data * self.factor)
339
+ return _safe_cast(scaled_data, np.int32, allow_decimal_loss=True)
340
+
341
+ def decode(self, data):
342
+ return (data / self.factor).astype(
343
+ dtype=self.src_type.to_dtype(), copy=False
344
+ )
345
+
346
+
347
+ @dataclass
348
+ class IntervalQuantizationEncoding(Encoding):
349
+ """
350
+ Lossy encoding that sorts floating point values into bins.
351
+ Each bin is represented by an integer
352
+
353
+ Parameters
354
+ ----------
355
+ min, max : float
356
+ The minimum and maximum value the bins comprise.
357
+ num_steps : int
358
+ The number of bins.
359
+ src_type : dtype or TypeCode, optional
360
+ The data type of the array to be encoded.
361
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
362
+ The dtype must be a float type.
363
+ If omitted, the data type is taken from the data the
364
+ first time :meth:`encode()` is called.
365
+
366
+ Attributes
367
+ ----------
368
+ min, max : float
369
+ num_steps : int
370
+ src_type : TypeCode
371
+
372
+ Examples
373
+ --------
374
+
375
+ >>> data = np.linspace(11, 12, 6)
376
+ >>> print(data)
377
+ [11.0 11.2 11.4 11.6 11.8 12.0]
378
+ >>> # Use 0.5 as step size
379
+ >>> encoding = IntervalQuantizationEncoding(min=10, max=20, num_steps=21)
380
+ >>> # The encoding is lossy, as different values are mapped to the same bin
381
+ >>> encoded = encoding.encode(data)
382
+ >>> print(encoded)
383
+ [2 3 3 4 4 4]
384
+ >>> decoded = encoding.decode(encoded)
385
+ >>> print(decoded)
386
+ [11.0 11.5 11.5 12.0 12.0 12.0]
387
+ """
388
+ min: ...
389
+ max: ...
390
+ num_steps: ...
391
+ src_type: ... = None
392
+
393
+ def __post_init__(self):
394
+ if self.src_type is not None:
395
+ self.src_type = TypeCode.from_dtype(self.src_type)
396
+
397
+ def encode(self, data):
398
+ # If not given in constructor, it is determined from the data
399
+ if self.src_type is None:
400
+ self.src_type = TypeCode.from_dtype(data.dtype)
401
+
402
+ steps = np.linspace(
403
+ self.min, self.max, self.num_steps, dtype=data.dtype
404
+ )
405
+ indices = np.searchsorted(steps, data, side="left")
406
+ return _safe_cast(indices, np.int32)
407
+
408
+ def decode(self, data):
409
+ output = data * (self.max - self.min) / (self.num_steps - 1)
410
+ output = output.astype(self.src_type.to_dtype(), copy=False)
411
+ output += self.min
412
+ return output
413
+
414
+
415
+ @dataclass
416
+ class RunLengthEncoding(Encoding):
417
+ """
418
+ Encoding that compresses runs of equal values into pairs of
419
+ (value, run length).
420
+
421
+ Parameters
422
+ ----------
423
+ src_size : int, optional
424
+ The size of the array to be encoded.
425
+ If omitted, the size is determined from the data the
426
+ first time :meth:`encode()` is called.
427
+ src_type : dtype or TypeCode, optional
428
+ The data type of the array to be encoded.
429
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
430
+ The dtype must be a integer type.
431
+ If omitted, the data type is taken from the data the
432
+ first time :meth:`encode()` is called.
433
+
434
+ Attributes
435
+ ----------
436
+ src_size : int
437
+ src_type : TypeCode
438
+
439
+ Examples
440
+ --------
441
+
442
+ >>> data = np.array([1, 1, 1, 5, 3, 3])
443
+ >>> print(data)
444
+ [1 1 1 5 3 3]
445
+ >>> encoded = RunLengthEncoding().encode(data)
446
+ >>> print(encoded)
447
+ [1 3 5 1 3 2]
448
+ >>> # Emphasize the the pairs
449
+ >>> print(encoded.reshape(-1, 2))
450
+ [[1 3]
451
+ [5 1]
452
+ [3 2]]
453
+ """
454
+ src_size: ... = None
455
+ src_type: ... = None
456
+
457
+ def __post_init__(self):
458
+ if self.src_type is not None:
459
+ self.src_type = TypeCode.from_dtype(self.src_type)
460
+
461
+ def encode(self, data):
462
+ # If not given in constructor, it is determined from the data
463
+ if self.src_type is None:
464
+ self.src_type = TypeCode.from_dtype(data.dtype)
465
+ if self.src_size is None:
466
+ self.src_size = data.shape[0]
467
+ elif self.src_size != data.shape[0]:
468
+ raise IndexError(
469
+ "Given source size does not match actual data size"
470
+ )
471
+ return self._encode(_safe_cast(data, self.src_type.to_dtype()))
472
+
473
+ def decode(self, data):
474
+ return self._decode(
475
+ data, np.empty(0, dtype=self.src_type.to_dtype())
476
+ )
477
+
478
+ def _encode(self, const Integer[:] data):
479
+ # Pessimistic allocation of output array
480
+ # -> Run length is 1 for every element
481
+ cdef int32[:] output = np.zeros(data.shape[0] * 2, dtype=np.int32)
482
+ cdef int i=0, j=0
483
+ cdef int val = data[0]
484
+ cdef int run_length = 0
485
+ cdef int curr_val
486
+ for i in range(data.shape[0]):
487
+ curr_val = data[i]
488
+ if curr_val == val:
489
+ run_length += 1
490
+ else:
491
+ # New element -> Write element with run-length
492
+ output[j] = val
493
+ output[j+1] = run_length
494
+ j += 2
495
+ val = curr_val
496
+ run_length = 1
497
+ # Write last element
498
+ output[j] = val
499
+ output[j+1] = run_length
500
+ j += 2
501
+ # Trim to correct size
502
+ return np.asarray(output)[:j]
503
+
504
+ def _decode(self, const Integer[:] data, OutputInteger[:] output_type):
505
+ """
506
+ `output_type` is merely a typed placeholder to allow for static
507
+ typing of output.
508
+ """
509
+ if data.shape[0] % 2 != 0:
510
+ raise ValueError("Invalid run-length encoded data")
511
+
512
+ cdef int length = 0
513
+ cdef int i, j
514
+ cdef int value, repeat
515
+
516
+ if self.src_size is None:
517
+ # Determine length of output array by summing run lengths
518
+ for i in range(1, data.shape[0], 2):
519
+ length += data[i]
520
+ else:
521
+ length = self.src_size
522
+
523
+ cdef OutputInteger[:] output = np.zeros(
524
+ length, dtype=np.asarray(output_type).dtype
525
+ )
526
+ # Fill output array
527
+ j = 0
528
+ for i in range(0, data.shape[0], 2):
529
+ value = data[i]
530
+ repeat = data[i+1]
531
+ output[j : j+repeat] = value
532
+ j += repeat
533
+ return np.asarray(output)
534
+
535
+
536
+ @dataclass
537
+ class DeltaEncoding(Encoding):
538
+ """
539
+ Encoding that encodes an array of integers into an array of
540
+ consecutive differences.
541
+
542
+ Parameters
543
+ ----------
544
+ src_type : dtype or TypeCode, optional
545
+ The data type of the array to be encoded.
546
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
547
+ The dtype must be a integer type.
548
+ If omitted, the data type is taken from the data the
549
+ first time :meth:`encode()` is called.
550
+ origin : int, optional
551
+ The starting value from which the differences are calculated.
552
+ If omitted, the value is taken from the first array element the
553
+ first time :meth:`encode()` is called.
554
+
555
+ Attributes
556
+ ----------
557
+ src_type : TypeCode
558
+ origin : int
559
+
560
+ Examples
561
+ --------
562
+
563
+ >>> data = np.array([1, 1, 2, 3, 5, 8])
564
+ >>> encoding = DeltaEncoding()
565
+ >>> print(encoding.encode(data))
566
+ [0 0 1 1 2 3]
567
+ >>> print(encoding.origin)
568
+ 1
569
+ """
570
+ src_type: ... = None
571
+ origin: ... = None
572
+
573
+ def __post_init__(self):
574
+ if self.src_type is not None:
575
+ self.src_type = TypeCode.from_dtype(self.src_type)
576
+
577
+ def encode(self, data):
578
+ # If not given in constructor, it is determined from the data
579
+ if self.src_type is None:
580
+ self.src_type = TypeCode.from_dtype(data.dtype)
581
+ if self.origin is None:
582
+ self.origin = data[0]
583
+
584
+ # Differences (including `np.diff`) return an array with the same dtype as the
585
+ # input array
586
+ # As the input dtype may be unsigned, the output dtype could underflow,
587
+ # if the difference is negative
588
+ # -> cast to int64 to avoid this
589
+ data = data.astype(np.int64, copy=False)
590
+ data = data - self.origin
591
+ return _safe_cast(np.diff(data, prepend=0), np.int32)
592
+
593
+ def decode(self, data):
594
+ output = np.cumsum(data, dtype=self.src_type.to_dtype())
595
+ output += self.origin
596
+ return output
597
+
598
+
599
+ @dataclass
600
+ class IntegerPackingEncoding(Encoding):
601
+ """
602
+ Encoding that compresses an array of 32-bit integers into an array
603
+ of smaller sized integers.
604
+
605
+ If a value does not fit into smaller integer type,
606
+ the integer is represented by a sum of consecutive elements
607
+ in the compressed array.
608
+
609
+ Parameters
610
+ ----------
611
+ byte_count : int
612
+ The number of bytes the packed integers should occupy.
613
+ Supported values are 1 and 2 for 8-bit and 16-bit integers,
614
+ respectively.
615
+ src_size : int, optional
616
+ The size of the array to be encoded.
617
+ If omitted, the size is determined from the data the
618
+ first time :meth:`encode()` is called.
619
+ is_unsigned : bool, optional
620
+ Whether the values should be packed into signed or unsigned
621
+ integers.
622
+ If omitted, first time :meth:`encode()` is called, determines whether
623
+ the values fit into unsigned integers.
624
+
625
+ Attributes
626
+ ----------
627
+ byte_count : int
628
+ src_size : int
629
+ is_unsigned : bool
630
+
631
+ Examples
632
+ --------
633
+
634
+ >>> data = np.array([1, 2, -3, 128])
635
+ >>> print(data)
636
+ [ 1 2 -3 128]
637
+ >>> print(IntegerPackingEncoding(byte_count=1).encode(data))
638
+ [ 1 2 -3 127 1]
639
+ """
640
+ byte_count: ...
641
+ src_size: ... = None
642
+ is_unsigned: ... = None
643
+
644
+ def encode(self, data):
645
+ if self.src_size is None:
646
+ self.src_size = len(data)
647
+ elif self.src_size != len(data):
648
+ raise IndexError(
649
+ "Given source size does not match actual data size"
650
+ )
651
+ if self.is_unsigned is None:
652
+ # Only positive values -> use unsigned integers
653
+ self.is_unsigned = data.min().item() >= 0
654
+
655
+ data = _safe_cast(data, np.int32)
656
+ return self._encode(
657
+ data, np.empty(0, dtype=self._determine_packed_dtype())
658
+ )
659
+
660
+ def decode(self, const Integer[:] data):
661
+ cdef int i, j
662
+ cdef int min_val, max_val
663
+ cdef int packed_val, unpacked_val
664
+ bounds = self._get_bounds(data)
665
+ min_val = bounds[0]
666
+ max_val = bounds[1]
667
+ # For signed integers, do not check lower bound (is always 0)
668
+ # -> Set lower bound to value that is never reached
669
+ if min_val == 0:
670
+ min_val = -1
671
+
672
+ cdef int32[:] output = np.zeros(self.src_size, dtype=np.int32)
673
+ j = 0
674
+ unpacked_val = 0
675
+ for i in range(data.shape[0]):
676
+ packed_val = data[i]
677
+ if packed_val == max_val or packed_val == min_val:
678
+ unpacked_val += packed_val
679
+ else:
680
+ unpacked_val += packed_val
681
+ output[j] = unpacked_val
682
+ unpacked_val = 0
683
+ j += 1
684
+ # Trim to correct size and return
685
+ return np.asarray(output)
686
+
687
+ def _determine_packed_dtype(self):
688
+ if self.byte_count == 1:
689
+ if self.is_unsigned:
690
+ return np.uint8
691
+ else:
692
+ return np.int8
693
+ elif self.byte_count == 2:
694
+ if self.is_unsigned:
695
+ return np.uint16
696
+ else:
697
+ return np.int16
698
+ else:
699
+ raise ValueError("Unsupported byte count")
700
+
701
+ @cython.cdivision(True)
702
+ def _encode(self, const Integer[:] data, OutputInteger[:] output_type):
703
+ """
704
+ `output_type` is merely a typed placeholder to allow for static
705
+ typing of output.
706
+ """
707
+ cdef int i=0, j=0
708
+
709
+ packed_type = np.asarray(output_type).dtype
710
+ cdef int min_val = np.iinfo(packed_type).min
711
+ cdef int max_val = np.iinfo(packed_type).max
712
+
713
+ # Get length of output array
714
+ # by summing up required length of each element
715
+ cdef int number
716
+ cdef long length = 0
717
+ for i in range(data.shape[0]):
718
+ number = data[i]
719
+ if number < 0:
720
+ if min_val == 0:
721
+ raise ValueError(
722
+ "Cannot pack negative numbers into unsigned type"
723
+ )
724
+ # The required packed length for an element is the
725
+ # number of times min_val/max_val need to be repeated
726
+ length += number // min_val + 1
727
+ elif number > 0:
728
+ length += number // max_val + 1
729
+ else:
730
+ # number = 0
731
+ length += 1
732
+
733
+ # Fill output
734
+ cdef OutputInteger[:] output = np.zeros(length, dtype=packed_type)
735
+ cdef int remainder
736
+ j = 0
737
+ for i in range(data.shape[0]):
738
+ remainder = data[i]
739
+ if remainder < 0:
740
+ if min_val == 0:
741
+ raise ValueError(
742
+ "Cannot pack negative numbers into unsigned type"
743
+ )
744
+ while remainder <= min_val:
745
+ remainder -= min_val
746
+ output[j] = min_val
747
+ j += 1
748
+ elif remainder > 0:
749
+ while remainder >= max_val:
750
+ remainder -= max_val
751
+ output[j] = max_val
752
+ j += 1
753
+ output[j] = remainder
754
+ j += 1
755
+ return np.asarray(output)
756
+
757
+ @staticmethod
758
+ def _get_bounds(const Integer[:] data):
759
+ if Integer is int8:
760
+ info = np.iinfo(np.int8)
761
+ elif Integer is int16:
762
+ info = np.iinfo(np.int16)
763
+ elif Integer is int32:
764
+ info = np.iinfo(np.int32)
765
+ elif Integer is uint8:
766
+ info = np.iinfo(np.uint8)
767
+ elif Integer is uint16:
768
+ info = np.iinfo(np.uint16)
769
+ elif Integer is uint32:
770
+ info = np.iinfo(np.uint32)
771
+ else:
772
+ raise ValueError("Unsupported integer type")
773
+ return info.min, info.max
774
+
775
+
776
+ @dataclass
777
+ class StringArrayEncoding(Encoding):
778
+ """
779
+ Encoding that compresses an array of strings into an array of
780
+ indices that point to the unique strings in that array.
781
+
782
+ The unique strings themselves are stored as part of the
783
+ :class:`StringArrayEncoding` as concatenated string.
784
+ The start index of each unique string in the concatenated string
785
+ is stored in an *offset* array.
786
+
787
+ Parameters
788
+ ----------
789
+ strings : ndarray, optional
790
+ The unique strings that are used for encoding.
791
+ If omitted, the unique strings are determined from the data the
792
+ first time :meth:`encode()` is called.
793
+ data_encoding : list of Encoding, optional
794
+ The encodings that are applied to the index array.
795
+ If omitted, the array is directly encoded into bytes without
796
+ further compression.
797
+ offset_encoding : list of Encoding, optional
798
+ The encodings that are applied to the offset array.
799
+ If omitted, the array is directly encoded into bytes without
800
+ further compression.
801
+
802
+ Attributes
803
+ ----------
804
+ strings : ndarray
805
+ data_encoding : list of Encoding
806
+ offset_encoding : list of Encoding
807
+
808
+ Examples
809
+ --------
810
+
811
+ >>> data = np.array(["apple", "banana", "cherry", "apple", "banana", "apple"])
812
+ >>> print(data)
813
+ ['apple' 'banana' 'cherry' 'apple' 'banana' 'apple']
814
+ >>> # By default the indices would directly be encoded into bytes
815
+ >>> # However, the indices should be printed here -> data_encoding=[]
816
+ >>> encoding = StringArrayEncoding(data_encoding=[])
817
+ >>> encoded = encoding.encode(data)
818
+ >>> print(encoding.strings)
819
+ ['apple' 'banana' 'cherry']
820
+ >>> print(encoded)
821
+ [0 1 2 0 1 0]
822
+ """
823
+
824
+ strings: ... = None
825
+ data_encoding: ... = None
826
+ offset_encoding: ... = None
827
+
828
+ def __init__(self, strings=None, data_encoding=None, offset_encoding=None):
829
+ self.strings = strings
830
+ if data_encoding is None:
831
+ data_encoding = [ByteArrayEncoding(TypeCode.INT32)]
832
+ self.data_encoding = data_encoding
833
+ if offset_encoding is None:
834
+ offset_encoding = [ByteArrayEncoding(TypeCode.INT32)]
835
+ self.offset_encoding = offset_encoding
836
+
837
+ @staticmethod
838
+ def deserialize(content):
839
+ data_encoding = [
840
+ deserialize_encoding(e) for e in content["dataEncoding"]
841
+ ]
842
+ offset_encoding = [
843
+ deserialize_encoding(e) for e in content["offsetEncoding"]
844
+ ]
845
+ cdef str concatenated_strings = content["stringData"]
846
+ cdef np.ndarray offsets = decode_stepwise(
847
+ content["offsets"], offset_encoding
848
+ )
849
+
850
+ strings = np.array([
851
+ concatenated_strings[offsets[i]:offsets[i+1]]
852
+ # The final offset is the exclusive stop index
853
+ for i in range(len(offsets)-1)
854
+ ], dtype="U")
855
+
856
+ return StringArrayEncoding(strings, data_encoding, offset_encoding)
857
+
858
+ def serialize(self):
859
+ if self.strings is None:
860
+ raise ValueError(
861
+ "'strings' must be explicitly given or needs to be "
862
+ "determined from first encoding pass, before it is serialized"
863
+ )
864
+
865
+ string_data = "".join(self.strings)
866
+ offsets = np.cumsum([0] + [len(s) for s in self.strings])
867
+
868
+ return {
869
+ "kind": "StringArray",
870
+ "dataEncoding": [e.serialize() for e in self.data_encoding],
871
+ "stringData": string_data,
872
+ "offsets": encode_stepwise(offsets, self.offset_encoding),
873
+ "offsetEncoding": [e.serialize() for e in self.offset_encoding],
874
+ }
875
+
876
+ def encode(self, data):
877
+ if not np.issubdtype(data.dtype, np.str_):
878
+ raise TypeError("Data must be of string type")
879
+
880
+ if self.strings is None:
881
+ # 'unique()' already sorts the strings, but this is not necessarily
882
+ # desired, as this makes efficient encoding of the indices more difficult
883
+ # -> Bring into the original order
884
+ _, unique_indices = np.unique(data, return_index=True)
885
+ self.strings = data[np.sort(unique_indices)]
886
+ check_present = False
887
+ else:
888
+ check_present = True
889
+
890
+ if len(self.strings) > 0:
891
+ string_order = _safe_cast(np.argsort(self.strings), np.int32)
892
+ sorted_strings = self.strings[string_order]
893
+ sorted_indices = np.searchsorted(sorted_strings, data)
894
+ indices = string_order[sorted_indices]
895
+ # `"" not in self.strings` can be quite costly and is only necessary,
896
+ # if the the `strings` were given by the user, as otherwise we always
897
+ # include an empty string explicitly when we compute them in this function
898
+ # -> Only run if `check_present` is True
899
+ if check_present and "" not in self.strings:
900
+ # Represent empty strings as -1
901
+ indices[data == ""] = -1
902
+ else:
903
+ # There are no strings -> The indices can only ever be -1 to indicate
904
+ # missing values
905
+ # The check if this is correct is done below
906
+ indices = np.full(data.shape[0], -1, dtype=np.int32)
907
+
908
+ valid_indices_mask = indices != -1
909
+ if check_present and not np.all(
910
+ self.strings[indices[valid_indices_mask]] == data[valid_indices_mask]
911
+ ):
912
+ raise ValueError("Data contains strings not present in 'strings'")
913
+ return encode_stepwise(indices, self.data_encoding)
914
+
915
+ def decode(self, data):
916
+ indices = decode_stepwise(data, self.data_encoding)
917
+ # Initialize with empty strings
918
+ strings = np.zeros(indices.shape[0], dtype=self.strings.dtype)
919
+ # `-1`` indices indicate missing values
920
+ valid_indices_mask = indices != -1
921
+ strings[valid_indices_mask] = self.strings[indices[valid_indices_mask]]
922
+ return strings
923
+
924
+ def __eq__(self, other):
925
+ if not isinstance(other, type(self)):
926
+ return False
927
+ if not np.array_equal(self.strings, other.strings):
928
+ return False
929
+ if self.data_encoding != other.data_encoding:
930
+ return False
931
+ if self.offset_encoding != other.offset_encoding:
932
+ return False
933
+ return True
934
+
935
+
936
+ _encoding_classes = {
937
+ "ByteArray": ByteArrayEncoding,
938
+ "FixedPoint": FixedPointEncoding,
939
+ "IntervalQuantization": IntervalQuantizationEncoding,
940
+ "RunLength": RunLengthEncoding,
941
+ "Delta": DeltaEncoding,
942
+ "IntegerPacking": IntegerPackingEncoding,
943
+ "StringArray": StringArrayEncoding,
944
+ }
945
+ _encoding_classes_kinds = {
946
+ "ByteArrayEncoding": "ByteArray",
947
+ "FixedPointEncoding": "FixedPoint",
948
+ "IntervalQuantizationEncoding": "IntervalQuantization",
949
+ "RunLengthEncoding": "RunLength",
950
+ "DeltaEncoding": "Delta",
951
+ "IntegerPackingEncoding": "IntegerPacking",
952
+ "StringArrayEncoding": "StringArray",
953
+ }
954
+
955
+
956
+ def deserialize_encoding(content):
957
+ """
958
+ Create a :class:`Encoding` by deserializing the given *BinaryCIF* content.
959
+
960
+ Parameters
961
+ ----------
962
+ content : dict
963
+ The encoding represenet as *BinaryCIF* dictionary.
964
+
965
+ Returns
966
+ -------
967
+ encoding : Encoding
968
+ The deserialized encoding.
969
+ """
970
+ try:
971
+ encoding_class = _encoding_classes[content["kind"]]
972
+ except KeyError:
973
+ raise ValueError(
974
+ f"Unknown encoding kind '{content['kind']}'"
975
+ )
976
+ return encoding_class.deserialize(content)
977
+
978
+
979
+ def create_uncompressed_encoding(array):
980
+ """
981
+ Create a simple encoding for the given array that does not compress the data.
982
+
983
+ Parameters
984
+ ----------
985
+ array : ndarray
986
+ The array to to create the encoding for.
987
+
988
+ Returns
989
+ -------
990
+ encoding : list of Encoding
991
+ The encoding for the data.
992
+ """
993
+ if np.issubdtype(array.dtype, np.str_):
994
+ return [StringArrayEncoding()]
995
+ else:
996
+ return [ByteArrayEncoding()]
997
+
998
+
999
+ def encode_stepwise(data, encoding):
1000
+ """
1001
+ Apply a list of encodings stepwise to the given data.
1002
+
1003
+ Parameters
1004
+ ----------
1005
+ data : ndarray
1006
+ The data to be encoded.
1007
+ encoding : list of Encoding
1008
+ The encodings to be applied.
1009
+
1010
+ Returns
1011
+ -------
1012
+ encoded_data : ndarray or bytes
1013
+ The encoded data.
1014
+ """
1015
+ for encoding in encoding:
1016
+ data = encoding.encode(data)
1017
+ return data
1018
+
1019
+
1020
+ def decode_stepwise(data, encoding):
1021
+ """
1022
+ Apply a list of encodings stepwise to the given data.
1023
+
1024
+ Parameters
1025
+ ----------
1026
+ data : ndarray or bytes
1027
+ The data to be decoded.
1028
+ encoding : list of Encoding
1029
+ The encodings to be applied.
1030
+
1031
+ Returns
1032
+ -------
1033
+ decoded_data : ndarray
1034
+ The decoded data.
1035
+ """
1036
+ for enc in reversed(encoding):
1037
+ data = enc.decode(data)
1038
+ # ByteEncoding may decode in a non-writable array,
1039
+ # as it creates the ndarray cheaply from buffer
1040
+ if not data.flags.writeable:
1041
+ # Make the resulting ndarray writable, by copying the underlying buffer
1042
+ data = data.copy()
1043
+ return data
1044
+
1045
+
1046
+ def _camel_to_snake_case(attribute_name):
1047
+ return CAMEL_CASE_PATTERN.sub("_", attribute_name).lower()
1048
+
1049
+
1050
+ def _snake_to_camel_case(attribute_name):
1051
+ attribute_name = "".join(
1052
+ word.capitalize() for word in attribute_name.split("_")
1053
+ )
1054
+ return attribute_name[0].lower() + attribute_name[1:]
1055
+
1056
+
1057
+ def _safe_cast(array, dtype, allow_decimal_loss=False):
1058
+ source_dtype = array.dtype
1059
+ target_dtype = np.dtype(dtype)
1060
+
1061
+ if target_dtype == source_dtype:
1062
+ return array
1063
+
1064
+ if np.issubdtype(target_dtype, np.integer):
1065
+ if np.issubdtype(source_dtype, np.floating):
1066
+ if not allow_decimal_loss:
1067
+ raise ValueError("Cannot cast floating point to integer")
1068
+ if not np.isfinite(array).all():
1069
+ raise ValueError("Data contains non-finite values")
1070
+ elif not np.issubdtype(source_dtype, np.integer):
1071
+ # Neither float, nor integer -> cannot cast
1072
+ raise ValueError(f"Cannot cast '{source_dtype}' to integer")
1073
+ dtype_info = np.iinfo(target_dtype)
1074
+ # Check if an integer underflow/overflow would occur during conversion
1075
+ if np.max(array) > dtype_info.max or np.min(array) < dtype_info.min:
1076
+ raise ValueError("Values do not fit into the given dtype")
1077
+
1078
+ return array.astype(target_dtype)