biotite 1.3.0__cp312-cp312-macosx_10_13_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (354) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +159 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +447 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +77 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +208 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/afdb/__init__.py +12 -0
  34. biotite/database/afdb/download.py +191 -0
  35. biotite/database/entrez/__init__.py +15 -0
  36. biotite/database/entrez/check.py +60 -0
  37. biotite/database/entrez/dbnames.py +101 -0
  38. biotite/database/entrez/download.py +228 -0
  39. biotite/database/entrez/key.py +44 -0
  40. biotite/database/entrez/query.py +263 -0
  41. biotite/database/error.py +16 -0
  42. biotite/database/pubchem/__init__.py +21 -0
  43. biotite/database/pubchem/download.py +258 -0
  44. biotite/database/pubchem/error.py +30 -0
  45. biotite/database/pubchem/query.py +819 -0
  46. biotite/database/pubchem/throttle.py +98 -0
  47. biotite/database/rcsb/__init__.py +13 -0
  48. biotite/database/rcsb/download.py +160 -0
  49. biotite/database/rcsb/query.py +963 -0
  50. biotite/database/uniprot/__init__.py +13 -0
  51. biotite/database/uniprot/check.py +40 -0
  52. biotite/database/uniprot/download.py +126 -0
  53. biotite/database/uniprot/query.py +292 -0
  54. biotite/file.py +244 -0
  55. biotite/interface/__init__.py +19 -0
  56. biotite/interface/openmm/__init__.py +20 -0
  57. biotite/interface/openmm/state.py +93 -0
  58. biotite/interface/openmm/system.py +227 -0
  59. biotite/interface/pymol/__init__.py +201 -0
  60. biotite/interface/pymol/cgo.py +346 -0
  61. biotite/interface/pymol/convert.py +185 -0
  62. biotite/interface/pymol/display.py +267 -0
  63. biotite/interface/pymol/object.py +1226 -0
  64. biotite/interface/pymol/shapes.py +178 -0
  65. biotite/interface/pymol/startup.py +169 -0
  66. biotite/interface/rdkit/__init__.py +19 -0
  67. biotite/interface/rdkit/mol.py +490 -0
  68. biotite/interface/version.py +94 -0
  69. biotite/interface/warning.py +19 -0
  70. biotite/sequence/__init__.py +84 -0
  71. biotite/sequence/align/__init__.py +199 -0
  72. biotite/sequence/align/alignment.py +702 -0
  73. biotite/sequence/align/banded.cpython-312-darwin.so +0 -0
  74. biotite/sequence/align/banded.pyx +652 -0
  75. biotite/sequence/align/buckets.py +71 -0
  76. biotite/sequence/align/cigar.py +425 -0
  77. biotite/sequence/align/kmeralphabet.cpython-312-darwin.so +0 -0
  78. biotite/sequence/align/kmeralphabet.pyx +595 -0
  79. biotite/sequence/align/kmersimilarity.cpython-312-darwin.so +0 -0
  80. biotite/sequence/align/kmersimilarity.pyx +233 -0
  81. biotite/sequence/align/kmertable.cpython-312-darwin.so +0 -0
  82. biotite/sequence/align/kmertable.pyx +3411 -0
  83. biotite/sequence/align/localgapped.cpython-312-darwin.so +0 -0
  84. biotite/sequence/align/localgapped.pyx +892 -0
  85. biotite/sequence/align/localungapped.cpython-312-darwin.so +0 -0
  86. biotite/sequence/align/localungapped.pyx +279 -0
  87. biotite/sequence/align/matrix.py +631 -0
  88. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  89. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  94. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  95. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  99. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  100. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  101. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  102. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  103. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  104. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  105. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  106. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  107. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  108. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  109. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  110. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  111. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  112. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  113. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  114. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  115. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  116. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  117. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  118. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  119. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  120. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  121. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  122. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  155. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  156. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  157. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  158. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  159. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  160. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  161. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  162. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  163. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  164. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  165. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  166. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  167. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  168. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  169. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  170. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  171. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  172. biotite/sequence/align/matrix_data/PB.license +21 -0
  173. biotite/sequence/align/matrix_data/PB.mat +18 -0
  174. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  175. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  176. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  177. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  178. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  179. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  180. biotite/sequence/align/multiple.cpython-312-darwin.so +0 -0
  181. biotite/sequence/align/multiple.pyx +619 -0
  182. biotite/sequence/align/pairwise.cpython-312-darwin.so +0 -0
  183. biotite/sequence/align/pairwise.pyx +585 -0
  184. biotite/sequence/align/permutation.cpython-312-darwin.so +0 -0
  185. biotite/sequence/align/permutation.pyx +313 -0
  186. biotite/sequence/align/primes.txt +821 -0
  187. biotite/sequence/align/selector.cpython-312-darwin.so +0 -0
  188. biotite/sequence/align/selector.pyx +954 -0
  189. biotite/sequence/align/statistics.py +264 -0
  190. biotite/sequence/align/tracetable.cpython-312-darwin.so +0 -0
  191. biotite/sequence/align/tracetable.pxd +64 -0
  192. biotite/sequence/align/tracetable.pyx +370 -0
  193. biotite/sequence/alphabet.py +555 -0
  194. biotite/sequence/annotation.py +836 -0
  195. biotite/sequence/codec.cpython-312-darwin.so +0 -0
  196. biotite/sequence/codec.pyx +155 -0
  197. biotite/sequence/codon.py +476 -0
  198. biotite/sequence/codon_tables.txt +202 -0
  199. biotite/sequence/graphics/__init__.py +33 -0
  200. biotite/sequence/graphics/alignment.py +1101 -0
  201. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  202. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  203. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  204. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  205. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  206. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  207. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  208. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  209. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  210. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  211. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  212. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  213. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  214. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  215. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  216. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  217. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  218. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  219. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  220. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  221. biotite/sequence/graphics/colorschemes.py +170 -0
  222. biotite/sequence/graphics/dendrogram.py +231 -0
  223. biotite/sequence/graphics/features.py +544 -0
  224. biotite/sequence/graphics/logo.py +102 -0
  225. biotite/sequence/graphics/plasmid.py +712 -0
  226. biotite/sequence/io/__init__.py +12 -0
  227. biotite/sequence/io/fasta/__init__.py +22 -0
  228. biotite/sequence/io/fasta/convert.py +283 -0
  229. biotite/sequence/io/fasta/file.py +265 -0
  230. biotite/sequence/io/fastq/__init__.py +19 -0
  231. biotite/sequence/io/fastq/convert.py +117 -0
  232. biotite/sequence/io/fastq/file.py +507 -0
  233. biotite/sequence/io/genbank/__init__.py +17 -0
  234. biotite/sequence/io/genbank/annotation.py +269 -0
  235. biotite/sequence/io/genbank/file.py +573 -0
  236. biotite/sequence/io/genbank/metadata.py +336 -0
  237. biotite/sequence/io/genbank/sequence.py +173 -0
  238. biotite/sequence/io/general.py +201 -0
  239. biotite/sequence/io/gff/__init__.py +26 -0
  240. biotite/sequence/io/gff/convert.py +128 -0
  241. biotite/sequence/io/gff/file.py +449 -0
  242. biotite/sequence/phylo/__init__.py +36 -0
  243. biotite/sequence/phylo/nj.cpython-312-darwin.so +0 -0
  244. biotite/sequence/phylo/nj.pyx +221 -0
  245. biotite/sequence/phylo/tree.cpython-312-darwin.so +0 -0
  246. biotite/sequence/phylo/tree.pyx +1169 -0
  247. biotite/sequence/phylo/upgma.cpython-312-darwin.so +0 -0
  248. biotite/sequence/phylo/upgma.pyx +164 -0
  249. biotite/sequence/profile.py +561 -0
  250. biotite/sequence/search.py +117 -0
  251. biotite/sequence/seqtypes.py +720 -0
  252. biotite/sequence/sequence.py +373 -0
  253. biotite/setup_ccd.py +197 -0
  254. biotite/structure/__init__.py +135 -0
  255. biotite/structure/alphabet/__init__.py +25 -0
  256. biotite/structure/alphabet/encoder.py +332 -0
  257. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  258. biotite/structure/alphabet/i3d.py +109 -0
  259. biotite/structure/alphabet/layers.py +86 -0
  260. biotite/structure/alphabet/pb.license +21 -0
  261. biotite/structure/alphabet/pb.py +170 -0
  262. biotite/structure/alphabet/unkerasify.py +128 -0
  263. biotite/structure/atoms.py +1562 -0
  264. biotite/structure/basepairs.py +1403 -0
  265. biotite/structure/bonds.cpython-312-darwin.so +0 -0
  266. biotite/structure/bonds.pyx +1975 -0
  267. biotite/structure/box.py +724 -0
  268. biotite/structure/celllist.cpython-312-darwin.so +0 -0
  269. biotite/structure/celllist.pyx +864 -0
  270. biotite/structure/chains.py +276 -0
  271. biotite/structure/charges.cpython-312-darwin.so +0 -0
  272. biotite/structure/charges.pyx +520 -0
  273. biotite/structure/compare.py +681 -0
  274. biotite/structure/density.py +109 -0
  275. biotite/structure/dotbracket.py +213 -0
  276. biotite/structure/error.py +39 -0
  277. biotite/structure/filter.py +590 -0
  278. biotite/structure/geometry.py +655 -0
  279. biotite/structure/graphics/__init__.py +13 -0
  280. biotite/structure/graphics/atoms.py +243 -0
  281. biotite/structure/graphics/rna.py +298 -0
  282. biotite/structure/hbond.py +425 -0
  283. biotite/structure/info/__init__.py +24 -0
  284. biotite/structure/info/atom_masses.json +121 -0
  285. biotite/structure/info/atoms.py +90 -0
  286. biotite/structure/info/bonds.py +149 -0
  287. biotite/structure/info/ccd.py +200 -0
  288. biotite/structure/info/components.bcif +0 -0
  289. biotite/structure/info/groups.py +128 -0
  290. biotite/structure/info/masses.py +121 -0
  291. biotite/structure/info/misc.py +137 -0
  292. biotite/structure/info/radii.py +267 -0
  293. biotite/structure/info/standardize.py +185 -0
  294. biotite/structure/integrity.py +213 -0
  295. biotite/structure/io/__init__.py +29 -0
  296. biotite/structure/io/dcd/__init__.py +13 -0
  297. biotite/structure/io/dcd/file.py +67 -0
  298. biotite/structure/io/general.py +243 -0
  299. biotite/structure/io/gro/__init__.py +14 -0
  300. biotite/structure/io/gro/file.py +343 -0
  301. biotite/structure/io/mol/__init__.py +20 -0
  302. biotite/structure/io/mol/convert.py +112 -0
  303. biotite/structure/io/mol/ctab.py +420 -0
  304. biotite/structure/io/mol/header.py +120 -0
  305. biotite/structure/io/mol/mol.py +149 -0
  306. biotite/structure/io/mol/sdf.py +940 -0
  307. biotite/structure/io/netcdf/__init__.py +13 -0
  308. biotite/structure/io/netcdf/file.py +64 -0
  309. biotite/structure/io/pdb/__init__.py +20 -0
  310. biotite/structure/io/pdb/convert.py +388 -0
  311. biotite/structure/io/pdb/file.py +1356 -0
  312. biotite/structure/io/pdb/hybrid36.cpython-312-darwin.so +0 -0
  313. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  314. biotite/structure/io/pdbqt/__init__.py +15 -0
  315. biotite/structure/io/pdbqt/convert.py +113 -0
  316. biotite/structure/io/pdbqt/file.py +688 -0
  317. biotite/structure/io/pdbx/__init__.py +23 -0
  318. biotite/structure/io/pdbx/bcif.py +671 -0
  319. biotite/structure/io/pdbx/cif.py +1088 -0
  320. biotite/structure/io/pdbx/component.py +251 -0
  321. biotite/structure/io/pdbx/compress.py +358 -0
  322. biotite/structure/io/pdbx/convert.py +2097 -0
  323. biotite/structure/io/pdbx/encoding.cpython-312-darwin.so +0 -0
  324. biotite/structure/io/pdbx/encoding.pyx +1047 -0
  325. biotite/structure/io/trajfile.py +696 -0
  326. biotite/structure/io/trr/__init__.py +13 -0
  327. biotite/structure/io/trr/file.py +43 -0
  328. biotite/structure/io/util.py +38 -0
  329. biotite/structure/io/xtc/__init__.py +13 -0
  330. biotite/structure/io/xtc/file.py +43 -0
  331. biotite/structure/mechanics.py +72 -0
  332. biotite/structure/molecules.py +337 -0
  333. biotite/structure/pseudoknots.py +622 -0
  334. biotite/structure/rdf.py +245 -0
  335. biotite/structure/repair.py +302 -0
  336. biotite/structure/residues.py +544 -0
  337. biotite/structure/rings.py +335 -0
  338. biotite/structure/sasa.cpython-312-darwin.so +0 -0
  339. biotite/structure/sasa.pyx +322 -0
  340. biotite/structure/segments.py +292 -0
  341. biotite/structure/sequence.py +110 -0
  342. biotite/structure/spacegroups.json +1567 -0
  343. biotite/structure/spacegroups.license +26 -0
  344. biotite/structure/sse.py +306 -0
  345. biotite/structure/superimpose.py +511 -0
  346. biotite/structure/tm.py +581 -0
  347. biotite/structure/transform.py +736 -0
  348. biotite/structure/util.py +168 -0
  349. biotite/version.py +21 -0
  350. biotite/visualize.py +375 -0
  351. biotite-1.3.0.dist-info/METADATA +162 -0
  352. biotite-1.3.0.dist-info/RECORD +354 -0
  353. biotite-1.3.0.dist-info/WHEEL +6 -0
  354. biotite-1.3.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1047 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This module contains data encodings for BinaryCIF files.
7
+ """
8
+
9
+ __name__ = "biotite.structure.io.pdbx"
10
+ __author__ = "Patrick Kunzmann"
11
+ __all__ = ["ByteArrayEncoding", "FixedPointEncoding",
12
+ "IntervalQuantizationEncoding", "RunLengthEncoding",
13
+ "DeltaEncoding", "IntegerPackingEncoding", "StringArrayEncoding",
14
+ "TypeCode"]
15
+
16
+ cimport cython
17
+ cimport numpy as np
18
+
19
+ from dataclasses import dataclass
20
+ from abc import ABCMeta, abstractmethod
21
+ from numbers import Integral
22
+ from enum import IntEnum
23
+ import re
24
+ import numpy as np
25
+ from .component import _Component
26
+ from ....file import InvalidFileError
27
+
28
+ ctypedef np.int8_t int8
29
+ ctypedef np.int16_t int16
30
+ ctypedef np.int32_t int32
31
+ ctypedef np.uint8_t uint8
32
+ ctypedef np.uint16_t uint16
33
+ ctypedef np.uint32_t uint32
34
+ ctypedef np.float32_t float32
35
+ ctypedef np.float64_t float64
36
+
37
+ ctypedef fused Integer:
38
+ uint8
39
+ uint16
40
+ uint32
41
+ int8
42
+ int16
43
+ int32
44
+
45
+ # Used to create cartesian product of type combinations
46
+ # in run-length encoding
47
+ ctypedef fused OutputInteger:
48
+ uint8
49
+ uint16
50
+ uint32
51
+ int8
52
+ int16
53
+ int32
54
+
55
+ ctypedef fused Float:
56
+ float32
57
+ float64
58
+
59
+
60
+ CAMEL_CASE_PATTERN = re.compile(r"(?<!^)(?=[A-Z])")
61
+
62
+
63
+ class TypeCode(IntEnum):
64
+ """
65
+ This enum type represents integers that represent data types in
66
+ *BinaryCIF*.
67
+ """
68
+ INT8 = 1
69
+ INT16 = 2
70
+ INT32 = 3
71
+ UINT8 = 4
72
+ UINT16 = 5
73
+ UINT32 = 6
74
+ FLOAT32 = 32
75
+ FLOAT64 = 33
76
+
77
+ @staticmethod
78
+ def from_dtype(dtype):
79
+ """
80
+ Convert a *NumPy* dtype to a *BinaryCIF* type code.
81
+
82
+ Parameters
83
+ ----------
84
+ dtype : dtype or int or TypeCode
85
+ The data type to be converted.
86
+ If already a type code, it is simply returned.
87
+
88
+ Returns
89
+ -------
90
+ type_code : TypeCode
91
+ The corresponding type code.
92
+ """
93
+ if isinstance(dtype, Integral):
94
+ # Already a type code
95
+ return TypeCode(dtype)
96
+ else:
97
+ dtype = np.dtype(dtype)
98
+ # Find the closest dtype supported by the format
99
+ if np.issubdtype(dtype, np.integer):
100
+ # int64 is not supported by format
101
+ if dtype == np.int64:
102
+ supported_dtype = np.int32
103
+ elif dtype == np.uint64:
104
+ supported_dtype = np.uint32
105
+ else:
106
+ supported_dtype = dtype
107
+ elif np.issubdtype(dtype, np.floating):
108
+ if dtype == np.float16:
109
+ supported_dtype = np.float32
110
+ # float128 is not available on all architectures
111
+ elif hasattr(np, "float128") and dtype == np.float128:
112
+ supported_dtype = np.float64
113
+ else:
114
+ supported_dtype = dtype
115
+ else:
116
+ raise ValueError(
117
+ f"dtype '{dtype}' is not supported by BinaryCIF"
118
+ )
119
+ return _DTYPE_TO_TYPE_CODE[
120
+ np.dtype(supported_dtype).newbyteorder("<").str
121
+ ]
122
+
123
+ def to_dtype(self):
124
+ """
125
+ Convert this type code to a *NumPy* dtype.
126
+
127
+ Returns
128
+ -------
129
+ dtype : dtype
130
+ The corresponding data type.
131
+ """
132
+ return _TYPE_CODE_TO_DTYPE[self]
133
+
134
+ # Converts BCIF integers representing the type to an actual NumPy dtype
135
+ _TYPE_CODE_TO_DTYPE = {
136
+ # All data types are little-endian
137
+ TypeCode.INT8: "|i1",
138
+ TypeCode.INT16: "<i2",
139
+ TypeCode.INT32: "<i4",
140
+ TypeCode.UINT8: "|u1",
141
+ TypeCode.UINT16: "<u2",
142
+ TypeCode.UINT32: "<u4",
143
+ TypeCode.FLOAT32: "<f4",
144
+ TypeCode.FLOAT64: "<f8"
145
+ }
146
+ _DTYPE_TO_TYPE_CODE = {val: key for key, val in _TYPE_CODE_TO_DTYPE.items()}
147
+
148
+
149
+ class Encoding(_Component, metaclass=ABCMeta):
150
+ """
151
+ Abstract base class for *BinaryCIF* data encodings.
152
+
153
+ Notes
154
+ -----
155
+ The encoding classes do not omit bound checks for decoding,
156
+ since the file content may be invalid/malicious.
157
+ """
158
+
159
+ @classmethod
160
+ def deserialize(cls, content):
161
+ params = {
162
+ _camel_to_snake_case(param): value
163
+ for param, value in content.items()
164
+ }
165
+ # 'kind' is no parameter, but indicates the class itself
166
+ params.pop("kind")
167
+ try:
168
+ encoding = cls(**params)
169
+ except TypeError as e:
170
+ raise InvalidFileError(
171
+ f"Invalid encoding parameters for {cls.__name__}"
172
+ )
173
+ except ValueError:
174
+ raise InvalidFileError(
175
+ f"Missing encoding parameters for {cls.__name__}"
176
+ )
177
+ return encoding
178
+
179
+ def serialize(self):
180
+ for param in self.__annotations__:
181
+ if getattr(self, param) is None:
182
+ raise ValueError(
183
+ f"'{param}' must be explicitly given or needs to be "
184
+ "determined from first encoding pass, before it is "
185
+ "serialized"
186
+ )
187
+
188
+ serialized = {
189
+ _snake_to_camel_case(param): getattr(self, param)
190
+ for param in self.__annotations__
191
+ }
192
+ serialized.update({
193
+ "kind": _encoding_classes_kinds[type(self).__name__]
194
+ })
195
+ return serialized
196
+
197
+ @abstractmethod
198
+ def encode(self, data):
199
+ """
200
+ Apply this encoding to the given data.
201
+
202
+ Parameters
203
+ ----------
204
+ data : ndarray
205
+ The data to be encoded.
206
+
207
+ Returns
208
+ -------
209
+ encoded_data : ndarray or bytes
210
+ The encoded data.
211
+ """
212
+ raise NotImplementedError()
213
+
214
+ @abstractmethod
215
+ def decode(self, data):
216
+ """
217
+ Apply the inverse of this encoding to the given data.
218
+
219
+ Parameters
220
+ ----------
221
+ data : ndarray or bytes
222
+ The data to be decoded.
223
+
224
+ Returns
225
+ -------
226
+ decoded_data : ndarray
227
+ The decoded data.
228
+ """
229
+ # Important: Do not omit bound checks for decoding,
230
+ # since the file content may be invalid/malicious.
231
+ raise NotImplementedError()
232
+
233
+ def __str__(self):
234
+ # Restore original behavior, as `__str__()` implementation of `_Component`
235
+ # may require serialization, which is not possible for some encodings prior
236
+ # to the first encoding pass
237
+ return object.__str__(self)
238
+
239
+
240
+ @dataclass
241
+ class ByteArrayEncoding(Encoding):
242
+ r"""
243
+ Encoding that encodes an array into bytes.
244
+
245
+ Parameters
246
+ ----------
247
+ type : dytpe or TypeCode, optional
248
+ The data type of the array to be encoded.
249
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
250
+ If omitted, the data type is taken from the data the
251
+ first time :meth:`encode()` is called.
252
+
253
+ Attributes
254
+ ----------
255
+ type : TypeCode
256
+
257
+ Examples
258
+ --------
259
+
260
+ >>> data = np.arange(3)
261
+ >>> print(data)
262
+ [0 1 2]
263
+ >>> print(ByteArrayEncoding().encode(data))
264
+ b'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
265
+ """
266
+ type: ... = None
267
+
268
+ def __post_init__(self):
269
+ if self.type is not None:
270
+ self.type = TypeCode.from_dtype(self.type)
271
+
272
+ def encode(self, data):
273
+ if self.type is None:
274
+ self.type = TypeCode.from_dtype(data.dtype)
275
+ return _safe_cast(data, self.type.to_dtype()).tobytes()
276
+
277
+ def decode(self, data):
278
+ # Data is raw bytes in this case
279
+ return np.frombuffer(data, dtype=self.type.to_dtype())
280
+
281
+
282
+ @dataclass
283
+ class FixedPointEncoding(Encoding):
284
+ """
285
+ Lossy encoding that multiplies floating point values with a given
286
+ factor and subsequently rounds them to the nearest integer.
287
+
288
+ Parameters
289
+ ----------
290
+ factor : float
291
+ The factor by which the data is multiplied before rounding.
292
+ src_type : dtype or TypeCode, optional
293
+ The data type of the array to be encoded.
294
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
295
+ The dtype must be a float type.
296
+ If omitted, the data type is taken from the data the
297
+ first time :meth:`encode()` is called.
298
+
299
+ Attributes
300
+ ----------
301
+ factor : float
302
+ src_type : TypeCode
303
+
304
+ Examples
305
+ --------
306
+
307
+ >>> data = np.array([9.87, 6.543])
308
+ >>> print(data)
309
+ [9.870 6.543]
310
+ >>> print(FixedPointEncoding(factor=100).encode(data))
311
+ [987 654]
312
+ """
313
+ factor: ...
314
+ src_type: ... = None
315
+
316
+ def __post_init__(self):
317
+ if self.src_type is not None:
318
+ self.src_type = TypeCode.from_dtype(self.src_type)
319
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
320
+ raise ValueError(
321
+ "Only floating point types are supported"
322
+ )
323
+
324
+ def encode(self, data):
325
+ # If not given in constructor, it is determined from the data
326
+ if self.src_type is None:
327
+ self.src_type = TypeCode.from_dtype(data.dtype)
328
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
329
+ raise ValueError(
330
+ "Only floating point types are supported"
331
+ )
332
+
333
+ # Round to avoid wrong values due to floating point inaccuracies
334
+ scaled_data = np.round(data * self.factor)
335
+ return _safe_cast(scaled_data, np.int32, allow_decimal_loss=True)
336
+
337
+ def decode(self, data):
338
+ return (data / self.factor).astype(
339
+ dtype=self.src_type.to_dtype(), copy=False
340
+ )
341
+
342
+
343
+ @dataclass
344
+ class IntervalQuantizationEncoding(Encoding):
345
+ """
346
+ Lossy encoding that sorts floating point values into bins.
347
+ Each bin is represented by an integer
348
+
349
+ Parameters
350
+ ----------
351
+ min, max : float
352
+ The minimum and maximum value the bins comprise.
353
+ num_steps : int
354
+ The number of bins.
355
+ src_type : dtype or TypeCode, optional
356
+ The data type of the array to be encoded.
357
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
358
+ The dtype must be a float type.
359
+ If omitted, the data type is taken from the data the
360
+ first time :meth:`encode()` is called.
361
+
362
+ Attributes
363
+ ----------
364
+ min, max : float
365
+ num_steps : int
366
+ src_type : TypeCode
367
+
368
+ Examples
369
+ --------
370
+
371
+ >>> data = np.linspace(11, 12, 6)
372
+ >>> print(data)
373
+ [11.0 11.2 11.4 11.6 11.8 12.0]
374
+ >>> # Use 0.5 as step size
375
+ >>> encoding = IntervalQuantizationEncoding(min=10, max=20, num_steps=21)
376
+ >>> # The encoding is lossy, as different values are mapped to the same bin
377
+ >>> encoded = encoding.encode(data)
378
+ >>> print(encoded)
379
+ [2 3 3 4 4 4]
380
+ >>> decoded = encoding.decode(encoded)
381
+ >>> print(decoded)
382
+ [11.0 11.5 11.5 12.0 12.0 12.0]
383
+ """
384
+ min: ...
385
+ max: ...
386
+ num_steps: ...
387
+ src_type: ... = None
388
+
389
+ def __post_init__(self):
390
+ if self.src_type is not None:
391
+ self.src_type = TypeCode.from_dtype(self.src_type)
392
+
393
+ def encode(self, data):
394
+ # If not given in constructor, it is determined from the data
395
+ if self.src_type is None:
396
+ self.src_type = TypeCode.from_dtype(data.dtype)
397
+
398
+ steps = np.linspace(
399
+ self.min, self.max, self.num_steps, dtype=data.dtype
400
+ )
401
+ indices = np.searchsorted(steps, data, side="left")
402
+ return _safe_cast(indices, np.int32)
403
+
404
+ def decode(self, data):
405
+ output = data * (self.max - self.min) / (self.num_steps - 1)
406
+ output = output.astype(self.src_type.to_dtype(), copy=False)
407
+ output += self.min
408
+ return output
409
+
410
+
411
+ @dataclass
412
+ class RunLengthEncoding(Encoding):
413
+ """
414
+ Encoding that compresses runs of equal values into pairs of
415
+ (value, run length).
416
+
417
+ Parameters
418
+ ----------
419
+ src_size : int, optional
420
+ The size of the array to be encoded.
421
+ If omitted, the size is determined from the data the
422
+ first time :meth:`encode()` is called.
423
+ src_type : dtype or TypeCode, optional
424
+ The data type of the array to be encoded.
425
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
426
+ The dtype must be a integer type.
427
+ If omitted, the data type is taken from the data the
428
+ first time :meth:`encode()` is called.
429
+
430
+ Attributes
431
+ ----------
432
+ src_size : int
433
+ src_type : TypeCode
434
+
435
+ Examples
436
+ --------
437
+
438
+ >>> data = np.array([1, 1, 1, 5, 3, 3])
439
+ >>> print(data)
440
+ [1 1 1 5 3 3]
441
+ >>> encoded = RunLengthEncoding().encode(data)
442
+ >>> print(encoded)
443
+ [1 3 5 1 3 2]
444
+ >>> # Emphasize the the pairs
445
+ >>> print(encoded.reshape(-1, 2))
446
+ [[1 3]
447
+ [5 1]
448
+ [3 2]]
449
+ """
450
+ src_size: ... = None
451
+ src_type: ... = None
452
+
453
+ def __post_init__(self):
454
+ if self.src_type is not None:
455
+ self.src_type = TypeCode.from_dtype(self.src_type)
456
+
457
+ def encode(self, data):
458
+ # If not given in constructor, it is determined from the data
459
+ if self.src_type is None:
460
+ self.src_type = TypeCode.from_dtype(data.dtype)
461
+ if self.src_size is None:
462
+ self.src_size = data.shape[0]
463
+ elif self.src_size != data.shape[0]:
464
+ raise IndexError(
465
+ "Given source size does not match actual data size"
466
+ )
467
+ return self._encode(_safe_cast(data, self.src_type.to_dtype()))
468
+
469
+ def decode(self, data):
470
+ return self._decode(
471
+ data, np.empty(0, dtype=self.src_type.to_dtype())
472
+ )
473
+
474
+ def _encode(self, const Integer[:] data):
475
+ # Pessimistic allocation of output array
476
+ # -> Run length is 1 for every element
477
+ cdef int32[:] output = np.zeros(data.shape[0] * 2, dtype=np.int32)
478
+ cdef int i=0, j=0
479
+ cdef int val = data[0]
480
+ cdef int run_length = 0
481
+ cdef int curr_val
482
+ for i in range(data.shape[0]):
483
+ curr_val = data[i]
484
+ if curr_val == val:
485
+ run_length += 1
486
+ else:
487
+ # New element -> Write element with run-length
488
+ output[j] = val
489
+ output[j+1] = run_length
490
+ j += 2
491
+ val = curr_val
492
+ run_length = 1
493
+ # Write last element
494
+ output[j] = val
495
+ output[j+1] = run_length
496
+ j += 2
497
+ # Trim to correct size
498
+ return np.asarray(output)[:j]
499
+
500
+ def _decode(self, const Integer[:] data, OutputInteger[:] output_type):
501
+ """
502
+ `output_type` is merely a typed placeholder to allow for static
503
+ typing of output.
504
+ """
505
+ if data.shape[0] % 2 != 0:
506
+ raise ValueError("Invalid run-length encoded data")
507
+
508
+ cdef int length = 0
509
+ cdef int i, j
510
+ cdef int value, repeat
511
+
512
+ if self.src_size is None:
513
+ # Determine length of output array by summing run lengths
514
+ for i in range(1, data.shape[0], 2):
515
+ length += data[i]
516
+ else:
517
+ length = self.src_size
518
+
519
+ cdef OutputInteger[:] output = np.zeros(
520
+ length, dtype=np.asarray(output_type).dtype
521
+ )
522
+ # Fill output array
523
+ j = 0
524
+ for i in range(0, data.shape[0], 2):
525
+ value = data[i]
526
+ repeat = data[i+1]
527
+ output[j : j+repeat] = value
528
+ j += repeat
529
+ return np.asarray(output)
530
+
531
+
532
+ @dataclass
533
+ class DeltaEncoding(Encoding):
534
+ """
535
+ Encoding that encodes an array of integers into an array of
536
+ consecutive differences.
537
+
538
+ Parameters
539
+ ----------
540
+ src_type : dtype or TypeCode, optional
541
+ The data type of the array to be encoded.
542
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
543
+ The dtype must be a integer type.
544
+ If omitted, the data type is taken from the data the
545
+ first time :meth:`encode()` is called.
546
+ origin : int, optional
547
+ The starting value from which the differences are calculated.
548
+ If omitted, the value is taken from the first array element the
549
+ first time :meth:`encode()` is called.
550
+
551
+ Attributes
552
+ ----------
553
+ src_type : TypeCode
554
+ origin : int
555
+
556
+ Examples
557
+ --------
558
+
559
+ >>> data = np.array([1, 1, 2, 3, 5, 8])
560
+ >>> encoding = DeltaEncoding()
561
+ >>> print(encoding.encode(data))
562
+ [0 0 1 1 2 3]
563
+ >>> print(encoding.origin)
564
+ 1
565
+ """
566
+ src_type: ... = None
567
+ origin: ... = None
568
+
569
+ def __post_init__(self):
570
+ if self.src_type is not None:
571
+ self.src_type = TypeCode.from_dtype(self.src_type)
572
+
573
+ def encode(self, data):
574
+ # If not given in constructor, it is determined from the data
575
+ if self.src_type is None:
576
+ self.src_type = TypeCode.from_dtype(data.dtype)
577
+ if self.origin is None:
578
+ self.origin = data[0]
579
+
580
+ # Differences (including `np.diff`) return an array with the same dtype as the
581
+ # input array
582
+ # As the input dtype may be unsigned, the output dtype could underflow,
583
+ # if the difference is negative
584
+ # -> cast to int64 to avoid this
585
+ data = data.astype(np.int64, copy=False)
586
+ data = data - self.origin
587
+ return _safe_cast(np.diff(data, prepend=0), np.int32)
588
+
589
+ def decode(self, data):
590
+ output = np.cumsum(data, dtype=self.src_type.to_dtype())
591
+ output += self.origin
592
+ return output
593
+
594
+
595
+ @dataclass
596
+ class IntegerPackingEncoding(Encoding):
597
+ """
598
+ Encoding that compresses an array of 32-bit integers into an array
599
+ of smaller sized integers.
600
+
601
+ If a value does not fit into smaller integer type,
602
+ the integer is represented by a sum of consecutive elements
603
+ in the compressed array.
604
+
605
+ Parameters
606
+ ----------
607
+ byte_count : int
608
+ The number of bytes the packed integers should occupy.
609
+ Supported values are 1 and 2 for 8-bit and 16-bit integers,
610
+ respectively.
611
+ src_size : int, optional
612
+ The size of the array to be encoded.
613
+ If omitted, the size is determined from the data the
614
+ first time :meth:`encode()` is called.
615
+ is_unsigned : bool, optional
616
+ Whether the values should be packed into signed or unsigned
617
+ integers.
618
+ If omitted, first time :meth:`encode()` is called, determines whether
619
+ the values fit into unsigned integers.
620
+
621
+ Attributes
622
+ ----------
623
+ byte_count : int
624
+ src_size : int
625
+ is_unsigned : bool
626
+
627
+ Examples
628
+ --------
629
+
630
+ >>> data = np.array([1, 2, -3, 128])
631
+ >>> print(data)
632
+ [ 1 2 -3 128]
633
+ >>> print(IntegerPackingEncoding(byte_count=1).encode(data))
634
+ [ 1 2 -3 127 1]
635
+ """
636
+ byte_count: ...
637
+ src_size: ... = None
638
+ is_unsigned: ... = None
639
+
640
+ def encode(self, data):
641
+ if self.src_size is None:
642
+ self.src_size = len(data)
643
+ elif self.src_size != len(data):
644
+ raise IndexError(
645
+ "Given source size does not match actual data size"
646
+ )
647
+ if self.is_unsigned is None:
648
+ # Only positive values -> use unsigned integers
649
+ self.is_unsigned = data.min().item() >= 0
650
+
651
+ data = _safe_cast(data, np.int32)
652
+ return self._encode(
653
+ data, np.empty(0, dtype=self._determine_packed_dtype())
654
+ )
655
+
656
+ def decode(self, const Integer[:] data):
657
+ cdef int i, j
658
+ cdef int min_val, max_val
659
+ cdef int packed_val, unpacked_val
660
+ bounds = self._get_bounds(data)
661
+ min_val = bounds[0]
662
+ max_val = bounds[1]
663
+ # For signed integers, do not check lower bound (is always 0)
664
+ # -> Set lower bound to value that is never reached
665
+ if min_val == 0:
666
+ min_val = -1
667
+
668
+ cdef int32[:] output = np.zeros(self.src_size, dtype=np.int32)
669
+ j = 0
670
+ unpacked_val = 0
671
+ for i in range(data.shape[0]):
672
+ packed_val = data[i]
673
+ if packed_val == max_val or packed_val == min_val:
674
+ unpacked_val += packed_val
675
+ else:
676
+ unpacked_val += packed_val
677
+ output[j] = unpacked_val
678
+ unpacked_val = 0
679
+ j += 1
680
+ # Trim to correct size and return
681
+ return np.asarray(output)
682
+
683
+ def _determine_packed_dtype(self):
684
+ if self.byte_count == 1:
685
+ if self.is_unsigned:
686
+ return np.uint8
687
+ else:
688
+ return np.int8
689
+ elif self.byte_count == 2:
690
+ if self.is_unsigned:
691
+ return np.uint16
692
+ else:
693
+ return np.int16
694
+ else:
695
+ raise ValueError("Unsupported byte count")
696
+
697
+ @cython.cdivision(True)
698
+ def _encode(self, const Integer[:] data, OutputInteger[:] output_type):
699
+ """
700
+ `output_type` is merely a typed placeholder to allow for static
701
+ typing of output.
702
+ """
703
+ cdef int i=0, j=0
704
+
705
+ packed_type = np.asarray(output_type).dtype
706
+ cdef int min_val = np.iinfo(packed_type).min
707
+ cdef int max_val = np.iinfo(packed_type).max
708
+
709
+ # Get length of output array
710
+ # by summing up required length of each element
711
+ cdef int number
712
+ cdef long length = 0
713
+ for i in range(data.shape[0]):
714
+ number = data[i]
715
+ if number < 0:
716
+ if min_val == 0:
717
+ raise ValueError(
718
+ "Cannot pack negative numbers into unsigned type"
719
+ )
720
+ # The required packed length for an element is the
721
+ # number of times min_val/max_val need to be repeated
722
+ length += number // min_val + 1
723
+ elif number > 0:
724
+ length += number // max_val + 1
725
+ else:
726
+ # number = 0
727
+ length += 1
728
+
729
+ # Fill output
730
+ cdef OutputInteger[:] output = np.zeros(length, dtype=packed_type)
731
+ cdef int remainder
732
+ j = 0
733
+ for i in range(data.shape[0]):
734
+ remainder = data[i]
735
+ if remainder < 0:
736
+ if min_val == 0:
737
+ raise ValueError(
738
+ "Cannot pack negative numbers into unsigned type"
739
+ )
740
+ while remainder <= min_val:
741
+ remainder -= min_val
742
+ output[j] = min_val
743
+ j += 1
744
+ elif remainder > 0:
745
+ while remainder >= max_val:
746
+ remainder -= max_val
747
+ output[j] = max_val
748
+ j += 1
749
+ output[j] = remainder
750
+ j += 1
751
+ return np.asarray(output)
752
+
753
+ @staticmethod
754
+ def _get_bounds(const Integer[:] data):
755
+ if Integer is int8:
756
+ info = np.iinfo(np.int8)
757
+ elif Integer is int16:
758
+ info = np.iinfo(np.int16)
759
+ elif Integer is int32:
760
+ info = np.iinfo(np.int32)
761
+ elif Integer is uint8:
762
+ info = np.iinfo(np.uint8)
763
+ elif Integer is uint16:
764
+ info = np.iinfo(np.uint16)
765
+ elif Integer is uint32:
766
+ info = np.iinfo(np.uint32)
767
+ else:
768
+ raise ValueError("Unsupported integer type")
769
+ return info.min, info.max
770
+
771
+
772
+ @dataclass
773
+ class StringArrayEncoding(Encoding):
774
+ """
775
+ Encoding that compresses an array of strings into an array of
776
+ indices that point to the unique strings in that array.
777
+
778
+ The unique strings themselves are stored as part of the
779
+ :class:`StringArrayEncoding` as concatenated string.
780
+ The start index of each unique string in the concatenated string
781
+ is stored in an *offset* array.
782
+
783
+ Parameters
784
+ ----------
785
+ strings : ndarray, optional
786
+ The unique strings that are used for encoding.
787
+ If omitted, the unique strings are determined from the data the
788
+ first time :meth:`encode()` is called.
789
+ data_encoding : list of Encoding, optional
790
+ The encodings that are applied to the index array.
791
+ If omitted, the array is directly encoded into bytes without
792
+ further compression.
793
+ offset_encoding : list of Encoding, optional
794
+ The encodings that are applied to the offset array.
795
+ If omitted, the array is directly encoded into bytes without
796
+ further compression.
797
+
798
+ Attributes
799
+ ----------
800
+ strings : ndarray
801
+ data_encoding : list of Encoding
802
+ offset_encoding : list of Encoding
803
+
804
+ Examples
805
+ --------
806
+
807
+ >>> data = np.array(["apple", "banana", "cherry", "apple", "banana", "apple"])
808
+ >>> print(data)
809
+ ['apple' 'banana' 'cherry' 'apple' 'banana' 'apple']
810
+ >>> # By default the indices would directly be encoded into bytes
811
+ >>> # However, the indices should be printed here -> data_encoding=[]
812
+ >>> encoding = StringArrayEncoding(data_encoding=[])
813
+ >>> encoded = encoding.encode(data)
814
+ >>> print(encoding.strings)
815
+ ['apple' 'banana' 'cherry']
816
+ >>> print(encoded)
817
+ [0 1 2 0 1 0]
818
+ """
819
+
820
+ strings: ... = None
821
+ data_encoding: ... = None
822
+ offset_encoding: ... = None
823
+
824
+ def __init__(self, strings=None, data_encoding=None, offset_encoding=None):
825
+ self.strings = strings
826
+ if data_encoding is None:
827
+ data_encoding = [ByteArrayEncoding(TypeCode.INT32)]
828
+ self.data_encoding = data_encoding
829
+ if offset_encoding is None:
830
+ offset_encoding = [ByteArrayEncoding(TypeCode.INT32)]
831
+ self.offset_encoding = offset_encoding
832
+
833
+ @staticmethod
834
+ def deserialize(content):
835
+ data_encoding = [
836
+ deserialize_encoding(e) for e in content["dataEncoding"]
837
+ ]
838
+ offset_encoding = [
839
+ deserialize_encoding(e) for e in content["offsetEncoding"]
840
+ ]
841
+ cdef str concatenated_strings = content["stringData"]
842
+ cdef np.ndarray offsets = decode_stepwise(
843
+ content["offsets"], offset_encoding
844
+ )
845
+
846
+ strings = np.array([
847
+ concatenated_strings[offsets[i]:offsets[i+1]]
848
+ # The final offset is the exclusive stop index
849
+ for i in range(len(offsets)-1)
850
+ ], dtype="U")
851
+
852
+ return StringArrayEncoding(strings, data_encoding, offset_encoding)
853
+
854
+ def serialize(self):
855
+ if self.strings is None:
856
+ raise ValueError(
857
+ "'strings' must be explicitly given or needs to be "
858
+ "determined from first encoding pass, before it is serialized"
859
+ )
860
+
861
+ string_data = "".join(self.strings)
862
+ offsets = np.cumsum([0] + [len(s) for s in self.strings])
863
+
864
+ return {
865
+ "kind": "StringArray",
866
+ "dataEncoding": [e.serialize() for e in self.data_encoding],
867
+ "stringData": string_data,
868
+ "offsets": encode_stepwise(offsets, self.offset_encoding),
869
+ "offsetEncoding": [e.serialize() for e in self.offset_encoding],
870
+ }
871
+
872
+ def encode(self, data):
873
+ if not np.issubdtype(data.dtype, np.str_):
874
+ raise TypeError("Data must be of string type")
875
+
876
+ if self.strings is None:
877
+ # 'unique()' already sorts the strings, but this is not necessarily
878
+ # desired, as this makes efficient encoding of the indices more difficult
879
+ # -> Bring into the original order
880
+ _, unique_indices = np.unique(data, return_index=True)
881
+ self.strings = data[np.sort(unique_indices)]
882
+ check_present = False
883
+ else:
884
+ check_present = True
885
+
886
+ string_order = _safe_cast(np.argsort(self.strings), np.int32)
887
+ sorted_strings = self.strings[string_order]
888
+ sorted_indices = np.searchsorted(sorted_strings, data)
889
+ indices = string_order[sorted_indices]
890
+ if check_present and not np.all(self.strings[indices] == data):
891
+ raise ValueError("Data contains strings not present in 'strings'")
892
+ return encode_stepwise(indices, self.data_encoding)
893
+
894
+ def decode(self, data):
895
+ indices = decode_stepwise(data, self.data_encoding)
896
+ return self.strings[indices]
897
+
898
+ def __eq__(self, other):
899
+ if not isinstance(other, type(self)):
900
+ return False
901
+ if not np.array_equal(self.strings, other.strings):
902
+ return False
903
+ if self.data_encoding != other.data_encoding:
904
+ return False
905
+ if self.offset_encoding != other.offset_encoding:
906
+ return False
907
+ return True
908
+
909
+
910
+ _encoding_classes = {
911
+ "ByteArray": ByteArrayEncoding,
912
+ "FixedPoint": FixedPointEncoding,
913
+ "IntervalQuantization": IntervalQuantizationEncoding,
914
+ "RunLength": RunLengthEncoding,
915
+ "Delta": DeltaEncoding,
916
+ "IntegerPacking": IntegerPackingEncoding,
917
+ "StringArray": StringArrayEncoding,
918
+ }
919
+ _encoding_classes_kinds = {
920
+ "ByteArrayEncoding": "ByteArray",
921
+ "FixedPointEncoding": "FixedPoint",
922
+ "IntervalQuantizationEncoding": "IntervalQuantization",
923
+ "RunLengthEncoding": "RunLength",
924
+ "DeltaEncoding": "Delta",
925
+ "IntegerPackingEncoding": "IntegerPacking",
926
+ "StringArrayEncoding": "StringArray",
927
+ }
928
+
929
+
930
+ def deserialize_encoding(content):
931
+ """
932
+ Create a :class:`Encoding` by deserializing the given *BinaryCIF* content.
933
+
934
+ Parameters
935
+ ----------
936
+ content : dict
937
+ The encoding represenet as *BinaryCIF* dictionary.
938
+
939
+ Returns
940
+ -------
941
+ encoding : Encoding
942
+ The deserialized encoding.
943
+ """
944
+ try:
945
+ encoding_class = _encoding_classes[content["kind"]]
946
+ except KeyError:
947
+ raise ValueError(
948
+ f"Unknown encoding kind '{content['kind']}'"
949
+ )
950
+ return encoding_class.deserialize(content)
951
+
952
+
953
+ def create_uncompressed_encoding(array):
954
+ """
955
+ Create a simple encoding for the given array that does not compress the data.
956
+
957
+ Parameters
958
+ ----------
959
+ array : ndarray
960
+ The array to to create the encoding for.
961
+
962
+ Returns
963
+ -------
964
+ encoding : list of Encoding
965
+ The encoding for the data.
966
+ """
967
+ if np.issubdtype(array.dtype, np.str_):
968
+ return [StringArrayEncoding()]
969
+ else:
970
+ return [ByteArrayEncoding()]
971
+
972
+
973
+ def encode_stepwise(data, encoding):
974
+ """
975
+ Apply a list of encodings stepwise to the given data.
976
+
977
+ Parameters
978
+ ----------
979
+ data : ndarray
980
+ The data to be encoded.
981
+ encoding : list of Encoding
982
+ The encodings to be applied.
983
+
984
+ Returns
985
+ -------
986
+ encoded_data : ndarray or bytes
987
+ The encoded data.
988
+ """
989
+ for encoding in encoding:
990
+ data = encoding.encode(data)
991
+ return data
992
+
993
+
994
+ def decode_stepwise(data, encoding):
995
+ """
996
+ Apply a list of encodings stepwise to the given data.
997
+
998
+ Parameters
999
+ ----------
1000
+ data : ndarray or bytes
1001
+ The data to be decoded.
1002
+ encoding : list of Encoding
1003
+ The encodings to be applied.
1004
+
1005
+ Returns
1006
+ -------
1007
+ decoded_data : ndarray
1008
+ The decoded data.
1009
+ """
1010
+ for enc in reversed(encoding):
1011
+ data = enc.decode(data)
1012
+ return data
1013
+
1014
+
1015
+ def _camel_to_snake_case(attribute_name):
1016
+ return CAMEL_CASE_PATTERN.sub("_", attribute_name).lower()
1017
+
1018
+
1019
+ def _snake_to_camel_case(attribute_name):
1020
+ attribute_name = "".join(
1021
+ word.capitalize() for word in attribute_name.split("_")
1022
+ )
1023
+ return attribute_name[0].lower() + attribute_name[1:]
1024
+
1025
+
1026
+ def _safe_cast(array, dtype, allow_decimal_loss=False):
1027
+ source_dtype = array.dtype
1028
+ target_dtype = np.dtype(dtype)
1029
+
1030
+ if target_dtype == source_dtype:
1031
+ return array
1032
+
1033
+ if np.issubdtype(target_dtype, np.integer):
1034
+ if np.issubdtype(source_dtype, np.floating):
1035
+ if not allow_decimal_loss:
1036
+ raise ValueError("Cannot cast floating point to integer")
1037
+ if not np.isfinite(array).all():
1038
+ raise ValueError("Data contains non-finite values")
1039
+ elif not np.issubdtype(source_dtype, np.integer):
1040
+ # Neither float, nor integer -> cannot cast
1041
+ raise ValueError(f"Cannot cast '{source_dtype}' to integer")
1042
+ dtype_info = np.iinfo(target_dtype)
1043
+ # Check if an integer underflow/overflow would occur during conversion
1044
+ if np.max(array) > dtype_info.max or np.min(array) < dtype_info.min:
1045
+ raise ValueError("Values do not fit into the given dtype")
1046
+
1047
+ return array.astype(target_dtype)