biotite 1.1.0__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (332) hide show
  1. biotite/__init__.py +18 -0
  2. biotite/application/__init__.py +69 -0
  3. biotite/application/application.py +276 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +500 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +92 -0
  8. biotite/application/blast/webapp.py +428 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +223 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +159 -0
  13. biotite/application/localapp.py +342 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +116 -0
  16. biotite/application/msaapp.py +363 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +227 -0
  19. biotite/application/muscle/app5.py +163 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +452 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +199 -0
  24. biotite/application/util.py +57 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +310 -0
  27. biotite/application/viennarna/rnafold.py +254 -0
  28. biotite/application/viennarna/rnaplot.py +206 -0
  29. biotite/application/viennarna/util.py +77 -0
  30. biotite/application/webapp.py +76 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +60 -0
  35. biotite/database/entrez/dbnames.py +91 -0
  36. biotite/database/entrez/download.py +229 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +262 -0
  39. biotite/database/error.py +16 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +258 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +830 -0
  44. biotite/database/pubchem/throttle.py +98 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +159 -0
  47. biotite/database/rcsb/query.py +964 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +40 -0
  50. biotite/database/uniprot/download.py +129 -0
  51. biotite/database/uniprot/query.py +293 -0
  52. biotite/file.py +232 -0
  53. biotite/sequence/__init__.py +84 -0
  54. biotite/sequence/align/__init__.py +203 -0
  55. biotite/sequence/align/alignment.py +680 -0
  56. biotite/sequence/align/banded.cp313-win_amd64.pyd +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +71 -0
  59. biotite/sequence/align/cigar.py +425 -0
  60. biotite/sequence/align/kmeralphabet.cp313-win_amd64.pyd +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +595 -0
  62. biotite/sequence/align/kmersimilarity.cp313-win_amd64.pyd +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cp313-win_amd64.pyd +0 -0
  65. biotite/sequence/align/kmertable.pyx +3411 -0
  66. biotite/sequence/align/localgapped.cp313-win_amd64.pyd +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cp313-win_amd64.pyd +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +622 -0
  71. biotite/sequence/align/matrix_data/3Di.mat +24 -0
  72. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  81. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  87. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  93. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  99. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  100. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  101. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  102. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  103. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  104. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  105. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  154. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  155. biotite/sequence/align/matrix_data/PB.license +21 -0
  156. biotite/sequence/align/matrix_data/PB.mat +18 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  160. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  161. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  162. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  163. biotite/sequence/align/multiple.cp313-win_amd64.pyd +0 -0
  164. biotite/sequence/align/multiple.pyx +620 -0
  165. biotite/sequence/align/pairwise.cp313-win_amd64.pyd +0 -0
  166. biotite/sequence/align/pairwise.pyx +587 -0
  167. biotite/sequence/align/permutation.cp313-win_amd64.pyd +0 -0
  168. biotite/sequence/align/permutation.pyx +313 -0
  169. biotite/sequence/align/primes.txt +821 -0
  170. biotite/sequence/align/selector.cp313-win_amd64.pyd +0 -0
  171. biotite/sequence/align/selector.pyx +954 -0
  172. biotite/sequence/align/statistics.py +264 -0
  173. biotite/sequence/align/tracetable.cp313-win_amd64.pyd +0 -0
  174. biotite/sequence/align/tracetable.pxd +64 -0
  175. biotite/sequence/align/tracetable.pyx +370 -0
  176. biotite/sequence/alphabet.py +555 -0
  177. biotite/sequence/annotation.py +830 -0
  178. biotite/sequence/codec.cp313-win_amd64.pyd +0 -0
  179. biotite/sequence/codec.pyx +155 -0
  180. biotite/sequence/codon.py +477 -0
  181. biotite/sequence/codon_tables.txt +202 -0
  182. biotite/sequence/graphics/__init__.py +33 -0
  183. biotite/sequence/graphics/alignment.py +1115 -0
  184. biotite/sequence/graphics/color_schemes/3di_flower.json +48 -0
  185. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  186. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  187. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  188. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  189. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  190. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  192. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  193. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  194. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  195. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  196. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  197. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  198. biotite/sequence/graphics/color_schemes/pb_flower.json +40 -0
  199. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  200. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  201. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  202. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  203. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  204. biotite/sequence/graphics/colorschemes.py +170 -0
  205. biotite/sequence/graphics/dendrogram.py +229 -0
  206. biotite/sequence/graphics/features.py +544 -0
  207. biotite/sequence/graphics/logo.py +104 -0
  208. biotite/sequence/graphics/plasmid.py +712 -0
  209. biotite/sequence/io/__init__.py +12 -0
  210. biotite/sequence/io/fasta/__init__.py +22 -0
  211. biotite/sequence/io/fasta/convert.py +284 -0
  212. biotite/sequence/io/fasta/file.py +265 -0
  213. biotite/sequence/io/fastq/__init__.py +19 -0
  214. biotite/sequence/io/fastq/convert.py +117 -0
  215. biotite/sequence/io/fastq/file.py +507 -0
  216. biotite/sequence/io/genbank/__init__.py +17 -0
  217. biotite/sequence/io/genbank/annotation.py +269 -0
  218. biotite/sequence/io/genbank/file.py +573 -0
  219. biotite/sequence/io/genbank/metadata.py +336 -0
  220. biotite/sequence/io/genbank/sequence.py +171 -0
  221. biotite/sequence/io/general.py +201 -0
  222. biotite/sequence/io/gff/__init__.py +26 -0
  223. biotite/sequence/io/gff/convert.py +128 -0
  224. biotite/sequence/io/gff/file.py +450 -0
  225. biotite/sequence/phylo/__init__.py +36 -0
  226. biotite/sequence/phylo/nj.cp313-win_amd64.pyd +0 -0
  227. biotite/sequence/phylo/nj.pyx +221 -0
  228. biotite/sequence/phylo/tree.cp313-win_amd64.pyd +0 -0
  229. biotite/sequence/phylo/tree.pyx +1169 -0
  230. biotite/sequence/phylo/upgma.cp313-win_amd64.pyd +0 -0
  231. biotite/sequence/phylo/upgma.pyx +164 -0
  232. biotite/sequence/profile.py +567 -0
  233. biotite/sequence/search.py +118 -0
  234. biotite/sequence/seqtypes.py +713 -0
  235. biotite/sequence/sequence.py +374 -0
  236. biotite/setup_ccd.py +197 -0
  237. biotite/structure/__init__.py +133 -0
  238. biotite/structure/alphabet/__init__.py +25 -0
  239. biotite/structure/alphabet/encoder.py +332 -0
  240. biotite/structure/alphabet/encoder_weights_3di.kerasify +0 -0
  241. biotite/structure/alphabet/i3d.py +110 -0
  242. biotite/structure/alphabet/layers.py +86 -0
  243. biotite/structure/alphabet/pb.license +21 -0
  244. biotite/structure/alphabet/pb.py +171 -0
  245. biotite/structure/alphabet/unkerasify.py +122 -0
  246. biotite/structure/atoms.py +1554 -0
  247. biotite/structure/basepairs.py +1404 -0
  248. biotite/structure/bonds.cp313-win_amd64.pyd +0 -0
  249. biotite/structure/bonds.pyx +1972 -0
  250. biotite/structure/box.py +588 -0
  251. biotite/structure/celllist.cp313-win_amd64.pyd +0 -0
  252. biotite/structure/celllist.pyx +849 -0
  253. biotite/structure/chains.py +314 -0
  254. biotite/structure/charges.cp313-win_amd64.pyd +0 -0
  255. biotite/structure/charges.pyx +520 -0
  256. biotite/structure/compare.py +274 -0
  257. biotite/structure/density.py +109 -0
  258. biotite/structure/dotbracket.py +214 -0
  259. biotite/structure/error.py +39 -0
  260. biotite/structure/filter.py +590 -0
  261. biotite/structure/geometry.py +655 -0
  262. biotite/structure/graphics/__init__.py +13 -0
  263. biotite/structure/graphics/atoms.py +243 -0
  264. biotite/structure/graphics/rna.py +295 -0
  265. biotite/structure/hbond.py +428 -0
  266. biotite/structure/info/__init__.py +24 -0
  267. biotite/structure/info/atom_masses.json +121 -0
  268. biotite/structure/info/atoms.py +81 -0
  269. biotite/structure/info/bonds.py +149 -0
  270. biotite/structure/info/ccd.py +202 -0
  271. biotite/structure/info/components.bcif +0 -0
  272. biotite/structure/info/groups.py +131 -0
  273. biotite/structure/info/masses.py +121 -0
  274. biotite/structure/info/misc.py +138 -0
  275. biotite/structure/info/radii.py +197 -0
  276. biotite/structure/info/standardize.py +186 -0
  277. biotite/structure/integrity.py +215 -0
  278. biotite/structure/io/__init__.py +29 -0
  279. biotite/structure/io/dcd/__init__.py +13 -0
  280. biotite/structure/io/dcd/file.py +67 -0
  281. biotite/structure/io/general.py +243 -0
  282. biotite/structure/io/gro/__init__.py +14 -0
  283. biotite/structure/io/gro/file.py +344 -0
  284. biotite/structure/io/mol/__init__.py +20 -0
  285. biotite/structure/io/mol/convert.py +112 -0
  286. biotite/structure/io/mol/ctab.py +415 -0
  287. biotite/structure/io/mol/header.py +120 -0
  288. biotite/structure/io/mol/mol.py +149 -0
  289. biotite/structure/io/mol/sdf.py +914 -0
  290. biotite/structure/io/netcdf/__init__.py +13 -0
  291. biotite/structure/io/netcdf/file.py +64 -0
  292. biotite/structure/io/pdb/__init__.py +20 -0
  293. biotite/structure/io/pdb/convert.py +307 -0
  294. biotite/structure/io/pdb/file.py +1290 -0
  295. biotite/structure/io/pdb/hybrid36.cp313-win_amd64.pyd +0 -0
  296. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  297. biotite/structure/io/pdbqt/__init__.py +15 -0
  298. biotite/structure/io/pdbqt/convert.py +113 -0
  299. biotite/structure/io/pdbqt/file.py +688 -0
  300. biotite/structure/io/pdbx/__init__.py +23 -0
  301. biotite/structure/io/pdbx/bcif.py +656 -0
  302. biotite/structure/io/pdbx/cif.py +1075 -0
  303. biotite/structure/io/pdbx/component.py +245 -0
  304. biotite/structure/io/pdbx/compress.py +321 -0
  305. biotite/structure/io/pdbx/convert.py +1745 -0
  306. biotite/structure/io/pdbx/encoding.cp313-win_amd64.pyd +0 -0
  307. biotite/structure/io/pdbx/encoding.pyx +1031 -0
  308. biotite/structure/io/trajfile.py +693 -0
  309. biotite/structure/io/trr/__init__.py +13 -0
  310. biotite/structure/io/trr/file.py +43 -0
  311. biotite/structure/io/xtc/__init__.py +13 -0
  312. biotite/structure/io/xtc/file.py +43 -0
  313. biotite/structure/mechanics.py +73 -0
  314. biotite/structure/molecules.py +352 -0
  315. biotite/structure/pseudoknots.py +628 -0
  316. biotite/structure/rdf.py +245 -0
  317. biotite/structure/repair.py +304 -0
  318. biotite/structure/residues.py +572 -0
  319. biotite/structure/sasa.cp313-win_amd64.pyd +0 -0
  320. biotite/structure/sasa.pyx +322 -0
  321. biotite/structure/segments.py +178 -0
  322. biotite/structure/sequence.py +111 -0
  323. biotite/structure/sse.py +308 -0
  324. biotite/structure/superimpose.py +689 -0
  325. biotite/structure/transform.py +530 -0
  326. biotite/structure/util.py +168 -0
  327. biotite/version.py +16 -0
  328. biotite/visualize.py +265 -0
  329. biotite-1.1.0.dist-info/METADATA +190 -0
  330. biotite-1.1.0.dist-info/RECORD +332 -0
  331. biotite-1.1.0.dist-info/WHEEL +4 -0
  332. biotite-1.1.0.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,1031 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This module contains data encodings for BinaryCIF files.
7
+ """
8
+
9
+ __name__ = "biotite.structure.io.pdbx"
10
+ __author__ = "Patrick Kunzmann"
11
+ __all__ = ["ByteArrayEncoding", "FixedPointEncoding",
12
+ "IntervalQuantizationEncoding", "RunLengthEncoding",
13
+ "DeltaEncoding", "IntegerPackingEncoding", "StringArrayEncoding",
14
+ "TypeCode"]
15
+
16
+ cimport cython
17
+ cimport numpy as np
18
+
19
+ from dataclasses import dataclass
20
+ from abc import ABCMeta, abstractmethod
21
+ from numbers import Integral
22
+ from enum import IntEnum
23
+ import re
24
+ import numpy as np
25
+ from .component import _Component
26
+ from ....file import InvalidFileError
27
+
28
+ ctypedef np.int8_t int8
29
+ ctypedef np.int16_t int16
30
+ ctypedef np.int32_t int32
31
+ ctypedef np.uint8_t uint8
32
+ ctypedef np.uint16_t uint16
33
+ ctypedef np.uint32_t uint32
34
+ ctypedef np.float32_t float32
35
+ ctypedef np.float64_t float64
36
+
37
+ ctypedef fused Integer:
38
+ uint8
39
+ uint16
40
+ uint32
41
+ int8
42
+ int16
43
+ int32
44
+
45
+ # Used to create cartesian product of type combinations
46
+ # in run-length encoding
47
+ ctypedef fused OutputInteger:
48
+ uint8
49
+ uint16
50
+ uint32
51
+ int8
52
+ int16
53
+ int32
54
+
55
+ ctypedef fused Float:
56
+ float32
57
+ float64
58
+
59
+
60
+ CAMEL_CASE_PATTERN = re.compile(r"(?<!^)(?=[A-Z])")
61
+
62
+
63
+ class TypeCode(IntEnum):
64
+ """
65
+ This enum type represents integers that represent data types in
66
+ *BinaryCIF*.
67
+ """
68
+ INT8 = 1
69
+ INT16 = 2
70
+ INT32 = 3
71
+ UINT8 = 4
72
+ UINT16 = 5
73
+ UINT32 = 6
74
+ FLOAT32 = 32
75
+ FLOAT64 = 33
76
+
77
+ @staticmethod
78
+ def from_dtype(dtype):
79
+ """
80
+ Convert a *NumPy* dtype to a *BinaryCIF* type code.
81
+
82
+ Parameters
83
+ ----------
84
+ dtype : dtype or int or TypeCode
85
+ The data type to be converted.
86
+ If already a type code, it is simply returned.
87
+
88
+ Returns
89
+ -------
90
+ type_code : TypeCode
91
+ The corresponding type code.
92
+ """
93
+ if isinstance(dtype, Integral):
94
+ # Already a type code
95
+ return TypeCode(dtype)
96
+ else:
97
+ dtype = np.dtype(dtype)
98
+ # Find the closest dtype supported by the format
99
+ if np.issubdtype(dtype, np.integer):
100
+ # int64 is not supported by format
101
+ if dtype == np.int64:
102
+ supported_dtype = np.int32
103
+ elif dtype == np.uint64:
104
+ supported_dtype = np.uint32
105
+ else:
106
+ supported_dtype = dtype
107
+ elif np.issubdtype(dtype, np.floating):
108
+ if dtype == np.float16:
109
+ supported_dtype = np.float32
110
+ # float128 is not available on all architectures
111
+ elif hasattr(np, "float128") and dtype == np.float128:
112
+ supported_dtype = np.float64
113
+ else:
114
+ supported_dtype = dtype
115
+ else:
116
+ raise ValueError(
117
+ f"dtype '{dtype}' is not supported by BinaryCIF"
118
+ )
119
+ return _DTYPE_TO_TYPE_CODE[
120
+ np.dtype(supported_dtype).newbyteorder("<").str
121
+ ]
122
+
123
+ def to_dtype(self):
124
+ """
125
+ Convert this type code to a *NumPy* dtype.
126
+
127
+ Returns
128
+ -------
129
+ dtype : dtype
130
+ The corresponding data type.
131
+ """
132
+ return _TYPE_CODE_TO_DTYPE[self]
133
+
134
+ # Converts BCIF integers representing the type to an actual NumPy dtype
135
+ _TYPE_CODE_TO_DTYPE = {
136
+ # All data types are little-endian
137
+ TypeCode.INT8: "|i1",
138
+ TypeCode.INT16: "<i2",
139
+ TypeCode.INT32: "<i4",
140
+ TypeCode.UINT8: "|u1",
141
+ TypeCode.UINT16: "<u2",
142
+ TypeCode.UINT32: "<u4",
143
+ TypeCode.FLOAT32: "<f4",
144
+ TypeCode.FLOAT64: "<f8"
145
+ }
146
+ _DTYPE_TO_TYPE_CODE = {val: key for key, val in _TYPE_CODE_TO_DTYPE.items()}
147
+
148
+
149
+ class Encoding(_Component, metaclass=ABCMeta):
150
+ """
151
+ Abstract base class for *BinaryCIF* data encodings.
152
+
153
+ Notes
154
+ -----
155
+ The encoding classes do not omit bound checks for decoding,
156
+ since the file content may be invalid/malicious.
157
+ """
158
+
159
+ @classmethod
160
+ def deserialize(cls, content):
161
+ params = {
162
+ _camel_to_snake_case(param): value
163
+ for param, value in content.items()
164
+ }
165
+ # 'kind' is no parameter, but indicates the class itself
166
+ params.pop("kind")
167
+ try:
168
+ encoding = cls(**params)
169
+ except TypeError as e:
170
+ raise InvalidFileError(
171
+ f"Invalid encoding parameters for {cls.__name__}"
172
+ )
173
+ except ValueError:
174
+ raise InvalidFileError(
175
+ f"Missing encoding parameters for {cls.__name__}"
176
+ )
177
+ return encoding
178
+
179
+ def serialize(self):
180
+ for param in self.__annotations__:
181
+ if getattr(self, param) is None:
182
+ raise ValueError(
183
+ f"'{param}' must be explicitly given or needs to be "
184
+ "determined from first encoding pass, before it is "
185
+ "serialized"
186
+ )
187
+
188
+ serialized = {
189
+ _snake_to_camel_case(param): getattr(self, param)
190
+ for param in self.__annotations__
191
+ }
192
+ serialized.update({
193
+ "kind": _encoding_classes_kinds[type(self).__name__]
194
+ })
195
+ return serialized
196
+
197
+ @abstractmethod
198
+ def encode(self, data):
199
+ """
200
+ Apply this encoding to the given data.
201
+
202
+ Parameters
203
+ ----------
204
+ data : ndarray
205
+ The data to be encoded.
206
+
207
+ Returns
208
+ -------
209
+ encoded_data : ndarray or bytes
210
+ The encoded data.
211
+ """
212
+ raise NotImplementedError()
213
+
214
+ @abstractmethod
215
+ def decode(self, data):
216
+ """
217
+ Apply the inverse of this encoding to the given data.
218
+
219
+ Parameters
220
+ ----------
221
+ data : ndarray or bytes
222
+ The data to be decoded.
223
+
224
+ Returns
225
+ -------
226
+ decoded_data : ndarray
227
+ The decoded data.
228
+ """
229
+ # Important: Do not omit bound checks for decoding,
230
+ # since the file content may be invalid/malicious.
231
+ raise NotImplementedError()
232
+
233
+
234
+ @dataclass
235
+ class ByteArrayEncoding(Encoding):
236
+ r"""
237
+ Encoding that encodes an array into bytes.
238
+
239
+ Parameters
240
+ ----------
241
+ type : dytpe or TypeCode, optional
242
+ The data type of the array to be encoded.
243
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
244
+ If omitted, the data type is taken from the data the
245
+ first time :meth:`encode()` is called.
246
+
247
+ Attributes
248
+ ----------
249
+ type : TypeCode
250
+
251
+ Examples
252
+ --------
253
+
254
+ >>> data = np.arange(3)
255
+ >>> print(data)
256
+ [0 1 2]
257
+ >>> print(ByteArrayEncoding().encode(data))
258
+ b'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
259
+ """
260
+ type: ... = None
261
+
262
+ def __post_init__(self):
263
+ if self.type is not None:
264
+ self.type = TypeCode.from_dtype(self.type)
265
+
266
+ def encode(self, data):
267
+ if self.type is None:
268
+ self.type = TypeCode.from_dtype(data.dtype)
269
+ return _safe_cast(data, self.type.to_dtype()).tobytes()
270
+
271
+ def decode(self, data):
272
+ # Data is raw bytes in this case
273
+ return np.frombuffer(data, dtype=self.type.to_dtype())
274
+
275
+
276
+ @dataclass
277
+ class FixedPointEncoding(Encoding):
278
+ """
279
+ Lossy encoding that multiplies floating point values with a given
280
+ factor and subsequently rounds them to the nearest integer.
281
+
282
+ Parameters
283
+ ----------
284
+ factor : float
285
+ The factor by which the data is multiplied before rounding.
286
+ src_type : dtype or TypeCode, optional
287
+ The data type of the array to be encoded.
288
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
289
+ The dtype must be a float type.
290
+ If omitted, the data type is taken from the data the
291
+ first time :meth:`encode()` is called.
292
+
293
+ Attributes
294
+ ----------
295
+ factor : float
296
+ src_type : TypeCode
297
+
298
+ Examples
299
+ --------
300
+
301
+ >>> data = np.array([9.87, 6.543])
302
+ >>> print(data)
303
+ [9.870 6.543]
304
+ >>> print(FixedPointEncoding(factor=100).encode(data))
305
+ [987 654]
306
+ """
307
+ factor: ...
308
+ src_type: ... = None
309
+
310
+ def __post_init__(self):
311
+ if self.src_type is not None:
312
+ self.src_type = TypeCode.from_dtype(self.src_type)
313
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
314
+ raise ValueError(
315
+ "Only floating point types are supported"
316
+ )
317
+
318
+ def encode(self, data):
319
+ # If not given in constructor, it is determined from the data
320
+ if self.src_type is None:
321
+ self.src_type = TypeCode.from_dtype(data.dtype)
322
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
323
+ raise ValueError(
324
+ "Only floating point types are supported"
325
+ )
326
+
327
+ # Round to avoid wrong values due to floating point inaccuracies
328
+ return np.round(data * self.factor).astype(np.int32)
329
+
330
+ def decode(self, data):
331
+ return (data / self.factor).astype(
332
+ dtype=self.src_type.to_dtype(), copy=False
333
+ )
334
+
335
+
336
+ @dataclass
337
+ class IntervalQuantizationEncoding(Encoding):
338
+ """
339
+ Lossy encoding that sorts floating point values into bins.
340
+ Each bin is represented by an integer
341
+
342
+ Parameters
343
+ ----------
344
+ min, max : float
345
+ The minimum and maximum value the bins comprise.
346
+ num_steps : int
347
+ The number of bins.
348
+ src_type : dtype or TypeCode, optional
349
+ The data type of the array to be encoded.
350
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
351
+ The dtype must be a float type.
352
+ If omitted, the data type is taken from the data the
353
+ first time :meth:`encode()` is called.
354
+
355
+ Attributes
356
+ ----------
357
+ min, max : float
358
+ num_steps : int
359
+ src_type : TypeCode
360
+
361
+ Examples
362
+ --------
363
+
364
+ >>> data = np.linspace(11, 12, 6)
365
+ >>> print(data)
366
+ [11.0 11.2 11.4 11.6 11.8 12.0]
367
+ >>> # Use 0.5 as step size
368
+ >>> encoding = IntervalQuantizationEncoding(min=10, max=20, num_steps=21)
369
+ >>> # The encoding is lossy, as different values are mapped to the same bin
370
+ >>> encoded = encoding.encode(data)
371
+ >>> print(encoded)
372
+ [2 3 3 4 4 4]
373
+ >>> decoded = encoding.decode(encoded)
374
+ >>> print(decoded)
375
+ [11.0 11.5 11.5 12.0 12.0 12.0]
376
+ """
377
+ min: ...
378
+ max: ...
379
+ num_steps: ...
380
+ src_type: ... = None
381
+
382
+ def __post_init__(self):
383
+ if self.src_type is not None:
384
+ self.src_type = TypeCode.from_dtype(self.src_type)
385
+
386
+ def encode(self, data):
387
+ # If not given in constructor, it is determined from the data
388
+ if self.src_type is None:
389
+ self.src_type = TypeCode.from_dtype(data.dtype)
390
+
391
+ steps = np.linspace(
392
+ self.min, self.max, self.num_steps, dtype=data.dtype
393
+ )
394
+ indices = np.searchsorted(steps, data, side="left")
395
+ return indices.astype(np.int32, copy=False)
396
+
397
+ def decode(self, data):
398
+ output = data * (self.max - self.min) / (self.num_steps - 1)
399
+ output = output.astype(self.src_type.to_dtype(), copy=False)
400
+ output += self.min
401
+ return output
402
+
403
+
404
+ @dataclass
405
+ class RunLengthEncoding(Encoding):
406
+ """
407
+ Encoding that compresses runs of equal values into pairs of
408
+ (value, run length).
409
+
410
+ Parameters
411
+ ----------
412
+ src_size : int, optional
413
+ The size of the array to be encoded.
414
+ If omitted, the size is determined from the data the
415
+ first time :meth:`encode()` is called.
416
+ src_type : dtype or TypeCode, optional
417
+ The data type of the array to be encoded.
418
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
419
+ The dtype must be a integer type.
420
+ If omitted, the data type is taken from the data the
421
+ first time :meth:`encode()` is called.
422
+
423
+ Attributes
424
+ ----------
425
+ src_size : int
426
+ src_type : TypeCode
427
+
428
+ Examples
429
+ --------
430
+
431
+ >>> data = np.array([1, 1, 1, 5, 3, 3])
432
+ >>> print(data)
433
+ [1 1 1 5 3 3]
434
+ >>> encoded = RunLengthEncoding().encode(data)
435
+ >>> print(encoded)
436
+ [1 3 5 1 3 2]
437
+ >>> # Emphasize the the pairs
438
+ >>> print(encoded.reshape(-1, 2))
439
+ [[1 3]
440
+ [5 1]
441
+ [3 2]]
442
+ """
443
+ src_size: ... = None
444
+ src_type: ... = None
445
+
446
+ def __post_init__(self):
447
+ if self.src_type is not None:
448
+ self.src_type = TypeCode.from_dtype(self.src_type)
449
+
450
+ def encode(self, data):
451
+ # If not given in constructor, it is determined from the data
452
+ if self.src_type is None:
453
+ self.src_type = TypeCode.from_dtype(data.dtype)
454
+ if self.src_size is None:
455
+ self.src_size = data.shape[0]
456
+ elif self.src_size != data.shape[0]:
457
+ raise IndexError(
458
+ "Given source size does not match actual data size"
459
+ )
460
+ return self._encode(_safe_cast(data, self.src_type.to_dtype()))
461
+
462
+ def decode(self, data):
463
+ return self._decode(
464
+ data, np.empty(0, dtype=self.src_type.to_dtype())
465
+ )
466
+
467
+ def _encode(self, const Integer[:] data):
468
+ # Pessimistic allocation of output array
469
+ # -> Run length is 1 for every element
470
+ cdef int32[:] output = np.zeros(data.shape[0] * 2, dtype=np.int32)
471
+ cdef int i=0, j=0
472
+ cdef int val = data[0]
473
+ cdef int run_length = 0
474
+ cdef int curr_val
475
+ for i in range(data.shape[0]):
476
+ curr_val = data[i]
477
+ if curr_val == val:
478
+ run_length += 1
479
+ else:
480
+ # New element -> Write element with run-length
481
+ output[j] = val
482
+ output[j+1] = run_length
483
+ j += 2
484
+ val = curr_val
485
+ run_length = 1
486
+ # Write last element
487
+ output[j] = val
488
+ output[j+1] = run_length
489
+ j += 2
490
+ # Trim to correct size
491
+ return np.asarray(output)[:j]
492
+
493
+ def _decode(self, const Integer[:] data, OutputInteger[:] output_type):
494
+ """
495
+ `output_type` is merely a typed placeholder to allow for static
496
+ typing of output.
497
+ """
498
+ if data.shape[0] % 2 != 0:
499
+ raise ValueError("Invalid run-length encoded data")
500
+
501
+ cdef int length = 0
502
+ cdef int i, j
503
+ cdef int value, repeat
504
+
505
+ if self.src_size is None:
506
+ # Determine length of output array by summing run lengths
507
+ for i in range(1, data.shape[0], 2):
508
+ length += data[i]
509
+ else:
510
+ length = self.src_size
511
+
512
+ cdef OutputInteger[:] output = np.zeros(
513
+ length, dtype=np.asarray(output_type).dtype
514
+ )
515
+ # Fill output array
516
+ j = 0
517
+ for i in range(0, data.shape[0], 2):
518
+ value = data[i]
519
+ repeat = data[i+1]
520
+ output[j : j+repeat] = value
521
+ j += repeat
522
+ return np.asarray(output)
523
+
524
+
525
+ @dataclass
526
+ class DeltaEncoding(Encoding):
527
+ """
528
+ Encoding that encodes an array of integers into an array of
529
+ consecutive differences.
530
+
531
+ Parameters
532
+ ----------
533
+ src_type : dtype or TypeCode, optional
534
+ The data type of the array to be encoded.
535
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
536
+ The dtype must be a integer type.
537
+ If omitted, the data type is taken from the data the
538
+ first time :meth:`encode()` is called.
539
+ origin : int, optional
540
+ The starting value from which the differences are calculated.
541
+ If omitted, the value is taken from the first array element the
542
+ first time :meth:`encode()` is called.
543
+
544
+ Attributes
545
+ ----------
546
+ src_type : TypeCode
547
+ origin : int
548
+
549
+ Examples
550
+ --------
551
+
552
+ >>> data = np.array([1, 1, 2, 3, 5, 8])
553
+ >>> encoding = DeltaEncoding()
554
+ >>> print(encoding.encode(data))
555
+ [0 0 1 1 2 3]
556
+ >>> print(encoding.origin)
557
+ 1
558
+ """
559
+ src_type: ... = None
560
+ origin: ... = None
561
+
562
+ def __post_init__(self):
563
+ if self.src_type is not None:
564
+ self.src_type = TypeCode.from_dtype(self.src_type)
565
+
566
+ def encode(self, data):
567
+ # If not given in constructor, it is determined from the data
568
+ if self.src_type is None:
569
+ self.src_type = TypeCode.from_dtype(data.dtype)
570
+ if self.origin is None:
571
+ self.origin = data[0]
572
+
573
+ data = data - self.origin
574
+ return np.diff(data, prepend=0).astype(np.int32, copy=False)
575
+
576
+ def decode(self, data):
577
+ output = np.cumsum(data, dtype=self.src_type.to_dtype())
578
+ output += self.origin
579
+ return output
580
+
581
+
582
+ @dataclass
583
+ class IntegerPackingEncoding(Encoding):
584
+ """
585
+ Encoding that compresses an array of 32-bit integers into an array
586
+ of smaller sized integers.
587
+
588
+ If a value does not fit into smaller integer type,
589
+ the integer is represented by a sum of consecutive elements
590
+ in the compressed array.
591
+
592
+ Parameters
593
+ ----------
594
+ byte_count : int
595
+ The number of bytes the packed integers should occupy.
596
+ Supported values are 1 and 2 for 8-bit and 16-bit integers,
597
+ respectively.
598
+ src_size : int, optional
599
+ The size of the array to be encoded.
600
+ If omitted, the size is determined from the data the
601
+ first time :meth:`encode()` is called.
602
+ is_unsigned : bool, optional
603
+ Whether the values should be packed into signed or unsigned
604
+ integers.
605
+ If omitted, first time :meth:`encode()` is called, determines whether
606
+ the values fit into unsigned integers.
607
+
608
+ Attributes
609
+ ----------
610
+ byte_count : int
611
+ src_size : int
612
+ is_unsigned : bool
613
+
614
+ Examples
615
+ --------
616
+
617
+ >>> data = np.array([1, 2, -3, 128])
618
+ >>> print(data)
619
+ [ 1 2 -3 128]
620
+ >>> print(IntegerPackingEncoding(byte_count=1).encode(data))
621
+ [ 1 2 -3 127 1]
622
+ """
623
+ byte_count: ...
624
+ src_size: ... = None
625
+ is_unsigned: ... = None
626
+
627
+ def encode(self, data):
628
+ if self.src_size is None:
629
+ self.src_size = len(data)
630
+ elif self.src_size != len(data):
631
+ raise IndexError(
632
+ "Given source size does not match actual data size"
633
+ )
634
+ if self.is_unsigned is None:
635
+ # Only positive values -> use unsigned integers
636
+ self.is_unsigned = data.min().item() >= 0
637
+
638
+ data = data.astype(np.int32, copy=False)
639
+ return self._encode(
640
+ data, np.empty(0, dtype=self._determine_packed_dtype())
641
+ )
642
+
643
+ def decode(self, const Integer[:] data):
644
+ cdef int i, j
645
+ cdef int min_val, max_val
646
+ cdef int packed_val, unpacked_val
647
+ bounds = self._get_bounds(data)
648
+ min_val = bounds[0]
649
+ max_val = bounds[1]
650
+ # For signed integers, do not check lower bound (is always 0)
651
+ # -> Set lower bound to value that is never reached
652
+ if min_val == 0:
653
+ min_val = -1
654
+
655
+ cdef int32[:] output = np.zeros(self.src_size, dtype=np.int32)
656
+ j = 0
657
+ unpacked_val = 0
658
+ for i in range(data.shape[0]):
659
+ packed_val = data[i]
660
+ if packed_val == max_val or packed_val == min_val:
661
+ unpacked_val += packed_val
662
+ else:
663
+ unpacked_val += packed_val
664
+ output[j] = unpacked_val
665
+ unpacked_val = 0
666
+ j += 1
667
+ # Trim to correct size and return
668
+ return np.asarray(output)
669
+
670
+ def _determine_packed_dtype(self):
671
+ if self.byte_count == 1:
672
+ if self.is_unsigned:
673
+ return np.uint8
674
+ else:
675
+ return np.int8
676
+ elif self.byte_count == 2:
677
+ if self.is_unsigned:
678
+ return np.uint16
679
+ else:
680
+ return np.int16
681
+ else:
682
+ raise ValueError("Unsupported byte count")
683
+
684
+ @cython.cdivision(True)
685
+ def _encode(self, const Integer[:] data, OutputInteger[:] output_type):
686
+ """
687
+ `output_type` is merely a typed placeholder to allow for static
688
+ typing of output.
689
+ """
690
+ cdef int i=0, j=0
691
+
692
+ packed_type = np.asarray(output_type).dtype
693
+ cdef int min_val = np.iinfo(packed_type).min
694
+ cdef int max_val = np.iinfo(packed_type).max
695
+
696
+ # Get length of output array
697
+ # by summing up required length of each element
698
+ cdef int number
699
+ cdef long length = 0
700
+ for i in range(data.shape[0]):
701
+ number = data[i]
702
+ if number < 0:
703
+ if min_val == 0:
704
+ raise ValueError(
705
+ "Cannot pack negative numbers into unsigned type"
706
+ )
707
+ # The required packed length for an element is the
708
+ # number of times min_val/max_val need to be repeated
709
+ length += number // min_val + 1
710
+ elif number > 0:
711
+ length += number // max_val + 1
712
+ else:
713
+ # number = 0
714
+ length += 1
715
+
716
+ # Fill output
717
+ cdef OutputInteger[:] output = np.zeros(length, dtype=packed_type)
718
+ cdef int remainder
719
+ j = 0
720
+ for i in range(data.shape[0]):
721
+ remainder = data[i]
722
+ if remainder < 0:
723
+ if min_val == 0:
724
+ raise ValueError(
725
+ "Cannot pack negative numbers into unsigned type"
726
+ )
727
+ while remainder <= min_val:
728
+ remainder -= min_val
729
+ output[j] = min_val
730
+ j += 1
731
+ elif remainder > 0:
732
+ while remainder >= max_val:
733
+ remainder -= max_val
734
+ output[j] = max_val
735
+ j += 1
736
+ output[j] = remainder
737
+ j += 1
738
+ return np.asarray(output)
739
+
740
+ @staticmethod
741
+ def _get_bounds(const Integer[:] data):
742
+ if Integer is int8:
743
+ info = np.iinfo(np.int8)
744
+ elif Integer is int16:
745
+ info = np.iinfo(np.int16)
746
+ elif Integer is int32:
747
+ info = np.iinfo(np.int32)
748
+ elif Integer is uint8:
749
+ info = np.iinfo(np.uint8)
750
+ elif Integer is uint16:
751
+ info = np.iinfo(np.uint16)
752
+ elif Integer is uint32:
753
+ info = np.iinfo(np.uint32)
754
+ else:
755
+ raise ValueError("Unsupported integer type")
756
+ return info.min, info.max
757
+
758
+
759
+ @dataclass
760
+ class StringArrayEncoding(Encoding):
761
+ """
762
+ Encoding that compresses an array of strings into an array of
763
+ indices that point to the unique strings in that array.
764
+
765
+ The unique strings themselves are stored as part of the
766
+ :class:`StringArrayEncoding` as concatenated string.
767
+ The start index of each unique string in the concatenated string
768
+ is stored in an *offset* array.
769
+
770
+ Parameters
771
+ ----------
772
+ strings : ndarray, optional
773
+ The unique strings that are used for encoding.
774
+ If omitted, the unique strings are determined from the data the
775
+ first time :meth:`encode()` is called.
776
+ data_encoding : list of Encoding, optional
777
+ The encodings that are applied to the index array.
778
+ If omitted, the array is directly encoded into bytes without
779
+ further compression.
780
+ offset_encoding : list of Encoding, optional
781
+ The encodings that are applied to the offset array.
782
+ If omitted, the array is directly encoded into bytes without
783
+ further compression.
784
+
785
+ Attributes
786
+ ----------
787
+ strings : ndarray
788
+ data_encoding : list of Encoding
789
+ offset_encoding : list of Encoding
790
+
791
+ Examples
792
+ --------
793
+
794
+ >>> data = np.array(["apple", "banana", "cherry", "apple", "banana", "apple"])
795
+ >>> print(data)
796
+ ['apple' 'banana' 'cherry' 'apple' 'banana' 'apple']
797
+ >>> # By default the indices would directly be encoded into bytes
798
+ >>> # However, the indices should be printed here -> data_encoding=[]
799
+ >>> encoding = StringArrayEncoding(data_encoding=[])
800
+ >>> encoded = encoding.encode(data)
801
+ >>> print(encoding.strings)
802
+ ['apple' 'banana' 'cherry']
803
+ >>> print(encoded)
804
+ [0 1 2 0 1 0]
805
+ """
806
+
807
+ strings: ... = None
808
+ data_encoding: ... = None
809
+ offset_encoding: ... = None
810
+
811
+ def __init__(self, strings=None, data_encoding=None, offset_encoding=None):
812
+ self.strings = strings
813
+ if data_encoding is None:
814
+ data_encoding = [ByteArrayEncoding(TypeCode.INT32)]
815
+ self.data_encoding = data_encoding
816
+ if offset_encoding is None:
817
+ offset_encoding = [ByteArrayEncoding(TypeCode.INT32)]
818
+ self.offset_encoding = offset_encoding
819
+
820
+ @staticmethod
821
+ def deserialize(content):
822
+ data_encoding = [
823
+ deserialize_encoding(e) for e in content["dataEncoding"]
824
+ ]
825
+ offset_encoding = [
826
+ deserialize_encoding(e) for e in content["offsetEncoding"]
827
+ ]
828
+ cdef str concatenated_strings = content["stringData"]
829
+ cdef np.ndarray offsets = decode_stepwise(
830
+ content["offsets"], offset_encoding
831
+ )
832
+
833
+ strings = np.array([
834
+ concatenated_strings[offsets[i]:offsets[i+1]]
835
+ # The final offset is the exclusive stop index
836
+ for i in range(len(offsets)-1)
837
+ ], dtype="U")
838
+
839
+ return StringArrayEncoding(strings, data_encoding, offset_encoding)
840
+
841
+ def serialize(self):
842
+ if self.strings is None:
843
+ raise ValueError(
844
+ "'strings' must be explicitly given or needs to be "
845
+ "determined from first encoding pass, before it is serialized"
846
+ )
847
+
848
+ string_data = "".join(self.strings)
849
+ offsets = np.cumsum([0] + [len(s) for s in self.strings])
850
+
851
+ return {
852
+ "kind": "StringArray",
853
+ "dataEncoding": [e.serialize() for e in self.data_encoding],
854
+ "stringData": string_data,
855
+ "offsets": encode_stepwise(offsets, self.offset_encoding),
856
+ "offsetEncoding": [e.serialize() for e in self.offset_encoding],
857
+ }
858
+
859
+ def encode(self, data):
860
+ if not np.issubdtype(data.dtype, np.str_):
861
+ raise TypeError("Data must be of string type")
862
+
863
+ if self.strings is None:
864
+ # 'unique()' already sorts the strings, but this is not necessarily
865
+ # desired, as this makes efficient encoding of the indices more difficult
866
+ # -> Bring into the original order
867
+ _, unique_indices = np.unique(data, return_index=True)
868
+ self.strings = data[np.sort(unique_indices)]
869
+ check_present = False
870
+ else:
871
+ check_present = True
872
+
873
+ string_order = np.argsort(self.strings).astype(np.int32)
874
+ sorted_strings = self.strings[string_order]
875
+ sorted_indices = np.searchsorted(sorted_strings, data)
876
+ indices = string_order[sorted_indices]
877
+ if check_present and not np.all(self.strings[indices] == data):
878
+ raise ValueError("Data contains strings not present in 'strings'")
879
+ return encode_stepwise(indices, self.data_encoding)
880
+
881
+ def decode(self, data):
882
+ indices = decode_stepwise(data, self.data_encoding)
883
+ return self.strings[indices]
884
+
885
+ def __eq__(self, other):
886
+ if not isinstance(other, type(self)):
887
+ return False
888
+ if not np.array_equal(self.strings, other.strings):
889
+ return False
890
+ if self.data_encoding != other.data_encoding:
891
+ return False
892
+ if self.offset_encoding != other.offset_encoding:
893
+ return False
894
+ return True
895
+
896
+
897
+ _encoding_classes = {
898
+ "ByteArray": ByteArrayEncoding,
899
+ "FixedPoint": FixedPointEncoding,
900
+ "IntervalQuantization": IntervalQuantizationEncoding,
901
+ "RunLength": RunLengthEncoding,
902
+ "Delta": DeltaEncoding,
903
+ "IntegerPacking": IntegerPackingEncoding,
904
+ "StringArray": StringArrayEncoding,
905
+ }
906
+ _encoding_classes_kinds = {
907
+ "ByteArrayEncoding": "ByteArray",
908
+ "FixedPointEncoding": "FixedPoint",
909
+ "IntervalQuantizationEncoding": "IntervalQuantization",
910
+ "RunLengthEncoding": "RunLength",
911
+ "DeltaEncoding": "Delta",
912
+ "IntegerPackingEncoding": "IntegerPacking",
913
+ "StringArrayEncoding": "StringArray",
914
+ }
915
+
916
+
917
+ def deserialize_encoding(content):
918
+ """
919
+ Create a :class:`Encoding` by deserializing the given *BinaryCIF* content.
920
+
921
+ Parameters
922
+ ----------
923
+ content : dict
924
+ The encoding represenet as *BinaryCIF* dictionary.
925
+
926
+ Returns
927
+ -------
928
+ encoding : Encoding
929
+ The deserialized encoding.
930
+ """
931
+ try:
932
+ encoding_class = _encoding_classes[content["kind"]]
933
+ except KeyError:
934
+ raise ValueError(
935
+ f"Unknown encoding kind '{content['kind']}'"
936
+ )
937
+ return encoding_class.deserialize(content)
938
+
939
+
940
+ def create_uncompressed_encoding(array):
941
+ """
942
+ Create a simple encoding for the given array that does not compress the data.
943
+
944
+ Parameters
945
+ ----------
946
+ array : ndarray
947
+ The array to to create the encoding for.
948
+
949
+ Returns
950
+ -------
951
+ encoding : list of Encoding
952
+ The encoding for the data.
953
+ """
954
+ if np.issubdtype(array.dtype, np.str_):
955
+ return [StringArrayEncoding()]
956
+ else:
957
+ return [ByteArrayEncoding()]
958
+
959
+
960
+ def encode_stepwise(data, encoding):
961
+ """
962
+ Apply a list of encodings stepwise to the given data.
963
+
964
+ Parameters
965
+ ----------
966
+ data : ndarray
967
+ The data to be encoded.
968
+ encoding : list of Encoding
969
+ The encodings to be applied.
970
+
971
+ Returns
972
+ -------
973
+ encoded_data : ndarray or bytes
974
+ The encoded data.
975
+ """
976
+ for encoding in encoding:
977
+ data = encoding.encode(data)
978
+ return data
979
+
980
+
981
+ def decode_stepwise(data, encoding):
982
+ """
983
+ Apply a list of encodings stepwise to the given data.
984
+
985
+ Parameters
986
+ ----------
987
+ data : ndarray or bytes
988
+ The data to be decoded.
989
+ encoding : list of Encoding
990
+ The encodings to be applied.
991
+
992
+ Returns
993
+ -------
994
+ decoded_data : ndarray
995
+ The decoded data.
996
+ """
997
+ for enc in reversed(encoding):
998
+ data = enc.decode(data)
999
+ return data
1000
+
1001
+
1002
+ def _camel_to_snake_case(attribute_name):
1003
+ return CAMEL_CASE_PATTERN.sub("_", attribute_name).lower()
1004
+
1005
+
1006
+ def _snake_to_camel_case(attribute_name):
1007
+ attribute_name = "".join(
1008
+ word.capitalize() for word in attribute_name.split("_")
1009
+ )
1010
+ return attribute_name[0].lower() + attribute_name[1:]
1011
+
1012
+
1013
+ def _safe_cast(array, dtype):
1014
+ dtype = np.dtype(dtype)
1015
+ if dtype == array.dtype:
1016
+ return array
1017
+ if np.issubdtype(dtype, np.integer):
1018
+ if not np.issubdtype(array.dtype, np.integer):
1019
+ raise ValueError("Cannot cast floating point to integer")
1020
+ dtype_info = np.iinfo(dtype)
1021
+ if np.any(array < dtype_info.min) or np.any(array > dtype_info.max):
1022
+ raise ValueError("Integer values do not fit into the given dtype")
1023
+ return array.astype(dtype)
1024
+
1025
+
1026
+ def _get_n_decimals(value, tolerance):
1027
+ MAX_DECIMALS = 10
1028
+ for n in range(MAX_DECIMALS):
1029
+ if abs(value - round(value, n)) < tolerance:
1030
+ return n
1031
+ return MAX_DECIMALS