biotite 0.41.1__cp310-cp310-macosx_10_16_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of biotite might be problematic. Click here for more details.

Files changed (340) hide show
  1. biotite/__init__.py +19 -0
  2. biotite/application/__init__.py +43 -0
  3. biotite/application/application.py +265 -0
  4. biotite/application/autodock/__init__.py +12 -0
  5. biotite/application/autodock/app.py +505 -0
  6. biotite/application/blast/__init__.py +14 -0
  7. biotite/application/blast/alignment.py +83 -0
  8. biotite/application/blast/webapp.py +421 -0
  9. biotite/application/clustalo/__init__.py +12 -0
  10. biotite/application/clustalo/app.py +238 -0
  11. biotite/application/dssp/__init__.py +12 -0
  12. biotite/application/dssp/app.py +152 -0
  13. biotite/application/localapp.py +306 -0
  14. biotite/application/mafft/__init__.py +12 -0
  15. biotite/application/mafft/app.py +122 -0
  16. biotite/application/msaapp.py +374 -0
  17. biotite/application/muscle/__init__.py +13 -0
  18. biotite/application/muscle/app3.py +254 -0
  19. biotite/application/muscle/app5.py +171 -0
  20. biotite/application/sra/__init__.py +18 -0
  21. biotite/application/sra/app.py +456 -0
  22. biotite/application/tantan/__init__.py +12 -0
  23. biotite/application/tantan/app.py +222 -0
  24. biotite/application/util.py +59 -0
  25. biotite/application/viennarna/__init__.py +18 -0
  26. biotite/application/viennarna/rnaalifold.py +304 -0
  27. biotite/application/viennarna/rnafold.py +269 -0
  28. biotite/application/viennarna/rnaplot.py +187 -0
  29. biotite/application/viennarna/util.py +72 -0
  30. biotite/application/webapp.py +77 -0
  31. biotite/copyable.py +71 -0
  32. biotite/database/__init__.py +23 -0
  33. biotite/database/entrez/__init__.py +15 -0
  34. biotite/database/entrez/check.py +61 -0
  35. biotite/database/entrez/dbnames.py +89 -0
  36. biotite/database/entrez/download.py +223 -0
  37. biotite/database/entrez/key.py +44 -0
  38. biotite/database/entrez/query.py +223 -0
  39. biotite/database/error.py +15 -0
  40. biotite/database/pubchem/__init__.py +21 -0
  41. biotite/database/pubchem/download.py +260 -0
  42. biotite/database/pubchem/error.py +20 -0
  43. biotite/database/pubchem/query.py +827 -0
  44. biotite/database/pubchem/throttle.py +99 -0
  45. biotite/database/rcsb/__init__.py +13 -0
  46. biotite/database/rcsb/download.py +167 -0
  47. biotite/database/rcsb/query.py +959 -0
  48. biotite/database/uniprot/__init__.py +13 -0
  49. biotite/database/uniprot/check.py +32 -0
  50. biotite/database/uniprot/download.py +134 -0
  51. biotite/database/uniprot/query.py +209 -0
  52. biotite/file.py +251 -0
  53. biotite/sequence/__init__.py +73 -0
  54. biotite/sequence/align/__init__.py +49 -0
  55. biotite/sequence/align/alignment.py +658 -0
  56. biotite/sequence/align/banded.cpython-310-darwin.so +0 -0
  57. biotite/sequence/align/banded.pyx +652 -0
  58. biotite/sequence/align/buckets.py +69 -0
  59. biotite/sequence/align/cigar.py +434 -0
  60. biotite/sequence/align/kmeralphabet.cpython-310-darwin.so +0 -0
  61. biotite/sequence/align/kmeralphabet.pyx +574 -0
  62. biotite/sequence/align/kmersimilarity.cpython-310-darwin.so +0 -0
  63. biotite/sequence/align/kmersimilarity.pyx +233 -0
  64. biotite/sequence/align/kmertable.cpython-310-darwin.so +0 -0
  65. biotite/sequence/align/kmertable.pyx +3400 -0
  66. biotite/sequence/align/localgapped.cpython-310-darwin.so +0 -0
  67. biotite/sequence/align/localgapped.pyx +892 -0
  68. biotite/sequence/align/localungapped.cpython-310-darwin.so +0 -0
  69. biotite/sequence/align/localungapped.pyx +279 -0
  70. biotite/sequence/align/matrix.py +405 -0
  71. biotite/sequence/align/matrix_data/BLOSUM100.mat +31 -0
  72. biotite/sequence/align/matrix_data/BLOSUM30.mat +31 -0
  73. biotite/sequence/align/matrix_data/BLOSUM35.mat +31 -0
  74. biotite/sequence/align/matrix_data/BLOSUM40.mat +31 -0
  75. biotite/sequence/align/matrix_data/BLOSUM45.mat +31 -0
  76. biotite/sequence/align/matrix_data/BLOSUM50.mat +31 -0
  77. biotite/sequence/align/matrix_data/BLOSUM50_13p.mat +25 -0
  78. biotite/sequence/align/matrix_data/BLOSUM50_14.3.mat +25 -0
  79. biotite/sequence/align/matrix_data/BLOSUM50_5.0.mat +25 -0
  80. biotite/sequence/align/matrix_data/BLOSUM55.mat +31 -0
  81. biotite/sequence/align/matrix_data/BLOSUM60.mat +31 -0
  82. biotite/sequence/align/matrix_data/BLOSUM62.mat +31 -0
  83. biotite/sequence/align/matrix_data/BLOSUM62_13p.mat +25 -0
  84. biotite/sequence/align/matrix_data/BLOSUM62_14.3.mat +25 -0
  85. biotite/sequence/align/matrix_data/BLOSUM62_5.0.mat +25 -0
  86. biotite/sequence/align/matrix_data/BLOSUM65.mat +31 -0
  87. biotite/sequence/align/matrix_data/BLOSUM70.mat +31 -0
  88. biotite/sequence/align/matrix_data/BLOSUM75.mat +31 -0
  89. biotite/sequence/align/matrix_data/BLOSUM80.mat +31 -0
  90. biotite/sequence/align/matrix_data/BLOSUM85.mat +31 -0
  91. biotite/sequence/align/matrix_data/BLOSUM90.mat +31 -0
  92. biotite/sequence/align/matrix_data/BLOSUMN.mat +31 -0
  93. biotite/sequence/align/matrix_data/CorBLOSUM49_5.0.mat +25 -0
  94. biotite/sequence/align/matrix_data/CorBLOSUM57_13p.mat +25 -0
  95. biotite/sequence/align/matrix_data/CorBLOSUM57_14.3.mat +25 -0
  96. biotite/sequence/align/matrix_data/CorBLOSUM61_5.0.mat +25 -0
  97. biotite/sequence/align/matrix_data/CorBLOSUM66_13p.mat +25 -0
  98. biotite/sequence/align/matrix_data/CorBLOSUM67_14.3.mat +25 -0
  99. biotite/sequence/align/matrix_data/DAYHOFF.mat +32 -0
  100. biotite/sequence/align/matrix_data/GONNET.mat +26 -0
  101. biotite/sequence/align/matrix_data/IDENTITY.mat +25 -0
  102. biotite/sequence/align/matrix_data/MATCH.mat +25 -0
  103. biotite/sequence/align/matrix_data/NUC.mat +25 -0
  104. biotite/sequence/align/matrix_data/PAM10.mat +34 -0
  105. biotite/sequence/align/matrix_data/PAM100.mat +34 -0
  106. biotite/sequence/align/matrix_data/PAM110.mat +34 -0
  107. biotite/sequence/align/matrix_data/PAM120.mat +34 -0
  108. biotite/sequence/align/matrix_data/PAM130.mat +34 -0
  109. biotite/sequence/align/matrix_data/PAM140.mat +34 -0
  110. biotite/sequence/align/matrix_data/PAM150.mat +34 -0
  111. biotite/sequence/align/matrix_data/PAM160.mat +34 -0
  112. biotite/sequence/align/matrix_data/PAM170.mat +34 -0
  113. biotite/sequence/align/matrix_data/PAM180.mat +34 -0
  114. biotite/sequence/align/matrix_data/PAM190.mat +34 -0
  115. biotite/sequence/align/matrix_data/PAM20.mat +34 -0
  116. biotite/sequence/align/matrix_data/PAM200.mat +34 -0
  117. biotite/sequence/align/matrix_data/PAM210.mat +34 -0
  118. biotite/sequence/align/matrix_data/PAM220.mat +34 -0
  119. biotite/sequence/align/matrix_data/PAM230.mat +34 -0
  120. biotite/sequence/align/matrix_data/PAM240.mat +34 -0
  121. biotite/sequence/align/matrix_data/PAM250.mat +34 -0
  122. biotite/sequence/align/matrix_data/PAM260.mat +34 -0
  123. biotite/sequence/align/matrix_data/PAM270.mat +34 -0
  124. biotite/sequence/align/matrix_data/PAM280.mat +34 -0
  125. biotite/sequence/align/matrix_data/PAM290.mat +34 -0
  126. biotite/sequence/align/matrix_data/PAM30.mat +34 -0
  127. biotite/sequence/align/matrix_data/PAM300.mat +34 -0
  128. biotite/sequence/align/matrix_data/PAM310.mat +34 -0
  129. biotite/sequence/align/matrix_data/PAM320.mat +34 -0
  130. biotite/sequence/align/matrix_data/PAM330.mat +34 -0
  131. biotite/sequence/align/matrix_data/PAM340.mat +34 -0
  132. biotite/sequence/align/matrix_data/PAM350.mat +34 -0
  133. biotite/sequence/align/matrix_data/PAM360.mat +34 -0
  134. biotite/sequence/align/matrix_data/PAM370.mat +34 -0
  135. biotite/sequence/align/matrix_data/PAM380.mat +34 -0
  136. biotite/sequence/align/matrix_data/PAM390.mat +34 -0
  137. biotite/sequence/align/matrix_data/PAM40.mat +34 -0
  138. biotite/sequence/align/matrix_data/PAM400.mat +34 -0
  139. biotite/sequence/align/matrix_data/PAM410.mat +34 -0
  140. biotite/sequence/align/matrix_data/PAM420.mat +34 -0
  141. biotite/sequence/align/matrix_data/PAM430.mat +34 -0
  142. biotite/sequence/align/matrix_data/PAM440.mat +34 -0
  143. biotite/sequence/align/matrix_data/PAM450.mat +34 -0
  144. biotite/sequence/align/matrix_data/PAM460.mat +34 -0
  145. biotite/sequence/align/matrix_data/PAM470.mat +34 -0
  146. biotite/sequence/align/matrix_data/PAM480.mat +34 -0
  147. biotite/sequence/align/matrix_data/PAM490.mat +34 -0
  148. biotite/sequence/align/matrix_data/PAM50.mat +34 -0
  149. biotite/sequence/align/matrix_data/PAM500.mat +34 -0
  150. biotite/sequence/align/matrix_data/PAM60.mat +34 -0
  151. biotite/sequence/align/matrix_data/PAM70.mat +34 -0
  152. biotite/sequence/align/matrix_data/PAM80.mat +34 -0
  153. biotite/sequence/align/matrix_data/PAM90.mat +34 -0
  154. biotite/sequence/align/matrix_data/RBLOSUM52_5.0.mat +25 -0
  155. biotite/sequence/align/matrix_data/RBLOSUM59_13p.mat +25 -0
  156. biotite/sequence/align/matrix_data/RBLOSUM59_14.3.mat +25 -0
  157. biotite/sequence/align/matrix_data/RBLOSUM64_5.0.mat +25 -0
  158. biotite/sequence/align/matrix_data/RBLOSUM69_13p.mat +25 -0
  159. biotite/sequence/align/matrix_data/RBLOSUM69_14.3.mat +25 -0
  160. biotite/sequence/align/multiple.cpython-310-darwin.so +0 -0
  161. biotite/sequence/align/multiple.pyx +620 -0
  162. biotite/sequence/align/pairwise.cpython-310-darwin.so +0 -0
  163. biotite/sequence/align/pairwise.pyx +587 -0
  164. biotite/sequence/align/permutation.cpython-310-darwin.so +0 -0
  165. biotite/sequence/align/permutation.pyx +305 -0
  166. biotite/sequence/align/primes.txt +821 -0
  167. biotite/sequence/align/selector.cpython-310-darwin.so +0 -0
  168. biotite/sequence/align/selector.pyx +956 -0
  169. biotite/sequence/align/statistics.py +265 -0
  170. biotite/sequence/align/tracetable.cpython-310-darwin.so +0 -0
  171. biotite/sequence/align/tracetable.pxd +64 -0
  172. biotite/sequence/align/tracetable.pyx +370 -0
  173. biotite/sequence/alphabet.py +566 -0
  174. biotite/sequence/annotation.py +829 -0
  175. biotite/sequence/codec.cpython-310-darwin.so +0 -0
  176. biotite/sequence/codec.pyx +155 -0
  177. biotite/sequence/codon.py +466 -0
  178. biotite/sequence/codon_tables.txt +202 -0
  179. biotite/sequence/graphics/__init__.py +33 -0
  180. biotite/sequence/graphics/alignment.py +1034 -0
  181. biotite/sequence/graphics/color_schemes/autumn.json +51 -0
  182. biotite/sequence/graphics/color_schemes/blossom.json +51 -0
  183. biotite/sequence/graphics/color_schemes/clustalx_dna.json +11 -0
  184. biotite/sequence/graphics/color_schemes/clustalx_protein.json +28 -0
  185. biotite/sequence/graphics/color_schemes/flower.json +51 -0
  186. biotite/sequence/graphics/color_schemes/jalview_buried.json +31 -0
  187. biotite/sequence/graphics/color_schemes/jalview_hydrophobicity.json +31 -0
  188. biotite/sequence/graphics/color_schemes/jalview_prop_helix.json +31 -0
  189. biotite/sequence/graphics/color_schemes/jalview_prop_strand.json +31 -0
  190. biotite/sequence/graphics/color_schemes/jalview_prop_turn.json +31 -0
  191. biotite/sequence/graphics/color_schemes/jalview_taylor.json +28 -0
  192. biotite/sequence/graphics/color_schemes/jalview_zappo.json +28 -0
  193. biotite/sequence/graphics/color_schemes/ocean.json +51 -0
  194. biotite/sequence/graphics/color_schemes/pb_flower.json +39 -0
  195. biotite/sequence/graphics/color_schemes/rainbow_dna.json +11 -0
  196. biotite/sequence/graphics/color_schemes/rainbow_protein.json +30 -0
  197. biotite/sequence/graphics/color_schemes/spring.json +51 -0
  198. biotite/sequence/graphics/color_schemes/sunset.json +51 -0
  199. biotite/sequence/graphics/color_schemes/wither.json +51 -0
  200. biotite/sequence/graphics/colorschemes.py +139 -0
  201. biotite/sequence/graphics/dendrogram.py +184 -0
  202. biotite/sequence/graphics/features.py +510 -0
  203. biotite/sequence/graphics/logo.py +110 -0
  204. biotite/sequence/graphics/plasmid.py +661 -0
  205. biotite/sequence/io/__init__.py +12 -0
  206. biotite/sequence/io/fasta/__init__.py +22 -0
  207. biotite/sequence/io/fasta/convert.py +273 -0
  208. biotite/sequence/io/fasta/file.py +278 -0
  209. biotite/sequence/io/fastq/__init__.py +19 -0
  210. biotite/sequence/io/fastq/convert.py +120 -0
  211. biotite/sequence/io/fastq/file.py +551 -0
  212. biotite/sequence/io/genbank/__init__.py +17 -0
  213. biotite/sequence/io/genbank/annotation.py +277 -0
  214. biotite/sequence/io/genbank/file.py +575 -0
  215. biotite/sequence/io/genbank/metadata.py +324 -0
  216. biotite/sequence/io/genbank/sequence.py +172 -0
  217. biotite/sequence/io/general.py +192 -0
  218. biotite/sequence/io/gff/__init__.py +26 -0
  219. biotite/sequence/io/gff/convert.py +133 -0
  220. biotite/sequence/io/gff/file.py +434 -0
  221. biotite/sequence/phylo/__init__.py +36 -0
  222. biotite/sequence/phylo/nj.cpython-310-darwin.so +0 -0
  223. biotite/sequence/phylo/nj.pyx +221 -0
  224. biotite/sequence/phylo/tree.cpython-310-darwin.so +0 -0
  225. biotite/sequence/phylo/tree.pyx +1169 -0
  226. biotite/sequence/phylo/upgma.cpython-310-darwin.so +0 -0
  227. biotite/sequence/phylo/upgma.pyx +164 -0
  228. biotite/sequence/profile.py +456 -0
  229. biotite/sequence/search.py +116 -0
  230. biotite/sequence/seqtypes.py +556 -0
  231. biotite/sequence/sequence.py +374 -0
  232. biotite/structure/__init__.py +132 -0
  233. biotite/structure/atoms.py +1455 -0
  234. biotite/structure/basepairs.py +1415 -0
  235. biotite/structure/bonds.cpython-310-darwin.so +0 -0
  236. biotite/structure/bonds.pyx +1933 -0
  237. biotite/structure/box.py +592 -0
  238. biotite/structure/celllist.cpython-310-darwin.so +0 -0
  239. biotite/structure/celllist.pyx +849 -0
  240. biotite/structure/chains.py +298 -0
  241. biotite/structure/charges.cpython-310-darwin.so +0 -0
  242. biotite/structure/charges.pyx +520 -0
  243. biotite/structure/compare.py +274 -0
  244. biotite/structure/density.py +114 -0
  245. biotite/structure/dotbracket.py +216 -0
  246. biotite/structure/error.py +31 -0
  247. biotite/structure/filter.py +585 -0
  248. biotite/structure/geometry.py +697 -0
  249. biotite/structure/graphics/__init__.py +13 -0
  250. biotite/structure/graphics/atoms.py +226 -0
  251. biotite/structure/graphics/rna.py +282 -0
  252. biotite/structure/hbond.py +409 -0
  253. biotite/structure/info/__init__.py +25 -0
  254. biotite/structure/info/atom_masses.json +121 -0
  255. biotite/structure/info/atoms.py +82 -0
  256. biotite/structure/info/bonds.py +145 -0
  257. biotite/structure/info/ccd/README.rst +8 -0
  258. biotite/structure/info/ccd/amino_acids.txt +1663 -0
  259. biotite/structure/info/ccd/carbohydrates.txt +1135 -0
  260. biotite/structure/info/ccd/components.bcif +0 -0
  261. biotite/structure/info/ccd/nucleotides.txt +798 -0
  262. biotite/structure/info/ccd.py +95 -0
  263. biotite/structure/info/groups.py +90 -0
  264. biotite/structure/info/masses.py +123 -0
  265. biotite/structure/info/misc.py +144 -0
  266. biotite/structure/info/radii.py +197 -0
  267. biotite/structure/info/standardize.py +196 -0
  268. biotite/structure/integrity.py +268 -0
  269. biotite/structure/io/__init__.py +30 -0
  270. biotite/structure/io/ctab.py +72 -0
  271. biotite/structure/io/dcd/__init__.py +13 -0
  272. biotite/structure/io/dcd/file.py +65 -0
  273. biotite/structure/io/general.py +257 -0
  274. biotite/structure/io/gro/__init__.py +14 -0
  275. biotite/structure/io/gro/file.py +343 -0
  276. biotite/structure/io/mmtf/__init__.py +21 -0
  277. biotite/structure/io/mmtf/assembly.py +214 -0
  278. biotite/structure/io/mmtf/convertarray.cpython-310-darwin.so +0 -0
  279. biotite/structure/io/mmtf/convertarray.pyx +341 -0
  280. biotite/structure/io/mmtf/convertfile.cpython-310-darwin.so +0 -0
  281. biotite/structure/io/mmtf/convertfile.pyx +501 -0
  282. biotite/structure/io/mmtf/decode.cpython-310-darwin.so +0 -0
  283. biotite/structure/io/mmtf/decode.pyx +152 -0
  284. biotite/structure/io/mmtf/encode.cpython-310-darwin.so +0 -0
  285. biotite/structure/io/mmtf/encode.pyx +183 -0
  286. biotite/structure/io/mmtf/file.py +233 -0
  287. biotite/structure/io/mol/__init__.py +20 -0
  288. biotite/structure/io/mol/convert.py +115 -0
  289. biotite/structure/io/mol/ctab.py +414 -0
  290. biotite/structure/io/mol/header.py +116 -0
  291. biotite/structure/io/mol/mol.py +193 -0
  292. biotite/structure/io/mol/sdf.py +916 -0
  293. biotite/structure/io/netcdf/__init__.py +13 -0
  294. biotite/structure/io/netcdf/file.py +63 -0
  295. biotite/structure/io/npz/__init__.py +20 -0
  296. biotite/structure/io/npz/file.py +152 -0
  297. biotite/structure/io/pdb/__init__.py +20 -0
  298. biotite/structure/io/pdb/convert.py +293 -0
  299. biotite/structure/io/pdb/file.py +1240 -0
  300. biotite/structure/io/pdb/hybrid36.cpython-310-darwin.so +0 -0
  301. biotite/structure/io/pdb/hybrid36.pyx +242 -0
  302. biotite/structure/io/pdbqt/__init__.py +15 -0
  303. biotite/structure/io/pdbqt/convert.py +107 -0
  304. biotite/structure/io/pdbqt/file.py +640 -0
  305. biotite/structure/io/pdbx/__init__.py +23 -0
  306. biotite/structure/io/pdbx/bcif.py +648 -0
  307. biotite/structure/io/pdbx/cif.py +1032 -0
  308. biotite/structure/io/pdbx/component.py +246 -0
  309. biotite/structure/io/pdbx/convert.py +1597 -0
  310. biotite/structure/io/pdbx/encoding.cpython-310-darwin.so +0 -0
  311. biotite/structure/io/pdbx/encoding.pyx +950 -0
  312. biotite/structure/io/pdbx/legacy.py +267 -0
  313. biotite/structure/io/tng/__init__.py +13 -0
  314. biotite/structure/io/tng/file.py +46 -0
  315. biotite/structure/io/trajfile.py +710 -0
  316. biotite/structure/io/trr/__init__.py +13 -0
  317. biotite/structure/io/trr/file.py +46 -0
  318. biotite/structure/io/xtc/__init__.py +13 -0
  319. biotite/structure/io/xtc/file.py +46 -0
  320. biotite/structure/mechanics.py +75 -0
  321. biotite/structure/molecules.py +353 -0
  322. biotite/structure/pseudoknots.py +642 -0
  323. biotite/structure/rdf.py +243 -0
  324. biotite/structure/repair.py +253 -0
  325. biotite/structure/residues.py +562 -0
  326. biotite/structure/resutil.py +178 -0
  327. biotite/structure/sasa.cpython-310-darwin.so +0 -0
  328. biotite/structure/sasa.pyx +322 -0
  329. biotite/structure/sequence.py +112 -0
  330. biotite/structure/sse.py +327 -0
  331. biotite/structure/superimpose.py +727 -0
  332. biotite/structure/transform.py +504 -0
  333. biotite/structure/util.py +98 -0
  334. biotite/temp.py +86 -0
  335. biotite/version.py +16 -0
  336. biotite/visualize.py +251 -0
  337. biotite-0.41.1.dist-info/METADATA +187 -0
  338. biotite-0.41.1.dist-info/RECORD +340 -0
  339. biotite-0.41.1.dist-info/WHEEL +4 -0
  340. biotite-0.41.1.dist-info/licenses/LICENSE.rst +30 -0
@@ -0,0 +1,950 @@
1
+ # This source code is part of the Biotite package and is distributed
2
+ # under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
3
+ # information.
4
+
5
+ """
6
+ This module contains data encodings for BinaryCIF files.
7
+ """
8
+
9
+ __name__ = "biotite.structure.io.pdbx"
10
+ __author__ = "Patrick Kunzmann"
11
+ __all__ = ["ByteArrayEncoding", "FixedPointEncoding",
12
+ "IntervalQuantizationEncoding", "RunLengthEncoding",
13
+ "DeltaEncoding", "IntegerPackingEncoding", "StringArrayEncoding",
14
+ "TypeCode"]
15
+
16
+ cimport cython
17
+ cimport numpy as np
18
+
19
+ from dataclasses import dataclass
20
+ from abc import ABCMeta, abstractmethod
21
+ from numbers import Integral
22
+ from enum import IntEnum
23
+ import re
24
+ import numpy as np
25
+ from .component import _Component
26
+ from ....file import InvalidFileError
27
+
28
+ ctypedef np.int8_t int8
29
+ ctypedef np.int16_t int16
30
+ ctypedef np.int32_t int32
31
+ ctypedef np.uint8_t uint8
32
+ ctypedef np.uint16_t uint16
33
+ ctypedef np.uint32_t uint32
34
+ ctypedef np.float32_t float32
35
+ ctypedef np.float64_t float64
36
+
37
+ ctypedef fused Integer:
38
+ uint8
39
+ uint16
40
+ uint32
41
+ int8
42
+ int16
43
+ int32
44
+
45
+ # Used to create cartesian product of type combinations
46
+ # in run-length encoding
47
+ ctypedef fused OutputInteger:
48
+ uint8
49
+ uint16
50
+ uint32
51
+ int8
52
+ int16
53
+ int32
54
+
55
+ ctypedef fused Float:
56
+ float32
57
+ float64
58
+
59
+
60
+ CAMEL_CASE_PATTERN = re.compile(r"(?<!^)(?=[A-Z])")
61
+
62
+
63
+ class TypeCode(IntEnum):
64
+ """
65
+ This enum type represents integers that represent data types in
66
+ *BinaryCIF*.
67
+ """
68
+ INT8 = 1
69
+ INT16 = 2
70
+ INT32 = 3
71
+ UINT8 = 4
72
+ UINT16 = 5
73
+ UINT32 = 6
74
+ FLOAT32 = 32
75
+ FLOAT64 = 33
76
+
77
+ @staticmethod
78
+ def from_dtype(dtype):
79
+ """
80
+ Convert a *NumPy* dtype to a *BinaryCIF* type code.
81
+
82
+ Parameters
83
+ ----------
84
+ dtype : dtype or int or TypeCode
85
+ The data type to be converted.
86
+ If already a type code, it is simply returned.
87
+
88
+ Returns
89
+ -------
90
+ type_code : TypeCode
91
+ The corresponding type code.
92
+ """
93
+ if isinstance(dtype, Integral):
94
+ # Already a type code
95
+ return TypeCode(dtype)
96
+ else:
97
+ dtype = np.dtype(dtype)
98
+ # Find the closest dtype supported by the format
99
+ if np.issubdtype(dtype, np.integer):
100
+ # int64 is not supported by format
101
+ if dtype == np.int64:
102
+ supported_dtype = np.int32
103
+ elif dtype == np.uint64:
104
+ supported_dtype = np.uint32
105
+ else:
106
+ supported_dtype = dtype
107
+ elif np.issubdtype(dtype, np.floating):
108
+ if dtype == np.float16:
109
+ supported_dtype = np.float32
110
+ # float128 is not available on all architectures
111
+ elif hasattr(np, "float128") and dtype == np.float128:
112
+ supported_dtype = np.float64
113
+ else:
114
+ supported_dtype = dtype
115
+ else:
116
+ raise ValueError(
117
+ f"dtype '{dtype}' is not supported by BinaryCIF"
118
+ )
119
+ return _DTYPE_TO_TYPE_CODE[
120
+ np.dtype(supported_dtype).newbyteorder("<").str
121
+ ]
122
+
123
+ def to_dtype(self):
124
+ """
125
+ Convert this type code to a *NumPy* dtype.
126
+
127
+ Returns
128
+ -------
129
+ dtype : dtype
130
+ The corresponding data type.
131
+ """
132
+ return _TYPE_CODE_TO_DTYPE[self]
133
+
134
+ # Converts BCIF integers representing the type to an actual NumPy dtype
135
+ _TYPE_CODE_TO_DTYPE = {
136
+ # All data types are little-endian
137
+ TypeCode.INT8: "|i1",
138
+ TypeCode.INT16: "<i2",
139
+ TypeCode.INT32: "<i4",
140
+ TypeCode.UINT8: "|u1",
141
+ TypeCode.UINT16: "<u2",
142
+ TypeCode.UINT32: "<u4",
143
+ TypeCode.FLOAT32: "<f4",
144
+ TypeCode.FLOAT64: "<f8"
145
+ }
146
+ _DTYPE_TO_TYPE_CODE = {val: key for key, val in _TYPE_CODE_TO_DTYPE.items()}
147
+
148
+
149
+ class Encoding(_Component, metaclass=ABCMeta):
150
+ """
151
+ Abstract base class for *BinaryCIF* data encodings.
152
+
153
+ Notes
154
+ -----
155
+ The encoding classes do not omit bound checks for decoding,
156
+ since the file content may be invalid/malicious.
157
+ """
158
+
159
+ @classmethod
160
+ def deserialize(cls, content):
161
+ params = {
162
+ _camel_to_snake_case(param): value
163
+ for param, value in content.items()
164
+ }
165
+ # 'kind' is no parameter, but indicates the class itself
166
+ params.pop("kind")
167
+ try:
168
+ encoding = cls(**params)
169
+ except TypeError as e:
170
+ raise InvalidFileError(
171
+ f"Invalid encoding parameters for {cls.__name__}"
172
+ )
173
+ except ValueError:
174
+ raise InvalidFileError(
175
+ f"Missing encoding parameters for {cls.__name__}"
176
+ )
177
+ return encoding
178
+
179
+ def serialize(self):
180
+ for param in self.__annotations__:
181
+ if getattr(self, param) is None:
182
+ raise ValueError(
183
+ f"'{param}' must be explicitly given or needs to be "
184
+ "determined from first encoding pass, before it is "
185
+ "serialized"
186
+ )
187
+
188
+ serialized = {
189
+ _snake_to_camel_case(param): getattr(self, param)
190
+ for param in self.__annotations__
191
+ }
192
+ serialized.update({
193
+ "kind": _encoding_classes_kinds[type(self).__name__]
194
+ })
195
+ return serialized
196
+
197
+ @abstractmethod
198
+ def encode(self, data):
199
+ """
200
+ Apply this encoding to the given data.
201
+
202
+ Parameters
203
+ ----------
204
+ data : ndarray
205
+ The data to be encoded.
206
+
207
+ Returns
208
+ -------
209
+ encoded_data : ndarray or bytes
210
+ The encoded data.
211
+ """
212
+ raise NotImplementedError()
213
+
214
+ @abstractmethod
215
+ def decode(self, data):
216
+ """
217
+ Apply the inverse of this encoding to the given data.
218
+
219
+ Parameters
220
+ ----------
221
+ data : ndarray or bytes
222
+ The data to be decoded.
223
+
224
+ Returns
225
+ -------
226
+ decoded_data : ndarray
227
+ The decoded data.
228
+ """
229
+ # Important: Do not omit bound checks for decoding,
230
+ # since the file content may be invalid/malicious.
231
+ raise NotImplementedError()
232
+
233
+
234
+ @dataclass
235
+ class ByteArrayEncoding(Encoding):
236
+ r"""
237
+ Encoding that encodes an array into bytes.
238
+
239
+ Parameters
240
+ ----------
241
+ type : dytpe or TypeCode, optional
242
+ The data type of the array to be encoded.
243
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
244
+ If omitted, the data type is taken from the data the
245
+ first time :meth:`encode()` is called.
246
+
247
+ Attributes
248
+ ----------
249
+ type : TypeCode
250
+
251
+ Examples
252
+ --------
253
+
254
+ >>> data = np.arange(3)
255
+ >>> print(data)
256
+ [0 1 2]
257
+ >>> print(ByteArrayEncoding().encode(data))
258
+ b'\x00\x00\x00\x00\x01\x00\x00\x00\x02\x00\x00\x00'
259
+ """
260
+ type: ... = None
261
+
262
+ def __post_init__(self):
263
+ if self.type is not None:
264
+ self.type = TypeCode.from_dtype(self.type)
265
+
266
+ def encode(self, data):
267
+ if self.type is None:
268
+ self.type = TypeCode.from_dtype(data.dtype)
269
+ return _safe_cast(data, self.type.to_dtype()).tobytes()
270
+
271
+ def decode(self, data):
272
+ # Data is raw bytes in this case
273
+ return np.frombuffer(data, dtype=self.type.to_dtype())
274
+
275
+
276
+ @dataclass
277
+ class FixedPointEncoding(Encoding):
278
+ """
279
+ Lossy encoding that multiplies floating point values with a given
280
+ factor and subsequently rounds them to the nearest integer.
281
+
282
+ Parameters
283
+ ----------
284
+ factor : float
285
+ The factor by which the data is multiplied before rounding.
286
+ src_type : dtype or TypeCode, optional
287
+ The data type of the array to be encoded.
288
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
289
+ The dtype must be a float type.
290
+ If omitted, 32-bit floats are assumed.
291
+
292
+ Attributes
293
+ ----------
294
+ factor : float
295
+ src_type : TypeCode
296
+
297
+ Examples
298
+ --------
299
+
300
+ >>> data = np.array([9.87, 6.543])
301
+ >>> print(data)
302
+ [9.870 6.543]
303
+ >>> print(FixedPointEncoding(factor=100).encode(data))
304
+ [987 654]
305
+ """
306
+ factor: ...
307
+ src_type: ... = TypeCode.FLOAT32
308
+
309
+ def __post_init__(self):
310
+ if self.src_type is not None:
311
+ self.src_type = TypeCode.from_dtype(self.src_type)
312
+ if self.src_type not in (TypeCode.FLOAT32, TypeCode.FLOAT64):
313
+ raise ValueError(
314
+ "Only floating point types are supported"
315
+ )
316
+
317
+ def encode(self, data):
318
+ # Round to avoid wrong values due to floating point inaccuracies
319
+ return np.round(data * self.factor).astype(np.int32)
320
+
321
+ def decode(self, data):
322
+ return (data / self.factor).astype(
323
+ dtype=self.src_type.to_dtype(), copy=False
324
+ )
325
+
326
+
327
+ @dataclass
328
+ class IntervalQuantizationEncoding(Encoding):
329
+ """
330
+ Lossy encoding that sorts floating point values into bins.
331
+ Each bin is represented by an integer
332
+
333
+ Parameters
334
+ ----------
335
+ min, max : float
336
+ The minimum and maximum value the bins comprise.
337
+ num_steps : int
338
+ The number of bins.
339
+ src_type : dtype or TypeCode, optional
340
+ The data type of the array to be encoded.
341
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
342
+ The dtype must be a float type.
343
+ If omitted, 32-bit floats are assumed.
344
+
345
+ Attributes
346
+ ----------
347
+ min, max : float
348
+ num_steps : int
349
+ src_type : TypeCode
350
+
351
+ Examples
352
+ --------
353
+
354
+ >>> data = np.linspace(11, 12, 6)
355
+ >>> print(data)
356
+ [11.0 11.2 11.4 11.6 11.8 12.0]
357
+ >>> # Use 0.5 as step size
358
+ >>> encoding = IntervalQuantizationEncoding(min=10, max=20, num_steps=21)
359
+ >>> # The encoding is lossy, as different values are mapped to the same bin
360
+ >>> encoded = encoding.encode(data)
361
+ >>> print(encoded)
362
+ [2 3 3 4 4 4]
363
+ >>> decoded = encoding.decode(encoded)
364
+ >>> print(decoded)
365
+ [11.0 11.5 11.5 12.0 12.0 12.0]
366
+ """
367
+ min: ...
368
+ max: ...
369
+ num_steps: ...
370
+ src_type: ... = TypeCode.FLOAT32
371
+
372
+ def __post_init__(self):
373
+ if self.src_type is not None:
374
+ self.src_type = TypeCode.from_dtype(self.src_type)
375
+
376
+ def encode(self, data):
377
+ steps = np.linspace(
378
+ self.min, self.max, self.num_steps, dtype=data.dtype
379
+ )
380
+ indices = np.searchsorted(steps, data, side="left")
381
+ return indices.astype(np.int32, copy=False)
382
+
383
+ def decode(self, data):
384
+ output = data * (self.max - self.min) / (self.num_steps - 1)
385
+ output = output.astype(self.src_type.to_dtype(), copy=False)
386
+ output += self.min
387
+ return output
388
+
389
+
390
+ @dataclass
391
+ class RunLengthEncoding(Encoding):
392
+ """
393
+ Encoding that compresses runs of equal values into pairs of
394
+ (value, run length).
395
+
396
+ Parameters
397
+ ----------
398
+ src_size : int, optional
399
+ The size of the array to be encoded.
400
+ If omitted, the size is determined from the data the
401
+ first time :meth:`encode()` is called.
402
+ src_type : dtype or TypeCode, optional
403
+ The data type of the array to be encoded.
404
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
405
+ The dtype must be a integer type.
406
+ If omitted, the data type is taken from the data the
407
+ first time :meth:`encode()` is called.
408
+
409
+ Attributes
410
+ ----------
411
+ src_size : int
412
+ src_type : TypeCode
413
+
414
+ Examples
415
+ --------
416
+
417
+ >>> data = np.array([1, 1, 1, 5, 3, 3])
418
+ >>> print(data)
419
+ [1 1 1 5 3 3]
420
+ >>> encoded = RunLengthEncoding().encode(data)
421
+ >>> print(encoded)
422
+ [1 3 5 1 3 2]
423
+ >>> # Emphasize the the pairs
424
+ >>> print(encoded.reshape(-1, 2))
425
+ [[1 3]
426
+ [5 1]
427
+ [3 2]]
428
+ """
429
+ src_size: ... = None
430
+ src_type: ... = None
431
+
432
+ def __post_init__(self):
433
+ if self.src_type is not None:
434
+ self.src_type = TypeCode.from_dtype(self.src_type)
435
+
436
+ def encode(self, data):
437
+ # If not given in constructor, it is determined from the data
438
+ if self.src_type is None:
439
+ self.src_type = TypeCode.from_dtype(data.dtype)
440
+ if self.src_size is None:
441
+ self.src_size = data.shape[0]
442
+ elif self.src_size != data.shape[0]:
443
+ raise IndexError(
444
+ "Given source size does not match actual data size"
445
+ )
446
+ return self._encode(_safe_cast(data, self.src_type.to_dtype()))
447
+
448
+ def decode(self, data):
449
+ return self._decode(
450
+ data, np.empty(0, dtype=self.src_type.to_dtype())
451
+ )
452
+
453
+ def _encode(self, const Integer[:] data):
454
+ # Pessimistic allocation of output array
455
+ # -> Run length is 1 for every element
456
+ cdef int32[:] output = np.zeros(data.shape[0] * 2, dtype=np.int32)
457
+ cdef int i=0, j=0
458
+ cdef int val = data[0]
459
+ cdef int run_length = 0
460
+ cdef int curr_val
461
+ for i in range(data.shape[0]):
462
+ curr_val = data[i]
463
+ if curr_val == val:
464
+ run_length += 1
465
+ else:
466
+ # New element -> Write element with run-length
467
+ output[j] = val
468
+ output[j+1] = run_length
469
+ j += 2
470
+ val = curr_val
471
+ run_length = 1
472
+ # Write last element
473
+ output[j] = val
474
+ output[j+1] = run_length
475
+ j += 2
476
+ # Trim to correct size
477
+ return np.asarray(output)[:j]
478
+
479
+ def _decode(self, const Integer[:] data, OutputInteger[:] output_type):
480
+ """
481
+ `output_type` is merely a typed placeholder to allow for static
482
+ typing of output.
483
+ """
484
+ if data.shape[0] % 2 != 0:
485
+ raise ValueError("Invalid run-length encoded data")
486
+
487
+ cdef int length = 0
488
+ cdef int i, j
489
+ cdef int value, repeat
490
+
491
+ if self.src_size is None:
492
+ # Determine length of output array by summing run lengths
493
+ for i in range(1, data.shape[0], 2):
494
+ length += data[i]
495
+ else:
496
+ length = self.src_size
497
+
498
+ cdef OutputInteger[:] output = np.zeros(
499
+ length, dtype=np.asarray(output_type).dtype
500
+ )
501
+ # Fill output array
502
+ j = 0
503
+ for i in range(0, data.shape[0], 2):
504
+ value = data[i]
505
+ repeat = data[i+1]
506
+ output[j : j+repeat] = value
507
+ j += repeat
508
+ return np.asarray(output)
509
+
510
+
511
+ @dataclass
512
+ class DeltaEncoding(Encoding):
513
+ """
514
+ Encoding that encodes an array of integers into an array of
515
+ consecutive differences.
516
+
517
+ Parameters
518
+ ----------
519
+ src_type : dtype or TypeCode, optional
520
+ The data type of the array to be encoded.
521
+ Either a NumPy dtype or a *BinaryCIF* type code is accepted.
522
+ The dtype must be a integer type.
523
+ If omitted, the data type is taken from the data the
524
+ first time :meth:`encode()` is called.
525
+ origin : int, optional
526
+ The starting value from which the differences are calculated.
527
+ If omitted, the origin is set to 0.
528
+
529
+ Attributes
530
+ ----------
531
+ src_type : TypeCode
532
+ origin : int
533
+
534
+ Examples
535
+ --------
536
+
537
+ >>> data = np.array([1, 1, 2, 3, 5, 8])
538
+ >>> print(DeltaEncoding().encode(data))
539
+ [1 0 1 1 2 3]
540
+ """
541
+ src_type: ... = None
542
+ origin: ... = 0
543
+
544
+ def __post_init__(self):
545
+ if self.src_type is not None:
546
+ self.src_type = TypeCode.from_dtype(self.src_type)
547
+
548
+ def encode(self, data):
549
+ # If not given in constructor, it is determined from the data
550
+ if self.src_type is None:
551
+ self.src_type = TypeCode.from_dtype(data.dtype)
552
+
553
+ data = data - self.origin
554
+ return np.diff(data, prepend=0).astype(np.int32, copy=False)
555
+
556
+ def decode(self, data):
557
+ output = np.cumsum(data, dtype=self.src_type.to_dtype())
558
+ output += self.origin
559
+ return output
560
+
561
+
562
+ @dataclass
563
+ class IntegerPackingEncoding(Encoding):
564
+ """
565
+ Encoding that compresses an array of 32-bit integers into an array
566
+ of smaller sized integers.
567
+
568
+ If a value does not fit into smaller integer type,
569
+ the integer is represented by a sum of consecutive elements
570
+ in the compressed array.
571
+
572
+ Parameters
573
+ ----------
574
+ byte_count : int
575
+ The number of bytes the packed integers should occupy.
576
+ Supported values are 1 and 2 for 8-bit and 16-bit integers,
577
+ respectively.
578
+ src_size : int, optional
579
+ The size of the array to be encoded.
580
+ If omitted, the size is determined from the data the
581
+ first time :meth:`encode()` is called.
582
+ is_unsigned : bool, optional
583
+ Whether the values should be packed into signed or unsigned
584
+ integers.
585
+ If omitted, the values are packed into signed integers.
586
+
587
+ Attributes
588
+ ----------
589
+ byte_count : int
590
+ src_size : int
591
+ is_unsigned : bool
592
+
593
+ Examples
594
+ --------
595
+
596
+ >>> data = np.array([1, 2, -3, 128])
597
+ >>> print(data)
598
+ [ 1 2 -3 128]
599
+ >>> print(IntegerPackingEncoding(byte_count=1).encode(data))
600
+ [ 1 2 -3 127 1]
601
+ """
602
+ byte_count: ...
603
+ src_size: ... = None
604
+ is_unsigned: ... = False
605
+
606
+ def encode(self, data):
607
+ if self.src_size is None:
608
+ self.src_size = len(data)
609
+ elif self.src_size != len(data):
610
+ raise IndexError(
611
+ "Given source size does not match actual data size"
612
+ )
613
+
614
+ data = data.astype(np.int32, copy=False)
615
+ return self._encode(
616
+ data, np.empty(0, dtype=self._determine_packed_dtype())
617
+ )
618
+
619
+ def decode(self, const Integer[:] data):
620
+ cdef int i, j
621
+ cdef int min_val, max_val
622
+ cdef int packed_val, unpacked_val
623
+ bounds = self._get_bounds(data)
624
+ min_val = bounds[0]
625
+ max_val = bounds[1]
626
+ # For signed integers, do not check lower bound (is always 0)
627
+ # -> Set lower bound to value that is never reached
628
+ if min_val == 0:
629
+ min_val = -1
630
+
631
+ cdef int32[:] output = np.zeros(self.src_size, dtype=np.int32)
632
+ j = 0
633
+ unpacked_val = 0
634
+ for i in range(data.shape[0]):
635
+ packed_val = data[i]
636
+ if packed_val == max_val or packed_val == min_val:
637
+ unpacked_val += packed_val
638
+ else:
639
+ unpacked_val += packed_val
640
+ output[j] = unpacked_val
641
+ unpacked_val = 0
642
+ j += 1
643
+ # Trim to correct size and return
644
+ return np.asarray(output)
645
+
646
+ def _determine_packed_dtype(self):
647
+ if self.byte_count == 1:
648
+ if self.is_unsigned:
649
+ return np.uint8
650
+ else:
651
+ return np.int8
652
+ elif self.byte_count == 2:
653
+ if self.is_unsigned:
654
+ return np.uint16
655
+ else:
656
+ return np.int16
657
+ else:
658
+ raise ValueError("Unsupported byte count")
659
+
660
+ @cython.cdivision(True)
661
+ def _encode(self, const Integer[:] data, OutputInteger[:] output_type):
662
+ """
663
+ `output_type` is merely a typed placeholder to allow for static
664
+ typing of output.
665
+ """
666
+ cdef int i=0, j=0
667
+
668
+ packed_type = np.asarray(output_type).dtype
669
+ cdef int min_val = np.iinfo(packed_type).min
670
+ cdef int max_val = np.iinfo(packed_type).max
671
+
672
+ # Get length of output array
673
+ # by summing up required length of each element
674
+ cdef int number
675
+ cdef int length = 0
676
+ for i in range(data.shape[0]):
677
+ number = data[i]
678
+ if number < 0:
679
+ if min_val == 0:
680
+ raise ValueError(
681
+ "Cannot pack negative numbers into unsigned type"
682
+ )
683
+ # The required packed length for an element is the
684
+ # number of times min_val/max_val need to be repeated
685
+ length += number // min_val + 1
686
+ elif number > 0:
687
+ length += number // max_val + 1
688
+ else:
689
+ # number = 0
690
+ length += 1
691
+
692
+ # Fill output
693
+ cdef OutputInteger[:] output = np.zeros(length, dtype=packed_type)
694
+ cdef int remainder
695
+ j = 0
696
+ for i in range(data.shape[0]):
697
+ remainder = data[i]
698
+ if remainder < 0:
699
+ if min_val == 0:
700
+ raise ValueError(
701
+ "Cannot pack negative numbers into unsigned type"
702
+ )
703
+ while remainder <= min_val:
704
+ remainder -= min_val
705
+ output[j] = min_val
706
+ j += 1
707
+ elif remainder > 0:
708
+ while remainder >= max_val:
709
+ remainder -= max_val
710
+ output[j] = max_val
711
+ j += 1
712
+ output[j] = remainder
713
+ j += 1
714
+ return np.asarray(output)
715
+
716
+ @staticmethod
717
+ def _get_bounds(const Integer[:] data):
718
+ if Integer is int8:
719
+ info = np.iinfo(np.int8)
720
+ elif Integer is int16:
721
+ info = np.iinfo(np.int16)
722
+ elif Integer is int32:
723
+ info = np.iinfo(np.int32)
724
+ elif Integer is uint8:
725
+ info = np.iinfo(np.uint8)
726
+ elif Integer is uint16:
727
+ info = np.iinfo(np.uint16)
728
+ elif Integer is uint32:
729
+ info = np.iinfo(np.uint32)
730
+ else:
731
+ raise ValueError("Unsupported integer type")
732
+ return info.min, info.max
733
+
734
+
735
+ @dataclass
736
+ class StringArrayEncoding(Encoding):
737
+ """
738
+ Encoding that compresses an array of strings into an array of
739
+ indices that point to the unique strings in that array.
740
+
741
+ The unique strings themselves are stored as part of the
742
+ :class:`StringArrayEncoding` as concatenated string.
743
+ The start index of each unique string in the concatenated string
744
+ is stored in an *offset* array.
745
+
746
+ Parameters
747
+ ----------
748
+ strings : ndarray, optional
749
+ The unique strings that are used for encoding.
750
+ If omitted, the unique strings are determined from the data the
751
+ first time :meth:`encode()` is called.
752
+ data_encoding : list of Encoding, optional
753
+ The encodings that are applied to the indiy array.
754
+ If omitted, the array is directly encoded into bytes without
755
+ further compression.
756
+ offset_encoding : list of Encoding, optional
757
+ The encodings that are applied to the offset array.
758
+ If omitted, the array is directly encoded into bytes without
759
+ further compression.
760
+
761
+ Attributes
762
+ ----------
763
+ strings : ndarray
764
+ data_encoding : list of Encoding
765
+ offset_encoding : list of Encoding
766
+
767
+ Examples
768
+ --------
769
+
770
+ >>> data = np.array(["apple", "banana", "cherry", "apple", "banana", "apple"])
771
+ >>> print(data)
772
+ ['apple' 'banana' 'cherry' 'apple' 'banana' 'apple']
773
+ >>> # By default the indices would directly be encoded into bytes
774
+ >>> # However, the indices should be printed here -> data_encoding=[]
775
+ >>> encoding = StringArrayEncoding(data_encoding=[])
776
+ >>> encoded = encoding.encode(data)
777
+ >>> print(encoding.strings)
778
+ ['apple' 'banana' 'cherry']
779
+ >>> print(encoded)
780
+ [0 1 2 0 1 0]
781
+ """
782
+
783
+ strings: ... = None
784
+ data_encoding: ... = None
785
+ offset_encoding: ... = None
786
+
787
+ def __init__(self, strings=None, data_encoding=None, offset_encoding=None):
788
+ self.strings = strings
789
+ if data_encoding is None:
790
+ data_encoding = [ByteArrayEncoding(TypeCode.INT32)]
791
+ self.data_encoding = data_encoding
792
+ if offset_encoding is None:
793
+ offset_encoding = [ByteArrayEncoding(TypeCode.INT32)]
794
+ self.offset_encoding = offset_encoding
795
+
796
+ @staticmethod
797
+ def deserialize(content):
798
+ data_encoding = [
799
+ deserialize_encoding(e) for e in content["dataEncoding"]
800
+ ]
801
+ offset_encoding = [
802
+ deserialize_encoding(e) for e in content["offsetEncoding"]
803
+ ]
804
+ cdef str concatenated_strings = content["stringData"]
805
+ cdef np.ndarray offsets = decode_stepwise(
806
+ content["offsets"], offset_encoding
807
+ )
808
+
809
+ strings = np.array([
810
+ concatenated_strings[offsets[i]:offsets[i+1]]
811
+ # The final offset is the exclusive stop index
812
+ for i in range(len(offsets)-1)
813
+ ], dtype="U")
814
+
815
+ return StringArrayEncoding(strings, data_encoding, offset_encoding)
816
+
817
+ def serialize(self):
818
+ if self.strings is None:
819
+ raise ValueError(
820
+ "'strings' must be explicitly given or needs to be "
821
+ "determined from first encoding pass, before it is serialized"
822
+ )
823
+
824
+ string_data = "".join(self.strings)
825
+ offsets = np.cumsum([0] + [len(s) for s in self.strings])
826
+
827
+ return {
828
+ "kind": "StringArray",
829
+ "dataEncoding": [e.serialize() for e in self.data_encoding],
830
+ "stringData": string_data,
831
+ "offsets": encode_stepwise(offsets, self.offset_encoding),
832
+ "offsetEncoding": [e.serialize() for e in self.offset_encoding],
833
+ }
834
+
835
+ def encode(self, data):
836
+ if not np.issubdtype(data.dtype, np.str_):
837
+ raise TypeError("Data must be of string type")
838
+
839
+ if self.strings is None:
840
+ # 'unique()' already sorts the strings
841
+ self.strings = np.unique(data)
842
+ check_present = False
843
+ else:
844
+ check_present = True
845
+
846
+ string_order = np.argsort(self.strings).astype(np.int32)
847
+ sorted_strings = self.strings[string_order]
848
+ sorted_indices = np.searchsorted(sorted_strings, data)
849
+ indices = string_order[sorted_indices]
850
+ if check_present and not np.all(self.strings[indices] == data):
851
+ raise ValueError("Data contains strings not present in 'strings'")
852
+ return encode_stepwise(indices, self.data_encoding)
853
+
854
+ def decode(self, data):
855
+ indices = decode_stepwise(data, self.data_encoding)
856
+ return self.strings[indices]
857
+
858
+ def __eq__(self, other):
859
+ if not isinstance(other, type(self)):
860
+ return False
861
+ if not np.array_equal(self.strings, other.strings):
862
+ return False
863
+ if self.data_encoding != other.data_encoding:
864
+ return False
865
+ if self.offset_encoding != other.offset_encoding:
866
+ return False
867
+ return True
868
+
869
+
870
+ _encoding_classes = {
871
+ "ByteArray": ByteArrayEncoding,
872
+ "FixedPoint": FixedPointEncoding,
873
+ "IntervalQuantization": IntervalQuantizationEncoding,
874
+ "RunLength": RunLengthEncoding,
875
+ "Delta": DeltaEncoding,
876
+ "IntegerPacking": IntegerPackingEncoding,
877
+ "StringArray": StringArrayEncoding,
878
+ }
879
+ _encoding_classes_kinds = {
880
+ "ByteArrayEncoding": "ByteArray",
881
+ "FixedPointEncoding": "FixedPoint",
882
+ "IntervalQuantizationEncoding": "IntervalQuantization",
883
+ "RunLengthEncoding": "RunLength",
884
+ "DeltaEncoding": "Delta",
885
+ "IntegerPackingEncoding": "IntegerPacking",
886
+ "StringArrayEncoding": "StringArray",
887
+ }
888
+
889
+
890
+ def deserialize_encoding(content):
891
+ try:
892
+ encoding_class = _encoding_classes[content["kind"]]
893
+ except KeyError:
894
+ raise ValueError(
895
+ f"Unknown encoding kind '{content['kind']}'"
896
+ )
897
+ return encoding_class.deserialize(content)
898
+
899
+
900
+ def create_uncompressed_encoding(array):
901
+ dtype = array.dtype
902
+
903
+ if np.issubdtype(dtype, np.str_):
904
+ return [StringArrayEncoding()]
905
+ else:
906
+ return [ByteArrayEncoding()]
907
+
908
+
909
+ def encode_stepwise(data, encoding):
910
+ for encoding in encoding:
911
+ data = encoding.encode(data)
912
+ return data
913
+
914
+
915
+ def decode_stepwise(data, encoding):
916
+ for enc in reversed(encoding):
917
+ data = enc.decode(data)
918
+ return data
919
+
920
+
921
+ def _camel_to_snake_case(attribute_name):
922
+ return re.sub(CAMEL_CASE_PATTERN, "_", attribute_name).lower()
923
+
924
+
925
+ def _snake_to_camel_case(attribute_name):
926
+ attribute_name = "".join(
927
+ word.capitalize() for word in attribute_name.split("_")
928
+ )
929
+ return attribute_name[0].lower() + attribute_name[1:]
930
+
931
+
932
+ def _safe_cast(array, dtype):
933
+ dtype = np.dtype(dtype)
934
+ if dtype == array.dtype:
935
+ return array
936
+ if np.issubdtype(dtype, np.integer):
937
+ if not np.issubdtype(array.dtype, np.integer):
938
+ raise ValueError("Cannot cast floating point to integer")
939
+ dtype_info = np.iinfo(dtype)
940
+ if np.any(array < dtype_info.min) or np.any(array > dtype_info.max):
941
+ raise ValueError("Integer values do not fit into the given dtype")
942
+ return array.astype(dtype)
943
+
944
+
945
+ def _get_n_decimals(value, tolerance):
946
+ MAX_DECIMALS = 10
947
+ for n in range(MAX_DECIMALS):
948
+ if abs(value - round(value, n)) < tolerance:
949
+ return n
950
+ return MAX_DECIMALS