pyscf 2.7.0__py3-none-macosx_11_0_arm64.whl → 2.9.0__py3-none-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. pyscf/__init__.py +1 -1
  2. pyscf/ao2mo/__init__.py +13 -2
  3. pyscf/ao2mo/_ao2mo.py +10 -1
  4. pyscf/ao2mo/incore.py +3 -0
  5. pyscf/ao2mo/nrr_outcore.py +2 -2
  6. pyscf/ao2mo/outcore.py +3 -3
  7. pyscf/ao2mo/r_outcore.py +2 -2
  8. pyscf/cc/__init__.py +2 -3
  9. pyscf/cc/ccsd.py +13 -5
  10. pyscf/cc/ccsd_rdm.py +6 -1
  11. pyscf/cc/dfccsd.py +3 -3
  12. pyscf/cc/dfuccsd.py +310 -0
  13. pyscf/cc/gccsd.py +2 -2
  14. pyscf/cc/rccsd.py +5 -1
  15. pyscf/cc/uccsd.py +36 -27
  16. pyscf/cc/uccsd_rdm.py +2 -2
  17. pyscf/df/addons.py +8 -3
  18. pyscf/df/autoaux.py +4 -0
  19. pyscf/df/df_jk.py +56 -25
  20. pyscf/df/grad/rhf.py +31 -1
  21. pyscf/df/hessian/uhf.py +2 -2
  22. pyscf/df/incore.py +2 -2
  23. pyscf/df/outcore.py +6 -6
  24. pyscf/dft/gks.py +25 -21
  25. pyscf/dft/libxc.py +31 -11
  26. pyscf/dft/numint.py +33 -16
  27. pyscf/dft/radi.py +9 -2
  28. pyscf/dft/rks.py +28 -24
  29. pyscf/dft/roks.py +7 -1
  30. pyscf/dft/uks.py +34 -25
  31. pyscf/fci/direct_spin1.py +0 -1
  32. pyscf/fci/fci_dhf_slow.py +15 -1
  33. pyscf/grad/ccsd.py +3 -7
  34. pyscf/grad/ccsd_slow.py +2 -3
  35. pyscf/grad/mp2.py +12 -3
  36. pyscf/grad/sacasscf.py +2 -0
  37. pyscf/grad/uccsd.py +3 -7
  38. pyscf/grad/ump2.py +2 -4
  39. pyscf/gto/basis/__init__.py +32 -5
  40. pyscf/gto/basis/def2-mtzvp.dat +4719 -0
  41. pyscf/gto/basis/def2-mtzvpp.dat +4739 -0
  42. pyscf/gto/basis/dyall-basis/__init__.py +0 -0
  43. pyscf/gto/basis/dyall-basis/dyall_2zp.py +6492 -0
  44. pyscf/gto/basis/dyall-basis/dyall_3zp.py +8343 -0
  45. pyscf/gto/basis/dyall-basis/dyall_4zp.py +10055 -0
  46. pyscf/gto/basis/dyall-basis/dyall_aae2z.py +1818 -0
  47. pyscf/gto/basis/dyall-basis/dyall_aae3z.py +2521 -0
  48. pyscf/gto/basis/dyall-basis/dyall_aae4z.py +3351 -0
  49. pyscf/gto/basis/dyall-basis/dyall_acv2z.py +1790 -0
  50. pyscf/gto/basis/dyall-basis/dyall_acv3z.py +2417 -0
  51. pyscf/gto/basis/dyall-basis/dyall_acv4z.py +3085 -0
  52. pyscf/gto/basis/dyall-basis/dyall_ae2z.py +6619 -0
  53. pyscf/gto/basis/dyall-basis/dyall_ae3z.py +9027 -0
  54. pyscf/gto/basis/dyall-basis/dyall_ae4z.py +11839 -0
  55. pyscf/gto/basis/dyall-basis/dyall_av2z.py +1742 -0
  56. pyscf/gto/basis/dyall-basis/dyall_av3z.py +2318 -0
  57. pyscf/gto/basis/dyall-basis/dyall_av4z.py +2905 -0
  58. pyscf/gto/basis/dyall-basis/dyall_cv2z.py +6558 -0
  59. pyscf/gto/basis/dyall-basis/dyall_cv3z.py +8767 -0
  60. pyscf/gto/basis/dyall-basis/dyall_cv4z.py +11098 -0
  61. pyscf/gto/basis/dyall-basis/dyall_v2z.py +6472 -0
  62. pyscf/gto/basis/dyall-basis/dyall_v3z.py +8539 -0
  63. pyscf/gto/basis/dyall-basis/dyall_v4z.py +10658 -0
  64. pyscf/gto/basis/ma-def2-qzvp.dat +5959 -0
  65. pyscf/gto/basis/ma-def2-qzvpp.dat +6195 -0
  66. pyscf/gto/basis/ma-def2-svp.dat +3504 -0
  67. pyscf/gto/basis/ma-def2-svpp.dat +3504 -0
  68. pyscf/gto/basis/ma-def2-tzvp.dat +4347 -0
  69. pyscf/gto/basis/ma-def2-tzvpp.dat +4549 -0
  70. pyscf/gto/basis/parse_cp2k.py +8 -7
  71. pyscf/gto/basis/parse_nwchem.py +25 -10
  72. pyscf/gto/eval_gto.py +1 -1
  73. pyscf/gto/ft_ao.py +6 -6
  74. pyscf/gto/mole.py +32 -35
  75. pyscf/gto/moleintor.py +26 -1
  76. pyscf/gw/rpa.py +133 -244
  77. pyscf/gw/urpa.py +84 -131
  78. pyscf/hessian/uks.py +1 -1
  79. pyscf/lib/CMakeLists.txt +8 -4
  80. pyscf/lib/config.h +0 -1
  81. pyscf/lib/config.h.in +0 -1
  82. pyscf/lib/deps/include/xc.h +28 -18
  83. pyscf/lib/deps/include/xc_funcs.h +50 -2
  84. pyscf/lib/deps/include/xc_version.h +3 -3
  85. pyscf/lib/deps/lib/libcint.6.dylib +0 -0
  86. pyscf/lib/deps/lib/{libxc.12.dylib → libxc.15.dylib} +0 -0
  87. pyscf/lib/deps/lib/libxcfun.2.dylib +0 -0
  88. pyscf/lib/dft/libxc_itrf.c +25 -21
  89. pyscf/lib/dft/nr_numint_sparse.c +3 -3
  90. pyscf/lib/diis.py +1 -1
  91. pyscf/lib/exceptions.py +3 -0
  92. pyscf/lib/libagf2.dylib +0 -0
  93. pyscf/lib/libao2mo.dylib +0 -0
  94. pyscf/lib/libcc.dylib +0 -0
  95. pyscf/lib/libcgto.dylib +0 -0
  96. pyscf/lib/libcvhf.dylib +0 -0
  97. pyscf/lib/libdft.dylib +0 -0
  98. pyscf/lib/libfci.dylib +0 -0
  99. pyscf/lib/libmcscf.dylib +0 -0
  100. pyscf/lib/libmp.dylib +0 -0
  101. pyscf/lib/libnp_helper.dylib +0 -0
  102. pyscf/lib/libpbc.dylib +0 -0
  103. pyscf/lib/libri.dylib +0 -0
  104. pyscf/lib/libxc_itrf.dylib +0 -0
  105. pyscf/lib/libxcfun_itrf.dylib +0 -0
  106. pyscf/lib/linalg_helper.py +5 -6
  107. pyscf/lib/logger.py +2 -1
  108. pyscf/lib/mcscf/fci_contract.c +8 -1
  109. pyscf/lib/misc.py +16 -8
  110. pyscf/lib/mp/CMakeLists.txt +22 -0
  111. pyscf/lib/mp/mp2.c +518 -0
  112. pyscf/lib/mp/mp2.h +44 -0
  113. pyscf/lib/np_helper/CMakeLists.txt +1 -1
  114. pyscf/lib/np_helper/imatcopy.c +360 -0
  115. pyscf/lib/np_helper/np_helper.c +94 -0
  116. pyscf/lib/np_helper/np_helper.h +26 -0
  117. pyscf/lib/numpy_helper.py +194 -10
  118. pyscf/lib/pbc/nr_direct.c +2 -7
  119. pyscf/lib/vhf/fblas.h +3 -0
  120. pyscf/lib/vhf/nr_sr_vhf.c +8 -12
  121. pyscf/lib/vhf/rkb_screen.c +139 -0
  122. pyscf/mcscf/__init__.py +1 -1
  123. pyscf/mcscf/casci.py +7 -3
  124. pyscf/mcscf/chkfile.py +2 -3
  125. pyscf/mcscf/mc1step.py +12 -8
  126. pyscf/mcscf/newton_casscf.py +1 -1
  127. pyscf/mcscf/umc1step.py +5 -3
  128. pyscf/mp/__init__.py +2 -2
  129. pyscf/mp/dfmp2.py +498 -59
  130. pyscf/mp/dfmp2_native.py +11 -1
  131. pyscf/mp/dfmp2_slow.py +133 -0
  132. pyscf/mp/dfump2.py +672 -0
  133. pyscf/mp/dfump2_native.py +9 -0
  134. pyscf/mp/dfump2_slow.py +161 -0
  135. pyscf/mp/gmp2.py +6 -47
  136. pyscf/mp/mp2.py +19 -5
  137. pyscf/mp/ump2.py +23 -18
  138. pyscf/mrpt/nevpt2.py +11 -0
  139. pyscf/pbc/df/aft.py +9 -7
  140. pyscf/pbc/df/df.py +5 -6
  141. pyscf/pbc/df/df_jk.py +12 -6
  142. pyscf/pbc/df/fft.py +3 -3
  143. pyscf/pbc/df/fft_jk.py +7 -7
  144. pyscf/pbc/df/incore.py +1 -1
  145. pyscf/pbc/df/mdf_jk.py +2 -1
  146. pyscf/pbc/df/outcore.py +10 -10
  147. pyscf/pbc/df/rsdf.py +1 -0
  148. pyscf/pbc/df/rsdf_builder.py +3 -3
  149. pyscf/pbc/df/rsdf_helper.py +5 -5
  150. pyscf/pbc/df/rsdf_jk.py +2 -1
  151. pyscf/pbc/dft/gen_grid.py +3 -2
  152. pyscf/pbc/dft/gks.py +14 -3
  153. pyscf/pbc/dft/kgks.py +15 -4
  154. pyscf/pbc/dft/krks.py +28 -10
  155. pyscf/pbc/dft/krks_ksymm.py +21 -9
  156. pyscf/pbc/dft/krkspu.py +1 -30
  157. pyscf/pbc/dft/krkspu_ksymm.py +0 -30
  158. pyscf/pbc/dft/kuks.py +30 -13
  159. pyscf/pbc/dft/kuks_ksymm.py +22 -10
  160. pyscf/pbc/dft/kukspu.py +0 -27
  161. pyscf/pbc/dft/kukspu_ksymm.py +0 -30
  162. pyscf/pbc/dft/multigrid/multigrid.py +17 -7
  163. pyscf/pbc/dft/multigrid/multigrid_pair.py +6 -1
  164. pyscf/pbc/dft/numint.py +26 -10
  165. pyscf/pbc/dft/rks.py +20 -26
  166. pyscf/pbc/dft/uks.py +21 -4
  167. pyscf/pbc/gto/_pbcintor.py +1 -0
  168. pyscf/pbc/gto/cell.py +170 -5
  169. pyscf/pbc/gto/eval_gto.py +1 -1
  170. pyscf/pbc/gto/neighborlist.py +4 -1
  171. pyscf/pbc/mpitools/mpi.py +0 -1
  172. pyscf/pbc/scf/_response_functions.py +141 -34
  173. pyscf/pbc/scf/hf.py +13 -10
  174. pyscf/pbc/scf/khf.py +32 -3
  175. pyscf/pbc/scf/khf_ksymm.py +15 -1
  176. pyscf/pbc/scf/kuhf.py +1 -1
  177. pyscf/pbc/scf/kuhf_ksymm.py +1 -1
  178. pyscf/pbc/scf/rsjk.py +1 -1
  179. pyscf/pbc/scf/stability.py +26 -14
  180. pyscf/pbc/tdscf/krhf.py +58 -56
  181. pyscf/pbc/tdscf/kuhf.py +273 -78
  182. pyscf/pbc/tdscf/rhf.py +17 -12
  183. pyscf/pbc/tdscf/uhf.py +46 -35
  184. pyscf/pbc/tools/k2gamma.py +15 -3
  185. pyscf/pbc/tools/lattice.py +3 -3
  186. pyscf/pbc/tools/pbc.py +48 -35
  187. pyscf/pbc/x2c/sfx2c1e.py +5 -0
  188. pyscf/scf/_response_functions.py +85 -44
  189. pyscf/scf/_vhf.py +1 -0
  190. pyscf/scf/addons.py +21 -2
  191. pyscf/scf/dhf.py +82 -28
  192. pyscf/scf/dispersion.py +1 -1
  193. pyscf/scf/hf.py +19 -3
  194. pyscf/scf/uhf.py +9 -3
  195. pyscf/solvent/__init__.py +2 -2
  196. pyscf/solvent/_attach_solvent.py +2 -0
  197. pyscf/solvent/cosmors.py +378 -0
  198. pyscf/solvent/grad/pcm.py +75 -19
  199. pyscf/solvent/hessian/pcm.py +957 -108
  200. pyscf/solvent/hessian/smd.py +7 -43
  201. pyscf/solvent/pcm.py +4 -4
  202. pyscf/solvent/smd.py +5 -3
  203. pyscf/soscf/ciah.py +2 -10
  204. pyscf/soscf/newton_ah.py +4 -1
  205. pyscf/symm/geom.py +58 -13
  206. pyscf/tdscf/_lr_eig.py +561 -57
  207. pyscf/tdscf/dhf.py +58 -65
  208. pyscf/tdscf/ghf.py +63 -71
  209. pyscf/tdscf/gks.py +12 -10
  210. pyscf/tdscf/rhf.py +68 -68
  211. pyscf/tdscf/rks.py +12 -9
  212. pyscf/tdscf/uhf.py +59 -58
  213. pyscf/tdscf/uks.py +15 -13
  214. pyscf/tools/fcidump.py +36 -9
  215. pyscf/tools/finite_diff.py +175 -0
  216. pyscf/tools/qcschema.py +265 -0
  217. pyscf/x2c/tdscf.py +37 -37
  218. pyscf/x2c/x2c.py +101 -34
  219. {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/METADATA +30 -26
  220. {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/RECORD +224 -192
  221. {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/WHEEL +2 -1
  222. {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info/licenses}/NOTICE +13 -0
  223. pyscf/pbc/tdscf/kproxy.py +0 -189
  224. pyscf/pbc/tdscf/kproxy_supercell.py +0 -664
  225. pyscf/pbc/tdscf/krhf_slow.py +0 -300
  226. pyscf/pbc/tdscf/krhf_slow_gamma.py +0 -175
  227. pyscf/pbc/tdscf/krhf_slow_supercell.py +0 -250
  228. pyscf/pbc/tdscf/proxy.py +0 -39
  229. pyscf/pbc/tdscf/rhf_slow.py +0 -35
  230. pyscf/tdscf/common_slow.py +0 -799
  231. pyscf/tdscf/proxy.py +0 -258
  232. pyscf/tdscf/rhf_slow.py +0 -181
  233. {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info/licenses}/LICENSE +0 -0
  234. {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,360 @@
1
+ /* Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
14
+
15
+ *
16
+ * Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
17
+ */
18
+
19
+ #include <complex.h>
20
+ #include <math.h>
21
+ #include "np_helper.h"
22
+
23
+ const int TILESIZE = 32;
24
+ const int TILESIZE_CPLX = 16;
25
+
26
+ /*
27
+ * Calculate the largest integer i such that
28
+ * i * (i - 1) / 2 <= ijouter.
29
+ */
30
+ static inline int uncollapse_loop_index(const long long ijouter)
31
+ {
32
+ return (int) floor((sqrt(0.25 + 2.0 * ijouter) + 0.5));
33
+ }
34
+
35
+ static inline void dtranspose_scale_tile_offdiag(double *A, const int ii,
36
+ const int jj,
37
+ const size_t lda_w,
38
+ const double alpha) {
39
+ for (int j = jj; j < jj + TILESIZE; j++) {
40
+ #pragma omp simd
41
+ for(int i = ii; i < ii + TILESIZE; i++) {
42
+ const double tmp = A[i * lda_w + j];
43
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
44
+ A[j * lda_w + i] = alpha * tmp;
45
+ }
46
+ }
47
+ }
48
+
49
+ static inline void dtranspose_scale_tile_diag(double *A, const int ii,
50
+ const int jj, const size_t lda_w,
51
+ const double alpha) {
52
+ for (int j = jj; j < jj + TILESIZE; j++) {
53
+ #pragma omp simd
54
+ for(int i = ii; i < j; i++) {
55
+ const double tmp = A[i * lda_w + j];
56
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
57
+ A[j * lda_w + i]= alpha * tmp;
58
+ }
59
+ A[j * lda_w + j] *= alpha;
60
+ }
61
+ }
62
+
63
+ static inline void dtranspose_tile_diag(double *A, const int ii, const int jj,
64
+ const size_t lda_w) {
65
+ for (int j = jj; j < jj + TILESIZE; j++) {
66
+ #pragma omp simd
67
+ for(int i = ii; i < j; i++) {
68
+ const double tmp = A[i * lda_w + j];
69
+ A[i * lda_w + j] = A[j * lda_w + i];
70
+ A[j * lda_w + i] = tmp;
71
+ }
72
+ }
73
+ }
74
+
75
+ static inline void ztranspose_scale_tile_offdiag(double complex *A,
76
+ const int ii, const int jj,
77
+ const size_t lda_w,
78
+ const double complex alpha) {
79
+ for (int j = jj; j < jj + TILESIZE_CPLX; j++) {
80
+ #pragma omp simd
81
+ for(int i = ii; i < ii + TILESIZE_CPLX; i++) {
82
+ const double complex tmp = A[i * lda_w + j];
83
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
84
+ A[j * lda_w + i] = alpha * tmp;
85
+ }
86
+ }
87
+ }
88
+
89
+ static inline void ztranspose_scale_tile_diag(double complex *A, const int ii,
90
+ const int jj, const size_t lda_w,
91
+ const double complex alpha) {
92
+ for (int j = jj; j < jj + TILESIZE_CPLX; j++) {
93
+ #pragma omp simd
94
+ for(int i = ii; i < j; i++) {
95
+ const double complex tmp = A[i * lda_w + j];
96
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
97
+ A[j * lda_w + i] = alpha * tmp;
98
+ }
99
+ A[j * lda_w + j] *= alpha;
100
+ }
101
+ }
102
+
103
+ static inline void ztranspose_tile_diag(double complex *A, const int ii,
104
+ const int jj, const size_t lda_w) {
105
+ for (int j = jj; j < jj + TILESIZE_CPLX; j++) {
106
+ #pragma omp simd
107
+ for(int i = ii; i < j; i++) {
108
+ const double complex tmp = A[i * lda_w + j];
109
+ A[i * lda_w + j] = A[j * lda_w + i];
110
+ A[j * lda_w + i] = tmp;
111
+ }
112
+ }
113
+ }
114
+
115
+ /*
116
+ * In-place parallel matrix transpose, double version.
117
+ * See https://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/
118
+ */
119
+ void NPomp_d_itranspose_scale(const int n, const double alpha, double *A, int lda)
120
+ {
121
+ const int nclean = n - n % TILESIZE;
122
+ const int ntiles = nclean / TILESIZE;
123
+ const size_t lda_w = (size_t) lda;
124
+
125
+ #pragma omp parallel
126
+ {
127
+
128
+ /*
129
+ * ---------------------------
130
+ * | ****************** |
131
+ * | ****************** |
132
+ * | ****************** |
133
+ * | ****** ************ |
134
+ * | ****** ************ |
135
+ * | ****** ************ |
136
+ * | ************ ****** |
137
+ * | ************ ****** |
138
+ * | ************ ****** |
139
+ * | ****************** |
140
+ * | ****************** |
141
+ * | ****************** |
142
+ * | |
143
+ * ----------------------------
144
+ */
145
+
146
+ /*
147
+ * The following loop nest is equivalent to:
148
+ * for(int iouter = 1; iouter < ntiles; iouter++)
149
+ * for(int jouter = 0; jouter < iouter; jouter++)
150
+ *
151
+ * See 10.1109/IPDPS.2017.34.
152
+ */
153
+ int first_iteration = 1;
154
+ int iouter, jouter;
155
+ #pragma omp for schedule(static) nowait
156
+ for(long long ijouter = 0; ijouter < (ntiles*(ntiles-1))/2; ijouter++) {
157
+ if(first_iteration) {
158
+ iouter = uncollapse_loop_index(ijouter);
159
+ jouter = ijouter - iouter * (iouter - 1) / 2;
160
+ first_iteration = 0;
161
+ } else {
162
+ jouter++;
163
+ if(jouter == iouter) {
164
+ iouter++;
165
+ jouter = 0;
166
+ }
167
+ }
168
+ dtranspose_scale_tile_offdiag(A, iouter * TILESIZE, jouter * TILESIZE, lda_w, alpha);
169
+ }
170
+
171
+
172
+ /*
173
+ * ---------------------------
174
+ * | ****** |
175
+ * | ****** |
176
+ * | ****** |
177
+ * | ****** |
178
+ * | ****** |
179
+ * | ****** |
180
+ * | ****** |
181
+ * | ****** |
182
+ * | ****** |
183
+ * | ****** |
184
+ * | ****** |
185
+ * | ****** |
186
+ * | |
187
+ * ----------------------------
188
+ */
189
+
190
+ if(alpha != 1.0) {
191
+ #pragma omp for schedule(static) nowait
192
+ for(int ii = 0; ii < nclean; ii+=TILESIZE) {
193
+ dtranspose_scale_tile_diag(A, ii, ii, lda_w, alpha);
194
+ }
195
+ } else {
196
+ #pragma omp for schedule(static) nowait
197
+ for(int ii = 0; ii < nclean; ii+=TILESIZE) {
198
+ dtranspose_tile_diag(A, ii, ii, lda_w);
199
+ }
200
+ }
201
+
202
+
203
+ /*
204
+ * --------------------------
205
+ * | ***|
206
+ * | ***|
207
+ * | ***|
208
+ * | ***|
209
+ * | ***|
210
+ * | ***|
211
+ * | ***|
212
+ * | ***|
213
+ * | ***|
214
+ * | ***|
215
+ * | *********************** |
216
+ * | *********************** |
217
+ * ---------------------------
218
+ */
219
+
220
+ #pragma omp for schedule(static) nowait
221
+ for(int j = 0; j < nclean; j++) {
222
+ for(int i = nclean; i < n; i++) {
223
+ const double tmp = A[i * lda_w + j];
224
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
225
+ A[j * lda_w + i] = alpha * tmp;
226
+ }
227
+ }
228
+ } // end parallel region
229
+
230
+ /*
231
+ * --------------------------
232
+ * | |
233
+ * | |
234
+ * | |
235
+ * | |
236
+ * | |
237
+ * | |
238
+ * | |
239
+ * | |
240
+ * | |
241
+ * | |
242
+ * | ***|
243
+ * | ***|
244
+ * ---------------------------
245
+ */
246
+
247
+ for(int j = nclean; j < n; j++) {
248
+ for(int i = nclean; i < j; i++) {
249
+ const double tmp = A[i * lda_w + j];
250
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
251
+ A[j * lda_w + i] = alpha * tmp;
252
+ }
253
+ }
254
+
255
+ if(alpha != 1.0) {
256
+ for(int i = nclean; i < n; i++) {
257
+ A[i * lda_w + i] *= alpha;
258
+ }
259
+ }
260
+
261
+ }
262
+
263
+ /*
264
+ * In-place parallel matrix transpose, double complex version.
265
+ * See https://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/
266
+ */
267
+ void NPomp_z_itranspose_scale(const int n, const double complex *alphaptr, double complex *A, int lda)
268
+ {
269
+ const double complex alpha = *alphaptr;
270
+ const int nclean = n - n % TILESIZE_CPLX;
271
+ const int ntiles = nclean / TILESIZE_CPLX;
272
+ const size_t lda_w = (size_t) lda;
273
+
274
+ #pragma omp parallel
275
+ {
276
+
277
+ /*
278
+ * The following loop nest is equivalent to:
279
+ * for(int iouter = 1; iouter < ntiles; iouter++)
280
+ * for(int jouter = 0; jouter < iouter; jouter++)
281
+ *
282
+ * See 10.1109/IPDPS.2017.34.
283
+ */
284
+ int first_iteration = 1;
285
+ int iouter, jouter;
286
+ #pragma omp for schedule(static) nowait
287
+ for(long long ijouter = 0; ijouter < (ntiles*(ntiles-1))/2; ijouter++) {
288
+ if(first_iteration) {
289
+ iouter = uncollapse_loop_index(ijouter);
290
+ jouter = ijouter - iouter * (iouter - 1) / 2;
291
+ first_iteration = 0;
292
+ } else {
293
+ jouter++;
294
+ if(jouter == iouter) {
295
+ iouter++;
296
+ jouter = 0;
297
+ }
298
+ }
299
+ ztranspose_scale_tile_offdiag(A, iouter * TILESIZE_CPLX, jouter * TILESIZE_CPLX, lda_w, alpha);
300
+ }
301
+
302
+ if(alpha != 1.0) {
303
+ #pragma omp for schedule(static) nowait
304
+ for(int ii = 0; ii < nclean; ii+=TILESIZE_CPLX) {
305
+ ztranspose_scale_tile_diag(A, ii, ii, lda_w, alpha);
306
+ }
307
+ } else {
308
+ #pragma omp for schedule(static) nowait
309
+ for(int ii = 0; ii < nclean; ii+=TILESIZE_CPLX) {
310
+ ztranspose_tile_diag(A, ii, ii, lda_w);
311
+ }
312
+ }
313
+
314
+ #pragma omp for schedule(static) nowait
315
+ for(int j = 0; j < nclean; j++) {
316
+ for(int i = nclean; i < n; i++) {
317
+ const double complex tmp = A[i * lda_w + j];
318
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
319
+ A[j * lda_w + i] = alpha * tmp;
320
+ }
321
+ }
322
+
323
+ } // end parallel region
324
+
325
+ for(int j = nclean; j < n; j++) {
326
+ for(int i = nclean; i < j; i++) {
327
+ const double complex tmp = A[i * lda_w + j];
328
+ A[i * lda_w + j] = alpha * A[j * lda_w + i];
329
+ A[j * lda_w + i] = alpha * tmp;
330
+ }
331
+ }
332
+
333
+ if(alpha != 1.0) {
334
+ for(int i = nclean; i < n; i++) {
335
+ A[i * lda_w + i] *= alpha;
336
+ }
337
+ }
338
+ }
339
+
340
+
341
+
342
+ /*
343
+ * Batched versions for 3D tensors
344
+ */
345
+
346
+ void NPomp_dtensor_itranspose_scale021(const long long matstride, int nmat, int n, const double alpha,
347
+ double *A, int lda)
348
+ {
349
+ for (int imat = 0; imat < nmat; imat++) {
350
+ NPomp_d_itranspose_scale(n, alpha, A + imat * matstride, lda);
351
+ }
352
+ }
353
+
354
+ void NPomp_ztensor_itranspose_scale021(const long long matstride, int nmat, int n, const double complex *alpha,
355
+ double complex *A, int lda)
356
+ {
357
+ for (int imat = 0; imat < nmat; imat++) {
358
+ NPomp_z_itranspose_scale(n, alpha, A + imat * matstride, lda);
359
+ }
360
+ }
@@ -14,6 +14,7 @@
14
14
  */
15
15
 
16
16
  #include <stdlib.h>
17
+ #include <complex.h>
17
18
  #include "np_helper/np_helper.h"
18
19
 
19
20
  void NPdset0(double *p, const size_t n)
@@ -47,3 +48,96 @@ void NPzcopy(double complex *out, const double complex *in, const size_t n)
47
48
  out[i] = in[i];
48
49
  }
49
50
  }
51
+
52
+ /*
53
+ * These are mostly useful for first-touch array allocation on NUMA systems.
54
+ * Use with numpy.empty.
55
+ */
56
+ void NPomp_dset0(const size_t n, double *out)
57
+ {
58
+ #pragma omp parallel for schedule(static)
59
+ for (size_t i = 0; i < n; i++) {
60
+ out[i] = 0.0;
61
+ }
62
+ }
63
+
64
+ void NPomp_zset0(const size_t n, double complex *out)
65
+ {
66
+ #pragma omp parallel for schedule(static)
67
+ for (size_t i = 0; i < n; i++) {
68
+ out[i] = 0.0;
69
+ }
70
+ }
71
+
72
+
73
+ /*
74
+ * Copy a double precision matrix with multithreading.
75
+ */
76
+ void NPomp_dcopy(const size_t m,
77
+ const size_t n,
78
+ const double *__restrict in, const size_t in_stride,
79
+ double *__restrict out, const size_t out_stride)
80
+ {
81
+ #pragma omp parallel for schedule(static)
82
+ for (size_t i = 0; i < m; i++) {
83
+ #pragma omp simd
84
+ for (size_t j = 0; j < n; j++) {
85
+ out[i * out_stride + j] = in[i * in_stride + j];
86
+ }
87
+ }
88
+ }
89
+
90
+ /*
91
+ * Copy a complex double precision matrix with multithreading.
92
+ */
93
+ void NPomp_zcopy(const size_t m,
94
+ const size_t n,
95
+ const double complex *__restrict in, const size_t in_stride,
96
+ double complex *__restrict out, const size_t out_stride)
97
+ {
98
+ #pragma omp parallel for schedule(static)
99
+ for (size_t i = 0; i < m; i++) {
100
+ #pragma omp simd
101
+ for (size_t j = 0; j < n; j++) {
102
+ out[i * out_stride + j] = in[i * in_stride + j];
103
+ }
104
+ }
105
+ }
106
+
107
+ /*
108
+ * Elementwise multiplication of two double matrices.
109
+ * B <- A \circ B
110
+ */
111
+ void NPomp_dmul(const size_t m,
112
+ const size_t n,
113
+ const double *__restrict a, const size_t a_stride,
114
+ double *__restrict b, const size_t b_stride,
115
+ double *__restrict out, const size_t out_stride)
116
+ {
117
+ #pragma omp parallel for schedule(static)
118
+ for (size_t i = 0; i < m; i++) {
119
+ #pragma omp simd
120
+ for (size_t j = 0; j < n; j++) {
121
+ out[i * out_stride + j] = b[i * b_stride + j] * a[i * a_stride + j];
122
+ }
123
+ }
124
+ }
125
+
126
+ /*
127
+ * Elementwise multiplication of two complex double matrices.
128
+ * B <- A \circ B
129
+ */
130
+ void NPomp_zmul(const size_t m,
131
+ const size_t n,
132
+ const double complex *__restrict a, const size_t a_stride,
133
+ double complex *__restrict b, const size_t b_stride,
134
+ double complex *__restrict out, const size_t out_stride)
135
+ {
136
+ #pragma omp parallel for schedule(static)
137
+ for (size_t i = 0; i < m; i++) {
138
+ #pragma omp simd
139
+ for (size_t j = 0; j < n; j++) {
140
+ out[i * out_stride + j] = b[i * b_stride + j] * a[i * a_stride + j];
141
+ }
142
+ }
143
+ }
@@ -16,6 +16,7 @@
16
16
  * Author: Qiming Sun <osirpt.sun@gmail.com>
17
17
  */
18
18
 
19
+ #include <stdlib.h>
19
20
  #include <complex.h>
20
21
 
21
22
  #define BLOCK_DIM 104
@@ -46,6 +47,13 @@ void NPztranspose(int n, int m, double complex *a, double complex *at);
46
47
  void NPdtranspose_021(int *shape, double *a, double *at);
47
48
  void NPztranspose_021(int *shape, double complex *a, double complex *at);
48
49
 
50
+ void NPomp_d_itranspose_scale(const int n, const double alpha, double *A, int lda);
51
+ void NPomp_z_itranspose_scale(const int n, const double complex *alphaptr, double complex *A, int lda);
52
+ void NPomp_dtensor_itranspose_scale021(const long long matstride, int nmat, int n, const double alpha,
53
+ double *A, int lda);
54
+ void NPomp_ztensor_itranspose_scale021(const long long matstride, int nmat, int n, const double complex *alpha,
55
+ double complex *A, int lda);
56
+
49
57
  void NPdunpack_tril_2d(int count, int n, double *tril, double *mat, int hermi);
50
58
  void NPzunpack_tril_2d(int count, int n,
51
59
  double complex *tril, double complex *mat, int hermi);
@@ -62,6 +70,24 @@ void NPzset0(double complex *p, const size_t n);
62
70
  void NPdcopy(double *out, const double *in, const size_t n);
63
71
  void NPzcopy(double complex *out, const double complex *in, const size_t n);
64
72
 
73
+ void NPomp_dset0(const size_t n, double *out);
74
+ void NPomp_zset0(const size_t n, double complex *out);
75
+
76
+ void NPomp_dcopy(const size_t m, const size_t n,
77
+ const double *in, const size_t in_stride,
78
+ double *out, const size_t out_stride);
79
+ void NPomp_zcopy(const size_t m, const size_t n,
80
+ const double complex *in, const size_t in_stride,
81
+ double complex *out, const size_t out_stride);
82
+ void NPomp_dmul(const size_t m, const size_t n,
83
+ const double *a, const size_t a_stride,
84
+ double *b, const size_t b_stride,
85
+ double *out, const size_t out_stride);
86
+ void NPomp_zmul(const size_t m, const size_t n,
87
+ const double complex *a, const size_t a_stride,
88
+ double complex *b, const size_t b_stride,
89
+ double complex *out, const size_t out_stride);
90
+
65
91
  void NPdgemm(const char trans_a, const char trans_b,
66
92
  const int m, const int n, const int k,
67
93
  const int lda, const int ldb, const int ldc,