pyscf 2.7.0__py3-none-macosx_11_0_arm64.whl → 2.9.0__py3-none-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyscf/__init__.py +1 -1
- pyscf/ao2mo/__init__.py +13 -2
- pyscf/ao2mo/_ao2mo.py +10 -1
- pyscf/ao2mo/incore.py +3 -0
- pyscf/ao2mo/nrr_outcore.py +2 -2
- pyscf/ao2mo/outcore.py +3 -3
- pyscf/ao2mo/r_outcore.py +2 -2
- pyscf/cc/__init__.py +2 -3
- pyscf/cc/ccsd.py +13 -5
- pyscf/cc/ccsd_rdm.py +6 -1
- pyscf/cc/dfccsd.py +3 -3
- pyscf/cc/dfuccsd.py +310 -0
- pyscf/cc/gccsd.py +2 -2
- pyscf/cc/rccsd.py +5 -1
- pyscf/cc/uccsd.py +36 -27
- pyscf/cc/uccsd_rdm.py +2 -2
- pyscf/df/addons.py +8 -3
- pyscf/df/autoaux.py +4 -0
- pyscf/df/df_jk.py +56 -25
- pyscf/df/grad/rhf.py +31 -1
- pyscf/df/hessian/uhf.py +2 -2
- pyscf/df/incore.py +2 -2
- pyscf/df/outcore.py +6 -6
- pyscf/dft/gks.py +25 -21
- pyscf/dft/libxc.py +31 -11
- pyscf/dft/numint.py +33 -16
- pyscf/dft/radi.py +9 -2
- pyscf/dft/rks.py +28 -24
- pyscf/dft/roks.py +7 -1
- pyscf/dft/uks.py +34 -25
- pyscf/fci/direct_spin1.py +0 -1
- pyscf/fci/fci_dhf_slow.py +15 -1
- pyscf/grad/ccsd.py +3 -7
- pyscf/grad/ccsd_slow.py +2 -3
- pyscf/grad/mp2.py +12 -3
- pyscf/grad/sacasscf.py +2 -0
- pyscf/grad/uccsd.py +3 -7
- pyscf/grad/ump2.py +2 -4
- pyscf/gto/basis/__init__.py +32 -5
- pyscf/gto/basis/def2-mtzvp.dat +4719 -0
- pyscf/gto/basis/def2-mtzvpp.dat +4739 -0
- pyscf/gto/basis/dyall-basis/__init__.py +0 -0
- pyscf/gto/basis/dyall-basis/dyall_2zp.py +6492 -0
- pyscf/gto/basis/dyall-basis/dyall_3zp.py +8343 -0
- pyscf/gto/basis/dyall-basis/dyall_4zp.py +10055 -0
- pyscf/gto/basis/dyall-basis/dyall_aae2z.py +1818 -0
- pyscf/gto/basis/dyall-basis/dyall_aae3z.py +2521 -0
- pyscf/gto/basis/dyall-basis/dyall_aae4z.py +3351 -0
- pyscf/gto/basis/dyall-basis/dyall_acv2z.py +1790 -0
- pyscf/gto/basis/dyall-basis/dyall_acv3z.py +2417 -0
- pyscf/gto/basis/dyall-basis/dyall_acv4z.py +3085 -0
- pyscf/gto/basis/dyall-basis/dyall_ae2z.py +6619 -0
- pyscf/gto/basis/dyall-basis/dyall_ae3z.py +9027 -0
- pyscf/gto/basis/dyall-basis/dyall_ae4z.py +11839 -0
- pyscf/gto/basis/dyall-basis/dyall_av2z.py +1742 -0
- pyscf/gto/basis/dyall-basis/dyall_av3z.py +2318 -0
- pyscf/gto/basis/dyall-basis/dyall_av4z.py +2905 -0
- pyscf/gto/basis/dyall-basis/dyall_cv2z.py +6558 -0
- pyscf/gto/basis/dyall-basis/dyall_cv3z.py +8767 -0
- pyscf/gto/basis/dyall-basis/dyall_cv4z.py +11098 -0
- pyscf/gto/basis/dyall-basis/dyall_v2z.py +6472 -0
- pyscf/gto/basis/dyall-basis/dyall_v3z.py +8539 -0
- pyscf/gto/basis/dyall-basis/dyall_v4z.py +10658 -0
- pyscf/gto/basis/ma-def2-qzvp.dat +5959 -0
- pyscf/gto/basis/ma-def2-qzvpp.dat +6195 -0
- pyscf/gto/basis/ma-def2-svp.dat +3504 -0
- pyscf/gto/basis/ma-def2-svpp.dat +3504 -0
- pyscf/gto/basis/ma-def2-tzvp.dat +4347 -0
- pyscf/gto/basis/ma-def2-tzvpp.dat +4549 -0
- pyscf/gto/basis/parse_cp2k.py +8 -7
- pyscf/gto/basis/parse_nwchem.py +25 -10
- pyscf/gto/eval_gto.py +1 -1
- pyscf/gto/ft_ao.py +6 -6
- pyscf/gto/mole.py +32 -35
- pyscf/gto/moleintor.py +26 -1
- pyscf/gw/rpa.py +133 -244
- pyscf/gw/urpa.py +84 -131
- pyscf/hessian/uks.py +1 -1
- pyscf/lib/CMakeLists.txt +8 -4
- pyscf/lib/config.h +0 -1
- pyscf/lib/config.h.in +0 -1
- pyscf/lib/deps/include/xc.h +28 -18
- pyscf/lib/deps/include/xc_funcs.h +50 -2
- pyscf/lib/deps/include/xc_version.h +3 -3
- pyscf/lib/deps/lib/libcint.6.dylib +0 -0
- pyscf/lib/deps/lib/{libxc.12.dylib → libxc.15.dylib} +0 -0
- pyscf/lib/deps/lib/libxcfun.2.dylib +0 -0
- pyscf/lib/dft/libxc_itrf.c +25 -21
- pyscf/lib/dft/nr_numint_sparse.c +3 -3
- pyscf/lib/diis.py +1 -1
- pyscf/lib/exceptions.py +3 -0
- pyscf/lib/libagf2.dylib +0 -0
- pyscf/lib/libao2mo.dylib +0 -0
- pyscf/lib/libcc.dylib +0 -0
- pyscf/lib/libcgto.dylib +0 -0
- pyscf/lib/libcvhf.dylib +0 -0
- pyscf/lib/libdft.dylib +0 -0
- pyscf/lib/libfci.dylib +0 -0
- pyscf/lib/libmcscf.dylib +0 -0
- pyscf/lib/libmp.dylib +0 -0
- pyscf/lib/libnp_helper.dylib +0 -0
- pyscf/lib/libpbc.dylib +0 -0
- pyscf/lib/libri.dylib +0 -0
- pyscf/lib/libxc_itrf.dylib +0 -0
- pyscf/lib/libxcfun_itrf.dylib +0 -0
- pyscf/lib/linalg_helper.py +5 -6
- pyscf/lib/logger.py +2 -1
- pyscf/lib/mcscf/fci_contract.c +8 -1
- pyscf/lib/misc.py +16 -8
- pyscf/lib/mp/CMakeLists.txt +22 -0
- pyscf/lib/mp/mp2.c +518 -0
- pyscf/lib/mp/mp2.h +44 -0
- pyscf/lib/np_helper/CMakeLists.txt +1 -1
- pyscf/lib/np_helper/imatcopy.c +360 -0
- pyscf/lib/np_helper/np_helper.c +94 -0
- pyscf/lib/np_helper/np_helper.h +26 -0
- pyscf/lib/numpy_helper.py +194 -10
- pyscf/lib/pbc/nr_direct.c +2 -7
- pyscf/lib/vhf/fblas.h +3 -0
- pyscf/lib/vhf/nr_sr_vhf.c +8 -12
- pyscf/lib/vhf/rkb_screen.c +139 -0
- pyscf/mcscf/__init__.py +1 -1
- pyscf/mcscf/casci.py +7 -3
- pyscf/mcscf/chkfile.py +2 -3
- pyscf/mcscf/mc1step.py +12 -8
- pyscf/mcscf/newton_casscf.py +1 -1
- pyscf/mcscf/umc1step.py +5 -3
- pyscf/mp/__init__.py +2 -2
- pyscf/mp/dfmp2.py +498 -59
- pyscf/mp/dfmp2_native.py +11 -1
- pyscf/mp/dfmp2_slow.py +133 -0
- pyscf/mp/dfump2.py +672 -0
- pyscf/mp/dfump2_native.py +9 -0
- pyscf/mp/dfump2_slow.py +161 -0
- pyscf/mp/gmp2.py +6 -47
- pyscf/mp/mp2.py +19 -5
- pyscf/mp/ump2.py +23 -18
- pyscf/mrpt/nevpt2.py +11 -0
- pyscf/pbc/df/aft.py +9 -7
- pyscf/pbc/df/df.py +5 -6
- pyscf/pbc/df/df_jk.py +12 -6
- pyscf/pbc/df/fft.py +3 -3
- pyscf/pbc/df/fft_jk.py +7 -7
- pyscf/pbc/df/incore.py +1 -1
- pyscf/pbc/df/mdf_jk.py +2 -1
- pyscf/pbc/df/outcore.py +10 -10
- pyscf/pbc/df/rsdf.py +1 -0
- pyscf/pbc/df/rsdf_builder.py +3 -3
- pyscf/pbc/df/rsdf_helper.py +5 -5
- pyscf/pbc/df/rsdf_jk.py +2 -1
- pyscf/pbc/dft/gen_grid.py +3 -2
- pyscf/pbc/dft/gks.py +14 -3
- pyscf/pbc/dft/kgks.py +15 -4
- pyscf/pbc/dft/krks.py +28 -10
- pyscf/pbc/dft/krks_ksymm.py +21 -9
- pyscf/pbc/dft/krkspu.py +1 -30
- pyscf/pbc/dft/krkspu_ksymm.py +0 -30
- pyscf/pbc/dft/kuks.py +30 -13
- pyscf/pbc/dft/kuks_ksymm.py +22 -10
- pyscf/pbc/dft/kukspu.py +0 -27
- pyscf/pbc/dft/kukspu_ksymm.py +0 -30
- pyscf/pbc/dft/multigrid/multigrid.py +17 -7
- pyscf/pbc/dft/multigrid/multigrid_pair.py +6 -1
- pyscf/pbc/dft/numint.py +26 -10
- pyscf/pbc/dft/rks.py +20 -26
- pyscf/pbc/dft/uks.py +21 -4
- pyscf/pbc/gto/_pbcintor.py +1 -0
- pyscf/pbc/gto/cell.py +170 -5
- pyscf/pbc/gto/eval_gto.py +1 -1
- pyscf/pbc/gto/neighborlist.py +4 -1
- pyscf/pbc/mpitools/mpi.py +0 -1
- pyscf/pbc/scf/_response_functions.py +141 -34
- pyscf/pbc/scf/hf.py +13 -10
- pyscf/pbc/scf/khf.py +32 -3
- pyscf/pbc/scf/khf_ksymm.py +15 -1
- pyscf/pbc/scf/kuhf.py +1 -1
- pyscf/pbc/scf/kuhf_ksymm.py +1 -1
- pyscf/pbc/scf/rsjk.py +1 -1
- pyscf/pbc/scf/stability.py +26 -14
- pyscf/pbc/tdscf/krhf.py +58 -56
- pyscf/pbc/tdscf/kuhf.py +273 -78
- pyscf/pbc/tdscf/rhf.py +17 -12
- pyscf/pbc/tdscf/uhf.py +46 -35
- pyscf/pbc/tools/k2gamma.py +15 -3
- pyscf/pbc/tools/lattice.py +3 -3
- pyscf/pbc/tools/pbc.py +48 -35
- pyscf/pbc/x2c/sfx2c1e.py +5 -0
- pyscf/scf/_response_functions.py +85 -44
- pyscf/scf/_vhf.py +1 -0
- pyscf/scf/addons.py +21 -2
- pyscf/scf/dhf.py +82 -28
- pyscf/scf/dispersion.py +1 -1
- pyscf/scf/hf.py +19 -3
- pyscf/scf/uhf.py +9 -3
- pyscf/solvent/__init__.py +2 -2
- pyscf/solvent/_attach_solvent.py +2 -0
- pyscf/solvent/cosmors.py +378 -0
- pyscf/solvent/grad/pcm.py +75 -19
- pyscf/solvent/hessian/pcm.py +957 -108
- pyscf/solvent/hessian/smd.py +7 -43
- pyscf/solvent/pcm.py +4 -4
- pyscf/solvent/smd.py +5 -3
- pyscf/soscf/ciah.py +2 -10
- pyscf/soscf/newton_ah.py +4 -1
- pyscf/symm/geom.py +58 -13
- pyscf/tdscf/_lr_eig.py +561 -57
- pyscf/tdscf/dhf.py +58 -65
- pyscf/tdscf/ghf.py +63 -71
- pyscf/tdscf/gks.py +12 -10
- pyscf/tdscf/rhf.py +68 -68
- pyscf/tdscf/rks.py +12 -9
- pyscf/tdscf/uhf.py +59 -58
- pyscf/tdscf/uks.py +15 -13
- pyscf/tools/fcidump.py +36 -9
- pyscf/tools/finite_diff.py +175 -0
- pyscf/tools/qcschema.py +265 -0
- pyscf/x2c/tdscf.py +37 -37
- pyscf/x2c/x2c.py +101 -34
- {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/METADATA +30 -26
- {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/RECORD +224 -192
- {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/WHEEL +2 -1
- {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info/licenses}/NOTICE +13 -0
- pyscf/pbc/tdscf/kproxy.py +0 -189
- pyscf/pbc/tdscf/kproxy_supercell.py +0 -664
- pyscf/pbc/tdscf/krhf_slow.py +0 -300
- pyscf/pbc/tdscf/krhf_slow_gamma.py +0 -175
- pyscf/pbc/tdscf/krhf_slow_supercell.py +0 -250
- pyscf/pbc/tdscf/proxy.py +0 -39
- pyscf/pbc/tdscf/rhf_slow.py +0 -35
- pyscf/tdscf/common_slow.py +0 -799
- pyscf/tdscf/proxy.py +0 -258
- pyscf/tdscf/rhf_slow.py +0 -181
- {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info/licenses}/LICENSE +0 -0
- {pyscf-2.7.0.dist-info → pyscf-2.9.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
/* Copyright 2014-2018 The PySCF Developers. All Rights Reserved.
|
|
2
|
+
|
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
you may not use this file except in compliance with the License.
|
|
5
|
+
You may obtain a copy of the License at
|
|
6
|
+
|
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
|
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
See the License for the specific language governing permissions and
|
|
13
|
+
limitations under the License.
|
|
14
|
+
|
|
15
|
+
*
|
|
16
|
+
* Author: Christopher Hillenbrand <chillenbrand15@gmail.com>
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
#include <complex.h>
|
|
20
|
+
#include <math.h>
|
|
21
|
+
#include "np_helper.h"
|
|
22
|
+
|
|
23
|
+
const int TILESIZE = 32;
|
|
24
|
+
const int TILESIZE_CPLX = 16;
|
|
25
|
+
|
|
26
|
+
/*
|
|
27
|
+
* Calculate the largest integer i such that
|
|
28
|
+
* i * (i - 1) / 2 <= ijouter.
|
|
29
|
+
*/
|
|
30
|
+
static inline int uncollapse_loop_index(const long long ijouter)
|
|
31
|
+
{
|
|
32
|
+
return (int) floor((sqrt(0.25 + 2.0 * ijouter) + 0.5));
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
static inline void dtranspose_scale_tile_offdiag(double *A, const int ii,
|
|
36
|
+
const int jj,
|
|
37
|
+
const size_t lda_w,
|
|
38
|
+
const double alpha) {
|
|
39
|
+
for (int j = jj; j < jj + TILESIZE; j++) {
|
|
40
|
+
#pragma omp simd
|
|
41
|
+
for(int i = ii; i < ii + TILESIZE; i++) {
|
|
42
|
+
const double tmp = A[i * lda_w + j];
|
|
43
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
44
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
static inline void dtranspose_scale_tile_diag(double *A, const int ii,
|
|
50
|
+
const int jj, const size_t lda_w,
|
|
51
|
+
const double alpha) {
|
|
52
|
+
for (int j = jj; j < jj + TILESIZE; j++) {
|
|
53
|
+
#pragma omp simd
|
|
54
|
+
for(int i = ii; i < j; i++) {
|
|
55
|
+
const double tmp = A[i * lda_w + j];
|
|
56
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
57
|
+
A[j * lda_w + i]= alpha * tmp;
|
|
58
|
+
}
|
|
59
|
+
A[j * lda_w + j] *= alpha;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
static inline void dtranspose_tile_diag(double *A, const int ii, const int jj,
|
|
64
|
+
const size_t lda_w) {
|
|
65
|
+
for (int j = jj; j < jj + TILESIZE; j++) {
|
|
66
|
+
#pragma omp simd
|
|
67
|
+
for(int i = ii; i < j; i++) {
|
|
68
|
+
const double tmp = A[i * lda_w + j];
|
|
69
|
+
A[i * lda_w + j] = A[j * lda_w + i];
|
|
70
|
+
A[j * lda_w + i] = tmp;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
static inline void ztranspose_scale_tile_offdiag(double complex *A,
|
|
76
|
+
const int ii, const int jj,
|
|
77
|
+
const size_t lda_w,
|
|
78
|
+
const double complex alpha) {
|
|
79
|
+
for (int j = jj; j < jj + TILESIZE_CPLX; j++) {
|
|
80
|
+
#pragma omp simd
|
|
81
|
+
for(int i = ii; i < ii + TILESIZE_CPLX; i++) {
|
|
82
|
+
const double complex tmp = A[i * lda_w + j];
|
|
83
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
84
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
static inline void ztranspose_scale_tile_diag(double complex *A, const int ii,
|
|
90
|
+
const int jj, const size_t lda_w,
|
|
91
|
+
const double complex alpha) {
|
|
92
|
+
for (int j = jj; j < jj + TILESIZE_CPLX; j++) {
|
|
93
|
+
#pragma omp simd
|
|
94
|
+
for(int i = ii; i < j; i++) {
|
|
95
|
+
const double complex tmp = A[i * lda_w + j];
|
|
96
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
97
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
98
|
+
}
|
|
99
|
+
A[j * lda_w + j] *= alpha;
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
static inline void ztranspose_tile_diag(double complex *A, const int ii,
|
|
104
|
+
const int jj, const size_t lda_w) {
|
|
105
|
+
for (int j = jj; j < jj + TILESIZE_CPLX; j++) {
|
|
106
|
+
#pragma omp simd
|
|
107
|
+
for(int i = ii; i < j; i++) {
|
|
108
|
+
const double complex tmp = A[i * lda_w + j];
|
|
109
|
+
A[i * lda_w + j] = A[j * lda_w + i];
|
|
110
|
+
A[j * lda_w + i] = tmp;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/*
|
|
116
|
+
* In-place parallel matrix transpose, double version.
|
|
117
|
+
* See https://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/
|
|
118
|
+
*/
|
|
119
|
+
void NPomp_d_itranspose_scale(const int n, const double alpha, double *A, int lda)
|
|
120
|
+
{
|
|
121
|
+
const int nclean = n - n % TILESIZE;
|
|
122
|
+
const int ntiles = nclean / TILESIZE;
|
|
123
|
+
const size_t lda_w = (size_t) lda;
|
|
124
|
+
|
|
125
|
+
#pragma omp parallel
|
|
126
|
+
{
|
|
127
|
+
|
|
128
|
+
/*
|
|
129
|
+
* ---------------------------
|
|
130
|
+
* | ****************** |
|
|
131
|
+
* | ****************** |
|
|
132
|
+
* | ****************** |
|
|
133
|
+
* | ****** ************ |
|
|
134
|
+
* | ****** ************ |
|
|
135
|
+
* | ****** ************ |
|
|
136
|
+
* | ************ ****** |
|
|
137
|
+
* | ************ ****** |
|
|
138
|
+
* | ************ ****** |
|
|
139
|
+
* | ****************** |
|
|
140
|
+
* | ****************** |
|
|
141
|
+
* | ****************** |
|
|
142
|
+
* | |
|
|
143
|
+
* ----------------------------
|
|
144
|
+
*/
|
|
145
|
+
|
|
146
|
+
/*
|
|
147
|
+
* The following loop nest is equivalent to:
|
|
148
|
+
* for(int iouter = 1; iouter < ntiles; iouter++)
|
|
149
|
+
* for(int jouter = 0; jouter < iouter; jouter++)
|
|
150
|
+
*
|
|
151
|
+
* See 10.1109/IPDPS.2017.34.
|
|
152
|
+
*/
|
|
153
|
+
int first_iteration = 1;
|
|
154
|
+
int iouter, jouter;
|
|
155
|
+
#pragma omp for schedule(static) nowait
|
|
156
|
+
for(long long ijouter = 0; ijouter < (ntiles*(ntiles-1))/2; ijouter++) {
|
|
157
|
+
if(first_iteration) {
|
|
158
|
+
iouter = uncollapse_loop_index(ijouter);
|
|
159
|
+
jouter = ijouter - iouter * (iouter - 1) / 2;
|
|
160
|
+
first_iteration = 0;
|
|
161
|
+
} else {
|
|
162
|
+
jouter++;
|
|
163
|
+
if(jouter == iouter) {
|
|
164
|
+
iouter++;
|
|
165
|
+
jouter = 0;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
dtranspose_scale_tile_offdiag(A, iouter * TILESIZE, jouter * TILESIZE, lda_w, alpha);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
/*
|
|
173
|
+
* ---------------------------
|
|
174
|
+
* | ****** |
|
|
175
|
+
* | ****** |
|
|
176
|
+
* | ****** |
|
|
177
|
+
* | ****** |
|
|
178
|
+
* | ****** |
|
|
179
|
+
* | ****** |
|
|
180
|
+
* | ****** |
|
|
181
|
+
* | ****** |
|
|
182
|
+
* | ****** |
|
|
183
|
+
* | ****** |
|
|
184
|
+
* | ****** |
|
|
185
|
+
* | ****** |
|
|
186
|
+
* | |
|
|
187
|
+
* ----------------------------
|
|
188
|
+
*/
|
|
189
|
+
|
|
190
|
+
if(alpha != 1.0) {
|
|
191
|
+
#pragma omp for schedule(static) nowait
|
|
192
|
+
for(int ii = 0; ii < nclean; ii+=TILESIZE) {
|
|
193
|
+
dtranspose_scale_tile_diag(A, ii, ii, lda_w, alpha);
|
|
194
|
+
}
|
|
195
|
+
} else {
|
|
196
|
+
#pragma omp for schedule(static) nowait
|
|
197
|
+
for(int ii = 0; ii < nclean; ii+=TILESIZE) {
|
|
198
|
+
dtranspose_tile_diag(A, ii, ii, lda_w);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
/*
|
|
204
|
+
* --------------------------
|
|
205
|
+
* | ***|
|
|
206
|
+
* | ***|
|
|
207
|
+
* | ***|
|
|
208
|
+
* | ***|
|
|
209
|
+
* | ***|
|
|
210
|
+
* | ***|
|
|
211
|
+
* | ***|
|
|
212
|
+
* | ***|
|
|
213
|
+
* | ***|
|
|
214
|
+
* | ***|
|
|
215
|
+
* | *********************** |
|
|
216
|
+
* | *********************** |
|
|
217
|
+
* ---------------------------
|
|
218
|
+
*/
|
|
219
|
+
|
|
220
|
+
#pragma omp for schedule(static) nowait
|
|
221
|
+
for(int j = 0; j < nclean; j++) {
|
|
222
|
+
for(int i = nclean; i < n; i++) {
|
|
223
|
+
const double tmp = A[i * lda_w + j];
|
|
224
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
225
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
} // end parallel region
|
|
229
|
+
|
|
230
|
+
/*
|
|
231
|
+
* --------------------------
|
|
232
|
+
* | |
|
|
233
|
+
* | |
|
|
234
|
+
* | |
|
|
235
|
+
* | |
|
|
236
|
+
* | |
|
|
237
|
+
* | |
|
|
238
|
+
* | |
|
|
239
|
+
* | |
|
|
240
|
+
* | |
|
|
241
|
+
* | |
|
|
242
|
+
* | ***|
|
|
243
|
+
* | ***|
|
|
244
|
+
* ---------------------------
|
|
245
|
+
*/
|
|
246
|
+
|
|
247
|
+
for(int j = nclean; j < n; j++) {
|
|
248
|
+
for(int i = nclean; i < j; i++) {
|
|
249
|
+
const double tmp = A[i * lda_w + j];
|
|
250
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
251
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if(alpha != 1.0) {
|
|
256
|
+
for(int i = nclean; i < n; i++) {
|
|
257
|
+
A[i * lda_w + i] *= alpha;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
/*
|
|
264
|
+
* In-place parallel matrix transpose, double complex version.
|
|
265
|
+
* See https://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/
|
|
266
|
+
*/
|
|
267
|
+
void NPomp_z_itranspose_scale(const int n, const double complex *alphaptr, double complex *A, int lda)
|
|
268
|
+
{
|
|
269
|
+
const double complex alpha = *alphaptr;
|
|
270
|
+
const int nclean = n - n % TILESIZE_CPLX;
|
|
271
|
+
const int ntiles = nclean / TILESIZE_CPLX;
|
|
272
|
+
const size_t lda_w = (size_t) lda;
|
|
273
|
+
|
|
274
|
+
#pragma omp parallel
|
|
275
|
+
{
|
|
276
|
+
|
|
277
|
+
/*
|
|
278
|
+
* The following loop nest is equivalent to:
|
|
279
|
+
* for(int iouter = 1; iouter < ntiles; iouter++)
|
|
280
|
+
* for(int jouter = 0; jouter < iouter; jouter++)
|
|
281
|
+
*
|
|
282
|
+
* See 10.1109/IPDPS.2017.34.
|
|
283
|
+
*/
|
|
284
|
+
int first_iteration = 1;
|
|
285
|
+
int iouter, jouter;
|
|
286
|
+
#pragma omp for schedule(static) nowait
|
|
287
|
+
for(long long ijouter = 0; ijouter < (ntiles*(ntiles-1))/2; ijouter++) {
|
|
288
|
+
if(first_iteration) {
|
|
289
|
+
iouter = uncollapse_loop_index(ijouter);
|
|
290
|
+
jouter = ijouter - iouter * (iouter - 1) / 2;
|
|
291
|
+
first_iteration = 0;
|
|
292
|
+
} else {
|
|
293
|
+
jouter++;
|
|
294
|
+
if(jouter == iouter) {
|
|
295
|
+
iouter++;
|
|
296
|
+
jouter = 0;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
ztranspose_scale_tile_offdiag(A, iouter * TILESIZE_CPLX, jouter * TILESIZE_CPLX, lda_w, alpha);
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
if(alpha != 1.0) {
|
|
303
|
+
#pragma omp for schedule(static) nowait
|
|
304
|
+
for(int ii = 0; ii < nclean; ii+=TILESIZE_CPLX) {
|
|
305
|
+
ztranspose_scale_tile_diag(A, ii, ii, lda_w, alpha);
|
|
306
|
+
}
|
|
307
|
+
} else {
|
|
308
|
+
#pragma omp for schedule(static) nowait
|
|
309
|
+
for(int ii = 0; ii < nclean; ii+=TILESIZE_CPLX) {
|
|
310
|
+
ztranspose_tile_diag(A, ii, ii, lda_w);
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
#pragma omp for schedule(static) nowait
|
|
315
|
+
for(int j = 0; j < nclean; j++) {
|
|
316
|
+
for(int i = nclean; i < n; i++) {
|
|
317
|
+
const double complex tmp = A[i * lda_w + j];
|
|
318
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
319
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
} // end parallel region
|
|
324
|
+
|
|
325
|
+
for(int j = nclean; j < n; j++) {
|
|
326
|
+
for(int i = nclean; i < j; i++) {
|
|
327
|
+
const double complex tmp = A[i * lda_w + j];
|
|
328
|
+
A[i * lda_w + j] = alpha * A[j * lda_w + i];
|
|
329
|
+
A[j * lda_w + i] = alpha * tmp;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
if(alpha != 1.0) {
|
|
334
|
+
for(int i = nclean; i < n; i++) {
|
|
335
|
+
A[i * lda_w + i] *= alpha;
|
|
336
|
+
}
|
|
337
|
+
}
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
/*
|
|
343
|
+
* Batched versions for 3D tensors
|
|
344
|
+
*/
|
|
345
|
+
|
|
346
|
+
void NPomp_dtensor_itranspose_scale021(const long long matstride, int nmat, int n, const double alpha,
|
|
347
|
+
double *A, int lda)
|
|
348
|
+
{
|
|
349
|
+
for (int imat = 0; imat < nmat; imat++) {
|
|
350
|
+
NPomp_d_itranspose_scale(n, alpha, A + imat * matstride, lda);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
void NPomp_ztensor_itranspose_scale021(const long long matstride, int nmat, int n, const double complex *alpha,
|
|
355
|
+
double complex *A, int lda)
|
|
356
|
+
{
|
|
357
|
+
for (int imat = 0; imat < nmat; imat++) {
|
|
358
|
+
NPomp_z_itranspose_scale(n, alpha, A + imat * matstride, lda);
|
|
359
|
+
}
|
|
360
|
+
}
|
pyscf/lib/np_helper/np_helper.c
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
*/
|
|
15
15
|
|
|
16
16
|
#include <stdlib.h>
|
|
17
|
+
#include <complex.h>
|
|
17
18
|
#include "np_helper/np_helper.h"
|
|
18
19
|
|
|
19
20
|
void NPdset0(double *p, const size_t n)
|
|
@@ -47,3 +48,96 @@ void NPzcopy(double complex *out, const double complex *in, const size_t n)
|
|
|
47
48
|
out[i] = in[i];
|
|
48
49
|
}
|
|
49
50
|
}
|
|
51
|
+
|
|
52
|
+
/*
|
|
53
|
+
* These are mostly useful for first-touch array allocation on NUMA systems.
|
|
54
|
+
* Use with numpy.empty.
|
|
55
|
+
*/
|
|
56
|
+
void NPomp_dset0(const size_t n, double *out)
|
|
57
|
+
{
|
|
58
|
+
#pragma omp parallel for schedule(static)
|
|
59
|
+
for (size_t i = 0; i < n; i++) {
|
|
60
|
+
out[i] = 0.0;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
void NPomp_zset0(const size_t n, double complex *out)
|
|
65
|
+
{
|
|
66
|
+
#pragma omp parallel for schedule(static)
|
|
67
|
+
for (size_t i = 0; i < n; i++) {
|
|
68
|
+
out[i] = 0.0;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
/*
|
|
74
|
+
* Copy a double precision matrix with multithreading.
|
|
75
|
+
*/
|
|
76
|
+
void NPomp_dcopy(const size_t m,
|
|
77
|
+
const size_t n,
|
|
78
|
+
const double *__restrict in, const size_t in_stride,
|
|
79
|
+
double *__restrict out, const size_t out_stride)
|
|
80
|
+
{
|
|
81
|
+
#pragma omp parallel for schedule(static)
|
|
82
|
+
for (size_t i = 0; i < m; i++) {
|
|
83
|
+
#pragma omp simd
|
|
84
|
+
for (size_t j = 0; j < n; j++) {
|
|
85
|
+
out[i * out_stride + j] = in[i * in_stride + j];
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/*
|
|
91
|
+
* Copy a complex double precision matrix with multithreading.
|
|
92
|
+
*/
|
|
93
|
+
void NPomp_zcopy(const size_t m,
|
|
94
|
+
const size_t n,
|
|
95
|
+
const double complex *__restrict in, const size_t in_stride,
|
|
96
|
+
double complex *__restrict out, const size_t out_stride)
|
|
97
|
+
{
|
|
98
|
+
#pragma omp parallel for schedule(static)
|
|
99
|
+
for (size_t i = 0; i < m; i++) {
|
|
100
|
+
#pragma omp simd
|
|
101
|
+
for (size_t j = 0; j < n; j++) {
|
|
102
|
+
out[i * out_stride + j] = in[i * in_stride + j];
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/*
|
|
108
|
+
* Elementwise multiplication of two double matrices.
|
|
109
|
+
* B <- A \circ B
|
|
110
|
+
*/
|
|
111
|
+
void NPomp_dmul(const size_t m,
|
|
112
|
+
const size_t n,
|
|
113
|
+
const double *__restrict a, const size_t a_stride,
|
|
114
|
+
double *__restrict b, const size_t b_stride,
|
|
115
|
+
double *__restrict out, const size_t out_stride)
|
|
116
|
+
{
|
|
117
|
+
#pragma omp parallel for schedule(static)
|
|
118
|
+
for (size_t i = 0; i < m; i++) {
|
|
119
|
+
#pragma omp simd
|
|
120
|
+
for (size_t j = 0; j < n; j++) {
|
|
121
|
+
out[i * out_stride + j] = b[i * b_stride + j] * a[i * a_stride + j];
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/*
|
|
127
|
+
* Elementwise multiplication of two complex double matrices.
|
|
128
|
+
* B <- A \circ B
|
|
129
|
+
*/
|
|
130
|
+
void NPomp_zmul(const size_t m,
|
|
131
|
+
const size_t n,
|
|
132
|
+
const double complex *__restrict a, const size_t a_stride,
|
|
133
|
+
double complex *__restrict b, const size_t b_stride,
|
|
134
|
+
double complex *__restrict out, const size_t out_stride)
|
|
135
|
+
{
|
|
136
|
+
#pragma omp parallel for schedule(static)
|
|
137
|
+
for (size_t i = 0; i < m; i++) {
|
|
138
|
+
#pragma omp simd
|
|
139
|
+
for (size_t j = 0; j < n; j++) {
|
|
140
|
+
out[i * out_stride + j] = b[i * b_stride + j] * a[i * a_stride + j];
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
pyscf/lib/np_helper/np_helper.h
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
* Author: Qiming Sun <osirpt.sun@gmail.com>
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
+
#include <stdlib.h>
|
|
19
20
|
#include <complex.h>
|
|
20
21
|
|
|
21
22
|
#define BLOCK_DIM 104
|
|
@@ -46,6 +47,13 @@ void NPztranspose(int n, int m, double complex *a, double complex *at);
|
|
|
46
47
|
void NPdtranspose_021(int *shape, double *a, double *at);
|
|
47
48
|
void NPztranspose_021(int *shape, double complex *a, double complex *at);
|
|
48
49
|
|
|
50
|
+
void NPomp_d_itranspose_scale(const int n, const double alpha, double *A, int lda);
|
|
51
|
+
void NPomp_z_itranspose_scale(const int n, const double complex *alphaptr, double complex *A, int lda);
|
|
52
|
+
void NPomp_dtensor_itranspose_scale021(const long long matstride, int nmat, int n, const double alpha,
|
|
53
|
+
double *A, int lda);
|
|
54
|
+
void NPomp_ztensor_itranspose_scale021(const long long matstride, int nmat, int n, const double complex *alpha,
|
|
55
|
+
double complex *A, int lda);
|
|
56
|
+
|
|
49
57
|
void NPdunpack_tril_2d(int count, int n, double *tril, double *mat, int hermi);
|
|
50
58
|
void NPzunpack_tril_2d(int count, int n,
|
|
51
59
|
double complex *tril, double complex *mat, int hermi);
|
|
@@ -62,6 +70,24 @@ void NPzset0(double complex *p, const size_t n);
|
|
|
62
70
|
void NPdcopy(double *out, const double *in, const size_t n);
|
|
63
71
|
void NPzcopy(double complex *out, const double complex *in, const size_t n);
|
|
64
72
|
|
|
73
|
+
void NPomp_dset0(const size_t n, double *out);
|
|
74
|
+
void NPomp_zset0(const size_t n, double complex *out);
|
|
75
|
+
|
|
76
|
+
void NPomp_dcopy(const size_t m, const size_t n,
|
|
77
|
+
const double *in, const size_t in_stride,
|
|
78
|
+
double *out, const size_t out_stride);
|
|
79
|
+
void NPomp_zcopy(const size_t m, const size_t n,
|
|
80
|
+
const double complex *in, const size_t in_stride,
|
|
81
|
+
double complex *out, const size_t out_stride);
|
|
82
|
+
void NPomp_dmul(const size_t m, const size_t n,
|
|
83
|
+
const double *a, const size_t a_stride,
|
|
84
|
+
double *b, const size_t b_stride,
|
|
85
|
+
double *out, const size_t out_stride);
|
|
86
|
+
void NPomp_zmul(const size_t m, const size_t n,
|
|
87
|
+
const double complex *a, const size_t a_stride,
|
|
88
|
+
double complex *b, const size_t b_stride,
|
|
89
|
+
double complex *out, const size_t out_stride);
|
|
90
|
+
|
|
65
91
|
void NPdgemm(const char trans_a, const char trans_b,
|
|
66
92
|
const int m, const int n, const int k,
|
|
67
93
|
const int lda, const int ldb, const int ldc,
|