fastlisaresponse 1.0.9__cp312-cp312-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of fastlisaresponse might be problematic. Click here for more details.

@@ -0,0 +1,919 @@
1
+ #include "stdio.h"
2
+ #include "cuda_complex.hpp"
3
+ #include "LISAResponse.hh"
4
+ #include <iostream>
5
+
6
+ #ifdef __CUDACC__
7
+ #define CUDA_CALLABLE_MEMBER __device__
8
+ #define CUDA_KERNEL __global__
9
+ #define CUDA_SHARED __shared__
10
+ #define CUDA_SYNC_THREADS __syncthreads()
11
+ #else
12
+ #define CUDA_CALLABLE_MEMBER
13
+ #define CUDA_KERNEL
14
+ #define CUDA_SHARED
15
+ #define CUDA_SYNC_THREADS
16
+ #endif
17
+
18
+ #ifdef __CUDACC__
19
+ #define gpuErrchk(ans) \
20
+ { \
21
+ gpuAssert2((ans), __FILE__, __LINE__); \
22
+ }
23
+ inline void gpuAssert2(cudaError_t code, const char *file, int line, bool abort = true)
24
+ {
25
+ if (code != cudaSuccess)
26
+ {
27
+ fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
28
+ if (abort)
29
+ exit(code);
30
+ }
31
+ }
32
+
33
+ #endif
34
+
35
+ CUDA_CALLABLE_MEMBER
36
+ void get_basis_vecs(double lam, double beta, double u[], double v[], double k[])
37
+ {
38
+ long i;
39
+
40
+ double cosbeta, sinbeta, coslam, sinlam;
41
+
42
+ for (i = 0; i < 3; i++)
43
+ {
44
+ u[i] = 0.;
45
+ v[i] = 0.;
46
+ k[i] = 0.;
47
+ }
48
+
49
+ cosbeta = cos(beta);
50
+ sinbeta = sin(beta);
51
+
52
+ coslam = cos(lam);
53
+ sinlam = sin(lam);
54
+
55
+ u[0] = -sinbeta * coslam;
56
+ u[1] = sinbeta * sinlam;
57
+ u[2] = cosbeta;
58
+ v[0] = sinlam;
59
+ v[1] = -coslam;
60
+ v[2] = 0.;
61
+ k[0] = -cosbeta * coslam;
62
+ k[1] = -cosbeta * sinlam;
63
+ k[2] = -cosbeta;
64
+
65
+ return;
66
+ }
67
+
68
+ CUDA_CALLABLE_MEMBER
69
+ double dot_product_1d(double *arr1, double *arr2)
70
+ {
71
+ double out = 0.0;
72
+ for (int i = 0; i < 3; i++)
73
+ {
74
+ out += arr1[i] * arr2[i];
75
+ }
76
+ return out;
77
+ }
78
+
79
+ CUDA_CALLABLE_MEMBER
80
+ void xi_projections(double *xi_p, double *xi_c, double *u, double *v, double *n)
81
+ {
82
+ double u_dot_n = dot_product_1d(u, n);
83
+ double v_dot_n = dot_product_1d(v, n);
84
+
85
+ *xi_p = 0.5 * ((u_dot_n * u_dot_n) - (v_dot_n * v_dot_n));
86
+ *xi_c = u_dot_n * v_dot_n;
87
+ }
88
+
89
+ CUDA_CALLABLE_MEMBER
90
+ double interp_h(double delay, double out)
91
+ {
92
+
93
+ return out;
94
+ }
95
+
96
+ // with uneven spacing in t in the sparse arrays, need to determine which timesteps the dense arrays fall into
97
+ // for interpolation
98
+ // effectively the boundaries and length of each interpolation segment of the dense array in the sparse array
99
+ void find_start_inds(int start_inds[], int unit_length[], double *t_arr, double delta_t, int *length, int new_length)
100
+ {
101
+
102
+ double T = (new_length - 1) * delta_t;
103
+ start_inds[0] = 0;
104
+ int i = 1;
105
+ for (i = 1;
106
+ i < *length;
107
+ i += 1)
108
+ {
109
+
110
+ double t = t_arr[i];
111
+
112
+ // adjust for waveforms that hit the end of the trajectory
113
+ if (t < T)
114
+ {
115
+ start_inds[i] = (int)std::ceil(t / delta_t);
116
+ unit_length[i - 1] = start_inds[i] - start_inds[i - 1];
117
+ }
118
+ else
119
+ {
120
+ start_inds[i] = new_length;
121
+ unit_length[i - 1] = new_length - start_inds[i - 1];
122
+ break;
123
+ }
124
+ }
125
+
126
+ // fixes for not using certain segments for the interpolation
127
+ *length = i + 1;
128
+ }
129
+
130
+ CUDA_CALLABLE_MEMBER
131
+ void interp_single(double *result, double *input, int h, int d, double e, double *A_arr, double deps, double *E_arr, int start_input_ind)
132
+ {
133
+
134
+ int ind = (int)(e / deps);
135
+
136
+ double frac = (e - ind * deps) / deps;
137
+ double A = A_arr[ind] * (1. - frac) + A_arr[ind + 1] * frac;
138
+
139
+ double B = 1.0 - e;
140
+ double C = e;
141
+ double D = e * (1.0 - e);
142
+
143
+ double sum = 0.0;
144
+ double temp_up, temp_down;
145
+ // printf("in: %d %d\n", d, start_input_ind);
146
+ for (int j = 1; j < h; j += 1)
147
+ {
148
+
149
+ // get constants
150
+
151
+ double E = E_arr[j - 1];
152
+
153
+ double F = j + e;
154
+ double G = j + (1 - e);
155
+
156
+ // printf("mid: %d %d %d\n", j, d, start_input_ind);
157
+
158
+ // perform calculation
159
+ temp_up = input[d + 1 + j - start_input_ind];
160
+ temp_down = input[d - j - start_input_ind];
161
+ sum += E * (temp_up / F + temp_down / G);
162
+ }
163
+ temp_up = input[d + 1 - start_input_ind];
164
+ temp_down = input[d - start_input_ind];
165
+ // printf("out: %d %d\n", d, start_input_ind);
166
+ *result = A * (B * temp_up + C * temp_down + D * sum);
167
+ }
168
+
169
+ CUDA_CALLABLE_MEMBER
170
+ void interp(double *result_hp, double *result_hc, cmplx *input, int h, int d, double e, double *A_arr, double deps, double *E_arr, int start_input_ind, int i, int link_i)
171
+ {
172
+ /*
173
+ double A = 1.0;
174
+ for (int i = 1; i < h; i += 1){
175
+ A *= (i + e) * (i + 1 - e);
176
+ }
177
+ double denominator = factorials[h - 1] * factorials[h];
178
+ A /= denominator;
179
+ */
180
+
181
+ int ind = (int)(e / deps);
182
+
183
+ double frac = (e - ind * deps) / deps;
184
+ double A = A_arr[ind] * (1. - frac) + A_arr[ind + 1] * frac;
185
+
186
+ double B = 1.0 - e;
187
+ double C = e;
188
+ double D = e * (1.0 - e);
189
+
190
+ double sum_hp = 0.0;
191
+ double sum_hc = 0.0;
192
+ cmplx temp_up, temp_down;
193
+ // if ((i == 100) && (link_i == 0)) printf("%d %e %e %e %e %e\n", d, e, A, B, C, D);
194
+ // printf("in: %d %d\n", d, start_input_ind);
195
+ for (int j = 1; j < h; j += 1)
196
+ {
197
+
198
+ // get constants
199
+
200
+ /*
201
+ double first_term = factorials[h - 1] / factorials[h - 1 - j];
202
+ double second_term = factorials[h] / factorials[h + j];
203
+ double value = first_term * second_term;
204
+
205
+ value = value * pow(-1.0, (double)j);
206
+ */
207
+
208
+ double E = E_arr[j - 1];
209
+
210
+ double F = j + e;
211
+ double G = j + (1 - e);
212
+
213
+ // perform calculation
214
+ temp_up = input[d + 1 + j - start_input_ind];
215
+ temp_down = input[d - j - start_input_ind];
216
+
217
+ // if ((i == 100) && (link_i == 0)) printf("mid: %d %d %d %e %e %e %e %e %e %e\n", j, d + 1 + j - start_input_ind, d - j - start_input_ind, temp_up, temp_down, E, F, G);
218
+ sum_hp += E * (temp_up.real() / F + temp_down.real() / G);
219
+ sum_hc += E * (temp_up.imag() / F + temp_down.imag() / G);
220
+ }
221
+ temp_up = input[d + 1 - start_input_ind];
222
+ temp_down = input[d - start_input_ind];
223
+ // printf("out: %d %d\n", d, start_input_ind);
224
+ *result_hp = A * (B * temp_up.real() + C * temp_down.real() + D * sum_hp);
225
+ *result_hc = A * (B * temp_up.imag() + C * temp_down.imag() + D * sum_hc);
226
+ // if ((i == 100) && (link_i == 0)) printf("end: %e %e\n", *result_hp, *result_hc);
227
+ }
228
+
229
+ #define NUM_PARS 33
230
+ #define NUM_COEFFS 4
231
+ #define NLINKS 6
232
+ #define BUFFER_SIZE 1000
233
+ #define MAX_UNITS 200
234
+
235
+ #define MAX_A_VALS 1001
236
+ #define MAX_ORDER 40
237
+
238
+ CUDA_KERNEL
239
+ void TDI_delay(double *delayed_links, double *input_links, int num_inputs, int num_delays, double *t_arr, int *tdi_base_link, int *tdi_link_combinations, int *tdi_signs_in, int *channels, int num_units, int num_channels,
240
+ int order, double sampling_frequency, int buffer_integer, double *A_in, double deps, int num_A, double *E_in, int tdi_start_ind, Orbits *orbits_in)
241
+ {
242
+ Orbits orbits = *orbits_in;
243
+
244
+ #ifdef __CUDACC__
245
+ CUDA_SHARED double input[BUFFER_SIZE];
246
+ #endif
247
+ CUDA_SHARED double first_delay;
248
+ CUDA_SHARED double last_delay;
249
+ CUDA_SHARED int start_input_ind;
250
+ CUDA_SHARED int end_input_ind;
251
+ CUDA_SHARED double A_arr[MAX_A_VALS];
252
+ CUDA_SHARED double E_arr[MAX_ORDER];
253
+
254
+ int start, increment;
255
+ #ifdef __CUDACC__
256
+ start = threadIdx.x;
257
+ increment = blockDim.x;
258
+ #else
259
+ start = 0;
260
+ increment = 1;
261
+ // #pragma omp parallel for
262
+ #endif
263
+ #ifdef __CUDACC__
264
+ #else
265
+ // #pragma omp parallel for
266
+ #endif
267
+
268
+ #ifdef __CUDACC__
269
+ #else
270
+ // #pragma omp parallel for
271
+ #endif
272
+ for (int i = start; i < num_A; i += increment)
273
+ {
274
+ A_arr[i] = A_in[i];
275
+ // if (threadIdx.x == 1) printf("%e %e %e\n", k[i], u[i], v[i]);
276
+ }
277
+ CUDA_SYNC_THREADS;
278
+
279
+ #ifdef __CUDACC__
280
+ #else
281
+ // #pragma omp parallel for
282
+ #endif
283
+ for (int i = start; i < (order + 1) / 2 - 1; i += increment)
284
+ {
285
+ E_arr[i] = E_in[i];
286
+ // if (threadIdx.x == 1) printf("%e %e %e\n", k[i], u[i], v[i]);
287
+ }
288
+ CUDA_SYNC_THREADS;
289
+
290
+ int start1, increment1;
291
+ #ifdef __CUDACC__
292
+ start1 = blockIdx.y;
293
+ increment1 = gridDim.y;
294
+ #else
295
+ start1 = 0;
296
+ increment1 = 1;
297
+ // #pragma omp parallel for
298
+ #endif
299
+
300
+ for (int unit_i = start1; unit_i < num_units; unit_i += increment1)
301
+ {
302
+ int base_link = tdi_base_link[unit_i];
303
+ int base_link_index = orbits.get_link_ind(base_link);
304
+
305
+ int combination_link = tdi_link_combinations[unit_i];
306
+
307
+ int combination_link_index;
308
+ if (combination_link == -11)
309
+ {
310
+ combination_link_index = -1;
311
+ }
312
+ else
313
+ {
314
+ combination_link_index = orbits.get_link_ind(combination_link);
315
+ }
316
+ int sign = tdi_signs_in[unit_i];
317
+ int channel = channels[unit_i];
318
+
319
+ int point_count = order + 1;
320
+ int half_point_count = int(point_count / 2);
321
+
322
+ int start2, increment2;
323
+ #ifdef __CUDACC__
324
+ start2 = tdi_start_ind + threadIdx.x + blockDim.x * blockIdx.x;
325
+ increment2 = blockDim.x * gridDim.x;
326
+ #else
327
+ start2 = tdi_start_ind;
328
+ increment2 = 1;
329
+ // #pragma omp parallel for
330
+ #endif
331
+ for (int i = start2;
332
+ i < num_delays - tdi_start_ind;
333
+ i += increment2)
334
+ {
335
+ double t, L, delay;
336
+
337
+ double large_factor, pre_factor;
338
+ double clipped_delay, out, fraction;
339
+ double link_delayed_out;
340
+ int integer_delay, max_integer_delay, min_integer_delay;
341
+ int start, end, increment;
342
+
343
+ // at i = 0, delay ind should be at TDI_buffer = total_buffer - projection_buffer
344
+ t = t_arr[i];
345
+ if (combination_link == -11)
346
+ {
347
+ delay = t;
348
+ }
349
+ else
350
+ {
351
+ delay = t - orbits.get_light_travel_time(t, combination_link);
352
+ }
353
+
354
+ // delays are still with respect to projection start
355
+ clipped_delay = delay;
356
+ integer_delay = (int)ceil(clipped_delay * sampling_frequency) - 1;
357
+ fraction = 1.0 + integer_delay - clipped_delay * sampling_frequency;
358
+
359
+ max_integer_delay = integer_delay;
360
+ max_integer_delay += 2; // encompass all
361
+ min_integer_delay = integer_delay;
362
+
363
+ #ifdef __CUDACC__
364
+ int max_thread_num = ((num_delays - 2 * tdi_start_ind) - blockDim.x * blockIdx.x > NUM_THREADS) ? NUM_THREADS : (num_delays - 2 * tdi_start_ind) - blockDim.x * blockIdx.x;
365
+ CUDA_SYNC_THREADS;
366
+ if (threadIdx.x == 0)
367
+ {
368
+ start_input_ind = min_integer_delay - buffer_integer;
369
+ // printf("BAD1: %d %d %d %d %e %d \n", i, unit_i, blockIdx.x, start_input_ind, delay, max_integer_delay);
370
+ }
371
+ CUDA_SYNC_THREADS;
372
+ if (threadIdx.x == max_thread_num - 1)
373
+ {
374
+ // if (blockIdx.x == gridDim.x - 1)
375
+ // printf("%e %e %d %d\n", clipped_delay0, clipped_delay1, integer_delay0, integer_delay1);
376
+ end_input_ind = max_integer_delay + buffer_integer;
377
+ // printf("BAD2: %d %d %d %d %e %d %d\n", i, unit_i, blockIdx.x, start_input_ind, delay, max_integer_delay, start_input_ind);
378
+ }
379
+
380
+ CUDA_SYNC_THREADS;
381
+
382
+ for (int jj = threadIdx.x + start_input_ind; jj < end_input_ind; jj += max_thread_num)
383
+ {
384
+ // need to subtract out the projection buffer
385
+
386
+ input[jj - start_input_ind] = input_links[base_link_index * num_inputs + jj];
387
+ }
388
+
389
+ CUDA_SYNC_THREADS;
390
+ #else
391
+ start_input_ind = 0;
392
+ double *input = &input_links[base_link_index * num_inputs];
393
+ #endif
394
+ // printf("bef: %d %d %d\n", channel, i, unit_i);
395
+ interp_single(&link_delayed_out, input, half_point_count, integer_delay, fraction, A_arr, deps, E_arr, start_input_ind);
396
+
397
+ link_delayed_out *= sign;
398
+
399
+ // if ((channel == 0) && (unit_i == 2) & (i > 237790)){
400
+
401
+ // printf("aft: %d %d %d %e\n", channel, i, unit_i, delayed_links[channel * num_delays + i]);
402
+ // }
403
+
404
+ #ifdef __CUDACC__
405
+ atomicAdd(&delayed_links[channel * num_delays + i], link_delayed_out);
406
+ #else
407
+ // #pragma omp atomic
408
+ delayed_links[channel * num_delays + i] += link_delayed_out;
409
+ #endif
410
+ CUDA_SYNC_THREADS;
411
+ }
412
+ }
413
+ }
414
+
415
+ void get_tdi_delays(double *delayed_links, double *input_links, int num_inputs, int num_delays, double *t_arr, int *tdi_base_link, int *tdi_link_combinations, int *tdi_signs_in, int *channels, int num_units, int num_channels,
416
+ int order, double sampling_frequency, int buffer_integer, double *A_in, double deps, int num_A, double *E_in, int tdi_start_ind, Orbits *orbits_in)
417
+ {
418
+
419
+ #ifdef __CUDACC__
420
+ int num_blocks = std::ceil((num_delays - 2 * tdi_start_ind + NUM_THREADS - 1) / NUM_THREADS);
421
+
422
+ dim3 gridDim(num_blocks, num_units * num_channels);
423
+
424
+ Orbits *orbits_gpu;
425
+ gpuErrchk(cudaMalloc(&orbits_gpu, sizeof(Orbits)));
426
+ gpuErrchk(cudaMemcpy(orbits_gpu, orbits_in, sizeof(Orbits), cudaMemcpyHostToDevice));
427
+
428
+ // printf("RUNNING: %d\n", i);
429
+ TDI_delay<<<gridDim, NUM_THREADS>>>(delayed_links, input_links, num_inputs, num_delays, t_arr, tdi_base_link, tdi_link_combinations, tdi_signs_in, channels, num_units, num_channels,
430
+ order, sampling_frequency, buffer_integer, A_in, deps, num_A, E_in, tdi_start_ind, orbits_gpu);
431
+
432
+ cudaDeviceSynchronize();
433
+ gpuErrchk(cudaGetLastError());
434
+ gpuErrchk(cudaFree(orbits_gpu));
435
+
436
+ #else
437
+ TDI_delay(delayed_links, input_links, num_inputs, num_delays, t_arr, tdi_base_link, tdi_link_combinations, tdi_signs_in, channels, num_units, num_channels,
438
+ order, sampling_frequency, buffer_integer, A_in, deps, num_A, E_in, tdi_start_ind, orbits_in);
439
+
440
+ #endif
441
+ }
442
+
443
+ CUDA_KERNEL
444
+ void response(double *y_gw, double *t_data, double *k_in, double *u_in, double *v_in, double dt,
445
+ int num_delays,
446
+ cmplx *input_in, int num_inputs, int order, double sampling_frequency,
447
+ int buffer_integer, double *A_in, double deps, int num_A, double *E_in, int projections_start_ind,
448
+ Orbits *orbits_in)
449
+ {
450
+ #ifdef __CUDACC__
451
+ CUDA_SHARED cmplx input[BUFFER_SIZE];
452
+ #endif
453
+ CUDA_SHARED double A_arr[MAX_A_VALS];
454
+ CUDA_SHARED double E_arr[MAX_ORDER];
455
+ CUDA_SHARED double first_delay;
456
+ CUDA_SHARED double last_delay;
457
+ CUDA_SHARED int start_input_ind;
458
+ CUDA_SHARED int end_input_ind;
459
+
460
+ CUDA_SHARED double k[3];
461
+ CUDA_SHARED double u[3];
462
+ CUDA_SHARED double v[3];
463
+ CUDA_SHARED int link_space_craft_0[NLINKS];
464
+ CUDA_SHARED int link_space_craft_1[NLINKS];
465
+ CUDA_SHARED int links[NLINKS];
466
+
467
+ #ifdef __CUDACC__
468
+ CUDA_SHARED double x0_all[NUM_THREADS * 3];
469
+ CUDA_SHARED double x1_all[NUM_THREADS * 3];
470
+ CUDA_SHARED double n_all[NUM_THREADS * 3];
471
+
472
+ double *x0 = &x0_all[3 * threadIdx.x];
473
+ double *x1 = &x1_all[3 * threadIdx.x];
474
+ double *n = &n_all[3 * threadIdx.x];
475
+ #endif
476
+
477
+ int start, increment;
478
+
479
+ CUDA_SYNC_THREADS;
480
+
481
+ #ifdef __CUDACC__
482
+ start = threadIdx.x;
483
+ increment = blockDim.x;
484
+ #else
485
+ start = 0;
486
+ increment = 1;
487
+ #endif
488
+ for (int i = start; i < 3; i += increment)
489
+ {
490
+ k[i] = k_in[i];
491
+ u[i] = u_in[i];
492
+ v[i] = v_in[i];
493
+ // if (threadIdx.x == 1) printf("%e %e %e\n", k[i], u[i], v[i]);
494
+ }
495
+ CUDA_SYNC_THREADS;
496
+
497
+ for (int i = start; i < num_A; i += increment)
498
+ {
499
+ A_arr[i] = A_in[i];
500
+ // if (threadIdx.x == 1) printf("%e %e %e\n", k[i], u[i], v[i]);
501
+ }
502
+ CUDA_SYNC_THREADS;
503
+
504
+ for (int i = start; i < (order + 1) / 2 - 1; i += increment)
505
+ {
506
+ E_arr[i] = E_in[i];
507
+ // if (threadIdx.x == 1) printf("%e %e %e\n", k[i], u[i], v[i]);
508
+ }
509
+ CUDA_SYNC_THREADS;
510
+
511
+ Orbits orbits = *orbits_in;
512
+ for (int i = start; i < NLINKS; i += increment)
513
+ {
514
+ link_space_craft_0[i] = orbits.sc_r[i];
515
+ link_space_craft_1[i] = orbits.sc_e[i];
516
+ links[i] = orbits.links[i];
517
+ // if (threadIdx.x == 1)
518
+ // printf("%d %d %d %d\n", orbits.sc_r[i], orbits.sc_e[i], link_space_craft_1[i], link_space_craft_0[i]);
519
+ }
520
+ CUDA_SYNC_THREADS;
521
+ int point_count = order + 1;
522
+ int half_point_count = int(point_count / 2);
523
+
524
+ #ifdef __CUDACC__
525
+ start = blockIdx.y;
526
+ increment = gridDim.y;
527
+ #else
528
+ start = 0;
529
+ increment = 1;
530
+ #endif
531
+ for (int link_i = start; link_i < NLINKS; link_i += increment)
532
+ {
533
+ int sc0 = link_space_craft_0[link_i];
534
+ int sc1 = link_space_craft_1[link_i];
535
+ int link = links[link_i];
536
+
537
+ int start2, increment2;
538
+ #ifdef __CUDACC__
539
+ start2 = projections_start_ind + threadIdx.x + blockDim.x * blockIdx.x;
540
+ increment2 = blockDim.x * gridDim.x;
541
+ #else
542
+ start2 = projections_start_ind;
543
+ increment2 = 1;
544
+ #endif
545
+ for (int i = start2;
546
+ i < num_delays - projections_start_ind;
547
+ i += increment2)
548
+ {
549
+
550
+ #ifdef __CUDACC__
551
+ #else
552
+ double x0_all[3];
553
+ CUDA_SHARED double x1_all[3];
554
+ CUDA_SHARED double n_all[3];
555
+
556
+ double *x0 = &x0_all[0];
557
+ double *x1 = &x1_all[0];
558
+ double *n = &n_all[0];
559
+
560
+ #endif
561
+
562
+ double xi_p, xi_c;
563
+ double k_dot_n, k_dot_x0, k_dot_x1;
564
+ double t, L, delay0, delay1;
565
+ double hp_del0, hp_del1, hc_del0, hc_del1;
566
+
567
+ double large_factor, pre_factor;
568
+ double clipped_delay0, clipped_delay1, out, fraction0, fraction1;
569
+ int integer_delay0, integer_delay1, max_integer_delay, min_integer_delay;
570
+
571
+ t = t_data[i];
572
+
573
+ Vec out_vec(0.0, 0.0, 0.0);
574
+ double norm = 0.0;
575
+ double n_temp;
576
+
577
+ out_vec = orbits.get_pos(t, sc0);
578
+ x0[0] = out_vec.x;
579
+ x0[1] = out_vec.y;
580
+ x0[2] = out_vec.z;
581
+
582
+ out_vec = orbits.get_pos(t, sc1);
583
+ x1[0] = out_vec.x;
584
+ x1[1] = out_vec.y;
585
+ x1[2] = out_vec.z;
586
+
587
+ for (int coord = 0; coord < 3; coord += 1)
588
+ {
589
+ n_temp = x0[coord] - x1[coord];
590
+ n[coord] = n_temp;
591
+ norm += n_temp * n_temp;
592
+ }
593
+
594
+ norm = sqrt(norm);
595
+
596
+ #pragma unroll
597
+ for (int coord = 0; coord < 3; coord += 1)
598
+ {
599
+ n[coord] = n[coord] / norm;
600
+ }
601
+
602
+ L = orbits.get_light_travel_time(t, link);
603
+ // if (i % 10000 == 0)
604
+ // printf("%d %d %e %e %d %e %e %e %d\n", i, link_i, L, t, link, x0[0], x1[0], norm, sc0);
605
+
606
+ // if (i <500) printf("%d %d: start \n", i, link_i);
607
+
608
+ xi_projections(&xi_p, &xi_c, u, v, n);
609
+
610
+ k_dot_n = dot_product_1d(k, n);
611
+ k_dot_x0 = dot_product_1d(k, x0); // receiver
612
+ k_dot_x1 = dot_product_1d(k, x1); // emitter
613
+
614
+ delay0 = t - k_dot_x0 * C_inv;
615
+ delay1 = t - L - k_dot_x1 * C_inv;
616
+
617
+ // start time for hp hx is really -(projection_buffer * dt)
618
+
619
+ // if ((i == 0) && (link_i == 0)) printf("%.10e %.10e %.10e %.10e %.10e %.10e %.10e %.10e %.10e\n", L, delay0, delay1, x0[0], x0[1], x0[2],x1[0], x1[1], x1[2]);
620
+ clipped_delay0 = delay0; // - start_wave_time;
621
+ integer_delay0 = (int)ceil(clipped_delay0 * sampling_frequency) - 1;
622
+ fraction0 = 1.0 + integer_delay0 - clipped_delay0 * sampling_frequency;
623
+
624
+ clipped_delay1 = delay1; // - start_wave_time;
625
+ integer_delay1 = (int)ceil(clipped_delay1 * sampling_frequency) - 1;
626
+ fraction1 = 1.0 + integer_delay1 - clipped_delay1 * sampling_frequency;
627
+
628
+ max_integer_delay = (integer_delay0 < integer_delay1) ? integer_delay1 : integer_delay0;
629
+ max_integer_delay += 2; // encompass all
630
+ min_integer_delay = (integer_delay0 < integer_delay1) ? integer_delay0 : integer_delay1;
631
+
632
+ #ifdef __CUDACC__
633
+ int max_thread_num = ((num_delays - 2 * projections_start_ind) - blockDim.x * blockIdx.x > NUM_THREADS) ? NUM_THREADS : (num_delays - 2 * projections_start_ind) - blockDim.x * blockIdx.x;
634
+
635
+ if (threadIdx.x == 0)
636
+ {
637
+ start_input_ind = min_integer_delay - buffer_integer;
638
+ }
639
+
640
+ if (threadIdx.x == max_thread_num - 1)
641
+ {
642
+ // if (blockIdx.x == gridDim.x - 1)
643
+ // printf("%e %e %d %d\n", clipped_delay0, clipped_delay1, integer_delay0, integer_delay1);
644
+ end_input_ind = max_integer_delay + buffer_integer;
645
+ }
646
+
647
+ CUDA_SYNC_THREADS;
648
+
649
+ // if (blockIdx.x == gridDim.x - 1) printf("%d %e %d %d %d %d %d %d %d %d %d %d %d\n", i, L, blockIdx.x, gridDim.x, threadIdx.x, blockDim.x*blockIdx.x, num_delays, num_delays - blockDim.x*blockIdx.x, max_thread_num, start_input_ind, end_input_ind, integer_delay0, integer_delay1);
650
+ if (end_input_ind - start_input_ind > BUFFER_SIZE)
651
+ printf("%d %d %d %d %d %d %d %d\n", threadIdx.x, max_integer_delay, start_input_ind, end_input_ind, i, max_thread_num, num_delays, blockIdx.x * blockDim.x);
652
+
653
+ for (int jj = threadIdx.x + start_input_ind; jj < end_input_ind; jj += max_thread_num)
654
+ {
655
+ // cmplx temp = input_in[jj];
656
+ input[jj - start_input_ind] = input_in[jj];
657
+ }
658
+
659
+ CUDA_SYNC_THREADS;
660
+ #else
661
+ start_input_ind = 0;
662
+ cmplx *input = input_in;
663
+ #endif
664
+
665
+ interp(&hp_del0, &hc_del0, input, half_point_count, integer_delay0, fraction0, A_arr, deps, E_arr, start_input_ind, i, link_i);
666
+ interp(&hp_del1, &hc_del1, input, half_point_count, integer_delay1, fraction1, A_arr, deps, E_arr, start_input_ind, i, link_i);
667
+
668
+ pre_factor = 1. / (1. - k_dot_n);
669
+ large_factor = (hp_del0 - hp_del1) * xi_p + (hc_del0 - hc_del1) * xi_c;
670
+ // if (i % 10000 == 0)
671
+ // printf("%d %d %e %e %e %e %e %e\n", i, link_i, pre_factor, large_factor, delay0, delay1, L, xi_p);
672
+ y_gw[link_i * num_delays + i] = pre_factor * large_factor;
673
+ CUDA_SYNC_THREADS;
674
+ }
675
+ }
676
+ }
677
+
678
+ void get_response(double *y_gw, double *t_data, double *k_in, double *u_in, double *v_in, double dt,
679
+ int num_delays,
680
+ cmplx *input_in, int num_inputs, int order,
681
+ double sampling_frequency, int buffer_integer,
682
+ double *A_in, double deps, int num_A, double *E_in, int projections_start_ind,
683
+ Orbits *orbits)
684
+ {
685
+
686
+ #ifdef __CUDACC__
687
+
688
+ int num_delays_here = (num_delays - 2 * projections_start_ind);
689
+ int num_blocks = std::ceil((num_delays_here + NUM_THREADS - 1) / NUM_THREADS);
690
+
691
+ // copy self to GPU
692
+ Orbits *orbits_gpu;
693
+ gpuErrchk(cudaMalloc(&orbits_gpu, sizeof(Orbits)));
694
+ gpuErrchk(cudaMemcpy(orbits_gpu, orbits, sizeof(Orbits), cudaMemcpyHostToDevice));
695
+
696
+ dim3 gridDim(num_blocks, 1);
697
+
698
+ // printf("RUNNING: %d\n", i);
699
+ response<<<gridDim, NUM_THREADS>>>(y_gw, t_data, k_in, u_in, v_in, dt,
700
+ num_delays,
701
+ input_in, num_inputs, order, sampling_frequency, buffer_integer,
702
+ A_in, deps, num_A, E_in, projections_start_ind,
703
+ orbits_gpu);
704
+ cudaDeviceSynchronize();
705
+ gpuErrchk(cudaGetLastError());
706
+
707
+ gpuErrchk(cudaFree(orbits_gpu));
708
+ #else
709
+
710
+ // CPU waveform generation
711
+ // std::cout << num_delays << " " << NLINKS << std::endl;
712
+ response(y_gw, t_data, k_in, u_in, v_in, dt,
713
+ num_delays,
714
+ input_in, num_inputs, order, sampling_frequency, buffer_integer,
715
+ A_in, deps, num_A, E_in, projections_start_ind,
716
+ orbits);
717
+ #endif
718
+ }
719
+
720
+ /*
721
+ int main()
722
+ {
723
+
724
+ int num_fac = 100;
725
+ double factorials_in[num_fac];
726
+
727
+ factorials_in[0] = 1.0;
728
+
729
+ for (int i=1; i<num_fac; i+=1){
730
+ factorials_in[i] = i*factorials_in[i-1];
731
+ }
732
+
733
+ double *d_factorials_in;
734
+ gpuErrchk(cudaMalloc(&d_factorials_in, num_fac*sizeof(double)));
735
+ gpuErrchk(cudaMemcpy(d_factorials_in, factorials_in, num_fac*sizeof(double), cudaMemcpyHostToDevice));
736
+
737
+ int num_pts_in = int(1e6);
738
+
739
+
740
+ cmplx *input_in = new cmplx[num_pts_in];
741
+ cmplx *d_input_in;
742
+
743
+ gpuErrchk(cudaMalloc(&d_input_in, num_pts_in*sizeof(cmplx)));
744
+
745
+ double sampling_frequency = 1.0;
746
+ double dt = 1./sampling_frequency;
747
+ double input_start_time = -10000.0;
748
+ cmplx I(0.0, 1.0);
749
+ for (int i=0; i<num_pts_in; i+=1) input_in[i] = sin(i*dt + input_start_time) + I*cos(i*dt + input_start_time);
750
+
751
+ gpuErrchk(cudaMemcpy(d_input_in, input_in, num_pts_in*sizeof(cmplx), cudaMemcpyHostToDevice));
752
+
753
+ int num_delays = int(1e5);
754
+
755
+ int order = 25;
756
+ int buffer_integer = order + 1;
757
+
758
+
759
+
760
+ double beta = 0.5;
761
+ double lam = 1.0;
762
+
763
+ double k[3];
764
+ double u[3];
765
+ double v[3];
766
+
767
+ get_basis_vecs(lam, beta, u, v, k);
768
+
769
+ int nlinks = NLINKS;
770
+ double *n_in = new double[num_delays*nlinks*3];
771
+ double *x = new double[num_delays*3*3];
772
+ double *L_vals = new double[num_delays*nlinks];
773
+ int *link_space_craft_0 = new int[nlinks];
774
+ int *link_space_craft_1 = new int[nlinks];
775
+
776
+ link_space_craft_0[0] = 0; link_space_craft_1[0] = 1;
777
+ link_space_craft_0[1] = 1; link_space_craft_1[1] = 0;
778
+
779
+ link_space_craft_0[2] = 0; link_space_craft_1[2] = 2;
780
+ link_space_craft_0[3] = 2; link_space_craft_1[3] = 0;
781
+
782
+ link_space_craft_0[4] = 1; link_space_craft_1[4] = 2;
783
+ link_space_craft_0[5] = 2; link_space_craft_1[5] = 1;
784
+
785
+ double Re = 1.496e+11; // meters
786
+ double Phi0 = 0.0;
787
+
788
+ double Omega0 = 1/(365.25*24.0*3600.0);
789
+
790
+ double center_vec[3];
791
+
792
+ double L = 2.5e9;
793
+
794
+ double sc0_delta[2] = {L/2, -L/(2.*sqrt(3.))};
795
+
796
+ double sc1_delta[2] = {-L/2, -L/(2.*sqrt(3.))};
797
+ double sc2_delta[2] = {0.0, L/(sqrt(3.))};
798
+
799
+ double Rnew, xnew, ynew, znew, t;
800
+ double norm;
801
+ int link_ind_0, link_ind_1;
802
+ for (int i=0; i<num_delays; i++){
803
+ t = i*dt;
804
+
805
+ // sc 1
806
+ Rnew = Re + sc0_delta[0];
807
+ xnew = Rnew*cos(Omega0*t + Phi0);
808
+ ynew = Rnew*sin(Omega0*t + Phi0);
809
+ znew = sc0_delta[1];
810
+
811
+ x[(0*3 + 0)*num_delays + i] = xnew;
812
+ x[(0*3 + 1)*num_delays + i] = ynew;
813
+ x[(0*3 + 2)*num_delays + i] = znew;
814
+
815
+ Rnew = Re + sc1_delta[0];
816
+ xnew = Rnew*cos(Omega0*t + Phi0);
817
+ ynew = Rnew*sin(Omega0*t + Phi0);
818
+ znew = sc1_delta[1];
819
+
820
+ x[(1*3 + 0)*num_delays + i] = xnew;
821
+ x[(1*3 + 1)*num_delays + i] = ynew;
822
+ x[(1*3 + 2)*num_delays + i] = znew;
823
+
824
+ Rnew = Re + sc2_delta[0];
825
+ xnew = Rnew*cos(Omega0*t + Phi0);
826
+ ynew = Rnew*sin(Omega0*t + Phi0);
827
+ znew = sc2_delta[1];
828
+
829
+ x[(2*3 + 0)*num_delays + i] = xnew;
830
+ x[(2*3 + 1)*num_delays + i] = ynew;
831
+ x[(2*3 + 2)*num_delays + i] = znew;
832
+
833
+ for (int j=0; j<NLINKS; j++){
834
+ link_ind_0 = link_space_craft_0[j];
835
+ link_ind_1 = link_space_craft_1[j];
836
+
837
+ xnew = x[(link_ind_0*3 + 0)*num_delays + i] - x[(link_ind_1*3 + 0)*num_delays + i];
838
+ ynew = x[(link_ind_0*3 + 1)*num_delays + i] - x[(link_ind_1*3 + 1)*num_delays + i];
839
+ znew = x[(link_ind_0*3 + 2)*num_delays + i] - x[(link_ind_1*3 + 2)*num_delays + i];
840
+
841
+ norm = sqrt(xnew*xnew + ynew*ynew + znew*znew);
842
+
843
+ n_in[(j*3 + 0)*num_delays + i] = xnew/norm;
844
+ n_in[(j*3 + 1)*num_delays + i] = ynew/norm;
845
+ n_in[(j*3 + 2)*num_delays + i] = znew/norm;
846
+ L_vals[j*num_delays + i] = L;
847
+ }
848
+ }
849
+
850
+ double *d_k, *d_u, *d_v, *d_x, *d_n_in;
851
+ double *d_L_vals, *d_y_gw;
852
+
853
+ int *d_link_space_craft_0, *d_link_space_craft_1;
854
+
855
+ gpuErrchk(cudaMalloc(&d_k, 3*sizeof(double)));
856
+ gpuErrchk(cudaMalloc(&d_u, 3*sizeof(double)));
857
+ gpuErrchk(cudaMalloc(&d_v, 3*sizeof(double)));
858
+
859
+ gpuErrchk(cudaMalloc(&d_x, 3*3*num_delays*sizeof(double)));
860
+ gpuErrchk(cudaMalloc(&d_n_in, nlinks*3*num_delays*sizeof(double)));
861
+
862
+ gpuErrchk(cudaMalloc(&d_link_space_craft_0, nlinks*sizeof(int)));
863
+ gpuErrchk(cudaMalloc(&d_link_space_craft_1, nlinks*sizeof(int)));
864
+
865
+ gpuErrchk(cudaMalloc(&d_L_vals, nlinks*num_delays*sizeof(double)));
866
+
867
+ gpuErrchk(cudaMemcpy(d_k, &k, 3*sizeof(double), cudaMemcpyHostToDevice));
868
+ gpuErrchk(cudaMemcpy(d_u, &u, 3*sizeof(double), cudaMemcpyHostToDevice));
869
+ gpuErrchk(cudaMemcpy(d_v, &v, 3*sizeof(double), cudaMemcpyHostToDevice));
870
+
871
+ gpuErrchk(cudaMemcpy(d_x, x, 3*3*num_delays*sizeof(double), cudaMemcpyHostToDevice));
872
+ gpuErrchk(cudaMemcpy(d_n_in, n_in, nlinks*3*num_delays*sizeof(double), cudaMemcpyHostToDevice));
873
+
874
+ gpuErrchk(cudaMemcpy(d_link_space_craft_0, link_space_craft_0, nlinks*sizeof(int), cudaMemcpyHostToDevice));
875
+ gpuErrchk(cudaMemcpy(d_link_space_craft_1, link_space_craft_1, nlinks*sizeof(int), cudaMemcpyHostToDevice));
876
+
877
+ gpuErrchk(cudaMemcpy(d_L_vals, L_vals, num_delays*nlinks*sizeof(double), cudaMemcpyHostToDevice));
878
+
879
+ gpuErrchk(cudaMalloc(&d_y_gw, nlinks*num_delays*sizeof(double)));
880
+
881
+ for (int i=0; i<1; i++){
882
+
883
+ get_response(d_y_gw, d_k, d_u, d_v, dt, d_x, d_n_in,
884
+ num_delays, d_link_space_craft_0, d_link_space_craft_1,
885
+ d_L_vals, d_input_in, num_pts_in, order, sampling_frequency, buffer_integer, d_factorials_in, num_fac, input_start_time);
886
+ }
887
+
888
+ double *y_gw = new double[num_delays];
889
+
890
+ gpuErrchk(cudaMemcpy(y_gw, d_y_gw, num_delays*sizeof(double), cudaMemcpyDeviceToHost));
891
+ for (int i=0; i<1; i++) printf("%e\n", y_gw[i]);
892
+
893
+ delete[] n_in;
894
+ delete[] x;
895
+ delete[] L_vals;
896
+ delete[] link_space_craft_0;
897
+ delete[] link_space_craft_1;
898
+
899
+ gpuErrchk(cudaFree(d_k));
900
+ gpuErrchk(cudaFree(d_u));
901
+ gpuErrchk(cudaFree(d_v));
902
+
903
+ gpuErrchk(cudaFree(d_x));
904
+ gpuErrchk(cudaFree(d_n_in));
905
+
906
+ gpuErrchk(cudaFree(d_link_space_craft_0));
907
+ gpuErrchk(cudaFree(d_link_space_craft_1));
908
+
909
+ gpuErrchk(cudaFree(d_L_vals));
910
+
911
+ gpuErrchk(cudaFree(d_y_gw));
912
+ delete[] y_gw;
913
+
914
+ gpuErrchk(cudaFree(d_input_in));
915
+ gpuErrchk(cudaFree(d_factorials_in));
916
+
917
+ delete[] input_in;
918
+ }
919
+ */