timewise 0.5.3__py3-none-any.whl → 1.0.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- timewise/__init__.py +1 -5
- timewise/backend/__init__.py +6 -0
- timewise/backend/base.py +36 -0
- timewise/backend/filesystem.py +80 -0
- timewise/chunking.py +50 -0
- timewise/cli.py +117 -11
- timewise/config.py +34 -0
- timewise/io/__init__.py +1 -0
- timewise/io/config.py +64 -0
- timewise/io/download.py +302 -0
- timewise/io/stable_tap.py +121 -0
- timewise/plot/__init__.py +3 -0
- timewise/plot/diagnostic.py +242 -0
- timewise/plot/lightcurve.py +112 -0
- timewise/plot/panstarrs.py +260 -0
- timewise/plot/sdss.py +109 -0
- timewise/process/__init__.py +2 -0
- timewise/process/config.py +30 -0
- timewise/process/interface.py +143 -0
- timewise/process/keys.py +10 -0
- timewise/process/stacking.py +310 -0
- timewise/process/template.yml +49 -0
- timewise/query/__init__.py +6 -0
- timewise/query/base.py +45 -0
- timewise/query/positional.py +40 -0
- timewise/tables/__init__.py +10 -0
- timewise/tables/allwise_p3as_mep.py +22 -0
- timewise/tables/base.py +9 -0
- timewise/tables/neowiser_p1bs_psd.py +22 -0
- timewise/types.py +30 -0
- timewise/util/backoff.py +12 -0
- timewise/util/csv_utils.py +12 -0
- timewise/util/error_threading.py +70 -0
- timewise/util/visits.py +33 -0
- timewise-1.0.0a1.dist-info/METADATA +205 -0
- timewise-1.0.0a1.dist-info/RECORD +39 -0
- {timewise-0.5.3.dist-info → timewise-1.0.0a1.dist-info}/WHEEL +1 -1
- timewise-1.0.0a1.dist-info/entry_points.txt +3 -0
- timewise/big_parent_sample.py +0 -106
- timewise/config_loader.py +0 -157
- timewise/general.py +0 -52
- timewise/parent_sample_base.py +0 -89
- timewise/point_source_utils.py +0 -68
- timewise/utils.py +0 -558
- timewise/wise_bigdata_desy_cluster.py +0 -1407
- timewise/wise_data_base.py +0 -2027
- timewise/wise_data_by_visit.py +0 -672
- timewise/wise_flux_conversion_correction.dat +0 -19
- timewise-0.5.3.dist-info/METADATA +0 -55
- timewise-0.5.3.dist-info/RECORD +0 -17
- timewise-0.5.3.dist-info/entry_points.txt +0 -3
- {timewise-0.5.3.dist-info → timewise-1.0.0a1.dist-info/licenses}/LICENSE +0 -0
|
@@ -1,1407 +0,0 @@
|
|
|
1
|
-
import getpass
|
|
2
|
-
import os
|
|
3
|
-
import json
|
|
4
|
-
import subprocess
|
|
5
|
-
import math
|
|
6
|
-
import pickle
|
|
7
|
-
import queue
|
|
8
|
-
import threading
|
|
9
|
-
import argparse
|
|
10
|
-
import time
|
|
11
|
-
import seaborn as sns
|
|
12
|
-
import backoff
|
|
13
|
-
import shutil
|
|
14
|
-
import gc
|
|
15
|
-
import tqdm
|
|
16
|
-
import sys
|
|
17
|
-
from pathlib import Path
|
|
18
|
-
|
|
19
|
-
from functools import cache
|
|
20
|
-
from scipy.stats import chi2, f
|
|
21
|
-
import matplotlib.pyplot as plt
|
|
22
|
-
import numpy as np
|
|
23
|
-
import pandas as pd
|
|
24
|
-
import pyvo as vo
|
|
25
|
-
import traceback as tb
|
|
26
|
-
import gzip
|
|
27
|
-
import logging
|
|
28
|
-
|
|
29
|
-
from typing import List
|
|
30
|
-
|
|
31
|
-
from timewise.general import get_directories, backoff_hndlr
|
|
32
|
-
from timewise.wise_data_by_visit import WiseDataByVisit
|
|
33
|
-
from timewise.utils import StableAsyncTAPJob, ErrorQueue, ExceptionSafeThread
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
logger = logging.getLogger(__name__)
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
class WISEDataDESYCluster(WiseDataByVisit):
|
|
40
|
-
"""
|
|
41
|
-
A class to download WISE data with multiple threads and do the binning on the DESY cluster.
|
|
42
|
-
In addition to the attributes of `WiseDataByVisit` this class has the following attributes:
|
|
43
|
-
|
|
44
|
-
:param executable_filename: the filename of the executable that will be submitted to the cluster
|
|
45
|
-
:type executable_filename: Path
|
|
46
|
-
:param submit_file_filename: the filename of the submit file that will be submitted to the cluster
|
|
47
|
-
:type submit_file_filename: Path
|
|
48
|
-
:param job_id: the job id of the submitted job
|
|
49
|
-
:type job_id: str
|
|
50
|
-
:param cluster_jobID_map: a dictionary mapping the chunk number to the cluster job id
|
|
51
|
-
:type cluster_jobID_map: dict
|
|
52
|
-
:param clusterJob_chunk_map: a dictionary mapping the cluster job id to the chunk number
|
|
53
|
-
:type clusterJob_chunk_map: dict
|
|
54
|
-
:param cluster_info_file: the filename of the file that stores the cluster info, loaded by the cluster jobs
|
|
55
|
-
:type cluster_info_file: Path
|
|
56
|
-
:param start_time: the time when the download started
|
|
57
|
-
:type start_time: float
|
|
58
|
-
"""
|
|
59
|
-
status_cmd = f'qstat -u {getpass.getuser()}'
|
|
60
|
-
# finding the file that contains the setup function
|
|
61
|
-
if (env_file := os.getenv('TIMEWISE_DESY_CLUSTER_BASHFILE')) is not None:
|
|
62
|
-
BASHFILE = Path(env_file)
|
|
63
|
-
else:
|
|
64
|
-
BASHFILE = Path("~/.bashrc").expanduser()
|
|
65
|
-
|
|
66
|
-
def __init__(
|
|
67
|
-
self,
|
|
68
|
-
base_name,
|
|
69
|
-
parent_sample_class,
|
|
70
|
-
min_sep_arcsec,
|
|
71
|
-
n_chunks,
|
|
72
|
-
clean_outliers_when_binning=True,
|
|
73
|
-
multiply_flux_error=True
|
|
74
|
-
):
|
|
75
|
-
"""
|
|
76
|
-
Constructor of the class.
|
|
77
|
-
|
|
78
|
-
:param base_name: the base name of the data directory
|
|
79
|
-
:type base_name: str
|
|
80
|
-
:param parent_sample_class: the parent sample class
|
|
81
|
-
:type parent_sample_class: ParentSampleBase
|
|
82
|
-
:param min_sep_arcsec: query region around source for positional query
|
|
83
|
-
:type min_sep_arcsec: float
|
|
84
|
-
:param n_chunks: number of chunks to split the sample into
|
|
85
|
-
:type n_chunks: int
|
|
86
|
-
:param clean_outliers_when_binning: if True, clean outliers when binning
|
|
87
|
-
:type clean_outliers_when_binning: bool
|
|
88
|
-
"""
|
|
89
|
-
|
|
90
|
-
super().__init__(base_name=base_name, parent_sample_class=parent_sample_class, min_sep_arcsec=min_sep_arcsec,
|
|
91
|
-
n_chunks=n_chunks, clean_outliers_when_binning=clean_outliers_when_binning,
|
|
92
|
-
multiply_flux_error=multiply_flux_error)
|
|
93
|
-
|
|
94
|
-
# set up cluster stuff
|
|
95
|
-
self._status_output = None
|
|
96
|
-
directories = get_directories()
|
|
97
|
-
self.executable_filename = self.cluster_dir / "run_timewise.sh"
|
|
98
|
-
self.submit_file_filename = self.cluster_dir / "submit_file.submit"
|
|
99
|
-
self.job_id = None
|
|
100
|
-
|
|
101
|
-
self.cluster_jobID_map = None
|
|
102
|
-
self.clusterJob_chunk_map = None
|
|
103
|
-
self.cluster_info_file = self.cluster_dir / 'cluster_info.pkl'
|
|
104
|
-
self._overwrite = True
|
|
105
|
-
|
|
106
|
-
# these attributes will be set later and are used to pass them to the threads
|
|
107
|
-
self._n_cluster_jobs_per_chunk = None
|
|
108
|
-
self._storage_dir = None
|
|
109
|
-
|
|
110
|
-
# status attributes
|
|
111
|
-
self.start_time = None
|
|
112
|
-
self._total_tasks = None
|
|
113
|
-
self._done_tasks = None
|
|
114
|
-
|
|
115
|
-
self._tap_queue = None
|
|
116
|
-
self._cluster_queue = None
|
|
117
|
-
self._io_queue = None
|
|
118
|
-
self._io_queue_done = None
|
|
119
|
-
self._combining_queue = None
|
|
120
|
-
|
|
121
|
-
# ---------------------------------------------------------------------------------- #
|
|
122
|
-
# START using gzip to compress the data when saving #
|
|
123
|
-
# ----------------------------------------------------- #
|
|
124
|
-
|
|
125
|
-
def _data_product_filename(self, service, chunk_number=None, jobID=None, use_bigdata_dir=False):
|
|
126
|
-
fn = super(WISEDataDESYCluster, self)._data_product_filename(service, chunk_number=chunk_number, jobID=jobID)
|
|
127
|
-
|
|
128
|
-
if use_bigdata_dir:
|
|
129
|
-
d = get_directories()
|
|
130
|
-
fn = str(fn).replace(str(d["data_dir"]), str(d["bigdata_dir"]))
|
|
131
|
-
|
|
132
|
-
return Path(str(fn) + ".gz")
|
|
133
|
-
|
|
134
|
-
def load_data_product(
|
|
135
|
-
self,
|
|
136
|
-
service,
|
|
137
|
-
chunk_number=None,
|
|
138
|
-
jobID=None,
|
|
139
|
-
return_filename=False,
|
|
140
|
-
use_bigdata_dir=False,
|
|
141
|
-
verify_contains_lightcurves=False
|
|
142
|
-
):
|
|
143
|
-
fn = self._data_product_filename(
|
|
144
|
-
service,
|
|
145
|
-
chunk_number,
|
|
146
|
-
jobID,
|
|
147
|
-
use_bigdata_dir=use_bigdata_dir
|
|
148
|
-
)
|
|
149
|
-
|
|
150
|
-
logger.debug(f"loading {fn}")
|
|
151
|
-
try:
|
|
152
|
-
with gzip.open(fn, 'rt', encoding="utf-8") as fzip:
|
|
153
|
-
data_product = json.load(fzip)
|
|
154
|
-
|
|
155
|
-
if verify_contains_lightcurves:
|
|
156
|
-
try:
|
|
157
|
-
self._verify_contains_lightcurves(data_product)
|
|
158
|
-
except KeyError as e:
|
|
159
|
-
raise KeyError(f"{fn}: {e}")
|
|
160
|
-
|
|
161
|
-
if return_filename:
|
|
162
|
-
return data_product, fn
|
|
163
|
-
return data_product
|
|
164
|
-
except FileNotFoundError:
|
|
165
|
-
logger.warning(f"No file {fn}")
|
|
166
|
-
|
|
167
|
-
def _save_data_product(
|
|
168
|
-
self,
|
|
169
|
-
data_product,
|
|
170
|
-
service,
|
|
171
|
-
chunk_number=None,
|
|
172
|
-
jobID=None,
|
|
173
|
-
overwrite=False,
|
|
174
|
-
use_bigdata_dir=False
|
|
175
|
-
):
|
|
176
|
-
fn = self._data_product_filename(
|
|
177
|
-
service,
|
|
178
|
-
chunk_number,
|
|
179
|
-
jobID,
|
|
180
|
-
use_bigdata_dir=use_bigdata_dir
|
|
181
|
-
)
|
|
182
|
-
logger.debug(f"saving {len(data_product)} new objects to {fn}")
|
|
183
|
-
|
|
184
|
-
if fn == self._data_product_filename(service):
|
|
185
|
-
self._cached_final_products['lightcurves'][service] = data_product
|
|
186
|
-
|
|
187
|
-
if not overwrite:
|
|
188
|
-
try:
|
|
189
|
-
old_data_product = self.load_data_product(service=service, chunk_number=chunk_number, jobID=jobID)
|
|
190
|
-
|
|
191
|
-
if old_data_product is not None:
|
|
192
|
-
logger.debug(f"Found {len(old_data_product)}. Combining")
|
|
193
|
-
data_product = data_product.update(old_data_product)
|
|
194
|
-
|
|
195
|
-
except FileNotFoundError as e:
|
|
196
|
-
logger.info(f"FileNotFoundError: {e}. Making new binned lightcurves.")
|
|
197
|
-
|
|
198
|
-
with gzip.open(fn, 'wt', encoding="utf-8") as fzip:
|
|
199
|
-
json.dump(data_product, fzip)
|
|
200
|
-
|
|
201
|
-
# ----------------------------------------------------- #
|
|
202
|
-
# END using gzip to compress the data when saving #
|
|
203
|
-
# ---------------------------------------------------------------------------------- #
|
|
204
|
-
|
|
205
|
-
def get_sample_photometric_data(self, max_nTAPjobs=8, perc=1, tables=None, chunks=None,
|
|
206
|
-
cluster_jobs_per_chunk=100, wait=5, remove_chunks=False,
|
|
207
|
-
query_type='positional', overwrite=True,
|
|
208
|
-
storage_directory=None,
|
|
209
|
-
node_memory='8G',
|
|
210
|
-
skip_download=False,
|
|
211
|
-
skip_input=False,
|
|
212
|
-
mask_by_position=False):
|
|
213
|
-
"""
|
|
214
|
-
An alternative to `get_photometric_data()` that uses the DESY cluster and is optimised for large datasets.
|
|
215
|
-
|
|
216
|
-
:param max_nTAPjobs: The maximum number of TAP jobs active at the same time.
|
|
217
|
-
:type max_nTAPjobs: int
|
|
218
|
-
:param perc: The percentage of chunks to download
|
|
219
|
-
:type perc: float
|
|
220
|
-
:param tables: The tables to query
|
|
221
|
-
:type tables: str or list-like
|
|
222
|
-
:param chunks: chunks to download, default is all of the chunks
|
|
223
|
-
:type chunks: list-like
|
|
224
|
-
:param cluster_jobs_per_chunk: number of cluster jobs per chunk
|
|
225
|
-
:type cluster_jobs_per_chunk: int
|
|
226
|
-
:param wait: time in hours to wait after submitting TAP jobs
|
|
227
|
-
:type wait: float
|
|
228
|
-
:param remove_chunks: remove single chunk files after binning
|
|
229
|
-
:type remove_chunks: bool
|
|
230
|
-
:param query_type: 'positional': query photometry based on distance from object, 'by_allwise_id': select all photometry points within a radius of 50 arcsec with the corresponding AllWISE ID
|
|
231
|
-
:type query_type: str
|
|
232
|
-
:param overwrite: overwrite already existing lightcurves and metadata
|
|
233
|
-
:type overwrite: bool
|
|
234
|
-
:param storage_directory: move binned files and raw data here after work is done, defaults to TIMEWISE_BIGDATA_DIR
|
|
235
|
-
:type storage_directory: str | Path
|
|
236
|
-
:param node_memory: memory per node on the cluster, default is 8G
|
|
237
|
-
:type node_memory: str
|
|
238
|
-
:param skip_download: if True, assume data is already downloaded, only do binning in that case
|
|
239
|
-
:type skip_download: bool
|
|
240
|
-
:param skip_input: if True do not ask if data is correct before download
|
|
241
|
-
:type skip_input: bool
|
|
242
|
-
:param mask_by_position: if `True` mask single exposures that are too far away from the bulk
|
|
243
|
-
:type mask_by_position: bool
|
|
244
|
-
"""
|
|
245
|
-
|
|
246
|
-
# --------------------- set defaults --------------------------- #
|
|
247
|
-
|
|
248
|
-
mag = True
|
|
249
|
-
flux = True
|
|
250
|
-
|
|
251
|
-
if tables is None:
|
|
252
|
-
tables = [
|
|
253
|
-
'AllWISE Multiepoch Photometry Table',
|
|
254
|
-
'NEOWISE-R Single Exposure (L1b) Source Table'
|
|
255
|
-
]
|
|
256
|
-
tables = np.atleast_1d(tables)
|
|
257
|
-
|
|
258
|
-
if chunks is None:
|
|
259
|
-
chunks = list(range(round(int(self.n_chunks * perc))))
|
|
260
|
-
else:
|
|
261
|
-
cm = [c not in self.chunk_map for c in chunks]
|
|
262
|
-
if np.any(cm):
|
|
263
|
-
raise ValueError(f"Chunks {np.array(chunks)[cm]} are not in chunk map. "
|
|
264
|
-
f"Probably they are larger than the set chunk number of {self._n_chunks}")
|
|
265
|
-
|
|
266
|
-
if remove_chunks:
|
|
267
|
-
raise NotImplementedError("Removing chunks is not implemented yet!")
|
|
268
|
-
|
|
269
|
-
if query_type not in self.query_types:
|
|
270
|
-
raise ValueError(f"Unknown query type {query_type}! Choose one of {self.query_types}")
|
|
271
|
-
|
|
272
|
-
service = 'tap'
|
|
273
|
-
|
|
274
|
-
# set up dictionary to store jobs in
|
|
275
|
-
self.tap_jobs = {t: dict() for t in tables}
|
|
276
|
-
|
|
277
|
-
logger.debug(f"Getting {perc * 100:.2f}% of lightcurve chunks ({len(chunks)}) via {service} "
|
|
278
|
-
f"in {'magnitude' if mag else ''} {'flux' if flux else ''} "
|
|
279
|
-
f"from {tables}\nskipping download: {skip_download}")
|
|
280
|
-
|
|
281
|
-
if not skip_input:
|
|
282
|
-
input('Correct? [hit enter] ')
|
|
283
|
-
|
|
284
|
-
# --------------------------- set up cluster info --------------------------- #
|
|
285
|
-
|
|
286
|
-
self.n_cluster_jobs_per_chunk = cluster_jobs_per_chunk
|
|
287
|
-
self.clear_cluster_log_dir()
|
|
288
|
-
self._save_cluster_info()
|
|
289
|
-
self._overwrite = overwrite
|
|
290
|
-
self._storage_dir = get_directories()['bigdata_dir'] if storage_directory is None else Path(storage_directory)
|
|
291
|
-
|
|
292
|
-
# --------------------------- set up queues --------------------------- #
|
|
293
|
-
|
|
294
|
-
self.queue = queue.Queue()
|
|
295
|
-
self._tap_queue = ErrorQueue()
|
|
296
|
-
self._cluster_queue = ErrorQueue()
|
|
297
|
-
self._io_queue = queue.PriorityQueue()
|
|
298
|
-
self._io_queue_done = queue.Queue()
|
|
299
|
-
self._combining_queue = ErrorQueue()
|
|
300
|
-
|
|
301
|
-
# --------------------------- starting threads --------------------------- #
|
|
302
|
-
|
|
303
|
-
tap_threads = [
|
|
304
|
-
ExceptionSafeThread(error_queue=self._tap_queue, target=self._tap_thread, daemon=True, name=f"TAPThread{_}")
|
|
305
|
-
for _ in range(max_nTAPjobs)
|
|
306
|
-
]
|
|
307
|
-
cluster_threads = [
|
|
308
|
-
ExceptionSafeThread(error_queue=self._cluster_queue, target=self._cluster_thread, daemon=True, name=f"ClusterThread{_}")
|
|
309
|
-
for _ in range(max_nTAPjobs)
|
|
310
|
-
]
|
|
311
|
-
io_thread = threading.Thread(target=self._io_thread, daemon=True, name="IOThread")
|
|
312
|
-
combining_thread = ExceptionSafeThread(error_queue=self._combining_queue, target=self._combining_thread, daemon=True, name="CombiningThread")
|
|
313
|
-
status_thread = threading.Thread(target=self._status_thread, daemon=True, name='StatusThread')
|
|
314
|
-
|
|
315
|
-
for t in tap_threads + cluster_threads + [io_thread, combining_thread]:
|
|
316
|
-
logger.debug('starting thread')
|
|
317
|
-
t.start()
|
|
318
|
-
|
|
319
|
-
logger.debug(f'started {len(tap_threads)} TAP threads and {len(cluster_threads)} cluster threads.')
|
|
320
|
-
|
|
321
|
-
# --------------------------- filling queue with tasks --------------------------- #
|
|
322
|
-
|
|
323
|
-
self.start_time = time.time()
|
|
324
|
-
self._total_tasks = len(chunks)
|
|
325
|
-
self._done_tasks = 0
|
|
326
|
-
|
|
327
|
-
for c in chunks:
|
|
328
|
-
if not skip_download:
|
|
329
|
-
self._tap_queue.put((tables, c, wait, mag, flux, node_memory, query_type, mask_by_position))
|
|
330
|
-
else:
|
|
331
|
-
self._cluster_queue.put((node_memory, c, mask_by_position))
|
|
332
|
-
|
|
333
|
-
status_thread.start()
|
|
334
|
-
|
|
335
|
-
# --------------------------- wait for completion --------------------------- #
|
|
336
|
-
|
|
337
|
-
logger.debug(f'added {self._tap_queue.qsize()} tasks to tap queue')
|
|
338
|
-
self._tap_queue.join()
|
|
339
|
-
logger.debug('TAP done')
|
|
340
|
-
self._cluster_queue.join()
|
|
341
|
-
logger.debug('cluster done')
|
|
342
|
-
self._combining_queue.join()
|
|
343
|
-
logger.debug('combining done')
|
|
344
|
-
|
|
345
|
-
# unset queues
|
|
346
|
-
self.queue = None
|
|
347
|
-
self._tap_queue = None
|
|
348
|
-
self._cluster_queue = None
|
|
349
|
-
self._io_queue = None
|
|
350
|
-
self._io_queue_done = None
|
|
351
|
-
self._combining_queue = None
|
|
352
|
-
|
|
353
|
-
@backoff.on_exception(
|
|
354
|
-
backoff.expo,
|
|
355
|
-
vo.dal.exceptions.DALServiceError,
|
|
356
|
-
giveup=WiseDataByVisit._give_up_tap,
|
|
357
|
-
max_tries=50,
|
|
358
|
-
on_backoff=backoff_hndlr
|
|
359
|
-
)
|
|
360
|
-
def _wait_for_job(self, t, i):
|
|
361
|
-
logger.info(f"Waiting on {i}th query of {t} ........")
|
|
362
|
-
_job = StableAsyncTAPJob(url=self.tap_jobs[t][i])
|
|
363
|
-
_job.wait()
|
|
364
|
-
logger.info(f'{i}th query of {t}: Done!')
|
|
365
|
-
|
|
366
|
-
def _get_results_from_job(self, t, i):
|
|
367
|
-
logger.debug(f"getting results for {i}th query of {t} .........")
|
|
368
|
-
_job = StableAsyncTAPJob(url=self.tap_jobs[t][i])
|
|
369
|
-
lightcurve = _job.fetch_result().to_table().to_pandas()
|
|
370
|
-
fn = self._chunk_photometry_cache_filename(t, i)
|
|
371
|
-
table_nice_name = self.get_db_name(t, nice=True)
|
|
372
|
-
logger.debug(f"{i}th query of {table_nice_name}: saving under {fn}")
|
|
373
|
-
cols = dict(self.photometry_table_keymap[table_nice_name]['mag'])
|
|
374
|
-
cols.update(self.photometry_table_keymap[table_nice_name]['flux'])
|
|
375
|
-
|
|
376
|
-
if 'allwise' in t:
|
|
377
|
-
cols['cntr_mf'] = 'allwise_cntr'
|
|
378
|
-
|
|
379
|
-
lightcurve.rename(columns=cols).to_csv(fn)
|
|
380
|
-
return
|
|
381
|
-
|
|
382
|
-
def _io_queue_hash(self, method_name, args):
|
|
383
|
-
return f"{method_name}_{args}"
|
|
384
|
-
|
|
385
|
-
def _wait_for_io_task(self, method_name, args):
|
|
386
|
-
h = self._io_queue_hash(method_name, args)
|
|
387
|
-
logger.debug(f"waiting on io-task {h}")
|
|
388
|
-
|
|
389
|
-
while True:
|
|
390
|
-
_io_queue_done = list(self._io_queue_done.queue)
|
|
391
|
-
if h in _io_queue_done:
|
|
392
|
-
break
|
|
393
|
-
|
|
394
|
-
time.sleep(30)
|
|
395
|
-
|
|
396
|
-
logger.debug(f"{h} done!")
|
|
397
|
-
|
|
398
|
-
def _io_thread(self):
|
|
399
|
-
logger.debug("started in-out thread")
|
|
400
|
-
while True:
|
|
401
|
-
priority, method_name, args = self._io_queue.get(block=True)
|
|
402
|
-
logger.debug(f"executing {method_name} with arguments {args} (priority {priority})")
|
|
403
|
-
|
|
404
|
-
try:
|
|
405
|
-
self.__getattribute__(method_name)(*args)
|
|
406
|
-
self._io_queue_done.put(self._io_queue_hash(method_name, args))
|
|
407
|
-
except Exception as e:
|
|
408
|
-
msg = (
|
|
409
|
-
f"#################################################################\n"
|
|
410
|
-
f" !!! ATTENTION !!! \n"
|
|
411
|
-
f" ----------------- {method_name}({args}) ---------------- \n"
|
|
412
|
-
f" AN ERROR OCCURED \n"
|
|
413
|
-
f"\n{''.join(tb.format_exception(None, e, e.__traceback__))}\n\n"
|
|
414
|
-
f"putting {method_name}({args}) back into IO-queue\n"
|
|
415
|
-
f"#################################################################\n"
|
|
416
|
-
)
|
|
417
|
-
logger.error(msg)
|
|
418
|
-
self._io_queue.put((priority, method_name, args))
|
|
419
|
-
finally:
|
|
420
|
-
self._io_queue.task_done()
|
|
421
|
-
gc.collect()
|
|
422
|
-
|
|
423
|
-
def _tap_thread(self):
|
|
424
|
-
logger.debug(f'started tap thread')
|
|
425
|
-
while True:
|
|
426
|
-
tables, chunk, wait, mag, flux, node_memory, query_type, mask_by_position = self._tap_queue.get(block=True)
|
|
427
|
-
logger.debug(f'querying IRSA for chunk {chunk}')
|
|
428
|
-
|
|
429
|
-
submit_to_cluster = True
|
|
430
|
-
|
|
431
|
-
for i in range(len(tables) + 1):
|
|
432
|
-
|
|
433
|
-
# ----------- submit jobs for chunk i via the IRSA TAP ---------- #
|
|
434
|
-
if i < len(tables):
|
|
435
|
-
t = tables[i]
|
|
436
|
-
submit_method = "_submit_job_to_TAP"
|
|
437
|
-
submit_args = [chunk, t, mag, flux, query_type]
|
|
438
|
-
self._io_queue.put((1, submit_method, submit_args))
|
|
439
|
-
self._wait_for_io_task(submit_method, submit_args)
|
|
440
|
-
|
|
441
|
-
# -------------- get results of TAP job for chunk i-1 ------------- #
|
|
442
|
-
if i > 0:
|
|
443
|
-
t_before = tables[i - 1]
|
|
444
|
-
phase = StableAsyncTAPJob(url=self.tap_jobs[t_before][chunk]).phase
|
|
445
|
-
if phase == "COMPLETED":
|
|
446
|
-
result_method = "_get_results_from_job"
|
|
447
|
-
result_args = [t_before, chunk]
|
|
448
|
-
self._io_queue.put((2, result_method, result_args))
|
|
449
|
-
self._wait_for_io_task(result_method, result_args)
|
|
450
|
-
|
|
451
|
-
else:
|
|
452
|
-
logger.warning(
|
|
453
|
-
f"No completion for {chunk}th query of {t_before}! "
|
|
454
|
-
f"Phase is {phase}!"
|
|
455
|
-
)
|
|
456
|
-
submit_to_cluster = False
|
|
457
|
-
|
|
458
|
-
# --------------- wait for the TAP job of chunk i -------------- #
|
|
459
|
-
if i < len(tables):
|
|
460
|
-
t = tables[i]
|
|
461
|
-
logger.info(f'waiting for {wait} hours')
|
|
462
|
-
time.sleep(wait * 3600)
|
|
463
|
-
|
|
464
|
-
try:
|
|
465
|
-
self._wait_for_job(t, chunk)
|
|
466
|
-
except vo.dal.exceptions.DALServiceError:
|
|
467
|
-
logger.warning(f"could not wait for {chunk}th query of {t}! Not submitting to cluster.")
|
|
468
|
-
# mark task as done and move on without submission to cluster
|
|
469
|
-
submit_to_cluster = False
|
|
470
|
-
continue
|
|
471
|
-
|
|
472
|
-
self._tap_queue.task_done()
|
|
473
|
-
if submit_to_cluster:
|
|
474
|
-
self._cluster_queue.put((node_memory, chunk, mask_by_position))
|
|
475
|
-
|
|
476
|
-
gc.collect()
|
|
477
|
-
|
|
478
|
-
def _move_file_to_storage(self, filename):
|
|
479
|
-
data_dir = str(get_directories()['data_dir'])
|
|
480
|
-
dst_fn = Path(str(filename).replace(str(data_dir), str(self._storage_dir)))
|
|
481
|
-
dst_fn.parent.mkdir(parents=True, exist_ok=True)
|
|
482
|
-
|
|
483
|
-
logger.debug(f"copy {filename} to {dst_fn}")
|
|
484
|
-
|
|
485
|
-
try:
|
|
486
|
-
shutil.copy2(filename, dst_fn)
|
|
487
|
-
|
|
488
|
-
if Path(filename).stat().st_size == dst_fn.stat().st_size:
|
|
489
|
-
logger.debug(f"copy successful, removing {filename}")
|
|
490
|
-
os.remove(filename)
|
|
491
|
-
else:
|
|
492
|
-
logger.warning(f"copy from {filename} to {dst_fn} gone wrong! Not removing source.")
|
|
493
|
-
|
|
494
|
-
except FileNotFoundError as e:
|
|
495
|
-
logger.warning(f"FileNotFoundError: {e}!")
|
|
496
|
-
|
|
497
|
-
def _cluster_thread(self):
|
|
498
|
-
logger.debug(f'started cluster thread')
|
|
499
|
-
while True:
|
|
500
|
-
node_memory, chunk, mask_by_position = self._cluster_queue.get(block=True)
|
|
501
|
-
|
|
502
|
-
logger.info(f'got all TAP results for chunk {chunk}. submitting to cluster')
|
|
503
|
-
job_id = self.submit_to_cluster(
|
|
504
|
-
node_memory=node_memory,
|
|
505
|
-
single_chunk=chunk,
|
|
506
|
-
mask_by_position=mask_by_position
|
|
507
|
-
)
|
|
508
|
-
|
|
509
|
-
if not job_id:
|
|
510
|
-
logger.warning(f"could not submit {chunk} to cluster! Try later")
|
|
511
|
-
self._cluster_queue.put((node_memory, chunk, mask_by_position))
|
|
512
|
-
self._cluster_queue.task_done()
|
|
513
|
-
|
|
514
|
-
else:
|
|
515
|
-
logger.debug(f'waiting for chunk {chunk} (Cluster job {job_id})')
|
|
516
|
-
self.wait_for_job(job_id)
|
|
517
|
-
logger.debug(f'cluster done for chunk {chunk} (Cluster job {job_id}).')
|
|
518
|
-
|
|
519
|
-
log_files = Path("./").glob(f"{job_id}_*")
|
|
520
|
-
log_files_abs = [p.absolute() for p in log_files]
|
|
521
|
-
logger.debug(f"moving {len(log_files_abs)} log files to {self.cluster_log_dir}")
|
|
522
|
-
for f in log_files_abs:
|
|
523
|
-
shutil.move(f, self.cluster_log_dir)
|
|
524
|
-
|
|
525
|
-
gc.collect()
|
|
526
|
-
|
|
527
|
-
logger.debug(f"cluster thread done for chunk {chunk} (Cluster job {job_id}). "
|
|
528
|
-
f"Submitting to combining queue")
|
|
529
|
-
self._combining_queue.put(chunk)
|
|
530
|
-
self._cluster_queue.task_done()
|
|
531
|
-
|
|
532
|
-
def _combining_thread(self):
|
|
533
|
-
logger.debug(f'started combining thread')
|
|
534
|
-
while True:
|
|
535
|
-
|
|
536
|
-
try:
|
|
537
|
-
chunk = self._combining_queue.get(block=True)
|
|
538
|
-
except AttributeError: # when self._combining_queue is None, meaning it was reset in main thread
|
|
539
|
-
break
|
|
540
|
-
logger.debug(f"combining chunk {chunk}")
|
|
541
|
-
|
|
542
|
-
try:
|
|
543
|
-
success = self._combine_data_products('tap', chunk_number=chunk, remove=True, overwrite=self._overwrite)
|
|
544
|
-
|
|
545
|
-
if success:
|
|
546
|
-
if self._storage_dir:
|
|
547
|
-
filenames_to_move = [
|
|
548
|
-
self._data_product_filename(service='tap', chunk_number=chunk),
|
|
549
|
-
]
|
|
550
|
-
|
|
551
|
-
for t in self.photometry_table_keymap.keys():
|
|
552
|
-
filenames_to_move.append(self._chunk_photometry_cache_filename(t, chunk))
|
|
553
|
-
|
|
554
|
-
for fn in filenames_to_move:
|
|
555
|
-
try:
|
|
556
|
-
self._move_file_to_storage(fn)
|
|
557
|
-
except shutil.SameFileError as e:
|
|
558
|
-
logger.error(f"{e}. Not moving.")
|
|
559
|
-
|
|
560
|
-
else:
|
|
561
|
-
msg = f"Chunk {chunk}: Combining data products not successfully!"
|
|
562
|
-
if self._storage_dir:
|
|
563
|
-
msg += " Not moving files to storage."
|
|
564
|
-
logger.warning(msg)
|
|
565
|
-
|
|
566
|
-
finally:
|
|
567
|
-
self._combining_queue.task_done()
|
|
568
|
-
self._done_tasks += 1
|
|
569
|
-
gc.collect()
|
|
570
|
-
|
|
571
|
-
def _status_thread(self):
|
|
572
|
-
logger.debug('started status thread')
|
|
573
|
-
while True:
|
|
574
|
-
n_tap_tasks_queued = self._tap_queue.qsize()
|
|
575
|
-
n_cluster_tasks_queued = self._cluster_queue.qsize()
|
|
576
|
-
n_remaining = self._total_tasks - self._done_tasks
|
|
577
|
-
elapsed_time = time.time() - self.start_time
|
|
578
|
-
time_per_task = elapsed_time / self._done_tasks if self._done_tasks > 0 else np.nan
|
|
579
|
-
remaining_time = n_remaining * time_per_task
|
|
580
|
-
|
|
581
|
-
msg = f"\n----------------- STATUS -----------------\n" \
|
|
582
|
-
f"\ttasks in TAP queue:_______{n_tap_tasks_queued}\n" \
|
|
583
|
-
f"\ttasks in cluster queue:___{n_cluster_tasks_queued}\n" \
|
|
584
|
-
f"\tperformed io tasks:_______{len(list(self._io_queue_done.queue))}\n" \
|
|
585
|
-
f"\tdone total:_______________{self._done_tasks}/{self._total_tasks}\n" \
|
|
586
|
-
f"\truntime:__________________{elapsed_time/3600:.2f} hours\n" \
|
|
587
|
-
f"\tremaining:________________{remaining_time/3600:.2f} hours"
|
|
588
|
-
|
|
589
|
-
logger.info(msg)
|
|
590
|
-
time.sleep(5*3600)
|
|
591
|
-
|
|
592
|
-
# ----------------------------------------------------------------------------------- #
|
|
593
|
-
# START using cluster for downloading and binning #
|
|
594
|
-
# ---------------------------------------------------- #
|
|
595
|
-
|
|
596
|
-
@staticmethod
|
|
597
|
-
@backoff.on_exception(
|
|
598
|
-
backoff.expo,
|
|
599
|
-
OSError,
|
|
600
|
-
max_time=2*3600,
|
|
601
|
-
on_backoff=backoff_hndlr,
|
|
602
|
-
jitter=backoff.full_jitter,
|
|
603
|
-
)
|
|
604
|
-
def _execute_bash_command(cmd):
|
|
605
|
-
with subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) as process:
|
|
606
|
-
msg = process.stdout.read().decode()
|
|
607
|
-
process.terminate()
|
|
608
|
-
return msg
|
|
609
|
-
|
|
610
|
-
@staticmethod
|
|
611
|
-
def get_condor_status():
|
|
612
|
-
"""
|
|
613
|
-
Queries condor to get cluster status.
|
|
614
|
-
:return: str, output of query command
|
|
615
|
-
"""
|
|
616
|
-
cmd = "condor_q"
|
|
617
|
-
return WISEDataDESYCluster._execute_bash_command(cmd)
|
|
618
|
-
|
|
619
|
-
def collect_condor_status(self):
|
|
620
|
-
"""Gets the condor status and saves it to private attribute"""
|
|
621
|
-
self._status_output = self.get_condor_status()
|
|
622
|
-
|
|
623
|
-
def condor_status(self, job_id):
|
|
624
|
-
"""
|
|
625
|
-
Get the status of jobs running on condor.
|
|
626
|
-
:return: number of jobs that are done, running, waiting, total, held
|
|
627
|
-
"""
|
|
628
|
-
status_list = [
|
|
629
|
-
[y for y in ii.split(" ") if y]
|
|
630
|
-
for ii in self._status_output.split("\n")[4:-6]
|
|
631
|
-
]
|
|
632
|
-
done = running = waiting = total = held = None
|
|
633
|
-
|
|
634
|
-
for li in status_list:
|
|
635
|
-
if li[2] == job_id:
|
|
636
|
-
done, running, waiting = li[5:8]
|
|
637
|
-
held = 0 if len(li) == 10 else li[8]
|
|
638
|
-
total = li[-2]
|
|
639
|
-
|
|
640
|
-
return done, running, waiting, total, held
|
|
641
|
-
|
|
642
|
-
def wait_for_job(self, job_id=None):
|
|
643
|
-
"""
|
|
644
|
-
Wait until the cluster job is done
|
|
645
|
-
"""
|
|
646
|
-
|
|
647
|
-
_job_id = job_id or self.job_id
|
|
648
|
-
|
|
649
|
-
if _job_id:
|
|
650
|
-
logger.info("waiting for job with ID " + str(_job_id))
|
|
651
|
-
time.sleep(5)
|
|
652
|
-
|
|
653
|
-
self.collect_condor_status()
|
|
654
|
-
j = 0
|
|
655
|
-
while not np.all(np.array(self.condor_status(_job_id)) == None):
|
|
656
|
-
d, r, w, t, h = self.condor_status(_job_id)
|
|
657
|
-
logger.info(
|
|
658
|
-
f"{time.asctime(time.localtime())} - Job{_job_id}: "
|
|
659
|
-
f"{d} done, {r} running, {w} waiting, {h} held of total {t}"
|
|
660
|
-
)
|
|
661
|
-
j += 1
|
|
662
|
-
if j > 7:
|
|
663
|
-
logger.info(self._status_output)
|
|
664
|
-
j = 0
|
|
665
|
-
time.sleep(90)
|
|
666
|
-
self.collect_condor_status()
|
|
667
|
-
|
|
668
|
-
logger.info("Done waiting for job with ID " + str(_job_id))
|
|
669
|
-
|
|
670
|
-
else:
|
|
671
|
-
logger.info(f"No Job ID!")
|
|
672
|
-
|
|
673
|
-
@property
|
|
674
|
-
def n_cluster_jobs_per_chunk(self):
|
|
675
|
-
return self._n_cluster_jobs_per_chunk
|
|
676
|
-
|
|
677
|
-
@n_cluster_jobs_per_chunk.setter
|
|
678
|
-
def n_cluster_jobs_per_chunk(self, value):
|
|
679
|
-
self._n_cluster_jobs_per_chunk = value
|
|
680
|
-
|
|
681
|
-
if value:
|
|
682
|
-
n_jobs = self.n_chunks * int(value)
|
|
683
|
-
logger.debug(f'setting {n_jobs} jobs.')
|
|
684
|
-
self.cluster_jobID_map = np.zeros(len(self.parent_sample.df), dtype=int)
|
|
685
|
-
self.clusterJob_chunk_map = pd.DataFrame(columns=['chunk_number'])
|
|
686
|
-
|
|
687
|
-
for chunk_number in range(self.n_chunks):
|
|
688
|
-
indices = np.where(self.chunk_map == chunk_number)[0]
|
|
689
|
-
N_inds_per_job = int(math.ceil(len(indices) / self._n_cluster_jobs_per_chunk))
|
|
690
|
-
for j in range(self._n_cluster_jobs_per_chunk):
|
|
691
|
-
job_nr = chunk_number*self._n_cluster_jobs_per_chunk + j + 1
|
|
692
|
-
self.clusterJob_chunk_map.loc[job_nr] = [chunk_number]
|
|
693
|
-
start_ind = j * N_inds_per_job
|
|
694
|
-
end_ind = start_ind + N_inds_per_job
|
|
695
|
-
self.cluster_jobID_map[indices[start_ind:end_ind]] = job_nr
|
|
696
|
-
|
|
697
|
-
else:
|
|
698
|
-
logger.warning(f'Invalid value for n_cluster_jobs_per_chunk: {value}')
|
|
699
|
-
|
|
700
|
-
def _get_chunk_number_for_job(self, jobID):
|
|
701
|
-
chunk_number = self.clusterJob_chunk_map.loc[jobID, 'chunk_number']
|
|
702
|
-
return chunk_number
|
|
703
|
-
|
|
704
|
-
def _save_cluster_info(self):
|
|
705
|
-
logger.debug(f"writing cluster info to {self.cluster_info_file}")
|
|
706
|
-
with open(self.cluster_info_file, "wb") as f:
|
|
707
|
-
pickle.dump((self.cluster_jobID_map, self.clusterJob_chunk_map, self.clean_outliers_when_binning), f)
|
|
708
|
-
|
|
709
|
-
def _load_cluster_info(self):
|
|
710
|
-
logger.debug(f"loading cluster info from {self.cluster_info_file}")
|
|
711
|
-
with open(self.cluster_info_file, "rb") as f:
|
|
712
|
-
self.cluster_jobID_map, self.clusterJob_chunk_map, self.clean_outliers_when_binning = pickle.load(f)
|
|
713
|
-
|
|
714
|
-
def clear_cluster_log_dir(self):
|
|
715
|
-
"""
|
|
716
|
-
Clears the directory where cluster logs are stored
|
|
717
|
-
"""
|
|
718
|
-
fns = self.cluster_log_dir.glob("*")
|
|
719
|
-
for fn in fns:
|
|
720
|
-
(self.cluster_log_dir / fn).unlink()
|
|
721
|
-
|
|
722
|
-
def make_executable_file(self):
|
|
723
|
-
"""
|
|
724
|
-
Produces the executable that will be submitted to the NPX cluster.
|
|
725
|
-
"""
|
|
726
|
-
logging_level = logger.getEffectiveLevel()
|
|
727
|
-
script_fn = os.path.realpath(__file__)
|
|
728
|
-
|
|
729
|
-
txt = (
|
|
730
|
-
f'{sys.executable} {script_fn} '
|
|
731
|
-
f'--logging_level {logging_level} '
|
|
732
|
-
f'--base_name {self.base_name} '
|
|
733
|
-
f'--min_sep_arcsec {self.min_sep.to("arcsec").value} '
|
|
734
|
-
f'--n_chunks {self._n_chunks} '
|
|
735
|
-
f'--job_id $1 '
|
|
736
|
-
f'--mask_by_position $2'
|
|
737
|
-
)
|
|
738
|
-
|
|
739
|
-
logger.debug("writing executable to " + str(self.executable_filename))
|
|
740
|
-
with self.executable_filename.open("w") as f:
|
|
741
|
-
f.write(txt)
|
|
742
|
-
|
|
743
|
-
def get_submit_file_filename(self, ids):
|
|
744
|
-
"""
|
|
745
|
-
Get the filename of the submit file for given job ids
|
|
746
|
-
|
|
747
|
-
:param ids: list of job ids
|
|
748
|
-
:type ids: list
|
|
749
|
-
:return: filename
|
|
750
|
-
:rtype: str
|
|
751
|
-
"""
|
|
752
|
-
ids = np.atleast_1d(ids)
|
|
753
|
-
ids_string = f"{min(ids)}-{max(ids)}"
|
|
754
|
-
return self.cluster_dir / f"ids{ids_string}.submit"
|
|
755
|
-
|
|
756
|
-
def make_submit_file(
|
|
757
|
-
self,
|
|
758
|
-
job_ids: (int, List[int]),
|
|
759
|
-
node_memory: str = '8G',
|
|
760
|
-
mask_by_position: bool = False
|
|
761
|
-
):
|
|
762
|
-
"""
|
|
763
|
-
Produces the submit file that will be submitted to the NPX cluster.
|
|
764
|
-
|
|
765
|
-
:param job_ids: The job ID or list of job IDs to submit
|
|
766
|
-
:type job_ids: int or list of ints
|
|
767
|
-
:param node_memory: The amount of memory to request for each node
|
|
768
|
-
:type node_memory: str
|
|
769
|
-
:param mask_by_position: if `True` mask single exposures that are too far away from the bulk
|
|
770
|
-
:type mask_by_position: bool
|
|
771
|
-
"""
|
|
772
|
-
|
|
773
|
-
q = "1 job_id in " + ", ".join(np.atleast_1d(job_ids).astype(str))
|
|
774
|
-
d = get_directories()
|
|
775
|
-
data_dir = str(d['data_dir'])
|
|
776
|
-
bigdata_dir = str(d['bigdata_dir'])
|
|
777
|
-
|
|
778
|
-
text = (
|
|
779
|
-
f"executable = {self.executable_filename} \n"
|
|
780
|
-
f"environment = \"TIMEWISE_DATA={data_dir} TIMEWISE_BIGDATA={bigdata_dir}\" \n"
|
|
781
|
-
f"log = $(cluster)_$(process)job.log \n"
|
|
782
|
-
f"output = $(cluster)_$(process)job.out \n"
|
|
783
|
-
f"error = $(cluster)_$(process)job.err \n"
|
|
784
|
-
f"should_transfer_files = YES \n"
|
|
785
|
-
f"when_to_transfer_output = ON_EXIT \n"
|
|
786
|
-
f"arguments = $(job_id) {mask_by_position}\n"
|
|
787
|
-
f"RequestMemory = {node_memory} \n"
|
|
788
|
-
f"\n"
|
|
789
|
-
f"queue {q}"
|
|
790
|
-
)
|
|
791
|
-
|
|
792
|
-
fn = self.get_submit_file_filename(job_ids)
|
|
793
|
-
logger.debug("writing submitfile at " + fn)
|
|
794
|
-
with open(fn, "w") as f:
|
|
795
|
-
f.write(text)
|
|
796
|
-
|
|
797
|
-
def submit_to_cluster(self, node_memory, single_chunk=None, mask_by_position=False):
|
|
798
|
-
"""
|
|
799
|
-
Submit jobs to cluster
|
|
800
|
-
|
|
801
|
-
:param node_memory: memory per node
|
|
802
|
-
:type node_memory: str
|
|
803
|
-
:param single_chunk: number of single chunk to run on the cluster
|
|
804
|
-
:type single_chunk: int
|
|
805
|
-
:param mask_by_position: if `True` mask single exposures that are too far away from the bulk
|
|
806
|
-
:type mask_by_position: bool
|
|
807
|
-
:return: ID of the cluster job
|
|
808
|
-
:rtype: int
|
|
809
|
-
"""
|
|
810
|
-
|
|
811
|
-
if isinstance(single_chunk, type(None)):
|
|
812
|
-
_start_id = 1
|
|
813
|
-
_end_id = int(self.n_chunks*self.n_cluster_jobs_per_chunk)
|
|
814
|
-
else:
|
|
815
|
-
_start_id = int(single_chunk*self.n_cluster_jobs_per_chunk) + 1
|
|
816
|
-
_end_id = int(_start_id + self.n_cluster_jobs_per_chunk) - 1
|
|
817
|
-
|
|
818
|
-
job_ids = list(range(_start_id, _end_id + 1))
|
|
819
|
-
|
|
820
|
-
# make data_product files, storing essential info from parent_sample
|
|
821
|
-
for jobID in job_ids:
|
|
822
|
-
indices = self.parent_sample.df.index[self.cluster_jobID_map == jobID]
|
|
823
|
-
logger.debug(f"starting data_product for {len(indices)} objects.")
|
|
824
|
-
data_product = self._start_data_product(parent_sample_indices=indices)
|
|
825
|
-
chunk_number = self._get_chunk_number_for_job(jobID)
|
|
826
|
-
self._save_data_product(data_product, service="tap", chunk_number=chunk_number, jobID=jobID)
|
|
827
|
-
|
|
828
|
-
# make position mask files
|
|
829
|
-
if mask_by_position:
|
|
830
|
-
if single_chunk:
|
|
831
|
-
chunk_numbers = [single_chunk]
|
|
832
|
-
else:
|
|
833
|
-
chunk_numbers = list(range(self.n_chunks))
|
|
834
|
-
for c in chunk_numbers:
|
|
835
|
-
self.get_position_mask(service="tap", chunk_number=c)
|
|
836
|
-
|
|
837
|
-
self.make_executable_file()
|
|
838
|
-
self.make_submit_file(job_ids=job_ids, node_memory=node_memory, mask_by_position=mask_by_position)
|
|
839
|
-
|
|
840
|
-
submit_cmd = 'condor_submit ' + self.get_submit_file_filename(job_ids)
|
|
841
|
-
logger.info(f"{time.asctime(time.localtime())}: {submit_cmd}")
|
|
842
|
-
|
|
843
|
-
try:
|
|
844
|
-
msg = self._execute_bash_command(submit_cmd)
|
|
845
|
-
logger.info(str(msg))
|
|
846
|
-
job_id = str(msg).split("cluster ")[-1].split(".")[0]
|
|
847
|
-
logger.info(f"Running on cluster with ID {job_id}")
|
|
848
|
-
self.job_id = job_id
|
|
849
|
-
return job_id
|
|
850
|
-
|
|
851
|
-
except OSError:
|
|
852
|
-
return
|
|
853
|
-
|
|
854
|
-
def run_cluster(self, node_memory, service):
|
|
855
|
-
"""
|
|
856
|
-
Run the DESY cluster
|
|
857
|
-
|
|
858
|
-
:param node_memory: memory per node
|
|
859
|
-
:type node_memory: str
|
|
860
|
-
:param service: service to use for querying the data
|
|
861
|
-
:type service: str
|
|
862
|
-
"""
|
|
863
|
-
|
|
864
|
-
self.clear_cluster_log_dir()
|
|
865
|
-
self._save_cluster_info()
|
|
866
|
-
self.submit_to_cluster(node_memory)
|
|
867
|
-
self.wait_for_job()
|
|
868
|
-
for c in range(self.n_chunks):
|
|
869
|
-
self._combine_data_products(service, chunk_number=c, remove=True, overwrite=True)
|
|
870
|
-
|
|
871
|
-
# ---------------------------------------------------- #
|
|
872
|
-
# END using cluster for downloading and binning #
|
|
873
|
-
# ----------------------------------------------------------------------------------- #
|
|
874
|
-
|
|
875
|
-
###########################################################################################################
|
|
876
|
-
# START MAKE PLOTTING FUNCTIONS #
|
|
877
|
-
#####################################
|
|
878
|
-
|
|
879
|
-
def plot_lc(
|
|
880
|
-
self,
|
|
881
|
-
parent_sample_idx,
|
|
882
|
-
service='tap',
|
|
883
|
-
plot_unbinned=False,
|
|
884
|
-
plot_binned=True,
|
|
885
|
-
interactive=False,
|
|
886
|
-
fn=None,
|
|
887
|
-
ax=None,
|
|
888
|
-
save=True,
|
|
889
|
-
lum_key='flux_density',
|
|
890
|
-
load_from_bigdata_dir=False,
|
|
891
|
-
**kwargs
|
|
892
|
-
):
|
|
893
|
-
"""Make a pretty plot of a lightcurve
|
|
894
|
-
|
|
895
|
-
:param parent_sample_idx: The index in the parent sample of the lightcurve
|
|
896
|
-
:type parent_sample_idx: int or str
|
|
897
|
-
:param service: the service with which the lightcurves were downloaded
|
|
898
|
-
:type service: str
|
|
899
|
-
:param plot_unbinned: plot unbinned data
|
|
900
|
-
:type plot_unbinned: bool
|
|
901
|
-
:param plot_binned: plot binned lightcurve
|
|
902
|
-
:type plot_binned: bool
|
|
903
|
-
:param interactive: interactive mode
|
|
904
|
-
:type interactive: bool
|
|
905
|
-
:param fn: filename, defaults to </path/to/timewise/data/dir>/output/plots/<base_name>/<parent_sample_index>_<lum_key>.pdf
|
|
906
|
-
:type fn: str
|
|
907
|
-
:param ax: pre-existing matplotlib.Axis
|
|
908
|
-
:param save: save the plot
|
|
909
|
-
:type save: bool
|
|
910
|
-
:param lum_key: the unit of luminosity to use in the plot, either of 'mag', 'flux_density' or 'luminosity'
|
|
911
|
-
:type lum_key: str
|
|
912
|
-
:param load_from_bigdata_dir: load from the the big data storage directory
|
|
913
|
-
:type load_from_bigdata_dir: bool
|
|
914
|
-
:param kwargs: any additional kwargs will be passed on to `matplotlib.pyplot.subplots()`
|
|
915
|
-
:return: the matplotlib.Figure and matplotlib.Axes if `interactive=True`
|
|
916
|
-
"""
|
|
917
|
-
|
|
918
|
-
logger.debug(f"loading binned lightcurves")
|
|
919
|
-
|
|
920
|
-
_get_unbinned_lcs_fct = self.get_unbinned_lightcurves \
|
|
921
|
-
if service == 'tap' else self._get_unbinned_lightcurves_gator
|
|
922
|
-
|
|
923
|
-
wise_id = self.parent_sample.df.loc[int(parent_sample_idx), self.parent_wise_source_id_key]
|
|
924
|
-
if isinstance(wise_id, float) and not np.isnan(wise_id):
|
|
925
|
-
wise_id = int(wise_id)
|
|
926
|
-
logger.debug(f"{wise_id} for {parent_sample_idx}")
|
|
927
|
-
|
|
928
|
-
_chunk_number = self._get_chunk_number(parent_sample_index=parent_sample_idx)
|
|
929
|
-
data_product = self.load_data_product(
|
|
930
|
-
service,
|
|
931
|
-
chunk_number=_chunk_number,
|
|
932
|
-
use_bigdata_dir=load_from_bigdata_dir
|
|
933
|
-
)
|
|
934
|
-
lc = pd.DataFrame.from_dict(data_product[parent_sample_idx]["timewise_lightcurve"])
|
|
935
|
-
|
|
936
|
-
if plot_unbinned:
|
|
937
|
-
|
|
938
|
-
if service == 'tap':
|
|
939
|
-
unbinned_lcs = self.get_unbinned_lightcurves(_chunk_number)
|
|
940
|
-
|
|
941
|
-
else:
|
|
942
|
-
unbinned_lcs = self._get_unbinned_lightcurves_gator(_chunk_number)
|
|
943
|
-
|
|
944
|
-
unbinned_lc = unbinned_lcs[unbinned_lcs[self._tap_orig_id_key] == int(parent_sample_idx)]
|
|
945
|
-
|
|
946
|
-
else:
|
|
947
|
-
unbinned_lc = None
|
|
948
|
-
|
|
949
|
-
_lc = lc if plot_binned else None
|
|
950
|
-
|
|
951
|
-
if not fn:
|
|
952
|
-
fn = self.plots_dir / f"{parent_sample_idx}_{lum_key}.pdf"
|
|
953
|
-
|
|
954
|
-
return self._plot_lc(lightcurve=_lc, unbinned_lc=unbinned_lc, interactive=interactive, fn=fn, ax=ax,
|
|
955
|
-
save=save, lum_key=lum_key, **kwargs)
|
|
956
|
-
|
|
957
|
-
# --------------------------------------------------------------------------------------
|
|
958
|
-
# START Chi2 plots
|
|
959
|
-
# -------------------------------------------
|
|
960
|
-
|
|
961
|
-
@cache
|
|
962
|
-
def get_red_chi2(self, chunk, lum_key, use_bigdata_dir=False):
|
|
963
|
-
"""
|
|
964
|
-
Get the reduced chi2 for a given chunk or multiple chunks
|
|
965
|
-
|
|
966
|
-
:param chunk: the chunk number or list of chunk numbers
|
|
967
|
-
:type chunk: int or list
|
|
968
|
-
:param lum_key: the unit of luminosity to use in the plot, either of 'mag', 'flux' or 'flux_density'
|
|
969
|
-
:type lum_key: str
|
|
970
|
-
:param use_bigdata_dir: load from the big data storage directory, default is False
|
|
971
|
-
:type use_bigdata_dir: bool, optional
|
|
972
|
-
:return: the reduced chi2 for each band, the DataFrame will have columns `chi2`, `med_lum` and `N_datapoints`
|
|
973
|
-
:rtype: dict[str, pd.DataFrame]
|
|
974
|
-
"""
|
|
975
|
-
|
|
976
|
-
logger.info(f"getting reduced chi2 for chunk {chunk}")
|
|
977
|
-
data_product = self.load_data_product(service="tap", chunk_number=chunk, use_bigdata_dir=use_bigdata_dir)
|
|
978
|
-
|
|
979
|
-
chi2_val = {b: dict() for b in self.bands}
|
|
980
|
-
|
|
981
|
-
for b in self.bands:
|
|
982
|
-
key1 = f"{b}_chi2_to_med{lum_key}"
|
|
983
|
-
key2 = f"{b}_N_datapoints{lum_key}"
|
|
984
|
-
key3 = f"{b}_median{lum_key}"
|
|
985
|
-
logger.debug(f"{key1}, {key2}")
|
|
986
|
-
|
|
987
|
-
for i, idata_product in tqdm.tqdm(
|
|
988
|
-
data_product.items(),
|
|
989
|
-
total=len(data_product),
|
|
990
|
-
desc="collecting chi2 values"
|
|
991
|
-
):
|
|
992
|
-
if "timewise_metadata" in idata_product:
|
|
993
|
-
imetadata = idata_product["timewise_metadata"]
|
|
994
|
-
|
|
995
|
-
if (key1 in imetadata) and (key2 in imetadata):
|
|
996
|
-
ndof = (imetadata[key2] - 1)
|
|
997
|
-
v = {
|
|
998
|
-
"chi2": imetadata[key1] / ndof if ndof > 0 else np.nan,
|
|
999
|
-
"med_lum": imetadata[key3],
|
|
1000
|
-
"N_datapoints": imetadata[key2]
|
|
1001
|
-
}
|
|
1002
|
-
chi2_val[b][i] = v
|
|
1003
|
-
|
|
1004
|
-
return {b: pd.DataFrame.from_dict(chi2_val[b], orient='index') for b in self.bands}
|
|
1005
|
-
|
|
1006
|
-
def make_chi2_plot(
|
|
1007
|
-
self,
|
|
1008
|
-
index_mask=None,
|
|
1009
|
-
chunks=None,
|
|
1010
|
-
load_from_bigdata_dir=False,
|
|
1011
|
-
lum_key="_flux_density",
|
|
1012
|
-
interactive=False,
|
|
1013
|
-
save=False,
|
|
1014
|
-
nbins=100,
|
|
1015
|
-
cumulative=True,
|
|
1016
|
-
upper_bound=4
|
|
1017
|
-
):
|
|
1018
|
-
"""
|
|
1019
|
-
Make a plot of the reduced chi2 distribution for a given chunk or multiple chunks
|
|
1020
|
-
|
|
1021
|
-
:param index_mask: a mask to apply to the parent sample, eg {'AGNs': agn_mask}
|
|
1022
|
-
:type index_mask: dict
|
|
1023
|
-
:param chunks: the chunk number or list of chunk numbers
|
|
1024
|
-
:type chunks: int or list
|
|
1025
|
-
:param load_from_bigdata_dir: load from the big data storage directory, default is False
|
|
1026
|
-
:type load_from_bigdata_dir: bool, optional
|
|
1027
|
-
:param lum_key: the unit of luminosity to use in the plot, either of 'mag', 'flux' or 'flux_density'
|
|
1028
|
-
:type lum_key: str
|
|
1029
|
-
:param interactive: return the figure and axes if True, default is False
|
|
1030
|
-
:type interactive: bool
|
|
1031
|
-
:param save: save the plot, default is False
|
|
1032
|
-
:type save: bool
|
|
1033
|
-
:param nbins: the number of bins to use in the histogram, default is 100
|
|
1034
|
-
:type nbins: int
|
|
1035
|
-
:param cumulative: plot the cumulative distribution, default is True
|
|
1036
|
-
:type cumulative: bool
|
|
1037
|
-
:param upper_bound: the upper bound of the x-axis, default is 4
|
|
1038
|
-
:type upper_bound: float
|
|
1039
|
-
:return: the matplotlib.Figure and matplotlib.Axes if `interactive=True`
|
|
1040
|
-
:rtype: tuple[mpl.Figure, mpl.Axes]
|
|
1041
|
-
"""
|
|
1042
|
-
|
|
1043
|
-
if chunks is None:
|
|
1044
|
-
chunks = list(range(self.n_chunks))
|
|
1045
|
-
|
|
1046
|
-
chi2_data_list = [self.get_red_chi2(chunk, lum_key, load_from_bigdata_dir) for chunk in chunks]
|
|
1047
|
-
chi2_data = {b: pd.concat([d[b] for d in chi2_data_list]) for b in self.bands}
|
|
1048
|
-
|
|
1049
|
-
N_datapoints = set.intersection(*[set(df["N_datapoints"].unique()) for b, df in chi2_data.items()])
|
|
1050
|
-
|
|
1051
|
-
res = list()
|
|
1052
|
-
|
|
1053
|
-
for n in N_datapoints:
|
|
1054
|
-
|
|
1055
|
-
if n == 1:
|
|
1056
|
-
continue
|
|
1057
|
-
|
|
1058
|
-
chi2_df_sel = {b: df[df["N_datapoints"] == n]["chi2"] for b, df in chi2_data.items()}
|
|
1059
|
-
|
|
1060
|
-
logger.info(f"making chi2 histogram for lightcurves with {n} datapoints")
|
|
1061
|
-
|
|
1062
|
-
fig, axs = plt.subplots(
|
|
1063
|
-
ncols=len(self.bands),
|
|
1064
|
-
figsize=(10, 5),
|
|
1065
|
-
sharey="all",
|
|
1066
|
-
sharex="all"
|
|
1067
|
-
)
|
|
1068
|
-
|
|
1069
|
-
index_colors = (
|
|
1070
|
-
{k: f"C{(i+1)*2}"
|
|
1071
|
-
for i, k in enumerate(index_mask.keys())}
|
|
1072
|
-
if index_mask is not None else None
|
|
1073
|
-
)
|
|
1074
|
-
|
|
1075
|
-
x = np.linspace(0, upper_bound, nbins)
|
|
1076
|
-
x = np.concatenate([x, [1e6]])
|
|
1077
|
-
|
|
1078
|
-
for ax, band in zip(axs, self.bands):
|
|
1079
|
-
h, b, _ = ax.hist(
|
|
1080
|
-
chi2_df_sel[band].values.flatten(),
|
|
1081
|
-
label="all",
|
|
1082
|
-
density=True,
|
|
1083
|
-
cumulative=cumulative,
|
|
1084
|
-
color="k",
|
|
1085
|
-
bins=x,
|
|
1086
|
-
lw=3,
|
|
1087
|
-
histtype="step",
|
|
1088
|
-
zorder=20,
|
|
1089
|
-
)
|
|
1090
|
-
bmids = (b[1:] + b[:-1]) / 2
|
|
1091
|
-
|
|
1092
|
-
# if cumulative then also calculate the histogram of the PDF
|
|
1093
|
-
# this will be used later to calculate the goodness of fit to the F- and Chi2-distribution
|
|
1094
|
-
hpdf = h if not cumulative else \
|
|
1095
|
-
np.histogram(chi2_df_sel[band].values.flatten(), bins=x, density=True)[0]
|
|
1096
|
-
nonzero_m = hpdf > 0
|
|
1097
|
-
|
|
1098
|
-
# we need the absolute histogram numbers to calculate the uncsertainties of the density bins
|
|
1099
|
-
# The uncertainty of the density bin d_i is
|
|
1100
|
-
#
|
|
1101
|
-
# u_di = u_ci * sum_{j not i}(c_j) / [(sum_{j}c_j)^2 * (b_{i+1} - b_{i})]
|
|
1102
|
-
#
|
|
1103
|
-
# where u_ci = sqrt(c_i) is the uncertainty of the counts bin c_i
|
|
1104
|
-
#
|
|
1105
|
-
h_abs = np.histogram(chi2_df_sel[band].values.flatten(), bins=x, density=False)[0][nonzero_m]
|
|
1106
|
-
h_abs_sum = np.sum(h_abs)
|
|
1107
|
-
h_sum_not_i = h_abs_sum - h_abs
|
|
1108
|
-
u_density = np.sqrt(h_abs) * h_sum_not_i / (h_abs_sum ** 2 * (np.diff(x))[nonzero_m])
|
|
1109
|
-
|
|
1110
|
-
if index_mask is not None:
|
|
1111
|
-
for i, (label, indices) in enumerate(index_mask.items()):
|
|
1112
|
-
_indices = chi2_df_sel[band].index.intersection(indices)
|
|
1113
|
-
kwargs = (
|
|
1114
|
-
dict()
|
|
1115
|
-
if cumulative else
|
|
1116
|
-
{"edgecolor": "k"}
|
|
1117
|
-
)
|
|
1118
|
-
|
|
1119
|
-
sns.histplot(
|
|
1120
|
-
chi2_df_sel[band].loc[_indices].values.flatten(),
|
|
1121
|
-
label=label,
|
|
1122
|
-
stat="density",
|
|
1123
|
-
bins=b,
|
|
1124
|
-
ax=ax,
|
|
1125
|
-
color=index_colors[label],
|
|
1126
|
-
element="step" if cumulative else "bars",
|
|
1127
|
-
alpha=0.7,
|
|
1128
|
-
fill=not cumulative,
|
|
1129
|
-
zorder=10,
|
|
1130
|
-
lw=3 if cumulative else 1,
|
|
1131
|
-
cumulative=cumulative,
|
|
1132
|
-
**kwargs
|
|
1133
|
-
)
|
|
1134
|
-
|
|
1135
|
-
# select non-NaN's and values below `upper_bound`
|
|
1136
|
-
x_dense = np.linspace(min(x), upper_bound, 1000)
|
|
1137
|
-
sel = chi2_df_sel[band][(~chi2_df_sel[band].isna()) & (chi2_df_sel[band] < upper_bound)]
|
|
1138
|
-
if len(sel) > 0:
|
|
1139
|
-
|
|
1140
|
-
# fit an F-distribution
|
|
1141
|
-
fpars = f.fit(sel, n-1, 1e5, f0=n-1, floc=0)
|
|
1142
|
-
frozenf = f(*fpars)
|
|
1143
|
-
fpdf = frozenf.pdf
|
|
1144
|
-
|
|
1145
|
-
# if cumulative then draw the CDF instead of the PDF
|
|
1146
|
-
ffunc = frozenf.cdf if cumulative else fpdf
|
|
1147
|
-
|
|
1148
|
-
# To see how well the distribution fits the data we'll calculate the chi2
|
|
1149
|
-
# to the PDF (not to the CDF because the bins in CDF are correlated)
|
|
1150
|
-
ndof_fit = len(bmids[nonzero_m]) - 2
|
|
1151
|
-
F_chi2fit = sum((hpdf[nonzero_m] - fpdf(bmids[nonzero_m])) ** 2 / u_density**2) / ndof_fit
|
|
1152
|
-
|
|
1153
|
-
# plot the fitted distribution
|
|
1154
|
-
ax.plot(x_dense, ffunc(x_dense), color='deepskyblue', ls="--", lw=3,
|
|
1155
|
-
label=(
|
|
1156
|
-
rf"F-distribution" + "\n" +
|
|
1157
|
-
rf"$\nu_1$={fpars[0]:.0f}, $\nu_2$={fpars[1]:.2f}, scale={fpars[-1]:.2f}"
|
|
1158
|
-
),
|
|
1159
|
-
zorder=30
|
|
1160
|
-
)
|
|
1161
|
-
|
|
1162
|
-
# we will also show the expected chi2 distribution
|
|
1163
|
-
pars_expected = (n - 1, 0, 1 / (n - 1))
|
|
1164
|
-
chi2_expected = chi2(*pars_expected)
|
|
1165
|
-
r = chi2_expected.cdf(x_dense) if cumulative else chi2_expected.pdf(x_dense)
|
|
1166
|
-
chi2_fitchi2 = sum((hpdf[nonzero_m] - chi2_expected.pdf(bmids[nonzero_m])) ** 2 / hpdf[nonzero_m])
|
|
1167
|
-
ax.plot(x_dense, r, color="deepskyblue", ls=":", lw=3,
|
|
1168
|
-
label=rf"$\chi^2$-distribution" + "\n" + rf"$\nu$: {n - 1:.0f}",
|
|
1169
|
-
zorder=30)
|
|
1170
|
-
|
|
1171
|
-
ax.legend()
|
|
1172
|
-
ax.set_xlabel(r"$\chi^2_{" + band + "} / N_{visits," + band + "}$")
|
|
1173
|
-
ax.set_xlim(0, upper_bound)
|
|
1174
|
-
|
|
1175
|
-
for loc in ["top", "right"]:
|
|
1176
|
-
ax.spines[loc].set_visible(False)
|
|
1177
|
-
|
|
1178
|
-
fig.suptitle(f"{n} datapoints")
|
|
1179
|
-
fig.tight_layout()
|
|
1180
|
-
|
|
1181
|
-
if save:
|
|
1182
|
-
kind = "cdf" if cumulative else "pdf"
|
|
1183
|
-
chunk_str = "chunks_" + "_".join([str(c) for c in chunks]) \
|
|
1184
|
-
if len(chunks) != self.n_chunks \
|
|
1185
|
-
else "all_chunks"
|
|
1186
|
-
fn = self.plots_dir / f"chi2_plots" / lum_key / f"{n}_datapoints_{kind}_{chunk_str}.pdf"
|
|
1187
|
-
fn.parent.mkdir(parents=True, exist_ok=True)
|
|
1188
|
-
logger.debug(f"saving under {fn}")
|
|
1189
|
-
fig.savefig(fn)
|
|
1190
|
-
|
|
1191
|
-
if interactive:
|
|
1192
|
-
res.append((fig, axs))
|
|
1193
|
-
else:
|
|
1194
|
-
plt.close()
|
|
1195
|
-
|
|
1196
|
-
if interactive:
|
|
1197
|
-
return res
|
|
1198
|
-
|
|
1199
|
-
# -------------------------------------------
|
|
1200
|
-
# END Chi2 plots
|
|
1201
|
-
# --------------------------------------------------------------------------------------
|
|
1202
|
-
|
|
1203
|
-
# --------------------------------------------------------------------------------------
|
|
1204
|
-
# START coverage plots
|
|
1205
|
-
# -------------------------------------------
|
|
1206
|
-
|
|
1207
|
-
@cache
|
|
1208
|
-
def get_coverage(self, chunk, lum_key, load_from_bigdata_dir=False):
|
|
1209
|
-
"""
|
|
1210
|
-
Get the coverage of the MEASURED median for a given chunk and lum_key
|
|
1211
|
-
|
|
1212
|
-
:param chunk: chunk number
|
|
1213
|
-
:type chunk: int, list[int]]
|
|
1214
|
-
:param lum_key: luminosity key
|
|
1215
|
-
:type lum_key: str
|
|
1216
|
-
:param load_from_bigdata_dir: if True, load the coverage from the bigdata directory
|
|
1217
|
-
:type load_from_bigdata_dir: bool, optional
|
|
1218
|
-
"""
|
|
1219
|
-
logger.info(f"getting coverage for chunk {chunk}")
|
|
1220
|
-
data_product = self.load_data_product(service="tap", chunk_number=chunk, use_bigdata_dir=load_from_bigdata_dir)
|
|
1221
|
-
|
|
1222
|
-
coverage_val = {b: dict() for b in self.bands}
|
|
1223
|
-
|
|
1224
|
-
for b in self.bands:
|
|
1225
|
-
key1 = f"{b}_coverage_of_median{lum_key}"
|
|
1226
|
-
for i, idata_product in tqdm.tqdm(
|
|
1227
|
-
data_product.items(),
|
|
1228
|
-
total=len(data_product),
|
|
1229
|
-
desc="collecting coverage values"
|
|
1230
|
-
):
|
|
1231
|
-
if "timewise_metadata" in idata_product:
|
|
1232
|
-
imetadata = idata_product["timewise_metadata"]
|
|
1233
|
-
|
|
1234
|
-
if key1 in imetadata:
|
|
1235
|
-
v = {
|
|
1236
|
-
"coverage": imetadata[key1]
|
|
1237
|
-
}
|
|
1238
|
-
coverage_val[b][i] = v
|
|
1239
|
-
|
|
1240
|
-
return {b: pd.DataFrame.from_dict(coverage_val[b], orient='index') for b in self.bands}
|
|
1241
|
-
|
|
1242
|
-
@staticmethod
|
|
1243
|
-
def get_quantiles_label(df, cl=0.68):
|
|
1244
|
-
"""
|
|
1245
|
-
Get the quantiles label for a given coverage level
|
|
1246
|
-
"""
|
|
1247
|
-
med = np.nanmedian(df)
|
|
1248
|
-
ic = np.nanpercentile(df, [50 - cl / 2 * 100, 50 + cl / 2 * 100]) - med
|
|
1249
|
-
label = rf"$ {med:.2f} ^{{ +{ic[1]:.2f} }} _{{ {ic[0]:.2f} }}$"
|
|
1250
|
-
return label
|
|
1251
|
-
|
|
1252
|
-
def make_coverage_plots(
|
|
1253
|
-
self,
|
|
1254
|
-
index_mask=None,
|
|
1255
|
-
chunks=None,
|
|
1256
|
-
load_from_bigdata_dir=False,
|
|
1257
|
-
lum_key="_flux_density",
|
|
1258
|
-
interactive=False,
|
|
1259
|
-
save=False,
|
|
1260
|
-
nbins=100,
|
|
1261
|
-
):
|
|
1262
|
-
"""
|
|
1263
|
-
Make the coverage plots for the measured median of the specified luminosity unit
|
|
1264
|
-
|
|
1265
|
-
:param index_mask: index mask to apply to the data, e.g. {"AGNs": agn_mask}
|
|
1266
|
-
:type index_mask: dict, optional
|
|
1267
|
-
:param chunks: chunks to use, if None use all chunks
|
|
1268
|
-
:type chunks: list[int], int, optional
|
|
1269
|
-
:param load_from_bigdata_dir: if True, load the coverage from the bigdata directory
|
|
1270
|
-
:type load_from_bigdata_dir: bool, optional
|
|
1271
|
-
:param lum_key: luminosity key, either of "_flux_density" or "_mag", default is "_flux_density"
|
|
1272
|
-
:type lum_key: str, optional
|
|
1273
|
-
:param interactive: if True, return the figures and axes, otherwise close them
|
|
1274
|
-
:type interactive: bool, optional
|
|
1275
|
-
:param save: if True, save the figures
|
|
1276
|
-
:type save: bool, optional
|
|
1277
|
-
:param nbins: number of bins for the histograms
|
|
1278
|
-
:type nbins: int, optional
|
|
1279
|
-
:return: if interactive, return the figures and axes, otherwise close them
|
|
1280
|
-
:rtype: list[tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]]
|
|
1281
|
-
"""
|
|
1282
|
-
|
|
1283
|
-
if chunks is None:
|
|
1284
|
-
chunks = list(range(self.n_chunks))
|
|
1285
|
-
|
|
1286
|
-
coverages = [self.get_coverage(chunk, lum_key, load_from_bigdata_dir=load_from_bigdata_dir) for chunk in chunks]
|
|
1287
|
-
coverages_df = {b: pd.concat([c[b] for c in coverages]) for b in self.bands}
|
|
1288
|
-
|
|
1289
|
-
fig, axs = plt.subplots(
|
|
1290
|
-
1, len(self.bands),
|
|
1291
|
-
figsize=(len(self.bands) * 4, 4),
|
|
1292
|
-
sharey="all",
|
|
1293
|
-
sharex="all"
|
|
1294
|
-
)
|
|
1295
|
-
|
|
1296
|
-
for ax, band in zip(axs, self.bands):
|
|
1297
|
-
_coverages = coverages_df[band].values.flatten()
|
|
1298
|
-
label = "all\n" + self.get_quantiles_label(_coverages)
|
|
1299
|
-
|
|
1300
|
-
sns.histplot(
|
|
1301
|
-
_coverages,
|
|
1302
|
-
label=label,
|
|
1303
|
-
stat="density",
|
|
1304
|
-
bins=nbins,
|
|
1305
|
-
ax=ax,
|
|
1306
|
-
element="step",
|
|
1307
|
-
fill=False,
|
|
1308
|
-
lw=3,
|
|
1309
|
-
color="k",
|
|
1310
|
-
zorder=20,
|
|
1311
|
-
)
|
|
1312
|
-
|
|
1313
|
-
ax.set_xlabel("coverage " + band)
|
|
1314
|
-
ax.set_xlim(0, 1)
|
|
1315
|
-
fig.suptitle(f"coverage of median")
|
|
1316
|
-
fig.tight_layout()
|
|
1317
|
-
|
|
1318
|
-
if index_mask is not None:
|
|
1319
|
-
for i, (label, indices) in enumerate(index_mask.items()):
|
|
1320
|
-
_indices = coverages_df[band].index.intersection(indices)
|
|
1321
|
-
_coverages = coverages_df[band].loc[_indices].values.flatten()
|
|
1322
|
-
_label = label + "\n" + self.get_quantiles_label(_coverages)
|
|
1323
|
-
sns.histplot(
|
|
1324
|
-
_coverages,
|
|
1325
|
-
label=_label,
|
|
1326
|
-
stat="density",
|
|
1327
|
-
bins=nbins,
|
|
1328
|
-
ax=ax,
|
|
1329
|
-
color=f"C{(i+1)*2}",
|
|
1330
|
-
element="bars",
|
|
1331
|
-
alpha=0.7,
|
|
1332
|
-
fill=True,
|
|
1333
|
-
zorder=10
|
|
1334
|
-
)
|
|
1335
|
-
|
|
1336
|
-
ax.legend()
|
|
1337
|
-
for loc in ["top", "right"]:
|
|
1338
|
-
ax.spines[loc].set_visible(False)
|
|
1339
|
-
|
|
1340
|
-
ax.grid("on", axis="y", ls=":", lw=0.5, color="k", alpha=0.5, zorder=0)
|
|
1341
|
-
|
|
1342
|
-
axs[0].set_ylabel("density")
|
|
1343
|
-
|
|
1344
|
-
if save:
|
|
1345
|
-
chunk_str = "chunks_" + "_".join([str(c) for c in chunks]) \
|
|
1346
|
-
if len(chunks) != self.n_chunks \
|
|
1347
|
-
else "all_chunks"
|
|
1348
|
-
fn = self.plots_dir / f"coverage_plots" / lum_key / f"{chunk_str}.pdf"
|
|
1349
|
-
fn.parent.mkdir(parents=True, exist_ok=True)
|
|
1350
|
-
logger.debug(f"saving under {fn}")
|
|
1351
|
-
fig.savefig(fn)
|
|
1352
|
-
|
|
1353
|
-
if interactive:
|
|
1354
|
-
return fig, axs
|
|
1355
|
-
else:
|
|
1356
|
-
plt.close()
|
|
1357
|
-
|
|
1358
|
-
# -------------------------------------------
|
|
1359
|
-
# END coverage plots
|
|
1360
|
-
# --------------------------------------------------------------------------------------
|
|
1361
|
-
|
|
1362
|
-
#####################################
|
|
1363
|
-
# END MAKE PLOTTING FUNCTIONS #
|
|
1364
|
-
###########################################################################################################
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
if __name__ == '__main__':
|
|
1368
|
-
parser = argparse.ArgumentParser()
|
|
1369
|
-
parser.add_argument('--job_id', type=int)
|
|
1370
|
-
parser.add_argument('--base_name', type=str)
|
|
1371
|
-
parser.add_argument('--min_sep_arcsec', type=float)
|
|
1372
|
-
parser.add_argument('--n_chunks', type=int)
|
|
1373
|
-
parser.add_argument('--mask_by_position', type=str, default=False)
|
|
1374
|
-
parser.add_argument('--logging_level', type=str, default='INFO')
|
|
1375
|
-
cfg = parser.parse_args()
|
|
1376
|
-
|
|
1377
|
-
try:
|
|
1378
|
-
logging_level = int(cfg.logging_level)
|
|
1379
|
-
except ValueError:
|
|
1380
|
-
logging_level = cfg.logging_level.upper()
|
|
1381
|
-
|
|
1382
|
-
logging.getLogger("timewise").setLevel(logging_level)
|
|
1383
|
-
logger = logging.getLogger("timewise.main")
|
|
1384
|
-
logger.info(json.dumps(vars(cfg), indent=4))
|
|
1385
|
-
|
|
1386
|
-
wd = WISEDataDESYCluster(base_name=cfg.base_name,
|
|
1387
|
-
min_sep_arcsec=cfg.min_sep_arcsec,
|
|
1388
|
-
n_chunks=cfg.n_chunks,
|
|
1389
|
-
parent_sample_class=None)
|
|
1390
|
-
wd._load_cluster_info()
|
|
1391
|
-
chunk_number = wd._get_chunk_number_for_job(cfg.job_id)
|
|
1392
|
-
|
|
1393
|
-
match cfg.mask_by_position:
|
|
1394
|
-
case "True":
|
|
1395
|
-
mask_by_position = True
|
|
1396
|
-
case "False":
|
|
1397
|
-
mask_by_position = False
|
|
1398
|
-
case other:
|
|
1399
|
-
raise ValueError(f"mask_by_position has to be either of 'True' or 'False', not {cfg.mask_by_position}")
|
|
1400
|
-
|
|
1401
|
-
wd._subprocess_select_and_bin(
|
|
1402
|
-
service='tap',
|
|
1403
|
-
chunk_number=chunk_number,
|
|
1404
|
-
jobID=cfg.job_id,
|
|
1405
|
-
mask_by_position=mask_by_position
|
|
1406
|
-
)
|
|
1407
|
-
wd.calculate_metadata(service='tap', chunk_number=chunk_number, jobID=cfg.job_id)
|