timewise 0.5.4__py3-none-any.whl → 1.0.0a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. timewise/__init__.py +1 -5
  2. timewise/backend/__init__.py +6 -0
  3. timewise/backend/base.py +36 -0
  4. timewise/backend/filesystem.py +80 -0
  5. timewise/chunking.py +50 -0
  6. timewise/cli.py +117 -11
  7. timewise/config.py +34 -0
  8. timewise/io/__init__.py +1 -0
  9. timewise/io/config.py +64 -0
  10. timewise/io/download.py +302 -0
  11. timewise/io/stable_tap.py +121 -0
  12. timewise/plot/__init__.py +3 -0
  13. timewise/plot/diagnostic.py +242 -0
  14. timewise/plot/lightcurve.py +112 -0
  15. timewise/plot/panstarrs.py +260 -0
  16. timewise/plot/sdss.py +109 -0
  17. timewise/process/__init__.py +2 -0
  18. timewise/process/config.py +34 -0
  19. timewise/process/interface.py +143 -0
  20. timewise/process/keys.py +10 -0
  21. timewise/process/stacking.py +322 -0
  22. timewise/process/template.yml +49 -0
  23. timewise/query/__init__.py +6 -0
  24. timewise/query/base.py +45 -0
  25. timewise/query/positional.py +40 -0
  26. timewise/tables/__init__.py +10 -0
  27. timewise/tables/allwise_p3as_mep.py +22 -0
  28. timewise/tables/base.py +9 -0
  29. timewise/tables/neowiser_p1bs_psd.py +22 -0
  30. timewise/types.py +30 -0
  31. timewise/util/backoff.py +12 -0
  32. timewise/util/csv_utils.py +12 -0
  33. timewise/util/error_threading.py +70 -0
  34. timewise/util/visits.py +33 -0
  35. timewise-1.0.0a2.dist-info/METADATA +205 -0
  36. timewise-1.0.0a2.dist-info/RECORD +39 -0
  37. timewise-1.0.0a2.dist-info/entry_points.txt +3 -0
  38. timewise/big_parent_sample.py +0 -106
  39. timewise/config_loader.py +0 -157
  40. timewise/general.py +0 -52
  41. timewise/parent_sample_base.py +0 -89
  42. timewise/point_source_utils.py +0 -68
  43. timewise/utils.py +0 -558
  44. timewise/wise_bigdata_desy_cluster.py +0 -1407
  45. timewise/wise_data_base.py +0 -2027
  46. timewise/wise_data_by_visit.py +0 -672
  47. timewise/wise_flux_conversion_correction.dat +0 -19
  48. timewise-0.5.4.dist-info/METADATA +0 -56
  49. timewise-0.5.4.dist-info/RECORD +0 -17
  50. timewise-0.5.4.dist-info/entry_points.txt +0 -3
  51. {timewise-0.5.4.dist-info → timewise-1.0.0a2.dist-info}/WHEEL +0 -0
  52. {timewise-0.5.4.dist-info → timewise-1.0.0a2.dist-info}/licenses/LICENSE +0 -0
@@ -1,1407 +0,0 @@
1
- import getpass
2
- import os
3
- import json
4
- import subprocess
5
- import math
6
- import pickle
7
- import queue
8
- import threading
9
- import argparse
10
- import time
11
- import seaborn as sns
12
- import backoff
13
- import shutil
14
- import gc
15
- import tqdm
16
- import sys
17
- from pathlib import Path
18
-
19
- from functools import cache
20
- from scipy.stats import chi2, f
21
- import matplotlib.pyplot as plt
22
- import numpy as np
23
- import pandas as pd
24
- import pyvo as vo
25
- import traceback as tb
26
- import gzip
27
- import logging
28
-
29
- from typing import List
30
-
31
- from timewise.general import get_directories, backoff_hndlr
32
- from timewise.wise_data_by_visit import WiseDataByVisit
33
- from timewise.utils import StableAsyncTAPJob, ErrorQueue, ExceptionSafeThread
34
-
35
-
36
- logger = logging.getLogger(__name__)
37
-
38
-
39
- class WISEDataDESYCluster(WiseDataByVisit):
40
- """
41
- A class to download WISE data with multiple threads and do the binning on the DESY cluster.
42
- In addition to the attributes of `WiseDataByVisit` this class has the following attributes:
43
-
44
- :param executable_filename: the filename of the executable that will be submitted to the cluster
45
- :type executable_filename: Path
46
- :param submit_file_filename: the filename of the submit file that will be submitted to the cluster
47
- :type submit_file_filename: Path
48
- :param job_id: the job id of the submitted job
49
- :type job_id: str
50
- :param cluster_jobID_map: a dictionary mapping the chunk number to the cluster job id
51
- :type cluster_jobID_map: dict
52
- :param clusterJob_chunk_map: a dictionary mapping the cluster job id to the chunk number
53
- :type clusterJob_chunk_map: dict
54
- :param cluster_info_file: the filename of the file that stores the cluster info, loaded by the cluster jobs
55
- :type cluster_info_file: Path
56
- :param start_time: the time when the download started
57
- :type start_time: float
58
- """
59
- status_cmd = f'qstat -u {getpass.getuser()}'
60
- # finding the file that contains the setup function
61
- if (env_file := os.getenv('TIMEWISE_DESY_CLUSTER_BASHFILE')) is not None:
62
- BASHFILE = Path(env_file)
63
- else:
64
- BASHFILE = Path("~/.bashrc").expanduser()
65
-
66
- def __init__(
67
- self,
68
- base_name,
69
- parent_sample_class,
70
- min_sep_arcsec,
71
- n_chunks,
72
- clean_outliers_when_binning=True,
73
- multiply_flux_error=True
74
- ):
75
- """
76
- Constructor of the class.
77
-
78
- :param base_name: the base name of the data directory
79
- :type base_name: str
80
- :param parent_sample_class: the parent sample class
81
- :type parent_sample_class: ParentSampleBase
82
- :param min_sep_arcsec: query region around source for positional query
83
- :type min_sep_arcsec: float
84
- :param n_chunks: number of chunks to split the sample into
85
- :type n_chunks: int
86
- :param clean_outliers_when_binning: if True, clean outliers when binning
87
- :type clean_outliers_when_binning: bool
88
- """
89
-
90
- super().__init__(base_name=base_name, parent_sample_class=parent_sample_class, min_sep_arcsec=min_sep_arcsec,
91
- n_chunks=n_chunks, clean_outliers_when_binning=clean_outliers_when_binning,
92
- multiply_flux_error=multiply_flux_error)
93
-
94
- # set up cluster stuff
95
- self._status_output = None
96
- directories = get_directories()
97
- self.executable_filename = self.cluster_dir / "run_timewise.sh"
98
- self.submit_file_filename = self.cluster_dir / "submit_file.submit"
99
- self.job_id = None
100
-
101
- self.cluster_jobID_map = None
102
- self.clusterJob_chunk_map = None
103
- self.cluster_info_file = self.cluster_dir / 'cluster_info.pkl'
104
- self._overwrite = True
105
-
106
- # these attributes will be set later and are used to pass them to the threads
107
- self._n_cluster_jobs_per_chunk = None
108
- self._storage_dir = None
109
-
110
- # status attributes
111
- self.start_time = None
112
- self._total_tasks = None
113
- self._done_tasks = None
114
-
115
- self._tap_queue = None
116
- self._cluster_queue = None
117
- self._io_queue = None
118
- self._io_queue_done = None
119
- self._combining_queue = None
120
-
121
- # ---------------------------------------------------------------------------------- #
122
- # START using gzip to compress the data when saving #
123
- # ----------------------------------------------------- #
124
-
125
- def _data_product_filename(self, service, chunk_number=None, jobID=None, use_bigdata_dir=False):
126
- fn = super(WISEDataDESYCluster, self)._data_product_filename(service, chunk_number=chunk_number, jobID=jobID)
127
-
128
- if use_bigdata_dir:
129
- d = get_directories()
130
- fn = str(fn).replace(str(d["data_dir"]), str(d["bigdata_dir"]))
131
-
132
- return Path(str(fn) + ".gz")
133
-
134
- def load_data_product(
135
- self,
136
- service,
137
- chunk_number=None,
138
- jobID=None,
139
- return_filename=False,
140
- use_bigdata_dir=False,
141
- verify_contains_lightcurves=False
142
- ):
143
- fn = self._data_product_filename(
144
- service,
145
- chunk_number,
146
- jobID,
147
- use_bigdata_dir=use_bigdata_dir
148
- )
149
-
150
- logger.debug(f"loading {fn}")
151
- try:
152
- with gzip.open(fn, 'rt', encoding="utf-8") as fzip:
153
- data_product = json.load(fzip)
154
-
155
- if verify_contains_lightcurves:
156
- try:
157
- self._verify_contains_lightcurves(data_product)
158
- except KeyError as e:
159
- raise KeyError(f"{fn}: {e}")
160
-
161
- if return_filename:
162
- return data_product, fn
163
- return data_product
164
- except FileNotFoundError:
165
- logger.warning(f"No file {fn}")
166
-
167
- def _save_data_product(
168
- self,
169
- data_product,
170
- service,
171
- chunk_number=None,
172
- jobID=None,
173
- overwrite=False,
174
- use_bigdata_dir=False
175
- ):
176
- fn = self._data_product_filename(
177
- service,
178
- chunk_number,
179
- jobID,
180
- use_bigdata_dir=use_bigdata_dir
181
- )
182
- logger.debug(f"saving {len(data_product)} new objects to {fn}")
183
-
184
- if fn == self._data_product_filename(service):
185
- self._cached_final_products['lightcurves'][service] = data_product
186
-
187
- if not overwrite:
188
- try:
189
- old_data_product = self.load_data_product(service=service, chunk_number=chunk_number, jobID=jobID)
190
-
191
- if old_data_product is not None:
192
- logger.debug(f"Found {len(old_data_product)}. Combining")
193
- data_product = data_product.update(old_data_product)
194
-
195
- except FileNotFoundError as e:
196
- logger.info(f"FileNotFoundError: {e}. Making new binned lightcurves.")
197
-
198
- with gzip.open(fn, 'wt', encoding="utf-8") as fzip:
199
- json.dump(data_product, fzip)
200
-
201
- # ----------------------------------------------------- #
202
- # END using gzip to compress the data when saving #
203
- # ---------------------------------------------------------------------------------- #
204
-
205
- def get_sample_photometric_data(self, max_nTAPjobs=8, perc=1, tables=None, chunks=None,
206
- cluster_jobs_per_chunk=100, wait=5, remove_chunks=False,
207
- query_type='positional', overwrite=True,
208
- storage_directory=None,
209
- node_memory='8G',
210
- skip_download=False,
211
- skip_input=False,
212
- mask_by_position=False):
213
- """
214
- An alternative to `get_photometric_data()` that uses the DESY cluster and is optimised for large datasets.
215
-
216
- :param max_nTAPjobs: The maximum number of TAP jobs active at the same time.
217
- :type max_nTAPjobs: int
218
- :param perc: The percentage of chunks to download
219
- :type perc: float
220
- :param tables: The tables to query
221
- :type tables: str or list-like
222
- :param chunks: chunks to download, default is all of the chunks
223
- :type chunks: list-like
224
- :param cluster_jobs_per_chunk: number of cluster jobs per chunk
225
- :type cluster_jobs_per_chunk: int
226
- :param wait: time in hours to wait after submitting TAP jobs
227
- :type wait: float
228
- :param remove_chunks: remove single chunk files after binning
229
- :type remove_chunks: bool
230
- :param query_type: 'positional': query photometry based on distance from object, 'by_allwise_id': select all photometry points within a radius of 50 arcsec with the corresponding AllWISE ID
231
- :type query_type: str
232
- :param overwrite: overwrite already existing lightcurves and metadata
233
- :type overwrite: bool
234
- :param storage_directory: move binned files and raw data here after work is done, defaults to TIMEWISE_BIGDATA_DIR
235
- :type storage_directory: str | Path
236
- :param node_memory: memory per node on the cluster, default is 8G
237
- :type node_memory: str
238
- :param skip_download: if True, assume data is already downloaded, only do binning in that case
239
- :type skip_download: bool
240
- :param skip_input: if True do not ask if data is correct before download
241
- :type skip_input: bool
242
- :param mask_by_position: if `True` mask single exposures that are too far away from the bulk
243
- :type mask_by_position: bool
244
- """
245
-
246
- # --------------------- set defaults --------------------------- #
247
-
248
- mag = True
249
- flux = True
250
-
251
- if tables is None:
252
- tables = [
253
- 'AllWISE Multiepoch Photometry Table',
254
- 'NEOWISE-R Single Exposure (L1b) Source Table'
255
- ]
256
- tables = np.atleast_1d(tables)
257
-
258
- if chunks is None:
259
- chunks = list(range(round(int(self.n_chunks * perc))))
260
- else:
261
- cm = [c not in self.chunk_map for c in chunks]
262
- if np.any(cm):
263
- raise ValueError(f"Chunks {np.array(chunks)[cm]} are not in chunk map. "
264
- f"Probably they are larger than the set chunk number of {self._n_chunks}")
265
-
266
- if remove_chunks:
267
- raise NotImplementedError("Removing chunks is not implemented yet!")
268
-
269
- if query_type not in self.query_types:
270
- raise ValueError(f"Unknown query type {query_type}! Choose one of {self.query_types}")
271
-
272
- service = 'tap'
273
-
274
- # set up dictionary to store jobs in
275
- self.tap_jobs = {t: dict() for t in tables}
276
-
277
- logger.debug(f"Getting {perc * 100:.2f}% of lightcurve chunks ({len(chunks)}) via {service} "
278
- f"in {'magnitude' if mag else ''} {'flux' if flux else ''} "
279
- f"from {tables}\nskipping download: {skip_download}")
280
-
281
- if not skip_input:
282
- input('Correct? [hit enter] ')
283
-
284
- # --------------------------- set up cluster info --------------------------- #
285
-
286
- self.n_cluster_jobs_per_chunk = cluster_jobs_per_chunk
287
- self.clear_cluster_log_dir()
288
- self._save_cluster_info()
289
- self._overwrite = overwrite
290
- self._storage_dir = get_directories()['bigdata_dir'] if storage_directory is None else Path(storage_directory)
291
-
292
- # --------------------------- set up queues --------------------------- #
293
-
294
- self.queue = queue.Queue()
295
- self._tap_queue = ErrorQueue()
296
- self._cluster_queue = ErrorQueue()
297
- self._io_queue = queue.PriorityQueue()
298
- self._io_queue_done = queue.Queue()
299
- self._combining_queue = ErrorQueue()
300
-
301
- # --------------------------- starting threads --------------------------- #
302
-
303
- tap_threads = [
304
- ExceptionSafeThread(error_queue=self._tap_queue, target=self._tap_thread, daemon=True, name=f"TAPThread{_}")
305
- for _ in range(max_nTAPjobs)
306
- ]
307
- cluster_threads = [
308
- ExceptionSafeThread(error_queue=self._cluster_queue, target=self._cluster_thread, daemon=True, name=f"ClusterThread{_}")
309
- for _ in range(max_nTAPjobs)
310
- ]
311
- io_thread = threading.Thread(target=self._io_thread, daemon=True, name="IOThread")
312
- combining_thread = ExceptionSafeThread(error_queue=self._combining_queue, target=self._combining_thread, daemon=True, name="CombiningThread")
313
- status_thread = threading.Thread(target=self._status_thread, daemon=True, name='StatusThread')
314
-
315
- for t in tap_threads + cluster_threads + [io_thread, combining_thread]:
316
- logger.debug('starting thread')
317
- t.start()
318
-
319
- logger.debug(f'started {len(tap_threads)} TAP threads and {len(cluster_threads)} cluster threads.')
320
-
321
- # --------------------------- filling queue with tasks --------------------------- #
322
-
323
- self.start_time = time.time()
324
- self._total_tasks = len(chunks)
325
- self._done_tasks = 0
326
-
327
- for c in chunks:
328
- if not skip_download:
329
- self._tap_queue.put((tables, c, wait, mag, flux, node_memory, query_type, mask_by_position))
330
- else:
331
- self._cluster_queue.put((node_memory, c, mask_by_position))
332
-
333
- status_thread.start()
334
-
335
- # --------------------------- wait for completion --------------------------- #
336
-
337
- logger.debug(f'added {self._tap_queue.qsize()} tasks to tap queue')
338
- self._tap_queue.join()
339
- logger.debug('TAP done')
340
- self._cluster_queue.join()
341
- logger.debug('cluster done')
342
- self._combining_queue.join()
343
- logger.debug('combining done')
344
-
345
- # unset queues
346
- self.queue = None
347
- self._tap_queue = None
348
- self._cluster_queue = None
349
- self._io_queue = None
350
- self._io_queue_done = None
351
- self._combining_queue = None
352
-
353
- @backoff.on_exception(
354
- backoff.expo,
355
- vo.dal.exceptions.DALServiceError,
356
- giveup=WiseDataByVisit._give_up_tap,
357
- max_tries=50,
358
- on_backoff=backoff_hndlr
359
- )
360
- def _wait_for_job(self, t, i):
361
- logger.info(f"Waiting on {i}th query of {t} ........")
362
- _job = StableAsyncTAPJob(url=self.tap_jobs[t][i])
363
- _job.wait()
364
- logger.info(f'{i}th query of {t}: Done!')
365
-
366
- def _get_results_from_job(self, t, i):
367
- logger.debug(f"getting results for {i}th query of {t} .........")
368
- _job = StableAsyncTAPJob(url=self.tap_jobs[t][i])
369
- lightcurve = _job.fetch_result().to_table().to_pandas()
370
- fn = self._chunk_photometry_cache_filename(t, i)
371
- table_nice_name = self.get_db_name(t, nice=True)
372
- logger.debug(f"{i}th query of {table_nice_name}: saving under {fn}")
373
- cols = dict(self.photometry_table_keymap[table_nice_name]['mag'])
374
- cols.update(self.photometry_table_keymap[table_nice_name]['flux'])
375
-
376
- if 'allwise' in t:
377
- cols['cntr_mf'] = 'allwise_cntr'
378
-
379
- lightcurve.rename(columns=cols).to_csv(fn)
380
- return
381
-
382
- def _io_queue_hash(self, method_name, args):
383
- return f"{method_name}_{args}"
384
-
385
- def _wait_for_io_task(self, method_name, args):
386
- h = self._io_queue_hash(method_name, args)
387
- logger.debug(f"waiting on io-task {h}")
388
-
389
- while True:
390
- _io_queue_done = list(self._io_queue_done.queue)
391
- if h in _io_queue_done:
392
- break
393
-
394
- time.sleep(30)
395
-
396
- logger.debug(f"{h} done!")
397
-
398
- def _io_thread(self):
399
- logger.debug("started in-out thread")
400
- while True:
401
- priority, method_name, args = self._io_queue.get(block=True)
402
- logger.debug(f"executing {method_name} with arguments {args} (priority {priority})")
403
-
404
- try:
405
- self.__getattribute__(method_name)(*args)
406
- self._io_queue_done.put(self._io_queue_hash(method_name, args))
407
- except Exception as e:
408
- msg = (
409
- f"#################################################################\n"
410
- f" !!! ATTENTION !!! \n"
411
- f" ----------------- {method_name}({args}) ---------------- \n"
412
- f" AN ERROR OCCURED \n"
413
- f"\n{''.join(tb.format_exception(None, e, e.__traceback__))}\n\n"
414
- f"putting {method_name}({args}) back into IO-queue\n"
415
- f"#################################################################\n"
416
- )
417
- logger.error(msg)
418
- self._io_queue.put((priority, method_name, args))
419
- finally:
420
- self._io_queue.task_done()
421
- gc.collect()
422
-
423
- def _tap_thread(self):
424
- logger.debug(f'started tap thread')
425
- while True:
426
- tables, chunk, wait, mag, flux, node_memory, query_type, mask_by_position = self._tap_queue.get(block=True)
427
- logger.debug(f'querying IRSA for chunk {chunk}')
428
-
429
- submit_to_cluster = True
430
-
431
- for i in range(len(tables) + 1):
432
-
433
- # ----------- submit jobs for chunk i via the IRSA TAP ---------- #
434
- if i < len(tables):
435
- t = tables[i]
436
- submit_method = "_submit_job_to_TAP"
437
- submit_args = [chunk, t, mag, flux, query_type]
438
- self._io_queue.put((1, submit_method, submit_args))
439
- self._wait_for_io_task(submit_method, submit_args)
440
-
441
- # -------------- get results of TAP job for chunk i-1 ------------- #
442
- if i > 0:
443
- t_before = tables[i - 1]
444
- phase = StableAsyncTAPJob(url=self.tap_jobs[t_before][chunk]).phase
445
- if phase == "COMPLETED":
446
- result_method = "_get_results_from_job"
447
- result_args = [t_before, chunk]
448
- self._io_queue.put((2, result_method, result_args))
449
- self._wait_for_io_task(result_method, result_args)
450
-
451
- else:
452
- logger.warning(
453
- f"No completion for {chunk}th query of {t_before}! "
454
- f"Phase is {phase}!"
455
- )
456
- submit_to_cluster = False
457
-
458
- # --------------- wait for the TAP job of chunk i -------------- #
459
- if i < len(tables):
460
- t = tables[i]
461
- logger.info(f'waiting for {wait} hours')
462
- time.sleep(wait * 3600)
463
-
464
- try:
465
- self._wait_for_job(t, chunk)
466
- except vo.dal.exceptions.DALServiceError:
467
- logger.warning(f"could not wait for {chunk}th query of {t}! Not submitting to cluster.")
468
- # mark task as done and move on without submission to cluster
469
- submit_to_cluster = False
470
- continue
471
-
472
- self._tap_queue.task_done()
473
- if submit_to_cluster:
474
- self._cluster_queue.put((node_memory, chunk, mask_by_position))
475
-
476
- gc.collect()
477
-
478
- def _move_file_to_storage(self, filename):
479
- data_dir = str(get_directories()['data_dir'])
480
- dst_fn = Path(str(filename).replace(str(data_dir), str(self._storage_dir)))
481
- dst_fn.parent.mkdir(parents=True, exist_ok=True)
482
-
483
- logger.debug(f"copy {filename} to {dst_fn}")
484
-
485
- try:
486
- shutil.copy2(filename, dst_fn)
487
-
488
- if Path(filename).stat().st_size == dst_fn.stat().st_size:
489
- logger.debug(f"copy successful, removing {filename}")
490
- os.remove(filename)
491
- else:
492
- logger.warning(f"copy from {filename} to {dst_fn} gone wrong! Not removing source.")
493
-
494
- except FileNotFoundError as e:
495
- logger.warning(f"FileNotFoundError: {e}!")
496
-
497
- def _cluster_thread(self):
498
- logger.debug(f'started cluster thread')
499
- while True:
500
- node_memory, chunk, mask_by_position = self._cluster_queue.get(block=True)
501
-
502
- logger.info(f'got all TAP results for chunk {chunk}. submitting to cluster')
503
- job_id = self.submit_to_cluster(
504
- node_memory=node_memory,
505
- single_chunk=chunk,
506
- mask_by_position=mask_by_position
507
- )
508
-
509
- if not job_id:
510
- logger.warning(f"could not submit {chunk} to cluster! Try later")
511
- self._cluster_queue.put((node_memory, chunk, mask_by_position))
512
- self._cluster_queue.task_done()
513
-
514
- else:
515
- logger.debug(f'waiting for chunk {chunk} (Cluster job {job_id})')
516
- self.wait_for_job(job_id)
517
- logger.debug(f'cluster done for chunk {chunk} (Cluster job {job_id}).')
518
-
519
- log_files = Path("./").glob(f"{job_id}_*")
520
- log_files_abs = [p.absolute() for p in log_files]
521
- logger.debug(f"moving {len(log_files_abs)} log files to {self.cluster_log_dir}")
522
- for f in log_files_abs:
523
- shutil.move(f, self.cluster_log_dir)
524
-
525
- gc.collect()
526
-
527
- logger.debug(f"cluster thread done for chunk {chunk} (Cluster job {job_id}). "
528
- f"Submitting to combining queue")
529
- self._combining_queue.put(chunk)
530
- self._cluster_queue.task_done()
531
-
532
- def _combining_thread(self):
533
- logger.debug(f'started combining thread')
534
- while True:
535
-
536
- try:
537
- chunk = self._combining_queue.get(block=True)
538
- except AttributeError: # when self._combining_queue is None, meaning it was reset in main thread
539
- break
540
- logger.debug(f"combining chunk {chunk}")
541
-
542
- try:
543
- success = self._combine_data_products('tap', chunk_number=chunk, remove=True, overwrite=self._overwrite)
544
-
545
- if success:
546
- if self._storage_dir:
547
- filenames_to_move = [
548
- self._data_product_filename(service='tap', chunk_number=chunk),
549
- ]
550
-
551
- for t in self.photometry_table_keymap.keys():
552
- filenames_to_move.append(self._chunk_photometry_cache_filename(t, chunk))
553
-
554
- for fn in filenames_to_move:
555
- try:
556
- self._move_file_to_storage(fn)
557
- except shutil.SameFileError as e:
558
- logger.error(f"{e}. Not moving.")
559
-
560
- else:
561
- msg = f"Chunk {chunk}: Combining data products not successfully!"
562
- if self._storage_dir:
563
- msg += " Not moving files to storage."
564
- logger.warning(msg)
565
-
566
- finally:
567
- self._combining_queue.task_done()
568
- self._done_tasks += 1
569
- gc.collect()
570
-
571
- def _status_thread(self):
572
- logger.debug('started status thread')
573
- while True:
574
- n_tap_tasks_queued = self._tap_queue.qsize()
575
- n_cluster_tasks_queued = self._cluster_queue.qsize()
576
- n_remaining = self._total_tasks - self._done_tasks
577
- elapsed_time = time.time() - self.start_time
578
- time_per_task = elapsed_time / self._done_tasks if self._done_tasks > 0 else np.nan
579
- remaining_time = n_remaining * time_per_task
580
-
581
- msg = f"\n----------------- STATUS -----------------\n" \
582
- f"\ttasks in TAP queue:_______{n_tap_tasks_queued}\n" \
583
- f"\ttasks in cluster queue:___{n_cluster_tasks_queued}\n" \
584
- f"\tperformed io tasks:_______{len(list(self._io_queue_done.queue))}\n" \
585
- f"\tdone total:_______________{self._done_tasks}/{self._total_tasks}\n" \
586
- f"\truntime:__________________{elapsed_time/3600:.2f} hours\n" \
587
- f"\tremaining:________________{remaining_time/3600:.2f} hours"
588
-
589
- logger.info(msg)
590
- time.sleep(5*3600)
591
-
592
- # ----------------------------------------------------------------------------------- #
593
- # START using cluster for downloading and binning #
594
- # ---------------------------------------------------- #
595
-
596
- @staticmethod
597
- @backoff.on_exception(
598
- backoff.expo,
599
- OSError,
600
- max_time=2*3600,
601
- on_backoff=backoff_hndlr,
602
- jitter=backoff.full_jitter,
603
- )
604
- def _execute_bash_command(cmd):
605
- with subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) as process:
606
- msg = process.stdout.read().decode()
607
- process.terminate()
608
- return msg
609
-
610
- @staticmethod
611
- def get_condor_status():
612
- """
613
- Queries condor to get cluster status.
614
- :return: str, output of query command
615
- """
616
- cmd = "condor_q"
617
- return WISEDataDESYCluster._execute_bash_command(cmd)
618
-
619
- def collect_condor_status(self):
620
- """Gets the condor status and saves it to private attribute"""
621
- self._status_output = self.get_condor_status()
622
-
623
- def condor_status(self, job_id):
624
- """
625
- Get the status of jobs running on condor.
626
- :return: number of jobs that are done, running, waiting, total, held
627
- """
628
- status_list = [
629
- [y for y in ii.split(" ") if y]
630
- for ii in self._status_output.split("\n")[4:-6]
631
- ]
632
- done = running = waiting = total = held = None
633
-
634
- for li in status_list:
635
- if li[2] == job_id:
636
- done, running, waiting = li[5:8]
637
- held = 0 if len(li) == 10 else li[8]
638
- total = li[-2]
639
-
640
- return done, running, waiting, total, held
641
-
642
- def wait_for_job(self, job_id=None):
643
- """
644
- Wait until the cluster job is done
645
- """
646
-
647
- _job_id = job_id or self.job_id
648
-
649
- if _job_id:
650
- logger.info("waiting for job with ID " + str(_job_id))
651
- time.sleep(5)
652
-
653
- self.collect_condor_status()
654
- j = 0
655
- while not np.all(np.array(self.condor_status(_job_id)) == None):
656
- d, r, w, t, h = self.condor_status(_job_id)
657
- logger.info(
658
- f"{time.asctime(time.localtime())} - Job{_job_id}: "
659
- f"{d} done, {r} running, {w} waiting, {h} held of total {t}"
660
- )
661
- j += 1
662
- if j > 7:
663
- logger.info(self._status_output)
664
- j = 0
665
- time.sleep(90)
666
- self.collect_condor_status()
667
-
668
- logger.info("Done waiting for job with ID " + str(_job_id))
669
-
670
- else:
671
- logger.info(f"No Job ID!")
672
-
673
- @property
674
- def n_cluster_jobs_per_chunk(self):
675
- return self._n_cluster_jobs_per_chunk
676
-
677
- @n_cluster_jobs_per_chunk.setter
678
- def n_cluster_jobs_per_chunk(self, value):
679
- self._n_cluster_jobs_per_chunk = value
680
-
681
- if value:
682
- n_jobs = self.n_chunks * int(value)
683
- logger.debug(f'setting {n_jobs} jobs.')
684
- self.cluster_jobID_map = np.zeros(len(self.parent_sample.df), dtype=int)
685
- self.clusterJob_chunk_map = pd.DataFrame(columns=['chunk_number'])
686
-
687
- for chunk_number in range(self.n_chunks):
688
- indices = np.where(self.chunk_map == chunk_number)[0]
689
- N_inds_per_job = int(math.ceil(len(indices) / self._n_cluster_jobs_per_chunk))
690
- for j in range(self._n_cluster_jobs_per_chunk):
691
- job_nr = chunk_number*self._n_cluster_jobs_per_chunk + j + 1
692
- self.clusterJob_chunk_map.loc[job_nr] = [chunk_number]
693
- start_ind = j * N_inds_per_job
694
- end_ind = start_ind + N_inds_per_job
695
- self.cluster_jobID_map[indices[start_ind:end_ind]] = job_nr
696
-
697
- else:
698
- logger.warning(f'Invalid value for n_cluster_jobs_per_chunk: {value}')
699
-
700
- def _get_chunk_number_for_job(self, jobID):
701
- chunk_number = self.clusterJob_chunk_map.loc[jobID, 'chunk_number']
702
- return chunk_number
703
-
704
- def _save_cluster_info(self):
705
- logger.debug(f"writing cluster info to {self.cluster_info_file}")
706
- with open(self.cluster_info_file, "wb") as f:
707
- pickle.dump((self.cluster_jobID_map, self.clusterJob_chunk_map, self.clean_outliers_when_binning), f)
708
-
709
- def _load_cluster_info(self):
710
- logger.debug(f"loading cluster info from {self.cluster_info_file}")
711
- with open(self.cluster_info_file, "rb") as f:
712
- self.cluster_jobID_map, self.clusterJob_chunk_map, self.clean_outliers_when_binning = pickle.load(f)
713
-
714
- def clear_cluster_log_dir(self):
715
- """
716
- Clears the directory where cluster logs are stored
717
- """
718
- fns = self.cluster_log_dir.glob("*")
719
- for fn in fns:
720
- (self.cluster_log_dir / fn).unlink()
721
-
722
- def make_executable_file(self):
723
- """
724
- Produces the executable that will be submitted to the NPX cluster.
725
- """
726
- logging_level = logger.getEffectiveLevel()
727
- script_fn = os.path.realpath(__file__)
728
-
729
- txt = (
730
- f'{sys.executable} {script_fn} '
731
- f'--logging_level {logging_level} '
732
- f'--base_name {self.base_name} '
733
- f'--min_sep_arcsec {self.min_sep.to("arcsec").value} '
734
- f'--n_chunks {self._n_chunks} '
735
- f'--job_id $1 '
736
- f'--mask_by_position $2'
737
- )
738
-
739
- logger.debug("writing executable to " + str(self.executable_filename))
740
- with self.executable_filename.open("w") as f:
741
- f.write(txt)
742
-
743
- def get_submit_file_filename(self, ids):
744
- """
745
- Get the filename of the submit file for given job ids
746
-
747
- :param ids: list of job ids
748
- :type ids: list
749
- :return: filename
750
- :rtype: str
751
- """
752
- ids = np.atleast_1d(ids)
753
- ids_string = f"{min(ids)}-{max(ids)}"
754
- return self.cluster_dir / f"ids{ids_string}.submit"
755
-
756
- def make_submit_file(
757
- self,
758
- job_ids: (int, List[int]),
759
- node_memory: str = '8G',
760
- mask_by_position: bool = False
761
- ):
762
- """
763
- Produces the submit file that will be submitted to the NPX cluster.
764
-
765
- :param job_ids: The job ID or list of job IDs to submit
766
- :type job_ids: int or list of ints
767
- :param node_memory: The amount of memory to request for each node
768
- :type node_memory: str
769
- :param mask_by_position: if `True` mask single exposures that are too far away from the bulk
770
- :type mask_by_position: bool
771
- """
772
-
773
- q = "1 job_id in " + ", ".join(np.atleast_1d(job_ids).astype(str))
774
- d = get_directories()
775
- data_dir = str(d['data_dir'])
776
- bigdata_dir = str(d['bigdata_dir'])
777
-
778
- text = (
779
- f"executable = {self.executable_filename} \n"
780
- f"environment = \"TIMEWISE_DATA={data_dir} TIMEWISE_BIGDATA={bigdata_dir}\" \n"
781
- f"log = $(cluster)_$(process)job.log \n"
782
- f"output = $(cluster)_$(process)job.out \n"
783
- f"error = $(cluster)_$(process)job.err \n"
784
- f"should_transfer_files = YES \n"
785
- f"when_to_transfer_output = ON_EXIT \n"
786
- f"arguments = $(job_id) {mask_by_position}\n"
787
- f"RequestMemory = {node_memory} \n"
788
- f"\n"
789
- f"queue {q}"
790
- )
791
-
792
- fn = self.get_submit_file_filename(job_ids)
793
- logger.debug("writing submitfile at " + fn)
794
- with open(fn, "w") as f:
795
- f.write(text)
796
-
797
- def submit_to_cluster(self, node_memory, single_chunk=None, mask_by_position=False):
798
- """
799
- Submit jobs to cluster
800
-
801
- :param node_memory: memory per node
802
- :type node_memory: str
803
- :param single_chunk: number of single chunk to run on the cluster
804
- :type single_chunk: int
805
- :param mask_by_position: if `True` mask single exposures that are too far away from the bulk
806
- :type mask_by_position: bool
807
- :return: ID of the cluster job
808
- :rtype: int
809
- """
810
-
811
- if isinstance(single_chunk, type(None)):
812
- _start_id = 1
813
- _end_id = int(self.n_chunks*self.n_cluster_jobs_per_chunk)
814
- else:
815
- _start_id = int(single_chunk*self.n_cluster_jobs_per_chunk) + 1
816
- _end_id = int(_start_id + self.n_cluster_jobs_per_chunk) - 1
817
-
818
- job_ids = list(range(_start_id, _end_id + 1))
819
-
820
- # make data_product files, storing essential info from parent_sample
821
- for jobID in job_ids:
822
- indices = self.parent_sample.df.index[self.cluster_jobID_map == jobID]
823
- logger.debug(f"starting data_product for {len(indices)} objects.")
824
- data_product = self._start_data_product(parent_sample_indices=indices)
825
- chunk_number = self._get_chunk_number_for_job(jobID)
826
- self._save_data_product(data_product, service="tap", chunk_number=chunk_number, jobID=jobID)
827
-
828
- # make position mask files
829
- if mask_by_position:
830
- if single_chunk:
831
- chunk_numbers = [single_chunk]
832
- else:
833
- chunk_numbers = list(range(self.n_chunks))
834
- for c in chunk_numbers:
835
- self.get_position_mask(service="tap", chunk_number=c)
836
-
837
- self.make_executable_file()
838
- self.make_submit_file(job_ids=job_ids, node_memory=node_memory, mask_by_position=mask_by_position)
839
-
840
- submit_cmd = 'condor_submit ' + self.get_submit_file_filename(job_ids)
841
- logger.info(f"{time.asctime(time.localtime())}: {submit_cmd}")
842
-
843
- try:
844
- msg = self._execute_bash_command(submit_cmd)
845
- logger.info(str(msg))
846
- job_id = str(msg).split("cluster ")[-1].split(".")[0]
847
- logger.info(f"Running on cluster with ID {job_id}")
848
- self.job_id = job_id
849
- return job_id
850
-
851
- except OSError:
852
- return
853
-
854
- def run_cluster(self, node_memory, service):
855
- """
856
- Run the DESY cluster
857
-
858
- :param node_memory: memory per node
859
- :type node_memory: str
860
- :param service: service to use for querying the data
861
- :type service: str
862
- """
863
-
864
- self.clear_cluster_log_dir()
865
- self._save_cluster_info()
866
- self.submit_to_cluster(node_memory)
867
- self.wait_for_job()
868
- for c in range(self.n_chunks):
869
- self._combine_data_products(service, chunk_number=c, remove=True, overwrite=True)
870
-
871
- # ---------------------------------------------------- #
872
- # END using cluster for downloading and binning #
873
- # ----------------------------------------------------------------------------------- #
874
-
875
- ###########################################################################################################
876
- # START MAKE PLOTTING FUNCTIONS #
877
- #####################################
878
-
879
- def plot_lc(
880
- self,
881
- parent_sample_idx,
882
- service='tap',
883
- plot_unbinned=False,
884
- plot_binned=True,
885
- interactive=False,
886
- fn=None,
887
- ax=None,
888
- save=True,
889
- lum_key='flux_density',
890
- load_from_bigdata_dir=False,
891
- **kwargs
892
- ):
893
- """Make a pretty plot of a lightcurve
894
-
895
- :param parent_sample_idx: The index in the parent sample of the lightcurve
896
- :type parent_sample_idx: int or str
897
- :param service: the service with which the lightcurves were downloaded
898
- :type service: str
899
- :param plot_unbinned: plot unbinned data
900
- :type plot_unbinned: bool
901
- :param plot_binned: plot binned lightcurve
902
- :type plot_binned: bool
903
- :param interactive: interactive mode
904
- :type interactive: bool
905
- :param fn: filename, defaults to </path/to/timewise/data/dir>/output/plots/<base_name>/<parent_sample_index>_<lum_key>.pdf
906
- :type fn: str
907
- :param ax: pre-existing matplotlib.Axis
908
- :param save: save the plot
909
- :type save: bool
910
- :param lum_key: the unit of luminosity to use in the plot, either of 'mag', 'flux_density' or 'luminosity'
911
- :type lum_key: str
912
- :param load_from_bigdata_dir: load from the the big data storage directory
913
- :type load_from_bigdata_dir: bool
914
- :param kwargs: any additional kwargs will be passed on to `matplotlib.pyplot.subplots()`
915
- :return: the matplotlib.Figure and matplotlib.Axes if `interactive=True`
916
- """
917
-
918
- logger.debug(f"loading binned lightcurves")
919
-
920
- _get_unbinned_lcs_fct = self.get_unbinned_lightcurves \
921
- if service == 'tap' else self._get_unbinned_lightcurves_gator
922
-
923
- wise_id = self.parent_sample.df.loc[int(parent_sample_idx), self.parent_wise_source_id_key]
924
- if isinstance(wise_id, float) and not np.isnan(wise_id):
925
- wise_id = int(wise_id)
926
- logger.debug(f"{wise_id} for {parent_sample_idx}")
927
-
928
- _chunk_number = self._get_chunk_number(parent_sample_index=parent_sample_idx)
929
- data_product = self.load_data_product(
930
- service,
931
- chunk_number=_chunk_number,
932
- use_bigdata_dir=load_from_bigdata_dir
933
- )
934
- lc = pd.DataFrame.from_dict(data_product[parent_sample_idx]["timewise_lightcurve"])
935
-
936
- if plot_unbinned:
937
-
938
- if service == 'tap':
939
- unbinned_lcs = self.get_unbinned_lightcurves(_chunk_number)
940
-
941
- else:
942
- unbinned_lcs = self._get_unbinned_lightcurves_gator(_chunk_number)
943
-
944
- unbinned_lc = unbinned_lcs[unbinned_lcs[self._tap_orig_id_key] == int(parent_sample_idx)]
945
-
946
- else:
947
- unbinned_lc = None
948
-
949
- _lc = lc if plot_binned else None
950
-
951
- if not fn:
952
- fn = self.plots_dir / f"{parent_sample_idx}_{lum_key}.pdf"
953
-
954
- return self._plot_lc(lightcurve=_lc, unbinned_lc=unbinned_lc, interactive=interactive, fn=fn, ax=ax,
955
- save=save, lum_key=lum_key, **kwargs)
956
-
957
- # --------------------------------------------------------------------------------------
958
- # START Chi2 plots
959
- # -------------------------------------------
960
-
961
- @cache
962
- def get_red_chi2(self, chunk, lum_key, use_bigdata_dir=False):
963
- """
964
- Get the reduced chi2 for a given chunk or multiple chunks
965
-
966
- :param chunk: the chunk number or list of chunk numbers
967
- :type chunk: int or list
968
- :param lum_key: the unit of luminosity to use in the plot, either of 'mag', 'flux' or 'flux_density'
969
- :type lum_key: str
970
- :param use_bigdata_dir: load from the big data storage directory, default is False
971
- :type use_bigdata_dir: bool, optional
972
- :return: the reduced chi2 for each band, the DataFrame will have columns `chi2`, `med_lum` and `N_datapoints`
973
- :rtype: dict[str, pd.DataFrame]
974
- """
975
-
976
- logger.info(f"getting reduced chi2 for chunk {chunk}")
977
- data_product = self.load_data_product(service="tap", chunk_number=chunk, use_bigdata_dir=use_bigdata_dir)
978
-
979
- chi2_val = {b: dict() for b in self.bands}
980
-
981
- for b in self.bands:
982
- key1 = f"{b}_chi2_to_med{lum_key}"
983
- key2 = f"{b}_N_datapoints{lum_key}"
984
- key3 = f"{b}_median{lum_key}"
985
- logger.debug(f"{key1}, {key2}")
986
-
987
- for i, idata_product in tqdm.tqdm(
988
- data_product.items(),
989
- total=len(data_product),
990
- desc="collecting chi2 values"
991
- ):
992
- if "timewise_metadata" in idata_product:
993
- imetadata = idata_product["timewise_metadata"]
994
-
995
- if (key1 in imetadata) and (key2 in imetadata):
996
- ndof = (imetadata[key2] - 1)
997
- v = {
998
- "chi2": imetadata[key1] / ndof if ndof > 0 else np.nan,
999
- "med_lum": imetadata[key3],
1000
- "N_datapoints": imetadata[key2]
1001
- }
1002
- chi2_val[b][i] = v
1003
-
1004
- return {b: pd.DataFrame.from_dict(chi2_val[b], orient='index') for b in self.bands}
1005
-
1006
- def make_chi2_plot(
1007
- self,
1008
- index_mask=None,
1009
- chunks=None,
1010
- load_from_bigdata_dir=False,
1011
- lum_key="_flux_density",
1012
- interactive=False,
1013
- save=False,
1014
- nbins=100,
1015
- cumulative=True,
1016
- upper_bound=4
1017
- ):
1018
- """
1019
- Make a plot of the reduced chi2 distribution for a given chunk or multiple chunks
1020
-
1021
- :param index_mask: a mask to apply to the parent sample, eg {'AGNs': agn_mask}
1022
- :type index_mask: dict
1023
- :param chunks: the chunk number or list of chunk numbers
1024
- :type chunks: int or list
1025
- :param load_from_bigdata_dir: load from the big data storage directory, default is False
1026
- :type load_from_bigdata_dir: bool, optional
1027
- :param lum_key: the unit of luminosity to use in the plot, either of 'mag', 'flux' or 'flux_density'
1028
- :type lum_key: str
1029
- :param interactive: return the figure and axes if True, default is False
1030
- :type interactive: bool
1031
- :param save: save the plot, default is False
1032
- :type save: bool
1033
- :param nbins: the number of bins to use in the histogram, default is 100
1034
- :type nbins: int
1035
- :param cumulative: plot the cumulative distribution, default is True
1036
- :type cumulative: bool
1037
- :param upper_bound: the upper bound of the x-axis, default is 4
1038
- :type upper_bound: float
1039
- :return: the matplotlib.Figure and matplotlib.Axes if `interactive=True`
1040
- :rtype: tuple[mpl.Figure, mpl.Axes]
1041
- """
1042
-
1043
- if chunks is None:
1044
- chunks = list(range(self.n_chunks))
1045
-
1046
- chi2_data_list = [self.get_red_chi2(chunk, lum_key, load_from_bigdata_dir) for chunk in chunks]
1047
- chi2_data = {b: pd.concat([d[b] for d in chi2_data_list]) for b in self.bands}
1048
-
1049
- N_datapoints = set.intersection(*[set(df["N_datapoints"].unique()) for b, df in chi2_data.items()])
1050
-
1051
- res = list()
1052
-
1053
- for n in N_datapoints:
1054
-
1055
- if n == 1:
1056
- continue
1057
-
1058
- chi2_df_sel = {b: df[df["N_datapoints"] == n]["chi2"] for b, df in chi2_data.items()}
1059
-
1060
- logger.info(f"making chi2 histogram for lightcurves with {n} datapoints")
1061
-
1062
- fig, axs = plt.subplots(
1063
- ncols=len(self.bands),
1064
- figsize=(10, 5),
1065
- sharey="all",
1066
- sharex="all"
1067
- )
1068
-
1069
- index_colors = (
1070
- {k: f"C{(i+1)*2}"
1071
- for i, k in enumerate(index_mask.keys())}
1072
- if index_mask is not None else None
1073
- )
1074
-
1075
- x = np.linspace(0, upper_bound, nbins)
1076
- x = np.concatenate([x, [1e6]])
1077
-
1078
- for ax, band in zip(axs, self.bands):
1079
- h, b, _ = ax.hist(
1080
- chi2_df_sel[band].values.flatten(),
1081
- label="all",
1082
- density=True,
1083
- cumulative=cumulative,
1084
- color="k",
1085
- bins=x,
1086
- lw=3,
1087
- histtype="step",
1088
- zorder=20,
1089
- )
1090
- bmids = (b[1:] + b[:-1]) / 2
1091
-
1092
- # if cumulative then also calculate the histogram of the PDF
1093
- # this will be used later to calculate the goodness of fit to the F- and Chi2-distribution
1094
- hpdf = h if not cumulative else \
1095
- np.histogram(chi2_df_sel[band].values.flatten(), bins=x, density=True)[0]
1096
- nonzero_m = hpdf > 0
1097
-
1098
- # we need the absolute histogram numbers to calculate the uncsertainties of the density bins
1099
- # The uncertainty of the density bin d_i is
1100
- #
1101
- # u_di = u_ci * sum_{j not i}(c_j) / [(sum_{j}c_j)^2 * (b_{i+1} - b_{i})]
1102
- #
1103
- # where u_ci = sqrt(c_i) is the uncertainty of the counts bin c_i
1104
- #
1105
- h_abs = np.histogram(chi2_df_sel[band].values.flatten(), bins=x, density=False)[0][nonzero_m]
1106
- h_abs_sum = np.sum(h_abs)
1107
- h_sum_not_i = h_abs_sum - h_abs
1108
- u_density = np.sqrt(h_abs) * h_sum_not_i / (h_abs_sum ** 2 * (np.diff(x))[nonzero_m])
1109
-
1110
- if index_mask is not None:
1111
- for i, (label, indices) in enumerate(index_mask.items()):
1112
- _indices = chi2_df_sel[band].index.intersection(indices)
1113
- kwargs = (
1114
- dict()
1115
- if cumulative else
1116
- {"edgecolor": "k"}
1117
- )
1118
-
1119
- sns.histplot(
1120
- chi2_df_sel[band].loc[_indices].values.flatten(),
1121
- label=label,
1122
- stat="density",
1123
- bins=b,
1124
- ax=ax,
1125
- color=index_colors[label],
1126
- element="step" if cumulative else "bars",
1127
- alpha=0.7,
1128
- fill=not cumulative,
1129
- zorder=10,
1130
- lw=3 if cumulative else 1,
1131
- cumulative=cumulative,
1132
- **kwargs
1133
- )
1134
-
1135
- # select non-NaN's and values below `upper_bound`
1136
- x_dense = np.linspace(min(x), upper_bound, 1000)
1137
- sel = chi2_df_sel[band][(~chi2_df_sel[band].isna()) & (chi2_df_sel[band] < upper_bound)]
1138
- if len(sel) > 0:
1139
-
1140
- # fit an F-distribution
1141
- fpars = f.fit(sel, n-1, 1e5, f0=n-1, floc=0)
1142
- frozenf = f(*fpars)
1143
- fpdf = frozenf.pdf
1144
-
1145
- # if cumulative then draw the CDF instead of the PDF
1146
- ffunc = frozenf.cdf if cumulative else fpdf
1147
-
1148
- # To see how well the distribution fits the data we'll calculate the chi2
1149
- # to the PDF (not to the CDF because the bins in CDF are correlated)
1150
- ndof_fit = len(bmids[nonzero_m]) - 2
1151
- F_chi2fit = sum((hpdf[nonzero_m] - fpdf(bmids[nonzero_m])) ** 2 / u_density**2) / ndof_fit
1152
-
1153
- # plot the fitted distribution
1154
- ax.plot(x_dense, ffunc(x_dense), color='deepskyblue', ls="--", lw=3,
1155
- label=(
1156
- rf"F-distribution" + "\n" +
1157
- rf"$\nu_1$={fpars[0]:.0f}, $\nu_2$={fpars[1]:.2f}, scale={fpars[-1]:.2f}"
1158
- ),
1159
- zorder=30
1160
- )
1161
-
1162
- # we will also show the expected chi2 distribution
1163
- pars_expected = (n - 1, 0, 1 / (n - 1))
1164
- chi2_expected = chi2(*pars_expected)
1165
- r = chi2_expected.cdf(x_dense) if cumulative else chi2_expected.pdf(x_dense)
1166
- chi2_fitchi2 = sum((hpdf[nonzero_m] - chi2_expected.pdf(bmids[nonzero_m])) ** 2 / hpdf[nonzero_m])
1167
- ax.plot(x_dense, r, color="deepskyblue", ls=":", lw=3,
1168
- label=rf"$\chi^2$-distribution" + "\n" + rf"$\nu$: {n - 1:.0f}",
1169
- zorder=30)
1170
-
1171
- ax.legend()
1172
- ax.set_xlabel(r"$\chi^2_{" + band + "} / N_{visits," + band + "}$")
1173
- ax.set_xlim(0, upper_bound)
1174
-
1175
- for loc in ["top", "right"]:
1176
- ax.spines[loc].set_visible(False)
1177
-
1178
- fig.suptitle(f"{n} datapoints")
1179
- fig.tight_layout()
1180
-
1181
- if save:
1182
- kind = "cdf" if cumulative else "pdf"
1183
- chunk_str = "chunks_" + "_".join([str(c) for c in chunks]) \
1184
- if len(chunks) != self.n_chunks \
1185
- else "all_chunks"
1186
- fn = self.plots_dir / f"chi2_plots" / lum_key / f"{n}_datapoints_{kind}_{chunk_str}.pdf"
1187
- fn.parent.mkdir(parents=True, exist_ok=True)
1188
- logger.debug(f"saving under {fn}")
1189
- fig.savefig(fn)
1190
-
1191
- if interactive:
1192
- res.append((fig, axs))
1193
- else:
1194
- plt.close()
1195
-
1196
- if interactive:
1197
- return res
1198
-
1199
- # -------------------------------------------
1200
- # END Chi2 plots
1201
- # --------------------------------------------------------------------------------------
1202
-
1203
- # --------------------------------------------------------------------------------------
1204
- # START coverage plots
1205
- # -------------------------------------------
1206
-
1207
- @cache
1208
- def get_coverage(self, chunk, lum_key, load_from_bigdata_dir=False):
1209
- """
1210
- Get the coverage of the MEASURED median for a given chunk and lum_key
1211
-
1212
- :param chunk: chunk number
1213
- :type chunk: int, list[int]]
1214
- :param lum_key: luminosity key
1215
- :type lum_key: str
1216
- :param load_from_bigdata_dir: if True, load the coverage from the bigdata directory
1217
- :type load_from_bigdata_dir: bool, optional
1218
- """
1219
- logger.info(f"getting coverage for chunk {chunk}")
1220
- data_product = self.load_data_product(service="tap", chunk_number=chunk, use_bigdata_dir=load_from_bigdata_dir)
1221
-
1222
- coverage_val = {b: dict() for b in self.bands}
1223
-
1224
- for b in self.bands:
1225
- key1 = f"{b}_coverage_of_median{lum_key}"
1226
- for i, idata_product in tqdm.tqdm(
1227
- data_product.items(),
1228
- total=len(data_product),
1229
- desc="collecting coverage values"
1230
- ):
1231
- if "timewise_metadata" in idata_product:
1232
- imetadata = idata_product["timewise_metadata"]
1233
-
1234
- if key1 in imetadata:
1235
- v = {
1236
- "coverage": imetadata[key1]
1237
- }
1238
- coverage_val[b][i] = v
1239
-
1240
- return {b: pd.DataFrame.from_dict(coverage_val[b], orient='index') for b in self.bands}
1241
-
1242
- @staticmethod
1243
- def get_quantiles_label(df, cl=0.68):
1244
- """
1245
- Get the quantiles label for a given coverage level
1246
- """
1247
- med = np.nanmedian(df)
1248
- ic = np.nanpercentile(df, [50 - cl / 2 * 100, 50 + cl / 2 * 100]) - med
1249
- label = rf"$ {med:.2f} ^{{ +{ic[1]:.2f} }} _{{ {ic[0]:.2f} }}$"
1250
- return label
1251
-
1252
- def make_coverage_plots(
1253
- self,
1254
- index_mask=None,
1255
- chunks=None,
1256
- load_from_bigdata_dir=False,
1257
- lum_key="_flux_density",
1258
- interactive=False,
1259
- save=False,
1260
- nbins=100,
1261
- ):
1262
- """
1263
- Make the coverage plots for the measured median of the specified luminosity unit
1264
-
1265
- :param index_mask: index mask to apply to the data, e.g. {"AGNs": agn_mask}
1266
- :type index_mask: dict, optional
1267
- :param chunks: chunks to use, if None use all chunks
1268
- :type chunks: list[int], int, optional
1269
- :param load_from_bigdata_dir: if True, load the coverage from the bigdata directory
1270
- :type load_from_bigdata_dir: bool, optional
1271
- :param lum_key: luminosity key, either of "_flux_density" or "_mag", default is "_flux_density"
1272
- :type lum_key: str, optional
1273
- :param interactive: if True, return the figures and axes, otherwise close them
1274
- :type interactive: bool, optional
1275
- :param save: if True, save the figures
1276
- :type save: bool, optional
1277
- :param nbins: number of bins for the histograms
1278
- :type nbins: int, optional
1279
- :return: if interactive, return the figures and axes, otherwise close them
1280
- :rtype: list[tuple[matplotlib.figure.Figure, matplotlib.axes.Axes]]
1281
- """
1282
-
1283
- if chunks is None:
1284
- chunks = list(range(self.n_chunks))
1285
-
1286
- coverages = [self.get_coverage(chunk, lum_key, load_from_bigdata_dir=load_from_bigdata_dir) for chunk in chunks]
1287
- coverages_df = {b: pd.concat([c[b] for c in coverages]) for b in self.bands}
1288
-
1289
- fig, axs = plt.subplots(
1290
- 1, len(self.bands),
1291
- figsize=(len(self.bands) * 4, 4),
1292
- sharey="all",
1293
- sharex="all"
1294
- )
1295
-
1296
- for ax, band in zip(axs, self.bands):
1297
- _coverages = coverages_df[band].values.flatten()
1298
- label = "all\n" + self.get_quantiles_label(_coverages)
1299
-
1300
- sns.histplot(
1301
- _coverages,
1302
- label=label,
1303
- stat="density",
1304
- bins=nbins,
1305
- ax=ax,
1306
- element="step",
1307
- fill=False,
1308
- lw=3,
1309
- color="k",
1310
- zorder=20,
1311
- )
1312
-
1313
- ax.set_xlabel("coverage " + band)
1314
- ax.set_xlim(0, 1)
1315
- fig.suptitle(f"coverage of median")
1316
- fig.tight_layout()
1317
-
1318
- if index_mask is not None:
1319
- for i, (label, indices) in enumerate(index_mask.items()):
1320
- _indices = coverages_df[band].index.intersection(indices)
1321
- _coverages = coverages_df[band].loc[_indices].values.flatten()
1322
- _label = label + "\n" + self.get_quantiles_label(_coverages)
1323
- sns.histplot(
1324
- _coverages,
1325
- label=_label,
1326
- stat="density",
1327
- bins=nbins,
1328
- ax=ax,
1329
- color=f"C{(i+1)*2}",
1330
- element="bars",
1331
- alpha=0.7,
1332
- fill=True,
1333
- zorder=10
1334
- )
1335
-
1336
- ax.legend()
1337
- for loc in ["top", "right"]:
1338
- ax.spines[loc].set_visible(False)
1339
-
1340
- ax.grid("on", axis="y", ls=":", lw=0.5, color="k", alpha=0.5, zorder=0)
1341
-
1342
- axs[0].set_ylabel("density")
1343
-
1344
- if save:
1345
- chunk_str = "chunks_" + "_".join([str(c) for c in chunks]) \
1346
- if len(chunks) != self.n_chunks \
1347
- else "all_chunks"
1348
- fn = self.plots_dir / f"coverage_plots" / lum_key / f"{chunk_str}.pdf"
1349
- fn.parent.mkdir(parents=True, exist_ok=True)
1350
- logger.debug(f"saving under {fn}")
1351
- fig.savefig(fn)
1352
-
1353
- if interactive:
1354
- return fig, axs
1355
- else:
1356
- plt.close()
1357
-
1358
- # -------------------------------------------
1359
- # END coverage plots
1360
- # --------------------------------------------------------------------------------------
1361
-
1362
- #####################################
1363
- # END MAKE PLOTTING FUNCTIONS #
1364
- ###########################################################################################################
1365
-
1366
-
1367
- if __name__ == '__main__':
1368
- parser = argparse.ArgumentParser()
1369
- parser.add_argument('--job_id', type=int)
1370
- parser.add_argument('--base_name', type=str)
1371
- parser.add_argument('--min_sep_arcsec', type=float)
1372
- parser.add_argument('--n_chunks', type=int)
1373
- parser.add_argument('--mask_by_position', type=str, default=False)
1374
- parser.add_argument('--logging_level', type=str, default='INFO')
1375
- cfg = parser.parse_args()
1376
-
1377
- try:
1378
- logging_level = int(cfg.logging_level)
1379
- except ValueError:
1380
- logging_level = cfg.logging_level.upper()
1381
-
1382
- logging.getLogger("timewise").setLevel(logging_level)
1383
- logger = logging.getLogger("timewise.main")
1384
- logger.info(json.dumps(vars(cfg), indent=4))
1385
-
1386
- wd = WISEDataDESYCluster(base_name=cfg.base_name,
1387
- min_sep_arcsec=cfg.min_sep_arcsec,
1388
- n_chunks=cfg.n_chunks,
1389
- parent_sample_class=None)
1390
- wd._load_cluster_info()
1391
- chunk_number = wd._get_chunk_number_for_job(cfg.job_id)
1392
-
1393
- match cfg.mask_by_position:
1394
- case "True":
1395
- mask_by_position = True
1396
- case "False":
1397
- mask_by_position = False
1398
- case other:
1399
- raise ValueError(f"mask_by_position has to be either of 'True' or 'False', not {cfg.mask_by_position}")
1400
-
1401
- wd._subprocess_select_and_bin(
1402
- service='tap',
1403
- chunk_number=chunk_number,
1404
- jobID=cfg.job_id,
1405
- mask_by_position=mask_by_position
1406
- )
1407
- wd.calculate_metadata(service='tap', chunk_number=chunk_number, jobID=cfg.job_id)