sedlib 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sedlib/__init__.py +47 -0
- sedlib/bol2rad.py +552 -0
- sedlib/catalog.py +1141 -0
- sedlib/core.py +6059 -0
- sedlib/data/__init__.py +2 -0
- sedlib/data/temp_to_bc_coefficients.yaml +62 -0
- sedlib/filter/__init__.py +5 -0
- sedlib/filter/core.py +1064 -0
- sedlib/filter/data/__init__.py +2 -0
- sedlib/filter/data/svo_all_filter_database.pickle +0 -0
- sedlib/filter/data/svo_filter_catalog.pickle +0 -0
- sedlib/filter/data/svo_meta_data.xml +1282 -0
- sedlib/filter/utils.py +71 -0
- sedlib/helper.py +361 -0
- sedlib/utils.py +789 -0
- sedlib/version.py +12 -0
- sedlib-1.0.0.dist-info/METADATA +611 -0
- sedlib-1.0.0.dist-info/RECORD +21 -0
- sedlib-1.0.0.dist-info/WHEEL +5 -0
- sedlib-1.0.0.dist-info/licenses/LICENSE +201 -0
- sedlib-1.0.0.dist-info/top_level.txt +1 -0
sedlib/catalog.py
ADDED
@@ -0,0 +1,1141 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
|
3
|
+
"""
|
4
|
+
Catalog class for organizing photometric data
|
5
|
+
"""
|
6
|
+
|
7
|
+
__all__ = ['Catalog']
|
8
|
+
|
9
|
+
import re
|
10
|
+
import logging
|
11
|
+
import sqlite3
|
12
|
+
|
13
|
+
import numpy as np
|
14
|
+
import pandas as pd
|
15
|
+
|
16
|
+
from astropy import units as u
|
17
|
+
from astropy.table import Table
|
18
|
+
from astropy.modeling.physical_models import BlackBody
|
19
|
+
|
20
|
+
|
21
|
+
# # Set up logging
|
22
|
+
# logger = logging.get(__name__)
|
23
|
+
# logger.setLevel(logging.DEBUG)
|
24
|
+
|
25
|
+
# # Create a file handler
|
26
|
+
# fh = logging.FileHandler('sed.log')
|
27
|
+
# fh.setLevel(logging.DEBUG)
|
28
|
+
|
29
|
+
# # Create a formatter
|
30
|
+
# formatter = logging.Formatter(
|
31
|
+
# '%(asctime)s - %(levelname)s - %(message)s',
|
32
|
+
# datefmt='%Y-%m-%dT%H:%M:%S'
|
33
|
+
# )
|
34
|
+
# fh.setFormatter(formatter)
|
35
|
+
|
36
|
+
# # Add the handler to the logger
|
37
|
+
# logger.addHandler(fh)
|
38
|
+
|
39
|
+
|
40
|
+
class Catalog:
|
41
|
+
"""
|
42
|
+
A class to manage and organize photometric catalog data.
|
43
|
+
|
44
|
+
This class provides a wrapper around Astropy Table objects for managing
|
45
|
+
photometric data, including methods for data cleaning, filtering, and
|
46
|
+
statistical analysis.
|
47
|
+
|
48
|
+
Parameters
|
49
|
+
----------
|
50
|
+
name : str, optional
|
51
|
+
Name identifier for the catalog.
|
52
|
+
table : astropy.table.Table, optional
|
53
|
+
Existing Astropy Table to use as the catalog data.
|
54
|
+
logger : logging.Logger, optional
|
55
|
+
Logger instance for logging operations.
|
56
|
+
**kwargs : dict, optional
|
57
|
+
Additional keyword arguments to set as attributes.
|
58
|
+
|
59
|
+
Attributes
|
60
|
+
----------
|
61
|
+
name : str
|
62
|
+
Catalog name identifier.
|
63
|
+
table : astropy.table.Table
|
64
|
+
The underlying Astropy Table containing photometric data.
|
65
|
+
teff : astropy.units.Quantity, optional
|
66
|
+
Effective temperature of the target object.
|
67
|
+
teff_error : astropy.units.Quantity, optional
|
68
|
+
Error in effective temperature.
|
69
|
+
radius : astropy.units.Quantity, optional
|
70
|
+
Stellar radius of the target object.
|
71
|
+
radius_error : astropy.units.Quantity, optional
|
72
|
+
Error in stellar radius.
|
73
|
+
distance : astropy.units.Quantity, optional
|
74
|
+
Distance to the target object.
|
75
|
+
distance_error : astropy.units.Quantity, optional
|
76
|
+
Error in distance.
|
77
|
+
rejected_data : astropy.table.Table, optional
|
78
|
+
Data points that were rejected during filtering operations.
|
79
|
+
is_rejected : bool
|
80
|
+
Flag indicating if data has been rejected.
|
81
|
+
|
82
|
+
Methods
|
83
|
+
-------
|
84
|
+
from_table(table)
|
85
|
+
Initialize catalog from existing Astropy Table.
|
86
|
+
add_rows(new_rows)
|
87
|
+
Add new rows to the catalog.
|
88
|
+
select_rows(criteria, as_dataframe=False)
|
89
|
+
Select rows based on criteria.
|
90
|
+
update_rows(criteria, new_data)
|
91
|
+
Update rows matching criteria.
|
92
|
+
delete_rows(criteria=None, row_numbers=None)
|
93
|
+
Delete rows based on criteria or row numbers.
|
94
|
+
find_missing_data_rows(columns, as_dataframe=True)
|
95
|
+
Find rows with missing data.
|
96
|
+
delete_missing_data_rows(columns)
|
97
|
+
Delete rows with missing data.
|
98
|
+
combine_fluxes(method='mean', overwrite=False)
|
99
|
+
Combine duplicate filter measurements.
|
100
|
+
filter_outliers(sigma_threshold=3.0)
|
101
|
+
Apply sigma clipping to filter outliers.
|
102
|
+
flux_to_magnitude()
|
103
|
+
Convert flux values to magnitudes.
|
104
|
+
get_column_stats(column_name)
|
105
|
+
Calculate statistics for a column.
|
106
|
+
sql_query(query)
|
107
|
+
Execute SQL query on the catalog.
|
108
|
+
|
109
|
+
Examples
|
110
|
+
--------
|
111
|
+
>>> from sedlib import Catalog
|
112
|
+
>>> from astropy.table import Table
|
113
|
+
>>> import numpy as np
|
114
|
+
>>>
|
115
|
+
>>> # Create catalog from data
|
116
|
+
>>> data = {
|
117
|
+
>>> 'RA': [180.0, 180.1],
|
118
|
+
>>> 'DEC': [30.0, 30.1],
|
119
|
+
>>> 'filter': ['V', 'B'],
|
120
|
+
>>> 'flux': [1e-12, 8e-13],
|
121
|
+
>>> 'eflux': [1e-13, 8e-14]
|
122
|
+
>>> }
|
123
|
+
>>> table = Table(data)
|
124
|
+
>>> catalog = Catalog('test_catalog', table)
|
125
|
+
>>>
|
126
|
+
>>> # Filter outliers
|
127
|
+
>>> catalog.filter_outliers(sigma_threshold=2.0)
|
128
|
+
>>>
|
129
|
+
>>> # Get statistics
|
130
|
+
>>> stats = catalog.get_column_stats('flux')
|
131
|
+
>>> print(f"Mean flux: {stats['mean']:.2e}")
|
132
|
+
"""
|
133
|
+
|
134
|
+
def __init__(self, name=None, table=None, logger=None, **kwargs):
|
135
|
+
self.name = name
|
136
|
+
self.table = table if table is not None else Table()
|
137
|
+
|
138
|
+
self._logger = logger
|
139
|
+
self._logger.info(f"Initialized Catalog with name: {name}")
|
140
|
+
|
141
|
+
self.teff = None
|
142
|
+
self.teff_error = None
|
143
|
+
self.radius = None
|
144
|
+
self.radius_error = None
|
145
|
+
self.distance = None
|
146
|
+
self.distance_error = None
|
147
|
+
|
148
|
+
for key, val in kwargs.items():
|
149
|
+
setattr(self, key, val)
|
150
|
+
|
151
|
+
if len(self.table) > 0:
|
152
|
+
self.table['RA'].info.format = '.3f'
|
153
|
+
self.table['DEC'].info.format = '.3f'
|
154
|
+
self.table['wavelength'].info.format = '.3f'
|
155
|
+
|
156
|
+
columns = [
|
157
|
+
'RA', 'DEC', 'vizier_filter', 'filter',
|
158
|
+
'frequency', 'wavelength', 'flux', 'eflux'
|
159
|
+
]
|
160
|
+
self.table = table[columns]
|
161
|
+
|
162
|
+
self.rejected_data = None
|
163
|
+
self.is_rejected = False
|
164
|
+
self._rejected_sigma_threshold = 3.0
|
165
|
+
|
166
|
+
def __str__(self):
|
167
|
+
return str(self.table)
|
168
|
+
|
169
|
+
def __repr__(self):
|
170
|
+
return str(self.table)
|
171
|
+
|
172
|
+
def __setitem__(self, key, value):
|
173
|
+
if self.table is None:
|
174
|
+
raise ValueError("Table is not initialized.")
|
175
|
+
|
176
|
+
try:
|
177
|
+
self.table[key] = value
|
178
|
+
except ValueError:
|
179
|
+
raise ValueError(f"Column '{key}' does not exist in the table.")
|
180
|
+
|
181
|
+
def __getitem__(self, item):
|
182
|
+
if self.table is None:
|
183
|
+
raise ValueError("Table is not initialized.")
|
184
|
+
|
185
|
+
try:
|
186
|
+
return self.table[item]
|
187
|
+
except ValueError:
|
188
|
+
raise ValueError(f"Column '{item}' does not exist in the table.")
|
189
|
+
|
190
|
+
# def __getattr__(self, name):
|
191
|
+
# try:
|
192
|
+
# return getattr(self.table, name)
|
193
|
+
# except AttributeError:
|
194
|
+
# raise AttributeError(
|
195
|
+
# f"'{type(self).__name__}' object has no attribute '{name}'"
|
196
|
+
# )
|
197
|
+
|
198
|
+
def __len__(self):
|
199
|
+
if self.table is None:
|
200
|
+
raise ValueError("Table is not initialized.")
|
201
|
+
return len(self.table)
|
202
|
+
|
203
|
+
def __call__(self, max_rows=20):
|
204
|
+
return self.table.show_in_notebook(display_length=max_rows)
|
205
|
+
|
206
|
+
def from_table(self, table):
|
207
|
+
"""
|
208
|
+
Initialize Catalog with an existing astropy Table.
|
209
|
+
|
210
|
+
Parameters
|
211
|
+
----------
|
212
|
+
table : astropy.table.Table
|
213
|
+
The astropy Table to use as the catalog.
|
214
|
+
|
215
|
+
Raises
|
216
|
+
------
|
217
|
+
TypeError
|
218
|
+
If table is not an astropy.table.Table object.
|
219
|
+
"""
|
220
|
+
if not isinstance(table, Table):
|
221
|
+
raise TypeError("Argument must be an astropy.table.Table object")
|
222
|
+
|
223
|
+
self.table = table
|
224
|
+
self._self._logger.info(
|
225
|
+
f"Loaded table with {len(table)} rows and {len(table.columns)} columns"
|
226
|
+
)
|
227
|
+
|
228
|
+
def sql_query(self, query):
|
229
|
+
"""
|
230
|
+
Execute a SQL query on the catalog to retrieve data
|
231
|
+
based on specific conditions.
|
232
|
+
|
233
|
+
Parameters
|
234
|
+
----------
|
235
|
+
query : str
|
236
|
+
A SQL query string to execute.
|
237
|
+
|
238
|
+
Returns
|
239
|
+
-------
|
240
|
+
result : pandas.DataFrame
|
241
|
+
A DataFrame containing the query result.
|
242
|
+
|
243
|
+
Raises
|
244
|
+
------
|
245
|
+
ValueError
|
246
|
+
If the catalog table is not initialized or if the query fails.
|
247
|
+
|
248
|
+
Examples
|
249
|
+
--------
|
250
|
+
>>> catalog.sql_query("SELECT * FROM catalog WHERE flux > 1.5")
|
251
|
+
>>> catalog.sql_query(
|
252
|
+
"SELECT eflux FROM catalog WHERE vizier_filter LIKE '%Johnson%'"
|
253
|
+
)
|
254
|
+
"""
|
255
|
+
if self.table is None:
|
256
|
+
raise ValueError("Catalog table is not initialized.")
|
257
|
+
|
258
|
+
self._logger.debug(f"Executing SQL query: {query}")
|
259
|
+
|
260
|
+
# Convert the astropy table to a pandas DataFrame
|
261
|
+
self._logger.debug("Converting astropy table to pandas DataFrame")
|
262
|
+
df = self.table.to_pandas()
|
263
|
+
|
264
|
+
# Convert unsupported column types to strings for SQLite compatibility
|
265
|
+
self._logger.debug("Converting unsupported column types to strings")
|
266
|
+
for col in df.columns:
|
267
|
+
if (not pd.api.types.is_numeric_dtype(df[col]) and
|
268
|
+
not pd.api.types.is_string_dtype(df[col])):
|
269
|
+
df[col] = df[col].astype(str)
|
270
|
+
|
271
|
+
# Load the DataFrame into an in-memory SQLite database
|
272
|
+
self._logger.debug("Loading DataFrame into SQLite database")
|
273
|
+
connection = sqlite3.connect(":memory:")
|
274
|
+
try:
|
275
|
+
df.to_sql(
|
276
|
+
"catalog",
|
277
|
+
connection,
|
278
|
+
index=False,
|
279
|
+
if_exists="replace"
|
280
|
+
)
|
281
|
+
# Execute the query and fetch the results into a DataFrame
|
282
|
+
self._logger.debug("Executing SQL query and fetching results")
|
283
|
+
result_df = pd.read_sql_query(query, connection)
|
284
|
+
self._logger.info(f"SQL query returned {len(result_df)} rows")
|
285
|
+
return result_df
|
286
|
+
except Exception as e:
|
287
|
+
self._logger.error(f"SQL query failed: {str(e)}")
|
288
|
+
raise ValueError(f"Failed to execute query: {e}")
|
289
|
+
finally:
|
290
|
+
connection.close() # Ensure the SQLite connection is closed
|
291
|
+
|
292
|
+
def add_rows(self, new_rows):
|
293
|
+
"""
|
294
|
+
Add new rows to the catalog.
|
295
|
+
|
296
|
+
Parameters
|
297
|
+
----------
|
298
|
+
new_rows : list of dict
|
299
|
+
List of dictionaries, where each dictionary represents a new row
|
300
|
+
with keys as column names and values as row data.
|
301
|
+
|
302
|
+
Raises
|
303
|
+
------
|
304
|
+
ValueError
|
305
|
+
If the catalog table has not been initialized or columns do not match.
|
306
|
+
"""
|
307
|
+
if self.table is None:
|
308
|
+
raise ValueError("Catalog table is not initialized.")
|
309
|
+
|
310
|
+
self._logger.info(f"Adding {len(new_rows)} new rows to catalog")
|
311
|
+
|
312
|
+
try:
|
313
|
+
self._logger.debug("Validating column names in new rows")
|
314
|
+
for row_data in new_rows:
|
315
|
+
if not all(column in self.table.colnames
|
316
|
+
for column in row_data.keys()):
|
317
|
+
raise ValueError(
|
318
|
+
"One or more columns in the new row data do not exist "
|
319
|
+
"in the table."
|
320
|
+
)
|
321
|
+
self.table.add_row(row_data)
|
322
|
+
self._logger.debug("Successfully added new rows")
|
323
|
+
except Exception as e:
|
324
|
+
self._logger.error(f"Failed to add rows: {str(e)}")
|
325
|
+
raise
|
326
|
+
|
327
|
+
def select_rows(self, criteria, as_dataframe=False):
|
328
|
+
"""
|
329
|
+
Select rows from the catalog that meet the specified criteria.
|
330
|
+
|
331
|
+
Parameters
|
332
|
+
----------
|
333
|
+
criteria : dict
|
334
|
+
A dictionary specifying conditions to select rows for extraction.
|
335
|
+
Keys are column names, and values are conditions (e.g., {"flux": "<15"}).
|
336
|
+
|
337
|
+
as_dataframe : bool, optional
|
338
|
+
If True, returns the result as a Pandas DataFrame; if False, as an
|
339
|
+
astropy.table.Table. Default is False.
|
340
|
+
|
341
|
+
Returns
|
342
|
+
-------
|
343
|
+
result : astropy.table.Table or pandas.DataFrame
|
344
|
+
The rows that match the criteria, either as a Table or DataFrame.
|
345
|
+
|
346
|
+
Raises
|
347
|
+
------
|
348
|
+
ValueError
|
349
|
+
If the catalog table is not initialized.
|
350
|
+
"""
|
351
|
+
|
352
|
+
if self.table is None:
|
353
|
+
raise ValueError("Catalog table is not initialized.")
|
354
|
+
|
355
|
+
self._logger.debug(f"Selecting rows with criteria: {criteria}")
|
356
|
+
|
357
|
+
# Get row indices that match the criteria
|
358
|
+
self._logger.debug("Filtering rows based on criteria")
|
359
|
+
row_indices = self._filter_by_criteria(criteria)
|
360
|
+
|
361
|
+
# Extract the rows from the table
|
362
|
+
self._logger.debug("Extracting matching rows")
|
363
|
+
extracted_table = self.table[row_indices]
|
364
|
+
|
365
|
+
# Convert to DataFrame if requested
|
366
|
+
if as_dataframe:
|
367
|
+
self._logger.debug("Converting result to pandas DataFrame")
|
368
|
+
return extracted_table.to_pandas()
|
369
|
+
|
370
|
+
self._logger.info(f"Selected {len(extracted_table)} rows matching criteria")
|
371
|
+
return extracted_table
|
372
|
+
|
373
|
+
def update_rows(self, criteria, new_data):
|
374
|
+
"""
|
375
|
+
Update rows in the catalog that match the criteria with new data.
|
376
|
+
|
377
|
+
Parameters
|
378
|
+
----------
|
379
|
+
criteria : dict
|
380
|
+
A dictionary specifying conditions to select rows for updating.
|
381
|
+
Keys are column names, and values are conditions
|
382
|
+
(e.g., {"flux": "<15"} or {"eflux": None}).
|
383
|
+
|
384
|
+
new_data : dict
|
385
|
+
Dictionary where keys are column names and values are the new data
|
386
|
+
to be set in the selected rows.
|
387
|
+
|
388
|
+
Raises
|
389
|
+
------
|
390
|
+
ValueError
|
391
|
+
If the criteria contain invalid operators or
|
392
|
+
if `new_data` contains invalid columns.
|
393
|
+
"""
|
394
|
+
|
395
|
+
self._logger.debug(f"Updating rows matching {criteria} with new data: {new_data}")
|
396
|
+
|
397
|
+
self._logger.debug("Finding rows matching criteria")
|
398
|
+
rows_to_update = self._filter_by_criteria(criteria)
|
399
|
+
self._logger.info(f"Found {len(rows_to_update)} rows to update")
|
400
|
+
|
401
|
+
self._logger.debug("Updating matched rows with new data")
|
402
|
+
for row_index in rows_to_update:
|
403
|
+
for column, value in new_data.items():
|
404
|
+
if column in self.table.colnames:
|
405
|
+
self.table[row_index][column] = value
|
406
|
+
else:
|
407
|
+
raise ValueError(
|
408
|
+
f"Column '{column}' does not exist in the table."
|
409
|
+
)
|
410
|
+
|
411
|
+
def delete_rows(self, criteria=None, row_numbers=None):
|
412
|
+
"""
|
413
|
+
Delete rows in the catalog based on criteria or specific row numbers.
|
414
|
+
|
415
|
+
Parameters
|
416
|
+
----------
|
417
|
+
criteria : dict, optional
|
418
|
+
A dictionary specifying conditions to select rows for deletion.
|
419
|
+
Keys are column names, and values are conditions
|
420
|
+
(e.g., {"flux": "<15"} or {"eflux": None}).
|
421
|
+
|
422
|
+
row_numbers : int or list of int, optional
|
423
|
+
Specific row index or list of row indices to delete.
|
424
|
+
|
425
|
+
Returns
|
426
|
+
-------
|
427
|
+
int
|
428
|
+
Number of rows deleted
|
429
|
+
|
430
|
+
Raises
|
431
|
+
------
|
432
|
+
ValueError
|
433
|
+
If neither `criteria` nor `row_numbers` is provided, or if criteria
|
434
|
+
contain invalid operators, or if arguments have invalid types.
|
435
|
+
"""
|
436
|
+
if criteria is not None and not isinstance(criteria, dict):
|
437
|
+
raise ValueError("`criteria` must be a dictionary")
|
438
|
+
|
439
|
+
if row_numbers is not None:
|
440
|
+
if isinstance(row_numbers, int):
|
441
|
+
row_numbers = [row_numbers]
|
442
|
+
elif not isinstance(row_numbers, list):
|
443
|
+
raise ValueError(
|
444
|
+
"`row_numbers` must be an integer or list of integers"
|
445
|
+
)
|
446
|
+
|
447
|
+
if not all(isinstance(x, int) for x in row_numbers):
|
448
|
+
raise ValueError("All elements in `row_numbers` must be integers")
|
449
|
+
|
450
|
+
if criteria is None and row_numbers is None:
|
451
|
+
raise ValueError("Either `criteria` or `row_numbers` must be provided")
|
452
|
+
|
453
|
+
rows_to_delete = []
|
454
|
+
if criteria:
|
455
|
+
self._logger.debug(f"Finding rows matching criteria: {criteria}")
|
456
|
+
rows_to_delete.extend(self._filter_by_criteria(criteria))
|
457
|
+
|
458
|
+
if row_numbers:
|
459
|
+
self._logger.debug(f"Adding specified row numbers: {row_numbers}")
|
460
|
+
rows_to_delete.extend(row_numbers)
|
461
|
+
|
462
|
+
rows_to_delete = list(set(rows_to_delete))
|
463
|
+
self._logger.debug(f"Removing {len(rows_to_delete)} rows")
|
464
|
+
self.table.remove_rows(rows_to_delete)
|
465
|
+
deleted_count = len(rows_to_delete)
|
466
|
+
self._logger.info(f"Deleted {deleted_count} rows")
|
467
|
+
|
468
|
+
return deleted_count
|
469
|
+
|
470
|
+
def find_missing_data_rows(self, columns, as_dataframe=True):
|
471
|
+
"""
|
472
|
+
Finds rows with missing (None or NaN) data in any of the specified columns.
|
473
|
+
|
474
|
+
Parameters
|
475
|
+
----------
|
476
|
+
columns : str or list of str
|
477
|
+
Column name or list of column names to check
|
478
|
+
for missing data (either None or NaN).
|
479
|
+
|
480
|
+
as_dataframe : bool, optional
|
481
|
+
If True, returns the result as a table-like structure;
|
482
|
+
if False, as a catalog table.
|
483
|
+
Default is True.
|
484
|
+
|
485
|
+
Returns
|
486
|
+
-------
|
487
|
+
table-like structure or catalog table
|
488
|
+
A table containing rows with missing data (None or NaN)
|
489
|
+
in any of the specified columns.
|
490
|
+
|
491
|
+
Raises
|
492
|
+
------
|
493
|
+
ValueError
|
494
|
+
If the catalog table is not initialized or
|
495
|
+
if `columns` is not a valid type.
|
496
|
+
|
497
|
+
Notes
|
498
|
+
-----
|
499
|
+
This method allows you to identify rows that contain
|
500
|
+
missing data (None or NaN) in specified columns for data quality
|
501
|
+
checks or cleaning purposes.
|
502
|
+
"""
|
503
|
+
if self.table is None:
|
504
|
+
raise ValueError("Catalog table is not initialized.")
|
505
|
+
|
506
|
+
# Convert columns to a list if it's provided as a single string
|
507
|
+
if isinstance(columns, str):
|
508
|
+
columns = [columns]
|
509
|
+
elif not isinstance(columns, list) or not all(
|
510
|
+
isinstance(col, str) for col in columns):
|
511
|
+
raise ValueError(
|
512
|
+
"`columns` must be a string or a list of strings "
|
513
|
+
"representing column names."
|
514
|
+
)
|
515
|
+
|
516
|
+
self._logger.debug(f"Converting catalog table to pandas DataFrame")
|
517
|
+
# Convert the catalog table to a DataFrame for easier manipulation
|
518
|
+
df = self.table.to_pandas()
|
519
|
+
|
520
|
+
self._logger.debug(f"Checking for missing data in columns: {columns}")
|
521
|
+
# Build the condition to check for either None or NaN in specified columns
|
522
|
+
conditions = []
|
523
|
+
for col in columns:
|
524
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
525
|
+
# Numeric columns check for NaN
|
526
|
+
conditions.append(df[col].isnull() | np.isnan(df[col]))
|
527
|
+
else:
|
528
|
+
# Non-numeric columns check only for None
|
529
|
+
conditions.append(df[col].isnull())
|
530
|
+
|
531
|
+
# Combine all conditions to create a final selection mask
|
532
|
+
combined_condition = conditions[0]
|
533
|
+
for cond in conditions[1:]:
|
534
|
+
combined_condition |= cond
|
535
|
+
|
536
|
+
# Extract rows where any of the conditions are True (indicating None/NaN)
|
537
|
+
result_df = df[combined_condition]
|
538
|
+
self._logger.info(f"Found {len(result_df)} rows with missing data")
|
539
|
+
|
540
|
+
# Convert to a catalog table or keep as table-like structure
|
541
|
+
if as_dataframe:
|
542
|
+
return result_df
|
543
|
+
|
544
|
+
return Table.from_pandas(result_df)
|
545
|
+
|
546
|
+
def delete_missing_data_rows(self, columns):
|
547
|
+
"""
|
548
|
+
Delete rows from the catalog that have missing (None or NaN) data
|
549
|
+
in any of the specified columns.
|
550
|
+
|
551
|
+
Warning
|
552
|
+
-------
|
553
|
+
This method permanently deletes rows from the catalog and cannot be undone.
|
554
|
+
Use with caution, as there is no way to recover deleted rows.
|
555
|
+
Proceed at your own risk.
|
556
|
+
|
557
|
+
Parameters
|
558
|
+
----------
|
559
|
+
columns : str or list of str
|
560
|
+
Column name or list of column names to check for missing data.
|
561
|
+
If a single string is provided, it will be converted to a list.
|
562
|
+
|
563
|
+
Returns
|
564
|
+
-------
|
565
|
+
int
|
566
|
+
The number of rows deleted from the catalog.
|
567
|
+
|
568
|
+
Raises
|
569
|
+
------
|
570
|
+
ValueError
|
571
|
+
If the catalog table is not initialized or if `columns` is not valid.
|
572
|
+
|
573
|
+
Examples
|
574
|
+
--------
|
575
|
+
>>> catalog = Catalog(table=some_table)
|
576
|
+
>>> deleted_count = catalog.delete_missing_data_rows(['flux', 'wavelength'])
|
577
|
+
>>> print(f"Deleted {deleted_count} rows with missing data.")
|
578
|
+
"""
|
579
|
+
# Type checking and conversion for columns parameter
|
580
|
+
if isinstance(columns, str):
|
581
|
+
columns = [columns]
|
582
|
+
elif not isinstance(columns, list) or not all(
|
583
|
+
isinstance(col, str) for col in columns):
|
584
|
+
raise ValueError(
|
585
|
+
"`columns` must be a string or list of strings "
|
586
|
+
"representing column names."
|
587
|
+
)
|
588
|
+
|
589
|
+
self._logger.debug(f"Finding rows with missing data in columns: {columns}")
|
590
|
+
# Find rows with missing data in specified columns
|
591
|
+
missing_data_df = self.find_missing_data_rows(columns, as_dataframe=True)
|
592
|
+
|
593
|
+
# Get the indices of rows with missing data
|
594
|
+
missing_indices = missing_data_df.index.tolist()
|
595
|
+
|
596
|
+
self._logger.debug(f"Removing {len(missing_indices)} rows with missing data")
|
597
|
+
# Delete the rows with missing data
|
598
|
+
self.table.remove_rows(missing_indices)
|
599
|
+
|
600
|
+
# Log and return the count of deleted rows
|
601
|
+
deleted_count = len(missing_indices)
|
602
|
+
self._logger.info(
|
603
|
+
f"Deleted {deleted_count} rows with missing data in columns: {columns}"
|
604
|
+
)
|
605
|
+
return deleted_count
|
606
|
+
|
607
|
+
def combine_fluxes(
|
608
|
+
self, method="mean", default_eflux_ratio=0.01, overwrite=False
|
609
|
+
):
|
610
|
+
"""Combine flux values from the same filter in the catalog table.
|
611
|
+
|
612
|
+
Parameters
|
613
|
+
----------
|
614
|
+
method : str, optional
|
615
|
+
The method to combine flux values. Options are:
|
616
|
+
- "mean": Calculate the weighted mean.
|
617
|
+
- "median": Calculate the median.
|
618
|
+
Default is "mean".
|
619
|
+
|
620
|
+
default_eflux_ratio : float
|
621
|
+
The default ratio of flux to eflux to use when eflux is missing or
|
622
|
+
non-positive. Default is 0.01.
|
623
|
+
|
624
|
+
overwrite : bool, optional
|
625
|
+
If True, the table in the class is overwritten with the combined data.
|
626
|
+
If False, the original table remains unchanged and the combined data
|
627
|
+
is returned as a new table. Default is False.
|
628
|
+
|
629
|
+
Returns
|
630
|
+
-------
|
631
|
+
int
|
632
|
+
If `overwrite=True`, returns the number of unique filters that were
|
633
|
+
combined.
|
634
|
+
astropy.table.Table
|
635
|
+
If `overwrite=False`, returns a new table containing the combined data.
|
636
|
+
|
637
|
+
Raises
|
638
|
+
------
|
639
|
+
ValueError
|
640
|
+
If the method is not "mean" or "median".
|
641
|
+
|
642
|
+
Notes
|
643
|
+
-----
|
644
|
+
- The method combines flux values for rows with the same `vizier_filter`
|
645
|
+
column value. The returned table will contain only the `wavelength`,
|
646
|
+
`frequency`, `flux`, `eflux`, `vizier_filter`, and `filter` columns.
|
647
|
+
- If `overwrite=True`, the original table in the class will be replaced
|
648
|
+
with the combined table. If `overwrite=False`, the original table
|
649
|
+
remains unchanged.
|
650
|
+
- In the absence of valid `eflux` values (e.g., missing or non-positive
|
651
|
+
values), uniform weights are applied during flux combination. This
|
652
|
+
approach assumes equal uncertainty for all flux values, which may lead
|
653
|
+
to less accurate results. It is recommended to handle missing or invalid
|
654
|
+
`eflux` values prior to calling this method for better accuracy.
|
655
|
+
- The `default_eflux_ratio` parameter allows you to specify a default
|
656
|
+
ratio of flux to eflux to use when eflux is missing or non-positive.
|
657
|
+
This can be useful if you want to assume a default uncertainty for the
|
658
|
+
flux values.
|
659
|
+
|
660
|
+
Examples
|
661
|
+
--------
|
662
|
+
>>> from sedlib import SED
|
663
|
+
|
664
|
+
>>> sed = SED("Vega")
|
665
|
+
>>> sed.combine_fluxes(method="mean", overwrite=True)
|
666
|
+
"""
|
667
|
+
if not isinstance(method, str):
|
668
|
+
raise ValueError("`method` must be a string.")
|
669
|
+
|
670
|
+
if not isinstance(default_eflux_ratio, float):
|
671
|
+
raise ValueError("`default_eflux_ratio` must be a float.")
|
672
|
+
|
673
|
+
if not isinstance(overwrite, bool):
|
674
|
+
raise ValueError("`overwrite` must be a boolean.")
|
675
|
+
|
676
|
+
if self.table is None:
|
677
|
+
raise ValueError("Catalog table is not initialized.")
|
678
|
+
|
679
|
+
if method not in {"mean", "median"}:
|
680
|
+
raise ValueError(
|
681
|
+
f"Invalid method: {method}. Choose 'mean' or 'median'."
|
682
|
+
)
|
683
|
+
|
684
|
+
self._logger.debug("Finding unique filters")
|
685
|
+
|
686
|
+
# Group rows by filter
|
687
|
+
unique_filters = set(self.table["vizier_filter"])
|
688
|
+
new_rows = []
|
689
|
+
|
690
|
+
self._logger.debug(f"Processing {len(unique_filters)} unique filters")
|
691
|
+
for filt in unique_filters:
|
692
|
+
# Extract rows for this filter
|
693
|
+
rows = self.table[self.table["vizier_filter"] == filt]
|
694
|
+
|
695
|
+
# Flux and eflux values
|
696
|
+
fluxes = np.array(rows["flux"])
|
697
|
+
efluxes = np.array(rows["eflux"])
|
698
|
+
|
699
|
+
# Replace NaN or zero eflux with default values
|
700
|
+
invalid_mask = (np.isnan(efluxes)) | (efluxes == 0)
|
701
|
+
if np.any(invalid_mask):
|
702
|
+
self._logger.warning(
|
703
|
+
f"Found invalid eflux values (NaN or zero) for filter "
|
704
|
+
f"'{filt}'. Replacing with default."
|
705
|
+
)
|
706
|
+
efluxes[invalid_mask] = fluxes[invalid_mask] * default_eflux_ratio
|
707
|
+
|
708
|
+
# Calculate weights (inverse of variance)
|
709
|
+
weights = 1 / efluxes**2
|
710
|
+
|
711
|
+
self._logger.debug(f"Combining fluxes for filter {filt} using {method}")
|
712
|
+
# Combine fluxes
|
713
|
+
if method == "mean":
|
714
|
+
combined_flux = np.average(fluxes, weights=weights)
|
715
|
+
else: # method == "median"
|
716
|
+
combined_flux = np.median(fluxes)
|
717
|
+
|
718
|
+
# Combine efluxes using RMS (root-mean-square)
|
719
|
+
combined_eflux = np.sqrt(np.sum(efluxes**2)) / len(efluxes)
|
720
|
+
|
721
|
+
# Add a new row
|
722
|
+
new_row = {
|
723
|
+
"wavelength": rows["wavelength"][0],
|
724
|
+
"frequency": rows["frequency"][0],
|
725
|
+
"flux": combined_flux,
|
726
|
+
"eflux": combined_eflux,
|
727
|
+
"vizier_filter": filt,
|
728
|
+
"filter": rows["filter"][0],
|
729
|
+
}
|
730
|
+
|
731
|
+
new_rows.append(new_row)
|
732
|
+
|
733
|
+
self._logger.debug("Creating new table with combined rows")
|
734
|
+
# Replace table with combined rows
|
735
|
+
combined_table = Table(rows=new_rows)
|
736
|
+
|
737
|
+
# Sort the table by "wavelength" column
|
738
|
+
combined_table.sort("wavelength")
|
739
|
+
|
740
|
+
# Set units for the new table columns
|
741
|
+
for col in ["wavelength", "frequency", "flux", "eflux"]:
|
742
|
+
if col in self.table.colnames:
|
743
|
+
combined_table[col].unit = self.table[col].unit
|
744
|
+
|
745
|
+
combined_table['wavelength'].info.format = '.3f'
|
746
|
+
combined_table['frequency'].info.format = '.3e'
|
747
|
+
combined_table['flux'].info.format = '.3e'
|
748
|
+
combined_table['eflux'].info.format = '.3e'
|
749
|
+
|
750
|
+
if overwrite:
|
751
|
+
self.table = combined_table
|
752
|
+
self.flux_to_magnitude()
|
753
|
+
|
754
|
+
return len(unique_filters)
|
755
|
+
|
756
|
+
return combined_table
|
757
|
+
|
758
|
+
def _filter_by_criteria(self, criteria):
|
759
|
+
"""
|
760
|
+
Filter rows based on given criteria.
|
761
|
+
|
762
|
+
Parameters
|
763
|
+
----------
|
764
|
+
criteria : dict
|
765
|
+
Dictionary where keys are column names and values are filter
|
766
|
+
conditions (e.g., {"flux": "<15"} or {"name": "== 'star'"}).
|
767
|
+
|
768
|
+
Returns
|
769
|
+
-------
|
770
|
+
list of int
|
771
|
+
List of row indices that match the criteria.
|
772
|
+
|
773
|
+
Raises
|
774
|
+
------
|
775
|
+
ValueError
|
776
|
+
If a condition in `criteria` contains an invalid operator or if there is
|
777
|
+
a type mismatch between the table data and the condition.
|
778
|
+
"""
|
779
|
+
valid_operators = {
|
780
|
+
"<", ">", "<=", ">=", "==", "!=", "is None", "is not None"
|
781
|
+
}
|
782
|
+
matched_rows = []
|
783
|
+
|
784
|
+
for row_index, row in enumerate(self.table):
|
785
|
+
if self._row_matches_criteria(row, criteria, valid_operators):
|
786
|
+
matched_rows.append(row_index)
|
787
|
+
|
788
|
+
return matched_rows
|
789
|
+
|
790
|
+
def _row_matches_criteria(self, row, criteria, valid_operators):
|
791
|
+
"""
|
792
|
+
Helper function to determine if a row matches all criteria.
|
793
|
+
"""
|
794
|
+
|
795
|
+
for column, condition in criteria.items():
|
796
|
+
if not self._matches_condition(row[column], condition, valid_operators):
|
797
|
+
return False # If any condition fails, the row does not match
|
798
|
+
|
799
|
+
return True # All conditions matched
|
800
|
+
|
801
|
+
def _matches_condition(self, value, condition, valid_operators):
|
802
|
+
"""
|
803
|
+
Helper function to check if a value meets a specified condition.
|
804
|
+
"""
|
805
|
+
|
806
|
+
# Check for None conditions
|
807
|
+
if condition in {None, "is None"}:
|
808
|
+
return value is None
|
809
|
+
elif condition == "is not None":
|
810
|
+
return value is not None
|
811
|
+
|
812
|
+
# Extract the operator and the comparison value
|
813
|
+
operator_found = re.search(r'([<>!]=?|==)', condition)
|
814
|
+
if not operator_found or operator_found.group(0) not in valid_operators:
|
815
|
+
raise ValueError(
|
816
|
+
f"Invalid operator in condition: '{condition}'. "
|
817
|
+
f"Allowed operators are: {', '.join(valid_operators)}"
|
818
|
+
)
|
819
|
+
|
820
|
+
operator = operator_found.group(0)
|
821
|
+
value_str = condition.replace(operator, "", 1).strip()
|
822
|
+
|
823
|
+
# Attempt to convert the condition value to match the type of row's value
|
824
|
+
try:
|
825
|
+
condition_value = self._convert_condition_value(value, value_str)
|
826
|
+
return eval(f"value {operator} condition_value")
|
827
|
+
except ValueError:
|
828
|
+
raise ValueError(
|
829
|
+
f"Failed to convert '{value_str}' to the same type as column value "
|
830
|
+
f"'{value}'."
|
831
|
+
)
|
832
|
+
except TypeError as e:
|
833
|
+
raise TypeError(str(e))
|
834
|
+
|
835
|
+
def _convert_condition_value(self, value, value_str):
|
836
|
+
"""
|
837
|
+
Convert the condition value to the same type as the row's value.
|
838
|
+
"""
|
839
|
+
|
840
|
+
if isinstance(value, (float, np.floating)):
|
841
|
+
return float(value_str)
|
842
|
+
|
843
|
+
if isinstance(value, (int, np.integer)):
|
844
|
+
return int(value_str)
|
845
|
+
|
846
|
+
if isinstance(value, (str, np.str_)):
|
847
|
+
return value_str.strip("'\"")
|
848
|
+
|
849
|
+
raise TypeError(f"Unsupported column data type: {type(value)}")
|
850
|
+
|
851
|
+
def get_column_stats(self, column_name):
|
852
|
+
"""
|
853
|
+
Calculate basic statistics for a specified numerical column.
|
854
|
+
|
855
|
+
Parameters
|
856
|
+
----------
|
857
|
+
column_name : str
|
858
|
+
The name of the column to calculate statistics for.
|
859
|
+
|
860
|
+
Returns
|
861
|
+
-------
|
862
|
+
dict
|
863
|
+
A dictionary with mean, median, and standard deviation of the column.
|
864
|
+
|
865
|
+
Raises
|
866
|
+
------
|
867
|
+
ValueError
|
868
|
+
If the specified column does not exist or is non-numeric.
|
869
|
+
"""
|
870
|
+
if column_name not in self.table.colnames:
|
871
|
+
raise ValueError(f"Column '{column_name}' does not exist in the table.")
|
872
|
+
|
873
|
+
col_data = self.table[column_name]
|
874
|
+
if not isinstance(col_data[0], (int, float, np.integer, np.floating)):
|
875
|
+
raise ValueError(f"Column '{column_name}' is non-numeric.")
|
876
|
+
|
877
|
+
return {
|
878
|
+
'mean': np.mean(col_data),
|
879
|
+
'median': np.median(col_data),
|
880
|
+
'std_dev': np.std(col_data),
|
881
|
+
}
|
882
|
+
|
883
|
+
def flux_to_magnitude(self):
|
884
|
+
"""
|
885
|
+
Convert all fluxes to magnitudes using the filter's zero point.
|
886
|
+
|
887
|
+
This method converts flux values in the catalog table to magnitudes using
|
888
|
+
each filter's zero point. It creates two new columns in the table:
|
889
|
+
'mag' for magnitudes and 'mag_err' for magnitude errors.
|
890
|
+
|
891
|
+
The conversion is done using the filter's flux_to_mag() method, which
|
892
|
+
assumes a Pogson magnitude system.
|
893
|
+
|
894
|
+
Notes
|
895
|
+
-----
|
896
|
+
- For rows where the filter is None, magnitude values will be set to None
|
897
|
+
- The flux values must be in units compatible with the filter's zero point
|
898
|
+
- The magnitude system (AB, Vega, etc.) depends on the filter's zero point
|
899
|
+
|
900
|
+
Examples
|
901
|
+
--------
|
902
|
+
>>> from sedlib import SED
|
903
|
+
>>> from astropy import units as u
|
904
|
+
>>>
|
905
|
+
>>> sed = SED(name='Vega')
|
906
|
+
>>> sed.teff = 10070 * u.K
|
907
|
+
>>> sed.radius = 2.766 * u.Rsun
|
908
|
+
>>> sed.distance = 7.68 * u.pcs
|
909
|
+
>>>
|
910
|
+
>>> sed.catalog.flux_to_magnitude()
|
911
|
+
"""
|
912
|
+
self._logger.info("Converting all fluxes to magnitudes")
|
913
|
+
|
914
|
+
if self.table is None:
|
915
|
+
self._logger.error("Catalog table is not initialized")
|
916
|
+
raise ValueError('Catalog table is not initialized')
|
917
|
+
|
918
|
+
mags = []
|
919
|
+
mag_errs = []
|
920
|
+
|
921
|
+
flux_unit = self.table['flux'].unit
|
922
|
+
eflux_unit = self.table['eflux'].unit
|
923
|
+
|
924
|
+
self._logger.debug(f"Processing {len(self.table)} rows")
|
925
|
+
|
926
|
+
# Initialize magnitude columns if they don't exist
|
927
|
+
if 'mag' not in self.table.colnames:
|
928
|
+
self.table['mag'] = [None] * len(self.table)
|
929
|
+
if 'mag_err' not in self.table.colnames:
|
930
|
+
self.table['mag_err'] = [None] * len(self.table)
|
931
|
+
|
932
|
+
success_count = 0
|
933
|
+
|
934
|
+
for row in self.table:
|
935
|
+
f = row['filter']
|
936
|
+
flux = row['flux'] * flux_unit
|
937
|
+
eflux = row['eflux'] * eflux_unit
|
938
|
+
|
939
|
+
if f is None:
|
940
|
+
self._logger.warning(
|
941
|
+
f"Skipping row with filter {row['vizier_filter']}: "
|
942
|
+
"No filter object found"
|
943
|
+
)
|
944
|
+
mags.append(None)
|
945
|
+
mag_errs.append(None)
|
946
|
+
continue
|
947
|
+
|
948
|
+
success_count += 1
|
949
|
+
|
950
|
+
# Convert flux to magnitude using filter's zero point
|
951
|
+
mags.append(f.flux_to_mag(flux))
|
952
|
+
|
953
|
+
# Calculate magnitude error using error propagation formula
|
954
|
+
mag_err = (2.5 / (flux * np.log(10))) * eflux
|
955
|
+
mag_errs.append(mag_err)
|
956
|
+
|
957
|
+
self.table['mag'] = mags
|
958
|
+
self.table['mag_err'] = mag_errs
|
959
|
+
|
960
|
+
try:
|
961
|
+
self.table['mag'].info.format = '.3f'
|
962
|
+
self.table['mag_err'].info.format = '.3f'
|
963
|
+
except:
|
964
|
+
# if there are nan values in columns, convert them to float
|
965
|
+
self.table['mag'] = np.array(self.table['mag'], dtype=float)
|
966
|
+
self.table['mag_err'] = np.array(self.table['mag_err'], dtype=float)
|
967
|
+
|
968
|
+
# then format the columns
|
969
|
+
self.table['mag'].info.format = '.3f'
|
970
|
+
self.table['mag_err'].info.format = '.3f'
|
971
|
+
|
972
|
+
self._logger.info(
|
973
|
+
f"Successfully converted {success_count} fluxes to magnitudes"
|
974
|
+
)
|
975
|
+
|
976
|
+
def filter_outliers(
|
977
|
+
self,
|
978
|
+
sigma_threshold=3.0,
|
979
|
+
over_write=False,
|
980
|
+
verbose=False
|
981
|
+
):
|
982
|
+
"""Filter out outlier data points from the SED catalog using iterative
|
983
|
+
sigma clipping in logarithmic space.
|
984
|
+
|
985
|
+
This method computes the residuals between the observed fluxes in the
|
986
|
+
catalog and the predicted fluxes from the blackbody model computed with
|
987
|
+
the current effective temperature (teff), radius, and distance of the
|
988
|
+
object. The residual is defined as:
|
989
|
+
|
990
|
+
r = log10(F_obs) - log10(F_model)
|
991
|
+
|
992
|
+
where:
|
993
|
+
- F_obs is the observed flux.
|
994
|
+
- F_model is the flux predicted by the blackbody model:
|
995
|
+
F_model = (π * BlackBody(temperature=teff, scale=scale)
|
996
|
+
(wavelength)) / dR,
|
997
|
+
with dR = (distance / radius)^2.
|
998
|
+
|
999
|
+
An iterative sigma clipping is performed on the residuals, flagging any
|
1000
|
+
data point for which
|
1001
|
+
|
1002
|
+
|r - median(r)| > sigma_threshold * σ
|
1003
|
+
|
1004
|
+
Data points that do not meet this criterion are considered outliers. The
|
1005
|
+
process is repeated until no new points are flagged. This allows us to
|
1006
|
+
robustly identify points deviating from the continuum—even if some
|
1007
|
+
extreme values initially skew the statistics.
|
1008
|
+
|
1009
|
+
Parameters
|
1010
|
+
----------
|
1011
|
+
sigma_threshold : float, optional
|
1012
|
+
The sigma threshold for clipping (default is 3.0). Data points with
|
1013
|
+
residuals deviating more than sigma_threshold times the standard
|
1014
|
+
deviation from the median are flagged as outliers.
|
1015
|
+
over_write : bool, optional
|
1016
|
+
If True, the outlier points are permanently removed from the SED
|
1017
|
+
object's catalog. If False, the method returns an Astropy Table of
|
1018
|
+
outlier points without modifying the catalog.
|
1019
|
+
verbose : bool, optional
|
1020
|
+
If True, logs detailed information about each iteration of the
|
1021
|
+
filtering process (default is True).
|
1022
|
+
|
1023
|
+
Returns
|
1024
|
+
-------
|
1025
|
+
outliers : astropy.table.Table or None
|
1026
|
+
If over_write is False, returns an Astropy Table containing the
|
1027
|
+
outlier data points. If over_write is True, updates the catalog in
|
1028
|
+
place and returns None.
|
1029
|
+
|
1030
|
+
Raises
|
1031
|
+
------
|
1032
|
+
ValueError
|
1033
|
+
If required parameters (teff, radius, distance) are not set or if the
|
1034
|
+
catalog is missing the required 'wavelength' and 'flux' columns.
|
1035
|
+
|
1036
|
+
Examples
|
1037
|
+
--------
|
1038
|
+
>>> from sedlib import SED
|
1039
|
+
>>> from astropy import units as u
|
1040
|
+
>>> sed = SED(name='Vega')
|
1041
|
+
>>> sed.teff = 9600 * u.K
|
1042
|
+
>>> sed.radius = 2.818 * u.Rsun
|
1043
|
+
>>> sed.distance = 7.68 * u.pc
|
1044
|
+
>>> # Flag outliers using a 3-sigma threshold without modifying catalog:
|
1045
|
+
>>> outlier_table = sed.filter_outliers(
|
1046
|
+
... sigma_threshold=3.0, over_write=False, plot=True
|
1047
|
+
... )
|
1048
|
+
"""
|
1049
|
+
# Verify that required parameters are set.
|
1050
|
+
if self.teff is None or self.radius is None or self.distance is None:
|
1051
|
+
self._logger.error(
|
1052
|
+
"Effective temperature, radius, and distance must be set before "
|
1053
|
+
"filtering outliers."
|
1054
|
+
)
|
1055
|
+
raise ValueError(
|
1056
|
+
"Effective temperature, radius, and distance must be set before "
|
1057
|
+
"filtering outliers."
|
1058
|
+
)
|
1059
|
+
|
1060
|
+
# verify that the catalog is initialized
|
1061
|
+
if self.table is None:
|
1062
|
+
self._logger.error("Catalog data is required for filtering outliers.")
|
1063
|
+
raise ValueError("Catalog data is required for filtering outliers.")
|
1064
|
+
|
1065
|
+
self._rejected_sigma_threshold = sigma_threshold
|
1066
|
+
|
1067
|
+
# geometric dilution factor
|
1068
|
+
dR = (self.distance.to(u.cm) / self.radius.to(u.cm)) ** 2
|
1069
|
+
|
1070
|
+
# scale factor
|
1071
|
+
scale = 1.0 * u.erg / (u.cm**2 * u.AA * u.s * u.sr)
|
1072
|
+
|
1073
|
+
# blackbody model
|
1074
|
+
bb_model = BlackBody(temperature=self.teff, scale=scale)
|
1075
|
+
|
1076
|
+
if "wavelength" not in self.table.colnames or "flux" not in self.table.colnames:
|
1077
|
+
self._logger.error("Catalog must contain 'wavelength' and 'flux' columns.")
|
1078
|
+
raise ValueError(
|
1079
|
+
"Catalog must contain 'wavelength' and 'flux' columns."
|
1080
|
+
)
|
1081
|
+
|
1082
|
+
wavelengths = self.table["wavelength"]
|
1083
|
+
observed_flux = self.table["flux"]
|
1084
|
+
|
1085
|
+
# Compute the predicted flux for each wavelength.
|
1086
|
+
predicted_flux = bb_model(wavelengths) * np.pi / dR
|
1087
|
+
|
1088
|
+
# Calculate residuals in logarithmic space.
|
1089
|
+
residuals = (
|
1090
|
+
np.log10(observed_flux.value) - np.log10(predicted_flux.value)
|
1091
|
+
)
|
1092
|
+
|
1093
|
+
# Perform iterative sigma clipping.
|
1094
|
+
# True indicates an "inlier" (continuum point).
|
1095
|
+
mask = np.ones(len(residuals), dtype=bool)
|
1096
|
+
iteration = 0
|
1097
|
+
while True:
|
1098
|
+
iteration += 1
|
1099
|
+
current_residuals = residuals[mask]
|
1100
|
+
if len(current_residuals) == 0:
|
1101
|
+
self._logger.warning("No data points remain after clipping.")
|
1102
|
+
break
|
1103
|
+
median_val = np.median(current_residuals)
|
1104
|
+
sigma_val = np.std(current_residuals, ddof=1)
|
1105
|
+
|
1106
|
+
# Create a new mask: keep points within sigma_threshold * sigma of
|
1107
|
+
# the median.
|
1108
|
+
new_mask = (
|
1109
|
+
np.abs(residuals - median_val) <= sigma_threshold * sigma_val
|
1110
|
+
)
|
1111
|
+
|
1112
|
+
# Check for convergence.
|
1113
|
+
if np.array_equal(mask, new_mask):
|
1114
|
+
break
|
1115
|
+
mask = new_mask
|
1116
|
+
|
1117
|
+
# Identify outliers (those not in the final mask).
|
1118
|
+
outlier_indices = np.where(~mask)[0]
|
1119
|
+
num_outliers = len(outlier_indices)
|
1120
|
+
self._logger.info(
|
1121
|
+
f"Total outliers detected: {num_outliers} out of {len(residuals)} "
|
1122
|
+
"data points."
|
1123
|
+
)
|
1124
|
+
|
1125
|
+
# save the rejected data
|
1126
|
+
self.rejected_data = self.table[~mask].copy()
|
1127
|
+
|
1128
|
+
if verbose:
|
1129
|
+
print(
|
1130
|
+
f"Total outliers detected: {num_outliers} out of "
|
1131
|
+
f"{len(residuals)} data points after {iteration} iterations."
|
1132
|
+
)
|
1133
|
+
|
1134
|
+
if over_write:
|
1135
|
+
# Permanently update the catalog to keep only the inliers.
|
1136
|
+
self.table = self.table[mask]
|
1137
|
+
self._logger.info("Outlier points have been removed from the catalog.")
|
1138
|
+
return None
|
1139
|
+
|
1140
|
+
self._logger.info("Returning a table of outlier points for inspection.")
|
1141
|
+
return self.rejected_data
|