sedlib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sedlib/catalog.py ADDED
@@ -0,0 +1,1141 @@
1
+ #!/usr/bin/env python
2
+
3
+ """
4
+ Catalog class for organizing photometric data
5
+ """
6
+
7
+ __all__ = ['Catalog']
8
+
9
+ import re
10
+ import logging
11
+ import sqlite3
12
+
13
+ import numpy as np
14
+ import pandas as pd
15
+
16
+ from astropy import units as u
17
+ from astropy.table import Table
18
+ from astropy.modeling.physical_models import BlackBody
19
+
20
+
21
+ # # Set up logging
22
+ # logger = logging.get(__name__)
23
+ # logger.setLevel(logging.DEBUG)
24
+
25
+ # # Create a file handler
26
+ # fh = logging.FileHandler('sed.log')
27
+ # fh.setLevel(logging.DEBUG)
28
+
29
+ # # Create a formatter
30
+ # formatter = logging.Formatter(
31
+ # '%(asctime)s - %(levelname)s - %(message)s',
32
+ # datefmt='%Y-%m-%dT%H:%M:%S'
33
+ # )
34
+ # fh.setFormatter(formatter)
35
+
36
+ # # Add the handler to the logger
37
+ # logger.addHandler(fh)
38
+
39
+
40
+ class Catalog:
41
+ """
42
+ A class to manage and organize photometric catalog data.
43
+
44
+ This class provides a wrapper around Astropy Table objects for managing
45
+ photometric data, including methods for data cleaning, filtering, and
46
+ statistical analysis.
47
+
48
+ Parameters
49
+ ----------
50
+ name : str, optional
51
+ Name identifier for the catalog.
52
+ table : astropy.table.Table, optional
53
+ Existing Astropy Table to use as the catalog data.
54
+ logger : logging.Logger, optional
55
+ Logger instance for logging operations.
56
+ **kwargs : dict, optional
57
+ Additional keyword arguments to set as attributes.
58
+
59
+ Attributes
60
+ ----------
61
+ name : str
62
+ Catalog name identifier.
63
+ table : astropy.table.Table
64
+ The underlying Astropy Table containing photometric data.
65
+ teff : astropy.units.Quantity, optional
66
+ Effective temperature of the target object.
67
+ teff_error : astropy.units.Quantity, optional
68
+ Error in effective temperature.
69
+ radius : astropy.units.Quantity, optional
70
+ Stellar radius of the target object.
71
+ radius_error : astropy.units.Quantity, optional
72
+ Error in stellar radius.
73
+ distance : astropy.units.Quantity, optional
74
+ Distance to the target object.
75
+ distance_error : astropy.units.Quantity, optional
76
+ Error in distance.
77
+ rejected_data : astropy.table.Table, optional
78
+ Data points that were rejected during filtering operations.
79
+ is_rejected : bool
80
+ Flag indicating if data has been rejected.
81
+
82
+ Methods
83
+ -------
84
+ from_table(table)
85
+ Initialize catalog from existing Astropy Table.
86
+ add_rows(new_rows)
87
+ Add new rows to the catalog.
88
+ select_rows(criteria, as_dataframe=False)
89
+ Select rows based on criteria.
90
+ update_rows(criteria, new_data)
91
+ Update rows matching criteria.
92
+ delete_rows(criteria=None, row_numbers=None)
93
+ Delete rows based on criteria or row numbers.
94
+ find_missing_data_rows(columns, as_dataframe=True)
95
+ Find rows with missing data.
96
+ delete_missing_data_rows(columns)
97
+ Delete rows with missing data.
98
+ combine_fluxes(method='mean', overwrite=False)
99
+ Combine duplicate filter measurements.
100
+ filter_outliers(sigma_threshold=3.0)
101
+ Apply sigma clipping to filter outliers.
102
+ flux_to_magnitude()
103
+ Convert flux values to magnitudes.
104
+ get_column_stats(column_name)
105
+ Calculate statistics for a column.
106
+ sql_query(query)
107
+ Execute SQL query on the catalog.
108
+
109
+ Examples
110
+ --------
111
+ >>> from sedlib import Catalog
112
+ >>> from astropy.table import Table
113
+ >>> import numpy as np
114
+ >>>
115
+ >>> # Create catalog from data
116
+ >>> data = {
117
+ >>> 'RA': [180.0, 180.1],
118
+ >>> 'DEC': [30.0, 30.1],
119
+ >>> 'filter': ['V', 'B'],
120
+ >>> 'flux': [1e-12, 8e-13],
121
+ >>> 'eflux': [1e-13, 8e-14]
122
+ >>> }
123
+ >>> table = Table(data)
124
+ >>> catalog = Catalog('test_catalog', table)
125
+ >>>
126
+ >>> # Filter outliers
127
+ >>> catalog.filter_outliers(sigma_threshold=2.0)
128
+ >>>
129
+ >>> # Get statistics
130
+ >>> stats = catalog.get_column_stats('flux')
131
+ >>> print(f"Mean flux: {stats['mean']:.2e}")
132
+ """
133
+
134
+ def __init__(self, name=None, table=None, logger=None, **kwargs):
135
+ self.name = name
136
+ self.table = table if table is not None else Table()
137
+
138
+ self._logger = logger
139
+ self._logger.info(f"Initialized Catalog with name: {name}")
140
+
141
+ self.teff = None
142
+ self.teff_error = None
143
+ self.radius = None
144
+ self.radius_error = None
145
+ self.distance = None
146
+ self.distance_error = None
147
+
148
+ for key, val in kwargs.items():
149
+ setattr(self, key, val)
150
+
151
+ if len(self.table) > 0:
152
+ self.table['RA'].info.format = '.3f'
153
+ self.table['DEC'].info.format = '.3f'
154
+ self.table['wavelength'].info.format = '.3f'
155
+
156
+ columns = [
157
+ 'RA', 'DEC', 'vizier_filter', 'filter',
158
+ 'frequency', 'wavelength', 'flux', 'eflux'
159
+ ]
160
+ self.table = table[columns]
161
+
162
+ self.rejected_data = None
163
+ self.is_rejected = False
164
+ self._rejected_sigma_threshold = 3.0
165
+
166
+ def __str__(self):
167
+ return str(self.table)
168
+
169
+ def __repr__(self):
170
+ return str(self.table)
171
+
172
+ def __setitem__(self, key, value):
173
+ if self.table is None:
174
+ raise ValueError("Table is not initialized.")
175
+
176
+ try:
177
+ self.table[key] = value
178
+ except ValueError:
179
+ raise ValueError(f"Column '{key}' does not exist in the table.")
180
+
181
+ def __getitem__(self, item):
182
+ if self.table is None:
183
+ raise ValueError("Table is not initialized.")
184
+
185
+ try:
186
+ return self.table[item]
187
+ except ValueError:
188
+ raise ValueError(f"Column '{item}' does not exist in the table.")
189
+
190
+ # def __getattr__(self, name):
191
+ # try:
192
+ # return getattr(self.table, name)
193
+ # except AttributeError:
194
+ # raise AttributeError(
195
+ # f"'{type(self).__name__}' object has no attribute '{name}'"
196
+ # )
197
+
198
+ def __len__(self):
199
+ if self.table is None:
200
+ raise ValueError("Table is not initialized.")
201
+ return len(self.table)
202
+
203
+ def __call__(self, max_rows=20):
204
+ return self.table.show_in_notebook(display_length=max_rows)
205
+
206
+ def from_table(self, table):
207
+ """
208
+ Initialize Catalog with an existing astropy Table.
209
+
210
+ Parameters
211
+ ----------
212
+ table : astropy.table.Table
213
+ The astropy Table to use as the catalog.
214
+
215
+ Raises
216
+ ------
217
+ TypeError
218
+ If table is not an astropy.table.Table object.
219
+ """
220
+ if not isinstance(table, Table):
221
+ raise TypeError("Argument must be an astropy.table.Table object")
222
+
223
+ self.table = table
224
+ self._self._logger.info(
225
+ f"Loaded table with {len(table)} rows and {len(table.columns)} columns"
226
+ )
227
+
228
+ def sql_query(self, query):
229
+ """
230
+ Execute a SQL query on the catalog to retrieve data
231
+ based on specific conditions.
232
+
233
+ Parameters
234
+ ----------
235
+ query : str
236
+ A SQL query string to execute.
237
+
238
+ Returns
239
+ -------
240
+ result : pandas.DataFrame
241
+ A DataFrame containing the query result.
242
+
243
+ Raises
244
+ ------
245
+ ValueError
246
+ If the catalog table is not initialized or if the query fails.
247
+
248
+ Examples
249
+ --------
250
+ >>> catalog.sql_query("SELECT * FROM catalog WHERE flux > 1.5")
251
+ >>> catalog.sql_query(
252
+ "SELECT eflux FROM catalog WHERE vizier_filter LIKE '%Johnson%'"
253
+ )
254
+ """
255
+ if self.table is None:
256
+ raise ValueError("Catalog table is not initialized.")
257
+
258
+ self._logger.debug(f"Executing SQL query: {query}")
259
+
260
+ # Convert the astropy table to a pandas DataFrame
261
+ self._logger.debug("Converting astropy table to pandas DataFrame")
262
+ df = self.table.to_pandas()
263
+
264
+ # Convert unsupported column types to strings for SQLite compatibility
265
+ self._logger.debug("Converting unsupported column types to strings")
266
+ for col in df.columns:
267
+ if (not pd.api.types.is_numeric_dtype(df[col]) and
268
+ not pd.api.types.is_string_dtype(df[col])):
269
+ df[col] = df[col].astype(str)
270
+
271
+ # Load the DataFrame into an in-memory SQLite database
272
+ self._logger.debug("Loading DataFrame into SQLite database")
273
+ connection = sqlite3.connect(":memory:")
274
+ try:
275
+ df.to_sql(
276
+ "catalog",
277
+ connection,
278
+ index=False,
279
+ if_exists="replace"
280
+ )
281
+ # Execute the query and fetch the results into a DataFrame
282
+ self._logger.debug("Executing SQL query and fetching results")
283
+ result_df = pd.read_sql_query(query, connection)
284
+ self._logger.info(f"SQL query returned {len(result_df)} rows")
285
+ return result_df
286
+ except Exception as e:
287
+ self._logger.error(f"SQL query failed: {str(e)}")
288
+ raise ValueError(f"Failed to execute query: {e}")
289
+ finally:
290
+ connection.close() # Ensure the SQLite connection is closed
291
+
292
+ def add_rows(self, new_rows):
293
+ """
294
+ Add new rows to the catalog.
295
+
296
+ Parameters
297
+ ----------
298
+ new_rows : list of dict
299
+ List of dictionaries, where each dictionary represents a new row
300
+ with keys as column names and values as row data.
301
+
302
+ Raises
303
+ ------
304
+ ValueError
305
+ If the catalog table has not been initialized or columns do not match.
306
+ """
307
+ if self.table is None:
308
+ raise ValueError("Catalog table is not initialized.")
309
+
310
+ self._logger.info(f"Adding {len(new_rows)} new rows to catalog")
311
+
312
+ try:
313
+ self._logger.debug("Validating column names in new rows")
314
+ for row_data in new_rows:
315
+ if not all(column in self.table.colnames
316
+ for column in row_data.keys()):
317
+ raise ValueError(
318
+ "One or more columns in the new row data do not exist "
319
+ "in the table."
320
+ )
321
+ self.table.add_row(row_data)
322
+ self._logger.debug("Successfully added new rows")
323
+ except Exception as e:
324
+ self._logger.error(f"Failed to add rows: {str(e)}")
325
+ raise
326
+
327
+ def select_rows(self, criteria, as_dataframe=False):
328
+ """
329
+ Select rows from the catalog that meet the specified criteria.
330
+
331
+ Parameters
332
+ ----------
333
+ criteria : dict
334
+ A dictionary specifying conditions to select rows for extraction.
335
+ Keys are column names, and values are conditions (e.g., {"flux": "<15"}).
336
+
337
+ as_dataframe : bool, optional
338
+ If True, returns the result as a Pandas DataFrame; if False, as an
339
+ astropy.table.Table. Default is False.
340
+
341
+ Returns
342
+ -------
343
+ result : astropy.table.Table or pandas.DataFrame
344
+ The rows that match the criteria, either as a Table or DataFrame.
345
+
346
+ Raises
347
+ ------
348
+ ValueError
349
+ If the catalog table is not initialized.
350
+ """
351
+
352
+ if self.table is None:
353
+ raise ValueError("Catalog table is not initialized.")
354
+
355
+ self._logger.debug(f"Selecting rows with criteria: {criteria}")
356
+
357
+ # Get row indices that match the criteria
358
+ self._logger.debug("Filtering rows based on criteria")
359
+ row_indices = self._filter_by_criteria(criteria)
360
+
361
+ # Extract the rows from the table
362
+ self._logger.debug("Extracting matching rows")
363
+ extracted_table = self.table[row_indices]
364
+
365
+ # Convert to DataFrame if requested
366
+ if as_dataframe:
367
+ self._logger.debug("Converting result to pandas DataFrame")
368
+ return extracted_table.to_pandas()
369
+
370
+ self._logger.info(f"Selected {len(extracted_table)} rows matching criteria")
371
+ return extracted_table
372
+
373
+ def update_rows(self, criteria, new_data):
374
+ """
375
+ Update rows in the catalog that match the criteria with new data.
376
+
377
+ Parameters
378
+ ----------
379
+ criteria : dict
380
+ A dictionary specifying conditions to select rows for updating.
381
+ Keys are column names, and values are conditions
382
+ (e.g., {"flux": "<15"} or {"eflux": None}).
383
+
384
+ new_data : dict
385
+ Dictionary where keys are column names and values are the new data
386
+ to be set in the selected rows.
387
+
388
+ Raises
389
+ ------
390
+ ValueError
391
+ If the criteria contain invalid operators or
392
+ if `new_data` contains invalid columns.
393
+ """
394
+
395
+ self._logger.debug(f"Updating rows matching {criteria} with new data: {new_data}")
396
+
397
+ self._logger.debug("Finding rows matching criteria")
398
+ rows_to_update = self._filter_by_criteria(criteria)
399
+ self._logger.info(f"Found {len(rows_to_update)} rows to update")
400
+
401
+ self._logger.debug("Updating matched rows with new data")
402
+ for row_index in rows_to_update:
403
+ for column, value in new_data.items():
404
+ if column in self.table.colnames:
405
+ self.table[row_index][column] = value
406
+ else:
407
+ raise ValueError(
408
+ f"Column '{column}' does not exist in the table."
409
+ )
410
+
411
+ def delete_rows(self, criteria=None, row_numbers=None):
412
+ """
413
+ Delete rows in the catalog based on criteria or specific row numbers.
414
+
415
+ Parameters
416
+ ----------
417
+ criteria : dict, optional
418
+ A dictionary specifying conditions to select rows for deletion.
419
+ Keys are column names, and values are conditions
420
+ (e.g., {"flux": "<15"} or {"eflux": None}).
421
+
422
+ row_numbers : int or list of int, optional
423
+ Specific row index or list of row indices to delete.
424
+
425
+ Returns
426
+ -------
427
+ int
428
+ Number of rows deleted
429
+
430
+ Raises
431
+ ------
432
+ ValueError
433
+ If neither `criteria` nor `row_numbers` is provided, or if criteria
434
+ contain invalid operators, or if arguments have invalid types.
435
+ """
436
+ if criteria is not None and not isinstance(criteria, dict):
437
+ raise ValueError("`criteria` must be a dictionary")
438
+
439
+ if row_numbers is not None:
440
+ if isinstance(row_numbers, int):
441
+ row_numbers = [row_numbers]
442
+ elif not isinstance(row_numbers, list):
443
+ raise ValueError(
444
+ "`row_numbers` must be an integer or list of integers"
445
+ )
446
+
447
+ if not all(isinstance(x, int) for x in row_numbers):
448
+ raise ValueError("All elements in `row_numbers` must be integers")
449
+
450
+ if criteria is None and row_numbers is None:
451
+ raise ValueError("Either `criteria` or `row_numbers` must be provided")
452
+
453
+ rows_to_delete = []
454
+ if criteria:
455
+ self._logger.debug(f"Finding rows matching criteria: {criteria}")
456
+ rows_to_delete.extend(self._filter_by_criteria(criteria))
457
+
458
+ if row_numbers:
459
+ self._logger.debug(f"Adding specified row numbers: {row_numbers}")
460
+ rows_to_delete.extend(row_numbers)
461
+
462
+ rows_to_delete = list(set(rows_to_delete))
463
+ self._logger.debug(f"Removing {len(rows_to_delete)} rows")
464
+ self.table.remove_rows(rows_to_delete)
465
+ deleted_count = len(rows_to_delete)
466
+ self._logger.info(f"Deleted {deleted_count} rows")
467
+
468
+ return deleted_count
469
+
470
+ def find_missing_data_rows(self, columns, as_dataframe=True):
471
+ """
472
+ Finds rows with missing (None or NaN) data in any of the specified columns.
473
+
474
+ Parameters
475
+ ----------
476
+ columns : str or list of str
477
+ Column name or list of column names to check
478
+ for missing data (either None or NaN).
479
+
480
+ as_dataframe : bool, optional
481
+ If True, returns the result as a table-like structure;
482
+ if False, as a catalog table.
483
+ Default is True.
484
+
485
+ Returns
486
+ -------
487
+ table-like structure or catalog table
488
+ A table containing rows with missing data (None or NaN)
489
+ in any of the specified columns.
490
+
491
+ Raises
492
+ ------
493
+ ValueError
494
+ If the catalog table is not initialized or
495
+ if `columns` is not a valid type.
496
+
497
+ Notes
498
+ -----
499
+ This method allows you to identify rows that contain
500
+ missing data (None or NaN) in specified columns for data quality
501
+ checks or cleaning purposes.
502
+ """
503
+ if self.table is None:
504
+ raise ValueError("Catalog table is not initialized.")
505
+
506
+ # Convert columns to a list if it's provided as a single string
507
+ if isinstance(columns, str):
508
+ columns = [columns]
509
+ elif not isinstance(columns, list) or not all(
510
+ isinstance(col, str) for col in columns):
511
+ raise ValueError(
512
+ "`columns` must be a string or a list of strings "
513
+ "representing column names."
514
+ )
515
+
516
+ self._logger.debug(f"Converting catalog table to pandas DataFrame")
517
+ # Convert the catalog table to a DataFrame for easier manipulation
518
+ df = self.table.to_pandas()
519
+
520
+ self._logger.debug(f"Checking for missing data in columns: {columns}")
521
+ # Build the condition to check for either None or NaN in specified columns
522
+ conditions = []
523
+ for col in columns:
524
+ if pd.api.types.is_numeric_dtype(df[col]):
525
+ # Numeric columns check for NaN
526
+ conditions.append(df[col].isnull() | np.isnan(df[col]))
527
+ else:
528
+ # Non-numeric columns check only for None
529
+ conditions.append(df[col].isnull())
530
+
531
+ # Combine all conditions to create a final selection mask
532
+ combined_condition = conditions[0]
533
+ for cond in conditions[1:]:
534
+ combined_condition |= cond
535
+
536
+ # Extract rows where any of the conditions are True (indicating None/NaN)
537
+ result_df = df[combined_condition]
538
+ self._logger.info(f"Found {len(result_df)} rows with missing data")
539
+
540
+ # Convert to a catalog table or keep as table-like structure
541
+ if as_dataframe:
542
+ return result_df
543
+
544
+ return Table.from_pandas(result_df)
545
+
546
+ def delete_missing_data_rows(self, columns):
547
+ """
548
+ Delete rows from the catalog that have missing (None or NaN) data
549
+ in any of the specified columns.
550
+
551
+ Warning
552
+ -------
553
+ This method permanently deletes rows from the catalog and cannot be undone.
554
+ Use with caution, as there is no way to recover deleted rows.
555
+ Proceed at your own risk.
556
+
557
+ Parameters
558
+ ----------
559
+ columns : str or list of str
560
+ Column name or list of column names to check for missing data.
561
+ If a single string is provided, it will be converted to a list.
562
+
563
+ Returns
564
+ -------
565
+ int
566
+ The number of rows deleted from the catalog.
567
+
568
+ Raises
569
+ ------
570
+ ValueError
571
+ If the catalog table is not initialized or if `columns` is not valid.
572
+
573
+ Examples
574
+ --------
575
+ >>> catalog = Catalog(table=some_table)
576
+ >>> deleted_count = catalog.delete_missing_data_rows(['flux', 'wavelength'])
577
+ >>> print(f"Deleted {deleted_count} rows with missing data.")
578
+ """
579
+ # Type checking and conversion for columns parameter
580
+ if isinstance(columns, str):
581
+ columns = [columns]
582
+ elif not isinstance(columns, list) or not all(
583
+ isinstance(col, str) for col in columns):
584
+ raise ValueError(
585
+ "`columns` must be a string or list of strings "
586
+ "representing column names."
587
+ )
588
+
589
+ self._logger.debug(f"Finding rows with missing data in columns: {columns}")
590
+ # Find rows with missing data in specified columns
591
+ missing_data_df = self.find_missing_data_rows(columns, as_dataframe=True)
592
+
593
+ # Get the indices of rows with missing data
594
+ missing_indices = missing_data_df.index.tolist()
595
+
596
+ self._logger.debug(f"Removing {len(missing_indices)} rows with missing data")
597
+ # Delete the rows with missing data
598
+ self.table.remove_rows(missing_indices)
599
+
600
+ # Log and return the count of deleted rows
601
+ deleted_count = len(missing_indices)
602
+ self._logger.info(
603
+ f"Deleted {deleted_count} rows with missing data in columns: {columns}"
604
+ )
605
+ return deleted_count
606
+
607
+ def combine_fluxes(
608
+ self, method="mean", default_eflux_ratio=0.01, overwrite=False
609
+ ):
610
+ """Combine flux values from the same filter in the catalog table.
611
+
612
+ Parameters
613
+ ----------
614
+ method : str, optional
615
+ The method to combine flux values. Options are:
616
+ - "mean": Calculate the weighted mean.
617
+ - "median": Calculate the median.
618
+ Default is "mean".
619
+
620
+ default_eflux_ratio : float
621
+ The default ratio of flux to eflux to use when eflux is missing or
622
+ non-positive. Default is 0.01.
623
+
624
+ overwrite : bool, optional
625
+ If True, the table in the class is overwritten with the combined data.
626
+ If False, the original table remains unchanged and the combined data
627
+ is returned as a new table. Default is False.
628
+
629
+ Returns
630
+ -------
631
+ int
632
+ If `overwrite=True`, returns the number of unique filters that were
633
+ combined.
634
+ astropy.table.Table
635
+ If `overwrite=False`, returns a new table containing the combined data.
636
+
637
+ Raises
638
+ ------
639
+ ValueError
640
+ If the method is not "mean" or "median".
641
+
642
+ Notes
643
+ -----
644
+ - The method combines flux values for rows with the same `vizier_filter`
645
+ column value. The returned table will contain only the `wavelength`,
646
+ `frequency`, `flux`, `eflux`, `vizier_filter`, and `filter` columns.
647
+ - If `overwrite=True`, the original table in the class will be replaced
648
+ with the combined table. If `overwrite=False`, the original table
649
+ remains unchanged.
650
+ - In the absence of valid `eflux` values (e.g., missing or non-positive
651
+ values), uniform weights are applied during flux combination. This
652
+ approach assumes equal uncertainty for all flux values, which may lead
653
+ to less accurate results. It is recommended to handle missing or invalid
654
+ `eflux` values prior to calling this method for better accuracy.
655
+ - The `default_eflux_ratio` parameter allows you to specify a default
656
+ ratio of flux to eflux to use when eflux is missing or non-positive.
657
+ This can be useful if you want to assume a default uncertainty for the
658
+ flux values.
659
+
660
+ Examples
661
+ --------
662
+ >>> from sedlib import SED
663
+
664
+ >>> sed = SED("Vega")
665
+ >>> sed.combine_fluxes(method="mean", overwrite=True)
666
+ """
667
+ if not isinstance(method, str):
668
+ raise ValueError("`method` must be a string.")
669
+
670
+ if not isinstance(default_eflux_ratio, float):
671
+ raise ValueError("`default_eflux_ratio` must be a float.")
672
+
673
+ if not isinstance(overwrite, bool):
674
+ raise ValueError("`overwrite` must be a boolean.")
675
+
676
+ if self.table is None:
677
+ raise ValueError("Catalog table is not initialized.")
678
+
679
+ if method not in {"mean", "median"}:
680
+ raise ValueError(
681
+ f"Invalid method: {method}. Choose 'mean' or 'median'."
682
+ )
683
+
684
+ self._logger.debug("Finding unique filters")
685
+
686
+ # Group rows by filter
687
+ unique_filters = set(self.table["vizier_filter"])
688
+ new_rows = []
689
+
690
+ self._logger.debug(f"Processing {len(unique_filters)} unique filters")
691
+ for filt in unique_filters:
692
+ # Extract rows for this filter
693
+ rows = self.table[self.table["vizier_filter"] == filt]
694
+
695
+ # Flux and eflux values
696
+ fluxes = np.array(rows["flux"])
697
+ efluxes = np.array(rows["eflux"])
698
+
699
+ # Replace NaN or zero eflux with default values
700
+ invalid_mask = (np.isnan(efluxes)) | (efluxes == 0)
701
+ if np.any(invalid_mask):
702
+ self._logger.warning(
703
+ f"Found invalid eflux values (NaN or zero) for filter "
704
+ f"'{filt}'. Replacing with default."
705
+ )
706
+ efluxes[invalid_mask] = fluxes[invalid_mask] * default_eflux_ratio
707
+
708
+ # Calculate weights (inverse of variance)
709
+ weights = 1 / efluxes**2
710
+
711
+ self._logger.debug(f"Combining fluxes for filter {filt} using {method}")
712
+ # Combine fluxes
713
+ if method == "mean":
714
+ combined_flux = np.average(fluxes, weights=weights)
715
+ else: # method == "median"
716
+ combined_flux = np.median(fluxes)
717
+
718
+ # Combine efluxes using RMS (root-mean-square)
719
+ combined_eflux = np.sqrt(np.sum(efluxes**2)) / len(efluxes)
720
+
721
+ # Add a new row
722
+ new_row = {
723
+ "wavelength": rows["wavelength"][0],
724
+ "frequency": rows["frequency"][0],
725
+ "flux": combined_flux,
726
+ "eflux": combined_eflux,
727
+ "vizier_filter": filt,
728
+ "filter": rows["filter"][0],
729
+ }
730
+
731
+ new_rows.append(new_row)
732
+
733
+ self._logger.debug("Creating new table with combined rows")
734
+ # Replace table with combined rows
735
+ combined_table = Table(rows=new_rows)
736
+
737
+ # Sort the table by "wavelength" column
738
+ combined_table.sort("wavelength")
739
+
740
+ # Set units for the new table columns
741
+ for col in ["wavelength", "frequency", "flux", "eflux"]:
742
+ if col in self.table.colnames:
743
+ combined_table[col].unit = self.table[col].unit
744
+
745
+ combined_table['wavelength'].info.format = '.3f'
746
+ combined_table['frequency'].info.format = '.3e'
747
+ combined_table['flux'].info.format = '.3e'
748
+ combined_table['eflux'].info.format = '.3e'
749
+
750
+ if overwrite:
751
+ self.table = combined_table
752
+ self.flux_to_magnitude()
753
+
754
+ return len(unique_filters)
755
+
756
+ return combined_table
757
+
758
+ def _filter_by_criteria(self, criteria):
759
+ """
760
+ Filter rows based on given criteria.
761
+
762
+ Parameters
763
+ ----------
764
+ criteria : dict
765
+ Dictionary where keys are column names and values are filter
766
+ conditions (e.g., {"flux": "<15"} or {"name": "== 'star'"}).
767
+
768
+ Returns
769
+ -------
770
+ list of int
771
+ List of row indices that match the criteria.
772
+
773
+ Raises
774
+ ------
775
+ ValueError
776
+ If a condition in `criteria` contains an invalid operator or if there is
777
+ a type mismatch between the table data and the condition.
778
+ """
779
+ valid_operators = {
780
+ "<", ">", "<=", ">=", "==", "!=", "is None", "is not None"
781
+ }
782
+ matched_rows = []
783
+
784
+ for row_index, row in enumerate(self.table):
785
+ if self._row_matches_criteria(row, criteria, valid_operators):
786
+ matched_rows.append(row_index)
787
+
788
+ return matched_rows
789
+
790
+ def _row_matches_criteria(self, row, criteria, valid_operators):
791
+ """
792
+ Helper function to determine if a row matches all criteria.
793
+ """
794
+
795
+ for column, condition in criteria.items():
796
+ if not self._matches_condition(row[column], condition, valid_operators):
797
+ return False # If any condition fails, the row does not match
798
+
799
+ return True # All conditions matched
800
+
801
+ def _matches_condition(self, value, condition, valid_operators):
802
+ """
803
+ Helper function to check if a value meets a specified condition.
804
+ """
805
+
806
+ # Check for None conditions
807
+ if condition in {None, "is None"}:
808
+ return value is None
809
+ elif condition == "is not None":
810
+ return value is not None
811
+
812
+ # Extract the operator and the comparison value
813
+ operator_found = re.search(r'([<>!]=?|==)', condition)
814
+ if not operator_found or operator_found.group(0) not in valid_operators:
815
+ raise ValueError(
816
+ f"Invalid operator in condition: '{condition}'. "
817
+ f"Allowed operators are: {', '.join(valid_operators)}"
818
+ )
819
+
820
+ operator = operator_found.group(0)
821
+ value_str = condition.replace(operator, "", 1).strip()
822
+
823
+ # Attempt to convert the condition value to match the type of row's value
824
+ try:
825
+ condition_value = self._convert_condition_value(value, value_str)
826
+ return eval(f"value {operator} condition_value")
827
+ except ValueError:
828
+ raise ValueError(
829
+ f"Failed to convert '{value_str}' to the same type as column value "
830
+ f"'{value}'."
831
+ )
832
+ except TypeError as e:
833
+ raise TypeError(str(e))
834
+
835
+ def _convert_condition_value(self, value, value_str):
836
+ """
837
+ Convert the condition value to the same type as the row's value.
838
+ """
839
+
840
+ if isinstance(value, (float, np.floating)):
841
+ return float(value_str)
842
+
843
+ if isinstance(value, (int, np.integer)):
844
+ return int(value_str)
845
+
846
+ if isinstance(value, (str, np.str_)):
847
+ return value_str.strip("'\"")
848
+
849
+ raise TypeError(f"Unsupported column data type: {type(value)}")
850
+
851
+ def get_column_stats(self, column_name):
852
+ """
853
+ Calculate basic statistics for a specified numerical column.
854
+
855
+ Parameters
856
+ ----------
857
+ column_name : str
858
+ The name of the column to calculate statistics for.
859
+
860
+ Returns
861
+ -------
862
+ dict
863
+ A dictionary with mean, median, and standard deviation of the column.
864
+
865
+ Raises
866
+ ------
867
+ ValueError
868
+ If the specified column does not exist or is non-numeric.
869
+ """
870
+ if column_name not in self.table.colnames:
871
+ raise ValueError(f"Column '{column_name}' does not exist in the table.")
872
+
873
+ col_data = self.table[column_name]
874
+ if not isinstance(col_data[0], (int, float, np.integer, np.floating)):
875
+ raise ValueError(f"Column '{column_name}' is non-numeric.")
876
+
877
+ return {
878
+ 'mean': np.mean(col_data),
879
+ 'median': np.median(col_data),
880
+ 'std_dev': np.std(col_data),
881
+ }
882
+
883
+ def flux_to_magnitude(self):
884
+ """
885
+ Convert all fluxes to magnitudes using the filter's zero point.
886
+
887
+ This method converts flux values in the catalog table to magnitudes using
888
+ each filter's zero point. It creates two new columns in the table:
889
+ 'mag' for magnitudes and 'mag_err' for magnitude errors.
890
+
891
+ The conversion is done using the filter's flux_to_mag() method, which
892
+ assumes a Pogson magnitude system.
893
+
894
+ Notes
895
+ -----
896
+ - For rows where the filter is None, magnitude values will be set to None
897
+ - The flux values must be in units compatible with the filter's zero point
898
+ - The magnitude system (AB, Vega, etc.) depends on the filter's zero point
899
+
900
+ Examples
901
+ --------
902
+ >>> from sedlib import SED
903
+ >>> from astropy import units as u
904
+ >>>
905
+ >>> sed = SED(name='Vega')
906
+ >>> sed.teff = 10070 * u.K
907
+ >>> sed.radius = 2.766 * u.Rsun
908
+ >>> sed.distance = 7.68 * u.pcs
909
+ >>>
910
+ >>> sed.catalog.flux_to_magnitude()
911
+ """
912
+ self._logger.info("Converting all fluxes to magnitudes")
913
+
914
+ if self.table is None:
915
+ self._logger.error("Catalog table is not initialized")
916
+ raise ValueError('Catalog table is not initialized')
917
+
918
+ mags = []
919
+ mag_errs = []
920
+
921
+ flux_unit = self.table['flux'].unit
922
+ eflux_unit = self.table['eflux'].unit
923
+
924
+ self._logger.debug(f"Processing {len(self.table)} rows")
925
+
926
+ # Initialize magnitude columns if they don't exist
927
+ if 'mag' not in self.table.colnames:
928
+ self.table['mag'] = [None] * len(self.table)
929
+ if 'mag_err' not in self.table.colnames:
930
+ self.table['mag_err'] = [None] * len(self.table)
931
+
932
+ success_count = 0
933
+
934
+ for row in self.table:
935
+ f = row['filter']
936
+ flux = row['flux'] * flux_unit
937
+ eflux = row['eflux'] * eflux_unit
938
+
939
+ if f is None:
940
+ self._logger.warning(
941
+ f"Skipping row with filter {row['vizier_filter']}: "
942
+ "No filter object found"
943
+ )
944
+ mags.append(None)
945
+ mag_errs.append(None)
946
+ continue
947
+
948
+ success_count += 1
949
+
950
+ # Convert flux to magnitude using filter's zero point
951
+ mags.append(f.flux_to_mag(flux))
952
+
953
+ # Calculate magnitude error using error propagation formula
954
+ mag_err = (2.5 / (flux * np.log(10))) * eflux
955
+ mag_errs.append(mag_err)
956
+
957
+ self.table['mag'] = mags
958
+ self.table['mag_err'] = mag_errs
959
+
960
+ try:
961
+ self.table['mag'].info.format = '.3f'
962
+ self.table['mag_err'].info.format = '.3f'
963
+ except:
964
+ # if there are nan values in columns, convert them to float
965
+ self.table['mag'] = np.array(self.table['mag'], dtype=float)
966
+ self.table['mag_err'] = np.array(self.table['mag_err'], dtype=float)
967
+
968
+ # then format the columns
969
+ self.table['mag'].info.format = '.3f'
970
+ self.table['mag_err'].info.format = '.3f'
971
+
972
+ self._logger.info(
973
+ f"Successfully converted {success_count} fluxes to magnitudes"
974
+ )
975
+
976
+ def filter_outliers(
977
+ self,
978
+ sigma_threshold=3.0,
979
+ over_write=False,
980
+ verbose=False
981
+ ):
982
+ """Filter out outlier data points from the SED catalog using iterative
983
+ sigma clipping in logarithmic space.
984
+
985
+ This method computes the residuals between the observed fluxes in the
986
+ catalog and the predicted fluxes from the blackbody model computed with
987
+ the current effective temperature (teff), radius, and distance of the
988
+ object. The residual is defined as:
989
+
990
+ r = log10(F_obs) - log10(F_model)
991
+
992
+ where:
993
+ - F_obs is the observed flux.
994
+ - F_model is the flux predicted by the blackbody model:
995
+ F_model = (π * BlackBody(temperature=teff, scale=scale)
996
+ (wavelength)) / dR,
997
+ with dR = (distance / radius)^2.
998
+
999
+ An iterative sigma clipping is performed on the residuals, flagging any
1000
+ data point for which
1001
+
1002
+ |r - median(r)| > sigma_threshold * σ
1003
+
1004
+ Data points that do not meet this criterion are considered outliers. The
1005
+ process is repeated until no new points are flagged. This allows us to
1006
+ robustly identify points deviating from the continuum—even if some
1007
+ extreme values initially skew the statistics.
1008
+
1009
+ Parameters
1010
+ ----------
1011
+ sigma_threshold : float, optional
1012
+ The sigma threshold for clipping (default is 3.0). Data points with
1013
+ residuals deviating more than sigma_threshold times the standard
1014
+ deviation from the median are flagged as outliers.
1015
+ over_write : bool, optional
1016
+ If True, the outlier points are permanently removed from the SED
1017
+ object's catalog. If False, the method returns an Astropy Table of
1018
+ outlier points without modifying the catalog.
1019
+ verbose : bool, optional
1020
+ If True, logs detailed information about each iteration of the
1021
+ filtering process (default is True).
1022
+
1023
+ Returns
1024
+ -------
1025
+ outliers : astropy.table.Table or None
1026
+ If over_write is False, returns an Astropy Table containing the
1027
+ outlier data points. If over_write is True, updates the catalog in
1028
+ place and returns None.
1029
+
1030
+ Raises
1031
+ ------
1032
+ ValueError
1033
+ If required parameters (teff, radius, distance) are not set or if the
1034
+ catalog is missing the required 'wavelength' and 'flux' columns.
1035
+
1036
+ Examples
1037
+ --------
1038
+ >>> from sedlib import SED
1039
+ >>> from astropy import units as u
1040
+ >>> sed = SED(name='Vega')
1041
+ >>> sed.teff = 9600 * u.K
1042
+ >>> sed.radius = 2.818 * u.Rsun
1043
+ >>> sed.distance = 7.68 * u.pc
1044
+ >>> # Flag outliers using a 3-sigma threshold without modifying catalog:
1045
+ >>> outlier_table = sed.filter_outliers(
1046
+ ... sigma_threshold=3.0, over_write=False, plot=True
1047
+ ... )
1048
+ """
1049
+ # Verify that required parameters are set.
1050
+ if self.teff is None or self.radius is None or self.distance is None:
1051
+ self._logger.error(
1052
+ "Effective temperature, radius, and distance must be set before "
1053
+ "filtering outliers."
1054
+ )
1055
+ raise ValueError(
1056
+ "Effective temperature, radius, and distance must be set before "
1057
+ "filtering outliers."
1058
+ )
1059
+
1060
+ # verify that the catalog is initialized
1061
+ if self.table is None:
1062
+ self._logger.error("Catalog data is required for filtering outliers.")
1063
+ raise ValueError("Catalog data is required for filtering outliers.")
1064
+
1065
+ self._rejected_sigma_threshold = sigma_threshold
1066
+
1067
+ # geometric dilution factor
1068
+ dR = (self.distance.to(u.cm) / self.radius.to(u.cm)) ** 2
1069
+
1070
+ # scale factor
1071
+ scale = 1.0 * u.erg / (u.cm**2 * u.AA * u.s * u.sr)
1072
+
1073
+ # blackbody model
1074
+ bb_model = BlackBody(temperature=self.teff, scale=scale)
1075
+
1076
+ if "wavelength" not in self.table.colnames or "flux" not in self.table.colnames:
1077
+ self._logger.error("Catalog must contain 'wavelength' and 'flux' columns.")
1078
+ raise ValueError(
1079
+ "Catalog must contain 'wavelength' and 'flux' columns."
1080
+ )
1081
+
1082
+ wavelengths = self.table["wavelength"]
1083
+ observed_flux = self.table["flux"]
1084
+
1085
+ # Compute the predicted flux for each wavelength.
1086
+ predicted_flux = bb_model(wavelengths) * np.pi / dR
1087
+
1088
+ # Calculate residuals in logarithmic space.
1089
+ residuals = (
1090
+ np.log10(observed_flux.value) - np.log10(predicted_flux.value)
1091
+ )
1092
+
1093
+ # Perform iterative sigma clipping.
1094
+ # True indicates an "inlier" (continuum point).
1095
+ mask = np.ones(len(residuals), dtype=bool)
1096
+ iteration = 0
1097
+ while True:
1098
+ iteration += 1
1099
+ current_residuals = residuals[mask]
1100
+ if len(current_residuals) == 0:
1101
+ self._logger.warning("No data points remain after clipping.")
1102
+ break
1103
+ median_val = np.median(current_residuals)
1104
+ sigma_val = np.std(current_residuals, ddof=1)
1105
+
1106
+ # Create a new mask: keep points within sigma_threshold * sigma of
1107
+ # the median.
1108
+ new_mask = (
1109
+ np.abs(residuals - median_val) <= sigma_threshold * sigma_val
1110
+ )
1111
+
1112
+ # Check for convergence.
1113
+ if np.array_equal(mask, new_mask):
1114
+ break
1115
+ mask = new_mask
1116
+
1117
+ # Identify outliers (those not in the final mask).
1118
+ outlier_indices = np.where(~mask)[0]
1119
+ num_outliers = len(outlier_indices)
1120
+ self._logger.info(
1121
+ f"Total outliers detected: {num_outliers} out of {len(residuals)} "
1122
+ "data points."
1123
+ )
1124
+
1125
+ # save the rejected data
1126
+ self.rejected_data = self.table[~mask].copy()
1127
+
1128
+ if verbose:
1129
+ print(
1130
+ f"Total outliers detected: {num_outliers} out of "
1131
+ f"{len(residuals)} data points after {iteration} iterations."
1132
+ )
1133
+
1134
+ if over_write:
1135
+ # Permanently update the catalog to keep only the inliers.
1136
+ self.table = self.table[mask]
1137
+ self._logger.info("Outlier points have been removed from the catalog.")
1138
+ return None
1139
+
1140
+ self._logger.info("Returning a table of outlier points for inspection.")
1141
+ return self.rejected_data