PyPI - astro-otter - Versions diffs - 0.0.2__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

astro-otter 0.0.2py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of astro-otter might be problematic. Click here for more details.

Files changed (18) hide show

astro_otter-0.2.0.dist-info/METADATA +164 -0
astro_otter-0.2.0.dist-info/RECORD +18 -0
{astro_otter-0.0.2.dist-info → astro_otter-0.2.0.dist-info}/WHEEL +1 -1
otter/__init__.py +6 -1
otter/_version.py +1 -1
otter/exceptions.py +29 -0
otter/io/data_finder.py +1037 -0
otter/io/host.py +186 -0
otter/io/otter.py +766 -14
otter/io/transient.py +337 -164
otter/plotter/otter_plotter.py +6 -4
otter/plotter/plotter.py +180 -2
otter/schema.py +296 -0
otter/util.py +258 -17
astro_otter-0.0.2.dist-info/METADATA +0 -875
astro_otter-0.0.2.dist-info/RECORD +0 -15
{astro_otter-0.0.2.dist-info → astro_otter-0.2.0.dist-info/licenses}/LICENSE +0 -0
{astro_otter-0.0.2.dist-info → astro_otter-0.2.0.dist-info}/top_level.txt +0 -0

otter/io/transient.py CHANGED Viewed

@@ -9,6 +9,7 @@ from copy import deepcopy
 import re
 from collections.abc import MutableMapping
 from typing_extensions import Self
+import logging
 import numpy as np
 import pandas as pd
@@ -17,9 +18,6 @@ import astropy.units as u
 from astropy.time import Time
 from astropy.coordinates import SkyCoord
-from synphot.units import VEGAMAG, convert_flux
-from synphot.spectrum import SourceSpectrum
 from ..exceptions import (
     FailedQueryError,
     IOError,
@@ -27,10 +25,12 @@ from ..exceptions import (
     TransientMergeError,
 )
 from ..util import XRAY_AREAS
+from .host import Host
 warnings.simplefilter("once", RuntimeWarning)
 warnings.simplefilter("once", UserWarning)
 np.seterr(divide="ignore")
+logger = logging.getLogger(__name__)
 class Transient(MutableMapping):
@@ -196,6 +196,49 @@ class Transient(MutableMapping):
                 + " You can set strict_merge=False to override the check"
             )
+        # create set of the allowed keywords
+        allowed_keywords = {
+            "name",
+            "date_reference",
+            "coordinate",
+            "distance",
+            "filter_alias",
+            "schema_version",
+            "photometry",
+            "classification",
+            "host",
+        }
+        merge_subkeys_map = {
+            "name": None,
+            "date_reference": ["value", "date_format", "date_type"],
+            "coordinate": None,  # may need to update this if we run into problems
+            "distance": ["value", "distance_type", "unit"],
+            "filter_alias": None,
+            "schema_version": None,
+            "photometry": None,
+            "classification": None,
+            "host": [
+                "host_ra",
+                "host_dec",
+                "host_ra_units",
+                "host_dec_units",
+                "host_name",
+            ],
+        }
+        groupby_key_for_default_map = {
+            "name": None,
+            "date_reference": "date_type",
+            "coordinate": "coordinate_type",
+            "distance": "distance_type",
+            "filter_alias": None,
+            "schema_version": None,
+            "photometry": None,
+            "classification": None,
+            "host": None,
+        }
         # create a blank dictionary since we don't want to overwrite this object
         out = {}
@@ -230,31 +273,20 @@ class Transient(MutableMapping):
                 continue
             # There are some special keys that we are expecting
-            if key == "name":
-                self._merge_names(other, out)
-            elif key == "coordinate":
-                self._merge_coords(other, out)
-            elif key == "date_reference":
-                self._merge_date(other, out)
-            elif key == "distance":
-                self._merge_distance(other, out)
-            elif key == "filter_alias":
-                self._merge_filter_alias(other, out)
-            elif key == "schema_version":
-                self._merge_schema_version(other, out)
-            elif key == "photometry":
-                self._merge_photometry(other, out)
-            elif key == "spectra":
-                self._merge_spectra(other, out)
-            elif key == "classification":
-                self._merge_class(other, out)
+            if key in allowed_keywords:
+                Transient._merge_arbitrary(
+                    key,
+                    self,
+                    other,
+                    out,
+                    merge_subkeys=merge_subkeys_map[key],
+                    groupby_key=groupby_key_for_default_map[key],
+                )
             else:
                 # this is an unexpected key!
                 if strict_merge:
                     # since this is a strict merge we don't want unexpected data!
-                    raise TransientMergeError(
-                        f"{key} was not expected! Only keeping the old information!"
-                    )
+                    raise TransientMergeError(f"{key} was not expected! Can not merge!")
                 else:
                     # Throw a warning and only keep the old stuff
                     warnings.warn(
@@ -335,13 +367,20 @@ class Transient(MutableMapping):
             astropy.time.Time of the default discovery date
         """
         key = "date_reference"
-        date = self._get_default(key, filt='df["date_type"] == "discovery"')
+        try:
+            date = self._get_default(key, filt='df["date_type"] == "discovery"')
+        except KeyError:
+            return None
+        if date is None:
+            return date
         if "date_format" in date:
             f = date["date_format"]
         else:
             f = "mjd"
-        return Time(date["value"], format=f)
+        return Time(str(date["value"]).strip(), format=f)
     def get_redshift(self) -> float:
         """
@@ -357,7 +396,62 @@ class Transient(MutableMapping):
         else:
             return default["value"]
-    def _get_default(self, key, filt=""):
+    def get_classification(self) -> tuple(str, float, list):
+        """
+        Get the default classification of this Transient.
+        This normally corresponds to the highest confidence classification that we have
+        stored for the transient.
+        Returns:
+            The default object class as a string, the confidence level in that class,
+            and a list of the bibcodes corresponding to that classification. Or, None
+            if there is no classification.
+        """
+        default = self._get_default("classification")
+        if default is None:
+            return default
+        return default.object_class, default.confidence, default.reference
+    def get_host(self, max_hosts=3, search=False, **kwargs) -> list[Host]:
+        """
+        Gets the default host information of this Transient. This returns an otter.Host
+        object. If search=True, it will also check the BLAST host association database
+        for the best match and return it as well. Note that if search is True then
+        this has the potential to return max_hosts + 1, if BLAST also returns a result.
+        The BLAST result will always be the last value in the returned list.
+        Args:
+            max_hosts [int] : The maximum number of hosts to return
+            **kwargs : keyword arguments to be passed to getGHOST
+        Returns:
+            A list of otter.Host objects. This is useful becuase the Host objects have
+            useful methods for querying public catalogs for data of the host.
+        """
+        # first try to get the host information from our local database
+        host = []
+        if "host" in self:
+            max_hosts = min([max_hosts, len(self["host"])])
+            for h in self["host"][:max_hosts]:
+                host.append(Host(transient_name=self.default_name, **dict(h)))
+        # then try BLAST
+        if search:
+            logger.warn(
+                "Trying to find a host with BLAST/astro-ghost. Note\
+                 that this won't work for older targets! See https://blast.scimma.org"
+            )
+            # default_name should always be the TNS name if we have one
+            print(self.default_name)
+            blast_host = Host.query_blast(self.default_name)
+            print(blast_host)
+            if blast_host is not None:
+                host.append(blast_host)
+        return host
+    def _get_default(self, key, filt=None):
         """
         Get the default of key
@@ -370,7 +464,11 @@ class Transient(MutableMapping):
             raise KeyError(f"This transient does not have {key} associated with it!")
         df = pd.DataFrame(self[key])
-        df = df[eval(filt)]  # apply the filters
+        if len(df) == 0:
+            raise KeyError(f"This transient does not have {key} associated with it!")
+        if filt is not None:
+            df = df[eval(filt)]  # apply the filters
         if "default" in df:
             # first try to get the default
@@ -382,6 +480,7 @@ class Transient(MutableMapping):
         if len(df_filtered) == 0:
             return None
         return df_filtered.iloc[0]
     def _reformat_coordinate(self, item):
@@ -441,12 +540,19 @@ class Transient(MutableMapping):
         Returns:
             A pandas DataFrame of the cleaned up photometry in the requested units
         """
+        # these imports need to be here for some reason
+        # otherwise the code breaks
+        from synphot.units import VEGAMAG, convert_flux
+        from synphot.spectrum import SourceSpectrum
         # check inputs
         if by not in {"value", "raw"}:
             raise IOError("Please choose either value or raw!")
         # turn the photometry key into a pandas dataframe
+        if "photometry" not in self:
+            raise FailedQueryError("No photometry for this object!")
         dfs = []
         for item in self["photometry"]:
             max_len = 0
@@ -463,9 +569,29 @@ class Transient(MutableMapping):
             df = pd.DataFrame(item)
             dfs.append(df)
+        if len(dfs) == 0:
+            raise FailedQueryError("No photometry for this object!")
         c = pd.concat(dfs)
+        # extract the filter information and substitute in any missing columns
+        # because of how we handle this later, we just need to make sure the effective
+        # wavelengths are never nan
+        def fill_wave(row):
+            if "wave_eff" not in row or (
+                pd.isna(row.wave_eff) and not pd.isna(row.freq_eff)
+            ):
+                freq_eff = row.freq_eff * u.Unit(row.freq_units)
+                wave_eff = freq_eff.to(u.Unit(wave_unit), equivalencies=u.spectral())
+                return wave_eff.value, wave_unit
+            elif not pd.isna(row.wave_eff):
+                return row.wave_eff, row.wave_units
+            else:
+                raise ValueError("Missing frequency or wavelength information!")
         filters = pd.DataFrame(self["filter_alias"])
+        res = filters.apply(fill_wave, axis=1)
+        filters["wave_eff"], filters["wave_units"] = zip(*res)
+        # merge the photometry with the filter information
         df = c.merge(filters, on="filter_key")
         # make sure 'by' is in df
@@ -478,6 +604,14 @@ class Transient(MutableMapping):
         # skip rows where 'by' is nan
         df = df[df[by].notna()]
+        # remove rows where the flux is less than zero since this is nonphysical
+        # See Mummery et al. (2023) Section 5.2 for why we need to do this when using
+        # ZTF data:
+        # "Because the origin of the negative late-time flux is currently un-
+        # known (and under investigation), we have not attempted to correct
+        # the TDE lightcurves for this systematic effect. "
+        df = df[df[by].astype(float) > 0]
         # drop irrelevant obs_types before continuing
         if obs_type is not None:
             valid_obs_types = {"radio", "uvoir", "xray"}
@@ -500,6 +634,7 @@ class Transient(MutableMapping):
         # Figure out what columns are good to groupby in the photometry
         outdata = []
         if "telescope" in df:
             tele = True
             to_grp_by = ["obs_type", by + "_units", "telescope"]
@@ -523,8 +658,9 @@ class Transient(MutableMapping):
                 )
             unit = unit[0]
+            isvegamag = "vega" in unit.lower()
             try:
-                if "vega" in unit.lower():
+                if isvegamag:
                     astropy_units = VEGAMAG
                 else:
                     astropy_units = u.Unit(unit)
@@ -550,30 +686,27 @@ class Transient(MutableMapping):
                 indata_err = np.array(data[by + "_err"].astype(float))
             else:
                 indata_err = np.zeros(len(data))
+            # convert to an astropy quantity
             q = indata * u.Unit(astropy_units)
             q_err = indata_err * u.Unit(
                 astropy_units
             )  # assume error and values have the same unit
-            # get the effective wavelength
-            if "freq_eff" in data and not np.isnan(data["freq_eff"].iloc[0]):
-                freq_units = data["freq_units"]
-                if len(np.unique(freq_units)) > 1:
-                    raise OtterLimitationError(
-                        "Can not convert different units to the same unit!"
-                    )
-                freq_eff = np.array(data["freq_eff"]) * u.Unit(freq_units.iloc[0])
-                wave_eff = freq_eff.to(u.AA, equivalencies=u.spectral())
+            # get and save the effective wavelength
+            # because of cleaning we did to the filter dataframe above wave_eff
+            # should NEVER be nan!
+            if np.any(pd.isna(data["wave_eff"])):
+                raise ValueError("Flushing out the effective wavelength array failed!")
-            elif "wave_eff" in data and not np.isnan(data["wave_eff"].iloc[0]):
-                wave_units = data["wave_units"]
-                if len(np.unique(wave_units)) > 1:
-                    raise OtterLimitationError(
-                        "Can not convert different units to the same unit!"
-                    )
+            zz = zip(data["wave_eff"], data["wave_units"])
+            wave_eff = u.Quantity([vv * u.Unit(uu) for vv, uu in zz], wave_unit)
+            freq_eff = wave_eff.to(freq_unit, equivalencies=u.spectral())
-                wave_eff = np.array(data["wave_eff"]) * u.Unit(wave_units.iloc[0])
+            data["converted_wave"] = wave_eff.value
+            data["converted_wave_unit"] = wave_unit
+            data["converted_freq"] = freq_eff.value
+            data["converted_freq_unit"] = freq_unit
             # convert using synphot
             # stuff has to be done slightly differently for xray than for the others
@@ -588,44 +721,61 @@ class Transient(MutableMapping):
                         )
                 else:
                     raise OtterLimitationError(
-                        "Can not convert x-ray data without a " + "telescope"
+                        "Can not convert x-ray data without a telescope"
                     )
                 # we also need to make this wave_min and wave_max
                 # instead of just the effective wavelength like for radio and uvoir
-                wave_eff = np.array(
-                    list(zip(data["wave_min"], data["wave_max"]))
-                ) * u.Unit(wave_units.iloc[0])
+                zz = zip(data["wave_min"], data["wave_max"], data["wave_units"])
+                wave_eff = u.Quantity(
+                    [np.array([m, M]) * u.Unit(uu) for m, M, uu in zz],
+                    u.Unit(wave_unit),
+                )
             else:
                 area = None
-            # we unfortunately have to loop over the points here because
-            # syncphot does not work with a 2D array of min max wavelengths
-            # for converting counts to other flux units. It also can't convert
-            # vega mags with a wavelength array because it then interprets that as the
-            # wavelengths corresponding to the SourceSpectrum.from_vega()
-            flux, flux_err = [], []
-            for wave, xray_point, xray_point_err in zip(wave_eff, q, q_err):
-                f_val = convert_flux(
-                    wave,
-                    xray_point,
-                    u.Unit(flux_unit),
-                    vegaspec=SourceSpectrum.from_vega(),
-                    area=area,
-                )
-                f_err = convert_flux(
-                    wave,
-                    xray_point_err,
-                    u.Unit(flux_unit),
-                    vegaspec=SourceSpectrum.from_vega(),
-                    area=area,
-                )
+            if obstype == "xray" or isvegamag:
+                # we unfortunately have to loop over the points here because
+                # syncphot does not work with a 2D array of min max wavelengths
+                # for converting counts to other flux units. It also can't convert
+                # vega mags with a wavelength array because it interprets that as the
+                # wavelengths corresponding to the SourceSpectrum.from_vega()
+                flux, flux_err = [], []
+                for wave, xray_point, xray_point_err in zip(wave_eff, q, q_err):
+                    f_val = convert_flux(
+                        wave,
+                        xray_point,
+                        u.Unit(flux_unit),
+                        vegaspec=SourceSpectrum.from_vega(),
+                        area=area,
+                    ).value
+                    # approximate the uncertainty as dX = dY/Y * X
+                    f_err = np.multiply(
+                        f_val, np.divide(xray_point_err.value, xray_point.value)
+                    )
+                    # then we take the average of the minimum and maximum values
+                    # computed by syncphot
+                    flux.append(np.mean(f_val))
+                    flux_err.append(np.mean(f_err))
-                # then we take the average of the minimum and maximum values
-                # computed by syncphot
-                flux.append(np.mean(f_val).value)
-                flux_err.append(np.mean(f_err).value)
+            else:
+                # this will be faster and cover most cases
+                flux = convert_flux(wave_eff, q, u.Unit(flux_unit)).value
+                # since the error propagation is different between logarithmic units
+                # and linear units, unfortunately
+                if isinstance(u.Unit(flux_unit), u.LogUnit):
+                    # approximate the uncertainty as dX = dY/Y * |ln(10)/2.5|
+                    prefactor = np.abs(np.log(10) / 2.5)  # this is basically 1
+                else:
+                    # approximate the uncertainty as dX = dY/Y * X
+                    prefactor = flux
+                flux_err = np.multiply(prefactor, np.divide(q_err.value, q.value))
             flux = np.array(flux) * u.Unit(flux_unit)
             flux_err = np.array(flux_err) * u.Unit(flux_unit)
@@ -639,7 +789,7 @@ class Transient(MutableMapping):
         outdata = pd.concat(outdata)
         # copy over the flux units
-        outdata["converted_flux_unit"] = [flux_unit] * len(outdata)
+        outdata["converted_flux_unit"] = flux_unit
         # make sure all the datetimes are in the same format here too!!
         times = [
@@ -647,27 +797,28 @@ class Transient(MutableMapping):
             for d, f in zip(outdata.date, outdata.date_format.str.lower())
         ]
         outdata["converted_date"] = times
-        outdata["converted_date_unit"] = [date_unit] * len(outdata)
-        # same with frequencies and wavelengths
-        freqs = []
-        waves = []
-        for _, row in df.iterrows():
-            if "freq_eff" in row and not np.isnan(row["freq_eff"]):
-                val = row["freq_eff"] * u.Unit(row["freq_units"])
-            elif "wave_eff" in df and not np.isnan(row["wave_eff"]):
-                val = row["wave_eff"] * u.Unit(row["wave_units"])
-            else:
-                raise ValueError("No known frequency or wavelength, please fix!")
+        outdata["converted_date_unit"] = date_unit
+        # compute the upperlimit value based on a 3 sigma detection
+        # this is just for rows where we don't already know if it is an upperlimit
+        if isinstance(u.Unit(flux_unit), u.LogUnit):
+            # this uses the following formula (which is surprising because it means
+            # magnitude upperlimits are independent of the actual measurement!)
+            # sigma_m > (1/3) * (ln(10)/2.5)
+            def is_upperlimit(row):
+                if pd.isna(row.upperlimit):
+                    return row.converted_flux_err > np.log(10) / (3 * 2.5)
+                else:
+                    return row.upperlimit
+        else:
-            freqs.append(val.to(freq_unit, equivalencies=u.spectral()).value)
-            waves.append(val.to(wave_unit, equivalencies=u.spectral()).value)
+            def is_upperlimit(row):
+                if pd.isna(row.upperlimit):
+                    return row.converted_flux < 3 * row.converted_flux_err
+                else:
+                    return row.upperlimit
-        outdata["converted_freq"] = freqs
-        outdata["converted_wave"] = waves
-        outdata["converted_wave_unit"] = [wave_unit] * len(outdata)
-        outdata["converted_freq_unit"] = [freq_unit] * len(outdata)
+        outdata["upperlimit"] = outdata.apply(is_upperlimit, axis=1)
         return outdata
@@ -756,16 +907,6 @@ class Transient(MutableMapping):
         bothlines = [{"value": k, "reference": t1map[k] + t2map[k]} for k in inboth]
         out[key]["alias"] = line2 + line1 + bothlines
-    def _merge_coords(t1, t2, out):  # noqa: N805
-        """
-        Merge the coordinates subdictionaries for t1 and t2 and put it in out
-        Use pandas to drop any duplicates
-        """
-        key = "coordinate"
-        Transient._merge_arbitrary(key, t1, t2, out)
     def _merge_filter_alias(t1, t2, out):  # noqa: N805
         """
         Combine the filter alias lists across the transient objects
@@ -784,11 +925,21 @@ class Transient(MutableMapping):
         Just keep whichever schema version is greater
         """
         key = "schema_version/value"
-        if int(t1[key]) > int(t2[key]):
+        if "comment" not in t1["schema_version"]:
+            t1["schema_version/comment"] = ""
+        if "comment" not in t2["schema_version"]:
+            t2["schema_version/comment"] = ""
+        if key in t1 and key in t2 and int(t1[key]) > int(t2[key]):
             out["schema_version"] = deepcopy(t1["schema_version"])
         else:
             out["schema_version"] = deepcopy(t2["schema_version"])
+        out["schema_version"]["comment"] = (
+            t1["schema_version/comment"] + ";" + t2["schema_version/comment"]
+        )
     def _merge_photometry(t1, t2, out):  # noqa: N805
         """
         Combine photometry sources
@@ -797,8 +948,15 @@ class Transient(MutableMapping):
         key = "photometry"
         out[key] = deepcopy(t1[key])
-        refs = np.array([d["reference"] for d in out[key]])
+        refs = []  # np.array([d["reference"] for d in out[key]])
         # merge_dups = lambda val: np.sum(val) if np.any(val.isna()) else val.iloc[0]
+        for val in out[key]:
+            if isinstance(val, list):
+                refs += val
+            elif isinstance(val, np.ndarray):
+                refs += list(val)
+            else:
+                refs.append(val)
         for val in t2[key]:
             # first check if t2's reference is in out
@@ -823,12 +981,6 @@ class Transient(MutableMapping):
                 out[key][i1] = newdict  # replace the dictionary at i1 with the new dict
-    def _merge_spectra(t1, t2, out):  # noqa: N805
-        """
-        Combine spectra sources
-        """
-        pass
     def _merge_class(t1, t2, out):  # noqa: N805
         """
         Combine the classification attribute
@@ -864,24 +1016,8 @@ class Transient(MutableMapping):
             else:
                 item["default"] = False
-    def _merge_date(t1, t2, out):  # noqa: N805
-        """
-        Combine epoch data across two transients and write it to "out"
-        """
-        key = "date_reference"
-        Transient._merge_arbitrary(key, t1, t2, out)
-    def _merge_distance(t1, t2, out):  # noqa: N805
-        """
-        Combine distance information for these two transients
-        """
-        key = "distance"
-        Transient._merge_arbitrary(key, t1, t2, out)
     @staticmethod
-    def _merge_arbitrary(key, t1, t2, out):
+    def _merge_arbitrary(key, t1, t2, out, merge_subkeys=None, groupby_key=None):
         """
         Merge two arbitrary datasets inside the json file using pandas
@@ -889,44 +1025,81 @@ class Transient(MutableMapping):
         a NxM pandas dataframe!
         """
-        df1 = pd.DataFrame(t1[key])
-        df2 = pd.DataFrame(t2[key])
-        merged_with_dups = pd.concat([df1, df2]).reset_index(drop=True)
-        # have to get the indexes to drop using a string rep of the df
-        # this is cause we have lists in some cells
-        to_drop = merged_with_dups.astype(str).drop_duplicates().index
-        merged = merged_with_dups.iloc[to_drop].reset_index(drop=True)
-        outdict = merged.to_dict(orient="records")
+        if key == "name":
+            t1._merge_names(t2, out)
+        elif key == "filter_alias":
+            t1._merge_filter_alias(t2, out)
+        elif key == "schema_version":
+            t1._merge_schema_version(t2, out)
+        elif key == "photometry":
+            t1._merge_photometry(t2, out)
+        elif key == "classification":
+            t1._merge_class(t2, out)
+        else:
+            # this is where we can standardize some of the merging
+            df1 = pd.DataFrame(t1[key])
+            df2 = pd.DataFrame(t2[key])
+            merged_with_dups = pd.concat([df1, df2]).reset_index(drop=True)
+            # have to get the indexes to drop using a string rep of the df
+            # this is cause we have lists in some cells
+            # We also need to deal with merging the lists of references across rows
+            # that we deem to be duplicates. This solution to do this quickly is from
+            # https://stackoverflow.com/questions/36271413/ \
+            # pandas-merge-nearly-duplicate-rows-based-on-column-value
+            if merge_subkeys is None:
+                merge_subkeys = merged_with_dups.columns.tolist()
+                merge_subkeys.remove("reference")
+            else:
+                for k in merge_subkeys:
+                    if k not in merged_with_dups:
+                        merge_subkeys.remove(k)
+            merged = (
+                merged_with_dups.astype(str)
+                .groupby(merge_subkeys)["reference"]
+                .apply(lambda x: x.sum())
+                .reset_index()
+            )
-        outdict_cleaned = Transient._remove_nans(
-            outdict
-        )  # clear out the nans from pandas conversion
+            # then we have to turn the merged reference strings into a string list
+            merged["reference"] = merged.reference.str.replace("][", ",")
-        out[key] = outdict_cleaned
+            # then eval the string of a list to get back an actual list of sources
+            merged["reference"] = merged.reference.apply(
+                lambda v: np.unique(eval(v)).tolist()
+            )
-    @staticmethod
-    def _remove_nans(d):
-        """
-        Remove nans from a record dictionary
+            # decide on default values
+            if groupby_key is None:
+                iterate_through = [(0, merged)]
+            else:
+                iterate_through = merged.groupby(groupby_key)
+            # we will make whichever value has more references the default
+            outdict = []
+            for data_type, df in iterate_through:
+                lengths = df.reference.map(len)
+                max_idx_arr = np.argmax(lengths)
+                if isinstance(max_idx_arr, np.int64):
+                    max_idx = max_idx_arr
+                elif len(max_idx_arr) == 0:
+                    raise ValueError("Something went wrong with deciding the default")
+                else:
+                    max_idx = max_idx_arr[0]  # arbitrarily choose the first
-        THIS IS SLOW: O(n^2)!!! WILL NEED TO BE SPED UP LATER
-        """
+                defaults = np.full(len(df), False, dtype=bool)
+                defaults[max_idx] = True
-        outd = []
-        for item in d:
-            outsubd = {}
-            for key, val in item.items():
-                if not isinstance(val, float):
-                    # this definitely is not NaN
-                    outsubd[key] = val
+                df["default"] = defaults
+                outdict.append(df)
+            outdict = pd.concat(outdict)
-                else:
-                    if not np.isnan(val):
-                        outsubd[key] = val
-            outd.append(outsubd)
+            # from https://stackoverflow.com/questions/52504972/ \
+            # converting-a-pandas-df-to-json-without-nan
+            outdict = outdict.replace("nan", np.nan)
+            outdict_cleaned = [{**x[i]} for i, x in outdict.stack().groupby(level=0)]
-        return outd
+            out[key] = outdict_cleaned

astro-otter 0.0.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

Potentially problematic release.

astro-otter 0.0.2py3-none-any.whl → 0.2.0py3-none-any.whl