PyPI - ASDCache - Versions diffs - 0.2.2__tar.gz → 0.2.4__tar.gz - Mend

ASDCache 0.2.2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

{asdcache-0.2.2 → asdcache-0.2.4}/ASDCache/ASDCache.py +174 -226
asdcache-0.2.4/ASDCache/__init__.py +61 -0
asdcache-0.2.4/ASDCache/_version.py +24 -0
asdcache-0.2.4/ASDCache/utils.py +66 -0
{asdcache-0.2.2 → asdcache-0.2.4}/PKG-INFO +21 -16
{asdcache-0.2.2 → asdcache-0.2.4}/pyproject.toml +66 -29
asdcache-0.2.2/ASDCache/__init__.py +0 -10
asdcache-0.2.2/ASDCache/_version.py +0 -21
{asdcache-0.2.2 → asdcache-0.2.4}/.gitignore +0 -0
{asdcache-0.2.2 → asdcache-0.2.4}/LICENSE +0 -0

{asdcache-0.2.2 → asdcache-0.2.4}/ASDCache/ASDCache.py RENAMED Viewed

@@ -1,51 +1,22 @@
-r"""`ASDcache` is a module to fetch data from the  NIST Atomic Spectra Database (ASD), utlizing caching for fast responses.
+"""The ASDCache module.
-To make the most use out of the cache, `ASDcache` is opinionated in the information it retrieves from the ASD; it always requests the same schema of information and locally computes additional fields.
-Data is initially fetched from the online published NIST page, using the tab-separated ASCII output format.
-The benefit of this format is that it is more 'machine readable' than the formatted ASCII of HTML options.
-This means it requires far less bespoke parsing to get rid of 'human readable' features such as repeated page column headers, or empty lines.
-To ensure a consistent schema of the retrieved data, lines are always retrieved as a function of wavelength, using `vacuum wavelength`, even between 200 to 2000 nm.
-Wavenumbers and Ritz wavelength will be included in the response.
-In the range $5000 \mathrm{cm}^{-1}<\nu<50000 \mathrm{cm}^{-1}$ the air equivalent observed and Ritz wavelengths are calculated using the same Sellmeier equation as the NIST ASD (see [here][ASDcache.readASD.ASDCache.wn_to_n_refractive]).
-This is consistent with the approach of the ASD.
-Each response from the NIST page is cached (1 week by default) on the local system.
-This makes it much faster to load the same data, even across different script runs and/or user programs/sessions.
-As an example: reading all spectra between 200 and 1000 nm can take over 2 minutes without using the cache, but can be as fast as 0.2 seconds using the `polars` backend.
-In addition, it means that an internet connection is not required after initial data fetching.
-The cached response is only updated upon succesfull retrieval of a new response of the NIST page.
-If unable to succesfully fetch new data, we fall back to a 'stale' cached response.
-The cache can be shared to another system, to give offline/airgapped systems access to the same data.
-To that end, the file `NIST_ASD_cache.sqlite` in the user's cache directory has to be copied over.
-The standard cache directories are as follows:
-=== "Windows"
-    `%USERPROFILE%/AppData/Local`
-=== "Linux"
-    `~/.cache/http_cache/`
-=== "MacOS"
-    `/Users/user/Library/Caches/http_cache/`
-Queries to the NIST ASD are hashed by the keys (or parameters) of the requests.
-This means that any change to either one of these parameters, will result in a new cache entry, even if the returned data is equivalent.
+It contains both the [SpectraCache][(m).] and [BibCache][(m).] classes which allow you to interact with the ASD and the relevant bibliographic databases.
 """
-import importlib
+from pathlib import Path
+import importlib.util
 import warnings
+import numpy as np
 import pandas as pd
-from requests_cache import CachedSession, CachedResponse
+from requests_cache import CachedSession, CachedResponse, OriginalResponse
+from requests import Response
 from io import StringIO
 from datetime import timedelta
 import re
-import numpy as np
 from bs4 import BeautifulSoup
 import sys
 import logging
-from typing import Any, Optional
+from typing import Any, Optional, Union
 if importlib.util.find_spec("polars"):
     POLARS_AVAILABLE = True
@@ -54,12 +25,10 @@ if importlib.util.find_spec("polars"):
 else:
     POLARS_AVAILABLE = False
-logging.basicConfig(
-    level=logging.INFO,
-    format="[%(asctime)s] %(levelname)s [%(name)s.%(funcName)s:%(lineno)d] %(message)s",
-    datefmt="%d/%b/%Y %H:%M:%S",
-    stream=sys.stdout,
-)
+from .utils import wavenumber_to_refractive_index, extract_state_from_response
+from ._version import version
+logger = logging.getLogger("ASDCache")
 ASDSchema = {
     "element": str,
@@ -92,20 +61,22 @@ ASDSchema = {
     "line_ref": str,
 }
-STATE_EXPR = r"spectra=([\w]+)\+?([IVX]+)?"
-"""Regex pattern for extracting (element,charge) tuple for a single-state query, which uses roman numerals."""
 SCI_EXPR = r"([+-]?\d*\.?\d+(?:[eE][+-]?\d+)?)"
 """Regex pattern for processing scientific notation"""
+class ASDQueryError(Exception):
+    """Exception raised when the NIST ASD has indicated an error with a query."""
 class SpectraCache:
     """A class acting as the entrypoint to retrieve data from the NIST Atomic Spectra Database that uses caching.
     The `ASDCache` instance acts as an access point to the cache, which stores responses on the local system in a SQLite database.
-    Data retrieval from cache is much faster (order milliseconds) than fetching from the internet (order seconds), and avoids wastefull requests to the server.
+    Data retrieval from cache is much faster (order milliseconds) than fetching from the internet (order of seconds to minutes), and avoids wastefull requests to the server.
-    Cache time-to-live is one week by default.
+    Cache time-to-live is two weeks by default.
     Since the NIST ASD is usually updated less frequently than that, this is a compromise between having the latest data, and overall fast performance.
@@ -115,14 +86,15 @@ class SpectraCache:
     nist_url = "https://physics.nist.gov/cgi-bin/ASD/lines1.pl"
     species_expr = re.compile(r"spectra=([\w\+\-\%3]+)&")
     query_params = {
+        "submit": "Retrieve Data",
         "unit": 1,
         "de": 0,
-        "plot_out": 0,
+        # "plot_out": 0,
         "I_scale_type": 1,
         "format": 3,
         "line_out": 0,
-        "remove_js": "on",
-        "no_spaces": "on",
+        # "remove_js": "on",
+        # "no_spaces": "on",
         "en_unit": 0,
         "output": 0,
         "bibrefs": 1,
@@ -143,54 +115,37 @@ class SpectraCache:
         "enrg_out": "on",
         "J_out": "on",
         "g_out": "on",
-        "diag_out": "on",
+        # "diag_out": "on",  # avoid diagnostic data, it leads to multi-species queries failing; which can appear as if keys below are needed. See issue #1
         "allowed_out": 1,
         "forbid_out": 1,
-        "submit": "Retrieve Data",
+        # "show_diff_obs_calc": 1, # Does not appear mandatory in retrospect,  see issue #1
+        # "include_Ritz_E1": 1, # Does not appear mandatory in retrospect,  see issue #1
     }
     """Request parameters used by the NIST ASD form."""
-    column_order = [
-        "element",
-        "sp_num",
-        "obs_wl_vac(nm)",
-        "unc_obs_wl",
-        "obs_wl_air(nm)",
-        "ritz_wl_vac(nm)",
-        "unc_ritz_wl",
-        "ritz_wl_air(nm)",
-        "wn(cm-1)",
-        "intens",
-        "Aki(s^-1)",
-        "fik",
-        "S(a.u.)",
-        "log_gf",
-        "Acc",
-        "Ei(cm-1)",
-        "Ek(cm-1)",
-        "conf_i",
-        "term_i",
-        "J_i",
-        "conf_k",
-        "term_k",
-        "J_k",
-        "g_i",
-        "g_k",
-        "Type",
-        "tp_ref",
-        "line_ref",
-    ]
-    """Fixed order of columns for consistent schema of data."""
-    def __init__(self, use_polars_backend=False, cache_expiry=timedelta(weeks=1), strict_matching=True):
-        """Initialize an instance that handles cached data lookup of the NIST ASD."""
+    def __init__(
+        self,
+        use_polars_backend=False,
+        cache_expiry=timedelta(weeks=2),
+        strict_matching=True,
+        cache_path: Optional[Path] = None,
+    ):
+        """Initialize an instance that handles cached data lookup of the NIST ASD.
+        Args:
+            use_polars_backend (bool): Flag to use polars as DataFrame backend, if available
+            cache_expiry (timedelta): Span of time beyond which an entry will be considered expired, and a refresh attempted
+            strict_matching (bool): If true, use all request parameters to hash urls for cache matching (recommended).
+            cache_path (Path, Optional): Path to a location to store the cache in
+        """
         self.strict_matching = strict_matching
         self.session = CachedSession(
-            "NIST_ASD_cache",
+            "NIST_ASD_cache" if cache_path is None else cache_path,
             use_cache_dir=True,
             expire_after=cache_expiry,
             stale_if_error=True,
             filter_fn=self._check_response_success,
-            ignored_parameters=list(self.query_params.keys()) if self.strict_matching is False else None,
+            ignored_parameters=list(self.query_params.keys()) if self.strict_matching is False else [],
         )
         if (use_polars_backend) & (not POLARS_AVAILABLE):
             warnings.warn("Cannot find `polars` as a backend, falling back to `pandas`", stacklevel=2)
@@ -209,7 +164,7 @@ class SpectraCache:
         """
         return self.session.settings.expire_after
-    def set_cache_expiry(self, new: timedelta = None, **kwargs):
+    def set_cache_expiry(self, new: Optional[timedelta] = None, **kwargs):
         """Set the cache expiry to a different interval (default: 1 week).
         Can be done by either passing in a `timedelta` object, or valid keyword arguments for `timedelta` itself.
@@ -219,12 +174,59 @@ class SpectraCache:
         self.session.settings.expire_after = new
     @staticmethod
-    def _check_response_success(response: "CachedResponse") -> bool:
+    def _check_response_success(response: Response) -> bool:
         """Validate that data has been fetched succesfully.
         If this check fails, the cache should not update with this response, even when marked as stale.
+        The first obvious way to check success is if an error is indicated by the HTTP status code.
+        However, when a query for data is incorrect, the NIST ASD returns a HTML page indicating `<title>NIST ASD : Input Error</title>` in the `<head>` tag, or "Error Message".
+        A successfull query would not receive HTML as a response, but raw ASCII values instead.
+        We can thus check for the start of a HTML document.
+        Note that this only works for data queries, not for bibliographic metadata by `BibCache`.
+        """
+        return not (not response.ok or response.content.startswith(b"<!DOCTYPE"))
+    def _get_data(self, species: str, wl_range: tuple[float, float] = (170, 1000), **kwargs) -> Response:
+        """Retrieve raw, ASCII-formatted data from the NIST ASD with a GET request.
+        To retrieve data and parse it into a DataFrame, use [fetch][..] instead.
+        Returns the raw response, which will be cached if it contains valid data (see [_check_response_success][..]).
+        If the response does not contain ASCII data, but HTML instead, an [ASDQueryError][(m).] will be raised.
+        It is possible to override any standard query parameter (see [query_params][..]]) by passing them as kwargs.
         """
-        return (response.status_code == 200) & (b"Error Message" not in response.content)
+        query_params = {
+            "spectra": species,
+            "output_type": 0,
+            "low_w": min(wl_range),
+            "upp_w": max(wl_range),
+            **{k: v for k, v in self.query_params.items() if k not in kwargs},
+            **{k: v for k, v in kwargs.items() if k in self.query_params},
+        }
+        response: Response = self.session.get(self.nist_url, params=query_params)
+        response.raise_for_status()
+        # Check if response is not a HTML document instead of ASCII formatted data, indicating query error.
+        if response.content.startswith(b"<!DOCTYPE"):
+            body = BeautifulSoup(response.text, features="html.parser").body
+            reason = body.text.strip().replace("\n", " ") if body else ""
+            logger.error(
+                "NIST ASD responded with %s instead of ASCII-data for species=%s, wl_range=%s\nQuery: %s",
+                reason,
+                species,
+                wl_range,
+                response.url,
+            )
+            raise ASDQueryError(
+                f"Query for {species=} {wl_range=} did not receive ASCII-data. {reason=} This means the ASD could not interpret your query. Check if your query is malformed."
+            )
+        return response
     @property
     def cached_species(self) -> list[str]:
@@ -239,35 +241,22 @@ class SpectraCache:
             for elem in self.species_expr.search(u).group(1).split("%3B")
         ]
-    def fetch(self, species, wl_range=(170, 1000), **kwargs) -> "pd.DataFrame|pl.DataFrame|CachedResponse":
+    def fetch(self, species, wl_range=(170, 1000)) -> "pd.DataFrame|pl.DataFrame":
         """Fetch information on a species from the ASD, first checking the cache.
-        This supports loading multiple species in one go by using the same notation as the NIST ASD page.
+        This supports loading multiple species in one go by using the same notation as the NIST ASD form.
         Note however that cache keys are computed for unique options for `species` and `wl_range`.
         This means that you won't get caching benefits by using different queries.
-        In other words: the cache cannot deduplicate queries such as `ASD.fetch('H', (200,1000))` followed by `ASD.fetch('H I', (650,660))`.
+        In other words: the cache cannot deduplicate queries such as `ASD.fetch('H', (200,1000))` followed by `ASD.fetch('H I', (650,660))` (or vice versa).
         Both these operations will fetch data online and be stored as separate cache entries.
         """
-        query_params = {
-            "spectra": species,
-            "output_type": 0,
-            "low_w": min(wl_range),
-            "upp_w": max(wl_range),
-            **self.query_params,
-        }
-        response = self.session.get(self.nist_url, params=query_params)
-        # if response.status_code == 200:
-        response.raise_for_status()
+        # TODO: add kwargs for read-only/offline access etc.
+        response = self._get_data(species, wl_range)
         return self.create_dataframe(response)
-        # else:
-        #     print(f"Error: Received status code {response.status_code}")
-        #     print(response.url)
-        #     return response
     def create_dataframe(self, response) -> "pd.DataFrame|pl.DataFrame":
         """Create a dataframe from the (cached) NIST ASD response, using the chosen backend at class instantiation."""
@@ -276,14 +265,14 @@ class SpectraCache:
         return self._from_pandas(response)
     @classmethod
-    def _from_pandas(cls, response: "CachedResponse") -> "pd.DataFrame":
+    def _from_pandas(cls, response: Response) -> "pd.DataFrame":
         r"""Transform a (cached) NIST ASD response into a pandas DataFrame.
         Calculates the air equivalent wavelength from the vacuum wavelength using the same Sellmeier equation as the NIST ASD.
         Note that this conversion is only performed for lines with $200 nm < \lambda < 2000 nm$, like the ASD.
-        For lines outside of this range, the conversion falls back to their vacuum wavelength.
+        For lines outside of this range, it uses NaN values.
         """
         schema = {
             "obs_wl_vac(nm)": str,
@@ -311,38 +300,48 @@ class SpectraCache:
             "": str,
         }
         df = pd.read_csv(StringIO(response.text), sep="\t", dtype=schema)
+        # Detect if pandas uses new `StringDtype`, or legacy `object` dtype for strings.
+        # This affects NaN handling for strings.
+        # Pandas 3.0 and up use the StringDtype, while pandas 2 can opt-in to this
+        # The 'Type' column should exist, 'element' may not.
+        uses_new_string_dtype = pd.api.types.is_string_dtype(df["Type"])
         for col in ["obs_wl_vac(nm)", "ritz_wl_vac(nm)", "intens", "Ei(cm-1)", "Ek(cm-1)"]:
             df[col] = df.loc[:, col].str.extract(SCI_EXPR).astype(float)
-        df["Type"] = df.loc[:, "Type"].astype(str).replace("nan", "E1")
+        # Any missing value implies line is an E1 (electric dipole) transition
+        if uses_new_string_dtype:
+            df["Type"] = df.loc[:, "Type"].fillna("E1")
+        else:
+            df["Type"] = df.loc[:, "Type"].astype(str).replace("nan", "E1")
         df["tp_ref"] = df.loc[:, "tp_ref"].fillna("")
-        df["obs_wl_air(nm)"] = df["obs_wl_vac(nm)"]
-        df["obs_wl_air(nm)"] = df[df["wn(cm-1)"].between(5000, 50000)]["obs_wl_air(nm)"] / cls.wn_to_n_refractive(
-            df[df["wn(cm-1)"].between(5000, 50000)]["wn(cm-1)"]
+        df["obs_wl_air(nm)"] = np.nan
+        air_equiv_range = df["wn(cm-1)"].between(5000, 50000)  # range where air wavelength is computed.
+        df["obs_wl_air(nm)"] = df.loc[air_equiv_range, "obs_wl_vac(nm)"] / wavenumber_to_refractive_index(
+            df.loc[air_equiv_range, "wn(cm-1)"]
         )
-        df["ritz_wl_air(nm)"] = df["ritz_wl_vac(nm)"]
-        df["ritz_wl_air(nm)"] = df[df["wn(cm-1)"].between(5000, 50000)]["ritz_wl_air(nm)"] / cls.wn_to_n_refractive(
-            df[df["wn(cm-1)"].between(5000, 50000)]["wn(cm-1)"]
+        df["ritz_wl_air(nm)"] = np.nan
+        df["ritz_wl_air(nm)"] = df.loc[air_equiv_range, "ritz_wl_vac(nm)"] / wavenumber_to_refractive_index(
+            df.loc[air_equiv_range, "wn(cm-1)"]
         )
         df = df.drop([c for c in df.columns if "Unnamed" in c], axis=1).reset_index(drop=True)
         if "element" not in df.columns:
-            element, numeral = re.search(STATE_EXPR, response.url).groups()
-            df["element"] = element
-            df["sp_num"] = numeral
             # cast roman numerals to int for consistency with queries with multiple ionization states, e.g. Ar I vs Ar I-II
-            df["sp_num"] = df["sp_num"].map(cls.roman_to_int)
+            # As 'element' and 'sp_num' columns are only missing for single-species queries, assign as constants, not vectors.
+            element, numeric = extract_state_from_response(response)
+            df["element"] = element
+            df["sp_num"] = numeric
         df["unc_obs_wl"] = pd.to_numeric(df["unc_obs_wl"]) if "unc_obs_wl" in df.columns else np.nan
         df["unc_ritz_wl"] = pd.to_numeric(df["unc_ritz_wl"]) if "unc_ritz_wl" in df.columns else np.nan
-        return df.loc[:, cls.column_order]
+        return df.loc[:, list(ASDSchema)]
     @classmethod
-    def _from_polars(cls, response: "CachedResponse") -> "pl.DataFrame":
+    def _from_polars(cls, response: Response) -> "pl.DataFrame":
         r"""Transform a (cached) NIST ASD response into a polars DataFrame.
         Calculates the air equivalent wavelength from the vacuum wavelength using the same Sellmeier equation as the NIST ASD.
         Note that this conversion is only performed for lines with $200 nm < \lambda < 2000 nm$, like the ASD.
-        For lines outside of this range, the conversion falls back to their vacuum wavelength.
+        For lines outside of this range, it uses NaN values.
         """
         schema = {
             "obs_wl_vac(nm)": pl.String,
@@ -366,97 +365,45 @@ class SpectraCache:
             "J_k": pl.String,
             "": pl.String,
         }
-        # annotation_chars_to_strip = "(?i)()[]?*w,bGhilmprsq:+xzgacHd "
-        df = (
-            pl.read_csv(
-                StringIO(response.text),
-                separator="\t",
-                schema_overrides=schema,
-                null_values="",
-            )
-            .with_columns(
-                pl.col("obs_wl_vac(nm)", "Ei(cm-1)", "Ek(cm-1)", "intens")
-                # .str.strip_chars(annotation_chars_to_strip).str.replace("&dagger;", "", literal=True)
-                .str.extract(SCI_EXPR)
-                # .str.extract(r"([+-]?\d*\.?\d+e[+-]?\d+)")
-                .replace("", None)
-                .cast(pl.Float64),
-                pl.col("ritz_wl_vac(nm)").str.strip_chars('"+*').replace("", None).cast(pl.Float64),
-                pl.col("S(a.u.)").cast(pl.Float64),
-                pl.col("Type").replace(None, "E1"),
-                pl.col("tp_ref").replace(None, ""),
-            )
-            .drop([""])
-        ).with_columns(
-            pl.when(pl.col("wn(cm-1)").is_between(5000, 50000))
-            .then(
-                pl.col("obs_wl_vac(nm)").cast(pl.Float64)
-                / pl.col("wn(cm-1)").map_elements(cls.wn_to_n_refractive, return_dtype=pl.Float64)
-            )
-            .otherwise(pl.col("obs_wl_vac(nm)"))
-            .cast(pl.Float64)
+        df = pl.read_csv(
+            StringIO(response.text),
+            separator="\t",
+            schema_overrides=schema,
+            null_values="",
+        )
+        sci_cols = ["obs_wl_vac(nm)", "Ei(cm-1)", "Ek(cm-1)", "intens", "ritz_wl_vac(nm)"]
+        cast_to_scientific_notation = [
+            pl.col(c).str.extract(SCI_EXPR).replace("", None).cast(pl.Float64).alias(c) for c in sci_cols
+        ]
+        df = df.with_columns(
+            *cast_to_scientific_notation,
+            pl.col("S(a.u.)").cast(pl.Float64),
+            pl.col("Type").replace(None, "E1"),
+            pl.col("tp_ref").replace(None, ""),
+        ).drop([""])
+        # compute air wavelengths between 5000 cm-1 and 50000 cm-1
+        air_equiv_range = pl.col("wn(cm-1)").is_between(5000, 50000)
+        df = df.with_columns(
+            pl.when(air_equiv_range)
+            .then(pl.col("obs_wl_vac(nm)") / wavenumber_to_refractive_index(pl.col("wn(cm-1)")))
+            .otherwise(np.nan)
             .alias("obs_wl_air(nm)"),
-            pl.when(pl.col("wn(cm-1)").is_between(5000, 50000))
-            .then(
-                pl.col("ritz_wl_vac(nm)").cast(pl.Float64)
-                / pl.col("wn(cm-1)").map_elements(cls.wn_to_n_refractive, return_dtype=pl.Float64)
-            )
-            .otherwise(pl.col("ritz_wl_vac(nm)"))
-            .cast(pl.Float64)
+            pl.when(air_equiv_range)
+            .then(pl.col("ritz_wl_vac(nm)") / wavenumber_to_refractive_index(pl.col("wn(cm-1)")))
+            .otherwise(np.nan)
             .alias("ritz_wl_air(nm)"),
         )
         if "element" not in df.columns:
-            element, numeral = re.search(STATE_EXPR, response.url).groups()
-            # cast roman numerals to int for consistency with queries with multiple ionization states, e.g. Ar I vs Ar I-II
-            df = df.with_columns(
-                pl.lit(element).alias("element"),
-                pl.lit("I" if numeral is None else numeral)
-                .cast(pl.String)
-                .alias("sp_num")
-                .map_elements(cls.roman_to_int, return_dtype=pl.Int64)
-                .first(),
-            )
-        df = df.with_columns(
-            unc_obs_wl=pl.col("unc_obs_wl") if "unc_obs_wl" in df.columns else None,
-            unc_ritz_wl=pl.col("unc_ritz_wl") if "unc_ritz_wl" in df.columns else None,
-        ).with_columns(pl.col("unc_obs_wl").cast(pl.Float64), pl.col("unc_ritz_wl").cast(pl.Float64))
-        return df.select(*cls.column_order)
-    @staticmethod
-    def roman_to_int(roman: str) -> int:
-        """Transform Roman numerals to integers.
-        Does only support numerals including up to `L`.
-        """
-        roman_numerals = {"I": 1, "V": 5, "X": 10, "L": 50}
-        total = 0
-        previous = 0
-        for char in reversed(roman):
-            current_value = roman_numerals[char]
-            if current_value < previous:
-                total -= current_value  # Subtract if the current value is less than the previous value
-            else:
-                total += current_value
-            previous = current_value
-        return total
-    @staticmethod
-    def wn_to_n_refractive(wavenumbers: float) -> float:
-        r"""Calculate the refractive index $n$ in air for a transition, using the 5-term Sellmeier formula used by NIST.
-        The used Sellmeier formula is the one from E.R. Peck and K. Reeder [J. Opt. Soc. Am. 62, 958 (1972)](http://dx.doi.org/10.1364/JOSA.62.000958).
-        This formula is fitted to data in the range of 185 nm to 1700 nm for  air at 15 °C, 101 325 Pa pressure, with 0.033 % CO2.
-        This is the same formula used by the NIST ASD to calculate air wavelengths in the interval of 200 nm to 2000 nm.
-        See also [the ASD documentation on the topic](https://physics.nist.gov/PhysRefData/ASD/Html/lineshelp.html#Conversion%20between%20air%20and%20vacuum%20wavelengths).
-        Using this refractive index, air equivalent wavelengths consistent with the ASD can be calculated, without the need to query them separately.
-        """
-        sigma = wavenumbers * 1e-4  # um^-1
-        return 1 + 1e-8 * (8060.51 + 2480990 / (132.274 - sigma**2) + 17455.7 / (39.32957 - sigma**2))
+            element, numeric = extract_state_from_response(response)
+            df = df.with_columns(pl.lit(element).alias("element"), pl.lit(numeric, dtype=pl.Int64).alias("sp_num"))
+        # Cast to float, or create column filled with `null` if missing.
+        exprs = [
+            (pl.col(c) if c in df.columns else pl.lit(None).alias(c)).cast(pl.Float64)
+            for c in ["unc_obs_wl", "unc_ritz_wl"]
+        ]
+        df = df.with_columns(exprs)
+        return df.select(*ASDSchema)
     def get_all_cached(self) -> "pd.DataFrame|pl.DataFrame":
         """Retrieve all cached data into a single dataframe."""
@@ -508,7 +455,7 @@ class BibCache:
         """
         return self.session.settings.expire_after
-    def set_cache_expiry(self, new: timedelta = None, **kwargs):
+    def set_cache_expiry(self, new: Optional[timedelta] = None, **kwargs):
         """Set the cache expiry to a different interval (default: 1 week).
         Can be done by either passing in a `timedelta` object, or valid keyword arguments for `timedelta` itself.
@@ -518,14 +465,14 @@ class BibCache:
         self.session.settings.expire_after = new
     @staticmethod
-    def _check_response_success(response: "CachedResponse") -> bool:
+    def _check_response_success(response: Response) -> bool:
         """Validate that data has been fetched succesfully.
         If this check fails, the cache should not update with this response, even when marked as stale.
         """
         is_success = (response.status_code == 200) & (b"There was a problem" not in response.content)
         if not is_success:
-            logging.warning(f"Request was unsuccesful status:{response.status_code} , url:{response.url}")
+            logger.warning(f"Request was unsuccesful status:{response.status_code} , url:{response.url}")
         return is_success
     @classmethod
@@ -533,17 +480,18 @@ class BibCache:
         r"""Parse a reference code from the NIST ASD into the constituent parts that can be used to look up references.
         Args:
-            * reference_code (str): A NIST ASD bibliographic reference string, such as `L13456n3`, or `T6936n`.
+            reference_code (str): A NIST ASD bibliographic reference string, such as `L13456n3`, or `T6936n`.
         Returns:
-            * db    (str)   :   A label for which bibliographic database to target
-            * ref   (str)   :   The database ID for the reference to look up
-            * comment (str) :   An additional comment included in the reference, can be fetched separately.
+            db (str):   A label for which bibliographic database to target
+            ref (str|None):   The database ID for the reference to look up
+            comment (str):   An additional comment included in the reference, can be fetched separately.
         """
         if reference_code.startswith("n"):
-            db, ref, comment = "T", None, "n"
-        elif (not reference_code.startswith("LS")) & (cls.reference_expr.match(reference_code) is not None):
-            db, ref, comment = cls.reference_expr.match(reference_code).groups()
+            return ("T", None, "n")
+        matched = cls.reference_expr.match(reference_code)
+        if (not reference_code.startswith("LS")) and (matched is not None):
+            db, ref, comment = matched.groups()
             comment = comment if "LS" not in reference_code else "LS"
         else:
             db, ref, comment = "T", None, "LS"
@@ -553,12 +501,12 @@ class BibCache:
         """Look up a reference code for a given element state.
         Args:
-            element (str)           :   The element name, e.g. `H`
-            sp_num (int)            :   The ionization state of the element, with 1 corresponding to the atom
-            reference_code (str)    :   The bibliographic reference code from the ASD columns `tp_ref` or `line_ref`.
+            element (str):   The element name, e.g. `H`
+            sp_num (int):   The ionization state of the element, with 1 corresponding to the atom
+            reference_code (str):   The bibliographic reference code from the ASD columns `tp_ref` or `line_ref`.
         Returns:
-            bib_data (dict)         : A dictionary containing bibliographic metadata for the reference, if available/applicable. Contains a url to look it up.
+            bib_data (dict[str,Any]): A dictionary containing bibliographic metadata for the reference, if available/applicable. Contains a url to look it up.
         """
         db, ref, comment = self.parse_reference_code(reference_code)
         params = {

asdcache-0.2.4/ASDCache/__init__.py ADDED Viewed

@@ -0,0 +1,61 @@
+r"""`ASDCache` is a package to fetch data from the  NIST Atomic Spectra Database (ASD), utlizing caching for fast responses.
+To make the most use out of the cache, `ASDCache` is opinionated in the information it retrieves from the ASD; it always requests the same schema of information and locally computes additional fields.
+Data is initially fetched from the online published NIST page, using the tab-separated ASCII output format.
+The benefit of this format is that it is more 'machine readable' than the formatted ASCII of HTML options.
+This means it requires far less bespoke parsing to get rid of 'human readable' features such as repeated page column headers, or empty lines.
+## Air wavelength
+To ensure a consistent schema of the retrieved data, lines are always retrieved as a function of wavelength, using `vacuum wavelength`, even between 200 to 2000 nm.
+Wavenumbers and Ritz wavelength will be included in the response.
+In the range $5000\ \mathrm{cm}^{-1}<\nu<50000\ \mathrm{cm}^{-1}$ the air equivalent observed and Ritz wavelengths are calculated using the same Sellmeier equation as the NIST ASD (see [here][.utils.wavenumber_to_refractive_index]).
+This is consistent with the approach of the ASD.
+## Making use of the cache
+Each response from the NIST page is cached (2 weeks by default) on the local system.
+This makes it much faster to load the same data, even across different script runs and/or user programs/sessions.
+As an example: retrieving and parsing the data for all spectra between 200 and 1000 nm can take over 2 minutes without using the cache, but can be as fast as 0.2 seconds using the `polars` backend.
+In addition, it means that an internet connection is not required after initial data fetching.
+The cached response is only updated upon succesfull retrieval of a new response of the NIST page.
+If unable to succesfully fetch new data, we fall back to a 'stale' cached response.
+The cache can be shared to another system, to give offline/airgapped systems access to the same data.
+To that end, the file `NIST_ASD_cache.sqlite` in the user's cache directory has to be copied over.
+### Default cache locations
+The standard cache directories are as follows:
+=== "Windows"
+    `%USERPROFILE%/AppData/Local`
+=== "Linux"
+    `~/.cache/http_cache/`
+=== "MacOS"
+    `/Users/user/Library/Caches/http_cache/`
+### Cache keys and uniqueness
+Queries to the NIST ASD are hashed by the keys (or parameters) of the requests.
+This means that any change to either one of these parameters, will result in a new cache entry, even if the returned data is equivalent.
+In other words: the cache cannot deduplicate queries such as `SpectraCache().fetch('H', (200,1000))` followed by `SpectraCache().fetch('H I', (650,660))` (or vice versa).
+It is often better (and faster) to fetch a range of data beyond what you need, and then filter down the dataframe you retrieve according to your needs.
+"""
+from .ASDCache import SpectraCache, BibCache
+__all__ = ["SpectraCache", "BibCache"]

asdcache-0.2.4/ASDCache/_version.py ADDED Viewed

@@ -0,0 +1,24 @@
+# file generated by vcs-versioning
+# don't change, don't track in version control
+from __future__ import annotations
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+version: str
+__version__: str
+__version_tuple__: tuple[int | str, ...]
+version_tuple: tuple[int | str, ...]
+commit_id: str | None
+__commit_id__: str | None
+__version__ = version = '0.2.4'
+__version_tuple__ = version_tuple = (0, 2, 4)
+__commit_id__ = commit_id = None

asdcache-0.2.4/ASDCache/utils.py ADDED Viewed

@@ -0,0 +1,66 @@
+"""Module containing small helper utility functions for extracting and processing input from the ASD."""
+import re
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from requests import Response
+ROMAN_NUMERALS = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
+STATE_EXPR = r"spectra=([\w]+)\+?([IVX]+)?"
+"""Regex pattern for extracting (element,charge) tuple for a single-state query, which uses roman numerals."""
+def roman_to_int(roman: str) -> int:
+    """Parse a Roman numeral into an integer.
+    Supports numerals up to "M".
+    """
+    roman = roman.upper().strip()
+    total = 0
+    previous = 0
+    for char in reversed(roman):
+        current_value = ROMAN_NUMERALS[char]
+        if current_value < previous:
+            total -= current_value  # Subtract if the current value is less than the previous value
+        else:
+            total += current_value
+        previous = current_value
+    return total
+def wavenumber_to_refractive_index(wavenumbers: float) -> float:
+    r"""Calculate the refractive index $n$ in air for a transition, using the 5-term Sellmeier formula used by NIST.
+    The used Sellmeier formula is the one from E.R. Peck and K. Reeder [J. Opt. Soc. Am. 62, 958 (1972)](http://dx.doi.org/10.1364/JOSA.62.000958).
+    This formula is fitted to data in the range of 185 nm to 1700 nm for  air at 15 °C, 101 325 Pa pressure, with 0.033 % CO2.
+    This is the same formula used by the NIST ASD to calculate air wavelengths in the interval of 200 nm to 2000 nm.
+    See also [the ASD documentation on the topic](https://physics.nist.gov/PhysRefData/ASD/Html/lineshelp.html#Conversion%20between%20air%20and%20vacuum%20wavelengths).
+    Using this refractive index, air equivalent wavelengths consistent with the ASD can be calculated, without the need to query them separately.
+    """
+    sigma = wavenumbers * 1e-4  # um^-1
+    return 1 + 1e-8 * (8060.51 + 2480990 / (132.274 - sigma**2) + 17455.7 / (39.32957 - sigma**2))
+def extract_state_from_response(response: "Response") -> tuple[str, int]:
+    """Extract the element and ionization state from the url of a response.
+    When querying only a single state, e.g. 'H I', this information will not be present as a column in data: the `element` and `sp_num` columns will not be included.
+    This information is parsed from the query url instead, so it can be added.
+    Since the `sp_num` column is of an integer type, the roman numerals in the url are converted to integers.
+    """
+    matched = re.search(STATE_EXPR, str(response.url))
+    if not matched:
+        raise ValueError(
+            "URL did not contain a `spectra` parameter satisfying %s; Could not identify element and sp_num",
+            STATE_EXPR,
+        )
+    element, numeral = matched.groups()
+    numeric: int = roman_to_int(numeral) if numeral else 1
+    return element, numeric

{asdcache-0.2.2 → asdcache-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ASDCache
-Version: 0.2.2
+Version: 0.2.4
 Summary: A Python module to retrieve data from the NIST Atomic Spectra Database (ASD), using caching for fast, efficient data handling
 Project-URL: Documentation, https://antoinetue.github.io/asdcache
 Project-URL: Source, https://github.com/AntoineTUE/asdcache
@@ -17,28 +17,30 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
 Classifier: Topic :: Scientific/Engineering
 Requires-Python: >=3.9
-Requires-Dist: bs4
-Requires-Dist: numpy
-Requires-Dist: pandas
-Requires-Dist: requests
-Requires-Dist: requests-cache
+Requires-Dist: beautifulsoup4>=4.12
+Requires-Dist: numpy>=1.20.3
+Requires-Dist: pandas>=2.0
+Requires-Dist: requests-cache>=1.2.0
 Provides-Extra: docs
-Requires-Dist: black; extra == 'docs'
-Requires-Dist: mkdocs; extra == 'docs'
+Requires-Dist: mkdocs-api-autonav; extra == 'docs'
 Requires-Dist: mkdocs-autorefs; extra == 'docs'
-Requires-Dist: mkdocs-gen-files; extra == 'docs'
 Requires-Dist: mkdocs-git-revision-date-localized-plugin; extra == 'docs'
 Requires-Dist: mkdocs-include-markdown-plugin; extra == 'docs'
-Requires-Dist: mkdocs-jupyter; extra == 'docs'
-Requires-Dist: mkdocs-literate-nav; extra == 'docs'
-Requires-Dist: mkdocs-material; extra == 'docs'
+Requires-Dist: mkdocs-jupyter>=0.26.3; extra == 'docs'
+Requires-Dist: mkdocs-material==9.7.6; extra == 'docs'
 Requires-Dist: mkdocs-section-index; extra == 'docs'
 Requires-Dist: mkdocstrings; extra == 'docs'
-Requires-Dist: mkdocstrings-python; extra == 'docs'
+Requires-Dist: mkdocstrings-python-xref>=2.1.1; extra == 'docs'
+Requires-Dist: properdocs>=1.6.7; extra == 'docs'
+Requires-Dist: pygments>=2.20.0; extra == 'docs'
+Requires-Dist: ruff>=0.15.13; extra == 'docs'
 Provides-Extra: polars
-Requires-Dist: polars; extra == 'polars'
+Requires-Dist: polars[pandas]; extra == 'polars'
+Provides-Extra: polars-compat
+Requires-Dist: polars[pandas,rtcompat]; extra == 'polars-compat'
 Description-Content-Type: text/markdown
 # ASDCache
@@ -51,7 +53,8 @@ Description-Content-Type: text/markdown
 [![GitHub Workflow Status docs](https://img.shields.io/github/actions/workflow/status/AntoineTUE/ASDCache/documentation.yml?label=Documentation%20build)](https://antoinetue.github.io/ASDCache)
 [![PyPI - Version](https://img.shields.io/pypi/v/ASDCache)](https://pypi.python.org/pypi/ASDCache)
 [![PyPI - Python versions](https://img.shields.io/pypi/pyversions/ASDCache.svg)](https://pypi.python.org/pypi/ASDCache)
-[![PyPI - Downloads](https://img.shields.io/pypi/dw/ASDCache)](https://pypistats.org/packages/ASDCache)
+[![PyPI - Downloads](https://img.shields.io/pypi/dm/ASDCache)](https://pypistats.org/packages/asdcache)
+[![Pepy Total Downloads](https://img.shields.io/pepy/dt/asdcache)](https://pepy.tech/projects/asdcache)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![Hatch project](https://img.shields.io/badge/%F0%9F%A5%9A-Hatch-4051b5.svg)](https://github.com/pypa/hatch)
@@ -69,7 +72,7 @@ The main goals and benefits of `ASDCache` are:
 - [x] Retrieve a consistent schema of the data that represents the 'human readable' format, but enforce strictly numeric data for important columns
     - [ ] This removes footnotes and other annotations, be sure to check the ASD itself as well for this information.
 - [x]  Use caching to dramatically speed up data retrieval, from minutes down to milliseconds in some cases
-    - [x] Cache time-to-live is 1 week by default, meaning you still get updates to the ASD in a reasonable time frame
+    - [x] Cache time-to-live is two weeks by default, meaning you still get updates to the ASD in a reasonable time frame
     - [x] The cache time-to-live can be adjusted
 - [x]  Cache data to allow working offline, or even transfering the ASD data to an offline system.
     - [x] The cache is only updated when a request for new data succeeds
@@ -78,6 +81,7 @@ The main goals and benefits of `ASDCache` are:
 `ASDCache` is not affiliated with NIST or the NIST ASD in any way, it simply tries to help make it more accessible.
 ## Installing
 `ASDCache` can be installed with `pip`.
 ```console
@@ -99,6 +103,7 @@ Installing the `polars` feature is not required, in case `polars` is already ins
 Documentation for `ASDCache` is available on [this page](https://antoinetue.github.io/ASDCache).
 ### Example
 A brief example below demonstrates how to use `SpectraCache` to query the NIST ASD for spectroscopic data for different species and plot their respective relative intensities.
 Note that these relative intensities are in principle not comparable between different species or sources and merely serve as a guide.

{asdcache-0.2.2 → asdcache-0.2.4}/pyproject.toml RENAMED Viewed

@@ -24,34 +24,35 @@ classifiers = [
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
     "Programming Language :: Python :: 3.13",
+    "Programming Language :: Python :: 3.14",
 ]
-dependencies = ["requests","requests_cache", "pandas","numpy", "bs4"]
+dependencies = ["requests_cache>=1.2.0", "pandas>=2.0","numpy>=1.20.3", "beautifulsoup4>=4.12"]
 dynamic = ["version"]
 [project.optional-dependencies]
-polars = ["polars"]
+polars = ["polars[pandas]"]
+polars-compat = ["polars[rtcompat,pandas]"]
 docs = [
-    "mkdocs",
+    "properdocs>=1.6.7",
+    "mkdocs-material==9.7.6",
     "mkdocs-autorefs",
-    "mkdocs-gen-files",
+    # "mkdocs-gen-files",
     "mkdocs-git-revision-date-localized-plugin",
     "mkdocs-include-markdown-plugin",
-    "mkdocs-jupyter",
-    "mkdocs-literate-nav",
-    "mkdocs-material",
+    "mkdocs-jupyter>=0.26.3",
+    # "mkdocs-literate-nav",
     "mkdocs-section-index",
     "mkdocstrings",
-    "mkdocstrings-python",
-    "black"
+    "mkdocstrings-python-xref>=2.1.1",
+    "mkdocs-api-autonav",
+    "ruff>=0.15.13",
+    "pygments>=2.20.0"
 ]
 [project.urls]
 Documentation = "https://antoinetue.github.io/asdcache"
 Source = "https://github.com/AntoineTUE/asdcache"
-[tool.hatch.metadata]
-# direct dependency references, e.g `pip @ git+https://github.com/pypa/pip.git@master`
-allow-direct-references = true
 [tool.hatch.version]
 source = "vcs"
@@ -70,7 +71,7 @@ exclude = ["/.github"]
 minversion = "6.0"
 addopts = "-ra -q --doctest-glob='*.md'"
 testpaths = ["tests"]
-markers = ["full: test using the full NIST ASD"]
+markers = ["online: run test that retrieve data online from the ASD"]
 [tool.coverage.run]
 branch = true
@@ -110,7 +111,7 @@ extend-exclude = ["docs/assets/scripts/gen_ref_pages.py"]
 [tool.ruff.lint]
 select = ["E4", "E7", "E9", "F","C4", "SIM", "NPY", "PD","B","UP","D"]
-ignore = ["PD901","F401"]
+ignore = ["F401"]
 [tool.ruff.lint.pydocstyle]
 convention = "pep257"
@@ -138,16 +139,6 @@ dependencies = ["matplotlib", "ipython","ipykernel","pre-commit"]
 installer = "uv"
 features = ["polars"]
-[tool.hatch.envs.test]
-dependencies = [
-    "coverage[toml]>=6.2",
-    "pytest",
-    "pytest-cov",
-    "pytest-mock",
-    "pytest-recording",
-    "pytest-sugar",
-    "hypothesis",
-]
 [tool.hatch.envs.hatch-test]
 randomize = false
@@ -155,6 +146,22 @@ parallel = false # avoid cache access conflicts
 retries = 2
 retry-delay = 1
 features = ["polars"]
+dependencies = [
+    "coverage-enable-subprocess==1.0",
+    'coverage[toml]>=6.2,<7.11; python_version<"3.10"',
+    'coverage[toml]~=7.11; python_version>="3.10"',
+    'pytest~=8.4; python_version<"3.10"',
+    'pytest~=9.0; python_version>="3.10"',
+    "pytest-mock~=3.12",
+    "pytest-randomly~=3.15",
+    "pytest-rerunfailures~=14.0",
+    "pytest-xdist[psutil]~=3.5",
+    'pytest-cov~=7.1.0; python_version>="3.10"',
+    "pytest-recording",
+    "pytest-sugar~=1.1.1",
+    "hypothesis",
+]
 [tool.hatch.envs.docs]
 skip-install = true
@@ -162,13 +169,13 @@ features = ["docs"]
 dependencies = ["mike"]
 [tool.hatch.envs.docs.scripts]
-serve = "mkdocs serve -f mkdocs.yml {args}"
-build = "mkdocs build --clean -f mkdocs.yml {args}"
-ci-build = "mike deploy --config-file mkdocs.yml --update-aliases {args}"
+serve = "properdocs serve -f mkdocs.yml {args}"
+build = "properdocs build --clean -f mkdocs.yml {args}"
+ci-build = "mike deploy --config-file mkdocs.yml {args}"
 [tool.hatch.envs.lint]
 template = "lint"
-dependencies = ["ruff>=0.7.0"]
+dependencies = ["ruff>=0.15.13"]
 [tool.hatch.envs.lint.scripts]
 style = [
@@ -182,5 +189,35 @@ fix = [
     "style",  # feedback on what is not fixable
 ]
+[tool.hatch.envs.hatch-test.overrides]
+matrix.pandas.dependencies = [
+    { value = "pandas>=2.0", if = ["pandas-2.x"] },
+    { value = "pandas>=3.0", if = ["pandas-3.x"] },
+]
+matrix.numpy.dependencies = [
+    { value = "numpy>=2.0", if = ["numpy-2.x"] },
+    {value = "numpy<2.0", if =  ["numpy-legacy"]},
+]
+matrix.polars.features = [
+    { value = "polars", if = ["polars"]},
+    { value = "polars-compat", if = ["polars-compat"]},
+]
+[[tool.hatch.envs.hatch-test.matrix]]
+python = ["3.9","3.10"]
+numpy = ["numpy-2.x","numpy-legacy"]
+pandas = ["pandas-2.x"]
+polars = ["polars","polars-compat"]
+[[tool.hatch.envs.hatch-test.matrix]]
+python = ["3.11","3.12","3.13", "3.14"]
+numpy = ["numpy-2.x","numpy-legacy"]
+pandas = ["pandas-2.x","pandas-3.x"]
+polars = ["polars","polars-compat"]
 [[tool.hatch.envs.hatch-test.matrix]]
-python = ["3.9", "3.10", "3.11", "3.12","3.13"]
+python = ["3.13", "3.14"]
+numpy = ["numpy-2.x"]
+pandas = ["pandas-2.x","pandas-3.x"]
+polars = ["polars","polars-compat"]

asdcache-0.2.2/ASDCache/__init__.py DELETED Viewed

@@ -1,10 +0,0 @@
-"""ASDCache is a module to retrieve data from the NIST Atomic Spectra Database that uses caching for fast local access.
-To make the most use out of the cache, `ASDCache` is opinionated in the information it retrieves from the ASD; it always requests the same schema of information and locally computes additional fields.
-The `SpectraCache` class acts as the entrypoint to retrieve this data.
-"""
-from .ASDCache import SpectraCache, BibCache
-__all__ = ["SpectraCache", "BibCache"]

asdcache-0.2.2/ASDCache/_version.py DELETED Viewed

@@ -1,21 +0,0 @@
-# file generated by setuptools-scm
-# don't change, don't track in version control
-__all__ = ["__version__", "__version_tuple__", "version", "version_tuple"]
-TYPE_CHECKING = False
-if TYPE_CHECKING:
-    from typing import Tuple
-    from typing import Union
-    VERSION_TUPLE = Tuple[Union[int, str], ...]
-else:
-    VERSION_TUPLE = object
-version: str
-__version__: str
-__version_tuple__: VERSION_TUPLE
-version_tuple: VERSION_TUPLE
-__version__ = version = '0.2.2'
-__version_tuple__ = version_tuple = (0, 2, 2)

{asdcache-0.2.2 → asdcache-0.2.4}/.gitignore RENAMED Viewed

File without changes

{asdcache-0.2.2 → asdcache-0.2.4}/LICENSE RENAMED Viewed

File without changes

ASDCache 0.2.2__tar.gz → 0.2.4__tar.gz

ASDCache 0.2.2tar.gz → 0.2.4tar.gz