PyPI - hydroserverpy - Versions diffs - 1.1.0b1__tar.gz → 1.1.2__tar.gz - Mend

hydroserverpy 1.1.0b1tar.gz → 1.1.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of hydroserverpy might be problematic. Click here for more details.

Files changed (77) hide show

{hydroserverpy-1.1.0b1/src/hydroserverpy.egg-info → hydroserverpy-1.1.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hydroserverpy
-Version: 1.1.0b1
+Version: 1.1.2
 Requires-Python: <4,>=3.9
 License-File: LICENSE
 Requires-Dist: requests>=2

{hydroserverpy-1.1.0b1 → hydroserverpy-1.1.2}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = hydroserverpy
-version = 1.1.0b1
+version = 1.1.2
 [options]
 package_dir =

{hydroserverpy-1.1.0b1 → hydroserverpy-1.1.2}/src/hydroserverpy/api/services/iam/workspace.py RENAMED Viewed

@@ -178,7 +178,7 @@ class WorkspaceService(EndpointService):
                     None,
                     ...,
                 )
-                else None
+                else expires_at
             )
         }
         headers = {"Content-type": "application/json"}

{hydroserverpy-1.1.0b1 → hydroserverpy-1.1.2}/src/hydroserverpy/api/services/sta/datastream.py RENAMED Viewed

@@ -195,7 +195,7 @@ class DatastreamService(SensorThingsService):
                     None,
                     ...,
                 )
-                else None
+                else phenomenon_begin_time
             ),
             "phenomenonEndTime": (
                 phenomenon_end_time.isoformat()
@@ -204,7 +204,7 @@ class DatastreamService(SensorThingsService):
                     None,
                     ...,
                 )
-                else None
+                else phenomenon_end_time
             ),
             "resultBeginTime": (
                 result_begin_time.isoformat()
@@ -213,7 +213,7 @@ class DatastreamService(SensorThingsService):
                     None,
                     ...,
                 )
-                else None
+                else result_begin_time
             ),
             "resultEndTime": (
                 result_end_time.isoformat()
@@ -222,7 +222,7 @@ class DatastreamService(SensorThingsService):
                     None,
                     ...,
                 )
-                else None
+                else result_end_time
             ),
             "isPrivate": is_private,
             "isVisible": is_visible,

hydroserverpy-1.1.2/src/hydroserverpy/etl/extractors/http_extractor.py ADDED Viewed

@@ -0,0 +1,99 @@
+import logging
+from hydroserverpy.etl.types import TimeRange
+import requests
+from io import BytesIO
+from typing import Dict
+from .base import Extractor
+class HTTPExtractor(Extractor):
+    def __init__(self, settings: object):
+        self.url = settings["urlTemplate"]
+        # self.url = self.format_url(url, url_variables or {})
+        # self.params = settings.get('params', )
+        # self.headers = headers
+        # self.auth = auth
+    def prepare_params(self, data_requirements: Dict[str, TimeRange]):
+        pass
+        # TODO: Uncomment this once url templates work on in the Data Management App
+        # start_times = [
+        #     req["start_time"] for req in data_requirements.values() if req["start_time"]
+        # ]
+        # if start_times:
+        #     oldest_start_time = min(start_times)
+        #     start_time_key = self.params.pop("start_time_key", None)
+        #     if start_time_key:
+        #         self.params[start_time_key] = oldest_start_time
+        #         logging.info(
+        #             f"Set start_time to {oldest_start_time} and removed 'start_time_key'"
+        #         )
+        #     else:
+        #         logging.warning("'start_time_key' not found in params.")
+        # end_times = [
+        #     req["end_time"] for req in data_requirements.values() if req["end_time"]
+        # ]
+        # if end_times:
+        #     newest_end_time = max(end_times)
+        #     end_time_key = self.params.pop("end_time_key", None)
+        #     if end_time_key:
+        #         self.params[end_time_key] = newest_end_time
+        #         logging.info(
+        #             f"Set end_time to {newest_end_time} and removed 'end_time_key'"
+        #         )
+        #     else:
+        #         logging.warning("'end_time_key' not found in params.")
+    def extract(self):
+        """
+        Downloads the file from the HTTP/HTTPS server and returns a file-like object.
+        """
+        logging.info(f"Requesting data from → {self.url}")
+        # endpoints = [
+        #     "https://httpbin.org/get",
+        #     "https://jsonplaceholder.typicode.com/posts/1",
+        #     "https://api.github.com",
+        #     "https://api.ipify.org?format=json",
+        #     "https://www.python.org/",
+        #     "https://waterservices.usgs.gov/nwis/iv/?&format=json&sites=01646500&parameterCd=00060",
+        #     "https://datahub.io/core/country-list/r/data.csv",
+        #     "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv",
+        #     # "https://rain-flow.slco.org/export/file/?delimiter=comma&site_id=68&data_start=2025-04-09&data_end=2025-05-09&device_id=2",
+        #     # "https://rain-flow.slco.org/export/file/?mime=txt&delimiter=comma&site_id=68&data_start=2025-05-09%2000:00:00&data_end=2025-05-09%2023:59:59&device_id=2"
+        # ]
+        # for url in endpoints:
+        #     try:
+        #         r = requests.get(url, timeout=10)
+        #         print(f"{url:50} → {r.status_code}")
+        #     except Exception as e:
+        #         print(f"{url:50} → ERROR: {e}")
+        try:
+            response = requests.get(self.url)
+        except Exception as e:
+            logging.error(f"Failed to fetch {repr(self.url)}: {e}")
+            raise
+        logging.info(f"Received response")
+        data = BytesIO()
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                data.write(chunk)
+        data.seek(0)
+        return data
+    @staticmethod
+    def format_url(url_template, url_variables):
+        try:
+            url = url_template.format(**url_variables)
+        except KeyError as e:
+            missing_key = e.args[0]
+            raise KeyError(f"Missing configuration url_variable: {missing_key}")
+        return url

hydroserverpy-1.1.2/src/hydroserverpy/etl/extractors/local_file_extractor.py ADDED Viewed

@@ -0,0 +1,29 @@
+import logging
+from typing import Dict
+from .base import Extractor
+from ..types import TimeRange
+class LocalFileExtractor(Extractor):
+    def __init__(self, settings: object):
+        if "path" not in settings:
+            message = "Missing required setting 'path' in LocalFileExtractor settings."
+            logging.error(message)
+            raise ValueError(message)
+        self.path = settings["path"]
+    def prepare_params(self, data_requirements: Dict[str, TimeRange]):
+        pass
+    def extract(self):
+        """
+        Opens the file and returns a file-like object.
+        """
+        try:
+            file_handle = open(self.path, "r")
+            logging.info(f"Successfully opened file '{self.path}'.")
+            return file_handle
+        except Exception as e:
+            logging.error(f"Error opening file '{self.path}': {e}")
+            return None

{hydroserverpy-1.1.0b1 → hydroserverpy-1.1.2}/src/hydroserverpy/etl/loaders/hydroserver_loader.py RENAMED Viewed

@@ -1,5 +1,8 @@
+import datetime
 from hydroserverpy import HydroServer
 from typing import Dict, Optional
+from hydroserverpy.etl.types import TimeRange
 from .base import Loader
 import logging
 import pandas as pd
@@ -13,20 +16,25 @@ class HydroServerLoader(HydroServer, Loader):
     def __init__(
         self,
         host: str,
-        username: Optional[str] = None,
+        email: Optional[str] = None,
         password: Optional[str] = None,
         apikey: Optional[str] = None,
-        api_route: str = "api",
     ):
-        super().__init__(host, username, password, apikey, api_route)
+        super().__init__(
+            host=host,
+            email=email,
+            password=password,
+            apikey=apikey,
+        )
-    def load(self, data: pd.DataFrame, source_target_map) -> None:
+    def load(self, data: pd.DataFrame, payload_settings) -> None:
         """
         Load observations from a DataFrame to the HydroServer.
         :param data: A Pandas DataFrame where each column corresponds to a datastream.
         """
-        data_requirements = self.get_data_requirements(source_target_map)
+        mappings = payload_settings["mappings"]
+        time_ranges = self.get_data_requirements(mappings)
         for ds_id in data.columns:
             if ds_id == "timestamp":
                 continue
@@ -35,9 +43,17 @@ class HydroServerLoader(HydroServer, Loader):
             df.rename(columns={ds_id: "value"}, inplace=True)
             df.dropna(subset=["value"], inplace=True)
-            phenomenon_end_time = data_requirements[ds_id]["start_time"]
-            if phenomenon_end_time:
-                df = df[df["timestamp"] > phenomenon_end_time]
+            # ensure the timestamp column is UTC‑aware
+            timestamp_column = df["timestamp"]
+            if timestamp_column.dt.tz is None:
+                df["timestamp"] = timestamp_column.dt.tz_localize("UTC")
+            time_range = time_ranges[ds_id]
+            start_ts = pd.to_datetime(time_range["start_time"], utc=True)
+            if start_ts:
+                df = df[df["timestamp"] > start_ts]
+            logging.info(f"start cutoff for data loading {start_ts}")
             if df.empty:
                 logging.warning(
                     f"No new data to upload for datastream {ds_id}. Skipping."
@@ -45,24 +61,31 @@ class HydroServerLoader(HydroServer, Loader):
                 continue
             self.datastreams.load_observations(uid=ds_id, observations=df)
-    def get_data_requirements(
-        self, source_target_map
-    ) -> Dict[str, Dict[str, pd.Timestamp]]:
+    def get_data_requirements(self, source_target_map) -> Dict[str, TimeRange]:
         """
         Each target system needs to be able to answer the question: 'What data do you need?'
         and return a time range for each target time series. Usually the answer will be
         'anything newer than my most recent observation'.
         """
         data_requirements = {}
-        for ds_id in source_target_map.values():
-            datastream = self.datastreams.get(uid=ds_id)
+        target_ids = [mapping["targetIdentifier"] for mapping in source_target_map]
+        for id in target_ids:
+            datastream = self.datastreams.get(uid=id)
             if not datastream:
                 message = "Couldn't fetch target datastream. ETL process aborted."
                 logging.error(message)
                 raise message
-            start_time = pd.Timestamp(
+            start_ts = pd.Timestamp(
                 datastream.phenomenon_end_time or "1970-01-01T00:00:00Z"
             )
-            end_time = pd.Timestamp.now()
-            data_requirements[ds_id] = {"start_time": start_time, "end_time": end_time}
+            if start_ts.tzinfo is None:
+                start_ts = start_ts.tz_localize("UTC")
+            end_ts = pd.Timestamp.now(tz="UTC")
+            data_requirements[id] = {
+                "start_time": start_ts.isoformat(),
+                "end_time": end_ts.isoformat(),
+            }
         return data_requirements

hydroserverpy-1.1.2/src/hydroserverpy/etl/transformers/base.py ADDED Viewed

@@ -0,0 +1,117 @@
+from abc import ABC, abstractmethod
+from datetime import timedelta, timezone
+import logging
+from typing import Union
+import pandas as pd
+class Transformer(ABC):
+    def __init__(self, settings: object):
+        # timestampFormat will be the strs: 'utc', 'ISO8601', 'constant', or some custom openStrftime.
+        # If 'constant', then the system will append the timestamp_offset to the end of it.
+        self.timestamp_format = settings.get("timestampFormat", "ISO8601")
+        self.timestamp_offset: str = settings.get("timestampOffset", "+0000")
+        self.timestamp_key: Union[str, int] = settings["timestampKey"]
+        if isinstance(self.timestamp_key, int):
+            # Users will always interact in 1-based, so if the key is a column index, convert to 0-based
+            self.timestamp_key = self.timestamp_key - 1
+    @abstractmethod
+    def transform(self, *args, **kwargs) -> None:
+        pass
+    @property
+    def needs_datastreams(self) -> bool:
+        return False
+    def standardize_dataframe(self, df, payload_mappings):
+        rename_map = {
+            mapping["sourceIdentifier"]: mapping["targetIdentifier"]
+            for mapping in payload_mappings
+        }
+        df.rename(
+            columns={self.timestamp_key: "timestamp", **rename_map},
+            inplace=True,
+        )
+        # Verify timestamp column is present in the DataFrame
+        if "timestamp" not in df.columns:
+            message = f"Timestamp column '{self.timestamp_key}' not found in data."
+            logging.error(message)
+            raise ValueError(message)
+        # verify datastream columns
+        expected = set(rename_map.values())
+        missing = expected - set(df.columns)
+        if missing:
+            raise ValueError(
+                "The following datastream IDs are specified in the config file but their related keys could not be "
+                f"found in the source system's extracted data: {missing}"
+            )
+        # keep only timestamp + datastream columns; remove the rest inplace
+        to_keep = ["timestamp", *expected]
+        df.drop(columns=df.columns.difference(to_keep), inplace=True)
+        df["timestamp"] = self._parse_timestamps(df["timestamp"])
+        df.drop_duplicates(subset=["timestamp"], keep="last")
+        logging.info(f"standardized dataframe created: {df.shape}")
+        logging.info(f"{df.info()}")
+        logging.info(f"{df.head()}")
+        return df
+    def _parse_timestamps(self, raw_series: pd.Series) -> pd.Series:
+        """Return a Series of pandas UTC datetimes for the four supported modes."""
+        logging.info(f"parsing timestamps. Format: {self.timestamp_format}")
+        fmt = self.timestamp_format.lower()
+        VALID_KEYS = {"utc", "iso8601", "constant"}
+        if fmt not in VALID_KEYS and "%" not in self.timestamp_format:
+            raise ValueError(
+                f"timestamp_format must be one of {', '.join(VALID_KEYS)} "
+                "or a valid strftime pattern."
+            )
+        series = raw_series.str.strip()
+        if fmt == "utc":
+            # Accept Z-suffix, no offset, fractional seconds, etc.
+            parsed = pd.to_datetime(series, utc=True, errors="coerce")
+        elif fmt == "iso8601":
+            # pandas reads the embedded offset, then we shift to UTC
+            parsed = pd.to_datetime(series, errors="coerce").dt.tz_convert("UTC")
+        elif fmt == "constant":
+            offset = str(self.timestamp_offset).strip()
+            if not (len(offset) == 5 and offset[0] in "+-"):
+                raise ValueError(f"Invalid timestampOffset: {self.timestamp_offset}")
+            sign_multiplier = 1 if offset[0] == "+" else -1
+            hours = int(offset[1:3])
+            minutes = int(offset[3:5])
+            total_minutes = sign_multiplier * (hours * 60 + minutes)
+            local_timezone = timezone(timedelta(minutes=total_minutes))
+            naive_times = pd.to_datetime(series, errors="coerce")
+            localized_times = naive_times.dt.tz_localize(local_timezone)
+            parsed = localized_times.dt.tz_convert("UTC")
+        else:
+            logging.info(f"timestamp format is custom {self.timestamp_format}")
+            parsed = pd.to_datetime(
+                series, format=self.timestamp_format, errors="coerce"
+            ).dt.tz_localize("UTC")
+        if parsed.isna().any():
+            bad_rows = series[parsed.isna()].head(5).tolist()
+            logging.warning(
+                f"{parsed.isna().sum()} timestamps failed to parse. Sample bad values: {bad_rows}"
+            )
+        return parsed

hydroserverpy-1.1.2/src/hydroserverpy/etl/transformers/csv_transformer.py ADDED Viewed

@@ -0,0 +1,77 @@
+from io import StringIO
+import logging
+import pandas as pd
+from typing import Iterable, Union
+from .base import Transformer
+class CSVTransformer(Transformer):
+    def __init__(self, settings: object):
+        super().__init__(settings)
+        # Pandas is zero-based while CSV is one-based so convert
+        self.header_row = (
+            None if settings.get("headerRow") is None else settings["headerRow"] - 1
+        )
+        self.data_start_row = (
+            settings["dataStartRow"] - 1 if "dataStartRow" in settings else 0
+        )
+        self.delimiter = settings.get("delimiter", ",")
+        self.identifier_type = settings.get("identifierType", "name")
+    def transform(self, data_file, mappings) -> Union[pd.DataFrame, None]:
+        """
+        Transforms a CSV file-like object into a Pandas DataFrame where the column
+        names are replaced with their target datastream ids.
+        Parameters:
+            data_file: File-like object containing CSV data.
+        Returns:
+            observations_map (dict): Dict mapping datastream IDs to pandas DataFrames.
+        """
+        clean_file = self._strip_comments(data_file)
+        source_identifiers = [mapping["sourceIdentifier"] for mapping in mappings]
+        try:
+            # Pandas’ heuristics strip offsets and silently coerce failures to strings.
+            # Reading as pure text guarantees we always start with exactly what was in the file.
+            # Timestamps will be parsed at df standardization time.
+            df = pd.read_csv(
+                clean_file,
+                sep=self.delimiter,
+                header=self.header_row,
+                skiprows=self._build_skiprows(),
+                usecols=[self.timestamp_key] + source_identifiers,
+                dtype={self.timestamp_key: "string"},
+            )
+            logging.info(f"CSV file read into dataframe: {df.shape}")
+        except Exception as e:
+            logging.error(f"Error reading CSV data: {e}")
+            return None
+        if self.header_row is None:
+            df.columns = list(range(1, len(df.columns) + 1))
+        return self.standardize_dataframe(df, mappings)
+    def _strip_comments(self, stream: Iterable[Union[str, bytes]]) -> StringIO:
+        """
+        Remove lines whose first non-blank char is '#'.
+        Works for both text and binary iterables.
+        """
+        clean: list[str] = []
+        for raw in stream:
+            # normalize to bytes
+            b = raw if isinstance(raw, bytes) else raw.encode("utf-8", "ignore")
+            if b.lstrip().startswith(b"#"):
+                continue
+            clean.append(
+                raw.decode("utf-8", "ignore") if isinstance(raw, bytes) else raw
+            )
+        return StringIO("".join(clean))
+    def _build_skiprows(self):
+        return lambda idx: idx != self.header_row and idx < self.data_start_row

{hydroserverpy-1.1.0b1 → hydroserverpy-1.1.2}/src/hydroserverpy/etl/transformers/json_transformer.py RENAMED Viewed

@@ -7,27 +7,11 @@ import jmespath
 class JSONTransformer(Transformer):
-    def __init__(
-        self,
-        query_string: str,
-        datastream_ids: Dict[str, str],
-        timestamp_format: Optional[str] = "ISO8601",
-    ):
-        """
-        Initializes the JSONTransformer.
-        Parameters:
-            query_string (str): JMESPath to the data array containing time series data.
-            Since JMESPath can natively rename column names, the assumption is the timestamp column
-            is always named 'timestamp' or converted to 'timestamp' in the JMESPath query.
-            datastream_ids (dict): Mapping from JSON field names to datastream IDs.
-            timestamp_format (str, optional): The format of the timestamp, if it needs special parsing.
-        """
-        self.query_string = query_string
-        self.datastream_ids = datastream_ids
-        self.timestamp_format = timestamp_format
+    def __init__(self, settings: object):
+        super().__init__(settings)
+        self.JMESPath = settings["JMESPath"]
-    def transform(self, data_file):
+    def transform(self, data_file, mappings):
         """
         Transforms a JSON file-like object into the standard Pandas dataframe format.
         Since JMESPath can natively rename column names, the assumption is the timestamp column
@@ -47,15 +31,11 @@ class JSONTransformer(Transformer):
         df = pd.DataFrame(data_points)
-        return self.standardize_dataframe(
-            df,
-            self.datastream_ids,
-            timestamp_format=self.timestamp_format,
-        )
+        return self.standardize_dataframe(df, mappings)
     def extract_data_points(self, json_data: Any) -> Optional[List[dict]]:
         """Extracts data points from the JSON data using the data_path."""
-        data_points = jmespath.search(self.query_string, json_data)
+        data_points = jmespath.search(self.JMESPath, json_data)
         if isinstance(data_points, dict):
             data_points = [data_points]

{hydroserverpy-1.1.0b1 → hydroserverpy-1.1.2/src/hydroserverpy.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hydroserverpy
-Version: 1.1.0b1
+Version: 1.1.2
 Requires-Python: <4,>=3.9
 License-File: LICENSE
 Requires-Dist: requests>=2

hydroserverpy-1.1.0b1/src/hydroserverpy/etl/extractors/http_extractor.py DELETED Viewed

@@ -1,84 +0,0 @@
-import logging
-from hydroserverpy.etl.types import TimeRange
-import requests
-from io import BytesIO
-from typing import Dict
-from .base import Extractor
-class HTTPExtractor(Extractor):
-    def __init__(
-        self,
-        url: str,
-        url_variables: dict = None,
-        params: dict = None,
-        headers: dict = None,
-        auth: tuple = None,
-    ):
-        self.url = self.format_url(url, url_variables or {})
-        self.params = params
-        self.headers = headers
-        self.auth = auth
-        self.start_date = None
-    def prepare_params(self, data_requirements: Dict[str, TimeRange]):
-        start_times = [
-            req["start_time"] for req in data_requirements.values() if req["start_time"]
-        ]
-        if start_times:
-            oldest_start_time = min(start_times).isoformat()
-            start_time_key = self.params.pop("start_time_key", None)
-            if start_time_key:
-                self.params[start_time_key] = oldest_start_time
-                logging.info(
-                    f"Set start_time to {oldest_start_time} and removed 'start_time_key'"
-                )
-            else:
-                logging.warning("'start_time_key' not found in params.")
-        end_times = [
-            req["end_time"] for req in data_requirements.values() if req["end_time"]
-        ]
-        if end_times:
-            newest_end_time = max(end_times).isoformat()
-            end_time_key = self.params.pop("end_time_key", None)
-            if end_time_key:
-                self.params[end_time_key] = newest_end_time
-                logging.info(
-                    f"Set end_time to {newest_end_time} and removed 'end_time_key'"
-                )
-            else:
-                logging.warning("'end_time_key' not found in params.")
-    def extract(self):
-        """
-        Downloads the file from the HTTP/HTTPS server and returns a file-like object.
-        """
-        response = requests.get(
-            url=self.url,
-            params=self.params,
-            headers=self.headers,
-            auth=self.auth,
-            stream=True,
-        )
-        response.raise_for_status()
-        logging.info(f"Successfully downloaded file from {response.url}")
-        data = BytesIO()
-        for chunk in response.iter_content(chunk_size=8192):
-            if chunk:
-                data.write(chunk)
-        data.seek(0)
-        return data
-    @staticmethod
-    def format_url(url_template, url_variables):
-        try:
-            url = url_template.format(**url_variables)
-        except KeyError as e:
-            missing_key = e.args[0]
-            raise KeyError(f"Missing configuration url_variable: {missing_key}")
-        return url

hydroserverpy-1.1.0b1/src/hydroserverpy/etl/extractors/local_file_extractor.py DELETED Viewed

@@ -1,25 +0,0 @@
-import logging
-from typing import Dict
-from .base import Extractor
-from ..types import TimeRange
-class LocalFileExtractor(Extractor):
-    def __init__(self, filepath: str):
-        self.filepath = filepath
-    def prepare_params(self, data_requirements: Dict[str, TimeRange]):
-        pass
-    def extract(self):
-        """
-        Opens the file and returns a file-like object.
-        """
-        try:
-            file_handle = open(self.filepath, "r")
-            logging.info(f"Successfully opened file '{self.filepath}'.")
-            return file_handle
-        except Exception as e:
-            logging.error(f"Error opening file '{self.filepath}': {e}")
-            return None

hydroserverpy-1.1.0b1/src/hydroserverpy/etl/transformers/base.py DELETED Viewed

@@ -1,52 +0,0 @@
-from abc import ABC, abstractmethod
-import logging
-import pandas as pd
-class Transformer(ABC):
-    @abstractmethod
-    def transform(self, *args, **kwargs) -> None:
-        pass
-    @property
-    def needs_datastreams(self) -> bool:
-        return False
-    @staticmethod
-    def standardize_dataframe(
-        df,
-        datastream_ids,
-        timestamp_column: str = "timestamp",
-        timestamp_format: str = "ISO8601",
-    ):
-        df.rename(
-            columns={timestamp_column: "timestamp", **datastream_ids},
-            inplace=True,
-        )
-        # Verify timestamp column is present in the DataFrame
-        if "timestamp" not in df.columns:
-            message = f"Timestamp column '{timestamp_column}' not found in data."
-            logging.error(message)
-            raise ValueError(message)
-        # Verify that all datastream_ids are present in the DataFrame
-        expected_columns = set(datastream_ids.values())
-        actual_columns = set(df.columns)
-        missing_datastream_ids = expected_columns - actual_columns
-        if missing_datastream_ids:
-            raise ValueError(
-                "The following datastream IDs are specified in the config file but their related keys could not be "
-                f"found in the source system's extracted data: {missing_datastream_ids}"
-            )
-        # Keep only 'timestamp' and datastream_id columns
-        columns_to_keep = ["timestamp"] + list(expected_columns)
-        df = df[columns_to_keep]
-        # Convert timestamp column to datetime if not already
-        if not pd.api.types.is_datetime64_any_dtype(df["timestamp"]):
-            df["timestamp"] = pd.to_datetime(df["timestamp"], format=timestamp_format)
-        return df

hydroserverpy-1.1.0b1/src/hydroserverpy/etl/transformers/csv_transformer.py DELETED Viewed

@@ -1,88 +0,0 @@
-import logging
-import pandas as pd
-from typing import Dict, Optional, Union
-from .base import Transformer
-class CSVTransformer(Transformer):
-    def __init__(
-        self,
-        header_row: Optional[int],
-        data_start_row: int,
-        timestamp_column: Union[str, int],
-        datastream_ids: Dict[Union[str, int], str],
-        delimiter: Optional[str] = ",",
-        timestamp_format: Optional[str] = "ISO8601",
-    ):
-        # Pandas is zero-based while CSV is one-based so convert
-        self.header_row = None if header_row is None else header_row - 1
-        self.data_start_row = data_start_row - 1
-        self.timestamp_column = self.convert_to_zero_based(timestamp_column)
-        self.datastream_ids = datastream_ids
-        self.timestamp_format = timestamp_format
-        self.delimiter = delimiter
-    def transform(self, data_file) -> Union[pd.DataFrame, None]:
-        """
-        Transforms a CSV file-like object into a Pandas DataFrame where the column
-        names are replaced with their target datastream ids.
-        Parameters:
-            data_file: File-like object containing CSV data.
-        Returns:
-            observations_map (dict): Dict mapping datastream IDs to pandas DataFrames.
-        """
-        try:
-            df = pd.read_csv(
-                data_file,
-                delimiter=self.delimiter,
-                header=self.header_row,
-                parse_dates=[self.timestamp_column],
-                date_format=self.timestamp_format,
-                skiprows=self.calculate_skiprows(),
-                usecols=[self.timestamp_column] + list(self.datastream_ids.keys()),
-            )
-        except Exception as e:
-            logging.error(f"Error reading CSV data: {e}")
-            return None
-        if self.header_row is None:
-            df.columns = list(range(1, len(df.columns) + 1))
-        return self.standardize_dataframe(
-            df, self.datastream_ids, self.timestamp_column, self.timestamp_format
-        )
-    def calculate_skiprows(self):
-        """
-        Calculates the skiprows parameter for pd.read_csv.
-        Returns:
-            skiprows (list or None): List of row indices to skip, or None if no rows need to be skipped.
-        Raises:
-            ValueError: If header_row is not compatible with data_start_row.
-        """
-        if self.data_start_row == 0:
-            if self.header_row is not None:
-                # Cannot have a header row if data starts at the first row
-                raise ValueError(
-                    "header_row must be None when data_start_row is 1 (first row)"
-                )
-            return None  # No rows to skip
-        skiprows = list(range(self.data_start_row))
-        if self.header_row is not None:
-            if self.header_row >= self.data_start_row:
-                raise ValueError("header_row must be less than data_start_row")
-            if self.header_row in skiprows:
-                # Do not skip the header row
-                skiprows.remove(self.header_row)
-        return skiprows
-    @staticmethod
-    def convert_to_zero_based(index: Union[str, int]) -> Union[str, int]:
-        if isinstance(index, int):
-            return index - 1
-        return index