direl-ts-tool-kit 0.3.0__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: direl-ts-tool-kit
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: A toolbox for time series analysis and visualization.
5
5
  Home-page: https://gitlab.com/direl/direl_tool_kit
6
6
  Author: Diego Restrepo-Leal
@@ -100,6 +100,10 @@ def plot_time_series(
100
100
  if time_unit == "Day":
101
101
  ax.xaxis.set_major_locator(mdates.DayLocator())
102
102
  ax.xaxis.set_minor_locator(mdates.HourLocator())
103
+
104
+ if time_unit == "Hour":
105
+ ax.xaxis.set_major_locator(mdates.HourLocator())
106
+ ax.xaxis.set_minor_locator(mdates.MinuteLocator())
103
107
 
104
108
  ax.tick_params(axis="x", rotation=rot)
105
109
  ax.grid(which="both")
@@ -1,12 +1,13 @@
1
+ import numpy as np
1
2
  import pandas as pd
2
3
 
3
4
 
4
- def parse_datetime_index(df_raw, date_column="date"):
5
+ def parse_datetime_index(df_raw, date_column="date", format=None):
5
6
  """
6
7
  Parses a specified column into datetime objects and sets it as the DataFrame index.
7
8
 
8
- This function is crucial for preparing raw data (df_raw) for time series analysis
9
- by ensuring the DataFrame is indexed by the correct datetime type.
9
+ This function prepares raw data for time series analysis by ensuring the
10
+ DataFrame is indexed by the correct datetime type.
10
11
 
11
12
  Parameters
12
13
  ----------
@@ -15,17 +16,24 @@ def parse_datetime_index(df_raw, date_column="date"):
15
16
  date_column : str, optional
16
17
  The name of the column in 'df_raw' that contains the date/time information.
17
18
  Defaults to "date".
19
+ format : str, optional
20
+ The explicit format string (e.g., '%Y%m%d', '%Y-%m-%d %H:%M:%S')
21
+ to parse the dates, passed to `pd.to_datetime`. If None (default),
22
+ Pandas attempts to infer the format automatically.
18
23
 
19
24
  Returns
20
25
  -------
21
26
  df_ts : pd.DataFrame
22
27
  A copy of the original DataFrame with the specified date column removed
23
- and set as the DatetimeIndex. Ready for time series plotting.
28
+ and set as the DatetimeIndex. The returned DataFrame is ready for
29
+ time series operations.
24
30
  """
31
+ if not format:
32
+ date_parsed = pd.to_datetime(df_raw[date_column])
33
+ else:
34
+ date_parsed = pd.to_datetime(df_raw[date_column], format=format)
25
35
 
26
- date_parsed = pd.to_datetime(df_raw[date_column])
27
36
  df_ts = df_raw.copy()
28
- original_dates = df_raw[date_column]
29
37
  df_ts.drop(columns=[date_column], inplace=True)
30
38
  df_ts.set_index(date_parsed, inplace=True)
31
39
 
@@ -116,3 +124,38 @@ def reindex_and_aggregate(df_ts, column_name, freq="MS"):
116
124
  df_ts_new.notnull().apply(pd.Series.value_counts)
117
125
 
118
126
  return df_ts_new
127
+
128
+
129
+ def remove_outliers_by_threshold(df_ts, column_name, lower_bound, upper_bound):
130
+ """
131
+ Replaces values in a specified column with NaN if they fall outside
132
+ a defined range (outlier removal).
133
+
134
+ This function identifies data points that are either below the lower
135
+ bound or above the upper bound and treats them as missing data.
136
+
137
+ Parameters
138
+ ----------
139
+ df_ts : pd.DataFrame
140
+ The time series DataFrame (must have a DatetimeIndex).
141
+ column_name : str
142
+ The name of the column where outlier detection will be performed (e.g., 'Temperature').
143
+ lower_bound : float or int
144
+ The minimum acceptable value. Values strictly below this bound are replaced by NaN.
145
+ upper_bound : float or int
146
+ The maximum acceptable value. Values strictly above this bound are replaced by NaN.
147
+
148
+ Returns
149
+ -------
150
+ pd.DataFrame
151
+ The DataFrame with outlier values in the specified column replaced by np.nan.
152
+ """
153
+ df_out = df_ts.copy()
154
+
155
+ outlier_index = df_out[
156
+ (df_out[column_name] < lower_bound) | (df_out[column_name] > upper_bound)
157
+ ].index
158
+
159
+ df_out.loc[outlier_index, column_name] = np.nan
160
+
161
+ return df_out
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: direl-ts-tool-kit
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: A toolbox for time series analysis and visualization.
5
5
  Home-page: https://gitlab.com/direl/direl_tool_kit
6
6
  Author: Diego Restrepo-Leal
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name="direl-ts-tool-kit",
5
- version="0.3.0",
5
+ version="0.4.1",
6
6
  description="A toolbox for time series analysis and visualization.",
7
7
  long_description=open("README.md", encoding="utf-8").read(),
8
8
  long_description_content_type="text/markdown",