tseda 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tseda/change_point/change_point_estimator.py +77 -0
- tseda/config/prompts_config.csv +5 -0
- tseda/data_writers/kmds_writer.py +58 -0
- tseda/dataloader/coffee_prices_data_loader.py +30 -0
- tseda/dataloader/kaggle_data_loader.py +25 -0
- tseda/dataloader/kmds_data_loader.py +82 -0
- tseda/dataloader/local_dataloader.py +21 -0
- tseda/dataloader/synthetic_series_data_loader.py +39 -0
- tseda/dataloader/white_noise_data_loader.py +26 -0
- tseda/decomposition/ssa_decomposition.py +401 -0
- tseda/decomposition/ssa_result_summary.py +258 -0
- tseda/images/tseda_workflow.png +0 -0
- tseda/periodicity/fft_analyzer.py +60 -0
- tseda/series_stats/sampling_prop.py +131 -0
- tseda/series_stats/summary_statistics.py +23 -0
- tseda/user_interface/analysis.py +163 -0
- tseda/user_interface/callback_services.py +296 -0
- tseda/user_interface/components/analysis_assessment.py +157 -0
- tseda/user_interface/components/initial_eval_components.py +491 -0
- tseda/user_interface/gemini_chat.py +45 -0
- tseda/user_interface/initial_assessment.py +36 -0
- tseda/user_interface/initial_assessment_layout.py +95 -0
- tseda/user_interface/kmds_capture.py +426 -0
- tseda/user_interface/ts_analyze_ui.py +616 -0
- tseda/visualization/autocorrelation_vis.py +51 -0
- tseda/visualization/series_histogram_visualizer.py +44 -0
- tseda/visualization/series_kde_visualizer.py +92 -0
- tseda/visualization/series_visualizer.py +66 -0
- tseda-0.1.0.dist-info/METADATA +145 -0
- tseda-0.1.0.dist-info/RECORD +32 -0
- tseda-0.1.0.dist-info/WHEEL +5 -0
- tseda-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import ruptures as rpt
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PELT_ChangePointEstimator:
|
|
9
|
+
"""Estimate change points with the PELT algorithm and return a predicted segment series."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, series: pd.Series, model: str = "rbf") -> None:
|
|
12
|
+
if series is None or len(series) == 0:
|
|
13
|
+
raise ValueError("Input series must be a non-empty pandas Series.")
|
|
14
|
+
|
|
15
|
+
self._series = series
|
|
16
|
+
self._model = model
|
|
17
|
+
self._n = len(series)
|
|
18
|
+
self._penalty = float(2 * np.log(self._n))
|
|
19
|
+
|
|
20
|
+
values = series.to_numpy().reshape(-1, 1)
|
|
21
|
+
self._algo = rpt.Pelt(model=model).fit(values)
|
|
22
|
+
self._change_pts = self._algo.predict(pen=self._penalty)
|
|
23
|
+
self._predicted_series = self._build_predicted_series(self._change_pts)
|
|
24
|
+
|
|
25
|
+
def _build_predicted_series(self, change_points: list[int]) -> pd.Series:
|
|
26
|
+
segment_labels: list[str] = []
|
|
27
|
+
start_idx = 0
|
|
28
|
+
|
|
29
|
+
for segment_no, end_idx in enumerate(change_points, start=1):
|
|
30
|
+
segment_length = max(0, end_idx - start_idx)
|
|
31
|
+
segment_labels.extend([f"segment-{segment_no}"] * segment_length)
|
|
32
|
+
start_idx = end_idx
|
|
33
|
+
|
|
34
|
+
if len(segment_labels) < self._n:
|
|
35
|
+
segment_labels.extend([f"segment-{len(change_points) + 1}"] * (self._n - len(segment_labels)))
|
|
36
|
+
|
|
37
|
+
return pd.Series(segment_labels[: self._n], index=self._series.index, name="segment")
|
|
38
|
+
|
|
39
|
+
def predict_series(self) -> pd.Series:
|
|
40
|
+
"""Return the predicted segment label series."""
|
|
41
|
+
return self._predicted_series.copy()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ChangePointEstimator:
|
|
45
|
+
"""Compatibility wrapper used by existing tests and call sites."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, series: pd.Series) -> None:
|
|
48
|
+
if series is None or len(series) == 0:
|
|
49
|
+
raise ValueError("Input series must be a non-empty pandas Series.")
|
|
50
|
+
|
|
51
|
+
self._series = series
|
|
52
|
+
self._df = pd.DataFrame({"date": series.index, "signal": series.values})
|
|
53
|
+
self._change_pts: list[int] | None = None
|
|
54
|
+
|
|
55
|
+
def estimate_change_points(self, penalty_coeff: float = 2.0) -> pd.Series:
|
|
56
|
+
"""Run PELT and assign segment labels for each observation."""
|
|
57
|
+
n = len(self._series)
|
|
58
|
+
penalty = float(penalty_coeff * np.log(n))
|
|
59
|
+
values = self._series.to_numpy().reshape(-1, 1)
|
|
60
|
+
|
|
61
|
+
algo = rpt.Pelt(model="rbf").fit(values)
|
|
62
|
+
self._change_pts = algo.predict(pen=penalty)
|
|
63
|
+
|
|
64
|
+
segment_labels: list[str] = []
|
|
65
|
+
start_idx = 0
|
|
66
|
+
for segment_no, end_idx in enumerate(self._change_pts, start=1):
|
|
67
|
+
segment_length = max(0, end_idx - start_idx)
|
|
68
|
+
segment_labels.extend([f"segment-{segment_no}"] * segment_length)
|
|
69
|
+
start_idx = end_idx
|
|
70
|
+
|
|
71
|
+
if len(segment_labels) < n:
|
|
72
|
+
segment_labels.extend([f"segment-{len(self._change_pts) + 1}"] * (n - len(segment_labels)))
|
|
73
|
+
|
|
74
|
+
self._df["segment"] = segment_labels[:n]
|
|
75
|
+
return self._df["segment"]
|
|
76
|
+
|
|
77
|
+
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
previous_state,prompt
|
|
2
|
+
START,You should upload a regularly sampled csv file for analysis
|
|
3
|
+
FILE_PROCESSED,"You should estimate the densite for the series and inspect its plot to see if there are multiple modes. If you see this, it means there were different regimes in your time series and you should analyze and explore these regimes independently. You can get an approximate estimate of the inflection points in the density estimate, so that you can sense if there are multiple regimes. You can then apply a change point detection algorithm to estimate the location of these changes. Shall I proceed with the inflection point estimation in the density curve?"
|
|
4
|
+
INFLECTION_POINTS_EST,You should run a change point detection algorithm to locate the estimated time of change of your time series behavior. Shall I do that?
|
|
5
|
+
CHANGE_POINTS_IDENTIFIED,You should plot the change points on the time series with the different segment color coded. Shall I do that?
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from kmds.tagging.tag_types import ExploratoryTags
|
|
2
|
+
from kmds.ontology.intent_types import IntentType
|
|
3
|
+
from owlready2 import *
|
|
4
|
+
from kmds.utils.load_utils import *
|
|
5
|
+
|
|
6
|
+
class KMDSDataWriter:
|
|
7
|
+
def __init__(self, file_path: str):
|
|
8
|
+
self._file_path = file_path
|
|
9
|
+
self._onto = self.load_kb()
|
|
10
|
+
return
|
|
11
|
+
|
|
12
|
+
def load_kb(self) -> Ontology:
|
|
13
|
+
onto2 :Ontology = load_kb(self._file_path)
|
|
14
|
+
return onto2
|
|
15
|
+
|
|
16
|
+
def add_exploratory_obs(self, obs: str, file_path: str) -> None:
|
|
17
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
with self._onto:
|
|
22
|
+
# add the new observation
|
|
23
|
+
observation_count :int = len(the_workflow.has_exploratory_observations)+ 1
|
|
24
|
+
e1 = ExploratoryObservation(namespace=self._onto)
|
|
25
|
+
|
|
26
|
+
e1.finding = obs
|
|
27
|
+
e1.finding_sequence = observation_count
|
|
28
|
+
e1.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
|
|
29
|
+
e1.intent = IntentType.DATA_UNDERSTANDING.value
|
|
30
|
+
the_workflow.has_exploratory_observations.append(e1)
|
|
31
|
+
|
|
32
|
+
self._onto.save(file=file_path, format="rdfxml")
|
|
33
|
+
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
def delete_exploratory_obs(self, obs_seq: int) -> None:
|
|
37
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
38
|
+
with self._onto:
|
|
39
|
+
del the_workflow.has_exploratory_observations[obs_seq - 1]
|
|
40
|
+
|
|
41
|
+
obs_len = len(the_workflow.has_exploratory_observations)
|
|
42
|
+
for idx in range(obs_len):
|
|
43
|
+
the_workflow.has_exploratory_observations[idx].finding_sequence = idx + 1
|
|
44
|
+
|
|
45
|
+
self._onto.save(file=self._file_path, format="rdfxml")
|
|
46
|
+
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
def update_exploratory_obs(self, obs: str, obs_seq: int) -> None:
|
|
50
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
51
|
+
with self._onto:
|
|
52
|
+
the_workflow.has_exploratory_observations[obs_seq - 1].finding = obs
|
|
53
|
+
|
|
54
|
+
self._onto.save(file=self._file_path, format="rdfxml")
|
|
55
|
+
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from .local_dataloader import LocalDataLoader
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
class CoffeePricesDataLoader(LocalDataLoader):
|
|
5
|
+
def __init__(self, file_path: str = "data/coffee_prices.csv"):
|
|
6
|
+
super().__init__(file_path)
|
|
7
|
+
|
|
8
|
+
def load_coffee_prices(self) -> pd.DataFrame:
|
|
9
|
+
"""Load coffee prices data from a local CSV file."""
|
|
10
|
+
data = self.load_data()
|
|
11
|
+
data.columns = ["date", "signal"]
|
|
12
|
+
data.date = pd.to_datetime(data.date)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
if not data.empty:
|
|
16
|
+
# Additional processing specific to coffee prices can be added here
|
|
17
|
+
return data
|
|
18
|
+
else:
|
|
19
|
+
print("No data loaded.")
|
|
20
|
+
return pd.DataFrame()
|
|
21
|
+
|
|
22
|
+
def get_series(self) -> pd.Series:
|
|
23
|
+
"""Get the 'signal' series from the coffee prices data."""
|
|
24
|
+
data = self.load_coffee_prices()
|
|
25
|
+
data.index = data.date
|
|
26
|
+
if not data.empty:
|
|
27
|
+
return data["signal"]
|
|
28
|
+
else:
|
|
29
|
+
print("No data available to extract series.")
|
|
30
|
+
return pd.Series(dtype=float)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from sys import exit
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
def download_kaggle_dataset(dataset_slug, download_path):
|
|
8
|
+
"""
|
|
9
|
+
dataset_slug: The part of the URL after kaggle.com/datasets/
|
|
10
|
+
Example: 'arashnic/max-planck-weather-dataset'
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
path = Path(download_path)
|
|
14
|
+
if not path.is_dir():
|
|
15
|
+
print("Directory does not exist or is not valid., please check the path and try again.")
|
|
16
|
+
exit(1)
|
|
17
|
+
|
|
18
|
+
load_dotenv() # Load environment variables from .env file
|
|
19
|
+
api = KaggleApi()
|
|
20
|
+
api.authenticate()
|
|
21
|
+
|
|
22
|
+
print(f"Downloading {dataset_slug}...")
|
|
23
|
+
api.dataset_download_files(dataset_slug, path=download_path, unzip=True)
|
|
24
|
+
print("Download complete.")
|
|
25
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from owlready2 import *
|
|
2
|
+
from kmds.utils.load_utils import *
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
class KMDSDataLoader:
|
|
6
|
+
def __init__(self, file_path: str):
|
|
7
|
+
self._file_path = file_path
|
|
8
|
+
self._onto = self.load_kb()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
def load_kb(self) -> Ontology:
|
|
14
|
+
onto2 :Ontology = load_kb(self._file_path)
|
|
15
|
+
return onto2
|
|
16
|
+
|
|
17
|
+
def load_exploratory_obs(self) -> pd.DataFrame:
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
21
|
+
exp_obs: List[ExploratoryObservation] = the_workflow.has_exploratory_observations
|
|
22
|
+
records = []
|
|
23
|
+
|
|
24
|
+
for o in exp_obs:
|
|
25
|
+
a_row = {}
|
|
26
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
27
|
+
#a_row["obs_type"] = o.exploratory_observation_type
|
|
28
|
+
a_row["finding"] = o.finding
|
|
29
|
+
records.append(a_row)
|
|
30
|
+
df = pd.DataFrame(records)
|
|
31
|
+
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
def load_data_rep_obs(self) -> pd.DataFrame:
|
|
35
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
36
|
+
dr_obs: List[DataRepresentationObservation] = the_workflow.has_data_representation_observations
|
|
37
|
+
records = []
|
|
38
|
+
for o in dr_obs:
|
|
39
|
+
a_row = {}
|
|
40
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
41
|
+
a_row["obs_type"] = o.data_representation_observation_type
|
|
42
|
+
a_row["finding"] = o.finding
|
|
43
|
+
records.append(a_row)
|
|
44
|
+
df = pd.DataFrame(records)
|
|
45
|
+
|
|
46
|
+
return df
|
|
47
|
+
def load_modelling_choice_obs(self) -> pd.DataFrame:
|
|
48
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
49
|
+
mc_obs: List[ModellingChoiceObservation] = the_workflow.has_modelling_choice_observations
|
|
50
|
+
records = []
|
|
51
|
+
for o in mc_obs:
|
|
52
|
+
a_row = {}
|
|
53
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
54
|
+
a_row["obs_type"] = o.modelling_choice_observation_type
|
|
55
|
+
a_row["finding"] = o.finding
|
|
56
|
+
records.append(a_row)
|
|
57
|
+
df = pd.DataFrame(records)
|
|
58
|
+
|
|
59
|
+
return df
|
|
60
|
+
|
|
61
|
+
def load_modelling_selection_obs(self) -> pd.DataFrame:
|
|
62
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
63
|
+
ms_obs: List[ModellingSelectionObservation] = the_workflow.has_modelling_selection_observations
|
|
64
|
+
records = []
|
|
65
|
+
for o in ms_obs:
|
|
66
|
+
a_row = {}
|
|
67
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
68
|
+
a_row["obs_type"] = o.modelling_selection_observation_type
|
|
69
|
+
a_row["finding"] = o.finding
|
|
70
|
+
records.append(a_row)
|
|
71
|
+
df = pd.DataFrame(records)
|
|
72
|
+
|
|
73
|
+
return df
|
|
74
|
+
|
|
75
|
+
def export_all_observations(self) -> pd.DataFrame:
|
|
76
|
+
exp_df = load_exp_observations(self._onto)
|
|
77
|
+
dr_df = load_data_rep_observations(self._onto)
|
|
78
|
+
mc_df = load_modelling_choice_observations(self._onto)
|
|
79
|
+
ms_df = load_model_selection_observations(self._onto)
|
|
80
|
+
df_consolidated = pd.concat([exp_df, dr_df, mc_df, ms_df], ignore_index=True)
|
|
81
|
+
|
|
82
|
+
return df_consolidated
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
class LocalDataLoader:
|
|
5
|
+
def __init__(self, file_path: str):
|
|
6
|
+
self.file_path = file_path
|
|
7
|
+
|
|
8
|
+
def load_data(self) -> pd.DataFrame:
|
|
9
|
+
"""Load data from a local CSV file into a pandas DataFrame."""
|
|
10
|
+
try:
|
|
11
|
+
data = pd.read_csv(self.file_path)
|
|
12
|
+
return data
|
|
13
|
+
except FileNotFoundError:
|
|
14
|
+
print(f"Error: The file at {self.file_path} was not found.")
|
|
15
|
+
return pd.DataFrame()
|
|
16
|
+
except pd.errors.EmptyDataError:
|
|
17
|
+
print("Error: The file is empty.")
|
|
18
|
+
return pd.DataFrame()
|
|
19
|
+
except Exception as e:
|
|
20
|
+
print(f"An unexpected error occurred: {e}")
|
|
21
|
+
return pd.DataFrame()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from .local_dataloader import LocalDataLoader
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy.stats import norm
|
|
6
|
+
from math import ceil
|
|
7
|
+
|
|
8
|
+
class SyntheticSeriesDataLoader(LocalDataLoader):
|
|
9
|
+
def __init__(self, file_path: str = "data/synthetic_series.csv"):
|
|
10
|
+
super().__init__(file_path)
|
|
11
|
+
|
|
12
|
+
def get_series(self) -> pd.Series:
|
|
13
|
+
"""Get the 'signal' series from the synthetic series data."""
|
|
14
|
+
now = datetime.now()
|
|
15
|
+
p1_peak = 5
|
|
16
|
+
p2_peak = 7
|
|
17
|
+
p1 = 6
|
|
18
|
+
p2 = 10
|
|
19
|
+
N = ceil(24*90/1) # 90 days of hourly data
|
|
20
|
+
time_idx = [float(i) for i in range(N)]
|
|
21
|
+
p1_omega = [(2*np.pi* t)/(p1) for t in time_idx]
|
|
22
|
+
p2_omega = [(2*np.pi* t)/(p2) for t in time_idx]
|
|
23
|
+
p1_vals = [p1_peak*np.sin(w) for w in p1_omega]
|
|
24
|
+
p2_vals = [p2_peak*np.sin(w) for w in p2_omega]
|
|
25
|
+
noise = norm.rvs(loc=0, scale=0.5, size=N)
|
|
26
|
+
level = [20 for _ in range(N)]
|
|
27
|
+
signal = np.array(level) + np.array(p1_vals) + np.array(p2_vals) + np.array(noise)
|
|
28
|
+
time_vals = [now + timedelta(hours=i) for i in range(N)]
|
|
29
|
+
data = {"time": time_vals, "signal": signal}
|
|
30
|
+
df = pd.DataFrame.from_dict(data)
|
|
31
|
+
df.index = df.time
|
|
32
|
+
if not df.empty:
|
|
33
|
+
series = df["signal"]
|
|
34
|
+
series.index = df.time
|
|
35
|
+
|
|
36
|
+
return series
|
|
37
|
+
else:
|
|
38
|
+
print("No data available to extract series.")
|
|
39
|
+
return pd.Series(dtype=float)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from .local_dataloader import LocalDataLoader
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
6
|
+
class WhiteNoiseDataLoader(LocalDataLoader):
|
|
7
|
+
def __init__(self, file_path: str = "data/white_noise_series.csv"):
|
|
8
|
+
super().__init__(file_path)
|
|
9
|
+
|
|
10
|
+
def get_series(self) -> pd.Series:
|
|
11
|
+
"""Generate a white noise series for 30 days with hourly frequency."""
|
|
12
|
+
num_days = 30
|
|
13
|
+
samples_per_day = 24
|
|
14
|
+
num_samples = num_days * samples_per_day
|
|
15
|
+
|
|
16
|
+
# Create a datetime index for 30 days with hourly frequency
|
|
17
|
+
start_date = datetime.now()
|
|
18
|
+
time_index = pd.to_datetime([start_date + timedelta(hours=i) for i in range(num_samples)])
|
|
19
|
+
|
|
20
|
+
# Generate white noise data
|
|
21
|
+
white_noise = np.random.normal(loc=0, scale=1, size=num_samples)
|
|
22
|
+
|
|
23
|
+
# Create a pandas Series
|
|
24
|
+
series = pd.Series(white_noise, index=time_index)
|
|
25
|
+
|
|
26
|
+
return series
|