tseda 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. tseda-0.1.0/PKG-INFO +145 -0
  2. tseda-0.1.0/README.md +103 -0
  3. tseda-0.1.0/pyproject.toml +63 -0
  4. tseda-0.1.0/setup.cfg +4 -0
  5. tseda-0.1.0/src/tseda/change_point/change_point_estimator.py +77 -0
  6. tseda-0.1.0/src/tseda/config/prompts_config.csv +5 -0
  7. tseda-0.1.0/src/tseda/data_writers/kmds_writer.py +58 -0
  8. tseda-0.1.0/src/tseda/dataloader/coffee_prices_data_loader.py +30 -0
  9. tseda-0.1.0/src/tseda/dataloader/kaggle_data_loader.py +25 -0
  10. tseda-0.1.0/src/tseda/dataloader/kmds_data_loader.py +82 -0
  11. tseda-0.1.0/src/tseda/dataloader/local_dataloader.py +21 -0
  12. tseda-0.1.0/src/tseda/dataloader/synthetic_series_data_loader.py +39 -0
  13. tseda-0.1.0/src/tseda/dataloader/white_noise_data_loader.py +26 -0
  14. tseda-0.1.0/src/tseda/decomposition/ssa_decomposition.py +401 -0
  15. tseda-0.1.0/src/tseda/decomposition/ssa_result_summary.py +258 -0
  16. tseda-0.1.0/src/tseda/images/tseda_workflow.png +0 -0
  17. tseda-0.1.0/src/tseda/periodicity/fft_analyzer.py +60 -0
  18. tseda-0.1.0/src/tseda/series_stats/sampling_prop.py +131 -0
  19. tseda-0.1.0/src/tseda/series_stats/summary_statistics.py +23 -0
  20. tseda-0.1.0/src/tseda/user_interface/analysis.py +163 -0
  21. tseda-0.1.0/src/tseda/user_interface/callback_services.py +296 -0
  22. tseda-0.1.0/src/tseda/user_interface/components/analysis_assessment.py +157 -0
  23. tseda-0.1.0/src/tseda/user_interface/components/initial_eval_components.py +491 -0
  24. tseda-0.1.0/src/tseda/user_interface/gemini_chat.py +45 -0
  25. tseda-0.1.0/src/tseda/user_interface/initial_assessment.py +36 -0
  26. tseda-0.1.0/src/tseda/user_interface/initial_assessment_layout.py +95 -0
  27. tseda-0.1.0/src/tseda/user_interface/kmds_capture.py +426 -0
  28. tseda-0.1.0/src/tseda/user_interface/ts_analyze_ui.py +616 -0
  29. tseda-0.1.0/src/tseda/visualization/autocorrelation_vis.py +51 -0
  30. tseda-0.1.0/src/tseda/visualization/series_histogram_visualizer.py +44 -0
  31. tseda-0.1.0/src/tseda/visualization/series_kde_visualizer.py +92 -0
  32. tseda-0.1.0/src/tseda/visualization/series_visualizer.py +66 -0
  33. tseda-0.1.0/src/tseda.egg-info/PKG-INFO +145 -0
  34. tseda-0.1.0/src/tseda.egg-info/SOURCES.txt +44 -0
  35. tseda-0.1.0/src/tseda.egg-info/dependency_links.txt +1 -0
  36. tseda-0.1.0/src/tseda.egg-info/requires.txt +29 -0
  37. tseda-0.1.0/src/tseda.egg-info/top_level.txt +1 -0
  38. tseda-0.1.0/tests/test_autocorrelation_vis.py +58 -0
  39. tseda-0.1.0/tests/test_change_point_estimator.py +47 -0
  40. tseda-0.1.0/tests/test_fft_analyzer.py +70 -0
  41. tseda-0.1.0/tests/test_sampling_prop.py +56 -0
  42. tseda-0.1.0/tests/test_series_histogram_visualizer.py +99 -0
  43. tseda-0.1.0/tests/test_series_kde_visualizer.py +59 -0
  44. tseda-0.1.0/tests/test_ssa_decomposition.py +90 -0
  45. tseda-0.1.0/tests/test_ssa_result_summary.py +83 -0
  46. tseda-0.1.0/tests/test_ts_analyze_ui.py +174 -0
tseda-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,145 @@
1
+ Metadata-Version: 2.4
2
+ Name: tseda
3
+ Version: 0.1.0
4
+ Summary: A package for exploration of regularly sampled time series.
5
+ License-Expression: LicenseRef-Proprietary
6
+ Keywords: time-series,ssa,dash,analytics,decomposition
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Programming Language :: Python :: 3 :: Only
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Intended Audience :: Science/Research
11
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
12
+ Requires-Python: >=3.13
13
+ Description-Content-Type: text/markdown
14
+ Requires-Dist: dash>=4.1.0
15
+ Requires-Dist: dash-bootstrap-components>=2.0.4
16
+ Requires-Dist: google-genai>=1.56.0
17
+ Requires-Dist: importlib-resources>=6.5.2
18
+ Requires-Dist: ipython>=9.8.0
19
+ Requires-Dist: jupyter>=1.1.1
20
+ Requires-Dist: jupyter-ai>=2.31.7
21
+ Requires-Dist: jupyter-dash>=0.4.2
22
+ Requires-Dist: kaggle>=2.0.0
23
+ Requires-Dist: kdepy>=1.1.12
24
+ Requires-Dist: kmds>=0.3.1
25
+ Requires-Dist: matplotlib>=3.10.8
26
+ Requires-Dist: numpy>=2.4.0
27
+ Requires-Dist: pandas>=2.3.3
28
+ Requires-Dist: plotly>=6.5.0
29
+ Requires-Dist: python-dotenv>=1.2.1
30
+ Requires-Dist: req>=1.0.0
31
+ Requires-Dist: ruptures>=1.1.10
32
+ Requires-Dist: scikit-learn>=1.8.0
33
+ Requires-Dist: scipy<1.16.0
34
+ Requires-Dist: seaborn>=0.13.2
35
+ Requires-Dist: skrub>=0.7.2
36
+ Requires-Dist: ssalib>=0.1.3
37
+ Requires-Dist: statsmodels>=0.14.6
38
+ Requires-Dist: streamlit>=1.52.2
39
+ Provides-Extra: dev
40
+ Requires-Dist: pytest>=8.0; extra == "dev"
41
+ Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
42
+
43
+ # Time Series Explorer (`tseda`)
44
+
45
+ An application for time series exploration.
46
+
47
+ ## Overview
48
+
49
+ `tseda` lets you explore regularly sampled time series with a sampling frequency of one hour or greater. It is currently limited to 2,000 samples (this is configurable).
50
+
51
+ ## Three-Step Exploration Workflow
52
+
53
+ ### (a) Initial Assessment
54
+
55
+ Explore the distribution and spread of values using a kernel density estimate and box plot. You get to see the raw distribution of the values. The PACF and ACF provide clues about seasonality and autoregressive components.
56
+
57
+ ### (b) Decomposition Using Singular Spectral Analysis
58
+
59
+ On the basis of the sampling frequency, a window for SSA is determined. This is a heuristic assignment. For example:
60
+
61
+ | Sampling Frequency | Window Size |
62
+ |--------------------|-------------|
63
+ | Hourly | 24 |
64
+ | Monthly | 12 |
65
+ | Quarterly | 4 |
66
+
67
+ This can be changed in the UI. Based on the eigen value distribution, observations from the ACF plot and the eigen vector plot, the seasonal components can be determined if present. Based on these initial plots, the user needs to input a set of groupings and reconstruct the series with these groupings. The reconstruction plots are shown. If there is structure in the series, then change point analysis can be done using the fact that the components are smooth. A change point plot is shown. The explained variance from signal and noise components and the assessment of the noise structure (independent or correlated) is provided.
68
+
69
+ ### (c) Observation Logging
70
+
71
+ The SSA is based on the eigen decomposition of the trajectory matrix. Though the raw signal is correlated, the eigenvectors are uncorrelated. If we assume that the signal is Gaussian, this also implies independence. We can use the Akaike Information Criterion for model selection and determine the AIC as a function of the rank of the model. This is shown in the observation page. An automatic summary of all the observations is provided.
72
+
73
+ ## Notebook Interface
74
+
75
+ The package also provides a notebook interface to these features. If you have a new dataset that you want to analyze, look at the data loader directory for examples. Download your dataset, clean it, produce your time series, and analyze it with `tseda`.
76
+
77
+ ## Getting Started
78
+
79
+ ### 1. Install Dependencies
80
+
81
+ Create and activate a virtual environment, then install the required packages:
82
+
83
+ ```bash
84
+ python -m venv .venv
85
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
86
+ pip install -r requirements.txt
87
+ ```
88
+
89
+ Or, if you are using the package in editable/development mode:
90
+
91
+ ```bash
92
+ pip install -e .
93
+ ```
94
+
95
+ ### 2. Run the App
96
+
97
+ ```bash
98
+ python src/tseda/user_interface/ts_analyze_ui.py
99
+ ```
100
+
101
+ The app will start a local web server. Open your browser and navigate to the URL printed in the terminal (typically `http://127.0.0.1:8050`).
102
+
103
+ ### 3. Upload Your Data
104
+
105
+ - Click **"Drag and Drop or Select Files"** in the Initial Assessment panel.
106
+ - Your file must be a **CSV or Excel** file with at least two columns: a **timestamp** column (first) and a **numeric value** column (second).
107
+ - The data must be **regularly sampled at hourly or lower frequency** (e.g., hourly, daily, monthly).
108
+ - The dataset must contain **no missing values** (NA / NaN). Clean your data before uploading.
109
+ - Files are limited to **2,000 rows** (configurable via `MAX_FILE_LINES` in `ts_analyze_ui.py`).
110
+
111
+ ### 4. Explore in Three Steps
112
+
113
+ | Step | Panel | What to do |
114
+ |------|-------|------------|
115
+ | 1 | **Initial Assessment of Time Series** | Review distribution plots (KDE, box plot) and the ACF / PACF for autocorrelation patterns. |
116
+ | 2 | **Time Series Decomposition** | Review the eigenvalue plot, then enter component groupings (e.g., Trend, Seasonal, Noise) and click **Apply Grouping**. |
117
+ | 3 | **Observation Logging** | Review the AIC rank diagnostics, read the auto-generated summary, and add your own observations before saving the report. |
118
+
119
+ ## Build And Publish With uv
120
+
121
+ 1. Build source and wheel distributions:
122
+
123
+ ```bash
124
+ uv build
125
+ ```
126
+
127
+ 2. Validate distributions before upload:
128
+
129
+ ```bash
130
+ uvx twine check dist/*
131
+ ```
132
+
133
+ 3. Publish to PyPI using an API token:
134
+
135
+ ```bash
136
+ export UV_PUBLISH_TOKEN="pypi-..."
137
+ uv publish
138
+ ```
139
+
140
+ 4. Publish to TestPyPI first (recommended):
141
+
142
+ ```bash
143
+ export UV_PUBLISH_TOKEN="pypi-..."
144
+ uv publish --publish-url https://test.pypi.org/legacy/
145
+ ```
tseda-0.1.0/README.md ADDED
@@ -0,0 +1,103 @@
1
+ # Time Series Explorer (`tseda`)
2
+
3
+ An application for time series exploration.
4
+
5
+ ## Overview
6
+
7
+ `tseda` lets you explore regularly sampled time series with a sampling frequency of one hour or greater. It is currently limited to 2,000 samples (this is configurable).
8
+
9
+ ## Three-Step Exploration Workflow
10
+
11
+ ### (a) Initial Assessment
12
+
13
+ Explore the distribution and spread of values using a kernel density estimate and box plot. You get to see the raw distribution of the values. The PACF and ACF provide clues about seasonality and autoregressive components.
14
+
15
+ ### (b) Decomposition Using Singular Spectral Analysis
16
+
17
+ On the basis of the sampling frequency, a window for SSA is determined. This is a heuristic assignment. For example:
18
+
19
+ | Sampling Frequency | Window Size |
20
+ |--------------------|-------------|
21
+ | Hourly | 24 |
22
+ | Monthly | 12 |
23
+ | Quarterly | 4 |
24
+
25
+ This can be changed in the UI. Based on the eigen value distribution, observations from the ACF plot and the eigen vector plot, the seasonal components can be determined if present. Based on these initial plots, the user needs to input a set of groupings and reconstruct the series with these groupings. The reconstruction plots are shown. If there is structure in the series, then change point analysis can be done using the fact that the components are smooth. A change point plot is shown. The explained variance from signal and noise components and the assessment of the noise structure (independent or correlated) is provided.
26
+
27
+ ### (c) Observation Logging
28
+
29
+ The SSA is based on the eigen decomposition of the trajectory matrix. Though the raw signal is correlated, the eigenvectors are uncorrelated. If we assume that the signal is Gaussian, this also implies independence. We can use the Akaike Information Criterion for model selection and determine the AIC as a function of the rank of the model. This is shown in the observation page. An automatic summary of all the observations is provided.
30
+
31
+ ## Notebook Interface
32
+
33
+ The package also provides a notebook interface to these features. If you have a new dataset that you want to analyze, look at the data loader directory for examples. Download your dataset, clean it, produce your time series, and analyze it with `tseda`.
34
+
35
+ ## Getting Started
36
+
37
+ ### 1. Install Dependencies
38
+
39
+ Create and activate a virtual environment, then install the required packages:
40
+
41
+ ```bash
42
+ python -m venv .venv
43
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
44
+ pip install -r requirements.txt
45
+ ```
46
+
47
+ Or, if you are using the package in editable/development mode:
48
+
49
+ ```bash
50
+ pip install -e .
51
+ ```
52
+
53
+ ### 2. Run the App
54
+
55
+ ```bash
56
+ python src/tseda/user_interface/ts_analyze_ui.py
57
+ ```
58
+
59
+ The app will start a local web server. Open your browser and navigate to the URL printed in the terminal (typically `http://127.0.0.1:8050`).
60
+
61
+ ### 3. Upload Your Data
62
+
63
+ - Click **"Drag and Drop or Select Files"** in the Initial Assessment panel.
64
+ - Your file must be a **CSV or Excel** file with at least two columns: a **timestamp** column (first) and a **numeric value** column (second).
65
+ - The data must be **regularly sampled at hourly or lower frequency** (e.g., hourly, daily, monthly).
66
+ - The dataset must contain **no missing values** (NA / NaN). Clean your data before uploading.
67
+ - Files are limited to **2,000 rows** (configurable via `MAX_FILE_LINES` in `ts_analyze_ui.py`).
68
+
69
+ ### 4. Explore in Three Steps
70
+
71
+ | Step | Panel | What to do |
72
+ |------|-------|------------|
73
+ | 1 | **Initial Assessment of Time Series** | Review distribution plots (KDE, box plot) and the ACF / PACF for autocorrelation patterns. |
74
+ | 2 | **Time Series Decomposition** | Review the eigenvalue plot, then enter component groupings (e.g., Trend, Seasonal, Noise) and click **Apply Grouping**. |
75
+ | 3 | **Observation Logging** | Review the AIC rank diagnostics, read the auto-generated summary, and add your own observations before saving the report. |
76
+
77
+ ## Build And Publish With uv
78
+
79
+ 1. Build source and wheel distributions:
80
+
81
+ ```bash
82
+ uv build
83
+ ```
84
+
85
+ 2. Validate distributions before upload:
86
+
87
+ ```bash
88
+ uvx twine check dist/*
89
+ ```
90
+
91
+ 3. Publish to PyPI using an API token:
92
+
93
+ ```bash
94
+ export UV_PUBLISH_TOKEN="pypi-..."
95
+ uv publish
96
+ ```
97
+
98
+ 4. Publish to TestPyPI first (recommended):
99
+
100
+ ```bash
101
+ export UV_PUBLISH_TOKEN="pypi-..."
102
+ uv publish --publish-url https://test.pypi.org/legacy/
103
+ ```
@@ -0,0 +1,63 @@
1
+ [project]
2
+ name = "tseda"
3
+ version = "0.1.0"
4
+ description = "A package for exploration of regularly sampled time series."
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ license = "LicenseRef-Proprietary"
8
+ keywords = ["time-series", "ssa", "dash", "analytics", "decomposition"]
9
+ classifiers = [
10
+ "Programming Language :: Python :: 3",
11
+ "Programming Language :: Python :: 3 :: Only",
12
+ "Operating System :: OS Independent",
13
+ "Intended Audience :: Science/Research",
14
+ "Topic :: Scientific/Engineering :: Information Analysis",
15
+ ]
16
+ dependencies = [
17
+ "dash>=4.1.0",
18
+ "dash-bootstrap-components>=2.0.4",
19
+ "google-genai>=1.56.0",
20
+ "importlib-resources>=6.5.2",
21
+ "ipython>=9.8.0",
22
+ "jupyter>=1.1.1",
23
+ "jupyter-ai>=2.31.7",
24
+ "jupyter-dash>=0.4.2",
25
+ "kaggle>=2.0.0",
26
+ "kdepy>=1.1.12",
27
+ "kmds>=0.3.1",
28
+ "matplotlib>=3.10.8",
29
+ "numpy>=2.4.0",
30
+ "pandas>=2.3.3",
31
+ "plotly>=6.5.0",
32
+ "python-dotenv>=1.2.1",
33
+ "req>=1.0.0",
34
+ "ruptures>=1.1.10",
35
+ "scikit-learn>=1.8.0",
36
+ "scipy<1.16.0",
37
+ "seaborn>=0.13.2",
38
+ "skrub>=0.7.2",
39
+ "ssalib>=0.1.3",
40
+ "statsmodels>=0.14.6",
41
+ "streamlit>=1.52.2",
42
+ ]
43
+
44
+ [project.optional-dependencies]
45
+ dev = [
46
+ "pytest>=8.0",
47
+ "pytest-cov>=7.0.0",
48
+ ]
49
+
50
+ [build-system]
51
+ requires = ["setuptools>=68", "wheel"]
52
+ build-backend = "setuptools.build_meta"
53
+
54
+
55
+ [tool.setuptools.packages.find]
56
+ where = ["src"]
57
+ include = ["tseda*"]
58
+
59
+ [tool.setuptools.package-data]
60
+ tseda = [
61
+ "config/*.csv",
62
+ "images/*",
63
+ ]
tseda-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,77 @@
1
+ from __future__ import annotations
2
+
3
+ import numpy as np
4
+ import pandas as pd
5
+ import ruptures as rpt
6
+
7
+
8
+ class PELT_ChangePointEstimator:
9
+ """Estimate change points with the PELT algorithm and return a predicted segment series."""
10
+
11
+ def __init__(self, series: pd.Series, model: str = "rbf") -> None:
12
+ if series is None or len(series) == 0:
13
+ raise ValueError("Input series must be a non-empty pandas Series.")
14
+
15
+ self._series = series
16
+ self._model = model
17
+ self._n = len(series)
18
+ self._penalty = float(2 * np.log(self._n))
19
+
20
+ values = series.to_numpy().reshape(-1, 1)
21
+ self._algo = rpt.Pelt(model=model).fit(values)
22
+ self._change_pts = self._algo.predict(pen=self._penalty)
23
+ self._predicted_series = self._build_predicted_series(self._change_pts)
24
+
25
+ def _build_predicted_series(self, change_points: list[int]) -> pd.Series:
26
+ segment_labels: list[str] = []
27
+ start_idx = 0
28
+
29
+ for segment_no, end_idx in enumerate(change_points, start=1):
30
+ segment_length = max(0, end_idx - start_idx)
31
+ segment_labels.extend([f"segment-{segment_no}"] * segment_length)
32
+ start_idx = end_idx
33
+
34
+ if len(segment_labels) < self._n:
35
+ segment_labels.extend([f"segment-{len(change_points) + 1}"] * (self._n - len(segment_labels)))
36
+
37
+ return pd.Series(segment_labels[: self._n], index=self._series.index, name="segment")
38
+
39
+ def predict_series(self) -> pd.Series:
40
+ """Return the predicted segment label series."""
41
+ return self._predicted_series.copy()
42
+
43
+
44
+ class ChangePointEstimator:
45
+ """Compatibility wrapper used by existing tests and call sites."""
46
+
47
+ def __init__(self, series: pd.Series) -> None:
48
+ if series is None or len(series) == 0:
49
+ raise ValueError("Input series must be a non-empty pandas Series.")
50
+
51
+ self._series = series
52
+ self._df = pd.DataFrame({"date": series.index, "signal": series.values})
53
+ self._change_pts: list[int] | None = None
54
+
55
+ def estimate_change_points(self, penalty_coeff: float = 2.0) -> pd.Series:
56
+ """Run PELT and assign segment labels for each observation."""
57
+ n = len(self._series)
58
+ penalty = float(penalty_coeff * np.log(n))
59
+ values = self._series.to_numpy().reshape(-1, 1)
60
+
61
+ algo = rpt.Pelt(model="rbf").fit(values)
62
+ self._change_pts = algo.predict(pen=penalty)
63
+
64
+ segment_labels: list[str] = []
65
+ start_idx = 0
66
+ for segment_no, end_idx in enumerate(self._change_pts, start=1):
67
+ segment_length = max(0, end_idx - start_idx)
68
+ segment_labels.extend([f"segment-{segment_no}"] * segment_length)
69
+ start_idx = end_idx
70
+
71
+ if len(segment_labels) < n:
72
+ segment_labels.extend([f"segment-{len(self._change_pts) + 1}"] * (n - len(segment_labels)))
73
+
74
+ self._df["segment"] = segment_labels[:n]
75
+ return self._df["segment"]
76
+
77
+
@@ -0,0 +1,5 @@
1
+ previous_state,prompt
2
+ START,You should upload a regularly sampled csv file for analysis
3
+ FILE_PROCESSED,"You should estimate the densite for the series and inspect its plot to see if there are multiple modes. If you see this, it means there were different regimes in your time series and you should analyze and explore these regimes independently. You can get an approximate estimate of the inflection points in the density estimate, so that you can sense if there are multiple regimes. You can then apply a change point detection algorithm to estimate the location of these changes. Shall I proceed with the inflection point estimation in the density curve?"
4
+ INFLECTION_POINTS_EST,You should run a change point detection algorithm to locate the estimated time of change of your time series behavior. Shall I do that?
5
+ CHANGE_POINTS_IDENTIFIED,You should plot the change points on the time series with the different segment color coded. Shall I do that?
@@ -0,0 +1,58 @@
1
+ from kmds.tagging.tag_types import ExploratoryTags
2
+ from kmds.ontology.intent_types import IntentType
3
+ from owlready2 import *
4
+ from kmds.utils.load_utils import *
5
+
6
+ class KMDSDataWriter:
7
+ def __init__(self, file_path: str):
8
+ self._file_path = file_path
9
+ self._onto = self.load_kb()
10
+ return
11
+
12
+ def load_kb(self) -> Ontology:
13
+ onto2 :Ontology = load_kb(self._file_path)
14
+ return onto2
15
+
16
+ def add_exploratory_obs(self, obs: str, file_path: str) -> None:
17
+ the_workflow: Workflow = get_workflow(self._onto)
18
+
19
+
20
+
21
+ with self._onto:
22
+ # add the new observation
23
+ observation_count :int = len(the_workflow.has_exploratory_observations)+ 1
24
+ e1 = ExploratoryObservation(namespace=self._onto)
25
+
26
+ e1.finding = obs
27
+ e1.finding_sequence = observation_count
28
+ e1.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
29
+ e1.intent = IntentType.DATA_UNDERSTANDING.value
30
+ the_workflow.has_exploratory_observations.append(e1)
31
+
32
+ self._onto.save(file=file_path, format="rdfxml")
33
+
34
+ return
35
+
36
+ def delete_exploratory_obs(self, obs_seq: int) -> None:
37
+ the_workflow: Workflow = get_workflow(self._onto)
38
+ with self._onto:
39
+ del the_workflow.has_exploratory_observations[obs_seq - 1]
40
+
41
+ obs_len = len(the_workflow.has_exploratory_observations)
42
+ for idx in range(obs_len):
43
+ the_workflow.has_exploratory_observations[idx].finding_sequence = idx + 1
44
+
45
+ self._onto.save(file=self._file_path, format="rdfxml")
46
+
47
+ return
48
+
49
+ def update_exploratory_obs(self, obs: str, obs_seq: int) -> None:
50
+ the_workflow: Workflow = get_workflow(self._onto)
51
+ with self._onto:
52
+ the_workflow.has_exploratory_observations[obs_seq - 1].finding = obs
53
+
54
+ self._onto.save(file=self._file_path, format="rdfxml")
55
+
56
+ return
57
+
58
+
@@ -0,0 +1,30 @@
1
+ from .local_dataloader import LocalDataLoader
2
+ import pandas as pd
3
+
4
+ class CoffeePricesDataLoader(LocalDataLoader):
5
+ def __init__(self, file_path: str = "data/coffee_prices.csv"):
6
+ super().__init__(file_path)
7
+
8
+ def load_coffee_prices(self) -> pd.DataFrame:
9
+ """Load coffee prices data from a local CSV file."""
10
+ data = self.load_data()
11
+ data.columns = ["date", "signal"]
12
+ data.date = pd.to_datetime(data.date)
13
+
14
+
15
+ if not data.empty:
16
+ # Additional processing specific to coffee prices can be added here
17
+ return data
18
+ else:
19
+ print("No data loaded.")
20
+ return pd.DataFrame()
21
+
22
+ def get_series(self) -> pd.Series:
23
+ """Get the 'signal' series from the coffee prices data."""
24
+ data = self.load_coffee_prices()
25
+ data.index = data.date
26
+ if not data.empty:
27
+ return data["signal"]
28
+ else:
29
+ print("No data available to extract series.")
30
+ return pd.Series(dtype=float)
@@ -0,0 +1,25 @@
1
+ import os
2
+ from kaggle.api.kaggle_api_extended import KaggleApi
3
+ from dotenv import load_dotenv
4
+ from sys import exit
5
+ from pathlib import Path
6
+
7
+ def download_kaggle_dataset(dataset_slug, download_path):
8
+ """
9
+ dataset_slug: The part of the URL after kaggle.com/datasets/
10
+ Example: 'arashnic/max-planck-weather-dataset'
11
+ """
12
+
13
+ path = Path(download_path)
14
+ if not path.is_dir():
15
+ print("Directory does not exist or is not valid., please check the path and try again.")
16
+ exit(1)
17
+
18
+ load_dotenv() # Load environment variables from .env file
19
+ api = KaggleApi()
20
+ api.authenticate()
21
+
22
+ print(f"Downloading {dataset_slug}...")
23
+ api.dataset_download_files(dataset_slug, path=download_path, unzip=True)
24
+ print("Download complete.")
25
+
@@ -0,0 +1,82 @@
1
+ from owlready2 import *
2
+ from kmds.utils.load_utils import *
3
+ import pandas as pd
4
+
5
+ class KMDSDataLoader:
6
+ def __init__(self, file_path: str):
7
+ self._file_path = file_path
8
+ self._onto = self.load_kb()
9
+
10
+
11
+ return
12
+
13
+ def load_kb(self) -> Ontology:
14
+ onto2 :Ontology = load_kb(self._file_path)
15
+ return onto2
16
+
17
+ def load_exploratory_obs(self) -> pd.DataFrame:
18
+
19
+
20
+ the_workflow: Workflow = get_workflow(self._onto)
21
+ exp_obs: List[ExploratoryObservation] = the_workflow.has_exploratory_observations
22
+ records = []
23
+
24
+ for o in exp_obs:
25
+ a_row = {}
26
+ a_row["finding_seq"] = o.finding_sequence
27
+ #a_row["obs_type"] = o.exploratory_observation_type
28
+ a_row["finding"] = o.finding
29
+ records.append(a_row)
30
+ df = pd.DataFrame(records)
31
+
32
+ return df
33
+
34
+ def load_data_rep_obs(self) -> pd.DataFrame:
35
+ the_workflow: Workflow = get_workflow(self._onto)
36
+ dr_obs: List[DataRepresentationObservation] = the_workflow.has_data_representation_observations
37
+ records = []
38
+ for o in dr_obs:
39
+ a_row = {}
40
+ a_row["finding_seq"] = o.finding_sequence
41
+ a_row["obs_type"] = o.data_representation_observation_type
42
+ a_row["finding"] = o.finding
43
+ records.append(a_row)
44
+ df = pd.DataFrame(records)
45
+
46
+ return df
47
+ def load_modelling_choice_obs(self) -> pd.DataFrame:
48
+ the_workflow: Workflow = get_workflow(self._onto)
49
+ mc_obs: List[ModellingChoiceObservation] = the_workflow.has_modelling_choice_observations
50
+ records = []
51
+ for o in mc_obs:
52
+ a_row = {}
53
+ a_row["finding_seq"] = o.finding_sequence
54
+ a_row["obs_type"] = o.modelling_choice_observation_type
55
+ a_row["finding"] = o.finding
56
+ records.append(a_row)
57
+ df = pd.DataFrame(records)
58
+
59
+ return df
60
+
61
+ def load_modelling_selection_obs(self) -> pd.DataFrame:
62
+ the_workflow: Workflow = get_workflow(self._onto)
63
+ ms_obs: List[ModellingSelectionObservation] = the_workflow.has_modelling_selection_observations
64
+ records = []
65
+ for o in ms_obs:
66
+ a_row = {}
67
+ a_row["finding_seq"] = o.finding_sequence
68
+ a_row["obs_type"] = o.modelling_selection_observation_type
69
+ a_row["finding"] = o.finding
70
+ records.append(a_row)
71
+ df = pd.DataFrame(records)
72
+
73
+ return df
74
+
75
+ def export_all_observations(self) -> pd.DataFrame:
76
+ exp_df = load_exp_observations(self._onto)
77
+ dr_df = load_data_rep_observations(self._onto)
78
+ mc_df = load_modelling_choice_observations(self._onto)
79
+ ms_df = load_model_selection_observations(self._onto)
80
+ df_consolidated = pd.concat([exp_df, dr_df, mc_df, ms_df], ignore_index=True)
81
+
82
+ return df_consolidated
@@ -0,0 +1,21 @@
1
+
2
+ import pandas as pd
3
+
4
+ class LocalDataLoader:
5
+ def __init__(self, file_path: str):
6
+ self.file_path = file_path
7
+
8
+ def load_data(self) -> pd.DataFrame:
9
+ """Load data from a local CSV file into a pandas DataFrame."""
10
+ try:
11
+ data = pd.read_csv(self.file_path)
12
+ return data
13
+ except FileNotFoundError:
14
+ print(f"Error: The file at {self.file_path} was not found.")
15
+ return pd.DataFrame()
16
+ except pd.errors.EmptyDataError:
17
+ print("Error: The file is empty.")
18
+ return pd.DataFrame()
19
+ except Exception as e:
20
+ print(f"An unexpected error occurred: {e}")
21
+ return pd.DataFrame()
@@ -0,0 +1,39 @@
1
+ from .local_dataloader import LocalDataLoader
2
+ import pandas as pd
3
+ from datetime import datetime, timedelta
4
+ import numpy as np
5
+ from scipy.stats import norm
6
+ from math import ceil
7
+
8
+ class SyntheticSeriesDataLoader(LocalDataLoader):
9
+ def __init__(self, file_path: str = "data/synthetic_series.csv"):
10
+ super().__init__(file_path)
11
+
12
+ def get_series(self) -> pd.Series:
13
+ """Get the 'signal' series from the synthetic series data."""
14
+ now = datetime.now()
15
+ p1_peak = 5
16
+ p2_peak = 7
17
+ p1 = 6
18
+ p2 = 10
19
+ N = ceil(24*90/1) # 90 days of hourly data
20
+ time_idx = [float(i) for i in range(N)]
21
+ p1_omega = [(2*np.pi* t)/(p1) for t in time_idx]
22
+ p2_omega = [(2*np.pi* t)/(p2) for t in time_idx]
23
+ p1_vals = [p1_peak*np.sin(w) for w in p1_omega]
24
+ p2_vals = [p2_peak*np.sin(w) for w in p2_omega]
25
+ noise = norm.rvs(loc=0, scale=0.5, size=N)
26
+ level = [20 for _ in range(N)]
27
+ signal = np.array(level) + np.array(p1_vals) + np.array(p2_vals) + np.array(noise)
28
+ time_vals = [now + timedelta(hours=i) for i in range(N)]
29
+ data = {"time": time_vals, "signal": signal}
30
+ df = pd.DataFrame.from_dict(data)
31
+ df.index = df.time
32
+ if not df.empty:
33
+ series = df["signal"]
34
+ series.index = df.time
35
+
36
+ return series
37
+ else:
38
+ print("No data available to extract series.")
39
+ return pd.Series(dtype=float)