tseda 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tseda-0.1.0/PKG-INFO +145 -0
- tseda-0.1.0/README.md +103 -0
- tseda-0.1.0/pyproject.toml +63 -0
- tseda-0.1.0/setup.cfg +4 -0
- tseda-0.1.0/src/tseda/change_point/change_point_estimator.py +77 -0
- tseda-0.1.0/src/tseda/config/prompts_config.csv +5 -0
- tseda-0.1.0/src/tseda/data_writers/kmds_writer.py +58 -0
- tseda-0.1.0/src/tseda/dataloader/coffee_prices_data_loader.py +30 -0
- tseda-0.1.0/src/tseda/dataloader/kaggle_data_loader.py +25 -0
- tseda-0.1.0/src/tseda/dataloader/kmds_data_loader.py +82 -0
- tseda-0.1.0/src/tseda/dataloader/local_dataloader.py +21 -0
- tseda-0.1.0/src/tseda/dataloader/synthetic_series_data_loader.py +39 -0
- tseda-0.1.0/src/tseda/dataloader/white_noise_data_loader.py +26 -0
- tseda-0.1.0/src/tseda/decomposition/ssa_decomposition.py +401 -0
- tseda-0.1.0/src/tseda/decomposition/ssa_result_summary.py +258 -0
- tseda-0.1.0/src/tseda/images/tseda_workflow.png +0 -0
- tseda-0.1.0/src/tseda/periodicity/fft_analyzer.py +60 -0
- tseda-0.1.0/src/tseda/series_stats/sampling_prop.py +131 -0
- tseda-0.1.0/src/tseda/series_stats/summary_statistics.py +23 -0
- tseda-0.1.0/src/tseda/user_interface/analysis.py +163 -0
- tseda-0.1.0/src/tseda/user_interface/callback_services.py +296 -0
- tseda-0.1.0/src/tseda/user_interface/components/analysis_assessment.py +157 -0
- tseda-0.1.0/src/tseda/user_interface/components/initial_eval_components.py +491 -0
- tseda-0.1.0/src/tseda/user_interface/gemini_chat.py +45 -0
- tseda-0.1.0/src/tseda/user_interface/initial_assessment.py +36 -0
- tseda-0.1.0/src/tseda/user_interface/initial_assessment_layout.py +95 -0
- tseda-0.1.0/src/tseda/user_interface/kmds_capture.py +426 -0
- tseda-0.1.0/src/tseda/user_interface/ts_analyze_ui.py +616 -0
- tseda-0.1.0/src/tseda/visualization/autocorrelation_vis.py +51 -0
- tseda-0.1.0/src/tseda/visualization/series_histogram_visualizer.py +44 -0
- tseda-0.1.0/src/tseda/visualization/series_kde_visualizer.py +92 -0
- tseda-0.1.0/src/tseda/visualization/series_visualizer.py +66 -0
- tseda-0.1.0/src/tseda.egg-info/PKG-INFO +145 -0
- tseda-0.1.0/src/tseda.egg-info/SOURCES.txt +44 -0
- tseda-0.1.0/src/tseda.egg-info/dependency_links.txt +1 -0
- tseda-0.1.0/src/tseda.egg-info/requires.txt +29 -0
- tseda-0.1.0/src/tseda.egg-info/top_level.txt +1 -0
- tseda-0.1.0/tests/test_autocorrelation_vis.py +58 -0
- tseda-0.1.0/tests/test_change_point_estimator.py +47 -0
- tseda-0.1.0/tests/test_fft_analyzer.py +70 -0
- tseda-0.1.0/tests/test_sampling_prop.py +56 -0
- tseda-0.1.0/tests/test_series_histogram_visualizer.py +99 -0
- tseda-0.1.0/tests/test_series_kde_visualizer.py +59 -0
- tseda-0.1.0/tests/test_ssa_decomposition.py +90 -0
- tseda-0.1.0/tests/test_ssa_result_summary.py +83 -0
- tseda-0.1.0/tests/test_ts_analyze_ui.py +174 -0
tseda-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tseda
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A package for exploration of regularly sampled time series.
|
|
5
|
+
License-Expression: LicenseRef-Proprietary
|
|
6
|
+
Keywords: time-series,ssa,dash,analytics,decomposition
|
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
|
8
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Intended Audience :: Science/Research
|
|
11
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
12
|
+
Requires-Python: >=3.13
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
Requires-Dist: dash>=4.1.0
|
|
15
|
+
Requires-Dist: dash-bootstrap-components>=2.0.4
|
|
16
|
+
Requires-Dist: google-genai>=1.56.0
|
|
17
|
+
Requires-Dist: importlib-resources>=6.5.2
|
|
18
|
+
Requires-Dist: ipython>=9.8.0
|
|
19
|
+
Requires-Dist: jupyter>=1.1.1
|
|
20
|
+
Requires-Dist: jupyter-ai>=2.31.7
|
|
21
|
+
Requires-Dist: jupyter-dash>=0.4.2
|
|
22
|
+
Requires-Dist: kaggle>=2.0.0
|
|
23
|
+
Requires-Dist: kdepy>=1.1.12
|
|
24
|
+
Requires-Dist: kmds>=0.3.1
|
|
25
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
26
|
+
Requires-Dist: numpy>=2.4.0
|
|
27
|
+
Requires-Dist: pandas>=2.3.3
|
|
28
|
+
Requires-Dist: plotly>=6.5.0
|
|
29
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
30
|
+
Requires-Dist: req>=1.0.0
|
|
31
|
+
Requires-Dist: ruptures>=1.1.10
|
|
32
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
33
|
+
Requires-Dist: scipy<1.16.0
|
|
34
|
+
Requires-Dist: seaborn>=0.13.2
|
|
35
|
+
Requires-Dist: skrub>=0.7.2
|
|
36
|
+
Requires-Dist: ssalib>=0.1.3
|
|
37
|
+
Requires-Dist: statsmodels>=0.14.6
|
|
38
|
+
Requires-Dist: streamlit>=1.52.2
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
41
|
+
Requires-Dist: pytest-cov>=7.0.0; extra == "dev"
|
|
42
|
+
|
|
43
|
+
# Time Series Explorer (`tseda`)
|
|
44
|
+
|
|
45
|
+
An application for time series exploration.
|
|
46
|
+
|
|
47
|
+
## Overview
|
|
48
|
+
|
|
49
|
+
`tseda` lets you explore regularly sampled time series with a sampling frequency of one hour or greater. It is currently limited to 2,000 samples (this is configurable).
|
|
50
|
+
|
|
51
|
+
## Three-Step Exploration Workflow
|
|
52
|
+
|
|
53
|
+
### (a) Initial Assessment
|
|
54
|
+
|
|
55
|
+
Explore the distribution and spread of values using a kernel density estimate and box plot. You get to see the raw distribution of the values. The PACF and ACF provide clues about seasonality and autoregressive components.
|
|
56
|
+
|
|
57
|
+
### (b) Decomposition Using Singular Spectral Analysis
|
|
58
|
+
|
|
59
|
+
On the basis of the sampling frequency, a window for SSA is determined. This is a heuristic assignment. For example:
|
|
60
|
+
|
|
61
|
+
| Sampling Frequency | Window Size |
|
|
62
|
+
|--------------------|-------------|
|
|
63
|
+
| Hourly | 24 |
|
|
64
|
+
| Monthly | 12 |
|
|
65
|
+
| Quarterly | 4 |
|
|
66
|
+
|
|
67
|
+
This can be changed in the UI. Based on the eigen value distribution, observations from the ACF plot and the eigen vector plot, the seasonal components can be determined if present. Based on these initial plots, the user needs to input a set of groupings and reconstruct the series with these groupings. The reconstruction plots are shown. If there is structure in the series, then change point analysis can be done using the fact that the components are smooth. A change point plot is shown. The explained variance from signal and noise components and the assessment of the noise structure (independent or correlated) is provided.
|
|
68
|
+
|
|
69
|
+
### (c) Observation Logging
|
|
70
|
+
|
|
71
|
+
The SSA is based on the eigen decomposition of the trajectory matrix. Though the raw signal is correlated, the eigenvectors are uncorrelated. If we assume that the signal is Gaussian, this also implies independence. We can use the Akaike Information Criterion for model selection and determine the AIC as a function of the rank of the model. This is shown in the observation page. An automatic summary of all the observations is provided.
|
|
72
|
+
|
|
73
|
+
## Notebook Interface
|
|
74
|
+
|
|
75
|
+
The package also provides a notebook interface to these features. If you have a new dataset that you want to analyze, look at the data loader directory for examples. Download your dataset, clean it, produce your time series, and analyze it with `tseda`.
|
|
76
|
+
|
|
77
|
+
## Getting Started
|
|
78
|
+
|
|
79
|
+
### 1. Install Dependencies
|
|
80
|
+
|
|
81
|
+
Create and activate a virtual environment, then install the required packages:
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
python -m venv .venv
|
|
85
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
86
|
+
pip install -r requirements.txt
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Or, if you are using the package in editable/development mode:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pip install -e .
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 2. Run the App
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
python src/tseda/user_interface/ts_analyze_ui.py
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
The app will start a local web server. Open your browser and navigate to the URL printed in the terminal (typically `http://127.0.0.1:8050`).
|
|
102
|
+
|
|
103
|
+
### 3. Upload Your Data
|
|
104
|
+
|
|
105
|
+
- Click **"Drag and Drop or Select Files"** in the Initial Assessment panel.
|
|
106
|
+
- Your file must be a **CSV or Excel** file with at least two columns: a **timestamp** column (first) and a **numeric value** column (second).
|
|
107
|
+
- The data must be **regularly sampled at hourly or lower frequency** (e.g., hourly, daily, monthly).
|
|
108
|
+
- The dataset must contain **no missing values** (NA / NaN). Clean your data before uploading.
|
|
109
|
+
- Files are limited to **2,000 rows** (configurable via `MAX_FILE_LINES` in `ts_analyze_ui.py`).
|
|
110
|
+
|
|
111
|
+
### 4. Explore in Three Steps
|
|
112
|
+
|
|
113
|
+
| Step | Panel | What to do |
|
|
114
|
+
|------|-------|------------|
|
|
115
|
+
| 1 | **Initial Assessment of Time Series** | Review distribution plots (KDE, box plot) and the ACF / PACF for autocorrelation patterns. |
|
|
116
|
+
| 2 | **Time Series Decomposition** | Review the eigenvalue plot, then enter component groupings (e.g., Trend, Seasonal, Noise) and click **Apply Grouping**. |
|
|
117
|
+
| 3 | **Observation Logging** | Review the AIC rank diagnostics, read the auto-generated summary, and add your own observations before saving the report. |
|
|
118
|
+
|
|
119
|
+
## Build And Publish With uv
|
|
120
|
+
|
|
121
|
+
1. Build source and wheel distributions:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
uv build
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
2. Validate distributions before upload:
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
uvx twine check dist/*
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
3. Publish to PyPI using an API token:
|
|
134
|
+
|
|
135
|
+
```bash
|
|
136
|
+
export UV_PUBLISH_TOKEN="pypi-..."
|
|
137
|
+
uv publish
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
4. Publish to TestPyPI first (recommended):
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
export UV_PUBLISH_TOKEN="pypi-..."
|
|
144
|
+
uv publish --publish-url https://test.pypi.org/legacy/
|
|
145
|
+
```
|
tseda-0.1.0/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Time Series Explorer (`tseda`)
|
|
2
|
+
|
|
3
|
+
An application for time series exploration.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
`tseda` lets you explore regularly sampled time series with a sampling frequency of one hour or greater. It is currently limited to 2,000 samples (this is configurable).
|
|
8
|
+
|
|
9
|
+
## Three-Step Exploration Workflow
|
|
10
|
+
|
|
11
|
+
### (a) Initial Assessment
|
|
12
|
+
|
|
13
|
+
Explore the distribution and spread of values using a kernel density estimate and box plot. You get to see the raw distribution of the values. The PACF and ACF provide clues about seasonality and autoregressive components.
|
|
14
|
+
|
|
15
|
+
### (b) Decomposition Using Singular Spectral Analysis
|
|
16
|
+
|
|
17
|
+
On the basis of the sampling frequency, a window for SSA is determined. This is a heuristic assignment. For example:
|
|
18
|
+
|
|
19
|
+
| Sampling Frequency | Window Size |
|
|
20
|
+
|--------------------|-------------|
|
|
21
|
+
| Hourly | 24 |
|
|
22
|
+
| Monthly | 12 |
|
|
23
|
+
| Quarterly | 4 |
|
|
24
|
+
|
|
25
|
+
This can be changed in the UI. Based on the eigen value distribution, observations from the ACF plot and the eigen vector plot, the seasonal components can be determined if present. Based on these initial plots, the user needs to input a set of groupings and reconstruct the series with these groupings. The reconstruction plots are shown. If there is structure in the series, then change point analysis can be done using the fact that the components are smooth. A change point plot is shown. The explained variance from signal and noise components and the assessment of the noise structure (independent or correlated) is provided.
|
|
26
|
+
|
|
27
|
+
### (c) Observation Logging
|
|
28
|
+
|
|
29
|
+
The SSA is based on the eigen decomposition of the trajectory matrix. Though the raw signal is correlated, the eigenvectors are uncorrelated. If we assume that the signal is Gaussian, this also implies independence. We can use the Akaike Information Criterion for model selection and determine the AIC as a function of the rank of the model. This is shown in the observation page. An automatic summary of all the observations is provided.
|
|
30
|
+
|
|
31
|
+
## Notebook Interface
|
|
32
|
+
|
|
33
|
+
The package also provides a notebook interface to these features. If you have a new dataset that you want to analyze, look at the data loader directory for examples. Download your dataset, clean it, produce your time series, and analyze it with `tseda`.
|
|
34
|
+
|
|
35
|
+
## Getting Started
|
|
36
|
+
|
|
37
|
+
### 1. Install Dependencies
|
|
38
|
+
|
|
39
|
+
Create and activate a virtual environment, then install the required packages:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
python -m venv .venv
|
|
43
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
44
|
+
pip install -r requirements.txt
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or, if you are using the package in editable/development mode:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install -e .
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 2. Run the App
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
python src/tseda/user_interface/ts_analyze_ui.py
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
The app will start a local web server. Open your browser and navigate to the URL printed in the terminal (typically `http://127.0.0.1:8050`).
|
|
60
|
+
|
|
61
|
+
### 3. Upload Your Data
|
|
62
|
+
|
|
63
|
+
- Click **"Drag and Drop or Select Files"** in the Initial Assessment panel.
|
|
64
|
+
- Your file must be a **CSV or Excel** file with at least two columns: a **timestamp** column (first) and a **numeric value** column (second).
|
|
65
|
+
- The data must be **regularly sampled at hourly or lower frequency** (e.g., hourly, daily, monthly).
|
|
66
|
+
- The dataset must contain **no missing values** (NA / NaN). Clean your data before uploading.
|
|
67
|
+
- Files are limited to **2,000 rows** (configurable via `MAX_FILE_LINES` in `ts_analyze_ui.py`).
|
|
68
|
+
|
|
69
|
+
### 4. Explore in Three Steps
|
|
70
|
+
|
|
71
|
+
| Step | Panel | What to do |
|
|
72
|
+
|------|-------|------------|
|
|
73
|
+
| 1 | **Initial Assessment of Time Series** | Review distribution plots (KDE, box plot) and the ACF / PACF for autocorrelation patterns. |
|
|
74
|
+
| 2 | **Time Series Decomposition** | Review the eigenvalue plot, then enter component groupings (e.g., Trend, Seasonal, Noise) and click **Apply Grouping**. |
|
|
75
|
+
| 3 | **Observation Logging** | Review the AIC rank diagnostics, read the auto-generated summary, and add your own observations before saving the report. |
|
|
76
|
+
|
|
77
|
+
## Build And Publish With uv
|
|
78
|
+
|
|
79
|
+
1. Build source and wheel distributions:
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv build
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
2. Validate distributions before upload:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
uvx twine check dist/*
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
3. Publish to PyPI using an API token:
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
export UV_PUBLISH_TOKEN="pypi-..."
|
|
95
|
+
uv publish
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
4. Publish to TestPyPI first (recommended):
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
export UV_PUBLISH_TOKEN="pypi-..."
|
|
102
|
+
uv publish --publish-url https://test.pypi.org/legacy/
|
|
103
|
+
```
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "tseda"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A package for exploration of regularly sampled time series."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.13"
|
|
7
|
+
license = "LicenseRef-Proprietary"
|
|
8
|
+
keywords = ["time-series", "ssa", "dash", "analytics", "decomposition"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Programming Language :: Python :: 3",
|
|
11
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
12
|
+
"Operating System :: OS Independent",
|
|
13
|
+
"Intended Audience :: Science/Research",
|
|
14
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
15
|
+
]
|
|
16
|
+
dependencies = [
|
|
17
|
+
"dash>=4.1.0",
|
|
18
|
+
"dash-bootstrap-components>=2.0.4",
|
|
19
|
+
"google-genai>=1.56.0",
|
|
20
|
+
"importlib-resources>=6.5.2",
|
|
21
|
+
"ipython>=9.8.0",
|
|
22
|
+
"jupyter>=1.1.1",
|
|
23
|
+
"jupyter-ai>=2.31.7",
|
|
24
|
+
"jupyter-dash>=0.4.2",
|
|
25
|
+
"kaggle>=2.0.0",
|
|
26
|
+
"kdepy>=1.1.12",
|
|
27
|
+
"kmds>=0.3.1",
|
|
28
|
+
"matplotlib>=3.10.8",
|
|
29
|
+
"numpy>=2.4.0",
|
|
30
|
+
"pandas>=2.3.3",
|
|
31
|
+
"plotly>=6.5.0",
|
|
32
|
+
"python-dotenv>=1.2.1",
|
|
33
|
+
"req>=1.0.0",
|
|
34
|
+
"ruptures>=1.1.10",
|
|
35
|
+
"scikit-learn>=1.8.0",
|
|
36
|
+
"scipy<1.16.0",
|
|
37
|
+
"seaborn>=0.13.2",
|
|
38
|
+
"skrub>=0.7.2",
|
|
39
|
+
"ssalib>=0.1.3",
|
|
40
|
+
"statsmodels>=0.14.6",
|
|
41
|
+
"streamlit>=1.52.2",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.optional-dependencies]
|
|
45
|
+
dev = [
|
|
46
|
+
"pytest>=8.0",
|
|
47
|
+
"pytest-cov>=7.0.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[build-system]
|
|
51
|
+
requires = ["setuptools>=68", "wheel"]
|
|
52
|
+
build-backend = "setuptools.build_meta"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
[tool.setuptools.packages.find]
|
|
56
|
+
where = ["src"]
|
|
57
|
+
include = ["tseda*"]
|
|
58
|
+
|
|
59
|
+
[tool.setuptools.package-data]
|
|
60
|
+
tseda = [
|
|
61
|
+
"config/*.csv",
|
|
62
|
+
"images/*",
|
|
63
|
+
]
|
tseda-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import ruptures as rpt
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class PELT_ChangePointEstimator:
|
|
9
|
+
"""Estimate change points with the PELT algorithm and return a predicted segment series."""
|
|
10
|
+
|
|
11
|
+
def __init__(self, series: pd.Series, model: str = "rbf") -> None:
|
|
12
|
+
if series is None or len(series) == 0:
|
|
13
|
+
raise ValueError("Input series must be a non-empty pandas Series.")
|
|
14
|
+
|
|
15
|
+
self._series = series
|
|
16
|
+
self._model = model
|
|
17
|
+
self._n = len(series)
|
|
18
|
+
self._penalty = float(2 * np.log(self._n))
|
|
19
|
+
|
|
20
|
+
values = series.to_numpy().reshape(-1, 1)
|
|
21
|
+
self._algo = rpt.Pelt(model=model).fit(values)
|
|
22
|
+
self._change_pts = self._algo.predict(pen=self._penalty)
|
|
23
|
+
self._predicted_series = self._build_predicted_series(self._change_pts)
|
|
24
|
+
|
|
25
|
+
def _build_predicted_series(self, change_points: list[int]) -> pd.Series:
|
|
26
|
+
segment_labels: list[str] = []
|
|
27
|
+
start_idx = 0
|
|
28
|
+
|
|
29
|
+
for segment_no, end_idx in enumerate(change_points, start=1):
|
|
30
|
+
segment_length = max(0, end_idx - start_idx)
|
|
31
|
+
segment_labels.extend([f"segment-{segment_no}"] * segment_length)
|
|
32
|
+
start_idx = end_idx
|
|
33
|
+
|
|
34
|
+
if len(segment_labels) < self._n:
|
|
35
|
+
segment_labels.extend([f"segment-{len(change_points) + 1}"] * (self._n - len(segment_labels)))
|
|
36
|
+
|
|
37
|
+
return pd.Series(segment_labels[: self._n], index=self._series.index, name="segment")
|
|
38
|
+
|
|
39
|
+
def predict_series(self) -> pd.Series:
|
|
40
|
+
"""Return the predicted segment label series."""
|
|
41
|
+
return self._predicted_series.copy()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ChangePointEstimator:
|
|
45
|
+
"""Compatibility wrapper used by existing tests and call sites."""
|
|
46
|
+
|
|
47
|
+
def __init__(self, series: pd.Series) -> None:
|
|
48
|
+
if series is None or len(series) == 0:
|
|
49
|
+
raise ValueError("Input series must be a non-empty pandas Series.")
|
|
50
|
+
|
|
51
|
+
self._series = series
|
|
52
|
+
self._df = pd.DataFrame({"date": series.index, "signal": series.values})
|
|
53
|
+
self._change_pts: list[int] | None = None
|
|
54
|
+
|
|
55
|
+
def estimate_change_points(self, penalty_coeff: float = 2.0) -> pd.Series:
|
|
56
|
+
"""Run PELT and assign segment labels for each observation."""
|
|
57
|
+
n = len(self._series)
|
|
58
|
+
penalty = float(penalty_coeff * np.log(n))
|
|
59
|
+
values = self._series.to_numpy().reshape(-1, 1)
|
|
60
|
+
|
|
61
|
+
algo = rpt.Pelt(model="rbf").fit(values)
|
|
62
|
+
self._change_pts = algo.predict(pen=penalty)
|
|
63
|
+
|
|
64
|
+
segment_labels: list[str] = []
|
|
65
|
+
start_idx = 0
|
|
66
|
+
for segment_no, end_idx in enumerate(self._change_pts, start=1):
|
|
67
|
+
segment_length = max(0, end_idx - start_idx)
|
|
68
|
+
segment_labels.extend([f"segment-{segment_no}"] * segment_length)
|
|
69
|
+
start_idx = end_idx
|
|
70
|
+
|
|
71
|
+
if len(segment_labels) < n:
|
|
72
|
+
segment_labels.extend([f"segment-{len(self._change_pts) + 1}"] * (n - len(segment_labels)))
|
|
73
|
+
|
|
74
|
+
self._df["segment"] = segment_labels[:n]
|
|
75
|
+
return self._df["segment"]
|
|
76
|
+
|
|
77
|
+
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
previous_state,prompt
|
|
2
|
+
START,You should upload a regularly sampled csv file for analysis
|
|
3
|
+
FILE_PROCESSED,"You should estimate the densite for the series and inspect its plot to see if there are multiple modes. If you see this, it means there were different regimes in your time series and you should analyze and explore these regimes independently. You can get an approximate estimate of the inflection points in the density estimate, so that you can sense if there are multiple regimes. You can then apply a change point detection algorithm to estimate the location of these changes. Shall I proceed with the inflection point estimation in the density curve?"
|
|
4
|
+
INFLECTION_POINTS_EST,You should run a change point detection algorithm to locate the estimated time of change of your time series behavior. Shall I do that?
|
|
5
|
+
CHANGE_POINTS_IDENTIFIED,You should plot the change points on the time series with the different segment color coded. Shall I do that?
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from kmds.tagging.tag_types import ExploratoryTags
|
|
2
|
+
from kmds.ontology.intent_types import IntentType
|
|
3
|
+
from owlready2 import *
|
|
4
|
+
from kmds.utils.load_utils import *
|
|
5
|
+
|
|
6
|
+
class KMDSDataWriter:
|
|
7
|
+
def __init__(self, file_path: str):
|
|
8
|
+
self._file_path = file_path
|
|
9
|
+
self._onto = self.load_kb()
|
|
10
|
+
return
|
|
11
|
+
|
|
12
|
+
def load_kb(self) -> Ontology:
|
|
13
|
+
onto2 :Ontology = load_kb(self._file_path)
|
|
14
|
+
return onto2
|
|
15
|
+
|
|
16
|
+
def add_exploratory_obs(self, obs: str, file_path: str) -> None:
|
|
17
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
with self._onto:
|
|
22
|
+
# add the new observation
|
|
23
|
+
observation_count :int = len(the_workflow.has_exploratory_observations)+ 1
|
|
24
|
+
e1 = ExploratoryObservation(namespace=self._onto)
|
|
25
|
+
|
|
26
|
+
e1.finding = obs
|
|
27
|
+
e1.finding_sequence = observation_count
|
|
28
|
+
e1.exploratory_observation_type = ExploratoryTags.DATA_QUALITY_OBSERVATION.value
|
|
29
|
+
e1.intent = IntentType.DATA_UNDERSTANDING.value
|
|
30
|
+
the_workflow.has_exploratory_observations.append(e1)
|
|
31
|
+
|
|
32
|
+
self._onto.save(file=file_path, format="rdfxml")
|
|
33
|
+
|
|
34
|
+
return
|
|
35
|
+
|
|
36
|
+
def delete_exploratory_obs(self, obs_seq: int) -> None:
|
|
37
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
38
|
+
with self._onto:
|
|
39
|
+
del the_workflow.has_exploratory_observations[obs_seq - 1]
|
|
40
|
+
|
|
41
|
+
obs_len = len(the_workflow.has_exploratory_observations)
|
|
42
|
+
for idx in range(obs_len):
|
|
43
|
+
the_workflow.has_exploratory_observations[idx].finding_sequence = idx + 1
|
|
44
|
+
|
|
45
|
+
self._onto.save(file=self._file_path, format="rdfxml")
|
|
46
|
+
|
|
47
|
+
return
|
|
48
|
+
|
|
49
|
+
def update_exploratory_obs(self, obs: str, obs_seq: int) -> None:
|
|
50
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
51
|
+
with self._onto:
|
|
52
|
+
the_workflow.has_exploratory_observations[obs_seq - 1].finding = obs
|
|
53
|
+
|
|
54
|
+
self._onto.save(file=self._file_path, format="rdfxml")
|
|
55
|
+
|
|
56
|
+
return
|
|
57
|
+
|
|
58
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
from .local_dataloader import LocalDataLoader
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
class CoffeePricesDataLoader(LocalDataLoader):
|
|
5
|
+
def __init__(self, file_path: str = "data/coffee_prices.csv"):
|
|
6
|
+
super().__init__(file_path)
|
|
7
|
+
|
|
8
|
+
def load_coffee_prices(self) -> pd.DataFrame:
|
|
9
|
+
"""Load coffee prices data from a local CSV file."""
|
|
10
|
+
data = self.load_data()
|
|
11
|
+
data.columns = ["date", "signal"]
|
|
12
|
+
data.date = pd.to_datetime(data.date)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
if not data.empty:
|
|
16
|
+
# Additional processing specific to coffee prices can be added here
|
|
17
|
+
return data
|
|
18
|
+
else:
|
|
19
|
+
print("No data loaded.")
|
|
20
|
+
return pd.DataFrame()
|
|
21
|
+
|
|
22
|
+
def get_series(self) -> pd.Series:
|
|
23
|
+
"""Get the 'signal' series from the coffee prices data."""
|
|
24
|
+
data = self.load_coffee_prices()
|
|
25
|
+
data.index = data.date
|
|
26
|
+
if not data.empty:
|
|
27
|
+
return data["signal"]
|
|
28
|
+
else:
|
|
29
|
+
print("No data available to extract series.")
|
|
30
|
+
return pd.Series(dtype=float)
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
|
3
|
+
from dotenv import load_dotenv
|
|
4
|
+
from sys import exit
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
def download_kaggle_dataset(dataset_slug, download_path):
|
|
8
|
+
"""
|
|
9
|
+
dataset_slug: The part of the URL after kaggle.com/datasets/
|
|
10
|
+
Example: 'arashnic/max-planck-weather-dataset'
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
path = Path(download_path)
|
|
14
|
+
if not path.is_dir():
|
|
15
|
+
print("Directory does not exist or is not valid., please check the path and try again.")
|
|
16
|
+
exit(1)
|
|
17
|
+
|
|
18
|
+
load_dotenv() # Load environment variables from .env file
|
|
19
|
+
api = KaggleApi()
|
|
20
|
+
api.authenticate()
|
|
21
|
+
|
|
22
|
+
print(f"Downloading {dataset_slug}...")
|
|
23
|
+
api.dataset_download_files(dataset_slug, path=download_path, unzip=True)
|
|
24
|
+
print("Download complete.")
|
|
25
|
+
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from owlready2 import *
|
|
2
|
+
from kmds.utils.load_utils import *
|
|
3
|
+
import pandas as pd
|
|
4
|
+
|
|
5
|
+
class KMDSDataLoader:
|
|
6
|
+
def __init__(self, file_path: str):
|
|
7
|
+
self._file_path = file_path
|
|
8
|
+
self._onto = self.load_kb()
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
return
|
|
12
|
+
|
|
13
|
+
def load_kb(self) -> Ontology:
|
|
14
|
+
onto2 :Ontology = load_kb(self._file_path)
|
|
15
|
+
return onto2
|
|
16
|
+
|
|
17
|
+
def load_exploratory_obs(self) -> pd.DataFrame:
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
21
|
+
exp_obs: List[ExploratoryObservation] = the_workflow.has_exploratory_observations
|
|
22
|
+
records = []
|
|
23
|
+
|
|
24
|
+
for o in exp_obs:
|
|
25
|
+
a_row = {}
|
|
26
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
27
|
+
#a_row["obs_type"] = o.exploratory_observation_type
|
|
28
|
+
a_row["finding"] = o.finding
|
|
29
|
+
records.append(a_row)
|
|
30
|
+
df = pd.DataFrame(records)
|
|
31
|
+
|
|
32
|
+
return df
|
|
33
|
+
|
|
34
|
+
def load_data_rep_obs(self) -> pd.DataFrame:
|
|
35
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
36
|
+
dr_obs: List[DataRepresentationObservation] = the_workflow.has_data_representation_observations
|
|
37
|
+
records = []
|
|
38
|
+
for o in dr_obs:
|
|
39
|
+
a_row = {}
|
|
40
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
41
|
+
a_row["obs_type"] = o.data_representation_observation_type
|
|
42
|
+
a_row["finding"] = o.finding
|
|
43
|
+
records.append(a_row)
|
|
44
|
+
df = pd.DataFrame(records)
|
|
45
|
+
|
|
46
|
+
return df
|
|
47
|
+
def load_modelling_choice_obs(self) -> pd.DataFrame:
|
|
48
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
49
|
+
mc_obs: List[ModellingChoiceObservation] = the_workflow.has_modelling_choice_observations
|
|
50
|
+
records = []
|
|
51
|
+
for o in mc_obs:
|
|
52
|
+
a_row = {}
|
|
53
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
54
|
+
a_row["obs_type"] = o.modelling_choice_observation_type
|
|
55
|
+
a_row["finding"] = o.finding
|
|
56
|
+
records.append(a_row)
|
|
57
|
+
df = pd.DataFrame(records)
|
|
58
|
+
|
|
59
|
+
return df
|
|
60
|
+
|
|
61
|
+
def load_modelling_selection_obs(self) -> pd.DataFrame:
|
|
62
|
+
the_workflow: Workflow = get_workflow(self._onto)
|
|
63
|
+
ms_obs: List[ModellingSelectionObservation] = the_workflow.has_modelling_selection_observations
|
|
64
|
+
records = []
|
|
65
|
+
for o in ms_obs:
|
|
66
|
+
a_row = {}
|
|
67
|
+
a_row["finding_seq"] = o.finding_sequence
|
|
68
|
+
a_row["obs_type"] = o.modelling_selection_observation_type
|
|
69
|
+
a_row["finding"] = o.finding
|
|
70
|
+
records.append(a_row)
|
|
71
|
+
df = pd.DataFrame(records)
|
|
72
|
+
|
|
73
|
+
return df
|
|
74
|
+
|
|
75
|
+
def export_all_observations(self) -> pd.DataFrame:
|
|
76
|
+
exp_df = load_exp_observations(self._onto)
|
|
77
|
+
dr_df = load_data_rep_observations(self._onto)
|
|
78
|
+
mc_df = load_modelling_choice_observations(self._onto)
|
|
79
|
+
ms_df = load_model_selection_observations(self._onto)
|
|
80
|
+
df_consolidated = pd.concat([exp_df, dr_df, mc_df, ms_df], ignore_index=True)
|
|
81
|
+
|
|
82
|
+
return df_consolidated
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
|
|
2
|
+
import pandas as pd
|
|
3
|
+
|
|
4
|
+
class LocalDataLoader:
|
|
5
|
+
def __init__(self, file_path: str):
|
|
6
|
+
self.file_path = file_path
|
|
7
|
+
|
|
8
|
+
def load_data(self) -> pd.DataFrame:
|
|
9
|
+
"""Load data from a local CSV file into a pandas DataFrame."""
|
|
10
|
+
try:
|
|
11
|
+
data = pd.read_csv(self.file_path)
|
|
12
|
+
return data
|
|
13
|
+
except FileNotFoundError:
|
|
14
|
+
print(f"Error: The file at {self.file_path} was not found.")
|
|
15
|
+
return pd.DataFrame()
|
|
16
|
+
except pd.errors.EmptyDataError:
|
|
17
|
+
print("Error: The file is empty.")
|
|
18
|
+
return pd.DataFrame()
|
|
19
|
+
except Exception as e:
|
|
20
|
+
print(f"An unexpected error occurred: {e}")
|
|
21
|
+
return pd.DataFrame()
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from .local_dataloader import LocalDataLoader
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from datetime import datetime, timedelta
|
|
4
|
+
import numpy as np
|
|
5
|
+
from scipy.stats import norm
|
|
6
|
+
from math import ceil
|
|
7
|
+
|
|
8
|
+
class SyntheticSeriesDataLoader(LocalDataLoader):
|
|
9
|
+
def __init__(self, file_path: str = "data/synthetic_series.csv"):
|
|
10
|
+
super().__init__(file_path)
|
|
11
|
+
|
|
12
|
+
def get_series(self) -> pd.Series:
|
|
13
|
+
"""Get the 'signal' series from the synthetic series data."""
|
|
14
|
+
now = datetime.now()
|
|
15
|
+
p1_peak = 5
|
|
16
|
+
p2_peak = 7
|
|
17
|
+
p1 = 6
|
|
18
|
+
p2 = 10
|
|
19
|
+
N = ceil(24*90/1) # 90 days of hourly data
|
|
20
|
+
time_idx = [float(i) for i in range(N)]
|
|
21
|
+
p1_omega = [(2*np.pi* t)/(p1) for t in time_idx]
|
|
22
|
+
p2_omega = [(2*np.pi* t)/(p2) for t in time_idx]
|
|
23
|
+
p1_vals = [p1_peak*np.sin(w) for w in p1_omega]
|
|
24
|
+
p2_vals = [p2_peak*np.sin(w) for w in p2_omega]
|
|
25
|
+
noise = norm.rvs(loc=0, scale=0.5, size=N)
|
|
26
|
+
level = [20 for _ in range(N)]
|
|
27
|
+
signal = np.array(level) + np.array(p1_vals) + np.array(p2_vals) + np.array(noise)
|
|
28
|
+
time_vals = [now + timedelta(hours=i) for i in range(N)]
|
|
29
|
+
data = {"time": time_vals, "signal": signal}
|
|
30
|
+
df = pd.DataFrame.from_dict(data)
|
|
31
|
+
df.index = df.time
|
|
32
|
+
if not df.empty:
|
|
33
|
+
series = df["signal"]
|
|
34
|
+
series.index = df.time
|
|
35
|
+
|
|
36
|
+
return series
|
|
37
|
+
else:
|
|
38
|
+
print("No data available to extract series.")
|
|
39
|
+
return pd.Series(dtype=float)
|