timeseries-qc 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- timeseries_qc-0.1.0/PKG-INFO +198 -0
- timeseries_qc-0.1.0/README.md +166 -0
- timeseries_qc-0.1.0/pyproject.toml +72 -0
- timeseries_qc-0.1.0/setup.cfg +4 -0
- timeseries_qc-0.1.0/tests/test_checker.py +183 -0
- timeseries_qc-0.1.0/tests/test_export.py +70 -0
- timeseries_qc-0.1.0/tests/test_rules.py +207 -0
- timeseries_qc-0.1.0/tests/test_time_health.py +194 -0
- timeseries_qc-0.1.0/tests/test_viz.py +150 -0
- timeseries_qc-0.1.0/tests/test_yaml_parser.py +193 -0
- timeseries_qc-0.1.0/timeseries_qc.egg-info/PKG-INFO +198 -0
- timeseries_qc-0.1.0/timeseries_qc.egg-info/SOURCES.txt +26 -0
- timeseries_qc-0.1.0/timeseries_qc.egg-info/dependency_links.txt +1 -0
- timeseries_qc-0.1.0/timeseries_qc.egg-info/requires.txt +11 -0
- timeseries_qc-0.1.0/timeseries_qc.egg-info/top_level.txt +1 -0
- timeseries_qc-0.1.0/tsqc/__init__.py +23 -0
- timeseries_qc-0.1.0/tsqc/checker.py +272 -0
- timeseries_qc-0.1.0/tsqc/config/__init__.py +0 -0
- timeseries_qc-0.1.0/tsqc/config/yaml_parser.py +163 -0
- timeseries_qc-0.1.0/tsqc/result.py +368 -0
- timeseries_qc-0.1.0/tsqc/rules/__init__.py +4 -0
- timeseries_qc-0.1.0/tsqc/rules/base.py +37 -0
- timeseries_qc-0.1.0/tsqc/rules/builtins.py +169 -0
- timeseries_qc-0.1.0/tsqc/time_health/__init__.py +0 -0
- timeseries_qc-0.1.0/tsqc/time_health/checker.py +204 -0
- timeseries_qc-0.1.0/tsqc/viz/__init__.py +0 -0
- timeseries_qc-0.1.0/tsqc/viz/rle.py +93 -0
- timeseries_qc-0.1.0/tsqc/viz/timeline.py +139 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: timeseries-qc
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Classify timeseries rows as Good / Sus / Bad and render a multi-tag quality timeline chart.
|
|
5
|
+
Author: timeseries-qc contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://nagusubra.github.io/timeseries-qc/
|
|
8
|
+
Project-URL: Repository, https://github.com/nagusubra/timeseries-qc
|
|
9
|
+
Project-URL: Issues, https://github.com/nagusubra/timeseries-qc/issues
|
|
10
|
+
Keywords: timeseries,data quality,QC,SCADA,IoT,pandas
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Requires-Python: >=3.9
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
Requires-Dist: pandas>=1.5
|
|
23
|
+
Requires-Dist: plotly>=5.0
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
27
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
28
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
29
|
+
Requires-Dist: mypy>=1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
31
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
32
|
+
|
|
33
|
+
# timeseries-qc
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/timeseries-qc/)
|
|
36
|
+
[](LICENSE)
|
|
37
|
+
|
|
38
|
+
**The open source data quality-control layer for SCADA, DCS, IoT, and historian timeseries data.**
|
|
39
|
+
|
|
40
|
+
Add `good / sus / bad` quality labels to every row of a pandas DataFrame in five lines. Then render a multi-tag horizontal status timeline, the chart that no other open-source library produces.
|
|
41
|
+
|
|
42
|
+
A simple to digest and understand timeseries data quality check. Catch the issues in your process data before it affects your downstream analytics and business decisions. Build data quality checks based on business rules and monitor through interactive graph components.
|
|
43
|
+
|
|
44
|
+
**Sample Input - Solar farm SCADA data:**
|
|
45
|
+
|
|
46
|
+
```text
|
|
47
|
+
| timestamp | tag_name | value |
|
|
48
|
+
| :------------------------ | :------------- | :------ |
|
|
49
|
+
| 2026-01-01 00:00:00+00:00 | INVERTER.MW | 42.1 |
|
|
50
|
+
| 2026-01-01 01:00:00+00:00 | INVERTER.MW | NULL | <-- timeseries_qc will catch this (Null value)
|
|
51
|
+
| 2026-01-01 02:00:00+00:00 | INVERTER.MW | 52.3 |
|
|
52
|
+
| 2026-01-01 00:00:00+00:00 | MET.IRRADIANCE | 600.001 |
|
|
53
|
+
| 2026-01-01 01:00:00+00:00 | MET.IRRADIANCE | 600.001 | <-- timeseries_qc will catch this (Stale/Frozen value)
|
|
54
|
+
| 2026-01-01 02:00:00+00:00 | MET.IRRADIANCE | 810.818 |
|
|
55
|
+
| 2026-01-01 00:00:00+00:00 | TRACKER.ANGLE | 30.22 |
|
|
56
|
+
| 2026-01-01 01:00:00+00:00 | TRACKER.ANGLE | 45.31 |
|
|
57
|
+
| 2026-01-01 02:00:00+00:00 | TRACKER.ANGLE | 60.22 |
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Sample Output - Solar farm SCADA data:**
|
|
61
|
+
|
|
62
|
+

|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
**Sample Input - Oil field SCADA data:**
|
|
66
|
+
|
|
67
|
+
```text
|
|
68
|
+
| timestamp | tag_name | value |
|
|
69
|
+
| :------------------------ | :----------- | :----- |
|
|
70
|
+
| 2026-01-01 00:00:00+00:00 | WHP.PSIG | 0 | <-- timeseries_qc will catch this (Flatline/Zero)
|
|
71
|
+
| 2026-01-01 01:00:00+00:00 | WHP.PSIG | 0 | <-- timeseries_qc will catch this (Flatline/Zero)
|
|
72
|
+
| 2026-01-01 02:00:00+00:00 | WHP.PSIG | 0 | <-- timeseries_qc will catch this (Flatline/Zero)
|
|
73
|
+
| 2026-01-01 00:00:00+00:00 | FMRATE.MSCFD | 12.1 |
|
|
74
|
+
| 2026-01-01 01:00:00+00:00 | FMRATE.MSCFD | 90.99 | <-- timeseries_qc will catch this (Rate-of-change spike)
|
|
75
|
+
| 2026-01-01 02:00:00+00:00 | FMRATE.MSCFD | 12.3 |
|
|
76
|
+
| 2026-01-01 00:00:00+00:00 | OHT.TEMP_F | 30.2 |
|
|
77
|
+
| 2026-01-01 01:00:00+00:00 | OHT.TEMP_F | 45.2 |
|
|
78
|
+
| 2026-01-01 02:00:00+00:00 | OHT.TEMP_F | 6000.2 | <-- timeseries_qc will catch this (Out of bounds)
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Sample Output - Oil field SCADA data:**
|
|
82
|
+
|
|
83
|
+

|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
## Features
|
|
87
|
+
|
|
88
|
+
- **Four built-in rules** cover ≥80% of real-world bad data: `NullRule`, `FlatlineRule`, `DeltaRule`, `RangeRule`
|
|
89
|
+
- **Timeline chart** (`result.plot()`) — Plotly Gantt-style, one row per tag, Green/Yellow/Red, hover tooltips
|
|
90
|
+
- **YAML config** — non-coders set thresholds in a text file, no Python required
|
|
91
|
+
- **Timestamp health** (`result.check_timestamps()`) — detects gaps, duplicates, non-monotonic, freq drift, DST ambiguity
|
|
92
|
+
- **Self-contained HTML export** (`result.export_report("report.html")`) — offline, no CDN, includes per-issue summary table
|
|
93
|
+
- **Per-issue breakdown** (`result.issue_summary()`) — start/end times, row count, duration, and status for each contiguous bad/sus segment
|
|
94
|
+
- **Pandas-native** — works with any DataFrame that has `timestamp`, `tag_name`, `value` columns
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Installation
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install timeseries-qc
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Quickstart (5 lines)
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
import tsqc
|
|
110
|
+
import pandas as pd
|
|
111
|
+
|
|
112
|
+
df = pd.read_csv("sensor_data.csv") # columns: timestamp, tag_name, value
|
|
113
|
+
result = tsqc.check(df, assume_tz="UTC") # assume_tz required for tz-naive CSVs
|
|
114
|
+
result.plot().show() # renders the multi-tag quality timeline
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
If your CSV already contains tz-aware timestamps (ISO 8601 with `+00:00`), omit `assume_tz`.
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
## YAML Config Example
|
|
122
|
+
|
|
123
|
+
```yaml
|
|
124
|
+
# tsqc_rules.yaml
|
|
125
|
+
default_rules:
|
|
126
|
+
- check: null
|
|
127
|
+
level: bad
|
|
128
|
+
- check: flatline
|
|
129
|
+
window: 1h
|
|
130
|
+
min_delta: 0.001
|
|
131
|
+
level: sus
|
|
132
|
+
- check: delta
|
|
133
|
+
threshold: 50.0
|
|
134
|
+
level: sus
|
|
135
|
+
|
|
136
|
+
tag_rules:
|
|
137
|
+
FOREBAY.LEVEL:
|
|
138
|
+
- check: range
|
|
139
|
+
min: 900
|
|
140
|
+
max: 1100
|
|
141
|
+
level: bad
|
|
142
|
+
"GENERATOR.*":
|
|
143
|
+
- check: range
|
|
144
|
+
min: 0
|
|
145
|
+
max: 200
|
|
146
|
+
level: bad
|
|
147
|
+
- check: flatline
|
|
148
|
+
window: 30min
|
|
149
|
+
min_delta: 0.5 # 0 MW for <30min is valid; longer flatline at non-zero is suspect
|
|
150
|
+
level: sus
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
result = tsqc.check(df, rules="tsqc_rules.yaml")
|
|
155
|
+
result.summary() # DataFrame: pct_good/sus/bad per tag
|
|
156
|
+
result.issue_summary() # DataFrame: per-issue runs (start, end, rows, duration)
|
|
157
|
+
result.check_timestamps() # DataFrame: gap/duplicate/non_monotonic issues
|
|
158
|
+
result.export_report("report.html") # Full HTML with chart + all tables
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Output Schema
|
|
164
|
+
|
|
165
|
+
`result.df` adds two columns to your DataFrame:
|
|
166
|
+
|
|
167
|
+
| Column | Values | Notes |
|
|
168
|
+
|--------|--------|-------|
|
|
169
|
+
| `quality` | `"good"`, `"sus"`, `"bad"` | Worst-level rule wins |
|
|
170
|
+
| `quality_reasons` | e.g. `"flatline\|range"` | Pipe-delimited triggered rule names |
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## Comparison with Alternatives
|
|
175
|
+
|
|
176
|
+
**Pecos** (Sandia Labs) offers binary pass/fail and has been in maintenance mode since 2021 — no timeline chart and no YAML config. **SaQC** (Helmholtz UFZ) is a rich flagging engine for environmental science but has an environmental-domain API, no timeline visualization, and an LGPL license. **Great Expectations** is not timeseries-native and produces no visualization. `timeseries-qc` is the only library that combines (1) Good/Sus/Bad classification, (2) the multi-tag horizontal status timeline, and (3) YAML-driven configuration in a single `pip install`.
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## Examples
|
|
181
|
+
|
|
182
|
+
- [examples/solar_farm.ipynb](examples/solar_farm.ipynb) — solar farm SCADA data with anomaly injection
|
|
183
|
+
- [examples/oilfield.ipynb](examples/oilfield.ipynb) — oil well pad SCADA data with anomaly injection
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Known Limitations (v0.1.0)
|
|
188
|
+
|
|
189
|
+
1. **Pandas only.** PySpark and Polars support are deferred.
|
|
190
|
+
2. **No YAML override of default rules.** Tag-specific rules add to, not replace, default rules.
|
|
191
|
+
3. **Visualization requires Plotly ≥ 5.0.** Matplotlib output not supported.
|
|
192
|
+
4. **`DeltaRule` is point-to-point diff only.** Rolling-window delta is a v0.2 feature.
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## License
|
|
197
|
+
|
|
198
|
+
MIT © timeseries-qc contributors
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# timeseries-qc
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/timeseries-qc/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
|
|
6
|
+
**The open source data quality-control layer for SCADA, DCS, IoT, and historian timeseries data.**
|
|
7
|
+
|
|
8
|
+
Add `good / sus / bad` quality labels to every row of a pandas DataFrame in five lines. Then render a multi-tag horizontal status timeline, the chart that no other open-source library produces.
|
|
9
|
+
|
|
10
|
+
A simple to digest and understand timeseries data quality check. Catch the issues in your process data before it affects your downstream analytics and business decisions. Build data quality checks based on business rules and monitor through interactive graph components.
|
|
11
|
+
|
|
12
|
+
**Sample Input - Solar farm SCADA data:**
|
|
13
|
+
|
|
14
|
+
```text
|
|
15
|
+
| timestamp | tag_name | value |
|
|
16
|
+
| :------------------------ | :------------- | :------ |
|
|
17
|
+
| 2026-01-01 00:00:00+00:00 | INVERTER.MW | 42.1 |
|
|
18
|
+
| 2026-01-01 01:00:00+00:00 | INVERTER.MW | NULL | <-- timeseries_qc will catch this (Null value)
|
|
19
|
+
| 2026-01-01 02:00:00+00:00 | INVERTER.MW | 52.3 |
|
|
20
|
+
| 2026-01-01 00:00:00+00:00 | MET.IRRADIANCE | 600.001 |
|
|
21
|
+
| 2026-01-01 01:00:00+00:00 | MET.IRRADIANCE | 600.001 | <-- timeseries_qc will catch this (Stale/Frozen value)
|
|
22
|
+
| 2026-01-01 02:00:00+00:00 | MET.IRRADIANCE | 810.818 |
|
|
23
|
+
| 2026-01-01 00:00:00+00:00 | TRACKER.ANGLE | 30.22 |
|
|
24
|
+
| 2026-01-01 01:00:00+00:00 | TRACKER.ANGLE | 45.31 |
|
|
25
|
+
| 2026-01-01 02:00:00+00:00 | TRACKER.ANGLE | 60.22 |
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Sample Output - Solar farm SCADA data:**
|
|
29
|
+
|
|
30
|
+

|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
**Sample Input - Oil field SCADA data:**
|
|
34
|
+
|
|
35
|
+
```text
|
|
36
|
+
| timestamp | tag_name | value |
|
|
37
|
+
| :------------------------ | :----------- | :----- |
|
|
38
|
+
| 2026-01-01 00:00:00+00:00 | WHP.PSIG | 0 | <-- timeseries_qc will catch this (Flatline/Zero)
|
|
39
|
+
| 2026-01-01 01:00:00+00:00 | WHP.PSIG | 0 | <-- timeseries_qc will catch this (Flatline/Zero)
|
|
40
|
+
| 2026-01-01 02:00:00+00:00 | WHP.PSIG | 0 | <-- timeseries_qc will catch this (Flatline/Zero)
|
|
41
|
+
| 2026-01-01 00:00:00+00:00 | FMRATE.MSCFD | 12.1 |
|
|
42
|
+
| 2026-01-01 01:00:00+00:00 | FMRATE.MSCFD | 90.99 | <-- timeseries_qc will catch this (Rate-of-change spike)
|
|
43
|
+
| 2026-01-01 02:00:00+00:00 | FMRATE.MSCFD | 12.3 |
|
|
44
|
+
| 2026-01-01 00:00:00+00:00 | OHT.TEMP_F | 30.2 |
|
|
45
|
+
| 2026-01-01 01:00:00+00:00 | OHT.TEMP_F | 45.2 |
|
|
46
|
+
| 2026-01-01 02:00:00+00:00 | OHT.TEMP_F | 6000.2 | <-- timeseries_qc will catch this (Out of bounds)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Sample Output - Oil field SCADA data:**
|
|
50
|
+
|
|
51
|
+

|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
## Features
|
|
55
|
+
|
|
56
|
+
- **Four built-in rules** cover ≥80% of real-world bad data: `NullRule`, `FlatlineRule`, `DeltaRule`, `RangeRule`
|
|
57
|
+
- **Timeline chart** (`result.plot()`) — Plotly Gantt-style, one row per tag, Green/Yellow/Red, hover tooltips
|
|
58
|
+
- **YAML config** — non-coders set thresholds in a text file, no Python required
|
|
59
|
+
- **Timestamp health** (`result.check_timestamps()`) — detects gaps, duplicates, non-monotonic, freq drift, DST ambiguity
|
|
60
|
+
- **Self-contained HTML export** (`result.export_report("report.html")`) — offline, no CDN, includes per-issue summary table
|
|
61
|
+
- **Per-issue breakdown** (`result.issue_summary()`) — start/end times, row count, duration, and status for each contiguous bad/sus segment
|
|
62
|
+
- **Pandas-native** — works with any DataFrame that has `timestamp`, `tag_name`, `value` columns
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Installation
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install timeseries-qc
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Quickstart (5 lines)
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
import tsqc
|
|
78
|
+
import pandas as pd
|
|
79
|
+
|
|
80
|
+
df = pd.read_csv("sensor_data.csv") # columns: timestamp, tag_name, value
|
|
81
|
+
result = tsqc.check(df, assume_tz="UTC") # assume_tz required for tz-naive CSVs
|
|
82
|
+
result.plot().show() # renders the multi-tag quality timeline
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
If your CSV already contains tz-aware timestamps (ISO 8601 with `+00:00`), omit `assume_tz`.
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
## YAML Config Example
|
|
90
|
+
|
|
91
|
+
```yaml
|
|
92
|
+
# tsqc_rules.yaml
|
|
93
|
+
default_rules:
|
|
94
|
+
- check: null
|
|
95
|
+
level: bad
|
|
96
|
+
- check: flatline
|
|
97
|
+
window: 1h
|
|
98
|
+
min_delta: 0.001
|
|
99
|
+
level: sus
|
|
100
|
+
- check: delta
|
|
101
|
+
threshold: 50.0
|
|
102
|
+
level: sus
|
|
103
|
+
|
|
104
|
+
tag_rules:
|
|
105
|
+
FOREBAY.LEVEL:
|
|
106
|
+
- check: range
|
|
107
|
+
min: 900
|
|
108
|
+
max: 1100
|
|
109
|
+
level: bad
|
|
110
|
+
"GENERATOR.*":
|
|
111
|
+
- check: range
|
|
112
|
+
min: 0
|
|
113
|
+
max: 200
|
|
114
|
+
level: bad
|
|
115
|
+
- check: flatline
|
|
116
|
+
window: 30min
|
|
117
|
+
min_delta: 0.5 # 0 MW for <30min is valid; longer flatline at non-zero is suspect
|
|
118
|
+
level: sus
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
result = tsqc.check(df, rules="tsqc_rules.yaml")
|
|
123
|
+
result.summary() # DataFrame: pct_good/sus/bad per tag
|
|
124
|
+
result.issue_summary() # DataFrame: per-issue runs (start, end, rows, duration)
|
|
125
|
+
result.check_timestamps() # DataFrame: gap/duplicate/non_monotonic issues
|
|
126
|
+
result.export_report("report.html") # Full HTML with chart + all tables
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
---
|
|
130
|
+
|
|
131
|
+
## Output Schema
|
|
132
|
+
|
|
133
|
+
`result.df` adds two columns to your DataFrame:
|
|
134
|
+
|
|
135
|
+
| Column | Values | Notes |
|
|
136
|
+
|--------|--------|-------|
|
|
137
|
+
| `quality` | `"good"`, `"sus"`, `"bad"` | Worst-level rule wins |
|
|
138
|
+
| `quality_reasons` | e.g. `"flatline\|range"` | Pipe-delimited triggered rule names |
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## Comparison with Alternatives
|
|
143
|
+
|
|
144
|
+
**Pecos** (Sandia Labs) offers binary pass/fail and has been in maintenance mode since 2021 — no timeline chart and no YAML config. **SaQC** (Helmholtz UFZ) is a rich flagging engine for environmental science but has an environmental-domain API, no timeline visualization, and an LGPL license. **Great Expectations** is not timeseries-native and produces no visualization. `timeseries-qc` is the only library that combines (1) Good/Sus/Bad classification, (2) the multi-tag horizontal status timeline, and (3) YAML-driven configuration in a single `pip install`.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Examples
|
|
149
|
+
|
|
150
|
+
- [examples/solar_farm.ipynb](examples/solar_farm.ipynb) — solar farm SCADA data with anomaly injection
|
|
151
|
+
- [examples/oilfield.ipynb](examples/oilfield.ipynb) — oil well pad SCADA data with anomaly injection
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Known Limitations (v0.1.0)
|
|
156
|
+
|
|
157
|
+
1. **Pandas only.** PySpark and Polars support are deferred.
|
|
158
|
+
2. **No YAML override of default rules.** Tag-specific rules add to, not replace, default rules.
|
|
159
|
+
3. **Visualization requires Plotly ≥ 5.0.** Matplotlib output not supported.
|
|
160
|
+
4. **`DeltaRule` is point-to-point diff only.** Rolling-window delta is a v0.2 feature.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
MIT © timeseries-qc contributors
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "timeseries-qc"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Classify timeseries rows as Good / Sus / Bad and render a multi-tag quality timeline chart."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
keywords = ["timeseries", "data quality", "QC", "SCADA", "IoT", "pandas"]
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "timeseries-qc contributors" },
|
|
15
|
+
]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
24
|
+
"Intended Audience :: Science/Research",
|
|
25
|
+
"Operating System :: OS Independent",
|
|
26
|
+
]
|
|
27
|
+
dependencies = [
|
|
28
|
+
"pandas>=1.5",
|
|
29
|
+
"plotly>=5.0",
|
|
30
|
+
"pyyaml>=6.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://nagusubra.github.io/timeseries-qc/"
|
|
35
|
+
Repository = "https://github.com/nagusubra/timeseries-qc"
|
|
36
|
+
Issues = "https://github.com/nagusubra/timeseries-qc/issues"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=7.0",
|
|
41
|
+
"pytest-cov>=4.0",
|
|
42
|
+
"ruff>=0.4",
|
|
43
|
+
"mypy>=1.0",
|
|
44
|
+
"build>=1.0",
|
|
45
|
+
"twine>=5.0",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["."]
|
|
50
|
+
include = ["tsqc*"]
|
|
51
|
+
exclude = ["synthetic_data_generation*", "examples*", "tests*"]
|
|
52
|
+
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
target-version = "py39"
|
|
55
|
+
line-length = 100
|
|
56
|
+
|
|
57
|
+
[tool.ruff.lint]
|
|
58
|
+
select = ["E", "F", "W", "I"]
|
|
59
|
+
ignore = ["E501"]
|
|
60
|
+
|
|
61
|
+
[tool.mypy]
|
|
62
|
+
python_version = "3.10"
|
|
63
|
+
ignore_missing_imports = true
|
|
64
|
+
strict = false
|
|
65
|
+
|
|
66
|
+
[tool.pytest.ini_options]
|
|
67
|
+
testpaths = ["tests"]
|
|
68
|
+
addopts = "--tb=short"
|
|
69
|
+
|
|
70
|
+
[tool.coverage.run]
|
|
71
|
+
source = ["tsqc"]
|
|
72
|
+
omit = ["tests/*"]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""End-to-end tests for tsqc.check() (tsqc/checker.py)."""
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import pandas as pd
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
7
|
+
import tsqc
|
|
8
|
+
from tsqc.rules.builtins import CustomRule, NullRule, RangeRule
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _utc_df(n=50, tag="SENSOR_A", with_nan=True, with_spike=True):
|
|
12
|
+
ts = pd.date_range("2026-01-01", periods=n, freq="1min", tz="UTC")
|
|
13
|
+
rng = np.random.default_rng(0)
|
|
14
|
+
vals = 10.0 + rng.normal(0, 0.5, n)
|
|
15
|
+
if with_nan:
|
|
16
|
+
vals[5] = float("nan")
|
|
17
|
+
if with_spike:
|
|
18
|
+
vals[20] = 999.0
|
|
19
|
+
df = pd.DataFrame({"timestamp": ts, "tag_name": tag, "value": vals})
|
|
20
|
+
return df
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ───────────────────────────── Basic ─────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
class TestCheckBasic:
|
|
26
|
+
def test_returns_qcresult(self, single_tag_df):
|
|
27
|
+
result = tsqc.check(single_tag_df)
|
|
28
|
+
assert isinstance(result, tsqc.QCResult)
|
|
29
|
+
|
|
30
|
+
def test_output_has_quality_columns(self, single_tag_df):
|
|
31
|
+
result = tsqc.check(single_tag_df)
|
|
32
|
+
assert "quality" in result.df.columns
|
|
33
|
+
assert "quality_reasons" in result.df.columns
|
|
34
|
+
|
|
35
|
+
def test_quality_values_valid(self, single_tag_df):
|
|
36
|
+
result = tsqc.check(single_tag_df)
|
|
37
|
+
assert set(result.df["quality"].unique()).issubset({"good", "sus", "bad"})
|
|
38
|
+
|
|
39
|
+
def test_does_not_modify_original(self, single_tag_df):
|
|
40
|
+
original_cols = list(single_tag_df.columns)
|
|
41
|
+
tsqc.check(single_tag_df)
|
|
42
|
+
assert list(single_tag_df.columns) == original_cols
|
|
43
|
+
|
|
44
|
+
def test_row_count_preserved(self, single_tag_df):
|
|
45
|
+
result = tsqc.check(single_tag_df)
|
|
46
|
+
assert len(result.df) == len(single_tag_df)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# ───────────────────────────── Multi-tag ─────────────────────────────────
|
|
50
|
+
|
|
51
|
+
class TestCheckMultiTag:
|
|
52
|
+
def test_multi_tag_all_tags_present(self, multi_tag_df):
|
|
53
|
+
result = tsqc.check(multi_tag_df)
|
|
54
|
+
assert set(result.df["tag_name"].unique()) == {"TAG_A", "TAG_B", "TAG_C"}
|
|
55
|
+
|
|
56
|
+
def test_multi_tag_quality_column_populated(self, multi_tag_df):
|
|
57
|
+
result = tsqc.check(multi_tag_df)
|
|
58
|
+
assert result.df["quality"].notna().all()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# ───────────────────────────── quality_reasons ───────────────────────────
|
|
62
|
+
|
|
63
|
+
class TestQualityReasons:
|
|
64
|
+
def test_null_reason_set(self):
|
|
65
|
+
df = _utc_df(with_nan=True, with_spike=False)
|
|
66
|
+
result = tsqc.check(df, rules=[NullRule()])
|
|
67
|
+
nan_row = result.df[result.df["value"].isna()]
|
|
68
|
+
assert "null" in nan_row["quality_reasons"].iloc[0]
|
|
69
|
+
|
|
70
|
+
def test_pipe_delimited_multiple_reasons(self):
|
|
71
|
+
df = _utc_df(with_nan=True, with_spike=False)
|
|
72
|
+
rules = [
|
|
73
|
+
NullRule(level="bad"),
|
|
74
|
+
CustomRule(fn=lambda s: s.isna(), name="also_null", level="sus"),
|
|
75
|
+
]
|
|
76
|
+
result = tsqc.check(df, rules=rules)
|
|
77
|
+
nan_row = result.df[result.df["value"].isna()]
|
|
78
|
+
reasons = nan_row["quality_reasons"].iloc[0]
|
|
79
|
+
assert "|" in reasons
|
|
80
|
+
assert "null" in reasons
|
|
81
|
+
assert "also_null" in reasons
|
|
82
|
+
|
|
83
|
+
def test_good_rows_have_empty_reasons(self):
|
|
84
|
+
df = _utc_df(with_nan=False, with_spike=False, n=10)
|
|
85
|
+
result = tsqc.check(df, rules=[NullRule()])
|
|
86
|
+
assert (result.df["quality_reasons"] == "").all()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ───────────────────────────── Custom rules ──────────────────────────────
|
|
90
|
+
|
|
91
|
+
class TestCheckCustomRules:
|
|
92
|
+
def test_custom_rule_list_applied(self):
|
|
93
|
+
df = _utc_df(with_nan=False, with_spike=True)
|
|
94
|
+
rules = [RangeRule(min_val=0.0, max_val=100.0, level="bad")]
|
|
95
|
+
result = tsqc.check(df, rules=rules)
|
|
96
|
+
bad_rows = result.df[result.df["quality"] == "bad"]
|
|
97
|
+
# Spike at value=999 should be flagged
|
|
98
|
+
assert len(bad_rows) >= 1
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ───────────────────────────── Error handling ────────────────────────────
|
|
102
|
+
|
|
103
|
+
class TestCheckErrors:
|
|
104
|
+
def test_missing_value_column_raises(self):
|
|
105
|
+
df = pd.DataFrame(
|
|
106
|
+
{"timestamp": pd.date_range("2026-01-01", periods=3, freq="1min", tz="UTC")}
|
|
107
|
+
)
|
|
108
|
+
with pytest.raises(ValueError, match="value"):
|
|
109
|
+
tsqc.check(df)
|
|
110
|
+
|
|
111
|
+
def test_missing_timestamp_column_raises(self):
|
|
112
|
+
df = pd.DataFrame({"value": [1.0, 2.0]})
|
|
113
|
+
with pytest.raises(ValueError, match="timestamp"):
|
|
114
|
+
tsqc.check(df)
|
|
115
|
+
|
|
116
|
+
def test_tz_naive_without_assume_tz_raises(self):
|
|
117
|
+
ts = pd.date_range("2026-01-01", periods=5, freq="1min") # tz-naive
|
|
118
|
+
df = pd.DataFrame(
|
|
119
|
+
{"timestamp": ts, "tag_name": "T", "value": [1.0] * 5}
|
|
120
|
+
)
|
|
121
|
+
with pytest.raises(ValueError, match="assume_tz"):
|
|
122
|
+
tsqc.check(df)
|
|
123
|
+
|
|
124
|
+
def test_tz_naive_with_assume_tz_utc_succeeds(self):
|
|
125
|
+
ts = pd.date_range("2026-01-01", periods=5, freq="1min") # tz-naive
|
|
126
|
+
df = pd.DataFrame({"timestamp": ts, "tag_name": "T", "value": [1.0] * 5})
|
|
127
|
+
result = tsqc.check(df, assume_tz="UTC")
|
|
128
|
+
assert result.df["timestamp"].dt.tz is not None
|
|
129
|
+
assert str(result.df["timestamp"].dt.tz) == "UTC"
|
|
130
|
+
|
|
131
|
+
def test_invalid_assume_tz_raises(self):
|
|
132
|
+
ts = pd.date_range("2026-01-01", periods=3, freq="1min")
|
|
133
|
+
df = pd.DataFrame({"timestamp": ts, "tag_name": "T", "value": [1.0] * 3})
|
|
134
|
+
with pytest.raises(ValueError, match="IANA"):
|
|
135
|
+
tsqc.check(df, assume_tz="NotATimezone/Bogus")
|
|
136
|
+
|
|
137
|
+
def test_tz_aware_non_utc_converted_to_utc(self):
|
|
138
|
+
ts = pd.date_range("2026-01-01", periods=5, freq="1min", tz="America/Chicago")
|
|
139
|
+
df = pd.DataFrame({"timestamp": ts, "tag_name": "T", "value": [1.0] * 5})
|
|
140
|
+
result = tsqc.check(df)
|
|
141
|
+
assert str(result.df["timestamp"].dt.tz) == "UTC"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ───────────────────────────── No tag column ─────────────────────────────
|
|
145
|
+
|
|
146
|
+
class TestCheckNoTagCol:
|
|
147
|
+
def test_single_tag_no_tag_col(self):
|
|
148
|
+
ts = pd.date_range("2026-01-01", periods=10, freq="1min", tz="UTC")
|
|
149
|
+
df = pd.DataFrame({"timestamp": ts, "value": list(range(10))})
|
|
150
|
+
result = tsqc.check(df, tag_col=None)
|
|
151
|
+
assert "quality" in result.df.columns
|
|
152
|
+
|
|
153
|
+
def test_summary_with_no_tag_col(self):
|
|
154
|
+
ts = pd.date_range("2026-01-01", periods=10, freq="1min", tz="UTC")
|
|
155
|
+
df = pd.DataFrame({"timestamp": ts, "value": list(range(10))})
|
|
156
|
+
result = tsqc.check(df, tag_col=None)
|
|
157
|
+
summary = result.summary()
|
|
158
|
+
assert "tag_name" in summary.columns
|
|
159
|
+
assert len(summary) == 1
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# ───────────────────────────── summary() ─────────────────────────────────
|
|
163
|
+
|
|
164
|
+
class TestSummary:
|
|
165
|
+
def test_summary_columns(self, single_tag_df):
|
|
166
|
+
result = tsqc.check(single_tag_df)
|
|
167
|
+
summary = result.summary()
|
|
168
|
+
expected_cols = {"tag_name", "total_rows", "pct_good", "pct_sus", "pct_bad",
|
|
169
|
+
"n_good", "n_sus", "n_bad"}
|
|
170
|
+
assert expected_cols.issubset(set(summary.columns))
|
|
171
|
+
|
|
172
|
+
def test_summary_pcts_sum_to_100(self, single_tag_df):
|
|
173
|
+
result = tsqc.check(single_tag_df)
|
|
174
|
+
summary = result.summary()
|
|
175
|
+
for _, row in summary.iterrows():
|
|
176
|
+
total = row["pct_good"] + row["pct_sus"] + row["pct_bad"]
|
|
177
|
+
assert abs(total - 100.0) < 0.5, f"Percentages don't sum to 100 for {row['tag_name']}"
|
|
178
|
+
|
|
179
|
+
def test_summary_sorted_by_pct_bad(self, multi_tag_df):
|
|
180
|
+
result = tsqc.check(multi_tag_df)
|
|
181
|
+
summary = result.summary()
|
|
182
|
+
pct_bad_values = list(summary["pct_bad"])
|
|
183
|
+
assert pct_bad_values == sorted(pct_bad_values, reverse=True)
|