taskclf 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- taskclf-0.1.0/PKG-INFO +281 -0
- taskclf-0.1.0/README.md +248 -0
- taskclf-0.1.0/pyproject.toml +66 -0
- taskclf-0.1.0/src/taskclf/README.md +20 -0
- taskclf-0.1.0/src/taskclf/__init__.py +2 -0
- taskclf-0.1.0/src/taskclf/adapters/README.md +41 -0
- taskclf-0.1.0/src/taskclf/adapters/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/adapters/activitywatch/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/adapters/activitywatch/client.py +296 -0
- taskclf-0.1.0/src/taskclf/adapters/activitywatch/mapping.py +178 -0
- taskclf-0.1.0/src/taskclf/adapters/activitywatch/types.py +48 -0
- taskclf-0.1.0/src/taskclf/adapters/input/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/adapters/input/linux.py +0 -0
- taskclf-0.1.0/src/taskclf/adapters/input/macos.py +1 -0
- taskclf-0.1.0/src/taskclf/adapters/input/windows.py +0 -0
- taskclf-0.1.0/src/taskclf/cli/README.md +54 -0
- taskclf-0.1.0/src/taskclf/cli/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/cli/main.py +2169 -0
- taskclf-0.1.0/src/taskclf/core/README.md +22 -0
- taskclf-0.1.0/src/taskclf/core/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/core/defaults.py +84 -0
- taskclf-0.1.0/src/taskclf/core/drift.py +331 -0
- taskclf-0.1.0/src/taskclf/core/hashing.py +42 -0
- taskclf-0.1.0/src/taskclf/core/logging.py +88 -0
- taskclf-0.1.0/src/taskclf/core/metrics.py +395 -0
- taskclf-0.1.0/src/taskclf/core/model_io.py +220 -0
- taskclf-0.1.0/src/taskclf/core/schema.py +163 -0
- taskclf-0.1.0/src/taskclf/core/store.py +51 -0
- taskclf-0.1.0/src/taskclf/core/telemetry.py +237 -0
- taskclf-0.1.0/src/taskclf/core/time.py +67 -0
- taskclf-0.1.0/src/taskclf/core/types.py +244 -0
- taskclf-0.1.0/src/taskclf/core/validation.py +314 -0
- taskclf-0.1.0/src/taskclf/features/README.md +36 -0
- taskclf-0.1.0/src/taskclf/features/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/features/build.py +439 -0
- taskclf-0.1.0/src/taskclf/features/domain.py +138 -0
- taskclf-0.1.0/src/taskclf/features/dynamics.py +140 -0
- taskclf-0.1.0/src/taskclf/features/sessions.py +69 -0
- taskclf-0.1.0/src/taskclf/features/text.py +56 -0
- taskclf-0.1.0/src/taskclf/features/windows.py +75 -0
- taskclf-0.1.0/src/taskclf/infer/README.md +38 -0
- taskclf-0.1.0/src/taskclf/infer/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/infer/baseline.py +199 -0
- taskclf-0.1.0/src/taskclf/infer/batch.py +289 -0
- taskclf-0.1.0/src/taskclf/infer/calibration.py +326 -0
- taskclf-0.1.0/src/taskclf/infer/monitor.py +328 -0
- taskclf-0.1.0/src/taskclf/infer/online.py +515 -0
- taskclf-0.1.0/src/taskclf/infer/prediction.py +44 -0
- taskclf-0.1.0/src/taskclf/infer/resolve.py +161 -0
- taskclf-0.1.0/src/taskclf/infer/smooth.py +253 -0
- taskclf-0.1.0/src/taskclf/infer/taxonomy.py +341 -0
- taskclf-0.1.0/src/taskclf/labels/README.md +28 -0
- taskclf-0.1.0/src/taskclf/labels/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/labels/projection.py +99 -0
- taskclf-0.1.0/src/taskclf/labels/queue.py +247 -0
- taskclf-0.1.0/src/taskclf/labels/store.py +254 -0
- taskclf-0.1.0/src/taskclf/labels/weak_rules.py +1 -0
- taskclf-0.1.0/src/taskclf/model_registry.py +760 -0
- taskclf-0.1.0/src/taskclf/report/README.md +18 -0
- taskclf-0.1.0/src/taskclf/report/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/report/daily.py +144 -0
- taskclf-0.1.0/src/taskclf/report/export.py +125 -0
- taskclf-0.1.0/src/taskclf/train/README.md +31 -0
- taskclf-0.1.0/src/taskclf/train/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/train/build_dataset.py +172 -0
- taskclf-0.1.0/src/taskclf/train/calibrate.py +267 -0
- taskclf-0.1.0/src/taskclf/train/dataset.py +98 -0
- taskclf-0.1.0/src/taskclf/train/evaluate.py +394 -0
- taskclf-0.1.0/src/taskclf/train/lgbm.py +231 -0
- taskclf-0.1.0/src/taskclf/train/retrain.py +650 -0
- taskclf-0.1.0/src/taskclf/ui/__init__.py +0 -0
- taskclf-0.1.0/src/taskclf/ui/labeling.py +310 -0
- taskclf-0.1.0/src/taskclf/ui/tray.py +503 -0
taskclf-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: taskclf
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local-first task classifier that infers your work type from computer activity signals
|
|
5
|
+
Author: fruitiecutiepie
|
|
6
|
+
Author-email: fruitiecutiepie <104437268+fruitiecutiepie@users.noreply.github.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
11
|
+
Classifier: Topic :: Office/Business
|
|
12
|
+
Classifier: Topic :: System :: Monitoring
|
|
13
|
+
Requires-Dist: duckdb>=1.4.4
|
|
14
|
+
Requires-Dist: lightgbm>=4.6.0
|
|
15
|
+
Requires-Dist: matplotlib>=3.10.8
|
|
16
|
+
Requires-Dist: pandas>=3.0.0
|
|
17
|
+
Requires-Dist: pyarrow>=23.0.0
|
|
18
|
+
Requires-Dist: pydantic>=2.12.5
|
|
19
|
+
Requires-Dist: rich>=14.3.2
|
|
20
|
+
Requires-Dist: scipy>=1.15.0
|
|
21
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
22
|
+
Requires-Dist: streamlit>=1.54.0
|
|
23
|
+
Requires-Dist: typer>=0.23.1
|
|
24
|
+
Requires-Dist: pyyaml>=6.0
|
|
25
|
+
Requires-Dist: pystray>=0.19.5
|
|
26
|
+
Requires-Dist: pillow>=11.0.0
|
|
27
|
+
Requires-Dist: plyer>=2.1.0
|
|
28
|
+
Requires-Python: >=3.14
|
|
29
|
+
Project-URL: Homepage, https://github.com/fruitiecutiepie/taskclf
|
|
30
|
+
Project-URL: Documentation, https://fruitiecutiepie.github.io/taskclf
|
|
31
|
+
Project-URL: Repository, https://github.com/fruitiecutiepie/taskclf
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# taskclf — Task Type Classifier from Local Activity Signals
|
|
35
|
+
|
|
36
|
+
Train and run a personal task-type classifier (e.g. coding / writing / meetings) using privacy-preserving computer activity signals such as foreground app/window metadata and aggregated input statistics (counts/rates only).
|
|
37
|
+
|
|
38
|
+
This project is intentionally scoped as a **personalized classifier** (single-user first). The architecture keeps:
|
|
39
|
+
- **Collectors** (platform/tool dependent) isolated behind adapters
|
|
40
|
+
- **Features** as a versioned, validated contract
|
|
41
|
+
- **Models** as bundled artifacts with schema checks
|
|
42
|
+
- **Inference** as a small, stable loop that emits task segments and daily summaries
|
|
43
|
+
|
|
44
|
+
## Goals
|
|
45
|
+
- Fast iteration: first useful model in < 1 week of data
|
|
46
|
+
- Privacy: no raw keystrokes, no raw window titles persisted
|
|
47
|
+
- Stability: feature schema versioning + schema hash gates
|
|
48
|
+
- Extensibility: add new collectors and models without breaking consumers
|
|
49
|
+
|
|
50
|
+
## Non-Goals
|
|
51
|
+
- Universal (multi-user) generalization out of the box
|
|
52
|
+
- Storing or analyzing raw typed content
|
|
53
|
+
- "Perfect" labeling UI (start minimal, iterate later)
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Labels (v1)
|
|
58
|
+
|
|
59
|
+
Eight core labels defined in `schema/labels_v1.json`:
|
|
60
|
+
|
|
61
|
+
| ID | Label | Description |
|
|
62
|
+
|----|-------|-------------|
|
|
63
|
+
| 0 | `Build` | Writing or implementing structured content in editor/terminal |
|
|
64
|
+
| 1 | `Debug` | Investigating issues, terminal-heavy troubleshooting |
|
|
65
|
+
| 2 | `Review` | Reviewing technical material or diffs with light edits |
|
|
66
|
+
| 3 | `Write` | Writing structured non-code content |
|
|
67
|
+
| 4 | `ReadResearch` | Consuming information with minimal production |
|
|
68
|
+
| 5 | `Communicate` | Asynchronous coordination (chat/email) |
|
|
69
|
+
| 6 | `Meet` | Synchronous meetings or calls |
|
|
70
|
+
| 7 | `BreakIdle` | Idle or break period |
|
|
71
|
+
|
|
72
|
+
Labels are stored as **time spans** (not per-keystroke events). Users can remap
|
|
73
|
+
core labels to personal categories via a **taxonomy config**
|
|
74
|
+
(see `configs/user_taxonomy_example.yaml`).
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Data Flow Overview
|
|
79
|
+
|
|
80
|
+
### Structures (pipelines)
|
|
81
|
+
* ETL pipeline reads raw → produces features parquet
|
|
82
|
+
* Training pipeline reads features + labels → produces model
|
|
83
|
+
* Inference pipeline reads new events → emits predictions + segments
|
|
84
|
+
|
|
85
|
+
### Batch (repeatable)
|
|
86
|
+
1. **Ingest**: pull ActivityWatch export → `data/raw/aw/`
|
|
87
|
+
2. **Feature build**: events → per-minute features → `data/processed/features_v1/`
|
|
88
|
+
3. **Label import**: label spans → `data/processed/labels_v1/`
|
|
89
|
+
4. **Build dataset**: join features + labels, split by time → training arrays
|
|
90
|
+
5. **Train**: fit model → `models/<run_id>/`
|
|
91
|
+
6. **Evaluate**: metrics, acceptance checks, calibration
|
|
92
|
+
7. **Report**: daily summaries → `artifacts/`
|
|
93
|
+
|
|
94
|
+
### Online (real-time)
|
|
95
|
+
Every N seconds:
|
|
96
|
+
- read the last minute(s) of events
|
|
97
|
+
- compute the latest feature bucket
|
|
98
|
+
- predict + smooth (with optional calibration and taxonomy mapping)
|
|
99
|
+
- append predictions → `artifacts/`
|
|
100
|
+
|
|
101
|
+
At end-of-day:
|
|
102
|
+
- produce report
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Privacy & Safety
|
|
107
|
+
This repo enforces the following:
|
|
108
|
+
- **No raw keystrokes** are stored (only aggregate counts/rates).
|
|
109
|
+
- **No raw window titles** are stored by default.
|
|
110
|
+
- Titles are hashed or locally tokenized; you can keep a local mapping if you choose.
|
|
111
|
+
- Dataset artifacts stay **local-first**.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## Quick Start
|
|
116
|
+
|
|
117
|
+
### Requirements
|
|
118
|
+
- Python >= 3.14
|
|
119
|
+
- `uv` installed
|
|
120
|
+
|
|
121
|
+
### Setup
|
|
122
|
+
```bash
|
|
123
|
+
uv sync
|
|
124
|
+
uv run taskclf --help
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
### Ingest (ActivityWatch)
|
|
128
|
+
|
|
129
|
+
```bash
|
|
130
|
+
uv run taskclf ingest aw --input /path/to/activitywatch-export.json
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
This parses an ActivityWatch JSON export, normalizes app names to reverse-domain
|
|
134
|
+
identifiers, hashes window titles (never storing raw text), and writes
|
|
135
|
+
privacy-safe events to `data/raw/aw/<YYYY-MM-DD>/events.parquet` partitioned by
|
|
136
|
+
date.
|
|
137
|
+
|
|
138
|
+
Options:
|
|
139
|
+
- `--out-dir` — output directory (default: `data/raw/aw`)
|
|
140
|
+
- `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
|
|
141
|
+
|
|
142
|
+
### Build features
|
|
143
|
+
|
|
144
|
+
```bash
|
|
145
|
+
uv run taskclf features build --date 2026-02-16
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### Import labels
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
uv run taskclf labels import --file labels.csv
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
Or add individual label blocks:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
uv run taskclf labels add-block \
|
|
158
|
+
--start 2026-02-16T09:00:00 --end 2026-02-16T10:00:00 --label Build
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Or label what you're doing right now (no timestamps needed):
|
|
162
|
+
|
|
163
|
+
```bash
|
|
164
|
+
uv run taskclf labels label-now --minutes 10 --label Build
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
This queries ActivityWatch for a live summary of apps used in the last N minutes
|
|
168
|
+
and creates the label span automatically.
|
|
169
|
+
|
|
170
|
+
### Train
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
uv run taskclf train lgbm --from 2026-02-01 --to 2026-02-16
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
### Run batch inference
|
|
177
|
+
|
|
178
|
+
```bash
|
|
179
|
+
uv run taskclf infer batch --model-dir models/<run_id> --from 2026-02-01 --to 2026-02-16
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Run online inference
|
|
183
|
+
|
|
184
|
+
```bash
|
|
185
|
+
uv run taskclf infer online --model-dir models/<run_id>
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
Starts a polling loop that queries a running ActivityWatch server, builds
|
|
189
|
+
feature rows from live window events, predicts task types using a trained model,
|
|
190
|
+
smooths predictions, and writes running outputs to `artifacts/`. Press Ctrl+C
|
|
191
|
+
to stop; a final daily report is generated on shutdown.
|
|
192
|
+
|
|
193
|
+
Options:
|
|
194
|
+
- `--poll-seconds` — seconds between polls (default: 60)
|
|
195
|
+
- `--aw-host` — ActivityWatch server URL (default: `http://localhost:5600`)
|
|
196
|
+
- `--smooth-window` — rolling majority window size (default: 3)
|
|
197
|
+
- `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
|
|
198
|
+
- `--out-dir` — output directory (default: `artifacts`)
|
|
199
|
+
- `--label-queue` / `--no-label-queue` — auto-enqueue low-confidence predictions for manual labeling
|
|
200
|
+
- `--label-confidence` — confidence threshold for auto-enqueue (default: 0.55)
|
|
201
|
+
|
|
202
|
+
### Run baseline (no model needed)
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
uv run taskclf infer baseline --from 2026-02-01 --to 2026-02-16
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Rule-based classifier useful for day-1 bootstrapping before you have a trained model.
|
|
209
|
+
|
|
210
|
+
### Produce report
|
|
211
|
+
|
|
212
|
+
```bash
|
|
213
|
+
uv run taskclf report daily --segments-file artifacts/segments.json
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## CLI Reference
|
|
219
|
+
|
|
220
|
+
All commands: `uv run taskclf --help`
|
|
221
|
+
|
|
222
|
+
| Group | Commands | Purpose |
|
|
223
|
+
|-------|----------|---------|
|
|
224
|
+
| `ingest` | `aw` | Import ActivityWatch exports |
|
|
225
|
+
| `features` | `build` | Build per-minute feature rows |
|
|
226
|
+
| `labels` | `import`, `add-block`, `label-now`, `show-queue`, `project` | Manage label spans and labeling queue |
|
|
227
|
+
| `train` | `build-dataset`, `lgbm`, `evaluate`, `tune-reject`, `calibrate`, `retrain`, `check-retrain` | Training, evaluation, and retraining pipeline |
|
|
228
|
+
| `taxonomy` | `validate`, `show`, `init` | User-defined label groupings |
|
|
229
|
+
| `infer` | `batch`, `online`, `baseline`, `compare` | Prediction (ML, rule-based, comparison) |
|
|
230
|
+
| `report` | `daily` | Daily summaries (JSON/CSV/Parquet) |
|
|
231
|
+
| `monitor` | `drift-check`, `telemetry`, `show` | Feature drift and telemetry tracking |
|
|
232
|
+
| *(top-level)* | `tray` | System tray labeling app with activity transition detection |
|
|
233
|
+
|
|
234
|
+
Full CLI docs: `docs/api/cli/main.md`
|
|
235
|
+
|
|
236
|
+
---
|
|
237
|
+
|
|
238
|
+
## Repo Layout
|
|
239
|
+
|
|
240
|
+
* `src/taskclf/` — application code (adapters, core, features, labels, train, infer, report, ui)
|
|
241
|
+
* `schema/` — versioned JSON schemas for features and labels
|
|
242
|
+
* `configs/` — configuration files (model params, retrain policy, taxonomy examples)
|
|
243
|
+
* `docs/` — API reference and guides (served via `make docs-serve`)
|
|
244
|
+
* `data/` — raw and processed datasets (local, gitignored)
|
|
245
|
+
* `models/` — trained model bundles (one folder per run)
|
|
246
|
+
* `artifacts/` — predictions, segments, reports, evaluation outputs
|
|
247
|
+
* `tests/` — test suite
|
|
248
|
+
|
|
249
|
+
---
|
|
250
|
+
|
|
251
|
+
## Model Artifact Contract
|
|
252
|
+
|
|
253
|
+
Every saved model bundle (`models/<run_id>/`) contains:
|
|
254
|
+
|
|
255
|
+
* the model file
|
|
256
|
+
* `metadata.json`: feature schema version + hash, label set, training date range, params, dataset hash
|
|
257
|
+
* `metrics.json`: macro/weighted F1, per-class metrics
|
|
258
|
+
* `confusion_matrix.csv`
|
|
259
|
+
* categorical encoders (if applicable)
|
|
260
|
+
|
|
261
|
+
Inference refuses to run if the schema hash mismatches the model bundle.
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Development
|
|
266
|
+
|
|
267
|
+
Common tasks are in the `Makefile`:
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
make lint # ruff check .
|
|
271
|
+
make test # pytest
|
|
272
|
+
make typecheck # mypy src
|
|
273
|
+
make docs-serve # local preview at http://127.0.0.1:8000
|
|
274
|
+
make docs-build # static site in site/
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## License
|
|
280
|
+
|
|
281
|
+
TBD (local-first personal project by default).
|
taskclf-0.1.0/README.md
ADDED
|
@@ -0,0 +1,248 @@
|
|
|
1
|
+
# taskclf — Task Type Classifier from Local Activity Signals
|
|
2
|
+
|
|
3
|
+
Train and run a personal task-type classifier (e.g. coding / writing / meetings) using privacy-preserving computer activity signals such as foreground app/window metadata and aggregated input statistics (counts/rates only).
|
|
4
|
+
|
|
5
|
+
This project is intentionally scoped as a **personalized classifier** (single-user first). The architecture keeps:
|
|
6
|
+
- **Collectors** (platform/tool dependent) isolated behind adapters
|
|
7
|
+
- **Features** as a versioned, validated contract
|
|
8
|
+
- **Models** as bundled artifacts with schema checks
|
|
9
|
+
- **Inference** as a small, stable loop that emits task segments and daily summaries
|
|
10
|
+
|
|
11
|
+
## Goals
|
|
12
|
+
- Fast iteration: first useful model in < 1 week of data
|
|
13
|
+
- Privacy: no raw keystrokes, no raw window titles persisted
|
|
14
|
+
- Stability: feature schema versioning + schema hash gates
|
|
15
|
+
- Extensibility: add new collectors and models without breaking consumers
|
|
16
|
+
|
|
17
|
+
## Non-Goals
|
|
18
|
+
- Universal (multi-user) generalization out of the box
|
|
19
|
+
- Storing or analyzing raw typed content
|
|
20
|
+
- "Perfect" labeling UI (start minimal, iterate later)
|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Labels (v1)
|
|
25
|
+
|
|
26
|
+
Eight core labels defined in `schema/labels_v1.json`:
|
|
27
|
+
|
|
28
|
+
| ID | Label | Description |
|
|
29
|
+
|----|-------|-------------|
|
|
30
|
+
| 0 | `Build` | Writing or implementing structured content in editor/terminal |
|
|
31
|
+
| 1 | `Debug` | Investigating issues, terminal-heavy troubleshooting |
|
|
32
|
+
| 2 | `Review` | Reviewing technical material or diffs with light edits |
|
|
33
|
+
| 3 | `Write` | Writing structured non-code content |
|
|
34
|
+
| 4 | `ReadResearch` | Consuming information with minimal production |
|
|
35
|
+
| 5 | `Communicate` | Asynchronous coordination (chat/email) |
|
|
36
|
+
| 6 | `Meet` | Synchronous meetings or calls |
|
|
37
|
+
| 7 | `BreakIdle` | Idle or break period |
|
|
38
|
+
|
|
39
|
+
Labels are stored as **time spans** (not per-keystroke events). Users can remap
|
|
40
|
+
core labels to personal categories via a **taxonomy config**
|
|
41
|
+
(see `configs/user_taxonomy_example.yaml`).
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Data Flow Overview
|
|
46
|
+
|
|
47
|
+
### Structures (pipelines)
|
|
48
|
+
* ETL pipeline reads raw → produces features parquet
|
|
49
|
+
* Training pipeline reads features + labels → produces model
|
|
50
|
+
* Inference pipeline reads new events → emits predictions + segments
|
|
51
|
+
|
|
52
|
+
### Batch (repeatable)
|
|
53
|
+
1. **Ingest**: pull ActivityWatch export → `data/raw/aw/`
|
|
54
|
+
2. **Feature build**: events → per-minute features → `data/processed/features_v1/`
|
|
55
|
+
3. **Label import**: label spans → `data/processed/labels_v1/`
|
|
56
|
+
4. **Build dataset**: join features + labels, split by time → training arrays
|
|
57
|
+
5. **Train**: fit model → `models/<run_id>/`
|
|
58
|
+
6. **Evaluate**: metrics, acceptance checks, calibration
|
|
59
|
+
7. **Report**: daily summaries → `artifacts/`
|
|
60
|
+
|
|
61
|
+
### Online (real-time)
|
|
62
|
+
Every N seconds:
|
|
63
|
+
- read the last minute(s) of events
|
|
64
|
+
- compute the latest feature bucket
|
|
65
|
+
- predict + smooth (with optional calibration and taxonomy mapping)
|
|
66
|
+
- append predictions → `artifacts/`
|
|
67
|
+
|
|
68
|
+
At end-of-day:
|
|
69
|
+
- produce report
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Privacy & Safety
|
|
74
|
+
This repo enforces the following:
|
|
75
|
+
- **No raw keystrokes** are stored (only aggregate counts/rates).
|
|
76
|
+
- **No raw window titles** are stored by default.
|
|
77
|
+
- Titles are hashed or locally tokenized; you can keep a local mapping if you choose.
|
|
78
|
+
- Dataset artifacts stay **local-first**.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Quick Start
|
|
83
|
+
|
|
84
|
+
### Requirements
|
|
85
|
+
- Python >= 3.14
|
|
86
|
+
- `uv` installed
|
|
87
|
+
|
|
88
|
+
### Setup
|
|
89
|
+
```bash
|
|
90
|
+
uv sync
|
|
91
|
+
uv run taskclf --help
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### Ingest (ActivityWatch)
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
uv run taskclf ingest aw --input /path/to/activitywatch-export.json
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
This parses an ActivityWatch JSON export, normalizes app names to reverse-domain
|
|
101
|
+
identifiers, hashes window titles (never storing raw text), and writes
|
|
102
|
+
privacy-safe events to `data/raw/aw/<YYYY-MM-DD>/events.parquet` partitioned by
|
|
103
|
+
date.
|
|
104
|
+
|
|
105
|
+
Options:
|
|
106
|
+
- `--out-dir` — output directory (default: `data/raw/aw`)
|
|
107
|
+
- `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
|
|
108
|
+
|
|
109
|
+
### Build features
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
uv run taskclf features build --date 2026-02-16
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Import labels
|
|
116
|
+
|
|
117
|
+
```bash
|
|
118
|
+
uv run taskclf labels import --file labels.csv
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
Or add individual label blocks:
|
|
122
|
+
|
|
123
|
+
```bash
|
|
124
|
+
uv run taskclf labels add-block \
|
|
125
|
+
--start 2026-02-16T09:00:00 --end 2026-02-16T10:00:00 --label Build
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Or label what you're doing right now (no timestamps needed):
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
uv run taskclf labels label-now --minutes 10 --label Build
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
This queries ActivityWatch for a live summary of apps used in the last N minutes
|
|
135
|
+
and creates the label span automatically.
|
|
136
|
+
|
|
137
|
+
### Train
|
|
138
|
+
|
|
139
|
+
```bash
|
|
140
|
+
uv run taskclf train lgbm --from 2026-02-01 --to 2026-02-16
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Run batch inference
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
uv run taskclf infer batch --model-dir models/<run_id> --from 2026-02-01 --to 2026-02-16
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Run online inference
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
uv run taskclf infer online --model-dir models/<run_id>
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
Starts a polling loop that queries a running ActivityWatch server, builds
|
|
156
|
+
feature rows from live window events, predicts task types using a trained model,
|
|
157
|
+
smooths predictions, and writes running outputs to `artifacts/`. Press Ctrl+C
|
|
158
|
+
to stop; a final daily report is generated on shutdown.
|
|
159
|
+
|
|
160
|
+
Options:
|
|
161
|
+
- `--poll-seconds` — seconds between polls (default: 60)
|
|
162
|
+
- `--aw-host` — ActivityWatch server URL (default: `http://localhost:5600`)
|
|
163
|
+
- `--smooth-window` — rolling majority window size (default: 3)
|
|
164
|
+
- `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
|
|
165
|
+
- `--out-dir` — output directory (default: `artifacts`)
|
|
166
|
+
- `--label-queue` / `--no-label-queue` — auto-enqueue low-confidence predictions for manual labeling
|
|
167
|
+
- `--label-confidence` — confidence threshold for auto-enqueue (default: 0.55)
|
|
168
|
+
|
|
169
|
+
### Run baseline (no model needed)
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
uv run taskclf infer baseline --from 2026-02-01 --to 2026-02-16
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Rule-based classifier useful for day-1 bootstrapping before you have a trained model.
|
|
176
|
+
|
|
177
|
+
### Produce report
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
uv run taskclf report daily --segments-file artifacts/segments.json
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## CLI Reference
|
|
186
|
+
|
|
187
|
+
All commands: `uv run taskclf --help`
|
|
188
|
+
|
|
189
|
+
| Group | Commands | Purpose |
|
|
190
|
+
|-------|----------|---------|
|
|
191
|
+
| `ingest` | `aw` | Import ActivityWatch exports |
|
|
192
|
+
| `features` | `build` | Build per-minute feature rows |
|
|
193
|
+
| `labels` | `import`, `add-block`, `label-now`, `show-queue`, `project` | Manage label spans and labeling queue |
|
|
194
|
+
| `train` | `build-dataset`, `lgbm`, `evaluate`, `tune-reject`, `calibrate`, `retrain`, `check-retrain` | Training, evaluation, and retraining pipeline |
|
|
195
|
+
| `taxonomy` | `validate`, `show`, `init` | User-defined label groupings |
|
|
196
|
+
| `infer` | `batch`, `online`, `baseline`, `compare` | Prediction (ML, rule-based, comparison) |
|
|
197
|
+
| `report` | `daily` | Daily summaries (JSON/CSV/Parquet) |
|
|
198
|
+
| `monitor` | `drift-check`, `telemetry`, `show` | Feature drift and telemetry tracking |
|
|
199
|
+
| *(top-level)* | `tray` | System tray labeling app with activity transition detection |
|
|
200
|
+
|
|
201
|
+
Full CLI docs: `docs/api/cli/main.md`
|
|
202
|
+
|
|
203
|
+
---
|
|
204
|
+
|
|
205
|
+
## Repo Layout
|
|
206
|
+
|
|
207
|
+
* `src/taskclf/` — application code (adapters, core, features, labels, train, infer, report, ui)
|
|
208
|
+
* `schema/` — versioned JSON schemas for features and labels
|
|
209
|
+
* `configs/` — configuration files (model params, retrain policy, taxonomy examples)
|
|
210
|
+
* `docs/` — API reference and guides (served via `make docs-serve`)
|
|
211
|
+
* `data/` — raw and processed datasets (local, gitignored)
|
|
212
|
+
* `models/` — trained model bundles (one folder per run)
|
|
213
|
+
* `artifacts/` — predictions, segments, reports, evaluation outputs
|
|
214
|
+
* `tests/` — test suite
|
|
215
|
+
|
|
216
|
+
---
|
|
217
|
+
|
|
218
|
+
## Model Artifact Contract
|
|
219
|
+
|
|
220
|
+
Every saved model bundle (`models/<run_id>/`) contains:
|
|
221
|
+
|
|
222
|
+
* the model file
|
|
223
|
+
* `metadata.json`: feature schema version + hash, label set, training date range, params, dataset hash
|
|
224
|
+
* `metrics.json`: macro/weighted F1, per-class metrics
|
|
225
|
+
* `confusion_matrix.csv`
|
|
226
|
+
* categorical encoders (if applicable)
|
|
227
|
+
|
|
228
|
+
Inference refuses to run if the schema hash mismatches the model bundle.
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Development
|
|
233
|
+
|
|
234
|
+
Common tasks are in the `Makefile`:
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
make lint # ruff check .
|
|
238
|
+
make test # pytest
|
|
239
|
+
make typecheck # mypy src
|
|
240
|
+
make docs-serve # local preview at http://127.0.0.1:8000
|
|
241
|
+
make docs-build # static site in site/
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## License
|
|
247
|
+
|
|
248
|
+
TBD (local-first personal project by default).
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "taskclf"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Local-first task classifier that infers your work type from computer activity signals"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = {text = "MIT"}
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "fruitiecutiepie", email = "104437268+fruitiecutiepie@users.noreply.github.com" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.14"
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Programming Language :: Python :: 3",
|
|
13
|
+
"Operating System :: OS Independent",
|
|
14
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
15
|
+
"Topic :: Office/Business",
|
|
16
|
+
"Topic :: System :: Monitoring",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"duckdb>=1.4.4",
|
|
20
|
+
"lightgbm>=4.6.0",
|
|
21
|
+
"matplotlib>=3.10.8",
|
|
22
|
+
"pandas>=3.0.0",
|
|
23
|
+
"pyarrow>=23.0.0",
|
|
24
|
+
"pydantic>=2.12.5",
|
|
25
|
+
"rich>=14.3.2",
|
|
26
|
+
"scipy>=1.15.0",
|
|
27
|
+
"scikit-learn>=1.8.0",
|
|
28
|
+
"streamlit>=1.54.0",
|
|
29
|
+
"typer>=0.23.1",
|
|
30
|
+
"pyyaml>=6.0",
|
|
31
|
+
"pystray>=0.19.5",
|
|
32
|
+
"Pillow>=11.0.0",
|
|
33
|
+
"plyer>=2.1.0",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/fruitiecutiepie/taskclf"
|
|
38
|
+
Documentation = "https://fruitiecutiepie.github.io/taskclf"
|
|
39
|
+
Repository = "https://github.com/fruitiecutiepie/taskclf"
|
|
40
|
+
|
|
41
|
+
[project.scripts]
|
|
42
|
+
taskclf = "taskclf.cli.main:app"
|
|
43
|
+
|
|
44
|
+
[tool.uv]
|
|
45
|
+
override-dependencies = [
|
|
46
|
+
"pandas>=3.0.0",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[build-system]
|
|
50
|
+
requires = ["uv_build>=0.9.5,<0.10.0"]
|
|
51
|
+
build-backend = "uv_build"
|
|
52
|
+
|
|
53
|
+
[dependency-groups]
|
|
54
|
+
dev = [
|
|
55
|
+
"black>=26.1.0",
|
|
56
|
+
"mypy>=1.19.1",
|
|
57
|
+
"pandas-stubs>=3.0.0.260204",
|
|
58
|
+
"pre-commit>=4.5.1",
|
|
59
|
+
"pytest>=9.0.2",
|
|
60
|
+
"pytest-xdist>=3.8.0",
|
|
61
|
+
"ruff>=0.15.1",
|
|
62
|
+
]
|
|
63
|
+
docs = [
|
|
64
|
+
"zensical",
|
|
65
|
+
"mkdocstrings-python",
|
|
66
|
+
]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# src/taskclf/
|
|
2
|
+
|
|
3
|
+
Main package.
|
|
4
|
+
|
|
5
|
+
## Design principles
|
|
6
|
+
- **Adapters** isolate unstable platform/tool integrations.
|
|
7
|
+
- **Core** defines contracts, validation, and data/model IO.
|
|
8
|
+
- **Pipelines** compose pure transforms into repeatable runs.
|
|
9
|
+
- **CLI** is the stable interface for humans and automation.
|
|
10
|
+
|
|
11
|
+
## Subpackages
|
|
12
|
+
- `core/` — schemas, validation, storage primitives, model IO, metrics, drift detection, telemetry
|
|
13
|
+
- `adapters/` — ActivityWatch + input collectors
|
|
14
|
+
- `features/` — feature computation (event -> bucketed features, rolling windows, sessions)
|
|
15
|
+
- `labels/` — label span formats, import/export, projection onto feature windows, active labeling queue, weak label rules
|
|
16
|
+
- `train/` — dataset construction, splits, training, evaluation, calibration, retraining pipeline
|
|
17
|
+
- `infer/` — batch and online inference, rule-based baseline, smoothing, calibration, taxonomy mapping, drift monitoring
|
|
18
|
+
- `report/` — daily summaries and exports (JSON/CSV/Parquet)
|
|
19
|
+
- `cli/` — Typer entrypoint and commands
|
|
20
|
+
- `ui/` — labeling UI (Streamlit)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# adapters/
|
|
2
|
+
|
|
3
|
+
Integrations with external tools or OS APIs.
|
|
4
|
+
|
|
5
|
+
## Subpackages
|
|
6
|
+
|
|
7
|
+
### `activitywatch/`
|
|
8
|
+
- `types.py` -- `AWEvent` (window events) and `AWInputEvent` (keyboard/mouse
|
|
9
|
+
aggregate counts from `aw-watcher-input`).
|
|
10
|
+
- `mapping.py` -- App-name normalization, browser/editor/terminal classification,
|
|
11
|
+
and semantic `app_category` assignment (browser, editor, terminal, chat, email,
|
|
12
|
+
meeting, docs, design, devtools, media, file_manager, utilities, project_mgmt,
|
|
13
|
+
other).
|
|
14
|
+
- `client.py` -- AW JSON export parser and REST API client. Supports both
|
|
15
|
+
`currentwindow` (window watcher) and `os.hid.input` (input watcher) bucket
|
|
16
|
+
types for file-based and REST-based ingestion.
|
|
17
|
+
|
|
18
|
+
### `input/`
|
|
19
|
+
- Optional OS-specific input aggregators (counts only, not yet implemented).
|
|
20
|
+
The `aw-watcher-input` integration in `activitywatch/` covers the same
|
|
21
|
+
signals when ActivityWatch is running.
|
|
22
|
+
|
|
23
|
+
## Invariants
|
|
24
|
+
- Adapters must output normalized events satisfying the `core.types.Event` protocol.
|
|
25
|
+
- Raw window titles are never persisted -- they are replaced with salted hashes.
|
|
26
|
+
- App names are mapped to reverse-domain identifiers via the known-app registry.
|
|
27
|
+
- Input events carry only aggregate counts (presses, clicks, movement, scroll)
|
|
28
|
+
-- never individual key identities.
|
|
29
|
+
- Keep adapter-specific quirks out of `core/`.
|
|
30
|
+
- Adapters should be swappable without changing feature or model code.
|
|
31
|
+
|
|
32
|
+
## Event Protocol
|
|
33
|
+
The `Event` protocol (`core.types.Event`) defines the minimal attribute set
|
|
34
|
+
that any adapter event must expose. `AWEvent` satisfies this protocol
|
|
35
|
+
structurally (no inheritance required). New adapters should likewise expose
|
|
36
|
+
`timestamp`, `duration_seconds`, `app_id`, `window_title_hash`, `is_browser`,
|
|
37
|
+
`is_editor`, `is_terminal`, and `app_category`.
|
|
38
|
+
|
|
39
|
+
`AWInputEvent` is a separate type that does not implement the `Event` protocol
|
|
40
|
+
-- it feeds into the feature builder as a supplementary data source via the
|
|
41
|
+
`input_events` parameter of `build_features_from_aw_events()`.
|
|
File without changes
|
|
File without changes
|