taskclf 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. taskclf-0.1.0/PKG-INFO +281 -0
  2. taskclf-0.1.0/README.md +248 -0
  3. taskclf-0.1.0/pyproject.toml +66 -0
  4. taskclf-0.1.0/src/taskclf/README.md +20 -0
  5. taskclf-0.1.0/src/taskclf/__init__.py +2 -0
  6. taskclf-0.1.0/src/taskclf/adapters/README.md +41 -0
  7. taskclf-0.1.0/src/taskclf/adapters/__init__.py +0 -0
  8. taskclf-0.1.0/src/taskclf/adapters/activitywatch/__init__.py +0 -0
  9. taskclf-0.1.0/src/taskclf/adapters/activitywatch/client.py +296 -0
  10. taskclf-0.1.0/src/taskclf/adapters/activitywatch/mapping.py +178 -0
  11. taskclf-0.1.0/src/taskclf/adapters/activitywatch/types.py +48 -0
  12. taskclf-0.1.0/src/taskclf/adapters/input/__init__.py +0 -0
  13. taskclf-0.1.0/src/taskclf/adapters/input/linux.py +0 -0
  14. taskclf-0.1.0/src/taskclf/adapters/input/macos.py +1 -0
  15. taskclf-0.1.0/src/taskclf/adapters/input/windows.py +0 -0
  16. taskclf-0.1.0/src/taskclf/cli/README.md +54 -0
  17. taskclf-0.1.0/src/taskclf/cli/__init__.py +0 -0
  18. taskclf-0.1.0/src/taskclf/cli/main.py +2169 -0
  19. taskclf-0.1.0/src/taskclf/core/README.md +22 -0
  20. taskclf-0.1.0/src/taskclf/core/__init__.py +0 -0
  21. taskclf-0.1.0/src/taskclf/core/defaults.py +84 -0
  22. taskclf-0.1.0/src/taskclf/core/drift.py +331 -0
  23. taskclf-0.1.0/src/taskclf/core/hashing.py +42 -0
  24. taskclf-0.1.0/src/taskclf/core/logging.py +88 -0
  25. taskclf-0.1.0/src/taskclf/core/metrics.py +395 -0
  26. taskclf-0.1.0/src/taskclf/core/model_io.py +220 -0
  27. taskclf-0.1.0/src/taskclf/core/schema.py +163 -0
  28. taskclf-0.1.0/src/taskclf/core/store.py +51 -0
  29. taskclf-0.1.0/src/taskclf/core/telemetry.py +237 -0
  30. taskclf-0.1.0/src/taskclf/core/time.py +67 -0
  31. taskclf-0.1.0/src/taskclf/core/types.py +244 -0
  32. taskclf-0.1.0/src/taskclf/core/validation.py +314 -0
  33. taskclf-0.1.0/src/taskclf/features/README.md +36 -0
  34. taskclf-0.1.0/src/taskclf/features/__init__.py +0 -0
  35. taskclf-0.1.0/src/taskclf/features/build.py +439 -0
  36. taskclf-0.1.0/src/taskclf/features/domain.py +138 -0
  37. taskclf-0.1.0/src/taskclf/features/dynamics.py +140 -0
  38. taskclf-0.1.0/src/taskclf/features/sessions.py +69 -0
  39. taskclf-0.1.0/src/taskclf/features/text.py +56 -0
  40. taskclf-0.1.0/src/taskclf/features/windows.py +75 -0
  41. taskclf-0.1.0/src/taskclf/infer/README.md +38 -0
  42. taskclf-0.1.0/src/taskclf/infer/__init__.py +0 -0
  43. taskclf-0.1.0/src/taskclf/infer/baseline.py +199 -0
  44. taskclf-0.1.0/src/taskclf/infer/batch.py +289 -0
  45. taskclf-0.1.0/src/taskclf/infer/calibration.py +326 -0
  46. taskclf-0.1.0/src/taskclf/infer/monitor.py +328 -0
  47. taskclf-0.1.0/src/taskclf/infer/online.py +515 -0
  48. taskclf-0.1.0/src/taskclf/infer/prediction.py +44 -0
  49. taskclf-0.1.0/src/taskclf/infer/resolve.py +161 -0
  50. taskclf-0.1.0/src/taskclf/infer/smooth.py +253 -0
  51. taskclf-0.1.0/src/taskclf/infer/taxonomy.py +341 -0
  52. taskclf-0.1.0/src/taskclf/labels/README.md +28 -0
  53. taskclf-0.1.0/src/taskclf/labels/__init__.py +0 -0
  54. taskclf-0.1.0/src/taskclf/labels/projection.py +99 -0
  55. taskclf-0.1.0/src/taskclf/labels/queue.py +247 -0
  56. taskclf-0.1.0/src/taskclf/labels/store.py +254 -0
  57. taskclf-0.1.0/src/taskclf/labels/weak_rules.py +1 -0
  58. taskclf-0.1.0/src/taskclf/model_registry.py +760 -0
  59. taskclf-0.1.0/src/taskclf/report/README.md +18 -0
  60. taskclf-0.1.0/src/taskclf/report/__init__.py +0 -0
  61. taskclf-0.1.0/src/taskclf/report/daily.py +144 -0
  62. taskclf-0.1.0/src/taskclf/report/export.py +125 -0
  63. taskclf-0.1.0/src/taskclf/train/README.md +31 -0
  64. taskclf-0.1.0/src/taskclf/train/__init__.py +0 -0
  65. taskclf-0.1.0/src/taskclf/train/build_dataset.py +172 -0
  66. taskclf-0.1.0/src/taskclf/train/calibrate.py +267 -0
  67. taskclf-0.1.0/src/taskclf/train/dataset.py +98 -0
  68. taskclf-0.1.0/src/taskclf/train/evaluate.py +394 -0
  69. taskclf-0.1.0/src/taskclf/train/lgbm.py +231 -0
  70. taskclf-0.1.0/src/taskclf/train/retrain.py +650 -0
  71. taskclf-0.1.0/src/taskclf/ui/__init__.py +0 -0
  72. taskclf-0.1.0/src/taskclf/ui/labeling.py +310 -0
  73. taskclf-0.1.0/src/taskclf/ui/tray.py +503 -0
taskclf-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,281 @@
1
+ Metadata-Version: 2.3
2
+ Name: taskclf
3
+ Version: 0.1.0
4
+ Summary: Local-first task classifier that infers your work type from computer activity signals
5
+ Author: fruitiecutiepie
6
+ Author-email: fruitiecutiepie <104437268+fruitiecutiepie@users.noreply.github.com>
7
+ License: MIT
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Operating System :: OS Independent
10
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
11
+ Classifier: Topic :: Office/Business
12
+ Classifier: Topic :: System :: Monitoring
13
+ Requires-Dist: duckdb>=1.4.4
14
+ Requires-Dist: lightgbm>=4.6.0
15
+ Requires-Dist: matplotlib>=3.10.8
16
+ Requires-Dist: pandas>=3.0.0
17
+ Requires-Dist: pyarrow>=23.0.0
18
+ Requires-Dist: pydantic>=2.12.5
19
+ Requires-Dist: rich>=14.3.2
20
+ Requires-Dist: scipy>=1.15.0
21
+ Requires-Dist: scikit-learn>=1.8.0
22
+ Requires-Dist: streamlit>=1.54.0
23
+ Requires-Dist: typer>=0.23.1
24
+ Requires-Dist: pyyaml>=6.0
25
+ Requires-Dist: pystray>=0.19.5
26
+ Requires-Dist: pillow>=11.0.0
27
+ Requires-Dist: plyer>=2.1.0
28
+ Requires-Python: >=3.14
29
+ Project-URL: Homepage, https://github.com/fruitiecutiepie/taskclf
30
+ Project-URL: Documentation, https://fruitiecutiepie.github.io/taskclf
31
+ Project-URL: Repository, https://github.com/fruitiecutiepie/taskclf
32
+ Description-Content-Type: text/markdown
33
+
34
+ # taskclf — Task Type Classifier from Local Activity Signals
35
+
36
+ Train and run a personal task-type classifier (e.g. coding / writing / meetings) using privacy-preserving computer activity signals such as foreground app/window metadata and aggregated input statistics (counts/rates only).
37
+
38
+ This project is intentionally scoped as a **personalized classifier** (single-user first). The architecture keeps:
39
+ - **Collectors** (platform/tool dependent) isolated behind adapters
40
+ - **Features** as a versioned, validated contract
41
+ - **Models** as bundled artifacts with schema checks
42
+ - **Inference** as a small, stable loop that emits task segments and daily summaries
43
+
44
+ ## Goals
45
+ - Fast iteration: first useful model in < 1 week of data
46
+ - Privacy: no raw keystrokes, no raw window titles persisted
47
+ - Stability: feature schema versioning + schema hash gates
48
+ - Extensibility: add new collectors and models without breaking consumers
49
+
50
+ ## Non-Goals
51
+ - Universal (multi-user) generalization out of the box
52
+ - Storing or analyzing raw typed content
53
+ - "Perfect" labeling UI (start minimal, iterate later)
54
+
55
+ ---
56
+
57
+ ## Labels (v1)
58
+
59
+ Eight core labels defined in `schema/labels_v1.json`:
60
+
61
+ | ID | Label | Description |
62
+ |----|-------|-------------|
63
+ | 0 | `Build` | Writing or implementing structured content in editor/terminal |
64
+ | 1 | `Debug` | Investigating issues, terminal-heavy troubleshooting |
65
+ | 2 | `Review` | Reviewing technical material or diffs with light edits |
66
+ | 3 | `Write` | Writing structured non-code content |
67
+ | 4 | `ReadResearch` | Consuming information with minimal production |
68
+ | 5 | `Communicate` | Asynchronous coordination (chat/email) |
69
+ | 6 | `Meet` | Synchronous meetings or calls |
70
+ | 7 | `BreakIdle` | Idle or break period |
71
+
72
+ Labels are stored as **time spans** (not per-keystroke events). Users can remap
73
+ core labels to personal categories via a **taxonomy config**
74
+ (see `configs/user_taxonomy_example.yaml`).
75
+
76
+ ---
77
+
78
+ ## Data Flow Overview
79
+
80
+ ### Structures (pipelines)
81
+ * ETL pipeline reads raw → produces features parquet
82
+ * Training pipeline reads features + labels → produces model
83
+ * Inference pipeline reads new events → emits predictions + segments
84
+
85
+ ### Batch (repeatable)
86
+ 1. **Ingest**: pull ActivityWatch export → `data/raw/aw/`
87
+ 2. **Feature build**: events → per-minute features → `data/processed/features_v1/`
88
+ 3. **Label import**: label spans → `data/processed/labels_v1/`
89
+ 4. **Build dataset**: join features + labels, split by time → training arrays
90
+ 5. **Train**: fit model → `models/<run_id>/`
91
+ 6. **Evaluate**: metrics, acceptance checks, calibration
92
+ 7. **Report**: daily summaries → `artifacts/`
93
+
94
+ ### Online (real-time)
95
+ Every N seconds:
96
+ - read the last minute(s) of events
97
+ - compute the latest feature bucket
98
+ - predict + smooth (with optional calibration and taxonomy mapping)
99
+ - append predictions → `artifacts/`
100
+
101
+ At end-of-day:
102
+ - produce report
103
+
104
+ ---
105
+
106
+ ## Privacy & Safety
107
+ This repo enforces the following:
108
+ - **No raw keystrokes** are stored (only aggregate counts/rates).
109
+ - **No raw window titles** are stored by default.
110
+ - Titles are hashed or locally tokenized; you can keep a local mapping if you choose.
111
+ - Dataset artifacts stay **local-first**.
112
+
113
+ ---
114
+
115
+ ## Quick Start
116
+
117
+ ### Requirements
118
+ - Python >= 3.14
119
+ - `uv` installed
120
+
121
+ ### Setup
122
+ ```bash
123
+ uv sync
124
+ uv run taskclf --help
125
+ ```
126
+
127
+ ### Ingest (ActivityWatch)
128
+
129
+ ```bash
130
+ uv run taskclf ingest aw --input /path/to/activitywatch-export.json
131
+ ```
132
+
133
+ This parses an ActivityWatch JSON export, normalizes app names to reverse-domain
134
+ identifiers, hashes window titles (never storing raw text), and writes
135
+ privacy-safe events to `data/raw/aw/<YYYY-MM-DD>/events.parquet` partitioned by
136
+ date.
137
+
138
+ Options:
139
+ - `--out-dir` — output directory (default: `data/raw/aw`)
140
+ - `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
141
+
142
+ ### Build features
143
+
144
+ ```bash
145
+ uv run taskclf features build --date 2026-02-16
146
+ ```
147
+
148
+ ### Import labels
149
+
150
+ ```bash
151
+ uv run taskclf labels import --file labels.csv
152
+ ```
153
+
154
+ Or add individual label blocks:
155
+
156
+ ```bash
157
+ uv run taskclf labels add-block \
158
+ --start 2026-02-16T09:00:00 --end 2026-02-16T10:00:00 --label Build
159
+ ```
160
+
161
+ Or label what you're doing right now (no timestamps needed):
162
+
163
+ ```bash
164
+ uv run taskclf labels label-now --minutes 10 --label Build
165
+ ```
166
+
167
+ This queries ActivityWatch for a live summary of apps used in the last N minutes
168
+ and creates the label span automatically.
169
+
170
+ ### Train
171
+
172
+ ```bash
173
+ uv run taskclf train lgbm --from 2026-02-01 --to 2026-02-16
174
+ ```
175
+
176
+ ### Run batch inference
177
+
178
+ ```bash
179
+ uv run taskclf infer batch --model-dir models/<run_id> --from 2026-02-01 --to 2026-02-16
180
+ ```
181
+
182
+ ### Run online inference
183
+
184
+ ```bash
185
+ uv run taskclf infer online --model-dir models/<run_id>
186
+ ```
187
+
188
+ Starts a polling loop that queries a running ActivityWatch server, builds
189
+ feature rows from live window events, predicts task types using a trained model,
190
+ smooths predictions, and writes running outputs to `artifacts/`. Press Ctrl+C
191
+ to stop; a final daily report is generated on shutdown.
192
+
193
+ Options:
194
+ - `--poll-seconds` — seconds between polls (default: 60)
195
+ - `--aw-host` — ActivityWatch server URL (default: `http://localhost:5600`)
196
+ - `--smooth-window` — rolling majority window size (default: 3)
197
+ - `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
198
+ - `--out-dir` — output directory (default: `artifacts`)
199
+ - `--label-queue` / `--no-label-queue` — auto-enqueue low-confidence predictions for manual labeling
200
+ - `--label-confidence` — confidence threshold for auto-enqueue (default: 0.55)
201
+
202
+ ### Run baseline (no model needed)
203
+
204
+ ```bash
205
+ uv run taskclf infer baseline --from 2026-02-01 --to 2026-02-16
206
+ ```
207
+
208
+ Rule-based classifier useful for day-1 bootstrapping before you have a trained model.
209
+
210
+ ### Produce report
211
+
212
+ ```bash
213
+ uv run taskclf report daily --segments-file artifacts/segments.json
214
+ ```
215
+
216
+ ---
217
+
218
+ ## CLI Reference
219
+
220
+ All commands: `uv run taskclf --help`
221
+
222
+ | Group | Commands | Purpose |
223
+ |-------|----------|---------|
224
+ | `ingest` | `aw` | Import ActivityWatch exports |
225
+ | `features` | `build` | Build per-minute feature rows |
226
+ | `labels` | `import`, `add-block`, `label-now`, `show-queue`, `project` | Manage label spans and labeling queue |
227
+ | `train` | `build-dataset`, `lgbm`, `evaluate`, `tune-reject`, `calibrate`, `retrain`, `check-retrain` | Training, evaluation, and retraining pipeline |
228
+ | `taxonomy` | `validate`, `show`, `init` | User-defined label groupings |
229
+ | `infer` | `batch`, `online`, `baseline`, `compare` | Prediction (ML, rule-based, comparison) |
230
+ | `report` | `daily` | Daily summaries (JSON/CSV/Parquet) |
231
+ | `monitor` | `drift-check`, `telemetry`, `show` | Feature drift and telemetry tracking |
232
+ | *(top-level)* | `tray` | System tray labeling app with activity transition detection |
233
+
234
+ Full CLI docs: `docs/api/cli/main.md`
235
+
236
+ ---
237
+
238
+ ## Repo Layout
239
+
240
+ * `src/taskclf/` — application code (adapters, core, features, labels, train, infer, report, ui)
241
+ * `schema/` — versioned JSON schemas for features and labels
242
+ * `configs/` — configuration files (model params, retrain policy, taxonomy examples)
243
+ * `docs/` — API reference and guides (served via `make docs-serve`)
244
+ * `data/` — raw and processed datasets (local, gitignored)
245
+ * `models/` — trained model bundles (one folder per run)
246
+ * `artifacts/` — predictions, segments, reports, evaluation outputs
247
+ * `tests/` — test suite
248
+
249
+ ---
250
+
251
+ ## Model Artifact Contract
252
+
253
+ Every saved model bundle (`models/<run_id>/`) contains:
254
+
255
+ * the model file
256
+ * `metadata.json`: feature schema version + hash, label set, training date range, params, dataset hash
257
+ * `metrics.json`: macro/weighted F1, per-class metrics
258
+ * `confusion_matrix.csv`
259
+ * categorical encoders (if applicable)
260
+
261
+ Inference refuses to run if the schema hash mismatches the model bundle.
262
+
263
+ ---
264
+
265
+ ## Development
266
+
267
+ Common tasks are in the `Makefile`:
268
+
269
+ ```bash
270
+ make lint # ruff check .
271
+ make test # pytest
272
+ make typecheck # mypy src
273
+ make docs-serve # local preview at http://127.0.0.1:8000
274
+ make docs-build # static site in site/
275
+ ```
276
+
277
+ ---
278
+
279
+ ## License
280
+
281
+ TBD (local-first personal project by default).
@@ -0,0 +1,248 @@
1
+ # taskclf — Task Type Classifier from Local Activity Signals
2
+
3
+ Train and run a personal task-type classifier (e.g. coding / writing / meetings) using privacy-preserving computer activity signals such as foreground app/window metadata and aggregated input statistics (counts/rates only).
4
+
5
+ This project is intentionally scoped as a **personalized classifier** (single-user first). The architecture keeps:
6
+ - **Collectors** (platform/tool dependent) isolated behind adapters
7
+ - **Features** as a versioned, validated contract
8
+ - **Models** as bundled artifacts with schema checks
9
+ - **Inference** as a small, stable loop that emits task segments and daily summaries
10
+
11
+ ## Goals
12
+ - Fast iteration: first useful model in < 1 week of data
13
+ - Privacy: no raw keystrokes, no raw window titles persisted
14
+ - Stability: feature schema versioning + schema hash gates
15
+ - Extensibility: add new collectors and models without breaking consumers
16
+
17
+ ## Non-Goals
18
+ - Universal (multi-user) generalization out of the box
19
+ - Storing or analyzing raw typed content
20
+ - "Perfect" labeling UI (start minimal, iterate later)
21
+
22
+ ---
23
+
24
+ ## Labels (v1)
25
+
26
+ Eight core labels defined in `schema/labels_v1.json`:
27
+
28
+ | ID | Label | Description |
29
+ |----|-------|-------------|
30
+ | 0 | `Build` | Writing or implementing structured content in editor/terminal |
31
+ | 1 | `Debug` | Investigating issues, terminal-heavy troubleshooting |
32
+ | 2 | `Review` | Reviewing technical material or diffs with light edits |
33
+ | 3 | `Write` | Writing structured non-code content |
34
+ | 4 | `ReadResearch` | Consuming information with minimal production |
35
+ | 5 | `Communicate` | Asynchronous coordination (chat/email) |
36
+ | 6 | `Meet` | Synchronous meetings or calls |
37
+ | 7 | `BreakIdle` | Idle or break period |
38
+
39
+ Labels are stored as **time spans** (not per-keystroke events). Users can remap
40
+ core labels to personal categories via a **taxonomy config**
41
+ (see `configs/user_taxonomy_example.yaml`).
42
+
43
+ ---
44
+
45
+ ## Data Flow Overview
46
+
47
+ ### Structures (pipelines)
48
+ * ETL pipeline reads raw → produces features parquet
49
+ * Training pipeline reads features + labels → produces model
50
+ * Inference pipeline reads new events → emits predictions + segments
51
+
52
+ ### Batch (repeatable)
53
+ 1. **Ingest**: pull ActivityWatch export → `data/raw/aw/`
54
+ 2. **Feature build**: events → per-minute features → `data/processed/features_v1/`
55
+ 3. **Label import**: label spans → `data/processed/labels_v1/`
56
+ 4. **Build dataset**: join features + labels, split by time → training arrays
57
+ 5. **Train**: fit model → `models/<run_id>/`
58
+ 6. **Evaluate**: metrics, acceptance checks, calibration
59
+ 7. **Report**: daily summaries → `artifacts/`
60
+
61
+ ### Online (real-time)
62
+ Every N seconds:
63
+ - read the last minute(s) of events
64
+ - compute the latest feature bucket
65
+ - predict + smooth (with optional calibration and taxonomy mapping)
66
+ - append predictions → `artifacts/`
67
+
68
+ At end-of-day:
69
+ - produce report
70
+
71
+ ---
72
+
73
+ ## Privacy & Safety
74
+ This repo enforces the following:
75
+ - **No raw keystrokes** are stored (only aggregate counts/rates).
76
+ - **No raw window titles** are stored by default.
77
+ - Titles are hashed or locally tokenized; you can keep a local mapping if you choose.
78
+ - Dataset artifacts stay **local-first**.
79
+
80
+ ---
81
+
82
+ ## Quick Start
83
+
84
+ ### Requirements
85
+ - Python >= 3.14
86
+ - `uv` installed
87
+
88
+ ### Setup
89
+ ```bash
90
+ uv sync
91
+ uv run taskclf --help
92
+ ```
93
+
94
+ ### Ingest (ActivityWatch)
95
+
96
+ ```bash
97
+ uv run taskclf ingest aw --input /path/to/activitywatch-export.json
98
+ ```
99
+
100
+ This parses an ActivityWatch JSON export, normalizes app names to reverse-domain
101
+ identifiers, hashes window titles (never storing raw text), and writes
102
+ privacy-safe events to `data/raw/aw/<YYYY-MM-DD>/events.parquet` partitioned by
103
+ date.
104
+
105
+ Options:
106
+ - `--out-dir` — output directory (default: `data/raw/aw`)
107
+ - `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
108
+
109
+ ### Build features
110
+
111
+ ```bash
112
+ uv run taskclf features build --date 2026-02-16
113
+ ```
114
+
115
+ ### Import labels
116
+
117
+ ```bash
118
+ uv run taskclf labels import --file labels.csv
119
+ ```
120
+
121
+ Or add individual label blocks:
122
+
123
+ ```bash
124
+ uv run taskclf labels add-block \
125
+ --start 2026-02-16T09:00:00 --end 2026-02-16T10:00:00 --label Build
126
+ ```
127
+
128
+ Or label what you're doing right now (no timestamps needed):
129
+
130
+ ```bash
131
+ uv run taskclf labels label-now --minutes 10 --label Build
132
+ ```
133
+
134
+ This queries ActivityWatch for a live summary of apps used in the last N minutes
135
+ and creates the label span automatically.
136
+
137
+ ### Train
138
+
139
+ ```bash
140
+ uv run taskclf train lgbm --from 2026-02-01 --to 2026-02-16
141
+ ```
142
+
143
+ ### Run batch inference
144
+
145
+ ```bash
146
+ uv run taskclf infer batch --model-dir models/<run_id> --from 2026-02-01 --to 2026-02-16
147
+ ```
148
+
149
+ ### Run online inference
150
+
151
+ ```bash
152
+ uv run taskclf infer online --model-dir models/<run_id>
153
+ ```
154
+
155
+ Starts a polling loop that queries a running ActivityWatch server, builds
156
+ feature rows from live window events, predicts task types using a trained model,
157
+ smooths predictions, and writes running outputs to `artifacts/`. Press Ctrl+C
158
+ to stop; a final daily report is generated on shutdown.
159
+
160
+ Options:
161
+ - `--poll-seconds` — seconds between polls (default: 60)
162
+ - `--aw-host` — ActivityWatch server URL (default: `http://localhost:5600`)
163
+ - `--smooth-window` — rolling majority window size (default: 3)
164
+ - `--title-salt` — salt for hashing window titles (default: `taskclf-default-salt`)
165
+ - `--out-dir` — output directory (default: `artifacts`)
166
+ - `--label-queue` / `--no-label-queue` — auto-enqueue low-confidence predictions for manual labeling
167
+ - `--label-confidence` — confidence threshold for auto-enqueue (default: 0.55)
168
+
169
+ ### Run baseline (no model needed)
170
+
171
+ ```bash
172
+ uv run taskclf infer baseline --from 2026-02-01 --to 2026-02-16
173
+ ```
174
+
175
+ Rule-based classifier useful for day-1 bootstrapping before you have a trained model.
176
+
177
+ ### Produce report
178
+
179
+ ```bash
180
+ uv run taskclf report daily --segments-file artifacts/segments.json
181
+ ```
182
+
183
+ ---
184
+
185
+ ## CLI Reference
186
+
187
+ All commands: `uv run taskclf --help`
188
+
189
+ | Group | Commands | Purpose |
190
+ |-------|----------|---------|
191
+ | `ingest` | `aw` | Import ActivityWatch exports |
192
+ | `features` | `build` | Build per-minute feature rows |
193
+ | `labels` | `import`, `add-block`, `label-now`, `show-queue`, `project` | Manage label spans and labeling queue |
194
+ | `train` | `build-dataset`, `lgbm`, `evaluate`, `tune-reject`, `calibrate`, `retrain`, `check-retrain` | Training, evaluation, and retraining pipeline |
195
+ | `taxonomy` | `validate`, `show`, `init` | User-defined label groupings |
196
+ | `infer` | `batch`, `online`, `baseline`, `compare` | Prediction (ML, rule-based, comparison) |
197
+ | `report` | `daily` | Daily summaries (JSON/CSV/Parquet) |
198
+ | `monitor` | `drift-check`, `telemetry`, `show` | Feature drift and telemetry tracking |
199
+ | *(top-level)* | `tray` | System tray labeling app with activity transition detection |
200
+
201
+ Full CLI docs: `docs/api/cli/main.md`
202
+
203
+ ---
204
+
205
+ ## Repo Layout
206
+
207
+ * `src/taskclf/` — application code (adapters, core, features, labels, train, infer, report, ui)
208
+ * `schema/` — versioned JSON schemas for features and labels
209
+ * `configs/` — configuration files (model params, retrain policy, taxonomy examples)
210
+ * `docs/` — API reference and guides (served via `make docs-serve`)
211
+ * `data/` — raw and processed datasets (local, gitignored)
212
+ * `models/` — trained model bundles (one folder per run)
213
+ * `artifacts/` — predictions, segments, reports, evaluation outputs
214
+ * `tests/` — test suite
215
+
216
+ ---
217
+
218
+ ## Model Artifact Contract
219
+
220
+ Every saved model bundle (`models/<run_id>/`) contains:
221
+
222
+ * the model file
223
+ * `metadata.json`: feature schema version + hash, label set, training date range, params, dataset hash
224
+ * `metrics.json`: macro/weighted F1, per-class metrics
225
+ * `confusion_matrix.csv`
226
+ * categorical encoders (if applicable)
227
+
228
+ Inference refuses to run if the schema hash mismatches the model bundle.
229
+
230
+ ---
231
+
232
+ ## Development
233
+
234
+ Common tasks are in the `Makefile`:
235
+
236
+ ```bash
237
+ make lint # ruff check .
238
+ make test # pytest
239
+ make typecheck # mypy src
240
+ make docs-serve # local preview at http://127.0.0.1:8000
241
+ make docs-build # static site in site/
242
+ ```
243
+
244
+ ---
245
+
246
+ ## License
247
+
248
+ TBD (local-first personal project by default).
@@ -0,0 +1,66 @@
1
+ [project]
2
+ name = "taskclf"
3
+ version = "0.1.0"
4
+ description = "Local-first task classifier that infers your work type from computer activity signals"
5
+ readme = "README.md"
6
+ license = {text = "MIT"}
7
+ authors = [
8
+ { name = "fruitiecutiepie", email = "104437268+fruitiecutiepie@users.noreply.github.com" }
9
+ ]
10
+ requires-python = ">=3.14"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3",
13
+ "Operating System :: OS Independent",
14
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
15
+ "Topic :: Office/Business",
16
+ "Topic :: System :: Monitoring",
17
+ ]
18
+ dependencies = [
19
+ "duckdb>=1.4.4",
20
+ "lightgbm>=4.6.0",
21
+ "matplotlib>=3.10.8",
22
+ "pandas>=3.0.0",
23
+ "pyarrow>=23.0.0",
24
+ "pydantic>=2.12.5",
25
+ "rich>=14.3.2",
26
+ "scipy>=1.15.0",
27
+ "scikit-learn>=1.8.0",
28
+ "streamlit>=1.54.0",
29
+ "typer>=0.23.1",
30
+ "pyyaml>=6.0",
31
+ "pystray>=0.19.5",
32
+ "Pillow>=11.0.0",
33
+ "plyer>=2.1.0",
34
+ ]
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/fruitiecutiepie/taskclf"
38
+ Documentation = "https://fruitiecutiepie.github.io/taskclf"
39
+ Repository = "https://github.com/fruitiecutiepie/taskclf"
40
+
41
+ [project.scripts]
42
+ taskclf = "taskclf.cli.main:app"
43
+
44
+ [tool.uv]
45
+ override-dependencies = [
46
+ "pandas>=3.0.0",
47
+ ]
48
+
49
+ [build-system]
50
+ requires = ["uv_build>=0.9.5,<0.10.0"]
51
+ build-backend = "uv_build"
52
+
53
+ [dependency-groups]
54
+ dev = [
55
+ "black>=26.1.0",
56
+ "mypy>=1.19.1",
57
+ "pandas-stubs>=3.0.0.260204",
58
+ "pre-commit>=4.5.1",
59
+ "pytest>=9.0.2",
60
+ "pytest-xdist>=3.8.0",
61
+ "ruff>=0.15.1",
62
+ ]
63
+ docs = [
64
+ "zensical",
65
+ "mkdocstrings-python",
66
+ ]
@@ -0,0 +1,20 @@
1
+ # src/taskclf/
2
+
3
+ Main package.
4
+
5
+ ## Design principles
6
+ - **Adapters** isolate unstable platform/tool integrations.
7
+ - **Core** defines contracts, validation, and data/model IO.
8
+ - **Pipelines** compose pure transforms into repeatable runs.
9
+ - **CLI** is the stable interface for humans and automation.
10
+
11
+ ## Subpackages
12
+ - `core/` — schemas, validation, storage primitives, model IO, metrics, drift detection, telemetry
13
+ - `adapters/` — ActivityWatch + input collectors
14
+ - `features/` — feature computation (event -> bucketed features, rolling windows, sessions)
15
+ - `labels/` — label span formats, import/export, projection onto feature windows, active labeling queue, weak label rules
16
+ - `train/` — dataset construction, splits, training, evaluation, calibration, retraining pipeline
17
+ - `infer/` — batch and online inference, rule-based baseline, smoothing, calibration, taxonomy mapping, drift monitoring
18
+ - `report/` — daily summaries and exports (JSON/CSV/Parquet)
19
+ - `cli/` — Typer entrypoint and commands
20
+ - `ui/` — labeling UI (Streamlit)
@@ -0,0 +1,2 @@
1
+ def main() -> None:
2
+ print("Hello from taskclf!")
@@ -0,0 +1,41 @@
1
+ # adapters/
2
+
3
+ Integrations with external tools or OS APIs.
4
+
5
+ ## Subpackages
6
+
7
+ ### `activitywatch/`
8
+ - `types.py` -- `AWEvent` (window events) and `AWInputEvent` (keyboard/mouse
9
+ aggregate counts from `aw-watcher-input`).
10
+ - `mapping.py` -- App-name normalization, browser/editor/terminal classification,
11
+ and semantic `app_category` assignment (browser, editor, terminal, chat, email,
12
+ meeting, docs, design, devtools, media, file_manager, utilities, project_mgmt,
13
+ other).
14
+ - `client.py` -- AW JSON export parser and REST API client. Supports both
15
+ `currentwindow` (window watcher) and `os.hid.input` (input watcher) bucket
16
+ types for file-based and REST-based ingestion.
17
+
18
+ ### `input/`
19
+ - Optional OS-specific input aggregators (counts only, not yet implemented).
20
+ The `aw-watcher-input` integration in `activitywatch/` covers the same
21
+ signals when ActivityWatch is running.
22
+
23
+ ## Invariants
24
+ - Adapters must output normalized events satisfying the `core.types.Event` protocol.
25
+ - Raw window titles are never persisted -- they are replaced with salted hashes.
26
+ - App names are mapped to reverse-domain identifiers via the known-app registry.
27
+ - Input events carry only aggregate counts (presses, clicks, movement, scroll)
28
+ -- never individual key identities.
29
+ - Keep adapter-specific quirks out of `core/`.
30
+ - Adapters should be swappable without changing feature or model code.
31
+
32
+ ## Event Protocol
33
+ The `Event` protocol (`core.types.Event`) defines the minimal attribute set
34
+ that any adapter event must expose. `AWEvent` satisfies this protocol
35
+ structurally (no inheritance required). New adapters should likewise expose
36
+ `timestamp`, `duration_seconds`, `app_id`, `window_title_hash`, `is_browser`,
37
+ `is_editor`, `is_terminal`, and `app_category`.
38
+
39
+ `AWInputEvent` is a separate type that does not implement the `Event` protocol
40
+ -- it feeds into the feature builder as a supplementary data source via the
41
+ `input_events` parameter of `build_features_from_aw_events()`.
File without changes