hopscotch-analytics 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hopscotch_analytics-0.3.0/.github/workflows/release.yml +36 -0
- hopscotch_analytics-0.3.0/CHANGELOG.md +130 -0
- hopscotch_analytics-0.3.0/PKG-INFO +20 -0
- hopscotch_analytics-0.3.0/pyproject.toml +42 -0
- hopscotch_analytics-0.3.0/src/hopscotch/__init__.py +4 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/__init__.py +20 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_clusters.py +238 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_events.py +159 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_segment.py +158 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_start_end_events.py +45 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/collapse_events.py +579 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/data_processor.py +8 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/drop_segment.py +26 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/edit_events.py +64 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/filter_events.py +111 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/filter_paths.py +269 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/rename_events.py +47 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/sample_paths.py +70 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/split_sessions.py +124 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/truncate_paths.py +104 -0
- hopscotch_analytics-0.3.0/src/hopscotch/data_processors/url_events.py +348 -0
- hopscotch_analytics-0.3.0/src/hopscotch/datasets/__init__.py +0 -0
- hopscotch_analytics-0.3.0/src/hopscotch/datasets/ecom.csv.gz +0 -0
- hopscotch_analytics-0.3.0/src/hopscotch/datasets/ecom.py +359 -0
- hopscotch_analytics-0.3.0/src/hopscotch/eventstream/__init__.py +4 -0
- hopscotch_analytics-0.3.0/src/hopscotch/eventstream/event_type.py +34 -0
- hopscotch_analytics-0.3.0/src/hopscotch/eventstream/eventstream.py +496 -0
- hopscotch_analytics-0.3.0/src/hopscotch/eventstream/schema.py +46 -0
- hopscotch_analytics-0.3.0/src/hopscotch/exceptions.py +69 -0
- hopscotch_analytics-0.3.0/src/hopscotch/metrics/__init__.py +0 -0
- hopscotch_analytics-0.3.0/src/hopscotch/metrics/metric_builder.py +767 -0
- hopscotch_analytics-0.3.0/src/hopscotch/static/widget.css +1 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/__init__.py +7 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/cluster_analysis.py +280 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/funnel.py +84 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/segment_overview.py +602 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/step_matrix.py +312 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/transition_matrix.py +153 -0
- hopscotch_analytics-0.3.0/src/hopscotch/tools/types.py +14 -0
- hopscotch_analytics-0.3.0/src/hopscotch/utils/__init__.py +0 -0
- hopscotch_analytics-0.3.0/src/hopscotch/utils/sequences.py +35 -0
- hopscotch_analytics-0.3.0/src/hopscotch/utils/session_detection.py +326 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/__init__.py +3 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/_esm.py +34 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/cluster_analysis.py +288 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/funnel.py +217 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/segment_overview.py +225 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/step_sankey.py +296 -0
- hopscotch_analytics-0.3.0/src/hopscotch/widgets/transition_graph.py +312 -0
- hopscotch_analytics-0.3.0/tests/__init__.py +0 -0
- hopscotch_analytics-0.3.0/tests/conftest.py +20 -0
- hopscotch_analytics-0.3.0/tests/data_processors/__init__.py +0 -0
- hopscotch_analytics-0.3.0/tests/data_processors/add_clusters_test.py +387 -0
- hopscotch_analytics-0.3.0/tests/data_processors/add_events_test.py +279 -0
- hopscotch_analytics-0.3.0/tests/data_processors/add_start_end_events_test.py +83 -0
- hopscotch_analytics-0.3.0/tests/data_processors/collapse_events_test.py +697 -0
- hopscotch_analytics-0.3.0/tests/data_processors/edit_events_test.py +115 -0
- hopscotch_analytics-0.3.0/tests/data_processors/filter_events_test.py +187 -0
- hopscotch_analytics-0.3.0/tests/data_processors/filter_paths_test.py +278 -0
- hopscotch_analytics-0.3.0/tests/data_processors/rename_events_test.py +98 -0
- hopscotch_analytics-0.3.0/tests/data_processors/sample_paths.py +89 -0
- hopscotch_analytics-0.3.0/tests/data_processors/segments_test.py +196 -0
- hopscotch_analytics-0.3.0/tests/data_processors/split_sessions_test.py +254 -0
- hopscotch_analytics-0.3.0/tests/data_processors/truncate_paths_test.py +173 -0
- hopscotch_analytics-0.3.0/tests/data_processors/url_events_test.py +681 -0
- hopscotch_analytics-0.3.0/tests/test_eventstream.py +101 -0
- hopscotch_analytics-0.3.0/tests/tools/__init__.py +0 -0
- hopscotch_analytics-0.3.0/tests/tools/cluster_analysis_test.py +292 -0
- hopscotch_analytics-0.3.0/tests/tools/funnel_test.py +120 -0
- hopscotch_analytics-0.3.0/tests/tools/segment_overview_test.py +1117 -0
- hopscotch_analytics-0.3.0/tests/tools/step_matrix_input.csv +21 -0
- hopscotch_analytics-0.3.0/tests/tools/step_matrix_test.py +300 -0
- hopscotch_analytics-0.3.0/tests/tools/transition_matrix_input.csv +19 -0
- hopscotch_analytics-0.3.0/tests/tools/transition_matrix_test.py +296 -0
- hopscotch_analytics-0.3.0/uv.lock +2814 -0
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
name: Release
|
|
2
|
+
|
|
3
|
+
# Triggered when hopscotch-lib-dev pushes a version tag here
|
|
4
|
+
# after syncing the filtered Python history.
|
|
5
|
+
# Can also be triggered manually via workflow_dispatch.
|
|
6
|
+
|
|
7
|
+
on:
|
|
8
|
+
push:
|
|
9
|
+
tags:
|
|
10
|
+
- 'v*'
|
|
11
|
+
workflow_dispatch:
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
publish:
|
|
15
|
+
if: ${{ !contains(github.ref, 'rc') }}
|
|
16
|
+
runs-on: ubuntu-latest
|
|
17
|
+
environment: pypi
|
|
18
|
+
permissions:
|
|
19
|
+
id-token: write # required for OIDC Trusted Publisher
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- uses: actions/checkout@v5
|
|
23
|
+
|
|
24
|
+
- name: Set up Python 3.13
|
|
25
|
+
uses: actions/setup-python@v6
|
|
26
|
+
with:
|
|
27
|
+
python-version: '3.13'
|
|
28
|
+
|
|
29
|
+
- name: Install uv
|
|
30
|
+
run: pip install uv
|
|
31
|
+
|
|
32
|
+
- name: Build package
|
|
33
|
+
run: uv build
|
|
34
|
+
|
|
35
|
+
- name: Publish to PyPI
|
|
36
|
+
run: uv publish
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
Format: [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
|
|
5
|
+
|
|
6
|
+
## [Unreleased]
|
|
7
|
+
|
|
8
|
+
## [0.3.0] - 2026-06-21
|
|
9
|
+
|
|
10
|
+
### Added — Cluster Analysis
|
|
11
|
+
- **`ClusterAnalysis` tool**: KMeans and HDBSCAN clustering with optional NMF
|
|
12
|
+
decomposition; supports `n_clusters` / `nmf_k` as single value, range
|
|
13
|
+
(`"3-8"`), or comma-separated list (`"3,5,7"`) for silhouette grid search
|
|
14
|
+
- **`ClusterAnalysisWidget`** (anywidget): interactive sidebar with Configure
|
|
15
|
+
Features, Configure Metrics, clustering method, scaler, N Clusters, NMF
|
|
16
|
+
Decomposition fields; heatmap overview, silhouette chart, NMF H/W matrices
|
|
17
|
+
- **Silhouette grid search**: selects best parameter set automatically and
|
|
18
|
+
highlights it in the bar chart
|
|
19
|
+
|
|
20
|
+
### Added — Segment Overview
|
|
21
|
+
- **`segment_levels` traitlet**: segment column values available in
|
|
22
|
+
Configure Metrics `belongs_to` fields without extra round-trips
|
|
23
|
+
- **`active_days` event selector** in Configure Metrics and Configure
|
|
24
|
+
Features: optional filter to count only days with specific events
|
|
25
|
+
|
|
26
|
+
### Added — shared metric form components
|
|
27
|
+
- `metric_config_row.tsx`: unified `MetricRow`, `MultiSelect`, `SingleSelect`,
|
|
28
|
+
`InfoTip`, `validateMetricCfg` shared across Configure Features (CA),
|
|
29
|
+
Configure Metrics (CA), and Configure Metrics (SO) — ~650 lines removed
|
|
30
|
+
- `showAgg=false` for Configure Features (no aggregation dropdown)
|
|
31
|
+
- All three forms default new rows to `event_count` with all events selected
|
|
32
|
+
- InfoTip on Cluster Analysis sidebar Metrics label explaining overview metrics
|
|
33
|
+
|
|
34
|
+
### Changed
|
|
35
|
+
- Package renamed from `hopscotch` to `hopscotch-analytics` on PyPI
|
|
36
|
+
- Aggregation global dropdown removed from Cluster Analysis sidebar;
|
|
37
|
+
aggregation is now configured per-metric in Configure Metrics
|
|
38
|
+
|
|
39
|
+
### Fixed — Segment Overview
|
|
40
|
+
- `segLevels` ReferenceError crash in metrics overlay
|
|
41
|
+
- `SidebarSH` inline component causing remount and focus loss on
|
|
42
|
+
Configure Metrics open
|
|
43
|
+
|
|
44
|
+
### Fixed — Cluster Analysis
|
|
45
|
+
- Default `n_clusters=""` causing KMeans to fail on widget init; now
|
|
46
|
+
defaults to `"3-8"`
|
|
47
|
+
- NMF K text field losing focus on each keystroke
|
|
48
|
+
|
|
49
|
+
## [0.2.2] - 2026-06-19
|
|
50
|
+
|
|
51
|
+
### Fixed
|
|
52
|
+
- `_get_esm()`: fetch widget.js source and return as string to anywidget
|
|
53
|
+
(anywidget requires inline JS string, not a URL)
|
|
54
|
+
|
|
55
|
+
## [0.2.1] - 2026-06-19
|
|
56
|
+
|
|
57
|
+
### Fixed
|
|
58
|
+
- `_get_esm()`: download widget.js to local cache instead of passing URL
|
|
59
|
+
to anywidget directly
|
|
60
|
+
|
|
61
|
+
## [0.2.0] - 2026-06-19
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
### Added — Transition Graph
|
|
65
|
+
- **Diff mode node coloring**: nodes tinted red/blue based on event share
|
|
66
|
+
difference between groups; inner circle radius shrinks proportionally
|
|
67
|
+
to diff magnitude (zero diff = normal donut, max diff = solid circle)
|
|
68
|
+
- **Node hover tooltip** in diff mode: shows share breakdown per group
|
|
69
|
+
with subtitle "share of event in group"
|
|
70
|
+
- **Colored dots** (● blue / ● red) next to segment value dropdowns so
|
|
71
|
+
it's immediately clear which color maps to which group
|
|
72
|
+
- **Fit to canvas** button (expand icon, top-right toolbar) replaces the
|
|
73
|
+
old Reset Layout button; calls `cy.fit` with 12 px padding
|
|
74
|
+
- **Auto-fit on load**: graph fits the canvas automatically on every
|
|
75
|
+
render (both auto-layout and saved-positions paths)
|
|
76
|
+
- **Edge Weight Type** label (renamed from "Value Type") in settings sidebar
|
|
77
|
+
|
|
78
|
+
### Fixed — Transition Graph
|
|
79
|
+
- `value1Label` / `value2Label` falling back to "group1"/"group2" when
|
|
80
|
+
diff values are boolean `false` — now uses `String()` coercion
|
|
81
|
+
- Diff tooltip label now correctly shows the actual segment value
|
|
82
|
+
|
|
83
|
+
### Added — Step Sankey
|
|
84
|
+
- **`step_window` parameter**: frontend-only slider in sidebar Visibility
|
|
85
|
+
Settings (Radix single-thumb slider, amber track); defaults to 3;
|
|
86
|
+
limits displayed columns per anchor without recomputing backend data
|
|
87
|
+
- **Event Count filter**: sidebar RangeSlider now populated with real
|
|
88
|
+
`COUNT(DISTINCT path_id)` per event from Python backend;
|
|
89
|
+
`_populationCustomized` flag prevents reset on recompute
|
|
90
|
+
- **Pattern edit menu** matching platform UX: path_start/path_end show
|
|
91
|
+
insert panel directly; internal events show Insert Before / Insert
|
|
92
|
+
After / Replace / Delete first-level menu with Event / Gap+Event tabs
|
|
93
|
+
- `path_start` and `path_end` included in all event dropdowns
|
|
94
|
+
- Diff tooltip label fix (same boolean coercion fix as transition graph)
|
|
95
|
+
|
|
96
|
+
### Fixed — Step Sankey
|
|
97
|
+
- **Gap+Event for `path_end`** now inserts `event->.*->path_end`
|
|
98
|
+
(own matrix block) instead of collapsing with existing wildcard
|
|
99
|
+
- **`PatternStore`**: `addWithTrailingGap` method; default display
|
|
100
|
+
pattern (`path_start->.*->path_end`) no longer leaks into edit state
|
|
101
|
+
when no real pattern is set — adding events from path_start no longer
|
|
102
|
+
appends path_end
|
|
103
|
+
- **Column filtering**: end-aligned and start-aligned matrices now
|
|
104
|
+
correctly limit variable columns on both sides by `stepWindow`
|
|
105
|
+
- **`path_start`/`path_end`** excluded from regular column event nodes
|
|
106
|
+
(appear only as fixed anchors)
|
|
107
|
+
- **`_find_center_position`**: regex split handles leading/trailing `.*`
|
|
108
|
+
wildcards — fixes `PatternNoMatchError` for `.*->path_end` patterns
|
|
109
|
+
- **Diff mode** with `.*->path_end`: `original_pattern` passed to sub-
|
|
110
|
+
calls so `skip_first_matrix` applies correctly in each group
|
|
111
|
+
- **`path_start->.*->path_end`** default no longer shown as a third
|
|
112
|
+
matrix block when user adds a central event via GUI
|
|
113
|
+
|
|
114
|
+
### Fixed — Python widget
|
|
115
|
+
- **Save initial widget state** on creation with `object_name` —
|
|
116
|
+
state is now written synchronously so a subsequent load always
|
|
117
|
+
restores `path_pattern`, `diff`, etc.
|
|
118
|
+
|
|
119
|
+
## [0.1.0] - 2026-06-17
|
|
120
|
+
|
|
121
|
+
### Added
|
|
122
|
+
- `Eventstream` class with DuckDB-powered step matrix and transition matrix
|
|
123
|
+
- `StepSankeyWidget` (anywidget): interactive step sankey with pattern editing,
|
|
124
|
+
diff mode, segment filters, `max_steps`, persistence via `object_name`
|
|
125
|
+
- `TransitionGraphWidget` (anywidget): Cytoscape.js force-directed graph,
|
|
126
|
+
edge weight types, diff mode, event count filter, node/edge color picker
|
|
127
|
+
- Supabase OTP authentication paywall baked into bundle at build time
|
|
128
|
+
- GitHub Actions release pipeline: builds JS bundle, creates GitHub Release
|
|
129
|
+
with `widget.js` asset, syncs Python history to public repo via
|
|
130
|
+
`git filter-repo`
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hopscotch-analytics
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Clickstream analysis library for Jupyter / Colab / VS Code
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: anywidget>=0.9
|
|
7
|
+
Requires-Dist: duckdb>=1.1
|
|
8
|
+
Requires-Dist: matplotlib>=3.11.0
|
|
9
|
+
Requires-Dist: numpy>=1.26
|
|
10
|
+
Requires-Dist: pandas>=2.2
|
|
11
|
+
Requires-Dist: pyarrow>=14.0
|
|
12
|
+
Requires-Dist: scikit-learn>=1.3
|
|
13
|
+
Requires-Dist: scipy>=1.11
|
|
14
|
+
Requires-Dist: traitlets>=5.0
|
|
15
|
+
Provides-Extra: dev
|
|
16
|
+
Requires-Dist: ipykernel; extra == 'dev'
|
|
17
|
+
Requires-Dist: jupyterlab; extra == 'dev'
|
|
18
|
+
Requires-Dist: notebook; extra == 'dev'
|
|
19
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hopscotch-analytics"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Clickstream analysis library for Jupyter / Colab / VS Code"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"pandas>=2.2",
|
|
12
|
+
"duckdb>=1.1",
|
|
13
|
+
"anywidget>=0.9",
|
|
14
|
+
"traitlets>=5.0",
|
|
15
|
+
"pyarrow>=14.0",
|
|
16
|
+
"scikit-learn>=1.3",
|
|
17
|
+
"numpy>=1.26",
|
|
18
|
+
"scipy>=1.11",
|
|
19
|
+
"matplotlib>=3.11.0",
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
[project.optional-dependencies]
|
|
23
|
+
dev = [
|
|
24
|
+
"pytest>=8.0",
|
|
25
|
+
"pytest-cov",
|
|
26
|
+
"ipykernel",
|
|
27
|
+
"jupyterlab",
|
|
28
|
+
"notebook",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.wheel]
|
|
32
|
+
packages = ["src/hopscotch"]
|
|
33
|
+
|
|
34
|
+
[tool.pytest.ini_options]
|
|
35
|
+
testpaths = ["tests"]
|
|
36
|
+
|
|
37
|
+
[dependency-groups]
|
|
38
|
+
dev = [
|
|
39
|
+
"ipykernel>=7.3.0",
|
|
40
|
+
"jupyterlab>=4.5.8",
|
|
41
|
+
"pytest>=9.1.0",
|
|
42
|
+
]
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from hopscotch.data_processors.add_clusters import AddClusters
|
|
2
|
+
from hopscotch.data_processors.add_events import AddEvents
|
|
3
|
+
from hopscotch.data_processors.add_segment import AddSegment
|
|
4
|
+
from hopscotch.data_processors.add_start_end_events import AddStartEndEvents
|
|
5
|
+
from hopscotch.data_processors.collapse_events import CollapseEvents
|
|
6
|
+
from hopscotch.data_processors.drop_segment import DropSegment
|
|
7
|
+
from hopscotch.data_processors.edit_events import EditEvents
|
|
8
|
+
from hopscotch.data_processors.filter_events import FilterEvents
|
|
9
|
+
from hopscotch.data_processors.filter_paths import FilterPaths
|
|
10
|
+
from hopscotch.data_processors.rename_events import RenameEvents
|
|
11
|
+
from hopscotch.data_processors.sample_paths import SamplePaths
|
|
12
|
+
from hopscotch.data_processors.split_sessions import SplitSessions
|
|
13
|
+
from hopscotch.data_processors.truncate_paths import TruncatePaths
|
|
14
|
+
from hopscotch.data_processors.url_events import UrlEvents
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"AddClusters", "AddEvents", "AddSegment", "AddStartEndEvents", "CollapseEvents", "DropSegment",
|
|
18
|
+
"EditEvents", "FilterEvents", "FilterPaths", "RenameEvents", "SamplePaths",
|
|
19
|
+
"SplitSessions", "TruncatePaths", "UrlEvents",
|
|
20
|
+
]
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AddClusters - data processor for clustering trajectories based on metrics.
|
|
3
|
+
|
|
4
|
+
Clusters trajectories and adds a new segment column with cluster labels.
|
|
5
|
+
Uses MetricBuilder for feature calculation.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List, Literal, Tuple
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
import pandas as pd
|
|
12
|
+
from sklearn.cluster import HDBSCAN, KMeans
|
|
13
|
+
from sklearn.decomposition import NMF
|
|
14
|
+
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
|
15
|
+
|
|
16
|
+
from hopscotch.data_processors.data_processor import DataProcessor
|
|
17
|
+
from hopscotch.eventstream.schema import EventstreamSchema
|
|
18
|
+
from hopscotch.exceptions import PreprocessingConfigError
|
|
19
|
+
from hopscotch.metrics.metric_builder import MetricBuilder
|
|
20
|
+
|
|
21
|
+
PROCESSOR_NAME = "add_clusters"
|
|
22
|
+
|
|
23
|
+
T_ClusteringMethod = Literal["kmeans", "hdbscan"]
|
|
24
|
+
T_Scaler = Literal["minmax", "std"] | None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class AddClusters(DataProcessor):
|
|
28
|
+
"""
|
|
29
|
+
Data processor that clusters trajectories based on computed metrics.
|
|
30
|
+
|
|
31
|
+
Adds a new segment column containing cluster labels for each trajectory.
|
|
32
|
+
|
|
33
|
+
Attributes:
|
|
34
|
+
segment_name: Name of the new segment column
|
|
35
|
+
metrics: List of metric configurations (MetricBuilder format)
|
|
36
|
+
method: Clustering method ("kmeans" or "hdbscan")
|
|
37
|
+
scaler: Feature scaler method ("minmax", "std", or None)
|
|
38
|
+
method_params: Parameters for the clustering algorithm
|
|
39
|
+
eventstream: Eventstream instance (needed for MetricBuilder)
|
|
40
|
+
path_id_col: Path ID column name (optional)
|
|
41
|
+
event_col: Event column name (optional)
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
segment_name: str
|
|
45
|
+
metrics: List[Dict[str, Any]]
|
|
46
|
+
method: T_ClusteringMethod
|
|
47
|
+
scaler: T_Scaler
|
|
48
|
+
method_params: Dict[str, Any]
|
|
49
|
+
nmf_k: int | None
|
|
50
|
+
eventstream: Any
|
|
51
|
+
path_id_col: str | None
|
|
52
|
+
event_col: str | None
|
|
53
|
+
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
eventstream: Any,
|
|
57
|
+
segment_name: str,
|
|
58
|
+
metrics: List[Dict[str, Any]],
|
|
59
|
+
method: T_ClusteringMethod = "kmeans",
|
|
60
|
+
scaler: T_Scaler = "minmax",
|
|
61
|
+
n_clusters: int | None = None,
|
|
62
|
+
min_cluster_size: int | None = None,
|
|
63
|
+
cluster_selection_epsilon: float | None = None,
|
|
64
|
+
nmf_k: int | None = None,
|
|
65
|
+
path_id_col: str | None = None,
|
|
66
|
+
event_col: str | None = None,
|
|
67
|
+
) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Initialize AddClusters processor.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
eventstream: Eventstream instance for metric calculation
|
|
73
|
+
segment_name: Name of the new segment column with cluster labels
|
|
74
|
+
metrics: List of metric configurations for MetricBuilder.
|
|
75
|
+
Each config is a dict with 'metric' and optional 'metric_args'.
|
|
76
|
+
Example: [
|
|
77
|
+
{"metric": "length"},
|
|
78
|
+
{"metric": "duration"},
|
|
79
|
+
{"metric": "event_count", "metric_args": {"event": "purchase"}}
|
|
80
|
+
]
|
|
81
|
+
method: Clustering method - "kmeans" or "hdbscan"
|
|
82
|
+
scaler: Feature scaler - "minmax", "std", or None.
|
|
83
|
+
Default is "minmax".
|
|
84
|
+
n_clusters: Number of clusters for k-means (required for kmeans)
|
|
85
|
+
min_cluster_size: Minimum cluster size for HDBSCAN
|
|
86
|
+
cluster_selection_epsilon: Cluster selection epsilon for HDBSCAN
|
|
87
|
+
path_id_col: Path ID column (if None, taken from schema)
|
|
88
|
+
event_col: Event column (if None, taken from schema)
|
|
89
|
+
"""
|
|
90
|
+
self.eventstream = eventstream
|
|
91
|
+
self.segment_name = segment_name
|
|
92
|
+
self.metrics = metrics
|
|
93
|
+
self.method = method
|
|
94
|
+
self.scaler = scaler
|
|
95
|
+
self.nmf_k = nmf_k
|
|
96
|
+
self.path_id_col = path_id_col
|
|
97
|
+
self.event_col = event_col
|
|
98
|
+
|
|
99
|
+
# Validate method and collect method-specific parameters
|
|
100
|
+
if method == "kmeans":
|
|
101
|
+
if n_clusters is None:
|
|
102
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "n_clusters is required for kmeans method")
|
|
103
|
+
self.method_params = {"n_clusters": n_clusters}
|
|
104
|
+
elif method == "hdbscan":
|
|
105
|
+
self.method_params = {}
|
|
106
|
+
if min_cluster_size is not None:
|
|
107
|
+
self.method_params["min_cluster_size"] = min_cluster_size
|
|
108
|
+
if cluster_selection_epsilon is not None:
|
|
109
|
+
self.method_params["cluster_selection_epsilon"] = cluster_selection_epsilon
|
|
110
|
+
else:
|
|
111
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, f"Unknown clustering method: {method}. Use 'kmeans' or 'hdbscan'.")
|
|
112
|
+
|
|
113
|
+
super().__init__()
|
|
114
|
+
|
|
115
|
+
def apply(
|
|
116
|
+
self, df: pd.DataFrame, schema: EventstreamSchema
|
|
117
|
+
) -> Tuple[pd.DataFrame, EventstreamSchema]:
|
|
118
|
+
"""
|
|
119
|
+
Apply clustering to trajectories and add cluster labels as a new segment.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
df: Input DataFrame with eventstream data
|
|
123
|
+
schema: EventstreamSchema with column definitions
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
Tuple of (new_df, new_schema) with added cluster segment
|
|
127
|
+
"""
|
|
128
|
+
# Validate segment name doesn't exist
|
|
129
|
+
if self.segment_name in df.columns:
|
|
130
|
+
if self.segment_name in schema.segment_cols:
|
|
131
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, f"Segment '{self.segment_name}' already exists.")
|
|
132
|
+
else:
|
|
133
|
+
raise PreprocessingConfigError(
|
|
134
|
+
PROCESSOR_NAME,
|
|
135
|
+
f"Name '{self.segment_name}' is already reserved in the eventstream."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
path_id_col = self.path_id_col or schema.path_col
|
|
139
|
+
event_col = self.event_col or schema.event_col # noqa: F841 - reserved for future use
|
|
140
|
+
|
|
141
|
+
# Build metrics using MetricBuilder
|
|
142
|
+
metric_builder = MetricBuilder(self.eventstream)
|
|
143
|
+
metrics_df = metric_builder.build_metrics(self.metrics, path_id_col)
|
|
144
|
+
|
|
145
|
+
if metrics_df.empty:
|
|
146
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "No metrics were computed. Check metric configurations.")
|
|
147
|
+
|
|
148
|
+
# Handle NaN values - fill with 0 for clustering
|
|
149
|
+
features = metrics_df.fillna(0).values
|
|
150
|
+
|
|
151
|
+
if features.shape[1] == 0:
|
|
152
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "No feature columns were generated from metrics.")
|
|
153
|
+
|
|
154
|
+
# Apply scaling
|
|
155
|
+
features_scaled = self._scale_features(features)
|
|
156
|
+
|
|
157
|
+
# Apply NMF dimensionality reduction if requested
|
|
158
|
+
if self.nmf_k is not None:
|
|
159
|
+
nmf = NMF(n_components=self.nmf_k, random_state=42)
|
|
160
|
+
features_scaled = nmf.fit_transform(features_scaled)
|
|
161
|
+
|
|
162
|
+
# Perform clustering
|
|
163
|
+
cluster_labels = self._cluster(features_scaled)
|
|
164
|
+
|
|
165
|
+
# Create cluster labels Series indexed by path_id
|
|
166
|
+
cluster_series = pd.Series(
|
|
167
|
+
cluster_labels,
|
|
168
|
+
index=metrics_df.index,
|
|
169
|
+
name=self.segment_name
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Convert labels to string for categorical representation
|
|
173
|
+
# HDBSCAN uses -1 for noise points, handle specially
|
|
174
|
+
cluster_series = cluster_series.apply(
|
|
175
|
+
lambda x: f"cluster_{x}" if x >= 0 else "noise"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Map cluster labels to all events in the dataframe
|
|
179
|
+
new_df = df.copy()
|
|
180
|
+
new_df[self.segment_name] = new_df[path_id_col].map(cluster_series)
|
|
181
|
+
new_df[self.segment_name] = new_df[self.segment_name].astype("category")
|
|
182
|
+
|
|
183
|
+
# Update schema
|
|
184
|
+
new_schema = schema.copy()
|
|
185
|
+
new_schema.segment_cols.append(self.segment_name)
|
|
186
|
+
|
|
187
|
+
return new_df, new_schema
|
|
188
|
+
|
|
189
|
+
def _scale_features(self, features: np.ndarray) -> np.ndarray:
|
|
190
|
+
"""
|
|
191
|
+
Scale features based on the configured method.
|
|
192
|
+
|
|
193
|
+
Args:
|
|
194
|
+
features: Raw feature matrix
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
Scaled feature matrix
|
|
198
|
+
"""
|
|
199
|
+
if self.scaler is None:
|
|
200
|
+
return features
|
|
201
|
+
elif self.scaler == "minmax":
|
|
202
|
+
scaler = MinMaxScaler()
|
|
203
|
+
return scaler.fit_transform(features)
|
|
204
|
+
elif self.scaler == "std":
|
|
205
|
+
scaler = StandardScaler()
|
|
206
|
+
return scaler.fit_transform(features)
|
|
207
|
+
else:
|
|
208
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, f"Unknown scaler method: {self.scaler}")
|
|
209
|
+
|
|
210
|
+
def _cluster(self, features: np.ndarray) -> np.ndarray:
|
|
211
|
+
"""
|
|
212
|
+
Perform clustering using the configured method.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
features: Standardized feature matrix
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
Array of cluster labels
|
|
219
|
+
"""
|
|
220
|
+
if self.method == "kmeans":
|
|
221
|
+
clusterer = KMeans(
|
|
222
|
+
n_clusters=self.method_params["n_clusters"],
|
|
223
|
+
random_state=42,
|
|
224
|
+
n_init="auto"
|
|
225
|
+
)
|
|
226
|
+
return clusterer.fit_predict(features)
|
|
227
|
+
elif self.method == "hdbscan":
|
|
228
|
+
# Set defaults for HDBSCAN if not provided
|
|
229
|
+
min_cluster_size = self.method_params.get("min_cluster_size", 5)
|
|
230
|
+
cluster_selection_epsilon = self.method_params.get("cluster_selection_epsilon", 0.0)
|
|
231
|
+
|
|
232
|
+
clusterer = HDBSCAN(
|
|
233
|
+
min_cluster_size=min_cluster_size,
|
|
234
|
+
cluster_selection_epsilon=cluster_selection_epsilon
|
|
235
|
+
)
|
|
236
|
+
return clusterer.fit_predict(features)
|
|
237
|
+
else:
|
|
238
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, f"Unknown clustering method: {self.method}")
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
from typing import List, Tuple
|
|
2
|
+
|
|
3
|
+
import duckdb
|
|
4
|
+
import pandas as pd
|
|
5
|
+
|
|
6
|
+
from hopscotch.data_processors.data_processor import DataProcessor
|
|
7
|
+
from hopscotch.eventstream.event_type import EventTypes
|
|
8
|
+
from hopscotch.eventstream.schema import EventstreamSchema
|
|
9
|
+
from hopscotch.exceptions import PreprocessingConfigError
|
|
10
|
+
|
|
11
|
+
PROCESSOR_NAME = "add_events"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class AddEvents(DataProcessor):
|
|
15
|
+
|
|
16
|
+
def __init__(
|
|
17
|
+
self,
|
|
18
|
+
new_event_name: str,
|
|
19
|
+
source_events: List[str] | None = None,
|
|
20
|
+
sql: str | None = None,
|
|
21
|
+
churn: dict | None = None,
|
|
22
|
+
) -> None:
|
|
23
|
+
if not isinstance(new_event_name, str) or not new_event_name:
|
|
24
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'new_event_name' must be a non-empty string.")
|
|
25
|
+
|
|
26
|
+
n_modes = sum([source_events is not None, sql is not None, churn is not None])
|
|
27
|
+
if n_modes != 1:
|
|
28
|
+
raise PreprocessingConfigError(
|
|
29
|
+
PROCESSOR_NAME,
|
|
30
|
+
"Exactly one of 'source_events', 'sql', or 'churn' must be provided."
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if source_events is not None:
|
|
34
|
+
if not isinstance(source_events, list):
|
|
35
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'source_events' must be a list.")
|
|
36
|
+
if not all(isinstance(e, str) for e in source_events):
|
|
37
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "All elements in 'source_events' must be strings.")
|
|
38
|
+
|
|
39
|
+
if sql is not None and not isinstance(sql, str):
|
|
40
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'sql' must be a string.")
|
|
41
|
+
|
|
42
|
+
if churn is not None:
|
|
43
|
+
if not isinstance(churn, dict):
|
|
44
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'churn' must be a dictionary.")
|
|
45
|
+
if "inactivity_days" not in churn:
|
|
46
|
+
raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'churn' must contain 'inactivity_days'.")
|
|
47
|
+
inactivity_days = churn["inactivity_days"]
|
|
48
|
+
if not isinstance(inactivity_days, (int, float)) or inactivity_days <= 0:
|
|
49
|
+
raise PreprocessingConfigError(
|
|
50
|
+
PROCESSOR_NAME, "Value 'churn.inactivity_days' must be a positive number."
|
|
51
|
+
)
|
|
52
|
+
active_events = churn.get("active_events")
|
|
53
|
+
if active_events is not None:
|
|
54
|
+
if not isinstance(active_events, list):
|
|
55
|
+
raise PreprocessingConfigError(
|
|
56
|
+
PROCESSOR_NAME, "Value 'churn.active_events' must be a list."
|
|
57
|
+
)
|
|
58
|
+
if not all(isinstance(e, str) for e in active_events):
|
|
59
|
+
raise PreprocessingConfigError(
|
|
60
|
+
PROCESSOR_NAME, "All elements in 'churn.active_events' must be strings."
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
self.new_event_name = new_event_name
|
|
64
|
+
self.source_events = source_events
|
|
65
|
+
self.sql = sql
|
|
66
|
+
self.churn = churn
|
|
67
|
+
super().__init__()
|
|
68
|
+
|
|
69
|
+
def apply(
|
|
70
|
+
self, df: pd.DataFrame, schema: EventstreamSchema
|
|
71
|
+
) -> Tuple[pd.DataFrame, EventstreamSchema]:
|
|
72
|
+
if self.source_events is not None:
|
|
73
|
+
df_source = self._get_by_source_events(df, schema)
|
|
74
|
+
elif self.sql is not None:
|
|
75
|
+
df_source = self._get_by_sql(df, schema)
|
|
76
|
+
else:
|
|
77
|
+
df_source = self._get_by_churn(df, schema)
|
|
78
|
+
|
|
79
|
+
if df_source.empty:
|
|
80
|
+
return df, schema
|
|
81
|
+
|
|
82
|
+
event_types = EventTypes()
|
|
83
|
+
df_new = df_source.copy()
|
|
84
|
+
df_new[schema.event_col] = self.new_event_name
|
|
85
|
+
df_new[schema.event_type] = event_types.SYNTHETIC_EVENT.type
|
|
86
|
+
df_new[schema.subindex] = event_types.SYNTHETIC_EVENT.index
|
|
87
|
+
|
|
88
|
+
df = (
|
|
89
|
+
pd.concat([df, df_new])
|
|
90
|
+
.sort_values([schema.path_col, schema.timestamp, schema.subindex])
|
|
91
|
+
.reset_index(drop=True)
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
df[schema.event_col] = df[schema.event_col].astype("category")
|
|
95
|
+
|
|
96
|
+
return df, schema
|
|
97
|
+
|
|
98
|
+
def _get_by_source_events(self, df: pd.DataFrame, schema: EventstreamSchema) -> pd.DataFrame:
|
|
99
|
+
if not self.source_events:
|
|
100
|
+
return df.iloc[0:0]
|
|
101
|
+
|
|
102
|
+
existing = set(df[schema.event_col].cat.categories.tolist())
|
|
103
|
+
unknown = set(self.source_events) - existing
|
|
104
|
+
if unknown:
|
|
105
|
+
raise PreprocessingConfigError(
|
|
106
|
+
PROCESSOR_NAME,
|
|
107
|
+
f"Unknown event names in 'source_events': {sorted(unknown)}. "
|
|
108
|
+
f"Available events: {sorted(existing)}."
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
return df[df[schema.event_col].isin(self.source_events)].copy()
|
|
112
|
+
|
|
113
|
+
def _get_by_sql(self, df: pd.DataFrame, schema: EventstreamSchema) -> pd.DataFrame:
|
|
114
|
+
columns_old = set(df.columns)
|
|
115
|
+
eventstream = df # noqa: F841 — referenced by user SQL as "eventstream"
|
|
116
|
+
result = duckdb.sql(self.sql).df()
|
|
117
|
+
if set(result.columns) != columns_old:
|
|
118
|
+
raise PreprocessingConfigError(
|
|
119
|
+
PROCESSOR_NAME,
|
|
120
|
+
"The SQL query must return the same columns as the eventstream."
|
|
121
|
+
)
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
def _get_by_churn(self, df: pd.DataFrame, schema: EventstreamSchema) -> pd.DataFrame:
|
|
125
|
+
path_col = schema.path_col
|
|
126
|
+
ts_col = schema.timestamp
|
|
127
|
+
subindex_col = schema.subindex
|
|
128
|
+
event_col = schema.event_col
|
|
129
|
+
|
|
130
|
+
inactivity_days = self.churn["inactivity_days"]
|
|
131
|
+
active_events = self.churn.get("active_events")
|
|
132
|
+
|
|
133
|
+
threshold_seconds = inactivity_days * 86400
|
|
134
|
+
|
|
135
|
+
# Filter to active events only if specified; otherwise all events count.
|
|
136
|
+
# LEAD looks only within the filtered set, so the "next active event"
|
|
137
|
+
# is found correctly. The overall dataset max comes from the full df.
|
|
138
|
+
active_filter = ""
|
|
139
|
+
if active_events is not None:
|
|
140
|
+
if not active_events:
|
|
141
|
+
return df.iloc[0:0]
|
|
142
|
+
quoted = ", ".join(f"'{e}'" for e in active_events)
|
|
143
|
+
active_filter = f"WHERE {event_col} IN ({quoted})"
|
|
144
|
+
|
|
145
|
+
query = f"""
|
|
146
|
+
WITH windowed AS (
|
|
147
|
+
SELECT *,
|
|
148
|
+
LEAD({ts_col}) OVER (
|
|
149
|
+
PARTITION BY {path_col} ORDER BY {ts_col}, {subindex_col}
|
|
150
|
+
) AS _hop_next_ts,
|
|
151
|
+
(SELECT MAX({ts_col}) FROM df) AS _hop_dataset_end
|
|
152
|
+
FROM df
|
|
153
|
+
{active_filter}
|
|
154
|
+
)
|
|
155
|
+
SELECT * EXCLUDE (_hop_next_ts, _hop_dataset_end) FROM windowed
|
|
156
|
+
WHERE epoch(COALESCE(_hop_next_ts, _hop_dataset_end)) - epoch({ts_col})
|
|
157
|
+
> {threshold_seconds}
|
|
158
|
+
"""
|
|
159
|
+
return duckdb.sql(query).df()
|