hopscotch-analytics 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. hopscotch_analytics-0.3.0/.github/workflows/release.yml +36 -0
  2. hopscotch_analytics-0.3.0/CHANGELOG.md +130 -0
  3. hopscotch_analytics-0.3.0/PKG-INFO +20 -0
  4. hopscotch_analytics-0.3.0/pyproject.toml +42 -0
  5. hopscotch_analytics-0.3.0/src/hopscotch/__init__.py +4 -0
  6. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/__init__.py +20 -0
  7. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_clusters.py +238 -0
  8. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_events.py +159 -0
  9. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_segment.py +158 -0
  10. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/add_start_end_events.py +45 -0
  11. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/collapse_events.py +579 -0
  12. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/data_processor.py +8 -0
  13. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/drop_segment.py +26 -0
  14. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/edit_events.py +64 -0
  15. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/filter_events.py +111 -0
  16. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/filter_paths.py +269 -0
  17. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/rename_events.py +47 -0
  18. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/sample_paths.py +70 -0
  19. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/split_sessions.py +124 -0
  20. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/truncate_paths.py +104 -0
  21. hopscotch_analytics-0.3.0/src/hopscotch/data_processors/url_events.py +348 -0
  22. hopscotch_analytics-0.3.0/src/hopscotch/datasets/__init__.py +0 -0
  23. hopscotch_analytics-0.3.0/src/hopscotch/datasets/ecom.csv.gz +0 -0
  24. hopscotch_analytics-0.3.0/src/hopscotch/datasets/ecom.py +359 -0
  25. hopscotch_analytics-0.3.0/src/hopscotch/eventstream/__init__.py +4 -0
  26. hopscotch_analytics-0.3.0/src/hopscotch/eventstream/event_type.py +34 -0
  27. hopscotch_analytics-0.3.0/src/hopscotch/eventstream/eventstream.py +496 -0
  28. hopscotch_analytics-0.3.0/src/hopscotch/eventstream/schema.py +46 -0
  29. hopscotch_analytics-0.3.0/src/hopscotch/exceptions.py +69 -0
  30. hopscotch_analytics-0.3.0/src/hopscotch/metrics/__init__.py +0 -0
  31. hopscotch_analytics-0.3.0/src/hopscotch/metrics/metric_builder.py +767 -0
  32. hopscotch_analytics-0.3.0/src/hopscotch/static/widget.css +1 -0
  33. hopscotch_analytics-0.3.0/src/hopscotch/tools/__init__.py +7 -0
  34. hopscotch_analytics-0.3.0/src/hopscotch/tools/cluster_analysis.py +280 -0
  35. hopscotch_analytics-0.3.0/src/hopscotch/tools/funnel.py +84 -0
  36. hopscotch_analytics-0.3.0/src/hopscotch/tools/segment_overview.py +602 -0
  37. hopscotch_analytics-0.3.0/src/hopscotch/tools/step_matrix.py +312 -0
  38. hopscotch_analytics-0.3.0/src/hopscotch/tools/transition_matrix.py +153 -0
  39. hopscotch_analytics-0.3.0/src/hopscotch/tools/types.py +14 -0
  40. hopscotch_analytics-0.3.0/src/hopscotch/utils/__init__.py +0 -0
  41. hopscotch_analytics-0.3.0/src/hopscotch/utils/sequences.py +35 -0
  42. hopscotch_analytics-0.3.0/src/hopscotch/utils/session_detection.py +326 -0
  43. hopscotch_analytics-0.3.0/src/hopscotch/widgets/__init__.py +3 -0
  44. hopscotch_analytics-0.3.0/src/hopscotch/widgets/_esm.py +34 -0
  45. hopscotch_analytics-0.3.0/src/hopscotch/widgets/cluster_analysis.py +288 -0
  46. hopscotch_analytics-0.3.0/src/hopscotch/widgets/funnel.py +217 -0
  47. hopscotch_analytics-0.3.0/src/hopscotch/widgets/segment_overview.py +225 -0
  48. hopscotch_analytics-0.3.0/src/hopscotch/widgets/step_sankey.py +296 -0
  49. hopscotch_analytics-0.3.0/src/hopscotch/widgets/transition_graph.py +312 -0
  50. hopscotch_analytics-0.3.0/tests/__init__.py +0 -0
  51. hopscotch_analytics-0.3.0/tests/conftest.py +20 -0
  52. hopscotch_analytics-0.3.0/tests/data_processors/__init__.py +0 -0
  53. hopscotch_analytics-0.3.0/tests/data_processors/add_clusters_test.py +387 -0
  54. hopscotch_analytics-0.3.0/tests/data_processors/add_events_test.py +279 -0
  55. hopscotch_analytics-0.3.0/tests/data_processors/add_start_end_events_test.py +83 -0
  56. hopscotch_analytics-0.3.0/tests/data_processors/collapse_events_test.py +697 -0
  57. hopscotch_analytics-0.3.0/tests/data_processors/edit_events_test.py +115 -0
  58. hopscotch_analytics-0.3.0/tests/data_processors/filter_events_test.py +187 -0
  59. hopscotch_analytics-0.3.0/tests/data_processors/filter_paths_test.py +278 -0
  60. hopscotch_analytics-0.3.0/tests/data_processors/rename_events_test.py +98 -0
  61. hopscotch_analytics-0.3.0/tests/data_processors/sample_paths.py +89 -0
  62. hopscotch_analytics-0.3.0/tests/data_processors/segments_test.py +196 -0
  63. hopscotch_analytics-0.3.0/tests/data_processors/split_sessions_test.py +254 -0
  64. hopscotch_analytics-0.3.0/tests/data_processors/truncate_paths_test.py +173 -0
  65. hopscotch_analytics-0.3.0/tests/data_processors/url_events_test.py +681 -0
  66. hopscotch_analytics-0.3.0/tests/test_eventstream.py +101 -0
  67. hopscotch_analytics-0.3.0/tests/tools/__init__.py +0 -0
  68. hopscotch_analytics-0.3.0/tests/tools/cluster_analysis_test.py +292 -0
  69. hopscotch_analytics-0.3.0/tests/tools/funnel_test.py +120 -0
  70. hopscotch_analytics-0.3.0/tests/tools/segment_overview_test.py +1117 -0
  71. hopscotch_analytics-0.3.0/tests/tools/step_matrix_input.csv +21 -0
  72. hopscotch_analytics-0.3.0/tests/tools/step_matrix_test.py +300 -0
  73. hopscotch_analytics-0.3.0/tests/tools/transition_matrix_input.csv +19 -0
  74. hopscotch_analytics-0.3.0/tests/tools/transition_matrix_test.py +296 -0
  75. hopscotch_analytics-0.3.0/uv.lock +2814 -0
@@ -0,0 +1,36 @@
1
+ name: Release
2
+
3
+ # Triggered when hopscotch-lib-dev pushes a version tag here
4
+ # after syncing the filtered Python history.
5
+ # Can also be triggered manually via workflow_dispatch.
6
+
7
+ on:
8
+ push:
9
+ tags:
10
+ - 'v*'
11
+ workflow_dispatch:
12
+
13
+ jobs:
14
+ publish:
15
+ if: ${{ !contains(github.ref, 'rc') }}
16
+ runs-on: ubuntu-latest
17
+ environment: pypi
18
+ permissions:
19
+ id-token: write # required for OIDC Trusted Publisher
20
+
21
+ steps:
22
+ - uses: actions/checkout@v5
23
+
24
+ - name: Set up Python 3.13
25
+ uses: actions/setup-python@v6
26
+ with:
27
+ python-version: '3.13'
28
+
29
+ - name: Install uv
30
+ run: pip install uv
31
+
32
+ - name: Build package
33
+ run: uv build
34
+
35
+ - name: Publish to PyPI
36
+ run: uv publish
@@ -0,0 +1,130 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+ Format: [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
5
+
6
+ ## [Unreleased]
7
+
8
+ ## [0.3.0] - 2026-06-21
9
+
10
+ ### Added — Cluster Analysis
11
+ - **`ClusterAnalysis` tool**: KMeans and HDBSCAN clustering with optional NMF
12
+ decomposition; supports `n_clusters` / `nmf_k` as single value, range
13
+ (`"3-8"`), or comma-separated list (`"3,5,7"`) for silhouette grid search
14
+ - **`ClusterAnalysisWidget`** (anywidget): interactive sidebar with Configure
15
+ Features, Configure Metrics, clustering method, scaler, N Clusters, NMF
16
+ Decomposition fields; heatmap overview, silhouette chart, NMF H/W matrices
17
+ - **Silhouette grid search**: selects best parameter set automatically and
18
+ highlights it in the bar chart
19
+
20
+ ### Added — Segment Overview
21
+ - **`segment_levels` traitlet**: segment column values available in
22
+ Configure Metrics `belongs_to` fields without extra round-trips
23
+ - **`active_days` event selector** in Configure Metrics and Configure
24
+ Features: optional filter to count only days with specific events
25
+
26
+ ### Added — shared metric form components
27
+ - `metric_config_row.tsx`: unified `MetricRow`, `MultiSelect`, `SingleSelect`,
28
+ `InfoTip`, `validateMetricCfg` shared across Configure Features (CA),
29
+ Configure Metrics (CA), and Configure Metrics (SO) — ~650 lines removed
30
+ - `showAgg=false` for Configure Features (no aggregation dropdown)
31
+ - All three forms default new rows to `event_count` with all events selected
32
+ - InfoTip on Cluster Analysis sidebar Metrics label explaining overview metrics
33
+
34
+ ### Changed
35
+ - Package renamed from `hopscotch` to `hopscotch-analytics` on PyPI
36
+ - Aggregation global dropdown removed from Cluster Analysis sidebar;
37
+ aggregation is now configured per-metric in Configure Metrics
38
+
39
+ ### Fixed — Segment Overview
40
+ - `segLevels` ReferenceError crash in metrics overlay
41
+ - `SidebarSH` inline component causing remount and focus loss on
42
+ Configure Metrics open
43
+
44
+ ### Fixed — Cluster Analysis
45
+ - Default `n_clusters=""` causing KMeans to fail on widget init; now
46
+ defaults to `"3-8"`
47
+ - NMF K text field losing focus on each keystroke
48
+
49
+ ## [0.2.2] - 2026-06-19
50
+
51
+ ### Fixed
52
+ - `_get_esm()`: fetch widget.js source and return as string to anywidget
53
+ (anywidget requires inline JS string, not a URL)
54
+
55
+ ## [0.2.1] - 2026-06-19
56
+
57
+ ### Fixed
58
+ - `_get_esm()`: download widget.js to local cache instead of passing URL
59
+ to anywidget directly
60
+
61
+ ## [0.2.0] - 2026-06-19
62
+
63
+
64
+ ### Added — Transition Graph
65
+ - **Diff mode node coloring**: nodes tinted red/blue based on event share
66
+ difference between groups; inner circle radius shrinks proportionally
67
+ to diff magnitude (zero diff = normal donut, max diff = solid circle)
68
+ - **Node hover tooltip** in diff mode: shows share breakdown per group
69
+ with subtitle "share of event in group"
70
+ - **Colored dots** (● blue / ● red) next to segment value dropdowns so
71
+ it's immediately clear which color maps to which group
72
+ - **Fit to canvas** button (expand icon, top-right toolbar) replaces the
73
+ old Reset Layout button; calls `cy.fit` with 12 px padding
74
+ - **Auto-fit on load**: graph fits the canvas automatically on every
75
+ render (both auto-layout and saved-positions paths)
76
+ - **Edge Weight Type** label (renamed from "Value Type") in settings sidebar
77
+
78
+ ### Fixed — Transition Graph
79
+ - `value1Label` / `value2Label` falling back to "group1"/"group2" when
80
+ diff values are boolean `false` — now uses `String()` coercion
81
+ - Diff tooltip label now correctly shows the actual segment value
82
+
83
+ ### Added — Step Sankey
84
+ - **`step_window` parameter**: frontend-only slider in sidebar Visibility
85
+ Settings (Radix single-thumb slider, amber track); defaults to 3;
86
+ limits displayed columns per anchor without recomputing backend data
87
+ - **Event Count filter**: sidebar RangeSlider now populated with real
88
+ `COUNT(DISTINCT path_id)` per event from Python backend;
89
+ `_populationCustomized` flag prevents reset on recompute
90
+ - **Pattern edit menu** matching platform UX: path_start/path_end show
91
+ insert panel directly; internal events show Insert Before / Insert
92
+ After / Replace / Delete first-level menu with Event / Gap+Event tabs
93
+ - `path_start` and `path_end` included in all event dropdowns
94
+ - Diff tooltip label fix (same boolean coercion fix as transition graph)
95
+
96
+ ### Fixed — Step Sankey
97
+ - **Gap+Event for `path_end`** now inserts `event->.*->path_end`
98
+ (own matrix block) instead of collapsing with existing wildcard
99
+ - **`PatternStore`**: `addWithTrailingGap` method; default display
100
+ pattern (`path_start->.*->path_end`) no longer leaks into edit state
101
+ when no real pattern is set — adding events from path_start no longer
102
+ appends path_end
103
+ - **Column filtering**: end-aligned and start-aligned matrices now
104
+ correctly limit variable columns on both sides by `stepWindow`
105
+ - **`path_start`/`path_end`** excluded from regular column event nodes
106
+ (appear only as fixed anchors)
107
+ - **`_find_center_position`**: regex split handles leading/trailing `.*`
108
+ wildcards — fixes `PatternNoMatchError` for `.*->path_end` patterns
109
+ - **Diff mode** with `.*->path_end`: `original_pattern` passed to sub-
110
+ calls so `skip_first_matrix` applies correctly in each group
111
+ - **`path_start->.*->path_end`** default no longer shown as a third
112
+ matrix block when user adds a central event via GUI
113
+
114
+ ### Fixed — Python widget
115
+ - **Save initial widget state** on creation with `object_name` —
116
+ state is now written synchronously so a subsequent load always
117
+ restores `path_pattern`, `diff`, etc.
118
+
119
+ ## [0.1.0] - 2026-06-17
120
+
121
+ ### Added
122
+ - `Eventstream` class with DuckDB-powered step matrix and transition matrix
123
+ - `StepSankeyWidget` (anywidget): interactive step sankey with pattern editing,
124
+ diff mode, segment filters, `max_steps`, persistence via `object_name`
125
+ - `TransitionGraphWidget` (anywidget): Cytoscape.js force-directed graph,
126
+ edge weight types, diff mode, event count filter, node/edge color picker
127
+ - Supabase OTP authentication paywall baked into bundle at build time
128
+ - GitHub Actions release pipeline: builds JS bundle, creates GitHub Release
129
+ with `widget.js` asset, syncs Python history to public repo via
130
+ `git filter-repo`
@@ -0,0 +1,20 @@
1
+ Metadata-Version: 2.4
2
+ Name: hopscotch-analytics
3
+ Version: 0.3.0
4
+ Summary: Clickstream analysis library for Jupyter / Colab / VS Code
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: anywidget>=0.9
7
+ Requires-Dist: duckdb>=1.1
8
+ Requires-Dist: matplotlib>=3.11.0
9
+ Requires-Dist: numpy>=1.26
10
+ Requires-Dist: pandas>=2.2
11
+ Requires-Dist: pyarrow>=14.0
12
+ Requires-Dist: scikit-learn>=1.3
13
+ Requires-Dist: scipy>=1.11
14
+ Requires-Dist: traitlets>=5.0
15
+ Provides-Extra: dev
16
+ Requires-Dist: ipykernel; extra == 'dev'
17
+ Requires-Dist: jupyterlab; extra == 'dev'
18
+ Requires-Dist: notebook; extra == 'dev'
19
+ Requires-Dist: pytest-cov; extra == 'dev'
20
+ Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "hopscotch-analytics"
7
+ version = "0.3.0"
8
+ description = "Clickstream analysis library for Jupyter / Colab / VS Code"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "pandas>=2.2",
12
+ "duckdb>=1.1",
13
+ "anywidget>=0.9",
14
+ "traitlets>=5.0",
15
+ "pyarrow>=14.0",
16
+ "scikit-learn>=1.3",
17
+ "numpy>=1.26",
18
+ "scipy>=1.11",
19
+ "matplotlib>=3.11.0",
20
+ ]
21
+
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest>=8.0",
25
+ "pytest-cov",
26
+ "ipykernel",
27
+ "jupyterlab",
28
+ "notebook",
29
+ ]
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["src/hopscotch"]
33
+
34
+ [tool.pytest.ini_options]
35
+ testpaths = ["tests"]
36
+
37
+ [dependency-groups]
38
+ dev = [
39
+ "ipykernel>=7.3.0",
40
+ "jupyterlab>=4.5.8",
41
+ "pytest>=9.1.0",
42
+ ]
@@ -0,0 +1,4 @@
1
+ from hopscotch.eventstream.eventstream import Eventstream
2
+ from hopscotch.eventstream.schema import EventstreamSchema
3
+
4
+ __all__ = ["Eventstream", "EventstreamSchema"]
@@ -0,0 +1,20 @@
1
+ from hopscotch.data_processors.add_clusters import AddClusters
2
+ from hopscotch.data_processors.add_events import AddEvents
3
+ from hopscotch.data_processors.add_segment import AddSegment
4
+ from hopscotch.data_processors.add_start_end_events import AddStartEndEvents
5
+ from hopscotch.data_processors.collapse_events import CollapseEvents
6
+ from hopscotch.data_processors.drop_segment import DropSegment
7
+ from hopscotch.data_processors.edit_events import EditEvents
8
+ from hopscotch.data_processors.filter_events import FilterEvents
9
+ from hopscotch.data_processors.filter_paths import FilterPaths
10
+ from hopscotch.data_processors.rename_events import RenameEvents
11
+ from hopscotch.data_processors.sample_paths import SamplePaths
12
+ from hopscotch.data_processors.split_sessions import SplitSessions
13
+ from hopscotch.data_processors.truncate_paths import TruncatePaths
14
+ from hopscotch.data_processors.url_events import UrlEvents
15
+
16
+ __all__ = [
17
+ "AddClusters", "AddEvents", "AddSegment", "AddStartEndEvents", "CollapseEvents", "DropSegment",
18
+ "EditEvents", "FilterEvents", "FilterPaths", "RenameEvents", "SamplePaths",
19
+ "SplitSessions", "TruncatePaths", "UrlEvents",
20
+ ]
@@ -0,0 +1,238 @@
1
+ """
2
+ AddClusters - data processor for clustering trajectories based on metrics.
3
+
4
+ Clusters trajectories and adds a new segment column with cluster labels.
5
+ Uses MetricBuilder for feature calculation.
6
+ """
7
+
8
+ from typing import Any, Dict, List, Literal, Tuple
9
+
10
+ import numpy as np
11
+ import pandas as pd
12
+ from sklearn.cluster import HDBSCAN, KMeans
13
+ from sklearn.decomposition import NMF
14
+ from sklearn.preprocessing import MinMaxScaler, StandardScaler
15
+
16
+ from hopscotch.data_processors.data_processor import DataProcessor
17
+ from hopscotch.eventstream.schema import EventstreamSchema
18
+ from hopscotch.exceptions import PreprocessingConfigError
19
+ from hopscotch.metrics.metric_builder import MetricBuilder
20
+
21
+ PROCESSOR_NAME = "add_clusters"
22
+
23
+ T_ClusteringMethod = Literal["kmeans", "hdbscan"]
24
+ T_Scaler = Literal["minmax", "std"] | None
25
+
26
+
27
+ class AddClusters(DataProcessor):
28
+ """
29
+ Data processor that clusters trajectories based on computed metrics.
30
+
31
+ Adds a new segment column containing cluster labels for each trajectory.
32
+
33
+ Attributes:
34
+ segment_name: Name of the new segment column
35
+ metrics: List of metric configurations (MetricBuilder format)
36
+ method: Clustering method ("kmeans" or "hdbscan")
37
+ scaler: Feature scaler method ("minmax", "std", or None)
38
+ method_params: Parameters for the clustering algorithm
39
+ eventstream: Eventstream instance (needed for MetricBuilder)
40
+ path_id_col: Path ID column name (optional)
41
+ event_col: Event column name (optional)
42
+ """
43
+
44
+ segment_name: str
45
+ metrics: List[Dict[str, Any]]
46
+ method: T_ClusteringMethod
47
+ scaler: T_Scaler
48
+ method_params: Dict[str, Any]
49
+ nmf_k: int | None
50
+ eventstream: Any
51
+ path_id_col: str | None
52
+ event_col: str | None
53
+
54
+ def __init__(
55
+ self,
56
+ eventstream: Any,
57
+ segment_name: str,
58
+ metrics: List[Dict[str, Any]],
59
+ method: T_ClusteringMethod = "kmeans",
60
+ scaler: T_Scaler = "minmax",
61
+ n_clusters: int | None = None,
62
+ min_cluster_size: int | None = None,
63
+ cluster_selection_epsilon: float | None = None,
64
+ nmf_k: int | None = None,
65
+ path_id_col: str | None = None,
66
+ event_col: str | None = None,
67
+ ) -> None:
68
+ """
69
+ Initialize AddClusters processor.
70
+
71
+ Args:
72
+ eventstream: Eventstream instance for metric calculation
73
+ segment_name: Name of the new segment column with cluster labels
74
+ metrics: List of metric configurations for MetricBuilder.
75
+ Each config is a dict with 'metric' and optional 'metric_args'.
76
+ Example: [
77
+ {"metric": "length"},
78
+ {"metric": "duration"},
79
+ {"metric": "event_count", "metric_args": {"event": "purchase"}}
80
+ ]
81
+ method: Clustering method - "kmeans" or "hdbscan"
82
+ scaler: Feature scaler - "minmax", "std", or None.
83
+ Default is "minmax".
84
+ n_clusters: Number of clusters for k-means (required for kmeans)
85
+ min_cluster_size: Minimum cluster size for HDBSCAN
86
+ cluster_selection_epsilon: Cluster selection epsilon for HDBSCAN
87
+ path_id_col: Path ID column (if None, taken from schema)
88
+ event_col: Event column (if None, taken from schema)
89
+ """
90
+ self.eventstream = eventstream
91
+ self.segment_name = segment_name
92
+ self.metrics = metrics
93
+ self.method = method
94
+ self.scaler = scaler
95
+ self.nmf_k = nmf_k
96
+ self.path_id_col = path_id_col
97
+ self.event_col = event_col
98
+
99
+ # Validate method and collect method-specific parameters
100
+ if method == "kmeans":
101
+ if n_clusters is None:
102
+ raise PreprocessingConfigError(PROCESSOR_NAME, "n_clusters is required for kmeans method")
103
+ self.method_params = {"n_clusters": n_clusters}
104
+ elif method == "hdbscan":
105
+ self.method_params = {}
106
+ if min_cluster_size is not None:
107
+ self.method_params["min_cluster_size"] = min_cluster_size
108
+ if cluster_selection_epsilon is not None:
109
+ self.method_params["cluster_selection_epsilon"] = cluster_selection_epsilon
110
+ else:
111
+ raise PreprocessingConfigError(PROCESSOR_NAME, f"Unknown clustering method: {method}. Use 'kmeans' or 'hdbscan'.")
112
+
113
+ super().__init__()
114
+
115
+ def apply(
116
+ self, df: pd.DataFrame, schema: EventstreamSchema
117
+ ) -> Tuple[pd.DataFrame, EventstreamSchema]:
118
+ """
119
+ Apply clustering to trajectories and add cluster labels as a new segment.
120
+
121
+ Args:
122
+ df: Input DataFrame with eventstream data
123
+ schema: EventstreamSchema with column definitions
124
+
125
+ Returns:
126
+ Tuple of (new_df, new_schema) with added cluster segment
127
+ """
128
+ # Validate segment name doesn't exist
129
+ if self.segment_name in df.columns:
130
+ if self.segment_name in schema.segment_cols:
131
+ raise PreprocessingConfigError(PROCESSOR_NAME, f"Segment '{self.segment_name}' already exists.")
132
+ else:
133
+ raise PreprocessingConfigError(
134
+ PROCESSOR_NAME,
135
+ f"Name '{self.segment_name}' is already reserved in the eventstream."
136
+ )
137
+
138
+ path_id_col = self.path_id_col or schema.path_col
139
+ event_col = self.event_col or schema.event_col # noqa: F841 - reserved for future use
140
+
141
+ # Build metrics using MetricBuilder
142
+ metric_builder = MetricBuilder(self.eventstream)
143
+ metrics_df = metric_builder.build_metrics(self.metrics, path_id_col)
144
+
145
+ if metrics_df.empty:
146
+ raise PreprocessingConfigError(PROCESSOR_NAME, "No metrics were computed. Check metric configurations.")
147
+
148
+ # Handle NaN values - fill with 0 for clustering
149
+ features = metrics_df.fillna(0).values
150
+
151
+ if features.shape[1] == 0:
152
+ raise PreprocessingConfigError(PROCESSOR_NAME, "No feature columns were generated from metrics.")
153
+
154
+ # Apply scaling
155
+ features_scaled = self._scale_features(features)
156
+
157
+ # Apply NMF dimensionality reduction if requested
158
+ if self.nmf_k is not None:
159
+ nmf = NMF(n_components=self.nmf_k, random_state=42)
160
+ features_scaled = nmf.fit_transform(features_scaled)
161
+
162
+ # Perform clustering
163
+ cluster_labels = self._cluster(features_scaled)
164
+
165
+ # Create cluster labels Series indexed by path_id
166
+ cluster_series = pd.Series(
167
+ cluster_labels,
168
+ index=metrics_df.index,
169
+ name=self.segment_name
170
+ )
171
+
172
+ # Convert labels to string for categorical representation
173
+ # HDBSCAN uses -1 for noise points, handle specially
174
+ cluster_series = cluster_series.apply(
175
+ lambda x: f"cluster_{x}" if x >= 0 else "noise"
176
+ )
177
+
178
+ # Map cluster labels to all events in the dataframe
179
+ new_df = df.copy()
180
+ new_df[self.segment_name] = new_df[path_id_col].map(cluster_series)
181
+ new_df[self.segment_name] = new_df[self.segment_name].astype("category")
182
+
183
+ # Update schema
184
+ new_schema = schema.copy()
185
+ new_schema.segment_cols.append(self.segment_name)
186
+
187
+ return new_df, new_schema
188
+
189
+ def _scale_features(self, features: np.ndarray) -> np.ndarray:
190
+ """
191
+ Scale features based on the configured method.
192
+
193
+ Args:
194
+ features: Raw feature matrix
195
+
196
+ Returns:
197
+ Scaled feature matrix
198
+ """
199
+ if self.scaler is None:
200
+ return features
201
+ elif self.scaler == "minmax":
202
+ scaler = MinMaxScaler()
203
+ return scaler.fit_transform(features)
204
+ elif self.scaler == "std":
205
+ scaler = StandardScaler()
206
+ return scaler.fit_transform(features)
207
+ else:
208
+ raise PreprocessingConfigError(PROCESSOR_NAME, f"Unknown scaler method: {self.scaler}")
209
+
210
+ def _cluster(self, features: np.ndarray) -> np.ndarray:
211
+ """
212
+ Perform clustering using the configured method.
213
+
214
+ Args:
215
+ features: Standardized feature matrix
216
+
217
+ Returns:
218
+ Array of cluster labels
219
+ """
220
+ if self.method == "kmeans":
221
+ clusterer = KMeans(
222
+ n_clusters=self.method_params["n_clusters"],
223
+ random_state=42,
224
+ n_init="auto"
225
+ )
226
+ return clusterer.fit_predict(features)
227
+ elif self.method == "hdbscan":
228
+ # Set defaults for HDBSCAN if not provided
229
+ min_cluster_size = self.method_params.get("min_cluster_size", 5)
230
+ cluster_selection_epsilon = self.method_params.get("cluster_selection_epsilon", 0.0)
231
+
232
+ clusterer = HDBSCAN(
233
+ min_cluster_size=min_cluster_size,
234
+ cluster_selection_epsilon=cluster_selection_epsilon
235
+ )
236
+ return clusterer.fit_predict(features)
237
+ else:
238
+ raise PreprocessingConfigError(PROCESSOR_NAME, f"Unknown clustering method: {self.method}")
@@ -0,0 +1,159 @@
1
+ from typing import List, Tuple
2
+
3
+ import duckdb
4
+ import pandas as pd
5
+
6
+ from hopscotch.data_processors.data_processor import DataProcessor
7
+ from hopscotch.eventstream.event_type import EventTypes
8
+ from hopscotch.eventstream.schema import EventstreamSchema
9
+ from hopscotch.exceptions import PreprocessingConfigError
10
+
11
+ PROCESSOR_NAME = "add_events"
12
+
13
+
14
+ class AddEvents(DataProcessor):
15
+
16
+ def __init__(
17
+ self,
18
+ new_event_name: str,
19
+ source_events: List[str] | None = None,
20
+ sql: str | None = None,
21
+ churn: dict | None = None,
22
+ ) -> None:
23
+ if not isinstance(new_event_name, str) or not new_event_name:
24
+ raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'new_event_name' must be a non-empty string.")
25
+
26
+ n_modes = sum([source_events is not None, sql is not None, churn is not None])
27
+ if n_modes != 1:
28
+ raise PreprocessingConfigError(
29
+ PROCESSOR_NAME,
30
+ "Exactly one of 'source_events', 'sql', or 'churn' must be provided."
31
+ )
32
+
33
+ if source_events is not None:
34
+ if not isinstance(source_events, list):
35
+ raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'source_events' must be a list.")
36
+ if not all(isinstance(e, str) for e in source_events):
37
+ raise PreprocessingConfigError(PROCESSOR_NAME, "All elements in 'source_events' must be strings.")
38
+
39
+ if sql is not None and not isinstance(sql, str):
40
+ raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'sql' must be a string.")
41
+
42
+ if churn is not None:
43
+ if not isinstance(churn, dict):
44
+ raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'churn' must be a dictionary.")
45
+ if "inactivity_days" not in churn:
46
+ raise PreprocessingConfigError(PROCESSOR_NAME, "Argument 'churn' must contain 'inactivity_days'.")
47
+ inactivity_days = churn["inactivity_days"]
48
+ if not isinstance(inactivity_days, (int, float)) or inactivity_days <= 0:
49
+ raise PreprocessingConfigError(
50
+ PROCESSOR_NAME, "Value 'churn.inactivity_days' must be a positive number."
51
+ )
52
+ active_events = churn.get("active_events")
53
+ if active_events is not None:
54
+ if not isinstance(active_events, list):
55
+ raise PreprocessingConfigError(
56
+ PROCESSOR_NAME, "Value 'churn.active_events' must be a list."
57
+ )
58
+ if not all(isinstance(e, str) for e in active_events):
59
+ raise PreprocessingConfigError(
60
+ PROCESSOR_NAME, "All elements in 'churn.active_events' must be strings."
61
+ )
62
+
63
+ self.new_event_name = new_event_name
64
+ self.source_events = source_events
65
+ self.sql = sql
66
+ self.churn = churn
67
+ super().__init__()
68
+
69
+ def apply(
70
+ self, df: pd.DataFrame, schema: EventstreamSchema
71
+ ) -> Tuple[pd.DataFrame, EventstreamSchema]:
72
+ if self.source_events is not None:
73
+ df_source = self._get_by_source_events(df, schema)
74
+ elif self.sql is not None:
75
+ df_source = self._get_by_sql(df, schema)
76
+ else:
77
+ df_source = self._get_by_churn(df, schema)
78
+
79
+ if df_source.empty:
80
+ return df, schema
81
+
82
+ event_types = EventTypes()
83
+ df_new = df_source.copy()
84
+ df_new[schema.event_col] = self.new_event_name
85
+ df_new[schema.event_type] = event_types.SYNTHETIC_EVENT.type
86
+ df_new[schema.subindex] = event_types.SYNTHETIC_EVENT.index
87
+
88
+ df = (
89
+ pd.concat([df, df_new])
90
+ .sort_values([schema.path_col, schema.timestamp, schema.subindex])
91
+ .reset_index(drop=True)
92
+ )
93
+
94
+ df[schema.event_col] = df[schema.event_col].astype("category")
95
+
96
+ return df, schema
97
+
98
+ def _get_by_source_events(self, df: pd.DataFrame, schema: EventstreamSchema) -> pd.DataFrame:
99
+ if not self.source_events:
100
+ return df.iloc[0:0]
101
+
102
+ existing = set(df[schema.event_col].cat.categories.tolist())
103
+ unknown = set(self.source_events) - existing
104
+ if unknown:
105
+ raise PreprocessingConfigError(
106
+ PROCESSOR_NAME,
107
+ f"Unknown event names in 'source_events': {sorted(unknown)}. "
108
+ f"Available events: {sorted(existing)}."
109
+ )
110
+
111
+ return df[df[schema.event_col].isin(self.source_events)].copy()
112
+
113
+ def _get_by_sql(self, df: pd.DataFrame, schema: EventstreamSchema) -> pd.DataFrame:
114
+ columns_old = set(df.columns)
115
+ eventstream = df # noqa: F841 — referenced by user SQL as "eventstream"
116
+ result = duckdb.sql(self.sql).df()
117
+ if set(result.columns) != columns_old:
118
+ raise PreprocessingConfigError(
119
+ PROCESSOR_NAME,
120
+ "The SQL query must return the same columns as the eventstream."
121
+ )
122
+ return result
123
+
124
+ def _get_by_churn(self, df: pd.DataFrame, schema: EventstreamSchema) -> pd.DataFrame:
125
+ path_col = schema.path_col
126
+ ts_col = schema.timestamp
127
+ subindex_col = schema.subindex
128
+ event_col = schema.event_col
129
+
130
+ inactivity_days = self.churn["inactivity_days"]
131
+ active_events = self.churn.get("active_events")
132
+
133
+ threshold_seconds = inactivity_days * 86400
134
+
135
+ # Filter to active events only if specified; otherwise all events count.
136
+ # LEAD looks only within the filtered set, so the "next active event"
137
+ # is found correctly. The overall dataset max comes from the full df.
138
+ active_filter = ""
139
+ if active_events is not None:
140
+ if not active_events:
141
+ return df.iloc[0:0]
142
+ quoted = ", ".join(f"'{e}'" for e in active_events)
143
+ active_filter = f"WHERE {event_col} IN ({quoted})"
144
+
145
+ query = f"""
146
+ WITH windowed AS (
147
+ SELECT *,
148
+ LEAD({ts_col}) OVER (
149
+ PARTITION BY {path_col} ORDER BY {ts_col}, {subindex_col}
150
+ ) AS _hop_next_ts,
151
+ (SELECT MAX({ts_col}) FROM df) AS _hop_dataset_end
152
+ FROM df
153
+ {active_filter}
154
+ )
155
+ SELECT * EXCLUDE (_hop_next_ts, _hop_dataset_end) FROM windowed
156
+ WHERE epoch(COALESCE(_hop_next_ts, _hop_dataset_end)) - epoch({ts_col})
157
+ > {threshold_seconds}
158
+ """
159
+ return duckdb.sql(query).df()