pegasource 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pegasource-0.2.0 → pegasource-0.2.2}/PKG-INFO +5 -11
- {pegasource-0.2.0 → pegasource-0.2.2}/README.md +4 -10
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/__init__.py +1 -1
- pegasource-0.2.2/pegasource/dataset_clustering/__init__.py +61 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_hardware.py +141 -19
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/server.py +10 -0
- pegasource-0.2.2/pegasource/path_estimation/__init__.py +45 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/evaluate.py +147 -39
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/generate_synthetic_datasets.py +0 -2
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/graph_utils.py +16 -1
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/io.py +103 -12
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/metrics.py +48 -3
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/run_method_evaluation.py +0 -2
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/types.py +22 -3
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/PKG-INFO +5 -11
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/SOURCES.txt +0 -1
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/entry_points.txt +0 -1
- {pegasource-0.2.0 → pegasource-0.2.2}/pyproject.toml +1 -2
- pegasource-0.2.0/pegasource/dataset_clustering/__init__.py +0 -44
- pegasource-0.2.0/pegasource/path_estimation/__init__.py +0 -20
- pegasource-0.2.0/pegasource/path_estimation/__main__.py +0 -65
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/data/README.md +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/data/israel_roads.pkl.gz +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/_paths.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/app.js +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/data.json +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/index.html +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/styles.css +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/custom_devices.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/clustered_output.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/dirty_hardware_data_20k.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/dirty_hardware_data_40k.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/dirty_hardware_data_hebrew_40k.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/hardware_data_40_percent_dirty.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/hardware_data_hebrew_40_percent_dirty.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/test_output.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/dataset_generator.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/fetch_ifixit_devices.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/ifixit_devices.json +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/prepare_viz_data.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/__init__.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/_rv_graph_builder.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/_rv_preprocessing.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/_rv_visualization.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/distance.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/israel_roads.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/projection.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/vectorizer.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/cache/london_walk_bbox.graphml +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/__init__.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/ekf.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/kf.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/particle.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/ukf.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/geo_reference.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/gnn/__init__.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/gnn/estimate.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/graph_stitch.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/hmm_map_match.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/london_street_path.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/__init__.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/dataset.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/lstm_model.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/transformer_model.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/plotting_utils.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/sample_observations.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/sample_true_path.csv +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/viz.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/__init__.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/patterns.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/reader.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/report.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/stats.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/__init__.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/auto.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/models.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/utils.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/dependency_links.txt +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/requires.txt +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/top_level.txt +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/setup.cfg +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_geo.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_path_estimation.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_pcap.py +0 -0
- {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_timeseries.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pegasource
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Offline-capable toolkit: PCAP, geo, time-series, clustering, and path estimation
|
|
5
5
|
Author: Josef Berman
|
|
6
6
|
License: MIT
|
|
@@ -240,13 +240,10 @@ python -m pegasource.dataset_clustering.prepare_viz_data
|
|
|
240
240
|
|
|
241
241
|
### Path estimation (trajectories)
|
|
242
242
|
|
|
243
|
-
Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for full
|
|
243
|
+
Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for the full estimator stack):
|
|
244
244
|
|
|
245
245
|
```bash
|
|
246
246
|
pip install -e ".[path_estimation]"
|
|
247
|
-
pegasource-path-estimation --observations run_1_observations.csv --true-path run_1_true_path.csv --output-dir out/
|
|
248
|
-
# or
|
|
249
|
-
python -m pegasource.path_estimation --observations ... --true-path ... --output-dir out/
|
|
250
247
|
```
|
|
251
248
|
|
|
252
249
|
```python
|
|
@@ -283,12 +280,9 @@ results = estimate_paths_only(
|
|
|
283
280
|
# results["kf"] is an EstimationResult with times_s, east_m, north_m (or {"error": "..."})
|
|
284
281
|
```
|
|
285
282
|
|
|
286
|
-
Synthetic data
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
python -m pegasource.path_estimation.generate_synthetic_datasets --help
|
|
290
|
-
python -m pegasource.path_estimation.run_method_evaluation --help # writes ./method_eval/
|
|
291
|
-
```
|
|
283
|
+
Synthetic data (``generate_dataset`` in ``pegasource.path_estimation.generate_synthetic_datasets``)
|
|
284
|
+
and batch method comparison (``main`` in ``pegasource.path_estimation.run_method_evaluation``) are
|
|
285
|
+
invoked from your own Python code, not as installed console commands.
|
|
292
286
|
|
|
293
287
|
---
|
|
294
288
|
|
|
@@ -189,13 +189,10 @@ python -m pegasource.dataset_clustering.prepare_viz_data
|
|
|
189
189
|
|
|
190
190
|
### Path estimation (trajectories)
|
|
191
191
|
|
|
192
|
-
Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for full
|
|
192
|
+
Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for the full estimator stack):
|
|
193
193
|
|
|
194
194
|
```bash
|
|
195
195
|
pip install -e ".[path_estimation]"
|
|
196
|
-
pegasource-path-estimation --observations run_1_observations.csv --true-path run_1_true_path.csv --output-dir out/
|
|
197
|
-
# or
|
|
198
|
-
python -m pegasource.path_estimation --observations ... --true-path ... --output-dir out/
|
|
199
196
|
```
|
|
200
197
|
|
|
201
198
|
```python
|
|
@@ -232,12 +229,9 @@ results = estimate_paths_only(
|
|
|
232
229
|
# results["kf"] is an EstimationResult with times_s, east_m, north_m (or {"error": "..."})
|
|
233
230
|
```
|
|
234
231
|
|
|
235
|
-
Synthetic data
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
python -m pegasource.path_estimation.generate_synthetic_datasets --help
|
|
239
|
-
python -m pegasource.path_estimation.run_method_evaluation --help # writes ./method_eval/
|
|
240
|
-
```
|
|
232
|
+
Synthetic data (``generate_dataset`` in ``pegasource.path_estimation.generate_synthetic_datasets``)
|
|
233
|
+
and batch method comparison (``main`` in ``pegasource.path_estimation.run_method_evaluation``) are
|
|
234
|
+
invoked from your own Python code, not as installed console commands.
|
|
241
235
|
|
|
242
236
|
---
|
|
243
237
|
|
|
@@ -10,5 +10,5 @@ pegasource.dataset_clustering Hardware inventory embedding + clustering (opti
|
|
|
10
10
|
pegasource.path_estimation GPS/cellular path reconstruction (filters, graph, NN; optional torch stack)
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
-
__version__ = "0.2.
|
|
13
|
+
__version__ = "0.2.2"
|
|
14
14
|
__all__ = ["pcap", "geo", "timeseries", "dataset_clustering", "path_estimation"]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Hardware inventory clustering: text embeddings + hierarchical clustering.
|
|
3
|
+
|
|
4
|
+
Reads a CSV of string cells, concatenates each row into one text line, encodes with a
|
|
5
|
+
`sentence-transformers <https://www.sbert.net/>`_ model (cosine-normalized vectors),
|
|
6
|
+
then clusters with scikit-learn: **agglomerative** clustering for smaller tables, or a
|
|
7
|
+
**two-phase** MiniBatchKMeans + agglomerative pipeline when row count exceeds
|
|
8
|
+
`DIRECT_CLUSTERING_LIMIT`.
|
|
9
|
+
|
|
10
|
+
**Public API** (import from ``pegasource.dataset_clustering``):
|
|
11
|
+
|
|
12
|
+
- `load_data` — load CSV as strings
|
|
13
|
+
- `build_text_representations` — one string per row
|
|
14
|
+
- `generate_embeddings` — requires optional ``[clustering]`` (sentence-transformers)
|
|
15
|
+
- `cluster_direct` / `cluster_twophase` / `cluster_embeddings` — label vectors
|
|
16
|
+
- `print_summary` — CLI-style cluster size table
|
|
17
|
+
|
|
18
|
+
**Optional tooling**: FastAPI server (``server``), iFixit catalog fetch, Excel export,
|
|
19
|
+
bundled ``cluster_viz`` UI. Install extras with ``pip install -e ".[clustering]"``.
|
|
20
|
+
|
|
21
|
+
Examples
|
|
22
|
+
--------
|
|
23
|
+
>>> # After: pip install -e ".[clustering]"
|
|
24
|
+
>>> from pegasource.dataset_clustering import (
|
|
25
|
+
... load_data,
|
|
26
|
+
... build_text_representations,
|
|
27
|
+
... generate_embeddings,
|
|
28
|
+
... cluster_embeddings,
|
|
29
|
+
... )
|
|
30
|
+
>>> # df = load_data("data.csv")
|
|
31
|
+
>>> # emb = generate_embeddings(build_text_representations(df), "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 256)
|
|
32
|
+
>>> # labels = cluster_embeddings(emb, 0.3)
|
|
33
|
+
|
|
34
|
+
CLI and server::
|
|
35
|
+
|
|
36
|
+
python -m pegasource.dataset_clustering.cluster_hardware --help
|
|
37
|
+
python -m pegasource.dataset_clustering.server
|
|
38
|
+
pegasource-cluster-viz --port 8001
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
from .cluster_hardware import (
|
|
42
|
+
DIRECT_CLUSTERING_LIMIT,
|
|
43
|
+
build_text_representations,
|
|
44
|
+
cluster_direct,
|
|
45
|
+
cluster_embeddings,
|
|
46
|
+
cluster_twophase,
|
|
47
|
+
generate_embeddings,
|
|
48
|
+
load_data,
|
|
49
|
+
print_summary,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
__all__ = [
|
|
53
|
+
"DIRECT_CLUSTERING_LIMIT",
|
|
54
|
+
"build_text_representations",
|
|
55
|
+
"cluster_direct",
|
|
56
|
+
"cluster_embeddings",
|
|
57
|
+
"cluster_twophase",
|
|
58
|
+
"generate_embeddings",
|
|
59
|
+
"load_data",
|
|
60
|
+
"print_summary",
|
|
61
|
+
]
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
|
|
2
|
+
r"""Cluster dirty hardware records using embeddings + agglomerative clustering.
|
|
3
|
+
|
|
4
|
+
This module is the core library behind ``python -m pegasource.dataset_clustering.cluster_hardware``
|
|
5
|
+
and the symbols re-exported from :mod:`pegasource.dataset_clustering`.
|
|
4
6
|
|
|
5
|
-
For
|
|
6
|
-
|
|
7
|
-
Phase 2: Run agglomerative clustering within each chunk, then merge labels
|
|
7
|
+
**Scaling:** For :math:`n >` ``DIRECT_CLUSTERING_LIMIT``, use :func:`cluster_twophase`
|
|
8
|
+
(via :func:`cluster_embeddings`) so memory stays bounded.
|
|
8
9
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
10
|
+
See Also
|
|
11
|
+
--------
|
|
12
|
+
pegasource.dataset_clustering.server : interactive FastAPI + static UI
|
|
12
13
|
"""
|
|
13
14
|
|
|
14
15
|
import argparse
|
|
@@ -23,7 +24,8 @@ import numpy as np
|
|
|
23
24
|
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
|
|
24
25
|
|
|
25
26
|
|
|
26
|
-
|
|
27
|
+
#: Maximum row count for single-pass agglomerative clustering. Above this,
|
|
28
|
+
#: :func:`cluster_embeddings` calls :func:`cluster_twophase` automatically.
|
|
27
29
|
DIRECT_CLUSTERING_LIMIT = 15_000
|
|
28
30
|
|
|
29
31
|
_DEFAULT_INPUT = PACKAGE_DIR / "data" / "dirty_hardware_data_40k.csv"
|
|
@@ -71,7 +73,24 @@ def parse_args():
|
|
|
71
73
|
|
|
72
74
|
|
|
73
75
|
def load_data(path, sample_size=None):
|
|
74
|
-
"""Load CSV
|
|
76
|
+
"""Load a CSV with all columns as strings (missing values become empty strings).
|
|
77
|
+
|
|
78
|
+
Parameters
|
|
79
|
+
----------
|
|
80
|
+
path : str or os.PathLike
|
|
81
|
+
Input ``.csv`` path.
|
|
82
|
+
sample_size : int, optional
|
|
83
|
+
If set, only the first ``sample_size`` rows are kept (for quick tests).
|
|
84
|
+
|
|
85
|
+
Returns
|
|
86
|
+
-------
|
|
87
|
+
pandas.DataFrame
|
|
88
|
+
All columns have ``dtype`` str; ``NaN`` replaced with ``""``.
|
|
89
|
+
|
|
90
|
+
Notes
|
|
91
|
+
-----
|
|
92
|
+
Prints progress to stdout (row × column counts).
|
|
93
|
+
"""
|
|
75
94
|
print(f"📂 Loading data from {path}...")
|
|
76
95
|
df = pd.read_csv(path, dtype=str).fillna("")
|
|
77
96
|
if sample_size is not None:
|
|
@@ -81,7 +100,24 @@ def load_data(path, sample_size=None):
|
|
|
81
100
|
|
|
82
101
|
|
|
83
102
|
def build_text_representations(df):
|
|
84
|
-
"""
|
|
103
|
+
"""Join every column of each row into one text line for embedding.
|
|
104
|
+
|
|
105
|
+
Columns are separated by ``" | "`` (space-pipe-space), preserving column order.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
df : pandas.DataFrame
|
|
110
|
+
Input table (typically all strings).
|
|
111
|
+
|
|
112
|
+
Returns
|
|
113
|
+
-------
|
|
114
|
+
list of str
|
|
115
|
+
Length ``len(df)``; one concatenated string per row.
|
|
116
|
+
|
|
117
|
+
Notes
|
|
118
|
+
-----
|
|
119
|
+
Prints the first row’s text to stdout as a sanity check.
|
|
120
|
+
"""
|
|
85
121
|
print("📝 Building text representations...")
|
|
86
122
|
texts = df.apply(lambda row: " | ".join(row.values), axis=1).tolist()
|
|
87
123
|
print(f" Example: {texts[0]!r}")
|
|
@@ -89,7 +125,32 @@ def build_text_representations(df):
|
|
|
89
125
|
|
|
90
126
|
|
|
91
127
|
def generate_embeddings(texts, model_name, batch_size, device="cpu"):
|
|
92
|
-
"""
|
|
128
|
+
"""Encode texts with SentenceTransformer (L2-normalized for cosine distance).
|
|
129
|
+
|
|
130
|
+
Requires the optional **sentence-transformers** dependency
|
|
131
|
+
(``pip install -e ".[clustering]"``).
|
|
132
|
+
|
|
133
|
+
Parameters
|
|
134
|
+
----------
|
|
135
|
+
texts : list of str
|
|
136
|
+
One sentence/line per inventory row.
|
|
137
|
+
model_name : str
|
|
138
|
+
Hugging Face model id (e.g. ``sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2``)
|
|
139
|
+
or path to a local model directory.
|
|
140
|
+
batch_size : int
|
|
141
|
+
``model.encode`` batch size (GPU memory vs throughput).
|
|
142
|
+
device : str, default ``"cpu"``
|
|
143
|
+
``"cpu"``, ``"cuda"``, or another torch device string.
|
|
144
|
+
|
|
145
|
+
Returns
|
|
146
|
+
-------
|
|
147
|
+
numpy.ndarray
|
|
148
|
+
Shape ``(n_rows, dim)``; rows are unit vectors if the backend normalizes.
|
|
149
|
+
|
|
150
|
+
Notes
|
|
151
|
+
-----
|
|
152
|
+
Local paths are resolved with :func:`os.path.abspath` when the path exists on disk.
|
|
153
|
+
"""
|
|
93
154
|
from sentence_transformers import SentenceTransformer
|
|
94
155
|
|
|
95
156
|
# Resolve local paths to absolute (avoids cwd issues)
|
|
@@ -115,7 +176,22 @@ def generate_embeddings(texts, model_name, batch_size, device="cpu"):
|
|
|
115
176
|
|
|
116
177
|
|
|
117
178
|
def cluster_direct(embeddings, threshold):
|
|
118
|
-
"""
|
|
179
|
+
"""Agglomerative clustering with cosine distance and average linkage.
|
|
180
|
+
|
|
181
|
+
Parameters
|
|
182
|
+
----------
|
|
183
|
+
embeddings : numpy.ndarray
|
|
184
|
+
Shape ``(n, dim)``; rows are typically L2-normalized.
|
|
185
|
+
threshold : float
|
|
186
|
+
``distance_threshold`` in ``AgglomerativeClustering`` (cosine metric). Smaller
|
|
187
|
+
values yield **more** clusters (tighter merges); larger values yield **fewer**
|
|
188
|
+
clusters. Typical range ~0.2–0.5 for normalized vectors.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
numpy.ndarray
|
|
193
|
+
Integer labels, shape ``(n,)``, not necessarily starting at 0 or contiguous.
|
|
194
|
+
"""
|
|
119
195
|
print(f"🔗 Clustering {len(embeddings):,} rows with distance_threshold={threshold}...")
|
|
120
196
|
t0 = time.time()
|
|
121
197
|
clustering = AgglomerativeClustering(
|
|
@@ -132,10 +208,24 @@ def cluster_direct(embeddings, threshold):
|
|
|
132
208
|
|
|
133
209
|
|
|
134
210
|
def cluster_twophase(embeddings, threshold, n_pre_clusters=None):
|
|
135
|
-
"""
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
211
|
+
"""Scale to large *n*: KMeans pre-groups, then agglomerative within each group.
|
|
212
|
+
|
|
213
|
+
Global cluster ids are formed by offsetting each group’s local labels so they do not
|
|
214
|
+
collide across groups.
|
|
215
|
+
|
|
216
|
+
Parameters
|
|
217
|
+
----------
|
|
218
|
+
embeddings : numpy.ndarray
|
|
219
|
+
Shape ``(n, dim)``.
|
|
220
|
+
threshold : float
|
|
221
|
+
Same meaning as in :func:`cluster_direct` (cosine / average linkage).
|
|
222
|
+
n_pre_clusters : int, optional
|
|
223
|
+
Number of KMeans buckets. If omitted, uses ``max(10, n // 5000)`` capped at ``n``.
|
|
224
|
+
|
|
225
|
+
Returns
|
|
226
|
+
-------
|
|
227
|
+
numpy.ndarray
|
|
228
|
+
Integer labels, shape ``(n,)``.
|
|
139
229
|
"""
|
|
140
230
|
n = len(embeddings)
|
|
141
231
|
if n_pre_clusters is None:
|
|
@@ -190,7 +280,25 @@ def cluster_twophase(embeddings, threshold, n_pre_clusters=None):
|
|
|
190
280
|
|
|
191
281
|
|
|
192
282
|
def cluster_embeddings(embeddings, threshold, n_pre_clusters=None):
|
|
193
|
-
"""
|
|
283
|
+
"""Dispatch to :func:`cluster_direct` or :func:`cluster_twophase` by row count.
|
|
284
|
+
|
|
285
|
+
If ``len(embeddings) <= DIRECT_CLUSTERING_LIMIT`` uses direct agglomerative
|
|
286
|
+
clustering; otherwise uses the two-phase pipeline.
|
|
287
|
+
|
|
288
|
+
Parameters
|
|
289
|
+
----------
|
|
290
|
+
embeddings : numpy.ndarray
|
|
291
|
+
Row embedding matrix.
|
|
292
|
+
threshold : float
|
|
293
|
+
Cosine distance threshold for agglomerative steps.
|
|
294
|
+
n_pre_clusters : int, optional
|
|
295
|
+
Forwarded to :func:`cluster_twophase` when the two-phase path is used.
|
|
296
|
+
|
|
297
|
+
Returns
|
|
298
|
+
-------
|
|
299
|
+
numpy.ndarray
|
|
300
|
+
Cluster label per row.
|
|
301
|
+
"""
|
|
194
302
|
n = len(embeddings)
|
|
195
303
|
if n <= DIRECT_CLUSTERING_LIMIT:
|
|
196
304
|
return cluster_direct(embeddings, threshold)
|
|
@@ -200,7 +308,21 @@ def cluster_embeddings(embeddings, threshold, n_pre_clusters=None):
|
|
|
200
308
|
|
|
201
309
|
|
|
202
310
|
def print_summary(df, max_clusters=30, samples_per_cluster=3):
|
|
203
|
-
"""Print
|
|
311
|
+
"""Print cluster sizes and sample rows to stdout (requires ``cluster_id`` column).
|
|
312
|
+
|
|
313
|
+
Parameters
|
|
314
|
+
----------
|
|
315
|
+
df : pandas.DataFrame
|
|
316
|
+
Must include a ``cluster_id`` column (e.g. after assigning labels).
|
|
317
|
+
max_clusters : int, default 30
|
|
318
|
+
Maximum number of largest clusters to list.
|
|
319
|
+
samples_per_cluster : int, default 3
|
|
320
|
+
Rows shown per cluster (excluding ``cluster_id`` in the display).
|
|
321
|
+
|
|
322
|
+
Returns
|
|
323
|
+
-------
|
|
324
|
+
None
|
|
325
|
+
"""
|
|
204
326
|
cluster_sizes = df.groupby("cluster_id").size().sort_values(ascending=False)
|
|
205
327
|
n_clusters = len(cluster_sizes)
|
|
206
328
|
|
|
@@ -1,3 +1,13 @@
|
|
|
1
|
+
"""Interactive clustering UI: FastAPI backend + static ``cluster_viz`` frontend.
|
|
2
|
+
|
|
3
|
+
Endpoints (JSON under ``/api``) load or upload a CSV, track embedding status, re-cluster
|
|
4
|
+
by threshold, and export Excel. Static files are served from the package
|
|
5
|
+
``cluster_viz/`` directory. Run with ``python -m pegasource.dataset_clustering.server``
|
|
6
|
+
or the ``pegasource-cluster-viz`` console script (requires ``[clustering]`` extras).
|
|
7
|
+
|
|
8
|
+
The module exposes ``app`` (FastAPI instance) for ASGI servers and ``run()`` for CLI.
|
|
9
|
+
"""
|
|
10
|
+
|
|
1
11
|
import argparse
|
|
2
12
|
import asyncio
|
|
3
13
|
import json
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Trajectory reconstruction from asynchronous observations (local ENU meters).
|
|
2
|
+
|
|
3
|
+
Re-exports :func:`run_evaluation`, :func:`evaluate_path_estimation`, and
|
|
4
|
+
:func:`estimate_paths_only` from :mod:`pegasource.path_estimation.evaluate`, same pattern
|
|
5
|
+
as :mod:`pegasource.pcap` / :mod:`pegasource.geo`. Requires the optional **path_estimation**
|
|
6
|
+
dependencies (filterpy, torch, …); install with ``pip install -e ".[path_estimation]"``.
|
|
7
|
+
|
|
8
|
+
Because Python loads this file when importing any submodule (e.g.
|
|
9
|
+
``import pegasource.path_estimation.metrics``), loading the package also pulls in
|
|
10
|
+
``evaluate`` and those dependencies. Previously, lazy :func:`__getattr__` deferred that
|
|
11
|
+
until you accessed ``run_evaluation``; explicit imports match the rest of pegasource at
|
|
12
|
+
the cost of a heavier first import.
|
|
13
|
+
|
|
14
|
+
Quick start::
|
|
15
|
+
|
|
16
|
+
from pegasource.path_estimation import run_evaluation
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
run_evaluation(
|
|
20
|
+
Path("obs.csv"),
|
|
21
|
+
Path("true.csv"),
|
|
22
|
+
Path("out"),
|
|
23
|
+
methods=["kf", "dijkstra"],
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
Examples
|
|
27
|
+
--------
|
|
28
|
+
>>> from pegasource.path_estimation.metrics import compute_all_metrics
|
|
29
|
+
>>> import numpy as np
|
|
30
|
+
>>> t = np.zeros((10, 2))
|
|
31
|
+
>>> out = compute_all_metrics(t, t + 0.1)
|
|
32
|
+
>>> assert "rmse_m" in out
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from .evaluate import (
|
|
36
|
+
estimate_paths_only,
|
|
37
|
+
evaluate_path_estimation,
|
|
38
|
+
run_evaluation,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
__all__ = [
|
|
42
|
+
"estimate_paths_only",
|
|
43
|
+
"evaluate_path_estimation",
|
|
44
|
+
"run_evaluation",
|
|
45
|
+
]
|
|
@@ -1,4 +1,21 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""High-level evaluation API: run estimators, compare to ground truth, save artifacts.
|
|
2
|
+
|
|
3
|
+
**Method registry** (``METHOD_REGISTRY``): graph stitchers ``dijkstra`` / ``astar``,
|
|
4
|
+
``hmm``, Kalman-family filters ``kf`` / ``ekf`` / ``ukf`` / ``particle``, ``gnn``, plus
|
|
5
|
+
``lstm`` / ``transformer`` implemented in this file. Supervised keys ``lstm``,
|
|
6
|
+
``transformer``, ``gnn`` require a real ``*_true_path.csv`` (see ``METHODS_REQUIRING_GROUND_TRUTH``).
|
|
7
|
+
|
|
8
|
+
**Typical entry points**
|
|
9
|
+
|
|
10
|
+
- :func:`run_evaluation` — loads CSVs and the default projected OSM graph from
|
|
11
|
+
:func:`~pegasource.path_estimation.graph_utils.get_projected_graph`, writes
|
|
12
|
+
``metrics.json`` and figures.
|
|
13
|
+
- :func:`evaluate_path_estimation` — same pipeline but you pass the ``road_graph``.
|
|
14
|
+
- :func:`estimate_paths_only` — no true path file; cannot run supervised methods.
|
|
15
|
+
|
|
16
|
+
Output dicts map method name → either metric scores (see :func:`pegasource.path_estimation.metrics.compute_all_metrics`)
|
|
17
|
+
plus ``meta``, or ``{"error": "..."}`` if the estimator raised.
|
|
18
|
+
"""
|
|
2
19
|
|
|
3
20
|
from __future__ import annotations
|
|
4
21
|
|
|
@@ -85,6 +102,18 @@ def _estimate_transformer(
|
|
|
85
102
|
|
|
86
103
|
|
|
87
104
|
def torch_device(name: Optional[str] = None):
|
|
105
|
+
"""Return a ``torch.device``, defaulting to CUDA when available.
|
|
106
|
+
|
|
107
|
+
Parameters
|
|
108
|
+
----------
|
|
109
|
+
name : str, optional
|
|
110
|
+
If given (e.g. ``"cuda"``, ``"cpu"``), that device is returned. If ``None``,
|
|
111
|
+
uses CUDA when :func:`torch.cuda.is_available` else CPU.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
torch.device
|
|
116
|
+
"""
|
|
88
117
|
import torch as _torch
|
|
89
118
|
|
|
90
119
|
if name:
|
|
@@ -199,23 +228,43 @@ def estimate_paths_only(
|
|
|
199
228
|
) -> Dict[str, Any]:
|
|
200
229
|
"""Estimate paths from observations only (no ground-truth CSV).
|
|
201
230
|
|
|
202
|
-
Builds
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
231
|
+
Builds a **stub** ground-truth frame with :func:`pegasource.path_estimation.io.stub_true_path_from_observations`
|
|
232
|
+
so filter/graph code paths see a consistent time axis; **no real positions** exist,
|
|
233
|
+
so metrics against ``true_x``/``true_y`` are meaningless. Do **not** use for
|
|
234
|
+
``lstm``, ``transformer``, or ``gnn``.
|
|
235
|
+
|
|
236
|
+
Parameters
|
|
237
|
+
----------
|
|
238
|
+
observations_csv : pathlib.Path
|
|
239
|
+
``*_observations.csv`` with ``timestamp_s`` and source columns (see IO loaders).
|
|
240
|
+
road_graph : networkx.MultiDiGraph or compatible
|
|
241
|
+
Projected street graph for map-matching methods; ignored by pure filters.
|
|
242
|
+
methods : list of str
|
|
243
|
+
Lower-case names (e.g. ``["kf","dijkstra","hmm"]``). Raises if any of
|
|
244
|
+
``lstm``, ``transformer``, ``gnn`` appear.
|
|
245
|
+
output_hz : float, default 1.0
|
|
246
|
+
Stub timeline sampling rate (Hz).
|
|
247
|
+
output_dir : pathlib.Path, optional
|
|
248
|
+
If set and ``plot`` is True, creates ``output_dir/figures/<method>_path_enu.png``.
|
|
249
|
+
plot : bool, default False
|
|
250
|
+
If True and ``output_dir`` is set, writes ENU overlays **without** a true path polyline.
|
|
251
|
+
plot_map : bool, default False
|
|
252
|
+
Must stay False (no real lon/lat); raises ``ValueError`` otherwise.
|
|
253
|
+
device : str, optional
|
|
254
|
+
Passed to neural estimators (unused for filter-only methods).
|
|
255
|
+
seed : int, default 0
|
|
256
|
+
RNG seed for stochastic methods.
|
|
257
|
+
|
|
258
|
+
Returns
|
|
259
|
+
-------
|
|
260
|
+
dict
|
|
261
|
+
Keys are method names. Values are :class:`~pegasource.path_estimation.types.EstimationResult`
|
|
262
|
+
or ``{"error": "<message>"}``.
|
|
263
|
+
|
|
264
|
+
Raises
|
|
265
|
+
------
|
|
266
|
+
ValueError
|
|
267
|
+
If ``plot_map`` is True, or if supervised method names are requested.
|
|
219
268
|
"""
|
|
220
269
|
names = [m.strip().lower() for m in methods]
|
|
221
270
|
forbidden = sorted({m for m in names if m in METHODS_REQUIRING_GROUND_TRUTH})
|
|
@@ -261,26 +310,39 @@ def evaluate_path_estimation(
|
|
|
261
310
|
device: Optional[str] = None,
|
|
262
311
|
seed: int = 0,
|
|
263
312
|
) -> Dict[str, Dict]:
|
|
264
|
-
"""
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
``
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
313
|
+
"""Load CSVs and run estimators with a **caller-supplied** projected road graph.
|
|
314
|
+
|
|
315
|
+
Graph-based methods (``dijkstra``, ``astar``, ``hmm``, ``gnn``) use ``road_graph``;
|
|
316
|
+
filters (``kf``, ``ekf``, ``ukf``, ``particle``) ignore it. Neural sequence models
|
|
317
|
+
train against ``true_path_csv``.
|
|
318
|
+
|
|
319
|
+
Parameters
|
|
320
|
+
----------
|
|
321
|
+
observations_csv : pathlib.Path
|
|
322
|
+
Event table: must include ``timestamp_s`` and columns per ``source_type``.
|
|
323
|
+
true_path_csv : pathlib.Path
|
|
324
|
+
1 Hz (or regular) ground truth: ``timestamp_s``, ``true_x``, ``true_y``, etc.
|
|
325
|
+
road_graph : networkx.MultiDiGraph
|
|
326
|
+
Projected OSM-style graph (``x``, ``y`` node attrs, ``crs`` on ``G.graph``).
|
|
327
|
+
methods : list of str
|
|
328
|
+
Subset of registered method names; unknown names cause errors inside :func:`_run_methods`.
|
|
329
|
+
output_dir : pathlib.Path, optional
|
|
330
|
+
If set, writes ``metrics.json`` and, when ``plot``/``plot_map``, PNGs under ``figures/``.
|
|
331
|
+
plot : bool, default False
|
|
332
|
+
Save ENU matplotlib overlays (estimate vs true + observations).
|
|
333
|
+
plot_map : bool, default False
|
|
334
|
+
Save Web-Mercator basemap figures (may download tiles; slower).
|
|
335
|
+
device : str, optional
|
|
336
|
+
Torch device for ``lstm``, ``transformer``, ``gnn``.
|
|
337
|
+
seed : int, default 0
|
|
338
|
+
Base seed for RNGs inside stochastic estimators.
|
|
339
|
+
|
|
340
|
+
Returns
|
|
341
|
+
-------
|
|
342
|
+
dict[str, dict]
|
|
343
|
+
Each key is a method name. On success, value is a flat dict of metrics from
|
|
344
|
+
:func:`pegasource.path_estimation.metrics.compute_all_metrics` plus a ``meta`` field;
|
|
345
|
+
on failure, ``{"error": "<message>"}``.
|
|
284
346
|
"""
|
|
285
347
|
obs_df = load_observations_csv(observations_csv)
|
|
286
348
|
true_df = load_true_path_csv(true_path_csv)
|
|
@@ -309,7 +371,40 @@ def run_evaluation(
|
|
|
309
371
|
device: Optional[str] = None,
|
|
310
372
|
seed: int = 0,
|
|
311
373
|
) -> Dict[str, Dict]:
|
|
312
|
-
"""
|
|
374
|
+
"""Convenience wrapper: load CSVs, obtain default graph, evaluate, write ``output_dir``.
|
|
375
|
+
|
|
376
|
+
Calls :func:`pegasource.path_estimation.graph_utils.get_projected_graph` for the same
|
|
377
|
+
walkable OSM graph used by synthetic data generation (cached under the package).
|
|
378
|
+
|
|
379
|
+
Parameters
|
|
380
|
+
----------
|
|
381
|
+
observations_csv : pathlib.Path
|
|
382
|
+
See :func:`evaluate_path_estimation`.
|
|
383
|
+
true_path_csv : pathlib.Path
|
|
384
|
+
See :func:`evaluate_path_estimation`.
|
|
385
|
+
output_dir : pathlib.Path
|
|
386
|
+
Created if needed; receives ``metrics.json`` and ``figures/`` when plotting.
|
|
387
|
+
methods : list of str, optional
|
|
388
|
+
If ``None``, uses ``list(METHOD_REGISTRY.keys())`` (dijkstra, astar, hmm, kf, ekf,
|
|
389
|
+
ukf, particle, gnn). Pass explicit names to add ``lstm`` or ``transformer``.
|
|
390
|
+
plot : bool, default True
|
|
391
|
+
Write ENU figures to ``output_dir/figures``.
|
|
392
|
+
plot_map : bool, default False
|
|
393
|
+
If True, also writes map tiles figures (requires network for contextily).
|
|
394
|
+
device : str, optional
|
|
395
|
+
Torch device for neural estimators.
|
|
396
|
+
seed : int, default 0
|
|
397
|
+
RNG seed forwarded to estimators.
|
|
398
|
+
|
|
399
|
+
Returns
|
|
400
|
+
-------
|
|
401
|
+
dict[str, dict]
|
|
402
|
+
Same structure as :func:`evaluate_path_estimation`.
|
|
403
|
+
|
|
404
|
+
See Also
|
|
405
|
+
--------
|
|
406
|
+
evaluate_path_estimation : supply your own ``road_graph``.
|
|
407
|
+
"""
|
|
313
408
|
obs_df = load_observations_csv(observations_csv)
|
|
314
409
|
true_df = load_true_path_csv(true_path_csv)
|
|
315
410
|
G = get_projected_graph()
|
|
@@ -330,6 +425,19 @@ def run_evaluation(
|
|
|
330
425
|
|
|
331
426
|
|
|
332
427
|
def print_summary(summary: Dict[str, Dict]) -> None:
|
|
428
|
+
"""Print one line per method: RMSE/MAE or an error string (CLI helper).
|
|
429
|
+
|
|
430
|
+
Parameters
|
|
431
|
+
----------
|
|
432
|
+
summary : dict[str, dict]
|
|
433
|
+
Mapping produced by :func:`run_evaluation` / :func:`evaluate_path_estimation`
|
|
434
|
+
(or compatible metric dicts).
|
|
435
|
+
|
|
436
|
+
Returns
|
|
437
|
+
-------
|
|
438
|
+
None
|
|
439
|
+
Writes to stdout only.
|
|
440
|
+
"""
|
|
333
441
|
for k, v in summary.items():
|
|
334
442
|
if "error" in v:
|
|
335
443
|
print(f"{k}: ERROR — {v['error']}")
|