pegasource 0.2.0__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {pegasource-0.2.0 → pegasource-0.2.2}/PKG-INFO +5 -11
  2. {pegasource-0.2.0 → pegasource-0.2.2}/README.md +4 -10
  3. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/__init__.py +1 -1
  4. pegasource-0.2.2/pegasource/dataset_clustering/__init__.py +61 -0
  5. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_hardware.py +141 -19
  6. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/server.py +10 -0
  7. pegasource-0.2.2/pegasource/path_estimation/__init__.py +45 -0
  8. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/evaluate.py +147 -39
  9. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/generate_synthetic_datasets.py +0 -2
  10. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/graph_utils.py +16 -1
  11. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/io.py +103 -12
  12. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/metrics.py +48 -3
  13. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/run_method_evaluation.py +0 -2
  14. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/types.py +22 -3
  15. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/PKG-INFO +5 -11
  16. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/SOURCES.txt +0 -1
  17. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/entry_points.txt +0 -1
  18. {pegasource-0.2.0 → pegasource-0.2.2}/pyproject.toml +1 -2
  19. pegasource-0.2.0/pegasource/dataset_clustering/__init__.py +0 -44
  20. pegasource-0.2.0/pegasource/path_estimation/__init__.py +0 -20
  21. pegasource-0.2.0/pegasource/path_estimation/__main__.py +0 -65
  22. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/data/README.md +0 -0
  23. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/data/israel_roads.pkl.gz +0 -0
  24. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/_paths.py +0 -0
  25. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/app.js +0 -0
  26. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/data.json +0 -0
  27. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/index.html +0 -0
  28. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/cluster_viz/styles.css +0 -0
  29. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/custom_devices.py +0 -0
  30. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/clustered_output.csv +0 -0
  31. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/dirty_hardware_data_20k.csv +0 -0
  32. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/dirty_hardware_data_40k.csv +0 -0
  33. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/dirty_hardware_data_hebrew_40k.csv +0 -0
  34. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/hardware_data_40_percent_dirty.csv +0 -0
  35. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/hardware_data_hebrew_40_percent_dirty.csv +0 -0
  36. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/data/test_output.csv +0 -0
  37. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/dataset_generator.py +0 -0
  38. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/fetch_ifixit_devices.py +0 -0
  39. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/ifixit_devices.json +0 -0
  40. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/dataset_clustering/prepare_viz_data.py +0 -0
  41. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/__init__.py +0 -0
  42. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/_rv_graph_builder.py +0 -0
  43. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/_rv_preprocessing.py +0 -0
  44. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/_rv_visualization.py +0 -0
  45. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/distance.py +0 -0
  46. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/israel_roads.py +0 -0
  47. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/projection.py +0 -0
  48. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/geo/vectorizer.py +0 -0
  49. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/cache/london_walk_bbox.graphml +0 -0
  50. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/__init__.py +0 -0
  51. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/ekf.py +0 -0
  52. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/kf.py +0 -0
  53. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/particle.py +0 -0
  54. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/filters/ukf.py +0 -0
  55. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/geo_reference.py +0 -0
  56. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/gnn/__init__.py +0 -0
  57. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/gnn/estimate.py +0 -0
  58. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/graph_stitch.py +0 -0
  59. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/hmm_map_match.py +0 -0
  60. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/london_street_path.py +0 -0
  61. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/__init__.py +0 -0
  62. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/dataset.py +0 -0
  63. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/lstm_model.py +0 -0
  64. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/nn/transformer_model.py +0 -0
  65. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/plotting_utils.py +0 -0
  66. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/sample_observations.csv +0 -0
  67. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/sample_true_path.csv +0 -0
  68. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/path_estimation/viz.py +0 -0
  69. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/__init__.py +0 -0
  70. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/patterns.py +0 -0
  71. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/reader.py +0 -0
  72. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/report.py +0 -0
  73. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/pcap/stats.py +0 -0
  74. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/__init__.py +0 -0
  75. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/auto.py +0 -0
  76. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/models.py +0 -0
  77. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource/timeseries/utils.py +0 -0
  78. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/dependency_links.txt +0 -0
  79. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/requires.txt +0 -0
  80. {pegasource-0.2.0 → pegasource-0.2.2}/pegasource.egg-info/top_level.txt +0 -0
  81. {pegasource-0.2.0 → pegasource-0.2.2}/setup.cfg +0 -0
  82. {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_geo.py +0 -0
  83. {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_path_estimation.py +0 -0
  84. {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_pcap.py +0 -0
  85. {pegasource-0.2.0 → pegasource-0.2.2}/tests/test_timeseries.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pegasource
3
- Version: 0.2.0
3
+ Version: 0.2.2
4
4
  Summary: Offline-capable toolkit: PCAP, geo, time-series, clustering, and path estimation
5
5
  Author: Josef Berman
6
6
  License: MIT
@@ -240,13 +240,10 @@ python -m pegasource.dataset_clustering.prepare_viz_data
240
240
 
241
241
  ### Path estimation (trajectories)
242
242
 
243
- Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for full method set):
243
+ Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for the full estimator stack):
244
244
 
245
245
  ```bash
246
246
  pip install -e ".[path_estimation]"
247
- pegasource-path-estimation --observations run_1_observations.csv --true-path run_1_true_path.csv --output-dir out/
248
- # or
249
- python -m pegasource.path_estimation --observations ... --true-path ... --output-dir out/
250
247
  ```
251
248
 
252
249
  ```python
@@ -283,12 +280,9 @@ results = estimate_paths_only(
283
280
  # results["kf"] is an EstimationResult with times_s, east_m, north_m (or {"error": "..."})
284
281
  ```
285
282
 
286
- Synthetic data generation and batch method comparison:
287
-
288
- ```bash
289
- python -m pegasource.path_estimation.generate_synthetic_datasets --help
290
- python -m pegasource.path_estimation.run_method_evaluation --help # writes ./method_eval/
291
- ```
283
+ Synthetic data (``generate_dataset`` in ``pegasource.path_estimation.generate_synthetic_datasets``)
284
+ and batch method comparison (``main`` in ``pegasource.path_estimation.run_method_evaluation``) are
285
+ invoked from your own Python code, not as installed console commands.
292
286
 
293
287
  ---
294
288
 
@@ -189,13 +189,10 @@ python -m pegasource.dataset_clustering.prepare_viz_data
189
189
 
190
190
  ### Path estimation (trajectories)
191
191
 
192
- Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for full method set):
192
+ Evaluate reconstruction methods on observation CSVs vs a 1 Hz ground-truth path (requires **`[path_estimation]`** for the full estimator stack):
193
193
 
194
194
  ```bash
195
195
  pip install -e ".[path_estimation]"
196
- pegasource-path-estimation --observations run_1_observations.csv --true-path run_1_true_path.csv --output-dir out/
197
- # or
198
- python -m pegasource.path_estimation --observations ... --true-path ... --output-dir out/
199
196
  ```
200
197
 
201
198
  ```python
@@ -232,12 +229,9 @@ results = estimate_paths_only(
232
229
  # results["kf"] is an EstimationResult with times_s, east_m, north_m (or {"error": "..."})
233
230
  ```
234
231
 
235
- Synthetic data generation and batch method comparison:
236
-
237
- ```bash
238
- python -m pegasource.path_estimation.generate_synthetic_datasets --help
239
- python -m pegasource.path_estimation.run_method_evaluation --help # writes ./method_eval/
240
- ```
232
+ Synthetic data (``generate_dataset`` in ``pegasource.path_estimation.generate_synthetic_datasets``)
233
+ and batch method comparison (``main`` in ``pegasource.path_estimation.run_method_evaluation``) are
234
+ invoked from your own Python code, not as installed console commands.
241
235
 
242
236
  ---
243
237
 
@@ -10,5 +10,5 @@ pegasource.dataset_clustering Hardware inventory embedding + clustering (opti
10
10
  pegasource.path_estimation GPS/cellular path reconstruction (filters, graph, NN; optional torch stack)
11
11
  """
12
12
 
13
- __version__ = "0.2.0"
13
+ __version__ = "0.2.2"
14
14
  __all__ = ["pcap", "geo", "timeseries", "dataset_clustering", "path_estimation"]
@@ -0,0 +1,61 @@
1
+ """
2
+ Hardware inventory clustering: text embeddings + hierarchical clustering.
3
+
4
+ Reads a CSV of string cells, concatenates each row into one text line, encodes with a
5
+ `sentence-transformers <https://www.sbert.net/>`_ model (cosine-normalized vectors),
6
+ then clusters with scikit-learn: **agglomerative** clustering for smaller tables, or a
7
+ **two-phase** MiniBatchKMeans + agglomerative pipeline when row count exceeds
8
+ `DIRECT_CLUSTERING_LIMIT`.
9
+
10
+ **Public API** (import from ``pegasource.dataset_clustering``):
11
+
12
+ - `load_data` — load CSV as strings
13
+ - `build_text_representations` — one string per row
14
+ - `generate_embeddings` — requires optional ``[clustering]`` (sentence-transformers)
15
+ - `cluster_direct` / `cluster_twophase` / `cluster_embeddings` — label vectors
16
+ - `print_summary` — CLI-style cluster size table
17
+
18
+ **Optional tooling**: FastAPI server (``server``), iFixit catalog fetch, Excel export,
19
+ bundled ``cluster_viz`` UI. Install extras with ``pip install -e ".[clustering]"``.
20
+
21
+ Examples
22
+ --------
23
+ >>> # After: pip install -e ".[clustering]"
24
+ >>> from pegasource.dataset_clustering import (
25
+ ... load_data,
26
+ ... build_text_representations,
27
+ ... generate_embeddings,
28
+ ... cluster_embeddings,
29
+ ... )
30
+ >>> # df = load_data("data.csv")
31
+ >>> # emb = generate_embeddings(build_text_representations(df), "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 256)
32
+ >>> # labels = cluster_embeddings(emb, 0.3)
33
+
34
+ CLI and server::
35
+
36
+ python -m pegasource.dataset_clustering.cluster_hardware --help
37
+ python -m pegasource.dataset_clustering.server
38
+ pegasource-cluster-viz --port 8001
39
+ """
40
+
41
+ from .cluster_hardware import (
42
+ DIRECT_CLUSTERING_LIMIT,
43
+ build_text_representations,
44
+ cluster_direct,
45
+ cluster_embeddings,
46
+ cluster_twophase,
47
+ generate_embeddings,
48
+ load_data,
49
+ print_summary,
50
+ )
51
+
52
+ __all__ = [
53
+ "DIRECT_CLUSTERING_LIMIT",
54
+ "build_text_representations",
55
+ "cluster_direct",
56
+ "cluster_embeddings",
57
+ "cluster_twophase",
58
+ "generate_embeddings",
59
+ "load_data",
60
+ "print_summary",
61
+ ]
@@ -1,14 +1,15 @@
1
1
  #!/usr/bin/env python3
2
- """
3
- Cluster dirty hardware records using text embeddings + agglomerative hierarchical clustering.
2
+ r"""Cluster dirty hardware records using embeddings + agglomerative clustering.
3
+
4
+ This module is the core library behind ``python -m pegasource.dataset_clustering.cluster_hardware``
5
+ and the symbols re-exported from :mod:`pegasource.dataset_clustering`.
4
6
 
5
- For large datasets (>10K rows), uses a two-phase approach:
6
- Phase 1: Pre-group rows with MiniBatchKMeans into manageable chunks
7
- Phase 2: Run agglomerative clustering within each chunk, then merge labels
7
+ **Scaling:** For :math:`n >` ``DIRECT_CLUSTERING_LIMIT``, use :func:`cluster_twophase`
8
+ (via :func:`cluster_embeddings`) so memory stays bounded.
8
9
 
9
- Usage:
10
- python cluster_hardware.py --input data/dirty_hardware_data_40k.csv --threshold 0.3
11
- python cluster_hardware.py --input data/dirty_hardware_data_40k.csv --threshold 0.2 --sample-size 5000
10
+ See Also
11
+ --------
12
+ pegasource.dataset_clustering.server : interactive FastAPI + static UI
12
13
  """
13
14
 
14
15
  import argparse
@@ -23,7 +24,8 @@ import numpy as np
23
24
  from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
24
25
 
25
26
 
26
- # Maximum rows for direct agglomerative clustering (above this, use two-phase approach)
27
+ #: Maximum row count for single-pass agglomerative clustering. Above this,
28
+ #: :func:`cluster_embeddings` calls :func:`cluster_twophase` automatically.
27
29
  DIRECT_CLUSTERING_LIMIT = 15_000
28
30
 
29
31
  _DEFAULT_INPUT = PACKAGE_DIR / "data" / "dirty_hardware_data_40k.csv"
@@ -71,7 +73,24 @@ def parse_args():
71
73
 
72
74
 
73
75
  def load_data(path, sample_size=None):
74
- """Load CSV and optionally sample rows."""
76
+ """Load a CSV with all columns as strings (missing values become empty strings).
77
+
78
+ Parameters
79
+ ----------
80
+ path : str or os.PathLike
81
+ Input ``.csv`` path.
82
+ sample_size : int, optional
83
+ If set, only the first ``sample_size`` rows are kept (for quick tests).
84
+
85
+ Returns
86
+ -------
87
+ pandas.DataFrame
88
+ All columns have ``dtype`` str; ``NaN`` replaced with ``""``.
89
+
90
+ Notes
91
+ -----
92
+ Prints progress to stdout (row × column counts).
93
+ """
75
94
  print(f"📂 Loading data from {path}...")
76
95
  df = pd.read_csv(path, dtype=str).fillna("")
77
96
  if sample_size is not None:
@@ -81,7 +100,24 @@ def load_data(path, sample_size=None):
81
100
 
82
101
 
83
102
  def build_text_representations(df):
84
- """Concatenate all columns into a single text string per row."""
103
+ """Join every column of each row into one text line for embedding.
104
+
105
+ Columns are separated by ``" | "`` (space-pipe-space), preserving column order.
106
+
107
+ Parameters
108
+ ----------
109
+ df : pandas.DataFrame
110
+ Input table (typically all strings).
111
+
112
+ Returns
113
+ -------
114
+ list of str
115
+ Length ``len(df)``; one concatenated string per row.
116
+
117
+ Notes
118
+ -----
119
+ Prints the first row’s text to stdout as a sanity check.
120
+ """
85
121
  print("📝 Building text representations...")
86
122
  texts = df.apply(lambda row: " | ".join(row.values), axis=1).tolist()
87
123
  print(f" Example: {texts[0]!r}")
@@ -89,7 +125,32 @@ def build_text_representations(df):
89
125
 
90
126
 
91
127
  def generate_embeddings(texts, model_name, batch_size, device="cpu"):
92
- """Generate embeddings using a sentence-transformer model."""
128
+ """Encode texts with SentenceTransformer (L2-normalized for cosine distance).
129
+
130
+ Requires the optional **sentence-transformers** dependency
131
+ (``pip install -e ".[clustering]"``).
132
+
133
+ Parameters
134
+ ----------
135
+ texts : list of str
136
+ One sentence/line per inventory row.
137
+ model_name : str
138
+ Hugging Face model id (e.g. ``sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2``)
139
+ or path to a local model directory.
140
+ batch_size : int
141
+ ``model.encode`` batch size (GPU memory vs throughput).
142
+ device : str, default ``"cpu"``
143
+ ``"cpu"``, ``"cuda"``, or another torch device string.
144
+
145
+ Returns
146
+ -------
147
+ numpy.ndarray
148
+ Shape ``(n_rows, dim)``; rows are unit vectors if the backend normalizes.
149
+
150
+ Notes
151
+ -----
152
+ Local paths are resolved with :func:`os.path.abspath` when the path exists on disk.
153
+ """
93
154
  from sentence_transformers import SentenceTransformer
94
155
 
95
156
  # Resolve local paths to absolute (avoids cwd issues)
@@ -115,7 +176,22 @@ def generate_embeddings(texts, model_name, batch_size, device="cpu"):
115
176
 
116
177
 
117
178
  def cluster_direct(embeddings, threshold):
118
- """Run agglomerative clustering directly (for smaller datasets)."""
179
+ """Agglomerative clustering with cosine distance and average linkage.
180
+
181
+ Parameters
182
+ ----------
183
+ embeddings : numpy.ndarray
184
+ Shape ``(n, dim)``; rows are typically L2-normalized.
185
+ threshold : float
186
+ ``distance_threshold`` in ``AgglomerativeClustering`` (cosine metric). Smaller
187
+ values yield **more** clusters (tighter merges); larger values yield **fewer**
188
+ clusters. Typical range ~0.2–0.5 for normalized vectors.
189
+
190
+ Returns
191
+ -------
192
+ numpy.ndarray
193
+ Integer labels, shape ``(n,)``, not necessarily starting at 0 or contiguous.
194
+ """
119
195
  print(f"🔗 Clustering {len(embeddings):,} rows with distance_threshold={threshold}...")
120
196
  t0 = time.time()
121
197
  clustering = AgglomerativeClustering(
@@ -132,10 +208,24 @@ def cluster_direct(embeddings, threshold):
132
208
 
133
209
 
134
210
  def cluster_twophase(embeddings, threshold, n_pre_clusters=None):
135
- """
136
- Two-phase clustering for large datasets:
137
- Phase 1: MiniBatchKMeans to create manageable pre-groups
138
- Phase 2: Agglomerative clustering within each pre-group
211
+ """Scale to large *n*: KMeans pre-groups, then agglomerative within each group.
212
+
213
+ Global cluster ids are formed by offsetting each group’s local labels so they do not
214
+ collide across groups.
215
+
216
+ Parameters
217
+ ----------
218
+ embeddings : numpy.ndarray
219
+ Shape ``(n, dim)``.
220
+ threshold : float
221
+ Same meaning as in :func:`cluster_direct` (cosine / average linkage).
222
+ n_pre_clusters : int, optional
223
+ Number of KMeans buckets. If omitted, uses ``max(10, n // 5000)`` capped at ``n``.
224
+
225
+ Returns
226
+ -------
227
+ numpy.ndarray
228
+ Integer labels, shape ``(n,)``.
139
229
  """
140
230
  n = len(embeddings)
141
231
  if n_pre_clusters is None:
@@ -190,7 +280,25 @@ def cluster_twophase(embeddings, threshold, n_pre_clusters=None):
190
280
 
191
281
 
192
282
  def cluster_embeddings(embeddings, threshold, n_pre_clusters=None):
193
- """Choose clustering strategy based on dataset size."""
283
+ """Dispatch to :func:`cluster_direct` or :func:`cluster_twophase` by row count.
284
+
285
+ If ``len(embeddings) <= DIRECT_CLUSTERING_LIMIT`` uses direct agglomerative
286
+ clustering; otherwise uses the two-phase pipeline.
287
+
288
+ Parameters
289
+ ----------
290
+ embeddings : numpy.ndarray
291
+ Row embedding matrix.
292
+ threshold : float
293
+ Cosine distance threshold for agglomerative steps.
294
+ n_pre_clusters : int, optional
295
+ Forwarded to :func:`cluster_twophase` when the two-phase path is used.
296
+
297
+ Returns
298
+ -------
299
+ numpy.ndarray
300
+ Cluster label per row.
301
+ """
194
302
  n = len(embeddings)
195
303
  if n <= DIRECT_CLUSTERING_LIMIT:
196
304
  return cluster_direct(embeddings, threshold)
@@ -200,7 +308,21 @@ def cluster_embeddings(embeddings, threshold, n_pre_clusters=None):
200
308
 
201
309
 
202
310
  def print_summary(df, max_clusters=30, samples_per_cluster=3):
203
- """Print a summary of the clustering results."""
311
+ """Print cluster sizes and sample rows to stdout (requires ``cluster_id`` column).
312
+
313
+ Parameters
314
+ ----------
315
+ df : pandas.DataFrame
316
+ Must include a ``cluster_id`` column (e.g. after assigning labels).
317
+ max_clusters : int, default 30
318
+ Maximum number of largest clusters to list.
319
+ samples_per_cluster : int, default 3
320
+ Rows shown per cluster (excluding ``cluster_id`` in the display).
321
+
322
+ Returns
323
+ -------
324
+ None
325
+ """
204
326
  cluster_sizes = df.groupby("cluster_id").size().sort_values(ascending=False)
205
327
  n_clusters = len(cluster_sizes)
206
328
 
@@ -1,3 +1,13 @@
1
+ """Interactive clustering UI: FastAPI backend + static ``cluster_viz`` frontend.
2
+
3
+ Endpoints (JSON under ``/api``) load or upload a CSV, track embedding status, re-cluster
4
+ by threshold, and export Excel. Static files are served from the package
5
+ ``cluster_viz/`` directory. Run with ``python -m pegasource.dataset_clustering.server``
6
+ or the ``pegasource-cluster-viz`` console script (requires ``[clustering]`` extras).
7
+
8
+ The module exposes ``app`` (FastAPI instance) for ASGI servers and ``run()`` for CLI.
9
+ """
10
+
1
11
  import argparse
2
12
  import asyncio
3
13
  import json
@@ -0,0 +1,45 @@
1
+ """Trajectory reconstruction from asynchronous observations (local ENU meters).
2
+
3
+ Re-exports :func:`run_evaluation`, :func:`evaluate_path_estimation`, and
4
+ :func:`estimate_paths_only` from :mod:`pegasource.path_estimation.evaluate`, same pattern
5
+ as :mod:`pegasource.pcap` / :mod:`pegasource.geo`. Requires the optional **path_estimation**
6
+ dependencies (filterpy, torch, …); install with ``pip install -e ".[path_estimation]"``.
7
+
8
+ Because Python loads this file when importing any submodule (e.g.
9
+ ``import pegasource.path_estimation.metrics``), loading the package also pulls in
10
+ ``evaluate`` and those dependencies. Previously, lazy :func:`__getattr__` deferred that
11
+ until you accessed ``run_evaluation``; explicit imports match the rest of pegasource at
12
+ the cost of a heavier first import.
13
+
14
+ Quick start::
15
+
16
+ from pegasource.path_estimation import run_evaluation
17
+ from pathlib import Path
18
+
19
+ run_evaluation(
20
+ Path("obs.csv"),
21
+ Path("true.csv"),
22
+ Path("out"),
23
+ methods=["kf", "dijkstra"],
24
+ )
25
+
26
+ Examples
27
+ --------
28
+ >>> from pegasource.path_estimation.metrics import compute_all_metrics
29
+ >>> import numpy as np
30
+ >>> t = np.zeros((10, 2))
31
+ >>> out = compute_all_metrics(t, t + 0.1)
32
+ >>> assert "rmse_m" in out
33
+ """
34
+
35
+ from .evaluate import (
36
+ estimate_paths_only,
37
+ evaluate_path_estimation,
38
+ run_evaluation,
39
+ )
40
+
41
+ __all__ = [
42
+ "estimate_paths_only",
43
+ "evaluate_path_estimation",
44
+ "run_evaluation",
45
+ ]
@@ -1,4 +1,21 @@
1
- """Run estimators, compute metrics, save JSON summary and figures."""
1
+ """High-level evaluation API: run estimators, compare to ground truth, save artifacts.
2
+
3
+ **Method registry** (``METHOD_REGISTRY``): graph stitchers ``dijkstra`` / ``astar``,
4
+ ``hmm``, Kalman-family filters ``kf`` / ``ekf`` / ``ukf`` / ``particle``, ``gnn``, plus
5
+ ``lstm`` / ``transformer`` implemented in this file. Supervised keys ``lstm``,
6
+ ``transformer``, ``gnn`` require a real ``*_true_path.csv`` (see ``METHODS_REQUIRING_GROUND_TRUTH``).
7
+
8
+ **Typical entry points**
9
+
10
+ - :func:`run_evaluation` — loads CSVs and the default projected OSM graph from
11
+ :func:`~pegasource.path_estimation.graph_utils.get_projected_graph`, writes
12
+ ``metrics.json`` and figures.
13
+ - :func:`evaluate_path_estimation` — same pipeline but you pass the ``road_graph``.
14
+ - :func:`estimate_paths_only` — no true path file; cannot run supervised methods.
15
+
16
+ Output dicts map method name → either metric scores (see :func:`pegasource.path_estimation.metrics.compute_all_metrics`)
17
+ plus ``meta``, or ``{"error": "..."}`` if the estimator raised.
18
+ """
2
19
 
3
20
  from __future__ import annotations
4
21
 
@@ -85,6 +102,18 @@ def _estimate_transformer(
85
102
 
86
103
 
87
104
  def torch_device(name: Optional[str] = None):
105
+ """Return a ``torch.device``, defaulting to CUDA when available.
106
+
107
+ Parameters
108
+ ----------
109
+ name : str, optional
110
+ If given (e.g. ``"cuda"``, ``"cpu"``), that device is returned. If ``None``,
111
+ uses CUDA when :func:`torch.cuda.is_available` else CPU.
112
+
113
+ Returns
114
+ -------
115
+ torch.device
116
+ """
88
117
  import torch as _torch
89
118
 
90
119
  if name:
@@ -199,23 +228,43 @@ def estimate_paths_only(
199
228
  ) -> Dict[str, Any]:
200
229
  """Estimate paths from observations only (no ground-truth CSV).
201
230
 
202
- Builds an internal time grid from the observation span at ``output_hz`` (default 1 Hz).
203
- Methods that require supervised training (**lstm**, **transformer**, **gnn**) are not
204
- supported here; use :func:`evaluate_path_estimation` with a true path file.
205
-
206
- Args:
207
- observations_csv: Path to ``*_observations.csv``.
208
- road_graph: Road network graph for map-based methods (ignored by pure filters).
209
- methods: e.g. ``["kf", "ukf", "dijkstra"]``.
210
- output_hz: Sample rate for the output trajectory timeline (Hz).
211
- output_dir: If set and ``plot`` is True, writes ``figures/<method>_path_enu.png``.
212
- plot: ENU figure with **estimate + observations only** (no true path line).
213
- plot_map: Not supported without real lon/lat ground truth; raises if True.
214
- device: Torch device (unused unless extended).
215
- seed: RNG seed.
216
-
217
- Returns:
218
- Per method: :class:`EstimationResult`, or ``{"error": "..."}`` on failure.
231
+ Builds a **stub** ground-truth frame with :func:`pegasource.path_estimation.io.stub_true_path_from_observations`
232
+ so filter/graph code paths see a consistent time axis; **no real positions** exist,
233
+ so metrics against ``true_x``/``true_y`` are meaningless. Do **not** use for
234
+ ``lstm``, ``transformer``, or ``gnn``.
235
+
236
+ Parameters
237
+ ----------
238
+ observations_csv : pathlib.Path
239
+ ``*_observations.csv`` with ``timestamp_s`` and source columns (see IO loaders).
240
+ road_graph : networkx.MultiDiGraph or compatible
241
+ Projected street graph for map-matching methods; ignored by pure filters.
242
+ methods : list of str
243
+ Lower-case names (e.g. ``["kf","dijkstra","hmm"]``). Raises if any of
244
+ ``lstm``, ``transformer``, ``gnn`` appear.
245
+ output_hz : float, default 1.0
246
+ Stub timeline sampling rate (Hz).
247
+ output_dir : pathlib.Path, optional
248
+ If set and ``plot`` is True, creates ``output_dir/figures/<method>_path_enu.png``.
249
+ plot : bool, default False
250
+ If True and ``output_dir`` is set, writes ENU overlays **without** a true path polyline.
251
+ plot_map : bool, default False
252
+ Must stay False (no real lon/lat); raises ``ValueError`` otherwise.
253
+ device : str, optional
254
+ Passed to neural estimators (unused for filter-only methods).
255
+ seed : int, default 0
256
+ RNG seed for stochastic methods.
257
+
258
+ Returns
259
+ -------
260
+ dict
261
+ Keys are method names. Values are :class:`~pegasource.path_estimation.types.EstimationResult`
262
+ or ``{"error": "<message>"}``.
263
+
264
+ Raises
265
+ ------
266
+ ValueError
267
+ If ``plot_map`` is True, or if supervised method names are requested.
219
268
  """
220
269
  names = [m.strip().lower() for m in methods]
221
270
  forbidden = sorted({m for m in names if m in METHODS_REQUIRING_GROUND_TRUTH})
@@ -261,26 +310,39 @@ def evaluate_path_estimation(
261
310
  device: Optional[str] = None,
262
311
  seed: int = 0,
263
312
  ) -> Dict[str, Dict]:
264
- """Run path estimation for CSV paths and a caller-supplied road graph.
265
-
266
- Loads observations and ground truth from disk, then runs each method in
267
- ``methods`` using ``road_graph`` for graph-based estimators (Dijkstra, A*,
268
- HMM, GNN, …). Filter methods (KF, EKF, …) ignore the graph.
269
-
270
- Args:
271
- observations_csv: Path to ``*_observations.csv`` (mixed GPS / circle / cell).
272
- true_path_csv: Path to ``*_true_path.csv`` (1 Hz ground truth).
273
- road_graph: Projected road network (e.g. OSM ``MultiDiGraph`` with ``x``, ``y``,
274
- ``crs`` as expected by ``graph_stitch`` / GNN).
275
- methods: Method names (e.g. ``["kf", "dijkstra", "gnn"]``). Unknown names raise.
276
- output_dir: If set, writes ``metrics.json`` and, when plotting, ``figures/``.
277
- plot: Write ENU overlay PNGs under ``output_dir/figures`` when ``output_dir`` is set.
278
- plot_map: Write Web Mercator map PNGs when ``output_dir`` is set.
279
- device: Torch device string for LSTM / Transformer / GNN.
280
- seed: RNG seed for stochastic estimators.
281
-
282
- Returns:
283
- Per-method dicts: metric scores and ``meta``, or ``{"error": "..."}`` on failure.
313
+ """Load CSVs and run estimators with a **caller-supplied** projected road graph.
314
+
315
+ Graph-based methods (``dijkstra``, ``astar``, ``hmm``, ``gnn``) use ``road_graph``;
316
+ filters (``kf``, ``ekf``, ``ukf``, ``particle``) ignore it. Neural sequence models
317
+ train against ``true_path_csv``.
318
+
319
+ Parameters
320
+ ----------
321
+ observations_csv : pathlib.Path
322
+ Event table: must include ``timestamp_s`` and columns per ``source_type``.
323
+ true_path_csv : pathlib.Path
324
+ 1 Hz (or regular) ground truth: ``timestamp_s``, ``true_x``, ``true_y``, etc.
325
+ road_graph : networkx.MultiDiGraph
326
+ Projected OSM-style graph (``x``, ``y`` node attrs, ``crs`` on ``G.graph``).
327
+ methods : list of str
328
+ Subset of registered method names; unknown names cause errors inside :func:`_run_methods`.
329
+ output_dir : pathlib.Path, optional
330
+ If set, writes ``metrics.json`` and, when ``plot``/``plot_map``, PNGs under ``figures/``.
331
+ plot : bool, default False
332
+ Save ENU matplotlib overlays (estimate vs true + observations).
333
+ plot_map : bool, default False
334
+ Save Web-Mercator basemap figures (may download tiles; slower).
335
+ device : str, optional
336
+ Torch device for ``lstm``, ``transformer``, ``gnn``.
337
+ seed : int, default 0
338
+ Base seed for RNGs inside stochastic estimators.
339
+
340
+ Returns
341
+ -------
342
+ dict[str, dict]
343
+ Each key is a method name. On success, value is a flat dict of metrics from
344
+ :func:`pegasource.path_estimation.metrics.compute_all_metrics` plus a ``meta`` field;
345
+ on failure, ``{"error": "<message>"}``.
284
346
  """
285
347
  obs_df = load_observations_csv(observations_csv)
286
348
  true_df = load_true_path_csv(true_path_csv)
@@ -309,7 +371,40 @@ def run_evaluation(
309
371
  device: Optional[str] = None,
310
372
  seed: int = 0,
311
373
  ) -> Dict[str, Dict]:
312
- """Run selected methods; write ``metrics.json`` and figures under ``output_dir``."""
374
+ """Convenience wrapper: load CSVs, obtain default graph, evaluate, write ``output_dir``.
375
+
376
+ Calls :func:`pegasource.path_estimation.graph_utils.get_projected_graph` for the same
377
+ walkable OSM graph used by synthetic data generation (cached under the package).
378
+
379
+ Parameters
380
+ ----------
381
+ observations_csv : pathlib.Path
382
+ See :func:`evaluate_path_estimation`.
383
+ true_path_csv : pathlib.Path
384
+ See :func:`evaluate_path_estimation`.
385
+ output_dir : pathlib.Path
386
+ Created if needed; receives ``metrics.json`` and ``figures/`` when plotting.
387
+ methods : list of str, optional
388
+ If ``None``, uses ``list(METHOD_REGISTRY.keys())`` (dijkstra, astar, hmm, kf, ekf,
389
+ ukf, particle, gnn). Pass explicit names to add ``lstm`` or ``transformer``.
390
+ plot : bool, default True
391
+ Write ENU figures to ``output_dir/figures``.
392
+ plot_map : bool, default False
393
+ If True, also writes map tiles figures (requires network for contextily).
394
+ device : str, optional
395
+ Torch device for neural estimators.
396
+ seed : int, default 0
397
+ RNG seed forwarded to estimators.
398
+
399
+ Returns
400
+ -------
401
+ dict[str, dict]
402
+ Same structure as :func:`evaluate_path_estimation`.
403
+
404
+ See Also
405
+ --------
406
+ evaluate_path_estimation : supply your own ``road_graph``.
407
+ """
313
408
  obs_df = load_observations_csv(observations_csv)
314
409
  true_df = load_true_path_csv(true_path_csv)
315
410
  G = get_projected_graph()
@@ -330,6 +425,19 @@ def run_evaluation(
330
425
 
331
426
 
332
427
  def print_summary(summary: Dict[str, Dict]) -> None:
428
+ """Print one line per method: RMSE/MAE or an error string (CLI helper).
429
+
430
+ Parameters
431
+ ----------
432
+ summary : dict[str, dict]
433
+ Mapping produced by :func:`run_evaluation` / :func:`evaluate_path_estimation`
434
+ (or compatible metric dicts).
435
+
436
+ Returns
437
+ -------
438
+ None
439
+ Writes to stdout only.
440
+ """
333
441
  for k, v in summary.items():
334
442
  if "error" in v:
335
443
  print(f"{k}: ERROR — {v['error']}")
@@ -1005,5 +1005,3 @@ def main() -> None:
1005
1005
  print(f"Output folder: {args.output_dir.resolve()}")
1006
1006
 
1007
1007
 
1008
- if __name__ == "__main__":
1009
- main()