guidepost 0.2.18__tar.gz → 0.2.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {guidepost-0.2.18/guidepost.egg-info → guidepost-0.2.19}/PKG-INFO +2 -2
- {guidepost-0.2.18 → guidepost-0.2.19}/guidepost/__init__.py +1 -1
- guidepost-0.2.19/guidepost/guidepost.py +80 -0
- guidepost-0.2.19/guidepost/trailmark.py +65 -0
- guidepost-0.2.19/guidepost/utils.py +178 -0
- guidepost-0.2.19/guidepost/version.py +2 -0
- {guidepost-0.2.18 → guidepost-0.2.19/guidepost.egg-info}/PKG-INFO +2 -2
- {guidepost-0.2.18 → guidepost-0.2.19}/guidepost.egg-info/SOURCES.txt +2 -1
- {guidepost-0.2.18 → guidepost-0.2.19}/setup.py +2 -2
- guidepost-0.2.18/guidepost/guidepost.js +0 -2340
- guidepost-0.2.18/guidepost/guidepost.py +0 -107
- guidepost-0.2.18/guidepost/version.py +0 -2
- {guidepost-0.2.18 → guidepost-0.2.19}/LICENSE +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/MANIFEST.in +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/README.md +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/guidepost.egg-info/dependency_links.txt +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/guidepost.egg-info/requires.txt +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/guidepost.egg-info/top_level.txt +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/pyproject.toml +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/setup.cfg +0 -0
- {guidepost-0.2.18 → guidepost-0.2.19}/tutorials/__init__.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: guidepost
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.19
|
|
4
4
|
Summary: Guidepost. An overview visualization for understanding supercomputer queue data.
|
|
5
5
|
Home-page: https://github.com/cscully-allison/guidepost
|
|
6
6
|
Author: Connor Scully-Allison
|
|
@@ -8,7 +8,7 @@ Author-email: cscullyallison@sci.utah.edu
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: numpy
|
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
from .guidepost import Guidepost
|
|
2
|
-
|
|
2
|
+
from .trailmark import Trailmark
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import anywidget
|
|
2
|
+
import traitlets
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
from .utils import validate_and_clean_dataframe, extract_summary_statistics
|
|
9
|
+
|
|
10
|
+
class Guidepost(anywidget.AnyWidget):
|
|
11
|
+
|
|
12
|
+
_esm = os.path.join(os.path.dirname(__file__), "static", "guidepost.js")
|
|
13
|
+
records = None
|
|
14
|
+
_vis_data = traitlets.Dict({}).tag(sync=True)
|
|
15
|
+
|
|
16
|
+
vis_configs = None
|
|
17
|
+
_vis_configs = traitlets.Unicode("{}").tag(sync=True)
|
|
18
|
+
cached_records_df = None
|
|
19
|
+
|
|
20
|
+
selected_records = traitlets.Unicode("[]").tag(sync=True)
|
|
21
|
+
records_df = pd.DataFrame()
|
|
22
|
+
selection = None
|
|
23
|
+
|
|
24
|
+
_summary_stats = traitlets.Dict({}).tag(sync=True)
|
|
25
|
+
|
|
26
|
+
suppress_warnings = False
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def vis_configs(self):
|
|
30
|
+
return json.loads(self._vis_configs)
|
|
31
|
+
|
|
32
|
+
@vis_configs.setter
|
|
33
|
+
def vis_configs(self, config_dict):
|
|
34
|
+
self._vis_configs = json.dumps(config_dict)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def records(self):
|
|
38
|
+
return self._vis_data
|
|
39
|
+
|
|
40
|
+
@records.setter
|
|
41
|
+
def records(self, df):
|
|
42
|
+
self._vis_data = self.load_data(df)
|
|
43
|
+
|
|
44
|
+
def load_data(self, in_df):
|
|
45
|
+
'''
|
|
46
|
+
Load dataframe in a safe way.
|
|
47
|
+
Drop NAs, remove time deltas, report warnings
|
|
48
|
+
'''
|
|
49
|
+
|
|
50
|
+
in_cpy = in_df.copy()
|
|
51
|
+
in_cpy.insert(0, 'gp_idx', range(0, len(in_cpy)))
|
|
52
|
+
self.cached_records_df = in_cpy
|
|
53
|
+
|
|
54
|
+
if sys.version_info.major < 3 or sys.version_info.minor < 12:
|
|
55
|
+
warn_supported_version = False
|
|
56
|
+
|
|
57
|
+
o_df, report = validate_and_clean_dataframe(in_cpy, self.suppress_warnings)
|
|
58
|
+
|
|
59
|
+
self._summary_stats = extract_summary_statistics(o_df)
|
|
60
|
+
# Replace NaN with None so JSON serialization yields null (valid JSON), not NaN
|
|
61
|
+
safe_df = o_df.astype(object).where(pd.notna(o_df), None)
|
|
62
|
+
self._vis_data = safe_df.to_dict()
|
|
63
|
+
return safe_df.to_dict()
|
|
64
|
+
|
|
65
|
+
@property
|
|
66
|
+
def selection(self):
|
|
67
|
+
return self.retrieve_selected_data()
|
|
68
|
+
|
|
69
|
+
def retrieve_selected_data(self):
|
|
70
|
+
if self.cached_records_df is None:
|
|
71
|
+
raise ValueError("No data has been loaded yet. Please call load_data() first.")
|
|
72
|
+
elif len(self.selected_records) == 0:
|
|
73
|
+
return pd.DataFrame() # Return empty DataFrame if no selection
|
|
74
|
+
|
|
75
|
+
selected_records_idx = json.loads(self.selected_records)
|
|
76
|
+
|
|
77
|
+
self.records_df = self.cached_records_df[self.cached_records_df['gp_idx'].isin(selected_records_idx)]
|
|
78
|
+
|
|
79
|
+
#remove synthetic index
|
|
80
|
+
return self.records_df.drop('gp_idx', axis=1)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import anywidget
|
|
2
|
+
import traitlets
|
|
3
|
+
import pandas as pd
|
|
4
|
+
import numpy as np
|
|
5
|
+
import warnings
|
|
6
|
+
import json
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from pandas.api import types as ptypes
|
|
10
|
+
from .utils import convert_to_float, validate_and_clean_dataframe, extract_summary_statistics
|
|
11
|
+
|
|
12
|
+
class Trailmark(anywidget.AnyWidget):
|
|
13
|
+
_esm = os.path.join(os.path.dirname(__file__), "static", "trailmark.js")
|
|
14
|
+
_vis_data = traitlets.Dict({}).tag(sync=True)
|
|
15
|
+
records = None
|
|
16
|
+
vis_configs = traitlets.Dict({}).tag(sync=True)
|
|
17
|
+
|
|
18
|
+
suppress_warnings = False
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@property
|
|
22
|
+
def records(self):
|
|
23
|
+
return self._vis_data
|
|
24
|
+
|
|
25
|
+
@records.setter
|
|
26
|
+
def records(self, df):
|
|
27
|
+
self._vis_data = self.load_data(df)
|
|
28
|
+
|
|
29
|
+
def load_data(self, in_df):
|
|
30
|
+
'''
|
|
31
|
+
Load dataframe and extract summary statistics for visualization.
|
|
32
|
+
'''
|
|
33
|
+
|
|
34
|
+
# validate / coerce dataframe
|
|
35
|
+
if not isinstance(in_df, pd.DataFrame):
|
|
36
|
+
try:
|
|
37
|
+
in_df = pd.DataFrame(in_df)
|
|
38
|
+
except Exception:
|
|
39
|
+
raise ValueError("in_df must be a pandas DataFrame or convertible to one")
|
|
40
|
+
|
|
41
|
+
if not self.suppress_warnings and in_df.empty:
|
|
42
|
+
warnings.warn("load_data called with an empty DataFrame")
|
|
43
|
+
|
|
44
|
+
o_df, report = validate_and_clean_dataframe(in_df, self.suppress_warnings)
|
|
45
|
+
|
|
46
|
+
self.vis_data = extract_summary_statistics(o_df)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
self.vis_configs = {
|
|
50
|
+
"n_rows": len(o_df),
|
|
51
|
+
"n_columns": len(o_df.columns),
|
|
52
|
+
"type_counts": type_counts,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# print("Data loaded: {} rows, {} columns".format(len(in_df), len(in_df.columns)))
|
|
56
|
+
# print("Column types: {} continuous, {} ordinal, {} categorical".format(
|
|
57
|
+
# type_counts["continuous"],
|
|
58
|
+
# type_counts["ordinal"],
|
|
59
|
+
# type_counts["categorical"]
|
|
60
|
+
# ))
|
|
61
|
+
return self.vis_data
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# def retrieve_configuration_selection(self):
|
|
65
|
+
#
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
import pandas as pd
|
|
2
|
+
import numpy as np
|
|
3
|
+
import warnings
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
from pandas.api import types as ptypes
|
|
7
|
+
|
|
8
|
+
def convert_to_float(value):
|
|
9
|
+
if pd.isna(value):
|
|
10
|
+
return np.nan
|
|
11
|
+
elif value.endswith('K'):
|
|
12
|
+
return float(value[:-1]) * 1e3
|
|
13
|
+
if value.endswith('M'):
|
|
14
|
+
return float(value[:-1]) * 1e6
|
|
15
|
+
elif value.endswith('B'):
|
|
16
|
+
return float(value[:-1]) * 1e9
|
|
17
|
+
return float(value)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def validate_and_clean_dataframe(in_cpy, supress_warnings=False):
|
|
21
|
+
_warn_skips = (os.path.dirname('.'),)
|
|
22
|
+
warn_supported_version = False
|
|
23
|
+
|
|
24
|
+
original_cols = in_cpy.columns
|
|
25
|
+
o_df = in_cpy.dropna(axis=1, how='all')
|
|
26
|
+
|
|
27
|
+
error_report = {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
#remove columns with only nans
|
|
31
|
+
col_diff = original_cols.difference(o_df.columns)
|
|
32
|
+
if(len(col_diff)>0):
|
|
33
|
+
rmvd_cols = ', '.join(col_diff)
|
|
34
|
+
report = {"na_columns": col_diff}
|
|
35
|
+
if(not supress_warnings):
|
|
36
|
+
if warn_supported_version:
|
|
37
|
+
warnings.warn("The following columns were dropped because they contained entirely 'na' values which guidepost does not support:[{}]".format(rmvd_cols), skip_file_prefixes=_warn_skips)
|
|
38
|
+
else:
|
|
39
|
+
print("Warning: The following columns were dropped because they contained entirely 'na' values which guidepost does not support:[{}]".format(rmvd_cols))
|
|
40
|
+
original_cols = o_df.columns
|
|
41
|
+
|
|
42
|
+
# report per-column null counts but keep the rows; downstream JS handles nulls per-axis
|
|
43
|
+
null_counts = {col: int(o_df[col].isna().sum()) for col in o_df.columns if o_df[col].isna().any()}
|
|
44
|
+
if null_counts:
|
|
45
|
+
error_report["na_column_counts"] = null_counts
|
|
46
|
+
if(not supress_warnings):
|
|
47
|
+
print("Warning: The following columns contain null values which will be skipped per-axis at render time: {}".format(null_counts))
|
|
48
|
+
|
|
49
|
+
#drop columns which are timedelta type
|
|
50
|
+
o_df = o_df.select_dtypes(exclude=['timedelta64[ns]'])
|
|
51
|
+
col_diff = original_cols.difference(o_df.columns)
|
|
52
|
+
if(len(col_diff)>0):
|
|
53
|
+
rmvd_cols = ', '.join(col_diff)
|
|
54
|
+
report = {"timedelta_columns": col_diff}
|
|
55
|
+
|
|
56
|
+
if(not supress_warnings):
|
|
57
|
+
if warn_supported_version:
|
|
58
|
+
warnings.warn("The following columns were dropped because they contained 'timedelta' values which guidepost does not support:[{}]. Consider converting these to an interger representation.".format(rmvd_cols), skip_file_prefixes=_warn_skips)
|
|
59
|
+
else:
|
|
60
|
+
print("Warning: The following columns were dropped because they contained 'timedelta' values which guidepost does not support:[{}]. Consider converting these to an interger representation.".format(rmvd_cols))
|
|
61
|
+
original_cols = o_df.columns
|
|
62
|
+
|
|
63
|
+
#drop arrays/complex datatypes
|
|
64
|
+
col_diff = []
|
|
65
|
+
for col in o_df.columns:
|
|
66
|
+
if(type(o_df[col].iloc[0]) == type(np.ndarray([]))):
|
|
67
|
+
col_diff.append(col)
|
|
68
|
+
o_df = o_df.drop(col, axis=1)
|
|
69
|
+
|
|
70
|
+
if(len(col_diff)>0):
|
|
71
|
+
rmvd_cols = ', '.join(col_diff)
|
|
72
|
+
report = {"array_columns": col_diff}
|
|
73
|
+
|
|
74
|
+
if(not supress_warnings):
|
|
75
|
+
if warn_supported_version:
|
|
76
|
+
warnings.warn("The following columns were dropped because they contained array values in cells which guidepost does not support:[{}]".format(rmvd_cols), skip_file_prefixes=_warn_skips)
|
|
77
|
+
else:
|
|
78
|
+
print("Warning: The following columns were dropped because they contained array values in cells which guidepost does not support:[{}]".format(rmvd_cols))
|
|
79
|
+
original_cols = o_df.columns
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
#add synthetic index
|
|
83
|
+
if(o_df.shape[0]>250_000):
|
|
84
|
+
if(not supress_warnings):
|
|
85
|
+
if warn_supported_version:
|
|
86
|
+
warnings.warn("Your dataframe is very large. You may experience performance issues. Consider subsampling or reducing the data down to below 200,000 rows to enhance performance.", skip_file_prefixes=_warn_skips)
|
|
87
|
+
else:
|
|
88
|
+
print("Warning: Your dataframe is very large. You may experience performance issues. Consider subsampling or reducing the data down to below 200,000 rows to enhance performance.")
|
|
89
|
+
|
|
90
|
+
return o_df, report
|
|
91
|
+
|
|
92
|
+
def extract_summary_statistics(o_df):
|
|
93
|
+
summary = {}
|
|
94
|
+
type_counts = {"continuous": 0, "ordinal": 0, "categorical": 0}
|
|
95
|
+
|
|
96
|
+
for col in o_df.columns:
|
|
97
|
+
s = o_df[col]
|
|
98
|
+
n_rows = len(s)
|
|
99
|
+
n_missing = int(s.isna().sum())
|
|
100
|
+
pct_missing = float(n_missing) / n_rows if n_rows > 0 else 0.0
|
|
101
|
+
n_unique = int(s.nunique(dropna=True))
|
|
102
|
+
|
|
103
|
+
# determine semantic type
|
|
104
|
+
if ptypes.is_categorical_dtype(s.dtype):
|
|
105
|
+
semantic = "ordinal" if getattr(s.dtype, "ordered", False) else "categorical"
|
|
106
|
+
elif ptypes.is_bool_dtype(s.dtype):
|
|
107
|
+
semantic = "categorical"
|
|
108
|
+
elif ptypes.is_numeric_dtype(s.dtype):
|
|
109
|
+
# heuristic: small-integer domains likely ordinal (e.g., ratings)
|
|
110
|
+
if ptypes.is_integer_dtype(s.dtype) or n_unique < 20:
|
|
111
|
+
semantic = "ordinal"
|
|
112
|
+
else:
|
|
113
|
+
semantic = "continuous"
|
|
114
|
+
else:
|
|
115
|
+
# object, string, datetime, etc.
|
|
116
|
+
# treat datetimes separately as continuous-like
|
|
117
|
+
if ptypes.is_datetime64_any_dtype(s.dtype) or ptypes.is_timedelta64_dtype(s.dtype):
|
|
118
|
+
semantic = "continuous"
|
|
119
|
+
else:
|
|
120
|
+
# Check if categorical values are numbers with suffixes M, K, or B
|
|
121
|
+
if s.dropna().astype(str).str.fullmatch(r'\d+(\.\d+)?[MKB]').all():
|
|
122
|
+
s = s.map(convert_to_float)
|
|
123
|
+
semantic = "continuous"
|
|
124
|
+
else:
|
|
125
|
+
semantic = "categorical"
|
|
126
|
+
|
|
127
|
+
type_counts[semantic] += 1
|
|
128
|
+
|
|
129
|
+
col_summary = {
|
|
130
|
+
"dtype": str(s.dtype),
|
|
131
|
+
"semantic_type": semantic,
|
|
132
|
+
"n_rows": n_rows,
|
|
133
|
+
"n_missing": n_missing,
|
|
134
|
+
"pct_missing": pct_missing,
|
|
135
|
+
"n_unique": n_unique,
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if semantic == "continuous":
|
|
139
|
+
# compute robust numeric summaries (skip NA)
|
|
140
|
+
ser = pd.to_numeric(s, errors="coerce")
|
|
141
|
+
col_summary.update({
|
|
142
|
+
"count": int(ser.count()),
|
|
143
|
+
"mean": None if ser.count() == 0 else float(ser.mean()),
|
|
144
|
+
"std": None if ser.count() == 0 else float(ser.std()),
|
|
145
|
+
"min": None if ser.count() == 0 else float(ser.min()),
|
|
146
|
+
"25%": None if ser.count() == 0 else float(ser.quantile(0.25)),
|
|
147
|
+
"50%": None if ser.count() == 0 else float(ser.quantile(0.5)),
|
|
148
|
+
"75%": None if ser.count() == 0 else float(ser.quantile(0.75)),
|
|
149
|
+
"IQR": None if ser.count() == 0 else float(ser.quantile(0.75) - ser.quantile(0.25)),
|
|
150
|
+
"max": None if ser.count() == 0 else float(ser.max()),
|
|
151
|
+
"var": None if ser.count() == 0 else float(ser.var())
|
|
152
|
+
})
|
|
153
|
+
else:
|
|
154
|
+
# categorical / ordinal: top categories and frequencies
|
|
155
|
+
vc = s.astype(object).value_counts(dropna=True)
|
|
156
|
+
top = vc.index[0] if len(vc) > 0 else None
|
|
157
|
+
top_freq = int(vc.iloc[0]) if len(vc) > 0 else 0
|
|
158
|
+
|
|
159
|
+
# include up to 20 most frequent values
|
|
160
|
+
top_items = []
|
|
161
|
+
for k, v in vc.iloc[:10].items():
|
|
162
|
+
# convert numpy types to native python types for JSON serialization
|
|
163
|
+
try:
|
|
164
|
+
key = k.item() if hasattr(k, "item") else k
|
|
165
|
+
except Exception:
|
|
166
|
+
key = str(k)
|
|
167
|
+
top_items.append({"value": key, "count": int(v)})
|
|
168
|
+
col_summary.update({
|
|
169
|
+
"top": top,
|
|
170
|
+
"top_freq": top_freq,
|
|
171
|
+
"top_values": top_items
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
summary[col] = col_summary
|
|
175
|
+
|
|
176
|
+
# store results in widget traits for frontend sync
|
|
177
|
+
return summary
|
|
178
|
+
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: guidepost
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.19
|
|
4
4
|
Summary: Guidepost. An overview visualization for understanding supercomputer queue data.
|
|
5
5
|
Home-page: https://github.com/cscully-allison/guidepost
|
|
6
6
|
Author: Connor Scully-Allison
|
|
@@ -8,7 +8,7 @@ Author-email: cscullyallison@sci.utah.edu
|
|
|
8
8
|
Classifier: Programming Language :: Python :: 3
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.10
|
|
12
12
|
Description-Content-Type: text/markdown
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: numpy
|
|
@@ -28,14 +28,14 @@ setup(
|
|
|
28
28
|
'pandas',
|
|
29
29
|
'scikit-learn',
|
|
30
30
|
'anywidget',
|
|
31
|
-
'traitlets'
|
|
31
|
+
'traitlets',
|
|
32
32
|
],
|
|
33
33
|
classifiers=[
|
|
34
34
|
'Programming Language :: Python :: 3',
|
|
35
35
|
'License :: OSI Approved :: MIT License',
|
|
36
36
|
'Operating System :: OS Independent',
|
|
37
37
|
],
|
|
38
|
-
python_requires='>=3.
|
|
38
|
+
python_requires='>=3.10',
|
|
39
39
|
include_package_data=True,
|
|
40
40
|
package_data={
|
|
41
41
|
'guidepost': ['guidepost.js'],
|