guidepost 0.2.17__tar.gz → 0.2.19__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: guidepost
3
- Version: 0.2.17
3
+ Version: 0.2.19
4
4
  Summary: Guidepost. An overview visualization for understanding supercomputer queue data.
5
5
  Home-page: https://github.com/cscully-allison/guidepost
6
6
  Author: Connor Scully-Allison
@@ -8,7 +8,7 @@ Author-email: cscullyallison@sci.utah.edu
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.6
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numpy
@@ -1,2 +1,2 @@
1
1
  from .guidepost import Guidepost
2
-
2
+ from .trailmark import Trailmark
@@ -0,0 +1,80 @@
1
+ import anywidget
2
+ import traitlets
3
+ import pandas as pd
4
+ import numpy as np
5
+ import json
6
+ import os
7
+ import sys
8
+ from .utils import validate_and_clean_dataframe, extract_summary_statistics
9
+
10
+ class Guidepost(anywidget.AnyWidget):
11
+
12
+ _esm = os.path.join(os.path.dirname(__file__), "static", "guidepost.js")
13
+ records = None
14
+ _vis_data = traitlets.Dict({}).tag(sync=True)
15
+
16
+ vis_configs = None
17
+ _vis_configs = traitlets.Unicode("{}").tag(sync=True)
18
+ cached_records_df = None
19
+
20
+ selected_records = traitlets.Unicode("[]").tag(sync=True)
21
+ records_df = pd.DataFrame()
22
+ selection = None
23
+
24
+ _summary_stats = traitlets.Dict({}).tag(sync=True)
25
+
26
+ suppress_warnings = False
27
+
28
+ @property
29
+ def vis_configs(self):
30
+ return json.loads(self._vis_configs)
31
+
32
+ @vis_configs.setter
33
+ def vis_configs(self, config_dict):
34
+ self._vis_configs = json.dumps(config_dict)
35
+
36
+ @property
37
+ def records(self):
38
+ return self._vis_data
39
+
40
+ @records.setter
41
+ def records(self, df):
42
+ self._vis_data = self.load_data(df)
43
+
44
+ def load_data(self, in_df):
45
+ '''
46
+ Load dataframe in a safe way.
47
+ Drop NAs, remove time deltas, report warnings
48
+ '''
49
+
50
+ in_cpy = in_df.copy()
51
+ in_cpy.insert(0, 'gp_idx', range(0, len(in_cpy)))
52
+ self.cached_records_df = in_cpy
53
+
54
+ if sys.version_info.major < 3 or sys.version_info.minor < 12:
55
+ warn_supported_version = False
56
+
57
+ o_df, report = validate_and_clean_dataframe(in_cpy, self.suppress_warnings)
58
+
59
+ self._summary_stats = extract_summary_statistics(o_df)
60
+ # Replace NaN with None so JSON serialization yields null (valid JSON), not NaN
61
+ safe_df = o_df.astype(object).where(pd.notna(o_df), None)
62
+ self._vis_data = safe_df.to_dict()
63
+ return safe_df.to_dict()
64
+
65
+ @property
66
+ def selection(self):
67
+ return self.retrieve_selected_data()
68
+
69
+ def retrieve_selected_data(self):
70
+ if self.cached_records_df is None:
71
+ raise ValueError("No data has been loaded yet. Please call load_data() first.")
72
+ elif len(self.selected_records) == 0:
73
+ return pd.DataFrame() # Return empty DataFrame if no selection
74
+
75
+ selected_records_idx = json.loads(self.selected_records)
76
+
77
+ self.records_df = self.cached_records_df[self.cached_records_df['gp_idx'].isin(selected_records_idx)]
78
+
79
+ #remove synthetic index
80
+ return self.records_df.drop('gp_idx', axis=1)
@@ -0,0 +1,65 @@
1
+ import anywidget
2
+ import traitlets
3
+ import pandas as pd
4
+ import numpy as np
5
+ import warnings
6
+ import json
7
+ import os
8
+ import sys
9
+ from pandas.api import types as ptypes
10
+ from .utils import convert_to_float, validate_and_clean_dataframe, extract_summary_statistics
11
+
12
+ class Trailmark(anywidget.AnyWidget):
13
+ _esm = os.path.join(os.path.dirname(__file__), "static", "trailmark.js")
14
+ _vis_data = traitlets.Dict({}).tag(sync=True)
15
+ records = None
16
+ vis_configs = traitlets.Dict({}).tag(sync=True)
17
+
18
+ suppress_warnings = False
19
+
20
+
21
+ @property
22
+ def records(self):
23
+ return self._vis_data
24
+
25
+ @records.setter
26
+ def records(self, df):
27
+ self._vis_data = self.load_data(df)
28
+
29
+ def load_data(self, in_df):
30
+ '''
31
+ Load dataframe and extract summary statistics for visualization.
32
+ '''
33
+
34
+ # validate / coerce dataframe
35
+ if not isinstance(in_df, pd.DataFrame):
36
+ try:
37
+ in_df = pd.DataFrame(in_df)
38
+ except Exception:
39
+ raise ValueError("in_df must be a pandas DataFrame or convertible to one")
40
+
41
+ if not self.suppress_warnings and in_df.empty:
42
+ warnings.warn("load_data called with an empty DataFrame")
43
+
44
+ o_df, report = validate_and_clean_dataframe(in_df, self.suppress_warnings)
45
+
46
+ self.vis_data = extract_summary_statistics(o_df)
47
+
48
+
49
+ self.vis_configs = {
50
+ "n_rows": len(o_df),
51
+ "n_columns": len(o_df.columns),
52
+ "type_counts": type_counts,
53
+ }
54
+
55
+ # print("Data loaded: {} rows, {} columns".format(len(in_df), len(in_df.columns)))
56
+ # print("Column types: {} continuous, {} ordinal, {} categorical".format(
57
+ # type_counts["continuous"],
58
+ # type_counts["ordinal"],
59
+ # type_counts["categorical"]
60
+ # ))
61
+ return self.vis_data
62
+
63
+
64
+ # def retrieve_configuration_selection(self):
65
+ #
@@ -0,0 +1,178 @@
1
+ import pandas as pd
2
+ import numpy as np
3
+ import warnings
4
+ import os
5
+ import sys
6
+ from pandas.api import types as ptypes
7
+
8
+ def convert_to_float(value):
9
+ if pd.isna(value):
10
+ return np.nan
11
+ elif value.endswith('K'):
12
+ return float(value[:-1]) * 1e3
13
+ if value.endswith('M'):
14
+ return float(value[:-1]) * 1e6
15
+ elif value.endswith('B'):
16
+ return float(value[:-1]) * 1e9
17
+ return float(value)
18
+
19
+
20
+ def validate_and_clean_dataframe(in_cpy, supress_warnings=False):
21
+ _warn_skips = (os.path.dirname('.'),)
22
+ warn_supported_version = False
23
+
24
+ original_cols = in_cpy.columns
25
+ o_df = in_cpy.dropna(axis=1, how='all')
26
+
27
+ error_report = {}
28
+
29
+
30
+ #remove columns with only nans
31
+ col_diff = original_cols.difference(o_df.columns)
32
+ if(len(col_diff)>0):
33
+ rmvd_cols = ', '.join(col_diff)
34
+ report = {"na_columns": col_diff}
35
+ if(not supress_warnings):
36
+ if warn_supported_version:
37
+ warnings.warn("The following columns were dropped because they contained entirely 'na' values which guidepost does not support:[{}]".format(rmvd_cols), skip_file_prefixes=_warn_skips)
38
+ else:
39
+ print("Warning: The following columns were dropped because they contained entirely 'na' values which guidepost does not support:[{}]".format(rmvd_cols))
40
+ original_cols = o_df.columns
41
+
42
+ # report per-column null counts but keep the rows; downstream JS handles nulls per-axis
43
+ null_counts = {col: int(o_df[col].isna().sum()) for col in o_df.columns if o_df[col].isna().any()}
44
+ if null_counts:
45
+ error_report["na_column_counts"] = null_counts
46
+ if(not supress_warnings):
47
+ print("Warning: The following columns contain null values which will be skipped per-axis at render time: {}".format(null_counts))
48
+
49
+ #drop columns which are timedelta type
50
+ o_df = o_df.select_dtypes(exclude=['timedelta64[ns]'])
51
+ col_diff = original_cols.difference(o_df.columns)
52
+ if(len(col_diff)>0):
53
+ rmvd_cols = ', '.join(col_diff)
54
+ report = {"timedelta_columns": col_diff}
55
+
56
+ if(not supress_warnings):
57
+ if warn_supported_version:
58
+ warnings.warn("The following columns were dropped because they contained 'timedelta' values which guidepost does not support:[{}]. Consider converting these to an interger representation.".format(rmvd_cols), skip_file_prefixes=_warn_skips)
59
+ else:
60
+ print("Warning: The following columns were dropped because they contained 'timedelta' values which guidepost does not support:[{}]. Consider converting these to an interger representation.".format(rmvd_cols))
61
+ original_cols = o_df.columns
62
+
63
+ #drop arrays/complex datatypes
64
+ col_diff = []
65
+ for col in o_df.columns:
66
+ if(type(o_df[col].iloc[0]) == type(np.ndarray([]))):
67
+ col_diff.append(col)
68
+ o_df = o_df.drop(col, axis=1)
69
+
70
+ if(len(col_diff)>0):
71
+ rmvd_cols = ', '.join(col_diff)
72
+ report = {"array_columns": col_diff}
73
+
74
+ if(not supress_warnings):
75
+ if warn_supported_version:
76
+ warnings.warn("The following columns were dropped because they contained array values in cells which guidepost does not support:[{}]".format(rmvd_cols), skip_file_prefixes=_warn_skips)
77
+ else:
78
+ print("Warning: The following columns were dropped because they contained array values in cells which guidepost does not support:[{}]".format(rmvd_cols))
79
+ original_cols = o_df.columns
80
+
81
+
82
+ #add synthetic index
83
+ if(o_df.shape[0]>250_000):
84
+ if(not supress_warnings):
85
+ if warn_supported_version:
86
+ warnings.warn("Your dataframe is very large. You may experience performance issues. Consider subsampling or reducing the data down to below 200,000 rows to enhance performance.", skip_file_prefixes=_warn_skips)
87
+ else:
88
+ print("Warning: Your dataframe is very large. You may experience performance issues. Consider subsampling or reducing the data down to below 200,000 rows to enhance performance.")
89
+
90
+ return o_df, report
91
+
92
+ def extract_summary_statistics(o_df):
93
+ summary = {}
94
+ type_counts = {"continuous": 0, "ordinal": 0, "categorical": 0}
95
+
96
+ for col in o_df.columns:
97
+ s = o_df[col]
98
+ n_rows = len(s)
99
+ n_missing = int(s.isna().sum())
100
+ pct_missing = float(n_missing) / n_rows if n_rows > 0 else 0.0
101
+ n_unique = int(s.nunique(dropna=True))
102
+
103
+ # determine semantic type
104
+ if ptypes.is_categorical_dtype(s.dtype):
105
+ semantic = "ordinal" if getattr(s.dtype, "ordered", False) else "categorical"
106
+ elif ptypes.is_bool_dtype(s.dtype):
107
+ semantic = "categorical"
108
+ elif ptypes.is_numeric_dtype(s.dtype):
109
+ # heuristic: small-integer domains likely ordinal (e.g., ratings)
110
+ if ptypes.is_integer_dtype(s.dtype) or n_unique < 20:
111
+ semantic = "ordinal"
112
+ else:
113
+ semantic = "continuous"
114
+ else:
115
+ # object, string, datetime, etc.
116
+ # treat datetimes separately as continuous-like
117
+ if ptypes.is_datetime64_any_dtype(s.dtype) or ptypes.is_timedelta64_dtype(s.dtype):
118
+ semantic = "continuous"
119
+ else:
120
+ # Check if categorical values are numbers with suffixes M, K, or B
121
+ if s.dropna().astype(str).str.fullmatch(r'\d+(\.\d+)?[MKB]').all():
122
+ s = s.map(convert_to_float)
123
+ semantic = "continuous"
124
+ else:
125
+ semantic = "categorical"
126
+
127
+ type_counts[semantic] += 1
128
+
129
+ col_summary = {
130
+ "dtype": str(s.dtype),
131
+ "semantic_type": semantic,
132
+ "n_rows": n_rows,
133
+ "n_missing": n_missing,
134
+ "pct_missing": pct_missing,
135
+ "n_unique": n_unique,
136
+ }
137
+
138
+ if semantic == "continuous":
139
+ # compute robust numeric summaries (skip NA)
140
+ ser = pd.to_numeric(s, errors="coerce")
141
+ col_summary.update({
142
+ "count": int(ser.count()),
143
+ "mean": None if ser.count() == 0 else float(ser.mean()),
144
+ "std": None if ser.count() == 0 else float(ser.std()),
145
+ "min": None if ser.count() == 0 else float(ser.min()),
146
+ "25%": None if ser.count() == 0 else float(ser.quantile(0.25)),
147
+ "50%": None if ser.count() == 0 else float(ser.quantile(0.5)),
148
+ "75%": None if ser.count() == 0 else float(ser.quantile(0.75)),
149
+ "IQR": None if ser.count() == 0 else float(ser.quantile(0.75) - ser.quantile(0.25)),
150
+ "max": None if ser.count() == 0 else float(ser.max()),
151
+ "var": None if ser.count() == 0 else float(ser.var())
152
+ })
153
+ else:
154
+ # categorical / ordinal: top categories and frequencies
155
+ vc = s.astype(object).value_counts(dropna=True)
156
+ top = vc.index[0] if len(vc) > 0 else None
157
+ top_freq = int(vc.iloc[0]) if len(vc) > 0 else 0
158
+
159
+ # include up to 20 most frequent values
160
+ top_items = []
161
+ for k, v in vc.iloc[:10].items():
162
+ # convert numpy types to native python types for JSON serialization
163
+ try:
164
+ key = k.item() if hasattr(k, "item") else k
165
+ except Exception:
166
+ key = str(k)
167
+ top_items.append({"value": key, "count": int(v)})
168
+ col_summary.update({
169
+ "top": top,
170
+ "top_freq": top_freq,
171
+ "top_values": top_items
172
+ })
173
+
174
+ summary[col] = col_summary
175
+
176
+ # store results in widget traits for frontend sync
177
+ return summary
178
+
@@ -0,0 +1,2 @@
1
+ __version_info__ = ("0", "2", "19")
2
+ __version__ = ".".join(__version_info__)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: guidepost
3
- Version: 0.2.17
3
+ Version: 0.2.19
4
4
  Summary: Guidepost. An overview visualization for understanding supercomputer queue data.
5
5
  Home-page: https://github.com/cscully-allison/guidepost
6
6
  Author: Connor Scully-Allison
@@ -8,7 +8,7 @@ Author-email: cscullyallison@sci.utah.edu
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
- Requires-Python: >=3.6
11
+ Requires-Python: >=3.10
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: numpy
@@ -4,8 +4,9 @@ README.md
4
4
  pyproject.toml
5
5
  setup.py
6
6
  guidepost/__init__.py
7
- guidepost/guidepost.js
8
7
  guidepost/guidepost.py
8
+ guidepost/trailmark.py
9
+ guidepost/utils.py
9
10
  guidepost/version.py
10
11
  guidepost.egg-info/PKG-INFO
11
12
  guidepost.egg-info/SOURCES.txt
@@ -28,14 +28,14 @@ setup(
28
28
  'pandas',
29
29
  'scikit-learn',
30
30
  'anywidget',
31
- 'traitlets'
31
+ 'traitlets',
32
32
  ],
33
33
  classifiers=[
34
34
  'Programming Language :: Python :: 3',
35
35
  'License :: OSI Approved :: MIT License',
36
36
  'Operating System :: OS Independent',
37
37
  ],
38
- python_requires='>=3.6',
38
+ python_requires='>=3.10',
39
39
  include_package_data=True,
40
40
  package_data={
41
41
  'guidepost': ['guidepost.js'],