datamapplot 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Tutte Institute for Mathematics and Computing
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,111 @@
1
+ Metadata-Version: 2.1
2
+ Name: datamapplot
3
+ Version: 0.1.0
4
+ Summary: A library for presentation and publication ready plots of data maps
5
+ Home-page: https://github.com/TutteInstitute/datamapplot
6
+ Author: Leland McInnes
7
+ Author-email: leland.mcinnes@gmail.com
8
+ Maintainer: Leland McInnes
9
+ Maintainer-email: leland.mcinnes@gmail.com
10
+ License: MIT License
11
+ Keywords: data map,visualization,topic modelling,cluster,clustering
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Development Status :: 4 - Beta
16
+ Classifier: Operating System :: OS Independent
17
+ Requires-Python: >=3.9
18
+ License-File: LICENSE
19
+
20
+ .. -*- mode: rst -*-
21
+
22
+ .. image:: doc/datamapplot_text_horizontal.png
23
+ :width: 600
24
+ :alt: DataMapPlot logo
25
+ :align: center
26
+
27
+ ===========
28
+ DataMapPlot
29
+ ===========
30
+
31
+ Creating beautiful plots of data maps. DataMapPlot is a small library designed to help you make beautiful data map
32
+ plots for inclusion in presentations, posters and papers. The focus is on producing static plots that are great
33
+ looking with as little work for you as possible. All you need to do is label clusters of points in the data map and
34
+ DataMapPlot will take care of the rest. While this involves automating most of the aesthetic choices, the library
35
+ provides a wide variety of ways to customize the resulting plot to your needs.
36
+
37
+ --------
38
+ Examples
39
+ --------
40
+
41
+ Some examples of the kind of output that DataMapPlot can provide.
42
+
43
+ A basic plot, with some highlighted labels:
44
+
45
+ .. image:: examples/plot_cord19.png
46
+ :width: 1024
47
+ :alt: A data map plot of the CORD-19 dataset
48
+ :align: center
49
+
50
+ Using darkmode and some custom font choices:
51
+
52
+ .. image:: examples/plot_arxiv_ml.png
53
+ :width: 1024
54
+ :alt: A data map plot of papers from ArXiv ML
55
+ :align: center
56
+
57
+ Alternative custom styling:
58
+
59
+ .. image:: examples/plot_wikipedia.png
60
+ :width: 1024
61
+ :alt: A data map plot of Simple Wikipedia
62
+ :align: center
63
+
64
+ Custom arrow styles, fonts, and colour maps:
65
+
66
+ .. image:: examples/plot_simple_arxiv.png
67
+ :width: 1024
68
+ :alt: A styled data map plot of papers from ArXiv ML
69
+ :align: center
70
+
71
+ ------------
72
+ Installation
73
+ ------------
74
+
75
+ DataMapPlot requires a few libraries, but all are widely available and easy to install:
76
+
77
+ * Numpy
78
+ * Matplotlib
79
+ * Scikit-learn
80
+ * Pandas
81
+ * Datashader
82
+ * Scikit-image
83
+ * Numba
84
+
85
+ To install DataMapPlot you can use pip:
86
+
87
+ .. code:: bash
88
+
89
+ pip install datamapplot
90
+
91
+ or use conda with conda-forge
92
+
93
+ .. code:: bash
94
+
95
+ conda install -c conda-forge datamapplot
96
+
97
+
98
+ -------
99
+ License
100
+ -------
101
+
102
+ DataMapPlot is MIT licensed. See the LICENSE file for details.
103
+
104
+ ------------
105
+ Contributing
106
+ ------------
107
+
108
+ Contributions are more than welcome! If you have ideas for features of projects please get in touch. Everything from
109
+ code to notebooks to examples and documentation are all *equally valuable* so please don't feel you can't contribute.
110
+ To contribute please `fork the project <https://github.com/TutteInstitute/datamapplot/issues#fork-destination-box>`_ make your
111
+ changes and submit a pull request. We will do our best to work through any issues with you and get your code merged in.
@@ -0,0 +1,92 @@
1
+ .. -*- mode: rst -*-
2
+
3
+ .. image:: doc/datamapplot_text_horizontal.png
4
+ :width: 600
5
+ :alt: DataMapPlot logo
6
+ :align: center
7
+
8
+ ===========
9
+ DataMapPlot
10
+ ===========
11
+
12
+ Creating beautiful plots of data maps. DataMapPlot is a small library designed to help you make beautiful data map
13
+ plots for inclusion in presentations, posters and papers. The focus is on producing static plots that are great
14
+ looking with as little work for you as possible. All you need to do is label clusters of points in the data map and
15
+ DataMapPlot will take care of the rest. While this involves automating most of the aesthetic choices, the library
16
+ provides a wide variety of ways to customize the resulting plot to your needs.
17
+
18
+ --------
19
+ Examples
20
+ --------
21
+
22
+ Some examples of the kind of output that DataMapPlot can provide.
23
+
24
+ A basic plot, with some highlighted labels:
25
+
26
+ .. image:: examples/plot_cord19.png
27
+ :width: 1024
28
+ :alt: A data map plot of the CORD-19 dataset
29
+ :align: center
30
+
31
+ Using darkmode and some custom font choices:
32
+
33
+ .. image:: examples/plot_arxiv_ml.png
34
+ :width: 1024
35
+ :alt: A data map plot of papers from ArXiv ML
36
+ :align: center
37
+
38
+ Alternative custom styling:
39
+
40
+ .. image:: examples/plot_wikipedia.png
41
+ :width: 1024
42
+ :alt: A data map plot of Simple Wikipedia
43
+ :align: center
44
+
45
+ Custom arrow styles, fonts, and colour maps:
46
+
47
+ .. image:: examples/plot_simple_arxiv.png
48
+ :width: 1024
49
+ :alt: A styled data map plot of papers from ArXiv ML
50
+ :align: center
51
+
52
+ ------------
53
+ Installation
54
+ ------------
55
+
56
+ DataMapPlot requires a few libraries, but all are widely available and easy to install:
57
+
58
+ * Numpy
59
+ * Matplotlib
60
+ * Scikit-learn
61
+ * Pandas
62
+ * Datashader
63
+ * Scikit-image
64
+ * Numba
65
+
66
+ To install DataMapPlot you can use pip:
67
+
68
+ .. code:: bash
69
+
70
+ pip install datamapplot
71
+
72
+ or use conda with conda-forge
73
+
74
+ .. code:: bash
75
+
76
+ conda install -c conda-forge datamapplot
77
+
78
+
79
+ -------
80
+ License
81
+ -------
82
+
83
+ DataMapPlot is MIT licensed. See the LICENSE file for details.
84
+
85
+ ------------
86
+ Contributing
87
+ ------------
88
+
89
+ Contributions are more than welcome! If you have ideas for features of projects please get in touch. Everything from
90
+ code to notebooks to examples and documentation are all *equally valuable* so please don't feel you can't contribute.
91
+ To contribute please `fork the project <https://github.com/TutteInstitute/datamapplot/issues#fork-destination-box>`_ make your
92
+ changes and submit a pull request. We will do our best to work through any issues with you and get your code merged in.
@@ -0,0 +1,277 @@
1
+ import numpy as np
2
+ import pandas as pd
3
+ import textwrap
4
+
5
+ from matplotlib import pyplot as plt
6
+
7
+ from datamapplot.palette_handling import (
8
+ palette_from_datamap,
9
+ palette_from_cmap_and_datamap,
10
+ deep_palette,
11
+ pastel_palette,
12
+ )
13
+ from datamapplot.plot_rendering import render_plot
14
+ from datamapplot.medoids import medoid
15
+
16
+
17
+ def create_plot(
18
+ data_map_coords,
19
+ labels,
20
+ *,
21
+ title=None,
22
+ sub_title=None,
23
+ noise_label="Unlabelled",
24
+ noise_color="#999999",
25
+ color_label_text=True,
26
+ label_wrap_width=16,
27
+ label_color_map=None,
28
+ figsize=(12, 12),
29
+ dynamic_label_size=False,
30
+ dpi=plt.rcParams["figure.dpi"],
31
+ force_matplotlib=False,
32
+ darkmode=False,
33
+ highlight_labels=None,
34
+ palette_hue_shift=0.0,
35
+ palette_hue_radius_dependence=1.0,
36
+ use_medoids=False,
37
+ cmap=None,
38
+ **render_plot_kwds,
39
+ ):
40
+ """Create a static plot from ``data_map_coords`` with text labels provided by ``labels``.
41
+ This is the primary function for DataMapPlot and provides the easiest interface to the
42
+ static plotting functionality. This function provides a number of options, but also
43
+ passes any further keyword options through to the lower level ``render_plot`` function
44
+ so be sure to check the documentation for ``render_plot`` to discover further keyword
45
+ arguments that can be used here as well.
46
+
47
+ Parameters
48
+ ----------
49
+ data_map_coords: ndarray of floats of shape (n_samples, 2)
50
+ The 2D coordinates for the data map. Usually this is produced via a
51
+ dimension reduction technique such as UMAP, t-SNE, PacMAP, PyMDE etc.
52
+
53
+ labels: ndarray of strings (object) of shape (n_samples,)
54
+ A string label each data point in the data map. There should ideally by
55
+ only up to 64 unique labels. Noise or unlabelled points should have the
56
+ same label as ``noise_label``, which is "Unlabelled" by default.
57
+
58
+ title: str or None (optional, default=None)
59
+ A title for the plot. If ``None`` then no title is used for the plot.
60
+ The title should be succint; three to seven words.
61
+
62
+ sub_title: str or None (optional, default=None)
63
+ A sub-title for the plot. If ``None`` then no sub-title is used for the plot.
64
+ The sub-title can be significantly longer then the title and provide more information\
65
+ about the plot and data sources.
66
+
67
+ noise_label: str (optional, default="Unlabelled")
68
+ The string used in the ``labels`` array to identify the unlabelled or noise points
69
+ in the dataset.
70
+
71
+ noise_color: str (optional, default="#999999")
72
+ The colour to use for unlabelled or noise points in the data map. This should usually
73
+ be a muted or neutral colour to distinguish background points from the labelled clusters.
74
+
75
+ color_label_text: bool (optional, default=True)
76
+ Whether to use colours for the text labels generated in the plot. If ``False`` then
77
+ the text labels will default to either black or white depending on ``darkmode``.
78
+
79
+ label_wrap_width: int (optional, default=16)
80
+ The number of characters to apply text-wrapping at when creating text labels for
81
+ display in the plot. Note that long words will not be broken, so you can choose
82
+ relatively small values if you want tight text-wrapping.
83
+
84
+ label_color_map: dict or None (optional, default=None)
85
+ A colour mapping to use to colour points/clusters in the data map. The mapping should
86
+ be keyed by the unique cluster labels in ``labels`` and take values that are hex-string
87
+ representations of colours. If ``None`` then a colour mapping will be auto-generated.
88
+
89
+ figsize: (int, int) (optional, default=(12,12))
90
+ How big to make the figure in inches (actual pixel size will depend on ``dpi``).
91
+
92
+ dynamic_label_size: bool (optional, default=False)
93
+ Whether to dynamically resize the text labels based on the relative sizes of the
94
+ clusters. This can be useful to help highlight larger clusters.
95
+
96
+ dpi: int (optional, default=plt.rcParams["figure.dpi"])
97
+ The dots-per-inch setting usd when rendering the plot.
98
+
99
+ force_matplotlib: bool (optional, default=False)
100
+ Force using matplotlib instead of datashader for rendering the scatterplot of the
101
+ data map. This can be useful if you wish to have a different marker_type, or variably
102
+ sized markers based on a marker_size_array, neither of which are supported by the
103
+ datashader based renderer.
104
+
105
+ darkmode: bool (optional, default=False)
106
+ Whether to render the plot in darkmode (with a dark background) or not.
107
+
108
+ highlight_labels: list of str or None (optional, default=None)
109
+ A list of unique labels that should have their text highlighted in the resulting plot.
110
+ Arguments supported by ``render_plot`` can allow for control over how highlighted labels
111
+ are rendered. By default they are simply rendered in bold text.
112
+
113
+ palette_hue_shift: float (optional, default=0.0)
114
+ A setting, in degrees clockwise, to shift the hue channel when generating a colour
115
+ palette and color_mapping for the labels.
116
+
117
+ palette_hue_radius_dependence: float (optional, default=1.0)
118
+ A setting that determines how dependent on the radius the hue channel is. Larger
119
+ values will result in more hue variation where there are more outlying points.
120
+
121
+ use_medoids: bool (optional, default=False)
122
+ Whether to use medoids instead of centroids to determine the "location" of the cluster,
123
+ both for the label indicator line, and for palette colouring. Note that medoids are
124
+ more computationally expensive, especially for large plots, so use with some caution.
125
+
126
+ cmap: matplotlib cmap or None (optional, default=None)
127
+ A linear matplotlib cmap colour map to use as the base for a generated colour mapping.
128
+ This *should* be a matplotlib cmap that is smooth and linear, and cyclic
129
+ (see the colorcet package for some good options). If not a cyclic cmap it will be
130
+ "made" cyclic by reflecting it. If ``None`` then a custom method will be used instead.
131
+
132
+ **render_plot_kwds
133
+ All opther keyword arguments are passed through the ``render_plot`` which provides
134
+ significant further control over the aesthetics of the plot.
135
+
136
+ Returns
137
+ -------
138
+
139
+ fig: matplotlib.Figure
140
+ The figure that the resulting plot is rendered to.
141
+
142
+ ax: matpolotlib.Axes
143
+ The axes contained within the figure that the plot is rendered to.
144
+
145
+ """
146
+ cluster_label_vector = np.asarray(labels)
147
+ unique_non_noise_labels = [
148
+ label for label in np.unique(cluster_label_vector) if label != noise_label
149
+ ]
150
+ if use_medoids:
151
+ label_locations = np.asarray(
152
+ [
153
+ medoid(data_map_coords[cluster_label_vector == i])
154
+ for i in unique_non_noise_labels
155
+ ]
156
+ )
157
+ else:
158
+ label_locations = np.asarray(
159
+ [
160
+ data_map_coords[cluster_label_vector == i].mean(axis=0)
161
+ for i in unique_non_noise_labels
162
+ ]
163
+ )
164
+ label_text = [
165
+ textwrap.fill(x, width=label_wrap_width, break_long_words=False)
166
+ for x in unique_non_noise_labels
167
+ ]
168
+ if highlight_labels is not None:
169
+ highlight_labels = [
170
+ textwrap.fill(x, width=label_wrap_width, break_long_words=False)
171
+ for x in highlight_labels
172
+ ]
173
+
174
+ # If we don't have a color map, generate one
175
+ if label_color_map is None:
176
+ if cmap is None:
177
+ palette = palette_from_datamap(
178
+ data_map_coords,
179
+ label_locations,
180
+ hue_shift=palette_hue_shift,
181
+ radius_weight_power=palette_hue_radius_dependence,
182
+ )
183
+ else:
184
+ palette = palette_from_cmap_and_datamap(
185
+ cmap,
186
+ data_map_coords,
187
+ label_locations,
188
+ radius_weight_power=palette_hue_radius_dependence,
189
+ )
190
+ label_to_index_map = {
191
+ name: index for index, name in enumerate(unique_non_noise_labels)
192
+ }
193
+ color_list = [
194
+ palette[label_to_index_map[x]] if x in label_to_index_map else noise_color
195
+ for x in cluster_label_vector
196
+ ]
197
+ label_color_map = {
198
+ x: (
199
+ palette[label_to_index_map[x]]
200
+ if x in label_to_index_map
201
+ else noise_color
202
+ )
203
+ for x in np.unique(cluster_label_vector)
204
+ }
205
+ else:
206
+ color_list = [
207
+ label_color_map[x] if x != noise_label else noise_color
208
+ for x in cluster_label_vector
209
+ ]
210
+
211
+ # Darken and reduce chroma of label colors to get text labels
212
+ if color_label_text:
213
+ if darkmode:
214
+ label_text_colors = pastel_palette(
215
+ [label_color_map[x] for x in unique_non_noise_labels]
216
+ )
217
+ else:
218
+ label_text_colors = deep_palette(
219
+ [label_color_map[x] for x in unique_non_noise_labels]
220
+ )
221
+ else:
222
+ label_text_colors = None
223
+
224
+ if dynamic_label_size:
225
+ font_scale_factor = np.sqrt(figsize[0] * figsize[1])
226
+ cluster_sizes = np.sqrt(pd.Series(cluster_label_vector).value_counts())
227
+ label_size_adjustments = cluster_sizes - cluster_sizes.min()
228
+ label_size_adjustments /= label_size_adjustments.max()
229
+ label_size_adjustments *= (
230
+ render_plot_kwds.get("label_font_size", font_scale_factor) + 2
231
+ )
232
+ label_size_adjustments = dict(label_size_adjustments - 2)
233
+ label_size_adjustments = [
234
+ label_size_adjustments[x] for x in unique_non_noise_labels
235
+ ]
236
+ else:
237
+ label_size_adjustments = [0.0] * len(unique_non_noise_labels)
238
+
239
+ # Heuristics for point size and alpha values
240
+ n_points = data_map_coords.shape[0]
241
+ if data_map_coords.shape[0] < 100_000 or force_matplotlib:
242
+ magic_number = np.clip(128 * 4 ** (-np.log10(n_points)), 0.05, 64)
243
+ point_scale_factor = np.sqrt(figsize[0] * figsize[1])
244
+ point_size = magic_number * (point_scale_factor / 2)
245
+ alpha = np.clip(magic_number, 0.05, 1)
246
+ else:
247
+ point_size = int(np.sqrt(figsize[0] * figsize[1]) * dpi) // 2048
248
+ alpha = 1.0
249
+
250
+ if "point_size" in render_plot_kwds:
251
+ point_size = render_plot_kwds.pop("point_size")
252
+
253
+ if "alpha" in render_plot_kwds:
254
+ alpha = render_plot_kwds.pop("alpha")
255
+
256
+ fig, ax = render_plot(
257
+ data_map_coords,
258
+ color_list,
259
+ label_text,
260
+ label_locations,
261
+ title=title,
262
+ sub_title=sub_title,
263
+ point_size=point_size,
264
+ alpha=alpha,
265
+ label_colors=None if not color_label_text else label_text_colors,
266
+ highlight_colors=[label_color_map[x] for x in unique_non_noise_labels],
267
+ figsize=figsize,
268
+ noise_color=noise_color,
269
+ label_size_adjustments=label_size_adjustments,
270
+ dpi=dpi,
271
+ force_matplotlib=force_matplotlib,
272
+ darkmode=darkmode,
273
+ highlight_labels=highlight_labels,
274
+ **render_plot_kwds,
275
+ )
276
+
277
+ return fig, ax
@@ -0,0 +1,103 @@
1
+ import numpy as np
2
+ import numba
3
+
4
+
5
+ @numba.njit(
6
+ [
7
+ "f4(f4[::1],f4[::1])",
8
+ numba.types.float32(
9
+ numba.types.Array(numba.types.float32, 1, "C", readonly=True),
10
+ numba.types.Array(numba.types.float32, 1, "C", readonly=True),
11
+ ),
12
+ ],
13
+ fastmath=True,
14
+ locals={
15
+ "result": numba.types.float32,
16
+ "diff": numba.types.float32,
17
+ "dim": numba.types.intp,
18
+ "i": numba.types.uint16,
19
+ },
20
+ )
21
+ def euclidean(x, y):
22
+ r"""Squared euclidean distance.
23
+
24
+ .. math::
25
+ D(x, y) = \sum_i (x_i - y_i)^2
26
+ """
27
+ result = 0.0
28
+ dim = x.shape[0]
29
+ for i in range(dim):
30
+ diff = x[i] - y[i]
31
+ result += diff * diff
32
+
33
+ return np.sqrt(result)
34
+
35
+
36
+ @numba.njit(parallel=True, nogil=True)
37
+ def chunked_parallel_pairwise_distances(X, Y=None, metric=euclidean, chunk_size=16):
38
+ if Y is None:
39
+ XX, symmetrical = X, True
40
+ row_size = col_size = X.shape[0]
41
+ else:
42
+ XX, symmetrical = Y, False
43
+ row_size, col_size = X.shape[0], Y.shape[0]
44
+
45
+ result = np.zeros((row_size, col_size), dtype=np.float32)
46
+ n_row_chunks = (row_size // chunk_size) + 1
47
+ for chunk_idx in numba.prange(n_row_chunks):
48
+ n = chunk_idx * chunk_size
49
+ chunk_end_n = min(n + chunk_size, row_size)
50
+ m_start = n if symmetrical else 0
51
+ for m in range(m_start, col_size, chunk_size):
52
+ chunk_end_m = min(m + chunk_size, col_size)
53
+ for i in range(n, chunk_end_n):
54
+ for j in range(m, chunk_end_m):
55
+ result[i, j] = metric(X[i], XX[j])
56
+ return result
57
+
58
+
59
+ @numba.njit()
60
+ def pull_arms(data, arms, num_pulls_per_arm, estimates, pull_counts):
61
+ other_candidates = np.random.choice(
62
+ data.shape[0], size=num_pulls_per_arm, replace=False
63
+ ).astype(np.int32)
64
+ data_arm = data[arms]
65
+ data_other = data[other_candidates]
66
+
67
+ distance_sums = np.sum(
68
+ chunked_parallel_pairwise_distances(data_arm, data_other), axis=1
69
+ )
70
+
71
+ estimates *= pull_counts
72
+ estimates += distance_sums
73
+ pull_counts += num_pulls_per_arm
74
+ estimates /= pull_counts
75
+
76
+
77
+ @numba.njit()
78
+ def medoid(data, arm_budget=20):
79
+ pull_counts = np.zeros(data.shape[0], dtype=np.int32)
80
+ pull_budget = arm_budget * data.shape[0]
81
+ estimates = np.zeros(data.shape[0], dtype=np.float32)
82
+ current_active_arms = np.arange(data.shape[0])
83
+ n_rounds = int(np.ceil(np.log2(data.shape[0])))
84
+
85
+ while current_active_arms.shape[0] > 1:
86
+ num_pulls_per_arm = max(
87
+ 1,
88
+ int(
89
+ min(
90
+ data.shape[0],
91
+ np.floor(pull_budget / (current_active_arms.shape[0] * n_rounds)),
92
+ )
93
+ ),
94
+ )
95
+ pull_arms(data, current_active_arms, num_pulls_per_arm, estimates, pull_counts)
96
+
97
+ median = np.median(estimates)
98
+ mask = estimates <= median
99
+ current_active_arms = current_active_arms[mask]
100
+ estimates = estimates[mask]
101
+ pull_counts = pull_counts[mask]
102
+
103
+ return data[current_active_arms[0]]