datamapplot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datamapplot-0.1.0/LICENSE +21 -0
- datamapplot-0.1.0/PKG-INFO +111 -0
- datamapplot-0.1.0/README.rst +92 -0
- datamapplot-0.1.0/datamapplot/__init__.py +277 -0
- datamapplot-0.1.0/datamapplot/medoids.py +103 -0
- datamapplot-0.1.0/datamapplot/overlap_computations.py +160 -0
- datamapplot-0.1.0/datamapplot/palette_handling.py +224 -0
- datamapplot-0.1.0/datamapplot/plot_rendering.py +587 -0
- datamapplot-0.1.0/datamapplot/text_placement.py +324 -0
- datamapplot-0.1.0/datamapplot.egg-info/PKG-INFO +111 -0
- datamapplot-0.1.0/datamapplot.egg-info/SOURCES.txt +16 -0
- datamapplot-0.1.0/datamapplot.egg-info/dependency_links.txt +1 -0
- datamapplot-0.1.0/datamapplot.egg-info/not-zip-safe +1 -0
- datamapplot-0.1.0/datamapplot.egg-info/requires.txt +8 -0
- datamapplot-0.1.0/datamapplot.egg-info/top_level.txt +1 -0
- datamapplot-0.1.0/setup.cfg +38 -0
- datamapplot-0.1.0/setup.py +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023 Tutte Institute for Mathematics and Computing
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: datamapplot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A library for presentation and publication ready plots of data maps
|
|
5
|
+
Home-page: https://github.com/TutteInstitute/datamapplot
|
|
6
|
+
Author: Leland McInnes
|
|
7
|
+
Author-email: leland.mcinnes@gmail.com
|
|
8
|
+
Maintainer: Leland McInnes
|
|
9
|
+
Maintainer-email: leland.mcinnes@gmail.com
|
|
10
|
+
License: MIT License
|
|
11
|
+
Keywords: data map,visualization,topic modelling,cluster,clustering
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Development Status :: 4 - Beta
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
|
|
20
|
+
.. -*- mode: rst -*-
|
|
21
|
+
|
|
22
|
+
.. image:: doc/datamapplot_text_horizontal.png
|
|
23
|
+
:width: 600
|
|
24
|
+
:alt: DataMapPlot logo
|
|
25
|
+
:align: center
|
|
26
|
+
|
|
27
|
+
===========
|
|
28
|
+
DataMapPlot
|
|
29
|
+
===========
|
|
30
|
+
|
|
31
|
+
Creating beautiful plots of data maps. DataMapPlot is a small library designed to help you make beautiful data map
|
|
32
|
+
plots for inclusion in presentations, posters and papers. The focus is on producing static plots that are great
|
|
33
|
+
looking with as little work for you as possible. All you need to do is label clusters of points in the data map and
|
|
34
|
+
DataMapPlot will take care of the rest. While this involves automating most of the aesthetic choices, the library
|
|
35
|
+
provides a wide variety of ways to customize the resulting plot to your needs.
|
|
36
|
+
|
|
37
|
+
--------
|
|
38
|
+
Examples
|
|
39
|
+
--------
|
|
40
|
+
|
|
41
|
+
Some examples of the kind of output that DataMapPlot can provide.
|
|
42
|
+
|
|
43
|
+
A basic plot, with some highlighted labels:
|
|
44
|
+
|
|
45
|
+
.. image:: examples/plot_cord19.png
|
|
46
|
+
:width: 1024
|
|
47
|
+
:alt: A data map plot of the CORD-19 dataset
|
|
48
|
+
:align: center
|
|
49
|
+
|
|
50
|
+
Using darkmode and some custom font choices:
|
|
51
|
+
|
|
52
|
+
.. image:: examples/plot_arxiv_ml.png
|
|
53
|
+
:width: 1024
|
|
54
|
+
:alt: A data map plot of papers from ArXiv ML
|
|
55
|
+
:align: center
|
|
56
|
+
|
|
57
|
+
Alternative custom styling:
|
|
58
|
+
|
|
59
|
+
.. image:: examples/plot_wikipedia.png
|
|
60
|
+
:width: 1024
|
|
61
|
+
:alt: A data map plot of Simple Wikipedia
|
|
62
|
+
:align: center
|
|
63
|
+
|
|
64
|
+
Custom arrow styles, fonts, and colour maps:
|
|
65
|
+
|
|
66
|
+
.. image:: examples/plot_simple_arxiv.png
|
|
67
|
+
:width: 1024
|
|
68
|
+
:alt: A styled data map plot of papers from ArXiv ML
|
|
69
|
+
:align: center
|
|
70
|
+
|
|
71
|
+
------------
|
|
72
|
+
Installation
|
|
73
|
+
------------
|
|
74
|
+
|
|
75
|
+
DataMapPlot requires a few libraries, but all are widely available and easy to install:
|
|
76
|
+
|
|
77
|
+
* Numpy
|
|
78
|
+
* Matplotlib
|
|
79
|
+
* Scikit-learn
|
|
80
|
+
* Pandas
|
|
81
|
+
* Datashader
|
|
82
|
+
* Scikit-image
|
|
83
|
+
* Numba
|
|
84
|
+
|
|
85
|
+
To install DataMapPlot you can use pip:
|
|
86
|
+
|
|
87
|
+
.. code:: bash
|
|
88
|
+
|
|
89
|
+
pip install datamapplot
|
|
90
|
+
|
|
91
|
+
or use conda with conda-forge
|
|
92
|
+
|
|
93
|
+
.. code:: bash
|
|
94
|
+
|
|
95
|
+
conda install -c conda-forge datamapplot
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
-------
|
|
99
|
+
License
|
|
100
|
+
-------
|
|
101
|
+
|
|
102
|
+
DataMapPlot is MIT licensed. See the LICENSE file for details.
|
|
103
|
+
|
|
104
|
+
------------
|
|
105
|
+
Contributing
|
|
106
|
+
------------
|
|
107
|
+
|
|
108
|
+
Contributions are more than welcome! If you have ideas for features of projects please get in touch. Everything from
|
|
109
|
+
code to notebooks to examples and documentation are all *equally valuable* so please don't feel you can't contribute.
|
|
110
|
+
To contribute please `fork the project <https://github.com/TutteInstitute/datamapplot/issues#fork-destination-box>`_ make your
|
|
111
|
+
changes and submit a pull request. We will do our best to work through any issues with you and get your code merged in.
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
.. -*- mode: rst -*-
|
|
2
|
+
|
|
3
|
+
.. image:: doc/datamapplot_text_horizontal.png
|
|
4
|
+
:width: 600
|
|
5
|
+
:alt: DataMapPlot logo
|
|
6
|
+
:align: center
|
|
7
|
+
|
|
8
|
+
===========
|
|
9
|
+
DataMapPlot
|
|
10
|
+
===========
|
|
11
|
+
|
|
12
|
+
Creating beautiful plots of data maps. DataMapPlot is a small library designed to help you make beautiful data map
|
|
13
|
+
plots for inclusion in presentations, posters and papers. The focus is on producing static plots that are great
|
|
14
|
+
looking with as little work for you as possible. All you need to do is label clusters of points in the data map and
|
|
15
|
+
DataMapPlot will take care of the rest. While this involves automating most of the aesthetic choices, the library
|
|
16
|
+
provides a wide variety of ways to customize the resulting plot to your needs.
|
|
17
|
+
|
|
18
|
+
--------
|
|
19
|
+
Examples
|
|
20
|
+
--------
|
|
21
|
+
|
|
22
|
+
Some examples of the kind of output that DataMapPlot can provide.
|
|
23
|
+
|
|
24
|
+
A basic plot, with some highlighted labels:
|
|
25
|
+
|
|
26
|
+
.. image:: examples/plot_cord19.png
|
|
27
|
+
:width: 1024
|
|
28
|
+
:alt: A data map plot of the CORD-19 dataset
|
|
29
|
+
:align: center
|
|
30
|
+
|
|
31
|
+
Using darkmode and some custom font choices:
|
|
32
|
+
|
|
33
|
+
.. image:: examples/plot_arxiv_ml.png
|
|
34
|
+
:width: 1024
|
|
35
|
+
:alt: A data map plot of papers from ArXiv ML
|
|
36
|
+
:align: center
|
|
37
|
+
|
|
38
|
+
Alternative custom styling:
|
|
39
|
+
|
|
40
|
+
.. image:: examples/plot_wikipedia.png
|
|
41
|
+
:width: 1024
|
|
42
|
+
:alt: A data map plot of Simple Wikipedia
|
|
43
|
+
:align: center
|
|
44
|
+
|
|
45
|
+
Custom arrow styles, fonts, and colour maps:
|
|
46
|
+
|
|
47
|
+
.. image:: examples/plot_simple_arxiv.png
|
|
48
|
+
:width: 1024
|
|
49
|
+
:alt: A styled data map plot of papers from ArXiv ML
|
|
50
|
+
:align: center
|
|
51
|
+
|
|
52
|
+
------------
|
|
53
|
+
Installation
|
|
54
|
+
------------
|
|
55
|
+
|
|
56
|
+
DataMapPlot requires a few libraries, but all are widely available and easy to install:
|
|
57
|
+
|
|
58
|
+
* Numpy
|
|
59
|
+
* Matplotlib
|
|
60
|
+
* Scikit-learn
|
|
61
|
+
* Pandas
|
|
62
|
+
* Datashader
|
|
63
|
+
* Scikit-image
|
|
64
|
+
* Numba
|
|
65
|
+
|
|
66
|
+
To install DataMapPlot you can use pip:
|
|
67
|
+
|
|
68
|
+
.. code:: bash
|
|
69
|
+
|
|
70
|
+
pip install datamapplot
|
|
71
|
+
|
|
72
|
+
or use conda with conda-forge
|
|
73
|
+
|
|
74
|
+
.. code:: bash
|
|
75
|
+
|
|
76
|
+
conda install -c conda-forge datamapplot
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
-------
|
|
80
|
+
License
|
|
81
|
+
-------
|
|
82
|
+
|
|
83
|
+
DataMapPlot is MIT licensed. See the LICENSE file for details.
|
|
84
|
+
|
|
85
|
+
------------
|
|
86
|
+
Contributing
|
|
87
|
+
------------
|
|
88
|
+
|
|
89
|
+
Contributions are more than welcome! If you have ideas for features of projects please get in touch. Everything from
|
|
90
|
+
code to notebooks to examples and documentation are all *equally valuable* so please don't feel you can't contribute.
|
|
91
|
+
To contribute please `fork the project <https://github.com/TutteInstitute/datamapplot/issues#fork-destination-box>`_ make your
|
|
92
|
+
changes and submit a pull request. We will do our best to work through any issues with you and get your code merged in.
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import textwrap
|
|
4
|
+
|
|
5
|
+
from matplotlib import pyplot as plt
|
|
6
|
+
|
|
7
|
+
from datamapplot.palette_handling import (
|
|
8
|
+
palette_from_datamap,
|
|
9
|
+
palette_from_cmap_and_datamap,
|
|
10
|
+
deep_palette,
|
|
11
|
+
pastel_palette,
|
|
12
|
+
)
|
|
13
|
+
from datamapplot.plot_rendering import render_plot
|
|
14
|
+
from datamapplot.medoids import medoid
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_plot(
|
|
18
|
+
data_map_coords,
|
|
19
|
+
labels,
|
|
20
|
+
*,
|
|
21
|
+
title=None,
|
|
22
|
+
sub_title=None,
|
|
23
|
+
noise_label="Unlabelled",
|
|
24
|
+
noise_color="#999999",
|
|
25
|
+
color_label_text=True,
|
|
26
|
+
label_wrap_width=16,
|
|
27
|
+
label_color_map=None,
|
|
28
|
+
figsize=(12, 12),
|
|
29
|
+
dynamic_label_size=False,
|
|
30
|
+
dpi=plt.rcParams["figure.dpi"],
|
|
31
|
+
force_matplotlib=False,
|
|
32
|
+
darkmode=False,
|
|
33
|
+
highlight_labels=None,
|
|
34
|
+
palette_hue_shift=0.0,
|
|
35
|
+
palette_hue_radius_dependence=1.0,
|
|
36
|
+
use_medoids=False,
|
|
37
|
+
cmap=None,
|
|
38
|
+
**render_plot_kwds,
|
|
39
|
+
):
|
|
40
|
+
"""Create a static plot from ``data_map_coords`` with text labels provided by ``labels``.
|
|
41
|
+
This is the primary function for DataMapPlot and provides the easiest interface to the
|
|
42
|
+
static plotting functionality. This function provides a number of options, but also
|
|
43
|
+
passes any further keyword options through to the lower level ``render_plot`` function
|
|
44
|
+
so be sure to check the documentation for ``render_plot`` to discover further keyword
|
|
45
|
+
arguments that can be used here as well.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
data_map_coords: ndarray of floats of shape (n_samples, 2)
|
|
50
|
+
The 2D coordinates for the data map. Usually this is produced via a
|
|
51
|
+
dimension reduction technique such as UMAP, t-SNE, PacMAP, PyMDE etc.
|
|
52
|
+
|
|
53
|
+
labels: ndarray of strings (object) of shape (n_samples,)
|
|
54
|
+
A string label each data point in the data map. There should ideally by
|
|
55
|
+
only up to 64 unique labels. Noise or unlabelled points should have the
|
|
56
|
+
same label as ``noise_label``, which is "Unlabelled" by default.
|
|
57
|
+
|
|
58
|
+
title: str or None (optional, default=None)
|
|
59
|
+
A title for the plot. If ``None`` then no title is used for the plot.
|
|
60
|
+
The title should be succint; three to seven words.
|
|
61
|
+
|
|
62
|
+
sub_title: str or None (optional, default=None)
|
|
63
|
+
A sub-title for the plot. If ``None`` then no sub-title is used for the plot.
|
|
64
|
+
The sub-title can be significantly longer then the title and provide more information\
|
|
65
|
+
about the plot and data sources.
|
|
66
|
+
|
|
67
|
+
noise_label: str (optional, default="Unlabelled")
|
|
68
|
+
The string used in the ``labels`` array to identify the unlabelled or noise points
|
|
69
|
+
in the dataset.
|
|
70
|
+
|
|
71
|
+
noise_color: str (optional, default="#999999")
|
|
72
|
+
The colour to use for unlabelled or noise points in the data map. This should usually
|
|
73
|
+
be a muted or neutral colour to distinguish background points from the labelled clusters.
|
|
74
|
+
|
|
75
|
+
color_label_text: bool (optional, default=True)
|
|
76
|
+
Whether to use colours for the text labels generated in the plot. If ``False`` then
|
|
77
|
+
the text labels will default to either black or white depending on ``darkmode``.
|
|
78
|
+
|
|
79
|
+
label_wrap_width: int (optional, default=16)
|
|
80
|
+
The number of characters to apply text-wrapping at when creating text labels for
|
|
81
|
+
display in the plot. Note that long words will not be broken, so you can choose
|
|
82
|
+
relatively small values if you want tight text-wrapping.
|
|
83
|
+
|
|
84
|
+
label_color_map: dict or None (optional, default=None)
|
|
85
|
+
A colour mapping to use to colour points/clusters in the data map. The mapping should
|
|
86
|
+
be keyed by the unique cluster labels in ``labels`` and take values that are hex-string
|
|
87
|
+
representations of colours. If ``None`` then a colour mapping will be auto-generated.
|
|
88
|
+
|
|
89
|
+
figsize: (int, int) (optional, default=(12,12))
|
|
90
|
+
How big to make the figure in inches (actual pixel size will depend on ``dpi``).
|
|
91
|
+
|
|
92
|
+
dynamic_label_size: bool (optional, default=False)
|
|
93
|
+
Whether to dynamically resize the text labels based on the relative sizes of the
|
|
94
|
+
clusters. This can be useful to help highlight larger clusters.
|
|
95
|
+
|
|
96
|
+
dpi: int (optional, default=plt.rcParams["figure.dpi"])
|
|
97
|
+
The dots-per-inch setting usd when rendering the plot.
|
|
98
|
+
|
|
99
|
+
force_matplotlib: bool (optional, default=False)
|
|
100
|
+
Force using matplotlib instead of datashader for rendering the scatterplot of the
|
|
101
|
+
data map. This can be useful if you wish to have a different marker_type, or variably
|
|
102
|
+
sized markers based on a marker_size_array, neither of which are supported by the
|
|
103
|
+
datashader based renderer.
|
|
104
|
+
|
|
105
|
+
darkmode: bool (optional, default=False)
|
|
106
|
+
Whether to render the plot in darkmode (with a dark background) or not.
|
|
107
|
+
|
|
108
|
+
highlight_labels: list of str or None (optional, default=None)
|
|
109
|
+
A list of unique labels that should have their text highlighted in the resulting plot.
|
|
110
|
+
Arguments supported by ``render_plot`` can allow for control over how highlighted labels
|
|
111
|
+
are rendered. By default they are simply rendered in bold text.
|
|
112
|
+
|
|
113
|
+
palette_hue_shift: float (optional, default=0.0)
|
|
114
|
+
A setting, in degrees clockwise, to shift the hue channel when generating a colour
|
|
115
|
+
palette and color_mapping for the labels.
|
|
116
|
+
|
|
117
|
+
palette_hue_radius_dependence: float (optional, default=1.0)
|
|
118
|
+
A setting that determines how dependent on the radius the hue channel is. Larger
|
|
119
|
+
values will result in more hue variation where there are more outlying points.
|
|
120
|
+
|
|
121
|
+
use_medoids: bool (optional, default=False)
|
|
122
|
+
Whether to use medoids instead of centroids to determine the "location" of the cluster,
|
|
123
|
+
both for the label indicator line, and for palette colouring. Note that medoids are
|
|
124
|
+
more computationally expensive, especially for large plots, so use with some caution.
|
|
125
|
+
|
|
126
|
+
cmap: matplotlib cmap or None (optional, default=None)
|
|
127
|
+
A linear matplotlib cmap colour map to use as the base for a generated colour mapping.
|
|
128
|
+
This *should* be a matplotlib cmap that is smooth and linear, and cyclic
|
|
129
|
+
(see the colorcet package for some good options). If not a cyclic cmap it will be
|
|
130
|
+
"made" cyclic by reflecting it. If ``None`` then a custom method will be used instead.
|
|
131
|
+
|
|
132
|
+
**render_plot_kwds
|
|
133
|
+
All opther keyword arguments are passed through the ``render_plot`` which provides
|
|
134
|
+
significant further control over the aesthetics of the plot.
|
|
135
|
+
|
|
136
|
+
Returns
|
|
137
|
+
-------
|
|
138
|
+
|
|
139
|
+
fig: matplotlib.Figure
|
|
140
|
+
The figure that the resulting plot is rendered to.
|
|
141
|
+
|
|
142
|
+
ax: matpolotlib.Axes
|
|
143
|
+
The axes contained within the figure that the plot is rendered to.
|
|
144
|
+
|
|
145
|
+
"""
|
|
146
|
+
cluster_label_vector = np.asarray(labels)
|
|
147
|
+
unique_non_noise_labels = [
|
|
148
|
+
label for label in np.unique(cluster_label_vector) if label != noise_label
|
|
149
|
+
]
|
|
150
|
+
if use_medoids:
|
|
151
|
+
label_locations = np.asarray(
|
|
152
|
+
[
|
|
153
|
+
medoid(data_map_coords[cluster_label_vector == i])
|
|
154
|
+
for i in unique_non_noise_labels
|
|
155
|
+
]
|
|
156
|
+
)
|
|
157
|
+
else:
|
|
158
|
+
label_locations = np.asarray(
|
|
159
|
+
[
|
|
160
|
+
data_map_coords[cluster_label_vector == i].mean(axis=0)
|
|
161
|
+
for i in unique_non_noise_labels
|
|
162
|
+
]
|
|
163
|
+
)
|
|
164
|
+
label_text = [
|
|
165
|
+
textwrap.fill(x, width=label_wrap_width, break_long_words=False)
|
|
166
|
+
for x in unique_non_noise_labels
|
|
167
|
+
]
|
|
168
|
+
if highlight_labels is not None:
|
|
169
|
+
highlight_labels = [
|
|
170
|
+
textwrap.fill(x, width=label_wrap_width, break_long_words=False)
|
|
171
|
+
for x in highlight_labels
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
# If we don't have a color map, generate one
|
|
175
|
+
if label_color_map is None:
|
|
176
|
+
if cmap is None:
|
|
177
|
+
palette = palette_from_datamap(
|
|
178
|
+
data_map_coords,
|
|
179
|
+
label_locations,
|
|
180
|
+
hue_shift=palette_hue_shift,
|
|
181
|
+
radius_weight_power=palette_hue_radius_dependence,
|
|
182
|
+
)
|
|
183
|
+
else:
|
|
184
|
+
palette = palette_from_cmap_and_datamap(
|
|
185
|
+
cmap,
|
|
186
|
+
data_map_coords,
|
|
187
|
+
label_locations,
|
|
188
|
+
radius_weight_power=palette_hue_radius_dependence,
|
|
189
|
+
)
|
|
190
|
+
label_to_index_map = {
|
|
191
|
+
name: index for index, name in enumerate(unique_non_noise_labels)
|
|
192
|
+
}
|
|
193
|
+
color_list = [
|
|
194
|
+
palette[label_to_index_map[x]] if x in label_to_index_map else noise_color
|
|
195
|
+
for x in cluster_label_vector
|
|
196
|
+
]
|
|
197
|
+
label_color_map = {
|
|
198
|
+
x: (
|
|
199
|
+
palette[label_to_index_map[x]]
|
|
200
|
+
if x in label_to_index_map
|
|
201
|
+
else noise_color
|
|
202
|
+
)
|
|
203
|
+
for x in np.unique(cluster_label_vector)
|
|
204
|
+
}
|
|
205
|
+
else:
|
|
206
|
+
color_list = [
|
|
207
|
+
label_color_map[x] if x != noise_label else noise_color
|
|
208
|
+
for x in cluster_label_vector
|
|
209
|
+
]
|
|
210
|
+
|
|
211
|
+
# Darken and reduce chroma of label colors to get text labels
|
|
212
|
+
if color_label_text:
|
|
213
|
+
if darkmode:
|
|
214
|
+
label_text_colors = pastel_palette(
|
|
215
|
+
[label_color_map[x] for x in unique_non_noise_labels]
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
label_text_colors = deep_palette(
|
|
219
|
+
[label_color_map[x] for x in unique_non_noise_labels]
|
|
220
|
+
)
|
|
221
|
+
else:
|
|
222
|
+
label_text_colors = None
|
|
223
|
+
|
|
224
|
+
if dynamic_label_size:
|
|
225
|
+
font_scale_factor = np.sqrt(figsize[0] * figsize[1])
|
|
226
|
+
cluster_sizes = np.sqrt(pd.Series(cluster_label_vector).value_counts())
|
|
227
|
+
label_size_adjustments = cluster_sizes - cluster_sizes.min()
|
|
228
|
+
label_size_adjustments /= label_size_adjustments.max()
|
|
229
|
+
label_size_adjustments *= (
|
|
230
|
+
render_plot_kwds.get("label_font_size", font_scale_factor) + 2
|
|
231
|
+
)
|
|
232
|
+
label_size_adjustments = dict(label_size_adjustments - 2)
|
|
233
|
+
label_size_adjustments = [
|
|
234
|
+
label_size_adjustments[x] for x in unique_non_noise_labels
|
|
235
|
+
]
|
|
236
|
+
else:
|
|
237
|
+
label_size_adjustments = [0.0] * len(unique_non_noise_labels)
|
|
238
|
+
|
|
239
|
+
# Heuristics for point size and alpha values
|
|
240
|
+
n_points = data_map_coords.shape[0]
|
|
241
|
+
if data_map_coords.shape[0] < 100_000 or force_matplotlib:
|
|
242
|
+
magic_number = np.clip(128 * 4 ** (-np.log10(n_points)), 0.05, 64)
|
|
243
|
+
point_scale_factor = np.sqrt(figsize[0] * figsize[1])
|
|
244
|
+
point_size = magic_number * (point_scale_factor / 2)
|
|
245
|
+
alpha = np.clip(magic_number, 0.05, 1)
|
|
246
|
+
else:
|
|
247
|
+
point_size = int(np.sqrt(figsize[0] * figsize[1]) * dpi) // 2048
|
|
248
|
+
alpha = 1.0
|
|
249
|
+
|
|
250
|
+
if "point_size" in render_plot_kwds:
|
|
251
|
+
point_size = render_plot_kwds.pop("point_size")
|
|
252
|
+
|
|
253
|
+
if "alpha" in render_plot_kwds:
|
|
254
|
+
alpha = render_plot_kwds.pop("alpha")
|
|
255
|
+
|
|
256
|
+
fig, ax = render_plot(
|
|
257
|
+
data_map_coords,
|
|
258
|
+
color_list,
|
|
259
|
+
label_text,
|
|
260
|
+
label_locations,
|
|
261
|
+
title=title,
|
|
262
|
+
sub_title=sub_title,
|
|
263
|
+
point_size=point_size,
|
|
264
|
+
alpha=alpha,
|
|
265
|
+
label_colors=None if not color_label_text else label_text_colors,
|
|
266
|
+
highlight_colors=[label_color_map[x] for x in unique_non_noise_labels],
|
|
267
|
+
figsize=figsize,
|
|
268
|
+
noise_color=noise_color,
|
|
269
|
+
label_size_adjustments=label_size_adjustments,
|
|
270
|
+
dpi=dpi,
|
|
271
|
+
force_matplotlib=force_matplotlib,
|
|
272
|
+
darkmode=darkmode,
|
|
273
|
+
highlight_labels=highlight_labels,
|
|
274
|
+
**render_plot_kwds,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
return fig, ax
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import numba
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@numba.njit(
|
|
6
|
+
[
|
|
7
|
+
"f4(f4[::1],f4[::1])",
|
|
8
|
+
numba.types.float32(
|
|
9
|
+
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
10
|
+
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
11
|
+
),
|
|
12
|
+
],
|
|
13
|
+
fastmath=True,
|
|
14
|
+
locals={
|
|
15
|
+
"result": numba.types.float32,
|
|
16
|
+
"diff": numba.types.float32,
|
|
17
|
+
"dim": numba.types.intp,
|
|
18
|
+
"i": numba.types.uint16,
|
|
19
|
+
},
|
|
20
|
+
)
|
|
21
|
+
def euclidean(x, y):
|
|
22
|
+
r"""Squared euclidean distance.
|
|
23
|
+
|
|
24
|
+
.. math::
|
|
25
|
+
D(x, y) = \sum_i (x_i - y_i)^2
|
|
26
|
+
"""
|
|
27
|
+
result = 0.0
|
|
28
|
+
dim = x.shape[0]
|
|
29
|
+
for i in range(dim):
|
|
30
|
+
diff = x[i] - y[i]
|
|
31
|
+
result += diff * diff
|
|
32
|
+
|
|
33
|
+
return np.sqrt(result)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@numba.njit(parallel=True, nogil=True)
|
|
37
|
+
def chunked_parallel_pairwise_distances(X, Y=None, metric=euclidean, chunk_size=16):
|
|
38
|
+
if Y is None:
|
|
39
|
+
XX, symmetrical = X, True
|
|
40
|
+
row_size = col_size = X.shape[0]
|
|
41
|
+
else:
|
|
42
|
+
XX, symmetrical = Y, False
|
|
43
|
+
row_size, col_size = X.shape[0], Y.shape[0]
|
|
44
|
+
|
|
45
|
+
result = np.zeros((row_size, col_size), dtype=np.float32)
|
|
46
|
+
n_row_chunks = (row_size // chunk_size) + 1
|
|
47
|
+
for chunk_idx in numba.prange(n_row_chunks):
|
|
48
|
+
n = chunk_idx * chunk_size
|
|
49
|
+
chunk_end_n = min(n + chunk_size, row_size)
|
|
50
|
+
m_start = n if symmetrical else 0
|
|
51
|
+
for m in range(m_start, col_size, chunk_size):
|
|
52
|
+
chunk_end_m = min(m + chunk_size, col_size)
|
|
53
|
+
for i in range(n, chunk_end_n):
|
|
54
|
+
for j in range(m, chunk_end_m):
|
|
55
|
+
result[i, j] = metric(X[i], XX[j])
|
|
56
|
+
return result
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@numba.njit()
|
|
60
|
+
def pull_arms(data, arms, num_pulls_per_arm, estimates, pull_counts):
|
|
61
|
+
other_candidates = np.random.choice(
|
|
62
|
+
data.shape[0], size=num_pulls_per_arm, replace=False
|
|
63
|
+
).astype(np.int32)
|
|
64
|
+
data_arm = data[arms]
|
|
65
|
+
data_other = data[other_candidates]
|
|
66
|
+
|
|
67
|
+
distance_sums = np.sum(
|
|
68
|
+
chunked_parallel_pairwise_distances(data_arm, data_other), axis=1
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
estimates *= pull_counts
|
|
72
|
+
estimates += distance_sums
|
|
73
|
+
pull_counts += num_pulls_per_arm
|
|
74
|
+
estimates /= pull_counts
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@numba.njit()
|
|
78
|
+
def medoid(data, arm_budget=20):
|
|
79
|
+
pull_counts = np.zeros(data.shape[0], dtype=np.int32)
|
|
80
|
+
pull_budget = arm_budget * data.shape[0]
|
|
81
|
+
estimates = np.zeros(data.shape[0], dtype=np.float32)
|
|
82
|
+
current_active_arms = np.arange(data.shape[0])
|
|
83
|
+
n_rounds = int(np.ceil(np.log2(data.shape[0])))
|
|
84
|
+
|
|
85
|
+
while current_active_arms.shape[0] > 1:
|
|
86
|
+
num_pulls_per_arm = max(
|
|
87
|
+
1,
|
|
88
|
+
int(
|
|
89
|
+
min(
|
|
90
|
+
data.shape[0],
|
|
91
|
+
np.floor(pull_budget / (current_active_arms.shape[0] * n_rounds)),
|
|
92
|
+
)
|
|
93
|
+
),
|
|
94
|
+
)
|
|
95
|
+
pull_arms(data, current_active_arms, num_pulls_per_arm, estimates, pull_counts)
|
|
96
|
+
|
|
97
|
+
median = np.median(estimates)
|
|
98
|
+
mask = estimates <= median
|
|
99
|
+
current_active_arms = current_active_arms[mask]
|
|
100
|
+
estimates = estimates[mask]
|
|
101
|
+
pull_counts = pull_counts[mask]
|
|
102
|
+
|
|
103
|
+
return data[current_active_arms[0]]
|