ProteinClusterTools 1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- proteinclustertools-1.0/.gitignore +5 -0
- proteinclustertools-1.0/License +21 -0
- proteinclustertools-1.0/PKG-INFO +11 -0
- proteinclustertools-1.0/README.md +0 -0
- proteinclustertools-1.0/pyproject.toml +23 -0
- proteinclustertools-1.0/src/proteinclustertools/__init__.py +0 -0
- proteinclustertools-1.0/src/proteinclustertools/layout/__init__.py +0 -0
- proteinclustertools-1.0/src/proteinclustertools/layout/circle_collision.py +196 -0
- proteinclustertools-1.0/src/proteinclustertools/layout/circle_layout_tools.py +226 -0
- proteinclustertools-1.0/src/proteinclustertools/layout/draw_circles.py +23 -0
- proteinclustertools-1.0/src/proteinclustertools/pipeline.py +330 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/__init__.py +0 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/cluster_merging.py +135 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/cluster_stream.py +313 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/esm_extract.py +140 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/esm_wrapper.py +52 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/hierarchical_clustering.py +64 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/hmm_cluster_rep.py +168 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/kmeans.py +43 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/mmseqs_wrapper.py +21 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/sanitize_fasta_headers.py +23 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/summarize_distribution.py +41 -0
- proteinclustertools-1.0/src/proteinclustertools/tools/vector_cluster_rep.py +23 -0
- proteinclustertools-1.0/src/proteinclustertools/visuals/__init__.py +0 -0
- proteinclustertools-1.0/src/proteinclustertools/visuals/annotate.py +274 -0
- proteinclustertools-1.0/src/proteinclustertools/visuals/circle_plot.py +338 -0
- proteinclustertools-1.0/src/proteinclustertools/visuals/tree_utils.py +214 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 John Chen, Barnabas Gall
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: ProteinClusterTools
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: Pipeline for clustering protein sequences in large scale datasets, using homology or pLM representation.
|
|
5
|
+
Project-URL: Homepage, https://https://github.com/johnchen93/ProteinClusterTools
|
|
6
|
+
Project-URL: Issues, https://https://github.com/johnchen93/ProteinClusterTools/issues
|
|
7
|
+
Author-email: John Chen <jo.chn.93@gmail.com>, Barnabas Gall <barnabas.gall@anu.edu.au>
|
|
8
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
9
|
+
Classifier: Operating System :: OS Independent
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Requires-Python: >=3.6
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ProteinClusterTools"
|
|
7
|
+
version = "1.0"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="John Chen", email="jo.chn.93@gmail.com" },
|
|
10
|
+
{ name="Barnabas Gall", email="barnabas.gall@anu.edu.au"}
|
|
11
|
+
]
|
|
12
|
+
description = "Pipeline for clustering protein sequences in large scale datasets, using homology or pLM representation."
|
|
13
|
+
readme = "README.md"
|
|
14
|
+
requires-python = ">=3.6"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[project.urls]
|
|
22
|
+
Homepage = "https://https://github.com/johnchen93/ProteinClusterTools"
|
|
23
|
+
Issues = "https://https://github.com/johnchen93/ProteinClusterTools/issues"
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import time
|
|
3
|
+
from typing import Optional
|
|
4
|
+
import math
|
|
5
|
+
import Box2D
|
|
6
|
+
import matplotlib.pyplot as plt
|
|
7
|
+
import random
|
|
8
|
+
from tqdm import tqdm
|
|
9
|
+
import pandas as pd
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
# Set up the Box2D world
|
|
13
|
+
# world = Box2D.b2World(gravity=(0,0))
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class Circle():
|
|
17
|
+
x:float
|
|
18
|
+
y:float
|
|
19
|
+
r:float
|
|
20
|
+
id:str
|
|
21
|
+
parent:Optional['Circle']=None
|
|
22
|
+
size:Optional[float]=None
|
|
23
|
+
|
|
24
|
+
# Define a function to create a circle body
|
|
25
|
+
def create_circle(world, x, y, radius, density=1.0):
|
|
26
|
+
body_def = Box2D.b2BodyDef()
|
|
27
|
+
body_def.type = Box2D.b2_dynamicBody
|
|
28
|
+
body_def.position = (x , y )
|
|
29
|
+
body = world.CreateBody(body_def)
|
|
30
|
+
|
|
31
|
+
shape = Box2D.b2CircleShape(radius=radius)
|
|
32
|
+
fixture_def = Box2D.b2FixtureDef(shape=shape, density=density)
|
|
33
|
+
body.CreateFixture(fixture_def)
|
|
34
|
+
|
|
35
|
+
return body
|
|
36
|
+
|
|
37
|
+
def GetRadii(sizes, min_size=0, scale=1, force=None):
|
|
38
|
+
# expect a series of cluster sizes
|
|
39
|
+
if force is not None:
|
|
40
|
+
sel=sizes[(sizes>=min_size)|(force)]
|
|
41
|
+
else:
|
|
42
|
+
sel=sizes[sizes>=min_size]
|
|
43
|
+
radii=[]
|
|
44
|
+
for s in sel:
|
|
45
|
+
radii.append((s*scale/math.pi)**0.5)
|
|
46
|
+
radii=pd.Series(radii, index=sel.index, name='radius')
|
|
47
|
+
return radii
|
|
48
|
+
|
|
49
|
+
def MakeLayout(radii, radius_pad=1, space_mult=1.2, cycles=10_000, time_step=1, velocity_iterations=6, position_iterations=2,
|
|
50
|
+
parent:Circle=None, seed=None, pull=0, min_radius_frac=.05, seed_coords=None, expand_mult=1, metadata=None, silence=True):
|
|
51
|
+
'''
|
|
52
|
+
radii: pandas Series of radii (index indicates the id of the circle)
|
|
53
|
+
parent: parent circle to nest current circles within
|
|
54
|
+
seed: random seed
|
|
55
|
+
seed_coords: Series of (x,y) coordinates to use as starting points for circles, indexed by the id which is expected to match radii
|
|
56
|
+
'''
|
|
57
|
+
if seed is not None:
|
|
58
|
+
random.seed(seed)
|
|
59
|
+
|
|
60
|
+
# if parent is None:
|
|
61
|
+
area=sum(math.pi*(r+radius_pad)**2 for r in radii)
|
|
62
|
+
spacing=area**0.5 * space_mult
|
|
63
|
+
|
|
64
|
+
world = Box2D.b2World(gravity=(0,0))
|
|
65
|
+
# bodies=[]
|
|
66
|
+
rmin, rmax = min(radii), max(radii)
|
|
67
|
+
min_effective_r = rmax * min_radius_frac
|
|
68
|
+
total_area=0
|
|
69
|
+
start_circles=[]
|
|
70
|
+
N=len(radii)
|
|
71
|
+
lastx, lasty, lastr=0, 0, 0
|
|
72
|
+
distances=[]
|
|
73
|
+
radii_sum=[]
|
|
74
|
+
for i, r in radii.items():
|
|
75
|
+
if seed_coords is not None:
|
|
76
|
+
row=seed_coords.loc[i] # match by index
|
|
77
|
+
x, y=row['x'], row['y']
|
|
78
|
+
else:
|
|
79
|
+
angle=random.random() * 2 * math.pi
|
|
80
|
+
pos_r=2**-math.log(max(min_effective_r, r)) * spacing + random.random() * spacing * .01
|
|
81
|
+
x=pos_r*math.cos(angle)
|
|
82
|
+
y=pos_r*math.sin(angle)
|
|
83
|
+
total_area+=math.pi*(r+radius_pad)**2
|
|
84
|
+
# bodies.append(create_circle(world, x, y, r+radius_pad, r**2))
|
|
85
|
+
start_circles.append(Circle(x, y, r+radius_pad, i))
|
|
86
|
+
# track distances
|
|
87
|
+
if i != radii.index[0]:
|
|
88
|
+
distances.append(((x-lastx)**2+(y-lasty)**2)**0.5)
|
|
89
|
+
radii_sum.append(r+lastr)
|
|
90
|
+
lastx, lasty, lastr=x, y, r
|
|
91
|
+
if len(start_circles)>1:
|
|
92
|
+
avg_dist=sum(distances)/len(distances)
|
|
93
|
+
avg_radii_overlap=sum([radii_sum[i]-distances[i] for i in range(len(radii_sum))])/len(radii_sum)
|
|
94
|
+
else:
|
|
95
|
+
avg_dist=0
|
|
96
|
+
avg_radii_overlap=0
|
|
97
|
+
|
|
98
|
+
# get boundaries
|
|
99
|
+
xmax, xmin, ymax, ymin=GetLimits(start_circles)
|
|
100
|
+
# space the circles out so that the bounding box is 1.2 times the area of the circles
|
|
101
|
+
if seed_coords is None:
|
|
102
|
+
expand_factor=1
|
|
103
|
+
pull_mod=1
|
|
104
|
+
else:
|
|
105
|
+
expand_factor=(total_area/(xmax-xmin)/(ymax-ymin)) * expand_mult * len(radii)**.5 * avg_radii_overlap
|
|
106
|
+
pull_mod=expand_factor/cycles
|
|
107
|
+
|
|
108
|
+
# get centroid of xy
|
|
109
|
+
# if expand_mult!=1:
|
|
110
|
+
cx, cy=0, 0
|
|
111
|
+
for c in start_circles:
|
|
112
|
+
cx+=c.x
|
|
113
|
+
cy+=c.y
|
|
114
|
+
cx/=len(start_circles)
|
|
115
|
+
cy/=len(start_circles)
|
|
116
|
+
|
|
117
|
+
start=time.time()
|
|
118
|
+
if N >100 and not silence:
|
|
119
|
+
print(f' Average distance: {avg_dist}, average overlap: {avg_radii_overlap}')
|
|
120
|
+
print(f' Expanding by {expand_factor} for {len(radii)} circles')
|
|
121
|
+
|
|
122
|
+
# scale distance from centroid
|
|
123
|
+
for c in start_circles:
|
|
124
|
+
c.x=(c.x-cx)*expand_factor+cx
|
|
125
|
+
c.y=(c.y-cy)*expand_factor+cy
|
|
126
|
+
|
|
127
|
+
if N >100 and not silence:
|
|
128
|
+
print(f' Expanding took {time.time()-start} seconds')
|
|
129
|
+
|
|
130
|
+
bodies=[]
|
|
131
|
+
for c in start_circles:
|
|
132
|
+
bodies.append(create_circle(world, c.x, c.y, c.r, c.r**2))
|
|
133
|
+
|
|
134
|
+
for cycle in range(cycles):
|
|
135
|
+
cycle_mod=2 if cycle<cycles*2/3 else 1
|
|
136
|
+
if N>1000:
|
|
137
|
+
cycle_start=time.time()
|
|
138
|
+
if pull!=0:
|
|
139
|
+
for c in bodies:
|
|
140
|
+
x,y=c.position
|
|
141
|
+
mod=c.fixtures[0].density * pull * pull_mod
|
|
142
|
+
c.ApplyForceToCenter((-x*mod,-y*mod), True)
|
|
143
|
+
world.Step(time_step*cycle_mod, velocity_iterations, position_iterations)
|
|
144
|
+
if N>1000 and not silence:
|
|
145
|
+
print(f' Cycle took {time.time()-cycle_start} seconds')
|
|
146
|
+
|
|
147
|
+
if N>100 and not silence:
|
|
148
|
+
print(f' Layout took {time.time()-start} seconds')
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
circles=[]
|
|
152
|
+
for i in range(len(bodies)):
|
|
153
|
+
x,y=bodies[i].position
|
|
154
|
+
id=radii.index[i]
|
|
155
|
+
circles.append(Circle(x, y, radii[id], id, parent))
|
|
156
|
+
xmax, xmin, ymax, ymin=GetLimits(circles)
|
|
157
|
+
|
|
158
|
+
# get center to off set grouping
|
|
159
|
+
cx=(xmax+xmin)/2
|
|
160
|
+
cy=(ymax+ymin)/2
|
|
161
|
+
|
|
162
|
+
xoff, yoff= 0, 0
|
|
163
|
+
if parent is not None:
|
|
164
|
+
xoff, yoff=parent.x, parent.y
|
|
165
|
+
for c in circles:
|
|
166
|
+
c.x=c.x-cx+xoff
|
|
167
|
+
c.y=c.y-cy+yoff
|
|
168
|
+
|
|
169
|
+
return circles, metadata
|
|
170
|
+
|
|
171
|
+
def GetLimits(circles, pad=0):
|
|
172
|
+
xmax, xmin, ymax, ymin=circles[0].x, circles[0].x, circles[0].y, circles[0].y
|
|
173
|
+
for c in circles:
|
|
174
|
+
xmax=max(xmax, c.x+c.r)
|
|
175
|
+
xmin=min(xmin, c.x-c.r)
|
|
176
|
+
ymax=max(ymax, c.y+c.r)
|
|
177
|
+
ymin=min(ymin, c.y-c.r)
|
|
178
|
+
return xmin-pad, xmax+pad, ymin-pad, ymax+pad
|
|
179
|
+
|
|
180
|
+
if __name__=='__main__':
|
|
181
|
+
|
|
182
|
+
# test code
|
|
183
|
+
n=10_000
|
|
184
|
+
radii=GetRadii(pd.Series([random.random()*100+1 for x in range(n)]))
|
|
185
|
+
circles=MakeLayout(radii)
|
|
186
|
+
xmin, xmax, ymin, ymax=GetLimits(circles)
|
|
187
|
+
|
|
188
|
+
f, ax=plt.subplots(figsize=(10,10))
|
|
189
|
+
ax.axis('off')
|
|
190
|
+
|
|
191
|
+
ax.set_xlim(xmin, xmax)
|
|
192
|
+
ax.set_ylim(ymin, ymax)
|
|
193
|
+
for c in circles:
|
|
194
|
+
ax.add_patch(plt.Circle((c.x, c.y), c.r, alpha=.2, linewidth=1, fill=False))
|
|
195
|
+
plt.show()
|
|
196
|
+
|
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
|
|
2
|
+
import math
|
|
3
|
+
from multiprocessing import Pool
|
|
4
|
+
import os
|
|
5
|
+
from types import SimpleNamespace
|
|
6
|
+
from .circle_collision import Circle, GetRadii, MakeLayout, GetLimits
|
|
7
|
+
import pandas as pd
|
|
8
|
+
import yaml
|
|
9
|
+
|
|
10
|
+
def ReadConfig(config_file):
|
|
11
|
+
config_dict=yaml.safe_load(open(config_file))
|
|
12
|
+
config=SimpleNamespace(**config_dict)
|
|
13
|
+
return config
|
|
14
|
+
|
|
15
|
+
def CommentedConfigString(config:SimpleNamespace):
|
|
16
|
+
cstr=yaml.safe_dump(config.__dict__)
|
|
17
|
+
commented='#'+cstr.replace('\n','\n#')
|
|
18
|
+
return commented
|
|
19
|
+
|
|
20
|
+
def ConfigFromComments(text):
|
|
21
|
+
cstr=text[1:].replace('\n#','\n')
|
|
22
|
+
config=SimpleNamespace(**yaml.safe_load(cstr))
|
|
23
|
+
return config
|
|
24
|
+
|
|
25
|
+
def process_child_clusters(args):
|
|
26
|
+
'''Process child clusters in parallel
|
|
27
|
+
Has same inputs as MakeLayout, but with child_clusters instead of radii as the first argument
|
|
28
|
+
'''
|
|
29
|
+
child_clusters, radii, radius_pad, space_scale, seed, seed_pos, cycles, parent, pull, expand_mult, velocity_iterations, position_iterations, time_step, metadata = args
|
|
30
|
+
return MakeLayout(radii.loc[child_clusters], radius_pad=0, space_mult=space_scale*.8, seed=seed, seed_coords=seed_pos,
|
|
31
|
+
time_step=time_step, cycles=cycles, parent=parent, pull=pull, expand_mult=expand_mult,
|
|
32
|
+
velocity_iterations=velocity_iterations, position_iterations=position_iterations, metadata=metadata)
|
|
33
|
+
|
|
34
|
+
def CenterOfMass(circles): # not used
|
|
35
|
+
x, y=0, 0
|
|
36
|
+
for c in circles:
|
|
37
|
+
x+=c.x
|
|
38
|
+
y+=c.y
|
|
39
|
+
x/=len(circles)
|
|
40
|
+
y/=len(circles)
|
|
41
|
+
return x, y
|
|
42
|
+
|
|
43
|
+
def Box2DLayout(clusters, levels, minsize=0, space_scale=1.1, pull=1, radius_pad=3, force_inclusion=None, velocity_iterations=16, position_iterations=6,
|
|
44
|
+
cycle_scale=1000, max_cycles=100_000, time_step=1, pull_base=False, seed=None, seed_coords=None, expand_mult=1):
|
|
45
|
+
|
|
46
|
+
'''
|
|
47
|
+
Function to turn a table of cluster definitions into a hierarchical layout of circles
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
General:
|
|
51
|
+
clusters (pd.DataFrame): table of cluster definitions, with columns 'id' for each sequence one column per "level" of clustering
|
|
52
|
+
levels (list): list of column names in clusters to use as levels of clustering. Must be in ascending order (lowest cut-off first) of hierarchy
|
|
53
|
+
minsize (int): minimum size of a cluster to be included in the layout
|
|
54
|
+
force_inclusion (list): list of sequence ids to force inclusion in the layout
|
|
55
|
+
Tuning layout:
|
|
56
|
+
pull (float): multiplier for the pull between circles when packing
|
|
57
|
+
radius_pad (float): padding between circles in the base (lowest) level
|
|
58
|
+
seed (int): random seed for layout. Use same seed to reproduce layout, assuming same input data and cycles of simulation
|
|
59
|
+
seed_coords (pd.DataFrame): table of seed coordinates for each cluster in the base level. Must have columns 'id', 'x', 'y'
|
|
60
|
+
pull_base (bool): whether to allow circles in the base level to pull each other. Still respects radius_pad, just makes the base level more compact.
|
|
61
|
+
Niche:
|
|
62
|
+
space_scale (float): multiplier for the space between circles on initial randomized layout.
|
|
63
|
+
expand_mult (float): multiplier for the expansion of circles. Somewhat unpredictable, can increase speed if too many circles start overlapping, but quite situational.
|
|
64
|
+
Box2D simulation:
|
|
65
|
+
velocity_iterations (int): number of velocity iterations in the Box2D simulation
|
|
66
|
+
position_iterations (int): number of position iterations in the Box2D simulation
|
|
67
|
+
cycle_scale (int): multiplier for the number of cycles in the Box2D simulation per 50 child clusters
|
|
68
|
+
max_cycles (int): maximum number of cycles in the Box2D simulation
|
|
69
|
+
time_step (int): time step in the Box2D simulation
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
dict: dictionary of circles at each level of clustering. Each level is a dictionary of circles, with the id of the circle as the key.
|
|
73
|
+
Each circle object contains its xy position, radius, and size (number of sequences in the cluster)
|
|
74
|
+
'''
|
|
75
|
+
|
|
76
|
+
### Process child levels in parallel
|
|
77
|
+
|
|
78
|
+
size_dicts={}
|
|
79
|
+
if len(levels)>1:
|
|
80
|
+
process_args = []
|
|
81
|
+
for lv, level_id in enumerate(levels[1:], start=1):
|
|
82
|
+
df=clusters.loc[:,['id', level_id]]
|
|
83
|
+
size=df.groupby(level_id).size().rename('size')#.reset_index()
|
|
84
|
+
size_dicts[level_id]=dict(size)
|
|
85
|
+
forced=size.index.isin(df[df['id'].isin(force_inclusion)][level_id].unique()) if force_inclusion is not None else None
|
|
86
|
+
|
|
87
|
+
radii=GetRadii(size, minsize, 1, forced)
|
|
88
|
+
print(f'Level {level_id}, clusters: {len(radii)}')
|
|
89
|
+
seed_pos=None
|
|
90
|
+
if seed_coords is not None:
|
|
91
|
+
seed_pos=seed_coords.merge(df, on='id')
|
|
92
|
+
seed_pos=seed_pos[seed_pos[level_id].isin(radii.index)]
|
|
93
|
+
seed_pos=seed_pos.groupby(level_id)[['x','y']].mean()
|
|
94
|
+
|
|
95
|
+
parent_level=levels[lv-1]
|
|
96
|
+
# subset=clusters[clusters[level_id].isin(radii.index) & clusters[levels[lv-1]].isin(data[parent_level].keys())]
|
|
97
|
+
subset=clusters[clusters[level_id].isin(radii.index)]
|
|
98
|
+
|
|
99
|
+
print('preparing child clusters')
|
|
100
|
+
for cluster, group in subset.groupby(parent_level):
|
|
101
|
+
child_clusters = group[level_id].unique()
|
|
102
|
+
# print(f'Level {lv} ({level_id}), parent: {cluster}, clusters: {len(group)}, children: {len(child_clusters)}')
|
|
103
|
+
cycles=min( math.ceil(max(len(child_clusters)/50, 1) * cycle_scale), max_cycles )
|
|
104
|
+
process_args.append((child_clusters, radii, radius_pad, space_scale, seed, seed_pos,
|
|
105
|
+
cycles, None, pull, expand_mult,
|
|
106
|
+
velocity_iterations, position_iterations, time_step, {'plevel':parent_level, 'parent':cluster,'level':level_id}))
|
|
107
|
+
|
|
108
|
+
print('starting pool')
|
|
109
|
+
with Pool() as pool:
|
|
110
|
+
results = list(pool.imap_unordered(process_child_clusters, process_args))
|
|
111
|
+
else:
|
|
112
|
+
results=[]
|
|
113
|
+
|
|
114
|
+
### Process base level to avoid collisions
|
|
115
|
+
level_id=levels[0]
|
|
116
|
+
df=clusters.loc[:,['id', level_id]]
|
|
117
|
+
size=df.groupby(level_id).size().rename('size')#.reset_index()
|
|
118
|
+
size_dicts[level_id]=dict(size)
|
|
119
|
+
forced=size.index.isin(df[df['id'].isin(force_inclusion)][level_id].unique()) if force_inclusion is not None else None
|
|
120
|
+
|
|
121
|
+
radii=GetRadii(size, minsize, 1, forced)
|
|
122
|
+
radius_override={}
|
|
123
|
+
if len(levels)>1:
|
|
124
|
+
child_level=levels[1]
|
|
125
|
+
children={}
|
|
126
|
+
for circles, meta in results:
|
|
127
|
+
if meta['level']==child_level:
|
|
128
|
+
children[meta['parent']]=circles
|
|
129
|
+
|
|
130
|
+
for cluster, radius in radii.items():
|
|
131
|
+
if cluster not in children:
|
|
132
|
+
continue
|
|
133
|
+
child_circles=children[cluster]
|
|
134
|
+
# get center of mass
|
|
135
|
+
# x, y=CenterOfMass(child_circles)
|
|
136
|
+
x0, x1, y0, y1=GetLimits(child_circles)
|
|
137
|
+
cx=(x0+x1)/2
|
|
138
|
+
cy=(y0+y1)/2
|
|
139
|
+
max_dist=0
|
|
140
|
+
# distance to each child circle + radius
|
|
141
|
+
for c in child_circles:
|
|
142
|
+
dist=((c.x-cx)**2+(c.y-cy)**2)**0.5 + c.r
|
|
143
|
+
if dist>max_dist:
|
|
144
|
+
max_dist=dist
|
|
145
|
+
if max_dist>radius:
|
|
146
|
+
radii.loc[cluster]=max_dist
|
|
147
|
+
radius_override[cluster]=radius
|
|
148
|
+
|
|
149
|
+
print(f'Level {level_id}, clusters: {len(radii)}')
|
|
150
|
+
seed_pos=None
|
|
151
|
+
if seed_coords is not None:
|
|
152
|
+
seed_pos=seed_coords.merge(df, on='id')
|
|
153
|
+
seed_pos=seed_pos[seed_pos[level_id].isin(radii.index)]
|
|
154
|
+
seed_pos=seed_pos.groupby(level_id)[['x','y']].mean()
|
|
155
|
+
base_circles, _ =MakeLayout(radii, radius_pad=radius_pad, space_mult=space_scale, seed=seed,
|
|
156
|
+
seed_coords=seed_pos, pull=pull if pull_base else 0, expand_mult=expand_mult,
|
|
157
|
+
velocity_iterations=velocity_iterations, position_iterations=position_iterations, time_step=time_step)
|
|
158
|
+
|
|
159
|
+
for c in base_circles:
|
|
160
|
+
if c.id in radius_override:
|
|
161
|
+
c.r=radius_override[c.id]
|
|
162
|
+
c.size=size_dicts[level_id][c.id]
|
|
163
|
+
# collapse all data
|
|
164
|
+
data={}
|
|
165
|
+
data[level_id]=dict((c.id, c) for c in base_circles)
|
|
166
|
+
grouped={}
|
|
167
|
+
for circles, meta in results:
|
|
168
|
+
for c in circles:
|
|
169
|
+
c.size=size_dicts[meta['level']][c.id]
|
|
170
|
+
if meta['level'] not in data:
|
|
171
|
+
grouped[meta['level']]={meta['parent']:{'plevel':meta['plevel'], 'data':circles}}
|
|
172
|
+
data[meta['level']]=dict((c.id, c) for c in circles)
|
|
173
|
+
else:
|
|
174
|
+
grouped[meta['level']][meta['parent']]={'plevel':meta['plevel'], 'data':circles}
|
|
175
|
+
data[meta['level']].update((c.id, c) for c in circles)
|
|
176
|
+
# for each level, recenter the circles into the parent circle
|
|
177
|
+
for lv, level_id in enumerate(levels[1:], start=1):
|
|
178
|
+
for parent, info in grouped[level_id].items():
|
|
179
|
+
parent_circle=data[info['plevel']][parent]
|
|
180
|
+
child_circles=info['data']
|
|
181
|
+
x0, x1, y0, y1=GetLimits(child_circles)
|
|
182
|
+
cx=(x0+x1)/2
|
|
183
|
+
cy=(y0+y1)/2
|
|
184
|
+
|
|
185
|
+
px, py=parent_circle.x, parent_circle.y
|
|
186
|
+
for i, c in enumerate(child_circles):
|
|
187
|
+
c.x=c.x-cx+px
|
|
188
|
+
c.y=c.y-cy+py
|
|
189
|
+
return data
|
|
190
|
+
|
|
191
|
+
def SaveLayout(data, outfile, config:SimpleNamespace=None):
|
|
192
|
+
rows=[]
|
|
193
|
+
for lv, c_dict in data.items():
|
|
194
|
+
for c in c_dict.values():
|
|
195
|
+
rows.append([lv, c.id, c.x, c.y, c.r])
|
|
196
|
+
tb=pd.DataFrame(rows, columns=['level','cluster','x','y','r'])
|
|
197
|
+
# print(tb)
|
|
198
|
+
with open(outfile, 'w') as f:
|
|
199
|
+
if config is not None:
|
|
200
|
+
print(CommentedConfigString(config), file=f)
|
|
201
|
+
tb.to_csv(outfile, mode='a', index=False)
|
|
202
|
+
|
|
203
|
+
def ReadLayout(file, config_check:SimpleNamespace=None):
|
|
204
|
+
if not os.path.exists(file):
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
if config_check is not None:
|
|
208
|
+
saved_cstr=''
|
|
209
|
+
with open(file) as f:
|
|
210
|
+
for line in f:
|
|
211
|
+
if not line.startswith('#'):
|
|
212
|
+
break
|
|
213
|
+
saved_cstr+=line
|
|
214
|
+
saved_config=ConfigFromComments(saved_cstr)
|
|
215
|
+
# print(saved_config, config_check)
|
|
216
|
+
if saved_config!=config_check:
|
|
217
|
+
return None
|
|
218
|
+
|
|
219
|
+
# re-use old layout
|
|
220
|
+
tb=pd.read_csv(file, comment='#')
|
|
221
|
+
data={}
|
|
222
|
+
for lv, group in tb.groupby('level'):
|
|
223
|
+
data[lv]={}
|
|
224
|
+
for row in group.itertuples():
|
|
225
|
+
data[lv][row.cluster]=Circle(row.x,row.y,row.r,row.cluster)
|
|
226
|
+
return data
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import matplotlib.pyplot as plt
|
|
2
|
+
|
|
3
|
+
from layout.circle_collision import GetLimits
|
|
4
|
+
|
|
5
|
+
def DrawCircles(layout, out_label):
|
|
6
|
+
'''Debug function to draw a layout of circles to a file'''
|
|
7
|
+
all_c=[]
|
|
8
|
+
for lv, c_dict in layout.items():
|
|
9
|
+
all_c+=list(c_dict.values())
|
|
10
|
+
xmin, xmax, ymin, ymax=GetLimits(all_c, 5)
|
|
11
|
+
|
|
12
|
+
f, ax=plt.subplots(figsize=(10,10))
|
|
13
|
+
ax.axis('off')
|
|
14
|
+
ax.set_xlim(xmin, xmax)
|
|
15
|
+
ax.set_ylim(ymin, ymax)
|
|
16
|
+
ax.set_aspect('equal', adjustable='box')
|
|
17
|
+
for lv, c_dict in layout.items():
|
|
18
|
+
lv_frac=lv/len(layout)
|
|
19
|
+
for c in c_dict.values():
|
|
20
|
+
# draw circle
|
|
21
|
+
circle=plt.Circle((c.x, c.y), c.r, color='b', lw=.5, alpha=.2+lv_frac*.8, fill=False)
|
|
22
|
+
ax.add_patch(circle)
|
|
23
|
+
f.savefig(f'{out_label}_layout.png', dpi=300, bbox_inches='tight')
|