qumin 3.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
qumin/__init__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "3.1.0"
@@ -0,0 +1,85 @@
1
+ #!usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """author: Sacha Beniamine.
4
+
5
+ Compute conditional entropies in inflectional patterns.
6
+ """
7
+
8
+ import logging
9
+
10
+ from hydra.core.hydra_config import HydraConfig
11
+
12
+ from .utils import adjust_cpus
13
+ from .entropy.distribution import PatternDistribution
14
+ from .representations import segments, create_features
15
+
16
+ log = logging.getLogger()
17
+
18
+
19
+ def H_command(cfg, md, patterns_md):
20
+ r"""Compute entropies of flexional paradigms' distributions.
21
+
22
+ Arguments:
23
+ cfg (omegaconf.dictconfig.DictConfig): Configuration for this run.
24
+ md (qumin.utils.Metadata): Metadata handler for this run.
25
+ patterns_md (qumin.utils.Metadata): Metadata handler for the patterns run.
26
+ """
27
+
28
+ verbose = HydraConfig.get().verbose is not False
29
+ sounds_file_name = md.get_table_path("sounds")
30
+
31
+ preds = [cfg.entropy.n] if type(cfg.entropy.n) is int else sorted(cfg.entropy.n)
32
+ onePred = preds[0] == 1
33
+ if onePred:
34
+ preds.pop(0)
35
+
36
+ # Initialize segment inventory for phonological computations
37
+ segments.Inventory.initialize(sounds_file_name)
38
+
39
+ # Inflectional paradigms: rows are forms, with lexeme and cell..
40
+ paradigms = patterns_md.get_paradigms(md, segcheck=True)
41
+ # Patterns: built on the paradigms
42
+ patterns = patterns_md.get_patterns(paradigms)
43
+
44
+ if verbose and len(patterns.cells) > 10:
45
+ log.warning("Using verbose mode is strongly "
46
+ "discouraged on large (>10 cells) datasets."
47
+ "You should probably stop this process now.")
48
+
49
+ if cfg.entropy.features is not None:
50
+ features = create_features(md, cfg.entropy.features)
51
+ else:
52
+ features = None
53
+
54
+ num_cpus = adjust_cpus(cfg.cpus)
55
+ patterns.find_applicable(cpus=num_cpus)
56
+ patterns.info()
57
+
58
+ distrib = PatternDistribution(patterns,
59
+ md.paralex,
60
+ features=features)
61
+
62
+ if onePred:
63
+ if verbose:
64
+ distrib.one_pred_entropy(debug=verbose)
65
+ distrib.one_pred_entropy()
66
+ mean = distrib.get_mean()
67
+ mean.name = "H(c1 -> c2)"
68
+ log.info(mean.to_markdown())
69
+
70
+ if preds:
71
+ if cfg.entropy.importResults:
72
+ distrib.import_file(cfg.entropy.importResults)
73
+ for n in preds:
74
+ if verbose:
75
+ distrib.n_preds_entropy(n, paradigms, debug=verbose)
76
+ distrib.n_preds_entropy(n, paradigms)
77
+ mean = distrib.get_mean(n=n)
78
+ mean.name = "H(c1, ..., c{n} -> c)"
79
+ log.info(mean.to_markdown())
80
+
81
+ ent_file = md.get_path('entropies.csv')
82
+ log.info("Writing to: {}".format(ent_file))
83
+ distrib.export_file(ent_file)
84
+ md.register_file('entropies.csv', description="Entropy computation results",
85
+ custom={"mean_measures": mean.to_dict()})
qumin/cli.py ADDED
@@ -0,0 +1,57 @@
1
+ import logging
2
+ import hydra
3
+
4
+ from .calc_paradigm_entropy import H_command
5
+ from .find_patterns import pat_command
6
+ from .find_macroclasses import macroclasses_command
7
+ from .make_lattice import lattice_command
8
+ from .microclass_heatmap import heatmap_command
9
+ from .entropy_heatmap import ent_heatmap_command
10
+ from .utils.metadata import Metadata
11
+
12
+ log = logging.getLogger()
13
+
14
+
15
+ @hydra.main(version_base=None, config_path="config", config_name="qumin")
16
+ def qumin_command(cfg):
17
+ log.info(cfg)
18
+ md = Metadata(cfg=cfg)
19
+
20
+ if (cfg.patterns is None or cfg.action == "patterns") and \
21
+ cfg.action != 'ent_heatmap':
22
+ check_pat_config(cfg.action, md)
23
+ pat_command(cfg, md)
24
+
25
+ if cfg.action in ['H', 'macroclasses', 'lattice', 'heatmap']:
26
+ patterns_md = Metadata(path=cfg.patterns) if cfg.patterns else md
27
+ check_pat_config(cfg.action, patterns_md)
28
+
29
+ if cfg.action == "H":
30
+ H_command(cfg, md, patterns_md)
31
+ elif cfg.action == "macroclasses":
32
+ macroclasses_command(cfg, md, patterns_md)
33
+ elif cfg.action == "lattice":
34
+ lattice_command(cfg, md, patterns_md)
35
+ elif cfg.action == "heatmap":
36
+ heatmap_command(cfg, md, patterns_md)
37
+
38
+ if (cfg.action == "H" and cfg.entropy.vis) or cfg.action == 'ent_heatmap':
39
+ ent_heatmap_command(cfg, md)
40
+
41
+ md.save_metadata()
42
+
43
+
44
+ def check_pat_config(action, patterns_md):
45
+ """
46
+ Checks that the patterns are appropriate for this action.
47
+
48
+ Arguments:
49
+ action (bool): Action for this run.
50
+ patterns_md: Metadata of the patterns computation.
51
+ """
52
+ not_overab = not patterns_md.cfg.pats.overabundant.keep
53
+ not_defect = not patterns_md.cfg.pats.defective
54
+ for_H = action == "H"
55
+ for_m = action == "macroclasses"
56
+ assert not_overab or not (for_H or for_m), "For this calculation, overabundant.keep must be False"
57
+ assert not_defect or not for_m, "For this calculation, defective must be False"
@@ -0,0 +1,62 @@
1
+ # -*- coding: utf-8 -*-
2
+ # !/usr/bin/env python3
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+
7
+
8
+ def find_microclasses(paradigms, patterns, freqs=None):
9
+ """Find microclasses in a paradigm (lines with identical rows).
10
+
11
+ This is useful to identify an exemplar of each inflection microclass,
12
+ and limit further computation to the collection of these exemplars.
13
+
14
+ Arguments:
15
+ paradigms (pandas.DataFrame):
16
+ a dataframe containing inflectional paradigms.
17
+ rows describe a pattern between forms from a given lexeme for a given cell.
18
+ freqs (pandas.Series): a series of frequencies for each lemma
19
+
20
+ Return:
21
+ microclasses (dict).
22
+ classes is a dict. Its keys are exemplars,
23
+ its values are lists of the name of rows identical to the exemplar.
24
+ Each exemplar represents a macroclass. ::
25
+
26
+ {"a":["a","A","aa"], "b":["b","B","BBB"]}
27
+
28
+ """
29
+ lexemes = pd.Series(index=paradigms.data.lexeme.unique())
30
+ grouped = lexemes.groupby([df.groupby('lexeme', observed=False).pattern.apply(
31
+ lambda x: tuple(sorted([str(p) for p in x if p is not None])))
32
+ for df in patterns.values()])
33
+ mc = {}
34
+
35
+ for name, group in grouped:
36
+ members = list(group.index)
37
+ if freqs is not None:
38
+ freq_subset = freqs[group.index]
39
+ exemplar = freq_subset.index[freq_subset.argmax()]
40
+ else:
41
+ exemplar = min(members, key=lambda string: len(string))
42
+ mc[exemplar] = members
43
+
44
+ return mc
45
+
46
+
47
+ def find_min_attribute(tree, attr):
48
+ """Find the minimum value for an attribute in a tree.
49
+
50
+ Arguments:
51
+ tree (node.Node): The tree in which to find the minimum attribute.
52
+ attr (str): the attribute's key."""
53
+ agenda = [tree]
54
+ mini = np.inf
55
+ while agenda:
56
+ node = agenda.pop(0)
57
+ if node.children:
58
+ agenda.extend(node.children)
59
+ if attr in node.attributes and float(node.attributes[attr]) < mini:
60
+ mini = node.attributes[attr]
61
+
62
+ return mini
@@ -0,0 +1,79 @@
1
+ # !usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """Algorithms for inflection classes clustering.
4
+
5
+ Author: Sacha Beniamine
6
+ """
7
+ import numpy as np
8
+ from . import find_microclasses
9
+ import logging
10
+
11
+ log = logging.getLogger()
12
+
13
+
14
+ def choose(iterable):
15
+ """Choose a random element in an iterable of iterable."""
16
+ i = np.random.choice(len(iterable), 1)
17
+ return iterable[int(i)]
18
+
19
+
20
+ def log_classes(classes, md, suffix):
21
+ filename = md.get_path(suffix + ".txt")
22
+ log.info("Found %s %s", len(classes), suffix)
23
+ log.info("Printing log to %s", filename)
24
+ with open(filename, "w", encoding="utf-8") as flow:
25
+ for m in sorted(classes, key=lambda x: len(classes[x])):
26
+ flow.write("\n\n{} ({}) \n\t".format(m,
27
+ len(classes[m]))
28
+ + ", ".join(classes[m]))
29
+
30
+ md.register_file(suffix + ".txt", description="Log of the macroclass computation")
31
+
32
+
33
+ def hierarchical_clustering(patterns, paradigms, Clusters, **kwargs):
34
+ """Perform hierarchical clustering on patterns according to a clustering algorithm and a measure.
35
+
36
+ This function ::
37
+ Finds microclasses.
38
+ Performs the clustering,
39
+ Finds the macroclasses (and exports them),
40
+ Returns the inflection class tree.
41
+
42
+ The clustering algorithm is the following::
43
+
44
+ Begin with one cluster per microclasses.
45
+ While there is more than one cluster :
46
+ Find the best possible merge of two clusters, among all possible pairs.
47
+ Perform this merge
48
+
49
+ Scoring, finding the best merges, merging nodes depends on the Clusters class.
50
+
51
+ Arguments:
52
+ patterns (patterns.ParadigmPatterns): alternation patterns
53
+ paradigms (paradigms.Paradigms): paradigms of forms
54
+ Clusters : a cluster class to use in clustering.
55
+ clustering_algorithm (Callable): a clustering algorithm.
56
+ kwargs: any keywords arguments to pass to Clusters. Some keywords are mandatory :
57
+ "md" should be the Metadata register, "patterns" should be a function for pattern finding
58
+ """
59
+
60
+ # Clustering
61
+ microclasses = find_microclasses(paradigms, patterns)
62
+
63
+ clusters = Clusters(microclasses, patterns, **kwargs)
64
+ while len(clusters.nodes) > 1:
65
+ log.info("number of classes = %s", len(clusters.nodes))
66
+ possible_merges = clusters.find_ordered_merges()
67
+ a, b, score = choose(possible_merges)
68
+ clusters.merge(a, b)
69
+ node = clusters.rootnode()
70
+
71
+ # Export macroclasses
72
+ macroclasses = node.macroclasses()
73
+ if macroclasses:
74
+ log_classes(macroclasses, kwargs['md'], "macroclasses")
75
+ else:
76
+ log.warning("No macroclasses could be found "
77
+ " this is not necessarily a bug, but it is surprising !")
78
+
79
+ return node
@@ -0,0 +1,359 @@
1
+ # !usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """Classes to make clustering decisions and build inflection class trees according to description length.
4
+
5
+ Author: Sacha Beniamine
6
+ """
7
+ import logging
8
+ from collections import defaultdict, Counter
9
+ from itertools import combinations
10
+
11
+ import numpy as np
12
+ from tqdm import tqdm
13
+
14
+ from .node import Node
15
+
16
+ log = logging.getLogger()
17
+
18
+
19
+ class Cluster(object):
20
+ """A single cluster in MDL clustering.
21
+
22
+ A Cluster is iterable. Itering on a cluster is itering on its patterns.
23
+ Clusters can be merged or separated by adding or substracting them.
24
+
25
+ Attributes:
26
+ patterns (:class:`collections.defaultdict`): For each pair of cell in the paradigms under consideration,
27
+ it holds a counter of the number of microclass using each pattern in this cluster and pair of cells.::
28
+
29
+ { str: Counter({Pattern: int }) }
30
+ pairs of cells -> pattern -> number of microclasses using this pattern for this cell
31
+
32
+ Note that the Counter's length is written on a .length attribute, to avoid calling len() repeatedly.
33
+
34
+ labels (set): the set of all exemplars representing the microclasses in this cluster.
35
+ size (int): The size of this cluster. Depending on external parameters,
36
+ this can be the number of microclasses or the number of lexemes belonging to the cluster.
37
+ totalsize (int): The size of the whole system of clusters, either number of microclasses in the system, or number of lexemes in the system.
38
+ R : The cost in bits to disambiguate for each pair of cells which pattern is to be used with which microclass.
39
+ C : The contribution of this cluster to the cost of mapping from microclasses to clusters.
40
+ """
41
+
42
+ def __init__(self, *args):
43
+ """Initialize single cluster.
44
+
45
+ Arguments:
46
+ args (str): Names (exemplar) of each microclass belonging to the cluster.
47
+ """
48
+ # cell : Counter(patterns)
49
+ self.patterns = defaultdict(Counter)
50
+ self.labels = set(args)
51
+ self.size = self.R = self.C = self.totalsize = 0
52
+
53
+ def init_from_paradigm(self, class_size, patterns, size):
54
+ """Populate fields according to a paradigm column.
55
+
56
+ This assumes an initialization with only one microclass.
57
+
58
+ Arguments:
59
+ class_size (int): the size of the microclass
60
+ patterns (patterns.ParadigmPatterns): patterns
61
+ size (int): total size
62
+ """
63
+ self.size = class_size
64
+ self.totalsize = size
65
+ self.R = 0
66
+ self.C = weighted_log(self.size, self.totalsize)
67
+ exemplar = next(iter(self.labels))
68
+
69
+ for pair in patterns:
70
+ df = patterns[pair]
71
+ for pattern in df.loc[df.lexeme == exemplar, "pattern"]: # might be multiple patterns if overabundance
72
+ self[pair][pattern] = self.size
73
+ self[pair].length = 1
74
+ self.R += sum(weighted_log(self[pair][p], self.size) for p in self[pair])
75
+
76
+ def __copy(self):
77
+ new = Cluster()
78
+ new.totalsize = self.totalsize
79
+ new += self
80
+ return new
81
+
82
+ def __update_attributes(self, other, update_action):
83
+ self.size = update_action(self.size, other.size)
84
+ self.C = weighted_log(self.size, self.totalsize)
85
+ self.R = 0
86
+ for cell in set(self).union(other):
87
+ self[cell] = update_action(self[cell], other[cell])
88
+ self[cell].length = len(self[cell])
89
+ self.R += sum(weighted_log(self[cell][p], self.size) for p in self[cell])
90
+
91
+ def __str__(self):
92
+ template = "<Cluster {} size={}; C={}; R={}; Pattern={}>"
93
+ return template.format(self.labels, self.size, self.C, self.R, self.patterns)
94
+
95
+ def __iter__(self):
96
+ return iter(self.patterns)
97
+
98
+ def __getitem__(self, key):
99
+ return self.patterns[key]
100
+
101
+ def __setitem__(self, key, item):
102
+ self.patterns[key] = item
103
+
104
+ def __radd__(self, other):
105
+ if other == 0:
106
+ return self + Cluster()
107
+ else:
108
+ return self + other
109
+
110
+ def __add__(self, other):
111
+ new = self.__copy()
112
+ new += other
113
+ return new
114
+
115
+ def __sub__(self, other):
116
+ new = self.__copy()
117
+ new -= other
118
+ return new
119
+
120
+ def __iadd__(self, other):
121
+ self.labels = self.labels | other.labels
122
+ self.__update_attributes(other, lambda a, b: a + b)
123
+ return self
124
+
125
+ def __isub__(self, other):
126
+ self.labels = self.labels - other.labels
127
+ self.__update_attributes(other, lambda a, b: a - b)
128
+ return self
129
+
130
+
131
+ class BUDLClustersBuilder(object):
132
+ """Builder for hierarchical clusters of inflection classes with description length based decisions.
133
+
134
+ This class holds two representations of the clusters it builds. On one hand, the class
135
+ Cluster represents the informations needed to compute the description length of a cluster.
136
+ On the other hand, the class Node represents the inflection classes being built.
137
+ A Node can have children and a parent, a Cluster can be splitted or merged.
138
+
139
+ This class inherits attributes.
140
+
141
+ Attributes:
142
+ microclasses (dict[str, list]): Inherited. mapping of microclasses exemplars to microclasses inventories.
143
+ nodes (dict[frozenset, Node]): Inherited. Maps frozensets of microclass exemplars to Nodes representing clusters.
144
+ preferences (dict): Inherited. Configuration parameters.
145
+ attr (str): (class attribute) always have the value "DL", as the nodes of the Inflection class tree have a "DL" attribute.
146
+ DL (float): A description length DL, with DL(system) = DL(M) + DL(C) + DL(P) + DL(R)
147
+ M (float): DL(M), the cost in bits to express the mapping between lexemes and microclasses.
148
+ C (float): DL(C), the cost in bits to express the mapping between microclasses and clusters.
149
+ P (float): DL(P), the cost in bits to express the relation between clusters and patterns.
150
+ R (float): DL(R), the cost in bits to disambiguiate which pattern to use in each cluster for each microclasses.
151
+ clusters (dict[frozenset, Cluster]): Clusters, indexed by a frozenset of microclass examplars.
152
+ patterns (dict[str, collections.Counter]): A dict of pairs of cells to a count of patterns
153
+ to the number of clusters presenting this pattern for this cell.::
154
+
155
+ { str: Counter({Pattern: int }) }
156
+ pairs of cells -> pattern -> number of clusters with this pattern for this cell
157
+
158
+ Note that the Counter's length is written on a .length attribute, to avoid calling len() repeatedly.
159
+ Remark that the count is not the same as in the class Cluster.
160
+ size (int): The size of the whole system in microclasses.
161
+ """
162
+
163
+ attr = "DL"
164
+
165
+ def __init__(self, microclasses, patterns, **kwargs):
166
+ """Constructor.
167
+
168
+ Arguments:
169
+ microclasses (dict[str, list]): mapping of microclasses exemplars to microclasses inventories.
170
+ patterns (patterns.ParadigmPatterns): patterns
171
+ kwargs : keyword arguments to be used as configuration.
172
+ """
173
+ self.preferences = kwargs
174
+ self.microclasses = microclasses
175
+ self.nodes = {
176
+ frozenset([m]): Node([m], size=len(self.microclasses[m]), macroclass=False)
177
+ for m in
178
+ self.microclasses}
179
+
180
+ self.P = self.M = self.C = self.R = self.DL = 0
181
+ self.initialize_clusters(patterns)
182
+ self.initialize_patterns()
183
+ self.compute_DL(M=True)
184
+ current_partition = " - ".join(", ".join(c) for c in self.clusters)
185
+ log.debug("\t".join(["Partition", "M", "C", "P", "R", "DL"]))
186
+ log.debug(" ".join([current_partition, ":\t", "\t".join(
187
+ (str(self.M), str(self.C), str(self.P), str(self.R), str(self.DL)))]))
188
+
189
+ def initialize_clusters(self, patterns):
190
+ self.clusters = {}
191
+ classes_size = {m: 1 for m in self.microclasses}
192
+ self.size = sum(classes_size.values())
193
+
194
+ for microclass in self.microclasses:
195
+ cluster = Cluster(microclass)
196
+ cluster.init_from_paradigm(classes_size[microclass], patterns, self.size)
197
+ self.clusters[frozenset([microclass])] = cluster
198
+
199
+ def initialize_patterns(self):
200
+ self.patterns = defaultdict(Counter)
201
+ for cell in next(iter(self.clusters.values())):
202
+ for label in self.nodes:
203
+ self.patterns[cell] += Counter(list(self.clusters[label][cell]))
204
+ self.patterns[cell].length = sum(self.patterns[cell].values())
205
+
206
+ def compute_DL(self, M=False):
207
+ values = [len(self.microclasses[m]) for m in self.microclasses]
208
+ if M:
209
+ total = sum(values)
210
+ self.M = sum(weighted_log(val, total) for val in values)
211
+
212
+ self.size = len(self.microclasses)
213
+ for cell in self.patterns:
214
+
215
+ # This is P_p
216
+ for pattern in self.patterns[cell]:
217
+ self.P += weighted_log(self.patterns[cell][pattern],
218
+ self.patterns[cell].length)
219
+
220
+ # This is P_c
221
+ cluster_patterns = [len(self.clusters[cluster][cell]) for cluster in
222
+ self.nodes]
223
+ total = sum(cluster_patterns)
224
+ self.P += sum(weighted_log(a, total) for a in cluster_patterns)
225
+
226
+ for label in self.nodes:
227
+ self.C += self.clusters[label].C
228
+ self.R += self.clusters[label].R
229
+
230
+ self.DL = (self.M + self.C + self.P + self.R)
231
+
232
+ def _simulate_merge(self, a, b):
233
+ """Simulate merging two clusters, return parameters for the DL.
234
+
235
+ Parameters:
236
+ a (frozenset): the label of a cluster to merge.
237
+ b (frozenset): the label of a cluster to merge."""
238
+ g1 = self.clusters[a]
239
+ g2 = self.clusters[b]
240
+ new = g1 + g2
241
+ C = self.C - g1.C - g2.C + new.C
242
+ P = 0
243
+ patterns = defaultdict(Counter)
244
+
245
+ for cell in g1:
246
+ # This is P_p
247
+ patterns[cell] = self.patterns[cell] + \
248
+ Counter(list(new[cell])) - \
249
+ Counter(list(g1[cell])) - \
250
+ Counter(list(g2[cell]))
251
+
252
+ patterns[cell].length = self.patterns[cell].length + new[cell].length - g1[
253
+ cell].length - g2[cell].length
254
+
255
+ for pattern in patterns[cell]:
256
+ P += weighted_log(patterns[cell][pattern], patterns[cell].length)
257
+
258
+ # This is P_c
259
+ cluster_patterns = [new[cell].length]
260
+ for cluster in self.nodes:
261
+ if cluster not in [a, b]:
262
+ cluster_patterns.append(self.clusters[cluster][cell].length)
263
+ total = sum(cluster_patterns)
264
+ P += sum(weighted_log(a, total) for a in cluster_patterns)
265
+
266
+ R = self.R - g1.R - g2.R + new.R
267
+
268
+ return R, C, P, patterns, new
269
+
270
+ def merge(self, a, b):
271
+ """Merge two Clusters, build a Node to represent the result, update the DL.
272
+
273
+ Parameters:
274
+ a (str): the label of a cluster to merge.
275
+ b (str): the label of a cluster to merge."""
276
+ labels = a | b
277
+ self.R, self.C, self.P, self.patterns, self.clusters[
278
+ labels] = self._simulate_merge(a, b)
279
+ # del self.clusters[b]
280
+ # del self.clusters[a]
281
+
282
+ prev_DL = self.DL
283
+ self.DL = (self.R + self.C + self.P + self.M)
284
+
285
+ left = self.nodes.pop(a)
286
+ right = self.nodes.pop(b)
287
+ leaves = list(labels)
288
+ size = left.attributes["size"] + right.attributes["size"]
289
+ color = "c"
290
+ if self.DL >= prev_DL:
291
+ log.info("DL stopped improving: prev = {}, current best = {}".format(prev_DL,
292
+ self.DL))
293
+ color = "r"
294
+
295
+ self.nodes[labels] = Node(leaves, size=size, children=[left, right],
296
+ DL=self.DL, color=color, macroclass=color != "r")
297
+
298
+ log.debug("Merging %s and %s with DL %s", ", ".join(a), ", ".join(b), self.DL)
299
+
300
+ current_partition = " - ".join(
301
+ [", ".join(self.nodes[c].labels) for c in self.nodes])
302
+ log.debug(" ".join(
303
+ [current_partition, ":\t", "\t".join(
304
+ (str(self.M), str(self.C), str(self.P), str(self.R), str(self.DL)))]))
305
+
306
+ def find_ordered_merges(self):
307
+ """Find the list of all best merges of two clusters.
308
+
309
+ The list is a list of tuples of length 3 containing two frozensets representing the
310
+ labels of the clusters to merge and the description length of the resulting system.
311
+ """
312
+ best_merges = []
313
+ best = np.inf
314
+ pairs = combinations(sorted(self.nodes), 2)
315
+ tot = (len(self.nodes) * (len(self.nodes) - 1)) // 2
316
+
317
+ for g1, g2 in tqdm(pairs, leave=False, total=tot):
318
+ R, C, P, *_ = self._simulate_merge(g1, g2)
319
+ DL = self.M + R + C + P
320
+ if DL < best:
321
+ best_merges = [(g1, g2, DL)]
322
+ best = DL
323
+ elif DL == best:
324
+ best_merges.append((g1, g2, DL))
325
+
326
+ if len(best_merges) > 1:
327
+ choices = ", ".join(
328
+ ["({}, {})".format("-".join(a), "-".join(b)) for a, b, _ in best_merges])
329
+ log.warning("There were {} equivalent choices: %s"
330
+ .format(len(best_merges)), choices)
331
+
332
+ return best_merges
333
+
334
+ def rootnode(self):
335
+ """Return the root of the Inflection Class tree, if it exists."""
336
+ assert len(self.nodes) == 1
337
+ return next(iter(self.nodes.values()))
338
+
339
+
340
+ def weighted_log(symbol_count, message_length):
341
+ r"""Compute :math:`-log_{2}(symbol_count/message_length) * message_length`.
342
+
343
+ This corresponds to the product inside the sum
344
+ of the description length formula
345
+ when probabilities are estimated on frequencies.
346
+
347
+ Arguments:
348
+ symbol_count (int): a count of symbols.
349
+ message_length (int): the size of the message.
350
+
351
+ Returns:
352
+ (float): the weighted log
353
+ """
354
+ try:
355
+ if symbol_count == 0:
356
+ return 0
357
+ return symbol_count * -np.log2(symbol_count / message_length)
358
+ except ZeroDivisionError:
359
+ return 0