qumin 3.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- qumin/__init__.py +1 -0
- qumin/calc_paradigm_entropy.py +85 -0
- qumin/cli.py +57 -0
- qumin/clustering/__init__.py +62 -0
- qumin/clustering/algorithms.py +79 -0
- qumin/clustering/descriptionlength.py +359 -0
- qumin/clustering/node.py +623 -0
- qumin/config/__init__.py +0 -0
- qumin/config/help.yaml +47 -0
- qumin/config/qumin.yaml +95 -0
- qumin/entropy/__init__.py +69 -0
- qumin/entropy/distribution.py +808 -0
- qumin/entropy_heatmap.py +475 -0
- qumin/find_macroclasses.py +80 -0
- qumin/find_patterns.py +56 -0
- qumin/lattice/HighlightSubTrees.js +52 -0
- qumin/lattice/__init__.py +2 -0
- qumin/lattice/lattice.py +415 -0
- qumin/lattice/table.css +33 -0
- qumin/make_lattice.py +86 -0
- qumin/microclass_heatmap.py +107 -0
- qumin/representations/__init__.py +20 -0
- qumin/representations/alignment.py +265 -0
- qumin/representations/contexts.py +331 -0
- qumin/representations/frequencies.py +416 -0
- qumin/representations/generalize.py +88 -0
- qumin/representations/paradigms.py +417 -0
- qumin/representations/patterns.py +1391 -0
- qumin/representations/quantity.py +118 -0
- qumin/representations/segments.py +717 -0
- qumin/utils/__init__.py +37 -0
- qumin/utils/metadata.py +310 -0
- qumin-3.1.0.dist-info/METADATA +710 -0
- qumin-3.1.0.dist-info/RECORD +38 -0
- qumin-3.1.0.dist-info/WHEEL +5 -0
- qumin-3.1.0.dist-info/entry_points.txt +2 -0
- qumin-3.1.0.dist-info/licenses/LICENCE +675 -0
- qumin-3.1.0.dist-info/top_level.txt +1 -0
qumin/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "3.1.0"
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
#!usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""author: Sacha Beniamine.
|
|
4
|
+
|
|
5
|
+
Compute conditional entropies in inflectional patterns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
|
|
10
|
+
from hydra.core.hydra_config import HydraConfig
|
|
11
|
+
|
|
12
|
+
from .utils import adjust_cpus
|
|
13
|
+
from .entropy.distribution import PatternDistribution
|
|
14
|
+
from .representations import segments, create_features
|
|
15
|
+
|
|
16
|
+
log = logging.getLogger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def H_command(cfg, md, patterns_md):
|
|
20
|
+
r"""Compute entropies of flexional paradigms' distributions.
|
|
21
|
+
|
|
22
|
+
Arguments:
|
|
23
|
+
cfg (omegaconf.dictconfig.DictConfig): Configuration for this run.
|
|
24
|
+
md (qumin.utils.Metadata): Metadata handler for this run.
|
|
25
|
+
patterns_md (qumin.utils.Metadata): Metadata handler for the patterns run.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
verbose = HydraConfig.get().verbose is not False
|
|
29
|
+
sounds_file_name = md.get_table_path("sounds")
|
|
30
|
+
|
|
31
|
+
preds = [cfg.entropy.n] if type(cfg.entropy.n) is int else sorted(cfg.entropy.n)
|
|
32
|
+
onePred = preds[0] == 1
|
|
33
|
+
if onePred:
|
|
34
|
+
preds.pop(0)
|
|
35
|
+
|
|
36
|
+
# Initialize segment inventory for phonological computations
|
|
37
|
+
segments.Inventory.initialize(sounds_file_name)
|
|
38
|
+
|
|
39
|
+
# Inflectional paradigms: rows are forms, with lexeme and cell..
|
|
40
|
+
paradigms = patterns_md.get_paradigms(md, segcheck=True)
|
|
41
|
+
# Patterns: built on the paradigms
|
|
42
|
+
patterns = patterns_md.get_patterns(paradigms)
|
|
43
|
+
|
|
44
|
+
if verbose and len(patterns.cells) > 10:
|
|
45
|
+
log.warning("Using verbose mode is strongly "
|
|
46
|
+
"discouraged on large (>10 cells) datasets."
|
|
47
|
+
"You should probably stop this process now.")
|
|
48
|
+
|
|
49
|
+
if cfg.entropy.features is not None:
|
|
50
|
+
features = create_features(md, cfg.entropy.features)
|
|
51
|
+
else:
|
|
52
|
+
features = None
|
|
53
|
+
|
|
54
|
+
num_cpus = adjust_cpus(cfg.cpus)
|
|
55
|
+
patterns.find_applicable(cpus=num_cpus)
|
|
56
|
+
patterns.info()
|
|
57
|
+
|
|
58
|
+
distrib = PatternDistribution(patterns,
|
|
59
|
+
md.paralex,
|
|
60
|
+
features=features)
|
|
61
|
+
|
|
62
|
+
if onePred:
|
|
63
|
+
if verbose:
|
|
64
|
+
distrib.one_pred_entropy(debug=verbose)
|
|
65
|
+
distrib.one_pred_entropy()
|
|
66
|
+
mean = distrib.get_mean()
|
|
67
|
+
mean.name = "H(c1 -> c2)"
|
|
68
|
+
log.info(mean.to_markdown())
|
|
69
|
+
|
|
70
|
+
if preds:
|
|
71
|
+
if cfg.entropy.importResults:
|
|
72
|
+
distrib.import_file(cfg.entropy.importResults)
|
|
73
|
+
for n in preds:
|
|
74
|
+
if verbose:
|
|
75
|
+
distrib.n_preds_entropy(n, paradigms, debug=verbose)
|
|
76
|
+
distrib.n_preds_entropy(n, paradigms)
|
|
77
|
+
mean = distrib.get_mean(n=n)
|
|
78
|
+
mean.name = "H(c1, ..., c{n} -> c)"
|
|
79
|
+
log.info(mean.to_markdown())
|
|
80
|
+
|
|
81
|
+
ent_file = md.get_path('entropies.csv')
|
|
82
|
+
log.info("Writing to: {}".format(ent_file))
|
|
83
|
+
distrib.export_file(ent_file)
|
|
84
|
+
md.register_file('entropies.csv', description="Entropy computation results",
|
|
85
|
+
custom={"mean_measures": mean.to_dict()})
|
qumin/cli.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
import hydra
|
|
3
|
+
|
|
4
|
+
from .calc_paradigm_entropy import H_command
|
|
5
|
+
from .find_patterns import pat_command
|
|
6
|
+
from .find_macroclasses import macroclasses_command
|
|
7
|
+
from .make_lattice import lattice_command
|
|
8
|
+
from .microclass_heatmap import heatmap_command
|
|
9
|
+
from .entropy_heatmap import ent_heatmap_command
|
|
10
|
+
from .utils.metadata import Metadata
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@hydra.main(version_base=None, config_path="config", config_name="qumin")
|
|
16
|
+
def qumin_command(cfg):
|
|
17
|
+
log.info(cfg)
|
|
18
|
+
md = Metadata(cfg=cfg)
|
|
19
|
+
|
|
20
|
+
if (cfg.patterns is None or cfg.action == "patterns") and \
|
|
21
|
+
cfg.action != 'ent_heatmap':
|
|
22
|
+
check_pat_config(cfg.action, md)
|
|
23
|
+
pat_command(cfg, md)
|
|
24
|
+
|
|
25
|
+
if cfg.action in ['H', 'macroclasses', 'lattice', 'heatmap']:
|
|
26
|
+
patterns_md = Metadata(path=cfg.patterns) if cfg.patterns else md
|
|
27
|
+
check_pat_config(cfg.action, patterns_md)
|
|
28
|
+
|
|
29
|
+
if cfg.action == "H":
|
|
30
|
+
H_command(cfg, md, patterns_md)
|
|
31
|
+
elif cfg.action == "macroclasses":
|
|
32
|
+
macroclasses_command(cfg, md, patterns_md)
|
|
33
|
+
elif cfg.action == "lattice":
|
|
34
|
+
lattice_command(cfg, md, patterns_md)
|
|
35
|
+
elif cfg.action == "heatmap":
|
|
36
|
+
heatmap_command(cfg, md, patterns_md)
|
|
37
|
+
|
|
38
|
+
if (cfg.action == "H" and cfg.entropy.vis) or cfg.action == 'ent_heatmap':
|
|
39
|
+
ent_heatmap_command(cfg, md)
|
|
40
|
+
|
|
41
|
+
md.save_metadata()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def check_pat_config(action, patterns_md):
|
|
45
|
+
"""
|
|
46
|
+
Checks that the patterns are appropriate for this action.
|
|
47
|
+
|
|
48
|
+
Arguments:
|
|
49
|
+
action (bool): Action for this run.
|
|
50
|
+
patterns_md: Metadata of the patterns computation.
|
|
51
|
+
"""
|
|
52
|
+
not_overab = not patterns_md.cfg.pats.overabundant.keep
|
|
53
|
+
not_defect = not patterns_md.cfg.pats.defective
|
|
54
|
+
for_H = action == "H"
|
|
55
|
+
for_m = action == "macroclasses"
|
|
56
|
+
assert not_overab or not (for_H or for_m), "For this calculation, overabundant.keep must be False"
|
|
57
|
+
assert not_defect or not for_m, "For this calculation, defective must be False"
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# !/usr/bin/env python3
|
|
3
|
+
|
|
4
|
+
import numpy as np
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def find_microclasses(paradigms, patterns, freqs=None):
|
|
9
|
+
"""Find microclasses in a paradigm (lines with identical rows).
|
|
10
|
+
|
|
11
|
+
This is useful to identify an exemplar of each inflection microclass,
|
|
12
|
+
and limit further computation to the collection of these exemplars.
|
|
13
|
+
|
|
14
|
+
Arguments:
|
|
15
|
+
paradigms (pandas.DataFrame):
|
|
16
|
+
a dataframe containing inflectional paradigms.
|
|
17
|
+
rows describe a pattern between forms from a given lexeme for a given cell.
|
|
18
|
+
freqs (pandas.Series): a series of frequencies for each lemma
|
|
19
|
+
|
|
20
|
+
Return:
|
|
21
|
+
microclasses (dict).
|
|
22
|
+
classes is a dict. Its keys are exemplars,
|
|
23
|
+
its values are lists of the name of rows identical to the exemplar.
|
|
24
|
+
Each exemplar represents a macroclass. ::
|
|
25
|
+
|
|
26
|
+
{"a":["a","A","aa"], "b":["b","B","BBB"]}
|
|
27
|
+
|
|
28
|
+
"""
|
|
29
|
+
lexemes = pd.Series(index=paradigms.data.lexeme.unique())
|
|
30
|
+
grouped = lexemes.groupby([df.groupby('lexeme', observed=False).pattern.apply(
|
|
31
|
+
lambda x: tuple(sorted([str(p) for p in x if p is not None])))
|
|
32
|
+
for df in patterns.values()])
|
|
33
|
+
mc = {}
|
|
34
|
+
|
|
35
|
+
for name, group in grouped:
|
|
36
|
+
members = list(group.index)
|
|
37
|
+
if freqs is not None:
|
|
38
|
+
freq_subset = freqs[group.index]
|
|
39
|
+
exemplar = freq_subset.index[freq_subset.argmax()]
|
|
40
|
+
else:
|
|
41
|
+
exemplar = min(members, key=lambda string: len(string))
|
|
42
|
+
mc[exemplar] = members
|
|
43
|
+
|
|
44
|
+
return mc
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def find_min_attribute(tree, attr):
|
|
48
|
+
"""Find the minimum value for an attribute in a tree.
|
|
49
|
+
|
|
50
|
+
Arguments:
|
|
51
|
+
tree (node.Node): The tree in which to find the minimum attribute.
|
|
52
|
+
attr (str): the attribute's key."""
|
|
53
|
+
agenda = [tree]
|
|
54
|
+
mini = np.inf
|
|
55
|
+
while agenda:
|
|
56
|
+
node = agenda.pop(0)
|
|
57
|
+
if node.children:
|
|
58
|
+
agenda.extend(node.children)
|
|
59
|
+
if attr in node.attributes and float(node.attributes[attr]) < mini:
|
|
60
|
+
mini = node.attributes[attr]
|
|
61
|
+
|
|
62
|
+
return mini
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# !usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""Algorithms for inflection classes clustering.
|
|
4
|
+
|
|
5
|
+
Author: Sacha Beniamine
|
|
6
|
+
"""
|
|
7
|
+
import numpy as np
|
|
8
|
+
from . import find_microclasses
|
|
9
|
+
import logging
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def choose(iterable):
|
|
15
|
+
"""Choose a random element in an iterable of iterable."""
|
|
16
|
+
i = np.random.choice(len(iterable), 1)
|
|
17
|
+
return iterable[int(i)]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def log_classes(classes, md, suffix):
|
|
21
|
+
filename = md.get_path(suffix + ".txt")
|
|
22
|
+
log.info("Found %s %s", len(classes), suffix)
|
|
23
|
+
log.info("Printing log to %s", filename)
|
|
24
|
+
with open(filename, "w", encoding="utf-8") as flow:
|
|
25
|
+
for m in sorted(classes, key=lambda x: len(classes[x])):
|
|
26
|
+
flow.write("\n\n{} ({}) \n\t".format(m,
|
|
27
|
+
len(classes[m]))
|
|
28
|
+
+ ", ".join(classes[m]))
|
|
29
|
+
|
|
30
|
+
md.register_file(suffix + ".txt", description="Log of the macroclass computation")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def hierarchical_clustering(patterns, paradigms, Clusters, **kwargs):
|
|
34
|
+
"""Perform hierarchical clustering on patterns according to a clustering algorithm and a measure.
|
|
35
|
+
|
|
36
|
+
This function ::
|
|
37
|
+
Finds microclasses.
|
|
38
|
+
Performs the clustering,
|
|
39
|
+
Finds the macroclasses (and exports them),
|
|
40
|
+
Returns the inflection class tree.
|
|
41
|
+
|
|
42
|
+
The clustering algorithm is the following::
|
|
43
|
+
|
|
44
|
+
Begin with one cluster per microclasses.
|
|
45
|
+
While there is more than one cluster :
|
|
46
|
+
Find the best possible merge of two clusters, among all possible pairs.
|
|
47
|
+
Perform this merge
|
|
48
|
+
|
|
49
|
+
Scoring, finding the best merges, merging nodes depends on the Clusters class.
|
|
50
|
+
|
|
51
|
+
Arguments:
|
|
52
|
+
patterns (patterns.ParadigmPatterns): alternation patterns
|
|
53
|
+
paradigms (paradigms.Paradigms): paradigms of forms
|
|
54
|
+
Clusters : a cluster class to use in clustering.
|
|
55
|
+
clustering_algorithm (Callable): a clustering algorithm.
|
|
56
|
+
kwargs: any keywords arguments to pass to Clusters. Some keywords are mandatory :
|
|
57
|
+
"md" should be the Metadata register, "patterns" should be a function for pattern finding
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
# Clustering
|
|
61
|
+
microclasses = find_microclasses(paradigms, patterns)
|
|
62
|
+
|
|
63
|
+
clusters = Clusters(microclasses, patterns, **kwargs)
|
|
64
|
+
while len(clusters.nodes) > 1:
|
|
65
|
+
log.info("number of classes = %s", len(clusters.nodes))
|
|
66
|
+
possible_merges = clusters.find_ordered_merges()
|
|
67
|
+
a, b, score = choose(possible_merges)
|
|
68
|
+
clusters.merge(a, b)
|
|
69
|
+
node = clusters.rootnode()
|
|
70
|
+
|
|
71
|
+
# Export macroclasses
|
|
72
|
+
macroclasses = node.macroclasses()
|
|
73
|
+
if macroclasses:
|
|
74
|
+
log_classes(macroclasses, kwargs['md'], "macroclasses")
|
|
75
|
+
else:
|
|
76
|
+
log.warning("No macroclasses could be found "
|
|
77
|
+
" this is not necessarily a bug, but it is surprising !")
|
|
78
|
+
|
|
79
|
+
return node
|
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
# !usr/bin/python3
|
|
2
|
+
# -*- coding: utf-8 -*-
|
|
3
|
+
"""Classes to make clustering decisions and build inflection class trees according to description length.
|
|
4
|
+
|
|
5
|
+
Author: Sacha Beniamine
|
|
6
|
+
"""
|
|
7
|
+
import logging
|
|
8
|
+
from collections import defaultdict, Counter
|
|
9
|
+
from itertools import combinations
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
from tqdm import tqdm
|
|
13
|
+
|
|
14
|
+
from .node import Node
|
|
15
|
+
|
|
16
|
+
log = logging.getLogger()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Cluster(object):
|
|
20
|
+
"""A single cluster in MDL clustering.
|
|
21
|
+
|
|
22
|
+
A Cluster is iterable. Itering on a cluster is itering on its patterns.
|
|
23
|
+
Clusters can be merged or separated by adding or substracting them.
|
|
24
|
+
|
|
25
|
+
Attributes:
|
|
26
|
+
patterns (:class:`collections.defaultdict`): For each pair of cell in the paradigms under consideration,
|
|
27
|
+
it holds a counter of the number of microclass using each pattern in this cluster and pair of cells.::
|
|
28
|
+
|
|
29
|
+
{ str: Counter({Pattern: int }) }
|
|
30
|
+
pairs of cells -> pattern -> number of microclasses using this pattern for this cell
|
|
31
|
+
|
|
32
|
+
Note that the Counter's length is written on a .length attribute, to avoid calling len() repeatedly.
|
|
33
|
+
|
|
34
|
+
labels (set): the set of all exemplars representing the microclasses in this cluster.
|
|
35
|
+
size (int): The size of this cluster. Depending on external parameters,
|
|
36
|
+
this can be the number of microclasses or the number of lexemes belonging to the cluster.
|
|
37
|
+
totalsize (int): The size of the whole system of clusters, either number of microclasses in the system, or number of lexemes in the system.
|
|
38
|
+
R : The cost in bits to disambiguate for each pair of cells which pattern is to be used with which microclass.
|
|
39
|
+
C : The contribution of this cluster to the cost of mapping from microclasses to clusters.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, *args):
|
|
43
|
+
"""Initialize single cluster.
|
|
44
|
+
|
|
45
|
+
Arguments:
|
|
46
|
+
args (str): Names (exemplar) of each microclass belonging to the cluster.
|
|
47
|
+
"""
|
|
48
|
+
# cell : Counter(patterns)
|
|
49
|
+
self.patterns = defaultdict(Counter)
|
|
50
|
+
self.labels = set(args)
|
|
51
|
+
self.size = self.R = self.C = self.totalsize = 0
|
|
52
|
+
|
|
53
|
+
def init_from_paradigm(self, class_size, patterns, size):
|
|
54
|
+
"""Populate fields according to a paradigm column.
|
|
55
|
+
|
|
56
|
+
This assumes an initialization with only one microclass.
|
|
57
|
+
|
|
58
|
+
Arguments:
|
|
59
|
+
class_size (int): the size of the microclass
|
|
60
|
+
patterns (patterns.ParadigmPatterns): patterns
|
|
61
|
+
size (int): total size
|
|
62
|
+
"""
|
|
63
|
+
self.size = class_size
|
|
64
|
+
self.totalsize = size
|
|
65
|
+
self.R = 0
|
|
66
|
+
self.C = weighted_log(self.size, self.totalsize)
|
|
67
|
+
exemplar = next(iter(self.labels))
|
|
68
|
+
|
|
69
|
+
for pair in patterns:
|
|
70
|
+
df = patterns[pair]
|
|
71
|
+
for pattern in df.loc[df.lexeme == exemplar, "pattern"]: # might be multiple patterns if overabundance
|
|
72
|
+
self[pair][pattern] = self.size
|
|
73
|
+
self[pair].length = 1
|
|
74
|
+
self.R += sum(weighted_log(self[pair][p], self.size) for p in self[pair])
|
|
75
|
+
|
|
76
|
+
def __copy(self):
|
|
77
|
+
new = Cluster()
|
|
78
|
+
new.totalsize = self.totalsize
|
|
79
|
+
new += self
|
|
80
|
+
return new
|
|
81
|
+
|
|
82
|
+
def __update_attributes(self, other, update_action):
|
|
83
|
+
self.size = update_action(self.size, other.size)
|
|
84
|
+
self.C = weighted_log(self.size, self.totalsize)
|
|
85
|
+
self.R = 0
|
|
86
|
+
for cell in set(self).union(other):
|
|
87
|
+
self[cell] = update_action(self[cell], other[cell])
|
|
88
|
+
self[cell].length = len(self[cell])
|
|
89
|
+
self.R += sum(weighted_log(self[cell][p], self.size) for p in self[cell])
|
|
90
|
+
|
|
91
|
+
def __str__(self):
|
|
92
|
+
template = "<Cluster {} size={}; C={}; R={}; Pattern={}>"
|
|
93
|
+
return template.format(self.labels, self.size, self.C, self.R, self.patterns)
|
|
94
|
+
|
|
95
|
+
def __iter__(self):
|
|
96
|
+
return iter(self.patterns)
|
|
97
|
+
|
|
98
|
+
def __getitem__(self, key):
|
|
99
|
+
return self.patterns[key]
|
|
100
|
+
|
|
101
|
+
def __setitem__(self, key, item):
|
|
102
|
+
self.patterns[key] = item
|
|
103
|
+
|
|
104
|
+
def __radd__(self, other):
|
|
105
|
+
if other == 0:
|
|
106
|
+
return self + Cluster()
|
|
107
|
+
else:
|
|
108
|
+
return self + other
|
|
109
|
+
|
|
110
|
+
def __add__(self, other):
|
|
111
|
+
new = self.__copy()
|
|
112
|
+
new += other
|
|
113
|
+
return new
|
|
114
|
+
|
|
115
|
+
def __sub__(self, other):
|
|
116
|
+
new = self.__copy()
|
|
117
|
+
new -= other
|
|
118
|
+
return new
|
|
119
|
+
|
|
120
|
+
def __iadd__(self, other):
|
|
121
|
+
self.labels = self.labels | other.labels
|
|
122
|
+
self.__update_attributes(other, lambda a, b: a + b)
|
|
123
|
+
return self
|
|
124
|
+
|
|
125
|
+
def __isub__(self, other):
|
|
126
|
+
self.labels = self.labels - other.labels
|
|
127
|
+
self.__update_attributes(other, lambda a, b: a - b)
|
|
128
|
+
return self
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class BUDLClustersBuilder(object):
|
|
132
|
+
"""Builder for hierarchical clusters of inflection classes with description length based decisions.
|
|
133
|
+
|
|
134
|
+
This class holds two representations of the clusters it builds. On one hand, the class
|
|
135
|
+
Cluster represents the informations needed to compute the description length of a cluster.
|
|
136
|
+
On the other hand, the class Node represents the inflection classes being built.
|
|
137
|
+
A Node can have children and a parent, a Cluster can be splitted or merged.
|
|
138
|
+
|
|
139
|
+
This class inherits attributes.
|
|
140
|
+
|
|
141
|
+
Attributes:
|
|
142
|
+
microclasses (dict[str, list]): Inherited. mapping of microclasses exemplars to microclasses inventories.
|
|
143
|
+
nodes (dict[frozenset, Node]): Inherited. Maps frozensets of microclass exemplars to Nodes representing clusters.
|
|
144
|
+
preferences (dict): Inherited. Configuration parameters.
|
|
145
|
+
attr (str): (class attribute) always have the value "DL", as the nodes of the Inflection class tree have a "DL" attribute.
|
|
146
|
+
DL (float): A description length DL, with DL(system) = DL(M) + DL(C) + DL(P) + DL(R)
|
|
147
|
+
M (float): DL(M), the cost in bits to express the mapping between lexemes and microclasses.
|
|
148
|
+
C (float): DL(C), the cost in bits to express the mapping between microclasses and clusters.
|
|
149
|
+
P (float): DL(P), the cost in bits to express the relation between clusters and patterns.
|
|
150
|
+
R (float): DL(R), the cost in bits to disambiguiate which pattern to use in each cluster for each microclasses.
|
|
151
|
+
clusters (dict[frozenset, Cluster]): Clusters, indexed by a frozenset of microclass examplars.
|
|
152
|
+
patterns (dict[str, collections.Counter]): A dict of pairs of cells to a count of patterns
|
|
153
|
+
to the number of clusters presenting this pattern for this cell.::
|
|
154
|
+
|
|
155
|
+
{ str: Counter({Pattern: int }) }
|
|
156
|
+
pairs of cells -> pattern -> number of clusters with this pattern for this cell
|
|
157
|
+
|
|
158
|
+
Note that the Counter's length is written on a .length attribute, to avoid calling len() repeatedly.
|
|
159
|
+
Remark that the count is not the same as in the class Cluster.
|
|
160
|
+
size (int): The size of the whole system in microclasses.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
attr = "DL"
|
|
164
|
+
|
|
165
|
+
def __init__(self, microclasses, patterns, **kwargs):
|
|
166
|
+
"""Constructor.
|
|
167
|
+
|
|
168
|
+
Arguments:
|
|
169
|
+
microclasses (dict[str, list]): mapping of microclasses exemplars to microclasses inventories.
|
|
170
|
+
patterns (patterns.ParadigmPatterns): patterns
|
|
171
|
+
kwargs : keyword arguments to be used as configuration.
|
|
172
|
+
"""
|
|
173
|
+
self.preferences = kwargs
|
|
174
|
+
self.microclasses = microclasses
|
|
175
|
+
self.nodes = {
|
|
176
|
+
frozenset([m]): Node([m], size=len(self.microclasses[m]), macroclass=False)
|
|
177
|
+
for m in
|
|
178
|
+
self.microclasses}
|
|
179
|
+
|
|
180
|
+
self.P = self.M = self.C = self.R = self.DL = 0
|
|
181
|
+
self.initialize_clusters(patterns)
|
|
182
|
+
self.initialize_patterns()
|
|
183
|
+
self.compute_DL(M=True)
|
|
184
|
+
current_partition = " - ".join(", ".join(c) for c in self.clusters)
|
|
185
|
+
log.debug("\t".join(["Partition", "M", "C", "P", "R", "DL"]))
|
|
186
|
+
log.debug(" ".join([current_partition, ":\t", "\t".join(
|
|
187
|
+
(str(self.M), str(self.C), str(self.P), str(self.R), str(self.DL)))]))
|
|
188
|
+
|
|
189
|
+
def initialize_clusters(self, patterns):
|
|
190
|
+
self.clusters = {}
|
|
191
|
+
classes_size = {m: 1 for m in self.microclasses}
|
|
192
|
+
self.size = sum(classes_size.values())
|
|
193
|
+
|
|
194
|
+
for microclass in self.microclasses:
|
|
195
|
+
cluster = Cluster(microclass)
|
|
196
|
+
cluster.init_from_paradigm(classes_size[microclass], patterns, self.size)
|
|
197
|
+
self.clusters[frozenset([microclass])] = cluster
|
|
198
|
+
|
|
199
|
+
def initialize_patterns(self):
|
|
200
|
+
self.patterns = defaultdict(Counter)
|
|
201
|
+
for cell in next(iter(self.clusters.values())):
|
|
202
|
+
for label in self.nodes:
|
|
203
|
+
self.patterns[cell] += Counter(list(self.clusters[label][cell]))
|
|
204
|
+
self.patterns[cell].length = sum(self.patterns[cell].values())
|
|
205
|
+
|
|
206
|
+
def compute_DL(self, M=False):
|
|
207
|
+
values = [len(self.microclasses[m]) for m in self.microclasses]
|
|
208
|
+
if M:
|
|
209
|
+
total = sum(values)
|
|
210
|
+
self.M = sum(weighted_log(val, total) for val in values)
|
|
211
|
+
|
|
212
|
+
self.size = len(self.microclasses)
|
|
213
|
+
for cell in self.patterns:
|
|
214
|
+
|
|
215
|
+
# This is P_p
|
|
216
|
+
for pattern in self.patterns[cell]:
|
|
217
|
+
self.P += weighted_log(self.patterns[cell][pattern],
|
|
218
|
+
self.patterns[cell].length)
|
|
219
|
+
|
|
220
|
+
# This is P_c
|
|
221
|
+
cluster_patterns = [len(self.clusters[cluster][cell]) for cluster in
|
|
222
|
+
self.nodes]
|
|
223
|
+
total = sum(cluster_patterns)
|
|
224
|
+
self.P += sum(weighted_log(a, total) for a in cluster_patterns)
|
|
225
|
+
|
|
226
|
+
for label in self.nodes:
|
|
227
|
+
self.C += self.clusters[label].C
|
|
228
|
+
self.R += self.clusters[label].R
|
|
229
|
+
|
|
230
|
+
self.DL = (self.M + self.C + self.P + self.R)
|
|
231
|
+
|
|
232
|
+
def _simulate_merge(self, a, b):
|
|
233
|
+
"""Simulate merging two clusters, return parameters for the DL.
|
|
234
|
+
|
|
235
|
+
Parameters:
|
|
236
|
+
a (frozenset): the label of a cluster to merge.
|
|
237
|
+
b (frozenset): the label of a cluster to merge."""
|
|
238
|
+
g1 = self.clusters[a]
|
|
239
|
+
g2 = self.clusters[b]
|
|
240
|
+
new = g1 + g2
|
|
241
|
+
C = self.C - g1.C - g2.C + new.C
|
|
242
|
+
P = 0
|
|
243
|
+
patterns = defaultdict(Counter)
|
|
244
|
+
|
|
245
|
+
for cell in g1:
|
|
246
|
+
# This is P_p
|
|
247
|
+
patterns[cell] = self.patterns[cell] + \
|
|
248
|
+
Counter(list(new[cell])) - \
|
|
249
|
+
Counter(list(g1[cell])) - \
|
|
250
|
+
Counter(list(g2[cell]))
|
|
251
|
+
|
|
252
|
+
patterns[cell].length = self.patterns[cell].length + new[cell].length - g1[
|
|
253
|
+
cell].length - g2[cell].length
|
|
254
|
+
|
|
255
|
+
for pattern in patterns[cell]:
|
|
256
|
+
P += weighted_log(patterns[cell][pattern], patterns[cell].length)
|
|
257
|
+
|
|
258
|
+
# This is P_c
|
|
259
|
+
cluster_patterns = [new[cell].length]
|
|
260
|
+
for cluster in self.nodes:
|
|
261
|
+
if cluster not in [a, b]:
|
|
262
|
+
cluster_patterns.append(self.clusters[cluster][cell].length)
|
|
263
|
+
total = sum(cluster_patterns)
|
|
264
|
+
P += sum(weighted_log(a, total) for a in cluster_patterns)
|
|
265
|
+
|
|
266
|
+
R = self.R - g1.R - g2.R + new.R
|
|
267
|
+
|
|
268
|
+
return R, C, P, patterns, new
|
|
269
|
+
|
|
270
|
+
def merge(self, a, b):
|
|
271
|
+
"""Merge two Clusters, build a Node to represent the result, update the DL.
|
|
272
|
+
|
|
273
|
+
Parameters:
|
|
274
|
+
a (str): the label of a cluster to merge.
|
|
275
|
+
b (str): the label of a cluster to merge."""
|
|
276
|
+
labels = a | b
|
|
277
|
+
self.R, self.C, self.P, self.patterns, self.clusters[
|
|
278
|
+
labels] = self._simulate_merge(a, b)
|
|
279
|
+
# del self.clusters[b]
|
|
280
|
+
# del self.clusters[a]
|
|
281
|
+
|
|
282
|
+
prev_DL = self.DL
|
|
283
|
+
self.DL = (self.R + self.C + self.P + self.M)
|
|
284
|
+
|
|
285
|
+
left = self.nodes.pop(a)
|
|
286
|
+
right = self.nodes.pop(b)
|
|
287
|
+
leaves = list(labels)
|
|
288
|
+
size = left.attributes["size"] + right.attributes["size"]
|
|
289
|
+
color = "c"
|
|
290
|
+
if self.DL >= prev_DL:
|
|
291
|
+
log.info("DL stopped improving: prev = {}, current best = {}".format(prev_DL,
|
|
292
|
+
self.DL))
|
|
293
|
+
color = "r"
|
|
294
|
+
|
|
295
|
+
self.nodes[labels] = Node(leaves, size=size, children=[left, right],
|
|
296
|
+
DL=self.DL, color=color, macroclass=color != "r")
|
|
297
|
+
|
|
298
|
+
log.debug("Merging %s and %s with DL %s", ", ".join(a), ", ".join(b), self.DL)
|
|
299
|
+
|
|
300
|
+
current_partition = " - ".join(
|
|
301
|
+
[", ".join(self.nodes[c].labels) for c in self.nodes])
|
|
302
|
+
log.debug(" ".join(
|
|
303
|
+
[current_partition, ":\t", "\t".join(
|
|
304
|
+
(str(self.M), str(self.C), str(self.P), str(self.R), str(self.DL)))]))
|
|
305
|
+
|
|
306
|
+
def find_ordered_merges(self):
|
|
307
|
+
"""Find the list of all best merges of two clusters.
|
|
308
|
+
|
|
309
|
+
The list is a list of tuples of length 3 containing two frozensets representing the
|
|
310
|
+
labels of the clusters to merge and the description length of the resulting system.
|
|
311
|
+
"""
|
|
312
|
+
best_merges = []
|
|
313
|
+
best = np.inf
|
|
314
|
+
pairs = combinations(sorted(self.nodes), 2)
|
|
315
|
+
tot = (len(self.nodes) * (len(self.nodes) - 1)) // 2
|
|
316
|
+
|
|
317
|
+
for g1, g2 in tqdm(pairs, leave=False, total=tot):
|
|
318
|
+
R, C, P, *_ = self._simulate_merge(g1, g2)
|
|
319
|
+
DL = self.M + R + C + P
|
|
320
|
+
if DL < best:
|
|
321
|
+
best_merges = [(g1, g2, DL)]
|
|
322
|
+
best = DL
|
|
323
|
+
elif DL == best:
|
|
324
|
+
best_merges.append((g1, g2, DL))
|
|
325
|
+
|
|
326
|
+
if len(best_merges) > 1:
|
|
327
|
+
choices = ", ".join(
|
|
328
|
+
["({}, {})".format("-".join(a), "-".join(b)) for a, b, _ in best_merges])
|
|
329
|
+
log.warning("There were {} equivalent choices: %s"
|
|
330
|
+
.format(len(best_merges)), choices)
|
|
331
|
+
|
|
332
|
+
return best_merges
|
|
333
|
+
|
|
334
|
+
def rootnode(self):
|
|
335
|
+
"""Return the root of the Inflection Class tree, if it exists."""
|
|
336
|
+
assert len(self.nodes) == 1
|
|
337
|
+
return next(iter(self.nodes.values()))
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def weighted_log(symbol_count, message_length):
|
|
341
|
+
r"""Compute :math:`-log_{2}(symbol_count/message_length) * message_length`.
|
|
342
|
+
|
|
343
|
+
This corresponds to the product inside the sum
|
|
344
|
+
of the description length formula
|
|
345
|
+
when probabilities are estimated on frequencies.
|
|
346
|
+
|
|
347
|
+
Arguments:
|
|
348
|
+
symbol_count (int): a count of symbols.
|
|
349
|
+
message_length (int): the size of the message.
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
(float): the weighted log
|
|
353
|
+
"""
|
|
354
|
+
try:
|
|
355
|
+
if symbol_count == 0:
|
|
356
|
+
return 0
|
|
357
|
+
return symbol_count * -np.log2(symbol_count / message_length)
|
|
358
|
+
except ZeroDivisionError:
|
|
359
|
+
return 0
|