chaine 3.13.1__cp311-cp311-musllinux_1_2_i686.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of chaine might be problematic. Click here for more details.

Files changed (70) hide show
  1. chaine/__init__.py +2 -0
  2. chaine/_core/crf.cpp +19854 -0
  3. chaine/_core/crf.cpython-311-i386-linux-musl.so +0 -0
  4. chaine/_core/crf.pyx +271 -0
  5. chaine/_core/crfsuite/COPYING +27 -0
  6. chaine/_core/crfsuite/README +183 -0
  7. chaine/_core/crfsuite/include/crfsuite.h +1077 -0
  8. chaine/_core/crfsuite/include/crfsuite.hpp +649 -0
  9. chaine/_core/crfsuite/include/crfsuite_api.hpp +406 -0
  10. chaine/_core/crfsuite/include/os.h +65 -0
  11. chaine/_core/crfsuite/lib/cqdb/COPYING +28 -0
  12. chaine/_core/crfsuite/lib/cqdb/include/cqdb.h +518 -0
  13. chaine/_core/crfsuite/lib/cqdb/src/cqdb.c +639 -0
  14. chaine/_core/crfsuite/lib/cqdb/src/lookup3.c +1271 -0
  15. chaine/_core/crfsuite/lib/cqdb/src/main.c +184 -0
  16. chaine/_core/crfsuite/lib/crf/src/crf1d.h +354 -0
  17. chaine/_core/crfsuite/lib/crf/src/crf1d_context.c +788 -0
  18. chaine/_core/crfsuite/lib/crf/src/crf1d_encode.c +1020 -0
  19. chaine/_core/crfsuite/lib/crf/src/crf1d_feature.c +382 -0
  20. chaine/_core/crfsuite/lib/crf/src/crf1d_model.c +1085 -0
  21. chaine/_core/crfsuite/lib/crf/src/crf1d_tag.c +582 -0
  22. chaine/_core/crfsuite/lib/crf/src/crfsuite.c +500 -0
  23. chaine/_core/crfsuite/lib/crf/src/crfsuite_internal.h +233 -0
  24. chaine/_core/crfsuite/lib/crf/src/crfsuite_train.c +302 -0
  25. chaine/_core/crfsuite/lib/crf/src/dataset.c +115 -0
  26. chaine/_core/crfsuite/lib/crf/src/dictionary.c +127 -0
  27. chaine/_core/crfsuite/lib/crf/src/holdout.c +83 -0
  28. chaine/_core/crfsuite/lib/crf/src/json.c +1497 -0
  29. chaine/_core/crfsuite/lib/crf/src/json.h +120 -0
  30. chaine/_core/crfsuite/lib/crf/src/logging.c +85 -0
  31. chaine/_core/crfsuite/lib/crf/src/logging.h +49 -0
  32. chaine/_core/crfsuite/lib/crf/src/params.c +370 -0
  33. chaine/_core/crfsuite/lib/crf/src/params.h +84 -0
  34. chaine/_core/crfsuite/lib/crf/src/quark.c +180 -0
  35. chaine/_core/crfsuite/lib/crf/src/quark.h +46 -0
  36. chaine/_core/crfsuite/lib/crf/src/rumavl.c +1178 -0
  37. chaine/_core/crfsuite/lib/crf/src/rumavl.h +144 -0
  38. chaine/_core/crfsuite/lib/crf/src/train_arow.c +409 -0
  39. chaine/_core/crfsuite/lib/crf/src/train_averaged_perceptron.c +237 -0
  40. chaine/_core/crfsuite/lib/crf/src/train_l2sgd.c +491 -0
  41. chaine/_core/crfsuite/lib/crf/src/train_lbfgs.c +323 -0
  42. chaine/_core/crfsuite/lib/crf/src/train_passive_aggressive.c +442 -0
  43. chaine/_core/crfsuite/lib/crf/src/vecmath.h +360 -0
  44. chaine/_core/crfsuite/swig/crfsuite.cpp +1 -0
  45. chaine/_core/crfsuite_api.pxd +67 -0
  46. chaine/_core/liblbfgs/COPYING +22 -0
  47. chaine/_core/liblbfgs/README +71 -0
  48. chaine/_core/liblbfgs/include/lbfgs.h +745 -0
  49. chaine/_core/liblbfgs/lib/arithmetic_ansi.h +142 -0
  50. chaine/_core/liblbfgs/lib/arithmetic_sse_double.h +303 -0
  51. chaine/_core/liblbfgs/lib/arithmetic_sse_float.h +312 -0
  52. chaine/_core/liblbfgs/lib/lbfgs.c +1531 -0
  53. chaine/_core/tagger_wrapper.hpp +58 -0
  54. chaine/_core/trainer_wrapper.cpp +32 -0
  55. chaine/_core/trainer_wrapper.hpp +26 -0
  56. chaine/crf.py +505 -0
  57. chaine/logging.py +214 -0
  58. chaine/optimization/__init__.py +10 -0
  59. chaine/optimization/metrics.py +129 -0
  60. chaine/optimization/spaces.py +394 -0
  61. chaine/optimization/trial.py +103 -0
  62. chaine/optimization/utils.py +119 -0
  63. chaine/training.py +184 -0
  64. chaine/typing.py +18 -0
  65. chaine/validation.py +43 -0
  66. chaine-3.13.1.dist-info/METADATA +348 -0
  67. chaine-3.13.1.dist-info/RECORD +70 -0
  68. chaine-3.13.1.dist-info/WHEEL +4 -0
  69. chaine.libs/libgcc_s-1257a076.so.1 +0 -0
  70. chaine.libs/libstdc++-0530927c.so.6.0.32 +0 -0
@@ -0,0 +1,394 @@
1
+ """
2
+ chaine.optimization.spaces
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+
5
+ This module implements hyperparameter search spaces for the different training methods.
6
+ """
7
+
8
+ import random
9
+ from abc import ABC, abstractmethod
10
+
11
+ from chaine.optimization.utils import NumberSeries
12
+
13
+
14
+ class SearchSpace(ABC):
15
+ @property
16
+ @abstractmethod
17
+ def algorithm(self) -> str:
18
+ ...
19
+
20
+ @abstractmethod
21
+ def random_hyperparameters(self) -> dict[str, int | float | bool | str]:
22
+ ...
23
+
24
+
25
+ class LBFGSSearchSpace(SearchSpace):
26
+ def __init__(
27
+ self,
28
+ min_freq: NumberSeries = NumberSeries(start=0, stop=5, step=1),
29
+ num_memories: NumberSeries = NumberSeries(start=1, stop=10, step=1),
30
+ c1: NumberSeries = NumberSeries(start=0.0, stop=2.0, step=0.01),
31
+ c2: NumberSeries = NumberSeries(start=0.0, stop=2.0, step=0.01),
32
+ epsilon: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
33
+ period: NumberSeries = NumberSeries(start=1, stop=20, step=1),
34
+ delta: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
35
+ max_linesearch: NumberSeries = NumberSeries(start=0, stop=50, step=1),
36
+ linesearch: set[str] = {"MoreThuente", "Backtracking", "StrongBacktracking"},
37
+ all_possible_states: set[bool] = {True, False},
38
+ all_possible_transitions: set[bool] = {True, False},
39
+ ):
40
+ """Hyperparameter search space for Limited-Memory BFGS.
41
+
42
+ Parameters
43
+ ----------
44
+ min_freq : NumberSeries, optional
45
+ Threshold value for minimum frequency of a feature occurring in training data,
46
+ by default NumberSeries(start=0, stop=5, step=1).
47
+ num_memories : NumberSeries, optional
48
+ Number of limited memories for approximating the inverse hessian matrix,
49
+ by default NumberSeries(start=1, stop=10, step=1)
50
+ c1 : NumberSeries, optional
51
+ Coefficient for L1 regularization,
52
+ by default NumberSeries(start=0.0, stop=2.0, step=0.01).
53
+ c2 : NumberSeries, optional
54
+ Coefficient for L2 regularization,
55
+ by default NumberSeries(start=0.0, stop=2.0, step=0.01).
56
+ epsilon : NumberSeries, optional
57
+ Parameter that determines the condition of convergence,
58
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
59
+ period : NumberSeries, optional
60
+ Threshold value for iterations to test the stopping criterion,
61
+ by default NumberSeries(start=1, stop=20, step=1).
62
+ delta : NumberSeries, optional
63
+ Top iteration when log likelihood is not greater than this,
64
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
65
+ max_linesearch : NumberSeries, optional
66
+ Maximum number of trials for the line search algorithm,
67
+ by default NumberSeries(start=0, stop=50, step=1).
68
+ linesearch : set[str], optional
69
+ Line search algorithm used in updates,
70
+ by default {"MoreThuente", "Backtracking", "StrongBacktracking"}.
71
+ all_possible_states : set[bool], optional
72
+ Generate state features that do not even occur in the training data,
73
+ by default {True, False}.
74
+ all_possible_transitions : set[bool], optional
75
+ Generate transition features that do not even occur in the training data,
76
+ by default {True, False}.
77
+ """
78
+ self.min_freq = min_freq
79
+ self.all_possible_states = all_possible_states
80
+ self.all_possible_transitions = all_possible_transitions
81
+ self.num_memories = num_memories
82
+ self.c1 = c1
83
+ self.c2 = c2
84
+ self.epsilon = epsilon
85
+ self.period = period
86
+ self.delta = delta
87
+ self.linesearch = linesearch
88
+ self.max_linesearch = max_linesearch
89
+
90
+ @property
91
+ def algorithm(self) -> str:
92
+ return "lbfgs"
93
+
94
+ def random_hyperparameters(self) -> dict[str, int | float | bool | str]:
95
+ """Select random hyperparameters from the search space.
96
+
97
+ Returns
98
+ -------
99
+ dict[str, int | float | bool | str]
100
+ Randomly selected hyperparameters.
101
+ """
102
+ return {
103
+ "algorithm": self.algorithm,
104
+ "min_freq": random.choice(list(self.min_freq)),
105
+ "all_possible_states": random.choice(list(self.all_possible_states)),
106
+ "all_possible_transitions": random.choice(list(self.all_possible_transitions)),
107
+ "num_memories": random.choice(list(self.num_memories)),
108
+ "c1": random.choice(list(self.c1)),
109
+ "c2": random.choice(list(self.c2)),
110
+ "epsilon": random.choice(list(self.epsilon)),
111
+ "period": random.choice(list(self.period)),
112
+ "delta": random.choice(list(self.delta)),
113
+ "linesearch": random.choice(list(self.linesearch)),
114
+ "max_linesearch": random.choice(list(self.max_linesearch)),
115
+ }
116
+
117
+
118
+ class L2SGDSearchSpace(SearchSpace):
119
+ def __init__(
120
+ self,
121
+ min_freq: NumberSeries = NumberSeries(start=0, stop=5, step=1),
122
+ all_possible_states: set[bool] = {True, False},
123
+ all_possible_transitions: set[bool] = {True, False},
124
+ c2: NumberSeries = NumberSeries(start=0.0, stop=2.0, step=0.01),
125
+ period: NumberSeries = NumberSeries(start=1, stop=20, step=1),
126
+ delta: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
127
+ calibration_eta: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
128
+ calibration_rate: NumberSeries = NumberSeries(start=0.5, stop=5.0, step=0.1),
129
+ calibration_samples: NumberSeries = NumberSeries(start=100, stop=3000, step=10),
130
+ calibration_candidates: NumberSeries = NumberSeries(start=1, stop=30, step=1),
131
+ calibration_max_trials: NumberSeries = NumberSeries(start=1, stop=30, step=1),
132
+ ):
133
+ """Hyperparameter search space for SGD with L2 parameters.
134
+
135
+ Parameters
136
+ ----------
137
+ min_freq : NumberSeries, optional
138
+ Threshold value for minimum frequency of a feature occurring in training data,
139
+ by default NumberSeries(start=0, stop=5, step=1).
140
+ all_possible_states : set[bool], optional
141
+ Generate state features that do not even occur in the training data,
142
+ by default {True, False}.
143
+ all_possible_transitions : set[bool], optional
144
+ Generate transition features that do not even occur in the training data,
145
+ by default {True, False}.
146
+ c2 : NumberSeries, optional
147
+ Coefficient for L2 regularization,
148
+ by default NumberSeries(start=0.0, stop=2.0, step=0.01).
149
+ period : NumberSeries, optional
150
+ Threshold value for iterations to test the stopping criterion,
151
+ by default NumberSeries(start=1, stop=20, step=1).
152
+ delta : NumberSeries, optional
153
+ Top iteration when log likelihood is not greater than this,
154
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
155
+ calibration_eta : NumberSeries, optional
156
+ Initial value of learning rate (eta) used for calibration,
157
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
158
+ calibration_rate : NumberSeries, optional
159
+ Rate of increase/decrease of learning rate for calibration,
160
+ by default NumberSeries(start=0.5, stop=5.0, step=0.1).
161
+ calibration_samples : NumberSeries, optional
162
+ Number of instances used for calibration,
163
+ by default NumberSeries(start=100, stop=3000, step=10).
164
+ calibration_candidates : NumberSeries, optional
165
+ Number of candidates of learning rate,
166
+ by default NumberSeries(start=1, stop=30, step=1).
167
+ calibration_max_trials : NumberSeries, optional
168
+ Maximum number of trials of learning rates for calibration,
169
+ by default NumberSeries(start=1, stop=30, step=1).
170
+ """
171
+ self.min_freq = min_freq
172
+ self.all_possible_states = all_possible_states
173
+ self.all_possible_transitions = all_possible_transitions
174
+ self.c2 = c2
175
+ self.period = period
176
+ self.delta = delta
177
+ self.calibration_eta = calibration_eta
178
+ self.calibration_rate = calibration_rate
179
+ self.calibration_samples = calibration_samples
180
+ self.calibration_candidates = calibration_candidates
181
+ self.calibration_max_trials = calibration_max_trials
182
+
183
+ @property
184
+ def algorithm(self) -> str:
185
+ return "l2sgd"
186
+
187
+ def random_hyperparameters(self) -> dict[str, int | float | bool | str]:
188
+ """Select random hyperparameters from the search space.
189
+
190
+ Returns
191
+ -------
192
+ dict[str, int | float | bool | str]
193
+ Randomly selected hyperparameters.
194
+ """
195
+ return {
196
+ "algorithm": self.algorithm,
197
+ "min_freq": random.choice(list(self.min_freq)),
198
+ "all_possible_states": random.choice(list(self.all_possible_states)),
199
+ "all_possible_transitions": random.choice(list(self.all_possible_transitions)),
200
+ "c2": random.choice(list(self.c2)),
201
+ "period": random.choice(list(self.period)),
202
+ "delta": random.choice(list(self.delta)),
203
+ "calibration_eta": random.choice(list(self.calibration_eta)),
204
+ "calibration_rate": random.choice(list(self.calibration_rate)),
205
+ "calibration_samples": random.choice(list(self.calibration_samples)),
206
+ "calibration_candidates": random.choice(list(self.calibration_candidates)),
207
+ "calibration_max_trials": random.choice(list(self.calibration_max_trials)),
208
+ }
209
+
210
+
211
+ class APSearchSpace(SearchSpace):
212
+ def __init__(
213
+ self,
214
+ min_freq: NumberSeries = NumberSeries(start=0, stop=5, step=1),
215
+ all_possible_states: set[bool] = {True, False},
216
+ all_possible_transitions: set[bool] = {True, False},
217
+ epsilon: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
218
+ ):
219
+ """Hyperparameter search space for Averaged Perceptron.
220
+
221
+ Parameters
222
+ ----------
223
+ min_freq : NumberSeries, optional
224
+ Threshold value for minimum frequency of a feature occurring in training data,
225
+ by default NumberSeries(start=0, stop=5, step=1).
226
+ all_possible_states : set[bool], optional
227
+ Generate state features that do not even occur in the training data,
228
+ by default {True, False}.
229
+ all_possible_transitions : set[bool], optional
230
+ Generate transition features that do not even occur in the training data,
231
+ by default {True, False}.
232
+ epsilon : NumberSeries, optional
233
+ Parameter that determines the condition of convergence,
234
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
235
+ """
236
+ self.min_freq = min_freq
237
+ self.all_possible_states = all_possible_states
238
+ self.all_possible_transitions = all_possible_transitions
239
+ self.epsilon = epsilon
240
+
241
+ @property
242
+ def algorithm(self) -> str:
243
+ return "ap"
244
+
245
+ def random_hyperparameters(self) -> dict[str, int | float | bool | str]:
246
+ """Select random hyperparameters from the search space.
247
+
248
+ Returns
249
+ -------
250
+ dict[str, int | float | bool | str]
251
+ Randomly selected hyperparameters.
252
+ """
253
+ return {
254
+ "algorithm": self.algorithm,
255
+ "min_freq": random.choice(list(self.min_freq)),
256
+ "all_possible_states": random.choice(list(self.all_possible_states)),
257
+ "all_possible_transitions": random.choice(list(self.all_possible_transitions)),
258
+ "epsilon": random.choice(list(self.epsilon)),
259
+ }
260
+
261
+
262
+ class PASearchSpace(SearchSpace):
263
+ def __init__(
264
+ self,
265
+ min_freq: NumberSeries = NumberSeries(start=0, stop=5, step=1),
266
+ all_possible_states: set[bool] = {True, False},
267
+ all_possible_transitions: set[bool] = {True, False},
268
+ epsilon: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
269
+ pa_type: NumberSeries = {0, 1, 2},
270
+ c: NumberSeries = NumberSeries(start=0.0, stop=2.0, step=0.01),
271
+ error_sensitive: set[bool] = {True, False},
272
+ averaging: set[bool] = {True, False},
273
+ ):
274
+ """Hyperparameter search space for Passive Aggressive.
275
+
276
+ Parameters
277
+ ----------
278
+ min_freq : NumberSeries, optional
279
+ Threshold value for minimum frequency of a feature occurring in training data,
280
+ by default NumberSeries(start=0, stop=5, step=1).
281
+ all_possible_states : set[bool], optional
282
+ Generate state features that do not even occur in the training data,
283
+ by default {True, False}.
284
+ all_possible_transitions : set[bool], optional
285
+ Generate transition features that do not even occur in the training data,
286
+ by default {True, False}.
287
+ epsilon : NumberSeries, optional
288
+ Parameter that determines the condition of convergence,
289
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
290
+ pa_type : NumberSeries, optional
291
+ Strategy for updating feature weights, by default {0, 1, 2}.
292
+ c : NumberSeries, optional
293
+ Aggressiveness parameter, by default NumberSeries(start=0.0, stop=2.0, step=0.01).
294
+ error_sensitive : set[bool], optional
295
+ Include square root of predicted incorrect labels into optimization routine,
296
+ by default {True, False}.
297
+ averaging : set[bool], optional
298
+ Compute average of feature weights at all updates, by default {True, False}.
299
+ """
300
+ self.min_freq = min_freq
301
+ self.all_possible_states = all_possible_states
302
+ self.all_possible_transitions = all_possible_transitions
303
+ self.epsilon = epsilon
304
+ self.pa_type = pa_type
305
+ self.c = c
306
+ self.error_sensitive = error_sensitive
307
+ self.averaging = averaging
308
+
309
+ @property
310
+ def algorithm(self) -> str:
311
+ return "pa"
312
+
313
+ def random_hyperparameters(self) -> dict[str, int | float | bool | str]:
314
+ """Select random hyperparameters from the search space.
315
+
316
+ Returns
317
+ -------
318
+ dict[str, int | float | bool | str]
319
+ Randomly selected hyperparameters.
320
+ """
321
+ return {
322
+ "algorithm": self.algorithm,
323
+ "min_freq": random.choice(list(self.min_freq)),
324
+ "all_possible_states": random.choice(list(self.all_possible_states)),
325
+ "all_possible_transitions": random.choice(list(self.all_possible_transitions)),
326
+ "epsilon": random.choice(list(self.epsilon)),
327
+ "pa_type": random.choice(list(self.pa_type)),
328
+ "c": random.choice(list(self.c)),
329
+ "error_sensitive": random.choice(list(self.error_sensitive)),
330
+ "averaging": random.choice(list(self.averaging)),
331
+ }
332
+
333
+
334
+ class AROWSearchSpace(SearchSpace):
335
+ def __init__(
336
+ self,
337
+ min_freq: NumberSeries = NumberSeries(start=0, stop=5, step=1),
338
+ all_possible_states: set[bool] = {True, False},
339
+ all_possible_transitions: set[bool] = {True, False},
340
+ epsilon: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
341
+ variance: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
342
+ gamma: NumberSeries = NumberSeries(start=0.00001, stop=0.001, step=0.00001),
343
+ ):
344
+ """Hyperparameter search space for AROW.
345
+
346
+ Parameters
347
+ ----------
348
+ min_freq : NumberSeries, optional
349
+ Threshold value for minimum frequency of a feature occurring in training data,
350
+ by default NumberSeries(start=0, stop=5, step=1).
351
+ all_possible_states : set[bool], optional
352
+ Generate state features that do not even occur in the training data,
353
+ by default {True, False}.
354
+ all_possible_transitions : set[bool], optional
355
+ Generate transition features that do not even occur in the training data,
356
+ by default {True, False}.
357
+ epsilon : NumberSeries, optional
358
+ Parameter that determines the condition of convergence,
359
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
360
+ variance : NumberSeries, optional
361
+ Initial variance of every feature weight,
362
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
363
+ gamma : NumberSeries, optional
364
+ Trade-off between loss function and changes of feature weights,
365
+ by default NumberSeries(start=0.00001, stop=0.001, step=0.00001).
366
+ """
367
+ self.min_freq = min_freq
368
+ self.all_possible_states = all_possible_states
369
+ self.all_possible_transitions = all_possible_transitions
370
+ self.epsilon = epsilon
371
+ self.variance = variance
372
+ self.gamma = gamma
373
+
374
+ @property
375
+ def algorithm(self) -> str:
376
+ return "arow"
377
+
378
+ def random_hyperparameters(self) -> dict[str, int | float | bool | str]:
379
+ """Select random hyperparameters from the search space.
380
+
381
+ Returns
382
+ -------
383
+ dict[str, int | float | bool | str]
384
+ Randomly selected hyperparameters.
385
+ """
386
+ return {
387
+ "algorithm": self.algorithm,
388
+ "min_freq": random.choice(list(self.min_freq)),
389
+ "all_possible_states": random.choice(list(self.all_possible_states)),
390
+ "all_possible_transitions": random.choice(list(self.all_possible_transitions)),
391
+ "epsilon": random.choice(list(self.epsilon)),
392
+ "variance": random.choice(list(self.variance)),
393
+ "gamma": random.choice(list(self.gamma)),
394
+ }
@@ -0,0 +1,103 @@
1
+ """
2
+ chaine.optimization.trial
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
4
+
5
+ This module implements a class for a hyperparameter optimization trial.
6
+ """
7
+
8
+ import statistics
9
+ import tempfile
10
+ import time
11
+ import uuid
12
+ from pathlib import Path
13
+
14
+ from chaine.optimization.metrics import evaluate_predictions
15
+ from chaine.optimization.spaces import SearchSpace
16
+ from chaine.typing import Iterable, Iterator, Labels, Sequence
17
+
18
+
19
+ class OptimizationTrial:
20
+ def __init__(
21
+ self,
22
+ splits: Iterator[tuple[tuple[Iterable[Sequence], Iterable[Labels]]]],
23
+ space: SearchSpace,
24
+ *,
25
+ is_baseline: bool
26
+ ):
27
+ """Hyperparameter optimization trial.
28
+
29
+ Parameters
30
+ ----------
31
+ splits : Iterator[tuple[tuple[Iterable[Sequence], Iterable[Labels]]]]
32
+ K-fold split data set.
33
+ space : SearchSpace
34
+ Search space for hyperparameter optimization.
35
+ is_baseline : bool
36
+ True if trial is a baseline (i.e. default hyperparameters to be used).
37
+ """
38
+ self.splits = splits
39
+ self.space = space
40
+ self.is_baseline = is_baseline
41
+ self.model_filepath = Path(tempfile.gettempdir(), str(uuid.uuid4()))
42
+ self.precision = []
43
+ self.recall = []
44
+ self.f1 = []
45
+ self.time = []
46
+
47
+ def __enter__(self) -> dict[str, dict]:
48
+ """Train and evaluate a model.
49
+
50
+ Returns
51
+ -------
52
+ dict[str, dict]
53
+ Selected hyperparameters and evaluation scores.
54
+ """
55
+ from chaine.crf import Model, Trainer
56
+
57
+ if self.is_baseline:
58
+ # default hyperparameters as baseline
59
+ params = {"algorithm": self.space.algorithm}
60
+ else:
61
+ # select random hyperparameters
62
+ params = self.space.random_hyperparameters()
63
+
64
+ for (train_dataset, train_labels), (test_dataset, test_labels) in self.splits:
65
+ # fire!
66
+ start = time.time()
67
+ trainer = Trainer(max_iterations=100, **params)
68
+ trainer.train(train_dataset, train_labels, model_filepath=self.model_filepath)
69
+ end = time.time()
70
+
71
+ # evaluate model
72
+ model = Model(self.model_filepath)
73
+ predicted_labels = model.predict(test_dataset)
74
+ scores = evaluate_predictions(test_labels, predicted_labels)
75
+
76
+ # save scores
77
+ self.precision.append(scores["precision"])
78
+ self.recall.append(scores["recall"])
79
+ self.f1.append(scores["f1"])
80
+ self.time.append(end - start)
81
+
82
+ # return both hyperparameters and evaluation metrics
83
+ return {
84
+ "hyperparameters": params,
85
+ "stats": {
86
+ "mean_precision": statistics.mean(self.precision) if self.precision else None,
87
+ "stdev_precision": statistics.stdev(self.precision) if self.precision else None,
88
+ "mean_recall": statistics.mean(self.recall) if self.recall else None,
89
+ "stdev_recall": statistics.stdev(self.recall) if self.recall else None,
90
+ "mean_f1": statistics.mean(self.f1) if self.f1 else None,
91
+ "stdev_f1": statistics.stdev(self.f1) if self.f1 else None,
92
+ "mean_time": statistics.mean(self.time) if self.time else None,
93
+ "stdev_time": statistics.stdev(self.time) if self.time else None,
94
+ },
95
+ }
96
+
97
+ def __exit__(self, *args) -> bool:
98
+ # clean up
99
+ if self.model_filepath.exists():
100
+ self.model_filepath.unlink()
101
+
102
+ # ignore exceptions
103
+ return True
@@ -0,0 +1,119 @@
1
+ """
2
+ chaine.optimization.utils
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
4
+
5
+ This module implements utility functions for hyperparameter optimization.
6
+ """
7
+
8
+ import random
9
+ from collections.abc import Iterable, Iterator
10
+ from dataclasses import dataclass
11
+
12
+ from chaine.typing import Labels, Sequence
13
+
14
+
15
+ @dataclass
16
+ class NumberSeries(Iterable):
17
+ start: int
18
+ stop: int
19
+ step: int | float
20
+
21
+ def __repr__(self) -> str:
22
+ return f"<NumberSeries (start={self.start}, stop={self.stop}, step={self.step})>"
23
+
24
+ def __iter__(self) -> Iterator[int | float]:
25
+ n = int(round((self.stop - self.start) / float(self.step)))
26
+ if n > 1:
27
+ yield from [self.start + self.step * i for i in range(n + 1)]
28
+ elif n == 1:
29
+ yield self.start
30
+
31
+
32
+ def cross_validation(
33
+ dataset: Iterable[Sequence], labels: Iterable[Labels], k: int, seed: int | None = None
34
+ ) -> Iterator[tuple[tuple[Iterable[Sequence], Iterable[Labels]]]]:
35
+ """K-fold cross validation.
36
+
37
+ Parameters
38
+ ----------
39
+ dataset : Iterable[Sequence]
40
+ Data set to split into k folds.
41
+ labels : Iterable[Labels]
42
+ Labels to split into k folds.
43
+ k : int
44
+ Number of folds.
45
+ shuffle : bool, optional
46
+ True if data set should be shuffled first, by default True.
47
+
48
+ Yields
49
+ -------
50
+ Iterator[tuple[tuple[Iterable[Sequence], Iterable[Labels]]]]
51
+ Train and test set.
52
+ """
53
+ # get indices of the examples
54
+ indices = list(range(len(dataset)))
55
+
56
+ # shuffle examples
57
+ random.seed(seed)
58
+ random.shuffle(indices)
59
+
60
+ # split into k folds
61
+ folds = [indices[i::k] for i in range(k)]
62
+
63
+ # yield every fold split
64
+ for i in range(k):
65
+ # get train and test split
66
+ test = folds[i]
67
+ train = [s for x in [fold for fold in folds if fold != test] for s in x]
68
+
69
+ # yield train and test split
70
+ yield (
71
+ [d for i, d in enumerate(dataset) if i in train],
72
+ [l for i, l in enumerate(labels) if i in train],
73
+ ), (
74
+ [d for i, d in enumerate(dataset) if i in test],
75
+ [l for i, l in enumerate(labels) if i in test],
76
+ )
77
+
78
+
79
+ def downsample(
80
+ dataset: Iterable[Sequence], labels: Iterable[Labels], n: int, seed: int | None = None
81
+ ) -> tuple[Iterable[Sequence], Iterable[Labels]]:
82
+ """Downsample the given data set to the specified size.
83
+
84
+ Parameters
85
+ ----------
86
+ dataset : Iterable[Sequence]
87
+ Data set to downsample.
88
+ labels : Iterable[Labels]
89
+ Labels for the data set.
90
+ n : int
91
+ Number of samples to keep.
92
+ seed : int | None, optional
93
+ Random seed, by default None.
94
+
95
+ Returns
96
+ -------
97
+ tuple[Iterable[Sequence], Iterable[Labels]]
98
+ Downsampled data set and labels.
99
+
100
+ Raises
101
+ ------
102
+ ValueError
103
+ If number of instances in the data set is smaller than specified size.
104
+ """
105
+ if len(dataset) < n:
106
+ raise ValueError("Data set is too small")
107
+
108
+ # get indices of the data set
109
+ indices = list(range(len(dataset)))
110
+
111
+ # sample indices
112
+ random.seed(seed)
113
+ sample = set(random.sample(indices, n))
114
+
115
+ # keep only instances of the sample
116
+ dataset = [s for i, s in enumerate(dataset) if i in sample]
117
+ labels = [l for i, l in enumerate(labels) if i in sample]
118
+
119
+ return dataset, labels