chaine 4.0.0b2__cp314-cp314-musllinux_1_2_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. chaine/__init__.py +2 -0
  2. chaine/_core/crf.cpp +19496 -0
  3. chaine/_core/crf.cpython-314-x86_64-linux-musl.so +0 -0
  4. chaine/_core/crfsuite/include/crfsuite.h +1077 -0
  5. chaine/_core/crfsuite/include/crfsuite_api.hpp +406 -0
  6. chaine/_core/crfsuite/lib/cqdb/src/cqdb.c +639 -0
  7. chaine/_core/crfsuite/lib/cqdb/src/lookup3.c +1271 -0
  8. chaine/_core/crfsuite/lib/crf/src/crf1d_context.c +788 -0
  9. chaine/_core/crfsuite/lib/crf/src/crf1d_encode.c +1020 -0
  10. chaine/_core/crfsuite/lib/crf/src/crf1d_feature.c +382 -0
  11. chaine/_core/crfsuite/lib/crf/src/crf1d_model.c +1085 -0
  12. chaine/_core/crfsuite/lib/crf/src/crf1d_tag.c +582 -0
  13. chaine/_core/crfsuite/lib/crf/src/crfsuite.c +500 -0
  14. chaine/_core/crfsuite/lib/crf/src/crfsuite_train.c +302 -0
  15. chaine/_core/crfsuite/lib/crf/src/dataset.c +115 -0
  16. chaine/_core/crfsuite/lib/crf/src/dictionary.c +127 -0
  17. chaine/_core/crfsuite/lib/crf/src/holdout.c +83 -0
  18. chaine/_core/crfsuite/lib/crf/src/json.c +1497 -0
  19. chaine/_core/crfsuite/lib/crf/src/logging.c +85 -0
  20. chaine/_core/crfsuite/lib/crf/src/params.c +370 -0
  21. chaine/_core/crfsuite/lib/crf/src/quark.c +180 -0
  22. chaine/_core/crfsuite/lib/crf/src/rumavl.c +1178 -0
  23. chaine/_core/crfsuite/lib/crf/src/train_arow.c +409 -0
  24. chaine/_core/crfsuite/lib/crf/src/train_averaged_perceptron.c +237 -0
  25. chaine/_core/crfsuite/lib/crf/src/train_l2sgd.c +491 -0
  26. chaine/_core/crfsuite/lib/crf/src/train_lbfgs.c +323 -0
  27. chaine/_core/crfsuite/lib/crf/src/train_passive_aggressive.c +442 -0
  28. chaine/_core/crfsuite/swig/crfsuite.cpp +1 -0
  29. chaine/_core/liblbfgs/lib/lbfgs.c +1531 -0
  30. chaine/_core/tagger_wrapper.hpp +58 -0
  31. chaine/_core/trainer_wrapper.cpp +32 -0
  32. chaine/_core/trainer_wrapper.hpp +26 -0
  33. chaine/crf.py +505 -0
  34. chaine/logging.py +214 -0
  35. chaine/optimization/__init__.py +10 -0
  36. chaine/optimization/metrics.py +129 -0
  37. chaine/optimization/spaces.py +394 -0
  38. chaine/optimization/trial.py +103 -0
  39. chaine/optimization/utils.py +119 -0
  40. chaine/training.py +184 -0
  41. chaine/typing.py +18 -0
  42. chaine/validation.py +43 -0
  43. chaine-4.0.0b2.dist-info/METADATA +343 -0
  44. chaine-4.0.0b2.dist-info/RECORD +50 -0
  45. chaine-4.0.0b2.dist-info/WHEEL +5 -0
  46. chaine-4.0.0b2.dist-info/licenses/LICENSE +22 -0
  47. chaine-4.0.0b2.dist-info/sboms/auditwheel.cdx.json +1 -0
  48. chaine-4.0.0b2.dist-info/top_level.txt +1 -0
  49. chaine.libs/libgcc_s-0cd532bd.so.1 +0 -0
  50. chaine.libs/libstdc++-5d72f927.so.6.0.33 +0 -0
chaine/training.py ADDED
@@ -0,0 +1,184 @@
1
+ """
2
+ chaine.training
3
+ ~~~~~~~~~~~~~~~
4
+
5
+ This module implements the high-level API to train a conditional random field.
6
+ """
7
+
8
+
9
+ from chaine.crf import HyperparameterOptimizer, Model, Trainer
10
+ from chaine.logging import Logger, set_verbosity
11
+ from chaine.typing import Filepath, Iterable, Labels, Sequence
12
+
13
+ LOGGER = Logger(__name__)
14
+
15
+
16
+ def train(
17
+ dataset: Iterable[Sequence],
18
+ labels: Iterable[Labels],
19
+ *,
20
+ model_filepath: Filepath = "model.chaine",
21
+ optimize_hyperparameters: bool = False,
22
+ optimization_sample_size: int | None = None,
23
+ verbose: int = 1,
24
+ **hyperparameters,
25
+ ) -> Model:
26
+ """Train a conditional random field.
27
+
28
+ Parameters
29
+ ----------
30
+ dataset : Iterable[Sequence]
31
+ Data set consisting of sequences of feature sets.
32
+ labels : Iterable[Labels]
33
+ Labels corresponding to each instance in the data set.
34
+ model_filepath : Filepath, optional (default=model.chaine)
35
+ Path to model location.
36
+ optimize_hyperparameters : bool
37
+ If True, optimize hyperparameters first.
38
+ optimization_sample_size : int | None
39
+ Number of instances to sample from the data set for hyperparameter optimization.
40
+ verbose : int
41
+ Controls the verbosity: the higher, the more messages.
42
+ algorithm : str
43
+ The following optimization algorithms are available:
44
+ * lbfgs: Limited-memory BFGS with L1/L2 regularization
45
+ * l2sgd: Stochastic gradient descent with L2 regularization
46
+ * ap: Averaged perceptron
47
+ * pa: Passive aggressive
48
+ * arow: Adaptive regularization of weights
49
+
50
+ Limited-memory BFGS Parameters (lbfgs)
51
+ --------------------------------------
52
+ min_freq : float, optional (default=0)
53
+ Threshold value for minimum frequency of a feature occurring in training data.
54
+ all_possible_states : bool, optional (default=False)
55
+ Generate state features that do not even occur in the training data.
56
+ all_possible_transitions : bool, optional (default=False)
57
+ Generate transition features that do not even occur in the training data.
58
+ max_iterations : int, optional (default=None)
59
+ Maximum number of iterations (unlimited by default).
60
+ num_memories : int, optional (default=6)
61
+ Number of limited memories for approximating the inverse hessian matrix.
62
+ c1 : float, optional (default=0)
63
+ Coefficient for L1 regularization.
64
+ c2 : float, optional (default=1.0)
65
+ Coefficient for L2 regularization.
66
+ epsilon : float, optional (default=1e-5)
67
+ Parameter that determines the condition of convergence.
68
+ period : int, optional (default=10)
69
+ Threshold value for iterations to test the stopping criterion.
70
+ delta : float, optional (default=1e-5)
71
+ Top iteration when log likelihood is not greater than this.
72
+ linesearch : str, optional (default="MoreThuente")
73
+ Line search algorithm used in updates:
74
+ * MoreThuente: More and Thuente's method
75
+ * Backtracking: Backtracking method with regular Wolfe condition
76
+ * StrongBacktracking: Backtracking method with strong Wolfe condition
77
+ max_linesearch : int, optional (default=20)
78
+ Maximum number of trials for the line search algorithm.
79
+
80
+ SGD with L2 Parameters (l2sgd)
81
+ ------------------------------
82
+ min_freq : float, optional (default=0)
83
+ Threshold value for minimum frequency of a feature occurring in training data.
84
+ all_possible_states : bool, optional (default=False)
85
+ Generate state features that do not even occur in the training data.
86
+ all_possible_transitions : bool, optional (default=False)
87
+ Generate transition features that do not even occur in the training data.
88
+ max_iterations : int, optional (default=None)
89
+ Maximum number of iterations (1000 by default).
90
+ c2 : float, optional (default=1.0)
91
+ Coefficient for L2 regularization.
92
+ period : int, optional (default=10)
93
+ Threshold value for iterations to test the stopping criterion.
94
+ delta : float, optional (default=1e-5)
95
+ Top iteration when log likelihood is not greater than this.
96
+ calibration_eta : float, optional (default=0.1)
97
+ Initial value of learning rate (eta) used for calibration.
98
+ calibration_rate : float, optional (default=2.0)
99
+ Rate of increase/decrease of learning rate for calibration.
100
+ calibration_samples : int, optional (default=1000)
101
+ Number of instances used for calibration.
102
+ calibration_candidates : int, optional (default=10)
103
+ Number of candidates of learning rate.
104
+ calibration_max_trials : int, optional (default=20)
105
+ Maximum number of trials of learning rates for calibration.
106
+
107
+ Averaged Perceptron Parameters (ap)
108
+ -----------------------------------
109
+ min_freq : float, optional (default=0)
110
+ Threshold value for minimum frequency of a feature occurring in training data.
111
+ all_possible_states : bool, optional (default=False)
112
+ Generate state features that do not even occur in the training data.
113
+ all_possible_transitions : bool, optional (default=False)
114
+ Generate transition features that do not even occur in the training data.
115
+ max_iterations : int, optional (default=None)
116
+ Maximum number of iterations (100 by default).
117
+ epsilon : float, optional (default=1e-5)
118
+ Parameter that determines the condition of convergence.
119
+
120
+ Passive Aggressive Parameters (pa)
121
+ ----------------------------------
122
+ min_freq : float, optional (default=0)
123
+ Threshold value for minimum frequency of a feature occurring in training data.
124
+ all_possible_states : bool, optional (default=False)
125
+ Generate state features that do not even occur in the training data.
126
+ all_possible_transitions : bool, optional (default=False)
127
+ Generate transition features that do not even occur in the training data.
128
+ max_iterations : int, optional (default=None)
129
+ Maximum number of iterations (100 by default).
130
+ epsilon : float, optional (default=1e-5)
131
+ Parameter that determines the condition of convergence.
132
+ pa_type : int, optional (default=1)
133
+ Strategy for updating feature weights:
134
+ * 0: PA without slack variables
135
+ * 1: PA type I
136
+ * 2: PA type II
137
+ c : float, optional (default=1)
138
+ Aggressiveness parameter (used only for PA-I and PA-II).
139
+ error_sensitive : bool, optional (default=True)
140
+ Include square root of predicted incorrect labels into optimization routine.
141
+ averaging : bool, optional (default=True)
142
+ Compute average of feature weights at all updates.
143
+
144
+ Adaptive Regularization of Weights Parameters (arow)
145
+ ----------------------------------------------------
146
+ min_freq : float, optional (default=0)
147
+ Threshold value for minimum frequency of a feature occurring in training data.
148
+ all_possible_states : bool, optional (default=False)
149
+ Generate state features that do not even occur in the training data.
150
+ all_possible_transitions : bool, optional (default=False)
151
+ Generate transition features that do not even occur in the training data.
152
+ max_iterations : int, optional (default=None)
153
+ Maximum number of iterations (100 by default).
154
+ epsilon : float, optional (default=1e-5)
155
+ Parameter that determines the condition of convergence.
156
+ variance : float, optional (default=1)
157
+ Initial variance of every feature weight.
158
+ gamma : float, optional (default=1)
159
+ Trade-off between loss function and changes of feature weights.
160
+
161
+ Returns
162
+ -------
163
+ Model
164
+ A conditional random field trained on the dataset.
165
+ """
166
+ set_verbosity(verbose)
167
+
168
+ if optimize_hyperparameters:
169
+ if hyperparameters:
170
+ LOGGER.warning(f"Specified hyperparameters will be overwritten: {hyperparameters}")
171
+
172
+ # optionally tune hyperparameters first
173
+ optimizer = HyperparameterOptimizer()
174
+ results = optimizer.optimize_hyperparameters(dataset, labels, optimization_sample_size)
175
+
176
+ # use hyperparameters of the best run
177
+ hyperparameters = results[0]["hyperparameters"]
178
+
179
+ # initialize trainer and start training
180
+ trainer = Trainer(**hyperparameters)
181
+ trainer.train(dataset, labels, model_filepath=str(model_filepath))
182
+
183
+ # load and return the trained model
184
+ return Model(model_filepath)
chaine/typing.py ADDED
@@ -0,0 +1,18 @@
1
+ """
2
+ chaine.typing
3
+ ~~~~~~~~~~~~~
4
+
5
+ A collection of type hints.
6
+ """
7
+
8
+ from os import PathLike
9
+ from pathlib import Path
10
+ from typing import Any, Iterable, Iterator
11
+
12
+ Sequence = Iterable[dict[str, str | int | float | bool]]
13
+ Labels = Iterable[str]
14
+ Filepath = Path | PathLike | str
15
+ Sentence = list[str]
16
+ Tags = list[str]
17
+ Features = dict[str, float | int | str | bool]
18
+ Dataset = dict[str, dict[str, Any]]
chaine/validation.py ADDED
@@ -0,0 +1,43 @@
1
+ """
2
+ chaine.validation
3
+ ~~~~~~~~~~~~~~~~~
4
+
5
+ This module implements functions to validate input sequences (either for training or inference).
6
+ """
7
+
8
+ from chaine.typing import Sequence
9
+
10
+ # supported feature value data types
11
+ TYPES = (str, int, float, bool)
12
+
13
+
14
+ def is_valid_sequence(sequence: Sequence) -> bool:
15
+ """Check if the given sequence has valid input format.
16
+
17
+ Parameters
18
+ ----------
19
+ sequence : Sequence
20
+ Sequence to validate.
21
+
22
+ Returns
23
+ -------
24
+ bool
25
+ True if sequence is valid, False otherwise.
26
+ """
27
+ return isinstance(sequence, list) and all(is_valid_token(token) for token in sequence)
28
+
29
+
30
+ def is_valid_token(token: dict) -> bool:
31
+ """Check if the given token has valid input format.
32
+
33
+ Parameters
34
+ ----------
35
+ token : dict
36
+ Token to validate.
37
+
38
+ Returns
39
+ -------
40
+ bool
41
+ True if sequence is valid, False otherwise.
42
+ """
43
+ return isinstance(token, dict) and all(isinstance(value, TYPES) for value in token.values())
@@ -0,0 +1,343 @@
1
+ Metadata-Version: 2.4
2
+ Name: chaine
3
+ Version: 4.0.0b2
4
+ Summary: Linear-chain conditional random fields for natural language processing
5
+ Author-email: Severin Simmler <s.simmler@snapaddy.com>
6
+ Requires-Python: >=3.11
7
+ Description-Content-Type: text/markdown
8
+ License-File: LICENSE
9
+ Dynamic: license-file
10
+
11
+ # Chaine
12
+
13
+ [![downloads](https://static.pepy.tech/personalized-badge/chaine?period=total&units=international_system&left_color=black&right_color=black&left_text=downloads)](https://pepy.tech/project/chaine)
14
+ [![downloads/month](https://static.pepy.tech/personalized-badge/chaine?period=month&units=abbreviation&left_color=black&right_color=black&left_text=downloads/month)](https://pepy.tech/project/chaine)
15
+ [![downloads/week](https://static.pepy.tech/personalized-badge/chaine?period=week&units=abbreviation&left_color=black&right_color=black&left_text=downloads/week)](https://pepy.tech/project/chaine)
16
+
17
+ Chaine is a modern, fast and lightweight Python library implementing **linear-chain conditional random fields**. Use it for sequence labeling tasks like [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) or [part-of-speech tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging).
18
+
19
+ The main goals of this project are:
20
+
21
+ - **Usability**: Designed with special focus on usability and a beautiful high-level API.
22
+ - **Efficiency**: Performance critical parts are written in C and thus [blazingly fast](http://www.chokkan.org/software/crfsuite/benchmark.html). Loading a model from disk and retrieving feature weights for inference is optimized for both [speed and memory](http://www.chokkan.org/software/cqdb/).
23
+ - **Persistency**: No `pickle` or `joblib` is used for serialization. A trained model will be compatible with all versions for eternity, because the underlying C library will not change. I promise.
24
+ - **Compatibility**: There are wheels for Linux, macOS and Windows. No compiler needed.
25
+ - **Minimalism**: No code bloat, no external dependencies.
26
+
27
+ Install the latest stable version from [PyPI](https://pypi.org/project/chaine):
28
+
29
+ ```
30
+ pip install chaine
31
+ ```
32
+
33
+ ### Table of contents
34
+
35
+ - [Algorithms](#algorithms)
36
+ - [Usage](#usage)
37
+ - [Features](#features)
38
+ - [Training](#training)
39
+ - [Hyperparameters](#hyperparameters)
40
+ - [Inference](#inference)
41
+ - [Weights](#weights)
42
+ - [Credits](#credits)
43
+
44
+ ## Algorithms
45
+
46
+ You can train models using the following methods:
47
+
48
+ - Limited-Memory BFGS ([Nocedal 1980](https://www.jstor.org/stable/2006193))
49
+ - Orthant-Wise Limited-Memory Quasi-Newton ([Andrew et al. 2007](https://www.microsoft.com/en-us/research/publication/scalable-training-of-l1-regularized-log-linear-models/))
50
+ - Stochastic Gradient Descent ([Shalev et al. 2007](https://www.google.com/url?q=https://www.cs.huji.ac.il/~shais/papers/ShalevSiSr07.pdf))
51
+ - Averaged Perceptron ([Collins 2002](https://aclanthology.org/W02-1001.pdf))
52
+ - Passive Aggressive ([Crammer et al. 2006](https://jmlr.csail.mit.edu/papers/v7/crammer06a.html))
53
+ - Adaptive Regularization of Weight Vectors ([Mejer et al. 2010](https://aclanthology.org/D10-1095.pdf))
54
+
55
+ Please refer to the paper by [Lafferty et al.](https://repository.upenn.edu/cgi/viewcontent.cgi?article=1162&context=cis_papers) for a general introduction to **conditional random fields** or the respective chapter in [Speech and Language Processing](https://web.stanford.edu/~jurafsky/slp3/8.pdf).
56
+
57
+ ## Usage
58
+
59
+ Training and using a **conditional random field** for inference is easy as:
60
+
61
+ ```python
62
+ >>> import chaine
63
+ >>> tokens = [[{"index": 0, "text": "John"}, {"index": 1, "text": "Lennon"}]]
64
+ >>> labels = [["B-PER", "I-PER"]]
65
+ >>> model = chaine.train(tokens, labels)
66
+ >>> model.predict(tokens)
67
+ [['B-PER', 'I-PER']]
68
+ ```
69
+
70
+ > You can control verbosity with the argument `verbose`, where `0` will set the log level to `ERROR`, `1` to `INFO` (which is the default) and `2` to `DEBUG`.
71
+
72
+ ### Features
73
+
74
+ One token in a sequence is represented as a dictionary with describing feature names as keys and respective values of type string, integer, float or boolean:
75
+
76
+ ```python
77
+ {
78
+ "text": "John",
79
+ "num_characters": 4,
80
+ "relative_index": 0.0,
81
+ "is_number": False,
82
+ }
83
+ ```
84
+
85
+ One sequence is represented as a list of feature dictionaries:
86
+
87
+ ```python
88
+ [
89
+ {"text": "John", "num_characters": 4},
90
+ {"text": "Lennon", "num_characters": 6}
91
+ ]
92
+ ```
93
+
94
+ One data set is represented as an iterable of a list of feature dictionaries:
95
+
96
+ ```python
97
+ [
98
+ [
99
+ {"text": "John", "num_characters": 4},
100
+ {"text": "Lennon", "num_characters": 6}
101
+ ],
102
+ [
103
+ {"text": "Paul", "num_characters": 4},
104
+ {"text": "McCartney", "num_characters": 9}
105
+ ],
106
+ ...
107
+ ]
108
+ ```
109
+
110
+ This is the expected input format for training. For inference, you can also process a single sequence rather than a batch of multiple sequences.
111
+
112
+ #### Generators
113
+
114
+ Depending on the size of your data set, it probably makes sense to use generators. Something like this would be totally fine for both training and inference:
115
+
116
+ ```python
117
+ ([extract_features(token) for token in tokens] for tokens in dataset)
118
+ ```
119
+
120
+ Assuming `dataset` is a generator as well, only one sequence is loaded into memory at a time.
121
+
122
+ ### Training
123
+
124
+ You can either use the high-level function to train a model (which also loads and returns it):
125
+
126
+ ```python
127
+ >>> import chaine
128
+ >>> chaine.train(tokens, labels)
129
+ ```
130
+
131
+ or the lower-level `Trainer` class:
132
+
133
+ ```python
134
+ >>> from chaine import Trainer
135
+ >>> trainer = Trainer()
136
+ ```
137
+
138
+ A `Trainer` object has a method `train()` to learn states and transitions from the given data set. You have to provide a filepath to serialize the model to:
139
+
140
+ ```python
141
+ >>> trainer.train(tokens, labels, model_filepath="model.chaine")
142
+ ```
143
+
144
+ ### Hyperparameters
145
+
146
+ Before training a model, you might want to find out the ideal hyperparameters first. You can just set the respective argument to `True`:
147
+
148
+ ```python
149
+ >>> import chaine
150
+ >>> model = chaine.train(tokens, labels, optimize_hyperparameters=True)
151
+ ```
152
+
153
+ > This might be very memory and time consuming, because 5-fold cross validation for each of the 10 trials for each of the algorithms is performed.
154
+
155
+ or use the `HyperparameterOptimizer` class and have more control over the optimization process:
156
+
157
+ ```python
158
+ >>> from chaine import HyperparameterOptimizer
159
+ >>> from chaine.optimization import L2SGDSearchSpace
160
+ >>> optimizer = HyperparameterOptimizer(trials=50, folds=3, spaces=[L2SGDSearchSpace()])
161
+ >>> optimizer.optimize_hyperparameters(tokens, labels, sample_size=1000)
162
+ ```
163
+
164
+ This will make 50 trials with 3-fold cross validation for the Stochastic Gradient Descent algorithm and return a sorted list of hyperparameters with evaluation stats. The given data set is downsampled to 1000 instances.
165
+
166
+ <details>
167
+ <summary>Example of a hyperparameter optimization report</summary>
168
+
169
+ ```json
170
+ [
171
+ {
172
+ "hyperparameters": {
173
+ "algorithm": "lbfgs",
174
+ "min_freq": 0,
175
+ "all_possible_states": true,
176
+ "all_possible_transitions": true,
177
+ "num_memories": 8,
178
+ "c1": 0.9,
179
+ "c2": 0.31,
180
+ "epsilon": 0.00011,
181
+ "period": 17,
182
+ "delta": 0.00051,
183
+ "linesearch": "Backtracking",
184
+ "max_linesearch": 31
185
+ },
186
+ "stats": {
187
+ "mean_precision": 0.4490952380952381,
188
+ "stdev_precision": 0.16497993418839532,
189
+ "mean_recall": 0.4554858934169279,
190
+ "stdev_recall": 0.20082402876210334,
191
+ "mean_f1": 0.45041435392087253,
192
+ "stdev_f1": 0.17914435056760908,
193
+ "mean_time": 0.3920876979827881,
194
+ "stdev_time": 0.0390961164333519
195
+ }
196
+ },
197
+ {
198
+ "hyperparameters": {
199
+ "algorithm": "lbfgs",
200
+ "min_freq": 5,
201
+ "all_possible_states": true,
202
+ "all_possible_transitions": false,
203
+ "num_memories": 9,
204
+ "c1": 1.74,
205
+ "c2": 0.09,
206
+ "epsilon": 0.0008600000000000001,
207
+ "period": 1,
208
+ "delta": 0.00045000000000000004,
209
+ "linesearch": "StrongBacktracking",
210
+ "max_linesearch": 34
211
+ },
212
+ "stats": {
213
+ "mean_precision": 0.4344436335328176,
214
+ "stdev_precision": 0.15542689556199216,
215
+ "mean_recall": 0.4385174258109041,
216
+ "stdev_recall": 0.19873733310765845,
217
+ "mean_f1": 0.43386496201052716,
218
+ "stdev_f1": 0.17225578421967264,
219
+ "mean_time": 0.12209572792053222,
220
+ "stdev_time": 0.0236177196325414
221
+ }
222
+ },
223
+ {
224
+ "hyperparameters": {
225
+ "algorithm": "lbfgs",
226
+ "min_freq": 2,
227
+ "all_possible_states": true,
228
+ "all_possible_transitions": true,
229
+ "num_memories": 1,
230
+ "c1": 0.91,
231
+ "c2": 0.4,
232
+ "epsilon": 0.0008400000000000001,
233
+ "period": 13,
234
+ "delta": 0.00018,
235
+ "linesearch": "MoreThuente",
236
+ "max_linesearch": 43
237
+ },
238
+ "stats": {
239
+ "mean_precision": 0.41963433149859447,
240
+ "stdev_precision": 0.16363544501259455,
241
+ "mean_recall": 0.4331173486012196,
242
+ "stdev_recall": 0.21344965207006913,
243
+ "mean_f1": 0.422038027332145,
244
+ "stdev_f1": 0.18245844823319127,
245
+ "mean_time": 0.2586916446685791,
246
+ "stdev_time": 0.04341208573100539
247
+ }
248
+ },
249
+ {
250
+ "hyperparameters": {
251
+ "algorithm": "l2sgd",
252
+ "min_freq": 5,
253
+ "all_possible_states": true,
254
+ "all_possible_transitions": true,
255
+ "c2": 1.68,
256
+ "period": 2,
257
+ "delta": 0.00047000000000000004,
258
+ "calibration_eta": 0.0006900000000000001,
259
+ "calibration_rate": 2.9000000000000004,
260
+ "calibration_samples": 1400,
261
+ "calibration_candidates": 25,
262
+ "calibration_max_trials": 23
263
+ },
264
+ "stats": {
265
+ "mean_precision": 0.2571428571428571,
266
+ "stdev_precision": 0.43330716823151716,
267
+ "mean_recall": 0.01,
268
+ "stdev_recall": 0.022360679774997897,
269
+ "mean_f1": 0.01702127659574468,
270
+ "stdev_f1": 0.038060731531911314,
271
+ "mean_time": 0.15442829132080077,
272
+ "stdev_time": 0.051750737506044905
273
+ }
274
+ }
275
+ ]
276
+ ```
277
+ </details>
278
+
279
+ ### Inference
280
+
281
+ The high-level function `chaine.train()` returns a `Model` object. You can load an already trained model from disk by initializing a `Model` object with the model's filepath:
282
+
283
+ ```python
284
+ >>> from chaine import Model
285
+ >>> model = Model("model.chaine")
286
+ ```
287
+
288
+ You can predict labels for a batch of sequences:
289
+
290
+ ```python
291
+ >>> tokens = [
292
+ ... [{"index": 0, "text": "John"}, {"index": 1, "text": "Lennon"}],
293
+ ... [{"index": 0, "text": "Paul"}, {"index": 1, "text": "McCartney"}],
294
+ ... [{"index": 0, "text": "George"}, {"index": 1, "text": "Harrison"}],
295
+ ... [{"index": 0, "text": "Ringo"}, {"index": 1, "text": "Starr"}]
296
+ ... ]
297
+ >>> model.predict(tokens)
298
+ [['B-PER', 'I-PER'], ['B-PER', 'I-PER'], ['B-PER', 'I-PER'], ['B-PER', 'I-PER']]
299
+ ```
300
+
301
+ or only for a single sequence:
302
+
303
+ ```python
304
+ >>> model.predict_single(tokens[0])
305
+ ['B-PER', 'I-PER']
306
+ ```
307
+
308
+ If you are interested in the model's probability distribution for a given sequence, you can:
309
+
310
+ ```python
311
+ >>> model.predict_proba_single(tokens[0])
312
+ [[{'B-PER': 0.99, 'I-PER': 0.01}, {'B-PER': 0.01, 'I-PER': 0.99}]]
313
+ ```
314
+
315
+ > Use the `model.predict_proba()` method for a batch of sequences.
316
+
317
+ ### Weights
318
+
319
+ After loading a trained model, you can inspect the learned transition and state weights:
320
+
321
+ ```python
322
+ >>> model = Model("model.chaine")
323
+ >>> model.transitions
324
+ [{'from': 'B-PER', 'to': 'I-PER', 'weight': 1.430506540616852e-06}]
325
+ >>> model.states
326
+ [{'feature': 'text:John', 'label': 'B-PER', 'weight': 9.536710877105517e-07}, ...]
327
+ ```
328
+
329
+ You can also dump both transition and state weights as JSON:
330
+
331
+ ```python
332
+ >>> model.dump_states("states.json")
333
+ >>> model.dump_transitions("transitions.json")
334
+ ```
335
+
336
+ ## Credits
337
+
338
+ This project makes use of and is partially based on:
339
+
340
+ - [CRFsuite](https://github.com/chokkan/crfsuite)
341
+ - [libLBFGS](https://github.com/chokkan/liblbfgs)
342
+ - [python-crfsuite](https://github.com/scrapinghub/python-crfsuite)
343
+ - [sklearn-crfsuite](https://github.com/TeamHG-Memex/sklearn-crfsuite)
@@ -0,0 +1,50 @@
1
+ chaine/__init__.py,sha256=TzYwbQ05GnL1zGl5Y981lpn1SJZNs-9g-H1rDSsQQQI,97
2
+ chaine/crf.py,sha256=N6s6BhuQtzslHvbItP58kT-TygSkue5nmyGcgO4SEyA,18003
3
+ chaine/logging.py,sha256=ecMug4UjT-tRwjNqFNlrAaElJFUyzncbPzEz1-EKDg8,5256
4
+ chaine/training.py,sha256=l9gRS29jAAbPLvPbxrxMuJ89sPYg5B5cJJc7vl71bgU,8401
5
+ chaine/typing.py,sha256=F5YEabzjTGcWdT1oVvc7kOoQgrF9YM28IWKt7cSv7TA,393
6
+ chaine/validation.py,sha256=_kL7gELOpGAtSZk_MCAB0OP2iaMNNXOYVfnAn71u5dM,995
7
+ chaine/_core/crf.cpp,sha256=IF_wPi49qD-rqrHKjlM2Yg-ePnu49MF6JShdOFsNnBw,832756
8
+ chaine/_core/crf.cpython-314-x86_64-linux-musl.so,sha256=AU9vXVBsAjmOWL8u51RzDQfjWiBblbugo2ryBnVpECQ,2388513
9
+ chaine/_core/tagger_wrapper.hpp,sha256=Kp3-WuWptf_x1asr33PHFiuCFDKAjBE7Ux_Y-jms5-A,1312
10
+ chaine/_core/trainer_wrapper.cpp,sha256=cWch9SUpmvU6ob70witv9JGZBo3Ffk8FyYZmOSYfoyw,745
11
+ chaine/_core/trainer_wrapper.hpp,sha256=k0qxNF1Zc1jUWSl8jGsk6lEV87HvrcX368Ld0X2QpCU,573
12
+ chaine/_core/crfsuite/include/crfsuite.h,sha256=vFWIxhig8SblpqI9LtB8UbijMPKcz6fZTpi6N2kCAPU,38631
13
+ chaine/_core/crfsuite/include/crfsuite_api.hpp,sha256=bT7PenKkHZEO6C4YwH7K-531JoRBc6ZYUb8xna-WZ3w,14098
14
+ chaine/_core/crfsuite/lib/cqdb/src/cqdb.c,sha256=3FRltjneWVXB1xcO5SLB0XBD2iH8EjWoWiKCtkt5N1w,17268
15
+ chaine/_core/crfsuite/lib/cqdb/src/lookup3.c,sha256=vItwg4xz2L-ETuNdjp-ciX2Y4s7BeQq7poltS7iWQkQ,36564
16
+ chaine/_core/crfsuite/lib/crf/src/crf1d_context.c,sha256=KyKlCgG2XZQ-ve16J78C2qBPKs2KUuhBrGbBWue23f4,22618
17
+ chaine/_core/crfsuite/lib/crf/src/crf1d_encode.c,sha256=ouVuCPfrY6Q1UzJDRxcxRnIGaw6EEBxGm2i5CfvPpkI,29224
18
+ chaine/_core/crfsuite/lib/crf/src/crf1d_feature.c,sha256=NzK63VSV9iWLLcIWxYpj2US4kqhBkDm_HpGVeSJef7M,10589
19
+ chaine/_core/crfsuite/lib/crf/src/crf1d_model.c,sha256=TC-MNEnvphQYSsaRYATun5J9_Fmrn9zolfmn3xYpLHg,26694
20
+ chaine/_core/crfsuite/lib/crf/src/crf1d_tag.c,sha256=KKj_8rS9C8FMKrmi0C5I2q9p8UYvKAG2IMPJl9r3tAU,16748
21
+ chaine/_core/crfsuite/lib/crf/src/crfsuite.c,sha256=NbIIYrMAmQGoy0-vPmUZRiLJUN0yVRSRqbH2iCbYN6Y,14080
22
+ chaine/_core/crfsuite/lib/crf/src/crfsuite_train.c,sha256=uqlp55WD4ke60j4V52f8nMtHEFzqTvx5Wougffad2kk,8549
23
+ chaine/_core/crfsuite/lib/crf/src/dataset.c,sha256=Mtq3NmVjtGaCMGW3akiDOEOw8Z-JqYkZqykTRsDUbbA,3254
24
+ chaine/_core/crfsuite/lib/crf/src/dictionary.c,sha256=GlCNRabrGVZdh-SA6btiblt7ecDefy88QtHbdFBXW1M,3878
25
+ chaine/_core/crfsuite/lib/crf/src/holdout.c,sha256=60rLL5VCzH7ULgR_ZhtjmEkj8Zc_YGSayzCRrc5J8hE,2865
26
+ chaine/_core/crfsuite/lib/crf/src/json.c,sha256=x6x-POaCJTHeVHoOGo_-3C_uUHDW2neHVVZnGyGcVMs,29211
27
+ chaine/_core/crfsuite/lib/crf/src/logging.c,sha256=m4s_RGdtRttzx6UhTVxOtK7Aq3c3XCG0SK4efF6i7J4,2640
28
+ chaine/_core/crfsuite/lib/crf/src/params.c,sha256=eCrpuNmYSKQBE0qZkUaO4e1ZAqBo7hVHP4uUH0c9dlM,10019
29
+ chaine/_core/crfsuite/lib/crf/src/quark.c,sha256=HIwp_nWyUkEvalvtxDTLTD2ivzyRTEJcs2SQTbr_7J4,4842
30
+ chaine/_core/crfsuite/lib/crf/src/rumavl.c,sha256=oJwD9UETztAWh24F336WQ1RYZu_1rg818MEfpHWk68E,33987
31
+ chaine/_core/crfsuite/lib/crf/src/train_arow.c,sha256=p5Qf33ugcmee-z-diCcqSI9bWu4u445RvLxrG9NVejs,10858
32
+ chaine/_core/crfsuite/lib/crf/src/train_averaged_perceptron.c,sha256=sAzgkJuOj9lv_3hSpu_lwIBkGM3dM8XHg04l4z87Xgw,6704
33
+ chaine/_core/crfsuite/lib/crf/src/train_l2sgd.c,sha256=RVuMLL-mp8llsZAiKS0yXvOdLkusux4wcGkxQCrl0aw,13947
34
+ chaine/_core/crfsuite/lib/crf/src/train_lbfgs.c,sha256=faXl6eX5iVhuXOAvpuwBKxMkRwN_bE-96SQwWJGRtCQ,9447
35
+ chaine/_core/crfsuite/lib/crf/src/train_passive_aggressive.c,sha256=8e17ZrO3AwiSVuDtKmag_0piGIzNwMRUImv5ogMJUqM,11565
36
+ chaine/_core/crfsuite/swig/crfsuite.cpp,sha256=mlIpS4rUgAqoC92rmnY62H-eOOcvIeyGeelpZlfXfOM,24
37
+ chaine/_core/liblbfgs/lib/lbfgs.c,sha256=NXJaMFR3msc-fvZGshul3urGNhCcMrmVtupF5fZFo34,43705
38
+ chaine/optimization/__init__.py,sha256=VO4uQB6WjE3YSQyYkjIT_MvyifxoEGcwVMqj86Sbq_o,257
39
+ chaine/optimization/metrics.py,sha256=m1csn5S5TrKYpvj6xGMlgIqoWqN9U-YTNwS5oXka4VQ,3716
40
+ chaine/optimization/spaces.py,sha256=V9B2YtO4borD6UgC55_vuchxrM0E5gVqiW_Ky5g1WaM,17797
41
+ chaine/optimization/trial.py,sha256=Hcee5DfOp91oQt7VKHPw80XQpuIWfykmFCBXr1-UONY,3566
42
+ chaine/optimization/utils.py,sha256=2IdNfH0amKND6XQvwTCH3iMz3GPRVeCrrKkJ4cJklRo,3297
43
+ chaine.libs/libgcc_s-0cd532bd.so.1,sha256=yPk0-VjyKzucjnkP3mvC0vVaua6Ln17qZUJbICcXgtA,181737
44
+ chaine.libs/libstdc++-5d72f927.so.6.0.33,sha256=fogxHsmB1_D6C-a_-uHh8Ei_6Qh52a8vLlicJRM3ehk,3562401
45
+ chaine-4.0.0b2.dist-info/METADATA,sha256=Ky6kQ7QRB0dp-luVG0NHZUtBzC80_rnC_HafuQ4SUws,12246
46
+ chaine-4.0.0b2.dist-info/WHEEL,sha256=K2_TehZnioJBDmm5baDVfhoCxaclJzJsPrng3hg7WD0,112
47
+ chaine-4.0.0b2.dist-info/top_level.txt,sha256=E8ELV_w3Ec0xoSIoOxUHdz_lNB5rXl4rsnRRTWe5Rto,7
48
+ chaine-4.0.0b2.dist-info/RECORD,,
49
+ chaine-4.0.0b2.dist-info/licenses/LICENSE,sha256=6SMANnM0AQzMghahD-3tb0_seUOghEikuXKu_XYbDq0,1136
50
+ chaine-4.0.0b2.dist-info/sboms/auditwheel.cdx.json,sha256=72lG00XC8mVeUyAmR6RKhP46FhD308u4cc8bXVKznTI,1690
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: false
4
+ Tag: cp314-cp314-musllinux_1_2_x86_64
5
+
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2014-2017 ScrapingHub Inc. and contributors.
4
+ Copyright (c) 2020-2025 Severin Simmler
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ {"bomFormat": "CycloneDX", "specVersion": "1.4", "version": 1, "metadata": {"component": {"type": "library", "bom-ref": "pkg:pypi/chaine@4.0.0b2?file_name=chaine-4.0.0b2-cp314-cp314-musllinux_1_2_x86_64.whl", "name": "chaine", "version": "4.0.0b2", "purl": "pkg:pypi/chaine@4.0.0b2?file_name=chaine-4.0.0b2-cp314-cp314-musllinux_1_2_x86_64.whl"}, "tools": [{"name": "auditwheel", "version": "6.6.0"}]}, "components": [{"type": "library", "bom-ref": "pkg:pypi/chaine@4.0.0b2?file_name=chaine-4.0.0b2-cp314-cp314-musllinux_1_2_x86_64.whl", "name": "chaine", "version": "4.0.0b2", "purl": "pkg:pypi/chaine@4.0.0b2?file_name=chaine-4.0.0b2-cp314-cp314-musllinux_1_2_x86_64.whl"}, {"type": "library", "bom-ref": "pkg:apk/alpine/libgcc@14.2.0-r6#933a623c9e323e83b1734e630a342f206999adc096732a3fea9896fa0181ea29", "name": "libgcc", "version": "14.2.0-r6", "purl": "pkg:apk/alpine/libgcc@14.2.0-r6"}, {"type": "library", "bom-ref": "pkg:apk/alpine/libstdc%2B%2B@14.2.0-r6#76f023cbc3d7b369d6008f354e88aa983d13e4bd8f06e4e49a01686039fe1509", "name": "libstdc++", "version": "14.2.0-r6", "purl": "pkg:apk/alpine/libstdc%2B%2B@14.2.0-r6"}], "dependencies": [{"ref": "pkg:pypi/chaine@4.0.0b2?file_name=chaine-4.0.0b2-cp314-cp314-musllinux_1_2_x86_64.whl", "dependsOn": ["pkg:apk/alpine/libgcc@14.2.0-r6#933a623c9e323e83b1734e630a342f206999adc096732a3fea9896fa0181ea29", "pkg:apk/alpine/libstdc%2B%2B@14.2.0-r6#76f023cbc3d7b369d6008f354e88aa983d13e4bd8f06e4e49a01686039fe1509"]}, {"ref": "pkg:apk/alpine/libgcc@14.2.0-r6#933a623c9e323e83b1734e630a342f206999adc096732a3fea9896fa0181ea29"}, {"ref": "pkg:apk/alpine/libstdc%2B%2B@14.2.0-r6#76f023cbc3d7b369d6008f354e88aa983d13e4bd8f06e4e49a01686039fe1509"}]}