@mailwoman/neural-weights-fr-fr 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -13
- package/model-card.json +102 -50
- package/model.onnx +0 -0
- package/package.json +18 -18
- package/tokenizer.model +0 -0
package/README.md
CHANGED
|
@@ -1,30 +1,33 @@
|
|
|
1
1
|
# @mailwoman/neural-weights-fr-fr
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Stage 2 (coarse + venue/street/house_number) Mailwoman neural-classifier weights.
|
|
4
4
|
|
|
5
5
|
- locale: **fr-fr**
|
|
6
|
-
- corpus: **0.
|
|
7
|
-
- training steps: **
|
|
6
|
+
- corpus: **0.3.0**
|
|
7
|
+
- training steps: **2200**
|
|
8
8
|
- hardware: **AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT**
|
|
9
9
|
|
|
10
|
-
##
|
|
10
|
+
## Per-component F1 targets
|
|
11
11
|
|
|
12
|
-
**⚠ Below
|
|
12
|
+
**⚠ Below per-component F1 targets:**
|
|
13
13
|
|
|
14
|
-
- `country` F1 = **0.
|
|
15
|
-
- `region` F1 = **0.
|
|
16
|
-
- `locality` F1 = **0.
|
|
17
|
-
- `postcode` F1 = **0.
|
|
14
|
+
- `country` F1 = **0.2112** (target ≥0.95)
|
|
15
|
+
- `region` F1 = **0.1883** (target ≥0.95)
|
|
16
|
+
- `locality` F1 = **0.2736** (target ≥0.95)
|
|
17
|
+
- `postcode` F1 = **0.6916** (target ≥0.95)
|
|
18
|
+
- `venue` F1 = **0.3886** (target ≥0.60)
|
|
19
|
+
- `street` F1 = **0.3016** (target ≥0.70)
|
|
20
|
+
- `house_number` F1 = **0.7866** (target ≥0.80)
|
|
18
21
|
|
|
19
22
|
## Eval (golden set)
|
|
20
23
|
|
|
21
|
-
- entries: **
|
|
22
|
-
- full-parse exact match: **0.
|
|
23
|
-
- mean token confidence: **0.
|
|
24
|
+
- entries: **4535**
|
|
25
|
+
- full-parse exact match: **0.0818**
|
|
26
|
+
- mean token confidence: **0.8063**
|
|
24
27
|
|
|
25
28
|
## Components supported
|
|
26
29
|
|
|
27
|
-
Stage
|
|
30
|
+
Stage 2 ships coarse (country / region / locality / dependent_locality / postcode / subregion / cedex) plus fine-grained venue / street / house_number. Token classifier emits 21 BIO labels.
|
|
28
31
|
|
|
29
32
|
## Files
|
|
30
33
|
|
package/model-card.json
CHANGED
|
@@ -1,53 +1,87 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "neural-weights-fr-fr",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"phase": "Stage
|
|
3
|
+
"version": "0.4.0",
|
|
4
|
+
"phase": "Stage 2 (coarse + venue/street/house_number)",
|
|
5
5
|
"license": "AGPL-3.0-only",
|
|
6
6
|
"locale": "fr-fr",
|
|
7
7
|
"training": {
|
|
8
|
-
"corpus_version": "0.
|
|
8
|
+
"corpus_version": "0.3.0",
|
|
9
9
|
"tokenizer_version": "0.1.0",
|
|
10
|
-
"steps":
|
|
10
|
+
"steps": 2200,
|
|
11
11
|
"hardware": "AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT",
|
|
12
|
-
"duration_seconds":
|
|
12
|
+
"duration_seconds": 1146.0,
|
|
13
13
|
"started_at": null,
|
|
14
|
-
"completed_at": "2026-05-
|
|
14
|
+
"completed_at": "2026-05-23T06:21:51.190078Z"
|
|
15
15
|
},
|
|
16
|
-
"components_supported": [
|
|
16
|
+
"components_supported": [
|
|
17
|
+
"country",
|
|
18
|
+
"region",
|
|
19
|
+
"locality",
|
|
20
|
+
"dependent_locality",
|
|
21
|
+
"postcode",
|
|
22
|
+
"subregion",
|
|
23
|
+
"cedex",
|
|
24
|
+
"venue",
|
|
25
|
+
"street",
|
|
26
|
+
"house_number"
|
|
27
|
+
],
|
|
28
|
+
"labels": [
|
|
29
|
+
"O",
|
|
30
|
+
"B-country",
|
|
31
|
+
"I-country",
|
|
32
|
+
"B-region",
|
|
33
|
+
"I-region",
|
|
34
|
+
"B-locality",
|
|
35
|
+
"I-locality",
|
|
36
|
+
"B-dependent_locality",
|
|
37
|
+
"I-dependent_locality",
|
|
38
|
+
"B-postcode",
|
|
39
|
+
"I-postcode",
|
|
40
|
+
"B-subregion",
|
|
41
|
+
"I-subregion",
|
|
42
|
+
"B-cedex",
|
|
43
|
+
"I-cedex",
|
|
44
|
+
"B-venue",
|
|
45
|
+
"I-venue",
|
|
46
|
+
"B-street",
|
|
47
|
+
"I-street",
|
|
48
|
+
"B-house_number",
|
|
49
|
+
"I-house_number"
|
|
50
|
+
],
|
|
17
51
|
"eval": {
|
|
18
|
-
"n_entries":
|
|
19
|
-
"full_parse_exact_match": 0.
|
|
20
|
-
"mean_token_confidence": 0.
|
|
52
|
+
"n_entries": 4535,
|
|
53
|
+
"full_parse_exact_match": 0.08180815876515987,
|
|
54
|
+
"mean_token_confidence": 0.8062812768727202,
|
|
21
55
|
"per_component": {
|
|
22
56
|
"country": {
|
|
23
|
-
"precision": 0.
|
|
24
|
-
"recall": 0.
|
|
25
|
-
"f1": 0.
|
|
26
|
-
"support":
|
|
57
|
+
"precision": 0.21428571428481394,
|
|
58
|
+
"recall": 0.20816326530527282,
|
|
59
|
+
"f1": 0.21118012372283307,
|
|
60
|
+
"support": 245
|
|
27
61
|
},
|
|
28
62
|
"region": {
|
|
29
|
-
"precision": 0.
|
|
30
|
-
"recall": 0.
|
|
31
|
-
"f1": 0.
|
|
32
|
-
"support":
|
|
63
|
+
"precision": 0.342951360263526,
|
|
64
|
+
"recall": 0.129797191887635,
|
|
65
|
+
"f1": 0.18832050661831204,
|
|
66
|
+
"support": 3205
|
|
33
67
|
},
|
|
34
68
|
"locality": {
|
|
35
|
-
"precision": 0.
|
|
36
|
-
"recall": 0.
|
|
37
|
-
"f1": 0.
|
|
38
|
-
"support":
|
|
69
|
+
"precision": 0.24782398452605223,
|
|
70
|
+
"recall": 0.30533214179317686,
|
|
71
|
+
"f1": 0.2735886822759171,
|
|
72
|
+
"support": 3357
|
|
39
73
|
},
|
|
40
74
|
"dependent_locality": {
|
|
41
|
-
"precision": 0.
|
|
42
|
-
"recall": 0.
|
|
43
|
-
"f1": 0.
|
|
44
|
-
"support":
|
|
75
|
+
"precision": 0.005044136191670815,
|
|
76
|
+
"recall": 0.0999999999975,
|
|
77
|
+
"f1": 0.009603841445164863,
|
|
78
|
+
"support": 40
|
|
45
79
|
},
|
|
46
80
|
"postcode": {
|
|
47
|
-
"precision": 0.
|
|
48
|
-
"recall": 0.
|
|
49
|
-
"f1": 0.
|
|
50
|
-
"support":
|
|
81
|
+
"precision": 0.8323890462696731,
|
|
82
|
+
"recall": 0.591610738254835,
|
|
83
|
+
"f1": 0.6916437813892687,
|
|
84
|
+
"support": 2980
|
|
51
85
|
},
|
|
52
86
|
"subregion": {
|
|
53
87
|
"precision": 0.0,
|
|
@@ -60,6 +94,24 @@
|
|
|
60
94
|
"recall": 0.0,
|
|
61
95
|
"f1": 0.0,
|
|
62
96
|
"support": 1
|
|
97
|
+
},
|
|
98
|
+
"venue": {
|
|
99
|
+
"precision": 0.37649063032335905,
|
|
100
|
+
"recall": 0.4014532243411431,
|
|
101
|
+
"f1": 0.38857142807160183,
|
|
102
|
+
"support": 1101
|
|
103
|
+
},
|
|
104
|
+
"street": {
|
|
105
|
+
"precision": 0.35594795539016916,
|
|
106
|
+
"recall": 0.26161202185783416,
|
|
107
|
+
"f1": 0.3015748026611547,
|
|
108
|
+
"support": 2928
|
|
109
|
+
},
|
|
110
|
+
"house_number": {
|
|
111
|
+
"precision": 0.7446153846150028,
|
|
112
|
+
"recall": 0.8335246842704744,
|
|
113
|
+
"f1": 0.7865655466300883,
|
|
114
|
+
"support": 1742
|
|
63
115
|
}
|
|
64
116
|
},
|
|
65
117
|
"calibration": [
|
|
@@ -72,56 +124,56 @@
|
|
|
72
124
|
{
|
|
73
125
|
"low": 0.1,
|
|
74
126
|
"high": 0.2,
|
|
75
|
-
"n":
|
|
76
|
-
"acc": 0.
|
|
127
|
+
"n": 21,
|
|
128
|
+
"acc": 0.23809523809523808
|
|
77
129
|
},
|
|
78
130
|
{
|
|
79
131
|
"low": 0.2,
|
|
80
132
|
"high": 0.3,
|
|
81
|
-
"n":
|
|
82
|
-
"acc": 0.
|
|
133
|
+
"n": 666,
|
|
134
|
+
"acc": 0.2912912912912913
|
|
83
135
|
},
|
|
84
136
|
{
|
|
85
137
|
"low": 0.3,
|
|
86
138
|
"high": 0.4,
|
|
87
|
-
"n":
|
|
88
|
-
"acc": 0.
|
|
139
|
+
"n": 2416,
|
|
140
|
+
"acc": 0.3265728476821192
|
|
89
141
|
},
|
|
90
142
|
{
|
|
91
143
|
"low": 0.4,
|
|
92
144
|
"high": 0.5,
|
|
93
|
-
"n":
|
|
94
|
-
"acc": 0.
|
|
145
|
+
"n": 4308,
|
|
146
|
+
"acc": 0.3152274837511606
|
|
95
147
|
},
|
|
96
148
|
{
|
|
97
149
|
"low": 0.5,
|
|
98
150
|
"high": 0.6,
|
|
99
|
-
"n":
|
|
100
|
-
"acc": 0.
|
|
151
|
+
"n": 4777,
|
|
152
|
+
"acc": 0.3382876282185472
|
|
101
153
|
},
|
|
102
154
|
{
|
|
103
155
|
"low": 0.6,
|
|
104
156
|
"high": 0.7,
|
|
105
|
-
"n":
|
|
106
|
-
"acc": 0.
|
|
157
|
+
"n": 4943,
|
|
158
|
+
"acc": 0.3568682986040866
|
|
107
159
|
},
|
|
108
160
|
{
|
|
109
161
|
"low": 0.7,
|
|
110
162
|
"high": 0.8,
|
|
111
|
-
"n":
|
|
112
|
-
"acc": 0.
|
|
163
|
+
"n": 5534,
|
|
164
|
+
"acc": 0.382544271774485
|
|
113
165
|
},
|
|
114
166
|
{
|
|
115
167
|
"low": 0.8,
|
|
116
168
|
"high": 0.9,
|
|
117
|
-
"n":
|
|
118
|
-
"acc": 0.
|
|
169
|
+
"n": 8066,
|
|
170
|
+
"acc": 0.43627572526655095
|
|
119
171
|
},
|
|
120
172
|
{
|
|
121
173
|
"low": 0.9,
|
|
122
174
|
"high": 1.0,
|
|
123
|
-
"n":
|
|
124
|
-
"acc": 0.
|
|
175
|
+
"n": 30517,
|
|
176
|
+
"acc": 0.6440344725890488
|
|
125
177
|
}
|
|
126
178
|
]
|
|
127
179
|
},
|
|
@@ -130,7 +182,7 @@
|
|
|
130
182
|
"particle-honorific kryptonite (e.g. FR 'Saint-Just-Saint-Rambert') if not in synth set",
|
|
131
183
|
"non-Latin scripts (CJK, Cyrillic) fall through to byte-fallback tokens; F1 unknown"
|
|
132
184
|
],
|
|
133
|
-
"notes": "
|
|
185
|
+
"notes": "v0.4.0 \u2014 issue #116. Same encoder geometry as v0.3.0 (8.87M params, 6L/256H/4-heads, 21 BIO labels, linear-chain CRF). Issue proposed (1) per-token CRF NLL normalization + (3) class-weighted CE biased toward coarse labels + (4) source-weight rebalance. Empirical iteration found that \u00a71 and \u00a73 destabilize sustained training at every LR tested (5e-4, 3e-4, 1.5e-4 \u2014 even the v0.3.0-safe LR), and on the golden v0.1.2 eval (4535 entries) they slightly REGRESS country/postcode F1 vs v0.3.0. SHIPPED recipe is the \u00a74-only ablation (v0.3.0 dual-loss + v0.4.0 source-weight rebalance) at lr=1.5e-4, step 2200. Modest fine-label gains (street +0.03, house_number +0.01), modest coarse-F1 regression (country -0.07, postcode -0.07). \u00a71/\u00a73 deferred to v0.4.1 corpus-side investigation per issue's '2K divergence' clause. Full ablation matrix retrospective in LOG.md.",
|
|
134
186
|
"format": {
|
|
135
187
|
"model": "ONNX int8 dynamic",
|
|
136
188
|
"tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=16000",
|
|
@@ -142,5 +194,5 @@
|
|
|
142
194
|
"tokenizer": "tokenizer.model",
|
|
143
195
|
"model_card": "model-card.json"
|
|
144
196
|
},
|
|
145
|
-
"base_relpath": "/data/models/checkpoints/
|
|
197
|
+
"base_relpath": "/data/models/checkpoints/v0_4_0-stableLR-source-only/step-002200"
|
|
146
198
|
}
|
package/model.onnx
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
}
|
|
2
|
+
"name": "@mailwoman/neural-weights-fr-fr",
|
|
3
|
+
"version": "2.2.0",
|
|
4
|
+
"description": "Mailwoman neural-classifier weights for locale 'fr-fr'. Data-only package — loaded by @mailwoman/neural at runtime.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"model.onnx",
|
|
12
|
+
"tokenizer.model",
|
|
13
|
+
"model-card.json",
|
|
14
|
+
"README.md"
|
|
15
|
+
],
|
|
16
|
+
"publishConfig": {
|
|
17
|
+
"access": "public"
|
|
18
|
+
}
|
|
19
|
+
}
|
package/tokenizer.model
CHANGED
|
Binary file
|