@mailwoman/neural-weights-fr-fr 3.0.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -10
- package/model-card.json +74 -50
- package/model.onnx +0 -0
- package/package.json +18 -19
- package/tokenizer.model +0 -0
package/README.md
CHANGED
|
@@ -4,26 +4,26 @@ Stage 2 (coarse + venue/street/house_number) Mailwoman neural-classifier weights
|
|
|
4
4
|
|
|
5
5
|
- locale: **fr-fr**
|
|
6
6
|
- corpus: **0.3.0**
|
|
7
|
-
- training steps: **
|
|
7
|
+
- training steps: **2200**
|
|
8
8
|
- hardware: **AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT**
|
|
9
9
|
|
|
10
10
|
## Per-component F1 targets
|
|
11
11
|
|
|
12
12
|
**⚠ Below per-component F1 targets:**
|
|
13
13
|
|
|
14
|
-
- `country` F1 = **0.
|
|
15
|
-
- `region` F1 = **0.
|
|
16
|
-
- `locality` F1 = **0.
|
|
17
|
-
- `postcode` F1 = **0.
|
|
18
|
-
- `venue` F1 = **0.
|
|
19
|
-
- `street` F1 = **0.
|
|
20
|
-
- `house_number` F1 = **0.
|
|
14
|
+
- `country` F1 = **0.2112** (target ≥0.95)
|
|
15
|
+
- `region` F1 = **0.1883** (target ≥0.95)
|
|
16
|
+
- `locality` F1 = **0.2736** (target ≥0.95)
|
|
17
|
+
- `postcode` F1 = **0.6916** (target ≥0.95)
|
|
18
|
+
- `venue` F1 = **0.3886** (target ≥0.60)
|
|
19
|
+
- `street` F1 = **0.3016** (target ≥0.70)
|
|
20
|
+
- `house_number` F1 = **0.7866** (target ≥0.80)
|
|
21
21
|
|
|
22
22
|
## Eval (golden set)
|
|
23
23
|
|
|
24
24
|
- entries: **4535**
|
|
25
|
-
- full-parse exact match: **0.
|
|
26
|
-
- mean token confidence: **0.
|
|
25
|
+
- full-parse exact match: **0.0818**
|
|
26
|
+
- mean token confidence: **0.8063**
|
|
27
27
|
|
|
28
28
|
## Components supported
|
|
29
29
|
|
package/model-card.json
CHANGED
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "neural-weights-fr-fr",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.0.0",
|
|
4
|
+
"model_lineage": "shares the en-us Stage 3 / step 100000 model (formerly v0.6.0) — relabeled to the unified 4.0.0 release version; tokenizer 0.6.0-a0",
|
|
4
5
|
"phase": "Stage 2 (coarse + venue/street/house_number)",
|
|
5
6
|
"license": "AGPL-3.0-only",
|
|
6
7
|
"locale": "fr-fr",
|
|
7
8
|
"training": {
|
|
8
9
|
"corpus_version": "0.3.0",
|
|
9
10
|
"tokenizer_version": "0.1.0",
|
|
10
|
-
"steps":
|
|
11
|
+
"steps": 2200,
|
|
11
12
|
"hardware": "AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT",
|
|
12
|
-
"duration_seconds":
|
|
13
|
+
"duration_seconds": 1146.0,
|
|
13
14
|
"started_at": null,
|
|
14
|
-
"completed_at": "2026-05-
|
|
15
|
+
"completed_at": "2026-05-23T06:21:51.190078Z"
|
|
15
16
|
},
|
|
16
17
|
"components_supported": [
|
|
17
18
|
"country",
|
|
@@ -25,39 +26,62 @@
|
|
|
25
26
|
"street",
|
|
26
27
|
"house_number"
|
|
27
28
|
],
|
|
29
|
+
"labels": [
|
|
30
|
+
"O",
|
|
31
|
+
"B-country",
|
|
32
|
+
"I-country",
|
|
33
|
+
"B-region",
|
|
34
|
+
"I-region",
|
|
35
|
+
"B-locality",
|
|
36
|
+
"I-locality",
|
|
37
|
+
"B-dependent_locality",
|
|
38
|
+
"I-dependent_locality",
|
|
39
|
+
"B-postcode",
|
|
40
|
+
"I-postcode",
|
|
41
|
+
"B-subregion",
|
|
42
|
+
"I-subregion",
|
|
43
|
+
"B-cedex",
|
|
44
|
+
"I-cedex",
|
|
45
|
+
"B-venue",
|
|
46
|
+
"I-venue",
|
|
47
|
+
"B-street",
|
|
48
|
+
"I-street",
|
|
49
|
+
"B-house_number",
|
|
50
|
+
"I-house_number"
|
|
51
|
+
],
|
|
28
52
|
"eval": {
|
|
29
53
|
"n_entries": 4535,
|
|
30
|
-
"full_parse_exact_match": 0.
|
|
31
|
-
"mean_token_confidence": 0.
|
|
54
|
+
"full_parse_exact_match": 0.08180815876515987,
|
|
55
|
+
"mean_token_confidence": 0.8062812768727202,
|
|
32
56
|
"per_component": {
|
|
33
57
|
"country": {
|
|
34
|
-
"precision": 0.
|
|
35
|
-
"recall": 0.
|
|
36
|
-
"f1": 0.
|
|
58
|
+
"precision": 0.21428571428481394,
|
|
59
|
+
"recall": 0.20816326530527282,
|
|
60
|
+
"f1": 0.21118012372283307,
|
|
37
61
|
"support": 245
|
|
38
62
|
},
|
|
39
63
|
"region": {
|
|
40
|
-
"precision": 0.
|
|
41
|
-
"recall": 0.
|
|
42
|
-
"f1": 0.
|
|
64
|
+
"precision": 0.342951360263526,
|
|
65
|
+
"recall": 0.129797191887635,
|
|
66
|
+
"f1": 0.18832050661831204,
|
|
43
67
|
"support": 3205
|
|
44
68
|
},
|
|
45
69
|
"locality": {
|
|
46
|
-
"precision": 0.
|
|
47
|
-
"recall": 0.
|
|
48
|
-
"f1": 0.
|
|
70
|
+
"precision": 0.24782398452605223,
|
|
71
|
+
"recall": 0.30533214179317686,
|
|
72
|
+
"f1": 0.2735886822759171,
|
|
49
73
|
"support": 3357
|
|
50
74
|
},
|
|
51
75
|
"dependent_locality": {
|
|
52
|
-
"precision": 0.
|
|
53
|
-
"recall": 0.
|
|
54
|
-
"f1": 0.
|
|
76
|
+
"precision": 0.005044136191670815,
|
|
77
|
+
"recall": 0.0999999999975,
|
|
78
|
+
"f1": 0.009603841445164863,
|
|
55
79
|
"support": 40
|
|
56
80
|
},
|
|
57
81
|
"postcode": {
|
|
58
|
-
"precision": 0.
|
|
59
|
-
"recall": 0.
|
|
60
|
-
"f1": 0.
|
|
82
|
+
"precision": 0.8323890462696731,
|
|
83
|
+
"recall": 0.591610738254835,
|
|
84
|
+
"f1": 0.6916437813892687,
|
|
61
85
|
"support": 2980
|
|
62
86
|
},
|
|
63
87
|
"subregion": {
|
|
@@ -73,21 +97,21 @@
|
|
|
73
97
|
"support": 1
|
|
74
98
|
},
|
|
75
99
|
"venue": {
|
|
76
|
-
"precision": 0.
|
|
77
|
-
"recall": 0.
|
|
78
|
-
"f1": 0.
|
|
100
|
+
"precision": 0.37649063032335905,
|
|
101
|
+
"recall": 0.4014532243411431,
|
|
102
|
+
"f1": 0.38857142807160183,
|
|
79
103
|
"support": 1101
|
|
80
104
|
},
|
|
81
105
|
"street": {
|
|
82
|
-
"precision": 0.
|
|
83
|
-
"recall": 0.
|
|
84
|
-
"f1": 0.
|
|
106
|
+
"precision": 0.35594795539016916,
|
|
107
|
+
"recall": 0.26161202185783416,
|
|
108
|
+
"f1": 0.3015748026611547,
|
|
85
109
|
"support": 2928
|
|
86
110
|
},
|
|
87
111
|
"house_number": {
|
|
88
|
-
"precision": 0.
|
|
89
|
-
"recall": 0.
|
|
90
|
-
"f1": 0.
|
|
112
|
+
"precision": 0.7446153846150028,
|
|
113
|
+
"recall": 0.8335246842704744,
|
|
114
|
+
"f1": 0.7865655466300883,
|
|
91
115
|
"support": 1742
|
|
92
116
|
}
|
|
93
117
|
},
|
|
@@ -101,56 +125,56 @@
|
|
|
101
125
|
{
|
|
102
126
|
"low": 0.1,
|
|
103
127
|
"high": 0.2,
|
|
104
|
-
"n":
|
|
105
|
-
"acc": 0.
|
|
128
|
+
"n": 21,
|
|
129
|
+
"acc": 0.23809523809523808
|
|
106
130
|
},
|
|
107
131
|
{
|
|
108
132
|
"low": 0.2,
|
|
109
133
|
"high": 0.3,
|
|
110
|
-
"n":
|
|
111
|
-
"acc": 0.
|
|
134
|
+
"n": 666,
|
|
135
|
+
"acc": 0.2912912912912913
|
|
112
136
|
},
|
|
113
137
|
{
|
|
114
138
|
"low": 0.3,
|
|
115
139
|
"high": 0.4,
|
|
116
|
-
"n":
|
|
117
|
-
"acc": 0.
|
|
140
|
+
"n": 2416,
|
|
141
|
+
"acc": 0.3265728476821192
|
|
118
142
|
},
|
|
119
143
|
{
|
|
120
144
|
"low": 0.4,
|
|
121
145
|
"high": 0.5,
|
|
122
|
-
"n":
|
|
123
|
-
"acc": 0.
|
|
146
|
+
"n": 4308,
|
|
147
|
+
"acc": 0.3152274837511606
|
|
124
148
|
},
|
|
125
149
|
{
|
|
126
150
|
"low": 0.5,
|
|
127
151
|
"high": 0.6,
|
|
128
|
-
"n":
|
|
129
|
-
"acc": 0.
|
|
152
|
+
"n": 4777,
|
|
153
|
+
"acc": 0.3382876282185472
|
|
130
154
|
},
|
|
131
155
|
{
|
|
132
156
|
"low": 0.6,
|
|
133
157
|
"high": 0.7,
|
|
134
|
-
"n":
|
|
135
|
-
"acc": 0.
|
|
158
|
+
"n": 4943,
|
|
159
|
+
"acc": 0.3568682986040866
|
|
136
160
|
},
|
|
137
161
|
{
|
|
138
162
|
"low": 0.7,
|
|
139
163
|
"high": 0.8,
|
|
140
|
-
"n":
|
|
141
|
-
"acc": 0.
|
|
164
|
+
"n": 5534,
|
|
165
|
+
"acc": 0.382544271774485
|
|
142
166
|
},
|
|
143
167
|
{
|
|
144
168
|
"low": 0.8,
|
|
145
169
|
"high": 0.9,
|
|
146
|
-
"n":
|
|
147
|
-
"acc": 0.
|
|
170
|
+
"n": 8066,
|
|
171
|
+
"acc": 0.43627572526655095
|
|
148
172
|
},
|
|
149
173
|
{
|
|
150
174
|
"low": 0.9,
|
|
151
175
|
"high": 1.0,
|
|
152
|
-
"n":
|
|
153
|
-
"acc": 0.
|
|
176
|
+
"n": 30517,
|
|
177
|
+
"acc": 0.6440344725890488
|
|
154
178
|
}
|
|
155
179
|
]
|
|
156
180
|
},
|
|
@@ -159,7 +183,7 @@
|
|
|
159
183
|
"particle-honorific kryptonite (e.g. FR 'Saint-Just-Saint-Rambert') if not in synth set",
|
|
160
184
|
"non-Latin scripts (CJK, Cyrillic) fall through to byte-fallback tokens; F1 unknown"
|
|
161
185
|
],
|
|
162
|
-
"notes": "
|
|
186
|
+
"notes": "v0.4.0 \u2014 issue #116. Same encoder geometry as v0.3.0 (8.87M params, 6L/256H/4-heads, 21 BIO labels, linear-chain CRF). Issue proposed (1) per-token CRF NLL normalization + (3) class-weighted CE biased toward coarse labels + (4) source-weight rebalance. Empirical iteration found that \u00a71 and \u00a73 destabilize sustained training at every LR tested (5e-4, 3e-4, 1.5e-4 \u2014 even the v0.3.0-safe LR), and on the golden v0.1.2 eval (4535 entries) they slightly REGRESS country/postcode F1 vs v0.3.0. SHIPPED recipe is the \u00a74-only ablation (v0.3.0 dual-loss + v0.4.0 source-weight rebalance) at lr=1.5e-4, step 2200. Modest fine-label gains (street +0.03, house_number +0.01), modest coarse-F1 regression (country -0.07, postcode -0.07). \u00a71/\u00a73 deferred to v0.4.1 corpus-side investigation per issue's '2K divergence' clause. Full ablation matrix retrospective in LOG.md.",
|
|
163
187
|
"format": {
|
|
164
188
|
"model": "ONNX int8 dynamic",
|
|
165
189
|
"tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=16000",
|
|
@@ -171,5 +195,5 @@
|
|
|
171
195
|
"tokenizer": "tokenizer.model",
|
|
172
196
|
"model_card": "model-card.json"
|
|
173
197
|
},
|
|
174
|
-
"base_relpath": "/data/models/checkpoints/
|
|
198
|
+
"base_relpath": "/data/models/checkpoints/v0_4_0-stableLR-source-only/step-002200"
|
|
175
199
|
}
|
package/model.onnx
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
}
|
|
2
|
+
"name": "@mailwoman/neural-weights-fr-fr",
|
|
3
|
+
"version": "4.0.0",
|
|
4
|
+
"description": "Mailwoman neural-classifier weights for locale 'fr-fr'. Data-only package — loaded by @mailwoman/neural at runtime.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"model.onnx",
|
|
12
|
+
"tokenizer.model",
|
|
13
|
+
"model-card.json",
|
|
14
|
+
"README.md"
|
|
15
|
+
],
|
|
16
|
+
"publishConfig": {
|
|
17
|
+
"access": "public"
|
|
18
|
+
}
|
|
19
|
+
}
|
package/tokenizer.model
CHANGED
|
Binary file
|