@mailwoman/neural-weights-en-us 3.0.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -10
- package/model-card.json +71 -149
- package/model.onnx +0 -0
- package/package.json +18 -19
- package/tokenizer.model +0 -0
package/README.md
CHANGED
|
@@ -4,26 +4,26 @@ Stage 2 (coarse + venue/street/house_number) Mailwoman neural-classifier weights
|
|
|
4
4
|
|
|
5
5
|
- locale: **en-us**
|
|
6
6
|
- corpus: **0.3.0**
|
|
7
|
-
- training steps: **
|
|
7
|
+
- training steps: **2200**
|
|
8
8
|
- hardware: **AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT**
|
|
9
9
|
|
|
10
10
|
## Per-component F1 targets
|
|
11
11
|
|
|
12
12
|
**⚠ Below per-component F1 targets:**
|
|
13
13
|
|
|
14
|
-
- `country` F1 = **0.
|
|
15
|
-
- `region` F1 = **0.
|
|
16
|
-
- `locality` F1 = **0.
|
|
17
|
-
- `postcode` F1 = **0.
|
|
18
|
-
- `venue` F1 = **0.
|
|
19
|
-
- `street` F1 = **0.
|
|
20
|
-
- `house_number` F1 = **0.
|
|
14
|
+
- `country` F1 = **0.2112** (target ≥0.95)
|
|
15
|
+
- `region` F1 = **0.1883** (target ≥0.95)
|
|
16
|
+
- `locality` F1 = **0.2736** (target ≥0.95)
|
|
17
|
+
- `postcode` F1 = **0.6916** (target ≥0.95)
|
|
18
|
+
- `venue` F1 = **0.3886** (target ≥0.60)
|
|
19
|
+
- `street` F1 = **0.3016** (target ≥0.70)
|
|
20
|
+
- `house_number` F1 = **0.7866** (target ≥0.80)
|
|
21
21
|
|
|
22
22
|
## Eval (golden set)
|
|
23
23
|
|
|
24
24
|
- entries: **4535**
|
|
25
|
-
- full-parse exact match: **0.
|
|
26
|
-
- mean token confidence: **0.
|
|
25
|
+
- full-parse exact match: **0.0818**
|
|
26
|
+
- mean token confidence: **0.8063**
|
|
27
27
|
|
|
28
28
|
## Components supported
|
|
29
29
|
|
package/model-card.json
CHANGED
|
@@ -1,18 +1,66 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "neural-weights-en-us",
|
|
3
|
-
"version": "
|
|
4
|
-
"
|
|
3
|
+
"version": "4.0.0",
|
|
4
|
+
"model_lineage": "Stage 3 / step 100000 (formerly v0.6.0) \u2014 relabeled to the unified 4.0.0 release version; tokenizer 0.6.0-a0",
|
|
5
|
+
"phase": "Stage 3 \u2014 street decomposition + PO box + intersection",
|
|
5
6
|
"license": "AGPL-3.0-only",
|
|
6
7
|
"locale": "en-us",
|
|
7
8
|
"training": {
|
|
8
|
-
"corpus_version": "0.
|
|
9
|
-
"tokenizer_version": "0.
|
|
10
|
-
"steps":
|
|
11
|
-
"
|
|
12
|
-
"
|
|
13
|
-
"
|
|
14
|
-
"completed_at": "2026-05-22T10:22:18.727364Z"
|
|
9
|
+
"corpus_version": "0.4.0+stage3",
|
|
10
|
+
"tokenizer_version": "0.6.0-a0",
|
|
11
|
+
"steps": 100000,
|
|
12
|
+
"best_step": 100000,
|
|
13
|
+
"hardware": "NVIDIA A100-SXM4-40GB (Modal cloud)",
|
|
14
|
+
"recipe": "v0.5.1 base + STAGE3 (33 BIO labels) + synth-po-box source @ 1.5x. CE-only (crf_loss_weight=0.0 after two NaN attempts with crf>0; the 33x33 transition table + bf16 was numerically unstable). lr=1.5e-4 constant, warmup=1000."
|
|
15
15
|
},
|
|
16
|
+
"architecture": {
|
|
17
|
+
"hidden_size": 384,
|
|
18
|
+
"num_hidden_layers": 6,
|
|
19
|
+
"num_attention_heads": 6,
|
|
20
|
+
"intermediate_size": 1536,
|
|
21
|
+
"max_position_embeddings": 128,
|
|
22
|
+
"vocab_size": 48000,
|
|
23
|
+
"num_labels": 33,
|
|
24
|
+
"params": "29.3M (29M encoder + 9M embedding from 48K vocab)",
|
|
25
|
+
"crf_at_training": false,
|
|
26
|
+
"crf_at_inference": true,
|
|
27
|
+
"phrase_priors": true
|
|
28
|
+
},
|
|
29
|
+
"labels": [
|
|
30
|
+
"O",
|
|
31
|
+
"B-country",
|
|
32
|
+
"I-country",
|
|
33
|
+
"B-region",
|
|
34
|
+
"I-region",
|
|
35
|
+
"B-locality",
|
|
36
|
+
"I-locality",
|
|
37
|
+
"B-dependent_locality",
|
|
38
|
+
"I-dependent_locality",
|
|
39
|
+
"B-postcode",
|
|
40
|
+
"I-postcode",
|
|
41
|
+
"B-subregion",
|
|
42
|
+
"I-subregion",
|
|
43
|
+
"B-cedex",
|
|
44
|
+
"I-cedex",
|
|
45
|
+
"B-venue",
|
|
46
|
+
"I-venue",
|
|
47
|
+
"B-street",
|
|
48
|
+
"I-street",
|
|
49
|
+
"B-house_number",
|
|
50
|
+
"I-house_number",
|
|
51
|
+
"B-street_prefix",
|
|
52
|
+
"I-street_prefix",
|
|
53
|
+
"B-street_suffix",
|
|
54
|
+
"I-street_suffix",
|
|
55
|
+
"B-unit",
|
|
56
|
+
"I-unit",
|
|
57
|
+
"B-po_box",
|
|
58
|
+
"I-po_box",
|
|
59
|
+
"B-intersection_a",
|
|
60
|
+
"I-intersection_a",
|
|
61
|
+
"B-intersection_b",
|
|
62
|
+
"I-intersection_b"
|
|
63
|
+
],
|
|
16
64
|
"components_supported": [
|
|
17
65
|
"country",
|
|
18
66
|
"region",
|
|
@@ -23,153 +71,27 @@
|
|
|
23
71
|
"cedex",
|
|
24
72
|
"venue",
|
|
25
73
|
"street",
|
|
26
|
-
"house_number"
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"country": {
|
|
34
|
-
"precision": 0.2954545454532025,
|
|
35
|
-
"recall": 0.2653061224478967,
|
|
36
|
-
"f1": 0.2795698919733611,
|
|
37
|
-
"support": 245
|
|
38
|
-
},
|
|
39
|
-
"region": {
|
|
40
|
-
"precision": 0.4411027568916778,
|
|
41
|
-
"recall": 0.10982839313569116,
|
|
42
|
-
"f1": 0.17586809860649805,
|
|
43
|
-
"support": 3205
|
|
44
|
-
},
|
|
45
|
-
"locality": {
|
|
46
|
-
"precision": 0.23303370786511615,
|
|
47
|
-
"recall": 0.3089067619897799,
|
|
48
|
-
"f1": 0.26565902346259507,
|
|
49
|
-
"support": 3357
|
|
50
|
-
},
|
|
51
|
-
"dependent_locality": {
|
|
52
|
-
"precision": 0.0,
|
|
53
|
-
"recall": 0.0,
|
|
54
|
-
"f1": 0.0,
|
|
55
|
-
"support": 40
|
|
56
|
-
},
|
|
57
|
-
"postcode": {
|
|
58
|
-
"precision": 0.8426270136303831,
|
|
59
|
-
"recall": 0.684563758389032,
|
|
60
|
-
"f1": 0.7554156632710455,
|
|
61
|
-
"support": 2980
|
|
62
|
-
},
|
|
63
|
-
"subregion": {
|
|
64
|
-
"precision": 0.0,
|
|
65
|
-
"recall": 0.0,
|
|
66
|
-
"f1": 0.0,
|
|
67
|
-
"support": 0
|
|
68
|
-
},
|
|
69
|
-
"cedex": {
|
|
70
|
-
"precision": 0.0,
|
|
71
|
-
"recall": 0.0,
|
|
72
|
-
"f1": 0.0,
|
|
73
|
-
"support": 1
|
|
74
|
-
},
|
|
75
|
-
"venue": {
|
|
76
|
-
"precision": 0.38622493461169466,
|
|
77
|
-
"recall": 0.4023614895545846,
|
|
78
|
-
"f1": 0.39412811337886233,
|
|
79
|
-
"support": 1101
|
|
80
|
-
},
|
|
81
|
-
"street": {
|
|
82
|
-
"precision": 0.3326499231161801,
|
|
83
|
-
"recall": 0.22165300546440517,
|
|
84
|
-
"f1": 0.2660381220860398,
|
|
85
|
-
"support": 2928
|
|
86
|
-
},
|
|
87
|
-
"house_number": {
|
|
88
|
-
"precision": 0.7315737051789185,
|
|
89
|
-
"recall": 0.8432835820890682,
|
|
90
|
-
"f1": 0.7834666661687646,
|
|
91
|
-
"support": 1742
|
|
92
|
-
}
|
|
93
|
-
},
|
|
94
|
-
"calibration": [
|
|
95
|
-
{
|
|
96
|
-
"low": 0.0,
|
|
97
|
-
"high": 0.1,
|
|
98
|
-
"n": 0,
|
|
99
|
-
"acc": 0.0
|
|
100
|
-
},
|
|
101
|
-
{
|
|
102
|
-
"low": 0.1,
|
|
103
|
-
"high": 0.2,
|
|
104
|
-
"n": 19,
|
|
105
|
-
"acc": 0.5789473684210527
|
|
106
|
-
},
|
|
107
|
-
{
|
|
108
|
-
"low": 0.2,
|
|
109
|
-
"high": 0.3,
|
|
110
|
-
"n": 316,
|
|
111
|
-
"acc": 0.3322784810126582
|
|
112
|
-
},
|
|
113
|
-
{
|
|
114
|
-
"low": 0.3,
|
|
115
|
-
"high": 0.4,
|
|
116
|
-
"n": 1213,
|
|
117
|
-
"acc": 0.37757625721352017
|
|
118
|
-
},
|
|
119
|
-
{
|
|
120
|
-
"low": 0.4,
|
|
121
|
-
"high": 0.5,
|
|
122
|
-
"n": 2754,
|
|
123
|
-
"acc": 0.34132171387073346
|
|
124
|
-
},
|
|
125
|
-
{
|
|
126
|
-
"low": 0.5,
|
|
127
|
-
"high": 0.6,
|
|
128
|
-
"n": 3647,
|
|
129
|
-
"acc": 0.35481217438990953
|
|
130
|
-
},
|
|
131
|
-
{
|
|
132
|
-
"low": 0.6,
|
|
133
|
-
"high": 0.7,
|
|
134
|
-
"n": 3659,
|
|
135
|
-
"acc": 0.3883574747198688
|
|
136
|
-
},
|
|
137
|
-
{
|
|
138
|
-
"low": 0.7,
|
|
139
|
-
"high": 0.8,
|
|
140
|
-
"n": 4610,
|
|
141
|
-
"acc": 0.40629067245119305
|
|
142
|
-
},
|
|
143
|
-
{
|
|
144
|
-
"low": 0.8,
|
|
145
|
-
"high": 0.9,
|
|
146
|
-
"n": 7716,
|
|
147
|
-
"acc": 0.43960601347848627
|
|
148
|
-
},
|
|
149
|
-
{
|
|
150
|
-
"low": 0.9,
|
|
151
|
-
"high": 1.0,
|
|
152
|
-
"n": 37314,
|
|
153
|
-
"acc": 0.5987564989012167
|
|
154
|
-
}
|
|
155
|
-
]
|
|
156
|
-
},
|
|
157
|
-
"known_failure_modes": [
|
|
158
|
-
"underperforms on Hawaiian addresses (sparse in training corpus)",
|
|
159
|
-
"particle-honorific kryptonite (e.g. FR 'Saint-Just-Saint-Rambert') if not in synth set",
|
|
160
|
-
"non-Latin scripts (CJK, Cyrillic) fall through to byte-fallback tokens; F1 unknown"
|
|
74
|
+
"house_number",
|
|
75
|
+
"street_prefix",
|
|
76
|
+
"street_suffix",
|
|
77
|
+
"unit",
|
|
78
|
+
"po_box",
|
|
79
|
+
"intersection_a",
|
|
80
|
+
"intersection_b"
|
|
161
81
|
],
|
|
162
|
-
"notes": "
|
|
82
|
+
"notes": "v0.6.0 \u2014 Stage 3 ships. Schema expanded from 10 to 16 tags / 21 to 33 BIO labels. STAGE2 label IDs preserved exactly (new tags at IDs 21-32). TIGER, NAD, and BAN adapters now emit street_prefix/street/street_suffix/unit from existing structured input. New synth-po-box corpus source provides 50K PO box training examples across en-US/CA/GB/AU, fr-FR/CA, es-ES/MX/AR.",
|
|
163
83
|
"format": {
|
|
164
|
-
"model": "ONNX int8 dynamic",
|
|
165
|
-
"tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=
|
|
84
|
+
"model": "ONNX int8 dynamic (quantized from fp32)",
|
|
85
|
+
"tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=48000",
|
|
166
86
|
"max_sequence_length": 128,
|
|
167
|
-
"opset": 17
|
|
87
|
+
"opset": 17,
|
|
88
|
+
"fp32_size_mb": 111.8,
|
|
89
|
+
"int8_size_mb": 28.1
|
|
168
90
|
},
|
|
169
91
|
"files": {
|
|
170
92
|
"model": "model.onnx",
|
|
171
93
|
"tokenizer": "tokenizer.model",
|
|
172
94
|
"model_card": "model-card.json"
|
|
173
95
|
},
|
|
174
|
-
"base_relpath": "/data/
|
|
96
|
+
"base_relpath": "/data/output-v060/checkpoints/step-100000"
|
|
175
97
|
}
|
package/model.onnx
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
}
|
|
2
|
+
"name": "@mailwoman/neural-weights-en-us",
|
|
3
|
+
"version": "4.0.0",
|
|
4
|
+
"description": "Mailwoman neural-classifier weights for locale 'en-us'. Data-only package — loaded by @mailwoman/neural at runtime.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"model.onnx",
|
|
12
|
+
"tokenizer.model",
|
|
13
|
+
"model-card.json",
|
|
14
|
+
"README.md"
|
|
15
|
+
],
|
|
16
|
+
"publishConfig": {
|
|
17
|
+
"access": "public"
|
|
18
|
+
}
|
|
19
|
+
}
|
package/tokenizer.model
CHANGED
|
Binary file
|