@mailwoman/neural-weights-en-us 3.0.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -4,26 +4,26 @@ Stage 2 (coarse + venue/street/house_number) Mailwoman neural-classifier weights
4
4
 
5
5
  - locale: **en-us**
6
6
  - corpus: **0.3.0**
7
- - training steps: **1800**
7
+ - training steps: **2200**
8
8
  - hardware: **AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT**
9
9
 
10
10
  ## Per-component F1 targets
11
11
 
12
12
  **⚠ Below per-component F1 targets:**
13
13
 
14
- - `country` F1 = **0.2796** (target ≥0.95)
15
- - `region` F1 = **0.1759** (target ≥0.95)
16
- - `locality` F1 = **0.2657** (target ≥0.95)
17
- - `postcode` F1 = **0.7554** (target ≥0.95)
18
- - `venue` F1 = **0.3941** (target ≥0.60)
19
- - `street` F1 = **0.2660** (target ≥0.70)
20
- - `house_number` F1 = **0.7835** (target ≥0.80)
14
+ - `country` F1 = **0.2112** (target ≥0.95)
15
+ - `region` F1 = **0.1883** (target ≥0.95)
16
+ - `locality` F1 = **0.2736** (target ≥0.95)
17
+ - `postcode` F1 = **0.6916** (target ≥0.95)
18
+ - `venue` F1 = **0.3886** (target ≥0.60)
19
+ - `street` F1 = **0.3016** (target ≥0.70)
20
+ - `house_number` F1 = **0.7866** (target ≥0.80)
21
21
 
22
22
  ## Eval (golden set)
23
23
 
24
24
  - entries: **4535**
25
- - full-parse exact match: **0.1074**
26
- - mean token confidence: **0.8566**
25
+ - full-parse exact match: **0.0818**
26
+ - mean token confidence: **0.8063**
27
27
 
28
28
  ## Components supported
29
29
 
package/model-card.json CHANGED
@@ -1,18 +1,66 @@
1
1
  {
2
2
  "name": "neural-weights-en-us",
3
- "version": "3.0.0",
4
- "phase": "Stage 2 (coarse + venue/street/house_number)",
3
+ "version": "4.0.0",
4
+ "model_lineage": "Stage 3 / step 100000 (formerly v0.6.0) \u2014 relabeled to the unified 4.0.0 release version; tokenizer 0.6.0-a0",
5
+ "phase": "Stage 3 \u2014 street decomposition + PO box + intersection",
5
6
  "license": "AGPL-3.0-only",
6
7
  "locale": "en-us",
7
8
  "training": {
8
- "corpus_version": "0.3.0",
9
- "tokenizer_version": "0.1.0",
10
- "steps": 1800,
11
- "hardware": "AMD Radeon 780M (gfx1103) bf16 ~14.6 GiB GTT",
12
- "duration_seconds": 1067.0,
13
- "started_at": null,
14
- "completed_at": "2026-05-22T10:22:18.727364Z"
9
+ "corpus_version": "0.4.0+stage3",
10
+ "tokenizer_version": "0.6.0-a0",
11
+ "steps": 100000,
12
+ "best_step": 100000,
13
+ "hardware": "NVIDIA A100-SXM4-40GB (Modal cloud)",
14
+ "recipe": "v0.5.1 base + STAGE3 (33 BIO labels) + synth-po-box source @ 1.5x. CE-only (crf_loss_weight=0.0 after two NaN attempts with crf>0; the 33x33 transition table + bf16 was numerically unstable). lr=1.5e-4 constant, warmup=1000."
15
15
  },
16
+ "architecture": {
17
+ "hidden_size": 384,
18
+ "num_hidden_layers": 6,
19
+ "num_attention_heads": 6,
20
+ "intermediate_size": 1536,
21
+ "max_position_embeddings": 128,
22
+ "vocab_size": 48000,
23
+ "num_labels": 33,
24
+ "params": "29.3M (29M encoder + 9M embedding from 48K vocab)",
25
+ "crf_at_training": false,
26
+ "crf_at_inference": true,
27
+ "phrase_priors": true
28
+ },
29
+ "labels": [
30
+ "O",
31
+ "B-country",
32
+ "I-country",
33
+ "B-region",
34
+ "I-region",
35
+ "B-locality",
36
+ "I-locality",
37
+ "B-dependent_locality",
38
+ "I-dependent_locality",
39
+ "B-postcode",
40
+ "I-postcode",
41
+ "B-subregion",
42
+ "I-subregion",
43
+ "B-cedex",
44
+ "I-cedex",
45
+ "B-venue",
46
+ "I-venue",
47
+ "B-street",
48
+ "I-street",
49
+ "B-house_number",
50
+ "I-house_number",
51
+ "B-street_prefix",
52
+ "I-street_prefix",
53
+ "B-street_suffix",
54
+ "I-street_suffix",
55
+ "B-unit",
56
+ "I-unit",
57
+ "B-po_box",
58
+ "I-po_box",
59
+ "B-intersection_a",
60
+ "I-intersection_a",
61
+ "B-intersection_b",
62
+ "I-intersection_b"
63
+ ],
16
64
  "components_supported": [
17
65
  "country",
18
66
  "region",
@@ -23,153 +71,27 @@
23
71
  "cedex",
24
72
  "venue",
25
73
  "street",
26
- "house_number"
27
- ],
28
- "eval": {
29
- "n_entries": 4535,
30
- "full_parse_exact_match": 0.1073869900771775,
31
- "mean_token_confidence": 0.856554333787935,
32
- "per_component": {
33
- "country": {
34
- "precision": 0.2954545454532025,
35
- "recall": 0.2653061224478967,
36
- "f1": 0.2795698919733611,
37
- "support": 245
38
- },
39
- "region": {
40
- "precision": 0.4411027568916778,
41
- "recall": 0.10982839313569116,
42
- "f1": 0.17586809860649805,
43
- "support": 3205
44
- },
45
- "locality": {
46
- "precision": 0.23303370786511615,
47
- "recall": 0.3089067619897799,
48
- "f1": 0.26565902346259507,
49
- "support": 3357
50
- },
51
- "dependent_locality": {
52
- "precision": 0.0,
53
- "recall": 0.0,
54
- "f1": 0.0,
55
- "support": 40
56
- },
57
- "postcode": {
58
- "precision": 0.8426270136303831,
59
- "recall": 0.684563758389032,
60
- "f1": 0.7554156632710455,
61
- "support": 2980
62
- },
63
- "subregion": {
64
- "precision": 0.0,
65
- "recall": 0.0,
66
- "f1": 0.0,
67
- "support": 0
68
- },
69
- "cedex": {
70
- "precision": 0.0,
71
- "recall": 0.0,
72
- "f1": 0.0,
73
- "support": 1
74
- },
75
- "venue": {
76
- "precision": 0.38622493461169466,
77
- "recall": 0.4023614895545846,
78
- "f1": 0.39412811337886233,
79
- "support": 1101
80
- },
81
- "street": {
82
- "precision": 0.3326499231161801,
83
- "recall": 0.22165300546440517,
84
- "f1": 0.2660381220860398,
85
- "support": 2928
86
- },
87
- "house_number": {
88
- "precision": 0.7315737051789185,
89
- "recall": 0.8432835820890682,
90
- "f1": 0.7834666661687646,
91
- "support": 1742
92
- }
93
- },
94
- "calibration": [
95
- {
96
- "low": 0.0,
97
- "high": 0.1,
98
- "n": 0,
99
- "acc": 0.0
100
- },
101
- {
102
- "low": 0.1,
103
- "high": 0.2,
104
- "n": 19,
105
- "acc": 0.5789473684210527
106
- },
107
- {
108
- "low": 0.2,
109
- "high": 0.3,
110
- "n": 316,
111
- "acc": 0.3322784810126582
112
- },
113
- {
114
- "low": 0.3,
115
- "high": 0.4,
116
- "n": 1213,
117
- "acc": 0.37757625721352017
118
- },
119
- {
120
- "low": 0.4,
121
- "high": 0.5,
122
- "n": 2754,
123
- "acc": 0.34132171387073346
124
- },
125
- {
126
- "low": 0.5,
127
- "high": 0.6,
128
- "n": 3647,
129
- "acc": 0.35481217438990953
130
- },
131
- {
132
- "low": 0.6,
133
- "high": 0.7,
134
- "n": 3659,
135
- "acc": 0.3883574747198688
136
- },
137
- {
138
- "low": 0.7,
139
- "high": 0.8,
140
- "n": 4610,
141
- "acc": 0.40629067245119305
142
- },
143
- {
144
- "low": 0.8,
145
- "high": 0.9,
146
- "n": 7716,
147
- "acc": 0.43960601347848627
148
- },
149
- {
150
- "low": 0.9,
151
- "high": 1.0,
152
- "n": 37314,
153
- "acc": 0.5987564989012167
154
- }
155
- ]
156
- },
157
- "known_failure_modes": [
158
- "underperforms on Hawaiian addresses (sparse in training corpus)",
159
- "particle-honorific kryptonite (e.g. FR 'Saint-Just-Saint-Rambert') if not in synth set",
160
- "non-Latin scripts (CJK, Cyrillic) fall through to byte-fallback tokens; F1 unknown"
74
+ "house_number",
75
+ "street_prefix",
76
+ "street_suffix",
77
+ "unit",
78
+ "po_box",
79
+ "intersection_a",
80
+ "intersection_b"
161
81
  ],
162
- "notes": "Stage 2 v3.0.0 \u2014 same encoder geometry as v2.0.x (8.87M params, 6L/256H/4-heads) plus a linear-chain CRF decoder (+~500 params with a frozen BIO transition mask), label smoothing on the per-token CE leg (disabled in the shipped hparams after iteration; see ship notes), and a 21-label classifier head (was 15) that adds venue / street / house_number BIO classes. Trained on corpus-v0.3.0 which adds the US DOT NAD source (~97M structured 911-grade address points). The CRF transition mask makes orphan-I sequences (e.g. \"Saint Petersburg \u2192 Petersburg\" clipping visible on the v0.2.0 demo) structurally impossible. See evals/scores-by-version.json for the v2.0.x \u2192 v3.0.0 deltas + the per-component F1 on the new fine labels.",
82
+ "notes": "v0.6.0 \u2014 Stage 3 ships. Schema expanded from 10 to 16 tags / 21 to 33 BIO labels. STAGE2 label IDs preserved exactly (new tags at IDs 21-32). TIGER, NAD, and BAN adapters now emit street_prefix/street/street_suffix/unit from existing structured input. New synth-po-box corpus source provides 50K PO box training examples across en-US/CA/GB/AU, fr-FR/CA, es-ES/MX/AR.",
163
83
  "format": {
164
- "model": "ONNX int8 dynamic",
165
- "tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=16000",
84
+ "model": "ONNX int8 dynamic (quantized from fp32)",
85
+ "tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=48000",
166
86
  "max_sequence_length": 128,
167
- "opset": 17
87
+ "opset": 17,
88
+ "fp32_size_mb": 111.8,
89
+ "int8_size_mb": 28.1
168
90
  },
169
91
  "files": {
170
92
  "model": "model.onnx",
171
93
  "tokenizer": "tokenizer.model",
172
94
  "model_card": "model-card.json"
173
95
  },
174
- "base_relpath": "/data/models/checkpoints/stage2/step-001800"
96
+ "base_relpath": "/data/output-v060/checkpoints/step-100000"
175
97
  }
package/model.onnx CHANGED
Binary file
package/package.json CHANGED
@@ -1,20 +1,19 @@
1
1
  {
2
- "name": "@mailwoman/neural-weights-en-us",
3
- "version": "3.0.0",
4
- "description": "Mailwoman neural-classifier weights for locale 'en-us'. Data-only package — loaded by @mailwoman/neural at runtime.",
5
- "license": "AGPL-3.0-only",
6
- "private": false,
7
- "repository": {
8
- "type": "git",
9
- "url": "https://github.com/sister-software/mailwoman"
10
- },
11
- "files": [
12
- "model.onnx",
13
- "tokenizer.model",
14
- "model-card.json",
15
- "README.md"
16
- ],
17
- "publishConfig": {
18
- "access": "public"
19
- }
20
- }
2
+ "name": "@mailwoman/neural-weights-en-us",
3
+ "version": "4.0.0",
4
+ "description": "Mailwoman neural-classifier weights for locale 'en-us'. Data-only package — loaded by @mailwoman/neural at runtime.",
5
+ "license": "AGPL-3.0-only",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/sister-software/mailwoman"
9
+ },
10
+ "files": [
11
+ "model.onnx",
12
+ "tokenizer.model",
13
+ "model-card.json",
14
+ "README.md"
15
+ ],
16
+ "publishConfig": {
17
+ "access": "public"
18
+ }
19
+ }
package/tokenizer.model CHANGED
Binary file