@mailwoman/neural-weights-en-us 2.2.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/model-card.json CHANGED
@@ -1,18 +1,17 @@
1
1
  {
2
2
  "name": "neural-weights-en-us",
3
- "version": "0.5.1",
4
- "phase": "Stage 2 (coarse + venue/street/house_number) CE-only, unchained",
3
+ "version": "4.0.0",
4
+ "model_lineage": "Stage 3 / step 100000 (formerly v0.6.0) \u2014 relabeled to the unified 4.0.0 release version; tokenizer 0.6.0-a0",
5
+ "phase": "Stage 3 \u2014 street decomposition + PO box + intersection",
5
6
  "license": "AGPL-3.0-only",
6
7
  "locale": "en-us",
7
8
  "training": {
8
- "corpus_version": "0.4.0",
9
- "tokenizer_version": "0.5.0-a1",
10
- "steps": 95000,
9
+ "corpus_version": "0.4.0+stage3",
10
+ "tokenizer_version": "0.6.0-a0",
11
+ "steps": 100000,
12
+ "best_step": 100000,
11
13
  "hardware": "NVIDIA A100-SXM4-40GB (Modal cloud)",
12
- "duration_seconds": 2100,
13
- "started_at": "2026-05-25T06:04:00Z",
14
- "completed_at": "2026-05-25T06:39:00Z",
15
- "recipe": "CE-only (crf_loss_weight=0.0), h384, batch=128 direct, constant LR=1.5e-4, phrase_priors=ON, class_weights tuned"
14
+ "recipe": "v0.5.1 base + STAGE3 (33 BIO labels) + synth-po-box source @ 1.5x. CE-only (crf_loss_weight=0.0 after two NaN attempts with crf>0; the 33x33 transition table + bf16 was numerically unstable). lr=1.5e-4 constant, warmup=1000."
16
15
  },
17
16
  "architecture": {
18
17
  "hidden_size": 384,
@@ -20,23 +19,13 @@
20
19
  "num_attention_heads": 6,
21
20
  "intermediate_size": 1536,
22
21
  "max_position_embeddings": 128,
23
- "params": "29M",
22
+ "vocab_size": 48000,
23
+ "num_labels": 33,
24
+ "params": "29.3M (29M encoder + 9M embedding from 48K vocab)",
24
25
  "crf_at_training": false,
25
26
  "crf_at_inference": true,
26
27
  "phrase_priors": true
27
28
  },
28
- "components_supported": [
29
- "country",
30
- "region",
31
- "locality",
32
- "dependent_locality",
33
- "postcode",
34
- "subregion",
35
- "cedex",
36
- "venue",
37
- "street",
38
- "house_number"
39
- ],
40
29
  "labels": [
41
30
  "O",
42
31
  "B-country",
@@ -58,37 +47,51 @@
58
47
  "B-street",
59
48
  "I-street",
60
49
  "B-house_number",
61
- "I-house_number"
50
+ "I-house_number",
51
+ "B-street_prefix",
52
+ "I-street_prefix",
53
+ "B-street_suffix",
54
+ "I-street_suffix",
55
+ "B-unit",
56
+ "I-unit",
57
+ "B-po_box",
58
+ "I-po_box",
59
+ "B-intersection_a",
60
+ "I-intersection_a",
61
+ "B-intersection_b",
62
+ "I-intersection_b"
62
63
  ],
63
- "eval": {
64
- "val_macro_f1": 0.638,
65
- "val_loss": 0.281,
66
- "golden_eval": {
67
- "n_entries": 4535,
68
- "hybrid_joint_exact_match": 0.102,
69
- "hybrid_joint_macro_f1": 0.17,
70
- "hybrid_joint_empty_parse": 0.0,
71
- "rule_only_exact_match": 0.308,
72
- "neural_macro_f1": 0.078
73
- }
74
- },
75
- "known_failure_modes": [
76
- "54.5% overconfident-wrong in neural-only mode (addressed by reconciler: 0.1%)",
77
- "dependent_locality hallucination reduced by class_weights=0.3 but not eliminated",
78
- "non-Latin scripts: A1 tokenizer has 18.2% byte-fallback (vs v0.1.0's 36.7%); model not trained on non-Latin addresses yet",
79
- "particle-honorific kryptonite (e.g. FR 'Saint-Just-Saint-Rambert')"
64
+ "components_supported": [
65
+ "country",
66
+ "region",
67
+ "locality",
68
+ "dependent_locality",
69
+ "postcode",
70
+ "subregion",
71
+ "cedex",
72
+ "venue",
73
+ "street",
74
+ "house_number",
75
+ "street_prefix",
76
+ "street_suffix",
77
+ "unit",
78
+ "po_box",
79
+ "intersection_a",
80
+ "intersection_b"
80
81
  ],
81
- "notes": "v0.5.1 'unchained' iteration. Removes all hardware constraints from v0.5.0 (h256→h384, grad_accum→direct batch, 50K→100K steps, phrase priors OFF→ON, class weights uniform→tuned). CE-only training fix (crf_loss_weight=0) carries from v0.5.0 — nine dual-loss runs diverged, CE-only is stable. Val_loss oscillates every ~20K steps (hard-cluster cycling, not overfitting). Best checkpoint at step-95K. +77% over v0.4.0 on training eval; +70% over v0.5.0 on golden exact-match (hybrid-joint 6.0%→10.2%).",
82
+ "notes": "v0.6.0 \u2014 Stage 3 ships. Schema expanded from 10 to 16 tags / 21 to 33 BIO labels. STAGE2 label IDs preserved exactly (new tags at IDs 21-32). TIGER, NAD, and BAN adapters now emit street_prefix/street/street_suffix/unit from existing structured input. New synth-po-box corpus source provides 50K PO box training examples across en-US/CA/GB/AU, fr-FR/CA, es-ES/MX/AR.",
82
83
  "format": {
83
- "model": "ONNX fp32 dynamic",
84
+ "model": "ONNX int8 dynamic (quantized from fp32)",
84
85
  "tokenizer": "SentencePiece unigram, byte_fallback=true, vocab_size=48000",
85
86
  "max_sequence_length": 128,
86
- "opset": 17
87
+ "opset": 17,
88
+ "fp32_size_mb": 111.8,
89
+ "int8_size_mb": 28.1
87
90
  },
88
91
  "files": {
89
92
  "model": "model.onnx",
90
93
  "tokenizer": "tokenizer.model",
91
94
  "model_card": "model-card.json"
92
95
  },
93
- "base_relpath": "/data/output/checkpoints/step-095000"
96
+ "base_relpath": "/data/output-v060/checkpoints/step-100000"
94
97
  }
package/model.onnx CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mailwoman/neural-weights-en-us",
3
- "version": "2.2.0",
3
+ "version": "4.0.0",
4
4
  "description": "Mailwoman neural-classifier weights for locale 'en-us'. Data-only package — loaded by @mailwoman/neural at runtime.",
5
5
  "license": "AGPL-3.0-only",
6
6
  "repository": {
package/tokenizer.model CHANGED
Binary file