@mailwoman/core 4.9.0 → 4.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +90 -0
- package/data/coarse-placer/meta.json +17 -48
- package/out/decoder/arbitrate-tree.d.ts +45 -0
- package/out/decoder/arbitrate-tree.d.ts.map +1 -0
- package/out/decoder/arbitrate-tree.js +97 -0
- package/out/decoder/arbitrate-tree.js.map +1 -0
- package/out/decoder/index.d.ts +2 -0
- package/out/decoder/index.d.ts.map +1 -1
- package/out/decoder/index.js +2 -0
- package/out/decoder/index.js.map +1 -1
- package/out/decoder/proposals-to-tree.d.ts +20 -1
- package/out/decoder/proposals-to-tree.d.ts.map +1 -1
- package/out/decoder/proposals-to-tree.js +37 -0
- package/out/decoder/proposals-to-tree.js.map +1 -1
- package/out/decoder/resolve-proposal-overlaps.d.ts +52 -0
- package/out/decoder/resolve-proposal-overlaps.d.ts.map +1 -0
- package/out/decoder/resolve-proposal-overlaps.js +74 -0
- package/out/decoder/resolve-proposal-overlaps.js.map +1 -0
- package/out/parser/index.d.ts +1 -0
- package/out/parser/index.d.ts.map +1 -1
- package/out/parser/index.js +1 -0
- package/out/parser/index.js.map +1 -1
- package/out/parser/proposal-pipeline.d.ts +16 -3
- package/out/parser/proposal-pipeline.d.ts.map +1 -1
- package/out/parser/proposal-pipeline.js +18 -6
- package/out/parser/proposal-pipeline.js.map +1 -1
- package/out/parser/solution-to-proposals.d.ts +28 -0
- package/out/parser/solution-to-proposals.d.ts.map +1 -0
- package/out/parser/solution-to-proposals.js +44 -0
- package/out/parser/solution-to-proposals.js.map +1 -0
- package/out/pipeline/runtime-pipeline.d.ts.map +1 -1
- package/out/pipeline/runtime-pipeline.js +32 -4
- package/out/pipeline/runtime-pipeline.js.map +1 -1
- package/out/pipeline/types.d.ts +30 -1
- package/out/pipeline/types.d.ts.map +1 -1
- package/out/policy/defaults.d.ts +11 -6
- package/out/policy/defaults.d.ts.map +1 -1
- package/out/policy/defaults.js +12 -7
- package/out/policy/defaults.js.map +1 -1
- package/out/policy/from-config.d.ts +14 -4
- package/out/policy/from-config.d.ts.map +1 -1
- package/out/policy/from-config.js +16 -5
- package/out/policy/from-config.js.map +1 -1
- package/out/policy/index.d.ts +1 -0
- package/out/policy/index.d.ts.map +1 -1
- package/out/policy/index.js +1 -0
- package/out/policy/index.js.map +1 -1
- package/out/policy/input-shape-router.d.ts +104 -0
- package/out/policy/input-shape-router.d.ts.map +1 -0
- package/out/policy/input-shape-router.js +88 -0
- package/out/policy/input-shape-router.js.map +1 -0
- package/out/policy/registry.d.ts +7 -3
- package/out/policy/registry.d.ts.map +1 -1
- package/out/policy/registry.js +7 -3
- package/out/policy/registry.js.map +1 -1
- package/out/resolver/remote-resolver.d.ts +4 -2
- package/out/resolver/remote-resolver.d.ts.map +1 -1
- package/out/resolver/remote-resolver.js.map +1 -1
- package/out/resolver/resolve.d.ts.map +1 -1
- package/out/resolver/resolve.js +74 -4
- package/out/resolver/resolve.js.map +1 -1
- package/out/resolver/types.d.ts +48 -9
- package/out/resolver/types.d.ts.map +1 -1
- package/out/resolver/types.js +56 -9
- package/out/resolver/types.js.map +1 -1
- package/out/utils/repo.d.ts.map +1 -1
- package/out/utils/repo.js +5 -4
- package/out/utils/repo.js.map +1 -1
- package/package.json +1 -1
package/README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# @mailwoman/core
|
|
2
|
+
|
|
3
|
+
**The foundation of the Mailwoman address parser** — types, tokenization,
|
|
4
|
+
classification primitives, solver, decoder, and the staged pipeline coordinator.
|
|
5
|
+
Ships ~9 MB of provenance-tracked reference dictionaries (libpostal, Who's On
|
|
6
|
+
First, chromium-i18n) consumed by the resolver and classifiers.
|
|
7
|
+
|
|
8
|
+
```ts
|
|
9
|
+
import { createRuntimePipeline, AddressParser, ComponentTag, Classification, Span } from "@mailwoman/core"
|
|
10
|
+
|
|
11
|
+
const pipeline = createRuntimePipeline({ locale: "en-US" })
|
|
12
|
+
const result = pipeline.parse("1600 Amphitheatre Parkway, Mountain View, CA 94043")
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## What's inside
|
|
16
|
+
|
|
17
|
+
| Module | Purpose |
|
|
18
|
+
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
|
19
|
+
| **`types/`** | Core type system: `ComponentTag`, `Span`, `Classification`, `ClassificationMap`, `LocaleTag` |
|
|
20
|
+
| **`tokenization/`** | Tokenizer primitives, whitespace/punctuation rules, token classification |
|
|
21
|
+
| **`classification/`** | `Classification` data structure, `ClassificationMap`, span overlap resolution |
|
|
22
|
+
| **`decoder/`** | Span proposal → tree projection, BIO decoding, reconcile/merge strategies, confidence calibration |
|
|
23
|
+
| **`pipeline/`** | `createRuntimePipeline` — the staged pipeline coordinator that wires normalize → query-shape → locale-gate → ... → classifier → decoder |
|
|
24
|
+
| **`solver/`** | Rule-based solver (the v0 rules engine), `Solution`, `Solver` |
|
|
25
|
+
| **`parser/`** | `AddressParser` — high-level parse entry point (consumed by `mailwoman` CLI) |
|
|
26
|
+
| **`resources/`** | ~9 MB of shipped reference data: libpostal dictionaries, WOF place data, chromium-i18n address formats |
|
|
27
|
+
|
|
28
|
+
## Key exports
|
|
29
|
+
|
|
30
|
+
```ts
|
|
31
|
+
// Types
|
|
32
|
+
export type { ComponentTag, Span, Classification, ClassificationMap, LocaleTag }
|
|
33
|
+
|
|
34
|
+
// Pipeline
|
|
35
|
+
export { createRuntimePipeline, type RuntimePipeline, type PipelineOpts }
|
|
36
|
+
|
|
37
|
+
// Classification
|
|
38
|
+
export { Classification, ClassificationMap }
|
|
39
|
+
export { treeToClassification, classificationToTree }
|
|
40
|
+
|
|
41
|
+
// Decoder
|
|
42
|
+
export { decodeBioSpans, viterbiDecode, reconcileSpans }
|
|
43
|
+
export { createCalibrator, type Calibrator } // isotonic confidence calibration
|
|
44
|
+
|
|
45
|
+
// Solver (v0 rules)
|
|
46
|
+
export { Solver, Solution }
|
|
47
|
+
|
|
48
|
+
// Tokenization
|
|
49
|
+
export { tokenize, Token, TokenClass }
|
|
50
|
+
|
|
51
|
+
// Resources
|
|
52
|
+
export { loadDictionary, getAvailableLanguages }
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Pipeline architecture
|
|
56
|
+
|
|
57
|
+
Mailwoman's runtime pipeline is a staged coordinator that chains pure-function
|
|
58
|
+
stages with typed handoffs:
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
normalize → query-shape → locale-gate → kind-classifier → phrase-grouper → classifier → decoder
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Each stage is published as its own `@mailwoman/*` package and wired together by
|
|
65
|
+
the pipeline coordinator in this package. The design ensures every stage is
|
|
66
|
+
independently testable, benchmarkable, and replaceable.
|
|
67
|
+
|
|
68
|
+
## Reference data
|
|
69
|
+
|
|
70
|
+
This package ships immutable, provenance-tracked dictionaries consumed by the
|
|
71
|
+
resolver and rule-based classifiers:
|
|
72
|
+
|
|
73
|
+
- **libpostal** — multilingual street types, place names, directional/ordinal tokens
|
|
74
|
+
- **Who's On First** — place hierarchy and geography
|
|
75
|
+
- **chromium-i18n** — per-country address format templates
|
|
76
|
+
|
|
77
|
+
The dictionaries are ~9 MB total and are loaded lazily.
|
|
78
|
+
|
|
79
|
+
## Related
|
|
80
|
+
|
|
81
|
+
- [`mailwoman`](../mailwoman) — the user-facing CLI + `AddressParser`
|
|
82
|
+
- [`@mailwoman/normalize`](../normalize) — Stage 1 of the pipeline
|
|
83
|
+
- [`@mailwoman/neural`](../neural) — neural classifier (ONNX runtime)
|
|
84
|
+
- [`@mailwoman/classifiers`](../classifiers) — rule-based classifiers
|
|
85
|
+
- [What Mailwoman Is](https://mailwoman.sister.software/articles/concepts/what-mailwoman-is/)
|
|
86
|
+
- [Staged Pipeline Contract](https://mailwoman.sister.software/articles/plan/reference/STAGES/)
|
|
87
|
+
|
|
88
|
+
## License
|
|
89
|
+
|
|
90
|
+
[AGPL-3.0-only](https://www.gnu.org/licenses/agpl-3.0.html)
|
|
@@ -1,49 +1,18 @@
|
|
|
1
1
|
{
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
-2.534797191619873,
|
|
20
|
-
-2.1968510150909424,
|
|
21
|
-
-1.1691420078277588,
|
|
22
|
-
-0.1516338437795639,
|
|
23
|
-
-1.0911738872528076,
|
|
24
|
-
0.4597792327404022,
|
|
25
|
-
1.1227235794067383,
|
|
26
|
-
0.8241579532623291,
|
|
27
|
-
1.009660005569458,
|
|
28
|
-
0.47969329357147217,
|
|
29
|
-
0.34428685903549194,
|
|
30
|
-
3.046959161758423
|
|
31
|
-
],
|
|
32
|
-
"trainedAt": null,
|
|
33
|
-
"trainRows": 612888,
|
|
34
|
-
"quantization": "int8-per-row",
|
|
35
|
-
"scales": [
|
|
36
|
-
0.01638676988796925,
|
|
37
|
-
0.018822564853457954,
|
|
38
|
-
0.024230125382190614,
|
|
39
|
-
0.0221641082463302,
|
|
40
|
-
0.04191271714338168,
|
|
41
|
-
0.019635750552800698,
|
|
42
|
-
0.022043046050184353,
|
|
43
|
-
0.02781985688397265,
|
|
44
|
-
0.02106240227466493,
|
|
45
|
-
0.022867627031221166,
|
|
46
|
-
0.029368993804210753,
|
|
47
|
-
0.04316849220456101
|
|
48
|
-
]
|
|
49
|
-
}
|
|
2
|
+
"classes": ["US", "FR", "GB", "CN", "NL", "IT", "DE", "JP", "ES", "KR", "TW", "OTHER"],
|
|
3
|
+
"featureDim": 65536,
|
|
4
|
+
"temperature": 1.2,
|
|
5
|
+
"bias": [
|
|
6
|
+
-2.534797191619873, -2.1968510150909424, -1.1691420078277588, -0.1516338437795639, -1.0911738872528076,
|
|
7
|
+
0.4597792327404022, 1.1227235794067383, 0.8241579532623291, 1.009660005569458, 0.47969329357147217,
|
|
8
|
+
0.34428685903549194, 3.046959161758423
|
|
9
|
+
],
|
|
10
|
+
"trainedAt": null,
|
|
11
|
+
"trainRows": 612888,
|
|
12
|
+
"quantization": "int8-per-row",
|
|
13
|
+
"scales": [
|
|
14
|
+
0.01638676988796925, 0.018822564853457954, 0.024230125382190614, 0.0221641082463302, 0.04191271714338168,
|
|
15
|
+
0.019635750552800698, 0.022043046050184353, 0.02781985688397265, 0.02106240227466493, 0.022867627031221166,
|
|
16
|
+
0.029368993804210753, 0.04316849220456101
|
|
17
|
+
]
|
|
18
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Containment-preserving arbitration (#478 inc 3, fix-v1).
|
|
7
|
+
*
|
|
8
|
+
* The first arbitration implementation flattened the neural parse to proposals, unioned the solved
|
|
9
|
+
* v0 proposals, filtered per-component, resolved span overlaps, and rebuilt a FLAT tree. That
|
|
10
|
+
* lost containment two ways (diagnosed in `2026-06-17-478-arbitration-arena-gate.md`): the
|
|
11
|
+
* overlap pass evicted a `street` for the `street_suffix` sitting inside it (street dropped on
|
|
12
|
+
* 42% of rows), and the flat tree lost the region→locality structure the resolver needs
|
|
13
|
+
* (wrong-state namesakes, coord p50 3.3 km → 1069 km).
|
|
14
|
+
*
|
|
15
|
+
* This applies arbitration as **edits on the nested neural argmax tree** — never flattening, never
|
|
16
|
+
* restructuring — so the neural tree's containment is preserved by construction. Used only on the
|
|
17
|
+
* `rule_preferred` route; `neural_preferred` / `abstain` pass the neural tree through untouched.
|
|
18
|
+
*
|
|
19
|
+
* The edits (DeepSeek-coordinated, 2026-06-17):
|
|
20
|
+
*
|
|
21
|
+
* 1. **Relabel** — when a rule proposal covers the EXACT span of a neural node but assigns a different
|
|
22
|
+
* tag, take the rule's tag (the genuine same-span disagreement; rule wins under
|
|
23
|
+
* `rule_preferred`). Structure unchanged — only the node's tag/provenance.
|
|
24
|
+
* 2. **Add missing tags** — a rule proposal whose tag is absent from the neural tree AND whose span
|
|
25
|
+
* doesn't overlap any neural node is added as a new root (a component neural missed
|
|
26
|
+
* entirely).
|
|
27
|
+
*
|
|
28
|
+
* What it deliberately does NOT do: replace a neural node with a differently-spanned rule node,
|
|
29
|
+
* drop neural's sub-component decomposition (`street_suffix`/`street_prefix`), or add an
|
|
30
|
+
* overlapping rule node. So a clean address — where neural and v0 agree on tags+spans and differ
|
|
31
|
+
* only in street decomposition — is a **no-op**. The cost is losing pure-decomposition wins (low
|
|
32
|
+
* value); the gate re-run is the arbiter.
|
|
33
|
+
*/
|
|
34
|
+
import type { ClassificationProposal } from "../types/index.js";
|
|
35
|
+
import type { AddressTree } from "./types.js";
|
|
36
|
+
/**
|
|
37
|
+
* Edit the nested neural argmax tree with the solved v0 (rule) parse under the `rule_preferred`
|
|
38
|
+
* route — relabel same-span tag disagreements toward rule, add rule-only non-overlapping missing
|
|
39
|
+
* tags. Containment-preserving (no flatten, no restructure). Input is not mutated.
|
|
40
|
+
*
|
|
41
|
+
* @param tree The neural argmax `AddressTree`.
|
|
42
|
+
* @param ruleProposals Proposals from the solved v0 parse (`solutionToProposals`).
|
|
43
|
+
*/
|
|
44
|
+
export declare function applyRuleArbitration(tree: AddressTree, ruleProposals: readonly ClassificationProposal[]): AddressTree;
|
|
45
|
+
//# sourceMappingURL=arbitrate-tree.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"arbitrate-tree.d.ts","sourceRoot":"","sources":["../../decoder/arbitrate-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAEH,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,mBAAmB,CAAA;AAC/D,OAAO,KAAK,EAAe,WAAW,EAAE,MAAM,YAAY,CAAA;AAU1D;;;;;;;GAOG;AACH,wBAAgB,oBAAoB,CAAC,IAAI,EAAE,WAAW,EAAE,aAAa,EAAE,SAAS,sBAAsB,EAAE,GAAG,WAAW,CAgDrH"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Containment-preserving arbitration (#478 inc 3, fix-v1).
|
|
7
|
+
*
|
|
8
|
+
* The first arbitration implementation flattened the neural parse to proposals, unioned the solved
|
|
9
|
+
* v0 proposals, filtered per-component, resolved span overlaps, and rebuilt a FLAT tree. That
|
|
10
|
+
* lost containment two ways (diagnosed in `2026-06-17-478-arbitration-arena-gate.md`): the
|
|
11
|
+
* overlap pass evicted a `street` for the `street_suffix` sitting inside it (street dropped on
|
|
12
|
+
* 42% of rows), and the flat tree lost the region→locality structure the resolver needs
|
|
13
|
+
* (wrong-state namesakes, coord p50 3.3 km → 1069 km).
|
|
14
|
+
*
|
|
15
|
+
* This applies arbitration as **edits on the nested neural argmax tree** — never flattening, never
|
|
16
|
+
* restructuring — so the neural tree's containment is preserved by construction. Used only on the
|
|
17
|
+
* `rule_preferred` route; `neural_preferred` / `abstain` pass the neural tree through untouched.
|
|
18
|
+
*
|
|
19
|
+
* The edits (DeepSeek-coordinated, 2026-06-17):
|
|
20
|
+
*
|
|
21
|
+
* 1. **Relabel** — when a rule proposal covers the EXACT span of a neural node but assigns a different
|
|
22
|
+
* tag, take the rule's tag (the genuine same-span disagreement; rule wins under
|
|
23
|
+
* `rule_preferred`). Structure unchanged — only the node's tag/provenance.
|
|
24
|
+
* 2. **Add missing tags** — a rule proposal whose tag is absent from the neural tree AND whose span
|
|
25
|
+
* doesn't overlap any neural node is added as a new root (a component neural missed
|
|
26
|
+
* entirely).
|
|
27
|
+
*
|
|
28
|
+
* What it deliberately does NOT do: replace a neural node with a differently-spanned rule node,
|
|
29
|
+
* drop neural's sub-component decomposition (`street_suffix`/`street_prefix`), or add an
|
|
30
|
+
* overlapping rule node. So a clean address — where neural and v0 agree on tags+spans and differ
|
|
31
|
+
* only in street decomposition — is a **no-op**. The cost is losing pure-decomposition wins (low
|
|
32
|
+
* value); the gate re-run is the arbiter.
|
|
33
|
+
*/
|
|
34
|
+
function cloneNode(node) {
|
|
35
|
+
return { ...node, children: node.children.map(cloneNode) };
|
|
36
|
+
}
|
|
37
|
+
function spansOverlap(aStart, aEnd, bStart, bEnd) {
|
|
38
|
+
return aStart < bEnd && bStart < aEnd;
|
|
39
|
+
}
|
|
40
|
+
/**
|
|
41
|
+
* Edit the nested neural argmax tree with the solved v0 (rule) parse under the `rule_preferred`
|
|
42
|
+
* route — relabel same-span tag disagreements toward rule, add rule-only non-overlapping missing
|
|
43
|
+
* tags. Containment-preserving (no flatten, no restructure). Input is not mutated.
|
|
44
|
+
*
|
|
45
|
+
* @param tree The neural argmax `AddressTree`.
|
|
46
|
+
* @param ruleProposals Proposals from the solved v0 parse (`solutionToProposals`).
|
|
47
|
+
*/
|
|
48
|
+
export function applyRuleArbitration(tree, ruleProposals) {
|
|
49
|
+
const roots = tree.roots.map(cloneNode);
|
|
50
|
+
// 1. Relabel: a rule proposal on the EXACT span of a neural node, but a different tag → rule's tag.
|
|
51
|
+
const relabel = (node) => {
|
|
52
|
+
const hit = ruleProposals.find((p) => p.span.start === node.start && p.span.end === node.end && p.component !== node.tag);
|
|
53
|
+
if (hit) {
|
|
54
|
+
node.tag = hit.component;
|
|
55
|
+
node.source = "rule";
|
|
56
|
+
node.confidence = hit.confidence;
|
|
57
|
+
node.sourceId = hit.source_id;
|
|
58
|
+
}
|
|
59
|
+
for (const child of node.children)
|
|
60
|
+
relabel(child);
|
|
61
|
+
};
|
|
62
|
+
for (const root of roots)
|
|
63
|
+
relabel(root);
|
|
64
|
+
// Post-relabel inventory: which tags exist, and every node span (for the overlap guard).
|
|
65
|
+
const neuralTags = new Set();
|
|
66
|
+
const neuralSpans = [];
|
|
67
|
+
const collect = (node) => {
|
|
68
|
+
neuralTags.add(node.tag);
|
|
69
|
+
neuralSpans.push({ start: node.start, end: node.end });
|
|
70
|
+
for (const child of node.children)
|
|
71
|
+
collect(child);
|
|
72
|
+
};
|
|
73
|
+
for (const root of roots)
|
|
74
|
+
collect(root);
|
|
75
|
+
// 2. Add: a rule tag the neural tree lacks entirely, on a span that overlaps no neural node.
|
|
76
|
+
for (const p of ruleProposals) {
|
|
77
|
+
if (neuralTags.has(p.component))
|
|
78
|
+
continue;
|
|
79
|
+
if (neuralSpans.some((s) => spansOverlap(s.start, s.end, p.span.start, p.span.end)))
|
|
80
|
+
continue;
|
|
81
|
+
roots.push({
|
|
82
|
+
tag: p.component,
|
|
83
|
+
value: p.span.body,
|
|
84
|
+
start: p.span.start,
|
|
85
|
+
end: p.span.end,
|
|
86
|
+
confidence: p.confidence,
|
|
87
|
+
children: [],
|
|
88
|
+
source: p.source,
|
|
89
|
+
sourceId: p.source_id,
|
|
90
|
+
});
|
|
91
|
+
neuralTags.add(p.component); // a tag is added at most once
|
|
92
|
+
neuralSpans.push({ start: p.span.start, end: p.span.end });
|
|
93
|
+
}
|
|
94
|
+
roots.sort((a, b) => a.start - b.start);
|
|
95
|
+
return { ...tree, roots };
|
|
96
|
+
}
|
|
97
|
+
//# sourceMappingURL=arbitrate-tree.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"arbitrate-tree.js","sourceRoot":"","sources":["../../decoder/arbitrate-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AAKH,SAAS,SAAS,CAAC,IAAiB;IACnC,OAAO,EAAE,GAAG,IAAI,EAAE,QAAQ,EAAE,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,SAAS,CAAC,EAAE,CAAA;AAC3D,CAAC;AAED,SAAS,YAAY,CAAC,MAAc,EAAE,IAAY,EAAE,MAAc,EAAE,IAAY;IAC/E,OAAO,MAAM,GAAG,IAAI,IAAI,MAAM,GAAG,IAAI,CAAA;AACtC,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,oBAAoB,CAAC,IAAiB,EAAE,aAAgD;IACvG,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,SAAS,CAAC,CAAA;IAEvC,oGAAoG;IACpG,MAAM,OAAO,GAAG,CAAC,IAAiB,EAAQ,EAAE;QAC3C,MAAM,GAAG,GAAG,aAAa,CAAC,IAAI,CAC7B,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,IAAI,CAAC,CAAC,IAAI,CAAC,GAAG,KAAK,IAAI,CAAC,GAAG,IAAI,CAAC,CAAC,SAAS,KAAK,IAAI,CAAC,GAAG,CACzF,CAAA;QACD,IAAI,GAAG,EAAE,CAAC;YACT,IAAI,CAAC,GAAG,GAAG,GAAG,CAAC,SAAS,CAAA;YACxB,IAAI,CAAC,MAAM,GAAG,MAAM,CAAA;YACpB,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,UAAU,CAAA;YAChC,IAAI,CAAC,QAAQ,GAAG,GAAG,CAAC,SAAS,CAAA;QAC9B,CAAC;QACD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO,CAAC,KAAK,CAAC,CAAA;IAClD,CAAC,CAAA;IACD,KAAK,MAAM,IAAI,IAAI,KAAK;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAEvC,yFAAyF;IACzF,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAA;IACpC,MAAM,WAAW,GAA0C,EAAE,CAAA;IAC7D,MAAM,OAAO,GAAG,CAAC,IAAiB,EAAQ,EAAE;QAC3C,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACxB,WAAW,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,CAAC,CAAA;QACtD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;YAAE,OAAO,CAAC,KAAK,CAAC,CAAA;IAClD,CAAC,CAAA;IACD,KAAK,MAAM,IAAI,IAAI,KAAK;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAEvC,6FAA6F;IAC7F,KAAK,MAAM,CAAC,IAAI,aAAa,EAAE,CAAC;QAC/B,IAAI,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC;YAAE,SAAQ;QACzC,IAAI,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAAE,SAAQ;QAC7F,KAAK,CAAC,IAAI,CAAC;YACV,GAAG,EAAE,CAAC,CAAC,SAAS;YAChB,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI;YAClB,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK;YACnB,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG;YACf,UAAU,EAAE,CAAC,CAAC,UAAU;YACxB,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,CAAC,CAAC,MAAM;YAChB,QAAQ,EAAE,CAAC,CAAC,SAAS;SACrB,CAAC,CAAA;QACF,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA,CAAC,8BAA8B;QAC1D,WAAW,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,CAAA;IAC3D,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAA;IACvC,OAAO,EAAE,GAAG,IAAI,EAAE,KAAK,EAAE,CAAA;AAC1B,CAAC"}
|
package/out/decoder/index.d.ts
CHANGED
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*/
|
|
6
|
+
export * from "./arbitrate-tree.js";
|
|
6
7
|
export * from "./build-tree.js";
|
|
7
8
|
export * from "./calibration.js";
|
|
8
9
|
export * from "./containment.js";
|
|
9
10
|
export * from "./proposals-to-tree.js";
|
|
11
|
+
export * from "./resolve-proposal-overlaps.js";
|
|
10
12
|
export * from "./serialize-json.js";
|
|
11
13
|
export * from "./serialize-tuples.js";
|
|
12
14
|
export * from "./serialize-xml.js";
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../decoder/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,kBAAkB,CAAA;AAChC,cAAc,wBAAwB,CAAA;AACtC,cAAc,qBAAqB,CAAA;AACnC,cAAc,uBAAuB,CAAA;AACrC,cAAc,oBAAoB,CAAA;AAClC,cAAc,YAAY,CAAA;AAC1B,cAAc,oBAAoB,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../decoder/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,qBAAqB,CAAA;AACnC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,kBAAkB,CAAA;AAChC,cAAc,wBAAwB,CAAA;AACtC,cAAc,gCAAgC,CAAA;AAC9C,cAAc,qBAAqB,CAAA;AACnC,cAAc,uBAAuB,CAAA;AACrC,cAAc,oBAAoB,CAAA;AAClC,cAAc,YAAY,CAAA;AAC1B,cAAc,oBAAoB,CAAA"}
|
package/out/decoder/index.js
CHANGED
|
@@ -3,10 +3,12 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*/
|
|
6
|
+
export * from "./arbitrate-tree.js";
|
|
6
7
|
export * from "./build-tree.js";
|
|
7
8
|
export * from "./calibration.js";
|
|
8
9
|
export * from "./containment.js";
|
|
9
10
|
export * from "./proposals-to-tree.js";
|
|
11
|
+
export * from "./resolve-proposal-overlaps.js";
|
|
10
12
|
export * from "./serialize-json.js";
|
|
11
13
|
export * from "./serialize-tuples.js";
|
|
12
14
|
export * from "./serialize-xml.js";
|
package/out/decoder/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../decoder/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,kBAAkB,CAAA;AAChC,cAAc,wBAAwB,CAAA;AACtC,cAAc,qBAAqB,CAAA;AACnC,cAAc,uBAAuB,CAAA;AACrC,cAAc,oBAAoB,CAAA;AAClC,cAAc,YAAY,CAAA;AAC1B,cAAc,oBAAoB,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../decoder/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,qBAAqB,CAAA;AACnC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,kBAAkB,CAAA;AAChC,cAAc,wBAAwB,CAAA;AACtC,cAAc,gCAAgC,CAAA;AAC9C,cAAc,qBAAqB,CAAA;AACnC,cAAc,uBAAuB,CAAA;AACrC,cAAc,oBAAoB,CAAA;AAClC,cAAc,YAAY,CAAA;AAC1B,cAAc,oBAAoB,CAAA"}
|
|
@@ -13,7 +13,26 @@
|
|
|
13
13
|
* For consumers that need containment back, re-tokenize the input and run the full decoder
|
|
14
14
|
* pipeline.
|
|
15
15
|
*/
|
|
16
|
-
import type { ClassificationProposal } from "../types/index.js";
|
|
16
|
+
import type { ClassificationProposal, ClassificationProposalSource, ComponentTag } from "../types/index.js";
|
|
17
17
|
import type { AddressTree } from "./types.js";
|
|
18
18
|
export declare function proposalsToTree(raw: string, proposals: readonly ClassificationProposal[]): AddressTree;
|
|
19
|
+
/**
|
|
20
|
+
* The inverse of {@link proposalsToTree}: walk an `AddressTree` into a flat list of
|
|
21
|
+
* `ClassificationProposal`s (one per node, depth-first), tagged with the given `source` (#478
|
|
22
|
+
* increment 3). Used to bring the whole-text neural parse into the arbitration layer's proposal
|
|
23
|
+
* currency so it can be unioned with rule proposals and filtered by the policy registry.
|
|
24
|
+
*
|
|
25
|
+
* The spans are structural (`{ start, end, body }`) — we intentionally avoid `Span.from(...)`
|
|
26
|
+
* (which forces the tokenization module's filesystem-bound init); downstream proposal consumers
|
|
27
|
+
* read only `start` / `end` / `body`. Same convention as the neural proposal-classifier adapter.
|
|
28
|
+
*
|
|
29
|
+
* @param tree The parsed tree (e.g. the neural argmax tree).
|
|
30
|
+
* @param source Provenance stamped on every emitted proposal (`"neural"` here).
|
|
31
|
+
* @param opts.sourceId Optional stable id surfaced as `source_id`.
|
|
32
|
+
* @param opts.emits Optional tag allow-list; when set, only nodes with these tags are emitted.
|
|
33
|
+
*/
|
|
34
|
+
export declare function treeToProposals(tree: AddressTree, source: ClassificationProposalSource, opts?: {
|
|
35
|
+
sourceId?: string;
|
|
36
|
+
emits?: ReadonlySet<ComponentTag>;
|
|
37
|
+
}): ClassificationProposal[];
|
|
19
38
|
//# sourceMappingURL=proposals-to-tree.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposals-to-tree.d.ts","sourceRoot":"","sources":["../../decoder/proposals-to-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;
|
|
1
|
+
{"version":3,"file":"proposals-to-tree.d.ts","sourceRoot":"","sources":["../../decoder/proposals-to-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAGH,OAAO,KAAK,EAAE,sBAAsB,EAAE,4BAA4B,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAA;AAC3G,OAAO,KAAK,EAAe,WAAW,EAAE,MAAM,YAAY,CAAA;AAE1D,wBAAgB,eAAe,CAAC,GAAG,EAAE,MAAM,EAAE,SAAS,EAAE,SAAS,sBAAsB,EAAE,GAAG,WAAW,CAatG;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,eAAe,CAC9B,IAAI,EAAE,WAAW,EACjB,MAAM,EAAE,4BAA4B,EACpC,IAAI,GAAE;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,WAAW,CAAC,YAAY,CAAC,CAAA;CAAO,GACjE,sBAAsB,EAAE,CAqB1B"}
|
|
@@ -27,4 +27,41 @@ export function proposalsToTree(raw, proposals) {
|
|
|
27
27
|
roots.sort((a, b) => a.start - b.start);
|
|
28
28
|
return { raw, roots };
|
|
29
29
|
}
|
|
30
|
+
/**
|
|
31
|
+
* The inverse of {@link proposalsToTree}: walk an `AddressTree` into a flat list of
|
|
32
|
+
* `ClassificationProposal`s (one per node, depth-first), tagged with the given `source` (#478
|
|
33
|
+
* increment 3). Used to bring the whole-text neural parse into the arbitration layer's proposal
|
|
34
|
+
* currency so it can be unioned with rule proposals and filtered by the policy registry.
|
|
35
|
+
*
|
|
36
|
+
* The spans are structural (`{ start, end, body }`) — we intentionally avoid `Span.from(...)`
|
|
37
|
+
* (which forces the tokenization module's filesystem-bound init); downstream proposal consumers
|
|
38
|
+
* read only `start` / `end` / `body`. Same convention as the neural proposal-classifier adapter.
|
|
39
|
+
*
|
|
40
|
+
* @param tree The parsed tree (e.g. the neural argmax tree).
|
|
41
|
+
* @param source Provenance stamped on every emitted proposal (`"neural"` here).
|
|
42
|
+
* @param opts.sourceId Optional stable id surfaced as `source_id`.
|
|
43
|
+
* @param opts.emits Optional tag allow-list; when set, only nodes with these tags are emitted.
|
|
44
|
+
*/
|
|
45
|
+
export function treeToProposals(tree, source, opts = {}) {
|
|
46
|
+
const proposals = [];
|
|
47
|
+
const { sourceId, emits } = opts;
|
|
48
|
+
const visit = (node) => {
|
|
49
|
+
if (!emits || emits.has(node.tag)) {
|
|
50
|
+
const span = { start: node.start, end: node.end, body: node.value };
|
|
51
|
+
proposals.push({
|
|
52
|
+
span,
|
|
53
|
+
component: node.tag,
|
|
54
|
+
confidence: node.confidence,
|
|
55
|
+
source,
|
|
56
|
+
source_id: sourceId ?? node.sourceId ?? source,
|
|
57
|
+
penalty: 0,
|
|
58
|
+
});
|
|
59
|
+
}
|
|
60
|
+
for (const child of node.children)
|
|
61
|
+
visit(child);
|
|
62
|
+
};
|
|
63
|
+
for (const root of tree.roots)
|
|
64
|
+
visit(root);
|
|
65
|
+
return proposals;
|
|
66
|
+
}
|
|
30
67
|
//# sourceMappingURL=proposals-to-tree.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposals-to-tree.js","sourceRoot":"","sources":["../../decoder/proposals-to-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;
|
|
1
|
+
{"version":3,"file":"proposals-to-tree.js","sourceRoot":"","sources":["../../decoder/proposals-to-tree.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAMH,MAAM,UAAU,eAAe,CAAC,GAAW,EAAE,SAA4C;IACxF,MAAM,KAAK,GAAkB,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAClD,GAAG,EAAE,CAAC,CAAC,SAAyB;QAChC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI;QAClB,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK;QACnB,GAAG,EAAE,CAAC,CAAC,IAAI,CAAC,GAAG;QACf,UAAU,EAAE,CAAC,CAAC,UAAU;QACxB,QAAQ,EAAE,EAAE;QACZ,MAAM,EAAE,CAAC,CAAC,MAAM;QAChB,QAAQ,EAAE,CAAC,CAAC,SAAS;KACrB,CAAC,CAAC,CAAA;IACH,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAA;IACvC,OAAO,EAAE,GAAG,EAAE,KAAK,EAAE,CAAA;AACtB,CAAC;AAED;;;;;;;;;;;;;;GAcG;AACH,MAAM,UAAU,eAAe,CAC9B,IAAiB,EACjB,MAAoC,EACpC,OAAiE,EAAE;IAEnE,MAAM,SAAS,GAA6B,EAAE,CAAA;IAC9C,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,GAAG,IAAI,CAAA;IAEhC,MAAM,KAAK,GAAG,CAAC,IAAiB,EAAQ,EAAE;QACzC,IAAI,CAAC,KAAK,IAAI,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACnC,MAAM,IAAI,GAAG,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,GAAG,EAAE,IAAI,CAAC,GAAG,EAAE,IAAI,EAAE,IAAI,CAAC,KAAK,EAAqB,CAAA;YACtF,SAAS,CAAC,IAAI,CAAC;gBACd,IAAI;gBACJ,SAAS,EAAE,IAAI,CAAC,GAAG;gBACnB,UAAU,EAAE,IAAI,CAAC,UAAU;gBAC3B,MAAM;gBACN,SAAS,EAAE,QAAQ,IAAI,IAAI,CAAC,QAAQ,IAAI,MAAM;gBAC9C,OAAO,EAAE,CAAC;aACV,CAAC,CAAA;QACH,CAAC;QACD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;YAAE,KAAK,CAAC,KAAK,CAAC,CAAA;IAChD,CAAC,CAAA;IAED,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK;QAAE,KAAK,CAAC,IAAI,CAAC,CAAA;IAC1C,OAAO,SAAS,CAAA;AACjB,CAAC"}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Coherence pass for arbitrated proposals (#478 increment 3).
|
|
7
|
+
*
|
|
8
|
+
* The arbitration layer unions proposals from multiple sources (whole-text `neural`, per-section
|
|
9
|
+
* `rule`) and filters them per-component via the policy registry. That per-_tag_ filter is blind
|
|
10
|
+
* to cross-_tag_ span overlap: a `neural` street span `[0,11]` ("350 5th Ave") and a `rule`
|
|
11
|
+
* house_number `[0,3]` ("350") can both survive — different tags, overlapping spans. Fed straight
|
|
12
|
+
* into {@link proposalsToTree} (which emits one flat root node per proposal, no overlap handling)
|
|
13
|
+
* that yields an incoherent tree with overlapping nodes, which degrades or breaks the resolver.
|
|
14
|
+
*
|
|
15
|
+
* This pass guarantees the invariant {@link proposalsToTree} needs: **no two surviving proposals
|
|
16
|
+
* have overlapping spans.** It is a greedy interval selection — accept proposals in priority
|
|
17
|
+
* order, skip any that overlap an already-accepted span.
|
|
18
|
+
*
|
|
19
|
+
* ## The selection policy (the gate-tunable lever)
|
|
20
|
+
*
|
|
21
|
+
* Priority is **confidence desc, then shorter span first, then earlier start**:
|
|
22
|
+
*
|
|
23
|
+
* - _Confidence primary_ respects the arbitration that already happened — a source the registry kept
|
|
24
|
+
* at high confidence wins its span.
|
|
25
|
+
* - _Shorter-span-first on ties_ preserves finer decompositions: given equal-confidence
|
|
26
|
+
* `street[0,11]` vs `{house_number[0,3], street[4,11]}`, the two finer spans are accepted and
|
|
27
|
+
* the coarse subsuming span is dropped — keeping the street+house_number precondition intact
|
|
28
|
+
* (the thing #566 broke). The neural argmax path labels per-token, so it normally emits the
|
|
29
|
+
* finer decomposition itself; this tiebreak is the safety net when a coarse rule span
|
|
30
|
+
* competes.
|
|
31
|
+
*
|
|
32
|
+
* This policy is deliberately simple and deterministic. It is the lever the inc-3 assembled gate
|
|
33
|
+
* validates: if it drops too many house numbers (precondition regression) the comparator is where
|
|
34
|
+
* to look. (An alternative — earliest-end-first maximal-tiling, ignoring confidence — maximizes
|
|
35
|
+
* the _count_ of non-overlapping spans but can let a spurious tiny span evict a correct large
|
|
36
|
+
* one; confidence-primary guards against that.)
|
|
37
|
+
*
|
|
38
|
+
* Pure module: reads only `span.{start,end}` + `confidence`. Safe to import anywhere.
|
|
39
|
+
*/
|
|
40
|
+
import type { ClassificationProposal } from "../types/index.js";
|
|
41
|
+
/**
|
|
42
|
+
* Reduce a set of (possibly overlapping) arbitrated proposals to a coherent, non-overlapping set
|
|
43
|
+
* via greedy interval selection. The output is sorted by span start (the order
|
|
44
|
+
* {@link proposalsToTree} expects). Input is not mutated.
|
|
45
|
+
*
|
|
46
|
+
* @param proposals Arbitrated proposals (post policy-registry filter), any source, possibly
|
|
47
|
+
* overlapping.
|
|
48
|
+
*
|
|
49
|
+
* @returns A subset with no two spans overlapping, in span-start order.
|
|
50
|
+
*/
|
|
51
|
+
export declare function resolveProposalOverlaps(proposals: readonly ClassificationProposal[]): ClassificationProposal[];
|
|
52
|
+
//# sourceMappingURL=resolve-proposal-overlaps.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolve-proposal-overlaps.d.ts","sourceRoot":"","sources":["../../decoder/resolve-proposal-overlaps.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AAEH,OAAO,KAAK,EAAE,sBAAsB,EAAE,MAAM,mBAAmB,CAAA;AAO/D;;;;;;;;;GASG;AACH,wBAAgB,uBAAuB,CAAC,SAAS,EAAE,SAAS,sBAAsB,EAAE,GAAG,sBAAsB,EAAE,CAmB9G"}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Coherence pass for arbitrated proposals (#478 increment 3).
|
|
7
|
+
*
|
|
8
|
+
* The arbitration layer unions proposals from multiple sources (whole-text `neural`, per-section
|
|
9
|
+
* `rule`) and filters them per-component via the policy registry. That per-_tag_ filter is blind
|
|
10
|
+
* to cross-_tag_ span overlap: a `neural` street span `[0,11]` ("350 5th Ave") and a `rule`
|
|
11
|
+
* house_number `[0,3]` ("350") can both survive — different tags, overlapping spans. Fed straight
|
|
12
|
+
* into {@link proposalsToTree} (which emits one flat root node per proposal, no overlap handling)
|
|
13
|
+
* that yields an incoherent tree with overlapping nodes, which degrades or breaks the resolver.
|
|
14
|
+
*
|
|
15
|
+
* This pass guarantees the invariant {@link proposalsToTree} needs: **no two surviving proposals
|
|
16
|
+
* have overlapping spans.** It is a greedy interval selection — accept proposals in priority
|
|
17
|
+
* order, skip any that overlap an already-accepted span.
|
|
18
|
+
*
|
|
19
|
+
* ## The selection policy (the gate-tunable lever)
|
|
20
|
+
*
|
|
21
|
+
* Priority is **confidence desc, then shorter span first, then earlier start**:
|
|
22
|
+
*
|
|
23
|
+
* - _Confidence primary_ respects the arbitration that already happened — a source the registry kept
|
|
24
|
+
* at high confidence wins its span.
|
|
25
|
+
* - _Shorter-span-first on ties_ preserves finer decompositions: given equal-confidence
|
|
26
|
+
* `street[0,11]` vs `{house_number[0,3], street[4,11]}`, the two finer spans are accepted and
|
|
27
|
+
* the coarse subsuming span is dropped — keeping the street+house_number precondition intact
|
|
28
|
+
* (the thing #566 broke). The neural argmax path labels per-token, so it normally emits the
|
|
29
|
+
* finer decomposition itself; this tiebreak is the safety net when a coarse rule span
|
|
30
|
+
* competes.
|
|
31
|
+
*
|
|
32
|
+
* This policy is deliberately simple and deterministic. It is the lever the inc-3 assembled gate
|
|
33
|
+
* validates: if it drops too many house numbers (precondition regression) the comparator is where
|
|
34
|
+
* to look. (An alternative — earliest-end-first maximal-tiling, ignoring confidence — maximizes
|
|
35
|
+
* the _count_ of non-overlapping spans but can let a spurious tiny span evict a correct large
|
|
36
|
+
* one; confidence-primary guards against that.)
|
|
37
|
+
*
|
|
38
|
+
* Pure module: reads only `span.{start,end}` + `confidence`. Safe to import anywhere.
|
|
39
|
+
*/
|
|
40
|
+
/** Half-open interval overlap: `[aStart,aEnd)` and `[bStart,bEnd)` share at least one position. */
|
|
41
|
+
function spansOverlap(a, b) {
|
|
42
|
+
return a.start < b.end && b.start < a.end;
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Reduce a set of (possibly overlapping) arbitrated proposals to a coherent, non-overlapping set
|
|
46
|
+
* via greedy interval selection. The output is sorted by span start (the order
|
|
47
|
+
* {@link proposalsToTree} expects). Input is not mutated.
|
|
48
|
+
*
|
|
49
|
+
* @param proposals Arbitrated proposals (post policy-registry filter), any source, possibly
|
|
50
|
+
* overlapping.
|
|
51
|
+
*
|
|
52
|
+
* @returns A subset with no two spans overlapping, in span-start order.
|
|
53
|
+
*/
|
|
54
|
+
export function resolveProposalOverlaps(proposals) {
|
|
55
|
+
if (proposals.length <= 1)
|
|
56
|
+
return [...proposals];
|
|
57
|
+
const ranked = [...proposals].sort((a, b) => {
|
|
58
|
+
if (b.confidence !== a.confidence)
|
|
59
|
+
return b.confidence - a.confidence; // higher confidence first
|
|
60
|
+
const lenA = a.span.end - a.span.start;
|
|
61
|
+
const lenB = b.span.end - b.span.start;
|
|
62
|
+
if (lenA !== lenB)
|
|
63
|
+
return lenA - lenB; // shorter (finer) span first — preserve decompositions
|
|
64
|
+
return a.span.start - b.span.start; // earlier start first — stable, deterministic
|
|
65
|
+
});
|
|
66
|
+
const kept = [];
|
|
67
|
+
for (const proposal of ranked) {
|
|
68
|
+
if (kept.every((k) => !spansOverlap(k.span, proposal.span))) {
|
|
69
|
+
kept.push(proposal);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return kept.sort((a, b) => a.span.start - b.span.start);
|
|
73
|
+
}
|
|
74
|
+
//# sourceMappingURL=resolve-proposal-overlaps.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"resolve-proposal-overlaps.js","sourceRoot":"","sources":["../../decoder/resolve-proposal-overlaps.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAsCG;AAIH,mGAAmG;AACnG,SAAS,YAAY,CAAC,CAAiC,EAAE,CAAiC;IACzF,OAAO,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAA;AAC1C,CAAC;AAED;;;;;;;;;GASG;AACH,MAAM,UAAU,uBAAuB,CAAC,SAA4C;IACnF,IAAI,SAAS,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,CAAC,GAAG,SAAS,CAAC,CAAA;IAEhD,MAAM,MAAM,GAAG,CAAC,GAAG,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QAC3C,IAAI,CAAC,CAAC,UAAU,KAAK,CAAC,CAAC,UAAU;YAAE,OAAO,CAAC,CAAC,UAAU,GAAG,CAAC,CAAC,UAAU,CAAA,CAAC,0BAA0B;QAChG,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,KAAK,CAAA;QACtC,MAAM,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,KAAK,CAAA;QACtC,IAAI,IAAI,KAAK,IAAI;YAAE,OAAO,IAAI,GAAG,IAAI,CAAA,CAAC,uDAAuD;QAC7F,OAAO,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,KAAK,CAAA,CAAC,8CAA8C;IAClF,CAAC,CAAC,CAAA;IAEF,MAAM,IAAI,GAA6B,EAAE,CAAA;IACzC,KAAK,MAAM,QAAQ,IAAI,MAAM,EAAE,CAAC;QAC/B,IAAI,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,IAAI,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,EAAE,CAAC;YAC7D,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QACpB,CAAC;IACF,CAAC;IAED,OAAO,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;AACxD,CAAC"}
|
package/out/parser/index.d.ts
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../parser/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,oBAAoB,CAAA;AAClC,cAAc,wBAAwB,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../parser/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,oBAAoB,CAAA;AAClC,cAAc,wBAAwB,CAAA;AACtC,cAAc,4BAA4B,CAAA"}
|
package/out/parser/index.js
CHANGED
package/out/parser/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../parser/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,oBAAoB,CAAA;AAClC,cAAc,wBAAwB,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../parser/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,oBAAoB,CAAA;AAClC,cAAc,wBAAwB,CAAA;AACtC,cAAc,4BAA4B,CAAA"}
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
*
|
|
15
15
|
* Pure module: no resource imports, no top-level await. Safe to import from anywhere.
|
|
16
16
|
*/
|
|
17
|
+
import type { InputShapeRoute } from "../policy/input-shape-router.js";
|
|
17
18
|
import type { PolicyRegistry } from "../policy/policy.js";
|
|
18
19
|
import type { Span, TokenContext } from "../tokenization/index.js";
|
|
19
20
|
import { type ClassificationProposal, type ClassifierContext, type ProposalClassifier } from "../types/index.js";
|
|
@@ -26,10 +27,20 @@ import { type ClassificationProposal, type ClassifierContext, type ProposalClass
|
|
|
26
27
|
*/
|
|
27
28
|
export declare function collectProposals(sections: readonly Span[], classifiers: readonly ProposalClassifier[], context?: ClassifierContext): Promise<ClassificationProposal[]>;
|
|
28
29
|
/**
|
|
29
|
-
* Optional policy filter.
|
|
30
|
-
*
|
|
30
|
+
* Optional policy filter.
|
|
31
|
+
*
|
|
32
|
+
* Resolution order:
|
|
33
|
+
*
|
|
34
|
+
* 1. An explicit `policy` registry is authoritative (the operator's config wins entirely).
|
|
35
|
+
* 2. Otherwise, when an input-shape `routerPrior` is supplied (#478 increment 2), build a registry
|
|
36
|
+
* whose default mode is the routed prior and apply that.
|
|
37
|
+
* 3. Otherwise return the input unchanged.
|
|
38
|
+
*
|
|
39
|
+
* Increment 2 ships with no production caller passing `routerPrior` (the production `runPipeline`
|
|
40
|
+
* does not yet feed live signals — that is increment 3), so this stays byte-stable by default. The
|
|
41
|
+
* seam is exercised by the router/proposal-pipeline tests.
|
|
31
42
|
*/
|
|
32
|
-
export declare function filterByPolicy(proposals: readonly ClassificationProposal[], policy: PolicyRegistry | undefined, locale: string | undefined): ClassificationProposal[];
|
|
43
|
+
export declare function filterByPolicy(proposals: readonly ClassificationProposal[], policy: PolicyRegistry | undefined, locale: string | undefined, routerPrior?: InputShapeRoute): ClassificationProposal[];
|
|
33
44
|
/**
|
|
34
45
|
* Locate the context Span whose [start, end] best matches the given char range.
|
|
35
46
|
*
|
|
@@ -60,6 +71,8 @@ export declare function runProposalPipeline(context: TokenContext, classifiers:
|
|
|
60
71
|
policy?: PolicyRegistry;
|
|
61
72
|
locale?: string;
|
|
62
73
|
classifierContext?: ClassifierContext;
|
|
74
|
+
/** Input-shape routed prior (#478 increment 2). Applied only when `policy` is absent. */
|
|
75
|
+
routerPrior?: InputShapeRoute;
|
|
63
76
|
}): Promise<{
|
|
64
77
|
proposals: ClassificationProposal[];
|
|
65
78
|
writeback: WritebackResult;
|