@silupanda/label-score 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +694 -0
- package/dist/__tests__/cohen-kappa.test.d.ts +2 -0
- package/dist/__tests__/cohen-kappa.test.d.ts.map +1 -0
- package/dist/__tests__/cohen-kappa.test.js +110 -0
- package/dist/__tests__/cohen-kappa.test.js.map +1 -0
- package/dist/__tests__/fleiss-kappa.test.d.ts +2 -0
- package/dist/__tests__/fleiss-kappa.test.d.ts.map +1 -0
- package/dist/__tests__/fleiss-kappa.test.js +84 -0
- package/dist/__tests__/fleiss-kappa.test.js.map +1 -0
- package/dist/__tests__/gwet-ac1.test.d.ts +2 -0
- package/dist/__tests__/gwet-ac1.test.d.ts.map +1 -0
- package/dist/__tests__/gwet-ac1.test.js +74 -0
- package/dist/__tests__/gwet-ac1.test.js.map +1 -0
- package/dist/__tests__/interpret.test.d.ts +2 -0
- package/dist/__tests__/interpret.test.d.ts.map +1 -0
- package/dist/__tests__/interpret.test.js +66 -0
- package/dist/__tests__/interpret.test.js.map +1 -0
- package/dist/__tests__/krippendorff-alpha.test.d.ts +2 -0
- package/dist/__tests__/krippendorff-alpha.test.d.ts.map +1 -0
- package/dist/__tests__/krippendorff-alpha.test.js +98 -0
- package/dist/__tests__/krippendorff-alpha.test.js.map +1 -0
- package/dist/__tests__/scott-pi.test.d.ts +2 -0
- package/dist/__tests__/scott-pi.test.d.ts.map +1 -0
- package/dist/__tests__/scott-pi.test.js +60 -0
- package/dist/__tests__/scott-pi.test.js.map +1 -0
- package/dist/__tests__/types.test.d.ts +2 -0
- package/dist/__tests__/types.test.d.ts.map +1 -0
- package/dist/__tests__/types.test.js +92 -0
- package/dist/__tests__/types.test.js.map +1 -0
- package/dist/__tests__/validate.test.d.ts +2 -0
- package/dist/__tests__/validate.test.d.ts.map +1 -0
- package/dist/__tests__/validate.test.js +121 -0
- package/dist/__tests__/validate.test.js.map +1 -0
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +24 -0
- package/dist/index.js.map +1 -0
- package/dist/interpret.d.ts +23 -0
- package/dist/interpret.d.ts.map +1 -0
- package/dist/interpret.js +48 -0
- package/dist/interpret.js.map +1 -0
- package/dist/metrics/cohen-kappa.d.ts +16 -0
- package/dist/metrics/cohen-kappa.d.ts.map +1 -0
- package/dist/metrics/cohen-kappa.js +130 -0
- package/dist/metrics/cohen-kappa.js.map +1 -0
- package/dist/metrics/fleiss-kappa.d.ts +17 -0
- package/dist/metrics/fleiss-kappa.d.ts.map +1 -0
- package/dist/metrics/fleiss-kappa.js +63 -0
- package/dist/metrics/fleiss-kappa.js.map +1 -0
- package/dist/metrics/gwet-ac1.d.ts +19 -0
- package/dist/metrics/gwet-ac1.d.ts.map +1 -0
- package/dist/metrics/gwet-ac1.js +82 -0
- package/dist/metrics/gwet-ac1.js.map +1 -0
- package/dist/metrics/krippendorff-alpha.d.ts +21 -0
- package/dist/metrics/krippendorff-alpha.d.ts.map +1 -0
- package/dist/metrics/krippendorff-alpha.js +162 -0
- package/dist/metrics/krippendorff-alpha.js.map +1 -0
- package/dist/metrics/scott-pi.d.ts +18 -0
- package/dist/metrics/scott-pi.d.ts.map +1 -0
- package/dist/metrics/scott-pi.js +73 -0
- package/dist/metrics/scott-pi.js.map +1 -0
- package/dist/types.d.ts +119 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/validate.d.ts +33 -0
- package/dist/validate.d.ts.map +1 -0
- package/dist/validate.js +83 -0
- package/dist/validate.js.map +1 -0
- package/package.json +33 -0
package/README.md
ADDED
|
@@ -0,0 +1,694 @@
|
|
|
1
|
+
# label-score
|
|
2
|
+
|
|
3
|
+
Inter-annotator agreement metrics for JavaScript and TypeScript. Zero dependencies.
|
|
4
|
+
|
|
5
|
+
[](https://www.npmjs.com/package/label-score)
|
|
6
|
+
[](https://www.npmjs.com/package/label-score)
|
|
7
|
+
[](https://github.com/SiluPanda/label-score/blob/master/LICENSE)
|
|
8
|
+
[](https://nodejs.org)
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## Description
|
|
13
|
+
|
|
14
|
+
`label-score` computes chance-corrected agreement metrics for annotation data -- labels assigned to items by multiple annotators (human or machine). It implements five standard inter-annotator agreement (IAA) metrics, each returning a structured result object with the computed value, an interpretation label, and all intermediate quantities (observed agreement, expected agreement, category lists, annotator/item counts).
|
|
15
|
+
|
|
16
|
+
Use cases include validating human annotation quality before model training, measuring LLM-as-judge consistency across multiple model evaluators, building gold-standard evaluation datasets, and reporting IAA in research papers. All computations are implemented in pure TypeScript with zero runtime dependencies. Numerical outputs are verified against established Python implementations (scikit-learn, NLTK, krippendorff).
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install label-score
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Quick Start
|
|
29
|
+
|
|
30
|
+
```ts
|
|
31
|
+
import {
|
|
32
|
+
cohenKappa,
|
|
33
|
+
fleissKappa,
|
|
34
|
+
scottPi,
|
|
35
|
+
krippendorffAlpha,
|
|
36
|
+
gwetAC1,
|
|
37
|
+
} from 'label-score';
|
|
38
|
+
|
|
39
|
+
// Cohen's Kappa -- two raters, categorical labels
|
|
40
|
+
const kappa = cohenKappa(['A', 'B', 'C', 'A'], ['A', 'B', 'A', 'A']);
|
|
41
|
+
console.log(kappa.value); // 0.5454...
|
|
42
|
+
console.log(kappa.interpretation); // 'moderate'
|
|
43
|
+
|
|
44
|
+
// Fleiss' Kappa -- multiple raters via category-count matrix
|
|
45
|
+
const fleiss = fleissKappa([
|
|
46
|
+
[4, 0, 0], // all 4 raters chose category 0
|
|
47
|
+
[0, 3, 1], // 3 raters chose category 1, 1 chose category 2
|
|
48
|
+
[0, 0, 4], // all 4 raters chose category 2
|
|
49
|
+
]);
|
|
50
|
+
console.log(fleiss.value); // 0.7894...
|
|
51
|
+
console.log(fleiss.annotatorCount); // 4
|
|
52
|
+
|
|
53
|
+
// Scott's Pi -- two raters, joint marginals
|
|
54
|
+
const pi = scottPi(['A', 'B', 'C'], ['A', 'B', 'A']);
|
|
55
|
+
console.log(pi.value);
|
|
56
|
+
|
|
57
|
+
// Krippendorff's Alpha -- multiple raters, handles missing data
|
|
58
|
+
const alpha = krippendorffAlpha([
|
|
59
|
+
['A', null, 'C', 'A'],
|
|
60
|
+
['A', 'B', 'C', null],
|
|
61
|
+
]);
|
|
62
|
+
console.log(alpha.value);
|
|
63
|
+
console.log(alpha.missingCount); // 2
|
|
64
|
+
|
|
65
|
+
// Gwet's AC1 -- robust to prevalence effects
|
|
66
|
+
const ac1 = gwetAC1(['Y', 'Y', 'Y', 'N'], ['Y', 'Y', 'N', 'N']);
|
|
67
|
+
console.log(ac1.value);
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Features
|
|
73
|
+
|
|
74
|
+
- **Five agreement metrics** -- Cohen's Kappa, Fleiss' Kappa, Scott's Pi, Krippendorff's Alpha, and Gwet's AC1.
|
|
75
|
+
- **Weighted kappa** -- Linear and quadratic weighting schemes for ordinal data in Cohen's Kappa.
|
|
76
|
+
- **Missing data support** -- Krippendorff's Alpha excludes or rejects missing annotations via configuration.
|
|
77
|
+
- **Multiple measurement levels** -- Nominal, ordinal, interval, and ratio distance functions for Krippendorff's Alpha.
|
|
78
|
+
- **Automatic interpretation** -- Every result includes a human-readable interpretation label based on published scales (Landis & Koch for kappa-family; Krippendorff's thresholds for alpha).
|
|
79
|
+
- **Input validation** -- Descriptive errors for mismatched array lengths, empty inputs, inconsistent matrices, and insufficient annotators.
|
|
80
|
+
- **Duplicate detection** -- Utility to find repeated (item, annotator) pairs in annotation triple data.
|
|
81
|
+
- **Full TypeScript support** -- All functions, options, and result types are exported and fully typed.
|
|
82
|
+
- **Zero dependencies** -- Pure TypeScript, no runtime dependencies.
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## API Reference
|
|
87
|
+
|
|
88
|
+
### Metric Functions
|
|
89
|
+
|
|
90
|
+
#### `cohenKappa(rater1, rater2, options?)`
|
|
91
|
+
|
|
92
|
+
Computes Cohen's Kappa for two raters. Supports unweighted (nominal) and weighted (ordinal) variants.
|
|
93
|
+
|
|
94
|
+
**Parameters:**
|
|
95
|
+
|
|
96
|
+
| Parameter | Type | Description |
|
|
97
|
+
|-----------|------|-------------|
|
|
98
|
+
| `rater1` | `Label[]` | Labels assigned by the first rater. |
|
|
99
|
+
| `rater2` | `Label[]` | Labels assigned by the second rater. Must have the same length as `rater1`. |
|
|
100
|
+
| `options` | `CohensKappaOptions` | Optional. Configuration for weighting and confidence intervals. |
|
|
101
|
+
|
|
102
|
+
**`CohensKappaOptions`:**
|
|
103
|
+
|
|
104
|
+
| Field | Type | Default | Description |
|
|
105
|
+
|-------|------|---------|-------------|
|
|
106
|
+
| `weighted` | `boolean` | `false` | Enable weighted kappa for ordinal data. |
|
|
107
|
+
| `weights` | `'linear' \| 'quadratic'` | `'linear'` | Weight scheme when `weighted` is `true`. |
|
|
108
|
+
| `ci` | `boolean` | -- | Reserved for future confidence interval support. |
|
|
109
|
+
| `ciLevel` | `number` | -- | Confidence level (e.g., `0.95`). |
|
|
110
|
+
| `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
|
|
111
|
+
| `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
|
|
112
|
+
|
|
113
|
+
**Returns:** `KappaResult`
|
|
114
|
+
|
|
115
|
+
| Field | Type | Description |
|
|
116
|
+
|-------|------|-------------|
|
|
117
|
+
| `metric` | `MetricName` | Always `'cohens-kappa'`. |
|
|
118
|
+
| `value` | `number` | The computed kappa coefficient. |
|
|
119
|
+
| `observed` | `number` | Observed agreement proportion (Po). |
|
|
120
|
+
| `expected` | `number` | Expected agreement by chance (Pe). |
|
|
121
|
+
| `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
|
|
122
|
+
| `categories` | `Label[]` | Sorted list of unique categories found in the data. |
|
|
123
|
+
| `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
|
|
124
|
+
|
|
125
|
+
**Example:**
|
|
126
|
+
|
|
127
|
+
```ts
|
|
128
|
+
// Unweighted kappa
|
|
129
|
+
const result = cohenKappa(['A', 'B', 'C'], ['A', 'B', 'A']);
|
|
130
|
+
// { metric: 'cohens-kappa', value: 0.5384..., observed: 0.6666..., ... }
|
|
131
|
+
|
|
132
|
+
// Weighted kappa for ordinal ratings
|
|
133
|
+
const weighted = cohenKappa([1, 2, 3, 4], [1, 2, 4, 4], {
|
|
134
|
+
weighted: true,
|
|
135
|
+
weights: 'quadratic',
|
|
136
|
+
});
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
---
|
|
140
|
+
|
|
141
|
+
#### `fleissKappa(matrix)`
|
|
142
|
+
|
|
143
|
+
Computes Fleiss' Kappa for multiple raters (N >= 2).
|
|
144
|
+
|
|
145
|
+
**Parameters:**
|
|
146
|
+
|
|
147
|
+
| Parameter | Type | Description |
|
|
148
|
+
|-----------|------|-------------|
|
|
149
|
+
| `matrix` | `number[][]` | Category-count matrix. `matrix[i][j]` is the number of raters who assigned category `j` to subject `i`. All rows must sum to the same value (the number of raters per subject). |
|
|
150
|
+
|
|
151
|
+
**Returns:** `FleissKappaResult`
|
|
152
|
+
|
|
153
|
+
| Field | Type | Description |
|
|
154
|
+
|-------|------|-------------|
|
|
155
|
+
| `metric` | `'fleiss-kappa'` | Always `'fleiss-kappa'`. |
|
|
156
|
+
| `value` | `number` | The computed Fleiss' Kappa coefficient. |
|
|
157
|
+
| `observed` | `number` | Mean observed agreement across subjects. |
|
|
158
|
+
| `expected` | `number` | Expected agreement by chance. |
|
|
159
|
+
| `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
|
|
160
|
+
| `annotatorCount` | `number` | Number of raters per subject (row sum). |
|
|
161
|
+
| `itemCount` | `number` | Number of subjects (rows). |
|
|
162
|
+
| `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
|
|
163
|
+
|
|
164
|
+
**Example:**
|
|
165
|
+
|
|
166
|
+
```ts
|
|
167
|
+
const result = fleissKappa([
|
|
168
|
+
[4, 0, 0],
|
|
169
|
+
[0, 3, 1],
|
|
170
|
+
[0, 0, 4],
|
|
171
|
+
[1, 3, 0],
|
|
172
|
+
]);
|
|
173
|
+
// result.value: 0.6894...
|
|
174
|
+
// result.annotatorCount: 4
|
|
175
|
+
// result.itemCount: 4
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
**Throws:**
|
|
179
|
+
- If the matrix is empty.
|
|
180
|
+
- If rows have inconsistent lengths.
|
|
181
|
+
- If row sums are not equal.
|
|
182
|
+
- If fewer than 2 raters per subject.
|
|
183
|
+
- If fewer than 2 categories.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
#### `scottPi(rater1, rater2, options?)`
|
|
188
|
+
|
|
189
|
+
Computes Scott's Pi for two raters. Uses joint (pooled) marginal proportions to compute expected agreement, making it more robust than Cohen's Kappa when rater biases differ.
|
|
190
|
+
|
|
191
|
+
**Parameters:**
|
|
192
|
+
|
|
193
|
+
| Parameter | Type | Description |
|
|
194
|
+
|-----------|------|-------------|
|
|
195
|
+
| `rater1` | `Label[]` | Labels assigned by the first rater. |
|
|
196
|
+
| `rater2` | `Label[]` | Labels assigned by the second rater. Must have the same length as `rater1`. |
|
|
197
|
+
| `options` | `ScottsPiOptions` | Optional. Reserved for future confidence interval support. |
|
|
198
|
+
|
|
199
|
+
**`ScottsPiOptions`:**
|
|
200
|
+
|
|
201
|
+
| Field | Type | Default | Description |
|
|
202
|
+
|-------|------|---------|-------------|
|
|
203
|
+
| `ci` | `boolean` | -- | Reserved for future confidence interval support. |
|
|
204
|
+
| `ciLevel` | `number` | -- | Confidence level. |
|
|
205
|
+
| `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
|
|
206
|
+
| `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
|
|
207
|
+
|
|
208
|
+
**Returns:** `PiResult`
|
|
209
|
+
|
|
210
|
+
| Field | Type | Description |
|
|
211
|
+
|-------|------|-------------|
|
|
212
|
+
| `metric` | `'scotts-pi'` | Always `'scotts-pi'`. |
|
|
213
|
+
| `value` | `number` | The computed Pi coefficient. |
|
|
214
|
+
| `observed` | `number` | Observed agreement proportion. |
|
|
215
|
+
| `expected` | `number` | Expected agreement from joint marginals. |
|
|
216
|
+
| `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
|
|
217
|
+
| `categories` | `Label[]` | Sorted list of unique categories. |
|
|
218
|
+
| `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
|
|
219
|
+
|
|
220
|
+
**Example:**
|
|
221
|
+
|
|
222
|
+
```ts
|
|
223
|
+
const result = scottPi(['A', 'B', 'A', 'C'], ['A', 'B', 'B', 'C']);
|
|
224
|
+
// result.value: 0.5294...
|
|
225
|
+
// result.observed: 0.75
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
#### `krippendorffAlpha(matrix, options?)`
|
|
231
|
+
|
|
232
|
+
Computes Krippendorff's Alpha for multiple raters with support for missing data and multiple measurement levels.
|
|
233
|
+
|
|
234
|
+
**Parameters:**
|
|
235
|
+
|
|
236
|
+
| Parameter | Type | Description |
|
|
237
|
+
|-----------|------|-------------|
|
|
238
|
+
| `matrix` | `(string \| number \| null \| undefined)[][]` | Rater-by-item matrix. `matrix[r][c]` is the label assigned by rater `r` to item `c`. Use `null` or `undefined` for missing annotations. |
|
|
239
|
+
| `options` | `KrippendorffOptions` | Optional. Configuration for measurement level and missing data handling. |
|
|
240
|
+
|
|
241
|
+
**`KrippendorffOptions`:**
|
|
242
|
+
|
|
243
|
+
| Field | Type | Default | Description |
|
|
244
|
+
|-------|------|---------|-------------|
|
|
245
|
+
| `level` | `MeasurementLevel` | `'nominal'` | Measurement level: `'nominal'`, `'ordinal'`, `'interval'`, or `'ratio'`. Determines the disagreement function. |
|
|
246
|
+
| `missingData` | `'exclude' \| 'error'` | `'exclude'` | How to handle missing values. `'exclude'` skips them; `'error'` throws. |
|
|
247
|
+
| `ci` | `boolean` | -- | Reserved for future confidence interval support. |
|
|
248
|
+
| `ciLevel` | `number` | -- | Confidence level. |
|
|
249
|
+
| `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
|
|
250
|
+
| `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
|
|
251
|
+
|
|
252
|
+
**Returns:** `AlphaResult`
|
|
253
|
+
|
|
254
|
+
| Field | Type | Description |
|
|
255
|
+
|-------|------|-------------|
|
|
256
|
+
| `metric` | `'krippendorff-alpha'` | Always `'krippendorff-alpha'`. |
|
|
257
|
+
| `value` | `number` | The computed alpha coefficient. |
|
|
258
|
+
| `interpretation` | `AlphaInterpretation` | Krippendorff interpretation: `'unreliable'`, `'tentative'`, or `'reliable'`. |
|
|
259
|
+
| `level` | `MeasurementLevel` | The measurement level used. |
|
|
260
|
+
| `itemCount` | `number` | Number of items (columns). |
|
|
261
|
+
| `annotatorCount` | `number` | Number of raters (rows). |
|
|
262
|
+
| `missingCount` | `number` | Total number of missing annotations. |
|
|
263
|
+
| `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
|
|
264
|
+
|
|
265
|
+
**Example:**
|
|
266
|
+
|
|
267
|
+
```ts
|
|
268
|
+
// Nominal data with missing values
|
|
269
|
+
const result = krippendorffAlpha(
|
|
270
|
+
[
|
|
271
|
+
['A', null, 'C', 'A'],
|
|
272
|
+
['A', 'B', 'C', null],
|
|
273
|
+
['B', 'B', 'C', 'A'],
|
|
274
|
+
],
|
|
275
|
+
);
|
|
276
|
+
// result.value: 0.4615...
|
|
277
|
+
// result.missingCount: 2
|
|
278
|
+
// result.annotatorCount: 3
|
|
279
|
+
|
|
280
|
+
// Interval-level numeric data
|
|
281
|
+
const interval = krippendorffAlpha(
|
|
282
|
+
[
|
|
283
|
+
[1, 2, 3, 4],
|
|
284
|
+
[1, 2, 4, 4],
|
|
285
|
+
],
|
|
286
|
+
{ level: 'interval' },
|
|
287
|
+
);
|
|
288
|
+
// interval.level: 'interval'
|
|
289
|
+
```
|
|
290
|
+
|
|
291
|
+
**Disagreement functions by measurement level:**
|
|
292
|
+
|
|
293
|
+
| Level | Function | Description |
|
|
294
|
+
|-------|----------|-------------|
|
|
295
|
+
| `nominal` | `d = v === v' ? 0 : 1` | Binary: same or different. |
|
|
296
|
+
| `ordinal` | `d = v === v' ? 0 : 1` | Treated as nominal (rank-based extension planned). |
|
|
297
|
+
| `interval` | `d = (v - v')^2` | Squared numeric difference. |
|
|
298
|
+
| `ratio` | `d = (v - v')^2` | Squared numeric difference. |
|
|
299
|
+
|
|
300
|
+
---
|
|
301
|
+
|
|
302
|
+
#### `gwetAC1(rater1, rater2, options?)`
|
|
303
|
+
|
|
304
|
+
Computes Gwet's AC1 for two raters. Designed to be robust to the prevalence and bias paradox that causes Cohen's Kappa to produce misleadingly low values when one category dominates.
|
|
305
|
+
|
|
306
|
+
**Parameters:**
|
|
307
|
+
|
|
308
|
+
| Parameter | Type | Description |
|
|
309
|
+
|-----------|------|-------------|
|
|
310
|
+
| `rater1` | `Label[]` | Labels assigned by the first rater. |
|
|
311
|
+
| `rater2` | `Label[]` | Labels assigned by the second rater. Must have the same length as `rater1`. |
|
|
312
|
+
| `options` | `AC1Options` | Optional. Reserved for future confidence interval support. |
|
|
313
|
+
|
|
314
|
+
**`AC1Options`:**
|
|
315
|
+
|
|
316
|
+
| Field | Type | Default | Description |
|
|
317
|
+
|-------|------|---------|-------------|
|
|
318
|
+
| `ci` | `boolean` | -- | Reserved for future confidence interval support. |
|
|
319
|
+
| `ciLevel` | `number` | -- | Confidence level. |
|
|
320
|
+
| `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
|
|
321
|
+
| `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
|
|
322
|
+
|
|
323
|
+
**Returns:** `AC1Result`
|
|
324
|
+
|
|
325
|
+
| Field | Type | Description |
|
|
326
|
+
|-------|------|-------------|
|
|
327
|
+
| `metric` | `'gwets-ac1'` | Always `'gwets-ac1'`. |
|
|
328
|
+
| `value` | `number` | The computed AC1 coefficient. |
|
|
329
|
+
| `observed` | `number` | Observed agreement proportion. |
|
|
330
|
+
| `expected` | `number` | Gwet's expected agreement by chance. |
|
|
331
|
+
| `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
|
|
332
|
+
| `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
|
|
333
|
+
|
|
334
|
+
**Example:**
|
|
335
|
+
|
|
336
|
+
```ts
|
|
337
|
+
const result = gwetAC1(
|
|
338
|
+
['Y', 'Y', 'Y', 'Y', 'Y', 'N'],
|
|
339
|
+
['Y', 'Y', 'Y', 'Y', 'N', 'N'],
|
|
340
|
+
);
|
|
341
|
+
// result.value: 0.6666...
|
|
342
|
+
// result.observed: 0.6666...
|
|
343
|
+
```
|
|
344
|
+
|
|
345
|
+
---
|
|
346
|
+
|
|
347
|
+
### Interpretation Functions
|
|
348
|
+
|
|
349
|
+
#### `interpretKappa(value)`
|
|
350
|
+
|
|
351
|
+
Classifies a kappa-family metric value using the Landis & Koch (1977) scale.
|
|
352
|
+
|
|
353
|
+
**Parameters:**
|
|
354
|
+
|
|
355
|
+
| Parameter | Type | Description |
|
|
356
|
+
|-----------|------|-------------|
|
|
357
|
+
| `value` | `number` | The kappa coefficient to interpret. |
|
|
358
|
+
|
|
359
|
+
**Returns:** `Interpretation` -- one of `'poor'`, `'slight'`, `'fair'`, `'moderate'`, `'substantial'`, `'almost-perfect'`.
|
|
360
|
+
|
|
361
|
+
**Scale:**
|
|
362
|
+
|
|
363
|
+
| Range | Interpretation |
|
|
364
|
+
|-------|---------------|
|
|
365
|
+
| < 0.00 | `'poor'` |
|
|
366
|
+
| 0.00 -- 0.20 | `'slight'` |
|
|
367
|
+
| 0.20 -- 0.40 | `'fair'` |
|
|
368
|
+
| 0.40 -- 0.60 | `'moderate'` |
|
|
369
|
+
| 0.60 -- 0.80 | `'substantial'` |
|
|
370
|
+
| >= 0.80 | `'almost-perfect'` |
|
|
371
|
+
|
|
372
|
+
```ts
|
|
373
|
+
interpretKappa(0.75); // 'substantial'
|
|
374
|
+
interpretKappa(-0.1); // 'poor'
|
|
375
|
+
```
|
|
376
|
+
|
|
377
|
+
---
|
|
378
|
+
|
|
379
|
+
#### `interpretAlpha(value)`
|
|
380
|
+
|
|
381
|
+
Classifies a Krippendorff's Alpha value using Krippendorff's recommended thresholds.
|
|
382
|
+
|
|
383
|
+
**Parameters:**
|
|
384
|
+
|
|
385
|
+
| Parameter | Type | Description |
|
|
386
|
+
|-----------|------|-------------|
|
|
387
|
+
| `value` | `number` | The alpha coefficient to interpret. |
|
|
388
|
+
|
|
389
|
+
**Returns:** `AlphaInterpretation` -- one of `'unreliable'`, `'tentative'`, `'reliable'`.
|
|
390
|
+
|
|
391
|
+
**Scale:**
|
|
392
|
+
|
|
393
|
+
| Range | Interpretation |
|
|
394
|
+
|-------|---------------|
|
|
395
|
+
| < 0.667 | `'unreliable'` |
|
|
396
|
+
| 0.667 -- 0.800 | `'tentative'` |
|
|
397
|
+
| >= 0.800 | `'reliable'` |
|
|
398
|
+
|
|
399
|
+
```ts
|
|
400
|
+
interpretAlpha(0.85); // 'reliable'
|
|
401
|
+
interpretAlpha(0.70); // 'tentative'
|
|
402
|
+
```
|
|
403
|
+
|
|
404
|
+
---
|
|
405
|
+
|
|
406
|
+
### Validation Functions
|
|
407
|
+
|
|
408
|
+
#### `assertEqualLength(a, b, label?)`
|
|
409
|
+
|
|
410
|
+
Throws if arrays `a` and `b` do not have the same length.
|
|
411
|
+
|
|
412
|
+
```ts
|
|
413
|
+
assertEqualLength([1, 2], [3, 4]); // passes
|
|
414
|
+
assertEqualLength([1, 2], [3], 'raters'); // throws: "Arrays must have equal length: got 2 and 1 raters"
|
|
415
|
+
```
|
|
416
|
+
|
|
417
|
+
---
|
|
418
|
+
|
|
419
|
+
#### `assertNonEmpty(arr, label?)`
|
|
420
|
+
|
|
421
|
+
Throws if `arr` is empty.
|
|
422
|
+
|
|
423
|
+
```ts
|
|
424
|
+
assertNonEmpty([1, 2, 3]); // passes
|
|
425
|
+
assertNonEmpty([], 'subjects'); // throws: "Array must be non-empty subjects"
|
|
426
|
+
```
|
|
427
|
+
|
|
428
|
+
---
|
|
429
|
+
|
|
430
|
+
#### `assertConsistentRowLengths(matrix)`
|
|
431
|
+
|
|
432
|
+
Throws if the rows of a 2D matrix do not all have the same length.
|
|
433
|
+
|
|
434
|
+
```ts
|
|
435
|
+
assertConsistentRowLengths([[1, 2], [3, 4]]); // passes
|
|
436
|
+
assertConsistentRowLengths([[1, 2], [3, 4, 5]]); // throws: "Matrix has inconsistent row lengths"
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
---
|
|
440
|
+
|
|
441
|
+
#### `assertConstantRowSums(matrix)`
|
|
442
|
+
|
|
443
|
+
Throws if rows of a numeric matrix do not all sum to the same value (within floating-point tolerance of 1e-9).
|
|
444
|
+
|
|
445
|
+
```ts
|
|
446
|
+
assertConstantRowSums([[1, 2, 3], [2, 2, 2]]); // passes (both sum to 6)
|
|
447
|
+
assertConstantRowSums([[1, 2], [3, 4]]); // throws: "Category-count matrix rows must have equal sums"
|
|
448
|
+
```
|
|
449
|
+
|
|
450
|
+
---
|
|
451
|
+
|
|
452
|
+
#### `assertMinAnnotators(count)`
|
|
453
|
+
|
|
454
|
+
Throws if `count` is less than 2.
|
|
455
|
+
|
|
456
|
+
```ts
|
|
457
|
+
assertMinAnnotators(3); // passes
|
|
458
|
+
assertMinAnnotators(1); // throws: "At least 2 annotators required, got 1"
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
---
|
|
462
|
+
|
|
463
|
+
#### `detectDuplicates(triples)`
|
|
464
|
+
|
|
465
|
+
Returns `(item, annotator)` pairs that appear more than once in an array of annotation triples. Each duplicate pair is returned only once regardless of how many times it appears.
|
|
466
|
+
|
|
467
|
+
**Parameters:**
|
|
468
|
+
|
|
469
|
+
| Parameter | Type | Description |
|
|
470
|
+
|-----------|------|-------------|
|
|
471
|
+
| `triples` | `Array<{ item: unknown; annotator: unknown }>` | Array of annotation triples. |
|
|
472
|
+
|
|
473
|
+
**Returns:** `Array<{ item: unknown; annotator: unknown }>` -- the duplicate pairs.
|
|
474
|
+
|
|
475
|
+
```ts
|
|
476
|
+
const dupes = detectDuplicates([
|
|
477
|
+
{ item: 1, annotator: 'A' },
|
|
478
|
+
{ item: 1, annotator: 'A' },
|
|
479
|
+
{ item: 2, annotator: 'B' },
|
|
480
|
+
]);
|
|
481
|
+
// [{ item: 1, annotator: 'A' }]
|
|
482
|
+
```
|
|
483
|
+
|
|
484
|
+
---
|
|
485
|
+
|
|
486
|
+
### Types
|
|
487
|
+
|
|
488
|
+
All TypeScript types are exported from the package entry point.
|
|
489
|
+
|
|
490
|
+
#### Core Types
|
|
491
|
+
|
|
492
|
+
| Type | Description |
|
|
493
|
+
|------|-------------|
|
|
494
|
+
| `Label` | `string \| number` -- a single annotation label. |
|
|
495
|
+
| `MeasurementLevel` | `'nominal' \| 'ordinal' \| 'interval' \| 'ratio'` |
|
|
496
|
+
| `MetricName` | `'cohens-kappa' \| 'fleiss-kappa' \| 'krippendorff-alpha' \| 'scotts-pi' \| 'gwets-ac1' \| 'percent-agreement'` |
|
|
497
|
+
| `Interpretation` | `'poor' \| 'slight' \| 'fair' \| 'moderate' \| 'substantial' \| 'almost-perfect'` |
|
|
498
|
+
| `AlphaInterpretation` | `'unreliable' \| 'tentative' \| 'reliable'` |
|
|
499
|
+
|
|
500
|
+
#### Data Types
|
|
501
|
+
|
|
502
|
+
| Type | Description |
|
|
503
|
+
|------|-------------|
|
|
504
|
+
| `AnnotationTriple` | `{ item: string \| number; annotator: string \| number; label: Label }` |
|
|
505
|
+
| `ConfusionMatrix` | `{ labels: Label[]; matrix: number[][] }` -- `matrix[i][j]` = count where rater 1 said `labels[i]` and rater 2 said `labels[j]`. |
|
|
506
|
+
| `ConfidenceInterval` | `{ lower: number; upper: number; level: number }` |
|
|
507
|
+
|
|
508
|
+
#### Result Types
|
|
509
|
+
|
|
510
|
+
| Type | Description |
|
|
511
|
+
|------|-------------|
|
|
512
|
+
| `KappaResult` | Result from `cohenKappa`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `categories?`, `ci?`. |
|
|
513
|
+
| `FleissKappaResult` | Result from `fleissKappa`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `annotatorCount`, `itemCount`, `ci?`. |
|
|
514
|
+
| `PiResult` | Result from `scottPi`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `categories?`, `ci?`. |
|
|
515
|
+
| `AlphaResult` | Result from `krippendorffAlpha`. Fields: `metric`, `value`, `interpretation`, `level`, `itemCount`, `annotatorCount`, `missingCount`, `ci?`. |
|
|
516
|
+
| `AC1Result` | Result from `gwetAC1`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `ci?`. |
|
|
517
|
+
| `AgreementReport` | `{ metric: MetricName; value: number; interpretation: Interpretation \| AlphaInterpretation; ci?: ConfidenceInterval }` |
|
|
518
|
+
|
|
519
|
+
#### Options Types
|
|
520
|
+
|
|
521
|
+
| Type | Description |
|
|
522
|
+
|------|-------------|
|
|
523
|
+
| `CohensKappaOptions` | `{ weighted?, weights?, ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
|
|
524
|
+
| `FleissKappaOptions` | `{ ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
|
|
525
|
+
| `KrippendorffOptions` | `{ level?, ci?, ciLevel?, ciBootstrapSamples?, seed?, missingData? }` |
|
|
526
|
+
| `ScottsPiOptions` | `{ ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
|
|
527
|
+
| `AC1Options` | `{ ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
|
|
528
|
+
| `AgreementOptions` | `{ metric?, level?, missingData?, ci?, ciLevel?, seed? }` |
|
|
529
|
+
| `CIOptions` | `{ level?, bootstrapSamples?, seed? }` |
|
|
530
|
+
|
|
531
|
+
---
|
|
532
|
+
|
|
533
|
+
## Configuration
|
|
534
|
+
|
|
535
|
+
Each metric function accepts an optional options object as its last parameter. All options fields are optional and have sensible defaults.
|
|
536
|
+
|
|
537
|
+
**Cohen's Kappa weighting:**
|
|
538
|
+
|
|
539
|
+
```ts
|
|
540
|
+
// Unweighted (default) -- for nominal/categorical data
|
|
541
|
+
cohenKappa(rater1, rater2);
|
|
542
|
+
|
|
543
|
+
// Linear weights -- penalizes disagreements proportionally to distance
|
|
544
|
+
cohenKappa(rater1, rater2, { weighted: true, weights: 'linear' });
|
|
545
|
+
|
|
546
|
+
// Quadratic weights -- penalizes far disagreements more heavily
|
|
547
|
+
cohenKappa(rater1, rater2, { weighted: true, weights: 'quadratic' });
|
|
548
|
+
```
|
|
549
|
+
|
|
550
|
+
**Krippendorff's Alpha measurement level:**
|
|
551
|
+
|
|
552
|
+
```ts
|
|
553
|
+
// Nominal (default) -- categories with no order
|
|
554
|
+
krippendorffAlpha(matrix);
|
|
555
|
+
|
|
556
|
+
// Interval -- numeric data with meaningful distances
|
|
557
|
+
krippendorffAlpha(matrix, { level: 'interval' });
|
|
558
|
+
|
|
559
|
+
// Ratio -- numeric data with a true zero
|
|
560
|
+
krippendorffAlpha(matrix, { level: 'ratio' });
|
|
561
|
+
```
|
|
562
|
+
|
|
563
|
+
**Missing data handling:**
|
|
564
|
+
|
|
565
|
+
```ts
|
|
566
|
+
// Exclude missing values (default)
|
|
567
|
+
krippendorffAlpha(matrix, { missingData: 'exclude' });
|
|
568
|
+
|
|
569
|
+
// Throw an error if any value is missing
|
|
570
|
+
krippendorffAlpha(matrix, { missingData: 'error' });
|
|
571
|
+
```
|
|
572
|
+
|
|
573
|
+
---
|
|
574
|
+
|
|
575
|
+
## Error Handling
|
|
576
|
+
|
|
577
|
+
All metric functions validate their inputs and throw descriptive `Error` instances on invalid data.
|
|
578
|
+
|
|
579
|
+
| Condition | Error Message |
|
|
580
|
+
|-----------|--------------|
|
|
581
|
+
| Empty input array | `"Array must be non-empty"` |
|
|
582
|
+
| Rater arrays of different lengths | `"Arrays must have equal length: got X and Y"` |
|
|
583
|
+
| Matrix rows with inconsistent lengths | `"Matrix has inconsistent row lengths"` |
|
|
584
|
+
| Matrix rows with different sums | `"Category-count matrix rows must have equal sums"` |
|
|
585
|
+
| Fewer than 2 annotators | `"At least 2 annotators required, got N"` |
|
|
586
|
+
| Fewer than 2 categories (Fleiss) | `"Fleiss Kappa requires at least 2 categories"` |
|
|
587
|
+
| Missing data with `missingData: 'error'` | `"Missing data found at rater R, item C"` |
|
|
588
|
+
|
|
589
|
+
All errors are synchronous and thrown immediately during input validation, before any computation begins.
|
|
590
|
+
|
|
591
|
+
---
|
|
592
|
+
|
|
593
|
+
## Advanced Usage
|
|
594
|
+
|
|
595
|
+
### Choosing Between Metrics
|
|
596
|
+
|
|
597
|
+
| Scenario | Recommended Metric |
|
|
598
|
+
|----------|-------------------|
|
|
599
|
+
| Two raters, nominal categories | `cohenKappa` (unweighted) |
|
|
600
|
+
| Two raters, ordinal scale | `cohenKappa` with `weighted: true` |
|
|
601
|
+
| Two raters, possible rater bias | `scottPi` (pooled marginals) |
|
|
602
|
+
| Two raters, skewed category distribution | `gwetAC1` (prevalence-robust) |
|
|
603
|
+
| Three or more raters, nominal | `fleissKappa` |
|
|
604
|
+
| Any number of raters, missing data | `krippendorffAlpha` |
|
|
605
|
+
| Any number of raters, interval/ratio data | `krippendorffAlpha` with `level` option |
|
|
606
|
+
|
|
607
|
+
### Comparing Cohen's Kappa, Scott's Pi, and Gwet's AC1
|
|
608
|
+
|
|
609
|
+
When both raters have identical marginal distributions, Scott's Pi and Cohen's Kappa produce the same value. They diverge when raters have different biases (e.g., one rater assigns "positive" more frequently).
|
|
610
|
+
|
|
611
|
+
```ts
|
|
612
|
+
// Symmetric marginals -- Pi and Kappa agree
|
|
613
|
+
const r1 = ['A', 'A', 'B', 'B', 'C', 'C'];
|
|
614
|
+
const r2 = ['A', 'B', 'A', 'B', 'C', 'C'];
|
|
615
|
+
scottPi(r1, r2).value; // same as cohenKappa(r1, r2).value
|
|
616
|
+
|
|
617
|
+
// Skewed data -- AC1 is more stable than Kappa
|
|
618
|
+
const r1Skew = ['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N'];
|
|
619
|
+
const r2Skew = ['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N'];
|
|
620
|
+
gwetAC1(r1Skew, r2Skew).value; // higher than cohenKappa
|
|
621
|
+
cohenKappa(r1Skew, r2Skew).value; // suppressed by prevalence effect
|
|
622
|
+
```
|
|
623
|
+
|
|
624
|
+
### Handling Edge Cases
|
|
625
|
+
|
|
626
|
+
When all items fall into a single category (Pe = 1), the kappa formula `(Po - Pe) / (1 - Pe)` would produce a division by zero. In this case, all metrics return `1.0` (perfect agreement) since every item receives the same label from every rater.
|
|
627
|
+
|
|
628
|
+
```ts
|
|
629
|
+
const allSame = cohenKappa(['A', 'A', 'A'], ['A', 'A', 'A']);
|
|
630
|
+
// allSame.value === 1.0
|
|
631
|
+
```
|
|
632
|
+
|
|
633
|
+
### Pre-validating Annotation Data
|
|
634
|
+
|
|
635
|
+
Use the validation utilities to check data integrity before computing metrics.
|
|
636
|
+
|
|
637
|
+
```ts
|
|
638
|
+
import {
|
|
639
|
+
assertEqualLength,
|
|
640
|
+
assertNonEmpty,
|
|
641
|
+
detectDuplicates,
|
|
642
|
+
} from 'label-score';
|
|
643
|
+
|
|
644
|
+
const triples = [
|
|
645
|
+
{ item: 1, annotator: 'A', label: 'pos' },
|
|
646
|
+
{ item: 1, annotator: 'A', label: 'neg' }, // duplicate!
|
|
647
|
+
{ item: 2, annotator: 'B', label: 'pos' },
|
|
648
|
+
];
|
|
649
|
+
|
|
650
|
+
const dupes = detectDuplicates(triples);
|
|
651
|
+
if (dupes.length > 0) {
|
|
652
|
+
console.error('Duplicate annotations found:', dupes);
|
|
653
|
+
}
|
|
654
|
+
```
|
|
655
|
+
|
|
656
|
+
---
|
|
657
|
+
|
|
658
|
+
## TypeScript
|
|
659
|
+
|
|
660
|
+
`label-score` is written in TypeScript and ships type declarations (`dist/index.d.ts`) alongside the compiled JavaScript. All public types are available as named imports.
|
|
661
|
+
|
|
662
|
+
```ts
|
|
663
|
+
import type {
|
|
664
|
+
Label,
|
|
665
|
+
KappaResult,
|
|
666
|
+
FleissKappaResult,
|
|
667
|
+
AlphaResult,
|
|
668
|
+
PiResult,
|
|
669
|
+
AC1Result,
|
|
670
|
+
Interpretation,
|
|
671
|
+
AlphaInterpretation,
|
|
672
|
+
MeasurementLevel,
|
|
673
|
+
MetricName,
|
|
674
|
+
AnnotationTriple,
|
|
675
|
+
ConfusionMatrix,
|
|
676
|
+
ConfidenceInterval,
|
|
677
|
+
CohensKappaOptions,
|
|
678
|
+
FleissKappaOptions,
|
|
679
|
+
KrippendorffOptions,
|
|
680
|
+
ScottsPiOptions,
|
|
681
|
+
AC1Options,
|
|
682
|
+
AgreementOptions,
|
|
683
|
+
CIOptions,
|
|
684
|
+
AgreementReport,
|
|
685
|
+
} from 'label-score';
|
|
686
|
+
```
|
|
687
|
+
|
|
688
|
+
The package targets ES2022 and uses CommonJS module format. Compiler options include `strict: true`, `declaration: true`, and `declarationMap: true` for full IDE support.
|
|
689
|
+
|
|
690
|
+
---
|
|
691
|
+
|
|
692
|
+
## License
|
|
693
|
+
|
|
694
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"cohen-kappa.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/cohen-kappa.test.ts"],"names":[],"mappings":""}
|