@silupanda/label-score 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/README.md +694 -0
  2. package/dist/__tests__/cohen-kappa.test.d.ts +2 -0
  3. package/dist/__tests__/cohen-kappa.test.d.ts.map +1 -0
  4. package/dist/__tests__/cohen-kappa.test.js +110 -0
  5. package/dist/__tests__/cohen-kappa.test.js.map +1 -0
  6. package/dist/__tests__/fleiss-kappa.test.d.ts +2 -0
  7. package/dist/__tests__/fleiss-kappa.test.d.ts.map +1 -0
  8. package/dist/__tests__/fleiss-kappa.test.js +84 -0
  9. package/dist/__tests__/fleiss-kappa.test.js.map +1 -0
  10. package/dist/__tests__/gwet-ac1.test.d.ts +2 -0
  11. package/dist/__tests__/gwet-ac1.test.d.ts.map +1 -0
  12. package/dist/__tests__/gwet-ac1.test.js +74 -0
  13. package/dist/__tests__/gwet-ac1.test.js.map +1 -0
  14. package/dist/__tests__/interpret.test.d.ts +2 -0
  15. package/dist/__tests__/interpret.test.d.ts.map +1 -0
  16. package/dist/__tests__/interpret.test.js +66 -0
  17. package/dist/__tests__/interpret.test.js.map +1 -0
  18. package/dist/__tests__/krippendorff-alpha.test.d.ts +2 -0
  19. package/dist/__tests__/krippendorff-alpha.test.d.ts.map +1 -0
  20. package/dist/__tests__/krippendorff-alpha.test.js +98 -0
  21. package/dist/__tests__/krippendorff-alpha.test.js.map +1 -0
  22. package/dist/__tests__/scott-pi.test.d.ts +2 -0
  23. package/dist/__tests__/scott-pi.test.d.ts.map +1 -0
  24. package/dist/__tests__/scott-pi.test.js +60 -0
  25. package/dist/__tests__/scott-pi.test.js.map +1 -0
  26. package/dist/__tests__/types.test.d.ts +2 -0
  27. package/dist/__tests__/types.test.d.ts.map +1 -0
  28. package/dist/__tests__/types.test.js +92 -0
  29. package/dist/__tests__/types.test.js.map +1 -0
  30. package/dist/__tests__/validate.test.d.ts +2 -0
  31. package/dist/__tests__/validate.test.d.ts.map +1 -0
  32. package/dist/__tests__/validate.test.js +121 -0
  33. package/dist/__tests__/validate.test.js.map +1 -0
  34. package/dist/index.d.ts +9 -0
  35. package/dist/index.d.ts.map +1 -0
  36. package/dist/index.js +24 -0
  37. package/dist/index.js.map +1 -0
  38. package/dist/interpret.d.ts +23 -0
  39. package/dist/interpret.d.ts.map +1 -0
  40. package/dist/interpret.js +48 -0
  41. package/dist/interpret.js.map +1 -0
  42. package/dist/metrics/cohen-kappa.d.ts +16 -0
  43. package/dist/metrics/cohen-kappa.d.ts.map +1 -0
  44. package/dist/metrics/cohen-kappa.js +130 -0
  45. package/dist/metrics/cohen-kappa.js.map +1 -0
  46. package/dist/metrics/fleiss-kappa.d.ts +17 -0
  47. package/dist/metrics/fleiss-kappa.d.ts.map +1 -0
  48. package/dist/metrics/fleiss-kappa.js +63 -0
  49. package/dist/metrics/fleiss-kappa.js.map +1 -0
  50. package/dist/metrics/gwet-ac1.d.ts +19 -0
  51. package/dist/metrics/gwet-ac1.d.ts.map +1 -0
  52. package/dist/metrics/gwet-ac1.js +82 -0
  53. package/dist/metrics/gwet-ac1.js.map +1 -0
  54. package/dist/metrics/krippendorff-alpha.d.ts +21 -0
  55. package/dist/metrics/krippendorff-alpha.d.ts.map +1 -0
  56. package/dist/metrics/krippendorff-alpha.js +162 -0
  57. package/dist/metrics/krippendorff-alpha.js.map +1 -0
  58. package/dist/metrics/scott-pi.d.ts +18 -0
  59. package/dist/metrics/scott-pi.d.ts.map +1 -0
  60. package/dist/metrics/scott-pi.js +73 -0
  61. package/dist/metrics/scott-pi.js.map +1 -0
  62. package/dist/types.d.ts +119 -0
  63. package/dist/types.d.ts.map +1 -0
  64. package/dist/types.js +3 -0
  65. package/dist/types.js.map +1 -0
  66. package/dist/validate.d.ts +33 -0
  67. package/dist/validate.d.ts.map +1 -0
  68. package/dist/validate.js +83 -0
  69. package/dist/validate.js.map +1 -0
  70. package/package.json +33 -0
package/README.md ADDED
@@ -0,0 +1,694 @@
1
+ # label-score
2
+
3
+ Inter-annotator agreement metrics for JavaScript and TypeScript. Zero dependencies.
4
+
5
+ [![npm version](https://img.shields.io/npm/v/label-score.svg)](https://www.npmjs.com/package/label-score)
6
+ [![npm downloads](https://img.shields.io/npm/dt/label-score.svg)](https://www.npmjs.com/package/label-score)
7
+ [![license](https://img.shields.io/npm/l/label-score.svg)](https://github.com/SiluPanda/label-score/blob/master/LICENSE)
8
+ [![node](https://img.shields.io/node/v/label-score.svg)](https://nodejs.org)
9
+
10
+ ---
11
+
12
+ ## Description
13
+
14
+ `label-score` computes chance-corrected agreement metrics for annotation data -- labels assigned to items by multiple annotators (human or machine). It implements five standard inter-annotator agreement (IAA) metrics, each returning a structured result object with the computed value, an interpretation label, and all intermediate quantities (observed agreement, expected agreement, category lists, annotator/item counts).
15
+
16
+ Use cases include validating human annotation quality before model training, measuring LLM-as-judge consistency across multiple model evaluators, building gold-standard evaluation datasets, and reporting IAA in research papers. All computations are implemented in pure TypeScript with zero runtime dependencies. Numerical outputs are verified against established Python implementations (scikit-learn, NLTK, krippendorff).
17
+
18
+ ---
19
+
20
+ ## Installation
21
+
22
+ ```bash
23
+ npm install label-score
24
+ ```
25
+
26
+ ---
27
+
28
+ ## Quick Start
29
+
30
+ ```ts
31
+ import {
32
+ cohenKappa,
33
+ fleissKappa,
34
+ scottPi,
35
+ krippendorffAlpha,
36
+ gwetAC1,
37
+ } from 'label-score';
38
+
39
+ // Cohen's Kappa -- two raters, categorical labels
40
+ const kappa = cohenKappa(['A', 'B', 'C', 'A'], ['A', 'B', 'A', 'A']);
41
+ console.log(kappa.value); // 0.5454...
42
+ console.log(kappa.interpretation); // 'moderate'
43
+
44
+ // Fleiss' Kappa -- multiple raters via category-count matrix
45
+ const fleiss = fleissKappa([
46
+ [4, 0, 0], // all 4 raters chose category 0
47
+ [0, 3, 1], // 3 raters chose category 1, 1 chose category 2
48
+ [0, 0, 4], // all 4 raters chose category 2
49
+ ]);
50
+ console.log(fleiss.value); // 0.7894...
51
+ console.log(fleiss.annotatorCount); // 4
52
+
53
+ // Scott's Pi -- two raters, joint marginals
54
+ const pi = scottPi(['A', 'B', 'C'], ['A', 'B', 'A']);
55
+ console.log(pi.value);
56
+
57
+ // Krippendorff's Alpha -- multiple raters, handles missing data
58
+ const alpha = krippendorffAlpha([
59
+ ['A', null, 'C', 'A'],
60
+ ['A', 'B', 'C', null],
61
+ ]);
62
+ console.log(alpha.value);
63
+ console.log(alpha.missingCount); // 2
64
+
65
+ // Gwet's AC1 -- robust to prevalence effects
66
+ const ac1 = gwetAC1(['Y', 'Y', 'Y', 'N'], ['Y', 'Y', 'N', 'N']);
67
+ console.log(ac1.value);
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Features
73
+
74
+ - **Five agreement metrics** -- Cohen's Kappa, Fleiss' Kappa, Scott's Pi, Krippendorff's Alpha, and Gwet's AC1.
75
+ - **Weighted kappa** -- Linear and quadratic weighting schemes for ordinal data in Cohen's Kappa.
76
+ - **Missing data support** -- Krippendorff's Alpha excludes or rejects missing annotations via configuration.
77
+ - **Multiple measurement levels** -- Nominal, ordinal, interval, and ratio distance functions for Krippendorff's Alpha.
78
+ - **Automatic interpretation** -- Every result includes a human-readable interpretation label based on published scales (Landis & Koch for kappa-family; Krippendorff's thresholds for alpha).
79
+ - **Input validation** -- Descriptive errors for mismatched array lengths, empty inputs, inconsistent matrices, and insufficient annotators.
80
+ - **Duplicate detection** -- Utility to find repeated (item, annotator) pairs in annotation triple data.
81
+ - **Full TypeScript support** -- All functions, options, and result types are exported and fully typed.
82
+ - **Zero dependencies** -- Pure TypeScript, no runtime dependencies.
83
+
84
+ ---
85
+
86
+ ## API Reference
87
+
88
+ ### Metric Functions
89
+
90
+ #### `cohenKappa(rater1, rater2, options?)`
91
+
92
+ Computes Cohen's Kappa for two raters. Supports unweighted (nominal) and weighted (ordinal) variants.
93
+
94
+ **Parameters:**
95
+
96
+ | Parameter | Type | Description |
97
+ |-----------|------|-------------|
98
+ | `rater1` | `Label[]` | Labels assigned by the first rater. |
99
+ | `rater2` | `Label[]` | Labels assigned by the second rater. Must have the same length as `rater1`. |
100
+ | `options` | `CohensKappaOptions` | Optional. Configuration for weighting and confidence intervals. |
101
+
102
+ **`CohensKappaOptions`:**
103
+
104
+ | Field | Type | Default | Description |
105
+ |-------|------|---------|-------------|
106
+ | `weighted` | `boolean` | `false` | Enable weighted kappa for ordinal data. |
107
+ | `weights` | `'linear' \| 'quadratic'` | `'linear'` | Weight scheme when `weighted` is `true`. |
108
+ | `ci` | `boolean` | -- | Reserved for future confidence interval support. |
109
+ | `ciLevel` | `number` | -- | Confidence level (e.g., `0.95`). |
110
+ | `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
111
+ | `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
112
+
113
+ **Returns:** `KappaResult`
114
+
115
+ | Field | Type | Description |
116
+ |-------|------|-------------|
117
+ | `metric` | `MetricName` | Always `'cohens-kappa'`. |
118
+ | `value` | `number` | The computed kappa coefficient. |
119
+ | `observed` | `number` | Observed agreement proportion (Po). |
120
+ | `expected` | `number` | Expected agreement by chance (Pe). |
121
+ | `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
122
+ | `categories` | `Label[]` | Sorted list of unique categories found in the data. |
123
+ | `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
124
+
125
+ **Example:**
126
+
127
+ ```ts
128
+ // Unweighted kappa
129
+ const result = cohenKappa(['A', 'B', 'C'], ['A', 'B', 'A']);
130
+ // { metric: 'cohens-kappa', value: 0.5384..., observed: 0.6666..., ... }
131
+
132
+ // Weighted kappa for ordinal ratings
133
+ const weighted = cohenKappa([1, 2, 3, 4], [1, 2, 4, 4], {
134
+ weighted: true,
135
+ weights: 'quadratic',
136
+ });
137
+ ```
138
+
139
+ ---
140
+
141
+ #### `fleissKappa(matrix)`
142
+
143
+ Computes Fleiss' Kappa for multiple raters (N >= 2).
144
+
145
+ **Parameters:**
146
+
147
+ | Parameter | Type | Description |
148
+ |-----------|------|-------------|
149
+ | `matrix` | `number[][]` | Category-count matrix. `matrix[i][j]` is the number of raters who assigned category `j` to subject `i`. All rows must sum to the same value (the number of raters per subject). |
150
+
151
+ **Returns:** `FleissKappaResult`
152
+
153
+ | Field | Type | Description |
154
+ |-------|------|-------------|
155
+ | `metric` | `'fleiss-kappa'` | Always `'fleiss-kappa'`. |
156
+ | `value` | `number` | The computed Fleiss' Kappa coefficient. |
157
+ | `observed` | `number` | Mean observed agreement across subjects. |
158
+ | `expected` | `number` | Expected agreement by chance. |
159
+ | `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
160
+ | `annotatorCount` | `number` | Number of raters per subject (row sum). |
161
+ | `itemCount` | `number` | Number of subjects (rows). |
162
+ | `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
163
+
164
+ **Example:**
165
+
166
+ ```ts
167
+ const result = fleissKappa([
168
+ [4, 0, 0],
169
+ [0, 3, 1],
170
+ [0, 0, 4],
171
+ [1, 3, 0],
172
+ ]);
173
+ // result.value: 0.6894...
174
+ // result.annotatorCount: 4
175
+ // result.itemCount: 4
176
+ ```
177
+
178
+ **Throws:**
179
+ - If the matrix is empty.
180
+ - If rows have inconsistent lengths.
181
+ - If row sums are not equal.
182
+ - If fewer than 2 raters per subject.
183
+ - If fewer than 2 categories.
184
+
185
+ ---
186
+
187
+ #### `scottPi(rater1, rater2, options?)`
188
+
189
+ Computes Scott's Pi for two raters. Uses joint (pooled) marginal proportions to compute expected agreement, making it more robust than Cohen's Kappa when rater biases differ.
190
+
191
+ **Parameters:**
192
+
193
+ | Parameter | Type | Description |
194
+ |-----------|------|-------------|
195
+ | `rater1` | `Label[]` | Labels assigned by the first rater. |
196
+ | `rater2` | `Label[]` | Labels assigned by the second rater. Must have the same length as `rater1`. |
197
+ | `options` | `ScottsPiOptions` | Optional. Reserved for future confidence interval support. |
198
+
199
+ **`ScottsPiOptions`:**
200
+
201
+ | Field | Type | Default | Description |
202
+ |-------|------|---------|-------------|
203
+ | `ci` | `boolean` | -- | Reserved for future confidence interval support. |
204
+ | `ciLevel` | `number` | -- | Confidence level. |
205
+ | `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
206
+ | `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
207
+
208
+ **Returns:** `PiResult`
209
+
210
+ | Field | Type | Description |
211
+ |-------|------|-------------|
212
+ | `metric` | `'scotts-pi'` | Always `'scotts-pi'`. |
213
+ | `value` | `number` | The computed Pi coefficient. |
214
+ | `observed` | `number` | Observed agreement proportion. |
215
+ | `expected` | `number` | Expected agreement from joint marginals. |
216
+ | `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
217
+ | `categories` | `Label[]` | Sorted list of unique categories. |
218
+ | `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
219
+
220
+ **Example:**
221
+
222
+ ```ts
223
+ const result = scottPi(['A', 'B', 'A', 'C'], ['A', 'B', 'B', 'C']);
224
+ // result.value: 0.5294...
225
+ // result.observed: 0.75
226
+ ```
227
+
228
+ ---
229
+
230
+ #### `krippendorffAlpha(matrix, options?)`
231
+
232
+ Computes Krippendorff's Alpha for multiple raters with support for missing data and multiple measurement levels.
233
+
234
+ **Parameters:**
235
+
236
+ | Parameter | Type | Description |
237
+ |-----------|------|-------------|
238
+ | `matrix` | `(string \| number \| null \| undefined)[][]` | Rater-by-item matrix. `matrix[r][c]` is the label assigned by rater `r` to item `c`. Use `null` or `undefined` for missing annotations. |
239
+ | `options` | `KrippendorffOptions` | Optional. Configuration for measurement level and missing data handling. |
240
+
241
+ **`KrippendorffOptions`:**
242
+
243
+ | Field | Type | Default | Description |
244
+ |-------|------|---------|-------------|
245
+ | `level` | `MeasurementLevel` | `'nominal'` | Measurement level: `'nominal'`, `'ordinal'`, `'interval'`, or `'ratio'`. Determines the disagreement function. |
246
+ | `missingData` | `'exclude' \| 'error'` | `'exclude'` | How to handle missing values. `'exclude'` skips them; `'error'` throws. |
247
+ | `ci` | `boolean` | -- | Reserved for future confidence interval support. |
248
+ | `ciLevel` | `number` | -- | Confidence level. |
249
+ | `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
250
+ | `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
251
+
252
+ **Returns:** `AlphaResult`
253
+
254
+ | Field | Type | Description |
255
+ |-------|------|-------------|
256
+ | `metric` | `'krippendorff-alpha'` | Always `'krippendorff-alpha'`. |
257
+ | `value` | `number` | The computed alpha coefficient. |
258
+ | `interpretation` | `AlphaInterpretation` | Krippendorff interpretation: `'unreliable'`, `'tentative'`, or `'reliable'`. |
259
+ | `level` | `MeasurementLevel` | The measurement level used. |
260
+ | `itemCount` | `number` | Number of items (columns). |
261
+ | `annotatorCount` | `number` | Number of raters (rows). |
262
+ | `missingCount` | `number` | Total number of missing annotations. |
263
+ | `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
264
+
265
+ **Example:**
266
+
267
+ ```ts
268
+ // Nominal data with missing values
269
+ const result = krippendorffAlpha(
270
+ [
271
+ ['A', null, 'C', 'A'],
272
+ ['A', 'B', 'C', null],
273
+ ['B', 'B', 'C', 'A'],
274
+ ],
275
+ );
276
+ // result.value: 0.4615...
277
+ // result.missingCount: 2
278
+ // result.annotatorCount: 3
279
+
280
+ // Interval-level numeric data
281
+ const interval = krippendorffAlpha(
282
+ [
283
+ [1, 2, 3, 4],
284
+ [1, 2, 4, 4],
285
+ ],
286
+ { level: 'interval' },
287
+ );
288
+ // interval.level: 'interval'
289
+ ```
290
+
291
+ **Disagreement functions by measurement level:**
292
+
293
+ | Level | Function | Description |
294
+ |-------|----------|-------------|
295
+ | `nominal` | `d = v === v' ? 0 : 1` | Binary: same or different. |
296
+ | `ordinal` | `d = v === v' ? 0 : 1` | Treated as nominal (rank-based extension planned). |
297
+ | `interval` | `d = (v - v')^2` | Squared numeric difference. |
298
+ | `ratio` | `d = (v - v')^2` | Squared numeric difference. |
299
+
300
+ ---
301
+
302
+ #### `gwetAC1(rater1, rater2, options?)`
303
+
304
+ Computes Gwet's AC1 for two raters. Designed to be robust to the prevalence and bias paradox that causes Cohen's Kappa to produce misleadingly low values when one category dominates.
305
+
306
+ **Parameters:**
307
+
308
+ | Parameter | Type | Description |
309
+ |-----------|------|-------------|
310
+ | `rater1` | `Label[]` | Labels assigned by the first rater. |
311
+ | `rater2` | `Label[]` | Labels assigned by the second rater. Must have the same length as `rater1`. |
312
+ | `options` | `AC1Options` | Optional. Reserved for future confidence interval support. |
313
+
314
+ **`AC1Options`:**
315
+
316
+ | Field | Type | Default | Description |
317
+ |-------|------|---------|-------------|
318
+ | `ci` | `boolean` | -- | Reserved for future confidence interval support. |
319
+ | `ciLevel` | `number` | -- | Confidence level. |
320
+ | `ciBootstrapSamples` | `number` | -- | Number of bootstrap resamples. |
321
+ | `seed` | `number` | -- | Seed for reproducible bootstrap sampling. |
322
+
323
+ **Returns:** `AC1Result`
324
+
325
+ | Field | Type | Description |
326
+ |-------|------|-------------|
327
+ | `metric` | `'gwets-ac1'` | Always `'gwets-ac1'`. |
328
+ | `value` | `number` | The computed AC1 coefficient. |
329
+ | `observed` | `number` | Observed agreement proportion. |
330
+ | `expected` | `number` | Gwet's expected agreement by chance. |
331
+ | `interpretation` | `Interpretation` | Landis & Koch interpretation label. |
332
+ | `ci` | `ConfidenceInterval` | Optional. Confidence interval if requested. |
333
+
334
+ **Example:**
335
+
336
+ ```ts
337
+ const result = gwetAC1(
338
+ ['Y', 'Y', 'Y', 'Y', 'Y', 'N'],
339
+ ['Y', 'Y', 'Y', 'Y', 'N', 'N'],
340
+ );
341
+ // result.value: 0.6666...
342
+ // result.observed: 0.6666...
343
+ ```
344
+
345
+ ---
346
+
347
+ ### Interpretation Functions
348
+
349
+ #### `interpretKappa(value)`
350
+
351
+ Classifies a kappa-family metric value using the Landis & Koch (1977) scale.
352
+
353
+ **Parameters:**
354
+
355
+ | Parameter | Type | Description |
356
+ |-----------|------|-------------|
357
+ | `value` | `number` | The kappa coefficient to interpret. |
358
+
359
+ **Returns:** `Interpretation` -- one of `'poor'`, `'slight'`, `'fair'`, `'moderate'`, `'substantial'`, `'almost-perfect'`.
360
+
361
+ **Scale:**
362
+
363
+ | Range | Interpretation |
364
+ |-------|---------------|
365
+ | < 0.00 | `'poor'` |
366
+ | 0.00 -- 0.20 | `'slight'` |
367
+ | 0.20 -- 0.40 | `'fair'` |
368
+ | 0.40 -- 0.60 | `'moderate'` |
369
+ | 0.60 -- 0.80 | `'substantial'` |
370
+ | >= 0.80 | `'almost-perfect'` |
371
+
372
+ ```ts
373
+ interpretKappa(0.75); // 'substantial'
374
+ interpretKappa(-0.1); // 'poor'
375
+ ```
376
+
377
+ ---
378
+
379
+ #### `interpretAlpha(value)`
380
+
381
+ Classifies a Krippendorff's Alpha value using Krippendorff's recommended thresholds.
382
+
383
+ **Parameters:**
384
+
385
+ | Parameter | Type | Description |
386
+ |-----------|------|-------------|
387
+ | `value` | `number` | The alpha coefficient to interpret. |
388
+
389
+ **Returns:** `AlphaInterpretation` -- one of `'unreliable'`, `'tentative'`, `'reliable'`.
390
+
391
+ **Scale:**
392
+
393
+ | Range | Interpretation |
394
+ |-------|---------------|
395
+ | < 0.667 | `'unreliable'` |
396
+ | 0.667 -- 0.800 | `'tentative'` |
397
+ | >= 0.800 | `'reliable'` |
398
+
399
+ ```ts
400
+ interpretAlpha(0.85); // 'reliable'
401
+ interpretAlpha(0.70); // 'tentative'
402
+ ```
403
+
404
+ ---
405
+
406
+ ### Validation Functions
407
+
408
+ #### `assertEqualLength(a, b, label?)`
409
+
410
+ Throws if arrays `a` and `b` do not have the same length.
411
+
412
+ ```ts
413
+ assertEqualLength([1, 2], [3, 4]); // passes
414
+ assertEqualLength([1, 2], [3], 'raters'); // throws: "Arrays must have equal length: got 2 and 1 raters"
415
+ ```
416
+
417
+ ---
418
+
419
+ #### `assertNonEmpty(arr, label?)`
420
+
421
+ Throws if `arr` is empty.
422
+
423
+ ```ts
424
+ assertNonEmpty([1, 2, 3]); // passes
425
+ assertNonEmpty([], 'subjects'); // throws: "Array must be non-empty subjects"
426
+ ```
427
+
428
+ ---
429
+
430
+ #### `assertConsistentRowLengths(matrix)`
431
+
432
+ Throws if the rows of a 2D matrix do not all have the same length.
433
+
434
+ ```ts
435
+ assertConsistentRowLengths([[1, 2], [3, 4]]); // passes
436
+ assertConsistentRowLengths([[1, 2], [3, 4, 5]]); // throws: "Matrix has inconsistent row lengths"
437
+ ```
438
+
439
+ ---
440
+
441
+ #### `assertConstantRowSums(matrix)`
442
+
443
+ Throws if rows of a numeric matrix do not all sum to the same value (within floating-point tolerance of 1e-9).
444
+
445
+ ```ts
446
+ assertConstantRowSums([[1, 2, 3], [2, 2, 2]]); // passes (both sum to 6)
447
+ assertConstantRowSums([[1, 2], [3, 4]]); // throws: "Category-count matrix rows must have equal sums"
448
+ ```
449
+
450
+ ---
451
+
452
+ #### `assertMinAnnotators(count)`
453
+
454
+ Throws if `count` is less than 2.
455
+
456
+ ```ts
457
+ assertMinAnnotators(3); // passes
458
+ assertMinAnnotators(1); // throws: "At least 2 annotators required, got 1"
459
+ ```
460
+
461
+ ---
462
+
463
+ #### `detectDuplicates(triples)`
464
+
465
+ Returns `(item, annotator)` pairs that appear more than once in an array of annotation triples. Each duplicate pair is returned only once regardless of how many times it appears.
466
+
467
+ **Parameters:**
468
+
469
+ | Parameter | Type | Description |
470
+ |-----------|------|-------------|
471
+ | `triples` | `Array<{ item: unknown; annotator: unknown }>` | Array of annotation triples. |
472
+
473
+ **Returns:** `Array<{ item: unknown; annotator: unknown }>` -- the duplicate pairs.
474
+
475
+ ```ts
476
+ const dupes = detectDuplicates([
477
+ { item: 1, annotator: 'A' },
478
+ { item: 1, annotator: 'A' },
479
+ { item: 2, annotator: 'B' },
480
+ ]);
481
+ // [{ item: 1, annotator: 'A' }]
482
+ ```
483
+
484
+ ---
485
+
486
+ ### Types
487
+
488
+ All TypeScript types are exported from the package entry point.
489
+
490
+ #### Core Types
491
+
492
+ | Type | Description |
493
+ |------|-------------|
494
+ | `Label` | `string \| number` -- a single annotation label. |
495
+ | `MeasurementLevel` | `'nominal' \| 'ordinal' \| 'interval' \| 'ratio'` |
496
+ | `MetricName` | `'cohens-kappa' \| 'fleiss-kappa' \| 'krippendorff-alpha' \| 'scotts-pi' \| 'gwets-ac1' \| 'percent-agreement'` |
497
+ | `Interpretation` | `'poor' \| 'slight' \| 'fair' \| 'moderate' \| 'substantial' \| 'almost-perfect'` |
498
+ | `AlphaInterpretation` | `'unreliable' \| 'tentative' \| 'reliable'` |
499
+
500
+ #### Data Types
501
+
502
+ | Type | Description |
503
+ |------|-------------|
504
+ | `AnnotationTriple` | `{ item: string \| number; annotator: string \| number; label: Label }` |
505
+ | `ConfusionMatrix` | `{ labels: Label[]; matrix: number[][] }` -- `matrix[i][j]` = count where rater 1 said `labels[i]` and rater 2 said `labels[j]`. |
506
+ | `ConfidenceInterval` | `{ lower: number; upper: number; level: number }` |
507
+
508
+ #### Result Types
509
+
510
+ | Type | Description |
511
+ |------|-------------|
512
+ | `KappaResult` | Result from `cohenKappa`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `categories?`, `ci?`. |
513
+ | `FleissKappaResult` | Result from `fleissKappa`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `annotatorCount`, `itemCount`, `ci?`. |
514
+ | `PiResult` | Result from `scottPi`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `categories?`, `ci?`. |
515
+ | `AlphaResult` | Result from `krippendorffAlpha`. Fields: `metric`, `value`, `interpretation`, `level`, `itemCount`, `annotatorCount`, `missingCount`, `ci?`. |
516
+ | `AC1Result` | Result from `gwetAC1`. Fields: `metric`, `value`, `observed`, `expected`, `interpretation`, `ci?`. |
517
+ | `AgreementReport` | `{ metric: MetricName; value: number; interpretation: Interpretation \| AlphaInterpretation; ci?: ConfidenceInterval }` |
518
+
519
+ #### Options Types
520
+
521
+ | Type | Description |
522
+ |------|-------------|
523
+ | `CohensKappaOptions` | `{ weighted?, weights?, ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
524
+ | `FleissKappaOptions` | `{ ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
525
+ | `KrippendorffOptions` | `{ level?, ci?, ciLevel?, ciBootstrapSamples?, seed?, missingData? }` |
526
+ | `ScottsPiOptions` | `{ ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
527
+ | `AC1Options` | `{ ci?, ciLevel?, ciBootstrapSamples?, seed? }` |
528
+ | `AgreementOptions` | `{ metric?, level?, missingData?, ci?, ciLevel?, seed? }` |
529
+ | `CIOptions` | `{ level?, bootstrapSamples?, seed? }` |
530
+
531
+ ---
532
+
533
+ ## Configuration
534
+
535
+ Each metric function accepts an optional options object as its last parameter. All options fields are optional and have sensible defaults.
536
+
537
+ **Cohen's Kappa weighting:**
538
+
539
+ ```ts
540
+ // Unweighted (default) -- for nominal/categorical data
541
+ cohenKappa(rater1, rater2);
542
+
543
+ // Linear weights -- penalizes disagreements proportionally to distance
544
+ cohenKappa(rater1, rater2, { weighted: true, weights: 'linear' });
545
+
546
+ // Quadratic weights -- penalizes far disagreements more heavily
547
+ cohenKappa(rater1, rater2, { weighted: true, weights: 'quadratic' });
548
+ ```
549
+
550
+ **Krippendorff's Alpha measurement level:**
551
+
552
+ ```ts
553
+ // Nominal (default) -- categories with no order
554
+ krippendorffAlpha(matrix);
555
+
556
+ // Interval -- numeric data with meaningful distances
557
+ krippendorffAlpha(matrix, { level: 'interval' });
558
+
559
+ // Ratio -- numeric data with a true zero
560
+ krippendorffAlpha(matrix, { level: 'ratio' });
561
+ ```
562
+
563
+ **Missing data handling:**
564
+
565
+ ```ts
566
+ // Exclude missing values (default)
567
+ krippendorffAlpha(matrix, { missingData: 'exclude' });
568
+
569
+ // Throw an error if any value is missing
570
+ krippendorffAlpha(matrix, { missingData: 'error' });
571
+ ```
572
+
573
+ ---
574
+
575
+ ## Error Handling
576
+
577
+ All metric functions validate their inputs and throw descriptive `Error` instances on invalid data.
578
+
579
+ | Condition | Error Message |
580
+ |-----------|--------------|
581
+ | Empty input array | `"Array must be non-empty"` |
582
+ | Rater arrays of different lengths | `"Arrays must have equal length: got X and Y"` |
583
+ | Matrix rows with inconsistent lengths | `"Matrix has inconsistent row lengths"` |
584
+ | Matrix rows with different sums | `"Category-count matrix rows must have equal sums"` |
585
+ | Fewer than 2 annotators | `"At least 2 annotators required, got N"` |
586
+ | Fewer than 2 categories (Fleiss) | `"Fleiss Kappa requires at least 2 categories"` |
587
+ | Missing data with `missingData: 'error'` | `"Missing data found at rater R, item C"` |
588
+
589
+ All errors are synchronous and thrown immediately during input validation, before any computation begins.
590
+
591
+ ---
592
+
593
+ ## Advanced Usage
594
+
595
+ ### Choosing Between Metrics
596
+
597
+ | Scenario | Recommended Metric |
598
+ |----------|-------------------|
599
+ | Two raters, nominal categories | `cohenKappa` (unweighted) |
600
+ | Two raters, ordinal scale | `cohenKappa` with `weighted: true` |
601
+ | Two raters, possible rater bias | `scottPi` (pooled marginals) |
602
+ | Two raters, skewed category distribution | `gwetAC1` (prevalence-robust) |
603
+ | Three or more raters, nominal | `fleissKappa` |
604
+ | Any number of raters, missing data | `krippendorffAlpha` |
605
+ | Any number of raters, interval/ratio data | `krippendorffAlpha` with `level` option |
606
+
607
+ ### Comparing Cohen's Kappa, Scott's Pi, and Gwet's AC1
608
+
609
+ When both raters have identical marginal distributions, Scott's Pi and Cohen's Kappa produce the same value. They diverge when raters have different biases (e.g., one rater assigns "positive" more frequently).
610
+
611
+ ```ts
612
+ // Symmetric marginals -- Pi and Kappa agree
613
+ const r1 = ['A', 'A', 'B', 'B', 'C', 'C'];
614
+ const r2 = ['A', 'B', 'A', 'B', 'C', 'C'];
615
+ scottPi(r1, r2).value; // same as cohenKappa(r1, r2).value
616
+
617
+ // Skewed data -- AC1 is more stable than Kappa
618
+ const r1Skew = ['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N'];
619
+ const r2Skew = ['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N'];
620
+ gwetAC1(r1Skew, r2Skew).value; // higher than cohenKappa
621
+ cohenKappa(r1Skew, r2Skew).value; // suppressed by prevalence effect
622
+ ```
623
+
624
+ ### Handling Edge Cases
625
+
626
+ When all items fall into a single category (Pe = 1), the kappa formula `(Po - Pe) / (1 - Pe)` would produce a division by zero. In this case, all metrics return `1.0` (perfect agreement) since every item receives the same label from every rater.
627
+
628
+ ```ts
629
+ const allSame = cohenKappa(['A', 'A', 'A'], ['A', 'A', 'A']);
630
+ // allSame.value === 1.0
631
+ ```
632
+
633
+ ### Pre-validating Annotation Data
634
+
635
+ Use the validation utilities to check data integrity before computing metrics.
636
+
637
+ ```ts
638
+ import {
639
+ assertEqualLength,
640
+ assertNonEmpty,
641
+ detectDuplicates,
642
+ } from 'label-score';
643
+
644
+ const triples = [
645
+ { item: 1, annotator: 'A', label: 'pos' },
646
+ { item: 1, annotator: 'A', label: 'neg' }, // duplicate!
647
+ { item: 2, annotator: 'B', label: 'pos' },
648
+ ];
649
+
650
+ const dupes = detectDuplicates(triples);
651
+ if (dupes.length > 0) {
652
+ console.error('Duplicate annotations found:', dupes);
653
+ }
654
+ ```
655
+
656
+ ---
657
+
658
+ ## TypeScript
659
+
660
+ `label-score` is written in TypeScript and ships type declarations (`dist/index.d.ts`) alongside the compiled JavaScript. All public types are available as named imports.
661
+
662
+ ```ts
663
+ import type {
664
+ Label,
665
+ KappaResult,
666
+ FleissKappaResult,
667
+ AlphaResult,
668
+ PiResult,
669
+ AC1Result,
670
+ Interpretation,
671
+ AlphaInterpretation,
672
+ MeasurementLevel,
673
+ MetricName,
674
+ AnnotationTriple,
675
+ ConfusionMatrix,
676
+ ConfidenceInterval,
677
+ CohensKappaOptions,
678
+ FleissKappaOptions,
679
+ KrippendorffOptions,
680
+ ScottsPiOptions,
681
+ AC1Options,
682
+ AgreementOptions,
683
+ CIOptions,
684
+ AgreementReport,
685
+ } from 'label-score';
686
+ ```
687
+
688
+ The package targets ES2022 and uses CommonJS module format. Compiler options include `strict: true`, `declaration: true`, and `declarationMap: true` for full IDE support.
689
+
690
+ ---
691
+
692
+ ## License
693
+
694
+ MIT
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=cohen-kappa.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cohen-kappa.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/cohen-kappa.test.ts"],"names":[],"mappings":""}