puzlink 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +21 -0
- package/README.md +35 -0
- package/dist/data/answerLengths.d.ts +10 -0
- package/dist/data/answerLengths.d.ts.map +1 -0
- package/dist/data/answerLengths.js +63 -0
- package/dist/data/answerLengths.js.map +1 -0
- package/dist/data/categories/compass.d.ts +3 -0
- package/dist/data/categories/compass.d.ts.map +1 -0
- package/dist/data/categories/compass.js +11 -0
- package/dist/data/categories/compass.js.map +1 -0
- package/dist/data/categories/countryAlpha2.d.ts +3 -0
- package/dist/data/categories/countryAlpha2.d.ts.map +1 -0
- package/dist/data/categories/countryAlpha2.js +252 -0
- package/dist/data/categories/countryAlpha2.js.map +1 -0
- package/dist/data/categories/countryAlpha3.d.ts +3 -0
- package/dist/data/categories/countryAlpha3.d.ts.map +1 -0
- package/dist/data/categories/countryAlpha3.js +252 -0
- package/dist/data/categories/countryAlpha3.js.map +1 -0
- package/dist/data/categories/daysOfTheWeek.d.ts +3 -0
- package/dist/data/categories/daysOfTheWeek.d.ts.map +1 -0
- package/dist/data/categories/daysOfTheWeek.js +10 -0
- package/dist/data/categories/daysOfTheWeek.js.map +1 -0
- package/dist/data/categories/elementSymbols.d.ts +3 -0
- package/dist/data/categories/elementSymbols.d.ts.map +1 -0
- package/dist/data/categories/elementSymbols.js +121 -0
- package/dist/data/categories/elementSymbols.js.map +1 -0
- package/dist/data/categories/greekLetters.d.ts +3 -0
- package/dist/data/categories/greekLetters.d.ts.map +1 -0
- package/dist/data/categories/greekLetters.js +27 -0
- package/dist/data/categories/greekLetters.js.map +1 -0
- package/dist/data/categories/months.d.ts +3 -0
- package/dist/data/categories/months.d.ts.map +1 -0
- package/dist/data/categories/months.js +15 -0
- package/dist/data/categories/months.js.map +1 -0
- package/dist/data/categories/natoAlphabet.d.ts +3 -0
- package/dist/data/categories/natoAlphabet.d.ts.map +1 -0
- package/dist/data/categories/natoAlphabet.js +29 -0
- package/dist/data/categories/natoAlphabet.js.map +1 -0
- package/dist/data/categories/numbers.d.ts +3 -0
- package/dist/data/categories/numbers.d.ts.map +1 -0
- package/dist/data/categories/numbers.js +16 -0
- package/dist/data/categories/numbers.js.map +1 -0
- package/dist/data/categories/romanNumerals.d.ts +3 -0
- package/dist/data/categories/romanNumerals.d.ts.map +1 -0
- package/dist/data/categories/romanNumerals.js +134 -0
- package/dist/data/categories/romanNumerals.js.map +1 -0
- package/dist/data/categories/solfege.d.ts +3 -0
- package/dist/data/categories/solfege.d.ts.map +1 -0
- package/dist/data/categories/solfege.js +11 -0
- package/dist/data/categories/solfege.js.map +1 -0
- package/dist/data/categories/usStateAbbreviations.d.ts +3 -0
- package/dist/data/categories/usStateAbbreviations.d.ts.map +1 -0
- package/dist/data/categories/usStateAbbreviations.js +53 -0
- package/dist/data/categories/usStateAbbreviations.js.map +1 -0
- package/dist/data/categories.d.ts +10 -0
- package/dist/data/categories.d.ts.map +1 -0
- package/dist/data/categories.js +31 -0
- package/dist/data/categories.js.map +1 -0
- package/dist/data/knownLogProbs.d.ts +6 -0
- package/dist/data/knownLogProbs.d.ts.map +1 -0
- package/dist/data/knownLogProbs.js +2975 -0
- package/dist/data/knownLogProbs.js.map +1 -0
- package/dist/data/morse.d.ts +2 -0
- package/dist/data/morse.d.ts.map +1 -0
- package/dist/data/morse.js +29 -0
- package/dist/data/morse.js.map +1 -0
- package/dist/data/scrabble.d.ts +2 -0
- package/dist/data/scrabble.d.ts.map +1 -0
- package/dist/data/scrabble.js +29 -0
- package/dist/data/scrabble.js.map +1 -0
- package/dist/features/index.d.ts +32 -0
- package/dist/features/index.d.ts.map +1 -0
- package/dist/features/index.js +79 -0
- package/dist/features/index.js.map +1 -0
- package/dist/features/letterCount.d.ts +7 -0
- package/dist/features/letterCount.d.ts.map +1 -0
- package/dist/features/letterCount.js +121 -0
- package/dist/features/letterCount.js.map +1 -0
- package/dist/features/letterSequence.d.ts +7 -0
- package/dist/features/letterSequence.d.ts.map +1 -0
- package/dist/features/letterSequence.js +155 -0
- package/dist/features/letterSequence.js.map +1 -0
- package/dist/features/logProbCache.d.ts +16 -0
- package/dist/features/logProbCache.d.ts.map +1 -0
- package/dist/features/logProbCache.js +36 -0
- package/dist/features/logProbCache.js.map +1 -0
- package/dist/features/other.d.ts +4 -0
- package/dist/features/other.d.ts.map +1 -0
- package/dist/features/other.js +190 -0
- package/dist/features/other.js.map +1 -0
- package/dist/features/substring.d.ts +3 -0
- package/dist/features/substring.d.ts.map +1 -0
- package/dist/features/substring.js +146 -0
- package/dist/features/substring.js.map +1 -0
- package/dist/features/wordplay.d.ts +7 -0
- package/dist/features/wordplay.d.ts.map +1 -0
- package/dist/features/wordplay.js +387 -0
- package/dist/features/wordplay.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -0
- package/dist/index.js.map +1 -0
- package/dist/lib/affixDistribution.d.ts +26 -0
- package/dist/lib/affixDistribution.d.ts.map +1 -0
- package/dist/lib/affixDistribution.js +105 -0
- package/dist/lib/affixDistribution.js.map +1 -0
- package/dist/lib/counter.d.ts +23 -0
- package/dist/lib/counter.d.ts.map +1 -0
- package/dist/lib/counter.js +55 -0
- package/dist/lib/counter.js.map +1 -0
- package/dist/lib/distribution.d.ts +40 -0
- package/dist/lib/distribution.d.ts.map +1 -0
- package/dist/lib/distribution.js +176 -0
- package/dist/lib/distribution.js.map +1 -0
- package/dist/lib/lengthDistribution.d.ts +30 -0
- package/dist/lib/lengthDistribution.d.ts.map +1 -0
- package/dist/lib/lengthDistribution.js +137 -0
- package/dist/lib/lengthDistribution.js.map +1 -0
- package/dist/lib/letterBitset.d.ts +49 -0
- package/dist/lib/letterBitset.d.ts.map +1 -0
- package/dist/lib/letterBitset.js +101 -0
- package/dist/lib/letterBitset.js.map +1 -0
- package/dist/lib/letterDistribution.d.ts +60 -0
- package/dist/lib/letterDistribution.d.ts.map +1 -0
- package/dist/lib/letterDistribution.js +230 -0
- package/dist/lib/letterDistribution.js.map +1 -0
- package/dist/lib/letterIndices.d.ts +13 -0
- package/dist/lib/letterIndices.d.ts.map +1 -0
- package/dist/lib/letterIndices.js +41 -0
- package/dist/lib/letterIndices.js.map +1 -0
- package/dist/lib/logCounter.d.ts +23 -0
- package/dist/lib/logCounter.d.ts.map +1 -0
- package/dist/lib/logCounter.js +49 -0
- package/dist/lib/logCounter.js.map +1 -0
- package/dist/lib/logNum.d.ts +36 -0
- package/dist/lib/logNum.d.ts.map +1 -0
- package/dist/lib/logNum.js +193 -0
- package/dist/lib/logNum.js.map +1 -0
- package/dist/lib/memoize.d.ts +5 -0
- package/dist/lib/memoize.d.ts.map +1 -0
- package/dist/lib/memoize.js +104 -0
- package/dist/lib/memoize.js.map +1 -0
- package/dist/lib/util.d.ts +30 -0
- package/dist/lib/util.d.ts.map +1 -0
- package/dist/lib/util.js +111 -0
- package/dist/lib/util.js.map +1 -0
- package/dist/lib/wordlist.d.ts +66 -0
- package/dist/lib/wordlist.d.ts.map +1 -0
- package/dist/lib/wordlist.js +166 -0
- package/dist/lib/wordlist.js.map +1 -0
- package/dist/linkers/index.d.ts +34 -0
- package/dist/linkers/index.d.ts.map +1 -0
- package/dist/linkers/index.js +25 -0
- package/dist/linkers/index.js.map +1 -0
- package/dist/linkers/indexing.d.ts +5 -0
- package/dist/linkers/indexing.d.ts.map +1 -0
- package/dist/linkers/indexing.js +152 -0
- package/dist/linkers/indexing.js.map +1 -0
- package/dist/linkers/length.d.ts +5 -0
- package/dist/linkers/length.d.ts.map +1 -0
- package/dist/linkers/length.js +101 -0
- package/dist/linkers/length.js.map +1 -0
- package/dist/linkers/letterDistribution.d.ts +4 -0
- package/dist/linkers/letterDistribution.d.ts.map +1 -0
- package/dist/linkers/letterDistribution.js +46 -0
- package/dist/linkers/letterDistribution.js.map +1 -0
- package/dist/linkers/other.d.ts +5 -0
- package/dist/linkers/other.d.ts.map +1 -0
- package/dist/linkers/other.js +90 -0
- package/dist/linkers/other.js.map +1 -0
- package/dist/parse.d.ts +8 -0
- package/dist/parse.d.ts.map +1 -0
- package/dist/parse.js +23 -0
- package/dist/parse.js.map +1 -0
- package/dist/puzlink.d.ts +84 -0
- package/dist/puzlink.d.ts.map +1 -0
- package/dist/puzlink.js +59 -0
- package/dist/puzlink.js.map +1 -0
- package/package.json +57 -0
- package/src/data/answerLengths.ts +63 -0
- package/src/data/categories/README.md +3 -0
- package/src/data/categories/compass.ts +1 -0
- package/src/data/categories/countryAlpha2.ts +251 -0
- package/src/data/categories/countryAlpha3.ts +251 -0
- package/src/data/categories/daysOfTheWeek.ts +1 -0
- package/src/data/categories/elementSymbols.ts +120 -0
- package/src/data/categories/greekLetters.ts +26 -0
- package/src/data/categories/months.ts +14 -0
- package/src/data/categories/natoAlphabet.ts +28 -0
- package/src/data/categories/numbers.ts +15 -0
- package/src/data/categories/romanNumerals.ts +133 -0
- package/src/data/categories/solfege.ts +1 -0
- package/src/data/categories/txt/compass.txt +8 -0
- package/src/data/categories/txt/daysOfTheWeek.txt +7 -0
- package/src/data/categories/txt/elementSymbols.txt +118 -0
- package/src/data/categories/txt/greekLetters.txt +24 -0
- package/src/data/categories/txt/months.txt +12 -0
- package/src/data/categories/txt/natoAlphabet.txt +26 -0
- package/src/data/categories/txt/numbers.txt +13 -0
- package/src/data/categories/txt/solfege.txt +8 -0
- package/src/data/categories/txt/usStateAbbreviations.txt +50 -0
- package/src/data/categories/usStateAbbreviations.ts +52 -0
- package/src/data/categories.ts +42 -0
- package/src/data/knownLogProbs.ts +2992 -0
- package/src/data/morse.ts +28 -0
- package/src/data/scrabble.ts +28 -0
- package/src/features/index.ts +120 -0
- package/src/features/letterCount.ts +174 -0
- package/src/features/letterSequence.ts +222 -0
- package/src/features/logProbCache.ts +48 -0
- package/src/features/other.ts +214 -0
- package/src/features/substring.ts +173 -0
- package/src/features/wordplay.ts +428 -0
- package/src/index.ts +3 -0
- package/src/lib/affixDistribution.ts +70 -0
- package/src/lib/counter.ts +71 -0
- package/src/lib/distribution.ts +162 -0
- package/src/lib/lengthDistribution.ts +108 -0
- package/src/lib/letterBitset.ts +123 -0
- package/src/lib/letterDistribution.ts +236 -0
- package/src/lib/letterIndices.ts +51 -0
- package/src/lib/logCounter.ts +74 -0
- package/src/lib/logNum.ts +193 -0
- package/src/lib/memoize.ts +136 -0
- package/src/lib/testUtils.ts +1 -0
- package/src/lib/util.ts +150 -0
- package/src/lib/wordlist.ts +162 -0
- package/src/linkers/index.ts +56 -0
- package/src/linkers/indexing.ts +194 -0
- package/src/linkers/length.ts +122 -0
- package/src/linkers/other.ts +117 -0
- package/src/parse.ts +20 -0
- package/src/puzlink.ts +141 -0
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import { cumulativeStdNormalProbability as normCdf } from "simple-statistics";
|
|
2
|
+
import { LogCounter } from "./logCounter.js";
|
|
3
|
+
import { LogNum } from "./logNum.js";
|
|
4
|
+
import { memoize } from "./memoize.js";
|
|
5
|
+
import { interval } from "./util.js";
|
|
6
|
+
|
|
7
|
+
/** A probability distribution of items. */
|
|
8
|
+
export class Distribution<T extends PropertyKey> {
|
|
9
|
+
private readonly frequencies: ReadonlyMap<T, LogNum>;
|
|
10
|
+
|
|
11
|
+
constructor(frequencies: ReadonlyMap<T, LogNum>) {
|
|
12
|
+
this.frequencies = frequencies;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
static from(data: string): Distribution<string>;
|
|
16
|
+
static from<T extends PropertyKey>(data: readonly T[]): Distribution<T>;
|
|
17
|
+
static from<T extends PropertyKey>(counter: LogCounter<T>): Distribution<T>;
|
|
18
|
+
static from(data: string | readonly PropertyKey[] | LogCounter<PropertyKey>) {
|
|
19
|
+
const counter =
|
|
20
|
+
data instanceof LogCounter ? data : LogCounter.from(data as string);
|
|
21
|
+
return new Distribution(new Map(counter.frequencies()));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Get the frequency of the given item. */
|
|
25
|
+
get(item: T): LogNum {
|
|
26
|
+
return this.frequencies.get(item) ?? LogNum.from(0);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Returns an iterable of [item, log probability] pairs. */
|
|
30
|
+
entries(): IterableIterator<[T, LogNum]> {
|
|
31
|
+
return this.frequencies.entries();
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** k-th moment of the distribution. */
|
|
35
|
+
@memoize()
|
|
36
|
+
moment(k: number): LogNum {
|
|
37
|
+
return LogNum.sum(
|
|
38
|
+
Array.from(this.frequencies.values(), (freq) => freq.pow(k)),
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/** Log probability that k items drawn from the distribution are all equal. */
|
|
43
|
+
probEqual(k: number): LogNum {
|
|
44
|
+
if (k <= 0) {
|
|
45
|
+
return LogNum.from(1);
|
|
46
|
+
}
|
|
47
|
+
return this.moment(k);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/**
|
|
51
|
+
* Log probability that k items drawn from the distribution have two distinct
|
|
52
|
+
* values.
|
|
53
|
+
*/
|
|
54
|
+
@memoize()
|
|
55
|
+
probTwoDistinct(k: number): LogNum {
|
|
56
|
+
const probs = [];
|
|
57
|
+
for (const i of interval(0, k)) {
|
|
58
|
+
const j = k - i;
|
|
59
|
+
probs.push(
|
|
60
|
+
LogNum.fromBinomial(k, i).mul(this.probEqual(i)).mul(this.probEqual(j)),
|
|
61
|
+
);
|
|
62
|
+
}
|
|
63
|
+
// Case where all are equal is counted 2^k times; others are counted twice:
|
|
64
|
+
return LogNum.sum(probs)
|
|
65
|
+
.sub(LogNum.from(2).pow(k).mul(this.probEqual(k)))
|
|
66
|
+
.div(LogNum.from(2));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Log probability that k items drawn from the distribution are all equal,
|
|
71
|
+
* with exactly one exception.
|
|
72
|
+
*/
|
|
73
|
+
@memoize()
|
|
74
|
+
probAlmostEqual(k: number): LogNum {
|
|
75
|
+
const probs = [];
|
|
76
|
+
for (const [, freq] of this.frequencies) {
|
|
77
|
+
probs.push(
|
|
78
|
+
this.moment(k - 1)
|
|
79
|
+
.sub(freq.pow(k - 1))
|
|
80
|
+
.mul(freq)
|
|
81
|
+
.mul(LogNum.from(k)),
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
return LogNum.sum(probs);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/** Map the items of the distribution, returning the new distribution. */
|
|
88
|
+
map<U extends PropertyKey>(fn: (item: T) => U): Distribution<U> {
|
|
89
|
+
const frequencies = new Map<U, LogNum>();
|
|
90
|
+
for (const [item, freq] of this.frequencies) {
|
|
91
|
+
const mapped = fn(item);
|
|
92
|
+
frequencies.set(
|
|
93
|
+
mapped,
|
|
94
|
+
(frequencies.get(mapped) ?? LogNum.from(0)).add(freq),
|
|
95
|
+
);
|
|
96
|
+
}
|
|
97
|
+
return new Distribution(frequencies);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/** Chi-squared test statistic against an observed distribution. */
|
|
101
|
+
chi2(observed: LogCounter<T>): LogNum {
|
|
102
|
+
for (const [item] of observed.entries()) {
|
|
103
|
+
if (!this.frequencies.has(item)) {
|
|
104
|
+
return LogNum.from(Infinity);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
const n = observed.total;
|
|
108
|
+
const partials = [];
|
|
109
|
+
for (const [item, freq] of this.frequencies) {
|
|
110
|
+
const expected = n.mul(freq);
|
|
111
|
+
const actual = observed.get(item);
|
|
112
|
+
// Under chi-squared assumptions, the (expected - actual)^2/expected
|
|
113
|
+
// should be iid N(0, 1)^2.
|
|
114
|
+
partials.push(expected.absSub(actual).pow(2).div(expected));
|
|
115
|
+
}
|
|
116
|
+
return LogNum.sum(partials);
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/** Log probability that an unordered distribution is drawn from this. */
|
|
120
|
+
probUnordered(observed: LogCounter<T>): LogNum {
|
|
121
|
+
const df = this.frequencies.size - 1;
|
|
122
|
+
const z = (this.chi2(observed).toNum() - df) / Math.sqrt(2 * df);
|
|
123
|
+
return LogNum.from(1 - normCdf(z));
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/** Over- and under-represented items, at the given sigma. */
|
|
127
|
+
outliers(
|
|
128
|
+
observed: LogCounter<T>,
|
|
129
|
+
sigma = 2,
|
|
130
|
+
): {
|
|
131
|
+
high: Record<T, LogNum>;
|
|
132
|
+
low: Record<T, LogNum>;
|
|
133
|
+
} {
|
|
134
|
+
const n = observed.total;
|
|
135
|
+
const low = {} as Record<T, LogNum>;
|
|
136
|
+
const high = {} as Record<T, LogNum>;
|
|
137
|
+
|
|
138
|
+
const keys = [
|
|
139
|
+
...this.frequencies.keys(),
|
|
140
|
+
...observed.filterKeys((key) => !this.frequencies.has(key)),
|
|
141
|
+
];
|
|
142
|
+
const threshold = LogNum.from(sigma ** 2);
|
|
143
|
+
|
|
144
|
+
for (const item of keys) {
|
|
145
|
+
const freq = this.frequencies.get(item) ?? LogNum.from(0);
|
|
146
|
+
const expected = n.mul(freq);
|
|
147
|
+
const actual = observed.get(item);
|
|
148
|
+
|
|
149
|
+
// We assume (expected - actual)^2/expected should be distributed as
|
|
150
|
+
// N(0, 1)^2; thus if it's over sigma^2, it's an outlier.
|
|
151
|
+
if (expected.absSub(actual).pow(2).div(expected).gt(threshold)) {
|
|
152
|
+
if (expected.gt(actual)) {
|
|
153
|
+
low[item] = expected.sub(actual);
|
|
154
|
+
} else {
|
|
155
|
+
high[item] = actual.sub(expected);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
return { high, low };
|
|
161
|
+
}
|
|
162
|
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
import { Distribution } from "./distribution.js";
|
|
2
|
+
import { LogNum } from "./logNum.js";
|
|
3
|
+
import { memoize } from "./memoize.js";
|
|
4
|
+
import { interval, windows } from "./util.js";
|
|
5
|
+
|
|
6
|
+
export class LengthDistribution {
|
|
7
|
+
readonly distribution: Distribution<number>;
|
|
8
|
+
private readonly distMod2: Distribution<number>;
|
|
9
|
+
private readonly distMod3: Distribution<number>;
|
|
10
|
+
/** Length such that 99.9% of answers have length less than this. */
|
|
11
|
+
private readonly maxLength: number;
|
|
12
|
+
|
|
13
|
+
constructor(distribution: Distribution<number>) {
|
|
14
|
+
this.distribution = distribution;
|
|
15
|
+
this.distMod2 = this.distribution.map((length) => length % 2);
|
|
16
|
+
this.distMod3 = this.distribution.map((length) => length % 3);
|
|
17
|
+
|
|
18
|
+
this.maxLength = Infinity;
|
|
19
|
+
let totalProb = LogNum.from(0);
|
|
20
|
+
for (const [length, freq] of distribution.entries()) {
|
|
21
|
+
totalProb = totalProb.add(freq);
|
|
22
|
+
if (totalProb.gt(LogNum.from(0.999))) {
|
|
23
|
+
this.maxLength = length;
|
|
24
|
+
break;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
static from(data: Map<number, LogNum>): LengthDistribution {
|
|
30
|
+
return new LengthDistribution(new Distribution(data));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Log probability that k words have the same length. */
|
|
34
|
+
probEqual(k: number): LogNum {
|
|
35
|
+
return this.distribution.probEqual(k);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/** Log probability that k words have the same length, except for one. */
|
|
39
|
+
probAlmostEqual(k: number): LogNum {
|
|
40
|
+
return this.distribution.probAlmostEqual(k);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
/** Log probability that k words have the same length modulo 2. */
|
|
44
|
+
probEqualMod2(k: number): LogNum {
|
|
45
|
+
return this.distMod2.probEqual(k);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Log probability that k words have the same length modulo 3. */
|
|
49
|
+
probEqualMod3(k: number): LogNum {
|
|
50
|
+
return this.distMod3.probEqual(k);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Log probability that k words have consecutive lengths. */
|
|
54
|
+
@memoize()
|
|
55
|
+
probConsecutive(k: number): LogNum {
|
|
56
|
+
if (k <= 1) {
|
|
57
|
+
return LogNum.from(1);
|
|
58
|
+
} else if (k > this.maxLength) {
|
|
59
|
+
return LogNum.from(0);
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const range = interval(1, this.maxLength).map((i) =>
|
|
63
|
+
this.distribution.get(i),
|
|
64
|
+
);
|
|
65
|
+
const partials = Array.from(windows(range, k), (window) =>
|
|
66
|
+
LogNum.prod(window),
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
return LogNum.fromFactorial(k).mul(LogNum.sum(partials));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** Log probability that k words have exactly two distinct lengths. */
|
|
73
|
+
probTwoDistinct(k: number): LogNum {
|
|
74
|
+
return this.distribution.probTwoDistinct(k);
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Log probability that k words have distinct lengths, all at least min.
|
|
79
|
+
*/
|
|
80
|
+
@memoize(2)
|
|
81
|
+
probDistinct(k: number, min = 0): LogNum {
|
|
82
|
+
if (k <= 0) {
|
|
83
|
+
return LogNum.from(1);
|
|
84
|
+
} else if (min > this.maxLength) {
|
|
85
|
+
return LogNum.from(0);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
const probs = [];
|
|
89
|
+
for (const [length, freq] of this.distribution.entries()) {
|
|
90
|
+
if (length < min) {
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
probs.push(freq.mul(this.probDistinct(k - 1, length + 1)));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return probs.length === 0
|
|
97
|
+
? LogNum.from(0)
|
|
98
|
+
: LogNum.from(k).mul(LogNum.sum(probs));
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/** Log probability that k words can be paired by length. */
|
|
102
|
+
probPaired(k: number): LogNum {
|
|
103
|
+
if (k % 2 !== 0) {
|
|
104
|
+
return LogNum.from(0);
|
|
105
|
+
}
|
|
106
|
+
return this.probDistinct(Math.floor(k / 2));
|
|
107
|
+
}
|
|
108
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* A class that uses a single bigint to store counts for each lowercase letter,
|
|
3
|
+
* for fast-ish comparison.
|
|
4
|
+
*
|
|
5
|
+
* Idea stolen from Collective.jl:
|
|
6
|
+
* https://github.com/rdeits/Collective.jl/blob/master/src/bitstally.jl
|
|
7
|
+
*/
|
|
8
|
+
export class LetterBitset {
|
|
9
|
+
private static readonly bits = 5n;
|
|
10
|
+
private static readonly mask = (1n << LetterBitset.bits) - 1n;
|
|
11
|
+
private static readonly offsets = Array(26)
|
|
12
|
+
.fill(0)
|
|
13
|
+
.map((_, i) => LetterBitset.bits * BigInt(i));
|
|
14
|
+
private static readonly letterMasks = LetterBitset.offsets.map(
|
|
15
|
+
(x) => 1n << x,
|
|
16
|
+
);
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* This is a (26 * 5)-bit integer; each 5-bit block is a count for a letter.
|
|
20
|
+
* Unclear how efficient this'll be for different engines...
|
|
21
|
+
*
|
|
22
|
+
* Note that 26 * 5 = 130, so these *mostly* fit in 128-bit integers, unless
|
|
23
|
+
* there's more than 7 'z's.
|
|
24
|
+
*/
|
|
25
|
+
readonly data: bigint;
|
|
26
|
+
|
|
27
|
+
constructor(data: bigint) {
|
|
28
|
+
this.data = data;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
private static toIndex(letter: string) {
|
|
32
|
+
return letter.charCodeAt(0) - 97;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
private static fromIndex(index: number) {
|
|
36
|
+
return String.fromCharCode(97 + index);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Create a new LetterCounter from a slug. */
|
|
40
|
+
static from(slug: string) {
|
|
41
|
+
let data = 0n;
|
|
42
|
+
for (const char of slug) {
|
|
43
|
+
data += LetterBitset.letterMasks[this.toIndex(char)]!;
|
|
44
|
+
}
|
|
45
|
+
return new LetterBitset(data);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/** Count the number of times the given letter appears in this bitset. */
|
|
49
|
+
index(letter: string): number {
|
|
50
|
+
return Number(
|
|
51
|
+
(this.data >> LetterBitset.offsets[LetterBitset.toIndex(letter)]!) &
|
|
52
|
+
LetterBitset.mask,
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
equals(other: LetterBitset): boolean {
|
|
57
|
+
return this.data === other.data;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
add(char: string): LetterBitset {
|
|
61
|
+
return new LetterBitset(
|
|
62
|
+
this.data + (LetterBitset.letterMasks[LetterBitset.toIndex(char)] ?? 0n),
|
|
63
|
+
);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
sub(char: string): LetterBitset {
|
|
67
|
+
return new LetterBitset(
|
|
68
|
+
this.data - (LetterBitset.letterMasks[LetterBitset.toIndex(char)] ?? 0n),
|
|
69
|
+
);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** If this + result == other, return result; else null. */
|
|
73
|
+
transaddOf(other: LetterBitset) {
|
|
74
|
+
const diff = this.data - other.data;
|
|
75
|
+
const index = LetterBitset.letterMasks.findIndex((mask) => diff === mask);
|
|
76
|
+
if (index === -1) {
|
|
77
|
+
return null;
|
|
78
|
+
}
|
|
79
|
+
return LetterBitset.fromIndex(index);
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/** If this - result == other, return result; else null. */
|
|
83
|
+
transdeleteOf(other: LetterBitset) {
|
|
84
|
+
return other.transaddOf(this);
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** A map from letter bitsets to words with that bitset. */
|
|
89
|
+
export class LetterBitsets {
|
|
90
|
+
private letterCounters = new Map<bigint, string[]>();
|
|
91
|
+
private lengths = new Set<number>();
|
|
92
|
+
|
|
93
|
+
constructor(wordlist: string[]) {
|
|
94
|
+
for (const word of wordlist) {
|
|
95
|
+
const bitset = LetterBitset.from(word).data;
|
|
96
|
+
if (!this.letterCounters.has(bitset)) {
|
|
97
|
+
this.letterCounters.set(bitset, []);
|
|
98
|
+
}
|
|
99
|
+
this.letterCounters.get(bitset)!.push(word);
|
|
100
|
+
this.lengths.add(word.length);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/** Get the words whose bitset matches the given slug's bitset. */
|
|
105
|
+
get(slug: string): string[] {
|
|
106
|
+
return this.letterCounters.get(LetterBitset.from(slug).data) ?? [];
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Find all substrings of the slug that anagram to a word's bitset. */
|
|
110
|
+
*matchSubstring(slug: string): Generator<{ start: number; words: string[] }> {
|
|
111
|
+
for (const length of this.lengths) {
|
|
112
|
+
let start = 0;
|
|
113
|
+
let bitset = LetterBitset.from(slug.slice(0, length));
|
|
114
|
+
for (; start + length <= slug.length; start++) {
|
|
115
|
+
const words = this.letterCounters.get(bitset.data);
|
|
116
|
+
if (words && words.length > 0) {
|
|
117
|
+
yield { start, words };
|
|
118
|
+
}
|
|
119
|
+
bitset = bitset.sub(slug[start] ?? "").add(slug[start + length] ?? "");
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
@@ -0,0 +1,236 @@
|
|
|
1
|
+
import { Distribution } from "./distribution.js";
|
|
2
|
+
import { LogCounter } from "./logCounter.js";
|
|
3
|
+
import { LogNum } from "./logNum.js";
|
|
4
|
+
import { memoize } from "./memoize.js";
|
|
5
|
+
import { caesar, interval } from "./util.js";
|
|
6
|
+
|
|
7
|
+
export const LETTERS = "abcdefghijklmnopqrstuvwxyz";
|
|
8
|
+
export const VOWELS = "aeiou";
|
|
9
|
+
export const CONSONANTS = "bcdfghjklmnpqrstvwxyz";
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Info about the letter distribution of a wordlist.
|
|
13
|
+
*
|
|
14
|
+
* All of these methods rely on letters being drawn iid. This is the case for
|
|
15
|
+
* letters that come from random indexing. This is *not* the case for things
|
|
16
|
+
* like random substrings.
|
|
17
|
+
*/
|
|
18
|
+
export class LetterDistribution {
|
|
19
|
+
readonly distribution: Distribution<string>;
|
|
20
|
+
private readonly lengthToProbs: Map<
|
|
21
|
+
number,
|
|
22
|
+
{
|
|
23
|
+
/** Log probability to get a word of this length. */
|
|
24
|
+
word: LogNum;
|
|
25
|
+
/** Log probability to get an anagram of a word of this length. */
|
|
26
|
+
anagram: LogNum;
|
|
27
|
+
}
|
|
28
|
+
>;
|
|
29
|
+
|
|
30
|
+
constructor(wordlist: string[]) {
|
|
31
|
+
const letterCount = new Map<string, number>();
|
|
32
|
+
let total = 0;
|
|
33
|
+
|
|
34
|
+
for (const word of wordlist) {
|
|
35
|
+
for (const letter of word) {
|
|
36
|
+
letterCount.set(letter, (letterCount.get(letter) ?? 0) + 1);
|
|
37
|
+
total++;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
this.distribution = new Distribution(
|
|
42
|
+
new Map(
|
|
43
|
+
Array.from(letterCount.entries(), ([letter, count]) => [
|
|
44
|
+
letter,
|
|
45
|
+
LogNum.fromFraction(count, total),
|
|
46
|
+
]),
|
|
47
|
+
),
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
this.lengthToProbs = new Map<number, { word: LogNum; anagram: LogNum }>();
|
|
51
|
+
|
|
52
|
+
for (const word of wordlist) {
|
|
53
|
+
const prob = LogNum.prod(
|
|
54
|
+
Array.from(word, (letter) => this.distribution.get(letter)),
|
|
55
|
+
);
|
|
56
|
+
|
|
57
|
+
const counts = new Map<string, number>();
|
|
58
|
+
for (const letter of word) {
|
|
59
|
+
counts.set(letter, (counts.get(letter) ?? 0) + 1);
|
|
60
|
+
}
|
|
61
|
+
const perms = LogNum.fromFactorial(word.length).div(
|
|
62
|
+
LogNum.prod(
|
|
63
|
+
Array.from(counts.values(), (count) => LogNum.fromFactorial(count)),
|
|
64
|
+
),
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
if (!this.lengthToProbs.has(word.length)) {
|
|
68
|
+
this.lengthToProbs.set(word.length, {
|
|
69
|
+
word: LogNum.from(0),
|
|
70
|
+
anagram: LogNum.from(0),
|
|
71
|
+
});
|
|
72
|
+
}
|
|
73
|
+
this.lengthToProbs.get(word.length)!.word = this.lengthToProbs
|
|
74
|
+
.get(word.length)!
|
|
75
|
+
.word.add(prob);
|
|
76
|
+
this.lengthToProbs.get(word.length)!.anagram = this.lengthToProbs
|
|
77
|
+
.get(word.length)!
|
|
78
|
+
.anagram.add(prob.mul(perms));
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Log probability that a list of letters is iid drawn from this distribution,
|
|
84
|
+
* via chi-squared.
|
|
85
|
+
*/
|
|
86
|
+
probUnordered(letters: string[]): LogNum {
|
|
87
|
+
const counter = LogCounter.from(letters);
|
|
88
|
+
return this.distribution.probUnordered(counter);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/** Over- and under-represented letters, at 2 sigma. */
|
|
92
|
+
outliers(letters: string[]): {
|
|
93
|
+
high: string[];
|
|
94
|
+
low: string[];
|
|
95
|
+
} {
|
|
96
|
+
const counter = LogCounter.from(letters);
|
|
97
|
+
const { high, low } = this.distribution.outliers(counter);
|
|
98
|
+
return { high: Object.keys(high), low: Object.keys(low) };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Log probability that words of given lengths share common letters, in the
|
|
103
|
+
* same order.
|
|
104
|
+
*/
|
|
105
|
+
probCommonOrdered(common: number, lengths: number[]): LogNum {
|
|
106
|
+
if (common === 0) {
|
|
107
|
+
return LogNum.from(1);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
const combos = LogNum.prod(
|
|
111
|
+
interval(0, common - 1).flatMap((k) =>
|
|
112
|
+
lengths.map((length) => LogNum.fromFraction(length - k, k + 1)),
|
|
113
|
+
),
|
|
114
|
+
);
|
|
115
|
+
const p = this.distribution.moment(lengths.length).pow(common);
|
|
116
|
+
|
|
117
|
+
return LogNum.from(1).sub(LogNum.fromExp(-p.toNum() * combos.toNum()));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/** Log probability that k letters are all equal. */
|
|
121
|
+
probEqual(k: number): LogNum {
|
|
122
|
+
return this.distribution.probEqual(k);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/** Log probability that k letters are all equal, except for one. */
|
|
126
|
+
probAlmostEqual(k: number): LogNum {
|
|
127
|
+
return this.distribution.probAlmostEqual(k);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/** Log probability that k letters have exactly two values. */
|
|
131
|
+
probTwoDistinct(k: number): LogNum {
|
|
132
|
+
return this.distribution.probTwoDistinct(k);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/** Log probability that k letters are consecutive. */
|
|
136
|
+
@memoize()
|
|
137
|
+
probConsecutive(k: number): LogNum {
|
|
138
|
+
if (k <= 1) {
|
|
139
|
+
return LogNum.from(1);
|
|
140
|
+
} else if (k > LETTERS.length) {
|
|
141
|
+
return LogNum.from(0);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const freqWindow = [];
|
|
145
|
+
for (const i of interval(1, k)) {
|
|
146
|
+
freqWindow.push(this.distribution.get(LETTERS[i]!));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const partials = [];
|
|
150
|
+
for (let a = 1; a + k - 1 <= LETTERS.length; a++) {
|
|
151
|
+
partials.push(LogNum.prod(freqWindow));
|
|
152
|
+
freqWindow.shift();
|
|
153
|
+
freqWindow.push(this.distribution.get(LETTERS[a + k]!));
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
return LogNum.fromFactorial(k).mul(LogNum.sum(partials));
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Log probability that k letters have distinct values, all at least min.
|
|
161
|
+
*/
|
|
162
|
+
@memoize(2)
|
|
163
|
+
probDistinct(k: number, min = "a"): LogNum {
|
|
164
|
+
if (k <= 0) {
|
|
165
|
+
return LogNum.from(1);
|
|
166
|
+
} else if (min > "z") {
|
|
167
|
+
return LogNum.from(0);
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const probs = [];
|
|
171
|
+
for (const [letter, freq] of this.distribution.entries()) {
|
|
172
|
+
if (letter < min) {
|
|
173
|
+
continue;
|
|
174
|
+
}
|
|
175
|
+
probs.push(freq.mul(this.probDistinct(k - 1, caesar(letter, 1))));
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return probs.length === 0
|
|
179
|
+
? LogNum.from(0)
|
|
180
|
+
: LogNum.from(k).mul(LogNum.sum(probs));
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/** Log probability that k letters can be grouped in equal pairs. */
|
|
184
|
+
probPaired(k: number): LogNum {
|
|
185
|
+
if (k % 2 !== 0) {
|
|
186
|
+
return LogNum.from(0);
|
|
187
|
+
}
|
|
188
|
+
return this.probDistinct(Math.floor(k / 2));
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
/** Log probability that k letters form a word. */
|
|
192
|
+
probWord(k: number): LogNum {
|
|
193
|
+
return this.lengthToProbs.get(k)?.word ?? LogNum.from(0);
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/** Log probability that k letters form an anagram of a word. */
|
|
197
|
+
probAnagram(k: number): LogNum {
|
|
198
|
+
return this.lengthToProbs.get(k)?.anagram ?? LogNum.from(0);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/** Log probability that k letters are all vowels. */
|
|
202
|
+
@memoize()
|
|
203
|
+
probVowels(k: number): LogNum {
|
|
204
|
+
if (k === 0) {
|
|
205
|
+
return LogNum.from(1);
|
|
206
|
+
}
|
|
207
|
+
if (k === 1) {
|
|
208
|
+
return LogNum.sum(
|
|
209
|
+
Array.from(VOWELS, (vowel) => this.distribution.get(vowel)),
|
|
210
|
+
);
|
|
211
|
+
}
|
|
212
|
+
return this.probVowels(1).pow(k);
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
/** Log probability that k letters are all consonants. */
|
|
216
|
+
@memoize()
|
|
217
|
+
probConsonants(k: number): LogNum {
|
|
218
|
+
if (k === 0) {
|
|
219
|
+
return LogNum.from(1);
|
|
220
|
+
}
|
|
221
|
+
if (k === 1) {
|
|
222
|
+
return LogNum.sum(
|
|
223
|
+
Array.from(CONSONANTS, (consonant) => this.distribution.get(consonant)),
|
|
224
|
+
);
|
|
225
|
+
}
|
|
226
|
+
return this.probConsonants(1).pow(k);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/**
|
|
230
|
+
* Log probability that n words, each of length k, have the same pattern of
|
|
231
|
+
* vowels and consonants.
|
|
232
|
+
*/
|
|
233
|
+
probEqualVowelPattern(n: number, k: number): LogNum {
|
|
234
|
+
return this.probVowels(k).add(this.probConsonants(k)).pow(n);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { enumerate } from "./util.js";
|
|
2
|
+
|
|
3
|
+
/** A map from letters to their indices in a given slug. */
|
|
4
|
+
export class LetterIndices {
|
|
5
|
+
private readonly indices: ReadonlyMap<string, number[]>;
|
|
6
|
+
|
|
7
|
+
constructor(indices: ReadonlyMap<string, number[]>) {
|
|
8
|
+
this.indices = indices;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
static from(slug: string): LetterIndices {
|
|
12
|
+
const indices = new Map<string, number[]>();
|
|
13
|
+
|
|
14
|
+
for (const [i, letter] of enumerate(slug)) {
|
|
15
|
+
if (!indices.has(letter)) {
|
|
16
|
+
indices.set(letter, []);
|
|
17
|
+
}
|
|
18
|
+
indices.get(letter)!.push(i);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
return new LetterIndices(indices);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
*counts(): IterableIterator<[string, number]> {
|
|
25
|
+
for (const [letter, indices] of this.indices.entries()) {
|
|
26
|
+
yield [letter, indices.length];
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
countSet(): Set<number> {
|
|
31
|
+
return new Set(Array.from(this.counts(), ([, c]) => c));
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
entries(): IterableIterator<[string, number[]]> {
|
|
35
|
+
return this.indices.entries();
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
filterKeys(fn: (letter: string, indices: number[]) => boolean): string[] {
|
|
39
|
+
return Array.from(this.indices.entries())
|
|
40
|
+
.filter(([letter, indices]) => fn(letter, indices))
|
|
41
|
+
.map(([letter]) => letter);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
get(letter: string): number[] {
|
|
45
|
+
return this.indices.get(letter) ?? [];
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
keys(): IterableIterator<string> {
|
|
49
|
+
return this.indices.keys();
|
|
50
|
+
}
|
|
51
|
+
}
|