sparse-encode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -0
- package/dist/__tests__/bm25.test.d.ts +2 -0
- package/dist/__tests__/bm25.test.d.ts.map +1 -0
- package/dist/__tests__/bm25.test.js +89 -0
- package/dist/__tests__/bm25.test.js.map +1 -0
- package/dist/__tests__/tfidf.test.d.ts +2 -0
- package/dist/__tests__/tfidf.test.d.ts.map +1 -0
- package/dist/__tests__/tfidf.test.js +110 -0
- package/dist/__tests__/tfidf.test.js.map +1 -0
- package/dist/__tests__/tokenizer.test.d.ts +2 -0
- package/dist/__tests__/tokenizer.test.d.ts.map +1 -0
- package/dist/__tests__/tokenizer.test.js +51 -0
- package/dist/__tests__/tokenizer.test.js.map +1 -0
- package/dist/bm25.d.ts +3 -0
- package/dist/bm25.d.ts.map +1 -0
- package/dist/bm25.js +99 -0
- package/dist/bm25.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/porter-stemmer.d.ts +2 -0
- package/dist/porter-stemmer.d.ts.map +1 -0
- package/dist/porter-stemmer.js +195 -0
- package/dist/porter-stemmer.js.map +1 -0
- package/dist/tfidf.d.ts +3 -0
- package/dist/tfidf.d.ts.map +1 -0
- package/dist/tfidf.js +98 -0
- package/dist/tfidf.js.map +1 -0
- package/dist/tokenizer.d.ts +8 -0
- package/dist/tokenizer.d.ts.map +1 -0
- package/dist/tokenizer.js +43 -0
- package/dist/tokenizer.js.map +1 -0
- package/dist/types.d.ts +41 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/vocab.d.ts +11 -0
- package/dist/vocab.d.ts.map +1 -0
- package/dist/vocab.js +38 -0
- package/dist/vocab.js.map +1 -0
- package/package.json +33 -0
package/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# sparse-encode
|
|
2
|
+
|
|
3
|
+
Generate BM25 and TF-IDF sparse vectors in JavaScript/TypeScript. Designed for use with sparse vector search engines (e.g., Pinecone sparse indexes, Qdrant sparse vectors).
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install sparse-encode
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { createBM25, createTFIDF } from 'sparse-encode'
|
|
15
|
+
|
|
16
|
+
const docs = [
|
|
17
|
+
'the quick brown fox jumps over the lazy dog',
|
|
18
|
+
'the dog barked loudly at the fox',
|
|
19
|
+
'cats and dogs are common pets',
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
// BM25
|
|
23
|
+
const bm25 = createBM25({ k1: 1.5, b: 0.75 })
|
|
24
|
+
bm25.fit(docs)
|
|
25
|
+
const vec = bm25.encode('quick fox')
|
|
26
|
+
// { indices: [2, 5, ...], values: [0.82, 1.23, ...] }
|
|
27
|
+
|
|
28
|
+
const queryVec = bm25.encodeQuery('fox')
|
|
29
|
+
|
|
30
|
+
// TF-IDF
|
|
31
|
+
const tfidf = createTFIDF({ sublinearTf: false })
|
|
32
|
+
tfidf.fit(docs)
|
|
33
|
+
const tVec = tfidf.encode('quick fox')
|
|
34
|
+
// L2-normalized: sum of squares ≈ 1.0
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## BM25 Encoder
|
|
38
|
+
|
|
39
|
+
```typescript
|
|
40
|
+
const enc = createBM25(options?)
|
|
41
|
+
enc.fit(documents: string[]) // build vocabulary + IDF statistics
|
|
42
|
+
enc.encode(text: string): SparseVector // encode a document
|
|
43
|
+
enc.encodeQuery(text: string): SparseVector // encode a query (no length norm)
|
|
44
|
+
enc.encodeBatch(texts: string[]): SparseVector[]
|
|
45
|
+
enc.getStats(): FitStats // { N, avgdl, vocabSize, totalTokens }
|
|
46
|
+
enc.serialize(): string // JSON snapshot (vocab + df + options)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Options
|
|
50
|
+
|
|
51
|
+
| Option | Default | Description |
|
|
52
|
+
|--------|---------|-------------|
|
|
53
|
+
| `k1` | `1.5` | Term frequency saturation parameter |
|
|
54
|
+
| `b` | `0.75` | Length normalization parameter |
|
|
55
|
+
| `stem` | `true` | Apply Porter stemmer |
|
|
56
|
+
| `stopwords` | `[]` | Additional stopwords to remove |
|
|
57
|
+
| `tokenizer` | built-in | Custom tokenizer function |
|
|
58
|
+
|
|
59
|
+
## TF-IDF Encoder
|
|
60
|
+
|
|
61
|
+
```typescript
|
|
62
|
+
const enc = createTFIDF(options?)
|
|
63
|
+
enc.fit(documents: string[])
|
|
64
|
+
enc.encode(text: string): SparseVector // L2-normalized TF-IDF vector
|
|
65
|
+
enc.encodeQuery(text: string): SparseVector
|
|
66
|
+
enc.encodeBatch(texts: string[]): SparseVector[]
|
|
67
|
+
enc.getStats(): FitStats
|
|
68
|
+
enc.serialize(): string
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Options
|
|
72
|
+
|
|
73
|
+
| Option | Default | Description |
|
|
74
|
+
|--------|---------|-------------|
|
|
75
|
+
| `stem` | `true` | Apply Porter stemmer |
|
|
76
|
+
| `stopwords` | `[]` | Additional stopwords to remove |
|
|
77
|
+
| `sublinearTf` | `false` | Use `1 + log(tf)` instead of `tf / doc_length` |
|
|
78
|
+
| `tokenizer` | built-in | Custom tokenizer function |
|
|
79
|
+
|
|
80
|
+
## SparseVector Format
|
|
81
|
+
|
|
82
|
+
```typescript
|
|
83
|
+
interface SparseVector {
|
|
84
|
+
indices: number[] // term IDs (sorted ascending)
|
|
85
|
+
values: number[] // corresponding scores
|
|
86
|
+
}
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Each position `i` maps `indices[i]` → `values[i]`. Zero values are omitted.
|
|
90
|
+
|
|
91
|
+
## Tokenizer Pipeline
|
|
92
|
+
|
|
93
|
+
The built-in tokenizer:
|
|
94
|
+
1. Lowercases the input
|
|
95
|
+
2. Splits on non-word characters
|
|
96
|
+
3. Removes pure numbers
|
|
97
|
+
4. Removes common English stopwords
|
|
98
|
+
5. Applies the Porter stemmer (can be disabled with `stem: false`)
|
|
99
|
+
|
|
100
|
+
## License
|
|
101
|
+
|
|
102
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bm25.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/bm25.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const bm25_1 = require("../bm25");
|
|
5
|
+
(0, vitest_1.describe)('createBM25', () => {
|
|
6
|
+
const docs = [
|
|
7
|
+
'the quick brown fox jumps over the lazy dog',
|
|
8
|
+
'the dog barked loudly at the fox',
|
|
9
|
+
'cats and dogs are common pets',
|
|
10
|
+
];
|
|
11
|
+
(0, vitest_1.it)('throws if encode called before fit', () => {
|
|
12
|
+
const enc = (0, bm25_1.createBM25)();
|
|
13
|
+
(0, vitest_1.expect)(() => enc.encode('test')).toThrow();
|
|
14
|
+
});
|
|
15
|
+
(0, vitest_1.it)('returns a SparseVector with indices and values', () => {
|
|
16
|
+
const enc = (0, bm25_1.createBM25)();
|
|
17
|
+
enc.fit(docs);
|
|
18
|
+
const vec = enc.encode('fox');
|
|
19
|
+
(0, vitest_1.expect)(Array.isArray(vec.indices)).toBe(true);
|
|
20
|
+
(0, vitest_1.expect)(Array.isArray(vec.values)).toBe(true);
|
|
21
|
+
(0, vitest_1.expect)(vec.indices.length).toBe(vec.values.length);
|
|
22
|
+
});
|
|
23
|
+
(0, vitest_1.it)('encodes to non-empty vector for known term', () => {
|
|
24
|
+
const enc = (0, bm25_1.createBM25)();
|
|
25
|
+
enc.fit(docs);
|
|
26
|
+
const vec = enc.encode('fox');
|
|
27
|
+
(0, vitest_1.expect)(vec.indices.length).toBeGreaterThan(0);
|
|
28
|
+
});
|
|
29
|
+
(0, vitest_1.it)('indices are sorted ascending', () => {
|
|
30
|
+
const enc = (0, bm25_1.createBM25)();
|
|
31
|
+
enc.fit(docs);
|
|
32
|
+
const vec = enc.encode('quick brown fox');
|
|
33
|
+
for (let i = 1; i < vec.indices.length; i++) {
|
|
34
|
+
(0, vitest_1.expect)(vec.indices[i]).toBeGreaterThan(vec.indices[i - 1]);
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
(0, vitest_1.it)('term in doc scores higher than term not in corpus', () => {
|
|
38
|
+
const enc = (0, bm25_1.createBM25)();
|
|
39
|
+
enc.fit(docs);
|
|
40
|
+
const vec = enc.encode('fox');
|
|
41
|
+
const foxIdx = vec.indices.findIndex(i => i >= 0);
|
|
42
|
+
// vector should have at least one positive value
|
|
43
|
+
(0, vitest_1.expect)(vec.values[foxIdx]).toBeGreaterThan(0);
|
|
44
|
+
});
|
|
45
|
+
(0, vitest_1.it)('getStats returns correct N and vocabSize', () => {
|
|
46
|
+
const enc = (0, bm25_1.createBM25)();
|
|
47
|
+
enc.fit(docs);
|
|
48
|
+
const stats = enc.getStats();
|
|
49
|
+
(0, vitest_1.expect)(stats.N).toBe(3);
|
|
50
|
+
(0, vitest_1.expect)(stats.vocabSize).toBeGreaterThan(0);
|
|
51
|
+
(0, vitest_1.expect)(stats.avgdl).toBeGreaterThan(0);
|
|
52
|
+
(0, vitest_1.expect)(stats.totalTokens).toBeGreaterThan(0);
|
|
53
|
+
});
|
|
54
|
+
(0, vitest_1.it)('encodeBatch returns one vector per text', () => {
|
|
55
|
+
const enc = (0, bm25_1.createBM25)();
|
|
56
|
+
enc.fit(docs);
|
|
57
|
+
const vecs = enc.encodeBatch(['fox', 'dog', 'cat']);
|
|
58
|
+
(0, vitest_1.expect)(vecs).toHaveLength(3);
|
|
59
|
+
});
|
|
60
|
+
(0, vitest_1.it)('encodeQuery returns a SparseVector', () => {
|
|
61
|
+
const enc = (0, bm25_1.createBM25)();
|
|
62
|
+
enc.fit(docs);
|
|
63
|
+
const vec = enc.encodeQuery('brown fox');
|
|
64
|
+
(0, vitest_1.expect)(Array.isArray(vec.indices)).toBe(true);
|
|
65
|
+
(0, vitest_1.expect)(Array.isArray(vec.values)).toBe(true);
|
|
66
|
+
});
|
|
67
|
+
(0, vitest_1.it)('serialize returns valid JSON', () => {
|
|
68
|
+
const enc = (0, bm25_1.createBM25)();
|
|
69
|
+
enc.fit(docs);
|
|
70
|
+
const json = enc.serialize();
|
|
71
|
+
(0, vitest_1.expect)(() => JSON.parse(json)).not.toThrow();
|
|
72
|
+
const parsed = JSON.parse(json);
|
|
73
|
+
(0, vitest_1.expect)(parsed.N).toBe(3);
|
|
74
|
+
(0, vitest_1.expect)(parsed.vocab).toBeDefined();
|
|
75
|
+
});
|
|
76
|
+
(0, vitest_1.it)('custom k1/b options affect scores', () => {
|
|
77
|
+
const enc1 = (0, bm25_1.createBM25)({ k1: 1.0, b: 0.5 });
|
|
78
|
+
const enc2 = (0, bm25_1.createBM25)({ k1: 2.0, b: 0.9 });
|
|
79
|
+
enc1.fit(docs);
|
|
80
|
+
enc2.fit(docs);
|
|
81
|
+
const v1 = enc1.encode('fox');
|
|
82
|
+
const v2 = enc2.encode('fox');
|
|
83
|
+
// Scores differ with different parameters
|
|
84
|
+
const sum1 = v1.values.reduce((a, b) => a + b, 0);
|
|
85
|
+
const sum2 = v2.values.reduce((a, b) => a + b, 0);
|
|
86
|
+
(0, vitest_1.expect)(sum1).not.toBeCloseTo(sum2, 5);
|
|
87
|
+
});
|
|
88
|
+
});
|
|
89
|
+
//# sourceMappingURL=bm25.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bm25.test.js","sourceRoot":"","sources":["../../src/__tests__/bm25.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,kCAAoC;AAEpC,IAAA,iBAAQ,EAAC,YAAY,EAAE,GAAG,EAAE;IAC1B,MAAM,IAAI,GAAG;QACX,6CAA6C;QAC7C,kCAAkC;QAClC,+BAA+B;KAChC,CAAA;IAED,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,IAAA,eAAM,EAAC,GAAG,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,EAAE,CAAA;IAC5C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAC5C,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;IACpD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC/C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAA;QACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QAC5D,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mDAAmD,EAAE,GAAG,EAAE;QAC3D,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,MAAM,MAAM,GAAG,GAAG,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;QACjD,iDAAiD;QACjD,IAAA,eAAM,EAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC/C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,KAAK,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAA;QAC5B,IAAA,eAAM,EAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACvB,IAAA,eAAM,EAAC,KAAK,CAAC,SAAS,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QAC1C,IAAA,eAAM,EAAC,KAAK,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,KAAK,CAAC,WAAW,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC9C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,IAAI,GAAG,GAAG,CAAC,WAAW,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAA;QACnD,IAAA,eAAM,EAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAC9B,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,WAAW,CAAC,WAAW,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC9C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,GAAG,GAAG,IAAA,iBAAU,GAAE,CAAA;QACxB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,IAAI,GAAG,GAAG,CAAC,SAAS,EAAE,CAAA;QAC5B,IAAA,eAAM,EAAC,GAAG,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,EAAE,CAAA;QAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QAC/B,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACxB,IAAA,eAAM,EAAC,MAAM,CAAC,KAAK,CAAC,CAAC,WAAW,EAAE,CAAA;IACpC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,IAAA,iBAAU,EAAC,EAAE,EAAE,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAA;QAC5C,MAAM,IAAI,GAAG,IAAA,iBAAU,EAAC,EAAE,EAAE,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAA;QAC5C,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACd,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACd,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,0CAA0C;QAC1C,MAAM,IAAI,GAAG,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;QACjD,MAAM,IAAI,GAAG,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;QACjD,IAAA,eAAM,EAAC,IAAI,CAAC,CAAC,GAAG,CAAC,WAAW,CAAC,IAAI,EAAE,CAAC,CAAC,CAAA;IACvC,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tfidf.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/tfidf.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const tfidf_1 = require("../tfidf");
|
|
5
|
+
(0, vitest_1.describe)('createTFIDF', () => {
|
|
6
|
+
const docs = [
|
|
7
|
+
'the quick brown fox jumps over the lazy dog',
|
|
8
|
+
'the dog barked loudly at the fox',
|
|
9
|
+
'cats and dogs are common pets',
|
|
10
|
+
];
|
|
11
|
+
(0, vitest_1.it)('throws if encode called before fit', () => {
|
|
12
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
13
|
+
(0, vitest_1.expect)(() => enc.encode('test')).toThrow();
|
|
14
|
+
});
|
|
15
|
+
(0, vitest_1.it)('returns a SparseVector with indices and values', () => {
|
|
16
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
17
|
+
enc.fit(docs);
|
|
18
|
+
const vec = enc.encode('fox');
|
|
19
|
+
(0, vitest_1.expect)(Array.isArray(vec.indices)).toBe(true);
|
|
20
|
+
(0, vitest_1.expect)(Array.isArray(vec.values)).toBe(true);
|
|
21
|
+
(0, vitest_1.expect)(vec.indices.length).toBe(vec.values.length);
|
|
22
|
+
});
|
|
23
|
+
(0, vitest_1.it)('encodes to non-empty vector for known term', () => {
|
|
24
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
25
|
+
enc.fit(docs);
|
|
26
|
+
const vec = enc.encode('fox');
|
|
27
|
+
(0, vitest_1.expect)(vec.indices.length).toBeGreaterThan(0);
|
|
28
|
+
});
|
|
29
|
+
(0, vitest_1.it)('indices are sorted ascending', () => {
|
|
30
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
31
|
+
enc.fit(docs);
|
|
32
|
+
const vec = enc.encode('quick brown fox');
|
|
33
|
+
for (let i = 1; i < vec.indices.length; i++) {
|
|
34
|
+
(0, vitest_1.expect)(vec.indices[i]).toBeGreaterThan(vec.indices[i - 1]);
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
(0, vitest_1.it)('output vector is unit-length (L2 norm ≈ 1)', () => {
|
|
38
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
39
|
+
enc.fit(docs);
|
|
40
|
+
const vec = enc.encode('quick brown fox');
|
|
41
|
+
const sumSq = vec.values.reduce((acc, v) => acc + v * v, 0);
|
|
42
|
+
(0, vitest_1.expect)(sumSq).toBeCloseTo(1.0, 5);
|
|
43
|
+
});
|
|
44
|
+
(0, vitest_1.it)('unit-length holds for single-term query', () => {
|
|
45
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
46
|
+
enc.fit(docs);
|
|
47
|
+
const vec = enc.encode('fox');
|
|
48
|
+
if (vec.values.length > 0) {
|
|
49
|
+
const sumSq = vec.values.reduce((acc, v) => acc + v * v, 0);
|
|
50
|
+
(0, vitest_1.expect)(sumSq).toBeCloseTo(1.0, 5);
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
(0, vitest_1.it)('getStats returns correct N and vocabSize', () => {
|
|
54
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
55
|
+
enc.fit(docs);
|
|
56
|
+
const stats = enc.getStats();
|
|
57
|
+
(0, vitest_1.expect)(stats.N).toBe(3);
|
|
58
|
+
(0, vitest_1.expect)(stats.vocabSize).toBeGreaterThan(0);
|
|
59
|
+
(0, vitest_1.expect)(stats.totalTokens).toBeGreaterThan(0);
|
|
60
|
+
});
|
|
61
|
+
(0, vitest_1.it)('encodeBatch returns one vector per text', () => {
|
|
62
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
63
|
+
enc.fit(docs);
|
|
64
|
+
const vecs = enc.encodeBatch(['fox', 'dog', 'cat']);
|
|
65
|
+
(0, vitest_1.expect)(vecs).toHaveLength(3);
|
|
66
|
+
});
|
|
67
|
+
(0, vitest_1.it)('encodeQuery returns unit-length SparseVector', () => {
|
|
68
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
69
|
+
enc.fit(docs);
|
|
70
|
+
const vec = enc.encodeQuery('brown fox');
|
|
71
|
+
if (vec.values.length > 0) {
|
|
72
|
+
const sumSq = vec.values.reduce((acc, v) => acc + v * v, 0);
|
|
73
|
+
(0, vitest_1.expect)(sumSq).toBeCloseTo(1.0, 5);
|
|
74
|
+
}
|
|
75
|
+
});
|
|
76
|
+
(0, vitest_1.it)('serialize returns valid JSON', () => {
|
|
77
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
78
|
+
enc.fit(docs);
|
|
79
|
+
const json = enc.serialize();
|
|
80
|
+
(0, vitest_1.expect)(() => JSON.parse(json)).not.toThrow();
|
|
81
|
+
const parsed = JSON.parse(json);
|
|
82
|
+
(0, vitest_1.expect)(parsed.N).toBe(3);
|
|
83
|
+
(0, vitest_1.expect)(parsed.vocab).toBeDefined();
|
|
84
|
+
});
|
|
85
|
+
(0, vitest_1.it)('sublinearTf option changes scores', () => {
|
|
86
|
+
const enc1 = (0, tfidf_1.createTFIDF)({ sublinearTf: false });
|
|
87
|
+
const enc2 = (0, tfidf_1.createTFIDF)({ sublinearTf: true });
|
|
88
|
+
enc1.fit(docs);
|
|
89
|
+
enc2.fit(docs);
|
|
90
|
+
// Use a text with repeated terms to make sublinear difference visible
|
|
91
|
+
const text = 'fox fox fox quick';
|
|
92
|
+
const v1 = enc1.encode(text);
|
|
93
|
+
const v2 = enc2.encode(text);
|
|
94
|
+
// Both should be unit vectors but with different distributions
|
|
95
|
+
if (v1.values.length > 0 && v2.values.length > 0) {
|
|
96
|
+
const s1 = v1.values.reduce((a, b) => a + b * b, 0);
|
|
97
|
+
const s2 = v2.values.reduce((a, b) => a + b * b, 0);
|
|
98
|
+
(0, vitest_1.expect)(s1).toBeCloseTo(1.0, 5);
|
|
99
|
+
(0, vitest_1.expect)(s2).toBeCloseTo(1.0, 5);
|
|
100
|
+
}
|
|
101
|
+
});
|
|
102
|
+
(0, vitest_1.it)('empty text returns empty vector', () => {
|
|
103
|
+
const enc = (0, tfidf_1.createTFIDF)();
|
|
104
|
+
enc.fit(docs);
|
|
105
|
+
const vec = enc.encode('');
|
|
106
|
+
(0, vitest_1.expect)(vec.indices).toHaveLength(0);
|
|
107
|
+
(0, vitest_1.expect)(vec.values).toHaveLength(0);
|
|
108
|
+
});
|
|
109
|
+
});
|
|
110
|
+
//# sourceMappingURL=tfidf.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tfidf.test.js","sourceRoot":"","sources":["../../src/__tests__/tfidf.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,oCAAsC;AAEtC,IAAA,iBAAQ,EAAC,aAAa,EAAE,GAAG,EAAE;IAC3B,MAAM,IAAI,GAAG;QACX,6CAA6C;QAC7C,kCAAkC;QAClC,+BAA+B;KAChC,CAAA;IAED,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,IAAA,eAAM,EAAC,GAAG,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,EAAE,CAAA;IAC5C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAC5C,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;IACpD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC/C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAA;QACzC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5C,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;QAC5D,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,iBAAiB,CAAC,CAAA;QACzC,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;QAC3D,IAAA,eAAM,EAAC,KAAK,CAAC,CAAC,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,CAAA;IACnC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QAC7B,IAAI,GAAG,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;YAC3D,IAAA,eAAM,EAAC,KAAK,CAAC,CAAC,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,CAAA;QACnC,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,0CAA0C,EAAE,GAAG,EAAE;QAClD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,KAAK,GAAG,GAAG,CAAC,QAAQ,EAAE,CAAA;QAC5B,IAAA,eAAM,EAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACvB,IAAA,eAAM,EAAC,KAAK,CAAC,SAAS,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QAC1C,IAAA,eAAM,EAAC,KAAK,CAAC,WAAW,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC9C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,IAAI,GAAG,GAAG,CAAC,WAAW,CAAC,CAAC,KAAK,EAAE,KAAK,EAAE,KAAK,CAAC,CAAC,CAAA;QACnD,IAAA,eAAM,EAAC,IAAI,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAC9B,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,WAAW,CAAC,WAAW,CAAC,CAAA;QACxC,IAAI,GAAG,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1B,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;YAC3D,IAAA,eAAM,EAAC,KAAK,CAAC,CAAC,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,CAAA;QACnC,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,IAAI,GAAG,GAAG,CAAC,SAAS,EAAE,CAAA;QAC5B,IAAA,eAAM,EAAC,GAAG,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,EAAE,CAAA;QAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;QAC/B,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QACxB,IAAA,eAAM,EAAC,MAAM,CAAC,KAAK,CAAC,CAAC,WAAW,EAAE,CAAA;IACpC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,IAAA,mBAAW,EAAC,EAAE,WAAW,EAAE,KAAK,EAAE,CAAC,CAAA;QAChD,MAAM,IAAI,GAAG,IAAA,mBAAW,EAAC,EAAE,WAAW,EAAE,IAAI,EAAE,CAAC,CAAA;QAC/C,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACd,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACd,sEAAsE;QACtE,MAAM,IAAI,GAAG,mBAAmB,CAAA;QAChC,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QAC5B,MAAM,EAAE,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QAC5B,+DAA+D;QAC/D,IAAI,EAAE,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjD,MAAM,EAAE,GAAG,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;YACnD,MAAM,EAAE,GAAG,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;YACnD,IAAA,eAAM,EAAC,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,CAAA;YAC9B,IAAA,eAAM,EAAC,EAAE,CAAC,CAAC,WAAW,CAAC,GAAG,EAAE,CAAC,CAAC,CAAA;QAChC,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,GAAG,GAAG,IAAA,mBAAW,GAAE,CAAA;QACzB,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QACb,MAAM,GAAG,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QAC1B,IAAA,eAAM,EAAC,GAAG,CAAC,OAAO,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;QACnC,IAAA,eAAM,EAAC,GAAG,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IACpC,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/tokenizer.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const tokenizer_1 = require("../tokenizer");
|
|
5
|
+
(0, vitest_1.describe)('defaultTokenizer', () => {
|
|
6
|
+
(0, vitest_1.it)('lowercases input', () => {
|
|
7
|
+
const tokens = (0, tokenizer_1.defaultTokenizer)('Hello World');
|
|
8
|
+
(0, vitest_1.expect)(tokens.every(t => t === t.toLowerCase())).toBe(true);
|
|
9
|
+
});
|
|
10
|
+
(0, vitest_1.it)('removes stopwords', () => {
|
|
11
|
+
const tokens = (0, tokenizer_1.defaultTokenizer)('the cat sat on the mat');
|
|
12
|
+
(0, vitest_1.expect)(tokens).not.toContain('the');
|
|
13
|
+
(0, vitest_1.expect)(tokens).not.toContain('on');
|
|
14
|
+
});
|
|
15
|
+
(0, vitest_1.it)('removes pure numbers', () => {
|
|
16
|
+
const tokens = (0, tokenizer_1.defaultTokenizer)('I have 42 cats');
|
|
17
|
+
(0, vitest_1.expect)(tokens).not.toContain('42');
|
|
18
|
+
});
|
|
19
|
+
(0, vitest_1.it)('splits on non-word characters', () => {
|
|
20
|
+
const tokens = (0, tokenizer_1.defaultTokenizer)('hello-world foo.bar');
|
|
21
|
+
(0, vitest_1.expect)(tokens.length).toBeGreaterThanOrEqual(2);
|
|
22
|
+
});
|
|
23
|
+
(0, vitest_1.it)('applies Porter stemmer (running → run)', () => {
|
|
24
|
+
const tokens = (0, tokenizer_1.defaultTokenizer)('running');
|
|
25
|
+
// porter stemmer maps running → run
|
|
26
|
+
(0, vitest_1.expect)(tokens).toContain('run');
|
|
27
|
+
});
|
|
28
|
+
(0, vitest_1.it)('applies Porter stemmer (cats → cat)', () => {
|
|
29
|
+
const tokens = (0, tokenizer_1.defaultTokenizer)('cats');
|
|
30
|
+
(0, vitest_1.expect)(tokens).toContain('cat');
|
|
31
|
+
});
|
|
32
|
+
});
|
|
33
|
+
(0, vitest_1.describe)('tokenize', () => {
|
|
34
|
+
(0, vitest_1.it)('uses custom tokenizer when provided', () => {
|
|
35
|
+
const custom = (t) => t.split(',');
|
|
36
|
+
const tokens = (0, tokenizer_1.tokenize)('a,b,c', { tokenizer: custom });
|
|
37
|
+
(0, vitest_1.expect)(tokens).toEqual(['a', 'b', 'c']);
|
|
38
|
+
});
|
|
39
|
+
(0, vitest_1.it)('applies extra stopwords', () => {
|
|
40
|
+
const tokens = (0, tokenizer_1.tokenize)('cat dog bird', { stem: false, stopwords: ['dog'] });
|
|
41
|
+
(0, vitest_1.expect)(tokens).not.toContain('dog');
|
|
42
|
+
(0, vitest_1.expect)(tokens).toContain('cat');
|
|
43
|
+
(0, vitest_1.expect)(tokens).toContain('bird');
|
|
44
|
+
});
|
|
45
|
+
(0, vitest_1.it)('skips stemming when stem=false', () => {
|
|
46
|
+
const tokens = (0, tokenizer_1.tokenize)('running cats', { stem: false });
|
|
47
|
+
(0, vitest_1.expect)(tokens).toContain('running');
|
|
48
|
+
(0, vitest_1.expect)(tokens).toContain('cats');
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
//# sourceMappingURL=tokenizer.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.test.js","sourceRoot":"","sources":["../../src/__tests__/tokenizer.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,4CAAyD;AAEzD,IAAA,iBAAQ,EAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,IAAA,WAAE,EAAC,kBAAkB,EAAE,GAAG,EAAE;QAC1B,MAAM,MAAM,GAAG,IAAA,4BAAgB,EAAC,aAAa,CAAC,CAAA;QAC9C,IAAA,eAAM,EAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC7D,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mBAAmB,EAAE,GAAG,EAAE;QAC3B,MAAM,MAAM,GAAG,IAAA,4BAAgB,EAAC,wBAAwB,CAAC,CAAA;QACzD,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;QACnC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;IACpC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sBAAsB,EAAE,GAAG,EAAE;QAC9B,MAAM,MAAM,GAAG,IAAA,4BAAgB,EAAC,gBAAgB,CAAC,CAAA;QACjD,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;IACpC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,MAAM,GAAG,IAAA,4BAAgB,EAAC,qBAAqB,CAAC,CAAA;QACtD,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAA;IACjD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,MAAM,GAAG,IAAA,4BAAgB,EAAC,SAAS,CAAC,CAAA;QAC1C,oCAAoC;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;IACjC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,MAAM,GAAG,IAAA,4BAAgB,EAAC,MAAM,CAAC,CAAA;QACvC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;IACjC,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,UAAU,EAAE,GAAG,EAAE;IACxB,IAAA,WAAE,EAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,MAAM,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QAC1C,MAAM,MAAM,GAAG,IAAA,oBAAQ,EAAC,OAAO,EAAE,EAAE,SAAS,EAAE,MAAM,EAAE,CAAC,CAAA;QACvD,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAA;IACzC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,yBAAyB,EAAE,GAAG,EAAE;QACjC,MAAM,MAAM,GAAG,IAAA,oBAAQ,EAAC,cAAc,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,SAAS,EAAE,CAAC,KAAK,CAAC,EAAE,CAAC,CAAA;QAC5E,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;QACnC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAA;QAC/B,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,MAAM,GAAG,IAAA,oBAAQ,EAAC,cAAc,EAAE,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAA;QACxD,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,SAAS,CAAC,SAAS,CAAC,CAAA;QACnC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
package/dist/bm25.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bm25.d.ts","sourceRoot":"","sources":["../src/bm25.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,WAAW,EAAE,WAAW,EAA0B,MAAM,SAAS,CAAA;AAE/E,wBAAgB,UAAU,CAAC,OAAO,CAAC,EAAE,WAAW,GAAG,WAAW,CA0G7D"}
|
package/dist/bm25.js
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createBM25 = createBM25;
|
|
4
|
+
const tokenizer_1 = require("./tokenizer");
|
|
5
|
+
const vocab_1 = require("./vocab");
|
|
6
|
+
function createBM25(options) {
|
|
7
|
+
const k1 = options?.k1 ?? 1.5;
|
|
8
|
+
const b = options?.b ?? 0.75;
|
|
9
|
+
const stemOpt = options?.stem ?? true;
|
|
10
|
+
const tokenizerFn = options?.tokenizer;
|
|
11
|
+
const extraStopwords = options?.stopwords ?? [];
|
|
12
|
+
const vocab = new vocab_1.Vocabulary();
|
|
13
|
+
// df[termId] = number of documents containing that term
|
|
14
|
+
const df = new Map();
|
|
15
|
+
let N = 0;
|
|
16
|
+
let avgdl = 0;
|
|
17
|
+
let totalTokens = 0;
|
|
18
|
+
let fitted = false;
|
|
19
|
+
function tokenizeText(text) {
|
|
20
|
+
return (0, tokenizer_1.tokenize)(text, { stem: stemOpt, tokenizer: tokenizerFn, stopwords: extraStopwords });
|
|
21
|
+
}
|
|
22
|
+
function fit(documents) {
|
|
23
|
+
N = documents.length;
|
|
24
|
+
let totalLen = 0;
|
|
25
|
+
const tokenizedDocs = [];
|
|
26
|
+
for (const doc of documents) {
|
|
27
|
+
const tokens = tokenizeText(doc);
|
|
28
|
+
tokenizedDocs.push(tokens);
|
|
29
|
+
totalLen += tokens.length;
|
|
30
|
+
totalTokens += tokens.length;
|
|
31
|
+
// Register all terms in vocab first
|
|
32
|
+
const seen = new Set();
|
|
33
|
+
for (const token of tokens) {
|
|
34
|
+
const id = vocab.getOrAdd(token);
|
|
35
|
+
if (!seen.has(id)) {
|
|
36
|
+
seen.add(id);
|
|
37
|
+
df.set(id, (df.get(id) ?? 0) + 1);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
avgdl = N > 0 ? totalLen / N : 0;
|
|
42
|
+
fitted = true;
|
|
43
|
+
}
|
|
44
|
+
function computeBM25(tokens, dl) {
|
|
45
|
+
if (!fitted)
|
|
46
|
+
throw new Error('BM25Encoder must be fit() before encode()');
|
|
47
|
+
// Count term frequencies
|
|
48
|
+
const tf = new Map();
|
|
49
|
+
for (const token of tokens) {
|
|
50
|
+
const id = vocab.getId(token);
|
|
51
|
+
if (id !== undefined) {
|
|
52
|
+
tf.set(id, (tf.get(id) ?? 0) + 1);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
const entries = [];
|
|
56
|
+
for (const [termId, termTf] of tf) {
|
|
57
|
+
const termDf = df.get(termId) ?? 0;
|
|
58
|
+
const idf = Math.log((N - termDf + 0.5) / (termDf + 0.5) + 1);
|
|
59
|
+
const score = idf * (termTf * (k1 + 1)) / (termTf + k1 * (1 - b + b * dl / avgdl));
|
|
60
|
+
if (score > 0) {
|
|
61
|
+
entries.push({ idx: termId, val: score });
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
entries.sort((a, c) => a.idx - c.idx);
|
|
65
|
+
return {
|
|
66
|
+
indices: entries.map(e => e.idx),
|
|
67
|
+
values: entries.map(e => e.val),
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
function encode(text) {
|
|
71
|
+
const tokens = tokenizeText(text);
|
|
72
|
+
return computeBM25(tokens, tokens.length);
|
|
73
|
+
}
|
|
74
|
+
function encodeBatch(texts) {
|
|
75
|
+
return texts.map(t => encode(t));
|
|
76
|
+
}
|
|
77
|
+
function encodeQuery(text) {
|
|
78
|
+
// Query encoding: no length normalization (treat avgdl as doc length)
|
|
79
|
+
if (!fitted)
|
|
80
|
+
throw new Error('BM25Encoder must be fit() before encodeQuery()');
|
|
81
|
+
const tokens = tokenizeText(text);
|
|
82
|
+
return computeBM25(tokens, avgdl);
|
|
83
|
+
}
|
|
84
|
+
function serialize() {
|
|
85
|
+
return JSON.stringify({
|
|
86
|
+
N,
|
|
87
|
+
avgdl,
|
|
88
|
+
totalTokens,
|
|
89
|
+
df: Object.fromEntries(df),
|
|
90
|
+
vocab: vocab.serialize(),
|
|
91
|
+
options: { k1, b, stem: stemOpt, stopwords: extraStopwords },
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
function getStats() {
|
|
95
|
+
return { N, avgdl, vocabSize: vocab.size, totalTokens };
|
|
96
|
+
}
|
|
97
|
+
return { fit, encode, encodeBatch, encodeQuery, serialize, getStats };
|
|
98
|
+
}
|
|
99
|
+
//# sourceMappingURL=bm25.js.map
|
package/dist/bm25.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bm25.js","sourceRoot":"","sources":["../src/bm25.ts"],"names":[],"mappings":";;AAIA,gCA0GC;AA9GD,2CAAsC;AACtC,mCAAoC;AAGpC,SAAgB,UAAU,CAAC,OAAqB;IAC9C,MAAM,EAAE,GAAG,OAAO,EAAE,EAAE,IAAI,GAAG,CAAA;IAC7B,MAAM,CAAC,GAAG,OAAO,EAAE,CAAC,IAAI,IAAI,CAAA;IAC5B,MAAM,OAAO,GAAG,OAAO,EAAE,IAAI,IAAI,IAAI,CAAA;IACrC,MAAM,WAAW,GAAG,OAAO,EAAE,SAAS,CAAA;IACtC,MAAM,cAAc,GAAG,OAAO,EAAE,SAAS,IAAI,EAAE,CAAA;IAE/C,MAAM,KAAK,GAAG,IAAI,kBAAU,EAAE,CAAA;IAC9B,wDAAwD;IACxD,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAA;IACpC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,WAAW,GAAG,CAAC,CAAA;IACnB,IAAI,MAAM,GAAG,KAAK,CAAA;IAElB,SAAS,YAAY,CAAC,IAAY;QAChC,OAAO,IAAA,oBAAQ,EAAC,IAAI,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,SAAS,EAAE,cAAc,EAAE,CAAC,CAAA;IAC7F,CAAC;IAED,SAAS,GAAG,CAAC,SAAmB;QAC9B,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;QACpB,IAAI,QAAQ,GAAG,CAAC,CAAA;QAChB,MAAM,aAAa,GAAe,EAAE,CAAA;QAEpC,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;YAChC,aAAa,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;YAC1B,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAA;YACzB,WAAW,IAAI,MAAM,CAAC,MAAM,CAAA;YAE5B,oCAAoC;YACpC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAA;YAC9B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,MAAM,EAAE,GAAG,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAA;gBAChC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC;oBAClB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;oBACZ,EAAE,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;gBACnC,CAAC;YACH,CAAC;QACH,CAAC;QAED,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAChC,MAAM,GAAG,IAAI,CAAA;IACf,CAAC;IAED,SAAS,WAAW,CAAC,MAAgB,EAAE,EAAU;QAC/C,IAAI,CAAC,MAAM;YAAE,MAAM,IAAI,KAAK,CAAC,2CAA2C,CAAC,CAAA;QAEzE,yBAAyB;QACzB,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAA;QACpC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,EAAE,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;YAC7B,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,EAAE,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;YACnC,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAmC,EAAE,CAAA;QAClD,KAAK,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,IAAI,EAAE,EAAE,CAAC;YAClC,MAAM,MAAM,GAAG,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YAClC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAA;YAC7D,MAAM,KAAK,GAAG,GAAG,GAAG,CAAC,MAAM,GAAG,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,EAAE,GAAG,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,EAAE,GAAG,KAAK,CAAC,CAAC,CAAA;YAClF,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;gBACd,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC,CAAA;YAC3C,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QACrC,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;YAChC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;SAChC,CAAA;IACH,CAAC;IAED,SAAS,MAAM,CAAC,IAAY;QAC1B,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,CAAA;QACjC,OAAO,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,MAAM,CAAC,CAAA;IAC3C,CAAC;IAED,SAAS,WAAW,CAAC,KAAe;QAClC,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;IAClC,CAAC;IAED,SAAS,WAAW,CAAC,IAAY;QAC/B,sEAAsE;QACtE,IAAI,CAAC,MAAM;YAAE,MAAM,IAAI,KAAK,CAAC,gDAAgD,CAAC,CAAA;QAC9E,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,CAAA;QACjC,OAAO,WAAW,CAAC,MAAM,EAAE,KAAK,CAAC,CAAA;IACnC,CAAC;IAED,SAAS,SAAS;QAChB,OAAO,IAAI,CAAC,SAAS,CAAC;YACpB,CAAC;YACD,KAAK;YACL,WAAW;YACX,EAAE,EAAE,MAAM,CAAC,WAAW,CAAC,EAAE,CAAC;YAC1B,KAAK,EAAE,KAAK,CAAC,SAAS,EAAE;YACxB,OAAO,EAAE,EAAE,EAAE,EAAE,CAAC,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE;SAC7D,CAAC,CAAA;IACJ,CAAC;IAED,SAAS,QAAQ;QACf,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,CAAC,IAAI,EAAE,WAAW,EAAE,CAAA;IACzD,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAA;AACvE,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAA;AACnC,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAA;AACrC,YAAY,EACV,YAAY,EACZ,QAAQ,EACR,WAAW,EACX,YAAY,EACZ,WAAW,EACX,WAAW,EACX,YAAY,GACb,MAAM,SAAS,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createTFIDF = exports.createBM25 = void 0;
|
|
4
|
+
// sparse-encode - Generate BM25 and TF-IDF sparse vectors in JavaScript
|
|
5
|
+
var bm25_1 = require("./bm25");
|
|
6
|
+
Object.defineProperty(exports, "createBM25", { enumerable: true, get: function () { return bm25_1.createBM25; } });
|
|
7
|
+
var tfidf_1 = require("./tfidf");
|
|
8
|
+
Object.defineProperty(exports, "createTFIDF", { enumerable: true, get: function () { return tfidf_1.createTFIDF; } });
|
|
9
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,wEAAwE;AACxE,+BAAmC;AAA1B,kGAAA,UAAU,OAAA;AACnB,iCAAqC;AAA5B,oGAAA,WAAW,OAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"porter-stemmer.d.ts","sourceRoot":"","sources":["../src/porter-stemmer.ts"],"names":[],"mappings":"AA+CA,wBAAgB,IAAI,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CA+IzC"}
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Porter Stemmer — classic algorithm (Porter 1980)
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.stem = stem;
|
|
5
|
+
function hasCVC(word) {
|
|
6
|
+
// ends in consonant-vowel-consonant where final consonant is not w, x, y
|
|
7
|
+
if (word.length < 3)
|
|
8
|
+
return false;
|
|
9
|
+
const last = word[word.length - 1];
|
|
10
|
+
if ('wxy'.includes(last))
|
|
11
|
+
return false;
|
|
12
|
+
const vowels = new Set(['a', 'e', 'i', 'o', 'u']);
|
|
13
|
+
const c = word.length - 1;
|
|
14
|
+
return !vowels.has(word[c]) && vowels.has(word[c - 1]) && !vowels.has(word[c - 2]);
|
|
15
|
+
}
|
|
16
|
+
function containsVowel(word) {
|
|
17
|
+
for (const ch of word) {
|
|
18
|
+
if ('aeiou'.includes(ch))
|
|
19
|
+
return true;
|
|
20
|
+
if (ch === 'y' && word.indexOf(ch) > 0)
|
|
21
|
+
return true;
|
|
22
|
+
}
|
|
23
|
+
return false;
|
|
24
|
+
}
|
|
25
|
+
function measure(word) {
|
|
26
|
+
// count VC sequences
|
|
27
|
+
const vowels = new Set(['a', 'e', 'i', 'o', 'u']);
|
|
28
|
+
let m = 0;
|
|
29
|
+
let inVowel = false;
|
|
30
|
+
for (let i = 0; i < word.length; i++) {
|
|
31
|
+
const ch = word[i];
|
|
32
|
+
const isVowel = vowels.has(ch) || (ch === 'y' && i > 0);
|
|
33
|
+
if (isVowel) {
|
|
34
|
+
inVowel = true;
|
|
35
|
+
}
|
|
36
|
+
else {
|
|
37
|
+
if (inVowel) {
|
|
38
|
+
m++;
|
|
39
|
+
inVowel = false;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
return m;
|
|
44
|
+
}
|
|
45
|
+
function endsDoubleConsonant(word) {
|
|
46
|
+
if (word.length < 2)
|
|
47
|
+
return false;
|
|
48
|
+
const last = word[word.length - 1];
|
|
49
|
+
const prev = word[word.length - 2];
|
|
50
|
+
return last === prev && !'aeiou'.includes(last);
|
|
51
|
+
}
|
|
52
|
+
function stem(word) {
|
|
53
|
+
if (word.length <= 2)
|
|
54
|
+
return word;
|
|
55
|
+
let w = word.toLowerCase();
|
|
56
|
+
// Step 1a
|
|
57
|
+
if (w.endsWith('sses')) {
|
|
58
|
+
w = w.slice(0, -2);
|
|
59
|
+
}
|
|
60
|
+
else if (w.endsWith('ies')) {
|
|
61
|
+
w = w.slice(0, -2); // ies → i
|
|
62
|
+
}
|
|
63
|
+
else if (w.endsWith('ss')) {
|
|
64
|
+
// keep
|
|
65
|
+
}
|
|
66
|
+
else if (w.endsWith('s')) {
|
|
67
|
+
w = w.slice(0, -1);
|
|
68
|
+
}
|
|
69
|
+
// Step 1b
|
|
70
|
+
let step1bTriggered = false;
|
|
71
|
+
if (w.endsWith('eed')) {
|
|
72
|
+
const stem1b = w.slice(0, -3);
|
|
73
|
+
if (measure(stem1b) > 0) {
|
|
74
|
+
w = w.slice(0, -1); // eed → ee
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
else if (w.endsWith('ed')) {
|
|
78
|
+
const stem1b = w.slice(0, -2);
|
|
79
|
+
if (containsVowel(stem1b)) {
|
|
80
|
+
w = stem1b;
|
|
81
|
+
step1bTriggered = true;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
else if (w.endsWith('ing')) {
|
|
85
|
+
const stem1b = w.slice(0, -3);
|
|
86
|
+
if (containsVowel(stem1b)) {
|
|
87
|
+
w = stem1b;
|
|
88
|
+
step1bTriggered = true;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
if (step1bTriggered) {
|
|
92
|
+
if (w.endsWith('at') || w.endsWith('bl') || w.endsWith('iz')) {
|
|
93
|
+
w = w + 'e';
|
|
94
|
+
}
|
|
95
|
+
else if (endsDoubleConsonant(w) && !w.endsWith('l') && !w.endsWith('s') && !w.endsWith('z')) {
|
|
96
|
+
w = w.slice(0, -1);
|
|
97
|
+
}
|
|
98
|
+
else if (measure(w) === 1 && hasCVC(w)) {
|
|
99
|
+
w = w + 'e';
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
// Step 1c
|
|
103
|
+
if (w.endsWith('y') && w.length > 2) {
|
|
104
|
+
const before = w.slice(0, -1);
|
|
105
|
+
if (containsVowel(before)) {
|
|
106
|
+
w = before + 'i';
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
// Step 2
|
|
110
|
+
const step2Map = [
|
|
111
|
+
['ational', 'ate'],
|
|
112
|
+
['tional', 'tion'],
|
|
113
|
+
['enci', 'ence'],
|
|
114
|
+
['anci', 'ance'],
|
|
115
|
+
['izer', 'ize'],
|
|
116
|
+
['abli', 'able'],
|
|
117
|
+
['alli', 'al'],
|
|
118
|
+
['entli', 'ent'],
|
|
119
|
+
['eli', 'e'],
|
|
120
|
+
['ousli', 'ous'],
|
|
121
|
+
['ization', 'ize'],
|
|
122
|
+
['ation', 'ate'],
|
|
123
|
+
['ator', 'ate'],
|
|
124
|
+
['alism', 'al'],
|
|
125
|
+
['iveness', 'ive'],
|
|
126
|
+
['fulness', 'ful'],
|
|
127
|
+
['ousness', 'ous'],
|
|
128
|
+
['aliti', 'al'],
|
|
129
|
+
['iviti', 'ive'],
|
|
130
|
+
['biliti', 'ble'],
|
|
131
|
+
];
|
|
132
|
+
for (const [suffix, replacement] of step2Map) {
|
|
133
|
+
if (w.endsWith(suffix)) {
|
|
134
|
+
const base = w.slice(0, -suffix.length);
|
|
135
|
+
if (measure(base) > 0) {
|
|
136
|
+
w = base + replacement;
|
|
137
|
+
}
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
// Step 3
|
|
142
|
+
const step3Map = [
|
|
143
|
+
['icate', 'ic'],
|
|
144
|
+
['ative', ''],
|
|
145
|
+
['alize', 'al'],
|
|
146
|
+
['iciti', 'ic'],
|
|
147
|
+
['ical', 'ic'],
|
|
148
|
+
['ful', ''],
|
|
149
|
+
['ness', ''],
|
|
150
|
+
];
|
|
151
|
+
for (const [suffix, replacement] of step3Map) {
|
|
152
|
+
if (w.endsWith(suffix)) {
|
|
153
|
+
const base = w.slice(0, -suffix.length);
|
|
154
|
+
if (measure(base) > 0) {
|
|
155
|
+
w = base + replacement;
|
|
156
|
+
}
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
// Step 4
|
|
161
|
+
const step4Suffixes = [
|
|
162
|
+
'ement', 'ment', 'ance', 'ence', 'able', 'ible', 'ant', 'ent',
|
|
163
|
+
'ion', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic',
|
|
164
|
+
];
|
|
165
|
+
for (const suffix of step4Suffixes) {
|
|
166
|
+
if (w.endsWith(suffix)) {
|
|
167
|
+
const base = w.slice(0, -suffix.length);
|
|
168
|
+
if (suffix === 'ion') {
|
|
169
|
+
if (measure(base) > 1 && (base.endsWith('s') || base.endsWith('t'))) {
|
|
170
|
+
w = base;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
else if (measure(base) > 1) {
|
|
174
|
+
w = base;
|
|
175
|
+
}
|
|
176
|
+
break;
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
// Step 5a
|
|
180
|
+
if (w.endsWith('e')) {
|
|
181
|
+
const base = w.slice(0, -1);
|
|
182
|
+
if (measure(base) > 1) {
|
|
183
|
+
w = base;
|
|
184
|
+
}
|
|
185
|
+
else if (measure(base) === 1 && !hasCVC(base)) {
|
|
186
|
+
w = base;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
// Step 5b
|
|
190
|
+
if (w.endsWith('ll') && measure(w.slice(0, -1)) > 1) {
|
|
191
|
+
w = w.slice(0, -1);
|
|
192
|
+
}
|
|
193
|
+
return w;
|
|
194
|
+
}
|
|
195
|
+
//# sourceMappingURL=porter-stemmer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"porter-stemmer.js","sourceRoot":"","sources":["../src/porter-stemmer.ts"],"names":[],"mappings":";AAAA,mDAAmD;;AA+CnD,oBA+IC;AA5LD,SAAS,MAAM,CAAC,IAAY;IAC1B,yEAAyE;IACzE,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,KAAK,CAAA;IACjC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAClC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAA;IACtC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAA;IACjD,MAAM,CAAC,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAA;IACzB,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAA;AACpF,CAAC;AAED,SAAS,aAAa,CAAC,IAAY;IACjC,KAAK,MAAM,EAAE,IAAI,IAAI,EAAE,CAAC;QACtB,IAAI,OAAO,CAAC,QAAQ,CAAC,EAAE,CAAC;YAAE,OAAO,IAAI,CAAA;QACrC,IAAI,EAAE,KAAK,GAAG,IAAI,IAAI,CAAC,OAAO,CAAC,EAAE,CAAC,GAAG,CAAC;YAAE,OAAO,IAAI,CAAA;IACrD,CAAC;IACD,OAAO,KAAK,CAAA;AACd,CAAC;AAED,SAAS,OAAO,CAAC,IAAY;IAC3B,qBAAqB;IACrB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,CAAA;IACjD,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,IAAI,OAAO,GAAG,KAAK,CAAA;IACnB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;QAClB,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,EAAE,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QACvD,IAAI,OAAO,EAAE,CAAC;YACZ,OAAO,GAAG,IAAI,CAAA;QAChB,CAAC;aAAM,CAAC;YACN,IAAI,OAAO,EAAE,CAAC;gBACZ,CAAC,EAAE,CAAA;gBACH,OAAO,GAAG,KAAK,CAAA;YACjB,CAAC;QACH,CAAC;IACH,CAAC;IACD,OAAO,CAAC,CAAA;AACV,CAAC;AAED,SAAS,mBAAmB,CAAC,IAAY;IACvC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,KAAK,CAAA;IACjC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAClC,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAClC,OAAO,IAAI,KAAK,IAAI,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAA;AACjD,CAAC;AAED,SAAgB,IAAI,CAAC,IAAY;IAC/B,IAAI,IAAI,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,IAAI,CAAA;IAEjC,IAAI,CAAC,GAAG,IAAI,CAAC,WAAW,EAAE,CAAA;IAE1B,UAAU;IACV,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;QACvB,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;IACpB,CAAC;SAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC7B,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA,CAAC,UAAU;IAC/B,CAAC;SAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5B,OAAO;IACT,CAAC;SAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;IACpB,CAAC;IAED,UAAU;IACV,IAAI,eAAe,GAAG,KAAK,CAAA;IAC3B,IAAI,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QACtB,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7B,IAAI,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC;YACxB,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA,CAAC,WAAW;QAChC,CAAC;IACH,CAAC;SAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QAC5B,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7B,IAAI,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,CAAC,GAAG,MAAM,CAAA;YACV,eAAe,GAAG,IAAI,CAAA;QACxB,CAAC;IACH,CAAC;SAAM,IAAI,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC7B,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7B,IAAI,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,CAAC,GAAG,MAAM,CAAA;YACV,eAAe,GAAG,IAAI,CAAA;QACxB,CAAC;IACH,CAAC;IAED,IAAI,eAAe,EAAE,CAAC;QACpB,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;YAC7D,CAAC,GAAG,CAAC,GAAG,GAAG,CAAA;QACb,CAAC;aAAM,IAAI,mBAAmB,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;YAC9F,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QACpB,CAAC;aAAM,IAAI,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,IAAI,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC;YACzC,CAAC,GAAG,CAAC,GAAG,GAAG,CAAA;QACb,CAAC;IACH,CAAC;IAED,UAAU;IACV,IAAI,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpC,MAAM,MAAM,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7B,IAAI,aAAa,CAAC,MAAM,CAAC,EAAE,CAAC;YAC1B,CAAC,GAAG,MAAM,GAAG,GAAG,CAAA;QAClB,CAAC;IACH,CAAC;IAED,SAAS;IACT,MAAM,QAAQ,GAAuB;QACnC,CAAC,SAAS,EAAE,KAAK,CAAC;QAClB,CAAC,QAAQ,EAAE,MAAM,CAAC;QAClB,CAAC,MAAM,EAAE,MAAM,CAAC;QAChB,CAAC,MAAM,EAAE,MAAM,CAAC;QAChB,CAAC,MAAM,EAAE,KAAK,CAAC;QACf,CAAC,MAAM,EAAE,MAAM,CAAC;QAChB,CAAC,MAAM,EAAE,IAAI,CAAC;QACd,CAAC,OAAO,EAAE,KAAK,CAAC;QAChB,CAAC,KAAK,EAAE,GAAG,CAAC;QACZ,CAAC,OAAO,EAAE,KAAK,CAAC;QAChB,CAAC,SAAS,EAAE,KAAK,CAAC;QAClB,CAAC,OAAO,EAAE,KAAK,CAAC;QAChB,CAAC,MAAM,EAAE,KAAK,CAAC;QACf,CAAC,OAAO,EAAE,IAAI,CAAC;QACf,CAAC,SAAS,EAAE,KAAK,CAAC;QAClB,CAAC,SAAS,EAAE,KAAK,CAAC;QAClB,CAAC,SAAS,EAAE,KAAK,CAAC;QAClB,CAAC,OAAO,EAAE,IAAI,CAAC;QACf,CAAC,OAAO,EAAE,KAAK,CAAC;QAChB,CAAC,QAAQ,EAAE,KAAK,CAAC;KAClB,CAAA;IACD,KAAK,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,IAAI,QAAQ,EAAE,CAAC;QAC7C,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;YACvC,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,CAAC,GAAG,IAAI,GAAG,WAAW,CAAA;YACxB,CAAC;YACD,MAAK;QACP,CAAC;IACH,CAAC;IAED,SAAS;IACT,MAAM,QAAQ,GAAuB;QACnC,CAAC,OAAO,EAAE,IAAI,CAAC;QACf,CAAC,OAAO,EAAE,EAAE,CAAC;QACb,CAAC,OAAO,EAAE,IAAI,CAAC;QACf,CAAC,OAAO,EAAE,IAAI,CAAC;QACf,CAAC,MAAM,EAAE,IAAI,CAAC;QACd,CAAC,KAAK,EAAE,EAAE,CAAC;QACX,CAAC,MAAM,EAAE,EAAE,CAAC;KACb,CAAA;IACD,KAAK,MAAM,CAAC,MAAM,EAAE,WAAW,CAAC,IAAI,QAAQ,EAAE,CAAC;QAC7C,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;YACvC,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBACtB,CAAC,GAAG,IAAI,GAAG,WAAW,CAAA;YACxB,CAAC;YACD,MAAK;QACP,CAAC;IACH,CAAC;IAED,SAAS;IACT,MAAM,aAAa,GAAG;QACpB,OAAO,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,MAAM,EAAE,KAAK,EAAE,KAAK;QAC7D,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI;KAClE,CAAA;IACD,KAAK,MAAM,MAAM,IAAI,aAAa,EAAE,CAAC;QACnC,IAAI,CAAC,CAAC,QAAQ,CAAC,MAAM,CAAC,EAAE,CAAC;YACvB,MAAM,IAAI,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;YACvC,IAAI,MAAM,KAAK,KAAK,EAAE,CAAC;gBACrB,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC;oBACpE,CAAC,GAAG,IAAI,CAAA;gBACV,CAAC;YACH,CAAC;iBAAM,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC7B,CAAC,GAAG,IAAI,CAAA;YACV,CAAC;YACD,MAAK;QACP,CAAC;IACH,CAAC;IAED,UAAU;IACV,IAAI,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QACpB,MAAM,IAAI,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC3B,IAAI,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;YACtB,CAAC,GAAG,IAAI,CAAA;QACV,CAAC;aAAM,IAAI,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC;YAChD,CAAC,GAAG,IAAI,CAAA;QACV,CAAC;IACH,CAAC;IAED,UAAU;IACV,IAAI,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,OAAO,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;QACpD,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;IACpB,CAAC;IAED,OAAO,CAAC,CAAA;AACV,CAAC"}
|
package/dist/tfidf.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tfidf.d.ts","sourceRoot":"","sources":["../src/tfidf.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,YAAY,EAAE,YAAY,EAA0B,MAAM,SAAS,CAAA;AAEjF,wBAAgB,WAAW,CAAC,OAAO,CAAC,EAAE,YAAY,GAAG,YAAY,CAyGhE"}
|
package/dist/tfidf.js
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createTFIDF = createTFIDF;
|
|
4
|
+
const tokenizer_1 = require("./tokenizer");
|
|
5
|
+
const vocab_1 = require("./vocab");
|
|
6
|
+
function createTFIDF(options) {
|
|
7
|
+
const stemOpt = options?.stem ?? true;
|
|
8
|
+
const tokenizerFn = options?.tokenizer;
|
|
9
|
+
const extraStopwords = options?.stopwords ?? [];
|
|
10
|
+
const sublinearTf = options?.sublinearTf ?? false;
|
|
11
|
+
const vocab = new vocab_1.Vocabulary();
|
|
12
|
+
const df = new Map();
|
|
13
|
+
let N = 0;
|
|
14
|
+
let totalTokens = 0;
|
|
15
|
+
let avgdl = 0;
|
|
16
|
+
let fitted = false;
|
|
17
|
+
function tokenizeText(text) {
|
|
18
|
+
return (0, tokenizer_1.tokenize)(text, { stem: stemOpt, tokenizer: tokenizerFn, stopwords: extraStopwords });
|
|
19
|
+
}
|
|
20
|
+
function fit(documents) {
|
|
21
|
+
N = documents.length;
|
|
22
|
+
let totalLen = 0;
|
|
23
|
+
for (const doc of documents) {
|
|
24
|
+
const tokens = tokenizeText(doc);
|
|
25
|
+
totalLen += tokens.length;
|
|
26
|
+
totalTokens += tokens.length;
|
|
27
|
+
const seen = new Set();
|
|
28
|
+
for (const token of tokens) {
|
|
29
|
+
const id = vocab.getOrAdd(token);
|
|
30
|
+
if (!seen.has(id)) {
|
|
31
|
+
seen.add(id);
|
|
32
|
+
df.set(id, (df.get(id) ?? 0) + 1);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
avgdl = N > 0 ? totalLen / N : 0;
|
|
37
|
+
fitted = true;
|
|
38
|
+
}
|
|
39
|
+
function encode(text) {
|
|
40
|
+
if (!fitted)
|
|
41
|
+
throw new Error('TFIDFEncoder must be fit() before encode()');
|
|
42
|
+
const tokens = tokenizeText(text);
|
|
43
|
+
const dl = tokens.length;
|
|
44
|
+
if (dl === 0)
|
|
45
|
+
return { indices: [], values: [] };
|
|
46
|
+
// Count raw term frequencies
|
|
47
|
+
const rawTf = new Map();
|
|
48
|
+
for (const token of tokens) {
|
|
49
|
+
const id = vocab.getId(token);
|
|
50
|
+
if (id !== undefined) {
|
|
51
|
+
rawTf.set(id, (rawTf.get(id) ?? 0) + 1);
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
const entries = [];
|
|
55
|
+
for (const [termId, count] of rawTf) {
|
|
56
|
+
const tf = sublinearTf ? 1 + Math.log(count) : count / dl;
|
|
57
|
+
const termDf = df.get(termId) ?? 0;
|
|
58
|
+
const idf = Math.log((N + 1) / (termDf + 1)) + 1;
|
|
59
|
+
const score = tf * idf;
|
|
60
|
+
if (score > 0) {
|
|
61
|
+
entries.push({ idx: termId, val: score });
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
// L2 normalize
|
|
65
|
+
const norm = Math.sqrt(entries.reduce((acc, e) => acc + e.val * e.val, 0));
|
|
66
|
+
if (norm > 0) {
|
|
67
|
+
for (const e of entries)
|
|
68
|
+
e.val /= norm;
|
|
69
|
+
}
|
|
70
|
+
entries.sort((a, c) => a.idx - c.idx);
|
|
71
|
+
return {
|
|
72
|
+
indices: entries.map(e => e.idx),
|
|
73
|
+
values: entries.map(e => e.val),
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
function encodeBatch(texts) {
|
|
77
|
+
return texts.map(t => encode(t));
|
|
78
|
+
}
|
|
79
|
+
function encodeQuery(text) {
|
|
80
|
+
// For queries, same as encode
|
|
81
|
+
return encode(text);
|
|
82
|
+
}
|
|
83
|
+
function serialize() {
|
|
84
|
+
return JSON.stringify({
|
|
85
|
+
N,
|
|
86
|
+
avgdl,
|
|
87
|
+
totalTokens,
|
|
88
|
+
df: Object.fromEntries(df),
|
|
89
|
+
vocab: vocab.serialize(),
|
|
90
|
+
options: { stem: stemOpt, sublinearTf, stopwords: extraStopwords },
|
|
91
|
+
});
|
|
92
|
+
}
|
|
93
|
+
function getStats() {
|
|
94
|
+
return { N, avgdl, vocabSize: vocab.size, totalTokens };
|
|
95
|
+
}
|
|
96
|
+
return { fit, encode, encodeBatch, encodeQuery, serialize, getStats };
|
|
97
|
+
}
|
|
98
|
+
//# sourceMappingURL=tfidf.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tfidf.js","sourceRoot":"","sources":["../src/tfidf.ts"],"names":[],"mappings":";;AAIA,kCAyGC;AA7GD,2CAAsC;AACtC,mCAAoC;AAGpC,SAAgB,WAAW,CAAC,OAAsB;IAChD,MAAM,OAAO,GAAG,OAAO,EAAE,IAAI,IAAI,IAAI,CAAA;IACrC,MAAM,WAAW,GAAG,OAAO,EAAE,SAAS,CAAA;IACtC,MAAM,cAAc,GAAG,OAAO,EAAE,SAAS,IAAI,EAAE,CAAA;IAC/C,MAAM,WAAW,GAAG,OAAO,EAAE,WAAW,IAAI,KAAK,CAAA;IAEjD,MAAM,KAAK,GAAG,IAAI,kBAAU,EAAE,CAAA;IAC9B,MAAM,EAAE,GAAG,IAAI,GAAG,EAAkB,CAAA;IACpC,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,IAAI,WAAW,GAAG,CAAC,CAAA;IACnB,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,IAAI,MAAM,GAAG,KAAK,CAAA;IAElB,SAAS,YAAY,CAAC,IAAY;QAChC,OAAO,IAAA,oBAAQ,EAAC,IAAI,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,SAAS,EAAE,cAAc,EAAE,CAAC,CAAA;IAC7F,CAAC;IAED,SAAS,GAAG,CAAC,SAAmB;QAC9B,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;QACpB,IAAI,QAAQ,GAAG,CAAC,CAAA;QAEhB,KAAK,MAAM,GAAG,IAAI,SAAS,EAAE,CAAC;YAC5B,MAAM,MAAM,GAAG,YAAY,CAAC,GAAG,CAAC,CAAA;YAChC,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAA;YACzB,WAAW,IAAI,MAAM,CAAC,MAAM,CAAA;YAE5B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAA;YAC9B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,MAAM,EAAE,GAAG,KAAK,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAA;gBAChC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC;oBAClB,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;oBACZ,EAAE,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;gBACnC,CAAC;YACH,CAAC;QACH,CAAC;QAED,KAAK,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAChC,MAAM,GAAG,IAAI,CAAA;IACf,CAAC;IAED,SAAS,MAAM,CAAC,IAAY;QAC1B,IAAI,CAAC,MAAM;YAAE,MAAM,IAAI,KAAK,CAAC,4CAA4C,CAAC,CAAA;QAE1E,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,CAAA;QACjC,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,CAAA;QACxB,IAAI,EAAE,KAAK,CAAC;YAAE,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAA;QAEhD,6BAA6B;QAC7B,MAAM,KAAK,GAAG,IAAI,GAAG,EAAkB,CAAA;QACvC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,MAAM,EAAE,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;YAC7B,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;gBACrB,KAAK,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;YACzC,CAAC;QACH,CAAC;QAED,MAAM,OAAO,GAAmC,EAAE,CAAA;QAClD,KAAK,MAAM,CAAC,MAAM,EAAE,KAAK,CAAC,IAAI,KAAK,EAAE,CAAC;YACpC,MAAM,EAAE,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAA;YACzD,MAAM,MAAM,GAAG,EAAE,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;YAClC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAA;YAChD,MAAM,KAAK,GAAG,EAAE,GAAG,GAAG,CAAA;YACtB,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;gBACd,OAAO,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,KAAK,EAAE,CAAC,CAAA;YAC3C,CAAC;QACH,CAAC;QAED,eAAe;QACf,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;QAC1E,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;YACb,KAAK,MAAM,CAAC,IAAI,OAAO;gBAAE,CAAC,CAAC,GAAG,IAAI,IAAI,CAAA;QACxC,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QACrC,OAAO;YACL,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;YAChC,MAAM,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC;SAChC,CAAA;IACH,CAAC;IAED,SAAS,WAAW,CAAC,KAAe;QAClC,OAAO,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAA;IAClC,CAAC;IAED,SAAS,WAAW,CAAC,IAAY;QAC/B,8BAA8B;QAC9B,OAAO,MAAM,CAAC,IAAI,CAAC,CAAA;IACrB,CAAC;IAED,SAAS,SAAS;QAChB,OAAO,IAAI,CAAC,SAAS,CAAC;YACpB,CAAC;YACD,KAAK;YACL,WAAW;YACX,EAAE,EAAE,MAAM,CAAC,WAAW,CAAC,EAAE,CAAC;YAC1B,KAAK,EAAE,KAAK,CAAC,SAAS,EAAE;YACxB,OAAO,EAAE,EAAE,IAAI,EAAE,OAAO,EAAE,WAAW,EAAE,SAAS,EAAE,cAAc,EAAE;SACnE,CAAC,CAAA;IACJ,CAAC;IAED,SAAS,QAAQ;QACf,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,SAAS,EAAE,KAAK,CAAC,IAAI,EAAE,WAAW,EAAE,CAAA;IACzD,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,EAAE,SAAS,EAAE,QAAQ,EAAE,CAAA;AACvE,CAAC"}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import type { TokenizerFn } from './types';
|
|
2
|
+
export declare function defaultTokenizer(text: string): string[];
|
|
3
|
+
export declare function tokenize(text: string, options?: {
|
|
4
|
+
stopwords?: string[];
|
|
5
|
+
stem?: boolean;
|
|
6
|
+
tokenizer?: TokenizerFn;
|
|
7
|
+
}): string[];
|
|
8
|
+
//# sourceMappingURL=tokenizer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../src/tokenizer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,SAAS,CAAA;AAS1C,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAOvD;AAED,wBAAgB,QAAQ,CACtB,IAAI,EAAE,MAAM,EACZ,OAAO,CAAC,EAAE;IACR,SAAS,CAAC,EAAE,MAAM,EAAE,CAAA;IACpB,IAAI,CAAC,EAAE,OAAO,CAAA;IACd,SAAS,CAAC,EAAE,WAAW,CAAA;CACxB,GACA,MAAM,EAAE,CAyBV"}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.defaultTokenizer = defaultTokenizer;
|
|
4
|
+
exports.tokenize = tokenize;
|
|
5
|
+
const porter_stemmer_1 = require("./porter-stemmer");
|
|
6
|
+
const DEFAULT_STOPWORDS = new Set([
|
|
7
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
|
|
8
|
+
'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
9
|
+
'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could',
|
|
10
|
+
'should', 'may', 'might', 'shall', 'can',
|
|
11
|
+
]);
|
|
12
|
+
function defaultTokenizer(text) {
|
|
13
|
+
return text
|
|
14
|
+
.toLowerCase()
|
|
15
|
+
.split(/[^\w]+/)
|
|
16
|
+
.filter(t => t.length > 0 && !/^\d+$/.test(t))
|
|
17
|
+
.filter(t => !DEFAULT_STOPWORDS.has(t))
|
|
18
|
+
.map(t => (0, porter_stemmer_1.stem)(t));
|
|
19
|
+
}
|
|
20
|
+
function tokenize(text, options) {
|
|
21
|
+
const { tokenizer, stopwords, stem: doStem = true } = options ?? {};
|
|
22
|
+
let tokens;
|
|
23
|
+
if (tokenizer) {
|
|
24
|
+
tokens = tokenizer(text);
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
// Manual pipeline so we can honour stem=false
|
|
28
|
+
tokens = text
|
|
29
|
+
.toLowerCase()
|
|
30
|
+
.split(/[^\w]+/)
|
|
31
|
+
.filter(t => t.length > 0 && !/^\d+$/.test(t))
|
|
32
|
+
.filter(t => !DEFAULT_STOPWORDS.has(t));
|
|
33
|
+
if (doStem) {
|
|
34
|
+
tokens = tokens.map(t => (0, porter_stemmer_1.stem)(t));
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
if (stopwords && stopwords.length > 0) {
|
|
38
|
+
const extra = new Set(stopwords.map(s => s.toLowerCase()));
|
|
39
|
+
tokens = tokens.filter(t => !extra.has(t));
|
|
40
|
+
}
|
|
41
|
+
return tokens;
|
|
42
|
+
}
|
|
43
|
+
//# sourceMappingURL=tokenizer.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../src/tokenizer.ts"],"names":[],"mappings":";;AAUA,4CAOC;AAED,4BAgCC;AAnDD,qDAAuC;AAGvC,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC;IAChC,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK;IACnE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO;IACrE,MAAM,EAAE,KAAK,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO;IACnE,QAAQ,EAAE,KAAK,EAAE,OAAO,EAAE,OAAO,EAAE,KAAK;CACzC,CAAC,CAAA;AAEF,SAAgB,gBAAgB,CAAC,IAAY;IAC3C,OAAO,IAAI;SACR,WAAW,EAAE;SACb,KAAK,CAAC,QAAQ,CAAC;SACf,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;SAC7C,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACtC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAA,qBAAI,EAAC,CAAC,CAAC,CAAC,CAAA;AACtB,CAAC;AAED,SAAgB,QAAQ,CACtB,IAAY,EACZ,OAIC;IAED,MAAM,EAAE,SAAS,EAAE,SAAS,EAAE,IAAI,EAAE,MAAM,GAAG,IAAI,EAAE,GAAG,OAAO,IAAI,EAAE,CAAA;IAEnE,IAAI,MAAgB,CAAA;IACpB,IAAI,SAAS,EAAE,CAAC;QACd,MAAM,GAAG,SAAS,CAAC,IAAI,CAAC,CAAA;IAC1B,CAAC;SAAM,CAAC;QACN,8CAA8C;QAC9C,MAAM,GAAG,IAAI;aACV,WAAW,EAAE;aACb,KAAK,CAAC,QAAQ,CAAC;aACf,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;aAC7C,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,iBAAiB,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA;QAEzC,IAAI,MAAM,EAAE,CAAC;YACX,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,IAAA,qBAAI,EAAC,CAAC,CAAC,CAAC,CAAA;QACnC,CAAC;IACH,CAAC;IAED,IAAI,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACtC,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,CAAA;QAC1D,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5C,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
export interface SparseVector {
|
|
2
|
+
indices: number[];
|
|
3
|
+
values: number[];
|
|
4
|
+
}
|
|
5
|
+
export interface FitStats {
|
|
6
|
+
N: number;
|
|
7
|
+
avgdl: number;
|
|
8
|
+
vocabSize: number;
|
|
9
|
+
totalTokens: number;
|
|
10
|
+
}
|
|
11
|
+
export type TokenizerFn = (text: string) => string[];
|
|
12
|
+
export interface BM25Options {
|
|
13
|
+
k1?: number;
|
|
14
|
+
b?: number;
|
|
15
|
+
tokenizer?: TokenizerFn;
|
|
16
|
+
stopwords?: string[];
|
|
17
|
+
stem?: boolean;
|
|
18
|
+
}
|
|
19
|
+
export interface TFIDFOptions {
|
|
20
|
+
tokenizer?: TokenizerFn;
|
|
21
|
+
stopwords?: string[];
|
|
22
|
+
stem?: boolean;
|
|
23
|
+
sublinearTf?: boolean;
|
|
24
|
+
}
|
|
25
|
+
export interface BM25Encoder {
|
|
26
|
+
fit(documents: string[]): void;
|
|
27
|
+
encode(text: string): SparseVector;
|
|
28
|
+
encodeBatch(texts: string[]): SparseVector[];
|
|
29
|
+
encodeQuery(text: string): SparseVector;
|
|
30
|
+
serialize(): string;
|
|
31
|
+
getStats(): FitStats;
|
|
32
|
+
}
|
|
33
|
+
export interface TFIDFEncoder {
|
|
34
|
+
fit(documents: string[]): void;
|
|
35
|
+
encode(text: string): SparseVector;
|
|
36
|
+
encodeBatch(texts: string[]): SparseVector[];
|
|
37
|
+
encodeQuery(text: string): SparseVector;
|
|
38
|
+
serialize(): string;
|
|
39
|
+
getStats(): FitStats;
|
|
40
|
+
}
|
|
41
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,YAAY;IAC3B,OAAO,EAAE,MAAM,EAAE,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,CAAA;CACjB;AAED,MAAM,WAAW,QAAQ;IACvB,CAAC,EAAE,MAAM,CAAA;IACT,KAAK,EAAE,MAAM,CAAA;IACb,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,MAAM,CAAA;CACpB;AAED,MAAM,MAAM,WAAW,GAAG,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,EAAE,CAAA;AAEpD,MAAM,WAAW,WAAW;IAC1B,EAAE,CAAC,EAAE,MAAM,CAAA;IACX,CAAC,CAAC,EAAE,MAAM,CAAA;IACV,SAAS,CAAC,EAAE,WAAW,CAAA;IACvB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAA;IACpB,IAAI,CAAC,EAAE,OAAO,CAAA;CACf;AAED,MAAM,WAAW,YAAY;IAC3B,SAAS,CAAC,EAAE,WAAW,CAAA;IACvB,SAAS,CAAC,EAAE,MAAM,EAAE,CAAA;IACpB,IAAI,CAAC,EAAE,OAAO,CAAA;IACd,WAAW,CAAC,EAAE,OAAO,CAAA;CACtB;AAED,MAAM,WAAW,WAAW;IAC1B,GAAG,CAAC,SAAS,EAAE,MAAM,EAAE,GAAG,IAAI,CAAA;IAC9B,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAA;IAClC,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,YAAY,EAAE,CAAA;IAC5C,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAA;IACvC,SAAS,IAAI,MAAM,CAAA;IACnB,QAAQ,IAAI,QAAQ,CAAA;CACrB;AAED,MAAM,WAAW,YAAY;IAC3B,GAAG,CAAC,SAAS,EAAE,MAAM,EAAE,GAAG,IAAI,CAAA;IAC9B,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAA;IAClC,WAAW,CAAC,KAAK,EAAE,MAAM,EAAE,GAAG,YAAY,EAAE,CAAA;IAC5C,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAA;IACvC,SAAS,IAAI,MAAM,CAAA;IACnB,QAAQ,IAAI,QAAQ,CAAA;CACrB"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/dist/vocab.d.ts
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
export declare class Vocabulary {
|
|
2
|
+
private termToId;
|
|
3
|
+
private idToTerm;
|
|
4
|
+
getOrAdd(term: string): number;
|
|
5
|
+
getId(term: string): number | undefined;
|
|
6
|
+
get size(): number;
|
|
7
|
+
terms(): string[];
|
|
8
|
+
serialize(): object;
|
|
9
|
+
static deserialize(data: object): Vocabulary;
|
|
10
|
+
}
|
|
11
|
+
//# sourceMappingURL=vocab.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vocab.d.ts","sourceRoot":"","sources":["../src/vocab.ts"],"names":[],"mappings":"AAAA,qBAAa,UAAU;IACrB,OAAO,CAAC,QAAQ,CAA4B;IAC5C,OAAO,CAAC,QAAQ,CAAe;IAE/B,QAAQ,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM;IAU9B,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAIvC,IAAI,IAAI,IAAI,MAAM,CAEjB;IAED,KAAK,IAAI,MAAM,EAAE;IAIjB,SAAS,IAAI,MAAM;IAInB,MAAM,CAAC,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,UAAU;CAQ7C"}
|
package/dist/vocab.js
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Vocabulary = void 0;
|
|
4
|
+
class Vocabulary {
|
|
5
|
+
termToId = new Map();
|
|
6
|
+
idToTerm = [];
|
|
7
|
+
getOrAdd(term) {
|
|
8
|
+
let id = this.termToId.get(term);
|
|
9
|
+
if (id === undefined) {
|
|
10
|
+
id = this.idToTerm.length;
|
|
11
|
+
this.termToId.set(term, id);
|
|
12
|
+
this.idToTerm.push(term);
|
|
13
|
+
}
|
|
14
|
+
return id;
|
|
15
|
+
}
|
|
16
|
+
getId(term) {
|
|
17
|
+
return this.termToId.get(term);
|
|
18
|
+
}
|
|
19
|
+
get size() {
|
|
20
|
+
return this.idToTerm.length;
|
|
21
|
+
}
|
|
22
|
+
terms() {
|
|
23
|
+
return [...this.idToTerm];
|
|
24
|
+
}
|
|
25
|
+
serialize() {
|
|
26
|
+
return { terms: this.idToTerm };
|
|
27
|
+
}
|
|
28
|
+
static deserialize(data) {
|
|
29
|
+
const v = new Vocabulary();
|
|
30
|
+
const d = data;
|
|
31
|
+
for (const term of d.terms) {
|
|
32
|
+
v.getOrAdd(term);
|
|
33
|
+
}
|
|
34
|
+
return v;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
exports.Vocabulary = Vocabulary;
|
|
38
|
+
//# sourceMappingURL=vocab.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"vocab.js","sourceRoot":"","sources":["../src/vocab.ts"],"names":[],"mappings":";;;AAAA,MAAa,UAAU;IACb,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAA;IACpC,QAAQ,GAAa,EAAE,CAAA;IAE/B,QAAQ,CAAC,IAAY;QACnB,IAAI,EAAE,GAAG,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QAChC,IAAI,EAAE,KAAK,SAAS,EAAE,CAAC;YACrB,EAAE,GAAG,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAA;YACzB,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAA;YAC3B,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAC1B,CAAC;QACD,OAAO,EAAE,CAAA;IACX,CAAC;IAED,KAAK,CAAC,IAAY;QAChB,OAAO,IAAI,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;IAChC,CAAC;IAED,IAAI,IAAI;QACN,OAAO,IAAI,CAAC,QAAQ,CAAC,MAAM,CAAA;IAC7B,CAAC;IAED,KAAK;QACH,OAAO,CAAC,GAAG,IAAI,CAAC,QAAQ,CAAC,CAAA;IAC3B,CAAC;IAED,SAAS;QACP,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,QAAQ,EAAE,CAAA;IACjC,CAAC;IAED,MAAM,CAAC,WAAW,CAAC,IAAY;QAC7B,MAAM,CAAC,GAAG,IAAI,UAAU,EAAE,CAAA;QAC1B,MAAM,CAAC,GAAG,IAA2B,CAAA;QACrC,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC,KAAK,EAAE,CAAC;YAC3B,CAAC,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAA;QAClB,CAAC;QACD,OAAO,CAAC,CAAA;IACV,CAAC;CACF;AAtCD,gCAsCC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "sparse-encode",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Generate BM25 and TF-IDF sparse vectors in JavaScript",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"files": [
|
|
8
|
+
"dist"
|
|
9
|
+
],
|
|
10
|
+
"scripts": {
|
|
11
|
+
"build": "tsc",
|
|
12
|
+
"test": "vitest run",
|
|
13
|
+
"lint": "eslint src/",
|
|
14
|
+
"prepublishOnly": "npm run build"
|
|
15
|
+
},
|
|
16
|
+
"keywords": [],
|
|
17
|
+
"author": "",
|
|
18
|
+
"license": "MIT",
|
|
19
|
+
"engines": {
|
|
20
|
+
"node": ">=18"
|
|
21
|
+
},
|
|
22
|
+
"publishConfig": {
|
|
23
|
+
"access": "public"
|
|
24
|
+
},
|
|
25
|
+
"devDependencies": {
|
|
26
|
+
"@types/node": "^25.5.0",
|
|
27
|
+
"@typescript-eslint/eslint-plugin": "^8.57.1",
|
|
28
|
+
"@typescript-eslint/parser": "^8.57.1",
|
|
29
|
+
"eslint": "^10.1.0",
|
|
30
|
+
"typescript": "^5.9.3",
|
|
31
|
+
"vitest": "^4.1.0"
|
|
32
|
+
}
|
|
33
|
+
}
|