@thi.ng/text-analysis 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -1
- package/README.md +1 -1
- package/frequencies.d.ts +40 -0
- package/frequencies.js +10 -0
- package/package.json +8 -8
package/CHANGELOG.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Change Log
|
|
2
2
|
|
|
3
|
-
- **Last updated**: 2025-06-
|
|
3
|
+
- **Last updated**: 2025-06-18T12:01:21Z
|
|
4
4
|
- **Generator**: [thi.ng/monopub](https://thi.ng/monopub)
|
|
5
5
|
|
|
6
6
|
All notable changes to this project will be documented in this file.
|
|
@@ -11,6 +11,17 @@ See [Conventional Commits](https://conventionalcommits.org/) for commit guidelin
|
|
|
11
11
|
**Note:** Unlisted _patch_ versions only involve non-code or otherwise excluded changes
|
|
12
12
|
and/or version bumps of transitive dependencies.
|
|
13
13
|
|
|
14
|
+
## [0.4.0](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.4.0) (2025-06-18)
|
|
15
|
+
|
|
16
|
+
#### 🚀 Features
|
|
17
|
+
|
|
18
|
+
- add `filterDocsFrequency()` ([6ac1f90](https://github.com/thi-ng/umbrella/commit/6ac1f90))
|
|
19
|
+
|
|
20
|
+
#### ⏱ Performance improvements
|
|
21
|
+
|
|
22
|
+
- minor update kmeansDense() ([ebd5618](https://github.com/thi-ng/umbrella/commit/ebd5618))
|
|
23
|
+
- internal use `lookupUnsafe()`
|
|
24
|
+
|
|
14
25
|
### [0.3.1](https://github.com/thi-ng/umbrella/tree/@thi.ng/text-analysis@0.3.1) (2025-06-15)
|
|
15
26
|
|
|
16
27
|
#### 🩹 Bug fixes
|
package/README.md
CHANGED
package/frequencies.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { Fn, Fn2 } from "@thi.ng/api";
|
|
1
2
|
import { frequencies as $freq } from "@thi.ng/transducers/frequencies";
|
|
2
3
|
import { normFrequenciesAuto as $norm } from "@thi.ng/transducers/norm-frequencies-auto";
|
|
3
4
|
import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequencies";
|
|
@@ -59,4 +60,43 @@ export declare const normFrequencies: typeof $norm;
|
|
|
59
60
|
* ```
|
|
60
61
|
*/
|
|
61
62
|
export declare const sortedFrequencies: typeof $sorted;
|
|
63
|
+
/**
|
|
64
|
+
* Takes an array of tokenized documents, a histogram function (`frequencies`)
|
|
65
|
+
* and a predicate function (`pred`). First computes the combined histogram of
|
|
66
|
+
* terms/works in all given docs using `frequencies`, then filters each document
|
|
67
|
+
* using supplied predicate, which is called with a single word/token and its
|
|
68
|
+
* computed frequency. Only words are kept for which the predicate succeeds.
|
|
69
|
+
*
|
|
70
|
+
* @remarks
|
|
71
|
+
* See {@link frequencies} and {@link normFrequencies} for histogram creation.
|
|
72
|
+
*
|
|
73
|
+
* @example
|
|
74
|
+
* ```ts tangle:../export/filter-docs-frequency.ts
|
|
75
|
+
* import { filterDocsFrequency, frequencies } from "@thi.ng/text-analysis";
|
|
76
|
+
*
|
|
77
|
+
* const docs = [
|
|
78
|
+
* ["a", "b", "c"],
|
|
79
|
+
* ["a", "b", "d", "e"],
|
|
80
|
+
* ["b", "f", "g"],
|
|
81
|
+
* ["a", "b", "c", "f"],
|
|
82
|
+
* ["a", "g", "h"]
|
|
83
|
+
* ];
|
|
84
|
+
*
|
|
85
|
+
* // only keep words which occur more than once
|
|
86
|
+
* const filtered = filterDocsFrequency(docs, frequencies, (_, x) => x > 1);
|
|
87
|
+
*
|
|
88
|
+
* // show before & after
|
|
89
|
+
* for(let i = 0; i < docs.length; i++) console.log(docs[i], "=>", filtered[i]);
|
|
90
|
+
* // [ "a", "b", "c" ] => [ "a", "b", "c" ]
|
|
91
|
+
* // [ "a", "b", "d", "e" ] => [ "a", "b" ]
|
|
92
|
+
* // [ "b", "f", "g" ] => [ "b", "f", "g" ]
|
|
93
|
+
* // [ "a", "b", "c", "f" ] => [ "a", "b", "c", "f" ]
|
|
94
|
+
* // [ "a", "g", "h" ] => [ "a", "g" ]
|
|
95
|
+
* ```
|
|
96
|
+
*
|
|
97
|
+
* @param docs
|
|
98
|
+
* @param frequencies
|
|
99
|
+
* @param pred
|
|
100
|
+
*/
|
|
101
|
+
export declare const filterDocsFrequency: (docs: string[][], frequencies: Fn<Iterable<string>, Map<string, number>>, pred: Fn2<string, number, boolean>) => string[][];
|
|
62
102
|
//# sourceMappingURL=frequencies.d.ts.map
|
package/frequencies.js
CHANGED
|
@@ -4,7 +4,17 @@ import { sortedFrequencies as $sorted } from "@thi.ng/transducers/sorted-frequen
|
|
|
4
4
|
const frequencies = $freq;
|
|
5
5
|
const normFrequencies = $norm;
|
|
6
6
|
const sortedFrequencies = $sorted;
|
|
7
|
+
const filterDocsFrequency = (docs, frequencies2, pred) => {
|
|
8
|
+
const histogram = frequencies2(docs.flat());
|
|
9
|
+
return docs.map(
|
|
10
|
+
(doc) => doc.filter((word) => {
|
|
11
|
+
const freq = histogram.get(word);
|
|
12
|
+
return freq !== void 0 && pred(word, freq);
|
|
13
|
+
})
|
|
14
|
+
);
|
|
15
|
+
};
|
|
7
16
|
export {
|
|
17
|
+
filterDocsFrequency,
|
|
8
18
|
frequencies,
|
|
9
19
|
normFrequencies,
|
|
10
20
|
sortedFrequencies
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@thi.ng/text-analysis",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.4.0",
|
|
4
4
|
"description": "Text tokenization, transformation & analysis transducers, utilities, stop words, porter stemming, vector encodings, similarities",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"module": "./index.js",
|
|
@@ -40,14 +40,14 @@
|
|
|
40
40
|
},
|
|
41
41
|
"dependencies": {
|
|
42
42
|
"@thi.ng/api": "^8.11.29",
|
|
43
|
-
"@thi.ng/arrays": "^2.
|
|
43
|
+
"@thi.ng/arrays": "^2.13.0",
|
|
44
44
|
"@thi.ng/bidir-index": "^1.3.0",
|
|
45
45
|
"@thi.ng/checks": "^3.7.9",
|
|
46
|
-
"@thi.ng/distance": "^3.0.
|
|
47
|
-
"@thi.ng/k-means": "^1.1.
|
|
48
|
-
"@thi.ng/strings": "^3.9.
|
|
49
|
-
"@thi.ng/transducers": "^9.4.
|
|
50
|
-
"@thi.ng/vectors": "^8.3.
|
|
46
|
+
"@thi.ng/distance": "^3.0.1",
|
|
47
|
+
"@thi.ng/k-means": "^1.1.1",
|
|
48
|
+
"@thi.ng/strings": "^3.9.15",
|
|
49
|
+
"@thi.ng/transducers": "^9.4.2",
|
|
50
|
+
"@thi.ng/vectors": "^8.3.1"
|
|
51
51
|
},
|
|
52
52
|
"devDependencies": {
|
|
53
53
|
"esbuild": "^0.25.5",
|
|
@@ -141,5 +141,5 @@
|
|
|
141
141
|
"status": "alpha",
|
|
142
142
|
"year": 2021
|
|
143
143
|
},
|
|
144
|
-
"gitHead": "
|
|
144
|
+
"gitHead": "b076434a497b291ad33e81b1a15f6a71e2c82cc2\n"
|
|
145
145
|
}
|