@sepiariver/unique-set 2.0.3 → 3.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/PERF.md +30 -140
- package/README.md +47 -59
- package/dist/index.d.mts +12 -38
- package/dist/index.d.ts +15 -0
- package/dist/index.js +183 -375
- package/dist/index.mjs +128 -146
- package/index.ts +139 -167
- package/package.json +7 -6
package/PERF.md
CHANGED
|
@@ -1,153 +1,43 @@
|
|
|
1
1
|
# Performance
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Benchmarks run with `npm run bench` on Node.js v20.18.1, Apple M2 Pro.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Flat Data (`bench.spec.ts`)
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
Mixed strings, flat objects (2 keys), and 2-element arrays with ~10-15% duplicate rate.
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
| Items | MapSet `add()` | Native Set | Overhead |
|
|
10
|
+
| ------: | -------------: | ---------: | -------: |
|
|
11
|
+
| 400 | 1.23 ms | 0.04 ms | ~31x |
|
|
12
|
+
| 1,000 | 1.46 ms | 0.18 ms | ~8x |
|
|
13
|
+
| 20,000 | 13.09 ms | 1.46 ms | ~9x |
|
|
14
|
+
| 100,000 | 54.08 ms | 6.44 ms | ~8x |
|
|
10
15
|
|
|
11
|
-
|
|
16
|
+
## Nested Data (`bench-nested.spec.ts`)
|
|
12
17
|
|
|
13
|
-
|
|
18
|
+
Deeply nested objects (3-4 levels), nested arrays with objects, and mixed structures.
|
|
14
19
|
|
|
15
|
-
|
|
20
|
+
### `add()` - insert all items
|
|
16
21
|
|
|
17
|
-
|
|
|
18
|
-
|
|
|
19
|
-
|
|
|
20
|
-
|
|
|
21
|
-
|
|
|
22
|
+
| Items | MapSet | Native Set | Overhead |
|
|
23
|
+
| ------: | -------: | ---------: | -------: |
|
|
24
|
+
| 400 | 1.80 ms | 0.03 ms | ~60x |
|
|
25
|
+
| 1,000 | 3.01 ms | 0.05 ms | ~60x |
|
|
26
|
+
| 20,000 | 14.23 ms | 0.89 ms | ~16x |
|
|
27
|
+
| 100,000 | 80.60 ms | 3.80 ms | ~21x |
|
|
22
28
|
|
|
23
|
-
|
|
29
|
+
### `has()` - query all items (50% hits, 50% misses)
|
|
24
30
|
|
|
25
|
-
|
|
|
26
|
-
|
|
|
27
|
-
|
|
|
28
|
-
|
|
|
29
|
-
|
|
|
31
|
+
| Items | `has()` time | Queries | Hits | per query |
|
|
32
|
+
| ------: | -----------: | ------: | -----: | --------: |
|
|
33
|
+
| 400 | 0.96 ms | 457 | 228 | ~2.1 us |
|
|
34
|
+
| 1,000 | 0.95 ms | 1,144 | 572 | ~0.8 us |
|
|
35
|
+
| 20,000 | 16.86 ms | 22,892 | 11,446 | ~0.7 us |
|
|
36
|
+
| 100,000 | 73.89 ms | 114,458 | 57,229 | ~0.6 us |
|
|
30
37
|
|
|
31
|
-
|
|
38
|
+
## Notes
|
|
32
39
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
| Count | Unique | Bloom |
|
|
38
|
-
| ----- | -------- | ------- |
|
|
39
|
-
| 400 | 10.53 | 9.65 |
|
|
40
|
-
| 1000 | 48.60 | 10.39 |
|
|
41
|
-
| 20000 | 19242.54 | 2490.88 |
|
|
42
|
-
|
|
43
|
-
Trial 2
|
|
44
|
-
|
|
45
|
-
| Count | Unique | Bloom |
|
|
46
|
-
| ----- | -------- | ------- |
|
|
47
|
-
| 400 | 9.79 | 9.02 |
|
|
48
|
-
| 1000 | 48.85 | 10.49 |
|
|
49
|
-
| 20000 | 19255.17 | 2489.50 |
|
|
50
|
-
|
|
51
|
-
Performance is fairly stable and predictable with small datasets of shallow objects, regardless of hashCount.
|
|
52
|
-
|
|
53
|
-
## Nested Data
|
|
54
|
-
|
|
55
|
-
Plain objects and Arrays nested 1 or 2 levels deep, with 10-20% duplicates.
|
|
56
|
-
|
|
57
|
-
### BloomSet hashCount: 1
|
|
58
|
-
|
|
59
|
-
Trial 1
|
|
60
|
-
|
|
61
|
-
| Count | Unique | Bloom |
|
|
62
|
-
| ----- | -------- | ------- |
|
|
63
|
-
| 400 | 26.32 | 12.78 |
|
|
64
|
-
| 1000 | 91.30 | 16.86 |
|
|
65
|
-
| 20000 | 37671.41 | 5116.11 |
|
|
66
|
-
|
|
67
|
-
Trial 2
|
|
68
|
-
|
|
69
|
-
| Count | Unique | Bloom |
|
|
70
|
-
| ----- | -------- | ------- |
|
|
71
|
-
| 400 | 21.15 | 12.65 |
|
|
72
|
-
| 1000 | 115.2 | 16.33 |
|
|
73
|
-
| 20000 | 37169.66 | 5031.50 |
|
|
74
|
-
|
|
75
|
-
UniqueSet starts to suffer on > 1000 elements. It gets exponentially worse, especially with nested objects. Whereas BloomSet's optimizations keep it in the realm of usable at 20k elements. Subjectively I feel I'm willing to spend 5 seconds processing 20k elements if I need guaranteed uniqueness-by-value (but not 30 seconds).
|
|
76
|
-
|
|
77
|
-
### BloomSet hashCount: 7
|
|
78
|
-
|
|
79
|
-
Trial 1
|
|
80
|
-
|
|
81
|
-
| Count | Unique | Bloom |
|
|
82
|
-
| ----- | -------- | ------- |
|
|
83
|
-
| 400 | 20.58 | 13.57 |
|
|
84
|
-
| 1000 | 91.23 | 16.81 |
|
|
85
|
-
| 20000 | 37639.03 | 5151.90 |
|
|
86
|
-
|
|
87
|
-
Running 7 hashes doesn't add a lot of clock time for BloomSet, even with nested objects. Rather than recalculating the hash over the entire serialized value, BloomSet does some bit-mixing to distribute the value's representation across the bit array.
|
|
88
|
-
|
|
89
|
-
Trial 2
|
|
90
|
-
|
|
91
|
-
| Count | Unique | Bloom |
|
|
92
|
-
| ----- | -------- | ------- |
|
|
93
|
-
| 400 | 22.86 | 13.48 |
|
|
94
|
-
| 1000 | 94.64 | 17.80 |
|
|
95
|
-
| 20000 | 37673.08 | 5276.09 |
|
|
96
|
-
|
|
97
|
-
## Large (relatively)
|
|
98
|
-
|
|
99
|
-
Still using the nested dataset. Very roughly 15% duplicates, distributed in a contrived manner using modulo.
|
|
100
|
-
|
|
101
|
-
### hashCount: 7, size: 6553577
|
|
102
|
-
|
|
103
|
-
Trial 1
|
|
104
|
-
|
|
105
|
-
| Count | Unique | Bloom |
|
|
106
|
-
| ------ | ---------- | ---------- |
|
|
107
|
-
| 100000 | 982,727.79 | 142,716.46 |
|
|
108
|
-
|
|
109
|
-
```txt
|
|
110
|
-
UniqueSet size: 100000 Expected size: 100000
|
|
111
|
-
BloomSet size: 100000 Expected size: 100000
|
|
112
|
-
Native Set size: 114458 Expected size: 114458
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
With a (relatively) large dataset, UniqueSet is slow enough to make me not want to test it again. It might be possible to squeeze extra performance from BloomSet by tweaking the config options.
|
|
116
|
-
|
|
117
|
-
Trial 2
|
|
118
|
-
|
|
119
|
-
| Count | Unique | Bloom |
|
|
120
|
-
| ------ | ---------- | ---------- |
|
|
121
|
-
| 100000 | n/a | 149600.27 |
|
|
122
|
-
|
|
123
|
-
### hashCount: 1, size: 6553577
|
|
124
|
-
|
|
125
|
-
Trial 1
|
|
126
|
-
|
|
127
|
-
| Count | Unique | Bloom |
|
|
128
|
-
| ------ | ---------- | ---------- |
|
|
129
|
-
| 100000 | n/a | 135919.53 |
|
|
130
|
-
|
|
131
|
-
Trial 2
|
|
132
|
-
|
|
133
|
-
| Count | Unique | Bloom |
|
|
134
|
-
| ------ | ---------- | ---------- |
|
|
135
|
-
| 100000 | n/a | 135913.43 |
|
|
136
|
-
|
|
137
|
-
Reducing the hashCount predictably improves performance by 5-10%. Collisions fallback to `fast-deep-equal`, so we can tolerate false positives unless performance degrades.
|
|
138
|
-
|
|
139
|
-
#### hashCount: 1, size: 28755000
|
|
140
|
-
|
|
141
|
-
Trial 1
|
|
142
|
-
|
|
143
|
-
| Count | Unique | Bloom |
|
|
144
|
-
| ------ | ---------- | ---------- |
|
|
145
|
-
| 100000 | n/a | 128660.39 |
|
|
146
|
-
|
|
147
|
-
Trial 2
|
|
148
|
-
|
|
149
|
-
| Count | Unique | Bloom |
|
|
150
|
-
| ------ | ---------- | ---------- |
|
|
151
|
-
| 100000 | n/a | 127663.77 |
|
|
152
|
-
|
|
153
|
-
Using a larger bit array requires more memory: ~3.5Mb in this case, still extremely memory-efficient for what it's doing. It seems to yield something like 5% clock time improvement over a smaller array, possibly due to decreased false positives, leading to less invocations of `fast-deep-equal` for deep comparison.
|
|
40
|
+
- Native `Set` uses reference equality and cannot deduplicate objects/arrays by value. The overhead shown is the cost of deep value comparison.
|
|
41
|
+
- MapSet uses a streaming 32-bit FNV-1a structural hash with `fast-deep-equal` fallback for hash collisions.
|
|
42
|
+
- At 20,000 items, ~47 hash collisions are expected (birthday paradox). These are handled correctly with no impact on results.
|
|
43
|
+
- The `has()` cost per query decreases at larger sizes due to V8 JIT warmup.
|
package/README.md
CHANGED
|
@@ -1,86 +1,71 @@
|
|
|
1
1
|
# @sepiariver/unique-set
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Uses a streaming structural hash to optimize deep equality checks in a `Set`-like class. Falls back to deeply compare using [fast-deep-equal](https://www.npmjs.com/package/fast-deep-equal) when hash collisions occur.
|
|
4
4
|
|
|
5
5
|
Supports ESM and CommonJS. Thanks [@sakgoyal](https://github.com/sakgoyal) for contributing to and instigating ESM support.
|
|
6
6
|
|
|
7
7
|
```js
|
|
8
|
-
import {
|
|
8
|
+
import { MapSet, UniqueSet } from '@sepiariver/unique-set';
|
|
9
9
|
```
|
|
10
10
|
|
|
11
11
|
```js
|
|
12
|
-
const {
|
|
12
|
+
const { MapSet, UniqueSet } = require('@sepiariver/unique-set');
|
|
13
13
|
```
|
|
14
14
|
|
|
15
|
-
WARNING:
|
|
15
|
+
WARNING: Version 3 includes breaking changes. Older versions are deprecated.
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
Configuration options from previous versions are no longer supported. Usage is identical to the native `Set` class.
|
|
18
18
|
|
|
19
|
-
|
|
19
|
+
IMPORTANT: `MapSet` and `UniqueSet` are the same class (`UniqueSet` is an alias). The `delete` method uses deep equality, so `delete({a: 1})` will remove a previously added `{a: 1}` even if it's a different reference.
|
|
20
20
|
|
|
21
|
-
|
|
21
|
+
## API
|
|
22
22
|
|
|
23
|
-
|
|
23
|
+
### Constructor
|
|
24
24
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
### Constructor Signature
|
|
30
|
-
|
|
31
|
-
`new BloomSet(iterable = [], options = { size, hashCount });`
|
|
32
|
-
|
|
33
|
-
### Options
|
|
34
|
-
|
|
35
|
-
The options object allows you to customize the behavior and performance of the BloomSet. The following properties can be configured:
|
|
36
|
-
|
|
37
|
-
#### 1. size (number)
|
|
38
|
-
|
|
39
|
-
Description: Specifies the size of the bit array used internally by the Bloom filter. This directly impacts the memory usage and false positive probability.
|
|
40
|
-
|
|
41
|
-
Default: 6,553,577 (a prime number using roughly 800 KB of memory).
|
|
42
|
-
|
|
43
|
-
Recommendations:
|
|
44
|
-
|
|
45
|
-
For datasets with ~100,000 elements, this default size provides excellent performance (compared against `UniqueSet`) with minimal (< 1) false positives.
|
|
46
|
-
|
|
47
|
-
Larger datasets may require increasing the size for lower false positive rates. Remember though, false positives are mitigated by a fallback to `fast-deep-equal`, so you may be able to squeeze more performance from a higher tolerance for false positives, depending on your dataset.
|
|
25
|
+
```js
|
|
26
|
+
new MapSet(iterable?)
|
|
27
|
+
new UniqueSet(iterable?)
|
|
28
|
+
```
|
|
48
29
|
|
|
49
|
-
|
|
30
|
+
Accepts any iterable (array, Set, generator, etc.). Duplicates by value are discarded during construction.
|
|
50
31
|
|
|
51
|
-
|
|
32
|
+
### Methods
|
|
52
33
|
|
|
53
|
-
|
|
34
|
+
| Method | Description |
|
|
35
|
+
|---|---|
|
|
36
|
+
| `add(value)` | Adds `value` if no deeply-equal value exists. Returns `this`. |
|
|
37
|
+
| `has(value)` | Returns `true` if a deeply-equal value is in the set. |
|
|
38
|
+
| `delete(value)` | Removes the first deeply-equal value. Returns `true` if found. |
|
|
39
|
+
| `clear()` | Removes all values. |
|
|
40
|
+
| `forEach(cb, thisArg?)` | Calls `cb(value, value, set)` for each value. |
|
|
41
|
+
| `values()` | Returns an iterator over all values. |
|
|
42
|
+
| `[Symbol.iterator]()` | Makes the set iterable (e.g., `for...of`, spread). |
|
|
43
|
+
| `size` | The number of unique values in the set. |
|
|
54
44
|
|
|
55
45
|
### Examples
|
|
56
46
|
|
|
57
|
-
Default Configuration:
|
|
58
|
-
|
|
59
47
|
```js
|
|
60
|
-
const
|
|
61
|
-
bloomSet.add("example");
|
|
62
|
-
console.log(bloomSet.has("example")); // true
|
|
63
|
-
```
|
|
48
|
+
const set = new UniqueSet();
|
|
64
49
|
|
|
65
|
-
|
|
50
|
+
set.add({ a: 1, b: 2 });
|
|
51
|
+
set.add({ b: 2, a: 1 }); // same value, different key order: not added
|
|
52
|
+
set.size; // 1
|
|
66
53
|
|
|
67
|
-
|
|
54
|
+
set.has({ a: 1, b: 2 }); // true (deep equality, not reference)
|
|
68
55
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
56
|
+
set.add([1, [2, 3]]);
|
|
57
|
+
set.add([1, [2, 3]]); // duplicate nested array: not added
|
|
58
|
+
set.size; // 2
|
|
59
|
+
|
|
60
|
+
set.delete({ a: 1, b: 2 }); // true
|
|
61
|
+
set.size; // 1
|
|
73
62
|
```
|
|
74
63
|
|
|
75
64
|
### Considerations
|
|
76
65
|
|
|
77
|
-
- Memory
|
|
78
|
-
-
|
|
79
|
-
|
|
80
|
-
#### Further Tuning
|
|
81
|
-
|
|
82
|
-
- Use a larger size for datasets exceeding 100,000 elements.
|
|
83
|
-
- Reduce hashCount if performance is critical and your dataset contains very few duplicates.
|
|
66
|
+
- **Memory**: Each unique value is stored once, bucketed by a 32-bit structural hash. Overhead is minimal: one `Map` entry plus a small array per hash bucket, with >99% of buckets containing exactly one item at typical sizes.
|
|
67
|
+
- **Collisions**: At 20,000 items, roughly 47 hash collisions are expected (birthday paradox on 32-bit). Collisions are handled correctly via `fast-deep-equal`. They add a small cost but never affect correctness.
|
|
68
|
+
- **Equality semantics**: Object key order is ignored. Array element order matters. `NaN === NaN`. `0` and `-0` are treated as equal. Functions and symbols are compared by reference.
|
|
84
69
|
|
|
85
70
|
## Installation
|
|
86
71
|
|
|
@@ -91,7 +76,7 @@ npm install @sepiariver/unique-set
|
|
|
91
76
|
## Usage
|
|
92
77
|
|
|
93
78
|
```js
|
|
94
|
-
|
|
79
|
+
import { MapSet, UniqueSet } from "./dist/index.mjs";
|
|
95
80
|
|
|
96
81
|
const data = [
|
|
97
82
|
"string",
|
|
@@ -114,21 +99,24 @@ const data = [
|
|
|
114
99
|
[1, 2, 3],
|
|
115
100
|
];
|
|
116
101
|
|
|
102
|
+
const norm = new Set();
|
|
117
103
|
const unique1 = new UniqueSet();
|
|
118
104
|
data.forEach((el) => {
|
|
119
105
|
unique1.add(el);
|
|
106
|
+
norm.add(el);
|
|
120
107
|
});
|
|
121
108
|
const unique2 = new UniqueSet(data);
|
|
122
109
|
console.log(unique1.size); // 6 instead of 8 with Set
|
|
123
110
|
console.log(unique2.size); // 6
|
|
111
|
+
console.log(norm.size); // 8 with Set
|
|
124
112
|
|
|
125
|
-
const
|
|
113
|
+
const map1 = new MapSet();
|
|
126
114
|
data.forEach((el) => {
|
|
127
|
-
|
|
115
|
+
map1.add(el);
|
|
128
116
|
});
|
|
129
|
-
const
|
|
130
|
-
console.log(
|
|
131
|
-
console.log(
|
|
117
|
+
const map2 = new MapSet(data);
|
|
118
|
+
console.log(map1.size); // 6 instead of 8 with Set
|
|
119
|
+
console.log(map2.size); // 6
|
|
132
120
|
```
|
|
133
121
|
|
|
134
122
|
## Testing
|
package/dist/index.d.mts
CHANGED
|
@@ -1,41 +1,15 @@
|
|
|
1
|
-
|
|
2
|
-
declare class
|
|
3
|
-
/*** @throws TypeError If the input is not iterable. */
|
|
4
|
-
constructor(iterable?: Iterable<T>);
|
|
5
|
-
/**
|
|
6
|
-
* Determines whether an object is in the UniqueSet using deep equality.
|
|
7
|
-
* @param o The object to check for presence in the UniqueSet.
|
|
8
|
-
* @returns `true` if the object is found, `false` otherwise.
|
|
9
|
-
*/
|
|
10
|
-
has(o: T): boolean;
|
|
11
|
-
/**
|
|
12
|
-
* Adds a new object to the UniqueSet if it is not already present.
|
|
13
|
-
* @param o The object to add to the UniqueSet.
|
|
14
|
-
* @returns The `UniqueSet` instance, allowing for chaining.
|
|
15
|
-
*/
|
|
16
|
-
add(o: T): this;
|
|
17
|
-
}
|
|
18
|
-
/** A `Set` extension that uses a Bloom filter for fast existence checks combined with deep equality for accuracy. */
|
|
19
|
-
declare class BloomSet<T> extends Set<T> {
|
|
1
|
+
declare const structuralHash: (value: unknown) => number;
|
|
2
|
+
declare class MapSet<T> {
|
|
20
3
|
#private;
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
size?: number;
|
|
31
|
-
hashCount?: number;
|
|
32
|
-
});
|
|
33
|
-
/** Determines existence of an object in the BloomSet using the Bloom filter and deep equality */
|
|
34
|
-
has(o: T): boolean;
|
|
35
|
-
/** Adds a new object to the BloomSet if it is not already present.
|
|
36
|
-
* @returns The `BloomSet` instance, allowing for chaining.
|
|
37
|
-
*/
|
|
38
|
-
add(o: T): this;
|
|
4
|
+
constructor(iterable?: Iterable<T>);
|
|
5
|
+
add(value: T): this;
|
|
6
|
+
has(value: T): boolean;
|
|
7
|
+
delete(value: T): boolean;
|
|
8
|
+
get size(): number;
|
|
9
|
+
clear(): void;
|
|
10
|
+
forEach(callback: (value: T, valueAgain: T, set: this) => void, thisArg?: any): void;
|
|
11
|
+
values(): IterableIterator<T>;
|
|
12
|
+
[Symbol.iterator](): IterableIterator<T>;
|
|
39
13
|
}
|
|
40
14
|
|
|
41
|
-
export {
|
|
15
|
+
export { MapSet, MapSet as UniqueSet, structuralHash };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
declare const structuralHash: (value: unknown) => number;
|
|
2
|
+
declare class MapSet<T> {
|
|
3
|
+
#private;
|
|
4
|
+
constructor(iterable?: Iterable<T>);
|
|
5
|
+
add(value: T): this;
|
|
6
|
+
has(value: T): boolean;
|
|
7
|
+
delete(value: T): boolean;
|
|
8
|
+
get size(): number;
|
|
9
|
+
clear(): void;
|
|
10
|
+
forEach(callback: (value: T, valueAgain: T, set: this) => void, thisArg?: any): void;
|
|
11
|
+
values(): IterableIterator<T>;
|
|
12
|
+
[Symbol.iterator](): IterableIterator<T>;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export { MapSet, MapSet as UniqueSet, structuralHash };
|