@sepiariver/unique-set 1.1.0 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +85 -7
- package/dist/index.js +312 -13
- package/index.d.ts +60 -0
- package/package.json +11 -5
- package/src/index.js +166 -5
package/README.md
CHANGED
|
@@ -1,10 +1,76 @@
|
|
|
1
1
|
# @sepiariver/unique-set
|
|
2
2
|
|
|
3
|
-
Extends the
|
|
3
|
+
Extends the native `Set` class to deeply compare using [fast-deep-equal](https://www.npmjs.com/package/fast-deep-equal), with optional Bloom filter optimization.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
Supports ESM and CommonJS.
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
WARNING: This version exports 2 classes instead of a single default class, breaking b/c with version 1.
|
|
8
|
+
|
|
9
|
+
The extended methods iterate through the elements of the `UniqueSet` until equality is found. If no elements match, the entire `UniqueSet` would have been iterated. However fast `fast-deep-equal` is, calling it in a loop like this makes performance many, many times worse than the native `Set`. For datasets greater than a thousand elements, there is probably a better way to achieve what you're trying to do. Otherwise, `UniqueSet` is convenient.
|
|
10
|
+
|
|
11
|
+
UPDATE: Version 2 ships with `BloomSet`, in which equality checks are optimized with a Bloom filter. This class is useful for larger datasets, performing about 3-10 times faster than `UniqueSet` for datasets greater than 1000 elements. Less than a few hundred (~400) elements, `UniqueSet` is faster. `BloomSet`'s probabilistic false positives are covered by a fallback to `fast-deep-equal`. BloomSet is still orders of magnitude slower than the native `Set`, but if deep equality is required, this is a decent option.
|
|
12
|
+
|
|
13
|
+
Experiment with configurations to find the best performance for your use case.
|
|
14
|
+
|
|
15
|
+
NOTE: The `delete` method is unmodified. In the case of duplicate objects that are equivalent but have different references, the results of `delete` operations may be unexpected.
|
|
16
|
+
|
|
17
|
+
## Config Options
|
|
18
|
+
|
|
19
|
+
### Constructor Signature
|
|
20
|
+
|
|
21
|
+
`new BloomSet(iterable = [], options = { size, hashCount });`
|
|
22
|
+
|
|
23
|
+
### Options
|
|
24
|
+
|
|
25
|
+
The options object allows you to customize the behavior and performance of the BloomSet. The following properties can be configured:
|
|
26
|
+
|
|
27
|
+
#### 1. size (number)
|
|
28
|
+
|
|
29
|
+
Description: Specifies the size of the bit array used internally by the Bloom filter. This directly impacts the memory usage and false positive probability.
|
|
30
|
+
|
|
31
|
+
Default: 6,553,577 (a prime number using roughly 800 KB of memory).
|
|
32
|
+
|
|
33
|
+
Recommendations:
|
|
34
|
+
|
|
35
|
+
For datasets with ~100,000 elements, this default size provides excellent performance (compared against `UniqueSet`) with minimal (< 1) false positives.
|
|
36
|
+
|
|
37
|
+
Larger datasets may require increasing the size for lower false positive rates. Remember though, false positives are mitigated by a fallback to `fast-deep-equal`, so you may be able to squeeze more performance from a higher tolerance for false positives, depending on your dataset.
|
|
38
|
+
|
|
39
|
+
#### 2. hashCount (number)
|
|
40
|
+
|
|
41
|
+
Description: Specifies the number of hash functions used by the Bloom filter. This impacts both the false positive probability and the computational cost of adding/checking elements.
|
|
42
|
+
|
|
43
|
+
Default: 7
|
|
44
|
+
|
|
45
|
+
### Examples
|
|
46
|
+
|
|
47
|
+
Default Configuration:
|
|
48
|
+
|
|
49
|
+
```js
|
|
50
|
+
const bloomSet = new BloomSet();
|
|
51
|
+
bloomSet.add("example");
|
|
52
|
+
console.log(bloomSet.has("example")); // true
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Custom Configuration for Larger Datasets:
|
|
56
|
+
|
|
57
|
+
Example 28,755,000 bit array size uses roughly 3.5 MB of memory, but this configuration is robust against datasets of something like 1M elements. The practicality of using BloomSet with that many elements is low, due to the performance hit of deep equality checks.
|
|
58
|
+
|
|
59
|
+
```js
|
|
60
|
+
const bloomSet = new BloomSet([], { size: 28755000, hashCount: 20 });
|
|
61
|
+
bloomSet.add("custom");
|
|
62
|
+
console.log(bloomSet.has("custom")); // true
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Considerations
|
|
66
|
+
|
|
67
|
+
- Memory Usage: The bit array uses size / 8 bytes of memory. Even at 800 KB, initializing 1250 BloomSets in the same scope would use a gigabyte of memory.
|
|
68
|
+
- False Positive Rate: The probability of a false positive is influenced by size, hashCount, and the number of elements. Adjust these values to balance performance and accuracy for your dataset.
|
|
69
|
+
|
|
70
|
+
#### Further Tuning
|
|
71
|
+
|
|
72
|
+
- Use a larger size for datasets exceeding 100,000 elements.
|
|
73
|
+
- Reduce hashCount if performance is critical and your dataset contains very few duplicates.
|
|
8
74
|
|
|
9
75
|
## Installation
|
|
10
76
|
|
|
@@ -15,7 +81,7 @@ npm install @sepiariver/unique-set
|
|
|
15
81
|
## Usage
|
|
16
82
|
|
|
17
83
|
```js
|
|
18
|
-
const UniqueSet = require('@sepiariver/unique-set');
|
|
84
|
+
const { BloomSet, UniqueSet } = require('@sepiariver/unique-set');
|
|
19
85
|
|
|
20
86
|
const data = [
|
|
21
87
|
"string",
|
|
@@ -38,13 +104,21 @@ const data = [
|
|
|
38
104
|
[1, 2, 3],
|
|
39
105
|
];
|
|
40
106
|
|
|
41
|
-
|
|
107
|
+
const unique1 = new UniqueSet();
|
|
42
108
|
data.forEach((el) => {
|
|
43
109
|
unique1.add(el);
|
|
44
110
|
});
|
|
45
|
-
|
|
111
|
+
const unique2 = new UniqueSet(data);
|
|
46
112
|
console.log(unique1.size); // 6 instead of 8 with Set
|
|
47
113
|
console.log(unique2.size); // 6
|
|
114
|
+
|
|
115
|
+
const bloom1 = new BloomSet();
|
|
116
|
+
data.forEach((el) => {
|
|
117
|
+
bloom1.add(el);
|
|
118
|
+
});
|
|
119
|
+
const bloom2 = new BloomSet(data);
|
|
120
|
+
console.log(bloom1.size); // 6 instead of 8 with Set
|
|
121
|
+
console.log(bloom2.size); // 6
|
|
48
122
|
```
|
|
49
123
|
|
|
50
124
|
## Testing
|
|
@@ -55,4 +129,8 @@ console.log(unique2.size); // 6
|
|
|
55
129
|
|
|
56
130
|
## Contributing
|
|
57
131
|
|
|
58
|
-
Submit pull requests to https://github.com/sepiariver/unique-set/pulls
|
|
132
|
+
Submit pull requests to [https://github.com/sepiariver/unique-set/pulls]
|
|
133
|
+
|
|
134
|
+
## Issues
|
|
135
|
+
|
|
136
|
+
Issue reporting is encouraged: [https://github.com/sepiariver/unique-set/issues]
|
package/dist/index.js
CHANGED
|
@@ -1,12 +1,27 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Object.defineProperty(exports, "__esModule", {
|
|
4
|
+
value: true
|
|
5
|
+
});
|
|
6
|
+
exports.UniqueSet = exports.BloomSet = void 0;
|
|
4
7
|
|
|
5
8
|
var _fastDeepEqual = _interopRequireDefault(require("fast-deep-equal"));
|
|
6
9
|
|
|
7
10
|
function _interopRequireDefault(obj) { return obj && obj.__esModule ? obj : { "default": obj }; }
|
|
8
11
|
|
|
9
|
-
function
|
|
12
|
+
function _slicedToArray(arr, i) { return _arrayWithHoles(arr) || _iterableToArrayLimit(arr, i) || _unsupportedIterableToArray(arr, i) || _nonIterableRest(); }
|
|
13
|
+
|
|
14
|
+
function _nonIterableRest() { throw new TypeError("Invalid attempt to destructure non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); }
|
|
15
|
+
|
|
16
|
+
function _iterableToArrayLimit(arr, i) { var _i = arr == null ? null : typeof Symbol !== "undefined" && arr[Symbol.iterator] || arr["@@iterator"]; if (_i == null) return; var _arr = []; var _n = true; var _d = false; var _s, _e; try { for (_i = _i.call(arr); !(_n = (_s = _i.next()).done); _n = true) { _arr.push(_s.value); if (i && _arr.length === i) break; } } catch (err) { _d = true; _e = err; } finally { try { if (!_n && _i["return"] != null) _i["return"](); } finally { if (_d) throw _e; } } return _arr; }
|
|
17
|
+
|
|
18
|
+
function _arrayWithHoles(arr) { if (Array.isArray(arr)) return arr; }
|
|
19
|
+
|
|
20
|
+
function _typeof(obj) { "@babel/helpers - typeof"; return _typeof = "function" == typeof Symbol && "symbol" == typeof Symbol.iterator ? function (obj) { return typeof obj; } : function (obj) { return obj && "function" == typeof Symbol && obj.constructor === Symbol && obj !== Symbol.prototype ? "symbol" : typeof obj; }, _typeof(obj); }
|
|
21
|
+
|
|
22
|
+
function _readOnlyError(name) { throw new TypeError("\"" + name + "\" is read-only"); }
|
|
23
|
+
|
|
24
|
+
function _createForOfIteratorHelper(o, allowArrayLike) { var it = typeof Symbol !== "undefined" && o[Symbol.iterator] || o["@@iterator"]; if (!it) { if (Array.isArray(o) || (it = _unsupportedIterableToArray(o)) || allowArrayLike && o && typeof o.length === "number") { if (it) o = it; var i = 0; var F = function F() {}; return { s: F, n: function n() { if (i >= o.length) return { done: true }; return { done: false, value: o[i++] }; }, e: function e(_e2) { throw _e2; }, f: F }; } throw new TypeError("Invalid attempt to iterate non-iterable instance.\nIn order to be iterable, non-array objects must have a [Symbol.iterator]() method."); } var normalCompletion = true, didErr = false, err; return { s: function s() { it = it.call(o); }, n: function n() { var step = it.next(); normalCompletion = step.done; return step; }, e: function e(_e3) { didErr = true; err = _e3; }, f: function f() { try { if (!normalCompletion && it["return"] != null) it["return"](); } finally { if (didErr) throw err; } } }; }
|
|
10
25
|
|
|
11
26
|
function _unsupportedIterableToArray(o, minLen) { if (!o) return; if (typeof o === "string") return _arrayLikeToArray(o, minLen); var n = Object.prototype.toString.call(o).slice(8, -1); if (n === "Object" && o.constructor) n = o.constructor.name; if (n === "Map" || n === "Set") return Array.from(o); if (n === "Arguments" || /^(?:Ui|I)nt(?:8|16|32)(?:Clamped)?Array$/.test(n)) return _arrayLikeToArray(o, minLen); }
|
|
12
27
|
|
|
@@ -18,6 +33,10 @@ function _defineProperties(target, props) { for (var i = 0; i < props.length; i+
|
|
|
18
33
|
|
|
19
34
|
function _createClass(Constructor, protoProps, staticProps) { if (protoProps) _defineProperties(Constructor.prototype, protoProps); if (staticProps) _defineProperties(Constructor, staticProps); Object.defineProperty(Constructor, "prototype", { writable: false }); return Constructor; }
|
|
20
35
|
|
|
36
|
+
function _get() { if (typeof Reflect !== "undefined" && Reflect.get) { _get = Reflect.get; } else { _get = function _get(target, property, receiver) { var base = _superPropBase(target, property); if (!base) return; var desc = Object.getOwnPropertyDescriptor(base, property); if (desc.get) { return desc.get.call(arguments.length < 3 ? target : receiver); } return desc.value; }; } return _get.apply(this, arguments); }
|
|
37
|
+
|
|
38
|
+
function _superPropBase(object, property) { while (!Object.prototype.hasOwnProperty.call(object, property)) { object = _getPrototypeOf(object); if (object === null) break; } return object; }
|
|
39
|
+
|
|
21
40
|
function _inherits(subClass, superClass) { if (typeof superClass !== "function" && superClass !== null) { throw new TypeError("Super expression must either be null or a function"); } subClass.prototype = Object.create(superClass && superClass.prototype, { constructor: { value: subClass, writable: true, configurable: true } }); Object.defineProperty(subClass, "prototype", { writable: false }); if (superClass) _setPrototypeOf(subClass, superClass); }
|
|
22
41
|
|
|
23
42
|
function _createSuper(Derived) { var hasNativeReflectConstruct = _isNativeReflectConstruct(); return function _createSuperInternal() { var Super = _getPrototypeOf(Derived), result; if (hasNativeReflectConstruct) { var NewTarget = _getPrototypeOf(this).constructor; result = Reflect.construct(Super, arguments, NewTarget); } else { result = Super.apply(this, arguments); } return _possibleConstructorReturn(this, result); }; }
|
|
@@ -44,33 +63,54 @@ var UniqueSet = /*#__PURE__*/function (_Set) {
|
|
|
44
63
|
var _super = _createSuper(UniqueSet);
|
|
45
64
|
|
|
46
65
|
function UniqueSet() {
|
|
66
|
+
var _this;
|
|
67
|
+
|
|
68
|
+
var iterable = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : [];
|
|
69
|
+
|
|
47
70
|
_classCallCheck(this, UniqueSet);
|
|
48
71
|
|
|
49
|
-
|
|
50
|
-
|
|
72
|
+
if (!Array.isArray(iterable) && !iterable[Symbol.iterator]) {
|
|
73
|
+
throw new TypeError("UniqueSet requires an iterable");
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
_this = _super.call(this);
|
|
77
|
+
|
|
78
|
+
var _iterator = _createForOfIteratorHelper(iterable),
|
|
79
|
+
_step;
|
|
80
|
+
|
|
81
|
+
try {
|
|
82
|
+
for (_iterator.s(); !(_step = _iterator.n()).done;) {
|
|
83
|
+
var item = _step.value;
|
|
84
|
+
|
|
85
|
+
_this.add(item);
|
|
86
|
+
}
|
|
87
|
+
} catch (err) {
|
|
88
|
+
_iterator.e(err);
|
|
89
|
+
} finally {
|
|
90
|
+
_iterator.f();
|
|
51
91
|
}
|
|
52
92
|
|
|
53
|
-
return
|
|
93
|
+
return _this;
|
|
54
94
|
}
|
|
55
95
|
|
|
56
96
|
_createClass(UniqueSet, [{
|
|
57
97
|
key: "has",
|
|
58
98
|
value: function has(o) {
|
|
59
|
-
var
|
|
60
|
-
|
|
99
|
+
var _iterator2 = _createForOfIteratorHelper(this),
|
|
100
|
+
_step2;
|
|
61
101
|
|
|
62
102
|
try {
|
|
63
|
-
for (
|
|
64
|
-
var i =
|
|
103
|
+
for (_iterator2.s(); !(_step2 = _iterator2.n()).done;) {
|
|
104
|
+
var i = _step2.value;
|
|
65
105
|
|
|
66
106
|
if ((0, _fastDeepEqual["default"])(o, i)) {
|
|
67
107
|
return true;
|
|
68
108
|
}
|
|
69
109
|
}
|
|
70
110
|
} catch (err) {
|
|
71
|
-
|
|
111
|
+
_iterator2.e(err);
|
|
72
112
|
} finally {
|
|
73
|
-
|
|
113
|
+
_iterator2.f();
|
|
74
114
|
}
|
|
75
115
|
|
|
76
116
|
return false;
|
|
@@ -79,12 +119,271 @@ var UniqueSet = /*#__PURE__*/function (_Set) {
|
|
|
79
119
|
key: "add",
|
|
80
120
|
value: function add(o) {
|
|
81
121
|
if (!this.has(o)) {
|
|
82
|
-
|
|
122
|
+
_get(_getPrototypeOf(UniqueSet.prototype), "add", this).call(this, o);
|
|
83
123
|
}
|
|
124
|
+
|
|
125
|
+
return this;
|
|
84
126
|
}
|
|
85
127
|
}]);
|
|
86
128
|
|
|
87
129
|
return UniqueSet;
|
|
88
130
|
}( /*#__PURE__*/_wrapNativeSuper(Set));
|
|
89
131
|
|
|
90
|
-
|
|
132
|
+
exports.UniqueSet = UniqueSet;
|
|
133
|
+
|
|
134
|
+
var BloomSet = /*#__PURE__*/function (_Set2) {
|
|
135
|
+
_inherits(BloomSet, _Set2);
|
|
136
|
+
|
|
137
|
+
var _super2 = _createSuper(BloomSet);
|
|
138
|
+
|
|
139
|
+
function BloomSet() {
|
|
140
|
+
var _this2;
|
|
141
|
+
|
|
142
|
+
var iterable = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : [];
|
|
143
|
+
var options = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : {};
|
|
144
|
+
|
|
145
|
+
_classCallCheck(this, BloomSet);
|
|
146
|
+
|
|
147
|
+
if (!Array.isArray(iterable) && !iterable[Symbol.iterator]) {
|
|
148
|
+
throw new TypeError("BloomSet requires an iterable");
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
_this2 = _super2.call(this);
|
|
152
|
+
|
|
153
|
+
if (!options || _typeof(options) !== "object") {
|
|
154
|
+
options = {};
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
var _options = options,
|
|
158
|
+
_options$size = _options.size,
|
|
159
|
+
size = _options$size === void 0 ? 6553577 : _options$size,
|
|
160
|
+
_options$hashCount = _options.hashCount,
|
|
161
|
+
hashCount = _options$hashCount === void 0 ? 7 : _options$hashCount;
|
|
162
|
+
|
|
163
|
+
if (typeof size !== "number" || size <= 0) {
|
|
164
|
+
6553577, _readOnlyError("size"); // Targeting < 1 collision per 100,000 elements, ~819 KB memory, needs 7 hashes
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
_this2.aSize = _this2._findNextPrime(size);
|
|
168
|
+
|
|
169
|
+
if (typeof hashCount !== "number" || hashCount <= 0) {
|
|
170
|
+
7, _readOnlyError("hashCount");
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
_this2.hashCount = hashCount;
|
|
174
|
+
_this2.bitArray = new Uint8Array(Math.ceil(size / 8));
|
|
175
|
+
|
|
176
|
+
var _iterator3 = _createForOfIteratorHelper(iterable),
|
|
177
|
+
_step3;
|
|
178
|
+
|
|
179
|
+
try {
|
|
180
|
+
for (_iterator3.s(); !(_step3 = _iterator3.n()).done;) {
|
|
181
|
+
var item = _step3.value;
|
|
182
|
+
|
|
183
|
+
_this2.add(item);
|
|
184
|
+
}
|
|
185
|
+
} catch (err) {
|
|
186
|
+
_iterator3.e(err);
|
|
187
|
+
} finally {
|
|
188
|
+
_iterator3.f();
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
return _this2;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
_createClass(BloomSet, [{
|
|
195
|
+
key: "_findNextPrime",
|
|
196
|
+
value: function _findNextPrime(num) {
|
|
197
|
+
if (num < 2) return 2;
|
|
198
|
+
if (num % 2 === 0) num++; // Odd numbers only
|
|
199
|
+
|
|
200
|
+
while (!this._isPrime(num)) {
|
|
201
|
+
num += 2; // Odd numbers only
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
return num;
|
|
205
|
+
}
|
|
206
|
+
}, {
|
|
207
|
+
key: "_isPrime",
|
|
208
|
+
value: function _isPrime(num) {
|
|
209
|
+
if (num < 2) return false;
|
|
210
|
+
if (num === 2 || num === 3) return true;
|
|
211
|
+
if (num % 2 === 0 || num % 3 === 0) return false;
|
|
212
|
+
var sqrt = Math.floor(Math.sqrt(num));
|
|
213
|
+
|
|
214
|
+
for (var i = 5; i <= sqrt; i += 6) {
|
|
215
|
+
if (num % i === 0 || num % (i + 2) === 0) return false;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
return true;
|
|
219
|
+
}
|
|
220
|
+
}, {
|
|
221
|
+
key: "_serialize",
|
|
222
|
+
value: function _serialize(item) {
|
|
223
|
+
if (typeof item === "number" && isNaN(item)) {
|
|
224
|
+
return "NaN";
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
if (item && _typeof(item) === "object") {
|
|
228
|
+
var serialize = this._serialize.bind(this);
|
|
229
|
+
|
|
230
|
+
if (Array.isArray(item)) {
|
|
231
|
+
return "[".concat(item.map(serialize).join(","), "]");
|
|
232
|
+
} else {
|
|
233
|
+
return "{".concat(Object.entries(item).sort(function (_ref, _ref2) {
|
|
234
|
+
var _ref3 = _slicedToArray(_ref, 1),
|
|
235
|
+
a = _ref3[0];
|
|
236
|
+
|
|
237
|
+
var _ref4 = _slicedToArray(_ref2, 1),
|
|
238
|
+
b = _ref4[0];
|
|
239
|
+
|
|
240
|
+
return a.localeCompare(b);
|
|
241
|
+
}).map(function (_ref5) {
|
|
242
|
+
var _ref6 = _slicedToArray(_ref5, 2),
|
|
243
|
+
k = _ref6[0],
|
|
244
|
+
v = _ref6[1];
|
|
245
|
+
|
|
246
|
+
return "".concat(k, ":").concat(serialize(v));
|
|
247
|
+
}).join(","), "}");
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return String(item);
|
|
252
|
+
}
|
|
253
|
+
}, {
|
|
254
|
+
key: "_hashes",
|
|
255
|
+
value: function _hashes(item) {
|
|
256
|
+
var hashes = [];
|
|
257
|
+
|
|
258
|
+
var str = this._serialize(item);
|
|
259
|
+
|
|
260
|
+
var hash = this._fnv1a(str); // Base hash
|
|
261
|
+
// Bloom into hashCount hash values
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
for (var i = 0; i < this.hashCount; i++) {
|
|
265
|
+
hash %= this.aSize; // Ensure within bounds
|
|
266
|
+
// Track
|
|
267
|
+
|
|
268
|
+
hashes.push(hash); // Modify
|
|
269
|
+
|
|
270
|
+
hash = (hash ^ hash >>> 13) * 0xc2b2ae35;
|
|
271
|
+
hash >>>= 0; // Ensure unsigned 32-bit integer
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return hashes;
|
|
275
|
+
}
|
|
276
|
+
}, {
|
|
277
|
+
key: "_fnv1a",
|
|
278
|
+
value: function _fnv1a(str) {
|
|
279
|
+
if (typeof str !== "string") {
|
|
280
|
+
str = String(str);
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
var hash = 2166136261; // FNV offset basis for 32-bit
|
|
284
|
+
|
|
285
|
+
for (var i = 0; i < str.length; i++) {
|
|
286
|
+
hash ^= str.charCodeAt(i);
|
|
287
|
+
hash = hash * 16777619 >>> 0; // Multiply by the FNV prime and ensure 32-bit unsigned
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
return hash >>> 0;
|
|
291
|
+
}
|
|
292
|
+
}, {
|
|
293
|
+
key: "_setBits",
|
|
294
|
+
value: function _setBits(hashes) {
|
|
295
|
+
var _iterator4 = _createForOfIteratorHelper(hashes),
|
|
296
|
+
_step4;
|
|
297
|
+
|
|
298
|
+
try {
|
|
299
|
+
for (_iterator4.s(); !(_step4 = _iterator4.n()).done;) {
|
|
300
|
+
var hash = _step4.value;
|
|
301
|
+
var index = Math.floor(hash / 8);
|
|
302
|
+
var bit = hash % 8;
|
|
303
|
+
this.bitArray[index] |= 1 << bit;
|
|
304
|
+
}
|
|
305
|
+
} catch (err) {
|
|
306
|
+
_iterator4.e(err);
|
|
307
|
+
} finally {
|
|
308
|
+
_iterator4.f();
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
}, {
|
|
312
|
+
key: "_checkBits",
|
|
313
|
+
value: function _checkBits(hashes) {
|
|
314
|
+
var _iterator5 = _createForOfIteratorHelper(hashes),
|
|
315
|
+
_step5;
|
|
316
|
+
|
|
317
|
+
try {
|
|
318
|
+
for (_iterator5.s(); !(_step5 = _iterator5.n()).done;) {
|
|
319
|
+
var hash = _step5.value;
|
|
320
|
+
var index = Math.floor(hash / 8);
|
|
321
|
+
var bit = hash % 8;
|
|
322
|
+
|
|
323
|
+
if (!(this.bitArray[index] & 1 << bit)) {
|
|
324
|
+
return false;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
} catch (err) {
|
|
328
|
+
_iterator5.e(err);
|
|
329
|
+
} finally {
|
|
330
|
+
_iterator5.f();
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
return true;
|
|
334
|
+
}
|
|
335
|
+
}, {
|
|
336
|
+
key: "has",
|
|
337
|
+
value: function has(o) {
|
|
338
|
+
var hashes = this._hashes(o);
|
|
339
|
+
|
|
340
|
+
if (!this._checkBits(hashes)) {
|
|
341
|
+
return false; // Definitely not in the set
|
|
342
|
+
} // Fall back to fast-deep-equal for false positives
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
var _iterator6 = _createForOfIteratorHelper(this),
|
|
346
|
+
_step6;
|
|
347
|
+
|
|
348
|
+
try {
|
|
349
|
+
for (_iterator6.s(); !(_step6 = _iterator6.n()).done;) {
|
|
350
|
+
var i = _step6.value;
|
|
351
|
+
|
|
352
|
+
if ((0, _fastDeepEqual["default"])(o, i)) {
|
|
353
|
+
return true;
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
} catch (err) {
|
|
357
|
+
_iterator6.e(err);
|
|
358
|
+
} finally {
|
|
359
|
+
_iterator6.f();
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
return false;
|
|
363
|
+
}
|
|
364
|
+
}, {
|
|
365
|
+
key: "add",
|
|
366
|
+
value: function add(o) {
|
|
367
|
+
if (!this.has(o)) {
|
|
368
|
+
var hashes = this._hashes(o);
|
|
369
|
+
|
|
370
|
+
this._setBits(hashes);
|
|
371
|
+
|
|
372
|
+
_get(_getPrototypeOf(BloomSet.prototype), "add", this).call(this, o);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return this;
|
|
376
|
+
}
|
|
377
|
+
}]);
|
|
378
|
+
|
|
379
|
+
return BloomSet;
|
|
380
|
+
}( /*#__PURE__*/_wrapNativeSuper(Set));
|
|
381
|
+
|
|
382
|
+
exports.BloomSet = BloomSet;
|
|
383
|
+
|
|
384
|
+
if (typeof module !== "undefined" && module.exports) {
|
|
385
|
+
module.exports = {
|
|
386
|
+
UniqueSet: UniqueSet,
|
|
387
|
+
BloomSet: BloomSet
|
|
388
|
+
};
|
|
389
|
+
}
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
declare module "unique-set" {
|
|
2
|
+
/**
|
|
3
|
+
* A `Set` extension that ensures uniqueness of items using deep equality checks.
|
|
4
|
+
*/
|
|
5
|
+
export class UniqueSet<T> extends Set<T> {
|
|
6
|
+
/**
|
|
7
|
+
* Creates a new `UniqueSet` instance.
|
|
8
|
+
* @param iterable Optional: an iterable with which to initialize the UniqueSet.
|
|
9
|
+
* @throws TypeError If the input is not iterable.
|
|
10
|
+
*/
|
|
11
|
+
constructor(iterable?: Iterable<T>);
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Determines whether an object is in the UniqueSet using deep equality.
|
|
15
|
+
* @param o The object to check for presence in the UniqueSet.
|
|
16
|
+
* @returns `true` if the object is found, `false` otherwise.
|
|
17
|
+
*/
|
|
18
|
+
has(o: T): boolean;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Adds a new object to the UniqueSet if it is not already present.
|
|
22
|
+
* @param o The object to add to the UniqueSet.
|
|
23
|
+
* @returns The `UniqueSet` instance, allowing for chaining.
|
|
24
|
+
*/
|
|
25
|
+
add(o: T): this;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* A `Set` extension that uses a Bloom filter for fast existence checks combined with deep equality for accuracy.
|
|
30
|
+
*/
|
|
31
|
+
export class BloomSet<T> extends Set<T> {
|
|
32
|
+
/**
|
|
33
|
+
* Creates a new `BloomSet` instance.
|
|
34
|
+
* @param iterable Optional: an iterable object with which to initialize the BloomSet.
|
|
35
|
+
* @param options Bloom filter configuration options.
|
|
36
|
+
* @param options.size The size of the Bloom filter's bit array. Defaults to 6553577.
|
|
37
|
+
* @param options.hashCount The number of hash functions to use. Defaults to 7.
|
|
38
|
+
* @throws TypeError If the input is not iterable.
|
|
39
|
+
*/
|
|
40
|
+
constructor(
|
|
41
|
+
iterable?: Iterable<T>,
|
|
42
|
+
options?: { size?: number; hashCount?: number }
|
|
43
|
+
);
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Determines existence of an object in the BloomSet using the Bloom filter and deep equality.
|
|
47
|
+
* @param o The object to check for presence in the BloomSet.
|
|
48
|
+
* @returns `true` if the object is found, `false` otherwise.
|
|
49
|
+
*/
|
|
50
|
+
has(o: T): boolean;
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Adds a new object to the BloomSet if it is not already present.
|
|
54
|
+
* @param o The object to add to the BloomSet.
|
|
55
|
+
* @returns The `BloomSet` instance, allowing for chaining.
|
|
56
|
+
*/
|
|
57
|
+
add(o: T): this;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
}
|
package/package.json
CHANGED
|
@@ -1,8 +1,14 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sepiariver/unique-set",
|
|
3
|
-
"version": "
|
|
4
|
-
"description": "Extends the
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Extends the native Set class to deeply compare using fast-deep-equal, with optional Bloom filter optimization. This version exports 2 classes instead of a default, breaking b/c with version 1.",
|
|
5
5
|
"main": "index.js",
|
|
6
|
+
"module": "dist/index.js",
|
|
7
|
+
"types": "index.d.ts",
|
|
8
|
+
"exports": {
|
|
9
|
+
"require": "./index.js",
|
|
10
|
+
"import": "./dist/index.js"
|
|
11
|
+
},
|
|
6
12
|
"scripts": {
|
|
7
13
|
"test": "jest",
|
|
8
14
|
"build": "babel src -d dist"
|
|
@@ -14,7 +20,7 @@
|
|
|
14
20
|
"keywords": [
|
|
15
21
|
"Set",
|
|
16
22
|
"unique",
|
|
17
|
-
"
|
|
23
|
+
"bloom",
|
|
18
24
|
"deep",
|
|
19
25
|
"compare",
|
|
20
26
|
"equal"
|
|
@@ -29,8 +35,8 @@
|
|
|
29
35
|
"fast-deep-equal": "^3.1.3"
|
|
30
36
|
},
|
|
31
37
|
"devDependencies": {
|
|
32
|
-
"@babel/cli": "^7.17.
|
|
33
|
-
"@babel/core": "^7.17.
|
|
38
|
+
"@babel/cli": "^7.17.6",
|
|
39
|
+
"@babel/core": "^7.17.8",
|
|
34
40
|
"@babel/preset-env": "^7.16.11",
|
|
35
41
|
"babel-jest": "^27.5.1",
|
|
36
42
|
"jest": "^27.5.1"
|
package/src/index.js
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
1
|
import equal from "fast-deep-equal";
|
|
2
2
|
|
|
3
|
-
class UniqueSet extends Set {
|
|
4
|
-
constructor(
|
|
5
|
-
|
|
3
|
+
export class UniqueSet extends Set {
|
|
4
|
+
constructor(iterable = []) {
|
|
5
|
+
if (!Array.isArray(iterable) && !iterable[Symbol.iterator]) {
|
|
6
|
+
throw new TypeError("UniqueSet requires an iterable");
|
|
7
|
+
}
|
|
8
|
+
super();
|
|
9
|
+
for (const item of iterable) {
|
|
10
|
+
this.add(item);
|
|
11
|
+
}
|
|
6
12
|
}
|
|
13
|
+
|
|
7
14
|
has(o) {
|
|
8
15
|
for (const i of this) {
|
|
9
16
|
if (equal(o, i)) {
|
|
@@ -12,11 +19,165 @@ class UniqueSet extends Set {
|
|
|
12
19
|
}
|
|
13
20
|
return false;
|
|
14
21
|
}
|
|
22
|
+
|
|
15
23
|
add(o) {
|
|
16
24
|
if (!this.has(o)) {
|
|
17
|
-
|
|
25
|
+
super.add(o);
|
|
18
26
|
}
|
|
27
|
+
return this;
|
|
19
28
|
}
|
|
20
29
|
}
|
|
21
30
|
|
|
22
|
-
|
|
31
|
+
export class BloomSet extends Set {
|
|
32
|
+
constructor(iterable = [], options = {}) {
|
|
33
|
+
if (!Array.isArray(iterable) && !iterable[Symbol.iterator]) {
|
|
34
|
+
throw new TypeError("BloomSet requires an iterable");
|
|
35
|
+
}
|
|
36
|
+
super();
|
|
37
|
+
|
|
38
|
+
if (!options || typeof options !== "object") {
|
|
39
|
+
options = {};
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
const { size = 6553577, hashCount = 7 } = options;
|
|
43
|
+
|
|
44
|
+
if (typeof size !== "number" || size <= 0) {
|
|
45
|
+
size = 6553577; // Targeting < 1 collision per 100,000 elements, ~819 KB memory, needs 7 hashes
|
|
46
|
+
}
|
|
47
|
+
this.aSize = this._findNextPrime(size);
|
|
48
|
+
|
|
49
|
+
if (typeof hashCount !== "number" || hashCount <= 0) {
|
|
50
|
+
hashCount = 7;
|
|
51
|
+
}
|
|
52
|
+
this.hashCount = hashCount;
|
|
53
|
+
|
|
54
|
+
this.bitArray = new Uint8Array(Math.ceil(size / 8));
|
|
55
|
+
|
|
56
|
+
for (const item of iterable) {
|
|
57
|
+
this.add(item);
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
_findNextPrime(num) {
|
|
62
|
+
if (num < 2) return 2;
|
|
63
|
+
if (num % 2 === 0) num++; // Odd numbers only
|
|
64
|
+
|
|
65
|
+
while (!this._isPrime(num)) {
|
|
66
|
+
num += 2; // Odd numbers only
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return num;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
_isPrime(num) {
|
|
73
|
+
if (num < 2) return false;
|
|
74
|
+
if (num === 2 || num === 3) return true;
|
|
75
|
+
if (num % 2 === 0 || num % 3 === 0) return false;
|
|
76
|
+
|
|
77
|
+
const sqrt = Math.floor(Math.sqrt(num));
|
|
78
|
+
for (let i = 5; i <= sqrt; i += 6) {
|
|
79
|
+
if (num % i === 0 || num % (i + 2) === 0) return false;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return true;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
_serialize(item) {
|
|
86
|
+
if (typeof item === "number" && isNaN(item)) {
|
|
87
|
+
return "NaN";
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
if (item && typeof item === "object") {
|
|
91
|
+
const serialize = this._serialize.bind(this);
|
|
92
|
+
if (Array.isArray(item)) {
|
|
93
|
+
return `[${item.map(serialize).join(",")}]`;
|
|
94
|
+
} else {
|
|
95
|
+
return `{${Object.entries(item)
|
|
96
|
+
.sort(([a], [b]) => a.localeCompare(b))
|
|
97
|
+
.map(([k, v]) => `${k}:${serialize(v)}`)
|
|
98
|
+
.join(",")}}`;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return String(item);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
_hashes(item) {
|
|
106
|
+
const hashes = [];
|
|
107
|
+
const str = this._serialize(item);
|
|
108
|
+
let hash = this._fnv1a(str); // Base hash
|
|
109
|
+
|
|
110
|
+
// Bloom into hashCount hash values
|
|
111
|
+
for (let i = 0; i < this.hashCount; i++) {
|
|
112
|
+
hash %= this.aSize; // Ensure within bounds
|
|
113
|
+
// Track
|
|
114
|
+
hashes.push(hash);
|
|
115
|
+
// Modify
|
|
116
|
+
hash = (hash ^ (hash >>> 13)) * 0xc2b2ae35;
|
|
117
|
+
hash >>>= 0; // Ensure unsigned 32-bit integer
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
return hashes;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
_fnv1a(str) {
|
|
124
|
+
if (typeof str !== "string") {
|
|
125
|
+
str = String(str);
|
|
126
|
+
}
|
|
127
|
+
let hash = 2166136261; // FNV offset basis for 32-bit
|
|
128
|
+
for (let i = 0; i < str.length; i++) {
|
|
129
|
+
hash ^= str.charCodeAt(i);
|
|
130
|
+
hash = (hash * 16777619) >>> 0; // Multiply by the FNV prime and ensure 32-bit unsigned
|
|
131
|
+
}
|
|
132
|
+
return hash >>> 0;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
_setBits(hashes) {
|
|
136
|
+
for (const hash of hashes) {
|
|
137
|
+
const index = Math.floor(hash / 8);
|
|
138
|
+
const bit = hash % 8;
|
|
139
|
+
this.bitArray[index] |= 1 << bit;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
_checkBits(hashes) {
|
|
144
|
+
for (const hash of hashes) {
|
|
145
|
+
const index = Math.floor(hash / 8);
|
|
146
|
+
const bit = hash % 8;
|
|
147
|
+
if (!(this.bitArray[index] & (1 << bit))) {
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
return true;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
has(o) {
|
|
155
|
+
const hashes = this._hashes(o);
|
|
156
|
+
if (!this._checkBits(hashes)) {
|
|
157
|
+
return false; // Definitely not in the set
|
|
158
|
+
}
|
|
159
|
+
// Fall back to fast-deep-equal for false positives
|
|
160
|
+
for (const i of this) {
|
|
161
|
+
if (equal(o, i)) {
|
|
162
|
+
return true;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
return false;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
add(o) {
|
|
169
|
+
if (!this.has(o)) {
|
|
170
|
+
const hashes = this._hashes(o);
|
|
171
|
+
this._setBits(hashes);
|
|
172
|
+
super.add(o);
|
|
173
|
+
}
|
|
174
|
+
return this;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
if (typeof module !== "undefined" && module.exports) {
|
|
179
|
+
module.exports = {
|
|
180
|
+
UniqueSet,
|
|
181
|
+
BloomSet,
|
|
182
|
+
};
|
|
183
|
+
}
|