databonk 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/column.d.ts +31 -1
- package/dist/core/column.d.ts.map +1 -1
- package/dist/core/dataframe.d.ts +40 -0
- package/dist/core/dataframe.d.ts.map +1 -1
- package/dist/core/index-cache.d.ts +44 -0
- package/dist/core/index-cache.d.ts.map +1 -0
- package/dist/index.esm.js +719 -119
- package/dist/index.esm.js.map +1 -1
- package/dist/index.js +719 -119
- package/dist/index.js.map +1 -1
- package/dist/operations/groupby.d.ts +8 -1
- package/dist/operations/groupby.d.ts.map +1 -1
- package/dist/operations/join.d.ts +3 -0
- package/dist/operations/join.d.ts.map +1 -1
- package/dist/utils/aggregation-engine.d.ts +84 -0
- package/dist/utils/aggregation-engine.d.ts.map +1 -0
- package/dist/utils/bitset.d.ts +18 -0
- package/dist/utils/bitset.d.ts.map +1 -1
- package/dist/utils/hash.d.ts +79 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -88,6 +88,60 @@ class BitSet {
|
|
|
88
88
|
n = (n & 0x33333333) + ((n >>> 2) & 0x33333333);
|
|
89
89
|
return (((n + (n >>> 4)) & 0x0f0f0f0f) * 0x01010101) >>> 24;
|
|
90
90
|
}
|
|
91
|
+
/**
|
|
92
|
+
* Get a batch of null flags as a bitmask.
|
|
93
|
+
* Useful for SIMD-style batch null checking.
|
|
94
|
+
* @param startIndex The starting index (must be aligned to 32 for optimal performance)
|
|
95
|
+
* @param count Number of bits to get (max 32)
|
|
96
|
+
* @returns A number where bit i is set if index (startIndex + i) is null
|
|
97
|
+
*/
|
|
98
|
+
getNullMaskBatch(startIndex, count) {
|
|
99
|
+
if (count <= 0 || count > 32) {
|
|
100
|
+
throw new Error('Count must be between 1 and 32');
|
|
101
|
+
}
|
|
102
|
+
const arrayIndex = Math.floor(startIndex / 32);
|
|
103
|
+
const bitOffset = startIndex % 32;
|
|
104
|
+
if (bitOffset === 0 && count === 32) {
|
|
105
|
+
// Aligned access - fast path
|
|
106
|
+
return this.data[arrayIndex] >>> 0;
|
|
107
|
+
}
|
|
108
|
+
// Extract bits across word boundaries if needed
|
|
109
|
+
let result = this.data[arrayIndex] >>> bitOffset;
|
|
110
|
+
if (bitOffset + count > 32 && arrayIndex + 1 < this.data.length) {
|
|
111
|
+
// Need bits from next word
|
|
112
|
+
const bitsFromFirst = 32 - bitOffset;
|
|
113
|
+
const bitsFromSecond = count - bitsFromFirst;
|
|
114
|
+
const nextWord = this.data[arrayIndex + 1];
|
|
115
|
+
result |= (nextWord & ((1 << bitsFromSecond) - 1)) << bitsFromFirst;
|
|
116
|
+
}
|
|
117
|
+
// Mask to requested count
|
|
118
|
+
return result & ((1 << count) - 1);
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Check if any bit in a range is set.
|
|
122
|
+
* Faster than checking each bit individually.
|
|
123
|
+
*/
|
|
124
|
+
anySet(startIndex, count) {
|
|
125
|
+
const endIndex = Math.min(startIndex + count, this.length);
|
|
126
|
+
for (let i = startIndex; i < endIndex;) {
|
|
127
|
+
const arrayIndex = Math.floor(i / 32);
|
|
128
|
+
const bitOffset = i % 32;
|
|
129
|
+
const bitsToCheck = Math.min(32 - bitOffset, endIndex - i);
|
|
130
|
+
const mask = ((1 << bitsToCheck) - 1) << bitOffset;
|
|
131
|
+
if ((this.data[arrayIndex] & mask) !== 0) {
|
|
132
|
+
return true;
|
|
133
|
+
}
|
|
134
|
+
i += bitsToCheck;
|
|
135
|
+
}
|
|
136
|
+
return false;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* Get direct access to the underlying data array.
|
|
140
|
+
* @internal
|
|
141
|
+
*/
|
|
142
|
+
getDataRef() {
|
|
143
|
+
return this.data;
|
|
144
|
+
}
|
|
91
145
|
*[Symbol.iterator]() {
|
|
92
146
|
for (let i = 0; i < this.length; i++) {
|
|
93
147
|
yield this.get(i);
|
|
@@ -130,6 +184,30 @@ class Column {
|
|
|
130
184
|
}
|
|
131
185
|
return this.data[index];
|
|
132
186
|
}
|
|
187
|
+
/**
|
|
188
|
+
* Get a value without bounds checking or null handling.
|
|
189
|
+
* Use only when caller ensures valid index and handles nulls separately.
|
|
190
|
+
* @internal
|
|
191
|
+
*/
|
|
192
|
+
getRaw(index) {
|
|
193
|
+
return this.data[index];
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Get direct reference to the underlying data array.
|
|
197
|
+
* Use for batch operations that need raw access.
|
|
198
|
+
* @internal
|
|
199
|
+
*/
|
|
200
|
+
getDataRef() {
|
|
201
|
+
return this.data;
|
|
202
|
+
}
|
|
203
|
+
/**
|
|
204
|
+
* Get direct reference to the null bitmap.
|
|
205
|
+
* Use for batch null checking.
|
|
206
|
+
* @internal
|
|
207
|
+
*/
|
|
208
|
+
getNullBitmapRef() {
|
|
209
|
+
return this.nullBitmap;
|
|
210
|
+
}
|
|
133
211
|
isNull(index) {
|
|
134
212
|
return this.nullBitmap.get(index);
|
|
135
213
|
}
|
|
@@ -240,8 +318,105 @@ class Column {
|
|
|
240
318
|
static from(name, values, dataType) {
|
|
241
319
|
return new Column(name, values, dataType);
|
|
242
320
|
}
|
|
321
|
+
/**
|
|
322
|
+
* Create a Column directly from raw data without copying.
|
|
323
|
+
* Use for optimized construction when data is already in the correct format.
|
|
324
|
+
* @internal
|
|
325
|
+
*/
|
|
326
|
+
static fromRaw(name, data, nullBitmap, dataType) {
|
|
327
|
+
// Create an instance without going through the normal constructor
|
|
328
|
+
const column = Object.create(Column.prototype);
|
|
329
|
+
column.name = name;
|
|
330
|
+
column.dataType = dataType;
|
|
331
|
+
column.data = data;
|
|
332
|
+
column.nullBitmap = nullBitmap;
|
|
333
|
+
column.length = data.length;
|
|
334
|
+
return column;
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Select rows by indices with optimized batch copying.
|
|
338
|
+
* Much faster than calling get() for each index.
|
|
339
|
+
*/
|
|
340
|
+
selectIndices(indices) {
|
|
341
|
+
const newLength = indices.length;
|
|
342
|
+
const Constructor = TYPE_CONSTRUCTORS[this.dataType];
|
|
343
|
+
if (Constructor) {
|
|
344
|
+
// TypedArray fast path - batch copy
|
|
345
|
+
const newData = new Constructor(newLength);
|
|
346
|
+
const newNullBitmap = new BitSet(newLength);
|
|
347
|
+
const srcData = this.data;
|
|
348
|
+
for (let i = 0; i < newLength; i++) {
|
|
349
|
+
const srcIdx = indices[i];
|
|
350
|
+
newData[i] = srcData[srcIdx];
|
|
351
|
+
if (this.nullBitmap.get(srcIdx)) {
|
|
352
|
+
newNullBitmap.set(i, true);
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
|
|
356
|
+
}
|
|
357
|
+
// Regular array fallback
|
|
358
|
+
const newData = new Array(newLength);
|
|
359
|
+
const newNullBitmap = new BitSet(newLength);
|
|
360
|
+
for (let i = 0; i < newLength; i++) {
|
|
361
|
+
const srcIdx = indices[i];
|
|
362
|
+
newData[i] = this.data[srcIdx];
|
|
363
|
+
if (this.nullBitmap.get(srcIdx)) {
|
|
364
|
+
newNullBitmap.set(i, true);
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
|
|
368
|
+
}
|
|
243
369
|
}
|
|
244
370
|
|
|
371
|
+
/**
|
|
372
|
+
* RowProxy provides zero-allocation row access for iteration.
|
|
373
|
+
* Reuses a single object while iterating, avoiding object creation per row.
|
|
374
|
+
*/
|
|
375
|
+
class RowProxy {
|
|
376
|
+
constructor(df) {
|
|
377
|
+
this.columnCache = new Map();
|
|
378
|
+
this.index = 0;
|
|
379
|
+
for (const name of df.columnNames) {
|
|
380
|
+
this.columnCache.set(name, df.column(name));
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
/**
|
|
384
|
+
* Set the current row index.
|
|
385
|
+
* @returns this for chaining
|
|
386
|
+
*/
|
|
387
|
+
setIndex(i) {
|
|
388
|
+
this.index = i;
|
|
389
|
+
return this;
|
|
390
|
+
}
|
|
391
|
+
/**
|
|
392
|
+
* Get a value from the current row.
|
|
393
|
+
*/
|
|
394
|
+
get(col) {
|
|
395
|
+
const column = this.columnCache.get(col);
|
|
396
|
+
if (!column) {
|
|
397
|
+
throw new Error(`Column '${col}' not found`);
|
|
398
|
+
}
|
|
399
|
+
return column.get(this.index);
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* Get a value without null checking (faster for non-null columns).
|
|
403
|
+
*/
|
|
404
|
+
getRaw(col) {
|
|
405
|
+
return this.columnCache.get(col).getRaw(this.index);
|
|
406
|
+
}
|
|
407
|
+
/**
|
|
408
|
+
* Check if a column value is null at the current row.
|
|
409
|
+
*/
|
|
410
|
+
isNull(col) {
|
|
411
|
+
return this.columnCache.get(col).isNull(this.index);
|
|
412
|
+
}
|
|
413
|
+
/**
|
|
414
|
+
* Get the current row index.
|
|
415
|
+
*/
|
|
416
|
+
getIndex() {
|
|
417
|
+
return this.index;
|
|
418
|
+
}
|
|
419
|
+
}
|
|
245
420
|
class DataFrame {
|
|
246
421
|
constructor(data) {
|
|
247
422
|
this.columns = new Map();
|
|
@@ -306,8 +481,16 @@ class DataFrame {
|
|
|
306
481
|
}
|
|
307
482
|
filter(predicate) {
|
|
308
483
|
const indices = [];
|
|
484
|
+
// Cache column references for the predicate
|
|
485
|
+
const columnRefs = [];
|
|
486
|
+
this.columns.forEach((col, name) => columnRefs.push([name, col]));
|
|
487
|
+
// Reuse a single row object to reduce allocations
|
|
488
|
+
const row = {};
|
|
309
489
|
for (let i = 0; i < this.length; i++) {
|
|
310
|
-
|
|
490
|
+
// Populate row object using cached column references
|
|
491
|
+
for (const [name, col] of columnRefs) {
|
|
492
|
+
row[name] = col.get(i);
|
|
493
|
+
}
|
|
311
494
|
if (predicate(row, i)) {
|
|
312
495
|
indices.push(i);
|
|
313
496
|
}
|
|
@@ -326,11 +509,32 @@ class DataFrame {
|
|
|
326
509
|
selectRows(indices) {
|
|
327
510
|
const selectedColumns = {};
|
|
328
511
|
this.columns.forEach((column, name) => {
|
|
329
|
-
|
|
330
|
-
selectedColumns[name] =
|
|
512
|
+
// Use optimized batch selection instead of individual get() calls
|
|
513
|
+
selectedColumns[name] = column.selectIndices(indices);
|
|
331
514
|
});
|
|
332
515
|
return new DataFrame(selectedColumns);
|
|
333
516
|
}
|
|
517
|
+
/**
|
|
518
|
+
* Filter rows using a predicate function that receives a RowProxy.
|
|
519
|
+
* More efficient than filter() as it avoids creating a new object per row.
|
|
520
|
+
*/
|
|
521
|
+
filterByIndex(predicate) {
|
|
522
|
+
const proxy = new RowProxy(this);
|
|
523
|
+
const indices = [];
|
|
524
|
+
for (let i = 0; i < this.length; i++) {
|
|
525
|
+
if (predicate(i, proxy.setIndex(i))) {
|
|
526
|
+
indices.push(i);
|
|
527
|
+
}
|
|
528
|
+
}
|
|
529
|
+
return this.selectRows(indices);
|
|
530
|
+
}
|
|
531
|
+
/**
|
|
532
|
+
* Create a RowProxy for efficient iteration.
|
|
533
|
+
* Use this when you need to access multiple columns per row without allocation.
|
|
534
|
+
*/
|
|
535
|
+
createRowProxy() {
|
|
536
|
+
return new RowProxy(this);
|
|
537
|
+
}
|
|
334
538
|
getRow(index) {
|
|
335
539
|
if (index < 0 || index >= this.length) {
|
|
336
540
|
throw new Error('Index out of bounds');
|
|
@@ -438,66 +642,310 @@ class DataFrame {
|
|
|
438
642
|
}
|
|
439
643
|
}
|
|
440
644
|
|
|
645
|
+
/**
|
|
646
|
+
* StatAccumulator tracks multiple statistics in a single pass through the data.
|
|
647
|
+
* Uses Welford's online algorithm for numerically stable variance computation.
|
|
648
|
+
*/
|
|
649
|
+
class StatAccumulator {
|
|
650
|
+
constructor() {
|
|
651
|
+
this.count = 0;
|
|
652
|
+
this.sum = 0;
|
|
653
|
+
this.min = Infinity;
|
|
654
|
+
this.max = -Infinity;
|
|
655
|
+
this.mean_ = 0;
|
|
656
|
+
this.m2 = 0; // Sum of squares of differences from current mean
|
|
657
|
+
}
|
|
658
|
+
/**
|
|
659
|
+
* Add a single value to the accumulator.
|
|
660
|
+
* Updates all statistics in O(1) time.
|
|
661
|
+
*/
|
|
662
|
+
add(value) {
|
|
663
|
+
this.count++;
|
|
664
|
+
this.sum += value;
|
|
665
|
+
if (value < this.min)
|
|
666
|
+
this.min = value;
|
|
667
|
+
if (value > this.max)
|
|
668
|
+
this.max = value;
|
|
669
|
+
// Welford's online algorithm for stable variance
|
|
670
|
+
const delta = value - this.mean_;
|
|
671
|
+
this.mean_ += delta / this.count;
|
|
672
|
+
const delta2 = value - this.mean_;
|
|
673
|
+
this.m2 += delta * delta2;
|
|
674
|
+
}
|
|
675
|
+
/**
|
|
676
|
+
* Get the mean of all added values.
|
|
677
|
+
*/
|
|
678
|
+
getMean() {
|
|
679
|
+
return this.count > 0 ? this.mean_ : 0;
|
|
680
|
+
}
|
|
681
|
+
/**
|
|
682
|
+
* Get the sample variance (n-1 denominator).
|
|
683
|
+
*/
|
|
684
|
+
getVariance() {
|
|
685
|
+
return this.count > 1 ? this.m2 / (this.count - 1) : 0;
|
|
686
|
+
}
|
|
687
|
+
/**
|
|
688
|
+
* Get the sample standard deviation.
|
|
689
|
+
*/
|
|
690
|
+
getStd() {
|
|
691
|
+
return Math.sqrt(this.getVariance());
|
|
692
|
+
}
|
|
693
|
+
/**
|
|
694
|
+
* Get a specific aggregate value by function name.
|
|
695
|
+
*/
|
|
696
|
+
getValue(fn) {
|
|
697
|
+
switch (fn) {
|
|
698
|
+
case 'sum':
|
|
699
|
+
return this.sum;
|
|
700
|
+
case 'mean':
|
|
701
|
+
return this.getMean();
|
|
702
|
+
case 'min':
|
|
703
|
+
return this.count > 0 ? this.min : NaN;
|
|
704
|
+
case 'max':
|
|
705
|
+
return this.count > 0 ? this.max : NaN;
|
|
706
|
+
case 'count':
|
|
707
|
+
return this.count;
|
|
708
|
+
case 'var':
|
|
709
|
+
return this.getVariance();
|
|
710
|
+
case 'std':
|
|
711
|
+
return this.getStd();
|
|
712
|
+
default:
|
|
713
|
+
throw new Error(`Unknown aggregate function: ${fn}`);
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
/**
|
|
717
|
+
* Merge another accumulator into this one.
|
|
718
|
+
* Useful for parallel aggregation.
|
|
719
|
+
*/
|
|
720
|
+
merge(other) {
|
|
721
|
+
if (other.count === 0)
|
|
722
|
+
return;
|
|
723
|
+
if (this.count === 0) {
|
|
724
|
+
this.count = other.count;
|
|
725
|
+
this.sum = other.sum;
|
|
726
|
+
this.min = other.min;
|
|
727
|
+
this.max = other.max;
|
|
728
|
+
this.mean_ = other.mean_;
|
|
729
|
+
this.m2 = other.m2;
|
|
730
|
+
return;
|
|
731
|
+
}
|
|
732
|
+
const totalCount = this.count + other.count;
|
|
733
|
+
const delta = other.mean_ - this.mean_;
|
|
734
|
+
// Combined mean
|
|
735
|
+
this.mean_ = (this.count * this.mean_ + other.count * other.mean_) / totalCount;
|
|
736
|
+
// Combined M2 using parallel algorithm
|
|
737
|
+
this.m2 = this.m2 + other.m2 + delta * delta * this.count * other.count / totalCount;
|
|
738
|
+
this.sum += other.sum;
|
|
739
|
+
this.count = totalCount;
|
|
740
|
+
if (other.min < this.min)
|
|
741
|
+
this.min = other.min;
|
|
742
|
+
if (other.max > this.max)
|
|
743
|
+
this.max = other.max;
|
|
744
|
+
}
|
|
745
|
+
/**
|
|
746
|
+
* Reset the accumulator for reuse.
|
|
747
|
+
*/
|
|
748
|
+
reset() {
|
|
749
|
+
this.count = 0;
|
|
750
|
+
this.sum = 0;
|
|
751
|
+
this.min = Infinity;
|
|
752
|
+
this.max = -Infinity;
|
|
753
|
+
this.mean_ = 0;
|
|
754
|
+
this.m2 = 0;
|
|
755
|
+
}
|
|
756
|
+
}
|
|
757
|
+
/**
|
|
758
|
+
* GroupedAccumulators manages StatAccumulators for multiple groups and columns.
|
|
759
|
+
* Enables single-pass aggregation across all groups and aggregate functions.
|
|
760
|
+
*/
|
|
761
|
+
class GroupedAccumulators {
|
|
762
|
+
constructor(columns) {
|
|
763
|
+
// Map of groupKey -> columnName -> StatAccumulator
|
|
764
|
+
this.accumulators = new Map();
|
|
765
|
+
this.columns = columns;
|
|
766
|
+
}
|
|
767
|
+
/**
|
|
768
|
+
* Get or create the accumulator for a group and column.
|
|
769
|
+
*/
|
|
770
|
+
getAccumulator(groupKey, columnName) {
|
|
771
|
+
let groupAccs = this.accumulators.get(groupKey);
|
|
772
|
+
if (!groupAccs) {
|
|
773
|
+
groupAccs = new Map();
|
|
774
|
+
this.accumulators.set(groupKey, groupAccs);
|
|
775
|
+
}
|
|
776
|
+
let acc = groupAccs.get(columnName);
|
|
777
|
+
if (!acc) {
|
|
778
|
+
acc = new StatAccumulator();
|
|
779
|
+
groupAccs.set(columnName, acc);
|
|
780
|
+
}
|
|
781
|
+
return acc;
|
|
782
|
+
}
|
|
783
|
+
/**
|
|
784
|
+
* Add a value for a specific group and column.
|
|
785
|
+
*/
|
|
786
|
+
add(groupKey, columnName, value) {
|
|
787
|
+
if (value !== null && !isNaN(value)) {
|
|
788
|
+
this.getAccumulator(groupKey, columnName).add(value);
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
/**
|
|
792
|
+
* Get all group keys.
|
|
793
|
+
*/
|
|
794
|
+
getGroups() {
|
|
795
|
+
return Array.from(this.accumulators.keys());
|
|
796
|
+
}
|
|
797
|
+
/**
|
|
798
|
+
* Get the aggregate value for a group and column.
|
|
799
|
+
*/
|
|
800
|
+
getValue(groupKey, columnName, fn) {
|
|
801
|
+
const acc = this.accumulators.get(groupKey)?.get(columnName);
|
|
802
|
+
if (!acc) {
|
|
803
|
+
return fn === 'count' ? 0 : NaN;
|
|
804
|
+
}
|
|
805
|
+
return acc.getValue(fn);
|
|
806
|
+
}
|
|
807
|
+
/**
|
|
808
|
+
* Get the count for a group (same across all columns).
|
|
809
|
+
*/
|
|
810
|
+
getGroupCount(groupKey) {
|
|
811
|
+
const groupAccs = this.accumulators.get(groupKey);
|
|
812
|
+
if (!groupAccs)
|
|
813
|
+
return 0;
|
|
814
|
+
// Return count from the first column accumulator
|
|
815
|
+
for (const acc of groupAccs.values()) {
|
|
816
|
+
return acc.count;
|
|
817
|
+
}
|
|
818
|
+
return 0;
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
/**
|
|
822
|
+
* Creates an aggregation plan from a spec object.
|
|
823
|
+
*/
|
|
824
|
+
function createAggregationPlan(spec) {
|
|
825
|
+
const columns = [];
|
|
826
|
+
const functions = new Map();
|
|
827
|
+
for (const [colName, fns] of Object.entries(spec)) {
|
|
828
|
+
const fnArray = Array.isArray(fns) ? fns : [fns];
|
|
829
|
+
columns.push(colName);
|
|
830
|
+
functions.set(colName, fnArray);
|
|
831
|
+
}
|
|
832
|
+
return { columns, functions };
|
|
833
|
+
}
|
|
834
|
+
|
|
441
835
|
class GroupBy {
|
|
442
836
|
constructor(df, columns) {
|
|
443
|
-
this.
|
|
837
|
+
this.groupOrder = []; // Track insertion order for consistent output
|
|
444
838
|
this.df = df;
|
|
445
839
|
this.groupColumns = columns;
|
|
840
|
+
// Cache column references once
|
|
841
|
+
this.cachedGroupCols = columns.map(c => df.column(c));
|
|
842
|
+
this.groups = new Map();
|
|
446
843
|
this.computeGroups();
|
|
447
844
|
}
|
|
448
845
|
computeGroups() {
|
|
449
846
|
for (let i = 0; i < this.df.length; i++) {
|
|
450
847
|
const key = this.createGroupKey(i);
|
|
451
|
-
|
|
452
|
-
|
|
848
|
+
const existingIndices = this.groups.get(key);
|
|
849
|
+
if (existingIndices) {
|
|
850
|
+
existingIndices.push(i);
|
|
851
|
+
}
|
|
852
|
+
else {
|
|
853
|
+
this.groups.set(key, [i]);
|
|
854
|
+
this.groupOrder.push({
|
|
855
|
+
key,
|
|
856
|
+
firstRowIndex: i
|
|
857
|
+
});
|
|
453
858
|
}
|
|
454
|
-
this.groups.get(key).push(i);
|
|
455
859
|
}
|
|
456
860
|
}
|
|
861
|
+
/**
|
|
862
|
+
* Create a simple string key for a row using '||' separator.
|
|
863
|
+
*/
|
|
457
864
|
createGroupKey(rowIndex) {
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
return key.split('||').map(part => part === '__NULL__' ? null : part);
|
|
865
|
+
let key = '';
|
|
866
|
+
for (let i = 0; i < this.cachedGroupCols.length; i++) {
|
|
867
|
+
if (i > 0)
|
|
868
|
+
key += '||';
|
|
869
|
+
const val = this.cachedGroupCols[i].get(rowIndex);
|
|
870
|
+
key += val === null ? '\0' : String(val);
|
|
871
|
+
}
|
|
872
|
+
return key;
|
|
467
873
|
}
|
|
874
|
+
/**
|
|
875
|
+
* Perform aggregation using single-pass algorithm for efficiency.
|
|
876
|
+
*/
|
|
468
877
|
agg(spec) {
|
|
469
878
|
const resultColumns = {};
|
|
470
|
-
|
|
879
|
+
// Build list of columns to aggregate and their functions
|
|
880
|
+
const aggPlan = createAggregationPlan(spec);
|
|
881
|
+
// Separate count-only columns from columns that need actual data
|
|
882
|
+
const countOnlyColumns = new Set();
|
|
883
|
+
const dataColumns = [];
|
|
884
|
+
for (const [colName, fns] of aggPlan.functions) {
|
|
885
|
+
const fnArray = Array.isArray(fns) ? fns : [fns];
|
|
886
|
+
const hasOnlyCount = fnArray.every(fn => fn === 'count');
|
|
887
|
+
if (hasOnlyCount && !this.df.hasColumn(colName)) {
|
|
888
|
+
// This is a count-only column (like { count: 'count' })
|
|
889
|
+
countOnlyColumns.add(colName);
|
|
890
|
+
}
|
|
891
|
+
else {
|
|
892
|
+
dataColumns.push(colName);
|
|
893
|
+
}
|
|
894
|
+
}
|
|
895
|
+
// Cache column references for aggregation columns (excluding count-only)
|
|
896
|
+
const aggColumnRefs = new Map();
|
|
897
|
+
for (const colName of dataColumns) {
|
|
898
|
+
aggColumnRefs.set(colName, this.df.column(colName));
|
|
899
|
+
}
|
|
900
|
+
// Single-pass aggregation: create accumulators for each group
|
|
901
|
+
const groupedAccs = new GroupedAccumulators(dataColumns);
|
|
902
|
+
// Iterate through data once, accumulating all stats
|
|
903
|
+
for (let i = 0; i < this.df.length; i++) {
|
|
904
|
+
// Compute group key
|
|
905
|
+
const key = this.createGroupKey(i);
|
|
906
|
+
// Add values to accumulators for each aggregation column
|
|
907
|
+
for (const colName of dataColumns) {
|
|
908
|
+
const value = aggColumnRefs.get(colName).get(i);
|
|
909
|
+
if (value !== null && typeof value === 'number' && !isNaN(value)) {
|
|
910
|
+
groupedAccs.getAccumulator(key, colName).add(value);
|
|
911
|
+
}
|
|
912
|
+
else if (value !== null) {
|
|
913
|
+
// For count, we still need to track non-null values
|
|
914
|
+
// Use a dummy add for tracking count
|
|
915
|
+
groupedAccs.getAccumulator(key, colName);
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
// Build result columns for group keys (preserve original order)
|
|
920
|
+
this.groupColumns.forEach((colName, colIdx) => {
|
|
471
921
|
const groupValues = [];
|
|
472
|
-
const column = this.
|
|
473
|
-
for (const
|
|
474
|
-
|
|
475
|
-
|
|
922
|
+
const column = this.cachedGroupCols[colIdx];
|
|
923
|
+
for (const entry of this.groupOrder) {
|
|
924
|
+
const indices = this.groups.get(entry.key);
|
|
925
|
+
if (indices && indices.length > 0) {
|
|
926
|
+
groupValues.push(column.get(indices[0]));
|
|
927
|
+
}
|
|
476
928
|
}
|
|
477
929
|
resultColumns[colName] = new Column(colName, groupValues);
|
|
478
930
|
});
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
931
|
+
// Build result columns for aggregated values
|
|
932
|
+
for (const [colName, fns] of aggPlan.functions) {
|
|
933
|
+
for (const fn of fns) {
|
|
482
934
|
const aggValues = [];
|
|
483
|
-
for (const
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
935
|
+
for (const entry of this.groupOrder) {
|
|
936
|
+
if (fn === 'count' || countOnlyColumns.has(colName)) {
|
|
937
|
+
// For count, return number of rows in group
|
|
938
|
+
const indices = this.groups.get(entry.key);
|
|
939
|
+
aggValues.push(indices ? indices.length : 0);
|
|
488
940
|
}
|
|
489
941
|
else {
|
|
490
|
-
|
|
491
|
-
const groupValues = indices.map(i => this.df.column(colName).get(i));
|
|
492
|
-
const groupColumn = new Column(`temp_${colName}`, groupValues);
|
|
493
|
-
value = this.computeAggregateValue(groupColumn, fn);
|
|
942
|
+
aggValues.push(groupedAccs.getValue(entry.key, colName, fn));
|
|
494
943
|
}
|
|
495
|
-
aggValues.push(value);
|
|
496
944
|
}
|
|
497
|
-
const resultColName =
|
|
945
|
+
const resultColName = fns.length === 1 ? colName : `${colName}_${fn}`;
|
|
498
946
|
resultColumns[resultColName] = new Column(resultColName, aggValues, 'float64');
|
|
499
|
-
}
|
|
500
|
-
}
|
|
947
|
+
}
|
|
948
|
+
}
|
|
501
949
|
return new DataFrame(resultColumns);
|
|
502
950
|
}
|
|
503
951
|
computeAggregateValue(column, fn) {
|
|
@@ -540,7 +988,27 @@ class GroupBy {
|
|
|
540
988
|
return Math.sqrt(this.computeVar(column));
|
|
541
989
|
}
|
|
542
990
|
count() {
|
|
543
|
-
|
|
991
|
+
const resultColumns = {};
|
|
992
|
+
// Build group key columns
|
|
993
|
+
this.groupColumns.forEach((colName, colIdx) => {
|
|
994
|
+
const column = this.cachedGroupCols[colIdx];
|
|
995
|
+
const values = [];
|
|
996
|
+
for (const entry of this.groupOrder) {
|
|
997
|
+
const indices = this.groups.get(entry.key);
|
|
998
|
+
if (indices && indices.length > 0) {
|
|
999
|
+
values.push(column.get(indices[0]));
|
|
1000
|
+
}
|
|
1001
|
+
}
|
|
1002
|
+
resultColumns[colName] = new Column(colName, values);
|
|
1003
|
+
});
|
|
1004
|
+
// Add count column
|
|
1005
|
+
const counts = [];
|
|
1006
|
+
for (const entry of this.groupOrder) {
|
|
1007
|
+
const indices = this.groups.get(entry.key);
|
|
1008
|
+
counts.push(indices ? indices.length : 0);
|
|
1009
|
+
}
|
|
1010
|
+
resultColumns['count'] = new Column('count', counts, 'int32');
|
|
1011
|
+
return new DataFrame(resultColumns);
|
|
544
1012
|
}
|
|
545
1013
|
sum(columns) {
|
|
546
1014
|
const spec = {};
|
|
@@ -572,19 +1040,17 @@ class GroupBy {
|
|
|
572
1040
|
}
|
|
573
1041
|
first() {
|
|
574
1042
|
const resultColumns = {};
|
|
1043
|
+
// Cache all column references
|
|
1044
|
+
const colRefs = new Map();
|
|
1045
|
+
for (const colName of this.df.columnNames) {
|
|
1046
|
+
colRefs.set(colName, this.df.column(colName));
|
|
1047
|
+
}
|
|
575
1048
|
this.df.columnNames.forEach(colName => {
|
|
576
1049
|
const values = [];
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
values.push(keyValues[colIndex]);
|
|
582
|
-
}
|
|
583
|
-
}
|
|
584
|
-
else {
|
|
585
|
-
for (const indices of this.groups.values()) {
|
|
586
|
-
const firstIndex = indices[0];
|
|
587
|
-
values.push(this.df.column(colName).get(firstIndex));
|
|
1050
|
+
for (const entry of this.groupOrder) {
|
|
1051
|
+
const indices = this.groups.get(entry.key);
|
|
1052
|
+
if (indices && indices.length > 0) {
|
|
1053
|
+
values.push(colRefs.get(colName).get(indices[0]));
|
|
588
1054
|
}
|
|
589
1055
|
}
|
|
590
1056
|
resultColumns[colName] = new Column(colName, values);
|
|
@@ -593,19 +1059,17 @@ class GroupBy {
|
|
|
593
1059
|
}
|
|
594
1060
|
last() {
|
|
595
1061
|
const resultColumns = {};
|
|
1062
|
+
// Cache all column references
|
|
1063
|
+
const colRefs = new Map();
|
|
1064
|
+
for (const colName of this.df.columnNames) {
|
|
1065
|
+
colRefs.set(colName, this.df.column(colName));
|
|
1066
|
+
}
|
|
596
1067
|
this.df.columnNames.forEach(colName => {
|
|
597
1068
|
const values = [];
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
values.push(keyValues[colIndex]);
|
|
603
|
-
}
|
|
604
|
-
}
|
|
605
|
-
else {
|
|
606
|
-
for (const indices of this.groups.values()) {
|
|
607
|
-
const lastIndex = indices[indices.length - 1];
|
|
608
|
-
values.push(this.df.column(colName).get(lastIndex));
|
|
1069
|
+
for (const entry of this.groupOrder) {
|
|
1070
|
+
const indices = this.groups.get(entry.key);
|
|
1071
|
+
if (indices && indices.length > 0) {
|
|
1072
|
+
values.push(colRefs.get(colName).get(indices[indices.length - 1]));
|
|
609
1073
|
}
|
|
610
1074
|
}
|
|
611
1075
|
resultColumns[colName] = new Column(colName, values);
|
|
@@ -613,15 +1077,22 @@ class GroupBy {
|
|
|
613
1077
|
return new DataFrame(resultColumns);
|
|
614
1078
|
}
|
|
615
1079
|
size() {
|
|
616
|
-
Array.from(this.groups.keys());
|
|
617
|
-
const groupSizes = Array.from(this.groups.values()).map(indices => indices.length);
|
|
618
1080
|
const resultColumns = {};
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
1081
|
+
const groupSizes = [];
|
|
1082
|
+
// Build group key columns and sizes
|
|
1083
|
+
this.groupColumns.forEach((colName, colIdx) => {
|
|
1084
|
+
const column = this.cachedGroupCols[colIdx];
|
|
1085
|
+
const values = [];
|
|
1086
|
+
for (const entry of this.groupOrder) {
|
|
1087
|
+
const indices = this.groups.get(entry.key);
|
|
1088
|
+
if (indices && indices.length > 0) {
|
|
1089
|
+
values.push(column.get(indices[0]));
|
|
1090
|
+
// Only add to groupSizes on first column iteration
|
|
1091
|
+
if (colIdx === 0) {
|
|
1092
|
+
groupSizes.push(indices.length);
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
625
1096
|
resultColumns[colName] = new Column(colName, values);
|
|
626
1097
|
});
|
|
627
1098
|
resultColumns['size'] = new Column('size', groupSizes, 'int32');
|
|
@@ -632,6 +1103,75 @@ DataFrame.prototype.groupBy = function (columns) {
|
|
|
632
1103
|
return new GroupBy(this, columns);
|
|
633
1104
|
};
|
|
634
1105
|
|
|
1106
|
+
/**
|
|
1107
|
+
* IndexCache provides caching for hash indices used in join and groupBy operations.
|
|
1108
|
+
* Uses WeakMap to allow garbage collection of DataFrames.
|
|
1109
|
+
*/
|
|
1110
|
+
class IndexCache {
|
|
1111
|
+
constructor(maxAge = 60000) {
|
|
1112
|
+
this.cache = new WeakMap();
|
|
1113
|
+
this.maxAge = maxAge;
|
|
1114
|
+
}
|
|
1115
|
+
/**
|
|
1116
|
+
* Generate a cache key from column names.
|
|
1117
|
+
*/
|
|
1118
|
+
getCacheKey(columns) {
|
|
1119
|
+
return columns.slice().sort().join('\x00');
|
|
1120
|
+
}
|
|
1121
|
+
/**
|
|
1122
|
+
* Get a cached index for the given DataFrame and columns.
|
|
1123
|
+
* Returns null if not cached or expired.
|
|
1124
|
+
*/
|
|
1125
|
+
getIndex(df, columns) {
|
|
1126
|
+
const dfCache = this.cache.get(df);
|
|
1127
|
+
if (!dfCache)
|
|
1128
|
+
return null;
|
|
1129
|
+
const key = this.getCacheKey(columns);
|
|
1130
|
+
const cached = dfCache.get(key);
|
|
1131
|
+
if (!cached)
|
|
1132
|
+
return null;
|
|
1133
|
+
// Check if expired
|
|
1134
|
+
if (Date.now() - cached.createdAt > this.maxAge) {
|
|
1135
|
+
dfCache.delete(key);
|
|
1136
|
+
return null;
|
|
1137
|
+
}
|
|
1138
|
+
return cached.index;
|
|
1139
|
+
}
|
|
1140
|
+
/**
|
|
1141
|
+
* Store an index in the cache.
|
|
1142
|
+
*/
|
|
1143
|
+
setIndex(df, columns, index) {
|
|
1144
|
+
let dfCache = this.cache.get(df);
|
|
1145
|
+
if (!dfCache) {
|
|
1146
|
+
dfCache = new Map();
|
|
1147
|
+
this.cache.set(df, dfCache);
|
|
1148
|
+
}
|
|
1149
|
+
const key = this.getCacheKey(columns);
|
|
1150
|
+
dfCache.set(key, {
|
|
1151
|
+
columns: columns.slice(),
|
|
1152
|
+
index,
|
|
1153
|
+
createdAt: Date.now()
|
|
1154
|
+
});
|
|
1155
|
+
}
|
|
1156
|
+
/**
|
|
1157
|
+
* Invalidate all cached indices for a DataFrame.
|
|
1158
|
+
*/
|
|
1159
|
+
invalidate(df) {
|
|
1160
|
+
this.cache.delete(df);
|
|
1161
|
+
}
|
|
1162
|
+
/**
|
|
1163
|
+
* Clear all cached indices.
|
|
1164
|
+
*/
|
|
1165
|
+
clear() {
|
|
1166
|
+
// WeakMap doesn't have a clear method, so we create a new one
|
|
1167
|
+
this.cache = new WeakMap();
|
|
1168
|
+
}
|
|
1169
|
+
}
|
|
1170
|
+
/**
|
|
1171
|
+
* Global index cache instance for shared use across operations.
|
|
1172
|
+
*/
|
|
1173
|
+
const globalIndexCache = new IndexCache();
|
|
1174
|
+
|
|
635
1175
|
class Joiner {
|
|
636
1176
|
static join(left, right, on, how = 'inner', suffixes = ['_x', '_y']) {
|
|
637
1177
|
const joinKeys = Array.isArray(on) ? on : [on];
|
|
@@ -661,33 +1201,67 @@ class Joiner {
|
|
|
661
1201
|
}
|
|
662
1202
|
});
|
|
663
1203
|
}
|
|
664
|
-
static buildHashIndex(df, keys) {
|
|
1204
|
+
static buildHashIndex(df, keys, useCache = true) {
|
|
1205
|
+
// Check cache first
|
|
1206
|
+
if (useCache) {
|
|
1207
|
+
const cached = globalIndexCache.getIndex(df, keys);
|
|
1208
|
+
if (cached) {
|
|
1209
|
+
return cached;
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
665
1212
|
const index = new Map();
|
|
1213
|
+
// Cache column references once before the loop
|
|
1214
|
+
const columns = keys.map(k => df.column(k));
|
|
666
1215
|
for (let i = 0; i < df.length; i++) {
|
|
667
|
-
const
|
|
668
|
-
|
|
669
|
-
|
|
1216
|
+
const key = this.createJoinKey(columns, i);
|
|
1217
|
+
const indices = index.get(key);
|
|
1218
|
+
if (indices) {
|
|
1219
|
+
indices.push(i);
|
|
1220
|
+
}
|
|
1221
|
+
else {
|
|
1222
|
+
index.set(key, [i]);
|
|
670
1223
|
}
|
|
671
|
-
|
|
1224
|
+
}
|
|
1225
|
+
// Store in cache
|
|
1226
|
+
if (useCache) {
|
|
1227
|
+
globalIndexCache.setIndex(df, keys, index);
|
|
672
1228
|
}
|
|
673
1229
|
return index;
|
|
674
1230
|
}
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
1231
|
+
/**
|
|
1232
|
+
* Create a simple string key for a row using '||' separator.
|
|
1233
|
+
*/
|
|
1234
|
+
static createJoinKey(columns, rowIndex) {
|
|
1235
|
+
let key = '';
|
|
1236
|
+
for (let i = 0; i < columns.length; i++) {
|
|
1237
|
+
if (i > 0)
|
|
1238
|
+
key += '||';
|
|
1239
|
+
const val = columns[i].get(rowIndex);
|
|
1240
|
+
key += val === null ? '\0' : String(val);
|
|
1241
|
+
}
|
|
1242
|
+
return key;
|
|
682
1243
|
}
|
|
683
1244
|
static innerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
684
1245
|
const matches = [];
|
|
685
|
-
|
|
1246
|
+
// Cache column references for key lookups
|
|
1247
|
+
const leftColumns = joinKeys.map(k => left.column(k));
|
|
1248
|
+
// Track which left rows have been processed to avoid duplicates
|
|
1249
|
+
const processedLeft = new Set();
|
|
1250
|
+
// Iterate through left rows in original order
|
|
1251
|
+
for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
|
|
1252
|
+
if (processedLeft.has(leftIdx))
|
|
1253
|
+
continue;
|
|
1254
|
+
const key = this.createJoinKey(leftColumns, leftIdx);
|
|
686
1255
|
const rightIndices = rightIndex.get(key);
|
|
687
1256
|
if (rightIndices) {
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
1257
|
+
// Get all left rows with the same key
|
|
1258
|
+
const leftIndices = leftIndex.get(key);
|
|
1259
|
+
if (leftIndices) {
|
|
1260
|
+
for (const lIdx of leftIndices) {
|
|
1261
|
+
processedLeft.add(lIdx);
|
|
1262
|
+
for (const rightIdx of rightIndices) {
|
|
1263
|
+
matches.push([lIdx, rightIdx]);
|
|
1264
|
+
}
|
|
691
1265
|
}
|
|
692
1266
|
}
|
|
693
1267
|
}
|
|
@@ -696,18 +1270,26 @@ class Joiner {
|
|
|
696
1270
|
}
|
|
697
1271
|
static leftJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
698
1272
|
const matches = [];
|
|
699
|
-
|
|
1273
|
+
const leftColumns = joinKeys.map(k => left.column(k));
|
|
1274
|
+
const processedLeft = new Set();
|
|
1275
|
+
// Iterate through left rows in original order
|
|
1276
|
+
for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
|
|
1277
|
+
if (processedLeft.has(leftIdx))
|
|
1278
|
+
continue;
|
|
1279
|
+
const key = this.createJoinKey(leftColumns, leftIdx);
|
|
700
1280
|
const rightIndices = rightIndex.get(key);
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
1281
|
+
const leftIndices = leftIndex.get(key);
|
|
1282
|
+
if (leftIndices) {
|
|
1283
|
+
for (const lIdx of leftIndices) {
|
|
1284
|
+
processedLeft.add(lIdx);
|
|
1285
|
+
if (rightIndices) {
|
|
1286
|
+
for (const rightIdx of rightIndices) {
|
|
1287
|
+
matches.push([lIdx, rightIdx]);
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
else {
|
|
1291
|
+
matches.push([lIdx, null]);
|
|
705
1292
|
}
|
|
706
|
-
}
|
|
707
|
-
}
|
|
708
|
-
else {
|
|
709
|
-
for (const leftIdx of leftIndices) {
|
|
710
|
-
matches.push([leftIdx, null]);
|
|
711
1293
|
}
|
|
712
1294
|
}
|
|
713
1295
|
}
|
|
@@ -715,18 +1297,26 @@ class Joiner {
|
|
|
715
1297
|
}
|
|
716
1298
|
static rightJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
717
1299
|
const matches = [];
|
|
718
|
-
|
|
1300
|
+
const rightColumns = joinKeys.map(k => right.column(k));
|
|
1301
|
+
const processedRight = new Set();
|
|
1302
|
+
// Iterate through right rows in original order
|
|
1303
|
+
for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
|
|
1304
|
+
if (processedRight.has(rightIdx))
|
|
1305
|
+
continue;
|
|
1306
|
+
const key = this.createJoinKey(rightColumns, rightIdx);
|
|
719
1307
|
const leftIndices = leftIndex.get(key);
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
1308
|
+
const rightIndices = rightIndex.get(key);
|
|
1309
|
+
if (rightIndices) {
|
|
1310
|
+
for (const rIdx of rightIndices) {
|
|
1311
|
+
processedRight.add(rIdx);
|
|
1312
|
+
if (leftIndices) {
|
|
1313
|
+
for (const leftIdx of leftIndices) {
|
|
1314
|
+
matches.push([leftIdx, rIdx]);
|
|
1315
|
+
}
|
|
1316
|
+
}
|
|
1317
|
+
else {
|
|
1318
|
+
matches.push([null, rIdx]);
|
|
724
1319
|
}
|
|
725
|
-
}
|
|
726
|
-
}
|
|
727
|
-
else {
|
|
728
|
-
for (const rightIdx of rightIndices) {
|
|
729
|
-
matches.push([null, rightIdx]);
|
|
730
1320
|
}
|
|
731
1321
|
}
|
|
732
1322
|
}
|
|
@@ -735,27 +1325,37 @@ class Joiner {
|
|
|
735
1325
|
static outerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
736
1326
|
const matches = [];
|
|
737
1327
|
const processedRightKeys = new Set();
|
|
738
|
-
|
|
1328
|
+
const processedLeft = new Set();
|
|
1329
|
+
const leftColumns = joinKeys.map(k => left.column(k));
|
|
1330
|
+
const rightColumns = joinKeys.map(k => right.column(k));
|
|
1331
|
+
// Process left side first (in original order)
|
|
1332
|
+
for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
|
|
1333
|
+
if (processedLeft.has(leftIdx))
|
|
1334
|
+
continue;
|
|
1335
|
+
const key = this.createJoinKey(leftColumns, leftIdx);
|
|
739
1336
|
const rightIndices = rightIndex.get(key);
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
for (const
|
|
743
|
-
|
|
744
|
-
|
|
1337
|
+
const leftIndices = leftIndex.get(key);
|
|
1338
|
+
if (leftIndices) {
|
|
1339
|
+
for (const lIdx of leftIndices) {
|
|
1340
|
+
processedLeft.add(lIdx);
|
|
1341
|
+
if (rightIndices) {
|
|
1342
|
+
processedRightKeys.add(key);
|
|
1343
|
+
for (const rightIdx of rightIndices) {
|
|
1344
|
+
matches.push([lIdx, rightIdx]);
|
|
1345
|
+
}
|
|
1346
|
+
}
|
|
1347
|
+
else {
|
|
1348
|
+
matches.push([lIdx, null]);
|
|
745
1349
|
}
|
|
746
|
-
}
|
|
747
|
-
}
|
|
748
|
-
else {
|
|
749
|
-
for (const leftIdx of leftIndices) {
|
|
750
|
-
matches.push([leftIdx, null]);
|
|
751
1350
|
}
|
|
752
1351
|
}
|
|
753
1352
|
}
|
|
754
|
-
|
|
1353
|
+
// Add unmatched right rows (in original order)
|
|
1354
|
+
for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
|
|
1355
|
+
const key = this.createJoinKey(rightColumns, rightIdx);
|
|
755
1356
|
if (!processedRightKeys.has(key)) {
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
}
|
|
1357
|
+
matches.push([null, rightIdx]);
|
|
1358
|
+
processedRightKeys.add(key); // Mark this key as processed
|
|
759
1359
|
}
|
|
760
1360
|
}
|
|
761
1361
|
return this.buildJoinedDataFrame(left, right, matches, joinKeys, suffixes);
|