databonk 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/column.d.ts +31 -1
- package/dist/core/column.d.ts.map +1 -1
- package/dist/core/dataframe.d.ts +40 -0
- package/dist/core/dataframe.d.ts.map +1 -1
- package/dist/core/index-cache.d.ts +44 -0
- package/dist/core/index-cache.d.ts.map +1 -0
- package/dist/index.esm.js +719 -119
- package/dist/index.esm.js.map +1 -1
- package/dist/index.js +719 -119
- package/dist/index.js.map +1 -1
- package/dist/operations/groupby.d.ts +8 -1
- package/dist/operations/groupby.d.ts.map +1 -1
- package/dist/operations/join.d.ts +3 -0
- package/dist/operations/join.d.ts.map +1 -1
- package/dist/utils/aggregation-engine.d.ts +84 -0
- package/dist/utils/aggregation-engine.d.ts.map +1 -0
- package/dist/utils/bitset.d.ts +18 -0
- package/dist/utils/bitset.d.ts.map +1 -1
- package/dist/utils/hash.d.ts +79 -0
- package/dist/utils/hash.d.ts.map +1 -0
- package/package.json +1 -1
package/dist/index.esm.js
CHANGED
|
@@ -86,6 +86,60 @@ class BitSet {
|
|
|
86
86
|
n = (n & 0x33333333) + ((n >>> 2) & 0x33333333);
|
|
87
87
|
return (((n + (n >>> 4)) & 0x0f0f0f0f) * 0x01010101) >>> 24;
|
|
88
88
|
}
|
|
89
|
+
/**
|
|
90
|
+
* Get a batch of null flags as a bitmask.
|
|
91
|
+
* Useful for SIMD-style batch null checking.
|
|
92
|
+
* @param startIndex The starting index (must be aligned to 32 for optimal performance)
|
|
93
|
+
* @param count Number of bits to get (max 32)
|
|
94
|
+
* @returns A number where bit i is set if index (startIndex + i) is null
|
|
95
|
+
*/
|
|
96
|
+
getNullMaskBatch(startIndex, count) {
|
|
97
|
+
if (count <= 0 || count > 32) {
|
|
98
|
+
throw new Error('Count must be between 1 and 32');
|
|
99
|
+
}
|
|
100
|
+
const arrayIndex = Math.floor(startIndex / 32);
|
|
101
|
+
const bitOffset = startIndex % 32;
|
|
102
|
+
if (bitOffset === 0 && count === 32) {
|
|
103
|
+
// Aligned access - fast path
|
|
104
|
+
return this.data[arrayIndex] >>> 0;
|
|
105
|
+
}
|
|
106
|
+
// Extract bits across word boundaries if needed
|
|
107
|
+
let result = this.data[arrayIndex] >>> bitOffset;
|
|
108
|
+
if (bitOffset + count > 32 && arrayIndex + 1 < this.data.length) {
|
|
109
|
+
// Need bits from next word
|
|
110
|
+
const bitsFromFirst = 32 - bitOffset;
|
|
111
|
+
const bitsFromSecond = count - bitsFromFirst;
|
|
112
|
+
const nextWord = this.data[arrayIndex + 1];
|
|
113
|
+
result |= (nextWord & ((1 << bitsFromSecond) - 1)) << bitsFromFirst;
|
|
114
|
+
}
|
|
115
|
+
// Mask to requested count
|
|
116
|
+
return result & ((1 << count) - 1);
|
|
117
|
+
}
|
|
118
|
+
/**
|
|
119
|
+
* Check if any bit in a range is set.
|
|
120
|
+
* Faster than checking each bit individually.
|
|
121
|
+
*/
|
|
122
|
+
anySet(startIndex, count) {
|
|
123
|
+
const endIndex = Math.min(startIndex + count, this.length);
|
|
124
|
+
for (let i = startIndex; i < endIndex;) {
|
|
125
|
+
const arrayIndex = Math.floor(i / 32);
|
|
126
|
+
const bitOffset = i % 32;
|
|
127
|
+
const bitsToCheck = Math.min(32 - bitOffset, endIndex - i);
|
|
128
|
+
const mask = ((1 << bitsToCheck) - 1) << bitOffset;
|
|
129
|
+
if ((this.data[arrayIndex] & mask) !== 0) {
|
|
130
|
+
return true;
|
|
131
|
+
}
|
|
132
|
+
i += bitsToCheck;
|
|
133
|
+
}
|
|
134
|
+
return false;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Get direct access to the underlying data array.
|
|
138
|
+
* @internal
|
|
139
|
+
*/
|
|
140
|
+
getDataRef() {
|
|
141
|
+
return this.data;
|
|
142
|
+
}
|
|
89
143
|
*[Symbol.iterator]() {
|
|
90
144
|
for (let i = 0; i < this.length; i++) {
|
|
91
145
|
yield this.get(i);
|
|
@@ -128,6 +182,30 @@ class Column {
|
|
|
128
182
|
}
|
|
129
183
|
return this.data[index];
|
|
130
184
|
}
|
|
185
|
+
/**
|
|
186
|
+
* Get a value without bounds checking or null handling.
|
|
187
|
+
* Use only when caller ensures valid index and handles nulls separately.
|
|
188
|
+
* @internal
|
|
189
|
+
*/
|
|
190
|
+
getRaw(index) {
|
|
191
|
+
return this.data[index];
|
|
192
|
+
}
|
|
193
|
+
/**
|
|
194
|
+
* Get direct reference to the underlying data array.
|
|
195
|
+
* Use for batch operations that need raw access.
|
|
196
|
+
* @internal
|
|
197
|
+
*/
|
|
198
|
+
getDataRef() {
|
|
199
|
+
return this.data;
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Get direct reference to the null bitmap.
|
|
203
|
+
* Use for batch null checking.
|
|
204
|
+
* @internal
|
|
205
|
+
*/
|
|
206
|
+
getNullBitmapRef() {
|
|
207
|
+
return this.nullBitmap;
|
|
208
|
+
}
|
|
131
209
|
isNull(index) {
|
|
132
210
|
return this.nullBitmap.get(index);
|
|
133
211
|
}
|
|
@@ -238,8 +316,105 @@ class Column {
|
|
|
238
316
|
static from(name, values, dataType) {
|
|
239
317
|
return new Column(name, values, dataType);
|
|
240
318
|
}
|
|
319
|
+
/**
|
|
320
|
+
* Create a Column directly from raw data without copying.
|
|
321
|
+
* Use for optimized construction when data is already in the correct format.
|
|
322
|
+
* @internal
|
|
323
|
+
*/
|
|
324
|
+
static fromRaw(name, data, nullBitmap, dataType) {
|
|
325
|
+
// Create an instance without going through the normal constructor
|
|
326
|
+
const column = Object.create(Column.prototype);
|
|
327
|
+
column.name = name;
|
|
328
|
+
column.dataType = dataType;
|
|
329
|
+
column.data = data;
|
|
330
|
+
column.nullBitmap = nullBitmap;
|
|
331
|
+
column.length = data.length;
|
|
332
|
+
return column;
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Select rows by indices with optimized batch copying.
|
|
336
|
+
* Much faster than calling get() for each index.
|
|
337
|
+
*/
|
|
338
|
+
selectIndices(indices) {
|
|
339
|
+
const newLength = indices.length;
|
|
340
|
+
const Constructor = TYPE_CONSTRUCTORS[this.dataType];
|
|
341
|
+
if (Constructor) {
|
|
342
|
+
// TypedArray fast path - batch copy
|
|
343
|
+
const newData = new Constructor(newLength);
|
|
344
|
+
const newNullBitmap = new BitSet(newLength);
|
|
345
|
+
const srcData = this.data;
|
|
346
|
+
for (let i = 0; i < newLength; i++) {
|
|
347
|
+
const srcIdx = indices[i];
|
|
348
|
+
newData[i] = srcData[srcIdx];
|
|
349
|
+
if (this.nullBitmap.get(srcIdx)) {
|
|
350
|
+
newNullBitmap.set(i, true);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
|
|
354
|
+
}
|
|
355
|
+
// Regular array fallback
|
|
356
|
+
const newData = new Array(newLength);
|
|
357
|
+
const newNullBitmap = new BitSet(newLength);
|
|
358
|
+
for (let i = 0; i < newLength; i++) {
|
|
359
|
+
const srcIdx = indices[i];
|
|
360
|
+
newData[i] = this.data[srcIdx];
|
|
361
|
+
if (this.nullBitmap.get(srcIdx)) {
|
|
362
|
+
newNullBitmap.set(i, true);
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
|
|
366
|
+
}
|
|
241
367
|
}
|
|
242
368
|
|
|
369
|
+
/**
|
|
370
|
+
* RowProxy provides zero-allocation row access for iteration.
|
|
371
|
+
* Reuses a single object while iterating, avoiding object creation per row.
|
|
372
|
+
*/
|
|
373
|
+
class RowProxy {
|
|
374
|
+
constructor(df) {
|
|
375
|
+
this.columnCache = new Map();
|
|
376
|
+
this.index = 0;
|
|
377
|
+
for (const name of df.columnNames) {
|
|
378
|
+
this.columnCache.set(name, df.column(name));
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
/**
|
|
382
|
+
* Set the current row index.
|
|
383
|
+
* @returns this for chaining
|
|
384
|
+
*/
|
|
385
|
+
setIndex(i) {
|
|
386
|
+
this.index = i;
|
|
387
|
+
return this;
|
|
388
|
+
}
|
|
389
|
+
/**
|
|
390
|
+
* Get a value from the current row.
|
|
391
|
+
*/
|
|
392
|
+
get(col) {
|
|
393
|
+
const column = this.columnCache.get(col);
|
|
394
|
+
if (!column) {
|
|
395
|
+
throw new Error(`Column '${col}' not found`);
|
|
396
|
+
}
|
|
397
|
+
return column.get(this.index);
|
|
398
|
+
}
|
|
399
|
+
/**
|
|
400
|
+
* Get a value without null checking (faster for non-null columns).
|
|
401
|
+
*/
|
|
402
|
+
getRaw(col) {
|
|
403
|
+
return this.columnCache.get(col).getRaw(this.index);
|
|
404
|
+
}
|
|
405
|
+
/**
|
|
406
|
+
* Check if a column value is null at the current row.
|
|
407
|
+
*/
|
|
408
|
+
isNull(col) {
|
|
409
|
+
return this.columnCache.get(col).isNull(this.index);
|
|
410
|
+
}
|
|
411
|
+
/**
|
|
412
|
+
* Get the current row index.
|
|
413
|
+
*/
|
|
414
|
+
getIndex() {
|
|
415
|
+
return this.index;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
243
418
|
class DataFrame {
|
|
244
419
|
constructor(data) {
|
|
245
420
|
this.columns = new Map();
|
|
@@ -304,8 +479,16 @@ class DataFrame {
|
|
|
304
479
|
}
|
|
305
480
|
filter(predicate) {
|
|
306
481
|
const indices = [];
|
|
482
|
+
// Cache column references for the predicate
|
|
483
|
+
const columnRefs = [];
|
|
484
|
+
this.columns.forEach((col, name) => columnRefs.push([name, col]));
|
|
485
|
+
// Reuse a single row object to reduce allocations
|
|
486
|
+
const row = {};
|
|
307
487
|
for (let i = 0; i < this.length; i++) {
|
|
308
|
-
|
|
488
|
+
// Populate row object using cached column references
|
|
489
|
+
for (const [name, col] of columnRefs) {
|
|
490
|
+
row[name] = col.get(i);
|
|
491
|
+
}
|
|
309
492
|
if (predicate(row, i)) {
|
|
310
493
|
indices.push(i);
|
|
311
494
|
}
|
|
@@ -324,11 +507,32 @@ class DataFrame {
|
|
|
324
507
|
selectRows(indices) {
|
|
325
508
|
const selectedColumns = {};
|
|
326
509
|
this.columns.forEach((column, name) => {
|
|
327
|
-
|
|
328
|
-
selectedColumns[name] =
|
|
510
|
+
// Use optimized batch selection instead of individual get() calls
|
|
511
|
+
selectedColumns[name] = column.selectIndices(indices);
|
|
329
512
|
});
|
|
330
513
|
return new DataFrame(selectedColumns);
|
|
331
514
|
}
|
|
515
|
+
/**
|
|
516
|
+
* Filter rows using a predicate function that receives a RowProxy.
|
|
517
|
+
* More efficient than filter() as it avoids creating a new object per row.
|
|
518
|
+
*/
|
|
519
|
+
filterByIndex(predicate) {
|
|
520
|
+
const proxy = new RowProxy(this);
|
|
521
|
+
const indices = [];
|
|
522
|
+
for (let i = 0; i < this.length; i++) {
|
|
523
|
+
if (predicate(i, proxy.setIndex(i))) {
|
|
524
|
+
indices.push(i);
|
|
525
|
+
}
|
|
526
|
+
}
|
|
527
|
+
return this.selectRows(indices);
|
|
528
|
+
}
|
|
529
|
+
/**
|
|
530
|
+
* Create a RowProxy for efficient iteration.
|
|
531
|
+
* Use this when you need to access multiple columns per row without allocation.
|
|
532
|
+
*/
|
|
533
|
+
createRowProxy() {
|
|
534
|
+
return new RowProxy(this);
|
|
535
|
+
}
|
|
332
536
|
getRow(index) {
|
|
333
537
|
if (index < 0 || index >= this.length) {
|
|
334
538
|
throw new Error('Index out of bounds');
|
|
@@ -436,66 +640,310 @@ class DataFrame {
|
|
|
436
640
|
}
|
|
437
641
|
}
|
|
438
642
|
|
|
643
|
+
/**
|
|
644
|
+
* StatAccumulator tracks multiple statistics in a single pass through the data.
|
|
645
|
+
* Uses Welford's online algorithm for numerically stable variance computation.
|
|
646
|
+
*/
|
|
647
|
+
class StatAccumulator {
|
|
648
|
+
constructor() {
|
|
649
|
+
this.count = 0;
|
|
650
|
+
this.sum = 0;
|
|
651
|
+
this.min = Infinity;
|
|
652
|
+
this.max = -Infinity;
|
|
653
|
+
this.mean_ = 0;
|
|
654
|
+
this.m2 = 0; // Sum of squares of differences from current mean
|
|
655
|
+
}
|
|
656
|
+
/**
|
|
657
|
+
* Add a single value to the accumulator.
|
|
658
|
+
* Updates all statistics in O(1) time.
|
|
659
|
+
*/
|
|
660
|
+
add(value) {
|
|
661
|
+
this.count++;
|
|
662
|
+
this.sum += value;
|
|
663
|
+
if (value < this.min)
|
|
664
|
+
this.min = value;
|
|
665
|
+
if (value > this.max)
|
|
666
|
+
this.max = value;
|
|
667
|
+
// Welford's online algorithm for stable variance
|
|
668
|
+
const delta = value - this.mean_;
|
|
669
|
+
this.mean_ += delta / this.count;
|
|
670
|
+
const delta2 = value - this.mean_;
|
|
671
|
+
this.m2 += delta * delta2;
|
|
672
|
+
}
|
|
673
|
+
/**
|
|
674
|
+
* Get the mean of all added values.
|
|
675
|
+
*/
|
|
676
|
+
getMean() {
|
|
677
|
+
return this.count > 0 ? this.mean_ : 0;
|
|
678
|
+
}
|
|
679
|
+
/**
|
|
680
|
+
* Get the sample variance (n-1 denominator).
|
|
681
|
+
*/
|
|
682
|
+
getVariance() {
|
|
683
|
+
return this.count > 1 ? this.m2 / (this.count - 1) : 0;
|
|
684
|
+
}
|
|
685
|
+
/**
|
|
686
|
+
* Get the sample standard deviation.
|
|
687
|
+
*/
|
|
688
|
+
getStd() {
|
|
689
|
+
return Math.sqrt(this.getVariance());
|
|
690
|
+
}
|
|
691
|
+
/**
|
|
692
|
+
* Get a specific aggregate value by function name.
|
|
693
|
+
*/
|
|
694
|
+
getValue(fn) {
|
|
695
|
+
switch (fn) {
|
|
696
|
+
case 'sum':
|
|
697
|
+
return this.sum;
|
|
698
|
+
case 'mean':
|
|
699
|
+
return this.getMean();
|
|
700
|
+
case 'min':
|
|
701
|
+
return this.count > 0 ? this.min : NaN;
|
|
702
|
+
case 'max':
|
|
703
|
+
return this.count > 0 ? this.max : NaN;
|
|
704
|
+
case 'count':
|
|
705
|
+
return this.count;
|
|
706
|
+
case 'var':
|
|
707
|
+
return this.getVariance();
|
|
708
|
+
case 'std':
|
|
709
|
+
return this.getStd();
|
|
710
|
+
default:
|
|
711
|
+
throw new Error(`Unknown aggregate function: ${fn}`);
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
/**
|
|
715
|
+
* Merge another accumulator into this one.
|
|
716
|
+
* Useful for parallel aggregation.
|
|
717
|
+
*/
|
|
718
|
+
merge(other) {
|
|
719
|
+
if (other.count === 0)
|
|
720
|
+
return;
|
|
721
|
+
if (this.count === 0) {
|
|
722
|
+
this.count = other.count;
|
|
723
|
+
this.sum = other.sum;
|
|
724
|
+
this.min = other.min;
|
|
725
|
+
this.max = other.max;
|
|
726
|
+
this.mean_ = other.mean_;
|
|
727
|
+
this.m2 = other.m2;
|
|
728
|
+
return;
|
|
729
|
+
}
|
|
730
|
+
const totalCount = this.count + other.count;
|
|
731
|
+
const delta = other.mean_ - this.mean_;
|
|
732
|
+
// Combined mean
|
|
733
|
+
this.mean_ = (this.count * this.mean_ + other.count * other.mean_) / totalCount;
|
|
734
|
+
// Combined M2 using parallel algorithm
|
|
735
|
+
this.m2 = this.m2 + other.m2 + delta * delta * this.count * other.count / totalCount;
|
|
736
|
+
this.sum += other.sum;
|
|
737
|
+
this.count = totalCount;
|
|
738
|
+
if (other.min < this.min)
|
|
739
|
+
this.min = other.min;
|
|
740
|
+
if (other.max > this.max)
|
|
741
|
+
this.max = other.max;
|
|
742
|
+
}
|
|
743
|
+
/**
|
|
744
|
+
* Reset the accumulator for reuse.
|
|
745
|
+
*/
|
|
746
|
+
reset() {
|
|
747
|
+
this.count = 0;
|
|
748
|
+
this.sum = 0;
|
|
749
|
+
this.min = Infinity;
|
|
750
|
+
this.max = -Infinity;
|
|
751
|
+
this.mean_ = 0;
|
|
752
|
+
this.m2 = 0;
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
/**
|
|
756
|
+
* GroupedAccumulators manages StatAccumulators for multiple groups and columns.
|
|
757
|
+
* Enables single-pass aggregation across all groups and aggregate functions.
|
|
758
|
+
*/
|
|
759
|
+
class GroupedAccumulators {
|
|
760
|
+
constructor(columns) {
|
|
761
|
+
// Map of groupKey -> columnName -> StatAccumulator
|
|
762
|
+
this.accumulators = new Map();
|
|
763
|
+
this.columns = columns;
|
|
764
|
+
}
|
|
765
|
+
/**
|
|
766
|
+
* Get or create the accumulator for a group and column.
|
|
767
|
+
*/
|
|
768
|
+
getAccumulator(groupKey, columnName) {
|
|
769
|
+
let groupAccs = this.accumulators.get(groupKey);
|
|
770
|
+
if (!groupAccs) {
|
|
771
|
+
groupAccs = new Map();
|
|
772
|
+
this.accumulators.set(groupKey, groupAccs);
|
|
773
|
+
}
|
|
774
|
+
let acc = groupAccs.get(columnName);
|
|
775
|
+
if (!acc) {
|
|
776
|
+
acc = new StatAccumulator();
|
|
777
|
+
groupAccs.set(columnName, acc);
|
|
778
|
+
}
|
|
779
|
+
return acc;
|
|
780
|
+
}
|
|
781
|
+
/**
|
|
782
|
+
* Add a value for a specific group and column.
|
|
783
|
+
*/
|
|
784
|
+
add(groupKey, columnName, value) {
|
|
785
|
+
if (value !== null && !isNaN(value)) {
|
|
786
|
+
this.getAccumulator(groupKey, columnName).add(value);
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
/**
|
|
790
|
+
* Get all group keys.
|
|
791
|
+
*/
|
|
792
|
+
getGroups() {
|
|
793
|
+
return Array.from(this.accumulators.keys());
|
|
794
|
+
}
|
|
795
|
+
/**
|
|
796
|
+
* Get the aggregate value for a group and column.
|
|
797
|
+
*/
|
|
798
|
+
getValue(groupKey, columnName, fn) {
|
|
799
|
+
const acc = this.accumulators.get(groupKey)?.get(columnName);
|
|
800
|
+
if (!acc) {
|
|
801
|
+
return fn === 'count' ? 0 : NaN;
|
|
802
|
+
}
|
|
803
|
+
return acc.getValue(fn);
|
|
804
|
+
}
|
|
805
|
+
/**
|
|
806
|
+
* Get the count for a group (same across all columns).
|
|
807
|
+
*/
|
|
808
|
+
getGroupCount(groupKey) {
|
|
809
|
+
const groupAccs = this.accumulators.get(groupKey);
|
|
810
|
+
if (!groupAccs)
|
|
811
|
+
return 0;
|
|
812
|
+
// Return count from the first column accumulator
|
|
813
|
+
for (const acc of groupAccs.values()) {
|
|
814
|
+
return acc.count;
|
|
815
|
+
}
|
|
816
|
+
return 0;
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
/**
|
|
820
|
+
* Creates an aggregation plan from a spec object.
|
|
821
|
+
*/
|
|
822
|
+
function createAggregationPlan(spec) {
|
|
823
|
+
const columns = [];
|
|
824
|
+
const functions = new Map();
|
|
825
|
+
for (const [colName, fns] of Object.entries(spec)) {
|
|
826
|
+
const fnArray = Array.isArray(fns) ? fns : [fns];
|
|
827
|
+
columns.push(colName);
|
|
828
|
+
functions.set(colName, fnArray);
|
|
829
|
+
}
|
|
830
|
+
return { columns, functions };
|
|
831
|
+
}
|
|
832
|
+
|
|
439
833
|
class GroupBy {
|
|
440
834
|
constructor(df, columns) {
|
|
441
|
-
this.
|
|
835
|
+
this.groupOrder = []; // Track insertion order for consistent output
|
|
442
836
|
this.df = df;
|
|
443
837
|
this.groupColumns = columns;
|
|
838
|
+
// Cache column references once
|
|
839
|
+
this.cachedGroupCols = columns.map(c => df.column(c));
|
|
840
|
+
this.groups = new Map();
|
|
444
841
|
this.computeGroups();
|
|
445
842
|
}
|
|
446
843
|
computeGroups() {
|
|
447
844
|
for (let i = 0; i < this.df.length; i++) {
|
|
448
845
|
const key = this.createGroupKey(i);
|
|
449
|
-
|
|
450
|
-
|
|
846
|
+
const existingIndices = this.groups.get(key);
|
|
847
|
+
if (existingIndices) {
|
|
848
|
+
existingIndices.push(i);
|
|
849
|
+
}
|
|
850
|
+
else {
|
|
851
|
+
this.groups.set(key, [i]);
|
|
852
|
+
this.groupOrder.push({
|
|
853
|
+
key,
|
|
854
|
+
firstRowIndex: i
|
|
855
|
+
});
|
|
451
856
|
}
|
|
452
|
-
this.groups.get(key).push(i);
|
|
453
857
|
}
|
|
454
858
|
}
|
|
859
|
+
/**
|
|
860
|
+
* Create a simple string key for a row using '||' separator.
|
|
861
|
+
*/
|
|
455
862
|
createGroupKey(rowIndex) {
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
return key.split('||').map(part => part === '__NULL__' ? null : part);
|
|
863
|
+
let key = '';
|
|
864
|
+
for (let i = 0; i < this.cachedGroupCols.length; i++) {
|
|
865
|
+
if (i > 0)
|
|
866
|
+
key += '||';
|
|
867
|
+
const val = this.cachedGroupCols[i].get(rowIndex);
|
|
868
|
+
key += val === null ? '\0' : String(val);
|
|
869
|
+
}
|
|
870
|
+
return key;
|
|
465
871
|
}
|
|
872
|
+
/**
|
|
873
|
+
* Perform aggregation using single-pass algorithm for efficiency.
|
|
874
|
+
*/
|
|
466
875
|
agg(spec) {
|
|
467
876
|
const resultColumns = {};
|
|
468
|
-
|
|
877
|
+
// Build list of columns to aggregate and their functions
|
|
878
|
+
const aggPlan = createAggregationPlan(spec);
|
|
879
|
+
// Separate count-only columns from columns that need actual data
|
|
880
|
+
const countOnlyColumns = new Set();
|
|
881
|
+
const dataColumns = [];
|
|
882
|
+
for (const [colName, fns] of aggPlan.functions) {
|
|
883
|
+
const fnArray = Array.isArray(fns) ? fns : [fns];
|
|
884
|
+
const hasOnlyCount = fnArray.every(fn => fn === 'count');
|
|
885
|
+
if (hasOnlyCount && !this.df.hasColumn(colName)) {
|
|
886
|
+
// This is a count-only column (like { count: 'count' })
|
|
887
|
+
countOnlyColumns.add(colName);
|
|
888
|
+
}
|
|
889
|
+
else {
|
|
890
|
+
dataColumns.push(colName);
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
// Cache column references for aggregation columns (excluding count-only)
|
|
894
|
+
const aggColumnRefs = new Map();
|
|
895
|
+
for (const colName of dataColumns) {
|
|
896
|
+
aggColumnRefs.set(colName, this.df.column(colName));
|
|
897
|
+
}
|
|
898
|
+
// Single-pass aggregation: create accumulators for each group
|
|
899
|
+
const groupedAccs = new GroupedAccumulators(dataColumns);
|
|
900
|
+
// Iterate through data once, accumulating all stats
|
|
901
|
+
for (let i = 0; i < this.df.length; i++) {
|
|
902
|
+
// Compute group key
|
|
903
|
+
const key = this.createGroupKey(i);
|
|
904
|
+
// Add values to accumulators for each aggregation column
|
|
905
|
+
for (const colName of dataColumns) {
|
|
906
|
+
const value = aggColumnRefs.get(colName).get(i);
|
|
907
|
+
if (value !== null && typeof value === 'number' && !isNaN(value)) {
|
|
908
|
+
groupedAccs.getAccumulator(key, colName).add(value);
|
|
909
|
+
}
|
|
910
|
+
else if (value !== null) {
|
|
911
|
+
// For count, we still need to track non-null values
|
|
912
|
+
// Use a dummy add for tracking count
|
|
913
|
+
groupedAccs.getAccumulator(key, colName);
|
|
914
|
+
}
|
|
915
|
+
}
|
|
916
|
+
}
|
|
917
|
+
// Build result columns for group keys (preserve original order)
|
|
918
|
+
this.groupColumns.forEach((colName, colIdx) => {
|
|
469
919
|
const groupValues = [];
|
|
470
|
-
const column = this.
|
|
471
|
-
for (const
|
|
472
|
-
|
|
473
|
-
|
|
920
|
+
const column = this.cachedGroupCols[colIdx];
|
|
921
|
+
for (const entry of this.groupOrder) {
|
|
922
|
+
const indices = this.groups.get(entry.key);
|
|
923
|
+
if (indices && indices.length > 0) {
|
|
924
|
+
groupValues.push(column.get(indices[0]));
|
|
925
|
+
}
|
|
474
926
|
}
|
|
475
927
|
resultColumns[colName] = new Column(colName, groupValues);
|
|
476
928
|
});
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
929
|
+
// Build result columns for aggregated values
|
|
930
|
+
for (const [colName, fns] of aggPlan.functions) {
|
|
931
|
+
for (const fn of fns) {
|
|
480
932
|
const aggValues = [];
|
|
481
|
-
for (const
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
933
|
+
for (const entry of this.groupOrder) {
|
|
934
|
+
if (fn === 'count' || countOnlyColumns.has(colName)) {
|
|
935
|
+
// For count, return number of rows in group
|
|
936
|
+
const indices = this.groups.get(entry.key);
|
|
937
|
+
aggValues.push(indices ? indices.length : 0);
|
|
486
938
|
}
|
|
487
939
|
else {
|
|
488
|
-
|
|
489
|
-
const groupValues = indices.map(i => this.df.column(colName).get(i));
|
|
490
|
-
const groupColumn = new Column(`temp_${colName}`, groupValues);
|
|
491
|
-
value = this.computeAggregateValue(groupColumn, fn);
|
|
940
|
+
aggValues.push(groupedAccs.getValue(entry.key, colName, fn));
|
|
492
941
|
}
|
|
493
|
-
aggValues.push(value);
|
|
494
942
|
}
|
|
495
|
-
const resultColName =
|
|
943
|
+
const resultColName = fns.length === 1 ? colName : `${colName}_${fn}`;
|
|
496
944
|
resultColumns[resultColName] = new Column(resultColName, aggValues, 'float64');
|
|
497
|
-
}
|
|
498
|
-
}
|
|
945
|
+
}
|
|
946
|
+
}
|
|
499
947
|
return new DataFrame(resultColumns);
|
|
500
948
|
}
|
|
501
949
|
computeAggregateValue(column, fn) {
|
|
@@ -538,7 +986,27 @@ class GroupBy {
|
|
|
538
986
|
return Math.sqrt(this.computeVar(column));
|
|
539
987
|
}
|
|
540
988
|
count() {
|
|
541
|
-
|
|
989
|
+
const resultColumns = {};
|
|
990
|
+
// Build group key columns
|
|
991
|
+
this.groupColumns.forEach((colName, colIdx) => {
|
|
992
|
+
const column = this.cachedGroupCols[colIdx];
|
|
993
|
+
const values = [];
|
|
994
|
+
for (const entry of this.groupOrder) {
|
|
995
|
+
const indices = this.groups.get(entry.key);
|
|
996
|
+
if (indices && indices.length > 0) {
|
|
997
|
+
values.push(column.get(indices[0]));
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
resultColumns[colName] = new Column(colName, values);
|
|
1001
|
+
});
|
|
1002
|
+
// Add count column
|
|
1003
|
+
const counts = [];
|
|
1004
|
+
for (const entry of this.groupOrder) {
|
|
1005
|
+
const indices = this.groups.get(entry.key);
|
|
1006
|
+
counts.push(indices ? indices.length : 0);
|
|
1007
|
+
}
|
|
1008
|
+
resultColumns['count'] = new Column('count', counts, 'int32');
|
|
1009
|
+
return new DataFrame(resultColumns);
|
|
542
1010
|
}
|
|
543
1011
|
sum(columns) {
|
|
544
1012
|
const spec = {};
|
|
@@ -570,19 +1038,17 @@ class GroupBy {
|
|
|
570
1038
|
}
|
|
571
1039
|
first() {
|
|
572
1040
|
const resultColumns = {};
|
|
1041
|
+
// Cache all column references
|
|
1042
|
+
const colRefs = new Map();
|
|
1043
|
+
for (const colName of this.df.columnNames) {
|
|
1044
|
+
colRefs.set(colName, this.df.column(colName));
|
|
1045
|
+
}
|
|
573
1046
|
this.df.columnNames.forEach(colName => {
|
|
574
1047
|
const values = [];
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
values.push(keyValues[colIndex]);
|
|
580
|
-
}
|
|
581
|
-
}
|
|
582
|
-
else {
|
|
583
|
-
for (const indices of this.groups.values()) {
|
|
584
|
-
const firstIndex = indices[0];
|
|
585
|
-
values.push(this.df.column(colName).get(firstIndex));
|
|
1048
|
+
for (const entry of this.groupOrder) {
|
|
1049
|
+
const indices = this.groups.get(entry.key);
|
|
1050
|
+
if (indices && indices.length > 0) {
|
|
1051
|
+
values.push(colRefs.get(colName).get(indices[0]));
|
|
586
1052
|
}
|
|
587
1053
|
}
|
|
588
1054
|
resultColumns[colName] = new Column(colName, values);
|
|
@@ -591,19 +1057,17 @@ class GroupBy {
|
|
|
591
1057
|
}
|
|
592
1058
|
last() {
|
|
593
1059
|
const resultColumns = {};
|
|
1060
|
+
// Cache all column references
|
|
1061
|
+
const colRefs = new Map();
|
|
1062
|
+
for (const colName of this.df.columnNames) {
|
|
1063
|
+
colRefs.set(colName, this.df.column(colName));
|
|
1064
|
+
}
|
|
594
1065
|
this.df.columnNames.forEach(colName => {
|
|
595
1066
|
const values = [];
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
values.push(keyValues[colIndex]);
|
|
601
|
-
}
|
|
602
|
-
}
|
|
603
|
-
else {
|
|
604
|
-
for (const indices of this.groups.values()) {
|
|
605
|
-
const lastIndex = indices[indices.length - 1];
|
|
606
|
-
values.push(this.df.column(colName).get(lastIndex));
|
|
1067
|
+
for (const entry of this.groupOrder) {
|
|
1068
|
+
const indices = this.groups.get(entry.key);
|
|
1069
|
+
if (indices && indices.length > 0) {
|
|
1070
|
+
values.push(colRefs.get(colName).get(indices[indices.length - 1]));
|
|
607
1071
|
}
|
|
608
1072
|
}
|
|
609
1073
|
resultColumns[colName] = new Column(colName, values);
|
|
@@ -611,15 +1075,22 @@ class GroupBy {
|
|
|
611
1075
|
return new DataFrame(resultColumns);
|
|
612
1076
|
}
|
|
613
1077
|
size() {
|
|
614
|
-
Array.from(this.groups.keys());
|
|
615
|
-
const groupSizes = Array.from(this.groups.values()).map(indices => indices.length);
|
|
616
1078
|
const resultColumns = {};
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
1079
|
+
const groupSizes = [];
|
|
1080
|
+
// Build group key columns and sizes
|
|
1081
|
+
this.groupColumns.forEach((colName, colIdx) => {
|
|
1082
|
+
const column = this.cachedGroupCols[colIdx];
|
|
1083
|
+
const values = [];
|
|
1084
|
+
for (const entry of this.groupOrder) {
|
|
1085
|
+
const indices = this.groups.get(entry.key);
|
|
1086
|
+
if (indices && indices.length > 0) {
|
|
1087
|
+
values.push(column.get(indices[0]));
|
|
1088
|
+
// Only add to groupSizes on first column iteration
|
|
1089
|
+
if (colIdx === 0) {
|
|
1090
|
+
groupSizes.push(indices.length);
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
}
|
|
623
1094
|
resultColumns[colName] = new Column(colName, values);
|
|
624
1095
|
});
|
|
625
1096
|
resultColumns['size'] = new Column('size', groupSizes, 'int32');
|
|
@@ -630,6 +1101,75 @@ DataFrame.prototype.groupBy = function (columns) {
|
|
|
630
1101
|
return new GroupBy(this, columns);
|
|
631
1102
|
};
|
|
632
1103
|
|
|
1104
|
+
/**
|
|
1105
|
+
* IndexCache provides caching for hash indices used in join and groupBy operations.
|
|
1106
|
+
* Uses WeakMap to allow garbage collection of DataFrames.
|
|
1107
|
+
*/
|
|
1108
|
+
class IndexCache {
|
|
1109
|
+
constructor(maxAge = 60000) {
|
|
1110
|
+
this.cache = new WeakMap();
|
|
1111
|
+
this.maxAge = maxAge;
|
|
1112
|
+
}
|
|
1113
|
+
/**
|
|
1114
|
+
* Generate a cache key from column names.
|
|
1115
|
+
*/
|
|
1116
|
+
getCacheKey(columns) {
|
|
1117
|
+
return columns.slice().sort().join('\x00');
|
|
1118
|
+
}
|
|
1119
|
+
/**
|
|
1120
|
+
* Get a cached index for the given DataFrame and columns.
|
|
1121
|
+
* Returns null if not cached or expired.
|
|
1122
|
+
*/
|
|
1123
|
+
getIndex(df, columns) {
|
|
1124
|
+
const dfCache = this.cache.get(df);
|
|
1125
|
+
if (!dfCache)
|
|
1126
|
+
return null;
|
|
1127
|
+
const key = this.getCacheKey(columns);
|
|
1128
|
+
const cached = dfCache.get(key);
|
|
1129
|
+
if (!cached)
|
|
1130
|
+
return null;
|
|
1131
|
+
// Check if expired
|
|
1132
|
+
if (Date.now() - cached.createdAt > this.maxAge) {
|
|
1133
|
+
dfCache.delete(key);
|
|
1134
|
+
return null;
|
|
1135
|
+
}
|
|
1136
|
+
return cached.index;
|
|
1137
|
+
}
|
|
1138
|
+
/**
|
|
1139
|
+
* Store an index in the cache.
|
|
1140
|
+
*/
|
|
1141
|
+
setIndex(df, columns, index) {
|
|
1142
|
+
let dfCache = this.cache.get(df);
|
|
1143
|
+
if (!dfCache) {
|
|
1144
|
+
dfCache = new Map();
|
|
1145
|
+
this.cache.set(df, dfCache);
|
|
1146
|
+
}
|
|
1147
|
+
const key = this.getCacheKey(columns);
|
|
1148
|
+
dfCache.set(key, {
|
|
1149
|
+
columns: columns.slice(),
|
|
1150
|
+
index,
|
|
1151
|
+
createdAt: Date.now()
|
|
1152
|
+
});
|
|
1153
|
+
}
|
|
1154
|
+
/**
|
|
1155
|
+
* Invalidate all cached indices for a DataFrame.
|
|
1156
|
+
*/
|
|
1157
|
+
invalidate(df) {
|
|
1158
|
+
this.cache.delete(df);
|
|
1159
|
+
}
|
|
1160
|
+
/**
|
|
1161
|
+
* Clear all cached indices.
|
|
1162
|
+
*/
|
|
1163
|
+
clear() {
|
|
1164
|
+
// WeakMap doesn't have a clear method, so we create a new one
|
|
1165
|
+
this.cache = new WeakMap();
|
|
1166
|
+
}
|
|
1167
|
+
}
|
|
1168
|
+
/**
|
|
1169
|
+
* Global index cache instance for shared use across operations.
|
|
1170
|
+
*/
|
|
1171
|
+
const globalIndexCache = new IndexCache();
|
|
1172
|
+
|
|
633
1173
|
class Joiner {
|
|
634
1174
|
static join(left, right, on, how = 'inner', suffixes = ['_x', '_y']) {
|
|
635
1175
|
const joinKeys = Array.isArray(on) ? on : [on];
|
|
@@ -659,33 +1199,67 @@ class Joiner {
|
|
|
659
1199
|
}
|
|
660
1200
|
});
|
|
661
1201
|
}
|
|
662
|
-
static buildHashIndex(df, keys) {
|
|
1202
|
+
static buildHashIndex(df, keys, useCache = true) {
|
|
1203
|
+
// Check cache first
|
|
1204
|
+
if (useCache) {
|
|
1205
|
+
const cached = globalIndexCache.getIndex(df, keys);
|
|
1206
|
+
if (cached) {
|
|
1207
|
+
return cached;
|
|
1208
|
+
}
|
|
1209
|
+
}
|
|
663
1210
|
const index = new Map();
|
|
1211
|
+
// Cache column references once before the loop
|
|
1212
|
+
const columns = keys.map(k => df.column(k));
|
|
664
1213
|
for (let i = 0; i < df.length; i++) {
|
|
665
|
-
const
|
|
666
|
-
|
|
667
|
-
|
|
1214
|
+
const key = this.createJoinKey(columns, i);
|
|
1215
|
+
const indices = index.get(key);
|
|
1216
|
+
if (indices) {
|
|
1217
|
+
indices.push(i);
|
|
1218
|
+
}
|
|
1219
|
+
else {
|
|
1220
|
+
index.set(key, [i]);
|
|
668
1221
|
}
|
|
669
|
-
|
|
1222
|
+
}
|
|
1223
|
+
// Store in cache
|
|
1224
|
+
if (useCache) {
|
|
1225
|
+
globalIndexCache.setIndex(df, keys, index);
|
|
670
1226
|
}
|
|
671
1227
|
return index;
|
|
672
1228
|
}
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
1229
|
+
/**
|
|
1230
|
+
* Create a simple string key for a row using '||' separator.
|
|
1231
|
+
*/
|
|
1232
|
+
static createJoinKey(columns, rowIndex) {
|
|
1233
|
+
let key = '';
|
|
1234
|
+
for (let i = 0; i < columns.length; i++) {
|
|
1235
|
+
if (i > 0)
|
|
1236
|
+
key += '||';
|
|
1237
|
+
const val = columns[i].get(rowIndex);
|
|
1238
|
+
key += val === null ? '\0' : String(val);
|
|
1239
|
+
}
|
|
1240
|
+
return key;
|
|
680
1241
|
}
|
|
681
1242
|
static innerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
682
1243
|
const matches = [];
|
|
683
|
-
|
|
1244
|
+
// Cache column references for key lookups
|
|
1245
|
+
const leftColumns = joinKeys.map(k => left.column(k));
|
|
1246
|
+
// Track which left rows have been processed to avoid duplicates
|
|
1247
|
+
const processedLeft = new Set();
|
|
1248
|
+
// Iterate through left rows in original order
|
|
1249
|
+
for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
|
|
1250
|
+
if (processedLeft.has(leftIdx))
|
|
1251
|
+
continue;
|
|
1252
|
+
const key = this.createJoinKey(leftColumns, leftIdx);
|
|
684
1253
|
const rightIndices = rightIndex.get(key);
|
|
685
1254
|
if (rightIndices) {
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
1255
|
+
// Get all left rows with the same key
|
|
1256
|
+
const leftIndices = leftIndex.get(key);
|
|
1257
|
+
if (leftIndices) {
|
|
1258
|
+
for (const lIdx of leftIndices) {
|
|
1259
|
+
processedLeft.add(lIdx);
|
|
1260
|
+
for (const rightIdx of rightIndices) {
|
|
1261
|
+
matches.push([lIdx, rightIdx]);
|
|
1262
|
+
}
|
|
689
1263
|
}
|
|
690
1264
|
}
|
|
691
1265
|
}
|
|
@@ -694,18 +1268,26 @@ class Joiner {
|
|
|
694
1268
|
}
|
|
695
1269
|
static leftJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
696
1270
|
const matches = [];
|
|
697
|
-
|
|
1271
|
+
const leftColumns = joinKeys.map(k => left.column(k));
|
|
1272
|
+
const processedLeft = new Set();
|
|
1273
|
+
// Iterate through left rows in original order
|
|
1274
|
+
for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
|
|
1275
|
+
if (processedLeft.has(leftIdx))
|
|
1276
|
+
continue;
|
|
1277
|
+
const key = this.createJoinKey(leftColumns, leftIdx);
|
|
698
1278
|
const rightIndices = rightIndex.get(key);
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
1279
|
+
const leftIndices = leftIndex.get(key);
|
|
1280
|
+
if (leftIndices) {
|
|
1281
|
+
for (const lIdx of leftIndices) {
|
|
1282
|
+
processedLeft.add(lIdx);
|
|
1283
|
+
if (rightIndices) {
|
|
1284
|
+
for (const rightIdx of rightIndices) {
|
|
1285
|
+
matches.push([lIdx, rightIdx]);
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
else {
|
|
1289
|
+
matches.push([lIdx, null]);
|
|
703
1290
|
}
|
|
704
|
-
}
|
|
705
|
-
}
|
|
706
|
-
else {
|
|
707
|
-
for (const leftIdx of leftIndices) {
|
|
708
|
-
matches.push([leftIdx, null]);
|
|
709
1291
|
}
|
|
710
1292
|
}
|
|
711
1293
|
}
|
|
@@ -713,18 +1295,26 @@ class Joiner {
|
|
|
713
1295
|
}
|
|
714
1296
|
static rightJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
715
1297
|
const matches = [];
|
|
716
|
-
|
|
1298
|
+
const rightColumns = joinKeys.map(k => right.column(k));
|
|
1299
|
+
const processedRight = new Set();
|
|
1300
|
+
// Iterate through right rows in original order
|
|
1301
|
+
for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
|
|
1302
|
+
if (processedRight.has(rightIdx))
|
|
1303
|
+
continue;
|
|
1304
|
+
const key = this.createJoinKey(rightColumns, rightIdx);
|
|
717
1305
|
const leftIndices = leftIndex.get(key);
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
1306
|
+
const rightIndices = rightIndex.get(key);
|
|
1307
|
+
if (rightIndices) {
|
|
1308
|
+
for (const rIdx of rightIndices) {
|
|
1309
|
+
processedRight.add(rIdx);
|
|
1310
|
+
if (leftIndices) {
|
|
1311
|
+
for (const leftIdx of leftIndices) {
|
|
1312
|
+
matches.push([leftIdx, rIdx]);
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
else {
|
|
1316
|
+
matches.push([null, rIdx]);
|
|
722
1317
|
}
|
|
723
|
-
}
|
|
724
|
-
}
|
|
725
|
-
else {
|
|
726
|
-
for (const rightIdx of rightIndices) {
|
|
727
|
-
matches.push([null, rightIdx]);
|
|
728
1318
|
}
|
|
729
1319
|
}
|
|
730
1320
|
}
|
|
@@ -733,27 +1323,37 @@ class Joiner {
|
|
|
733
1323
|
static outerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
|
|
734
1324
|
const matches = [];
|
|
735
1325
|
const processedRightKeys = new Set();
|
|
736
|
-
|
|
1326
|
+
const processedLeft = new Set();
|
|
1327
|
+
const leftColumns = joinKeys.map(k => left.column(k));
|
|
1328
|
+
const rightColumns = joinKeys.map(k => right.column(k));
|
|
1329
|
+
// Process left side first (in original order)
|
|
1330
|
+
for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
|
|
1331
|
+
if (processedLeft.has(leftIdx))
|
|
1332
|
+
continue;
|
|
1333
|
+
const key = this.createJoinKey(leftColumns, leftIdx);
|
|
737
1334
|
const rightIndices = rightIndex.get(key);
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
for (const
|
|
741
|
-
|
|
742
|
-
|
|
1335
|
+
const leftIndices = leftIndex.get(key);
|
|
1336
|
+
if (leftIndices) {
|
|
1337
|
+
for (const lIdx of leftIndices) {
|
|
1338
|
+
processedLeft.add(lIdx);
|
|
1339
|
+
if (rightIndices) {
|
|
1340
|
+
processedRightKeys.add(key);
|
|
1341
|
+
for (const rightIdx of rightIndices) {
|
|
1342
|
+
matches.push([lIdx, rightIdx]);
|
|
1343
|
+
}
|
|
1344
|
+
}
|
|
1345
|
+
else {
|
|
1346
|
+
matches.push([lIdx, null]);
|
|
743
1347
|
}
|
|
744
|
-
}
|
|
745
|
-
}
|
|
746
|
-
else {
|
|
747
|
-
for (const leftIdx of leftIndices) {
|
|
748
|
-
matches.push([leftIdx, null]);
|
|
749
1348
|
}
|
|
750
1349
|
}
|
|
751
1350
|
}
|
|
752
|
-
|
|
1351
|
+
// Add unmatched right rows (in original order)
|
|
1352
|
+
for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
|
|
1353
|
+
const key = this.createJoinKey(rightColumns, rightIdx);
|
|
753
1354
|
if (!processedRightKeys.has(key)) {
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
}
|
|
1355
|
+
matches.push([null, rightIdx]);
|
|
1356
|
+
processedRightKeys.add(key); // Mark this key as processed
|
|
757
1357
|
}
|
|
758
1358
|
}
|
|
759
1359
|
return this.buildJoinedDataFrame(left, right, matches, joinKeys, suffixes);
|