databonk 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -88,6 +88,60 @@ class BitSet {
88
88
  n = (n & 0x33333333) + ((n >>> 2) & 0x33333333);
89
89
  return (((n + (n >>> 4)) & 0x0f0f0f0f) * 0x01010101) >>> 24;
90
90
  }
91
+ /**
92
+ * Get a batch of null flags as a bitmask.
93
+ * Useful for SIMD-style batch null checking.
94
+ * @param startIndex The starting index (must be aligned to 32 for optimal performance)
95
+ * @param count Number of bits to get (max 32)
96
+ * @returns A number where bit i is set if index (startIndex + i) is null
97
+ */
98
+ getNullMaskBatch(startIndex, count) {
99
+ if (count <= 0 || count > 32) {
100
+ throw new Error('Count must be between 1 and 32');
101
+ }
102
+ const arrayIndex = Math.floor(startIndex / 32);
103
+ const bitOffset = startIndex % 32;
104
+ if (bitOffset === 0 && count === 32) {
105
+ // Aligned access - fast path
106
+ return this.data[arrayIndex] >>> 0;
107
+ }
108
+ // Extract bits across word boundaries if needed
109
+ let result = this.data[arrayIndex] >>> bitOffset;
110
+ if (bitOffset + count > 32 && arrayIndex + 1 < this.data.length) {
111
+ // Need bits from next word
112
+ const bitsFromFirst = 32 - bitOffset;
113
+ const bitsFromSecond = count - bitsFromFirst;
114
+ const nextWord = this.data[arrayIndex + 1];
115
+ result |= (nextWord & ((1 << bitsFromSecond) - 1)) << bitsFromFirst;
116
+ }
117
+ // Mask to requested count
118
+ return result & ((1 << count) - 1);
119
+ }
120
+ /**
121
+ * Check if any bit in a range is set.
122
+ * Faster than checking each bit individually.
123
+ */
124
+ anySet(startIndex, count) {
125
+ const endIndex = Math.min(startIndex + count, this.length);
126
+ for (let i = startIndex; i < endIndex;) {
127
+ const arrayIndex = Math.floor(i / 32);
128
+ const bitOffset = i % 32;
129
+ const bitsToCheck = Math.min(32 - bitOffset, endIndex - i);
130
+ const mask = ((1 << bitsToCheck) - 1) << bitOffset;
131
+ if ((this.data[arrayIndex] & mask) !== 0) {
132
+ return true;
133
+ }
134
+ i += bitsToCheck;
135
+ }
136
+ return false;
137
+ }
138
+ /**
139
+ * Get direct access to the underlying data array.
140
+ * @internal
141
+ */
142
+ getDataRef() {
143
+ return this.data;
144
+ }
91
145
  *[Symbol.iterator]() {
92
146
  for (let i = 0; i < this.length; i++) {
93
147
  yield this.get(i);
@@ -130,6 +184,30 @@ class Column {
130
184
  }
131
185
  return this.data[index];
132
186
  }
187
+ /**
188
+ * Get a value without bounds checking or null handling.
189
+ * Use only when caller ensures valid index and handles nulls separately.
190
+ * @internal
191
+ */
192
+ getRaw(index) {
193
+ return this.data[index];
194
+ }
195
+ /**
196
+ * Get direct reference to the underlying data array.
197
+ * Use for batch operations that need raw access.
198
+ * @internal
199
+ */
200
+ getDataRef() {
201
+ return this.data;
202
+ }
203
+ /**
204
+ * Get direct reference to the null bitmap.
205
+ * Use for batch null checking.
206
+ * @internal
207
+ */
208
+ getNullBitmapRef() {
209
+ return this.nullBitmap;
210
+ }
133
211
  isNull(index) {
134
212
  return this.nullBitmap.get(index);
135
213
  }
@@ -240,8 +318,105 @@ class Column {
240
318
  static from(name, values, dataType) {
241
319
  return new Column(name, values, dataType);
242
320
  }
321
+ /**
322
+ * Create a Column directly from raw data without copying.
323
+ * Use for optimized construction when data is already in the correct format.
324
+ * @internal
325
+ */
326
+ static fromRaw(name, data, nullBitmap, dataType) {
327
+ // Create an instance without going through the normal constructor
328
+ const column = Object.create(Column.prototype);
329
+ column.name = name;
330
+ column.dataType = dataType;
331
+ column.data = data;
332
+ column.nullBitmap = nullBitmap;
333
+ column.length = data.length;
334
+ return column;
335
+ }
336
+ /**
337
+ * Select rows by indices with optimized batch copying.
338
+ * Much faster than calling get() for each index.
339
+ */
340
+ selectIndices(indices) {
341
+ const newLength = indices.length;
342
+ const Constructor = TYPE_CONSTRUCTORS[this.dataType];
343
+ if (Constructor) {
344
+ // TypedArray fast path - batch copy
345
+ const newData = new Constructor(newLength);
346
+ const newNullBitmap = new BitSet(newLength);
347
+ const srcData = this.data;
348
+ for (let i = 0; i < newLength; i++) {
349
+ const srcIdx = indices[i];
350
+ newData[i] = srcData[srcIdx];
351
+ if (this.nullBitmap.get(srcIdx)) {
352
+ newNullBitmap.set(i, true);
353
+ }
354
+ }
355
+ return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
356
+ }
357
+ // Regular array fallback
358
+ const newData = new Array(newLength);
359
+ const newNullBitmap = new BitSet(newLength);
360
+ for (let i = 0; i < newLength; i++) {
361
+ const srcIdx = indices[i];
362
+ newData[i] = this.data[srcIdx];
363
+ if (this.nullBitmap.get(srcIdx)) {
364
+ newNullBitmap.set(i, true);
365
+ }
366
+ }
367
+ return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
368
+ }
243
369
  }
244
370
 
371
+ /**
372
+ * RowProxy provides zero-allocation row access for iteration.
373
+ * Reuses a single object while iterating, avoiding object creation per row.
374
+ */
375
+ class RowProxy {
376
+ constructor(df) {
377
+ this.columnCache = new Map();
378
+ this.index = 0;
379
+ for (const name of df.columnNames) {
380
+ this.columnCache.set(name, df.column(name));
381
+ }
382
+ }
383
+ /**
384
+ * Set the current row index.
385
+ * @returns this for chaining
386
+ */
387
+ setIndex(i) {
388
+ this.index = i;
389
+ return this;
390
+ }
391
+ /**
392
+ * Get a value from the current row.
393
+ */
394
+ get(col) {
395
+ const column = this.columnCache.get(col);
396
+ if (!column) {
397
+ throw new Error(`Column '${col}' not found`);
398
+ }
399
+ return column.get(this.index);
400
+ }
401
+ /**
402
+ * Get a value without null checking (faster for non-null columns).
403
+ */
404
+ getRaw(col) {
405
+ return this.columnCache.get(col).getRaw(this.index);
406
+ }
407
+ /**
408
+ * Check if a column value is null at the current row.
409
+ */
410
+ isNull(col) {
411
+ return this.columnCache.get(col).isNull(this.index);
412
+ }
413
+ /**
414
+ * Get the current row index.
415
+ */
416
+ getIndex() {
417
+ return this.index;
418
+ }
419
+ }
245
420
  class DataFrame {
246
421
  constructor(data) {
247
422
  this.columns = new Map();
@@ -306,8 +481,16 @@ class DataFrame {
306
481
  }
307
482
  filter(predicate) {
308
483
  const indices = [];
484
+ // Cache column references for the predicate
485
+ const columnRefs = [];
486
+ this.columns.forEach((col, name) => columnRefs.push([name, col]));
487
+ // Reuse a single row object to reduce allocations
488
+ const row = {};
309
489
  for (let i = 0; i < this.length; i++) {
310
- const row = this.getRow(i);
490
+ // Populate row object using cached column references
491
+ for (const [name, col] of columnRefs) {
492
+ row[name] = col.get(i);
493
+ }
311
494
  if (predicate(row, i)) {
312
495
  indices.push(i);
313
496
  }
@@ -326,11 +509,32 @@ class DataFrame {
326
509
  selectRows(indices) {
327
510
  const selectedColumns = {};
328
511
  this.columns.forEach((column, name) => {
329
- const values = indices.map(i => column.get(i));
330
- selectedColumns[name] = new Column(name, values, column.dataType);
512
+ // Use optimized batch selection instead of individual get() calls
513
+ selectedColumns[name] = column.selectIndices(indices);
331
514
  });
332
515
  return new DataFrame(selectedColumns);
333
516
  }
517
+ /**
518
+ * Filter rows using a predicate function that receives a RowProxy.
519
+ * More efficient than filter() as it avoids creating a new object per row.
520
+ */
521
+ filterByIndex(predicate) {
522
+ const proxy = new RowProxy(this);
523
+ const indices = [];
524
+ for (let i = 0; i < this.length; i++) {
525
+ if (predicate(i, proxy.setIndex(i))) {
526
+ indices.push(i);
527
+ }
528
+ }
529
+ return this.selectRows(indices);
530
+ }
531
+ /**
532
+ * Create a RowProxy for efficient iteration.
533
+ * Use this when you need to access multiple columns per row without allocation.
534
+ */
535
+ createRowProxy() {
536
+ return new RowProxy(this);
537
+ }
334
538
  getRow(index) {
335
539
  if (index < 0 || index >= this.length) {
336
540
  throw new Error('Index out of bounds');
@@ -438,66 +642,310 @@ class DataFrame {
438
642
  }
439
643
  }
440
644
 
645
+ /**
646
+ * StatAccumulator tracks multiple statistics in a single pass through the data.
647
+ * Uses Welford's online algorithm for numerically stable variance computation.
648
+ */
649
+ class StatAccumulator {
650
+ constructor() {
651
+ this.count = 0;
652
+ this.sum = 0;
653
+ this.min = Infinity;
654
+ this.max = -Infinity;
655
+ this.mean_ = 0;
656
+ this.m2 = 0; // Sum of squares of differences from current mean
657
+ }
658
+ /**
659
+ * Add a single value to the accumulator.
660
+ * Updates all statistics in O(1) time.
661
+ */
662
+ add(value) {
663
+ this.count++;
664
+ this.sum += value;
665
+ if (value < this.min)
666
+ this.min = value;
667
+ if (value > this.max)
668
+ this.max = value;
669
+ // Welford's online algorithm for stable variance
670
+ const delta = value - this.mean_;
671
+ this.mean_ += delta / this.count;
672
+ const delta2 = value - this.mean_;
673
+ this.m2 += delta * delta2;
674
+ }
675
+ /**
676
+ * Get the mean of all added values.
677
+ */
678
+ getMean() {
679
+ return this.count > 0 ? this.mean_ : 0;
680
+ }
681
+ /**
682
+ * Get the sample variance (n-1 denominator).
683
+ */
684
+ getVariance() {
685
+ return this.count > 1 ? this.m2 / (this.count - 1) : 0;
686
+ }
687
+ /**
688
+ * Get the sample standard deviation.
689
+ */
690
+ getStd() {
691
+ return Math.sqrt(this.getVariance());
692
+ }
693
+ /**
694
+ * Get a specific aggregate value by function name.
695
+ */
696
+ getValue(fn) {
697
+ switch (fn) {
698
+ case 'sum':
699
+ return this.sum;
700
+ case 'mean':
701
+ return this.getMean();
702
+ case 'min':
703
+ return this.count > 0 ? this.min : NaN;
704
+ case 'max':
705
+ return this.count > 0 ? this.max : NaN;
706
+ case 'count':
707
+ return this.count;
708
+ case 'var':
709
+ return this.getVariance();
710
+ case 'std':
711
+ return this.getStd();
712
+ default:
713
+ throw new Error(`Unknown aggregate function: ${fn}`);
714
+ }
715
+ }
716
+ /**
717
+ * Merge another accumulator into this one.
718
+ * Useful for parallel aggregation.
719
+ */
720
+ merge(other) {
721
+ if (other.count === 0)
722
+ return;
723
+ if (this.count === 0) {
724
+ this.count = other.count;
725
+ this.sum = other.sum;
726
+ this.min = other.min;
727
+ this.max = other.max;
728
+ this.mean_ = other.mean_;
729
+ this.m2 = other.m2;
730
+ return;
731
+ }
732
+ const totalCount = this.count + other.count;
733
+ const delta = other.mean_ - this.mean_;
734
+ // Combined mean
735
+ this.mean_ = (this.count * this.mean_ + other.count * other.mean_) / totalCount;
736
+ // Combined M2 using parallel algorithm
737
+ this.m2 = this.m2 + other.m2 + delta * delta * this.count * other.count / totalCount;
738
+ this.sum += other.sum;
739
+ this.count = totalCount;
740
+ if (other.min < this.min)
741
+ this.min = other.min;
742
+ if (other.max > this.max)
743
+ this.max = other.max;
744
+ }
745
+ /**
746
+ * Reset the accumulator for reuse.
747
+ */
748
+ reset() {
749
+ this.count = 0;
750
+ this.sum = 0;
751
+ this.min = Infinity;
752
+ this.max = -Infinity;
753
+ this.mean_ = 0;
754
+ this.m2 = 0;
755
+ }
756
+ }
757
+ /**
758
+ * GroupedAccumulators manages StatAccumulators for multiple groups and columns.
759
+ * Enables single-pass aggregation across all groups and aggregate functions.
760
+ */
761
+ class GroupedAccumulators {
762
+ constructor(columns) {
763
+ // Map of groupKey -> columnName -> StatAccumulator
764
+ this.accumulators = new Map();
765
+ this.columns = columns;
766
+ }
767
+ /**
768
+ * Get or create the accumulator for a group and column.
769
+ */
770
+ getAccumulator(groupKey, columnName) {
771
+ let groupAccs = this.accumulators.get(groupKey);
772
+ if (!groupAccs) {
773
+ groupAccs = new Map();
774
+ this.accumulators.set(groupKey, groupAccs);
775
+ }
776
+ let acc = groupAccs.get(columnName);
777
+ if (!acc) {
778
+ acc = new StatAccumulator();
779
+ groupAccs.set(columnName, acc);
780
+ }
781
+ return acc;
782
+ }
783
+ /**
784
+ * Add a value for a specific group and column.
785
+ */
786
+ add(groupKey, columnName, value) {
787
+ if (value !== null && !isNaN(value)) {
788
+ this.getAccumulator(groupKey, columnName).add(value);
789
+ }
790
+ }
791
+ /**
792
+ * Get all group keys.
793
+ */
794
+ getGroups() {
795
+ return Array.from(this.accumulators.keys());
796
+ }
797
+ /**
798
+ * Get the aggregate value for a group and column.
799
+ */
800
+ getValue(groupKey, columnName, fn) {
801
+ const acc = this.accumulators.get(groupKey)?.get(columnName);
802
+ if (!acc) {
803
+ return fn === 'count' ? 0 : NaN;
804
+ }
805
+ return acc.getValue(fn);
806
+ }
807
+ /**
808
+ * Get the count for a group (same across all columns).
809
+ */
810
+ getGroupCount(groupKey) {
811
+ const groupAccs = this.accumulators.get(groupKey);
812
+ if (!groupAccs)
813
+ return 0;
814
+ // Return count from the first column accumulator
815
+ for (const acc of groupAccs.values()) {
816
+ return acc.count;
817
+ }
818
+ return 0;
819
+ }
820
+ }
821
+ /**
822
+ * Creates an aggregation plan from a spec object.
823
+ */
824
+ function createAggregationPlan(spec) {
825
+ const columns = [];
826
+ const functions = new Map();
827
+ for (const [colName, fns] of Object.entries(spec)) {
828
+ const fnArray = Array.isArray(fns) ? fns : [fns];
829
+ columns.push(colName);
830
+ functions.set(colName, fnArray);
831
+ }
832
+ return { columns, functions };
833
+ }
834
+
441
835
  class GroupBy {
442
836
  constructor(df, columns) {
443
- this.groups = new Map();
837
+ this.groupOrder = []; // Track insertion order for consistent output
444
838
  this.df = df;
445
839
  this.groupColumns = columns;
840
+ // Cache column references once
841
+ this.cachedGroupCols = columns.map(c => df.column(c));
842
+ this.groups = new Map();
446
843
  this.computeGroups();
447
844
  }
448
845
  computeGroups() {
449
846
  for (let i = 0; i < this.df.length; i++) {
450
847
  const key = this.createGroupKey(i);
451
- if (!this.groups.has(key)) {
452
- this.groups.set(key, []);
848
+ const existingIndices = this.groups.get(key);
849
+ if (existingIndices) {
850
+ existingIndices.push(i);
851
+ }
852
+ else {
853
+ this.groups.set(key, [i]);
854
+ this.groupOrder.push({
855
+ key,
856
+ firstRowIndex: i
857
+ });
453
858
  }
454
- this.groups.get(key).push(i);
455
859
  }
456
860
  }
861
+ /**
862
+ * Create a simple string key for a row using '||' separator.
863
+ */
457
864
  createGroupKey(rowIndex) {
458
- const keyParts = this.groupColumns.map(colName => {
459
- const column = this.df.column(colName);
460
- const value = column.get(rowIndex);
461
- return value === null ? '__NULL__' : String(value);
462
- });
463
- return keyParts.join('||');
464
- }
465
- parseGroupKey(key) {
466
- return key.split('||').map(part => part === '__NULL__' ? null : part);
865
+ let key = '';
866
+ for (let i = 0; i < this.cachedGroupCols.length; i++) {
867
+ if (i > 0)
868
+ key += '||';
869
+ const val = this.cachedGroupCols[i].get(rowIndex);
870
+ key += val === null ? '\0' : String(val);
871
+ }
872
+ return key;
467
873
  }
874
+ /**
875
+ * Perform aggregation using single-pass algorithm for efficiency.
876
+ */
468
877
  agg(spec) {
469
878
  const resultColumns = {};
470
- this.groupColumns.forEach(colName => {
879
+ // Build list of columns to aggregate and their functions
880
+ const aggPlan = createAggregationPlan(spec);
881
+ // Separate count-only columns from columns that need actual data
882
+ const countOnlyColumns = new Set();
883
+ const dataColumns = [];
884
+ for (const [colName, fns] of aggPlan.functions) {
885
+ const fnArray = Array.isArray(fns) ? fns : [fns];
886
+ const hasOnlyCount = fnArray.every(fn => fn === 'count');
887
+ if (hasOnlyCount && !this.df.hasColumn(colName)) {
888
+ // This is a count-only column (like { count: 'count' })
889
+ countOnlyColumns.add(colName);
890
+ }
891
+ else {
892
+ dataColumns.push(colName);
893
+ }
894
+ }
895
+ // Cache column references for aggregation columns (excluding count-only)
896
+ const aggColumnRefs = new Map();
897
+ for (const colName of dataColumns) {
898
+ aggColumnRefs.set(colName, this.df.column(colName));
899
+ }
900
+ // Single-pass aggregation: create accumulators for each group
901
+ const groupedAccs = new GroupedAccumulators(dataColumns);
902
+ // Iterate through data once, accumulating all stats
903
+ for (let i = 0; i < this.df.length; i++) {
904
+ // Compute group key
905
+ const key = this.createGroupKey(i);
906
+ // Add values to accumulators for each aggregation column
907
+ for (const colName of dataColumns) {
908
+ const value = aggColumnRefs.get(colName).get(i);
909
+ if (value !== null && typeof value === 'number' && !isNaN(value)) {
910
+ groupedAccs.getAccumulator(key, colName).add(value);
911
+ }
912
+ else if (value !== null) {
913
+ // For count, we still need to track non-null values
914
+ // Use a dummy add for tracking count
915
+ groupedAccs.getAccumulator(key, colName);
916
+ }
917
+ }
918
+ }
919
+ // Build result columns for group keys (preserve original order)
920
+ this.groupColumns.forEach((colName, colIdx) => {
471
921
  const groupValues = [];
472
- const column = this.df.column(colName);
473
- for (const indices of this.groups.values()) {
474
- // Get the original value from the first row in each group to preserve type
475
- groupValues.push(column.get(indices[0]));
922
+ const column = this.cachedGroupCols[colIdx];
923
+ for (const entry of this.groupOrder) {
924
+ const indices = this.groups.get(entry.key);
925
+ if (indices && indices.length > 0) {
926
+ groupValues.push(column.get(indices[0]));
927
+ }
476
928
  }
477
929
  resultColumns[colName] = new Column(colName, groupValues);
478
930
  });
479
- Object.entries(spec).forEach(([colName, functions]) => {
480
- const funcArray = Array.isArray(functions) ? functions : [functions];
481
- funcArray.forEach(fn => {
931
+ // Build result columns for aggregated values
932
+ for (const [colName, fns] of aggPlan.functions) {
933
+ for (const fn of fns) {
482
934
  const aggValues = [];
483
- for (const indices of this.groups.values()) {
484
- let value;
485
- if (fn === 'count') {
486
- // Count doesn't need actual column values, just the number of rows
487
- value = indices.length;
935
+ for (const entry of this.groupOrder) {
936
+ if (fn === 'count' || countOnlyColumns.has(colName)) {
937
+ // For count, return number of rows in group
938
+ const indices = this.groups.get(entry.key);
939
+ aggValues.push(indices ? indices.length : 0);
488
940
  }
489
941
  else {
490
- // For other aggregations, we need the actual column values
491
- const groupValues = indices.map(i => this.df.column(colName).get(i));
492
- const groupColumn = new Column(`temp_${colName}`, groupValues);
493
- value = this.computeAggregateValue(groupColumn, fn);
942
+ aggValues.push(groupedAccs.getValue(entry.key, colName, fn));
494
943
  }
495
- aggValues.push(value);
496
944
  }
497
- const resultColName = funcArray.length === 1 ? colName : `${colName}_${fn}`;
945
+ const resultColName = fns.length === 1 ? colName : `${colName}_${fn}`;
498
946
  resultColumns[resultColName] = new Column(resultColName, aggValues, 'float64');
499
- });
500
- });
947
+ }
948
+ }
501
949
  return new DataFrame(resultColumns);
502
950
  }
503
951
  computeAggregateValue(column, fn) {
@@ -540,7 +988,27 @@ class GroupBy {
540
988
  return Math.sqrt(this.computeVar(column));
541
989
  }
542
990
  count() {
543
- return this.agg({ count: 'count' });
991
+ const resultColumns = {};
992
+ // Build group key columns
993
+ this.groupColumns.forEach((colName, colIdx) => {
994
+ const column = this.cachedGroupCols[colIdx];
995
+ const values = [];
996
+ for (const entry of this.groupOrder) {
997
+ const indices = this.groups.get(entry.key);
998
+ if (indices && indices.length > 0) {
999
+ values.push(column.get(indices[0]));
1000
+ }
1001
+ }
1002
+ resultColumns[colName] = new Column(colName, values);
1003
+ });
1004
+ // Add count column
1005
+ const counts = [];
1006
+ for (const entry of this.groupOrder) {
1007
+ const indices = this.groups.get(entry.key);
1008
+ counts.push(indices ? indices.length : 0);
1009
+ }
1010
+ resultColumns['count'] = new Column('count', counts, 'int32');
1011
+ return new DataFrame(resultColumns);
544
1012
  }
545
1013
  sum(columns) {
546
1014
  const spec = {};
@@ -572,19 +1040,17 @@ class GroupBy {
572
1040
  }
573
1041
  first() {
574
1042
  const resultColumns = {};
1043
+ // Cache all column references
1044
+ const colRefs = new Map();
1045
+ for (const colName of this.df.columnNames) {
1046
+ colRefs.set(colName, this.df.column(colName));
1047
+ }
575
1048
  this.df.columnNames.forEach(colName => {
576
1049
  const values = [];
577
- if (this.groupColumns.includes(colName)) {
578
- for (const key of this.groups.keys()) {
579
- const keyValues = this.parseGroupKey(key);
580
- const colIndex = this.groupColumns.indexOf(colName);
581
- values.push(keyValues[colIndex]);
582
- }
583
- }
584
- else {
585
- for (const indices of this.groups.values()) {
586
- const firstIndex = indices[0];
587
- values.push(this.df.column(colName).get(firstIndex));
1050
+ for (const entry of this.groupOrder) {
1051
+ const indices = this.groups.get(entry.key);
1052
+ if (indices && indices.length > 0) {
1053
+ values.push(colRefs.get(colName).get(indices[0]));
588
1054
  }
589
1055
  }
590
1056
  resultColumns[colName] = new Column(colName, values);
@@ -593,19 +1059,17 @@ class GroupBy {
593
1059
  }
594
1060
  last() {
595
1061
  const resultColumns = {};
1062
+ // Cache all column references
1063
+ const colRefs = new Map();
1064
+ for (const colName of this.df.columnNames) {
1065
+ colRefs.set(colName, this.df.column(colName));
1066
+ }
596
1067
  this.df.columnNames.forEach(colName => {
597
1068
  const values = [];
598
- if (this.groupColumns.includes(colName)) {
599
- for (const key of this.groups.keys()) {
600
- const keyValues = this.parseGroupKey(key);
601
- const colIndex = this.groupColumns.indexOf(colName);
602
- values.push(keyValues[colIndex]);
603
- }
604
- }
605
- else {
606
- for (const indices of this.groups.values()) {
607
- const lastIndex = indices[indices.length - 1];
608
- values.push(this.df.column(colName).get(lastIndex));
1069
+ for (const entry of this.groupOrder) {
1070
+ const indices = this.groups.get(entry.key);
1071
+ if (indices && indices.length > 0) {
1072
+ values.push(colRefs.get(colName).get(indices[indices.length - 1]));
609
1073
  }
610
1074
  }
611
1075
  resultColumns[colName] = new Column(colName, values);
@@ -613,15 +1077,22 @@ class GroupBy {
613
1077
  return new DataFrame(resultColumns);
614
1078
  }
615
1079
  size() {
616
- Array.from(this.groups.keys());
617
- const groupSizes = Array.from(this.groups.values()).map(indices => indices.length);
618
1080
  const resultColumns = {};
619
- this.groupColumns.forEach(colName => {
620
- const column = this.df.column(colName);
621
- const values = Array.from(this.groups.values()).map(indices => {
622
- // Get the original value from the first row in each group to preserve type
623
- return column.get(indices[0]);
624
- });
1081
+ const groupSizes = [];
1082
+ // Build group key columns and sizes
1083
+ this.groupColumns.forEach((colName, colIdx) => {
1084
+ const column = this.cachedGroupCols[colIdx];
1085
+ const values = [];
1086
+ for (const entry of this.groupOrder) {
1087
+ const indices = this.groups.get(entry.key);
1088
+ if (indices && indices.length > 0) {
1089
+ values.push(column.get(indices[0]));
1090
+ // Only add to groupSizes on first column iteration
1091
+ if (colIdx === 0) {
1092
+ groupSizes.push(indices.length);
1093
+ }
1094
+ }
1095
+ }
625
1096
  resultColumns[colName] = new Column(colName, values);
626
1097
  });
627
1098
  resultColumns['size'] = new Column('size', groupSizes, 'int32');
@@ -632,6 +1103,75 @@ DataFrame.prototype.groupBy = function (columns) {
632
1103
  return new GroupBy(this, columns);
633
1104
  };
634
1105
 
1106
+ /**
1107
+ * IndexCache provides caching for hash indices used in join and groupBy operations.
1108
+ * Uses WeakMap to allow garbage collection of DataFrames.
1109
+ */
1110
+ class IndexCache {
1111
+ constructor(maxAge = 60000) {
1112
+ this.cache = new WeakMap();
1113
+ this.maxAge = maxAge;
1114
+ }
1115
+ /**
1116
+ * Generate a cache key from column names.
1117
+ */
1118
+ getCacheKey(columns) {
1119
+ return columns.slice().sort().join('\x00');
1120
+ }
1121
+ /**
1122
+ * Get a cached index for the given DataFrame and columns.
1123
+ * Returns null if not cached or expired.
1124
+ */
1125
+ getIndex(df, columns) {
1126
+ const dfCache = this.cache.get(df);
1127
+ if (!dfCache)
1128
+ return null;
1129
+ const key = this.getCacheKey(columns);
1130
+ const cached = dfCache.get(key);
1131
+ if (!cached)
1132
+ return null;
1133
+ // Check if expired
1134
+ if (Date.now() - cached.createdAt > this.maxAge) {
1135
+ dfCache.delete(key);
1136
+ return null;
1137
+ }
1138
+ return cached.index;
1139
+ }
1140
+ /**
1141
+ * Store an index in the cache.
1142
+ */
1143
+ setIndex(df, columns, index) {
1144
+ let dfCache = this.cache.get(df);
1145
+ if (!dfCache) {
1146
+ dfCache = new Map();
1147
+ this.cache.set(df, dfCache);
1148
+ }
1149
+ const key = this.getCacheKey(columns);
1150
+ dfCache.set(key, {
1151
+ columns: columns.slice(),
1152
+ index,
1153
+ createdAt: Date.now()
1154
+ });
1155
+ }
1156
+ /**
1157
+ * Invalidate all cached indices for a DataFrame.
1158
+ */
1159
+ invalidate(df) {
1160
+ this.cache.delete(df);
1161
+ }
1162
+ /**
1163
+ * Clear all cached indices.
1164
+ */
1165
+ clear() {
1166
+ // WeakMap doesn't have a clear method, so we create a new one
1167
+ this.cache = new WeakMap();
1168
+ }
1169
+ }
1170
+ /**
1171
+ * Global index cache instance for shared use across operations.
1172
+ */
1173
+ const globalIndexCache = new IndexCache();
1174
+
635
1175
  class Joiner {
636
1176
  static join(left, right, on, how = 'inner', suffixes = ['_x', '_y']) {
637
1177
  const joinKeys = Array.isArray(on) ? on : [on];
@@ -661,33 +1201,67 @@ class Joiner {
661
1201
  }
662
1202
  });
663
1203
  }
664
- static buildHashIndex(df, keys) {
1204
+ static buildHashIndex(df, keys, useCache = true) {
1205
+ // Check cache first
1206
+ if (useCache) {
1207
+ const cached = globalIndexCache.getIndex(df, keys);
1208
+ if (cached) {
1209
+ return cached;
1210
+ }
1211
+ }
665
1212
  const index = new Map();
1213
+ // Cache column references once before the loop
1214
+ const columns = keys.map(k => df.column(k));
666
1215
  for (let i = 0; i < df.length; i++) {
667
- const keyValue = this.createJoinKey(df, i, keys);
668
- if (!index.has(keyValue)) {
669
- index.set(keyValue, []);
1216
+ const key = this.createJoinKey(columns, i);
1217
+ const indices = index.get(key);
1218
+ if (indices) {
1219
+ indices.push(i);
1220
+ }
1221
+ else {
1222
+ index.set(key, [i]);
670
1223
  }
671
- index.get(keyValue).push(i);
1224
+ }
1225
+ // Store in cache
1226
+ if (useCache) {
1227
+ globalIndexCache.setIndex(df, keys, index);
672
1228
  }
673
1229
  return index;
674
1230
  }
675
- static createJoinKey(df, rowIndex, keys) {
676
- const keyParts = keys.map(key => {
677
- const column = df.column(key);
678
- const value = column.get(rowIndex);
679
- return value === null ? '__NULL__' : String(value);
680
- });
681
- return keyParts.join('||');
1231
+ /**
1232
+ * Create a simple string key for a row using '||' separator.
1233
+ */
1234
+ static createJoinKey(columns, rowIndex) {
1235
+ let key = '';
1236
+ for (let i = 0; i < columns.length; i++) {
1237
+ if (i > 0)
1238
+ key += '||';
1239
+ const val = columns[i].get(rowIndex);
1240
+ key += val === null ? '\0' : String(val);
1241
+ }
1242
+ return key;
682
1243
  }
683
1244
  static innerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
684
1245
  const matches = [];
685
- for (const [key, leftIndices] of leftIndex) {
1246
+ // Cache column references for key lookups
1247
+ const leftColumns = joinKeys.map(k => left.column(k));
1248
+ // Track which left rows have been processed to avoid duplicates
1249
+ const processedLeft = new Set();
1250
+ // Iterate through left rows in original order
1251
+ for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
1252
+ if (processedLeft.has(leftIdx))
1253
+ continue;
1254
+ const key = this.createJoinKey(leftColumns, leftIdx);
686
1255
  const rightIndices = rightIndex.get(key);
687
1256
  if (rightIndices) {
688
- for (const leftIdx of leftIndices) {
689
- for (const rightIdx of rightIndices) {
690
- matches.push([leftIdx, rightIdx]);
1257
+ // Get all left rows with the same key
1258
+ const leftIndices = leftIndex.get(key);
1259
+ if (leftIndices) {
1260
+ for (const lIdx of leftIndices) {
1261
+ processedLeft.add(lIdx);
1262
+ for (const rightIdx of rightIndices) {
1263
+ matches.push([lIdx, rightIdx]);
1264
+ }
691
1265
  }
692
1266
  }
693
1267
  }
@@ -696,18 +1270,26 @@ class Joiner {
696
1270
  }
697
1271
  static leftJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
698
1272
  const matches = [];
699
- for (const [key, leftIndices] of leftIndex) {
1273
+ const leftColumns = joinKeys.map(k => left.column(k));
1274
+ const processedLeft = new Set();
1275
+ // Iterate through left rows in original order
1276
+ for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
1277
+ if (processedLeft.has(leftIdx))
1278
+ continue;
1279
+ const key = this.createJoinKey(leftColumns, leftIdx);
700
1280
  const rightIndices = rightIndex.get(key);
701
- if (rightIndices) {
702
- for (const leftIdx of leftIndices) {
703
- for (const rightIdx of rightIndices) {
704
- matches.push([leftIdx, rightIdx]);
1281
+ const leftIndices = leftIndex.get(key);
1282
+ if (leftIndices) {
1283
+ for (const lIdx of leftIndices) {
1284
+ processedLeft.add(lIdx);
1285
+ if (rightIndices) {
1286
+ for (const rightIdx of rightIndices) {
1287
+ matches.push([lIdx, rightIdx]);
1288
+ }
1289
+ }
1290
+ else {
1291
+ matches.push([lIdx, null]);
705
1292
  }
706
- }
707
- }
708
- else {
709
- for (const leftIdx of leftIndices) {
710
- matches.push([leftIdx, null]);
711
1293
  }
712
1294
  }
713
1295
  }
@@ -715,18 +1297,26 @@ class Joiner {
715
1297
  }
716
1298
  static rightJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
717
1299
  const matches = [];
718
- for (const [key, rightIndices] of rightIndex) {
1300
+ const rightColumns = joinKeys.map(k => right.column(k));
1301
+ const processedRight = new Set();
1302
+ // Iterate through right rows in original order
1303
+ for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
1304
+ if (processedRight.has(rightIdx))
1305
+ continue;
1306
+ const key = this.createJoinKey(rightColumns, rightIdx);
719
1307
  const leftIndices = leftIndex.get(key);
720
- if (leftIndices) {
721
- for (const rightIdx of rightIndices) {
722
- for (const leftIdx of leftIndices) {
723
- matches.push([leftIdx, rightIdx]);
1308
+ const rightIndices = rightIndex.get(key);
1309
+ if (rightIndices) {
1310
+ for (const rIdx of rightIndices) {
1311
+ processedRight.add(rIdx);
1312
+ if (leftIndices) {
1313
+ for (const leftIdx of leftIndices) {
1314
+ matches.push([leftIdx, rIdx]);
1315
+ }
1316
+ }
1317
+ else {
1318
+ matches.push([null, rIdx]);
724
1319
  }
725
- }
726
- }
727
- else {
728
- for (const rightIdx of rightIndices) {
729
- matches.push([null, rightIdx]);
730
1320
  }
731
1321
  }
732
1322
  }
@@ -735,27 +1325,37 @@ class Joiner {
735
1325
  static outerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
736
1326
  const matches = [];
737
1327
  const processedRightKeys = new Set();
738
- for (const [key, leftIndices] of leftIndex) {
1328
+ const processedLeft = new Set();
1329
+ const leftColumns = joinKeys.map(k => left.column(k));
1330
+ const rightColumns = joinKeys.map(k => right.column(k));
1331
+ // Process left side first (in original order)
1332
+ for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
1333
+ if (processedLeft.has(leftIdx))
1334
+ continue;
1335
+ const key = this.createJoinKey(leftColumns, leftIdx);
739
1336
  const rightIndices = rightIndex.get(key);
740
- if (rightIndices) {
741
- processedRightKeys.add(key);
742
- for (const leftIdx of leftIndices) {
743
- for (const rightIdx of rightIndices) {
744
- matches.push([leftIdx, rightIdx]);
1337
+ const leftIndices = leftIndex.get(key);
1338
+ if (leftIndices) {
1339
+ for (const lIdx of leftIndices) {
1340
+ processedLeft.add(lIdx);
1341
+ if (rightIndices) {
1342
+ processedRightKeys.add(key);
1343
+ for (const rightIdx of rightIndices) {
1344
+ matches.push([lIdx, rightIdx]);
1345
+ }
1346
+ }
1347
+ else {
1348
+ matches.push([lIdx, null]);
745
1349
  }
746
- }
747
- }
748
- else {
749
- for (const leftIdx of leftIndices) {
750
- matches.push([leftIdx, null]);
751
1350
  }
752
1351
  }
753
1352
  }
754
- for (const [key, rightIndices] of rightIndex) {
1353
+ // Add unmatched right rows (in original order)
1354
+ for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
1355
+ const key = this.createJoinKey(rightColumns, rightIdx);
755
1356
  if (!processedRightKeys.has(key)) {
756
- for (const rightIdx of rightIndices) {
757
- matches.push([null, rightIdx]);
758
- }
1357
+ matches.push([null, rightIdx]);
1358
+ processedRightKeys.add(key); // Mark this key as processed
759
1359
  }
760
1360
  }
761
1361
  return this.buildJoinedDataFrame(left, right, matches, joinKeys, suffixes);