databonk 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.esm.js CHANGED
@@ -86,6 +86,60 @@ class BitSet {
86
86
  n = (n & 0x33333333) + ((n >>> 2) & 0x33333333);
87
87
  return (((n + (n >>> 4)) & 0x0f0f0f0f) * 0x01010101) >>> 24;
88
88
  }
89
+ /**
90
+ * Get a batch of null flags as a bitmask.
91
+ * Useful for SIMD-style batch null checking.
92
+ * @param startIndex The starting index (must be aligned to 32 for optimal performance)
93
+ * @param count Number of bits to get (max 32)
94
+ * @returns A number where bit i is set if index (startIndex + i) is null
95
+ */
96
+ getNullMaskBatch(startIndex, count) {
97
+ if (count <= 0 || count > 32) {
98
+ throw new Error('Count must be between 1 and 32');
99
+ }
100
+ const arrayIndex = Math.floor(startIndex / 32);
101
+ const bitOffset = startIndex % 32;
102
+ if (bitOffset === 0 && count === 32) {
103
+ // Aligned access - fast path
104
+ return this.data[arrayIndex] >>> 0;
105
+ }
106
+ // Extract bits across word boundaries if needed
107
+ let result = this.data[arrayIndex] >>> bitOffset;
108
+ if (bitOffset + count > 32 && arrayIndex + 1 < this.data.length) {
109
+ // Need bits from next word
110
+ const bitsFromFirst = 32 - bitOffset;
111
+ const bitsFromSecond = count - bitsFromFirst;
112
+ const nextWord = this.data[arrayIndex + 1];
113
+ result |= (nextWord & ((1 << bitsFromSecond) - 1)) << bitsFromFirst;
114
+ }
115
+ // Mask to requested count
116
+ return result & ((1 << count) - 1);
117
+ }
118
+ /**
119
+ * Check if any bit in a range is set.
120
+ * Faster than checking each bit individually.
121
+ */
122
+ anySet(startIndex, count) {
123
+ const endIndex = Math.min(startIndex + count, this.length);
124
+ for (let i = startIndex; i < endIndex;) {
125
+ const arrayIndex = Math.floor(i / 32);
126
+ const bitOffset = i % 32;
127
+ const bitsToCheck = Math.min(32 - bitOffset, endIndex - i);
128
+ const mask = ((1 << bitsToCheck) - 1) << bitOffset;
129
+ if ((this.data[arrayIndex] & mask) !== 0) {
130
+ return true;
131
+ }
132
+ i += bitsToCheck;
133
+ }
134
+ return false;
135
+ }
136
+ /**
137
+ * Get direct access to the underlying data array.
138
+ * @internal
139
+ */
140
+ getDataRef() {
141
+ return this.data;
142
+ }
89
143
  *[Symbol.iterator]() {
90
144
  for (let i = 0; i < this.length; i++) {
91
145
  yield this.get(i);
@@ -128,6 +182,30 @@ class Column {
128
182
  }
129
183
  return this.data[index];
130
184
  }
185
+ /**
186
+ * Get a value without bounds checking or null handling.
187
+ * Use only when caller ensures valid index and handles nulls separately.
188
+ * @internal
189
+ */
190
+ getRaw(index) {
191
+ return this.data[index];
192
+ }
193
+ /**
194
+ * Get direct reference to the underlying data array.
195
+ * Use for batch operations that need raw access.
196
+ * @internal
197
+ */
198
+ getDataRef() {
199
+ return this.data;
200
+ }
201
+ /**
202
+ * Get direct reference to the null bitmap.
203
+ * Use for batch null checking.
204
+ * @internal
205
+ */
206
+ getNullBitmapRef() {
207
+ return this.nullBitmap;
208
+ }
131
209
  isNull(index) {
132
210
  return this.nullBitmap.get(index);
133
211
  }
@@ -238,8 +316,105 @@ class Column {
238
316
  static from(name, values, dataType) {
239
317
  return new Column(name, values, dataType);
240
318
  }
319
+ /**
320
+ * Create a Column directly from raw data without copying.
321
+ * Use for optimized construction when data is already in the correct format.
322
+ * @internal
323
+ */
324
+ static fromRaw(name, data, nullBitmap, dataType) {
325
+ // Create an instance without going through the normal constructor
326
+ const column = Object.create(Column.prototype);
327
+ column.name = name;
328
+ column.dataType = dataType;
329
+ column.data = data;
330
+ column.nullBitmap = nullBitmap;
331
+ column.length = data.length;
332
+ return column;
333
+ }
334
+ /**
335
+ * Select rows by indices with optimized batch copying.
336
+ * Much faster than calling get() for each index.
337
+ */
338
+ selectIndices(indices) {
339
+ const newLength = indices.length;
340
+ const Constructor = TYPE_CONSTRUCTORS[this.dataType];
341
+ if (Constructor) {
342
+ // TypedArray fast path - batch copy
343
+ const newData = new Constructor(newLength);
344
+ const newNullBitmap = new BitSet(newLength);
345
+ const srcData = this.data;
346
+ for (let i = 0; i < newLength; i++) {
347
+ const srcIdx = indices[i];
348
+ newData[i] = srcData[srcIdx];
349
+ if (this.nullBitmap.get(srcIdx)) {
350
+ newNullBitmap.set(i, true);
351
+ }
352
+ }
353
+ return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
354
+ }
355
+ // Regular array fallback
356
+ const newData = new Array(newLength);
357
+ const newNullBitmap = new BitSet(newLength);
358
+ for (let i = 0; i < newLength; i++) {
359
+ const srcIdx = indices[i];
360
+ newData[i] = this.data[srcIdx];
361
+ if (this.nullBitmap.get(srcIdx)) {
362
+ newNullBitmap.set(i, true);
363
+ }
364
+ }
365
+ return Column.fromRaw(this.name, newData, newNullBitmap, this.dataType);
366
+ }
241
367
  }
242
368
 
369
+ /**
370
+ * RowProxy provides zero-allocation row access for iteration.
371
+ * Reuses a single object while iterating, avoiding object creation per row.
372
+ */
373
+ class RowProxy {
374
+ constructor(df) {
375
+ this.columnCache = new Map();
376
+ this.index = 0;
377
+ for (const name of df.columnNames) {
378
+ this.columnCache.set(name, df.column(name));
379
+ }
380
+ }
381
+ /**
382
+ * Set the current row index.
383
+ * @returns this for chaining
384
+ */
385
+ setIndex(i) {
386
+ this.index = i;
387
+ return this;
388
+ }
389
+ /**
390
+ * Get a value from the current row.
391
+ */
392
+ get(col) {
393
+ const column = this.columnCache.get(col);
394
+ if (!column) {
395
+ throw new Error(`Column '${col}' not found`);
396
+ }
397
+ return column.get(this.index);
398
+ }
399
+ /**
400
+ * Get a value without null checking (faster for non-null columns).
401
+ */
402
+ getRaw(col) {
403
+ return this.columnCache.get(col).getRaw(this.index);
404
+ }
405
+ /**
406
+ * Check if a column value is null at the current row.
407
+ */
408
+ isNull(col) {
409
+ return this.columnCache.get(col).isNull(this.index);
410
+ }
411
+ /**
412
+ * Get the current row index.
413
+ */
414
+ getIndex() {
415
+ return this.index;
416
+ }
417
+ }
243
418
  class DataFrame {
244
419
  constructor(data) {
245
420
  this.columns = new Map();
@@ -304,8 +479,16 @@ class DataFrame {
304
479
  }
305
480
  filter(predicate) {
306
481
  const indices = [];
482
+ // Cache column references for the predicate
483
+ const columnRefs = [];
484
+ this.columns.forEach((col, name) => columnRefs.push([name, col]));
485
+ // Reuse a single row object to reduce allocations
486
+ const row = {};
307
487
  for (let i = 0; i < this.length; i++) {
308
- const row = this.getRow(i);
488
+ // Populate row object using cached column references
489
+ for (const [name, col] of columnRefs) {
490
+ row[name] = col.get(i);
491
+ }
309
492
  if (predicate(row, i)) {
310
493
  indices.push(i);
311
494
  }
@@ -324,11 +507,32 @@ class DataFrame {
324
507
  selectRows(indices) {
325
508
  const selectedColumns = {};
326
509
  this.columns.forEach((column, name) => {
327
- const values = indices.map(i => column.get(i));
328
- selectedColumns[name] = new Column(name, values, column.dataType);
510
+ // Use optimized batch selection instead of individual get() calls
511
+ selectedColumns[name] = column.selectIndices(indices);
329
512
  });
330
513
  return new DataFrame(selectedColumns);
331
514
  }
515
+ /**
516
+ * Filter rows using a predicate function that receives a RowProxy.
517
+ * More efficient than filter() as it avoids creating a new object per row.
518
+ */
519
+ filterByIndex(predicate) {
520
+ const proxy = new RowProxy(this);
521
+ const indices = [];
522
+ for (let i = 0; i < this.length; i++) {
523
+ if (predicate(i, proxy.setIndex(i))) {
524
+ indices.push(i);
525
+ }
526
+ }
527
+ return this.selectRows(indices);
528
+ }
529
+ /**
530
+ * Create a RowProxy for efficient iteration.
531
+ * Use this when you need to access multiple columns per row without allocation.
532
+ */
533
+ createRowProxy() {
534
+ return new RowProxy(this);
535
+ }
332
536
  getRow(index) {
333
537
  if (index < 0 || index >= this.length) {
334
538
  throw new Error('Index out of bounds');
@@ -436,66 +640,310 @@ class DataFrame {
436
640
  }
437
641
  }
438
642
 
643
+ /**
644
+ * StatAccumulator tracks multiple statistics in a single pass through the data.
645
+ * Uses Welford's online algorithm for numerically stable variance computation.
646
+ */
647
+ class StatAccumulator {
648
+ constructor() {
649
+ this.count = 0;
650
+ this.sum = 0;
651
+ this.min = Infinity;
652
+ this.max = -Infinity;
653
+ this.mean_ = 0;
654
+ this.m2 = 0; // Sum of squares of differences from current mean
655
+ }
656
+ /**
657
+ * Add a single value to the accumulator.
658
+ * Updates all statistics in O(1) time.
659
+ */
660
+ add(value) {
661
+ this.count++;
662
+ this.sum += value;
663
+ if (value < this.min)
664
+ this.min = value;
665
+ if (value > this.max)
666
+ this.max = value;
667
+ // Welford's online algorithm for stable variance
668
+ const delta = value - this.mean_;
669
+ this.mean_ += delta / this.count;
670
+ const delta2 = value - this.mean_;
671
+ this.m2 += delta * delta2;
672
+ }
673
+ /**
674
+ * Get the mean of all added values.
675
+ */
676
+ getMean() {
677
+ return this.count > 0 ? this.mean_ : 0;
678
+ }
679
+ /**
680
+ * Get the sample variance (n-1 denominator).
681
+ */
682
+ getVariance() {
683
+ return this.count > 1 ? this.m2 / (this.count - 1) : 0;
684
+ }
685
+ /**
686
+ * Get the sample standard deviation.
687
+ */
688
+ getStd() {
689
+ return Math.sqrt(this.getVariance());
690
+ }
691
+ /**
692
+ * Get a specific aggregate value by function name.
693
+ */
694
+ getValue(fn) {
695
+ switch (fn) {
696
+ case 'sum':
697
+ return this.sum;
698
+ case 'mean':
699
+ return this.getMean();
700
+ case 'min':
701
+ return this.count > 0 ? this.min : NaN;
702
+ case 'max':
703
+ return this.count > 0 ? this.max : NaN;
704
+ case 'count':
705
+ return this.count;
706
+ case 'var':
707
+ return this.getVariance();
708
+ case 'std':
709
+ return this.getStd();
710
+ default:
711
+ throw new Error(`Unknown aggregate function: ${fn}`);
712
+ }
713
+ }
714
+ /**
715
+ * Merge another accumulator into this one.
716
+ * Useful for parallel aggregation.
717
+ */
718
+ merge(other) {
719
+ if (other.count === 0)
720
+ return;
721
+ if (this.count === 0) {
722
+ this.count = other.count;
723
+ this.sum = other.sum;
724
+ this.min = other.min;
725
+ this.max = other.max;
726
+ this.mean_ = other.mean_;
727
+ this.m2 = other.m2;
728
+ return;
729
+ }
730
+ const totalCount = this.count + other.count;
731
+ const delta = other.mean_ - this.mean_;
732
+ // Combined mean
733
+ this.mean_ = (this.count * this.mean_ + other.count * other.mean_) / totalCount;
734
+ // Combined M2 using parallel algorithm
735
+ this.m2 = this.m2 + other.m2 + delta * delta * this.count * other.count / totalCount;
736
+ this.sum += other.sum;
737
+ this.count = totalCount;
738
+ if (other.min < this.min)
739
+ this.min = other.min;
740
+ if (other.max > this.max)
741
+ this.max = other.max;
742
+ }
743
+ /**
744
+ * Reset the accumulator for reuse.
745
+ */
746
+ reset() {
747
+ this.count = 0;
748
+ this.sum = 0;
749
+ this.min = Infinity;
750
+ this.max = -Infinity;
751
+ this.mean_ = 0;
752
+ this.m2 = 0;
753
+ }
754
+ }
755
+ /**
756
+ * GroupedAccumulators manages StatAccumulators for multiple groups and columns.
757
+ * Enables single-pass aggregation across all groups and aggregate functions.
758
+ */
759
+ class GroupedAccumulators {
760
+ constructor(columns) {
761
+ // Map of groupKey -> columnName -> StatAccumulator
762
+ this.accumulators = new Map();
763
+ this.columns = columns;
764
+ }
765
+ /**
766
+ * Get or create the accumulator for a group and column.
767
+ */
768
+ getAccumulator(groupKey, columnName) {
769
+ let groupAccs = this.accumulators.get(groupKey);
770
+ if (!groupAccs) {
771
+ groupAccs = new Map();
772
+ this.accumulators.set(groupKey, groupAccs);
773
+ }
774
+ let acc = groupAccs.get(columnName);
775
+ if (!acc) {
776
+ acc = new StatAccumulator();
777
+ groupAccs.set(columnName, acc);
778
+ }
779
+ return acc;
780
+ }
781
+ /**
782
+ * Add a value for a specific group and column.
783
+ */
784
+ add(groupKey, columnName, value) {
785
+ if (value !== null && !isNaN(value)) {
786
+ this.getAccumulator(groupKey, columnName).add(value);
787
+ }
788
+ }
789
+ /**
790
+ * Get all group keys.
791
+ */
792
+ getGroups() {
793
+ return Array.from(this.accumulators.keys());
794
+ }
795
+ /**
796
+ * Get the aggregate value for a group and column.
797
+ */
798
+ getValue(groupKey, columnName, fn) {
799
+ const acc = this.accumulators.get(groupKey)?.get(columnName);
800
+ if (!acc) {
801
+ return fn === 'count' ? 0 : NaN;
802
+ }
803
+ return acc.getValue(fn);
804
+ }
805
+ /**
806
+ * Get the count for a group (same across all columns).
807
+ */
808
+ getGroupCount(groupKey) {
809
+ const groupAccs = this.accumulators.get(groupKey);
810
+ if (!groupAccs)
811
+ return 0;
812
+ // Return count from the first column accumulator
813
+ for (const acc of groupAccs.values()) {
814
+ return acc.count;
815
+ }
816
+ return 0;
817
+ }
818
+ }
819
+ /**
820
+ * Creates an aggregation plan from a spec object.
821
+ */
822
+ function createAggregationPlan(spec) {
823
+ const columns = [];
824
+ const functions = new Map();
825
+ for (const [colName, fns] of Object.entries(spec)) {
826
+ const fnArray = Array.isArray(fns) ? fns : [fns];
827
+ columns.push(colName);
828
+ functions.set(colName, fnArray);
829
+ }
830
+ return { columns, functions };
831
+ }
832
+
439
833
  class GroupBy {
440
834
  constructor(df, columns) {
441
- this.groups = new Map();
835
+ this.groupOrder = []; // Track insertion order for consistent output
442
836
  this.df = df;
443
837
  this.groupColumns = columns;
838
+ // Cache column references once
839
+ this.cachedGroupCols = columns.map(c => df.column(c));
840
+ this.groups = new Map();
444
841
  this.computeGroups();
445
842
  }
446
843
  computeGroups() {
447
844
  for (let i = 0; i < this.df.length; i++) {
448
845
  const key = this.createGroupKey(i);
449
- if (!this.groups.has(key)) {
450
- this.groups.set(key, []);
846
+ const existingIndices = this.groups.get(key);
847
+ if (existingIndices) {
848
+ existingIndices.push(i);
849
+ }
850
+ else {
851
+ this.groups.set(key, [i]);
852
+ this.groupOrder.push({
853
+ key,
854
+ firstRowIndex: i
855
+ });
451
856
  }
452
- this.groups.get(key).push(i);
453
857
  }
454
858
  }
859
+ /**
860
+ * Create a simple string key for a row using '||' separator.
861
+ */
455
862
  createGroupKey(rowIndex) {
456
- const keyParts = this.groupColumns.map(colName => {
457
- const column = this.df.column(colName);
458
- const value = column.get(rowIndex);
459
- return value === null ? '__NULL__' : String(value);
460
- });
461
- return keyParts.join('||');
462
- }
463
- parseGroupKey(key) {
464
- return key.split('||').map(part => part === '__NULL__' ? null : part);
863
+ let key = '';
864
+ for (let i = 0; i < this.cachedGroupCols.length; i++) {
865
+ if (i > 0)
866
+ key += '||';
867
+ const val = this.cachedGroupCols[i].get(rowIndex);
868
+ key += val === null ? '\0' : String(val);
869
+ }
870
+ return key;
465
871
  }
872
+ /**
873
+ * Perform aggregation using single-pass algorithm for efficiency.
874
+ */
466
875
  agg(spec) {
467
876
  const resultColumns = {};
468
- this.groupColumns.forEach(colName => {
877
+ // Build list of columns to aggregate and their functions
878
+ const aggPlan = createAggregationPlan(spec);
879
+ // Separate count-only columns from columns that need actual data
880
+ const countOnlyColumns = new Set();
881
+ const dataColumns = [];
882
+ for (const [colName, fns] of aggPlan.functions) {
883
+ const fnArray = Array.isArray(fns) ? fns : [fns];
884
+ const hasOnlyCount = fnArray.every(fn => fn === 'count');
885
+ if (hasOnlyCount && !this.df.hasColumn(colName)) {
886
+ // This is a count-only column (like { count: 'count' })
887
+ countOnlyColumns.add(colName);
888
+ }
889
+ else {
890
+ dataColumns.push(colName);
891
+ }
892
+ }
893
+ // Cache column references for aggregation columns (excluding count-only)
894
+ const aggColumnRefs = new Map();
895
+ for (const colName of dataColumns) {
896
+ aggColumnRefs.set(colName, this.df.column(colName));
897
+ }
898
+ // Single-pass aggregation: create accumulators for each group
899
+ const groupedAccs = new GroupedAccumulators(dataColumns);
900
+ // Iterate through data once, accumulating all stats
901
+ for (let i = 0; i < this.df.length; i++) {
902
+ // Compute group key
903
+ const key = this.createGroupKey(i);
904
+ // Add values to accumulators for each aggregation column
905
+ for (const colName of dataColumns) {
906
+ const value = aggColumnRefs.get(colName).get(i);
907
+ if (value !== null && typeof value === 'number' && !isNaN(value)) {
908
+ groupedAccs.getAccumulator(key, colName).add(value);
909
+ }
910
+ else if (value !== null) {
911
+ // For count, we still need to track non-null values
912
+ // Use a dummy add for tracking count
913
+ groupedAccs.getAccumulator(key, colName);
914
+ }
915
+ }
916
+ }
917
+ // Build result columns for group keys (preserve original order)
918
+ this.groupColumns.forEach((colName, colIdx) => {
469
919
  const groupValues = [];
470
- const column = this.df.column(colName);
471
- for (const indices of this.groups.values()) {
472
- // Get the original value from the first row in each group to preserve type
473
- groupValues.push(column.get(indices[0]));
920
+ const column = this.cachedGroupCols[colIdx];
921
+ for (const entry of this.groupOrder) {
922
+ const indices = this.groups.get(entry.key);
923
+ if (indices && indices.length > 0) {
924
+ groupValues.push(column.get(indices[0]));
925
+ }
474
926
  }
475
927
  resultColumns[colName] = new Column(colName, groupValues);
476
928
  });
477
- Object.entries(spec).forEach(([colName, functions]) => {
478
- const funcArray = Array.isArray(functions) ? functions : [functions];
479
- funcArray.forEach(fn => {
929
+ // Build result columns for aggregated values
930
+ for (const [colName, fns] of aggPlan.functions) {
931
+ for (const fn of fns) {
480
932
  const aggValues = [];
481
- for (const indices of this.groups.values()) {
482
- let value;
483
- if (fn === 'count') {
484
- // Count doesn't need actual column values, just the number of rows
485
- value = indices.length;
933
+ for (const entry of this.groupOrder) {
934
+ if (fn === 'count' || countOnlyColumns.has(colName)) {
935
+ // For count, return number of rows in group
936
+ const indices = this.groups.get(entry.key);
937
+ aggValues.push(indices ? indices.length : 0);
486
938
  }
487
939
  else {
488
- // For other aggregations, we need the actual column values
489
- const groupValues = indices.map(i => this.df.column(colName).get(i));
490
- const groupColumn = new Column(`temp_${colName}`, groupValues);
491
- value = this.computeAggregateValue(groupColumn, fn);
940
+ aggValues.push(groupedAccs.getValue(entry.key, colName, fn));
492
941
  }
493
- aggValues.push(value);
494
942
  }
495
- const resultColName = funcArray.length === 1 ? colName : `${colName}_${fn}`;
943
+ const resultColName = fns.length === 1 ? colName : `${colName}_${fn}`;
496
944
  resultColumns[resultColName] = new Column(resultColName, aggValues, 'float64');
497
- });
498
- });
945
+ }
946
+ }
499
947
  return new DataFrame(resultColumns);
500
948
  }
501
949
  computeAggregateValue(column, fn) {
@@ -538,7 +986,27 @@ class GroupBy {
538
986
  return Math.sqrt(this.computeVar(column));
539
987
  }
540
988
  count() {
541
- return this.agg({ count: 'count' });
989
+ const resultColumns = {};
990
+ // Build group key columns
991
+ this.groupColumns.forEach((colName, colIdx) => {
992
+ const column = this.cachedGroupCols[colIdx];
993
+ const values = [];
994
+ for (const entry of this.groupOrder) {
995
+ const indices = this.groups.get(entry.key);
996
+ if (indices && indices.length > 0) {
997
+ values.push(column.get(indices[0]));
998
+ }
999
+ }
1000
+ resultColumns[colName] = new Column(colName, values);
1001
+ });
1002
+ // Add count column
1003
+ const counts = [];
1004
+ for (const entry of this.groupOrder) {
1005
+ const indices = this.groups.get(entry.key);
1006
+ counts.push(indices ? indices.length : 0);
1007
+ }
1008
+ resultColumns['count'] = new Column('count', counts, 'int32');
1009
+ return new DataFrame(resultColumns);
542
1010
  }
543
1011
  sum(columns) {
544
1012
  const spec = {};
@@ -570,19 +1038,17 @@ class GroupBy {
570
1038
  }
571
1039
  first() {
572
1040
  const resultColumns = {};
1041
+ // Cache all column references
1042
+ const colRefs = new Map();
1043
+ for (const colName of this.df.columnNames) {
1044
+ colRefs.set(colName, this.df.column(colName));
1045
+ }
573
1046
  this.df.columnNames.forEach(colName => {
574
1047
  const values = [];
575
- if (this.groupColumns.includes(colName)) {
576
- for (const key of this.groups.keys()) {
577
- const keyValues = this.parseGroupKey(key);
578
- const colIndex = this.groupColumns.indexOf(colName);
579
- values.push(keyValues[colIndex]);
580
- }
581
- }
582
- else {
583
- for (const indices of this.groups.values()) {
584
- const firstIndex = indices[0];
585
- values.push(this.df.column(colName).get(firstIndex));
1048
+ for (const entry of this.groupOrder) {
1049
+ const indices = this.groups.get(entry.key);
1050
+ if (indices && indices.length > 0) {
1051
+ values.push(colRefs.get(colName).get(indices[0]));
586
1052
  }
587
1053
  }
588
1054
  resultColumns[colName] = new Column(colName, values);
@@ -591,19 +1057,17 @@ class GroupBy {
591
1057
  }
592
1058
  last() {
593
1059
  const resultColumns = {};
1060
+ // Cache all column references
1061
+ const colRefs = new Map();
1062
+ for (const colName of this.df.columnNames) {
1063
+ colRefs.set(colName, this.df.column(colName));
1064
+ }
594
1065
  this.df.columnNames.forEach(colName => {
595
1066
  const values = [];
596
- if (this.groupColumns.includes(colName)) {
597
- for (const key of this.groups.keys()) {
598
- const keyValues = this.parseGroupKey(key);
599
- const colIndex = this.groupColumns.indexOf(colName);
600
- values.push(keyValues[colIndex]);
601
- }
602
- }
603
- else {
604
- for (const indices of this.groups.values()) {
605
- const lastIndex = indices[indices.length - 1];
606
- values.push(this.df.column(colName).get(lastIndex));
1067
+ for (const entry of this.groupOrder) {
1068
+ const indices = this.groups.get(entry.key);
1069
+ if (indices && indices.length > 0) {
1070
+ values.push(colRefs.get(colName).get(indices[indices.length - 1]));
607
1071
  }
608
1072
  }
609
1073
  resultColumns[colName] = new Column(colName, values);
@@ -611,15 +1075,22 @@ class GroupBy {
611
1075
  return new DataFrame(resultColumns);
612
1076
  }
613
1077
  size() {
614
- Array.from(this.groups.keys());
615
- const groupSizes = Array.from(this.groups.values()).map(indices => indices.length);
616
1078
  const resultColumns = {};
617
- this.groupColumns.forEach(colName => {
618
- const column = this.df.column(colName);
619
- const values = Array.from(this.groups.values()).map(indices => {
620
- // Get the original value from the first row in each group to preserve type
621
- return column.get(indices[0]);
622
- });
1079
+ const groupSizes = [];
1080
+ // Build group key columns and sizes
1081
+ this.groupColumns.forEach((colName, colIdx) => {
1082
+ const column = this.cachedGroupCols[colIdx];
1083
+ const values = [];
1084
+ for (const entry of this.groupOrder) {
1085
+ const indices = this.groups.get(entry.key);
1086
+ if (indices && indices.length > 0) {
1087
+ values.push(column.get(indices[0]));
1088
+ // Only add to groupSizes on first column iteration
1089
+ if (colIdx === 0) {
1090
+ groupSizes.push(indices.length);
1091
+ }
1092
+ }
1093
+ }
623
1094
  resultColumns[colName] = new Column(colName, values);
624
1095
  });
625
1096
  resultColumns['size'] = new Column('size', groupSizes, 'int32');
@@ -630,6 +1101,75 @@ DataFrame.prototype.groupBy = function (columns) {
630
1101
  return new GroupBy(this, columns);
631
1102
  };
632
1103
 
1104
+ /**
1105
+ * IndexCache provides caching for hash indices used in join and groupBy operations.
1106
+ * Uses WeakMap to allow garbage collection of DataFrames.
1107
+ */
1108
+ class IndexCache {
1109
+ constructor(maxAge = 60000) {
1110
+ this.cache = new WeakMap();
1111
+ this.maxAge = maxAge;
1112
+ }
1113
+ /**
1114
+ * Generate a cache key from column names.
1115
+ */
1116
+ getCacheKey(columns) {
1117
+ return columns.slice().sort().join('\x00');
1118
+ }
1119
+ /**
1120
+ * Get a cached index for the given DataFrame and columns.
1121
+ * Returns null if not cached or expired.
1122
+ */
1123
+ getIndex(df, columns) {
1124
+ const dfCache = this.cache.get(df);
1125
+ if (!dfCache)
1126
+ return null;
1127
+ const key = this.getCacheKey(columns);
1128
+ const cached = dfCache.get(key);
1129
+ if (!cached)
1130
+ return null;
1131
+ // Check if expired
1132
+ if (Date.now() - cached.createdAt > this.maxAge) {
1133
+ dfCache.delete(key);
1134
+ return null;
1135
+ }
1136
+ return cached.index;
1137
+ }
1138
+ /**
1139
+ * Store an index in the cache.
1140
+ */
1141
+ setIndex(df, columns, index) {
1142
+ let dfCache = this.cache.get(df);
1143
+ if (!dfCache) {
1144
+ dfCache = new Map();
1145
+ this.cache.set(df, dfCache);
1146
+ }
1147
+ const key = this.getCacheKey(columns);
1148
+ dfCache.set(key, {
1149
+ columns: columns.slice(),
1150
+ index,
1151
+ createdAt: Date.now()
1152
+ });
1153
+ }
1154
+ /**
1155
+ * Invalidate all cached indices for a DataFrame.
1156
+ */
1157
+ invalidate(df) {
1158
+ this.cache.delete(df);
1159
+ }
1160
+ /**
1161
+ * Clear all cached indices.
1162
+ */
1163
+ clear() {
1164
+ // WeakMap doesn't have a clear method, so we create a new one
1165
+ this.cache = new WeakMap();
1166
+ }
1167
+ }
1168
+ /**
1169
+ * Global index cache instance for shared use across operations.
1170
+ */
1171
+ const globalIndexCache = new IndexCache();
1172
+
633
1173
  class Joiner {
634
1174
  static join(left, right, on, how = 'inner', suffixes = ['_x', '_y']) {
635
1175
  const joinKeys = Array.isArray(on) ? on : [on];
@@ -659,33 +1199,67 @@ class Joiner {
659
1199
  }
660
1200
  });
661
1201
  }
662
- static buildHashIndex(df, keys) {
1202
+ static buildHashIndex(df, keys, useCache = true) {
1203
+ // Check cache first
1204
+ if (useCache) {
1205
+ const cached = globalIndexCache.getIndex(df, keys);
1206
+ if (cached) {
1207
+ return cached;
1208
+ }
1209
+ }
663
1210
  const index = new Map();
1211
+ // Cache column references once before the loop
1212
+ const columns = keys.map(k => df.column(k));
664
1213
  for (let i = 0; i < df.length; i++) {
665
- const keyValue = this.createJoinKey(df, i, keys);
666
- if (!index.has(keyValue)) {
667
- index.set(keyValue, []);
1214
+ const key = this.createJoinKey(columns, i);
1215
+ const indices = index.get(key);
1216
+ if (indices) {
1217
+ indices.push(i);
1218
+ }
1219
+ else {
1220
+ index.set(key, [i]);
668
1221
  }
669
- index.get(keyValue).push(i);
1222
+ }
1223
+ // Store in cache
1224
+ if (useCache) {
1225
+ globalIndexCache.setIndex(df, keys, index);
670
1226
  }
671
1227
  return index;
672
1228
  }
673
- static createJoinKey(df, rowIndex, keys) {
674
- const keyParts = keys.map(key => {
675
- const column = df.column(key);
676
- const value = column.get(rowIndex);
677
- return value === null ? '__NULL__' : String(value);
678
- });
679
- return keyParts.join('||');
1229
+ /**
1230
+ * Create a simple string key for a row using '||' separator.
1231
+ */
1232
+ static createJoinKey(columns, rowIndex) {
1233
+ let key = '';
1234
+ for (let i = 0; i < columns.length; i++) {
1235
+ if (i > 0)
1236
+ key += '||';
1237
+ const val = columns[i].get(rowIndex);
1238
+ key += val === null ? '\0' : String(val);
1239
+ }
1240
+ return key;
680
1241
  }
681
1242
  static innerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
682
1243
  const matches = [];
683
- for (const [key, leftIndices] of leftIndex) {
1244
+ // Cache column references for key lookups
1245
+ const leftColumns = joinKeys.map(k => left.column(k));
1246
+ // Track which left rows have been processed to avoid duplicates
1247
+ const processedLeft = new Set();
1248
+ // Iterate through left rows in original order
1249
+ for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
1250
+ if (processedLeft.has(leftIdx))
1251
+ continue;
1252
+ const key = this.createJoinKey(leftColumns, leftIdx);
684
1253
  const rightIndices = rightIndex.get(key);
685
1254
  if (rightIndices) {
686
- for (const leftIdx of leftIndices) {
687
- for (const rightIdx of rightIndices) {
688
- matches.push([leftIdx, rightIdx]);
1255
+ // Get all left rows with the same key
1256
+ const leftIndices = leftIndex.get(key);
1257
+ if (leftIndices) {
1258
+ for (const lIdx of leftIndices) {
1259
+ processedLeft.add(lIdx);
1260
+ for (const rightIdx of rightIndices) {
1261
+ matches.push([lIdx, rightIdx]);
1262
+ }
689
1263
  }
690
1264
  }
691
1265
  }
@@ -694,18 +1268,26 @@ class Joiner {
694
1268
  }
695
1269
  static leftJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
696
1270
  const matches = [];
697
- for (const [key, leftIndices] of leftIndex) {
1271
+ const leftColumns = joinKeys.map(k => left.column(k));
1272
+ const processedLeft = new Set();
1273
+ // Iterate through left rows in original order
1274
+ for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
1275
+ if (processedLeft.has(leftIdx))
1276
+ continue;
1277
+ const key = this.createJoinKey(leftColumns, leftIdx);
698
1278
  const rightIndices = rightIndex.get(key);
699
- if (rightIndices) {
700
- for (const leftIdx of leftIndices) {
701
- for (const rightIdx of rightIndices) {
702
- matches.push([leftIdx, rightIdx]);
1279
+ const leftIndices = leftIndex.get(key);
1280
+ if (leftIndices) {
1281
+ for (const lIdx of leftIndices) {
1282
+ processedLeft.add(lIdx);
1283
+ if (rightIndices) {
1284
+ for (const rightIdx of rightIndices) {
1285
+ matches.push([lIdx, rightIdx]);
1286
+ }
1287
+ }
1288
+ else {
1289
+ matches.push([lIdx, null]);
703
1290
  }
704
- }
705
- }
706
- else {
707
- for (const leftIdx of leftIndices) {
708
- matches.push([leftIdx, null]);
709
1291
  }
710
1292
  }
711
1293
  }
@@ -713,18 +1295,26 @@ class Joiner {
713
1295
  }
714
1296
  static rightJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
715
1297
  const matches = [];
716
- for (const [key, rightIndices] of rightIndex) {
1298
+ const rightColumns = joinKeys.map(k => right.column(k));
1299
+ const processedRight = new Set();
1300
+ // Iterate through right rows in original order
1301
+ for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
1302
+ if (processedRight.has(rightIdx))
1303
+ continue;
1304
+ const key = this.createJoinKey(rightColumns, rightIdx);
717
1305
  const leftIndices = leftIndex.get(key);
718
- if (leftIndices) {
719
- for (const rightIdx of rightIndices) {
720
- for (const leftIdx of leftIndices) {
721
- matches.push([leftIdx, rightIdx]);
1306
+ const rightIndices = rightIndex.get(key);
1307
+ if (rightIndices) {
1308
+ for (const rIdx of rightIndices) {
1309
+ processedRight.add(rIdx);
1310
+ if (leftIndices) {
1311
+ for (const leftIdx of leftIndices) {
1312
+ matches.push([leftIdx, rIdx]);
1313
+ }
1314
+ }
1315
+ else {
1316
+ matches.push([null, rIdx]);
722
1317
  }
723
- }
724
- }
725
- else {
726
- for (const rightIdx of rightIndices) {
727
- matches.push([null, rightIdx]);
728
1318
  }
729
1319
  }
730
1320
  }
@@ -733,27 +1323,37 @@ class Joiner {
733
1323
  static outerJoin(left, right, leftIndex, rightIndex, joinKeys, suffixes) {
734
1324
  const matches = [];
735
1325
  const processedRightKeys = new Set();
736
- for (const [key, leftIndices] of leftIndex) {
1326
+ const processedLeft = new Set();
1327
+ const leftColumns = joinKeys.map(k => left.column(k));
1328
+ const rightColumns = joinKeys.map(k => right.column(k));
1329
+ // Process left side first (in original order)
1330
+ for (let leftIdx = 0; leftIdx < left.length; leftIdx++) {
1331
+ if (processedLeft.has(leftIdx))
1332
+ continue;
1333
+ const key = this.createJoinKey(leftColumns, leftIdx);
737
1334
  const rightIndices = rightIndex.get(key);
738
- if (rightIndices) {
739
- processedRightKeys.add(key);
740
- for (const leftIdx of leftIndices) {
741
- for (const rightIdx of rightIndices) {
742
- matches.push([leftIdx, rightIdx]);
1335
+ const leftIndices = leftIndex.get(key);
1336
+ if (leftIndices) {
1337
+ for (const lIdx of leftIndices) {
1338
+ processedLeft.add(lIdx);
1339
+ if (rightIndices) {
1340
+ processedRightKeys.add(key);
1341
+ for (const rightIdx of rightIndices) {
1342
+ matches.push([lIdx, rightIdx]);
1343
+ }
1344
+ }
1345
+ else {
1346
+ matches.push([lIdx, null]);
743
1347
  }
744
- }
745
- }
746
- else {
747
- for (const leftIdx of leftIndices) {
748
- matches.push([leftIdx, null]);
749
1348
  }
750
1349
  }
751
1350
  }
752
- for (const [key, rightIndices] of rightIndex) {
1351
+ // Add unmatched right rows (in original order)
1352
+ for (let rightIdx = 0; rightIdx < right.length; rightIdx++) {
1353
+ const key = this.createJoinKey(rightColumns, rightIdx);
753
1354
  if (!processedRightKeys.has(key)) {
754
- for (const rightIdx of rightIndices) {
755
- matches.push([null, rightIdx]);
756
- }
1355
+ matches.push([null, rightIdx]);
1356
+ processedRightKeys.add(key); // Mark this key as processed
757
1357
  }
758
1358
  }
759
1359
  return this.buildJoinedDataFrame(left, right, matches, joinKeys, suffixes);