starlight-dataset 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +160 -0
- package/package.json +18 -0
- package/starlight-dataset.mjs +92 -0
package/README.md
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# starlight-dataset
|
|
2
|
+
|
|
3
|
+
A lightweight dataset utility library for the **Starlight Machine Learning ecosystem**.
|
|
4
|
+
It provides a clean abstraction for handling data, batching, shuffling, and train/test splitting—designed to work seamlessly with other Starlight ML packages.
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
* Dataset abstraction (`Dataset` class)
|
|
11
|
+
* Immutable operations (`map`, `filter`, `shuffle`, etc.)
|
|
12
|
+
* Deterministic shuffling
|
|
13
|
+
* Batch generation
|
|
14
|
+
* Train / test split
|
|
15
|
+
* Works with regression, classification, clustering, and pipelines
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
npm install starlight-dataset
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Or import directly in your Starlight environment:
|
|
26
|
+
|
|
27
|
+
```js
|
|
28
|
+
import { Dataset, dataset } from "starlight-dataset";
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Basic Usage
|
|
34
|
+
|
|
35
|
+
### Create a Dataset
|
|
36
|
+
|
|
37
|
+
```js
|
|
38
|
+
import { dataset } from "starlight-dataset";
|
|
39
|
+
|
|
40
|
+
const ds = dataset([1, 2, 3, 4, 5]);
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
### Map & Filter
|
|
46
|
+
|
|
47
|
+
```js
|
|
48
|
+
const processed = ds
|
|
49
|
+
.map(x => x * 2)
|
|
50
|
+
.filter(x => x > 5);
|
|
51
|
+
|
|
52
|
+
processed.toArray();
|
|
53
|
+
// [6, 8, 10]
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## Shuffling
|
|
59
|
+
|
|
60
|
+
```js
|
|
61
|
+
const shuffled = ds.shuffle();
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
Deterministic shuffle with seed:
|
|
65
|
+
|
|
66
|
+
```js
|
|
67
|
+
const shuffled = ds.shuffle(0.42);
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Batching
|
|
73
|
+
|
|
74
|
+
```js
|
|
75
|
+
const batches = ds.batch(2);
|
|
76
|
+
|
|
77
|
+
batches.toArray();
|
|
78
|
+
// [ [1, 2], [3, 4], [5] ]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Train / Test Split
|
|
84
|
+
|
|
85
|
+
```js
|
|
86
|
+
const { train, test } = ds.split(0.8);
|
|
87
|
+
|
|
88
|
+
train.size(); // 4
|
|
89
|
+
test.size(); // 1
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Disable shuffle if needed:
|
|
93
|
+
|
|
94
|
+
```js
|
|
95
|
+
ds.split(0.8, false);
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
|
|
100
|
+
## Pairing Features & Labels
|
|
101
|
+
|
|
102
|
+
```js
|
|
103
|
+
import { fromPairs } from "starlight-dataset";
|
|
104
|
+
|
|
105
|
+
const X = [[1], [2], [3]];
|
|
106
|
+
const y = [2, 4, 6];
|
|
107
|
+
|
|
108
|
+
const paired = fromPairs(X, y);
|
|
109
|
+
|
|
110
|
+
paired.toArray();
|
|
111
|
+
// [ { x: [1], y: 2 }, { x: [2], y: 4 }, { x: [3], y: 6 } ]
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Dataset API
|
|
117
|
+
|
|
118
|
+
### `Dataset`
|
|
119
|
+
|
|
120
|
+
| Method | Description |
|
|
121
|
+
| ------------------------ | ----------------------- |
|
|
122
|
+
| `map(fn)` | Transform each element |
|
|
123
|
+
| `filter(fn)` | Filter elements |
|
|
124
|
+
| `shuffle(seed?)` | Shuffle dataset |
|
|
125
|
+
| `batch(size)` | Create batches |
|
|
126
|
+
| `split(ratio, shuffle?)` | Train/test split |
|
|
127
|
+
| `take(n)` | Take first `n` elements |
|
|
128
|
+
| `skip(n)` | Skip first `n` elements |
|
|
129
|
+
| `repeat(times)` | Repeat dataset |
|
|
130
|
+
| `size()` | Dataset size |
|
|
131
|
+
| `toArray()` | Convert to array |
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## Designed for Starlight ML
|
|
136
|
+
|
|
137
|
+
This package integrates naturally with:
|
|
138
|
+
|
|
139
|
+
* **starlight-ml**
|
|
140
|
+
* **starlight-vec**
|
|
141
|
+
* **starlight-classifier**
|
|
142
|
+
* **starlight-regression**
|
|
143
|
+
* **starlight-pipeline**
|
|
144
|
+
* **starlight-train (future)**
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Philosophy
|
|
149
|
+
|
|
150
|
+
* Simple over clever
|
|
151
|
+
* Immutable over mutable
|
|
152
|
+
* Readable over magical
|
|
153
|
+
* Educational yet production-ready
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
MIT © Dominex Macedon
|
|
160
|
+
|
package/package.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "starlight-dataset",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Dataset utilities for batching, shuffling, and splitting data in Starlight ML",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "starlight-dataset.mjs",
|
|
7
|
+
"keywords": [
|
|
8
|
+
"starlight",
|
|
9
|
+
"dataset",
|
|
10
|
+
"machine-learning",
|
|
11
|
+
"batching",
|
|
12
|
+
"data-pipeline"
|
|
13
|
+
],
|
|
14
|
+
"author": "Dominex Macedon",
|
|
15
|
+
"license": "MIT",
|
|
16
|
+
"dependencies": {},
|
|
17
|
+
"devDependencies": {}
|
|
18
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
export class Dataset {
|
|
2
|
+
constructor(data = []) {
|
|
3
|
+
if (!Array.isArray(data)) {
|
|
4
|
+
throw new Error("Dataset data must be an array");
|
|
5
|
+
}
|
|
6
|
+
this.data = data;
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
map(fn) {
|
|
10
|
+
return new Dataset(this.data.map(fn));
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
filter(fn) {
|
|
14
|
+
return new Dataset(this.data.filter(fn));
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
forEach(fn) {
|
|
18
|
+
this.data.forEach(fn);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
toArray() {
|
|
22
|
+
return this.data.slice();
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
size() {
|
|
26
|
+
return this.data.length;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
take(n = 1) {
|
|
30
|
+
return new Dataset(this.data.slice(0, n));
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
skip(n = 0) {
|
|
34
|
+
return new Dataset(this.data.slice(n));
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
repeat(times = 1) {
|
|
38
|
+
const result = [];
|
|
39
|
+
for (let i = 0; i < times; i++) {
|
|
40
|
+
result.push(...this.data);
|
|
41
|
+
}
|
|
42
|
+
return new Dataset(result);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
shuffle(seed = Math.random()) {
|
|
46
|
+
const arr = this.data.slice();
|
|
47
|
+
let s = seed * 100000;
|
|
48
|
+
|
|
49
|
+
for (let i = arr.length - 1; i > 0; i--) {
|
|
50
|
+
s = (s * 9301 + 49297) % 233280;
|
|
51
|
+
const j = Math.floor((s / 233280) * (i + 1));
|
|
52
|
+
[arr[i], arr[j]] = [arr[j], arr[i]];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return new Dataset(arr);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
batch(size = 1) {
|
|
59
|
+
if (size <= 0) throw new Error("Batch size must be > 0");
|
|
60
|
+
|
|
61
|
+
const batches = [];
|
|
62
|
+
for (let i = 0; i < this.data.length; i += size) {
|
|
63
|
+
batches.push(this.data.slice(i, i + size));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return new Dataset(batches);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
split(ratio = 0.8, shuffle = true) {
|
|
70
|
+
const data = shuffle ? this.shuffle().data : this.data;
|
|
71
|
+
const cut = Math.floor(data.length * ratio);
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
train: new Dataset(data.slice(0, cut)),
|
|
75
|
+
test: new Dataset(data.slice(cut))
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
export function dataset(data = []) {
|
|
81
|
+
return new Dataset(data);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export function fromPairs(X = [], y = []) {
|
|
85
|
+
if (X.length !== y.length) {
|
|
86
|
+
throw new Error("X and y must have the same length");
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
return new Dataset(
|
|
90
|
+
X.map((x, i) => ({ x, y: y[i] }))
|
|
91
|
+
);
|
|
92
|
+
}
|