starlight-dataset 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,160 @@
1
+ # starlight-dataset
2
+
3
+ A lightweight dataset utility library for the **Starlight Machine Learning ecosystem**.
4
+ It provides a clean abstraction for handling data, batching, shuffling, and train/test splitting—designed to work seamlessly with other Starlight ML packages.
5
+
6
+ ---
7
+
8
+ ## Features
9
+
10
+ * Dataset abstraction (`Dataset` class)
11
+ * Immutable operations (`map`, `filter`, `shuffle`, etc.)
12
+ * Deterministic shuffling
13
+ * Batch generation
14
+ * Train / test split
15
+ * Works with regression, classification, clustering, and pipelines
16
+
17
+ ---
18
+
19
+ ## Installation
20
+
21
+ ```bash
22
+ npm install starlight-dataset
23
+ ```
24
+
25
+ Or import directly in your Starlight environment:
26
+
27
+ ```js
28
+ import { Dataset, dataset } from "starlight-dataset";
29
+ ```
30
+
31
+ ---
32
+
33
+ ## Basic Usage
34
+
35
+ ### Create a Dataset
36
+
37
+ ```js
38
+ import { dataset } from "starlight-dataset";
39
+
40
+ const ds = dataset([1, 2, 3, 4, 5]);
41
+ ```
42
+
43
+ ---
44
+
45
+ ### Map & Filter
46
+
47
+ ```js
48
+ const processed = ds
49
+ .map(x => x * 2)
50
+ .filter(x => x > 5);
51
+
52
+ processed.toArray();
53
+ // [6, 8, 10]
54
+ ```
55
+
56
+ ---
57
+
58
+ ## Shuffling
59
+
60
+ ```js
61
+ const shuffled = ds.shuffle();
62
+ ```
63
+
64
+ Deterministic shuffle with seed:
65
+
66
+ ```js
67
+ const shuffled = ds.shuffle(0.42);
68
+ ```
69
+
70
+ ---
71
+
72
+ ## Batching
73
+
74
+ ```js
75
+ const batches = ds.batch(2);
76
+
77
+ batches.toArray();
78
+ // [ [1, 2], [3, 4], [5] ]
79
+ ```
80
+
81
+ ---
82
+
83
+ ## Train / Test Split
84
+
85
+ ```js
86
+ const { train, test } = ds.split(0.8);
87
+
88
+ train.size(); // 4
89
+ test.size(); // 1
90
+ ```
91
+
92
+ Disable shuffle if needed:
93
+
94
+ ```js
95
+ ds.split(0.8, false);
96
+ ```
97
+
98
+ ---
99
+
100
+ ## Pairing Features & Labels
101
+
102
+ ```js
103
+ import { fromPairs } from "starlight-dataset";
104
+
105
+ const X = [[1], [2], [3]];
106
+ const y = [2, 4, 6];
107
+
108
+ const paired = fromPairs(X, y);
109
+
110
+ paired.toArray();
111
+ // [ { x: [1], y: 2 }, { x: [2], y: 4 }, { x: [3], y: 6 } ]
112
+ ```
113
+
114
+ ---
115
+
116
+ ## Dataset API
117
+
118
+ ### `Dataset`
119
+
120
+ | Method | Description |
121
+ | ------------------------ | ----------------------- |
122
+ | `map(fn)` | Transform each element |
123
+ | `filter(fn)` | Filter elements |
124
+ | `shuffle(seed?)` | Shuffle dataset |
125
+ | `batch(size)` | Create batches |
126
+ | `split(ratio, shuffle?)` | Train/test split |
127
+ | `take(n)` | Take first `n` elements |
128
+ | `skip(n)` | Skip first `n` elements |
129
+ | `repeat(times)` | Repeat dataset |
130
+ | `size()` | Dataset size |
131
+ | `toArray()` | Convert to array |
132
+
133
+ ---
134
+
135
+ ## Designed for Starlight ML
136
+
137
+ This package integrates naturally with:
138
+
139
+ * **starlight-ml**
140
+ * **starlight-vec**
141
+ * **starlight-classifier**
142
+ * **starlight-regression**
143
+ * **starlight-pipeline**
144
+ * **starlight-train (future)**
145
+
146
+ ---
147
+
148
+ ## Philosophy
149
+
150
+ * Simple over clever
151
+ * Immutable over mutable
152
+ * Readable over magical
153
+ * Educational yet production-ready
154
+
155
+ ---
156
+
157
+ ## License
158
+
159
+ MIT © Dominex Macedon
160
+
package/package.json ADDED
@@ -0,0 +1,18 @@
1
+ {
2
+ "name": "starlight-dataset",
3
+ "version": "1.0.0",
4
+ "description": "Dataset utilities for batching, shuffling, and splitting data in Starlight ML",
5
+ "type": "module",
6
+ "main": "starlight-dataset.mjs",
7
+ "keywords": [
8
+ "starlight",
9
+ "dataset",
10
+ "machine-learning",
11
+ "batching",
12
+ "data-pipeline"
13
+ ],
14
+ "author": "Dominex Macedon",
15
+ "license": "MIT",
16
+ "dependencies": {},
17
+ "devDependencies": {}
18
+ }
@@ -0,0 +1,92 @@
1
+ export class Dataset {
2
+ constructor(data = []) {
3
+ if (!Array.isArray(data)) {
4
+ throw new Error("Dataset data must be an array");
5
+ }
6
+ this.data = data;
7
+ }
8
+
9
+ map(fn) {
10
+ return new Dataset(this.data.map(fn));
11
+ }
12
+
13
+ filter(fn) {
14
+ return new Dataset(this.data.filter(fn));
15
+ }
16
+
17
+ forEach(fn) {
18
+ this.data.forEach(fn);
19
+ }
20
+
21
+ toArray() {
22
+ return this.data.slice();
23
+ }
24
+
25
+ size() {
26
+ return this.data.length;
27
+ }
28
+
29
+ take(n = 1) {
30
+ return new Dataset(this.data.slice(0, n));
31
+ }
32
+
33
+ skip(n = 0) {
34
+ return new Dataset(this.data.slice(n));
35
+ }
36
+
37
+ repeat(times = 1) {
38
+ const result = [];
39
+ for (let i = 0; i < times; i++) {
40
+ result.push(...this.data);
41
+ }
42
+ return new Dataset(result);
43
+ }
44
+
45
+ shuffle(seed = Math.random()) {
46
+ const arr = this.data.slice();
47
+ let s = seed * 100000;
48
+
49
+ for (let i = arr.length - 1; i > 0; i--) {
50
+ s = (s * 9301 + 49297) % 233280;
51
+ const j = Math.floor((s / 233280) * (i + 1));
52
+ [arr[i], arr[j]] = [arr[j], arr[i]];
53
+ }
54
+
55
+ return new Dataset(arr);
56
+ }
57
+
58
+ batch(size = 1) {
59
+ if (size <= 0) throw new Error("Batch size must be > 0");
60
+
61
+ const batches = [];
62
+ for (let i = 0; i < this.data.length; i += size) {
63
+ batches.push(this.data.slice(i, i + size));
64
+ }
65
+
66
+ return new Dataset(batches);
67
+ }
68
+
69
+ split(ratio = 0.8, shuffle = true) {
70
+ const data = shuffle ? this.shuffle().data : this.data;
71
+ const cut = Math.floor(data.length * ratio);
72
+
73
+ return {
74
+ train: new Dataset(data.slice(0, cut)),
75
+ test: new Dataset(data.slice(cut))
76
+ };
77
+ }
78
+ }
79
+
80
+ export function dataset(data = []) {
81
+ return new Dataset(data);
82
+ }
83
+
84
+ export function fromPairs(X = [], y = []) {
85
+ if (X.length !== y.length) {
86
+ throw new Error("X and y must have the same length");
87
+ }
88
+
89
+ return new Dataset(
90
+ X.map((x, i) => ({ x, y: y[i] }))
91
+ );
92
+ }