effect-gpt 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -0
- package/data/chat_training_data.json +55 -0
- package/data/pretraining_data.json +27 -0
- package/package.json +25 -0
- package/src/cli/errors.ts +51 -0
- package/src/cli/main.ts +163 -0
- package/src/config.ts +3 -0
- package/src/data/Dataset.ts +168 -0
- package/src/errors.ts +73 -0
- package/src/index.ts +88 -0
- package/src/model/Embeddings.ts +108 -0
- package/src/model/FeedForward.ts +121 -0
- package/src/model/LLM.ts +124 -0
- package/src/model/LayerNorm.ts +138 -0
- package/src/model/ModelLayer.ts +10 -0
- package/src/model/OutputProjection.ts +76 -0
- package/src/model/SelfAttention.ts +169 -0
- package/src/model/TransformerBlock.ts +53 -0
- package/src/services/Logger.ts +124 -0
- package/src/services/Metrics.ts +260 -0
- package/src/services/Random.ts +98 -0
- package/src/services/SeedLayer.ts +39 -0
- package/src/services/index.ts +32 -0
- package/src/tensor/Tensor2D.ts +42 -0
- package/src/tensor/ops.ts +371 -0
- package/src/tensor/random.ts +32 -0
- package/src/tokenize/split.ts +27 -0
- package/src/tokenize/tokenize.ts +28 -0
- package/src/training/Adam.ts +61 -0
- package/src/training/clip.ts +16 -0
- package/src/training/loss.ts +35 -0
- package/src/training/train.ts +203 -0
- package/src/vocab/Vocab.ts +79 -0
- package/tests/fixtures/csv_bad.csv +2 -0
- package/tests/fixtures/csv_good.csv +3 -0
- package/tests/ts/cli_error_format.test.ts +26 -0
- package/tests/ts/dataset.test.ts +35 -0
- package/tests/ts/embeddings.test.ts +81 -0
- package/tests/ts/errors.test.ts +36 -0
- package/tests/ts/feed_forward.test.ts +74 -0
- package/tests/ts/initNormal.test.ts +41 -0
- package/tests/ts/layer_norm.test.ts +96 -0
- package/tests/ts/llm_parameters.test.ts +96 -0
- package/tests/ts/llm_predict.test.ts +98 -0
- package/tests/ts/llm_tokenize.test.ts +69 -0
- package/tests/ts/output_projection.test.ts +78 -0
- package/tests/ts/random.test.ts +44 -0
- package/tests/ts/self_attention.test.ts +63 -0
- package/tests/ts/support/factories.ts +126 -0
- package/tests/ts/support/runEffect.ts +29 -0
- package/tests/ts/support/seed.ts +12 -0
- package/tests/ts/support/stubs.ts +58 -0
- package/tests/ts/support/tensorMatchers.ts +96 -0
- package/tests/ts/support.test.ts +165 -0
- package/tests/ts/train_loop.test.ts +229 -0
- package/tests/ts/transformer_block.test.ts +72 -0
- package/tsconfig.json +20 -0
- package/tsconfig.test.json +8 -0
package/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Effect GPT
|
|
2
|
+
|
|
3
|
+
A transformer-based LLM built from scratch with Effect. Inspired by [RustGPT](https://github.com/tekaratzas/RustGPT).
|
|
4
|
+
|
|
5
|
+
## What This Is
|
|
6
|
+
|
|
7
|
+
A complete LLM implementation including:
|
|
8
|
+
- **Tokenization** — BPE-style text preprocessing
|
|
9
|
+
- **Transformer Architecture** — embeddings, multi-head attention, feed-forward layers, layer norm
|
|
10
|
+
- **Training** — cross-entropy loss, backpropagation, Adam optimizer with gradient clipping
|
|
11
|
+
- **Inference** — greedy decoding for text generation
|
|
12
|
+
|
|
13
|
+
## Quick Start
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
bun install
|
|
17
|
+
bun run dev # train + generate
|
|
18
|
+
bun test # run test suite
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Why Effect?
|
|
22
|
+
|
|
23
|
+
This project leverages Effect's robust ecosystem to bring systems-programming discipline to TypeScript:
|
|
24
|
+
|
|
25
|
+
- **Service-Based Architecture** — Uses `Context` and `Layer` to keep the core model pure and make testing deterministic by swapping implementations (e.g., swapping a terminal logger for a test capture).
|
|
26
|
+
- **Type-Safe Errors** — Implements `Data.TaggedError` for a precise, union-based error system, ensuring all failure cases (shape mismatches, IO errors) are handled explicitly.
|
|
27
|
+
- **Lazy Streaming** — Utilizes `Stream` for high-performance, backpressured data loading and batching during training.
|
|
28
|
+
- **Resource Management** — Uses `Scope` to guarantee that file handles and fibers are always cleaned up correctly.
|
|
29
|
+
- **Declarative Concurrency** — Leverages Effect's runtime to manage parallel preprocessing and training loops without the complexity of manual orchestration.
|
|
30
|
+
|
|
31
|
+
## Deep Dive
|
|
32
|
+
|
|
33
|
+
Curious about how this was built? Check out our blog post: [Building a transformer-based LLM with Effect](https://hackmd.io/jW2kapAMSkWpG_PW1o3hAA).
|
|
34
|
+
|
|
35
|
+
## Project Structure
|
|
36
|
+
|
|
37
|
+
```
|
|
38
|
+
src/
|
|
39
|
+
├── tensor/ # Tensor2D, matmul, softmax, layer norm
|
|
40
|
+
├── model/ # embeddings, attention, transformer blocks, forward/backward
|
|
41
|
+
├── training/ # loss, gradients, Adam optimizer, training loop
|
|
42
|
+
├── tokenize/ # text → tokens
|
|
43
|
+
├── vocab/ # vocabulary management
|
|
44
|
+
├── services/ # Effect services (Random, Logger, Metrics)
|
|
45
|
+
└── cli/ # command-line interface
|
|
46
|
+
tests/
|
|
47
|
+
├── ts/ # TypeScript test suite
|
|
48
|
+
│ ├── support/ # test helpers, seeded RNG,
|
|
49
|
+
│ └── ... # model, tensor, and training unit/integration tests
|
|
50
|
+
```
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
[
|
|
2
|
+
"User: What causes rain? Assistant: Rain is caused by water vapor in clouds condensing into droplets that become too heavy to remain airborne </s>",
|
|
3
|
+
"User: How do mountains form? Assistant: Mountains are formed through tectonic forces or volcanism over long geological time periods </s>",
|
|
4
|
+
"User: What is the Amazon rainforest? Assistant: The Amazon rainforest is one of the most biodiverse places on earth, home to countless species </s>",
|
|
5
|
+
"User: At what temperature does water boil? Assistant: Water boils at 100 degrees celsius at standard atmospheric pressure </s>",
|
|
6
|
+
"User: How long does it take the moon to orbit Earth? Assistant: The moon orbits the earth approximately every 27.3 days </s>",
|
|
7
|
+
"User: What is photosynthesis? Assistant: Photosynthesis is the process by which green plants use sunlight to synthesize food from carbon dioxide </s>",
|
|
8
|
+
"User: How does gravity work? Assistant: Gravity is a force that attracts two bodies toward each other based on their mass </s>",
|
|
9
|
+
"User: How many neurons are in the human brain? Assistant: The human brain contains about 86 billion neurons that transmit information throughout the body </s>",
|
|
10
|
+
"User: What is electricity? Assistant: Electricity is the flow of electrons through a conductor, often used to power devices </s>",
|
|
11
|
+
"User: What is climate change? Assistant: Climate change refers to long-term shifts in temperatures and weather patterns on Earth </s>",
|
|
12
|
+
"User: How long do oak trees live? Assistant: Oak trees can live for hundreds of years and produce acorns as their fruit </s>",
|
|
13
|
+
"User: What happened to Pluto? Assistant: Pluto was reclassified from a planet to a dwarf planet in 2006 by astronomers </s>",
|
|
14
|
+
"User: How is glass made? Assistant: Glass is made by heating sand, soda ash, and limestone to very high temperatures until they melt </s>",
|
|
15
|
+
"User: What do volcanoes do? Assistant: Volcanoes can erupt with lava, ash, and gases, altering landscapes and ecosystems </s>",
|
|
16
|
+
"User: Why was the Great Wall built? Assistant: The great wall of china was built to protect ancient china from invasions </s>",
|
|
17
|
+
"User: What are penguins like? Assistant: Penguins are flightless birds that are well adapted to life in cold environments </s>",
|
|
18
|
+
"User: How much rain do deserts get? Assistant: Deserts receive less than 250 millimeters of precipitation each year </s>",
|
|
19
|
+
"User: What is Jupiter like? Assistant: Jupiter is the largest planet in our solar system and has dozens of moons </s>",
|
|
20
|
+
"User: How fast does light travel? Assistant: Light travels at approximately 299,792 kilometers per second in a vacuum </s>",
|
|
21
|
+
"User: What is gold used for? Assistant: Gold is a dense, soft metal often used in jewelry and electronics due to its conductivity </s>",
|
|
22
|
+
"User: What covers most of Earth? Assistant: Most of the earth's surface is covered by water, primarily in oceans </s>",
|
|
23
|
+
"User: How do bicycles work? Assistant: Bicycles are an efficient mode of transport that convert human energy into motion through gears </s>",
|
|
24
|
+
"User: How is chocolate made? Assistant: Chocolate is made from roasted and ground cacao seeds, often sweetened and flavored </s>",
|
|
25
|
+
"User: What is the internet? Assistant: The internet is a global network that allows for digital communication and information sharing </s>",
|
|
26
|
+
"User: How do wind turbines work? Assistant: Wind energy is harnessed using turbines that convert wind motion into electricity </s>",
|
|
27
|
+
"User: What are cats like? Assistant: Cats are domesticated mammals known for their independence and hunting instincts </s>",
|
|
28
|
+
"User: How do languages change? Assistant: Languages evolve over time through cultural, social, and technological influences </s>",
|
|
29
|
+
"User: What did the printing press do? Assistant: The printing press revolutionized the spread of information in the 15th century </s>",
|
|
30
|
+
"User: What is sound? Assistant: Sound is a vibration that travels through air, water, or solid materials to reach our ears </s>",
|
|
31
|
+
"User: Why is carbon important? Assistant: Carbon is an essential element in organic chemistry, forming the basis of all life on Earth </s>",
|
|
32
|
+
"User: What was the Library of Alexandria? Assistant: The library of alexandria was one of the most significant libraries of the ancient world </s>",
|
|
33
|
+
"User: What do honeybees do? Assistant: Honeybees play a vital role in pollination, which supports ecosystems and agriculture </s>",
|
|
34
|
+
"User: Are electric cars better? Assistant: Electric vehicles produce less air pollution than traditional gasoline-powered cars </s>",
|
|
35
|
+
"User: How do you make bread? Assistant: Bread is typically made from flour, water, yeast, and salt through a baking process </s>",
|
|
36
|
+
"User: What is the Sahara desert? Assistant: The sahara desert is the largest hot desert in the world, spanning multiple countries </s>",
|
|
37
|
+
"User: What are renewable resources? Assistant: Renewable resources replenish naturally and include sunlight, wind, and water </s>",
|
|
38
|
+
"User: What are eclipses? Assistant: Eclipses occur when one celestial body moves into the shadow of another </s>",
|
|
39
|
+
"User: How are language models trained? Assistant: Language models are trained using vast amounts of text to learn patterns in language </s>",
|
|
40
|
+
"User: How do compasses work? Assistant: Compasses work by aligning a magnetic needle with the earth's magnetic field </s>",
|
|
41
|
+
"User: What do vaccines do? Assistant: Vaccines help the immune system recognize and fight off specific pathogens </s>",
|
|
42
|
+
"User: Hello! Assistant: Hello! How can I help you today? </s>",
|
|
43
|
+
"User: Hi! Assistant: Hi there! What can I do for you? </s>",
|
|
44
|
+
"User: Hey! Assistant: Hey! How's it going? What would you like to know? </s>",
|
|
45
|
+
"User: Good morning! Assistant: Good morning! Hope you're having a great day. How can I assist you? </s>",
|
|
46
|
+
"User: Good afternoon! Assistant: Good afternoon! What can I help you with today? </s>",
|
|
47
|
+
"User: Good evening! Assistant: Good evening! How may I help you tonight? </s>",
|
|
48
|
+
"User: How are you? Assistant: I'm doing well, thank you for asking! How can I help you today? </s>",
|
|
49
|
+
"User: What's up? Assistant: Not much, just here to help! What would you like to know? </s>",
|
|
50
|
+
"User: Greetings! Assistant: Greetings! It's nice to meet you. How can I be of assistance? </s>",
|
|
51
|
+
"User: Thank you. Assistant: You're welcome! I'm happy to help </s>",
|
|
52
|
+
"User: Can you explain that? Assistant: Of course! I'd be happy to explain in more detail </s>",
|
|
53
|
+
"User: I don't understand. Assistant: No problem! Let me try to explain it differently </s>",
|
|
54
|
+
"User: What do you think? Assistant: Based on the information available, I think it's important to consider multiple perspectives </s>"
|
|
55
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[
|
|
2
|
+
"The sun rises in the east and sets in the west </s>",
|
|
3
|
+
"Water flows downhill due to gravity </s>",
|
|
4
|
+
"Birds fly through the air using their wings </s>",
|
|
5
|
+
"Fish swim in rivers, lakes, and oceans </s>",
|
|
6
|
+
"Trees grow tall and produce leaves </s>",
|
|
7
|
+
"Rain falls from clouds in the sky </s>",
|
|
8
|
+
"Fire is hot and produces light </s>",
|
|
9
|
+
"Ice is frozen water that melts when heated </s>",
|
|
10
|
+
"Mountains are tall and rocky formations </s>",
|
|
11
|
+
"The moon orbits around planet Earth </s>",
|
|
12
|
+
"Flowers bloom in spring and summer </s>",
|
|
13
|
+
"Snow is cold and white </s>",
|
|
14
|
+
"Wind moves air from place to place </s>",
|
|
15
|
+
"Rivers flow into larger bodies of water </s>",
|
|
16
|
+
"Sand is found on beaches and in deserts </s>",
|
|
17
|
+
"Grass grows in fields and yards </s>",
|
|
18
|
+
"Rocks are hard and can be different colors </s>",
|
|
19
|
+
"Stars shine bright in the night sky </s>",
|
|
20
|
+
"Waves move across the surface of water </s>",
|
|
21
|
+
"Clouds form when water vapor rises </s>",
|
|
22
|
+
"Lightning is bright and makes thunder </s>",
|
|
23
|
+
"Storms bring rain and strong winds </s>",
|
|
24
|
+
"Seasons change throughout the year </s>",
|
|
25
|
+
"Animals eat food to survive </s>",
|
|
26
|
+
"Plants need sunlight and water to grow </s>"
|
|
27
|
+
]
|
package/package.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "effect-gpt",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"build": "tsc",
|
|
8
|
+
"check": "tsc --noEmit",
|
|
9
|
+
"dev": "bun src/cli/main.ts",
|
|
10
|
+
"test": "bun test",
|
|
11
|
+
"test:watch": "bun test --watch",
|
|
12
|
+
"test:unit": "bun test tests/ts"
|
|
13
|
+
},
|
|
14
|
+
"dependencies": {
|
|
15
|
+
"effect": "^3.16.0",
|
|
16
|
+
"@effect/platform": "^0.87.0",
|
|
17
|
+
"@effect/platform-bun": "^0.60.22",
|
|
18
|
+
"@effect/experimental": "^0.44.21",
|
|
19
|
+
"@effect/sql": "^0.33.21"
|
|
20
|
+
},
|
|
21
|
+
"devDependencies": {
|
|
22
|
+
"@types/bun": "latest",
|
|
23
|
+
"typescript": "^5.6.0"
|
|
24
|
+
}
|
|
25
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import * as Cause from "effect/Cause"
|
|
2
|
+
import type { TrainingError } from "../errors"
|
|
3
|
+
|
|
4
|
+
const formatUnknown = (err: unknown): string => {
|
|
5
|
+
if (Cause.isCause(err)) return Cause.pretty(err)
|
|
6
|
+
if (err instanceof Error) return err.stack ?? `${err.name}: ${err.message}`
|
|
7
|
+
return String(err)
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export const formatTrainingError = (err: TrainingError | unknown): string => {
|
|
11
|
+
if (err && typeof err === "object" && "_tag" in err) {
|
|
12
|
+
const tagged = err as { _tag: string }
|
|
13
|
+
switch (tagged._tag) {
|
|
14
|
+
case "TrainingDatasetError": {
|
|
15
|
+
const e = err as TrainingError & { cause: { path?: string; error?: unknown; message?: string; _tag?: string } }
|
|
16
|
+
const location = e.cause.path ? ` (${e.cause.path})` : ""
|
|
17
|
+
const reason = e.cause._tag ?? "dataset"
|
|
18
|
+
const detail =
|
|
19
|
+
e.cause.error !== undefined
|
|
20
|
+
? typeof e.cause.error === "object" && e.cause.error !== null && "message" in e.cause.error
|
|
21
|
+
? (e.cause.error as any).message
|
|
22
|
+
: String(e.cause.error)
|
|
23
|
+
: e.cause.message ?? ""
|
|
24
|
+
return `Dataset error${location}: ${reason}${detail ? ` - ${detail}` : ""}`
|
|
25
|
+
}
|
|
26
|
+
case "TrainingShapeError": {
|
|
27
|
+
const e = err as any
|
|
28
|
+
return `Shape error: ${e.cause?.message ?? formatUnknown(e.cause)}`
|
|
29
|
+
}
|
|
30
|
+
case "TrainingTokenizerError": {
|
|
31
|
+
const e = err as any
|
|
32
|
+
return `Tokenizer error: ${e.message}`
|
|
33
|
+
}
|
|
34
|
+
case "TrainingOptimizerError": {
|
|
35
|
+
const e = err as any
|
|
36
|
+
return `Optimizer error: ${e.message}`
|
|
37
|
+
}
|
|
38
|
+
case "TrainingConfigError": {
|
|
39
|
+
const e = err as any
|
|
40
|
+
return `Configuration error: ${e.message}`
|
|
41
|
+
}
|
|
42
|
+
case "TrainingUnknownError": {
|
|
43
|
+
const e = err as any
|
|
44
|
+
return `Unexpected training error: ${formatUnknown(e.cause)}`
|
|
45
|
+
}
|
|
46
|
+
default:
|
|
47
|
+
return `Unexpected error (${tagged._tag}): ${formatUnknown(err)}`
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
return `Unexpected error: ${formatUnknown(err)}`
|
|
51
|
+
}
|
package/src/cli/main.ts
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
import * as Effect from "effect/Effect"
|
|
2
|
+
import * as Layer from "effect/Layer"
|
|
3
|
+
import * as HashSet from "effect/HashSet"
|
|
4
|
+
import { Terminal } from "@effect/platform"
|
|
5
|
+
import { BunFileSystem, BunRuntime, BunTerminal } from "@effect/platform-bun"
|
|
6
|
+
import { Dataset } from "../data/Dataset"
|
|
7
|
+
import { Vocab } from "../vocab/Vocab"
|
|
8
|
+
import { LLM } from "../model/LLM"
|
|
9
|
+
import { Embeddings } from "../model/Embeddings"
|
|
10
|
+
import { TransformerBlock } from "../model/TransformerBlock"
|
|
11
|
+
import { OutputProjection } from "../model/OutputProjection"
|
|
12
|
+
import {
|
|
13
|
+
trainStream,
|
|
14
|
+
makeLLMLayer,
|
|
15
|
+
makeTrainingConfigLayer,
|
|
16
|
+
makePreprocessSettingsLayer
|
|
17
|
+
} from "../training/train"
|
|
18
|
+
import { MAX_SEQ_LEN, EMBEDDING_DIM, HIDDEN_DIM } from "../config"
|
|
19
|
+
import { PrettyLoggerLive, info, error as logError } from "../services/Logger"
|
|
20
|
+
import { InMemoryMetricsLive, snapshot } from "../services/Metrics"
|
|
21
|
+
import { SeedLayer, useSeedRng } from "../services/SeedLayer"
|
|
22
|
+
import type { Rng } from "../tensor/random"
|
|
23
|
+
import type { TrainingError } from "../errors"
|
|
24
|
+
import { formatTrainingError } from "./errors"
|
|
25
|
+
|
|
26
|
+
const PRETRAIN_EPOCHS = 100
|
|
27
|
+
const PRETRAIN_LR = 0.0005
|
|
28
|
+
const FINETUNE_EPOCHS = 100
|
|
29
|
+
const FINETUNE_LR = 0.0001
|
|
30
|
+
|
|
31
|
+
const readLine = (prompt: string) =>
|
|
32
|
+
Effect.gen(function* () {
|
|
33
|
+
const terminal = yield* Terminal.Terminal
|
|
34
|
+
yield* terminal.display(prompt)
|
|
35
|
+
return yield* terminal.readLine
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
const repl = (llm: LLM) =>
|
|
39
|
+
Effect.scoped(
|
|
40
|
+
Effect.gen(function* () {
|
|
41
|
+
const terminal = yield* Terminal.Terminal
|
|
42
|
+
yield* terminal.display("\n--- Interactive Mode ---\n")
|
|
43
|
+
yield* terminal.display("Type a prompt and press Enter to generate text.\n")
|
|
44
|
+
yield* terminal.display("Type 'exit' to quit.\n")
|
|
45
|
+
|
|
46
|
+
while (true) {
|
|
47
|
+
const input = yield* readLine("\nEnter prompt: ")
|
|
48
|
+
const trimmed = input.trim()
|
|
49
|
+
|
|
50
|
+
if (trimmed.toLowerCase() === "exit") {
|
|
51
|
+
yield* terminal.display("Exiting interactive mode.\n")
|
|
52
|
+
break
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
const formattedInput = `User: ${trimmed}`
|
|
56
|
+
const prediction = yield* llm.predict(formattedInput)
|
|
57
|
+
yield* terminal.display(`Model output: ${prediction}\n`)
|
|
58
|
+
}
|
|
59
|
+
})
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
const parseSeedArg = (argv: string[]): number | undefined => {
|
|
63
|
+
const seedIndex = argv.findIndex((arg) => arg === "--seed")
|
|
64
|
+
if (seedIndex >= 0 && seedIndex < argv.length - 1) {
|
|
65
|
+
const asNum = Number(argv[seedIndex + 1])
|
|
66
|
+
return Number.isFinite(asNum) ? asNum : undefined
|
|
67
|
+
}
|
|
68
|
+
return undefined
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const main = Effect.scoped(
|
|
72
|
+
Effect.gen(function* () {
|
|
73
|
+
const terminal = yield* Terminal.Terminal
|
|
74
|
+
const rng: Rng = yield* useSeedRng()
|
|
75
|
+
|
|
76
|
+
const dataset = Dataset.load({
|
|
77
|
+
pretrainingPath: "data/pretraining_data.json",
|
|
78
|
+
chatPath: "data/chat_training_data.json",
|
|
79
|
+
format: "json"
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
const vocabSet1 = yield* Vocab.processStreamForVocab(dataset.pretrainingStream())
|
|
83
|
+
const vocabSet2 = yield* Vocab.processStreamForVocab(dataset.chatStream())
|
|
84
|
+
|
|
85
|
+
const combinedSet = HashSet.union(vocabSet1, vocabSet2)
|
|
86
|
+
const vocabWords = Array.from(HashSet.values(combinedSet)).sort()
|
|
87
|
+
const vocab = Vocab.make(vocabWords)
|
|
88
|
+
|
|
89
|
+
const vocabSize = vocab.words.length
|
|
90
|
+
const network = [
|
|
91
|
+
new Embeddings(vocabSize, EMBEDDING_DIM, MAX_SEQ_LEN, rng),
|
|
92
|
+
new TransformerBlock(EMBEDDING_DIM, HIDDEN_DIM, rng),
|
|
93
|
+
new TransformerBlock(EMBEDDING_DIM, HIDDEN_DIM, rng),
|
|
94
|
+
new TransformerBlock(EMBEDDING_DIM, HIDDEN_DIM, rng),
|
|
95
|
+
new OutputProjection(EMBEDDING_DIM, vocabSize, rng)
|
|
96
|
+
]
|
|
97
|
+
const llm = new LLM(vocab, network)
|
|
98
|
+
|
|
99
|
+
yield* terminal.display("\n=== MODEL INFORMATION ===\n")
|
|
100
|
+
yield* terminal.display(`Network architecture: ${llm.networkDescription()}\n`)
|
|
101
|
+
yield* terminal.display(
|
|
102
|
+
`Model configuration -> max_seq_len: ${MAX_SEQ_LEN}, embedding_dim: ${EMBEDDING_DIM}, hidden_dim: ${HIDDEN_DIM}\n`
|
|
103
|
+
)
|
|
104
|
+
yield* terminal.display(`Total parameters: ${llm.totalParameters()}\n`)
|
|
105
|
+
|
|
106
|
+
const testInput = "User: How do mountains form?"
|
|
107
|
+
|
|
108
|
+
yield* terminal.display("\n=== BEFORE TRAINING ===\n")
|
|
109
|
+
yield* terminal.display(`Input: ${testInput}\n`)
|
|
110
|
+
const beforeOutput = yield* llm.predict(testInput)
|
|
111
|
+
yield* terminal.display(`Output: ${beforeOutput}\n`)
|
|
112
|
+
|
|
113
|
+
const llmLayer = makeLLMLayer(llm)
|
|
114
|
+
const preprocessLayer = makePreprocessSettingsLayer({ concurrency: "unbounded", batchSize: 1 })
|
|
115
|
+
|
|
116
|
+
yield* info("\n=== PRE-TRAINING MODEL ===")
|
|
117
|
+
yield* info(`Pre-training for ${PRETRAIN_EPOCHS} epochs with learning rate ${PRETRAIN_LR}`)
|
|
118
|
+
yield* trainStream(dataset.pretrainingStream).pipe(
|
|
119
|
+
Effect.provide(llmLayer),
|
|
120
|
+
Effect.provide(makeTrainingConfigLayer({ epochs: PRETRAIN_EPOCHS, learningRate: PRETRAIN_LR })),
|
|
121
|
+
Effect.provide(preprocessLayer)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
yield* info("\n=== INSTRUCTION TUNING ===")
|
|
125
|
+
yield* info(`Instruction tuning for ${FINETUNE_EPOCHS} epochs with learning rate ${FINETUNE_LR}`)
|
|
126
|
+
yield* trainStream(dataset.chatStream).pipe(
|
|
127
|
+
Effect.provide(llmLayer),
|
|
128
|
+
Effect.provide(makeTrainingConfigLayer({ epochs: FINETUNE_EPOCHS, learningRate: FINETUNE_LR })),
|
|
129
|
+
Effect.provide(preprocessLayer)
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
const metrics = yield* snapshot()
|
|
133
|
+
yield* info("Training complete", {
|
|
134
|
+
epochsCompleted: metrics.counters.find((c) => c.name === "epochs_completed")?.value,
|
|
135
|
+
totalExamples: metrics.counters.find((c) => c.name === "examples_processed")?.value,
|
|
136
|
+
finalLoss: metrics.gauges.find((g) => g.name === "epoch_loss")?.value
|
|
137
|
+
})
|
|
138
|
+
|
|
139
|
+
yield* terminal.display("\n=== AFTER TRAINING ===\n")
|
|
140
|
+
yield* terminal.display(`Input: ${testInput}\n`)
|
|
141
|
+
const afterOutput = yield* llm.predict(testInput)
|
|
142
|
+
yield* terminal.display(`Output: ${afterOutput}\n`)
|
|
143
|
+
yield* terminal.display("======================\n")
|
|
144
|
+
|
|
145
|
+
yield* repl(llm)
|
|
146
|
+
})
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
const LoggerLayer = PrettyLoggerLive("info")
|
|
150
|
+
|
|
151
|
+
const seedValue = parseSeedArg(process.argv)
|
|
152
|
+
const SeedLayerLive = SeedLayer(seedValue)
|
|
153
|
+
|
|
154
|
+
const AppLayer = Layer.mergeAll(BunFileSystem.layer, BunTerminal.layer, LoggerLayer, InMemoryMetricsLive, SeedLayerLive)
|
|
155
|
+
|
|
156
|
+
const program = Effect.scoped(
|
|
157
|
+
main.pipe(
|
|
158
|
+
Effect.provide(AppLayer),
|
|
159
|
+
Effect.catchAll((err) => logError(formatTrainingError(err as TrainingError)).pipe(Effect.provide(AppLayer)))
|
|
160
|
+
)
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
BunRuntime.runMain(program)
|
package/src/config.ts
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import * as Data from "effect/Data"
|
|
2
|
+
import * as Effect from "effect/Effect"
|
|
3
|
+
import * as Stream from "effect/Stream"
|
|
4
|
+
import * as Option from "effect/Option"
|
|
5
|
+
import * as Schema from "effect/Schema"
|
|
6
|
+
import { FileSystem } from "@effect/platform"
|
|
7
|
+
|
|
8
|
+
export type DatasetFormat = "json" | "csv"
|
|
9
|
+
|
|
10
|
+
export interface DatasetLoadOptions {
|
|
11
|
+
readonly pretrainingPath: string
|
|
12
|
+
readonly chatPath: string
|
|
13
|
+
readonly format: DatasetFormat
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface DatasetStreams {
|
|
17
|
+
/**
|
|
18
|
+
* Fresh stream producer for pretraining data.
|
|
19
|
+
* Each invocation re-opens the file, so it is safe to call once per epoch.
|
|
20
|
+
*/
|
|
21
|
+
readonly pretrainingStream: () => Stream.Stream<string, DatasetLoadError | DatasetParseError, FileSystem.FileSystem>
|
|
22
|
+
/**
|
|
23
|
+
* Fresh stream producer for chat training data.
|
|
24
|
+
* Each invocation re-opens the file, so it is safe to call once per epoch.
|
|
25
|
+
*/
|
|
26
|
+
readonly chatStream: () => Stream.Stream<string, DatasetLoadError | DatasetParseError, FileSystem.FileSystem>
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export interface DatasetArrays {
|
|
30
|
+
readonly pretrainingData: ReadonlyArray<string>
|
|
31
|
+
readonly chatTrainingData: ReadonlyArray<string>
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export class DatasetLoadError extends Data.TaggedError("DatasetLoadError")<{
|
|
35
|
+
readonly path: string
|
|
36
|
+
readonly error: unknown
|
|
37
|
+
}> {}
|
|
38
|
+
|
|
39
|
+
export class DatasetParseError extends Data.TaggedError("DatasetParseError")<{
|
|
40
|
+
readonly path: string
|
|
41
|
+
readonly error: unknown
|
|
42
|
+
}> {}
|
|
43
|
+
|
|
44
|
+
const TrainingItemSchema = Schema.String
|
|
45
|
+
const decodeTrainingItem = Schema.decodeUnknown(TrainingItemSchema)
|
|
46
|
+
|
|
47
|
+
const makeFileStream = (path: string) =>
|
|
48
|
+
Effect.flatMap(FileSystem.FileSystem, (fs) => Effect.succeed(fs.stream(path)))
|
|
49
|
+
|
|
50
|
+
const linesFromFile = (path: string) =>
|
|
51
|
+
Stream.splitLines(
|
|
52
|
+
Stream.decodeText(
|
|
53
|
+
Stream.mapError(
|
|
54
|
+
Stream.unwrap(makeFileStream(path)),
|
|
55
|
+
(error) => new DatasetLoadError({ path, error })
|
|
56
|
+
)
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
const parseJsonLine = (path: string, rawLine: string): Effect.Effect<Option.Option<string>, DatasetParseError> =>
|
|
61
|
+
Effect.gen(function* () {
|
|
62
|
+
const trimmed = rawLine.trim()
|
|
63
|
+
if (trimmed === "[" || trimmed === "]" || trimmed.length === 0) {
|
|
64
|
+
return yield* Effect.succeed<Option.Option<string>>(Option.none())
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const withoutComma = trimmed.endsWith(",") ? trimmed.slice(0, -1) : trimmed
|
|
68
|
+
|
|
69
|
+
const parsed: unknown = yield* Effect.try({
|
|
70
|
+
try: () => JSON.parse(withoutComma),
|
|
71
|
+
catch: (error) => new DatasetParseError({ path, error })
|
|
72
|
+
})
|
|
73
|
+
|
|
74
|
+
const decoded: string = yield* decodeTrainingItem(parsed).pipe(
|
|
75
|
+
Effect.mapError((error) => new DatasetParseError({ path, error }))
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
return yield* Effect.succeed<Option.Option<string>>(Option.some(decoded))
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
const jsonStream = (path: string) =>
|
|
82
|
+
Stream.mapEffect(linesFromFile(path), (line) => parseJsonLine(path, line)).pipe(
|
|
83
|
+
Stream.filterMap((option) => option)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
const splitCsvLine = (line: string): Array<string> => {
|
|
87
|
+
const fields: Array<string> = []
|
|
88
|
+
let current = ""
|
|
89
|
+
let inQuotes = false
|
|
90
|
+
|
|
91
|
+
for (let i = 0; i < line.length; i++) {
|
|
92
|
+
const ch = line[i]!
|
|
93
|
+
if (ch === '"') {
|
|
94
|
+
if (inQuotes && line[i + 1] === '"') {
|
|
95
|
+
// Escaped quote
|
|
96
|
+
current += '"'
|
|
97
|
+
i++ // skip next
|
|
98
|
+
} else {
|
|
99
|
+
inQuotes = !inQuotes
|
|
100
|
+
}
|
|
101
|
+
} else if (ch === "," && !inQuotes) {
|
|
102
|
+
fields.push(current)
|
|
103
|
+
current = ""
|
|
104
|
+
} else {
|
|
105
|
+
current += ch
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (inQuotes) {
|
|
110
|
+
throw new Error("Unclosed quote in CSV line")
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
fields.push(current)
|
|
114
|
+
return fields
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const parseCsvLine = (path: string, rawLine: string): Effect.Effect<Option.Option<string>, DatasetParseError> =>
|
|
118
|
+
Effect.gen(function* () {
|
|
119
|
+
const trimmed = rawLine.trim()
|
|
120
|
+
if (trimmed.length === 0) {
|
|
121
|
+
return yield* Effect.succeed<Option.Option<string>>(Option.none())
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const joined = yield* Effect.try({
|
|
125
|
+
try: () => splitCsvLine(rawLine).join(","),
|
|
126
|
+
catch: (error) => new DatasetParseError({ path, error })
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
return yield* Effect.succeed<Option.Option<string>>(Option.some(joined))
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
const csvStream = (path: string) =>
|
|
133
|
+
Stream.mapEffect(linesFromFile(path), (line) => parseCsvLine(path, line)).pipe(
|
|
134
|
+
Stream.filterMap((option) => option)
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
const streamForFormat = (path: string, format: DatasetFormat) =>
|
|
138
|
+
format === "json" ? jsonStream(path) : csvStream(path)
|
|
139
|
+
|
|
140
|
+
const collectAll = (
|
|
141
|
+
stream: Stream.Stream<string, DatasetLoadError | DatasetParseError, FileSystem.FileSystem>
|
|
142
|
+
) =>
|
|
143
|
+
Stream.runCollect(stream).pipe(
|
|
144
|
+
Effect.map((chunk) => Array.from(chunk)),
|
|
145
|
+
Effect.catchAll((error) => Effect.fail(error))
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
export const Dataset = {
|
|
149
|
+
/**
|
|
150
|
+
* Returns dataset streams that can be re-opened for each consumption.
|
|
151
|
+
*/
|
|
152
|
+
load: (options: DatasetLoadOptions): DatasetStreams => ({
|
|
153
|
+
pretrainingStream: () => streamForFormat(options.pretrainingPath, options.format),
|
|
154
|
+
chatStream: () => streamForFormat(options.chatPath, options.format)
|
|
155
|
+
}),
|
|
156
|
+
|
|
157
|
+
/**
|
|
158
|
+
* Convenience helper to materialize both streams into arrays.
|
|
159
|
+
* Useful for small datasets or legacy call sites.
|
|
160
|
+
*/
|
|
161
|
+
collect: (
|
|
162
|
+
streams: DatasetStreams
|
|
163
|
+
): Effect.Effect<DatasetArrays, DatasetLoadError | DatasetParseError, FileSystem.FileSystem> =>
|
|
164
|
+
Effect.all({
|
|
165
|
+
pretrainingData: collectAll(streams.pretrainingStream()),
|
|
166
|
+
chatTrainingData: collectAll(streams.chatStream())
|
|
167
|
+
})
|
|
168
|
+
}
|
package/src/errors.ts
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import * as Data from "effect/Data"
|
|
2
|
+
import type { DatasetLoadError, DatasetParseError } from "./data/Dataset"
|
|
3
|
+
import type { ShapeError } from "./tensor/ops"
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Narrow, discriminated error channel for the training pipeline.
|
|
7
|
+
* Keeps original domain errors attached for rich reporting at the CLI boundary.
|
|
8
|
+
*/
|
|
9
|
+
export class TrainingDatasetError extends Data.TaggedError("TrainingDatasetError")<{
|
|
10
|
+
readonly cause: DatasetLoadError | DatasetParseError
|
|
11
|
+
}> {}
|
|
12
|
+
|
|
13
|
+
export class TrainingShapeError extends Data.TaggedError("TrainingShapeError")<{
|
|
14
|
+
readonly cause: ShapeError
|
|
15
|
+
}> {}
|
|
16
|
+
|
|
17
|
+
export class TrainingTokenizerError extends Data.TaggedError("TrainingTokenizerError")<{
|
|
18
|
+
readonly message: string
|
|
19
|
+
readonly cause?: unknown
|
|
20
|
+
}> {}
|
|
21
|
+
|
|
22
|
+
export class TrainingOptimizerError extends Data.TaggedError("TrainingOptimizerError")<{
|
|
23
|
+
readonly message: string
|
|
24
|
+
readonly cause?: unknown
|
|
25
|
+
}> {}
|
|
26
|
+
|
|
27
|
+
export class TrainingConfigError extends Data.TaggedError("TrainingConfigError")<{
|
|
28
|
+
readonly message: string
|
|
29
|
+
readonly cause?: unknown
|
|
30
|
+
}> {}
|
|
31
|
+
|
|
32
|
+
export class TrainingUnknownError extends Data.TaggedError("TrainingUnknownError")<{
|
|
33
|
+
readonly cause: unknown
|
|
34
|
+
}> {}
|
|
35
|
+
|
|
36
|
+
export type TrainingError =
|
|
37
|
+
| TrainingDatasetError
|
|
38
|
+
| TrainingShapeError
|
|
39
|
+
| TrainingTokenizerError
|
|
40
|
+
| TrainingOptimizerError
|
|
41
|
+
| TrainingConfigError
|
|
42
|
+
| TrainingUnknownError
|
|
43
|
+
|
|
44
|
+
export const TrainingError = {
|
|
45
|
+
dataset: (cause: DatasetLoadError | DatasetParseError): TrainingDatasetError =>
|
|
46
|
+
new TrainingDatasetError({ cause }),
|
|
47
|
+
shape: (cause: ShapeError): TrainingShapeError => new TrainingShapeError({ cause }),
|
|
48
|
+
tokenizer: (message: string, cause?: unknown): TrainingTokenizerError =>
|
|
49
|
+
new TrainingTokenizerError({ message, cause }),
|
|
50
|
+
optimizer: (message: string, cause?: unknown): TrainingOptimizerError =>
|
|
51
|
+
new TrainingOptimizerError({ message, cause }),
|
|
52
|
+
config: (message: string, cause?: unknown): TrainingConfigError =>
|
|
53
|
+
new TrainingConfigError({ message, cause }),
|
|
54
|
+
unknown: (cause: unknown): TrainingUnknownError => new TrainingUnknownError({ cause }),
|
|
55
|
+
fromUnknown: (error: unknown): TrainingError => {
|
|
56
|
+
if (error instanceof TrainingDatasetError) return error
|
|
57
|
+
if (error instanceof TrainingShapeError) return error
|
|
58
|
+
if (error instanceof TrainingTokenizerError) return error
|
|
59
|
+
if (error instanceof TrainingOptimizerError) return error
|
|
60
|
+
if (error instanceof TrainingConfigError) return error
|
|
61
|
+
// Upstream domain errors
|
|
62
|
+
if (error && typeof error === "object") {
|
|
63
|
+
const candidate = error as { _tag?: string }
|
|
64
|
+
if (candidate._tag === "DatasetLoadError" || candidate._tag === "DatasetParseError") {
|
|
65
|
+
return new TrainingDatasetError({ cause: error as DatasetLoadError | DatasetParseError })
|
|
66
|
+
}
|
|
67
|
+
if (candidate._tag === "ShapeError") {
|
|
68
|
+
return new TrainingShapeError({ cause: error as ShapeError })
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return new TrainingUnknownError({ cause: error })
|
|
72
|
+
}
|
|
73
|
+
}
|