skylar 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
skylar/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """Skylar — local, sovereign, from-scratch LLMs (COBOL specialist & more)."""
2
+ from .core import Skylar, load, DEFAULT_MODEL, COBOL_SYSTEM
3
+ from .config import NanoTransformerConfig
4
+ from .decoder import NanoTransformer
5
+ from .chatml import encode_chatml
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = ["Skylar", "load", "NanoTransformer", "NanoTransformerConfig",
9
+ "encode_chatml", "DEFAULT_MODEL", "COBOL_SYSTEM", "__version__"]
10
+
11
+
12
+ def _register_auto():
13
+ """Make AutoConfig/AutoModelForCausalLM aware of the custom arch, so
14
+ `AutoModelForCausalLM.from_pretrained(repo)` works after `import skylar`.
15
+ Best-effort: never break import if transformers internals change."""
16
+ try:
17
+ from transformers import AutoConfig, AutoModelForCausalLM
18
+ AutoConfig.register("nano-transformer", NanoTransformerConfig)
19
+ AutoModelForCausalLM.register(NanoTransformerConfig, NanoTransformer)
20
+ except Exception:
21
+ pass
22
+
23
+
24
+ _register_auto()
skylar/chatml.py ADDED
@@ -0,0 +1,112 @@
1
+ """
2
+ Chat dataset utilities for Supervised Fine-Tuning (SFT).
3
+
4
+ Supports:
5
+ - ChatML format (same as Qwen, Mistral, OpenAI)
6
+ The key insight: a "chat model" is just a base model fine-tuned on
7
+ structured conversations. The structure is enforced by special tokens.
8
+ """
9
+
10
+ import json
11
+
12
+
13
+ # ─────────────────────────────────────────────────────────────
14
+ # CHAT FORMAT (ChatML)
15
+ # ─────────────────────────────────────────────────────────────
16
+ #
17
+ # ChatML is the standard used by Qwen, Mistral, OpenAI, etc.
18
+ # It wraps each message in special tokens:
19
+ #
20
+ # <|im_start|>system
21
+ # You are a helpful assistant.<|im_end|>
22
+ # <|im_start|>user
23
+ # What is 2+2?<|im_end|>
24
+ # <|im_start|>assistant
25
+ # 2+2 equals 4.<|im_end|>
26
+ #
27
+ # During training, we only compute loss on the ASSISTANT tokens.
28
+ # The model learns: "given this conversation so far, what should
29
+ # the assistant say next?"
30
+ # ─────────────────────────────────────────────────────────────
31
+
32
+ def get_chatml_ids(tokenizer):
33
+ """Works with both special-token and sub-token tokenizers."""
34
+ ims_id = tokenizer.token_to_id("<|im_start|>")
35
+ if ims_id is not None:
36
+ return [ims_id], [tokenizer.token_to_id("<|im_end|>")]
37
+ else:
38
+ return (
39
+ tokenizer.encode("<|im_start|>", add_special_tokens=False).ids,
40
+ tokenizer.encode("<|im_end|>", add_special_tokens=False).ids,
41
+ )
42
+
43
+
44
+ def create_loss_mask(messages, tokenizer):
45
+ """Loss mask for ChatML. Works with both special-token and sub-token tokenizers."""
46
+ ims_ids, ime_ids = get_chatml_ids(tokenizer)
47
+
48
+ all_token_ids = []
49
+ all_labels = []
50
+
51
+ for i, msg in enumerate(messages):
52
+ is_assistant = (msg["role"] == "assistant")
53
+
54
+ all_token_ids.extend(ims_ids)
55
+ all_labels.extend([-100] * len(ims_ids))
56
+
57
+ role_ids = tokenizer.encode(msg["role"] + "\n", add_special_tokens=False).ids
58
+ all_token_ids.extend(role_ids)
59
+ all_labels.extend([-100] * len(role_ids))
60
+
61
+ content_ids = tokenizer.encode(msg["content"], add_special_tokens=False).ids
62
+ all_token_ids.extend(content_ids)
63
+ if is_assistant:
64
+ all_labels.extend(list(content_ids))
65
+ else:
66
+ all_labels.extend([-100] * len(content_ids))
67
+
68
+ all_token_ids.extend(ime_ids)
69
+ if is_assistant:
70
+ all_labels.extend(list(ime_ids))
71
+ else:
72
+ all_labels.extend([-100] * len(ime_ids))
73
+
74
+ if i < len(messages) - 1:
75
+ sep_ids = tokenizer.encode("\n", add_special_tokens=False).ids
76
+ all_token_ids.extend(sep_ids)
77
+ all_labels.extend([-100] * len(sep_ids))
78
+
79
+ return all_token_ids, all_labels
80
+
81
+
82
+ def encode_chatml(messages, tokenizer, add_generation_prompt=False):
83
+ """Encode ChatML. Works with both special-token and sub-token tokenizers."""
84
+ ims_ids, ime_ids = get_chatml_ids(tokenizer)
85
+
86
+ all_ids = []
87
+ for i, msg in enumerate(messages):
88
+ all_ids.extend(ims_ids)
89
+ all_ids.extend(tokenizer.encode(msg["role"] + "\n", add_special_tokens=False).ids)
90
+ all_ids.extend(tokenizer.encode(msg["content"], add_special_tokens=False).ids)
91
+ all_ids.extend(ime_ids)
92
+ if i < len(messages) - 1:
93
+ all_ids.extend(tokenizer.encode("\n", add_special_tokens=False).ids)
94
+
95
+ if add_generation_prompt:
96
+ all_ids.extend(tokenizer.encode("\n", add_special_tokens=False).ids)
97
+ all_ids.extend(ims_ids)
98
+ all_ids.extend(tokenizer.encode("assistant\n", add_special_tokens=False).ids)
99
+
100
+ return all_ids
101
+
102
+
103
+ def load_dataset_jsonl(filepath):
104
+ """Load a JSONL dataset."""
105
+ examples = []
106
+ with open(filepath, "r", encoding="utf-8") as f:
107
+ for line in f:
108
+ line = line.strip()
109
+ if line:
110
+ examples.append(json.loads(line))
111
+ print(f" Loaded {len(examples)} examples from {filepath}")
112
+ return examples
skylar/cli.py ADDED
@@ -0,0 +1,171 @@
1
+ """`skylar` command-line interface: chat · generate · serve."""
2
+ import argparse
3
+ import sys
4
+
5
+ DEFAULT_MODEL = "Sophia-AI/SkylarCobol-390M"
6
+
7
+
8
+ def _add_common(sp):
9
+ sp.add_argument("--model", default=DEFAULT_MODEL,
10
+ help="HF repo id (es. Sophia-AI/SkylarCobol-390M) o cartella locale")
11
+ sp.add_argument("--device", default=None, help="cuda | cpu (auto se omesso)")
12
+ sp.add_argument("--system", default=None, help="system prompt (default: esperto COBOL)")
13
+ sp.add_argument("--max-new", dest="max_new", type=int, default=512)
14
+ sp.add_argument("--temperature", type=float, default=0.0,
15
+ help="0.0 = greedy deterministico (default)")
16
+
17
+
18
+ def cmd_generate(args):
19
+ from .core import Skylar, COBOL_SYSTEM
20
+ sk = Skylar.load(args.model, device=args.device)
21
+ system = args.system if args.system is not None else COBOL_SYSTEM
22
+ print(sk.generate(args.prompt, system=system, max_new_tokens=args.max_new,
23
+ temperature=args.temperature, seed=args.seed))
24
+
25
+
26
+ def cmd_chat(args):
27
+ from .core import Skylar, COBOL_SYSTEM
28
+ try:
29
+ from rich.console import Console
30
+ from rich.panel import Panel
31
+ console = Console()
32
+ except Exception:
33
+ console = None
34
+
35
+ print(f"Carico {args.model} ...", file=sys.stderr)
36
+ sk = Skylar.load(args.model, device=args.device)
37
+ system = args.system if args.system is not None else COBOL_SYSTEM
38
+ head = f"Skylar · {args.model}\ndevice: {sk.device} · system: {system!r}\n('exit' o Ctrl-D per uscire)"
39
+ if console:
40
+ console.print(Panel(head, title="skylar chat", border_style="cyan"))
41
+ else:
42
+ print(head)
43
+
44
+ while True:
45
+ try:
46
+ user = input("\n\033[1m›\033[0m ").strip()
47
+ except (EOFError, KeyboardInterrupt):
48
+ print()
49
+ break
50
+ if user.lower() in ("exit", "quit", ":q"):
51
+ break
52
+ if not user:
53
+ continue
54
+ for delta in sk.stream(user, system=system, max_new_tokens=args.max_new,
55
+ temperature=args.temperature):
56
+ sys.stdout.write(delta)
57
+ sys.stdout.flush()
58
+ print()
59
+
60
+
61
+ def cmd_cobol(args):
62
+ from .core import Skylar
63
+ from .cobol import EXAMPLE_STUB, complete_cobol, syntax_ok
64
+ if args.example:
65
+ stub = EXAMPLE_STUB
66
+ elif args.stub_file:
67
+ stub = open(args.stub_file).read()
68
+ else:
69
+ print("usa --example oppure --stub-file FILE", file=sys.stderr)
70
+ sys.exit(1)
71
+ print(f"Carico {args.model} ...", file=sys.stderr)
72
+ sk = Skylar.load(args.model, device=args.device)
73
+ prog = complete_cobol(sk, stub, max_new_tokens=args.max_new, temperature=args.temperature)
74
+ print(prog)
75
+ if args.compile:
76
+ ok = syntax_ok(prog)
77
+ if ok is None:
78
+ print("\n[cobc non installato — salto il check sintassi]", file=sys.stderr)
79
+ else:
80
+ print(f"\n[cobc -fsyntax-only: {'OK, compila ✓' if ok else 'errori di sintassi ✗'}]",
81
+ file=sys.stderr)
82
+
83
+
84
+ def cmd_serve(args):
85
+ try:
86
+ import uvicorn
87
+ from fastapi import FastAPI
88
+ from pydantic import BaseModel
89
+ except Exception:
90
+ print("`skylar serve` richiede gli extra: pip install 'skylar[serve]'", file=sys.stderr)
91
+ sys.exit(1)
92
+ from .core import Skylar, COBOL_SYSTEM
93
+
94
+ sk = Skylar.load(args.model, device=args.device)
95
+ app = FastAPI(title="Skylar", version="0.1.0")
96
+
97
+ class GenReq(BaseModel):
98
+ prompt: str
99
+ system: str = COBOL_SYSTEM
100
+ max_new_tokens: int = 512
101
+ temperature: float = 0.0
102
+
103
+ @app.get("/health")
104
+ def health():
105
+ return {"status": "ok", "model": args.model, "device": sk.device}
106
+
107
+ @app.post("/generate")
108
+ def generate(r: GenReq):
109
+ return {"completion": sk.generate(r.prompt, system=r.system,
110
+ max_new_tokens=r.max_new_tokens,
111
+ temperature=r.temperature)}
112
+
113
+ @app.post("/v1/chat/completions")
114
+ def chat_completions(body: dict):
115
+ msgs = body.get("messages", [])
116
+ system = next((m["content"] for m in msgs if m.get("role") == "system"), COBOL_SYSTEM)
117
+ user = next((m["content"] for m in reversed(msgs) if m.get("role") == "user"), "")
118
+ text = sk.generate(user, system=system,
119
+ max_new_tokens=body.get("max_tokens", 512),
120
+ temperature=body.get("temperature", 0.0))
121
+ return {"object": "chat.completion", "model": args.model,
122
+ "choices": [{"index": 0, "message": {"role": "assistant", "content": text},
123
+ "finish_reason": "stop"}]}
124
+
125
+ print(f"Skylar serve su http://{args.host}:{args.port} (POST /generate, /v1/chat/completions)")
126
+ uvicorn.run(app, host=args.host, port=args.port)
127
+
128
+
129
+ def main(argv=None):
130
+ p = argparse.ArgumentParser(
131
+ prog="skylar",
132
+ description="Skylar — LLM locali e sovrani, from-scratch (specialista COBOL & altri).")
133
+ p.add_argument("--version", action="store_true", help="stampa la versione ed esci")
134
+ sub = p.add_subparsers(dest="cmd")
135
+
136
+ g = sub.add_parser("generate", help="una risposta singola a un prompt")
137
+ _add_common(g)
138
+ g.add_argument("--prompt", required=True)
139
+ g.add_argument("--seed", type=int, default=None)
140
+ g.set_defaults(func=cmd_generate)
141
+
142
+ c = sub.add_parser("chat", help="REPL interattiva in streaming")
143
+ _add_common(c)
144
+ c.set_defaults(func=cmd_chat)
145
+
146
+ co = sub.add_parser("cobol", help="completa uno stub COBOL in un programma intero")
147
+ _add_common(co)
148
+ co.add_argument("--stub-file", default=None, help="file con uno stub COBOLEval-style")
149
+ co.add_argument("--example", action="store_true", help="usa lo stub d'esempio incluso")
150
+ co.add_argument("--compile", action="store_true", help="verifica la sintassi con GnuCOBOL")
151
+ co.set_defaults(func=cmd_cobol, max_new=900)
152
+
153
+ s = sub.add_parser("serve", help="server HTTP (OpenAI-compatibile) — extra [serve]")
154
+ _add_common(s)
155
+ s.add_argument("--host", default="127.0.0.1")
156
+ s.add_argument("--port", type=int, default=8000)
157
+ s.set_defaults(func=cmd_serve)
158
+
159
+ args = p.parse_args(argv)
160
+ if args.version:
161
+ from . import __version__
162
+ print(f"skylar {__version__}")
163
+ return
164
+ if not getattr(args, "func", None):
165
+ p.print_help()
166
+ return
167
+ args.func(args)
168
+
169
+
170
+ if __name__ == "__main__":
171
+ main()
skylar/cobol.py ADDED
@@ -0,0 +1,115 @@
1
+ """COBOL-aware helpers for the SkylarCobol model.
2
+
3
+ The model is trained to *complete* a COBOLEval-style stub: given a fixed-format COBOL skeleton
4
+ (IDENTIFICATION/ENVIRONMENT/DATA/LINKAGE divisions + the task as comments, ending at
5
+ `WORKING-STORAGE SECTION.`), it emits the WORKING-STORAGE entries + PROCEDURE DIVISION as a
6
+ fenced ```cobol block. `complete_cobol()` wraps the prompt exactly like the training/eval harness
7
+ and reassembles a full, compilable program — so you get real COBOL, not a fragment.
8
+ """
9
+ import re
10
+
11
+ from .core import COBOL_SYSTEM
12
+
13
+ _EVAL_USER = (
14
+ "Complete the following COBOL subprogram. Output ONLY the WORKING-STORAGE SECTION "
15
+ "entries and the PROCEDURE DIVISION USING LINKED-ITEMS (do NOT repeat IDENTIFICATION/"
16
+ "ENVIRONMENT/DATA/LINKAGE), storing the answer in RESULT, ending with END PROGRAM.\n"
17
+ "```cobol\n{stub}\n```"
18
+ )
19
+
20
+
21
+ def extract_code_block(src):
22
+ m = re.search(r"```(?:cobol)?\s*\n(.*?)```", src, re.DOTALL | re.IGNORECASE)
23
+ return m.group(1) if m else src
24
+
25
+
26
+ def swap_sections(src):
27
+ ws, lk, proc, begin = [], [], [], []
28
+ cur = begin
29
+ for line in src.split("\n"):
30
+ s = line.strip().upper()
31
+ if s.startswith("WORKING-STORAGE SECTION."):
32
+ cur = ws
33
+ elif s.startswith("LINKAGE SECTION."):
34
+ cur = lk
35
+ elif s.startswith("PROCEDURE DIVISION"):
36
+ cur = proc
37
+ line = " PROCEDURE DIVISION USING LINKED-ITEMS."
38
+ cur.append(line)
39
+ return "\n".join(begin + ws + lk + proc)
40
+
41
+
42
+ def _program_id(stub):
43
+ m = re.search(r"(?im)^\s*PROGRAM-ID\.\s*([A-Za-z0-9-]+)", stub)
44
+ return m.group(1) if m else "SOLUTION"
45
+
46
+
47
+ def construct(stub, completion, entry_point):
48
+ if "IDENTIFICATION DIVISION" in completion.upper():
49
+ prog = completion
50
+ else:
51
+ sol = completion
52
+ if sol.strip().startswith("WORKING-STORAGE SECTION."):
53
+ sol = sol.replace("WORKING-STORAGE SECTION.", "", 1)
54
+ prog = f"{stub}\n{sol}"
55
+ prog = swap_sections(prog)
56
+ name = entry_point.upper().replace("_", "-")
57
+ prog = re.sub(r"(?im)^[ \t]*END[ \t]+PROGRAM\b.*$", "", prog).rstrip()
58
+ prog += f"\n END PROGRAM {name}.\n"
59
+ return prog
60
+
61
+
62
+ def complete_cobol(sk, stub, entry_point=None, max_new_tokens=900, temperature=0.0):
63
+ """Return a full, reassembled COBOL program for a COBOLEval-style stub."""
64
+ name = entry_point or _program_id(stub)
65
+ raw = sk.generate(_EVAL_USER.format(stub=stub), system=COBOL_SYSTEM,
66
+ max_new_tokens=max_new_tokens, temperature=temperature)
67
+ return construct(stub, extract_code_block(raw), name)
68
+
69
+
70
+ def syntax_ok(program_text):
71
+ """Best-effort: does GnuCOBOL accept it syntactically? (None if cobc missing).
72
+
73
+ Uses fixed-format (COBOLEval programs are column-sensitive). Honors a COBC env var so a
74
+ non-PATH GnuCOBOL build can be pointed at explicitly."""
75
+ import shutil, subprocess, tempfile, os
76
+ cobc = os.environ.get("COBC", "cobc")
77
+ if not (os.path.isfile(cobc) or shutil.which(cobc)):
78
+ return None
79
+ with tempfile.TemporaryDirectory() as d:
80
+ f = os.path.join(d, "prog.cbl")
81
+ open(f, "w").write(program_text)
82
+ r = subprocess.run([cobc, "-fsyntax-only", "-fformat=fixed", "-w", f],
83
+ capture_output=True, text=True)
84
+ return r.returncode == 0
85
+
86
+
87
+ # sample stub for `skylar cobol --example` — a task the model handles well (increment a list).
88
+ # (COBOLEval-style fixed format; the model emits WORKING-STORAGE + PROCEDURE, we reassemble.)
89
+ EXAMPLE_STUB = """\
90
+ IDENTIFICATION DIVISION.
91
+ PROGRAM-ID. INCR-LIST.
92
+
93
+ ENVIRONMENT DIVISION.
94
+
95
+ INPUT-OUTPUT SECTION.
96
+
97
+ DATA DIVISION.
98
+
99
+ LINKAGE SECTION.
100
+
101
+ 01 LINKED-ITEMS.
102
+ 05 L-L OCCURS 3 TIMES INDEXED BY NI PIC S9(10).
103
+ 05 RESULT OCCURS 100 TIMES INDEXED BY NJ PIC S9(10).
104
+
105
+ * Return list with elements incremented by 1.
106
+ * >>> incr_list([1, 2, 3])
107
+ * [2, 3, 4]
108
+ * >>> incr_list([5, 3, 5])
109
+ * [6, 4, 6]
110
+
111
+ * Complete the WORKING-STORAGE SECTION and the PROCEDURE DIVISION
112
+ * Store the result in the RESULT variable and mark the end of your program with END PROGRAM
113
+
114
+ WORKING-STORAGE SECTION.
115
+ """