topolm 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
topolm/__init__.py ADDED
@@ -0,0 +1,16 @@
1
+ from .config import Config
2
+ from .core import TopoLM, Tokenizer, Corpus, NGram, evaluate, eval_examples
3
+ from .datasets import hf_dataset_texts, load_hf_dataset
4
+
5
+ __all__ = [
6
+ "Config",
7
+ "TopoLM",
8
+ "Tokenizer",
9
+ "Corpus",
10
+ "NGram",
11
+ "evaluate",
12
+ "eval_examples",
13
+ "load_hf_dataset",
14
+ "hf_dataset_texts",
15
+ ]
16
+ __version__ = "0.0.4"
topolm/cli.py ADDED
@@ -0,0 +1,36 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ from .core import TopoLM
5
+ from .config import Config
6
+
7
+ DEMO = """
8
+ The cat sat on the mat.
9
+ The dog sat on the floor.
10
+ The attacker used CVE-2024-1234 to access the admin panel.
11
+ CYP3A4 inhibition increases drug exposure.
12
+ Clarithromycin inhibits CYP3A4.
13
+ Clarithromycin may increase simvastatin exposure.
14
+ """
15
+
16
+ def build_demo_model():
17
+ return TopoLM(Config()).fit(DEMO)
18
+
19
+ def main(argv=None):
20
+ parser = argparse.ArgumentParser(prog="topolm")
21
+ sub = parser.add_subparsers(dest="cmd")
22
+ sub.add_parser("demo")
23
+ p = sub.add_parser("predict"); p.add_argument("context")
24
+ g = sub.add_parser("generate"); g.add_argument("prompt"); g.add_argument("--decoding", default="beam", choices=["beam", "nucleus", "greedy"])
25
+ args = parser.parse_args(argv)
26
+ model = build_demo_model()
27
+ if args.cmd == "predict":
28
+ for pred in model.distribution(args.context, 5):
29
+ print(f"{pred.text}\t{pred.probability:.3f}\t{pred.score:.3f}")
30
+ elif args.cmd == "generate":
31
+ print(model.generate(args.prompt, decoding=args.decoding))
32
+ else:
33
+ print(model.generate("clarithromycin inhibits", decoding="beam"))
34
+
35
+ if __name__ == "__main__":
36
+ main()
topolm/config.py ADDED
@@ -0,0 +1,23 @@
1
+ from dataclasses import dataclass
2
+
3
+ PUNCT = {".", ",", ";", ":", "?", "!"}
4
+ BOUNDARY = {"<bos>", "<eos>"}
5
+ HUBS = {"the", "a", "an", "and", "or", "of", "to", "in", "on", "with", "when", ".", ","}
6
+
7
+ @dataclass
8
+ class Config:
9
+ dim: int = 1024
10
+ seed: int = 42
11
+ window: int = 8
12
+ phrase_lengths: tuple[int, ...] = (2, 3, 4, 5)
13
+ max_candidates: int = 96
14
+ inference_candidates: int = 48
15
+ prediction_cache_max: int = 4096
16
+ temperature: float = 0.75
17
+ max_runtime_seconds: float = 5.0
18
+ default_top_p: float = 0.88
19
+ default_beam_width: int = 4
20
+ fast_dev_mode: bool = True
21
+ max_reranker_sentences: int = 80
22
+ negatives_per_positive: int = 2
23
+ hub_penalty: float = 0.10