PhantomReason 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- phantomreason/__init__.py +20 -0
- phantomreason/corpus.py +119 -0
- phantomreason/evaluate.py +89 -0
- phantomreason/model.py +1532 -0
- phantomreason/py.typed +0 -0
- phantomreason/service.py +329 -0
- phantomreason/storage.py +55 -0
- phantomreason/stores.py +161 -0
- phantomreason/traces.py +93 -0
- phantomreason-0.1.2.dist-info/METADATA +283 -0
- phantomreason-0.1.2.dist-info/RECORD +15 -0
- phantomreason-0.1.2.dist-info/WHEEL +5 -0
- phantomreason-0.1.2.dist-info/entry_points.txt +3 -0
- phantomreason-0.1.2.dist-info/licenses/LICENSE +21 -0
- phantomreason-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
from .model import PhantomLanguageModel, run_interactive_test
|
|
2
|
+
from .evaluate import run_evaluation
|
|
3
|
+
from .service import PhantomAgentService, build_server
|
|
4
|
+
from .storage import EXTRA_WORDS_PATH, GLOBAL_INDEX, GLOBAL_VOCAB, MODEL_STATE_PATH, vocabadder
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.2"
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"EXTRA_WORDS_PATH",
|
|
10
|
+
"GLOBAL_INDEX",
|
|
11
|
+
"GLOBAL_VOCAB",
|
|
12
|
+
"MODEL_STATE_PATH",
|
|
13
|
+
"PhantomAgentService",
|
|
14
|
+
"PhantomLanguageModel",
|
|
15
|
+
"__version__",
|
|
16
|
+
"build_server",
|
|
17
|
+
"run_evaluation",
|
|
18
|
+
"run_interactive_test",
|
|
19
|
+
"vocabadder",
|
|
20
|
+
]
|
phantomreason/corpus.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import html
|
|
4
|
+
import re
|
|
5
|
+
import urllib.error
|
|
6
|
+
import urllib.request
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEFAULT_USER_AGENT = "phantomreason/1.0 (+symbolic-trace-ingest)"
|
|
11
|
+
DEFAULT_TIMEOUT_SECONDS = 15.0
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def normalize_text(raw_text: str) -> str:
|
|
15
|
+
cleaned = html.unescape(raw_text)
|
|
16
|
+
cleaned = re.sub(r"(?is)<(script|style).*?>.*?</\1>", " ", cleaned)
|
|
17
|
+
cleaned = re.sub(r"(?s)<[^>]+>", " ", cleaned)
|
|
18
|
+
cleaned = cleaned.replace("\r", "\n")
|
|
19
|
+
cleaned = re.sub(r"[^\S\n]+", " ", cleaned)
|
|
20
|
+
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
|
21
|
+
return cleaned.strip()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def tokenize_words(text: str) -> list[str]:
|
|
25
|
+
return re.findall(r"[a-zA-Z']+", text.lower())
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def iter_sentences(
|
|
29
|
+
text: str,
|
|
30
|
+
min_words: int = 3,
|
|
31
|
+
max_words: int = 24,
|
|
32
|
+
max_sentences: int | None = None,
|
|
33
|
+
) -> list[str]:
|
|
34
|
+
normalized = normalize_text(text)
|
|
35
|
+
if not normalized:
|
|
36
|
+
return []
|
|
37
|
+
sentence_like = re.split(r"(?<=[.!?])\s+|\n+", normalized)
|
|
38
|
+
collected: list[str] = []
|
|
39
|
+
seen: set[str] = set()
|
|
40
|
+
for part in sentence_like:
|
|
41
|
+
words = tokenize_words(part)
|
|
42
|
+
if len(words) < min_words:
|
|
43
|
+
continue
|
|
44
|
+
while words:
|
|
45
|
+
chunk = words[:max_words]
|
|
46
|
+
words = words[max_words:]
|
|
47
|
+
if len(chunk) < min_words:
|
|
48
|
+
break
|
|
49
|
+
sentence = " ".join(chunk)
|
|
50
|
+
if sentence in seen:
|
|
51
|
+
continue
|
|
52
|
+
seen.add(sentence)
|
|
53
|
+
collected.append(sentence)
|
|
54
|
+
if max_sentences is not None and len(collected) >= max_sentences:
|
|
55
|
+
return collected
|
|
56
|
+
return collected
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def load_text_file(path: str | Path) -> str:
|
|
60
|
+
return Path(path).read_text(encoding="utf-8")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def fetch_url_text(
|
|
64
|
+
url: str,
|
|
65
|
+
timeout: float = DEFAULT_TIMEOUT_SECONDS,
|
|
66
|
+
user_agent: str = DEFAULT_USER_AGENT,
|
|
67
|
+
) -> str:
|
|
68
|
+
request = urllib.request.Request(url, headers={"User-Agent": user_agent})
|
|
69
|
+
with urllib.request.urlopen(request, timeout=timeout) as response:
|
|
70
|
+
charset = response.headers.get_content_charset() or "utf-8"
|
|
71
|
+
payload = response.read()
|
|
72
|
+
return payload.decode(charset, errors="replace")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def extract_dictionary_facts(text: str, max_entries: int = 256) -> list[str]:
|
|
76
|
+
facts: list[str] = []
|
|
77
|
+
seen: set[str] = set()
|
|
78
|
+
for raw_line in normalize_text(text).splitlines():
|
|
79
|
+
line = raw_line.strip(" -:\t")
|
|
80
|
+
if not line:
|
|
81
|
+
continue
|
|
82
|
+
match = re.match(r"^([A-Za-z][A-Za-z' -]{1,48})\s*[:\-]\s*(.+)$", line)
|
|
83
|
+
if not match:
|
|
84
|
+
continue
|
|
85
|
+
term = " ".join(tokenize_words(match.group(1)))
|
|
86
|
+
definition_tokens = tokenize_words(match.group(2))
|
|
87
|
+
if not term or len(definition_tokens) < 2:
|
|
88
|
+
continue
|
|
89
|
+
definition = " ".join(definition_tokens[:10])
|
|
90
|
+
sentence = f"{term} means {definition}"
|
|
91
|
+
if sentence in seen:
|
|
92
|
+
continue
|
|
93
|
+
seen.add(sentence)
|
|
94
|
+
facts.append(sentence)
|
|
95
|
+
if len(facts) >= max_entries:
|
|
96
|
+
break
|
|
97
|
+
return facts
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def load_local_corpus_candidates(root: Path) -> list[Path]:
|
|
101
|
+
candidates: list[Path] = []
|
|
102
|
+
for relative in ("words.txt", "words"):
|
|
103
|
+
path = root / relative
|
|
104
|
+
if path.exists() and path.is_file():
|
|
105
|
+
candidates.append(path)
|
|
106
|
+
return candidates
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
__all__ = [
|
|
110
|
+
"DEFAULT_TIMEOUT_SECONDS",
|
|
111
|
+
"DEFAULT_USER_AGENT",
|
|
112
|
+
"extract_dictionary_facts",
|
|
113
|
+
"fetch_url_text",
|
|
114
|
+
"iter_sentences",
|
|
115
|
+
"load_local_corpus_candidates",
|
|
116
|
+
"load_text_file",
|
|
117
|
+
"normalize_text",
|
|
118
|
+
"tokenize_words",
|
|
119
|
+
]
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .model import PhantomLanguageModel
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def require(condition: bool, label: str, failures: list[str]) -> None:
|
|
7
|
+
if not condition:
|
|
8
|
+
failures.append(label)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def run_evaluation() -> dict[str, object]:
|
|
12
|
+
model = PhantomLanguageModel(dim=512, sparsity=47)
|
|
13
|
+
training_text = (
|
|
14
|
+
"vector means an ordered list used for state. "
|
|
15
|
+
"artist is a person who creates stories. "
|
|
16
|
+
"sun rises east. "
|
|
17
|
+
"north star guides sailors nightly. "
|
|
18
|
+
"gardeners water orchids daily."
|
|
19
|
+
)
|
|
20
|
+
model.train_on_text(training_text, epochs=1, persist=False)
|
|
21
|
+
failures: list[str] = []
|
|
22
|
+
|
|
23
|
+
parsed = model.understand_input("north star guides sailors nightly")
|
|
24
|
+
require(parsed["subject"] == "north star", "parse subject for two-word noun phrase", failures)
|
|
25
|
+
require(parsed["predicate"] == "guides", "parse predicate for two-word noun phrase", failures)
|
|
26
|
+
|
|
27
|
+
parsed = model.understand_input("what is vector?")
|
|
28
|
+
require(parsed["subject"] == "vector", "parse copular question subject", failures)
|
|
29
|
+
require(parsed["predicate"] == "is", "parse copular question predicate", failures)
|
|
30
|
+
|
|
31
|
+
routed = model.route_prompt("what is vector?")
|
|
32
|
+
require(routed["fact_answer"] is not None, "fact recall for learned definition", failures)
|
|
33
|
+
|
|
34
|
+
model.register_fact("vector", "means", ["symbolic", "state"])
|
|
35
|
+
require(
|
|
36
|
+
not model._fact_is_active("fact:vector|means|ordered list"),
|
|
37
|
+
"older contradictory fact should be inactive",
|
|
38
|
+
failures,
|
|
39
|
+
)
|
|
40
|
+
require(
|
|
41
|
+
model._fact_is_active("fact:vector|means|symbolic state"),
|
|
42
|
+
"new contradictory fact should stay active",
|
|
43
|
+
failures,
|
|
44
|
+
)
|
|
45
|
+
latest = model._direct_fact_match(subject="vector", predicate="means")
|
|
46
|
+
require(
|
|
47
|
+
latest is not None and latest["object_tokens"] == ["symbolic", "state"],
|
|
48
|
+
"latest contradictory fact should dominate recall",
|
|
49
|
+
failures,
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
before_decay = model._fact_strength_value("fact:vector|means|symbolic state")
|
|
53
|
+
model.apply_fact_decay(keep_recent=0, amount=1)
|
|
54
|
+
after_decay = model._fact_strength_value("fact:vector|means|symbolic state")
|
|
55
|
+
require(after_decay <= before_decay, "decay should not increase fact strength", failures)
|
|
56
|
+
|
|
57
|
+
if failures:
|
|
58
|
+
return {
|
|
59
|
+
"status": "FAIL",
|
|
60
|
+
"failures": failures,
|
|
61
|
+
"fact_count": model.fact_count(),
|
|
62
|
+
"fact_answer": routed["fact_answer"],
|
|
63
|
+
"parse": model.render_understanding(model.understand_input("north star guides sailors nightly")),
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
"status": "PASS",
|
|
68
|
+
"failures": [],
|
|
69
|
+
"fact_count": model.fact_count(),
|
|
70
|
+
"fact_answer": routed["fact_answer"],
|
|
71
|
+
"parse": model.render_understanding(model.understand_input("north star guides sailors nightly")),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def main() -> int:
|
|
76
|
+
result = run_evaluation()
|
|
77
|
+
print(result["status"])
|
|
78
|
+
for item in result["failures"]:
|
|
79
|
+
print(f"- {item}")
|
|
80
|
+
if result["status"] == "PASS":
|
|
81
|
+
print(f"fact_count={result['fact_count']}")
|
|
82
|
+
print(f"fact_answer={result['fact_answer']}")
|
|
83
|
+
print(f"parse={result['parse']}")
|
|
84
|
+
return 0
|
|
85
|
+
return 1
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
if __name__ == "__main__":
|
|
89
|
+
raise SystemExit(main())
|