tokenmizer 0.2.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tokenmizer/__init__.py +21 -0
- tokenmizer/agents/__init__.py +0 -0
- tokenmizer/analytics/__init__.py +0 -0
- tokenmizer/analytics/engine.py +188 -0
- tokenmizer/api/__init__.py +0 -0
- tokenmizer/api/app.py +958 -0
- tokenmizer/api/rate_limiter.py +110 -0
- tokenmizer/checkpoints/__init__.py +0 -0
- tokenmizer/checkpoints/manager.py +383 -0
- tokenmizer/cli.py +153 -0
- tokenmizer/compression/__init__.py +0 -0
- tokenmizer/compression/engine.py +669 -0
- tokenmizer/compression/output_trimmer.py +95 -0
- tokenmizer/compression/window.py +104 -0
- tokenmizer/config/__init__.py +0 -0
- tokenmizer/config/settings.py +170 -0
- tokenmizer/core/__init__.py +0 -0
- tokenmizer/core/dto.py +196 -0
- tokenmizer/core/errors.py +35 -0
- tokenmizer/core/tokenizer.py +96 -0
- tokenmizer/dashboard/__init__.py +0 -0
- tokenmizer/dashboard/page.py +267 -0
- tokenmizer/filters/__init__.py +0 -0
- tokenmizer/filters/file_intelligence.py +960 -0
- tokenmizer/graph_memory/__init__.py +0 -0
- tokenmizer/graph_memory/decision_tracker.py +225 -0
- tokenmizer/graph_memory/graph.py +1287 -0
- tokenmizer/graph_memory/helpers.py +121 -0
- tokenmizer/graph_memory/hybrid_extractor.py +703 -0
- tokenmizer/graph_memory/types.py +134 -0
- tokenmizer/graph_memory/validator.py +304 -0
- tokenmizer/graph_memory/visualization.py +228 -0
- tokenmizer/mcp/__init__.py +0 -0
- tokenmizer/mcp/server.py +368 -0
- tokenmizer/providers/__init__.py +0 -0
- tokenmizer/providers/providers.py +456 -0
- tokenmizer/security/__init__.py +0 -0
- tokenmizer/security/auth.py +95 -0
- tokenmizer/security/middleware.py +138 -0
- tokenmizer/security/redaction.py +126 -0
- tokenmizer/semantic_cache/__init__.py +0 -0
- tokenmizer/semantic_cache/cache.py +383 -0
- tokenmizer/state/__init__.py +0 -0
- tokenmizer/state/backend.py +137 -0
- tokenmizer/storage/__init__.py +56 -0
- tokenmizer-0.2.4.dist-info/METADATA +529 -0
- tokenmizer-0.2.4.dist-info/RECORD +50 -0
- tokenmizer-0.2.4.dist-info/WHEEL +4 -0
- tokenmizer-0.2.4.dist-info/entry_points.txt +2 -0
- tokenmizer-0.2.4.dist-info/licenses/LICENSE +21 -0
|
File without changes
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Decision Topic Classifier
|
|
3
|
+
tokenmizer/graph_memory/decision_tracker.py
|
|
4
|
+
|
|
5
|
+
Problem being solved:
|
|
6
|
+
User says "use PostgreSQL" → Decision node created
|
|
7
|
+
Later says "actually use MySQL instead" → NEW Decision node created
|
|
8
|
+
Now graph has BOTH. Resume shows BOTH. LLM gets confused.
|
|
9
|
+
|
|
10
|
+
Solution:
|
|
11
|
+
Every new decision is classified into a topic bucket.
|
|
12
|
+
If an existing decision covers the same topic → mark it MODIFIED (superseded).
|
|
13
|
+
Resume shows only ACTIVE decisions. History preserved in graph for rollback.
|
|
14
|
+
|
|
15
|
+
Topic detection approach:
|
|
16
|
+
1. Keyword matching on known tech categories (fast, no LLM needed)
|
|
17
|
+
2. Word overlap for unknown topics (fallback)
|
|
18
|
+
|
|
19
|
+
This runs on every add_node(NodeType.DECISION, ...) call.
|
|
20
|
+
Zero external dependencies. Zero LLM calls. ~0.1ms per check.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import re
|
|
25
|
+
from typing import Optional
|
|
26
|
+
|
|
27
|
+
# ── Topic taxonomy ────────────────────────────────────────────────────────────
|
|
28
|
+
# Maps keywords → topic bucket name
|
|
29
|
+
# When two decisions share a bucket → one supersedes the other
|
|
30
|
+
|
|
31
|
+
_TOPIC_KEYWORDS: dict[str, list[str]] = {
|
|
32
|
+
# Databases
|
|
33
|
+
"database": ["postgresql", "postgres", "mysql", "sqlite", "mongodb",
|
|
34
|
+
"dynamodb", "cassandra", "cockroachdb", "mariadb",
|
|
35
|
+
"database", "db choice", "storage backend", "data store"],
|
|
36
|
+
"cache_backend": ["redis", "memcached", "valkey", "dragonfly",
|
|
37
|
+
"cache backend", "caching layer", "session store"],
|
|
38
|
+
"search": ["elasticsearch", "opensearch", "meilisearch", "typesense",
|
|
39
|
+
"algolia", "search engine", "full-text"],
|
|
40
|
+
|
|
41
|
+
# Auth
|
|
42
|
+
"auth_mechanism": ["jwt", "session", "cookie", "oauth", "saml", "paseto",
|
|
43
|
+
"auth token", "authentication method", "token type"],
|
|
44
|
+
"password_hashing": ["bcrypt", "argon2", "scrypt", "pbkdf2", "password hash"],
|
|
45
|
+
|
|
46
|
+
# Frameworks
|
|
47
|
+
"web_framework": ["fastapi", "flask", "django", "express", "hono", "gin", "fiber",
|
|
48
|
+
"rails", "laravel", "spring", "nestjs", "nest.js",
|
|
49
|
+
"web framework", "backend framework", "api framework"],
|
|
50
|
+
"frontend": ["react", "vue", "angular", "svelte", "nextjs", "next.js",
|
|
51
|
+
"nuxt", "remix", "astro", "gatsby", "vite",
|
|
52
|
+
"frontend framework", "ui framework", "frontend", "client side"],
|
|
53
|
+
"orm": ["sqlalchemy", "tortoise", "peewee", "prisma", "typeorm",
|
|
54
|
+
"sequelize", "orm", "query builder"],
|
|
55
|
+
|
|
56
|
+
# Infrastructure
|
|
57
|
+
"deployment": ["docker", "kubernetes", "k8s", "railway", "render",
|
|
58
|
+
"heroku", "fly.io", "aws", "gcp", "azure", "vercel",
|
|
59
|
+
"netlify", "deployment platform", "hosting"],
|
|
60
|
+
"queue": ["celery", "arq", "rq", "kafka", "rabbitmq", "sqs",
|
|
61
|
+
"task queue", "message queue", "job queue"],
|
|
62
|
+
"storage": ["s3", "cloudinary", "gcs", "azure blob", "minio",
|
|
63
|
+
"file storage", "object storage", "media storage"],
|
|
64
|
+
|
|
65
|
+
# Language / runtime
|
|
66
|
+
"language": ["python", "typescript", "javascript", "go", "rust",
|
|
67
|
+
"java", "kotlin", "programming language"],
|
|
68
|
+
"runtime": ["node", "deno", "bun", "python version", "runtime"],
|
|
69
|
+
|
|
70
|
+
# Architecture
|
|
71
|
+
"api_style": ["rest", "graphql", "grpc", "trpc", "websocket",
|
|
72
|
+
"api design", "api style"],
|
|
73
|
+
"architecture": ["monolith", "microservice", "serverless", "modular",
|
|
74
|
+
"architecture", "system design"],
|
|
75
|
+
"testing": ["pytest", "jest", "vitest", "cypress", "playwright",
|
|
76
|
+
"testing framework", "test runner"],
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
# Reverse map: keyword → topic
|
|
80
|
+
_KEYWORD_TO_TOPIC: dict[str, str] = {}
|
|
81
|
+
for _topic, _keywords in _TOPIC_KEYWORDS.items():
|
|
82
|
+
for _kw in _keywords:
|
|
83
|
+
_KEYWORD_TO_TOPIC[_kw.lower()] = _topic
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def classify_topic(label: str, summary: str = "") -> Optional[str]:
|
|
87
|
+
"""
|
|
88
|
+
Classify a decision label into a topic bucket.
|
|
89
|
+
Returns topic string if matched, None if unknown.
|
|
90
|
+
|
|
91
|
+
Examples:
|
|
92
|
+
"Use PostgreSQL for storage" → "database"
|
|
93
|
+
"JWT over sessions" → "auth_mechanism"
|
|
94
|
+
"Deploy on Railway" → "deployment"
|
|
95
|
+
"Use Next.js." → "frontend" (trailing period stripped)
|
|
96
|
+
"Some custom thing" → None
|
|
97
|
+
"""
|
|
98
|
+
# Strip trailing punctuation before classification
|
|
99
|
+
label = label.rstrip(".,!?;:")
|
|
100
|
+
text = (label + " " + summary).lower()
|
|
101
|
+
# Remove punctuation for matching — but preserve version numbers
|
|
102
|
+
text = re.sub(r"[^\w\s\.]", " ", text)
|
|
103
|
+
# "next.js" → "nextjs" for matching
|
|
104
|
+
text = text.replace("next.js", "nextjs").replace("node.js", "nodejs")
|
|
105
|
+
words = text.split()
|
|
106
|
+
|
|
107
|
+
# Check single words first
|
|
108
|
+
for word in words:
|
|
109
|
+
if word in _KEYWORD_TO_TOPIC:
|
|
110
|
+
return _KEYWORD_TO_TOPIC[word]
|
|
111
|
+
|
|
112
|
+
# Check bigrams (e.g. "task queue", "message broker")
|
|
113
|
+
for i in range(len(words) - 1):
|
|
114
|
+
bigram = words[i] + " " + words[i + 1]
|
|
115
|
+
if bigram in _KEYWORD_TO_TOPIC:
|
|
116
|
+
return _KEYWORD_TO_TOPIC[bigram]
|
|
117
|
+
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def find_contradicting_decisions(
|
|
122
|
+
new_label: str,
|
|
123
|
+
new_summary: str,
|
|
124
|
+
existing_nodes: dict, # dict[str, MemoryNode]
|
|
125
|
+
) -> list[str]:
|
|
126
|
+
"""
|
|
127
|
+
Find existing decision nodes that cover the same topic as the new decision.
|
|
128
|
+
Returns list of node IDs to mark as SUPERSEDED (history preserved, never deleted).
|
|
129
|
+
|
|
130
|
+
Only returns decisions that:
|
|
131
|
+
1. Are currently COMPLETED (active)
|
|
132
|
+
2. Cover the same topic bucket
|
|
133
|
+
3. Are NOT the same decision (not a duplicate)
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
new_label: label of the incoming decision
|
|
137
|
+
new_summary: rationale of the incoming decision
|
|
138
|
+
existing_nodes: current graph nodes
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
List of node IDs to supersede
|
|
142
|
+
"""
|
|
143
|
+
from tokenmizer.graph_memory.graph import NodeStatus, NodeType
|
|
144
|
+
|
|
145
|
+
new_topic = classify_topic(new_label, new_summary)
|
|
146
|
+
|
|
147
|
+
# If topic unknown, use word overlap as fallback
|
|
148
|
+
if new_topic is None:
|
|
149
|
+
return _find_by_word_overlap(new_label, existing_nodes)
|
|
150
|
+
|
|
151
|
+
to_supersede = []
|
|
152
|
+
for node_id, node in existing_nodes.items():
|
|
153
|
+
if node.type != NodeType.DECISION:
|
|
154
|
+
continue
|
|
155
|
+
if node.status not in (NodeStatus.COMPLETED,):
|
|
156
|
+
continue # already superseded/archived/invalidated — skip
|
|
157
|
+
|
|
158
|
+
existing_topic = classify_topic(node.label, node.summary)
|
|
159
|
+
if existing_topic == new_topic:
|
|
160
|
+
# Same topic — check it's not the same decision
|
|
161
|
+
if not _is_same_decision(new_label, node.label):
|
|
162
|
+
to_supersede.append(node_id)
|
|
163
|
+
|
|
164
|
+
return to_supersede
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def _find_by_word_overlap(
|
|
168
|
+
new_label: str,
|
|
169
|
+
existing_nodes: dict,
|
|
170
|
+
overlap_threshold: float = 0.6,
|
|
171
|
+
) -> list[str]:
|
|
172
|
+
"""
|
|
173
|
+
Fallback: find decisions with high word overlap (same topic, unknown category).
|
|
174
|
+
Only used when topic classification returns None.
|
|
175
|
+
"""
|
|
176
|
+
from tokenmizer.graph_memory.graph import NodeStatus, NodeType
|
|
177
|
+
|
|
178
|
+
_STOP = frozenset({"use", "using", "the", "a", "an", "for", "to", "in",
|
|
179
|
+
"on", "with", "and", "or", "of", "is", "are", "we",
|
|
180
|
+
"our", "this", "that", "it", "be", "have", "will"})
|
|
181
|
+
|
|
182
|
+
new_words = {w for w in re.sub(r"[^\w]", " ", new_label.lower()).split()
|
|
183
|
+
if w not in _STOP and len(w) > 2}
|
|
184
|
+
|
|
185
|
+
if not new_words:
|
|
186
|
+
return []
|
|
187
|
+
|
|
188
|
+
to_supersede = []
|
|
189
|
+
for node_id, node in existing_nodes.items():
|
|
190
|
+
if node.type != NodeType.DECISION:
|
|
191
|
+
continue
|
|
192
|
+
if node.status != NodeStatus.COMPLETED:
|
|
193
|
+
continue
|
|
194
|
+
|
|
195
|
+
existing_words = {w for w in re.sub(r"[^\w]", " ", node.label.lower()).split()
|
|
196
|
+
if w not in _STOP and len(w) > 2}
|
|
197
|
+
if not existing_words:
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
overlap = len(new_words & existing_words) / max(len(new_words), len(existing_words))
|
|
201
|
+
if overlap >= overlap_threshold and not _is_same_decision(new_label, node.label):
|
|
202
|
+
to_supersede.append(node_id)
|
|
203
|
+
|
|
204
|
+
return to_supersede
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _is_same_decision(label_a: str, label_b: str) -> bool:
|
|
208
|
+
"""True if two labels are essentially the same decision (dedup check)."""
|
|
209
|
+
def _norm(s: str) -> str:
|
|
210
|
+
s = s.lower().rstrip(".,!?;:")
|
|
211
|
+
# Normalize common tech name variants
|
|
212
|
+
s = re.sub(r"[^\w\s]", " ", s)
|
|
213
|
+
s = s.replace("next js", "nextjs").replace("node js", "nodejs")
|
|
214
|
+
s = s.replace("type script", "typescript").replace("java script", "javascript")
|
|
215
|
+
return re.sub(r"\s+", " ", s).strip()
|
|
216
|
+
|
|
217
|
+
a, b = _norm(label_a), _norm(label_b)
|
|
218
|
+
if a == b:
|
|
219
|
+
return True
|
|
220
|
+
words_a = set(a.split())
|
|
221
|
+
words_b = set(b.split())
|
|
222
|
+
if not words_a or not words_b:
|
|
223
|
+
return False
|
|
224
|
+
overlap = len(words_a & words_b) / max(len(words_a), len(words_b))
|
|
225
|
+
return overlap >= 0.82
|