tokenmizer 0.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. tokenmizer/__init__.py +21 -0
  2. tokenmizer/agents/__init__.py +0 -0
  3. tokenmizer/analytics/__init__.py +0 -0
  4. tokenmizer/analytics/engine.py +188 -0
  5. tokenmizer/api/__init__.py +0 -0
  6. tokenmizer/api/app.py +958 -0
  7. tokenmizer/api/rate_limiter.py +110 -0
  8. tokenmizer/checkpoints/__init__.py +0 -0
  9. tokenmizer/checkpoints/manager.py +383 -0
  10. tokenmizer/cli.py +153 -0
  11. tokenmizer/compression/__init__.py +0 -0
  12. tokenmizer/compression/engine.py +669 -0
  13. tokenmizer/compression/output_trimmer.py +95 -0
  14. tokenmizer/compression/window.py +104 -0
  15. tokenmizer/config/__init__.py +0 -0
  16. tokenmizer/config/settings.py +170 -0
  17. tokenmizer/core/__init__.py +0 -0
  18. tokenmizer/core/dto.py +196 -0
  19. tokenmizer/core/errors.py +35 -0
  20. tokenmizer/core/tokenizer.py +96 -0
  21. tokenmizer/dashboard/__init__.py +0 -0
  22. tokenmizer/dashboard/page.py +267 -0
  23. tokenmizer/filters/__init__.py +0 -0
  24. tokenmizer/filters/file_intelligence.py +960 -0
  25. tokenmizer/graph_memory/__init__.py +0 -0
  26. tokenmizer/graph_memory/decision_tracker.py +225 -0
  27. tokenmizer/graph_memory/graph.py +1287 -0
  28. tokenmizer/graph_memory/helpers.py +121 -0
  29. tokenmizer/graph_memory/hybrid_extractor.py +703 -0
  30. tokenmizer/graph_memory/types.py +134 -0
  31. tokenmizer/graph_memory/validator.py +304 -0
  32. tokenmizer/graph_memory/visualization.py +228 -0
  33. tokenmizer/mcp/__init__.py +0 -0
  34. tokenmizer/mcp/server.py +368 -0
  35. tokenmizer/providers/__init__.py +0 -0
  36. tokenmizer/providers/providers.py +456 -0
  37. tokenmizer/security/__init__.py +0 -0
  38. tokenmizer/security/auth.py +95 -0
  39. tokenmizer/security/middleware.py +138 -0
  40. tokenmizer/security/redaction.py +126 -0
  41. tokenmizer/semantic_cache/__init__.py +0 -0
  42. tokenmizer/semantic_cache/cache.py +383 -0
  43. tokenmizer/state/__init__.py +0 -0
  44. tokenmizer/state/backend.py +137 -0
  45. tokenmizer/storage/__init__.py +56 -0
  46. tokenmizer-0.2.4.dist-info/METADATA +529 -0
  47. tokenmizer-0.2.4.dist-info/RECORD +50 -0
  48. tokenmizer-0.2.4.dist-info/WHEEL +4 -0
  49. tokenmizer-0.2.4.dist-info/entry_points.txt +2 -0
  50. tokenmizer-0.2.4.dist-info/licenses/LICENSE +21 -0
File without changes
@@ -0,0 +1,225 @@
1
+ """
2
+ Decision Topic Classifier
3
+ tokenmizer/graph_memory/decision_tracker.py
4
+
5
+ Problem being solved:
6
+ User says "use PostgreSQL" → Decision node created
7
+ Later says "actually use MySQL instead" → NEW Decision node created
8
+ Now graph has BOTH. Resume shows BOTH. LLM gets confused.
9
+
10
+ Solution:
11
+ Every new decision is classified into a topic bucket.
12
+ If an existing decision covers the same topic → mark it MODIFIED (superseded).
13
+ Resume shows only ACTIVE decisions. History preserved in graph for rollback.
14
+
15
+ Topic detection approach:
16
+ 1. Keyword matching on known tech categories (fast, no LLM needed)
17
+ 2. Word overlap for unknown topics (fallback)
18
+
19
+ This runs on every add_node(NodeType.DECISION, ...) call.
20
+ Zero external dependencies. Zero LLM calls. ~0.1ms per check.
21
+ """
22
+ from __future__ import annotations
23
+
24
+ import re
25
+ from typing import Optional
26
+
27
+ # ── Topic taxonomy ────────────────────────────────────────────────────────────
28
+ # Maps keywords → topic bucket name
29
+ # When two decisions share a bucket → one supersedes the other
30
+
31
+ _TOPIC_KEYWORDS: dict[str, list[str]] = {
32
+ # Databases
33
+ "database": ["postgresql", "postgres", "mysql", "sqlite", "mongodb",
34
+ "dynamodb", "cassandra", "cockroachdb", "mariadb",
35
+ "database", "db choice", "storage backend", "data store"],
36
+ "cache_backend": ["redis", "memcached", "valkey", "dragonfly",
37
+ "cache backend", "caching layer", "session store"],
38
+ "search": ["elasticsearch", "opensearch", "meilisearch", "typesense",
39
+ "algolia", "search engine", "full-text"],
40
+
41
+ # Auth
42
+ "auth_mechanism": ["jwt", "session", "cookie", "oauth", "saml", "paseto",
43
+ "auth token", "authentication method", "token type"],
44
+ "password_hashing": ["bcrypt", "argon2", "scrypt", "pbkdf2", "password hash"],
45
+
46
+ # Frameworks
47
+ "web_framework": ["fastapi", "flask", "django", "express", "hono", "gin", "fiber",
48
+ "rails", "laravel", "spring", "nestjs", "nest.js",
49
+ "web framework", "backend framework", "api framework"],
50
+ "frontend": ["react", "vue", "angular", "svelte", "nextjs", "next.js",
51
+ "nuxt", "remix", "astro", "gatsby", "vite",
52
+ "frontend framework", "ui framework", "frontend", "client side"],
53
+ "orm": ["sqlalchemy", "tortoise", "peewee", "prisma", "typeorm",
54
+ "sequelize", "orm", "query builder"],
55
+
56
+ # Infrastructure
57
+ "deployment": ["docker", "kubernetes", "k8s", "railway", "render",
58
+ "heroku", "fly.io", "aws", "gcp", "azure", "vercel",
59
+ "netlify", "deployment platform", "hosting"],
60
+ "queue": ["celery", "arq", "rq", "kafka", "rabbitmq", "sqs",
61
+ "task queue", "message queue", "job queue"],
62
+ "storage": ["s3", "cloudinary", "gcs", "azure blob", "minio",
63
+ "file storage", "object storage", "media storage"],
64
+
65
+ # Language / runtime
66
+ "language": ["python", "typescript", "javascript", "go", "rust",
67
+ "java", "kotlin", "programming language"],
68
+ "runtime": ["node", "deno", "bun", "python version", "runtime"],
69
+
70
+ # Architecture
71
+ "api_style": ["rest", "graphql", "grpc", "trpc", "websocket",
72
+ "api design", "api style"],
73
+ "architecture": ["monolith", "microservice", "serverless", "modular",
74
+ "architecture", "system design"],
75
+ "testing": ["pytest", "jest", "vitest", "cypress", "playwright",
76
+ "testing framework", "test runner"],
77
+ }
78
+
79
+ # Reverse map: keyword → topic
80
+ _KEYWORD_TO_TOPIC: dict[str, str] = {}
81
+ for _topic, _keywords in _TOPIC_KEYWORDS.items():
82
+ for _kw in _keywords:
83
+ _KEYWORD_TO_TOPIC[_kw.lower()] = _topic
84
+
85
+
86
+ def classify_topic(label: str, summary: str = "") -> Optional[str]:
87
+ """
88
+ Classify a decision label into a topic bucket.
89
+ Returns topic string if matched, None if unknown.
90
+
91
+ Examples:
92
+ "Use PostgreSQL for storage" → "database"
93
+ "JWT over sessions" → "auth_mechanism"
94
+ "Deploy on Railway" → "deployment"
95
+ "Use Next.js." → "frontend" (trailing period stripped)
96
+ "Some custom thing" → None
97
+ """
98
+ # Strip trailing punctuation before classification
99
+ label = label.rstrip(".,!?;:")
100
+ text = (label + " " + summary).lower()
101
+ # Remove punctuation for matching — but preserve version numbers
102
+ text = re.sub(r"[^\w\s\.]", " ", text)
103
+ # "next.js" → "nextjs" for matching
104
+ text = text.replace("next.js", "nextjs").replace("node.js", "nodejs")
105
+ words = text.split()
106
+
107
+ # Check single words first
108
+ for word in words:
109
+ if word in _KEYWORD_TO_TOPIC:
110
+ return _KEYWORD_TO_TOPIC[word]
111
+
112
+ # Check bigrams (e.g. "task queue", "message broker")
113
+ for i in range(len(words) - 1):
114
+ bigram = words[i] + " " + words[i + 1]
115
+ if bigram in _KEYWORD_TO_TOPIC:
116
+ return _KEYWORD_TO_TOPIC[bigram]
117
+
118
+ return None
119
+
120
+
121
+ def find_contradicting_decisions(
122
+ new_label: str,
123
+ new_summary: str,
124
+ existing_nodes: dict, # dict[str, MemoryNode]
125
+ ) -> list[str]:
126
+ """
127
+ Find existing decision nodes that cover the same topic as the new decision.
128
+ Returns list of node IDs to mark as SUPERSEDED (history preserved, never deleted).
129
+
130
+ Only returns decisions that:
131
+ 1. Are currently COMPLETED (active)
132
+ 2. Cover the same topic bucket
133
+ 3. Are NOT the same decision (not a duplicate)
134
+
135
+ Args:
136
+ new_label: label of the incoming decision
137
+ new_summary: rationale of the incoming decision
138
+ existing_nodes: current graph nodes
139
+
140
+ Returns:
141
+ List of node IDs to supersede
142
+ """
143
+ from tokenmizer.graph_memory.graph import NodeStatus, NodeType
144
+
145
+ new_topic = classify_topic(new_label, new_summary)
146
+
147
+ # If topic unknown, use word overlap as fallback
148
+ if new_topic is None:
149
+ return _find_by_word_overlap(new_label, existing_nodes)
150
+
151
+ to_supersede = []
152
+ for node_id, node in existing_nodes.items():
153
+ if node.type != NodeType.DECISION:
154
+ continue
155
+ if node.status not in (NodeStatus.COMPLETED,):
156
+ continue # already superseded/archived/invalidated — skip
157
+
158
+ existing_topic = classify_topic(node.label, node.summary)
159
+ if existing_topic == new_topic:
160
+ # Same topic — check it's not the same decision
161
+ if not _is_same_decision(new_label, node.label):
162
+ to_supersede.append(node_id)
163
+
164
+ return to_supersede
165
+
166
+
167
+ def _find_by_word_overlap(
168
+ new_label: str,
169
+ existing_nodes: dict,
170
+ overlap_threshold: float = 0.6,
171
+ ) -> list[str]:
172
+ """
173
+ Fallback: find decisions with high word overlap (same topic, unknown category).
174
+ Only used when topic classification returns None.
175
+ """
176
+ from tokenmizer.graph_memory.graph import NodeStatus, NodeType
177
+
178
+ _STOP = frozenset({"use", "using", "the", "a", "an", "for", "to", "in",
179
+ "on", "with", "and", "or", "of", "is", "are", "we",
180
+ "our", "this", "that", "it", "be", "have", "will"})
181
+
182
+ new_words = {w for w in re.sub(r"[^\w]", " ", new_label.lower()).split()
183
+ if w not in _STOP and len(w) > 2}
184
+
185
+ if not new_words:
186
+ return []
187
+
188
+ to_supersede = []
189
+ for node_id, node in existing_nodes.items():
190
+ if node.type != NodeType.DECISION:
191
+ continue
192
+ if node.status != NodeStatus.COMPLETED:
193
+ continue
194
+
195
+ existing_words = {w for w in re.sub(r"[^\w]", " ", node.label.lower()).split()
196
+ if w not in _STOP and len(w) > 2}
197
+ if not existing_words:
198
+ continue
199
+
200
+ overlap = len(new_words & existing_words) / max(len(new_words), len(existing_words))
201
+ if overlap >= overlap_threshold and not _is_same_decision(new_label, node.label):
202
+ to_supersede.append(node_id)
203
+
204
+ return to_supersede
205
+
206
+
207
+ def _is_same_decision(label_a: str, label_b: str) -> bool:
208
+ """True if two labels are essentially the same decision (dedup check)."""
209
+ def _norm(s: str) -> str:
210
+ s = s.lower().rstrip(".,!?;:")
211
+ # Normalize common tech name variants
212
+ s = re.sub(r"[^\w\s]", " ", s)
213
+ s = s.replace("next js", "nextjs").replace("node js", "nodejs")
214
+ s = s.replace("type script", "typescript").replace("java script", "javascript")
215
+ return re.sub(r"\s+", " ", s).strip()
216
+
217
+ a, b = _norm(label_a), _norm(label_b)
218
+ if a == b:
219
+ return True
220
+ words_a = set(a.split())
221
+ words_b = set(b.split())
222
+ if not words_a or not words_b:
223
+ return False
224
+ overlap = len(words_a & words_b) / max(len(words_a), len(words_b))
225
+ return overlap >= 0.82