memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
memnex/config.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
"""MemNex configuration system.
|
|
2
|
+
|
|
3
|
+
Supports loading from YAML files, environment variable overrides (MEMNEX_*),
|
|
4
|
+
and sensible defaults when no configuration is found.
|
|
5
|
+
|
|
6
|
+
Priority: MEMNEX_* env vars > config.yaml > defaults
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass, field, fields
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Dict, List, Optional
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# ── Sub-configurations ──────────────────────────────────────────
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class StorageConfig:
|
|
22
|
+
"""Storage backend configuration."""
|
|
23
|
+
|
|
24
|
+
backend: str = "standard" # lite | standard | enterprise
|
|
25
|
+
path: str = "~/.memnex"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class EmbeddingConfig:
|
|
30
|
+
"""Embedding model configuration."""
|
|
31
|
+
|
|
32
|
+
model: str = "default" # default | bge-m3 | bge-small | openai
|
|
33
|
+
dimension: int = 384
|
|
34
|
+
batch_size: int = 32
|
|
35
|
+
contextual_retrieval: bool = True
|
|
36
|
+
hyde_enabled: bool = True
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class RerankerConfig:
|
|
41
|
+
"""Reranker scoring configuration."""
|
|
42
|
+
|
|
43
|
+
weights: Dict[str, float] = field(default_factory=lambda: {
|
|
44
|
+
"raw_relevance": 0.25,
|
|
45
|
+
"semantic_similarity": 0.30,
|
|
46
|
+
"recency_decay": 0.15,
|
|
47
|
+
"source_authority": 0.15,
|
|
48
|
+
"frequency": 0.15,
|
|
49
|
+
})
|
|
50
|
+
cross_encoder_enabled: bool = False
|
|
51
|
+
cross_encoder_model: str = "BAAI/bge-reranker-v2-m3"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class CompactionConfig:
|
|
56
|
+
"""Compaction pipeline configuration."""
|
|
57
|
+
|
|
58
|
+
dedup_threshold: float = 0.95
|
|
59
|
+
chunk_threshold: int = 20000
|
|
60
|
+
warn_threshold: int = 50000
|
|
61
|
+
hard_limit: int = 500000
|
|
62
|
+
field_max_values: int = 20
|
|
63
|
+
needs_review_ttl_days: int = 30
|
|
64
|
+
prune_confidence_threshold: float = 0.3
|
|
65
|
+
prune_max_age_days: int = 180
|
|
66
|
+
prune_min_access_count: int = 0
|
|
67
|
+
dedup_use_faiss: bool = True
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@dataclass
|
|
71
|
+
class GraphConfig:
|
|
72
|
+
"""Graph configuration."""
|
|
73
|
+
|
|
74
|
+
semantic_similar_threshold: float = 0.85
|
|
75
|
+
semantic_similar_max_edges: int = 10
|
|
76
|
+
semantic_similar_ttl_days: int = 30
|
|
77
|
+
semantic_similar_sync_on_merge: bool = False
|
|
78
|
+
community_detection_enabled: bool = True
|
|
79
|
+
community_min_size: int = 3
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass
|
|
83
|
+
class RetrievalConfig:
|
|
84
|
+
"""Retrieval configuration."""
|
|
85
|
+
|
|
86
|
+
default_max_tokens: int = 4000
|
|
87
|
+
skill_max_tokens: int = 2000
|
|
88
|
+
injection_scan_enabled: bool = True
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@dataclass
|
|
92
|
+
class LLMConfig:
|
|
93
|
+
"""LLM provider configuration."""
|
|
94
|
+
|
|
95
|
+
semantic_extraction: bool = True
|
|
96
|
+
query_enhancement: bool = True
|
|
97
|
+
conflict_resolution: bool = True
|
|
98
|
+
summarization: bool = True
|
|
99
|
+
reranking: bool = True
|
|
100
|
+
provider: str = "anthropic"
|
|
101
|
+
anthropic_api_key: Optional[str] = None
|
|
102
|
+
local_endpoint: Optional[str] = None
|
|
103
|
+
local_model: Optional[str] = None
|
|
104
|
+
fallback_chain: List[str] = field(default_factory=lambda: ["anthropic"])
|
|
105
|
+
max_input_length: int = 10000
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@dataclass
|
|
109
|
+
class ObservationConfig:
|
|
110
|
+
"""Observation rate-limiting configuration."""
|
|
111
|
+
|
|
112
|
+
max_per_minute: int = 20
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@dataclass
|
|
116
|
+
class LoggingConfig:
|
|
117
|
+
"""Logging configuration."""
|
|
118
|
+
|
|
119
|
+
level: str = "INFO"
|
|
120
|
+
sanitize_sensitive: bool = True
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@dataclass
|
|
124
|
+
class EncryptionConfig:
|
|
125
|
+
"""Encryption configuration."""
|
|
126
|
+
|
|
127
|
+
enabled: bool = False
|
|
128
|
+
key_path: str = "~/.memnex/.enc_key"
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── Top-level configuration ─────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass
|
|
135
|
+
class MemNexConfig:
|
|
136
|
+
"""Top-level MemNex configuration.
|
|
137
|
+
|
|
138
|
+
Collects all sub-configurations into a single object.
|
|
139
|
+
Use ``load_config()`` to create an instance.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
storage: StorageConfig = field(default_factory=StorageConfig)
|
|
143
|
+
embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
|
|
144
|
+
reranker: RerankerConfig = field(default_factory=RerankerConfig)
|
|
145
|
+
compaction: CompactionConfig = field(default_factory=CompactionConfig)
|
|
146
|
+
graph: GraphConfig = field(default_factory=GraphConfig)
|
|
147
|
+
retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
|
|
148
|
+
llm: LLMConfig = field(default_factory=LLMConfig)
|
|
149
|
+
observation: ObservationConfig = field(default_factory=ObservationConfig)
|
|
150
|
+
logging: LoggingConfig = field(default_factory=LoggingConfig)
|
|
151
|
+
encryption: EncryptionConfig = field(default_factory=EncryptionConfig)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ── Env-var mapping ─────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
# Maps ``MEMNEX_<SECTION>_<KEY>`` environment variables to
|
|
157
|
+
# ``(MemNexConfig field, sub-config field)`` pairs.
|
|
158
|
+
#
|
|
159
|
+
# Example: ``MEMNEX_STORAGE_BACKEND=enterprise`` sets
|
|
160
|
+
# ``config.storage.backend = "enterprise"``.
|
|
161
|
+
#
|
|
162
|
+
# Keys not listed here are still resolved dynamically in
|
|
163
|
+
# ``_apply_env_overrides()`` using the same naming convention.
|
|
164
|
+
|
|
165
|
+
_ENV_TYPE_COERCIONS: Dict[str, type] = {
|
|
166
|
+
# StorageConfig
|
|
167
|
+
"storage.backend": str,
|
|
168
|
+
"storage.path": str,
|
|
169
|
+
# EmbeddingConfig
|
|
170
|
+
"embedding.model": str,
|
|
171
|
+
"embedding.dimension": int,
|
|
172
|
+
"embedding.batch_size": int,
|
|
173
|
+
"embedding.contextual_retrieval": bool,
|
|
174
|
+
"embedding.hyde_enabled": bool,
|
|
175
|
+
# RerankerConfig
|
|
176
|
+
"reranker.cross_encoder_enabled": bool,
|
|
177
|
+
"reranker.cross_encoder_model": str,
|
|
178
|
+
# CompactionConfig
|
|
179
|
+
"compaction.dedup_threshold": float,
|
|
180
|
+
"compaction.chunk_threshold": int,
|
|
181
|
+
"compaction.warn_threshold": int,
|
|
182
|
+
"compaction.hard_limit": int,
|
|
183
|
+
"compaction.field_max_values": int,
|
|
184
|
+
"compaction.needs_review_ttl_days": int,
|
|
185
|
+
"compaction.prune_confidence_threshold": float,
|
|
186
|
+
"compaction.prune_max_age_days": int,
|
|
187
|
+
"compaction.prune_min_access_count": int,
|
|
188
|
+
"compaction.dedup_use_faiss": bool,
|
|
189
|
+
# GraphConfig
|
|
190
|
+
"graph.semantic_similar_threshold": float,
|
|
191
|
+
"graph.semantic_similar_max_edges": int,
|
|
192
|
+
"graph.semantic_similar_ttl_days": int,
|
|
193
|
+
"graph.semantic_similar_sync_on_merge": bool,
|
|
194
|
+
"graph.community_detection_enabled": bool,
|
|
195
|
+
"graph.community_min_size": int,
|
|
196
|
+
# RetrievalConfig
|
|
197
|
+
"retrieval.default_max_tokens": int,
|
|
198
|
+
"retrieval.skill_max_tokens": int,
|
|
199
|
+
"retrieval.injection_scan_enabled": bool,
|
|
200
|
+
# LLMConfig
|
|
201
|
+
"llm.semantic_extraction": bool,
|
|
202
|
+
"llm.query_enhancement": bool,
|
|
203
|
+
"llm.conflict_resolution": bool,
|
|
204
|
+
"llm.summarization": bool,
|
|
205
|
+
"llm.reranking": bool,
|
|
206
|
+
"llm.provider": str,
|
|
207
|
+
"llm.anthropic_api_key": str,
|
|
208
|
+
"llm.local_endpoint": str,
|
|
209
|
+
"llm.local_model": str,
|
|
210
|
+
"llm.max_input_length": int,
|
|
211
|
+
# ObservationConfig
|
|
212
|
+
"observation.max_per_minute": int,
|
|
213
|
+
# LoggingConfig
|
|
214
|
+
"logging.level": str,
|
|
215
|
+
"logging.sanitize_sensitive": bool,
|
|
216
|
+
# EncryptionConfig
|
|
217
|
+
"encryption.enabled": bool,
|
|
218
|
+
"encryption.key_path": str,
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _coerce(value: str, target_type: type) -> Any:
|
|
223
|
+
"""Coerce a string value to the target type."""
|
|
224
|
+
if target_type is bool:
|
|
225
|
+
return value.lower() in ("true", "1", "yes", "on")
|
|
226
|
+
if target_type is int:
|
|
227
|
+
return int(value)
|
|
228
|
+
if target_type is float:
|
|
229
|
+
return float(value)
|
|
230
|
+
return value
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _apply_env_overrides(config: MemNexConfig) -> None:
|
|
234
|
+
"""Apply MEMNEX_* environment variable overrides to *config*.
|
|
235
|
+
|
|
236
|
+
Convention: ``MEMNEX_{SECTION}_{KEY}`` (upper-cased).
|
|
237
|
+
|
|
238
|
+
For the ``weights`` dict in ``reranker``, individual keys can be set via
|
|
239
|
+
``MEMNEX_RERANKER_WEIGHTS_RAW_RELEVANCE=0.3``.
|
|
240
|
+
"""
|
|
241
|
+
for dotted_path, target_type in _ENV_TYPE_COERCIONS.items():
|
|
242
|
+
parts = dotted_path.split(".")
|
|
243
|
+
section_name = parts[0]
|
|
244
|
+
key_name = parts[1]
|
|
245
|
+
env_key = f"MEMNEX_{section_name.upper()}_{key_name.upper()}"
|
|
246
|
+
env_value = os.environ.get(env_key)
|
|
247
|
+
if env_value is not None:
|
|
248
|
+
sub_config = getattr(config, section_name)
|
|
249
|
+
setattr(sub_config, key_name, _coerce(env_value, target_type))
|
|
250
|
+
|
|
251
|
+
# Handle reranker.weights sub-dict via MEMNEX_RERANKER_WEIGHTS_<KEY>
|
|
252
|
+
weights_env_prefix = "MEMNEX_RERANKER_WEIGHTS_"
|
|
253
|
+
for env_key, env_value in os.environ.items():
|
|
254
|
+
if env_key.startswith(weights_env_prefix):
|
|
255
|
+
weight_key = env_key[len(weights_env_prefix):].lower()
|
|
256
|
+
try:
|
|
257
|
+
config.reranker.weights[weight_key] = float(env_value)
|
|
258
|
+
except (ValueError, TypeError):
|
|
259
|
+
logger.warning("Invalid weight value for %s: %s", env_key, env_value)
|
|
260
|
+
|
|
261
|
+
# Handle LLM fallback_chain via MEMNEX_LLM_FALLBACK_CHAIN (comma-separated)
|
|
262
|
+
fallback_env = os.environ.get("MEMNEX_LLM_FALLBACK_CHAIN")
|
|
263
|
+
if fallback_env is not None:
|
|
264
|
+
config.llm.fallback_chain = [s.strip() for s in fallback_env.split(",") if s.strip()]
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
# ── YAML loading helpers ────────────────────────────────────────
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
def _parse_yaml(path: Path) -> Optional[Dict[str, Any]]:
|
|
271
|
+
"""Try to parse a YAML file; return ``None`` if pyyaml is not available."""
|
|
272
|
+
try:
|
|
273
|
+
import yaml # optional dependency
|
|
274
|
+
except ImportError:
|
|
275
|
+
logger.debug("PyYAML not installed, skipping config file: %s", path)
|
|
276
|
+
return None
|
|
277
|
+
|
|
278
|
+
if not path.exists():
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
with open(path, "r", encoding="utf-8") as fh:
|
|
282
|
+
data = yaml.safe_load(fh)
|
|
283
|
+
|
|
284
|
+
return data if isinstance(data, dict) else None
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def _dict_to_dataclass(cls: type, data: Dict[str, Any]) -> Any:
|
|
288
|
+
"""Recursively convert a plain dict to a dataclass instance.
|
|
289
|
+
|
|
290
|
+
Only keys that match known fields are used; unknown keys are silently
|
|
291
|
+
ignored so that config files with extra sections don't crash.
|
|
292
|
+
"""
|
|
293
|
+
known_fields = {f.name for f in fields(cls)}
|
|
294
|
+
kwargs: Dict[str, Any] = {}
|
|
295
|
+
|
|
296
|
+
for f in fields(cls):
|
|
297
|
+
if f.name not in data:
|
|
298
|
+
continue
|
|
299
|
+
value = data[f.name]
|
|
300
|
+
field_type = f.type
|
|
301
|
+
|
|
302
|
+
# Resolve string annotations to actual types
|
|
303
|
+
if isinstance(field_type, str):
|
|
304
|
+
# field.type is a string like "StorageConfig"
|
|
305
|
+
# Try to resolve from module globals
|
|
306
|
+
import sys
|
|
307
|
+
field_type = sys.modules.get(cls.__module__, None)
|
|
308
|
+
if field_type is not None:
|
|
309
|
+
field_type = getattr(field_type, f.type, None)
|
|
310
|
+
|
|
311
|
+
# Handle nested dataclasses
|
|
312
|
+
if isinstance(value, dict) and hasattr(field_type, "__dataclass_fields__"):
|
|
313
|
+
kwargs[f.name] = _dict_to_dataclass(field_type, value)
|
|
314
|
+
else:
|
|
315
|
+
kwargs[f.name] = value
|
|
316
|
+
|
|
317
|
+
# Filter out keys not in known_fields to avoid TypeError
|
|
318
|
+
kwargs = {k: v for k, v in kwargs.items() if k in known_fields}
|
|
319
|
+
return cls(**kwargs)
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
323
|
+
|
|
324
|
+
_DEFAULT_CONFIG_PATH = Path("~/.memnex/config.yaml").expanduser()
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def load_config(path: Optional[str] = None) -> MemNexConfig:
|
|
328
|
+
"""Load MemNex configuration.
|
|
329
|
+
|
|
330
|
+
Resolution order (highest priority first):
|
|
331
|
+
1. ``MEMNEX_*`` environment variables
|
|
332
|
+
2. YAML config file at *path* (or ``~/.memnex/config.yaml``)
|
|
333
|
+
3. Built-in defaults
|
|
334
|
+
|
|
335
|
+
Parameters
|
|
336
|
+
----------
|
|
337
|
+
path:
|
|
338
|
+
Explicit path to a YAML config file. When ``None`` the default
|
|
339
|
+
location ``~/.memnex/config.yaml`` is tried. If the file does
|
|
340
|
+
not exist, defaults are used.
|
|
341
|
+
|
|
342
|
+
Returns
|
|
343
|
+
-------
|
|
344
|
+
MemNexConfig
|
|
345
|
+
Fully resolved configuration object.
|
|
346
|
+
"""
|
|
347
|
+
config = MemNexConfig()
|
|
348
|
+
|
|
349
|
+
# Resolve config file path
|
|
350
|
+
config_path = Path(path).expanduser() if path else _DEFAULT_CONFIG_PATH
|
|
351
|
+
|
|
352
|
+
# Load YAML overlay
|
|
353
|
+
yaml_data = _parse_yaml(config_path)
|
|
354
|
+
if yaml_data:
|
|
355
|
+
config = _dict_to_dataclass(MemNexConfig, yaml_data)
|
|
356
|
+
logger.debug("Loaded config from %s", config_path)
|
|
357
|
+
else:
|
|
358
|
+
logger.debug(
|
|
359
|
+
"No config file found at %s, using defaults",
|
|
360
|
+
config_path,
|
|
361
|
+
)
|
|
362
|
+
|
|
363
|
+
# Apply environment variable overrides (highest priority)
|
|
364
|
+
_apply_env_overrides(config)
|
|
365
|
+
|
|
366
|
+
return config
|
memnex/core/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""Association and linking modules."""
|
|
2
|
+
|
|
3
|
+
from .term_mapper import TermMapper
|
|
4
|
+
from .ref_linker import RefLinker
|
|
5
|
+
from .entity_aligner import EntityAligner
|
|
6
|
+
from .domain_classifier import DomainClassifier
|
|
7
|
+
|
|
8
|
+
__all__ = ["TermMapper", "RefLinker", "EntityAligner", "DomainClassifier"]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""DomainClassifier - classifies functions into domain categories."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List, Optional
|
|
4
|
+
from memnex.models.memory import Function
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class DomainClassifier:
|
|
8
|
+
"""Classifies functions into domain categories based on keyword matching."""
|
|
9
|
+
|
|
10
|
+
DOMAIN_KEYWORDS: Dict[str, List[str]] = {
|
|
11
|
+
"认证模块": ["登录", "登出", "注册", "密码", "验证码", "认证", "OAuth", "login", "logout", "register", "password", "2FA"],
|
|
12
|
+
"账户模块": ["账户", "账号", "资料", "设置", "偏好", "profile", "account", "settings"],
|
|
13
|
+
"首页模块": ["首页", "仪表盘", "概览", "指标", "展示", "home", "dashboard", "metrics", "KPI"],
|
|
14
|
+
"订单模块": ["订单", "下单", "购物车", "order", "cart", "purchase"],
|
|
15
|
+
"支付模块": ["支付", "付款", "银行卡", "账单", "发票", "payment", "billing", "invoice"],
|
|
16
|
+
"通知模块": ["通知", "邮件", "短信", "消息", "提醒", "notification", "email", "SMS"],
|
|
17
|
+
"报表模块": ["报表", "统计", "导出", "分析", "report", "analytics", "export"],
|
|
18
|
+
"搜索模块": ["搜索", "查询", "过滤", "筛选", "推荐", "search", "filter", "recommendation"],
|
|
19
|
+
"安全模块": ["安全", "权限", "访问", "角色", "加密", "security", "permission", "access", "role"],
|
|
20
|
+
"配置模块": ["配置", "设置项", "参数", "开关", "config", "settings", "feature flag"],
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
def classify(self, func: Function) -> str:
|
|
24
|
+
"""
|
|
25
|
+
Classify a function into a domain category.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
func: Function to classify
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Domain name (e.g., "认证模块", "支付模块", or "通用")
|
|
32
|
+
"""
|
|
33
|
+
text_parts = []
|
|
34
|
+
if func.name:
|
|
35
|
+
text_parts.append(("name", func.name))
|
|
36
|
+
|
|
37
|
+
# Collect text from multi-value fields (List[FieldValue])
|
|
38
|
+
for fv in func.trigger:
|
|
39
|
+
text_parts.append(("trigger", fv.desc))
|
|
40
|
+
for fv in func.condition:
|
|
41
|
+
text_parts.append(("condition", fv.desc))
|
|
42
|
+
for fv in func.action:
|
|
43
|
+
text_parts.append(("action", fv.desc))
|
|
44
|
+
for fv in func.benefit:
|
|
45
|
+
text_parts.append(("benefit", fv.desc))
|
|
46
|
+
|
|
47
|
+
scores: Dict[str, float] = {}
|
|
48
|
+
for domain, keywords in self.DOMAIN_KEYWORDS.items():
|
|
49
|
+
score = 0.0
|
|
50
|
+
for priority_label, text in text_parts:
|
|
51
|
+
weight = 2.0 if priority_label == "name" else 1.0
|
|
52
|
+
text_lower = text.lower()
|
|
53
|
+
for keyword in keywords:
|
|
54
|
+
if keyword.lower() in text_lower:
|
|
55
|
+
length_bonus = len(keyword) / 10.0
|
|
56
|
+
score += weight + length_bonus
|
|
57
|
+
scores[domain] = score
|
|
58
|
+
|
|
59
|
+
if scores:
|
|
60
|
+
best_domain = max(scores.items(), key=lambda x: x[1])
|
|
61
|
+
if best_domain[1] > 0:
|
|
62
|
+
return best_domain[0]
|
|
63
|
+
|
|
64
|
+
return "通用"
|
|
65
|
+
|
|
66
|
+
def classify_with_llm_fallback(self, func: Function) -> str:
|
|
67
|
+
"""
|
|
68
|
+
Classify a function with LLM fallback for ambiguous cases.
|
|
69
|
+
|
|
70
|
+
Currently returns "通用" as LLM fallback placeholder.
|
|
71
|
+
"""
|
|
72
|
+
result = self.classify(func)
|
|
73
|
+
if result == "通用":
|
|
74
|
+
return "通用"
|
|
75
|
+
return result
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Entity alignment using fuzzy matching and semantic similarity."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List, Dict, Any, Optional, Tuple
|
|
5
|
+
from difflib import SequenceMatcher
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class EntityAligner:
|
|
9
|
+
"""Aligns and merges entities from multiple sources."""
|
|
10
|
+
|
|
11
|
+
TERM_EQUIVALENCES = {
|
|
12
|
+
"login": ["登录", "登入", "认证", "authenticate"],
|
|
13
|
+
"logout": ["登出", "退出", "signout"],
|
|
14
|
+
"register": ["注册", "登记", "signup"],
|
|
15
|
+
"user": ["用户", "user", "users", "member", "会员"],
|
|
16
|
+
"password": ["密码", "password", "pwd"],
|
|
17
|
+
"order": ["订单", "order", "订购"],
|
|
18
|
+
"payment": ["支付", "payment", "pay", "付款"],
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
def __init__(self):
|
|
22
|
+
"""Build reverse mapping from Chinese/alt terms to English canonical forms."""
|
|
23
|
+
self._chinese_to_english = {}
|
|
24
|
+
for english, chinese_list in self.TERM_EQUIVALENCES.items():
|
|
25
|
+
for ch in chinese_list:
|
|
26
|
+
self._chinese_to_english[ch] = english
|
|
27
|
+
|
|
28
|
+
def normalize(self, text: str) -> str:
|
|
29
|
+
"""Normalize text for comparison, translating Chinese terms to English."""
|
|
30
|
+
normalized = text.lower()
|
|
31
|
+
normalized = re.sub(r'[^a-z0-9一-鿿]', '', normalized)
|
|
32
|
+
|
|
33
|
+
for chinese, english in self._chinese_to_english.items():
|
|
34
|
+
if chinese in normalized:
|
|
35
|
+
normalized = normalized.replace(chinese, english)
|
|
36
|
+
|
|
37
|
+
return normalized
|
|
38
|
+
|
|
39
|
+
def calculate_similarity(self, str1: str, str2: str) -> float:
|
|
40
|
+
"""Calculate similarity between two strings (0-1)."""
|
|
41
|
+
norm1 = self.normalize(str1)
|
|
42
|
+
norm2 = self.normalize(str2)
|
|
43
|
+
|
|
44
|
+
if norm1 == norm2:
|
|
45
|
+
return 1.0
|
|
46
|
+
|
|
47
|
+
for base, equivalents in self.TERM_EQUIVALENCES.items():
|
|
48
|
+
if norm1 in equivalents or norm1 == base:
|
|
49
|
+
if norm2 in equivalents or norm2 == base:
|
|
50
|
+
return 0.85
|
|
51
|
+
|
|
52
|
+
return SequenceMatcher(None, norm1, norm2).ratio()
|
|
53
|
+
|
|
54
|
+
def find_similar(
|
|
55
|
+
self,
|
|
56
|
+
target: str,
|
|
57
|
+
entities: List,
|
|
58
|
+
threshold: float = 0.6
|
|
59
|
+
) -> List[Tuple[Any, float]]:
|
|
60
|
+
"""Find entities similar to target."""
|
|
61
|
+
results = []
|
|
62
|
+
|
|
63
|
+
for entity in entities:
|
|
64
|
+
entity_name = getattr(entity, 'name', '') or ''
|
|
65
|
+
score = self.calculate_similarity(target, entity_name)
|
|
66
|
+
if score >= threshold:
|
|
67
|
+
results.append((entity, score))
|
|
68
|
+
|
|
69
|
+
results.sort(key=lambda x: x[1], reverse=True)
|
|
70
|
+
return results
|
|
71
|
+
|
|
72
|
+
def find_merge_candidates(
|
|
73
|
+
self,
|
|
74
|
+
entities: List[Dict],
|
|
75
|
+
threshold: float = 0.9
|
|
76
|
+
) -> List[List[Dict]]:
|
|
77
|
+
"""
|
|
78
|
+
Find groups of entities that should be merged.
|
|
79
|
+
|
|
80
|
+
Uses blocking strategy for O(n) performance instead of O(n^2).
|
|
81
|
+
"""
|
|
82
|
+
groups = []
|
|
83
|
+
used = set()
|
|
84
|
+
|
|
85
|
+
blocks: Dict[str, List[Dict]] = {}
|
|
86
|
+
for entity in entities:
|
|
87
|
+
normalized = self.normalize(entity['name'])
|
|
88
|
+
first_char = normalized[0] if normalized else '#'
|
|
89
|
+
if first_char not in blocks:
|
|
90
|
+
blocks[first_char] = []
|
|
91
|
+
blocks[first_char].append(entity)
|
|
92
|
+
|
|
93
|
+
for block_key, block_entities in blocks.items():
|
|
94
|
+
for i, entity in enumerate(block_entities):
|
|
95
|
+
if entity['id'] in used:
|
|
96
|
+
continue
|
|
97
|
+
|
|
98
|
+
group = [entity]
|
|
99
|
+
used.add(entity['id'])
|
|
100
|
+
|
|
101
|
+
for other in block_entities[i + 1:]:
|
|
102
|
+
if other['id'] in used:
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
score = self.calculate_similarity(entity['name'], other['name'])
|
|
106
|
+
if score >= threshold:
|
|
107
|
+
group.append(other)
|
|
108
|
+
used.add(other['id'])
|
|
109
|
+
|
|
110
|
+
if len(group) > 1:
|
|
111
|
+
groups.append(group)
|
|
112
|
+
|
|
113
|
+
return groups
|
|
114
|
+
|
|
115
|
+
def suggest_merged_name(self, entities: List[Dict]) -> str:
|
|
116
|
+
"""Suggest a merged name from multiple entities."""
|
|
117
|
+
if not entities:
|
|
118
|
+
return ""
|
|
119
|
+
|
|
120
|
+
if len(entities) == 1:
|
|
121
|
+
return entities[0]['name']
|
|
122
|
+
|
|
123
|
+
for entity in entities:
|
|
124
|
+
if re.search(r'[一-鿿]', entity['name']):
|
|
125
|
+
return entity['name']
|
|
126
|
+
|
|
127
|
+
return max(entities, key=lambda x: len(x['name']))['name']
|