memplex 3.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. memnex/__init__.py +31 -0
  2. memnex/__main__.py +6 -0
  3. memnex/_plugin/.claude-plugin/plugin.json +24 -0
  4. memnex/_plugin/.mcp.json +9 -0
  5. memnex/_plugin/__init__.py +0 -0
  6. memnex/_plugin/hooks/hooks.json +43 -0
  7. memnex/_plugin/scripts/hook-runner.py +166 -0
  8. memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
  9. memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
  10. memnex/_plugin/skills/mem-search/SKILL.md +85 -0
  11. memnex/_plugin/skills/mem-write/SKILL.md +78 -0
  12. memnex/adapters/__init__.py +14 -0
  13. memnex/adapters/claude_skill.py +169 -0
  14. memnex/adapters/cli.py +525 -0
  15. memnex/adapters/http_api.py +314 -0
  16. memnex/adapters/mcp_server.py +448 -0
  17. memnex/compaction.py +563 -0
  18. memnex/config.py +366 -0
  19. memnex/core/__init__.py +13 -0
  20. memnex/core/associator/__init__.py +8 -0
  21. memnex/core/associator/domain_classifier.py +75 -0
  22. memnex/core/associator/entity_aligner.py +127 -0
  23. memnex/core/associator/ref_linker.py +197 -0
  24. memnex/core/associator/term_mapper.py +77 -0
  25. memnex/core/dictionaries/__init__.py +50 -0
  26. memnex/core/engine.py +667 -0
  27. memnex/core/extractors/__init__.py +15 -0
  28. memnex/core/extractors/docx.py +97 -0
  29. memnex/core/extractors/image.py +233 -0
  30. memnex/core/extractors/markdown.py +139 -0
  31. memnex/core/extractors/pdf.py +133 -0
  32. memnex/core/extractors/vision_mapper.py +131 -0
  33. memnex/core/handlers/__init__.py +7 -0
  34. memnex/core/handlers/clipboard.py +40 -0
  35. memnex/core/handlers/file_handler.py +62 -0
  36. memnex/core/handlers/url_handler.py +132 -0
  37. memnex/llm/__init__.py +25 -0
  38. memnex/llm/enhancer.py +226 -0
  39. memnex/llm/fallback_chain.py +87 -0
  40. memnex/llm/injection_guard.py +178 -0
  41. memnex/llm/provider.py +130 -0
  42. memnex/llm/providers/__init__.py +22 -0
  43. memnex/llm/providers/anthropic.py +135 -0
  44. memnex/llm/providers/local.py +135 -0
  45. memnex/llm/providers/rule_based.py +68 -0
  46. memnex/llm/sanitizer.py +67 -0
  47. memnex/models/__init__.py +68 -0
  48. memnex/models/feedback.py +42 -0
  49. memnex/models/graph.py +33 -0
  50. memnex/models/memory.py +102 -0
  51. memnex/models/misc.py +185 -0
  52. memnex/models/paragraph.py +45 -0
  53. memnex/models/search.py +51 -0
  54. memnex/models/source.py +23 -0
  55. memnex/models/task.py +62 -0
  56. memnex/processing/__init__.py +1 -0
  57. memnex/processing/graph_builder.py +278 -0
  58. memnex/processing/merger/__init__.py +6 -0
  59. memnex/processing/merger/confidence_calculator.py +127 -0
  60. memnex/processing/merger/conflict_resolver.py +116 -0
  61. memnex/retrieval/__init__.py +1 -0
  62. memnex/retrieval/dedup.py +386 -0
  63. memnex/retrieval/embedding.py +289 -0
  64. memnex/retrieval/reranker.py +299 -0
  65. memnex/service.py +902 -0
  66. memnex/storage/__init__.py +65 -0
  67. memnex/storage/base.py +132 -0
  68. memnex/storage/changelog.py +106 -0
  69. memnex/storage/feedback.py +486 -0
  70. memnex/storage/lite/__init__.py +5 -0
  71. memnex/storage/lite/store.py +606 -0
  72. memnex/storage/vector.py +265 -0
  73. memnex/wiki/__init__.py +11 -0
  74. memnex/wiki/community.py +221 -0
  75. memnex/wiki/compiler.py +545 -0
  76. memnex/wiki/generator.py +270 -0
  77. memnex/wiki/search.py +282 -0
  78. memnex/worker.py +412 -0
  79. memplex-3.2.0.dist-info/METADATA +37 -0
  80. memplex-3.2.0.dist-info/RECORD +83 -0
  81. memplex-3.2.0.dist-info/WHEEL +5 -0
  82. memplex-3.2.0.dist-info/entry_points.txt +2 -0
  83. memplex-3.2.0.dist-info/top_level.txt +1 -0
memnex/config.py ADDED
@@ -0,0 +1,366 @@
1
+ """MemNex configuration system.
2
+
3
+ Supports loading from YAML files, environment variable overrides (MEMNEX_*),
4
+ and sensible defaults when no configuration is found.
5
+
6
+ Priority: MEMNEX_* env vars > config.yaml > defaults
7
+ """
8
+
9
+ import logging
10
+ import os
11
+ from dataclasses import dataclass, field, fields
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List, Optional
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # ── Sub-configurations ──────────────────────────────────────────
18
+
19
+
20
+ @dataclass
21
+ class StorageConfig:
22
+ """Storage backend configuration."""
23
+
24
+ backend: str = "standard" # lite | standard | enterprise
25
+ path: str = "~/.memnex"
26
+
27
+
28
+ @dataclass
29
+ class EmbeddingConfig:
30
+ """Embedding model configuration."""
31
+
32
+ model: str = "default" # default | bge-m3 | bge-small | openai
33
+ dimension: int = 384
34
+ batch_size: int = 32
35
+ contextual_retrieval: bool = True
36
+ hyde_enabled: bool = True
37
+
38
+
39
+ @dataclass
40
+ class RerankerConfig:
41
+ """Reranker scoring configuration."""
42
+
43
+ weights: Dict[str, float] = field(default_factory=lambda: {
44
+ "raw_relevance": 0.25,
45
+ "semantic_similarity": 0.30,
46
+ "recency_decay": 0.15,
47
+ "source_authority": 0.15,
48
+ "frequency": 0.15,
49
+ })
50
+ cross_encoder_enabled: bool = False
51
+ cross_encoder_model: str = "BAAI/bge-reranker-v2-m3"
52
+
53
+
54
+ @dataclass
55
+ class CompactionConfig:
56
+ """Compaction pipeline configuration."""
57
+
58
+ dedup_threshold: float = 0.95
59
+ chunk_threshold: int = 20000
60
+ warn_threshold: int = 50000
61
+ hard_limit: int = 500000
62
+ field_max_values: int = 20
63
+ needs_review_ttl_days: int = 30
64
+ prune_confidence_threshold: float = 0.3
65
+ prune_max_age_days: int = 180
66
+ prune_min_access_count: int = 0
67
+ dedup_use_faiss: bool = True
68
+
69
+
70
+ @dataclass
71
+ class GraphConfig:
72
+ """Graph configuration."""
73
+
74
+ semantic_similar_threshold: float = 0.85
75
+ semantic_similar_max_edges: int = 10
76
+ semantic_similar_ttl_days: int = 30
77
+ semantic_similar_sync_on_merge: bool = False
78
+ community_detection_enabled: bool = True
79
+ community_min_size: int = 3
80
+
81
+
82
+ @dataclass
83
+ class RetrievalConfig:
84
+ """Retrieval configuration."""
85
+
86
+ default_max_tokens: int = 4000
87
+ skill_max_tokens: int = 2000
88
+ injection_scan_enabled: bool = True
89
+
90
+
91
+ @dataclass
92
+ class LLMConfig:
93
+ """LLM provider configuration."""
94
+
95
+ semantic_extraction: bool = True
96
+ query_enhancement: bool = True
97
+ conflict_resolution: bool = True
98
+ summarization: bool = True
99
+ reranking: bool = True
100
+ provider: str = "anthropic"
101
+ anthropic_api_key: Optional[str] = None
102
+ local_endpoint: Optional[str] = None
103
+ local_model: Optional[str] = None
104
+ fallback_chain: List[str] = field(default_factory=lambda: ["anthropic"])
105
+ max_input_length: int = 10000
106
+
107
+
108
+ @dataclass
109
+ class ObservationConfig:
110
+ """Observation rate-limiting configuration."""
111
+
112
+ max_per_minute: int = 20
113
+
114
+
115
+ @dataclass
116
+ class LoggingConfig:
117
+ """Logging configuration."""
118
+
119
+ level: str = "INFO"
120
+ sanitize_sensitive: bool = True
121
+
122
+
123
+ @dataclass
124
+ class EncryptionConfig:
125
+ """Encryption configuration."""
126
+
127
+ enabled: bool = False
128
+ key_path: str = "~/.memnex/.enc_key"
129
+
130
+
131
+ # ── Top-level configuration ─────────────────────────────────────
132
+
133
+
134
+ @dataclass
135
+ class MemNexConfig:
136
+ """Top-level MemNex configuration.
137
+
138
+ Collects all sub-configurations into a single object.
139
+ Use ``load_config()`` to create an instance.
140
+ """
141
+
142
+ storage: StorageConfig = field(default_factory=StorageConfig)
143
+ embedding: EmbeddingConfig = field(default_factory=EmbeddingConfig)
144
+ reranker: RerankerConfig = field(default_factory=RerankerConfig)
145
+ compaction: CompactionConfig = field(default_factory=CompactionConfig)
146
+ graph: GraphConfig = field(default_factory=GraphConfig)
147
+ retrieval: RetrievalConfig = field(default_factory=RetrievalConfig)
148
+ llm: LLMConfig = field(default_factory=LLMConfig)
149
+ observation: ObservationConfig = field(default_factory=ObservationConfig)
150
+ logging: LoggingConfig = field(default_factory=LoggingConfig)
151
+ encryption: EncryptionConfig = field(default_factory=EncryptionConfig)
152
+
153
+
154
+ # ── Env-var mapping ─────────────────────────────────────────────
155
+
156
+ # Maps ``MEMNEX_<SECTION>_<KEY>`` environment variables to
157
+ # ``(MemNexConfig field, sub-config field)`` pairs.
158
+ #
159
+ # Example: ``MEMNEX_STORAGE_BACKEND=enterprise`` sets
160
+ # ``config.storage.backend = "enterprise"``.
161
+ #
162
+ # Keys not listed here are still resolved dynamically in
163
+ # ``_apply_env_overrides()`` using the same naming convention.
164
+
165
+ _ENV_TYPE_COERCIONS: Dict[str, type] = {
166
+ # StorageConfig
167
+ "storage.backend": str,
168
+ "storage.path": str,
169
+ # EmbeddingConfig
170
+ "embedding.model": str,
171
+ "embedding.dimension": int,
172
+ "embedding.batch_size": int,
173
+ "embedding.contextual_retrieval": bool,
174
+ "embedding.hyde_enabled": bool,
175
+ # RerankerConfig
176
+ "reranker.cross_encoder_enabled": bool,
177
+ "reranker.cross_encoder_model": str,
178
+ # CompactionConfig
179
+ "compaction.dedup_threshold": float,
180
+ "compaction.chunk_threshold": int,
181
+ "compaction.warn_threshold": int,
182
+ "compaction.hard_limit": int,
183
+ "compaction.field_max_values": int,
184
+ "compaction.needs_review_ttl_days": int,
185
+ "compaction.prune_confidence_threshold": float,
186
+ "compaction.prune_max_age_days": int,
187
+ "compaction.prune_min_access_count": int,
188
+ "compaction.dedup_use_faiss": bool,
189
+ # GraphConfig
190
+ "graph.semantic_similar_threshold": float,
191
+ "graph.semantic_similar_max_edges": int,
192
+ "graph.semantic_similar_ttl_days": int,
193
+ "graph.semantic_similar_sync_on_merge": bool,
194
+ "graph.community_detection_enabled": bool,
195
+ "graph.community_min_size": int,
196
+ # RetrievalConfig
197
+ "retrieval.default_max_tokens": int,
198
+ "retrieval.skill_max_tokens": int,
199
+ "retrieval.injection_scan_enabled": bool,
200
+ # LLMConfig
201
+ "llm.semantic_extraction": bool,
202
+ "llm.query_enhancement": bool,
203
+ "llm.conflict_resolution": bool,
204
+ "llm.summarization": bool,
205
+ "llm.reranking": bool,
206
+ "llm.provider": str,
207
+ "llm.anthropic_api_key": str,
208
+ "llm.local_endpoint": str,
209
+ "llm.local_model": str,
210
+ "llm.max_input_length": int,
211
+ # ObservationConfig
212
+ "observation.max_per_minute": int,
213
+ # LoggingConfig
214
+ "logging.level": str,
215
+ "logging.sanitize_sensitive": bool,
216
+ # EncryptionConfig
217
+ "encryption.enabled": bool,
218
+ "encryption.key_path": str,
219
+ }
220
+
221
+
222
+ def _coerce(value: str, target_type: type) -> Any:
223
+ """Coerce a string value to the target type."""
224
+ if target_type is bool:
225
+ return value.lower() in ("true", "1", "yes", "on")
226
+ if target_type is int:
227
+ return int(value)
228
+ if target_type is float:
229
+ return float(value)
230
+ return value
231
+
232
+
233
+ def _apply_env_overrides(config: MemNexConfig) -> None:
234
+ """Apply MEMNEX_* environment variable overrides to *config*.
235
+
236
+ Convention: ``MEMNEX_{SECTION}_{KEY}`` (upper-cased).
237
+
238
+ For the ``weights`` dict in ``reranker``, individual keys can be set via
239
+ ``MEMNEX_RERANKER_WEIGHTS_RAW_RELEVANCE=0.3``.
240
+ """
241
+ for dotted_path, target_type in _ENV_TYPE_COERCIONS.items():
242
+ parts = dotted_path.split(".")
243
+ section_name = parts[0]
244
+ key_name = parts[1]
245
+ env_key = f"MEMNEX_{section_name.upper()}_{key_name.upper()}"
246
+ env_value = os.environ.get(env_key)
247
+ if env_value is not None:
248
+ sub_config = getattr(config, section_name)
249
+ setattr(sub_config, key_name, _coerce(env_value, target_type))
250
+
251
+ # Handle reranker.weights sub-dict via MEMNEX_RERANKER_WEIGHTS_<KEY>
252
+ weights_env_prefix = "MEMNEX_RERANKER_WEIGHTS_"
253
+ for env_key, env_value in os.environ.items():
254
+ if env_key.startswith(weights_env_prefix):
255
+ weight_key = env_key[len(weights_env_prefix):].lower()
256
+ try:
257
+ config.reranker.weights[weight_key] = float(env_value)
258
+ except (ValueError, TypeError):
259
+ logger.warning("Invalid weight value for %s: %s", env_key, env_value)
260
+
261
+ # Handle LLM fallback_chain via MEMNEX_LLM_FALLBACK_CHAIN (comma-separated)
262
+ fallback_env = os.environ.get("MEMNEX_LLM_FALLBACK_CHAIN")
263
+ if fallback_env is not None:
264
+ config.llm.fallback_chain = [s.strip() for s in fallback_env.split(",") if s.strip()]
265
+
266
+
267
+ # ── YAML loading helpers ────────────────────────────────────────
268
+
269
+
270
+ def _parse_yaml(path: Path) -> Optional[Dict[str, Any]]:
271
+ """Try to parse a YAML file; return ``None`` if pyyaml is not available."""
272
+ try:
273
+ import yaml # optional dependency
274
+ except ImportError:
275
+ logger.debug("PyYAML not installed, skipping config file: %s", path)
276
+ return None
277
+
278
+ if not path.exists():
279
+ return None
280
+
281
+ with open(path, "r", encoding="utf-8") as fh:
282
+ data = yaml.safe_load(fh)
283
+
284
+ return data if isinstance(data, dict) else None
285
+
286
+
287
+ def _dict_to_dataclass(cls: type, data: Dict[str, Any]) -> Any:
288
+ """Recursively convert a plain dict to a dataclass instance.
289
+
290
+ Only keys that match known fields are used; unknown keys are silently
291
+ ignored so that config files with extra sections don't crash.
292
+ """
293
+ known_fields = {f.name for f in fields(cls)}
294
+ kwargs: Dict[str, Any] = {}
295
+
296
+ for f in fields(cls):
297
+ if f.name not in data:
298
+ continue
299
+ value = data[f.name]
300
+ field_type = f.type
301
+
302
+ # Resolve string annotations to actual types
303
+ if isinstance(field_type, str):
304
+ # field.type is a string like "StorageConfig"
305
+ # Try to resolve from module globals
306
+ import sys
307
+ field_type = sys.modules.get(cls.__module__, None)
308
+ if field_type is not None:
309
+ field_type = getattr(field_type, f.type, None)
310
+
311
+ # Handle nested dataclasses
312
+ if isinstance(value, dict) and hasattr(field_type, "__dataclass_fields__"):
313
+ kwargs[f.name] = _dict_to_dataclass(field_type, value)
314
+ else:
315
+ kwargs[f.name] = value
316
+
317
+ # Filter out keys not in known_fields to avoid TypeError
318
+ kwargs = {k: v for k, v in kwargs.items() if k in known_fields}
319
+ return cls(**kwargs)
320
+
321
+
322
+ # ── Public API ──────────────────────────────────────────────────
323
+
324
+ _DEFAULT_CONFIG_PATH = Path("~/.memnex/config.yaml").expanduser()
325
+
326
+
327
+ def load_config(path: Optional[str] = None) -> MemNexConfig:
328
+ """Load MemNex configuration.
329
+
330
+ Resolution order (highest priority first):
331
+ 1. ``MEMNEX_*`` environment variables
332
+ 2. YAML config file at *path* (or ``~/.memnex/config.yaml``)
333
+ 3. Built-in defaults
334
+
335
+ Parameters
336
+ ----------
337
+ path:
338
+ Explicit path to a YAML config file. When ``None`` the default
339
+ location ``~/.memnex/config.yaml`` is tried. If the file does
340
+ not exist, defaults are used.
341
+
342
+ Returns
343
+ -------
344
+ MemNexConfig
345
+ Fully resolved configuration object.
346
+ """
347
+ config = MemNexConfig()
348
+
349
+ # Resolve config file path
350
+ config_path = Path(path).expanduser() if path else _DEFAULT_CONFIG_PATH
351
+
352
+ # Load YAML overlay
353
+ yaml_data = _parse_yaml(config_path)
354
+ if yaml_data:
355
+ config = _dict_to_dataclass(MemNexConfig, yaml_data)
356
+ logger.debug("Loaded config from %s", config_path)
357
+ else:
358
+ logger.debug(
359
+ "No config file found at %s, using defaults",
360
+ config_path,
361
+ )
362
+
363
+ # Apply environment variable overrides (highest priority)
364
+ _apply_env_overrides(config)
365
+
366
+ return config
@@ -0,0 +1,13 @@
1
+ """memnex.core -- pure computation layer.
2
+
3
+ Primary entry point::
4
+
5
+ from memnex.core import CoreEngine
6
+
7
+ engine = CoreEngine()
8
+ extracted = engine.extract(source)
9
+ """
10
+
11
+ from memnex.core.engine import CoreEngine
12
+
13
+ __all__ = ["CoreEngine"]
@@ -0,0 +1,8 @@
1
+ """Association and linking modules."""
2
+
3
+ from .term_mapper import TermMapper
4
+ from .ref_linker import RefLinker
5
+ from .entity_aligner import EntityAligner
6
+ from .domain_classifier import DomainClassifier
7
+
8
+ __all__ = ["TermMapper", "RefLinker", "EntityAligner", "DomainClassifier"]
@@ -0,0 +1,75 @@
1
+ """DomainClassifier - classifies functions into domain categories."""
2
+
3
+ from typing import Dict, List, Optional
4
+ from memnex.models.memory import Function
5
+
6
+
7
+ class DomainClassifier:
8
+ """Classifies functions into domain categories based on keyword matching."""
9
+
10
+ DOMAIN_KEYWORDS: Dict[str, List[str]] = {
11
+ "认证模块": ["登录", "登出", "注册", "密码", "验证码", "认证", "OAuth", "login", "logout", "register", "password", "2FA"],
12
+ "账户模块": ["账户", "账号", "资料", "设置", "偏好", "profile", "account", "settings"],
13
+ "首页模块": ["首页", "仪表盘", "概览", "指标", "展示", "home", "dashboard", "metrics", "KPI"],
14
+ "订单模块": ["订单", "下单", "购物车", "order", "cart", "purchase"],
15
+ "支付模块": ["支付", "付款", "银行卡", "账单", "发票", "payment", "billing", "invoice"],
16
+ "通知模块": ["通知", "邮件", "短信", "消息", "提醒", "notification", "email", "SMS"],
17
+ "报表模块": ["报表", "统计", "导出", "分析", "report", "analytics", "export"],
18
+ "搜索模块": ["搜索", "查询", "过滤", "筛选", "推荐", "search", "filter", "recommendation"],
19
+ "安全模块": ["安全", "权限", "访问", "角色", "加密", "security", "permission", "access", "role"],
20
+ "配置模块": ["配置", "设置项", "参数", "开关", "config", "settings", "feature flag"],
21
+ }
22
+
23
+ def classify(self, func: Function) -> str:
24
+ """
25
+ Classify a function into a domain category.
26
+
27
+ Args:
28
+ func: Function to classify
29
+
30
+ Returns:
31
+ Domain name (e.g., "认证模块", "支付模块", or "通用")
32
+ """
33
+ text_parts = []
34
+ if func.name:
35
+ text_parts.append(("name", func.name))
36
+
37
+ # Collect text from multi-value fields (List[FieldValue])
38
+ for fv in func.trigger:
39
+ text_parts.append(("trigger", fv.desc))
40
+ for fv in func.condition:
41
+ text_parts.append(("condition", fv.desc))
42
+ for fv in func.action:
43
+ text_parts.append(("action", fv.desc))
44
+ for fv in func.benefit:
45
+ text_parts.append(("benefit", fv.desc))
46
+
47
+ scores: Dict[str, float] = {}
48
+ for domain, keywords in self.DOMAIN_KEYWORDS.items():
49
+ score = 0.0
50
+ for priority_label, text in text_parts:
51
+ weight = 2.0 if priority_label == "name" else 1.0
52
+ text_lower = text.lower()
53
+ for keyword in keywords:
54
+ if keyword.lower() in text_lower:
55
+ length_bonus = len(keyword) / 10.0
56
+ score += weight + length_bonus
57
+ scores[domain] = score
58
+
59
+ if scores:
60
+ best_domain = max(scores.items(), key=lambda x: x[1])
61
+ if best_domain[1] > 0:
62
+ return best_domain[0]
63
+
64
+ return "通用"
65
+
66
+ def classify_with_llm_fallback(self, func: Function) -> str:
67
+ """
68
+ Classify a function with LLM fallback for ambiguous cases.
69
+
70
+ Currently returns "通用" as LLM fallback placeholder.
71
+ """
72
+ result = self.classify(func)
73
+ if result == "通用":
74
+ return "通用"
75
+ return result
@@ -0,0 +1,127 @@
1
+ """Entity alignment using fuzzy matching and semantic similarity."""
2
+
3
+ import re
4
+ from typing import List, Dict, Any, Optional, Tuple
5
+ from difflib import SequenceMatcher
6
+
7
+
8
+ class EntityAligner:
9
+ """Aligns and merges entities from multiple sources."""
10
+
11
+ TERM_EQUIVALENCES = {
12
+ "login": ["登录", "登入", "认证", "authenticate"],
13
+ "logout": ["登出", "退出", "signout"],
14
+ "register": ["注册", "登记", "signup"],
15
+ "user": ["用户", "user", "users", "member", "会员"],
16
+ "password": ["密码", "password", "pwd"],
17
+ "order": ["订单", "order", "订购"],
18
+ "payment": ["支付", "payment", "pay", "付款"],
19
+ }
20
+
21
+ def __init__(self):
22
+ """Build reverse mapping from Chinese/alt terms to English canonical forms."""
23
+ self._chinese_to_english = {}
24
+ for english, chinese_list in self.TERM_EQUIVALENCES.items():
25
+ for ch in chinese_list:
26
+ self._chinese_to_english[ch] = english
27
+
28
+ def normalize(self, text: str) -> str:
29
+ """Normalize text for comparison, translating Chinese terms to English."""
30
+ normalized = text.lower()
31
+ normalized = re.sub(r'[^a-z0-9一-鿿]', '', normalized)
32
+
33
+ for chinese, english in self._chinese_to_english.items():
34
+ if chinese in normalized:
35
+ normalized = normalized.replace(chinese, english)
36
+
37
+ return normalized
38
+
39
+ def calculate_similarity(self, str1: str, str2: str) -> float:
40
+ """Calculate similarity between two strings (0-1)."""
41
+ norm1 = self.normalize(str1)
42
+ norm2 = self.normalize(str2)
43
+
44
+ if norm1 == norm2:
45
+ return 1.0
46
+
47
+ for base, equivalents in self.TERM_EQUIVALENCES.items():
48
+ if norm1 in equivalents or norm1 == base:
49
+ if norm2 in equivalents or norm2 == base:
50
+ return 0.85
51
+
52
+ return SequenceMatcher(None, norm1, norm2).ratio()
53
+
54
+ def find_similar(
55
+ self,
56
+ target: str,
57
+ entities: List,
58
+ threshold: float = 0.6
59
+ ) -> List[Tuple[Any, float]]:
60
+ """Find entities similar to target."""
61
+ results = []
62
+
63
+ for entity in entities:
64
+ entity_name = getattr(entity, 'name', '') or ''
65
+ score = self.calculate_similarity(target, entity_name)
66
+ if score >= threshold:
67
+ results.append((entity, score))
68
+
69
+ results.sort(key=lambda x: x[1], reverse=True)
70
+ return results
71
+
72
+ def find_merge_candidates(
73
+ self,
74
+ entities: List[Dict],
75
+ threshold: float = 0.9
76
+ ) -> List[List[Dict]]:
77
+ """
78
+ Find groups of entities that should be merged.
79
+
80
+ Uses blocking strategy for O(n) performance instead of O(n^2).
81
+ """
82
+ groups = []
83
+ used = set()
84
+
85
+ blocks: Dict[str, List[Dict]] = {}
86
+ for entity in entities:
87
+ normalized = self.normalize(entity['name'])
88
+ first_char = normalized[0] if normalized else '#'
89
+ if first_char not in blocks:
90
+ blocks[first_char] = []
91
+ blocks[first_char].append(entity)
92
+
93
+ for block_key, block_entities in blocks.items():
94
+ for i, entity in enumerate(block_entities):
95
+ if entity['id'] in used:
96
+ continue
97
+
98
+ group = [entity]
99
+ used.add(entity['id'])
100
+
101
+ for other in block_entities[i + 1:]:
102
+ if other['id'] in used:
103
+ continue
104
+
105
+ score = self.calculate_similarity(entity['name'], other['name'])
106
+ if score >= threshold:
107
+ group.append(other)
108
+ used.add(other['id'])
109
+
110
+ if len(group) > 1:
111
+ groups.append(group)
112
+
113
+ return groups
114
+
115
+ def suggest_merged_name(self, entities: List[Dict]) -> str:
116
+ """Suggest a merged name from multiple entities."""
117
+ if not entities:
118
+ return ""
119
+
120
+ if len(entities) == 1:
121
+ return entities[0]['name']
122
+
123
+ for entity in entities:
124
+ if re.search(r'[一-鿿]', entity['name']):
125
+ return entity['name']
126
+
127
+ return max(entities, key=lambda x: len(x['name']))['name']