memplex 3.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- memnex/__init__.py +31 -0
- memnex/__main__.py +6 -0
- memnex/_plugin/.claude-plugin/plugin.json +24 -0
- memnex/_plugin/.mcp.json +9 -0
- memnex/_plugin/__init__.py +0 -0
- memnex/_plugin/hooks/hooks.json +43 -0
- memnex/_plugin/scripts/hook-runner.py +166 -0
- memnex/_plugin/skills/mem-explore/SKILL.md +83 -0
- memnex/_plugin/skills/mem-manage/SKILL.md +92 -0
- memnex/_plugin/skills/mem-search/SKILL.md +85 -0
- memnex/_plugin/skills/mem-write/SKILL.md +78 -0
- memnex/adapters/__init__.py +14 -0
- memnex/adapters/claude_skill.py +169 -0
- memnex/adapters/cli.py +525 -0
- memnex/adapters/http_api.py +314 -0
- memnex/adapters/mcp_server.py +448 -0
- memnex/compaction.py +563 -0
- memnex/config.py +366 -0
- memnex/core/__init__.py +13 -0
- memnex/core/associator/__init__.py +8 -0
- memnex/core/associator/domain_classifier.py +75 -0
- memnex/core/associator/entity_aligner.py +127 -0
- memnex/core/associator/ref_linker.py +197 -0
- memnex/core/associator/term_mapper.py +77 -0
- memnex/core/dictionaries/__init__.py +50 -0
- memnex/core/engine.py +667 -0
- memnex/core/extractors/__init__.py +15 -0
- memnex/core/extractors/docx.py +97 -0
- memnex/core/extractors/image.py +233 -0
- memnex/core/extractors/markdown.py +139 -0
- memnex/core/extractors/pdf.py +133 -0
- memnex/core/extractors/vision_mapper.py +131 -0
- memnex/core/handlers/__init__.py +7 -0
- memnex/core/handlers/clipboard.py +40 -0
- memnex/core/handlers/file_handler.py +62 -0
- memnex/core/handlers/url_handler.py +132 -0
- memnex/llm/__init__.py +25 -0
- memnex/llm/enhancer.py +226 -0
- memnex/llm/fallback_chain.py +87 -0
- memnex/llm/injection_guard.py +178 -0
- memnex/llm/provider.py +130 -0
- memnex/llm/providers/__init__.py +22 -0
- memnex/llm/providers/anthropic.py +135 -0
- memnex/llm/providers/local.py +135 -0
- memnex/llm/providers/rule_based.py +68 -0
- memnex/llm/sanitizer.py +67 -0
- memnex/models/__init__.py +68 -0
- memnex/models/feedback.py +42 -0
- memnex/models/graph.py +33 -0
- memnex/models/memory.py +102 -0
- memnex/models/misc.py +185 -0
- memnex/models/paragraph.py +45 -0
- memnex/models/search.py +51 -0
- memnex/models/source.py +23 -0
- memnex/models/task.py +62 -0
- memnex/processing/__init__.py +1 -0
- memnex/processing/graph_builder.py +278 -0
- memnex/processing/merger/__init__.py +6 -0
- memnex/processing/merger/confidence_calculator.py +127 -0
- memnex/processing/merger/conflict_resolver.py +116 -0
- memnex/retrieval/__init__.py +1 -0
- memnex/retrieval/dedup.py +386 -0
- memnex/retrieval/embedding.py +289 -0
- memnex/retrieval/reranker.py +299 -0
- memnex/service.py +902 -0
- memnex/storage/__init__.py +65 -0
- memnex/storage/base.py +132 -0
- memnex/storage/changelog.py +106 -0
- memnex/storage/feedback.py +486 -0
- memnex/storage/lite/__init__.py +5 -0
- memnex/storage/lite/store.py +606 -0
- memnex/storage/vector.py +265 -0
- memnex/wiki/__init__.py +11 -0
- memnex/wiki/community.py +221 -0
- memnex/wiki/compiler.py +545 -0
- memnex/wiki/generator.py +270 -0
- memnex/wiki/search.py +282 -0
- memnex/worker.py +412 -0
- memplex-3.2.0.dist-info/METADATA +37 -0
- memplex-3.2.0.dist-info/RECORD +83 -0
- memplex-3.2.0.dist-info/WHEEL +5 -0
- memplex-3.2.0.dist-info/entry_points.txt +2 -0
- memplex-3.2.0.dist-info/top_level.txt +1 -0
memnex/models/task.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Task and compaction types."""
|
|
2
|
+
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import List, Optional, Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BackgroundTask(Enum):
|
|
10
|
+
EXTRACT_DOCUMENT = "extract_document"
|
|
11
|
+
BUILD_INDEX = "build_index"
|
|
12
|
+
COMPILE_WIKI = "compile_wiki"
|
|
13
|
+
REFRESH_VECTOR = "refresh_vector"
|
|
14
|
+
COMPACTION = "compaction"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TaskStatus(Enum):
|
|
18
|
+
PENDING = "pending"
|
|
19
|
+
RUNNING = "running"
|
|
20
|
+
COMPLETED = "completed"
|
|
21
|
+
FAILED = "failed"
|
|
22
|
+
CANCELLED = "cancelled"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@dataclass
|
|
26
|
+
class TaskInfo:
|
|
27
|
+
task_id: str
|
|
28
|
+
task_type: BackgroundTask
|
|
29
|
+
status: TaskStatus
|
|
30
|
+
created_at: datetime
|
|
31
|
+
completed_at: Optional[datetime] = None
|
|
32
|
+
payload: Optional[dict] = None
|
|
33
|
+
result: Any = None
|
|
34
|
+
error: Optional[str] = None
|
|
35
|
+
retry_count: int = 0
|
|
36
|
+
max_retries: int = 3
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CompactionScope(Enum):
|
|
40
|
+
SESSION = "session"
|
|
41
|
+
PROJECT = "project"
|
|
42
|
+
GLOBAL = "global"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class CompactionStageResult:
|
|
47
|
+
stage: str
|
|
48
|
+
processed: int
|
|
49
|
+
removed: int
|
|
50
|
+
merged: int
|
|
51
|
+
duration_ms: int
|
|
52
|
+
abort: bool = False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class CompactionResult:
|
|
57
|
+
total_processed: int
|
|
58
|
+
total_removed: int
|
|
59
|
+
total_merged: int
|
|
60
|
+
duration_ms: int
|
|
61
|
+
stages: List[CompactionStageResult] = field(default_factory=list)
|
|
62
|
+
skipped: bool = False
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Data processing pipeline: association, merging, graph building."""
|
|
@@ -0,0 +1,278 @@
|
|
|
1
|
+
"""GraphBuilder -- construct graph edges from Function nodes.
|
|
2
|
+
|
|
3
|
+
Detects REFERENCES, DEPENDS_ON, CONFLICTS_WITH, ASSOCIATED_WITH, and
|
|
4
|
+
BELONGS_TO edges by analysing cross-references, name patterns, and
|
|
5
|
+
domain membership.
|
|
6
|
+
|
|
7
|
+
Works with :class:`MemoryStore` for persistence, unlike the legacy
|
|
8
|
+
``merger/graph_builder.py`` which was single-run only.
|
|
9
|
+
|
|
10
|
+
Usage::
|
|
11
|
+
|
|
12
|
+
builder = GraphBuilder(store, config)
|
|
13
|
+
edges = builder.process(func, existing_graph)
|
|
14
|
+
builder.build_from_batch(functions)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from typing import List, Optional, Set, TYPE_CHECKING
|
|
23
|
+
|
|
24
|
+
from memnex.models import (
|
|
25
|
+
EdgeType,
|
|
26
|
+
Function,
|
|
27
|
+
GraphData,
|
|
28
|
+
GraphEdge,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from memnex.config import MemNexConfig, GraphConfig
|
|
33
|
+
from memnex.storage.base import MemoryStore
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class GraphBuilder:
|
|
39
|
+
"""Build graph edges for Function nodes.
|
|
40
|
+
|
|
41
|
+
Parameters
|
|
42
|
+
----------
|
|
43
|
+
store:
|
|
44
|
+
Active :class:`MemoryStore` backend (used for name lookups).
|
|
45
|
+
config:
|
|
46
|
+
Optional :class:`MemNexConfig` (reads ``graph`` sub-config).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
store: MemoryStore,
|
|
52
|
+
config: Optional[MemNexConfig] = None,
|
|
53
|
+
) -> None:
|
|
54
|
+
self._store = store
|
|
55
|
+
self._config = config
|
|
56
|
+
self._graph_config: Optional[GraphConfig] = (
|
|
57
|
+
config.graph if config else None
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
# ── Public API ──────────────────────────────────────────────────
|
|
61
|
+
|
|
62
|
+
def process(
|
|
63
|
+
self,
|
|
64
|
+
func: Function,
|
|
65
|
+
existing_graph: Optional[GraphData] = None,
|
|
66
|
+
) -> List[GraphEdge]:
|
|
67
|
+
"""Detect and return edges for a single Function.
|
|
68
|
+
|
|
69
|
+
Parameters
|
|
70
|
+
----------
|
|
71
|
+
func:
|
|
72
|
+
The Function to analyse.
|
|
73
|
+
existing_graph:
|
|
74
|
+
Current graph state (used to avoid duplicate edges and
|
|
75
|
+
to look up neighbour nodes). If ``None``, edges are
|
|
76
|
+
computed from scratch.
|
|
77
|
+
"""
|
|
78
|
+
edges: List[GraphEdge] = []
|
|
79
|
+
existing_set = self._edge_set(existing_graph)
|
|
80
|
+
|
|
81
|
+
# 1. REFERENCES -- from cross_references field
|
|
82
|
+
for ref in func.cross_references:
|
|
83
|
+
target_id = ref.get("target_id", "") if isinstance(ref, dict) else ""
|
|
84
|
+
target_name = ref.get("target", "") if isinstance(ref, dict) else str(ref)
|
|
85
|
+
if target_id:
|
|
86
|
+
edge = self._make_edge(
|
|
87
|
+
source=func.id,
|
|
88
|
+
target=target_id,
|
|
89
|
+
edge_type=EdgeType.REFERENCES.value,
|
|
90
|
+
evidence=[f"cross_reference from {func.name}"],
|
|
91
|
+
)
|
|
92
|
+
elif target_name:
|
|
93
|
+
resolved = self._resolve_by_name(target_name)
|
|
94
|
+
if resolved:
|
|
95
|
+
edge = self._make_edge(
|
|
96
|
+
source=func.id,
|
|
97
|
+
target=resolved,
|
|
98
|
+
edge_type=EdgeType.REFERENCES.value,
|
|
99
|
+
evidence=[f"cross_reference: {func.name} -> {target_name}"],
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
continue
|
|
103
|
+
else:
|
|
104
|
+
continue
|
|
105
|
+
|
|
106
|
+
if self._edge_key(edge) not in existing_set:
|
|
107
|
+
edges.append(edge)
|
|
108
|
+
existing_set.add(self._edge_key(edge))
|
|
109
|
+
|
|
110
|
+
# 2. DEPENDS_ON -- from action field references
|
|
111
|
+
all_funcs = self._get_all_funcs()
|
|
112
|
+
for other in all_funcs:
|
|
113
|
+
if other.id == func.id:
|
|
114
|
+
continue
|
|
115
|
+
if self._has_name_reference(func, other):
|
|
116
|
+
key = (func.id, other.id, EdgeType.DEPENDS_ON.value)
|
|
117
|
+
if key not in existing_set:
|
|
118
|
+
edges.append(self._make_edge(
|
|
119
|
+
source=func.id,
|
|
120
|
+
target=other.id,
|
|
121
|
+
edge_type=EdgeType.DEPENDS_ON.value,
|
|
122
|
+
evidence=[f"{func.name} references {other.name}"],
|
|
123
|
+
))
|
|
124
|
+
existing_set.add(key)
|
|
125
|
+
|
|
126
|
+
# 3. CONFLICTS_WITH -- same domain, overlapping trigger/action
|
|
127
|
+
for other in all_funcs:
|
|
128
|
+
if other.id == func.id:
|
|
129
|
+
continue
|
|
130
|
+
if self._detect_conflict(func, other):
|
|
131
|
+
key = (func.id, other.id, EdgeType.CONFLICTS_WITH.value)
|
|
132
|
+
rev_key = (other.id, func.id, EdgeType.CONFLICTS_WITH.value)
|
|
133
|
+
if key not in existing_set and rev_key not in existing_set:
|
|
134
|
+
edges.append(self._make_edge(
|
|
135
|
+
source=func.id,
|
|
136
|
+
target=other.id,
|
|
137
|
+
edge_type=EdgeType.CONFLICTS_WITH.value,
|
|
138
|
+
evidence=[f"conflicting definitions in domain {func.domain or 'unknown'}"],
|
|
139
|
+
))
|
|
140
|
+
existing_set.add(key)
|
|
141
|
+
|
|
142
|
+
# 4. BELONGS_TO -- domain membership
|
|
143
|
+
if func.domain:
|
|
144
|
+
domain_id = f"domain_{func.domain.replace(' ', '_').lower()}"
|
|
145
|
+
key = (func.id, domain_id, EdgeType.BELONGS_TO.value)
|
|
146
|
+
if key not in existing_set:
|
|
147
|
+
edges.append(self._make_edge(
|
|
148
|
+
source=func.id,
|
|
149
|
+
target=domain_id,
|
|
150
|
+
edge_type=EdgeType.BELONGS_TO.value,
|
|
151
|
+
evidence=[f"{func.name} belongs to {func.domain}"],
|
|
152
|
+
))
|
|
153
|
+
existing_set.add(key)
|
|
154
|
+
|
|
155
|
+
# 5. ASSOCIATED_WITH -- shared domain with other functions
|
|
156
|
+
if func.domain:
|
|
157
|
+
for other in all_funcs:
|
|
158
|
+
if other.id == func.id:
|
|
159
|
+
continue
|
|
160
|
+
if other.domain == func.domain:
|
|
161
|
+
key = (func.id, other.id, EdgeType.ASSOCIATED_WITH.value)
|
|
162
|
+
rev_key = (other.id, func.id, EdgeType.ASSOCIATED_WITH.value)
|
|
163
|
+
if key not in existing_set and rev_key not in existing_set:
|
|
164
|
+
edges.append(self._make_edge(
|
|
165
|
+
source=func.id,
|
|
166
|
+
target=other.id,
|
|
167
|
+
edge_type=EdgeType.ASSOCIATED_WITH.value,
|
|
168
|
+
weight=0.5,
|
|
169
|
+
evidence=[f"shared domain: {func.domain}"],
|
|
170
|
+
))
|
|
171
|
+
existing_set.add(key)
|
|
172
|
+
|
|
173
|
+
return edges
|
|
174
|
+
|
|
175
|
+
def build_from_batch(
|
|
176
|
+
self,
|
|
177
|
+
funcs: List[Function],
|
|
178
|
+
) -> List[GraphEdge]:
|
|
179
|
+
"""Build edges for a batch of Functions.
|
|
180
|
+
|
|
181
|
+
The graph is built incrementally: each Function sees edges
|
|
182
|
+
from previously processed Functions in the same batch.
|
|
183
|
+
"""
|
|
184
|
+
all_edges: List[GraphEdge] = []
|
|
185
|
+
accumulated_graph = GraphData(nodes=[], edges=[])
|
|
186
|
+
|
|
187
|
+
for func in funcs:
|
|
188
|
+
accumulated_graph.nodes.append(func)
|
|
189
|
+
new_edges = self.process(func, accumulated_graph)
|
|
190
|
+
all_edges.extend(new_edges)
|
|
191
|
+
accumulated_graph.edges.extend(new_edges)
|
|
192
|
+
|
|
193
|
+
return all_edges
|
|
194
|
+
|
|
195
|
+
# ── Edge detection helpers ──────────────────────────────────────
|
|
196
|
+
|
|
197
|
+
def _has_name_reference(self, source: Function, target: Function) -> bool:
|
|
198
|
+
"""Check if *source* mentions *target*'s name in its action field."""
|
|
199
|
+
target_name = target.name.lower()
|
|
200
|
+
if not target_name:
|
|
201
|
+
return False
|
|
202
|
+
for fv in source.action:
|
|
203
|
+
if target_name in fv.desc.lower():
|
|
204
|
+
return True
|
|
205
|
+
for fv in source.trigger:
|
|
206
|
+
if target_name in fv.desc.lower():
|
|
207
|
+
return True
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
def _detect_conflict(self, a: Function, b: Function) -> bool:
|
|
211
|
+
"""Detect if two Functions in the same domain conflict.
|
|
212
|
+
|
|
213
|
+
Conflict heuristic:
|
|
214
|
+
- Same domain (non-empty)
|
|
215
|
+
- Overlapping trigger descriptions (substring match)
|
|
216
|
+
"""
|
|
217
|
+
if not a.domain or a.domain != b.domain:
|
|
218
|
+
return False
|
|
219
|
+
a_triggers = {fv.desc.lower() for fv in a.trigger}
|
|
220
|
+
b_triggers = {fv.desc.lower() for fv in b.trigger}
|
|
221
|
+
return bool(a_triggers & b_triggers)
|
|
222
|
+
|
|
223
|
+
# ── Utility helpers ─────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
@staticmethod
|
|
226
|
+
def _make_edge(
|
|
227
|
+
source: str,
|
|
228
|
+
target: str,
|
|
229
|
+
edge_type: str,
|
|
230
|
+
weight: float = 1.0,
|
|
231
|
+
evidence: Optional[List[str]] = None,
|
|
232
|
+
) -> GraphEdge:
|
|
233
|
+
return GraphEdge(
|
|
234
|
+
source=source,
|
|
235
|
+
target=target,
|
|
236
|
+
edge_type=edge_type,
|
|
237
|
+
weight=weight,
|
|
238
|
+
evidence=evidence or [],
|
|
239
|
+
created_at=datetime.now(),
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
@staticmethod
|
|
243
|
+
def _edge_key(edge: GraphEdge) -> tuple:
|
|
244
|
+
return (edge.source, edge.target, edge.edge_type)
|
|
245
|
+
|
|
246
|
+
@staticmethod
|
|
247
|
+
def _edge_set(graph: Optional[GraphData]) -> Set[tuple]:
|
|
248
|
+
if graph is None:
|
|
249
|
+
return set()
|
|
250
|
+
return {
|
|
251
|
+
(e.source, e.target, e.edge_type)
|
|
252
|
+
for e in graph.edges
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
def _resolve_by_name(self, name: str) -> Optional[str]:
|
|
256
|
+
"""Look up a Function ID by its name via the store."""
|
|
257
|
+
try:
|
|
258
|
+
funcs = self._store.list_functions(limit=100000)
|
|
259
|
+
for f in funcs:
|
|
260
|
+
if f.name == name:
|
|
261
|
+
return f.id
|
|
262
|
+
except Exception:
|
|
263
|
+
pass
|
|
264
|
+
return None
|
|
265
|
+
|
|
266
|
+
def _get_all_funcs(self) -> List[Function]:
|
|
267
|
+
"""Retrieve all stored Functions (cached per build batch)."""
|
|
268
|
+
if not hasattr(self, "_funcs_cache"):
|
|
269
|
+
try:
|
|
270
|
+
self._funcs_cache = self._store.list_functions(limit=100000)
|
|
271
|
+
except Exception:
|
|
272
|
+
self._funcs_cache = []
|
|
273
|
+
return self._funcs_cache
|
|
274
|
+
|
|
275
|
+
def invalidate_cache(self) -> None:
|
|
276
|
+
"""Clear the internal function list cache."""
|
|
277
|
+
if hasattr(self, "_funcs_cache"):
|
|
278
|
+
del self._funcs_cache
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
"""Confidence calculation based on extraction quality signals."""
|
|
2
|
+
|
|
3
|
+
from typing import List, TYPE_CHECKING
|
|
4
|
+
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from memnex.models.paragraph import Paragraph, Sentence
|
|
7
|
+
from memnex.models.memory import Function
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ConfidenceCalculator:
|
|
11
|
+
"""Calculates confidence scores based on extraction quality signals."""
|
|
12
|
+
|
|
13
|
+
SOURCE_BASE = {
|
|
14
|
+
"text": 0.95,
|
|
15
|
+
"markdown": 0.95,
|
|
16
|
+
"pdf": 0.90,
|
|
17
|
+
"docx": 0.90,
|
|
18
|
+
"image": 0.85,
|
|
19
|
+
"vision": 0.80,
|
|
20
|
+
"url": 0.90,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
SOURCE_ALIASES = {
|
|
24
|
+
"clipboard": "text",
|
|
25
|
+
"file:": "file",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
def calculate_paragraph_confidence(
|
|
29
|
+
self,
|
|
30
|
+
para: "Paragraph",
|
|
31
|
+
source_hint: str = "text"
|
|
32
|
+
) -> float:
|
|
33
|
+
"""
|
|
34
|
+
Calculate confidence for a paragraph -> Function conversion.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
para: The paragraph to evaluate
|
|
38
|
+
source_hint: Hint about source type
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Confidence score between 0.0 and 1.0
|
|
42
|
+
"""
|
|
43
|
+
base = self._get_base_confidence(source_hint)
|
|
44
|
+
|
|
45
|
+
adjustments = []
|
|
46
|
+
|
|
47
|
+
if para.sentences:
|
|
48
|
+
sent_count = len(para.sentences)
|
|
49
|
+
if 2 <= sent_count <= 10:
|
|
50
|
+
adjustments.append(0.02)
|
|
51
|
+
elif sent_count > 10:
|
|
52
|
+
adjustments.append(0.01)
|
|
53
|
+
else:
|
|
54
|
+
adjustments.append(-0.05)
|
|
55
|
+
|
|
56
|
+
if para.section:
|
|
57
|
+
adjustments.append(0.03)
|
|
58
|
+
|
|
59
|
+
text_len = len(para.raw_text) if para.raw_text else 0
|
|
60
|
+
if text_len < 10:
|
|
61
|
+
adjustments.append(-0.05)
|
|
62
|
+
elif text_len >= 50:
|
|
63
|
+
adjustments.append(0.02)
|
|
64
|
+
|
|
65
|
+
roles = [s.role for s in para.sentences] if para.sentences else []
|
|
66
|
+
field_count = sum(1 for r in roles if r in ("trigger", "condition", "action", "result"))
|
|
67
|
+
if field_count >= 3:
|
|
68
|
+
adjustments.append(0.05)
|
|
69
|
+
elif field_count == 1:
|
|
70
|
+
adjustments.append(-0.02)
|
|
71
|
+
|
|
72
|
+
unique_roles = set(roles)
|
|
73
|
+
if "trigger" in unique_roles and "action" in unique_roles:
|
|
74
|
+
adjustments.append(0.03)
|
|
75
|
+
if "condition" in unique_roles and "action" in unique_roles:
|
|
76
|
+
adjustments.append(0.02)
|
|
77
|
+
|
|
78
|
+
confidence = base + sum(adjustments)
|
|
79
|
+
return max(0.5, min(0.99, confidence))
|
|
80
|
+
|
|
81
|
+
def calculate_vision_confidence(
|
|
82
|
+
self,
|
|
83
|
+
page_type: str,
|
|
84
|
+
component_count: int
|
|
85
|
+
) -> float:
|
|
86
|
+
"""
|
|
87
|
+
Calculate confidence for Vision-derived functions.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
page_type: Type of page
|
|
91
|
+
component_count: Number of UI components detected
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Confidence score between 0.0 and 1.0
|
|
95
|
+
"""
|
|
96
|
+
base = self.SOURCE_BASE["vision"]
|
|
97
|
+
|
|
98
|
+
adjustments = []
|
|
99
|
+
|
|
100
|
+
if page_type and page_type not in ("Unknown", "Other"):
|
|
101
|
+
adjustments.append(0.05)
|
|
102
|
+
else:
|
|
103
|
+
adjustments.append(-0.05)
|
|
104
|
+
|
|
105
|
+
if component_count == 0:
|
|
106
|
+
adjustments.append(-0.10)
|
|
107
|
+
elif component_count <= 10:
|
|
108
|
+
adjustments.append(0.03)
|
|
109
|
+
elif component_count > 20:
|
|
110
|
+
adjustments.append(-0.02)
|
|
111
|
+
|
|
112
|
+
confidence = base + sum(adjustments)
|
|
113
|
+
return max(0.5, min(0.95, confidence))
|
|
114
|
+
|
|
115
|
+
def _get_base_confidence(self, source_hint: str) -> float:
|
|
116
|
+
"""Get base confidence for a source hint."""
|
|
117
|
+
hint_lower = source_hint.lower()
|
|
118
|
+
|
|
119
|
+
for key, val in self.SOURCE_BASE.items():
|
|
120
|
+
if key in hint_lower:
|
|
121
|
+
return val
|
|
122
|
+
|
|
123
|
+
for alias, canonical in self.SOURCE_ALIASES.items():
|
|
124
|
+
if alias in hint_lower:
|
|
125
|
+
return self.SOURCE_BASE.get(canonical, 0.9)
|
|
126
|
+
|
|
127
|
+
return 0.9
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Detect and resolve conflicts between extracted data.
|
|
3
|
+
|
|
4
|
+
Design principle (v3.2 §1.6): Field multi-value coexistence (non-authority arbitration).
|
|
5
|
+
When conflicts occur, ALL values are preserved and needs_review is set to True.
|
|
6
|
+
Only when user manually resolves does one value become the "final" value.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import List, Dict, Optional
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class Conflict:
|
|
15
|
+
id: str
|
|
16
|
+
type: str # field_value, missing_field, etc.
|
|
17
|
+
severity: str # high, medium, low
|
|
18
|
+
field: str
|
|
19
|
+
values: List[Dict] # [{"source": ..., "content": ..., "authority": ...}]
|
|
20
|
+
resolved: bool = False
|
|
21
|
+
final_value: Optional[str] = None
|
|
22
|
+
needs_human: bool = True
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ConflictResolver:
|
|
26
|
+
"""
|
|
27
|
+
Detects and resolves conflicts in extracted data.
|
|
28
|
+
|
|
29
|
+
Follows v3.2 §1.6 design: multi-value coexistence (no authority arbitration).
|
|
30
|
+
All conflicting values are preserved, needs_human=True by default.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def detect_conflicts(self, functions: List) -> List[Conflict]:
|
|
34
|
+
"""Detect conflicts between functions."""
|
|
35
|
+
conflicts = []
|
|
36
|
+
conflict_id = 1
|
|
37
|
+
|
|
38
|
+
func_map = {}
|
|
39
|
+
for func in functions:
|
|
40
|
+
key = func.name_normalized
|
|
41
|
+
if key not in func_map:
|
|
42
|
+
func_map[key] = []
|
|
43
|
+
func_map[key].append(func)
|
|
44
|
+
|
|
45
|
+
for key, funcs in func_map.items():
|
|
46
|
+
if len(funcs) < 2:
|
|
47
|
+
continue
|
|
48
|
+
|
|
49
|
+
for i in range(len(funcs)):
|
|
50
|
+
for j in range(i + 1, len(funcs)):
|
|
51
|
+
conflict = self._compare_functions(funcs[i], funcs[j], conflict_id)
|
|
52
|
+
if conflict:
|
|
53
|
+
conflicts.append(conflict)
|
|
54
|
+
conflict_id += 1
|
|
55
|
+
|
|
56
|
+
return conflicts
|
|
57
|
+
|
|
58
|
+
def _compare_functions(self, func1, func2, conflict_id: int) -> Optional[Conflict]:
|
|
59
|
+
"""Compare two functions for conflicts."""
|
|
60
|
+
# Compare conditions (adapted for List[FieldValue])
|
|
61
|
+
cond1_descs = [fv.desc for fv in func1.condition] if func1.condition else []
|
|
62
|
+
cond2_descs = [fv.desc for fv in func2.condition] if func2.condition else []
|
|
63
|
+
|
|
64
|
+
if cond1_descs and cond2_descs:
|
|
65
|
+
if cond1_descs != cond2_descs:
|
|
66
|
+
auth1 = func1.source_authority or "unknown"
|
|
67
|
+
auth2 = func2.source_authority or "unknown"
|
|
68
|
+
return Conflict(
|
|
69
|
+
id=f"conflict_{conflict_id:03d}",
|
|
70
|
+
type="field_value",
|
|
71
|
+
severity="medium",
|
|
72
|
+
field="condition",
|
|
73
|
+
values=[
|
|
74
|
+
{"source": func1.source_paragraphs[0] if func1.source_paragraphs else "unknown",
|
|
75
|
+
"content": ", ".join(cond1_descs),
|
|
76
|
+
"authority": auth1},
|
|
77
|
+
{"source": func2.source_paragraphs[0] if func2.source_paragraphs else "unknown",
|
|
78
|
+
"content": ", ".join(cond2_descs),
|
|
79
|
+
"authority": auth2}
|
|
80
|
+
],
|
|
81
|
+
needs_human=True
|
|
82
|
+
)
|
|
83
|
+
return None
|
|
84
|
+
|
|
85
|
+
def get_all_values(self, conflict: Conflict) -> List[str]:
|
|
86
|
+
"""Get all conflicting values."""
|
|
87
|
+
if not conflict.values:
|
|
88
|
+
return []
|
|
89
|
+
return [v["content"] for v in conflict.values]
|
|
90
|
+
|
|
91
|
+
def mark_for_human_review(self, conflict: Conflict, suggestion: str = None):
|
|
92
|
+
"""Mark conflict for human review."""
|
|
93
|
+
conflict.needs_human = True
|
|
94
|
+
conflict.resolved = False
|
|
95
|
+
|
|
96
|
+
def apply_resolution(self, conflict: Conflict, value: str):
|
|
97
|
+
"""Apply human resolution."""
|
|
98
|
+
if value not in self.get_all_values(conflict):
|
|
99
|
+
raise ValueError(f"Resolution value '{value}' not in conflict values")
|
|
100
|
+
conflict.final_value = value
|
|
101
|
+
conflict.resolved = True
|
|
102
|
+
conflict.needs_human = False
|
|
103
|
+
|
|
104
|
+
def resolve_conflicts(self, conflicts: List[Conflict]) -> tuple:
|
|
105
|
+
"""Process conflicts, marking all for human review."""
|
|
106
|
+
unresolved = []
|
|
107
|
+
resolved = []
|
|
108
|
+
|
|
109
|
+
for conflict in conflicts:
|
|
110
|
+
if conflict.resolved and conflict.final_value:
|
|
111
|
+
resolved.append(conflict)
|
|
112
|
+
else:
|
|
113
|
+
conflict.needs_human = True
|
|
114
|
+
unresolved.append(conflict)
|
|
115
|
+
|
|
116
|
+
return unresolved, resolved
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Search and ranking: embedding, reranking, deduplication."""
|