alita-sdk 0.3.462__py3-none-any.whl → 0.3.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +15 -3
- alita_sdk/cli/agent_loader.py +56 -8
- alita_sdk/cli/agent_ui.py +93 -31
- alita_sdk/cli/agents.py +2274 -230
- alita_sdk/cli/callbacks.py +96 -25
- alita_sdk/cli/cli.py +10 -1
- alita_sdk/cli/config.py +162 -9
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/cli/toolkit.py +14 -17
- alita_sdk/cli/toolkit_loader.py +35 -5
- alita_sdk/cli/tools/__init__.py +36 -2
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +910 -64
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +1 -1
- alita_sdk/configurations/ado.py +141 -20
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/configurations/confluence.py +76 -42
- alita_sdk/configurations/figma.py +76 -0
- alita_sdk/configurations/gitlab.py +17 -5
- alita_sdk/configurations/openapi.py +329 -0
- alita_sdk/configurations/qtest.py +72 -1
- alita_sdk/configurations/report_portal.py +96 -0
- alita_sdk/configurations/sharepoint.py +148 -0
- alita_sdk/configurations/testio.py +83 -0
- alita_sdk/runtime/clients/artifact.py +3 -3
- alita_sdk/runtime/clients/client.py +353 -48
- alita_sdk/runtime/clients/sandbox_client.py +0 -21
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +123 -26
- alita_sdk/runtime/langchain/constants.py +642 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +6 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
- alita_sdk/runtime/langchain/document_loaders/constants.py +12 -7
- alita_sdk/runtime/langchain/langraph_agent.py +279 -73
- alita_sdk/runtime/langchain/utils.py +82 -15
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +7 -0
- alita_sdk/runtime/toolkits/application.py +21 -9
- alita_sdk/runtime/toolkits/artifact.py +15 -5
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +139 -251
- alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +251 -6
- alita_sdk/runtime/toolkits/tools.py +238 -32
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +3 -1
- alita_sdk/runtime/tools/application.py +20 -6
- alita_sdk/runtime/tools/artifact.py +511 -28
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +43 -15
- alita_sdk/runtime/tools/image_generation.py +50 -44
- alita_sdk/runtime/tools/llm.py +852 -67
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_remote_tool.py +25 -10
- alita_sdk/runtime/tools/mcp_server_tool.py +7 -6
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -4
- alita_sdk/runtime/tools/sandbox.py +9 -6
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +7 -2
- alita_sdk/runtime/tools/vectorstore_base.py +51 -11
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +202 -5
- alita_sdk/runtime/utils/mcp_sse_client.py +36 -7
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/serialization.py +155 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +16 -5
- alita_sdk/runtime/utils/utils.py +36 -0
- alita_sdk/tools/__init__.py +113 -29
- alita_sdk/tools/ado/repos/__init__.py +51 -33
- alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
- alita_sdk/tools/ado/test_plan/__init__.py +25 -9
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
- alita_sdk/tools/ado/utils.py +1 -18
- alita_sdk/tools/ado/wiki/__init__.py +25 -8
- alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
- alita_sdk/tools/ado/work_item/__init__.py +26 -9
- alita_sdk/tools/ado/work_item/ado_wrapper.py +56 -3
- alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +11 -8
- alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +170 -45
- alita_sdk/tools/bitbucket/__init__.py +17 -12
- alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/browser/__init__.py +5 -4
- alita_sdk/tools/carrier/__init__.py +5 -6
- alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
- alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
- alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +10 -7
- alita_sdk/tools/cloud/azure/__init__.py +10 -7
- alita_sdk/tools/cloud/gcp/__init__.py +10 -7
- alita_sdk/tools/cloud/k8s/__init__.py +10 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +10 -7
- alita_sdk/tools/code_indexer_toolkit.py +73 -23
- alita_sdk/tools/confluence/__init__.py +21 -15
- alita_sdk/tools/confluence/api_wrapper.py +78 -23
- alita_sdk/tools/confluence/loader.py +4 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +493 -30
- alita_sdk/tools/figma/__init__.py +58 -11
- alita_sdk/tools/figma/api_wrapper.py +1235 -143
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +13 -14
- alita_sdk/tools/github/github_client.py +224 -100
- alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
- alita_sdk/tools/github/schemas.py +14 -5
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/github/tool_prompts.py +9 -22
- alita_sdk/tools/gitlab/__init__.py +15 -11
- alita_sdk/tools/gitlab/api_wrapper.py +207 -41
- alita_sdk/tools/gitlab_org/__init__.py +10 -8
- alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
- alita_sdk/tools/google/bigquery/__init__.py +13 -12
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +10 -8
- alita_sdk/tools/google_places/api_wrapper.py +1 -1
- alita_sdk/tools/jira/__init__.py +17 -11
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +11 -3
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +490 -114
- alita_sdk/tools/openapi/api_wrapper.py +1368 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +38 -25
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/pptx/__init__.py +10 -9
- alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
- alita_sdk/tools/qtest/__init__.py +30 -10
- alita_sdk/tools/qtest/api_wrapper.py +430 -13
- alita_sdk/tools/rally/__init__.py +10 -8
- alita_sdk/tools/rally/api_wrapper.py +1 -1
- alita_sdk/tools/report_portal/__init__.py +12 -9
- alita_sdk/tools/salesforce/__init__.py +10 -9
- alita_sdk/tools/servicenow/__init__.py +17 -14
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +10 -8
- alita_sdk/tools/sharepoint/api_wrapper.py +4 -4
- alita_sdk/tools/slack/__init__.py +10 -8
- alita_sdk/tools/slack/api_wrapper.py +2 -2
- alita_sdk/tools/sql/__init__.py +11 -9
- alita_sdk/tools/testio/__init__.py +10 -8
- alita_sdk/tools/testrail/__init__.py +11 -8
- alita_sdk/tools/testrail/api_wrapper.py +1 -1
- alita_sdk/tools/utils/__init__.py +9 -4
- alita_sdk/tools/utils/content_parser.py +77 -3
- alita_sdk/tools/utils/text_operations.py +410 -0
- alita_sdk/tools/utils/tool_prompts.py +79 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
- alita_sdk/tools/xray/__init__.py +12 -9
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +9 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +11 -8
- alita_sdk/tools/zephyr_essential/__init__.py +10 -8
- alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
- alita_sdk/tools/zephyr_essential/client.py +2 -2
- alita_sdk/tools/zephyr_scale/__init__.py +11 -9
- alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
- alita_sdk/tools/zephyr_squad/__init__.py +10 -8
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +147 -7
- alita_sdk-0.3.627.dist-info/RECORD +468 -0
- alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.462.dist-info/RECORD +0 -384
- alita_sdk-0.3.462.dist-info/entry_points.txt +0 -2
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,2137 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Graph Enrichment Utilities.
|
|
3
|
+
|
|
4
|
+
Post-processing tools to improve graph connectivity by:
|
|
5
|
+
1. Soft entity deduplication (merging same/similar entities with different types)
|
|
6
|
+
2. Linking semantically similar entities across sources
|
|
7
|
+
3. Creating cross-reference relationships (implements, documents, etc.)
|
|
8
|
+
4. Connecting orphan nodes to parent concepts
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
from alita_sdk.community.inventory.enrichment import GraphEnricher
|
|
12
|
+
|
|
13
|
+
enricher = GraphEnricher(graph_path="./graph.json")
|
|
14
|
+
enricher.enrich()
|
|
15
|
+
enricher.save()
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
import re
|
|
21
|
+
import hashlib
|
|
22
|
+
from collections import defaultdict
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
from typing import Dict, List, Set, Tuple, Optional, Any
|
|
25
|
+
from difflib import SequenceMatcher
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ============================================================================
|
|
31
|
+
# TYPE NORMALIZATION FOR ENRICHMENT
|
|
32
|
+
# ============================================================================
|
|
33
|
+
|
|
34
|
+
# Comprehensive type consolidation map
|
|
35
|
+
# Maps many ad-hoc LLM types to a smaller set of canonical types
|
|
36
|
+
# NOTE: All keys should be lowercase - normalize_type() lowercases input first
|
|
37
|
+
TYPE_NORMALIZATION_MAP = {
|
|
38
|
+
# ==========================================================================
|
|
39
|
+
# IDENTITY MAPPINGS - Types that MUST be preserved as-is
|
|
40
|
+
# ==========================================================================
|
|
41
|
+
"fact": "fact",
|
|
42
|
+
"source_file": "source_file",
|
|
43
|
+
"feature": "feature",
|
|
44
|
+
"module": "module",
|
|
45
|
+
"constant": "constant",
|
|
46
|
+
"rule": "rule",
|
|
47
|
+
"parameter": "parameter",
|
|
48
|
+
"error_handling": "error_handling",
|
|
49
|
+
"todo": "todo",
|
|
50
|
+
"property": "property",
|
|
51
|
+
"configuration": "configuration",
|
|
52
|
+
"process": "process",
|
|
53
|
+
"integration": "integration",
|
|
54
|
+
"interface": "interface",
|
|
55
|
+
"user_story": "user_story",
|
|
56
|
+
"test": "test",
|
|
57
|
+
"variable": "variable",
|
|
58
|
+
"function": "function",
|
|
59
|
+
|
|
60
|
+
# ==========================================================================
|
|
61
|
+
# CODE STRUCTURE FAMILY → map to preserved types
|
|
62
|
+
# ==========================================================================
|
|
63
|
+
"named": "export",
|
|
64
|
+
"default": "export",
|
|
65
|
+
"business_rule": "rule",
|
|
66
|
+
"domain_concept": "concept",
|
|
67
|
+
"business_concept": "concept",
|
|
68
|
+
"integration_point": "integration",
|
|
69
|
+
"user_interface_element": "interface",
|
|
70
|
+
"user_interface_component": "interface",
|
|
71
|
+
"user_interaction": "interface",
|
|
72
|
+
"user_action": "interface",
|
|
73
|
+
"api_contract": "rest_api",
|
|
74
|
+
"technical_debt": "todo",
|
|
75
|
+
"test_scenario": "test",
|
|
76
|
+
"test_case": "test",
|
|
77
|
+
"tooltype": "tool",
|
|
78
|
+
|
|
79
|
+
# ==========================================================================
|
|
80
|
+
# TOOL & TOOLKIT FAMILY → tool, toolkit
|
|
81
|
+
# ==========================================================================
|
|
82
|
+
"tool": "tool",
|
|
83
|
+
"tools": "tool",
|
|
84
|
+
"tool_used": "tool",
|
|
85
|
+
"tool_example": "tool",
|
|
86
|
+
"tool_category": "tool",
|
|
87
|
+
"internal_tool": "tool",
|
|
88
|
+
"documentationtool": "tool",
|
|
89
|
+
"toolkit": "toolkit",
|
|
90
|
+
"toolkits": "toolkit",
|
|
91
|
+
"toolkit_type": "toolkit",
|
|
92
|
+
|
|
93
|
+
# ==========================================================================
|
|
94
|
+
# FEATURE & CAPABILITY FAMILY → feature
|
|
95
|
+
# ==========================================================================
|
|
96
|
+
"features": "feature",
|
|
97
|
+
"functionality": "feature",
|
|
98
|
+
"capability": "feature",
|
|
99
|
+
"benefit": "feature",
|
|
100
|
+
"characteristic": "feature",
|
|
101
|
+
|
|
102
|
+
# ==========================================================================
|
|
103
|
+
# PROCESS & WORKFLOW FAMILY → process
|
|
104
|
+
# ==========================================================================
|
|
105
|
+
"processes": "process",
|
|
106
|
+
"procedure": "process",
|
|
107
|
+
"workflow": "workflow",
|
|
108
|
+
"flow": "process",
|
|
109
|
+
"pipeline": "process",
|
|
110
|
+
|
|
111
|
+
# ==========================================================================
|
|
112
|
+
# CONCEPT & ENTITY FAMILY → concept
|
|
113
|
+
# ==========================================================================
|
|
114
|
+
"concept": "concept",
|
|
115
|
+
"concepts": "concept",
|
|
116
|
+
"entity": "entity",
|
|
117
|
+
"entities": "entity",
|
|
118
|
+
"entity_type": "entity",
|
|
119
|
+
"entitytype": "entity",
|
|
120
|
+
"domain_entity": "entity",
|
|
121
|
+
"domain": "concept",
|
|
122
|
+
"topic": "concept",
|
|
123
|
+
"term": "concept",
|
|
124
|
+
"glossary_term": "concept",
|
|
125
|
+
"key_concept": "concept",
|
|
126
|
+
|
|
127
|
+
# ==========================================================================
|
|
128
|
+
# CONFIGURATION FAMILY → configuration
|
|
129
|
+
# ==========================================================================
|
|
130
|
+
"config": "configuration",
|
|
131
|
+
"configuration_section": "configuration",
|
|
132
|
+
"configuration_field": "configuration",
|
|
133
|
+
"configuration_option": "configuration",
|
|
134
|
+
"configuration_file": "configuration",
|
|
135
|
+
"configurationfile": "configuration",
|
|
136
|
+
"configurationchange": "configuration",
|
|
137
|
+
"configuration_command": "configuration",
|
|
138
|
+
"setting": "configuration",
|
|
139
|
+
"environment": "configuration",
|
|
140
|
+
|
|
141
|
+
# ==========================================================================
|
|
142
|
+
# DOCUMENTATION & GUIDE FAMILY → documentation
|
|
143
|
+
# ==========================================================================
|
|
144
|
+
"documentation": "documentation",
|
|
145
|
+
"documentation_section": "documentation",
|
|
146
|
+
"documentation_template": "documentation",
|
|
147
|
+
"guide": "documentation",
|
|
148
|
+
"guideline": "documentation",
|
|
149
|
+
"instruction": "documentation",
|
|
150
|
+
"tip": "documentation",
|
|
151
|
+
"note": "documentation",
|
|
152
|
+
"faq": "documentation",
|
|
153
|
+
"overview": "documentation",
|
|
154
|
+
"summary": "documentation",
|
|
155
|
+
"best_practice": "documentation",
|
|
156
|
+
|
|
157
|
+
# ==========================================================================
|
|
158
|
+
# SECTION & STRUCTURE FAMILY → section
|
|
159
|
+
# ==========================================================================
|
|
160
|
+
"section": "section",
|
|
161
|
+
"sections": "section",
|
|
162
|
+
"interface_section": "section",
|
|
163
|
+
"navigation_structure": "section",
|
|
164
|
+
"navigation_group": "section",
|
|
165
|
+
"navigation": "section",
|
|
166
|
+
|
|
167
|
+
# ==========================================================================
|
|
168
|
+
# COMPONENT & UI FAMILY → component
|
|
169
|
+
# ==========================================================================
|
|
170
|
+
"component": "component",
|
|
171
|
+
"components": "component",
|
|
172
|
+
"ui_component": "component",
|
|
173
|
+
"ui_element": "component",
|
|
174
|
+
"ui_layout": "component",
|
|
175
|
+
"interface_element": "component",
|
|
176
|
+
"button": "component",
|
|
177
|
+
"menu": "component",
|
|
178
|
+
"tab": "component",
|
|
179
|
+
"panel": "component",
|
|
180
|
+
"editor": "component",
|
|
181
|
+
"view": "component",
|
|
182
|
+
|
|
183
|
+
# ==========================================================================
|
|
184
|
+
# ISSUE & PROBLEM FAMILY → issue
|
|
185
|
+
# ==========================================================================
|
|
186
|
+
"issue": "issue",
|
|
187
|
+
"issues": "issue",
|
|
188
|
+
"issue_type": "issue",
|
|
189
|
+
"issuetype": "issue",
|
|
190
|
+
"known_issue": "issue",
|
|
191
|
+
"fixed_issue": "issue",
|
|
192
|
+
"limitation": "issue",
|
|
193
|
+
"challenge": "issue",
|
|
194
|
+
"problem": "issue",
|
|
195
|
+
"error_message": "issue",
|
|
196
|
+
"troubleshooting": "issue",
|
|
197
|
+
"compatibilityissue": "issue",
|
|
198
|
+
|
|
199
|
+
# ==========================================================================
|
|
200
|
+
# ACTION & COMMAND FAMILY → action
|
|
201
|
+
# ==========================================================================
|
|
202
|
+
"action": "action",
|
|
203
|
+
"actions": "action",
|
|
204
|
+
"command": "action",
|
|
205
|
+
"operation": "action",
|
|
206
|
+
"task": "action",
|
|
207
|
+
"trigger": "action",
|
|
208
|
+
"automation_rule": "action",
|
|
209
|
+
|
|
210
|
+
# ==========================================================================
|
|
211
|
+
# PARAMETER & FIELD FAMILY → parameter
|
|
212
|
+
# ==========================================================================
|
|
213
|
+
"parameters": "parameter",
|
|
214
|
+
"field": "parameter",
|
|
215
|
+
"field_identifier": "parameter",
|
|
216
|
+
"placeholder": "parameter",
|
|
217
|
+
"value": "parameter",
|
|
218
|
+
"label": "parameter",
|
|
219
|
+
"tag": "parameter",
|
|
220
|
+
|
|
221
|
+
# ==========================================================================
|
|
222
|
+
# CREDENTIAL & AUTH FAMILY → credential
|
|
223
|
+
# ==========================================================================
|
|
224
|
+
"credential": "credential",
|
|
225
|
+
"credential_type": "credential",
|
|
226
|
+
"secret": "credential",
|
|
227
|
+
"token": "credential",
|
|
228
|
+
"api_key": "credential",
|
|
229
|
+
"api_token": "credential",
|
|
230
|
+
"key": "credential",
|
|
231
|
+
"authentication": "credential",
|
|
232
|
+
"authentication_method": "credential",
|
|
233
|
+
"permission": "credential",
|
|
234
|
+
"access_control": "credential",
|
|
235
|
+
"access_requirement": "credential",
|
|
236
|
+
|
|
237
|
+
# ==========================================================================
|
|
238
|
+
# RESOURCE & FILE FAMILY → resource
|
|
239
|
+
# ==========================================================================
|
|
240
|
+
"resource": "resource",
|
|
241
|
+
"resources": "resource",
|
|
242
|
+
"file": "resource",
|
|
243
|
+
"file_type": "resource",
|
|
244
|
+
"file_format": "resource",
|
|
245
|
+
"file_path": "resource",
|
|
246
|
+
"folder": "resource",
|
|
247
|
+
"artifact": "resource",
|
|
248
|
+
"artifact_type": "resource",
|
|
249
|
+
"document": "resource",
|
|
250
|
+
"template": "resource",
|
|
251
|
+
"script": "resource",
|
|
252
|
+
|
|
253
|
+
# ==========================================================================
|
|
254
|
+
# PLATFORM & SOFTWARE FAMILY → platform
|
|
255
|
+
# ==========================================================================
|
|
256
|
+
"platform": "platform",
|
|
257
|
+
"platforms": "platform",
|
|
258
|
+
"software": "platform",
|
|
259
|
+
"softwareversion": "platform",
|
|
260
|
+
"application": "platform",
|
|
261
|
+
"app": "platform",
|
|
262
|
+
"system": "platform",
|
|
263
|
+
"framework": "platform",
|
|
264
|
+
"library": "platform",
|
|
265
|
+
"technology": "platform",
|
|
266
|
+
"product": "platform",
|
|
267
|
+
|
|
268
|
+
# ==========================================================================
|
|
269
|
+
# SERVICE & API FAMILY → Keep distinct types for different communication patterns
|
|
270
|
+
# ==========================================================================
|
|
271
|
+
"service": "service",
|
|
272
|
+
"services": "service",
|
|
273
|
+
"microservice": "service",
|
|
274
|
+
"web_service": "service",
|
|
275
|
+
"server": "service",
|
|
276
|
+
"client": "service",
|
|
277
|
+
"hostingservice": "service",
|
|
278
|
+
|
|
279
|
+
# REST API (do NOT normalize to generic 'service')
|
|
280
|
+
"rest api": "rest_api",
|
|
281
|
+
"rest_api": "rest_api",
|
|
282
|
+
"restapi": "rest_api",
|
|
283
|
+
"rest": "rest_api",
|
|
284
|
+
"api": "rest_api",
|
|
285
|
+
"openapi": "rest_api",
|
|
286
|
+
"swagger": "rest_api",
|
|
287
|
+
"rest endpoint": "rest_endpoint",
|
|
288
|
+
"rest_endpoint": "rest_endpoint",
|
|
289
|
+
"endpoint": "rest_endpoint",
|
|
290
|
+
"api_endpoint": "rest_endpoint",
|
|
291
|
+
"http_endpoint": "rest_endpoint",
|
|
292
|
+
"rest_resource": "rest_resource",
|
|
293
|
+
|
|
294
|
+
# GraphQL (do NOT normalize to 'service')
|
|
295
|
+
"graphql api": "graphql_api",
|
|
296
|
+
"graphql_api": "graphql_api",
|
|
297
|
+
"graphql": "graphql_api",
|
|
298
|
+
"graphql_schema": "graphql_api",
|
|
299
|
+
"graphql query": "graphql_query",
|
|
300
|
+
"graphql_query": "graphql_query",
|
|
301
|
+
"query": "graphql_query",
|
|
302
|
+
"graphql mutation": "graphql_mutation",
|
|
303
|
+
"graphql_mutation": "graphql_mutation",
|
|
304
|
+
"mutation": "graphql_mutation",
|
|
305
|
+
"graphql subscription": "graphql_subscription",
|
|
306
|
+
"graphql_subscription": "graphql_subscription",
|
|
307
|
+
"subscription": "graphql_subscription",
|
|
308
|
+
"graphql type": "graphql_type",
|
|
309
|
+
"graphql_type": "graphql_type",
|
|
310
|
+
|
|
311
|
+
# gRPC (do NOT normalize to 'service')
|
|
312
|
+
"grpc service": "grpc_service",
|
|
313
|
+
"grpc_service": "grpc_service",
|
|
314
|
+
"grpc": "grpc_service",
|
|
315
|
+
"grpc method": "grpc_method",
|
|
316
|
+
"grpc_method": "grpc_method",
|
|
317
|
+
"rpc_method": "grpc_method",
|
|
318
|
+
"protobuf_message": "protobuf_message",
|
|
319
|
+
"protobuf": "protobuf_message",
|
|
320
|
+
"proto_message": "protobuf_message",
|
|
321
|
+
"protocol buffer": "protobuf_message",
|
|
322
|
+
|
|
323
|
+
# Event-Driven Architecture (do NOT normalize to 'service')
|
|
324
|
+
"event bus": "event_bus",
|
|
325
|
+
"event_bus": "event_bus",
|
|
326
|
+
"message_broker": "event_bus",
|
|
327
|
+
"message_queue": "event_bus",
|
|
328
|
+
"kafka": "event_bus",
|
|
329
|
+
"rabbitmq": "event_bus",
|
|
330
|
+
"event type": "event_type",
|
|
331
|
+
"event_type": "event_type",
|
|
332
|
+
"event": "event_type",
|
|
333
|
+
"message_type": "event_type",
|
|
334
|
+
"event producer": "event_producer",
|
|
335
|
+
"event_producer": "event_producer",
|
|
336
|
+
"publisher": "event_producer",
|
|
337
|
+
"event consumer": "event_consumer",
|
|
338
|
+
"event_consumer": "event_consumer",
|
|
339
|
+
"subscriber": "event_consumer",
|
|
340
|
+
"listener": "event_consumer",
|
|
341
|
+
"event handler": "event_handler",
|
|
342
|
+
"event_handler": "event_handler",
|
|
343
|
+
"message_handler": "event_handler",
|
|
344
|
+
"handler": "event_handler",
|
|
345
|
+
|
|
346
|
+
# ==========================================================================
|
|
347
|
+
# INTEGRATION & CONNECTION FAMILY → integration
|
|
348
|
+
# ==========================================================================
|
|
349
|
+
"integrations": "integration",
|
|
350
|
+
"connection": "integration",
|
|
351
|
+
"connection_type": "integration",
|
|
352
|
+
"connector": "integration",
|
|
353
|
+
"adapter": "integration",
|
|
354
|
+
"datasource": "integration",
|
|
355
|
+
"database": "integration",
|
|
356
|
+
|
|
357
|
+
# ==========================================================================
|
|
358
|
+
# EXAMPLE & USE CASE FAMILY → example
|
|
359
|
+
# ==========================================================================
|
|
360
|
+
"example": "example",
|
|
361
|
+
"examples": "example",
|
|
362
|
+
"example_type": "example",
|
|
363
|
+
"example_request": "example",
|
|
364
|
+
"use_case": "example",
|
|
365
|
+
"use_case_category": "example",
|
|
366
|
+
"code_sample": "example",
|
|
367
|
+
"sample_prompt": "example",
|
|
368
|
+
|
|
369
|
+
# ==========================================================================
|
|
370
|
+
# NODE & GRAPH FAMILY → node
|
|
371
|
+
# ==========================================================================
|
|
372
|
+
"node": "node",
|
|
373
|
+
"nodetype": "node",
|
|
374
|
+
"node_type": "node",
|
|
375
|
+
"execution_node": "node",
|
|
376
|
+
"iteration_node": "node",
|
|
377
|
+
"interaction_node": "node",
|
|
378
|
+
"utilitynode": "node",
|
|
379
|
+
|
|
380
|
+
# ==========================================================================
|
|
381
|
+
# STEP & PROCEDURE FAMILY → step
|
|
382
|
+
# ==========================================================================
|
|
383
|
+
"step": "step",
|
|
384
|
+
"steps": "step",
|
|
385
|
+
"number_of_step": "step",
|
|
386
|
+
"prerequisite": "step",
|
|
387
|
+
|
|
388
|
+
# ==========================================================================
|
|
389
|
+
# STATUS & STATE FAMILY → status
|
|
390
|
+
# ==========================================================================
|
|
391
|
+
"status": "status",
|
|
392
|
+
"state": "status",
|
|
393
|
+
"state_type": "status",
|
|
394
|
+
"mode": "status",
|
|
395
|
+
"session_mode": "status",
|
|
396
|
+
|
|
397
|
+
# ==========================================================================
|
|
398
|
+
# PROJECT & WORKSPACE FAMILY → project
|
|
399
|
+
# ==========================================================================
|
|
400
|
+
"project": "project",
|
|
401
|
+
"workspace": "project",
|
|
402
|
+
"project_scope": "project",
|
|
403
|
+
"repository": "project",
|
|
404
|
+
"space": "project",
|
|
405
|
+
|
|
406
|
+
# ==========================================================================
|
|
407
|
+
# ROLE & USER FAMILY → role
|
|
408
|
+
# ==========================================================================
|
|
409
|
+
"role": "role",
|
|
410
|
+
"user_role": "role",
|
|
411
|
+
"team": "role",
|
|
412
|
+
"person": "role",
|
|
413
|
+
"audience": "role",
|
|
414
|
+
"stakeholder": "role",
|
|
415
|
+
"owner": "role",
|
|
416
|
+
|
|
417
|
+
# ==========================================================================
|
|
418
|
+
# AGENT FAMILY → agent
|
|
419
|
+
# ==========================================================================
|
|
420
|
+
"agent": "agent",
|
|
421
|
+
"agents": "agent",
|
|
422
|
+
"agent_type": "agent",
|
|
423
|
+
"agent_configuration": "agent",
|
|
424
|
+
"ai_agent": "agent",
|
|
425
|
+
"public_agent": "agent",
|
|
426
|
+
|
|
427
|
+
# ==========================================================================
|
|
428
|
+
# DATA & TYPE FAMILY → data_type
|
|
429
|
+
# ==========================================================================
|
|
430
|
+
"data_type": "data_type",
|
|
431
|
+
"datatype": "data_type",
|
|
432
|
+
"data_structure": "data_type",
|
|
433
|
+
"schema": "data_type",
|
|
434
|
+
"format": "data_type",
|
|
435
|
+
"content_type": "data_type",
|
|
436
|
+
"collection": "data_type",
|
|
437
|
+
"collectiontype": "data_type",
|
|
438
|
+
"list": "data_type",
|
|
439
|
+
"table": "data_type",
|
|
440
|
+
|
|
441
|
+
# ==========================================================================
|
|
442
|
+
# RELEASE & VERSION FAMILY → release
|
|
443
|
+
# ==========================================================================
|
|
444
|
+
"release": "release",
|
|
445
|
+
"version": "release",
|
|
446
|
+
"change": "release",
|
|
447
|
+
"feature_change": "release",
|
|
448
|
+
"migration": "release",
|
|
449
|
+
"deployment": "release",
|
|
450
|
+
"fix": "release",
|
|
451
|
+
|
|
452
|
+
# ==========================================================================
|
|
453
|
+
# REFERENCE & LINK FAMILY → reference
|
|
454
|
+
# ==========================================================================
|
|
455
|
+
"reference": "reference",
|
|
456
|
+
"related_page": "reference",
|
|
457
|
+
"url": "reference",
|
|
458
|
+
"webpage": "reference",
|
|
459
|
+
"website": "reference",
|
|
460
|
+
"page": "reference",
|
|
461
|
+
"link": "reference",
|
|
462
|
+
|
|
463
|
+
# ==========================================================================
|
|
464
|
+
# RULE & POLICY FAMILY → rule
|
|
465
|
+
# ==========================================================================
|
|
466
|
+
"rules": "rule",
|
|
467
|
+
"policy": "rule",
|
|
468
|
+
"formatting_rule": "rule",
|
|
469
|
+
"directive": "rule",
|
|
470
|
+
"requirement": "rule",
|
|
471
|
+
"specification": "rule",
|
|
472
|
+
|
|
473
|
+
# ==========================================================================
|
|
474
|
+
# MCP FAMILY → mcp_server
|
|
475
|
+
# ==========================================================================
|
|
476
|
+
"mcp server": "mcp_server",
|
|
477
|
+
"mcp_server": "mcp_server",
|
|
478
|
+
"mcp tool": "mcp_tool",
|
|
479
|
+
"mcp_tool": "mcp_tool",
|
|
480
|
+
"mcp resource": "mcp_resource",
|
|
481
|
+
"mcp_resource": "mcp_resource",
|
|
482
|
+
"mcp_type": "mcp_server",
|
|
483
|
+
"transport": "mcp_server",
|
|
484
|
+
|
|
485
|
+
# ==========================================================================
|
|
486
|
+
# MISCELLANEOUS → map to closest canonical type
|
|
487
|
+
# ==========================================================================
|
|
488
|
+
"method": "method",
|
|
489
|
+
"model": "concept",
|
|
490
|
+
"category": "concept",
|
|
491
|
+
"metric": "parameter",
|
|
492
|
+
"identifier": "parameter",
|
|
493
|
+
"port": "parameter",
|
|
494
|
+
"protocol": "service",
|
|
495
|
+
"security": "credential",
|
|
496
|
+
"support": "documentation",
|
|
497
|
+
"community": "documentation",
|
|
498
|
+
"contact": "reference",
|
|
499
|
+
"contactmethod": "reference",
|
|
500
|
+
"contact_information": "reference",
|
|
501
|
+
"contactinfo": "reference",
|
|
502
|
+
"building_block": "component",
|
|
503
|
+
"container": "component",
|
|
504
|
+
"instance": "entity",
|
|
505
|
+
"object": "entity",
|
|
506
|
+
"sourcetype": "data_type",
|
|
507
|
+
"input_mapping_type": "data_type",
|
|
508
|
+
"control_flow_feature": "feature",
|
|
509
|
+
"export_option": "action",
|
|
510
|
+
"export_format": "data_type",
|
|
511
|
+
"conversion": "action",
|
|
512
|
+
"customization": "configuration",
|
|
513
|
+
"viewing_option": "configuration",
|
|
514
|
+
"review_outcome": "status",
|
|
515
|
+
"goal": "feature",
|
|
516
|
+
"engagement": "action",
|
|
517
|
+
"output": "data_type",
|
|
518
|
+
"effect": "action",
|
|
519
|
+
"solution": "documentation",
|
|
520
|
+
"cause": "issue",
|
|
521
|
+
"indicator": "status",
|
|
522
|
+
"date": "parameter",
|
|
523
|
+
"screenshot": "resource",
|
|
524
|
+
"open_question": "issue",
|
|
525
|
+
"static_site_generator": "platform",
|
|
526
|
+
"theme": "configuration",
|
|
527
|
+
"theme_convention": "rule",
|
|
528
|
+
"file_naming_convention": "rule",
|
|
529
|
+
"metadata_guideline": "rule",
|
|
530
|
+
"linking_guideline": "rule",
|
|
531
|
+
"media_guideline": "rule",
|
|
532
|
+
"accessibility_guideline": "rule",
|
|
533
|
+
"page_type": "section",
|
|
534
|
+
"document_category": "section",
|
|
535
|
+
"prompt": "example",
|
|
536
|
+
"chat": "feature",
|
|
537
|
+
"ide": "platform",
|
|
538
|
+
"tagging": "action",
|
|
539
|
+
"account": "credential",
|
|
540
|
+
"installation_command": "action",
|
|
541
|
+
"usage": "documentation",
|
|
542
|
+
"mechanism": "concept",
|
|
543
|
+
"ai_component": "component",
|
|
544
|
+
"communication_method": "integration",
|
|
545
|
+
"dns_record": "configuration",
|
|
546
|
+
"tone": "rule",
|
|
547
|
+
"voice": "rule",
|
|
548
|
+
|
|
549
|
+
# ==========================================================================
|
|
550
|
+
# FACT & KNOWLEDGE FAMILY → fact (semantic facts extracted by LLM)
|
|
551
|
+
# ==========================================================================
|
|
552
|
+
"facts": "fact",
|
|
553
|
+
"algorithm": "fact",
|
|
554
|
+
"behavior": "fact",
|
|
555
|
+
"validation": "fact",
|
|
556
|
+
"decision": "fact",
|
|
557
|
+
"definition": "fact",
|
|
558
|
+
|
|
559
|
+
# ==========================================================================
|
|
560
|
+
# FILE & STRUCTURE FAMILY → file types (container nodes for entities)
|
|
561
|
+
# ==========================================================================
|
|
562
|
+
"document_file": "document_file",
|
|
563
|
+
"config_file": "config_file",
|
|
564
|
+
"web_file": "web_file",
|
|
565
|
+
"directory": "directory",
|
|
566
|
+
"package": "package",
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
# Types that should NEVER be normalized - they pass through as-is
|
|
570
|
+
PRESERVED_TYPES = {
|
|
571
|
+
"fact", "source_file", "feature", "module", "constant", "rule",
|
|
572
|
+
"parameter", "error_handling", "todo", "property", "configuration",
|
|
573
|
+
"process", "integration", "interface", "user_story", "test",
|
|
574
|
+
"export", "rest_api", "concept", "component", "workflow",
|
|
575
|
+
"document_file", "config_file", "web_file", "directory", "package",
|
|
576
|
+
"variable", "function", # Code entities - preserve for impact analysis
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
def normalize_type(entity_type: str) -> str:
|
|
580
|
+
"""
|
|
581
|
+
Normalize entity type to canonical lowercase form.
|
|
582
|
+
|
|
583
|
+
Aggressively consolidates types to a small set of ~25 canonical types:
|
|
584
|
+
- feature, tool, toolkit, process, concept, entity
|
|
585
|
+
- section, component, issue, action, parameter, credential
|
|
586
|
+
- resource, platform, service, integration, example, node
|
|
587
|
+
- step, status, project, role, agent, data_type, release
|
|
588
|
+
- reference, rule, documentation, configuration, mcp_server
|
|
589
|
+
|
|
590
|
+
Args:
|
|
591
|
+
entity_type: Raw entity type
|
|
592
|
+
|
|
593
|
+
Returns:
|
|
594
|
+
Canonical lowercase entity type
|
|
595
|
+
"""
|
|
596
|
+
if not entity_type:
|
|
597
|
+
return "concept" # Default to concept for unknown
|
|
598
|
+
|
|
599
|
+
# Normalize to lowercase first - all checks are case-insensitive
|
|
600
|
+
normalized = entity_type.lower().strip().replace(" ", "_").replace("-", "_")
|
|
601
|
+
|
|
602
|
+
# First: check if type should be preserved as-is (25+ canonical types)
|
|
603
|
+
if normalized in PRESERVED_TYPES:
|
|
604
|
+
return normalized
|
|
605
|
+
|
|
606
|
+
# Check explicit mapping (all keys are lowercase now)
|
|
607
|
+
if normalized in TYPE_NORMALIZATION_MAP:
|
|
608
|
+
return TYPE_NORMALIZATION_MAP[normalized]
|
|
609
|
+
|
|
610
|
+
# Handle plural forms
|
|
611
|
+
if normalized.endswith('s') and not normalized.endswith('ss') and len(normalized) > 3:
|
|
612
|
+
singular = normalized[:-1]
|
|
613
|
+
if singular in PRESERVED_TYPES:
|
|
614
|
+
return singular
|
|
615
|
+
if singular in TYPE_NORMALIZATION_MAP:
|
|
616
|
+
return TYPE_NORMALIZATION_MAP[singular]
|
|
617
|
+
|
|
618
|
+
# Fallback heuristics based on common suffixes/patterns
|
|
619
|
+
if '_type' in normalized or normalized.endswith('type'):
|
|
620
|
+
return "data_type"
|
|
621
|
+
if '_section' in normalized or normalized.endswith('section'):
|
|
622
|
+
return "section"
|
|
623
|
+
if '_field' in normalized or normalized.endswith('field'):
|
|
624
|
+
return "parameter"
|
|
625
|
+
if '_node' in normalized or normalized.endswith('node'):
|
|
626
|
+
return "node"
|
|
627
|
+
if '_issue' in normalized or normalized.endswith('issue'):
|
|
628
|
+
return "issue"
|
|
629
|
+
if '_guide' in normalized or normalized.endswith('guide'):
|
|
630
|
+
return "documentation"
|
|
631
|
+
if '_config' in normalized or normalized.endswith('config'):
|
|
632
|
+
return "configuration"
|
|
633
|
+
if '_tool' in normalized or normalized.endswith('tool'):
|
|
634
|
+
return "tool"
|
|
635
|
+
if '_service' in normalized or normalized.endswith('service'):
|
|
636
|
+
return "service"
|
|
637
|
+
|
|
638
|
+
# If still unknown, map to concept (generic catch-all)
|
|
639
|
+
return "concept"
|
|
640
|
+
|
|
641
|
+
# Relationship types for cross-source linking
|
|
642
|
+
CROSS_SOURCE_RELATIONS = {
|
|
643
|
+
# (source_type, target_type): relation_type
|
|
644
|
+
("class", "concept"): "implements",
|
|
645
|
+
("module", "concept"): "implements",
|
|
646
|
+
("function", "concept"): "implements",
|
|
647
|
+
("method", "concept"): "implements",
|
|
648
|
+
("class", "entity"): "implements",
|
|
649
|
+
("module", "feature"): "implements",
|
|
650
|
+
("command", "feature"): "provides",
|
|
651
|
+
("toolkit", "toolkit_type"): "is_type_of",
|
|
652
|
+
("source_toolkit", "toolkit_type"): "is_type_of",
|
|
653
|
+
("SourceToolkit", "toolkit_type"): "is_type_of",
|
|
654
|
+
("import", "module"): "imports",
|
|
655
|
+
("import", "class"): "imports",
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
# Types that represent code vs documentation
|
|
659
|
+
CODE_TYPES = {
|
|
660
|
+
"class", "module", "function", "method", "variable", "constant",
|
|
661
|
+
"import", "attribute", "property", "command", "command_group",
|
|
662
|
+
"SourceToolkit", "source_toolkit", "toolkit"
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
DOC_TYPES = {
|
|
666
|
+
"concept", "entity", "feature", "Feature", "guide", "section",
|
|
667
|
+
"step", "process", "guideline", "tutorial", "example", "overview",
|
|
668
|
+
"toolkit_type", "platform", "software", "integration"
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
# Type priority for deduplication - higher priority types are preferred
|
|
672
|
+
# When merging entities with different types, the higher priority type wins
|
|
673
|
+
TYPE_PRIORITY = {
|
|
674
|
+
# Code layer - highest priority (most specific)
|
|
675
|
+
"class": 100,
|
|
676
|
+
"function": 99,
|
|
677
|
+
"method": 98,
|
|
678
|
+
"module": 97,
|
|
679
|
+
"interface": 96,
|
|
680
|
+
"constant": 95,
|
|
681
|
+
"variable": 94,
|
|
682
|
+
"configuration": 93,
|
|
683
|
+
|
|
684
|
+
# Service layer - specific communication patterns have higher priority than generic
|
|
685
|
+
"service": 90,
|
|
686
|
+
|
|
687
|
+
# REST API types
|
|
688
|
+
"rest_api": 89,
|
|
689
|
+
"rest_endpoint": 88,
|
|
690
|
+
"rest_resource": 87,
|
|
691
|
+
|
|
692
|
+
# GraphQL types
|
|
693
|
+
"graphql_api": 89,
|
|
694
|
+
"graphql_mutation": 88,
|
|
695
|
+
"graphql_query": 87,
|
|
696
|
+
"graphql_subscription": 86,
|
|
697
|
+
"graphql_type": 85,
|
|
698
|
+
|
|
699
|
+
# gRPC types
|
|
700
|
+
"grpc_service": 89,
|
|
701
|
+
"grpc_method": 88,
|
|
702
|
+
"protobuf_message": 87,
|
|
703
|
+
|
|
704
|
+
# Event-driven types
|
|
705
|
+
"event_bus": 89,
|
|
706
|
+
"event_type": 88,
|
|
707
|
+
"event_producer": 87,
|
|
708
|
+
"event_consumer": 87,
|
|
709
|
+
"event_handler": 86,
|
|
710
|
+
|
|
711
|
+
# Generic fallbacks (lower priority)
|
|
712
|
+
"integration": 84,
|
|
713
|
+
"payload": 83,
|
|
714
|
+
|
|
715
|
+
# Data layer
|
|
716
|
+
"database": 85,
|
|
717
|
+
"table": 84,
|
|
718
|
+
"column": 83,
|
|
719
|
+
"constraint": 82,
|
|
720
|
+
"index": 81,
|
|
721
|
+
"migration": 80,
|
|
722
|
+
"enum": 79,
|
|
723
|
+
|
|
724
|
+
# Product layer
|
|
725
|
+
"feature": 75,
|
|
726
|
+
"epic": 74,
|
|
727
|
+
"user_story": 73,
|
|
728
|
+
"screen": 72,
|
|
729
|
+
"ux_flow": 71,
|
|
730
|
+
"ui_component": 70,
|
|
731
|
+
"ui_field": 69,
|
|
732
|
+
|
|
733
|
+
# Domain layer
|
|
734
|
+
"domain_entity": 65,
|
|
735
|
+
"attribute": 64,
|
|
736
|
+
"business_rule": 63,
|
|
737
|
+
"business_event": 62,
|
|
738
|
+
"glossary_term": 61,
|
|
739
|
+
"workflow": 60,
|
|
740
|
+
|
|
741
|
+
# Testing layer
|
|
742
|
+
"test_suite": 55,
|
|
743
|
+
"test_case": 54,
|
|
744
|
+
"test_step": 53,
|
|
745
|
+
"assertion": 52,
|
|
746
|
+
"test_data": 51,
|
|
747
|
+
"defect": 50,
|
|
748
|
+
"incident": 49,
|
|
749
|
+
|
|
750
|
+
# Delivery layer
|
|
751
|
+
"release": 45,
|
|
752
|
+
"sprint": 44,
|
|
753
|
+
"commit": 43,
|
|
754
|
+
"pull_request": 42,
|
|
755
|
+
"ticket": 41,
|
|
756
|
+
"deployment": 40,
|
|
757
|
+
|
|
758
|
+
# Organization layer
|
|
759
|
+
"team": 35,
|
|
760
|
+
"owner": 34,
|
|
761
|
+
"stakeholder": 33,
|
|
762
|
+
"repository": 32,
|
|
763
|
+
"documentation": 31,
|
|
764
|
+
|
|
765
|
+
# Toolkits (specific types)
|
|
766
|
+
"toolkit": 28,
|
|
767
|
+
"source_toolkit": 27,
|
|
768
|
+
"SourceToolkit": 26,
|
|
769
|
+
"command": 25,
|
|
770
|
+
"command_group": 24,
|
|
771
|
+
|
|
772
|
+
# Generic types - lowest priority
|
|
773
|
+
"concept": 15,
|
|
774
|
+
"entity": 14,
|
|
775
|
+
"component": 13,
|
|
776
|
+
"object": 12,
|
|
777
|
+
"item": 11,
|
|
778
|
+
"element": 10,
|
|
779
|
+
"thing": 5,
|
|
780
|
+
"unknown": 1,
|
|
781
|
+
}
|
|
782
|
+
|
|
783
|
+
# Types that should NOT be merged even with same name
|
|
784
|
+
# These represent fundamentally different concepts
|
|
785
|
+
NON_MERGEABLE_TYPES = {
|
|
786
|
+
# Don't merge tests with the things they test
|
|
787
|
+
("test_case", "function"),
|
|
788
|
+
("test_case", "class"),
|
|
789
|
+
("test_case", "endpoint"),
|
|
790
|
+
("test_suite", "module"),
|
|
791
|
+
|
|
792
|
+
# Don't merge documentation with code
|
|
793
|
+
("documentation", "module"),
|
|
794
|
+
("documentation", "class"),
|
|
795
|
+
|
|
796
|
+
# Don't merge defects with features
|
|
797
|
+
("defect", "feature"),
|
|
798
|
+
("incident", "feature"),
|
|
799
|
+
|
|
800
|
+
# Don't merge owners with owned items
|
|
801
|
+
("owner", "module"),
|
|
802
|
+
("owner", "service"),
|
|
803
|
+
("team", "repository"),
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
# Types that should NEVER be deduplicated even with exact same name
|
|
807
|
+
# These are context-dependent - same name in different files means different things
|
|
808
|
+
# e.g., "Get Tests" tool in Xray toolkit != "Get Tests" tool in Zephyr toolkit
|
|
809
|
+
NEVER_DEDUPLICATE_TYPES = {
|
|
810
|
+
"tool", # Tools belong to specific toolkits
|
|
811
|
+
"property", # Properties belong to specific entities
|
|
812
|
+
"properties", # Same as above
|
|
813
|
+
"parameter", # Parameters belong to specific functions/methods
|
|
814
|
+
"argument", # Arguments belong to specific functions
|
|
815
|
+
"field", # Fields belong to specific tables/forms
|
|
816
|
+
"column", # Columns belong to specific tables
|
|
817
|
+
"attribute", # Attributes belong to specific entities
|
|
818
|
+
"option", # Options belong to specific settings
|
|
819
|
+
"setting", # Settings may have same name in different contexts
|
|
820
|
+
"step", # Steps belong to specific workflows/processes
|
|
821
|
+
"test_step", # Test steps belong to specific test cases
|
|
822
|
+
"ui_field", # UI fields belong to specific screens
|
|
823
|
+
"method", # Methods belong to specific classes
|
|
824
|
+
|
|
825
|
+
# API types - same name can exist in different API contexts
|
|
826
|
+
"rest_endpoint", # /users endpoint in API A != /users in API B
|
|
827
|
+
"rest_resource", # Same resource name in different REST APIs
|
|
828
|
+
"graphql_query", # Same query name in different GraphQL schemas
|
|
829
|
+
"graphql_mutation", # Same mutation name in different GraphQL schemas
|
|
830
|
+
"graphql_subscription", # Same subscription in different GraphQL schemas
|
|
831
|
+
"graphql_type", # Same type name in different GraphQL schemas
|
|
832
|
+
"grpc_method", # Same method name in different gRPC services
|
|
833
|
+
"protobuf_message", # Same message name in different proto files
|
|
834
|
+
"event_type", # Same event name in different event busses
|
|
835
|
+
"event_handler", # Same handler name in different services
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
|
|
839
|
+
class GraphEnricher:
|
|
840
|
+
"""
|
|
841
|
+
Enriches a knowledge graph with cross-source relationships.
|
|
842
|
+
"""
|
|
843
|
+
|
|
844
|
+
def __init__(self, graph_path: str):
|
|
845
|
+
"""
|
|
846
|
+
Initialize enricher with a graph file.
|
|
847
|
+
|
|
848
|
+
Args:
|
|
849
|
+
graph_path: Path to the graph JSON file
|
|
850
|
+
"""
|
|
851
|
+
self.graph_path = Path(graph_path)
|
|
852
|
+
self.graph_data: Dict[str, Any] = {}
|
|
853
|
+
self.nodes_by_id: Dict[str, Dict] = {}
|
|
854
|
+
self.nodes_by_name: Dict[str, List[Dict]] = defaultdict(list)
|
|
855
|
+
self.existing_links: Set[Tuple[str, str]] = set()
|
|
856
|
+
self.new_links: List[Dict] = []
|
|
857
|
+
self.id_mapping: Dict[str, str] = {} # old_id -> new_id for merged nodes
|
|
858
|
+
self.merged_nodes: List[Dict] = [] # Track merged node info
|
|
859
|
+
self.stats = {
|
|
860
|
+
"cross_source_links": 0,
|
|
861
|
+
"orphan_links": 0,
|
|
862
|
+
"similarity_links": 0,
|
|
863
|
+
"entities_merged": 0,
|
|
864
|
+
"merge_groups": 0,
|
|
865
|
+
}
|
|
866
|
+
|
|
867
|
+
self._load_graph()
|
|
868
|
+
|
|
869
|
+
def _load_graph(self):
|
|
870
|
+
"""Load graph from JSON file."""
|
|
871
|
+
with open(self.graph_path) as f:
|
|
872
|
+
self.graph_data = json.load(f)
|
|
873
|
+
|
|
874
|
+
# Build indices
|
|
875
|
+
for node in self.graph_data.get("nodes", []):
|
|
876
|
+
self.nodes_by_id[node["id"]] = node
|
|
877
|
+
name_key = self._normalize_name(node.get("name", ""))
|
|
878
|
+
self.nodes_by_name[name_key].append(node)
|
|
879
|
+
|
|
880
|
+
# Track existing links
|
|
881
|
+
for link in self.graph_data.get("links", []):
|
|
882
|
+
self.existing_links.add((link["source"], link["target"]))
|
|
883
|
+
self.existing_links.add((link["target"], link["source"])) # bidirectional check
|
|
884
|
+
|
|
885
|
+
logger.info(f"Loaded graph: {len(self.nodes_by_id)} nodes, {len(self.existing_links)//2} links")
|
|
886
|
+
|
|
887
|
+
def normalize_entity_types(self):
|
|
888
|
+
"""
|
|
889
|
+
Normalize all entity types in the graph to canonical lowercase forms.
|
|
890
|
+
|
|
891
|
+
This fixes inconsistencies like Tool/tool/Tools all becoming 'tool'.
|
|
892
|
+
Should be run before other enrichment steps.
|
|
893
|
+
"""
|
|
894
|
+
logger.info("Normalizing entity types...")
|
|
895
|
+
types_normalized = 0
|
|
896
|
+
type_changes: Dict[str, str] = {} # original -> normalized
|
|
897
|
+
|
|
898
|
+
for node in self.graph_data.get("nodes", []):
|
|
899
|
+
original_type = node.get("type", "")
|
|
900
|
+
normalized = normalize_type(original_type)
|
|
901
|
+
|
|
902
|
+
if normalized != original_type:
|
|
903
|
+
if original_type not in type_changes:
|
|
904
|
+
type_changes[original_type] = normalized
|
|
905
|
+
node["type"] = normalized
|
|
906
|
+
types_normalized += 1
|
|
907
|
+
|
|
908
|
+
# Log what was changed
|
|
909
|
+
if type_changes:
|
|
910
|
+
logger.info(f"Normalized {types_normalized} entity types:")
|
|
911
|
+
for orig, norm in sorted(type_changes.items()):
|
|
912
|
+
logger.debug(f" {orig} -> {norm}")
|
|
913
|
+
|
|
914
|
+
self.stats["types_normalized"] = types_normalized
|
|
915
|
+
self.stats["type_changes"] = len(type_changes)
|
|
916
|
+
|
|
917
|
+
# Rebuild indices after type normalization
|
|
918
|
+
self.nodes_by_id.clear()
|
|
919
|
+
self.nodes_by_name.clear()
|
|
920
|
+
for node in self.graph_data.get("nodes", []):
|
|
921
|
+
self.nodes_by_id[node["id"]] = node
|
|
922
|
+
name_key = self._normalize_name(node.get("name", ""))
|
|
923
|
+
self.nodes_by_name[name_key].append(node)
|
|
924
|
+
|
|
925
|
+
logger.info(f"Normalized {len(type_changes)} distinct type variations")
|
|
926
|
+
|
|
927
|
+
def _normalize_name(self, name: str) -> str:
|
|
928
|
+
"""Normalize entity name for matching."""
|
|
929
|
+
# Convert to lowercase, replace separators with spaces
|
|
930
|
+
name = name.lower().strip()
|
|
931
|
+
name = re.sub(r'[_\-\.]+', ' ', name)
|
|
932
|
+
name = re.sub(r'\s+', ' ', name)
|
|
933
|
+
return name
|
|
934
|
+
|
|
935
|
+
def _tokenize_name(self, name: str) -> Set[str]:
|
|
936
|
+
"""Tokenize name into significant words."""
|
|
937
|
+
normalized = self._normalize_name(name)
|
|
938
|
+
# Remove common stop words
|
|
939
|
+
stop_words = {'the', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'for', 'on', 'with', 'by', 'is', 'it'}
|
|
940
|
+
words = set(normalized.split())
|
|
941
|
+
return words - stop_words
|
|
942
|
+
|
|
943
|
+
def _get_source(self, node: Dict) -> str:
|
|
944
|
+
"""Determine source category for a node."""
|
|
945
|
+
citations = node.get("citations", [])
|
|
946
|
+
if node.get("citation"):
|
|
947
|
+
citations = [node["citation"]]
|
|
948
|
+
|
|
949
|
+
if not citations:
|
|
950
|
+
return "unknown"
|
|
951
|
+
|
|
952
|
+
fp = citations[0].get("file_path", "")
|
|
953
|
+
if "alita-sdk" in fp or "alita_sdk" in fp:
|
|
954
|
+
return "sdk"
|
|
955
|
+
elif "elitea_core" in fp:
|
|
956
|
+
return "core"
|
|
957
|
+
elif "AlitaUI" in fp:
|
|
958
|
+
return "ui"
|
|
959
|
+
elif "docs/" in fp or fp.endswith(".md"):
|
|
960
|
+
return "docs"
|
|
961
|
+
else:
|
|
962
|
+
return "other"
|
|
963
|
+
|
|
964
|
+
def _is_code_type(self, entity_type: str) -> bool:
|
|
965
|
+
"""Check if entity type represents code."""
|
|
966
|
+
return entity_type.lower() in {t.lower() for t in CODE_TYPES}
|
|
967
|
+
|
|
968
|
+
def _is_doc_type(self, entity_type: str) -> bool:
|
|
969
|
+
"""Check if entity type represents documentation."""
|
|
970
|
+
return entity_type.lower() in {t.lower() for t in DOC_TYPES}
|
|
971
|
+
|
|
972
|
+
def _get_type_priority(self, entity_type: str) -> int:
|
|
973
|
+
"""Get priority score for entity type."""
|
|
974
|
+
return TYPE_PRIORITY.get(entity_type.lower(), TYPE_PRIORITY.get(entity_type, 0))
|
|
975
|
+
|
|
976
|
+
def _are_types_mergeable(self, type1: str, type2: str) -> bool:
|
|
977
|
+
"""Check if two entity types can be merged."""
|
|
978
|
+
t1, t2 = type1.lower(), type2.lower()
|
|
979
|
+
pair1 = (t1, t2)
|
|
980
|
+
pair2 = (t2, t1)
|
|
981
|
+
return pair1 not in NON_MERGEABLE_TYPES and pair2 not in NON_MERGEABLE_TYPES
|
|
982
|
+
|
|
983
|
+
def _generate_merged_id(self, name: str, entity_type: str) -> str:
|
|
984
|
+
"""Generate a consistent ID for merged entity."""
|
|
985
|
+
normalized = self._normalize_name(name)
|
|
986
|
+
key = f"{entity_type}:{normalized}"
|
|
987
|
+
return hashlib.md5(key.encode()).hexdigest()[:16]
|
|
988
|
+
|
|
989
|
+
def _add_link(self, source_id: str, target_id: str, relation_type: str, reason: str):
|
|
990
|
+
"""Add a new link if it doesn't exist."""
|
|
991
|
+
# Apply ID mapping for merged nodes
|
|
992
|
+
source_id = self.id_mapping.get(source_id, source_id)
|
|
993
|
+
target_id = self.id_mapping.get(target_id, target_id)
|
|
994
|
+
|
|
995
|
+
if source_id == target_id:
|
|
996
|
+
return False
|
|
997
|
+
if (source_id, target_id) in self.existing_links:
|
|
998
|
+
return False
|
|
999
|
+
|
|
1000
|
+
self.new_links.append({
|
|
1001
|
+
"source": source_id,
|
|
1002
|
+
"target": target_id,
|
|
1003
|
+
"relation_type": relation_type,
|
|
1004
|
+
"enrichment_reason": reason,
|
|
1005
|
+
})
|
|
1006
|
+
self.existing_links.add((source_id, target_id))
|
|
1007
|
+
self.existing_links.add((target_id, source_id))
|
|
1008
|
+
return True
|
|
1009
|
+
|
|
1010
|
+
def _similarity(self, s1: str, s2: str) -> float:
|
|
1011
|
+
"""Calculate string similarity ratio."""
|
|
1012
|
+
return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
|
|
1013
|
+
|
|
1014
|
+
def _word_overlap_score(self, name1: str, name2: str) -> float:
|
|
1015
|
+
"""Calculate word overlap score between two names."""
|
|
1016
|
+
words1 = self._tokenize_name(name1)
|
|
1017
|
+
words2 = self._tokenize_name(name2)
|
|
1018
|
+
if not words1 or not words2:
|
|
1019
|
+
return 0.0
|
|
1020
|
+
overlap = len(words1 & words2)
|
|
1021
|
+
return overlap / max(len(words1), len(words2))
|
|
1022
|
+
|
|
1023
|
+
def deduplicate_entities(self,
|
|
1024
|
+
name_similarity_threshold: float = 0.95,
|
|
1025
|
+
require_exact_match: bool = True) -> int:
|
|
1026
|
+
"""
|
|
1027
|
+
Soft entity deduplication - merge entities that represent the same concept.
|
|
1028
|
+
|
|
1029
|
+
CONSERVATIVE APPROACH: Only merges entities with EXACT same name (after normalization).
|
|
1030
|
+
This prevents incorrectly merging related but distinct concepts like:
|
|
1031
|
+
- "Artifact Toolkit" vs "Artifact Toolkit Guide"
|
|
1032
|
+
- "Feature X" vs "Configure Feature X"
|
|
1033
|
+
|
|
1034
|
+
Entities with different names but similar concepts should be LINKED, not merged.
|
|
1035
|
+
|
|
1036
|
+
When merging, it:
|
|
1037
|
+
- Selects the best type based on TYPE_PRIORITY
|
|
1038
|
+
- Consolidates all citations from merged entities
|
|
1039
|
+
- Preserves all properties/attributes
|
|
1040
|
+
- Updates all links to point to the merged entity
|
|
1041
|
+
|
|
1042
|
+
Args:
|
|
1043
|
+
name_similarity_threshold: Min similarity for fuzzy matching (only if require_exact_match=False)
|
|
1044
|
+
require_exact_match: If True (default), only merge exact name matches
|
|
1045
|
+
|
|
1046
|
+
Returns:
|
|
1047
|
+
Number of entities merged
|
|
1048
|
+
"""
|
|
1049
|
+
logger.info("Starting soft entity deduplication (exact match only)...")
|
|
1050
|
+
|
|
1051
|
+
nodes = self.graph_data.get("nodes", [])
|
|
1052
|
+
if not nodes:
|
|
1053
|
+
return 0
|
|
1054
|
+
|
|
1055
|
+
# Group entities by normalized name for exact matches
|
|
1056
|
+
name_groups: Dict[str, List[Dict]] = defaultdict(list)
|
|
1057
|
+
for node in nodes:
|
|
1058
|
+
name_key = self._normalize_name(node.get("name", ""))
|
|
1059
|
+
if len(name_key) >= 2: # Skip very short names
|
|
1060
|
+
name_groups[name_key].append(node)
|
|
1061
|
+
|
|
1062
|
+
# Find merge candidates - ONLY exact name matches
|
|
1063
|
+
merge_groups: List[List[Dict]] = []
|
|
1064
|
+
processed_ids: Set[str] = set()
|
|
1065
|
+
|
|
1066
|
+
for name_key, group_nodes in name_groups.items():
|
|
1067
|
+
if len(group_nodes) < 2:
|
|
1068
|
+
continue
|
|
1069
|
+
|
|
1070
|
+
# Skip types that should NEVER be deduplicated (context-dependent)
|
|
1071
|
+
# e.g., "Get Tests" tool in Xray != "Get Tests" tool in Zephyr
|
|
1072
|
+
group_nodes = [
|
|
1073
|
+
n for n in group_nodes
|
|
1074
|
+
if n.get("type", "").lower() not in NEVER_DEDUPLICATE_TYPES
|
|
1075
|
+
]
|
|
1076
|
+
if len(group_nodes) < 2:
|
|
1077
|
+
continue
|
|
1078
|
+
|
|
1079
|
+
# Filter to only mergeable types within exact name matches
|
|
1080
|
+
mergeable_groups: List[List[Dict]] = []
|
|
1081
|
+
for node in group_nodes:
|
|
1082
|
+
if node["id"] in processed_ids:
|
|
1083
|
+
continue
|
|
1084
|
+
|
|
1085
|
+
# Try to add to existing group if types are compatible
|
|
1086
|
+
added = False
|
|
1087
|
+
for mg in mergeable_groups:
|
|
1088
|
+
if all(self._are_types_mergeable(node.get("type", ""), m.get("type", "")) for m in mg):
|
|
1089
|
+
mg.append(node)
|
|
1090
|
+
added = True
|
|
1091
|
+
break
|
|
1092
|
+
|
|
1093
|
+
if not added:
|
|
1094
|
+
mergeable_groups.append([node])
|
|
1095
|
+
|
|
1096
|
+
# Add groups with multiple nodes
|
|
1097
|
+
for mg in mergeable_groups:
|
|
1098
|
+
if len(mg) >= 2:
|
|
1099
|
+
merge_groups.append(mg)
|
|
1100
|
+
for node in mg:
|
|
1101
|
+
processed_ids.add(node["id"])
|
|
1102
|
+
|
|
1103
|
+
# Optional: Phase 2 - Very high similarity fuzzy matches (disabled by default)
|
|
1104
|
+
if not require_exact_match:
|
|
1105
|
+
remaining_nodes = [n for n in nodes if n["id"] not in processed_ids]
|
|
1106
|
+
|
|
1107
|
+
for i, node1 in enumerate(remaining_nodes):
|
|
1108
|
+
if node1["id"] in processed_ids:
|
|
1109
|
+
continue
|
|
1110
|
+
|
|
1111
|
+
name1 = self._normalize_name(node1.get("name", ""))
|
|
1112
|
+
if len(name1) < 3:
|
|
1113
|
+
continue
|
|
1114
|
+
|
|
1115
|
+
candidates = [node1]
|
|
1116
|
+
|
|
1117
|
+
for node2 in remaining_nodes[i+1:]:
|
|
1118
|
+
if node2["id"] in processed_ids:
|
|
1119
|
+
continue
|
|
1120
|
+
|
|
1121
|
+
name2 = self._normalize_name(node2.get("name", ""))
|
|
1122
|
+
if len(name2) < 3:
|
|
1123
|
+
continue
|
|
1124
|
+
|
|
1125
|
+
# Check if types are mergeable
|
|
1126
|
+
if not self._are_types_mergeable(node1.get("type", ""), node2.get("type", "")):
|
|
1127
|
+
continue
|
|
1128
|
+
|
|
1129
|
+
# Only merge on VERY high similarity (almost identical names)
|
|
1130
|
+
str_sim = self._similarity(name1, name2)
|
|
1131
|
+
if str_sim >= name_similarity_threshold:
|
|
1132
|
+
candidates.append(node2)
|
|
1133
|
+
|
|
1134
|
+
if len(candidates) >= 2:
|
|
1135
|
+
merge_groups.append(candidates)
|
|
1136
|
+
for node in candidates:
|
|
1137
|
+
processed_ids.add(node["id"])
|
|
1138
|
+
|
|
1139
|
+
# Execute merges
|
|
1140
|
+
logger.info(f"Found {len(merge_groups)} merge groups")
|
|
1141
|
+
|
|
1142
|
+
nodes_to_remove: Set[str] = set()
|
|
1143
|
+
nodes_to_add: List[Dict] = []
|
|
1144
|
+
|
|
1145
|
+
for group in merge_groups:
|
|
1146
|
+
merged = self._merge_entity_group(group)
|
|
1147
|
+
if merged:
|
|
1148
|
+
nodes_to_add.append(merged["new_node"])
|
|
1149
|
+
nodes_to_remove.update(merged["removed_ids"])
|
|
1150
|
+
self.merged_nodes.append(merged)
|
|
1151
|
+
self.stats["entities_merged"] += len(merged["removed_ids"])
|
|
1152
|
+
|
|
1153
|
+
self.stats["merge_groups"] = len(merge_groups)
|
|
1154
|
+
|
|
1155
|
+
# Update nodes list
|
|
1156
|
+
self.graph_data["nodes"] = [n for n in nodes if n["id"] not in nodes_to_remove]
|
|
1157
|
+
self.graph_data["nodes"].extend(nodes_to_add)
|
|
1158
|
+
|
|
1159
|
+
# Update links to use new IDs
|
|
1160
|
+
self._update_links_after_merge()
|
|
1161
|
+
|
|
1162
|
+
# Rebuild indices
|
|
1163
|
+
self._rebuild_indices()
|
|
1164
|
+
|
|
1165
|
+
logger.info(f"Deduplication complete: {self.stats['entities_merged']} entities merged into {self.stats['merge_groups']} groups")
|
|
1166
|
+
|
|
1167
|
+
return self.stats["entities_merged"]
|
|
1168
|
+
|
|
1169
|
+
def _merge_entity_group(self, group: List[Dict]) -> Optional[Dict]:
|
|
1170
|
+
"""
|
|
1171
|
+
Merge a group of entities into a single entity.
|
|
1172
|
+
|
|
1173
|
+
Returns merge info dict or None if merge failed.
|
|
1174
|
+
"""
|
|
1175
|
+
if len(group) < 2:
|
|
1176
|
+
return None
|
|
1177
|
+
|
|
1178
|
+
# Select best type based on priority
|
|
1179
|
+
best_node = max(group, key=lambda n: self._get_type_priority(n.get("type", "")))
|
|
1180
|
+
best_type = best_node.get("type", "entity")
|
|
1181
|
+
|
|
1182
|
+
# Use the name from the highest priority node
|
|
1183
|
+
best_name = best_node.get("name", "")
|
|
1184
|
+
|
|
1185
|
+
# Generate merged ID
|
|
1186
|
+
new_id = self._generate_merged_id(best_name, best_type)
|
|
1187
|
+
|
|
1188
|
+
# Collect all citations
|
|
1189
|
+
all_citations = []
|
|
1190
|
+
all_sources = set()
|
|
1191
|
+
for node in group:
|
|
1192
|
+
if "citations" in node:
|
|
1193
|
+
all_citations.extend(node["citations"])
|
|
1194
|
+
if "citation" in node:
|
|
1195
|
+
all_citations.append(node["citation"])
|
|
1196
|
+
all_sources.add(self._get_source(node))
|
|
1197
|
+
|
|
1198
|
+
# Remove duplicate citations
|
|
1199
|
+
seen_citations = set()
|
|
1200
|
+
unique_citations = []
|
|
1201
|
+
for cit in all_citations:
|
|
1202
|
+
cit_key = (cit.get("file_path", ""), cit.get("chunk_index", 0))
|
|
1203
|
+
if cit_key not in seen_citations:
|
|
1204
|
+
seen_citations.add(cit_key)
|
|
1205
|
+
unique_citations.append(cit)
|
|
1206
|
+
|
|
1207
|
+
# Collect all properties
|
|
1208
|
+
all_properties = {}
|
|
1209
|
+
for node in group:
|
|
1210
|
+
if "properties" in node:
|
|
1211
|
+
all_properties.update(node["properties"])
|
|
1212
|
+
|
|
1213
|
+
# Collect all types as alternative_types
|
|
1214
|
+
all_types = list(set(n.get("type", "") for n in group if n.get("type")))
|
|
1215
|
+
all_types = [t for t in all_types if t != best_type]
|
|
1216
|
+
|
|
1217
|
+
# Create merged node
|
|
1218
|
+
merged_node = {
|
|
1219
|
+
"id": new_id,
|
|
1220
|
+
"name": best_name,
|
|
1221
|
+
"type": best_type,
|
|
1222
|
+
"citations": unique_citations,
|
|
1223
|
+
"sources": list(all_sources),
|
|
1224
|
+
"merged_from": [n["id"] for n in group],
|
|
1225
|
+
"alternative_types": all_types,
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
if all_properties:
|
|
1229
|
+
merged_node["properties"] = all_properties
|
|
1230
|
+
|
|
1231
|
+
# Add description from best node
|
|
1232
|
+
if "description" in best_node:
|
|
1233
|
+
merged_node["description"] = best_node["description"]
|
|
1234
|
+
else:
|
|
1235
|
+
# Try to get description from any node
|
|
1236
|
+
for node in group:
|
|
1237
|
+
if "description" in node:
|
|
1238
|
+
merged_node["description"] = node["description"]
|
|
1239
|
+
break
|
|
1240
|
+
|
|
1241
|
+
# Map old IDs to new ID
|
|
1242
|
+
removed_ids = []
|
|
1243
|
+
for node in group:
|
|
1244
|
+
old_id = node["id"]
|
|
1245
|
+
self.id_mapping[old_id] = new_id
|
|
1246
|
+
removed_ids.append(old_id)
|
|
1247
|
+
|
|
1248
|
+
return {
|
|
1249
|
+
"new_node": merged_node,
|
|
1250
|
+
"removed_ids": removed_ids,
|
|
1251
|
+
"merged_types": [n.get("type", "") for n in group],
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
def _update_links_after_merge(self):
|
|
1255
|
+
"""Update all links to use merged node IDs."""
|
|
1256
|
+
updated_links = []
|
|
1257
|
+
seen_links = set()
|
|
1258
|
+
|
|
1259
|
+
for link in self.graph_data.get("links", []):
|
|
1260
|
+
source = self.id_mapping.get(link["source"], link["source"])
|
|
1261
|
+
target = self.id_mapping.get(link["target"], link["target"])
|
|
1262
|
+
|
|
1263
|
+
# Skip self-links and duplicates
|
|
1264
|
+
if source == target:
|
|
1265
|
+
continue
|
|
1266
|
+
|
|
1267
|
+
link_key = (source, target, link.get("relation_type", ""))
|
|
1268
|
+
if link_key in seen_links:
|
|
1269
|
+
continue
|
|
1270
|
+
seen_links.add(link_key)
|
|
1271
|
+
|
|
1272
|
+
updated_link = link.copy()
|
|
1273
|
+
updated_link["source"] = source
|
|
1274
|
+
updated_link["target"] = target
|
|
1275
|
+
updated_links.append(updated_link)
|
|
1276
|
+
|
|
1277
|
+
self.graph_data["links"] = updated_links
|
|
1278
|
+
|
|
1279
|
+
def _rebuild_indices(self):
|
|
1280
|
+
"""Rebuild internal indices after modifications."""
|
|
1281
|
+
self.nodes_by_id.clear()
|
|
1282
|
+
self.nodes_by_name.clear()
|
|
1283
|
+
self.existing_links.clear()
|
|
1284
|
+
|
|
1285
|
+
for node in self.graph_data.get("nodes", []):
|
|
1286
|
+
self.nodes_by_id[node["id"]] = node
|
|
1287
|
+
name_key = self._normalize_name(node.get("name", ""))
|
|
1288
|
+
self.nodes_by_name[name_key].append(node)
|
|
1289
|
+
|
|
1290
|
+
for link in self.graph_data.get("links", []):
|
|
1291
|
+
self.existing_links.add((link["source"], link["target"]))
|
|
1292
|
+
self.existing_links.add((link["target"], link["source"]))
|
|
1293
|
+
|
|
1294
|
+
def enrich_cross_source_links(self, min_similarity: float = 0.85):
|
|
1295
|
+
"""
|
|
1296
|
+
Create links between entities with similar names across different sources.
|
|
1297
|
+
|
|
1298
|
+
For example, link SDK class "Toolkit" to docs concept "Toolkit".
|
|
1299
|
+
"""
|
|
1300
|
+
logger.info("Creating cross-source links...")
|
|
1301
|
+
|
|
1302
|
+
for name_key, nodes in self.nodes_by_name.items():
|
|
1303
|
+
if len(nodes) < 2:
|
|
1304
|
+
continue
|
|
1305
|
+
|
|
1306
|
+
# Group by source
|
|
1307
|
+
by_source: Dict[str, List[Dict]] = defaultdict(list)
|
|
1308
|
+
for node in nodes:
|
|
1309
|
+
source = self._get_source(node)
|
|
1310
|
+
by_source[source].append(node)
|
|
1311
|
+
|
|
1312
|
+
if len(by_source) < 2:
|
|
1313
|
+
continue # All from same source
|
|
1314
|
+
|
|
1315
|
+
# Link code entities to doc entities
|
|
1316
|
+
code_nodes = []
|
|
1317
|
+
doc_nodes = []
|
|
1318
|
+
|
|
1319
|
+
for source, source_nodes in by_source.items():
|
|
1320
|
+
for node in source_nodes:
|
|
1321
|
+
if self._is_code_type(node.get("type", "")):
|
|
1322
|
+
code_nodes.append(node)
|
|
1323
|
+
elif self._is_doc_type(node.get("type", "")):
|
|
1324
|
+
doc_nodes.append(node)
|
|
1325
|
+
|
|
1326
|
+
# Create cross-links
|
|
1327
|
+
for code_node in code_nodes:
|
|
1328
|
+
for doc_node in doc_nodes:
|
|
1329
|
+
code_type = code_node.get("type", "").lower()
|
|
1330
|
+
doc_type = doc_node.get("type", "").lower()
|
|
1331
|
+
|
|
1332
|
+
# Determine relationship type
|
|
1333
|
+
rel_type = CROSS_SOURCE_RELATIONS.get(
|
|
1334
|
+
(code_type, doc_type),
|
|
1335
|
+
"related_to"
|
|
1336
|
+
)
|
|
1337
|
+
|
|
1338
|
+
if self._add_link(
|
|
1339
|
+
code_node["id"],
|
|
1340
|
+
doc_node["id"],
|
|
1341
|
+
rel_type,
|
|
1342
|
+
f"cross_source:{name_key}"
|
|
1343
|
+
):
|
|
1344
|
+
self.stats["cross_source_links"] += 1
|
|
1345
|
+
|
|
1346
|
+
logger.info(f"Created {self.stats['cross_source_links']} cross-source links")
|
|
1347
|
+
|
|
1348
|
+
def enrich_semantic_links(self,
|
|
1349
|
+
min_word_overlap: float = 0.5,
|
|
1350
|
+
max_links_per_entity: int = 5):
|
|
1351
|
+
"""
|
|
1352
|
+
Create semantic links between entities based on shared concepts.
|
|
1353
|
+
|
|
1354
|
+
This enhanced cross-linking finds relationships by:
|
|
1355
|
+
1. Shared significant words in entity names
|
|
1356
|
+
2. Similar context (source/type combinations)
|
|
1357
|
+
3. Hierarchical relationships (parent-child by naming)
|
|
1358
|
+
|
|
1359
|
+
Args:
|
|
1360
|
+
min_word_overlap: Minimum word overlap ratio
|
|
1361
|
+
max_links_per_entity: Maximum new links per entity
|
|
1362
|
+
"""
|
|
1363
|
+
logger.info("Creating semantic cross-links...")
|
|
1364
|
+
|
|
1365
|
+
nodes = self.graph_data.get("nodes", [])
|
|
1366
|
+
links_created = 0
|
|
1367
|
+
|
|
1368
|
+
# Build word index for efficient lookup
|
|
1369
|
+
word_to_nodes: Dict[str, List[Dict]] = defaultdict(list)
|
|
1370
|
+
for node in nodes:
|
|
1371
|
+
words = self._tokenize_name(node.get("name", ""))
|
|
1372
|
+
for word in words:
|
|
1373
|
+
if len(word) >= 3: # Skip very short words
|
|
1374
|
+
word_to_nodes[word].append(node)
|
|
1375
|
+
|
|
1376
|
+
# Find semantic relationships
|
|
1377
|
+
processed_pairs: Set[Tuple[str, str]] = set()
|
|
1378
|
+
entity_link_count: Dict[str, int] = defaultdict(int)
|
|
1379
|
+
|
|
1380
|
+
for node in nodes:
|
|
1381
|
+
if entity_link_count[node["id"]] >= max_links_per_entity:
|
|
1382
|
+
continue
|
|
1383
|
+
|
|
1384
|
+
node_words = self._tokenize_name(node.get("name", ""))
|
|
1385
|
+
if not node_words:
|
|
1386
|
+
continue
|
|
1387
|
+
|
|
1388
|
+
# Find candidate nodes sharing words
|
|
1389
|
+
candidates: Dict[str, float] = {}
|
|
1390
|
+
for word in node_words:
|
|
1391
|
+
for other in word_to_nodes.get(word, []):
|
|
1392
|
+
if other["id"] == node["id"]:
|
|
1393
|
+
continue
|
|
1394
|
+
|
|
1395
|
+
pair = tuple(sorted([node["id"], other["id"]]))
|
|
1396
|
+
if pair in processed_pairs:
|
|
1397
|
+
continue
|
|
1398
|
+
if pair in self.existing_links:
|
|
1399
|
+
continue
|
|
1400
|
+
|
|
1401
|
+
other_words = self._tokenize_name(other.get("name", ""))
|
|
1402
|
+
if not other_words:
|
|
1403
|
+
continue
|
|
1404
|
+
|
|
1405
|
+
# Calculate overlap
|
|
1406
|
+
overlap = len(node_words & other_words)
|
|
1407
|
+
overlap_ratio = overlap / max(len(node_words), len(other_words))
|
|
1408
|
+
|
|
1409
|
+
if overlap_ratio >= min_word_overlap:
|
|
1410
|
+
if other["id"] not in candidates:
|
|
1411
|
+
candidates[other["id"]] = overlap_ratio
|
|
1412
|
+
else:
|
|
1413
|
+
candidates[other["id"]] = max(candidates[other["id"]], overlap_ratio)
|
|
1414
|
+
|
|
1415
|
+
# Create links to top candidates
|
|
1416
|
+
sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
|
|
1417
|
+
|
|
1418
|
+
for other_id, overlap in sorted_candidates[:max_links_per_entity]:
|
|
1419
|
+
if entity_link_count[node["id"]] >= max_links_per_entity:
|
|
1420
|
+
break
|
|
1421
|
+
if entity_link_count[other_id] >= max_links_per_entity:
|
|
1422
|
+
continue
|
|
1423
|
+
|
|
1424
|
+
pair = tuple(sorted([node["id"], other_id]))
|
|
1425
|
+
processed_pairs.add(pair)
|
|
1426
|
+
|
|
1427
|
+
other_node = self.nodes_by_id.get(other_id)
|
|
1428
|
+
if not other_node:
|
|
1429
|
+
continue
|
|
1430
|
+
|
|
1431
|
+
# Determine relationship type
|
|
1432
|
+
rel_type = self._infer_relationship_type(node, other_node)
|
|
1433
|
+
|
|
1434
|
+
if self._add_link(
|
|
1435
|
+
node["id"],
|
|
1436
|
+
other_id,
|
|
1437
|
+
rel_type,
|
|
1438
|
+
f"semantic_overlap:{overlap:.2f}"
|
|
1439
|
+
):
|
|
1440
|
+
links_created += 1
|
|
1441
|
+
entity_link_count[node["id"]] += 1
|
|
1442
|
+
entity_link_count[other_id] += 1
|
|
1443
|
+
|
|
1444
|
+
self.stats["semantic_links"] = links_created
|
|
1445
|
+
logger.info(f"Created {links_created} semantic cross-links")
|
|
1446
|
+
return links_created
|
|
1447
|
+
|
|
1448
|
+
def _infer_relationship_type(self, node1: Dict, node2: Dict) -> str:
|
|
1449
|
+
"""Infer the best relationship type between two entities."""
|
|
1450
|
+
type1 = node1.get("type", "").lower()
|
|
1451
|
+
type2 = node2.get("type", "").lower()
|
|
1452
|
+
name1 = self._normalize_name(node1.get("name", ""))
|
|
1453
|
+
name2 = self._normalize_name(node2.get("name", ""))
|
|
1454
|
+
|
|
1455
|
+
# Tool/Toolkit relationships - highest priority
|
|
1456
|
+
if type1 == "toolkit" and type2 == "tool":
|
|
1457
|
+
return "contains"
|
|
1458
|
+
if type2 == "toolkit" and type1 == "tool":
|
|
1459
|
+
return "part_of"
|
|
1460
|
+
if type1 == "mcp_server" and type2 == "mcp_tool":
|
|
1461
|
+
return "provides"
|
|
1462
|
+
if type2 == "mcp_server" and type1 == "mcp_tool":
|
|
1463
|
+
return "provided_by"
|
|
1464
|
+
|
|
1465
|
+
# Check for hierarchical relationship (one name contains the other)
|
|
1466
|
+
if name1 in name2 or name2 in name1:
|
|
1467
|
+
if len(name1) < len(name2):
|
|
1468
|
+
return "part_of"
|
|
1469
|
+
else:
|
|
1470
|
+
return "contains"
|
|
1471
|
+
|
|
1472
|
+
# Check for type-based relationships
|
|
1473
|
+
type_pairs = [
|
|
1474
|
+
({"class", "function", "method", "module"}, {"feature", "concept"}, "implements"),
|
|
1475
|
+
({"endpoint", "api"}, {"service"}, "part_of"),
|
|
1476
|
+
({"test_case", "test_suite"}, {"feature", "function", "class"}, "tests"),
|
|
1477
|
+
({"defect", "incident"}, {"feature", "component"}, "affects"),
|
|
1478
|
+
({"ticket"}, {"feature", "epic", "user_story"}, "implements"),
|
|
1479
|
+
({"documentation"}, {"feature", "api", "class"}, "documents"),
|
|
1480
|
+
({"toolkit"}, {"feature", "capability", "function"}, "provides"),
|
|
1481
|
+
({"tool"}, {"feature", "capability", "function"}, "implements"),
|
|
1482
|
+
]
|
|
1483
|
+
|
|
1484
|
+
for types_a, types_b, rel in type_pairs:
|
|
1485
|
+
if (type1 in types_a and type2 in types_b) or (type2 in types_a and type1 in types_b):
|
|
1486
|
+
return rel
|
|
1487
|
+
|
|
1488
|
+
# Check cross-source relation map
|
|
1489
|
+
if (type1, type2) in CROSS_SOURCE_RELATIONS:
|
|
1490
|
+
return CROSS_SOURCE_RELATIONS[(type1, type2)]
|
|
1491
|
+
if (type2, type1) in CROSS_SOURCE_RELATIONS:
|
|
1492
|
+
return CROSS_SOURCE_RELATIONS[(type2, type1)]
|
|
1493
|
+
|
|
1494
|
+
return "related_to"
|
|
1495
|
+
|
|
1496
|
+
def enrich_toolkit_tool_links(self):
|
|
1497
|
+
"""
|
|
1498
|
+
Create explicit links between toolkits and their tools.
|
|
1499
|
+
|
|
1500
|
+
This method specifically handles the toolkit → tool relationship by:
|
|
1501
|
+
1. Finding all toolkit and tool entities
|
|
1502
|
+
2. Matching tools to toolkits based on:
|
|
1503
|
+
- Same file path (tools defined in toolkit's documentation)
|
|
1504
|
+
- Toolkit name appearing in tool's parent_toolkit property
|
|
1505
|
+
- Tool name containing toolkit name prefix
|
|
1506
|
+
"""
|
|
1507
|
+
logger.info("Linking tools to toolkits...")
|
|
1508
|
+
|
|
1509
|
+
nodes = self.graph_data.get("nodes", [])
|
|
1510
|
+
links_created = 0
|
|
1511
|
+
|
|
1512
|
+
# Index toolkits and tools
|
|
1513
|
+
toolkits = [n for n in nodes if n.get("type", "").lower() == "toolkit"]
|
|
1514
|
+
tools = [n for n in nodes if n.get("type", "").lower() == "tool"]
|
|
1515
|
+
|
|
1516
|
+
# Index toolkits by file_path and name
|
|
1517
|
+
toolkit_by_file: Dict[str, List[Dict]] = defaultdict(list)
|
|
1518
|
+
toolkit_by_name: Dict[str, Dict] = {}
|
|
1519
|
+
|
|
1520
|
+
for tk in toolkits:
|
|
1521
|
+
file_path = tk.get("file_path", "")
|
|
1522
|
+
if file_path:
|
|
1523
|
+
toolkit_by_file[file_path].append(tk)
|
|
1524
|
+
name = tk.get("name", "").lower()
|
|
1525
|
+
if name:
|
|
1526
|
+
toolkit_by_name[name] = tk
|
|
1527
|
+
# Also index by common variations
|
|
1528
|
+
# e.g., "GitHub Toolkit" → "github", "github toolkit"
|
|
1529
|
+
short_name = name.replace(" toolkit", "").replace("_toolkit", "")
|
|
1530
|
+
toolkit_by_name[short_name] = tk
|
|
1531
|
+
|
|
1532
|
+
for tool in tools:
|
|
1533
|
+
tool_id = tool["id"]
|
|
1534
|
+
tool_file = tool.get("file_path", "")
|
|
1535
|
+
tool_name = tool.get("name", "").lower()
|
|
1536
|
+
tool_props = tool.get("properties", {})
|
|
1537
|
+
parent_toolkit = tool_props.get("parent_toolkit", "").lower()
|
|
1538
|
+
|
|
1539
|
+
matched_toolkit = None
|
|
1540
|
+
match_reason = ""
|
|
1541
|
+
|
|
1542
|
+
# Strategy 1: Match by parent_toolkit property
|
|
1543
|
+
if parent_toolkit:
|
|
1544
|
+
for tk_name, tk in toolkit_by_name.items():
|
|
1545
|
+
if tk_name in parent_toolkit or parent_toolkit in tk_name:
|
|
1546
|
+
matched_toolkit = tk
|
|
1547
|
+
match_reason = f"parent_toolkit:{parent_toolkit}"
|
|
1548
|
+
break
|
|
1549
|
+
|
|
1550
|
+
# Strategy 2: Match by same file path
|
|
1551
|
+
if not matched_toolkit and tool_file:
|
|
1552
|
+
if tool_file in toolkit_by_file:
|
|
1553
|
+
# Pick first matching toolkit in same file
|
|
1554
|
+
matched_toolkit = toolkit_by_file[tool_file][0]
|
|
1555
|
+
match_reason = f"same_file:{tool_file}"
|
|
1556
|
+
|
|
1557
|
+
# Strategy 3: Match by tool name containing toolkit name
|
|
1558
|
+
if not matched_toolkit:
|
|
1559
|
+
for tk_name, tk in toolkit_by_name.items():
|
|
1560
|
+
if tk_name in tool_name:
|
|
1561
|
+
matched_toolkit = tk
|
|
1562
|
+
match_reason = f"name_match:{tk_name}"
|
|
1563
|
+
break
|
|
1564
|
+
|
|
1565
|
+
# Create link if matched
|
|
1566
|
+
if matched_toolkit:
|
|
1567
|
+
pair = tuple(sorted([matched_toolkit["id"], tool_id]))
|
|
1568
|
+
if pair not in self.existing_links:
|
|
1569
|
+
if self._add_link(
|
|
1570
|
+
matched_toolkit["id"],
|
|
1571
|
+
tool_id,
|
|
1572
|
+
"contains",
|
|
1573
|
+
f"toolkit_tool:{match_reason}"
|
|
1574
|
+
):
|
|
1575
|
+
links_created += 1
|
|
1576
|
+
|
|
1577
|
+
self.stats["toolkit_tool_links"] = links_created
|
|
1578
|
+
logger.info(f"Created {links_created} toolkit → tool links")
|
|
1579
|
+
|
|
1580
|
+
def enrich_orphan_nodes(self, max_links_per_orphan: int = 3):
|
|
1581
|
+
"""
|
|
1582
|
+
Connect orphan nodes to related entities based on name similarity.
|
|
1583
|
+
"""
|
|
1584
|
+
logger.info("Connecting orphan nodes...")
|
|
1585
|
+
|
|
1586
|
+
# Find orphans
|
|
1587
|
+
connected = set()
|
|
1588
|
+
for link in self.graph_data.get("links", []):
|
|
1589
|
+
connected.add(link["source"])
|
|
1590
|
+
connected.add(link["target"])
|
|
1591
|
+
for link in self.new_links:
|
|
1592
|
+
connected.add(link["source"])
|
|
1593
|
+
connected.add(link["target"])
|
|
1594
|
+
|
|
1595
|
+
orphans = [
|
|
1596
|
+
node for node in self.graph_data.get("nodes", [])
|
|
1597
|
+
if node["id"] not in connected
|
|
1598
|
+
]
|
|
1599
|
+
|
|
1600
|
+
logger.info(f"Found {len(orphans)} orphan nodes")
|
|
1601
|
+
|
|
1602
|
+
# For each orphan, find potential parents
|
|
1603
|
+
for orphan in orphans:
|
|
1604
|
+
orphan_name = self._normalize_name(orphan.get("name", ""))
|
|
1605
|
+
orphan_words = set(orphan_name.split())
|
|
1606
|
+
|
|
1607
|
+
candidates = []
|
|
1608
|
+
|
|
1609
|
+
for node in self.graph_data.get("nodes", []):
|
|
1610
|
+
if node["id"] == orphan["id"]:
|
|
1611
|
+
continue
|
|
1612
|
+
if node["id"] not in connected:
|
|
1613
|
+
continue # Don't link orphans to orphans
|
|
1614
|
+
|
|
1615
|
+
node_name = self._normalize_name(node.get("name", ""))
|
|
1616
|
+
node_words = set(node_name.split())
|
|
1617
|
+
|
|
1618
|
+
# Check word overlap
|
|
1619
|
+
overlap = len(orphan_words & node_words)
|
|
1620
|
+
if overlap > 0:
|
|
1621
|
+
# Calculate similarity score
|
|
1622
|
+
sim = self._similarity(orphan_name, node_name)
|
|
1623
|
+
word_score = overlap / max(len(orphan_words), 1)
|
|
1624
|
+
score = (sim + word_score) / 2
|
|
1625
|
+
|
|
1626
|
+
if score > 0.3: # Minimum threshold
|
|
1627
|
+
candidates.append((node, score))
|
|
1628
|
+
|
|
1629
|
+
# Sort by score and take top matches
|
|
1630
|
+
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
1631
|
+
|
|
1632
|
+
for node, score in candidates[:max_links_per_orphan]:
|
|
1633
|
+
if self._add_link(
|
|
1634
|
+
orphan["id"],
|
|
1635
|
+
node["id"],
|
|
1636
|
+
"related_to",
|
|
1637
|
+
f"orphan_link:score={score:.2f}"
|
|
1638
|
+
):
|
|
1639
|
+
self.stats["orphan_links"] += 1
|
|
1640
|
+
|
|
1641
|
+
logger.info(f"Created {self.stats['orphan_links']} orphan links")
|
|
1642
|
+
|
|
1643
|
+
def enrich_similarity_links(self, min_similarity: float = 0.9):
|
|
1644
|
+
"""
|
|
1645
|
+
Create links between entities with very similar names.
|
|
1646
|
+
|
|
1647
|
+
This catches variations like "Create Toolkit" and "Toolkit Creation".
|
|
1648
|
+
"""
|
|
1649
|
+
logger.info(f"Creating similarity links (threshold={min_similarity})...")
|
|
1650
|
+
|
|
1651
|
+
nodes = self.graph_data.get("nodes", [])
|
|
1652
|
+
processed = set()
|
|
1653
|
+
|
|
1654
|
+
for i, node1 in enumerate(nodes):
|
|
1655
|
+
name1 = self._normalize_name(node1.get("name", ""))
|
|
1656
|
+
if len(name1) < 3:
|
|
1657
|
+
continue
|
|
1658
|
+
|
|
1659
|
+
for j, node2 in enumerate(nodes[i+1:], i+1):
|
|
1660
|
+
pair = (node1["id"], node2["id"])
|
|
1661
|
+
if pair in processed:
|
|
1662
|
+
continue
|
|
1663
|
+
processed.add(pair)
|
|
1664
|
+
|
|
1665
|
+
name2 = self._normalize_name(node2.get("name", ""))
|
|
1666
|
+
if len(name2) < 3:
|
|
1667
|
+
continue
|
|
1668
|
+
|
|
1669
|
+
# Calculate similarity
|
|
1670
|
+
sim = self._similarity(name1, name2)
|
|
1671
|
+
|
|
1672
|
+
if sim >= min_similarity:
|
|
1673
|
+
if self._add_link(
|
|
1674
|
+
node1["id"],
|
|
1675
|
+
node2["id"],
|
|
1676
|
+
"similar_to",
|
|
1677
|
+
f"similarity:{sim:.2f}"
|
|
1678
|
+
):
|
|
1679
|
+
self.stats["similarity_links"] += 1
|
|
1680
|
+
|
|
1681
|
+
logger.info(f"Created {self.stats['similarity_links']} similarity links")
|
|
1682
|
+
|
|
1683
|
+
def validate_low_confidence_relationships(
|
|
1684
|
+
self,
|
|
1685
|
+
confidence_threshold: float = 0.7,
|
|
1686
|
+
llm: Optional[Any] = None
|
|
1687
|
+
) -> Dict[str, Any]:
|
|
1688
|
+
"""
|
|
1689
|
+
Validate and re-evaluate relationships with confidence below threshold.
|
|
1690
|
+
|
|
1691
|
+
This method routes low-confidence relationships through additional validation:
|
|
1692
|
+
1. Gather context from both source and target entities
|
|
1693
|
+
2. Check if relationship makes semantic sense given entity types
|
|
1694
|
+
3. Optionally use LLM to validate ambiguous relationships
|
|
1695
|
+
|
|
1696
|
+
Args:
|
|
1697
|
+
confidence_threshold: Relationships below this are candidates for validation
|
|
1698
|
+
llm: Optional LLM for re-evaluation (if None, uses heuristics only)
|
|
1699
|
+
|
|
1700
|
+
Returns:
|
|
1701
|
+
Dictionary with validation stats:
|
|
1702
|
+
- validated: Number of relationships confirmed
|
|
1703
|
+
- rejected: Number of relationships removed
|
|
1704
|
+
- upgraded: Number of relationships with increased confidence
|
|
1705
|
+
- downgraded: Number of relationships with decreased confidence
|
|
1706
|
+
"""
|
|
1707
|
+
logger.info(f"Validating low-confidence relationships (threshold={confidence_threshold})...")
|
|
1708
|
+
|
|
1709
|
+
stats = {
|
|
1710
|
+
"candidates": 0,
|
|
1711
|
+
"validated": 0,
|
|
1712
|
+
"rejected": 0,
|
|
1713
|
+
"upgraded": 0,
|
|
1714
|
+
"downgraded": 0,
|
|
1715
|
+
}
|
|
1716
|
+
|
|
1717
|
+
links_to_keep = []
|
|
1718
|
+
links_to_remove = []
|
|
1719
|
+
|
|
1720
|
+
for link in self.graph_data.get("links", []):
|
|
1721
|
+
confidence = link.get("confidence", 1.0)
|
|
1722
|
+
|
|
1723
|
+
# Skip high-confidence links
|
|
1724
|
+
if confidence >= confidence_threshold:
|
|
1725
|
+
links_to_keep.append(link)
|
|
1726
|
+
continue
|
|
1727
|
+
|
|
1728
|
+
# Skip parser-extracted relationships (already validated by code structure)
|
|
1729
|
+
if link.get("source") == "parser":
|
|
1730
|
+
links_to_keep.append(link)
|
|
1731
|
+
continue
|
|
1732
|
+
|
|
1733
|
+
stats["candidates"] += 1
|
|
1734
|
+
|
|
1735
|
+
# Get source and target entities
|
|
1736
|
+
source_id = link.get("source")
|
|
1737
|
+
target_id = link.get("target")
|
|
1738
|
+
source_node = self.nodes_by_id.get(source_id)
|
|
1739
|
+
target_node = self.nodes_by_id.get(target_id)
|
|
1740
|
+
|
|
1741
|
+
if not source_node or not target_node:
|
|
1742
|
+
# Invalid link - remove
|
|
1743
|
+
stats["rejected"] += 1
|
|
1744
|
+
links_to_remove.append(link)
|
|
1745
|
+
continue
|
|
1746
|
+
|
|
1747
|
+
# Validate using heuristics
|
|
1748
|
+
validation_result = self._validate_relationship_heuristic(
|
|
1749
|
+
source_node, target_node, link
|
|
1750
|
+
)
|
|
1751
|
+
|
|
1752
|
+
if validation_result["action"] == "keep":
|
|
1753
|
+
# Update confidence if suggested
|
|
1754
|
+
if "new_confidence" in validation_result:
|
|
1755
|
+
link["confidence"] = validation_result["new_confidence"]
|
|
1756
|
+
link["validation_reason"] = validation_result.get("reason", "heuristic")
|
|
1757
|
+
if validation_result["new_confidence"] > confidence:
|
|
1758
|
+
stats["upgraded"] += 1
|
|
1759
|
+
elif validation_result["new_confidence"] < confidence:
|
|
1760
|
+
stats["downgraded"] += 1
|
|
1761
|
+
stats["validated"] += 1
|
|
1762
|
+
links_to_keep.append(link)
|
|
1763
|
+
|
|
1764
|
+
elif validation_result["action"] == "remove":
|
|
1765
|
+
stats["rejected"] += 1
|
|
1766
|
+
links_to_remove.append(link)
|
|
1767
|
+
logger.debug(
|
|
1768
|
+
f"Removing low-confidence relationship: {source_node.get('name')} "
|
|
1769
|
+
f"--[{link.get('relation_type')}]--> {target_node.get('name')} "
|
|
1770
|
+
f"(reason: {validation_result.get('reason', 'unknown')})"
|
|
1771
|
+
)
|
|
1772
|
+
|
|
1773
|
+
elif validation_result["action"] == "llm_validate" and llm:
|
|
1774
|
+
# Use LLM for ambiguous cases
|
|
1775
|
+
llm_result = self._validate_relationship_with_llm(
|
|
1776
|
+
source_node, target_node, link, llm
|
|
1777
|
+
)
|
|
1778
|
+
if llm_result["valid"]:
|
|
1779
|
+
link["confidence"] = llm_result.get("confidence", confidence)
|
|
1780
|
+
link["validation_reason"] = "llm_validated"
|
|
1781
|
+
stats["validated"] += 1
|
|
1782
|
+
links_to_keep.append(link)
|
|
1783
|
+
else:
|
|
1784
|
+
stats["rejected"] += 1
|
|
1785
|
+
links_to_remove.append(link)
|
|
1786
|
+
else:
|
|
1787
|
+
# Default: keep with same confidence
|
|
1788
|
+
links_to_keep.append(link)
|
|
1789
|
+
stats["validated"] += 1
|
|
1790
|
+
|
|
1791
|
+
# Update links
|
|
1792
|
+
self.graph_data["links"] = links_to_keep
|
|
1793
|
+
|
|
1794
|
+
# Log removed links for analysis
|
|
1795
|
+
if links_to_remove:
|
|
1796
|
+
logger.info(f"Removed {len(links_to_remove)} invalid low-confidence relationships")
|
|
1797
|
+
|
|
1798
|
+
self.stats["low_confidence_validation"] = stats
|
|
1799
|
+
logger.info(
|
|
1800
|
+
f"Low-confidence validation: {stats['candidates']} candidates, "
|
|
1801
|
+
f"{stats['validated']} validated, {stats['rejected']} rejected, "
|
|
1802
|
+
f"{stats['upgraded']} upgraded, {stats['downgraded']} downgraded"
|
|
1803
|
+
)
|
|
1804
|
+
|
|
1805
|
+
return stats
|
|
1806
|
+
|
|
1807
|
+
def _validate_relationship_heuristic(
|
|
1808
|
+
self,
|
|
1809
|
+
source_node: Dict,
|
|
1810
|
+
target_node: Dict,
|
|
1811
|
+
link: Dict
|
|
1812
|
+
) -> Dict[str, Any]:
|
|
1813
|
+
"""
|
|
1814
|
+
Validate a relationship using heuristic rules.
|
|
1815
|
+
|
|
1816
|
+
Returns:
|
|
1817
|
+
Dict with 'action' (keep/remove/llm_validate) and optional 'new_confidence'
|
|
1818
|
+
"""
|
|
1819
|
+
source_type = source_node.get("type", "").lower()
|
|
1820
|
+
target_type = target_node.get("type", "").lower()
|
|
1821
|
+
relation_type = link.get("relation_type", "").lower()
|
|
1822
|
+
confidence = link.get("confidence", 0.5)
|
|
1823
|
+
|
|
1824
|
+
# Rule 1: Invalid type combinations for specific relationships
|
|
1825
|
+
invalid_combinations = {
|
|
1826
|
+
# imports should be between code entities
|
|
1827
|
+
"imports": {
|
|
1828
|
+
"invalid_source": {"feature", "concept", "documentation", "requirement"},
|
|
1829
|
+
"invalid_target": {"feature", "concept", "documentation", "requirement"},
|
|
1830
|
+
},
|
|
1831
|
+
# implements should have code as source
|
|
1832
|
+
"implements": {
|
|
1833
|
+
"invalid_source": {"documentation", "concept", "glossary_term"},
|
|
1834
|
+
},
|
|
1835
|
+
# contains should have container as source
|
|
1836
|
+
"contains": {
|
|
1837
|
+
"invalid_source": {"constant", "variable", "field", "property"},
|
|
1838
|
+
},
|
|
1839
|
+
# tests should have test as source
|
|
1840
|
+
"tests": {
|
|
1841
|
+
"invalid_source": {"class", "function", "method", "module"},
|
|
1842
|
+
},
|
|
1843
|
+
}
|
|
1844
|
+
|
|
1845
|
+
if relation_type in invalid_combinations:
|
|
1846
|
+
rules = invalid_combinations[relation_type]
|
|
1847
|
+
if source_type in rules.get("invalid_source", set()):
|
|
1848
|
+
return {"action": "remove", "reason": f"invalid_source_type:{source_type}"}
|
|
1849
|
+
if target_type in rules.get("invalid_target", set()):
|
|
1850
|
+
return {"action": "remove", "reason": f"invalid_target_type:{target_type}"}
|
|
1851
|
+
|
|
1852
|
+
# Rule 2: Boost confidence for semantically valid combinations
|
|
1853
|
+
valid_combinations = {
|
|
1854
|
+
("class", "interface", "implements"): 0.9,
|
|
1855
|
+
("method", "function", "calls"): 0.85,
|
|
1856
|
+
("test_case", "function", "tests"): 0.9,
|
|
1857
|
+
("test_case", "class", "tests"): 0.9,
|
|
1858
|
+
("documentation", "class", "documents"): 0.85,
|
|
1859
|
+
("documentation", "function", "documents"): 0.85,
|
|
1860
|
+
("ticket", "feature", "implements"): 0.8,
|
|
1861
|
+
("feature", "requirement", "implements"): 0.85,
|
|
1862
|
+
("toolkit", "tool", "contains"): 0.95,
|
|
1863
|
+
("module", "class", "contains"): 0.9,
|
|
1864
|
+
("class", "method", "contains"): 0.95,
|
|
1865
|
+
}
|
|
1866
|
+
|
|
1867
|
+
combo_key = (source_type, target_type, relation_type)
|
|
1868
|
+
if combo_key in valid_combinations:
|
|
1869
|
+
suggested_confidence = valid_combinations[combo_key]
|
|
1870
|
+
return {
|
|
1871
|
+
"action": "keep",
|
|
1872
|
+
"new_confidence": max(confidence, suggested_confidence),
|
|
1873
|
+
"reason": f"valid_combination:{combo_key}"
|
|
1874
|
+
}
|
|
1875
|
+
|
|
1876
|
+
# Rule 3: Check name overlap for related_to relationships
|
|
1877
|
+
if relation_type == "related_to":
|
|
1878
|
+
source_words = self._tokenize_name(source_node.get("name", ""))
|
|
1879
|
+
target_words = self._tokenize_name(target_node.get("name", ""))
|
|
1880
|
+
|
|
1881
|
+
if source_words and target_words:
|
|
1882
|
+
overlap = len(source_words & target_words)
|
|
1883
|
+
if overlap >= 2:
|
|
1884
|
+
# Good overlap - boost confidence
|
|
1885
|
+
return {
|
|
1886
|
+
"action": "keep",
|
|
1887
|
+
"new_confidence": min(confidence + 0.2, 0.9),
|
|
1888
|
+
"reason": f"name_overlap:{overlap}"
|
|
1889
|
+
}
|
|
1890
|
+
elif overlap == 0 and confidence < 0.5:
|
|
1891
|
+
# No overlap and low confidence - consider removal
|
|
1892
|
+
return {"action": "llm_validate", "reason": "no_name_overlap"}
|
|
1893
|
+
|
|
1894
|
+
# Rule 4: Very low confidence with no semantic support
|
|
1895
|
+
if confidence < 0.4:
|
|
1896
|
+
# Check if there's any semantic basis
|
|
1897
|
+
source_name = source_node.get("name", "").lower()
|
|
1898
|
+
target_name = target_node.get("name", "").lower()
|
|
1899
|
+
|
|
1900
|
+
if (source_name not in target_name and
|
|
1901
|
+
target_name not in source_name and
|
|
1902
|
+
self._word_overlap_score(source_name, target_name) < 0.3):
|
|
1903
|
+
return {"action": "remove", "reason": "very_low_confidence_no_semantic_support"}
|
|
1904
|
+
|
|
1905
|
+
# Default: keep with same confidence
|
|
1906
|
+
return {"action": "keep", "reason": "default"}
|
|
1907
|
+
|
|
1908
|
+
def _validate_relationship_with_llm(
|
|
1909
|
+
self,
|
|
1910
|
+
source_node: Dict,
|
|
1911
|
+
target_node: Dict,
|
|
1912
|
+
link: Dict,
|
|
1913
|
+
llm: Any
|
|
1914
|
+
) -> Dict[str, Any]:
|
|
1915
|
+
"""
|
|
1916
|
+
Use LLM to validate an ambiguous relationship.
|
|
1917
|
+
|
|
1918
|
+
Args:
|
|
1919
|
+
source_node: Source entity
|
|
1920
|
+
target_node: Target entity
|
|
1921
|
+
link: The relationship to validate
|
|
1922
|
+
llm: LLM instance for validation
|
|
1923
|
+
|
|
1924
|
+
Returns:
|
|
1925
|
+
Dict with 'valid' (bool) and 'confidence' (float)
|
|
1926
|
+
"""
|
|
1927
|
+
from langchain_core.prompts import ChatPromptTemplate
|
|
1928
|
+
from langchain_core.output_parsers import JsonOutputParser
|
|
1929
|
+
|
|
1930
|
+
prompt_template = """Validate if the following relationship makes semantic sense.
|
|
1931
|
+
|
|
1932
|
+
Source Entity:
|
|
1933
|
+
- Name: {source_name}
|
|
1934
|
+
- Type: {source_type}
|
|
1935
|
+
- Description: {source_desc}
|
|
1936
|
+
|
|
1937
|
+
Relationship: {relation_type}
|
|
1938
|
+
|
|
1939
|
+
Target Entity:
|
|
1940
|
+
- Name: {target_name}
|
|
1941
|
+
- Type: {target_type}
|
|
1942
|
+
- Description: {target_desc}
|
|
1943
|
+
|
|
1944
|
+
Question: Does it make sense that "{source_name}" {relation_type} "{target_name}"?
|
|
1945
|
+
|
|
1946
|
+
Respond with ONLY a JSON object:
|
|
1947
|
+
{{"valid": true/false, "confidence": 0.0-1.0, "reason": "<brief explanation>"}}
|
|
1948
|
+
"""
|
|
1949
|
+
|
|
1950
|
+
try:
|
|
1951
|
+
prompt = ChatPromptTemplate.from_template(prompt_template)
|
|
1952
|
+
parser = JsonOutputParser()
|
|
1953
|
+
chain = prompt | llm | parser
|
|
1954
|
+
|
|
1955
|
+
result = chain.invoke({
|
|
1956
|
+
"source_name": source_node.get("name", ""),
|
|
1957
|
+
"source_type": source_node.get("type", ""),
|
|
1958
|
+
"source_desc": source_node.get("description", "No description"),
|
|
1959
|
+
"relation_type": link.get("relation_type", "related_to"),
|
|
1960
|
+
"target_name": target_node.get("name", ""),
|
|
1961
|
+
"target_type": target_node.get("type", ""),
|
|
1962
|
+
"target_desc": target_node.get("description", "No description"),
|
|
1963
|
+
})
|
|
1964
|
+
|
|
1965
|
+
return {
|
|
1966
|
+
"valid": result.get("valid", False),
|
|
1967
|
+
"confidence": result.get("confidence", 0.5),
|
|
1968
|
+
"reason": result.get("reason", "llm_validated")
|
|
1969
|
+
}
|
|
1970
|
+
|
|
1971
|
+
except Exception as e:
|
|
1972
|
+
logger.warning(f"LLM validation failed: {e}")
|
|
1973
|
+
# On LLM failure, keep the relationship
|
|
1974
|
+
return {"valid": True, "confidence": link.get("confidence", 0.5)}
|
|
1975
|
+
|
|
1976
|
+
def enrich(
|
|
1977
|
+
self,
|
|
1978
|
+
normalize_types: bool = True, # Normalize entity types first
|
|
1979
|
+
deduplicate: bool = False, # Disabled by default - can lose semantic meaning
|
|
1980
|
+
cross_source: bool = True,
|
|
1981
|
+
semantic_links: bool = True,
|
|
1982
|
+
toolkit_tools: bool = True, # Link tools to their toolkits
|
|
1983
|
+
orphans: bool = True,
|
|
1984
|
+
similarity: bool = False, # Disabled by default - can create too many links
|
|
1985
|
+
validate_low_confidence: bool = True, # Validate relationships with confidence < 0.7
|
|
1986
|
+
confidence_threshold: float = 0.7, # Threshold for low-confidence validation
|
|
1987
|
+
min_similarity: float = 0.9,
|
|
1988
|
+
exact_match_only: bool = True,
|
|
1989
|
+
llm: Optional[Any] = None, # Optional LLM for relationship validation
|
|
1990
|
+
):
|
|
1991
|
+
"""
|
|
1992
|
+
Run all enrichment steps.
|
|
1993
|
+
|
|
1994
|
+
The recommended order is:
|
|
1995
|
+
0. Normalize entity types (Tool/tool/Tools → tool)
|
|
1996
|
+
1. Deduplicate entities (DISABLED by default - use with caution)
|
|
1997
|
+
2. Link tools to toolkits (explicit toolkit → tool relationships)
|
|
1998
|
+
3. Create cross-source links (code ↔ docs)
|
|
1999
|
+
4. Create semantic links (shared concepts) - LINKS related entities
|
|
2000
|
+
5. Connect orphans
|
|
2001
|
+
6. Similarity links (optional)
|
|
2002
|
+
7. Validate low-confidence relationships
|
|
2003
|
+
|
|
2004
|
+
Args:
|
|
2005
|
+
normalize_types: Normalize entity types to canonical forms
|
|
2006
|
+
deduplicate: Merge entities with exact same name (DISABLED by default)
|
|
2007
|
+
cross_source: Link same-named entities across sources
|
|
2008
|
+
semantic_links: Link entities sharing significant words
|
|
2009
|
+
toolkit_tools: Create explicit toolkit → tool relationships
|
|
2010
|
+
orphans: Connect orphan nodes to related entities
|
|
2011
|
+
similarity: Link highly similar entity names
|
|
2012
|
+
validate_low_confidence: Validate relationships below confidence_threshold
|
|
2013
|
+
confidence_threshold: Threshold for low-confidence validation (default: 0.7)
|
|
2014
|
+
min_similarity: Threshold for similarity matching
|
|
2015
|
+
exact_match_only: Only merge exact name matches if dedup enabled
|
|
2016
|
+
llm: Optional LLM instance for validating ambiguous relationships
|
|
2017
|
+
"""
|
|
2018
|
+
# Step 0: Normalize entity types (Tool/tool/Tools → tool)
|
|
2019
|
+
if normalize_types:
|
|
2020
|
+
self.normalize_entity_types()
|
|
2021
|
+
|
|
2022
|
+
# Step 1: Deduplication (DISABLED by default - can lose semantic meaning)
|
|
2023
|
+
if deduplicate:
|
|
2024
|
+
self.deduplicate_entities(require_exact_match=exact_match_only)
|
|
2025
|
+
|
|
2026
|
+
# Step 2: Link tools to their toolkits (high priority - structural)
|
|
2027
|
+
if toolkit_tools:
|
|
2028
|
+
self.enrich_toolkit_tool_links()
|
|
2029
|
+
|
|
2030
|
+
# Step 3: Cross-source linking
|
|
2031
|
+
if cross_source:
|
|
2032
|
+
self.enrich_cross_source_links()
|
|
2033
|
+
|
|
2034
|
+
# Step 4: Semantic cross-linking (LINKS related entities, doesn't merge)
|
|
2035
|
+
if semantic_links:
|
|
2036
|
+
self.enrich_semantic_links()
|
|
2037
|
+
|
|
2038
|
+
# Step 5: Orphan connections
|
|
2039
|
+
if orphans:
|
|
2040
|
+
self.enrich_orphan_nodes()
|
|
2041
|
+
|
|
2042
|
+
# Step 6: High similarity links (optional)
|
|
2043
|
+
if similarity:
|
|
2044
|
+
self.enrich_similarity_links(min_similarity)
|
|
2045
|
+
|
|
2046
|
+
# Step 7: Validate low-confidence relationships
|
|
2047
|
+
if validate_low_confidence:
|
|
2048
|
+
self.validate_low_confidence_relationships(
|
|
2049
|
+
confidence_threshold=confidence_threshold,
|
|
2050
|
+
llm=llm
|
|
2051
|
+
)
|
|
2052
|
+
|
|
2053
|
+
logger.info(f"Enrichment complete: {len(self.new_links)} new links added")
|
|
2054
|
+
return self.stats
|
|
2055
|
+
|
|
2056
|
+
def save(self, output_path: Optional[str] = None):
|
|
2057
|
+
"""
|
|
2058
|
+
Save enriched graph.
|
|
2059
|
+
|
|
2060
|
+
Args:
|
|
2061
|
+
output_path: Optional output path. If None, overwrites input file.
|
|
2062
|
+
"""
|
|
2063
|
+
output = Path(output_path) if output_path else self.graph_path
|
|
2064
|
+
|
|
2065
|
+
# Merge new links
|
|
2066
|
+
all_links = self.graph_data.get("links", []) + self.new_links
|
|
2067
|
+
self.graph_data["links"] = all_links
|
|
2068
|
+
|
|
2069
|
+
# Add enrichment metadata
|
|
2070
|
+
if "metadata" not in self.graph_data:
|
|
2071
|
+
self.graph_data["metadata"] = {}
|
|
2072
|
+
self.graph_data["metadata"]["enrichment_stats"] = self.stats
|
|
2073
|
+
|
|
2074
|
+
with open(output, "w") as f:
|
|
2075
|
+
json.dump(self.graph_data, f, indent=2)
|
|
2076
|
+
|
|
2077
|
+
logger.info(f"Saved enriched graph to {output}")
|
|
2078
|
+
return str(output)
|
|
2079
|
+
|
|
2080
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
2081
|
+
"""Get enrichment statistics."""
|
|
2082
|
+
return {
|
|
2083
|
+
**self.stats,
|
|
2084
|
+
"total_new_links": len(self.new_links),
|
|
2085
|
+
"original_nodes": len(self.nodes_by_id) + self.stats.get("entities_merged", 0),
|
|
2086
|
+
"final_nodes": len(self.nodes_by_id),
|
|
2087
|
+
"original_links": len(self.graph_data.get("links", [])) - len(self.new_links),
|
|
2088
|
+
"final_links": len(self.graph_data.get("links", [])),
|
|
2089
|
+
}
|
|
2090
|
+
|
|
2091
|
+
|
|
2092
|
+
def enrich_graph(
|
|
2093
|
+
graph_path: str,
|
|
2094
|
+
output_path: Optional[str] = None,
|
|
2095
|
+
deduplicate: bool = False, # Disabled by default
|
|
2096
|
+
cross_source: bool = True,
|
|
2097
|
+
semantic_links: bool = True,
|
|
2098
|
+
toolkit_tools: bool = True,
|
|
2099
|
+
orphans: bool = True,
|
|
2100
|
+
similarity: bool = False,
|
|
2101
|
+
validate_low_confidence: bool = True,
|
|
2102
|
+
confidence_threshold: float = 0.7,
|
|
2103
|
+
llm: Optional[Any] = None,
|
|
2104
|
+
) -> Dict[str, Any]:
|
|
2105
|
+
"""
|
|
2106
|
+
Convenience function to enrich a graph file.
|
|
2107
|
+
|
|
2108
|
+
Args:
|
|
2109
|
+
graph_path: Path to input graph JSON
|
|
2110
|
+
output_path: Path to output (default: overwrite input)
|
|
2111
|
+
deduplicate: Merge same/similar entities (disabled by default)
|
|
2112
|
+
cross_source: Create cross-source links
|
|
2113
|
+
semantic_links: Create semantic cross-links
|
|
2114
|
+
toolkit_tools: Link tools to their toolkits
|
|
2115
|
+
orphans: Connect orphan nodes
|
|
2116
|
+
similarity: Create similarity links
|
|
2117
|
+
validate_low_confidence: Validate relationships below confidence_threshold
|
|
2118
|
+
confidence_threshold: Threshold for low-confidence validation (default: 0.7)
|
|
2119
|
+
llm: Optional LLM instance for validating ambiguous relationships
|
|
2120
|
+
|
|
2121
|
+
Returns:
|
|
2122
|
+
Enrichment statistics
|
|
2123
|
+
"""
|
|
2124
|
+
enricher = GraphEnricher(graph_path)
|
|
2125
|
+
stats = enricher.enrich(
|
|
2126
|
+
deduplicate=deduplicate,
|
|
2127
|
+
cross_source=cross_source,
|
|
2128
|
+
semantic_links=semantic_links,
|
|
2129
|
+
toolkit_tools=toolkit_tools,
|
|
2130
|
+
orphans=orphans,
|
|
2131
|
+
similarity=similarity,
|
|
2132
|
+
validate_low_confidence=validate_low_confidence,
|
|
2133
|
+
confidence_threshold=confidence_threshold,
|
|
2134
|
+
llm=llm,
|
|
2135
|
+
)
|
|
2136
|
+
enricher.save(output_path)
|
|
2137
|
+
return stats
|