alita-sdk 0.3.462__py3-none-any.whl → 0.3.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +15 -3
- alita_sdk/cli/agent_loader.py +56 -8
- alita_sdk/cli/agent_ui.py +93 -31
- alita_sdk/cli/agents.py +2274 -230
- alita_sdk/cli/callbacks.py +96 -25
- alita_sdk/cli/cli.py +10 -1
- alita_sdk/cli/config.py +162 -9
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/cli/toolkit.py +14 -17
- alita_sdk/cli/toolkit_loader.py +35 -5
- alita_sdk/cli/tools/__init__.py +36 -2
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +910 -64
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +1 -1
- alita_sdk/configurations/ado.py +141 -20
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/configurations/confluence.py +76 -42
- alita_sdk/configurations/figma.py +76 -0
- alita_sdk/configurations/gitlab.py +17 -5
- alita_sdk/configurations/openapi.py +329 -0
- alita_sdk/configurations/qtest.py +72 -1
- alita_sdk/configurations/report_portal.py +96 -0
- alita_sdk/configurations/sharepoint.py +148 -0
- alita_sdk/configurations/testio.py +83 -0
- alita_sdk/runtime/clients/artifact.py +3 -3
- alita_sdk/runtime/clients/client.py +353 -48
- alita_sdk/runtime/clients/sandbox_client.py +0 -21
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +123 -26
- alita_sdk/runtime/langchain/constants.py +642 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +6 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
- alita_sdk/runtime/langchain/document_loaders/constants.py +12 -7
- alita_sdk/runtime/langchain/langraph_agent.py +279 -73
- alita_sdk/runtime/langchain/utils.py +82 -15
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +7 -0
- alita_sdk/runtime/toolkits/application.py +21 -9
- alita_sdk/runtime/toolkits/artifact.py +15 -5
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +139 -251
- alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +251 -6
- alita_sdk/runtime/toolkits/tools.py +238 -32
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +3 -1
- alita_sdk/runtime/tools/application.py +20 -6
- alita_sdk/runtime/tools/artifact.py +511 -28
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +43 -15
- alita_sdk/runtime/tools/image_generation.py +50 -44
- alita_sdk/runtime/tools/llm.py +852 -67
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_remote_tool.py +25 -10
- alita_sdk/runtime/tools/mcp_server_tool.py +7 -6
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -4
- alita_sdk/runtime/tools/sandbox.py +9 -6
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +7 -2
- alita_sdk/runtime/tools/vectorstore_base.py +51 -11
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +202 -5
- alita_sdk/runtime/utils/mcp_sse_client.py +36 -7
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/serialization.py +155 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +16 -5
- alita_sdk/runtime/utils/utils.py +36 -0
- alita_sdk/tools/__init__.py +113 -29
- alita_sdk/tools/ado/repos/__init__.py +51 -33
- alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
- alita_sdk/tools/ado/test_plan/__init__.py +25 -9
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
- alita_sdk/tools/ado/utils.py +1 -18
- alita_sdk/tools/ado/wiki/__init__.py +25 -8
- alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
- alita_sdk/tools/ado/work_item/__init__.py +26 -9
- alita_sdk/tools/ado/work_item/ado_wrapper.py +56 -3
- alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +11 -8
- alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +170 -45
- alita_sdk/tools/bitbucket/__init__.py +17 -12
- alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/browser/__init__.py +5 -4
- alita_sdk/tools/carrier/__init__.py +5 -6
- alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
- alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
- alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +10 -7
- alita_sdk/tools/cloud/azure/__init__.py +10 -7
- alita_sdk/tools/cloud/gcp/__init__.py +10 -7
- alita_sdk/tools/cloud/k8s/__init__.py +10 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +10 -7
- alita_sdk/tools/code_indexer_toolkit.py +73 -23
- alita_sdk/tools/confluence/__init__.py +21 -15
- alita_sdk/tools/confluence/api_wrapper.py +78 -23
- alita_sdk/tools/confluence/loader.py +4 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +493 -30
- alita_sdk/tools/figma/__init__.py +58 -11
- alita_sdk/tools/figma/api_wrapper.py +1235 -143
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +13 -14
- alita_sdk/tools/github/github_client.py +224 -100
- alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
- alita_sdk/tools/github/schemas.py +14 -5
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/github/tool_prompts.py +9 -22
- alita_sdk/tools/gitlab/__init__.py +15 -11
- alita_sdk/tools/gitlab/api_wrapper.py +207 -41
- alita_sdk/tools/gitlab_org/__init__.py +10 -8
- alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
- alita_sdk/tools/google/bigquery/__init__.py +13 -12
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +10 -8
- alita_sdk/tools/google_places/api_wrapper.py +1 -1
- alita_sdk/tools/jira/__init__.py +17 -11
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +11 -3
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +490 -114
- alita_sdk/tools/openapi/api_wrapper.py +1368 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +38 -25
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/pptx/__init__.py +10 -9
- alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
- alita_sdk/tools/qtest/__init__.py +30 -10
- alita_sdk/tools/qtest/api_wrapper.py +430 -13
- alita_sdk/tools/rally/__init__.py +10 -8
- alita_sdk/tools/rally/api_wrapper.py +1 -1
- alita_sdk/tools/report_portal/__init__.py +12 -9
- alita_sdk/tools/salesforce/__init__.py +10 -9
- alita_sdk/tools/servicenow/__init__.py +17 -14
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +10 -8
- alita_sdk/tools/sharepoint/api_wrapper.py +4 -4
- alita_sdk/tools/slack/__init__.py +10 -8
- alita_sdk/tools/slack/api_wrapper.py +2 -2
- alita_sdk/tools/sql/__init__.py +11 -9
- alita_sdk/tools/testio/__init__.py +10 -8
- alita_sdk/tools/testrail/__init__.py +11 -8
- alita_sdk/tools/testrail/api_wrapper.py +1 -1
- alita_sdk/tools/utils/__init__.py +9 -4
- alita_sdk/tools/utils/content_parser.py +77 -3
- alita_sdk/tools/utils/text_operations.py +410 -0
- alita_sdk/tools/utils/tool_prompts.py +79 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
- alita_sdk/tools/xray/__init__.py +12 -9
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +9 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +11 -8
- alita_sdk/tools/zephyr_essential/__init__.py +10 -8
- alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
- alita_sdk/tools/zephyr_essential/client.py +2 -2
- alita_sdk/tools/zephyr_scale/__init__.py +11 -9
- alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
- alita_sdk/tools/zephyr_squad/__init__.py +10 -8
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +147 -7
- alita_sdk-0.3.627.dist-info/RECORD +468 -0
- alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.462.dist-info/RECORD +0 -384
- alita_sdk-0.3.462.dist-info/entry_points.txt +0 -2
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,1457 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NetworkX-based Knowledge Graph implementation.
|
|
3
|
+
|
|
4
|
+
Provides lightweight in-memory graph storage with JSON persistence.
|
|
5
|
+
Entities contain citations (source file, line numbers) instead of raw content.
|
|
6
|
+
Raw data should be retrieved on-demand using filesystem tools.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import logging
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Any, Optional, List, Dict, Set
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import networkx as nx
|
|
17
|
+
from networkx import DiGraph
|
|
18
|
+
except ImportError:
|
|
19
|
+
nx = None
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class Citation:
|
|
25
|
+
"""
|
|
26
|
+
Represents a source citation for an entity.
|
|
27
|
+
|
|
28
|
+
Citations are lightweight references to source files and line ranges.
|
|
29
|
+
The actual content should be retrieved on-demand using filesystem tools.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
file_path: str,
|
|
35
|
+
line_start: Optional[int] = None,
|
|
36
|
+
line_end: Optional[int] = None,
|
|
37
|
+
source_toolkit: Optional[str] = None,
|
|
38
|
+
doc_id: Optional[str] = None,
|
|
39
|
+
content_hash: Optional[str] = None,
|
|
40
|
+
):
|
|
41
|
+
self.file_path = file_path
|
|
42
|
+
self.line_start = line_start
|
|
43
|
+
self.line_end = line_end
|
|
44
|
+
self.source_toolkit = source_toolkit
|
|
45
|
+
self.doc_id = doc_id
|
|
46
|
+
self.content_hash = content_hash
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
49
|
+
"""Convert citation to dictionary."""
|
|
50
|
+
return {
|
|
51
|
+
'file_path': self.file_path,
|
|
52
|
+
'line_start': self.line_start,
|
|
53
|
+
'line_end': self.line_end,
|
|
54
|
+
'source_toolkit': self.source_toolkit,
|
|
55
|
+
'doc_id': self.doc_id,
|
|
56
|
+
'content_hash': self.content_hash,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_dict(cls, data: Dict[str, Any]) -> 'Citation':
|
|
61
|
+
"""Create citation from dictionary."""
|
|
62
|
+
return cls(
|
|
63
|
+
file_path=data.get('file_path', ''),
|
|
64
|
+
line_start=data.get('line_start'),
|
|
65
|
+
line_end=data.get('line_end'),
|
|
66
|
+
source_toolkit=data.get('source_toolkit'),
|
|
67
|
+
doc_id=data.get('doc_id'),
|
|
68
|
+
content_hash=data.get('content_hash'),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
def __repr__(self) -> str:
|
|
72
|
+
if self.line_start and self.line_end:
|
|
73
|
+
return f"{self.file_path}:{self.line_start}-{self.line_end}"
|
|
74
|
+
elif self.line_start:
|
|
75
|
+
return f"{self.file_path}:{self.line_start}"
|
|
76
|
+
return self.file_path
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class KnowledgeGraph:
|
|
80
|
+
"""
|
|
81
|
+
Lightweight NetworkX-based knowledge graph for storing entities and relationships.
|
|
82
|
+
|
|
83
|
+
Design principles:
|
|
84
|
+
- Graph contains only entity metadata and citations (not raw content)
|
|
85
|
+
- Citations reference source files and line numbers
|
|
86
|
+
- Raw content is retrieved on-demand via filesystem tools
|
|
87
|
+
- Graph file stays small and portable
|
|
88
|
+
|
|
89
|
+
Features:
|
|
90
|
+
- In-memory property graph using NetworkX
|
|
91
|
+
- JSON persistence via node_link_data format
|
|
92
|
+
- Delta update support with source document tracking
|
|
93
|
+
- Entity deduplication with merge strategies
|
|
94
|
+
- Impact analysis via graph traversal
|
|
95
|
+
- Enhanced search with fuzzy matching, token-based search, and file path patterns
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
# Layer classification based on entity types
|
|
99
|
+
LAYER_TYPE_MAPPING = {
|
|
100
|
+
'code': {
|
|
101
|
+
'class', 'function', 'method', 'module', 'import', 'variable',
|
|
102
|
+
'constant', 'attribute', 'decorator', 'exception', 'enum',
|
|
103
|
+
'class_reference', 'class_import', 'function_import', 'function_reference',
|
|
104
|
+
'function_call', 'method_call', 'test_function', 'pydanticmodel'
|
|
105
|
+
},
|
|
106
|
+
'service': {
|
|
107
|
+
'api_endpoint', 'rpc_method', 'route', 'service', 'handler',
|
|
108
|
+
'controller', 'middleware', 'event', 'sio', 'rpc'
|
|
109
|
+
},
|
|
110
|
+
'data': {
|
|
111
|
+
'model', 'schema', 'field', 'table', 'database', 'migration',
|
|
112
|
+
'entity', 'pydantic_model', 'dictionary', 'list', 'object'
|
|
113
|
+
},
|
|
114
|
+
'product': {
|
|
115
|
+
'feature', 'capability', 'platform', 'product', 'application',
|
|
116
|
+
'menu', 'ui_element', 'ui_component', 'interface_element'
|
|
117
|
+
},
|
|
118
|
+
'domain': {
|
|
119
|
+
'concept', 'process', 'action', 'use_case', 'workflow',
|
|
120
|
+
'requirement', 'guideline', 'best_practice'
|
|
121
|
+
},
|
|
122
|
+
'documentation': {
|
|
123
|
+
'document', 'guide', 'section', 'subsection', 'tip',
|
|
124
|
+
'example', 'resource', 'reference', 'documentation'
|
|
125
|
+
},
|
|
126
|
+
'configuration': {
|
|
127
|
+
'configuration', 'configuration_option', 'configuration_section',
|
|
128
|
+
'setting', 'credential', 'secret', 'integration'
|
|
129
|
+
},
|
|
130
|
+
'testing': {
|
|
131
|
+
'test', 'test_case', 'test_function', 'fixture', 'mock'
|
|
132
|
+
},
|
|
133
|
+
'tooling': {
|
|
134
|
+
'tool', 'toolkit', 'command', 'node_type', 'node'
|
|
135
|
+
},
|
|
136
|
+
'knowledge': {
|
|
137
|
+
# Facts extracted from code and documentation
|
|
138
|
+
'fact',
|
|
139
|
+
# Code-specific fact types
|
|
140
|
+
'algorithm', 'behavior', 'validation', 'dependency', 'error_handling',
|
|
141
|
+
# Text-specific fact types
|
|
142
|
+
'decision', 'definition', 'date', 'contact',
|
|
143
|
+
},
|
|
144
|
+
'structure': {
|
|
145
|
+
# File-level container nodes
|
|
146
|
+
'file', 'source_file', 'document_file', 'config_file', 'web_file',
|
|
147
|
+
# Directory/package structure
|
|
148
|
+
'directory', 'package',
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
# Reverse mapping: type -> layer
|
|
153
|
+
TYPE_TO_LAYER = {}
|
|
154
|
+
for layer, types in LAYER_TYPE_MAPPING.items():
|
|
155
|
+
for t in types:
|
|
156
|
+
TYPE_TO_LAYER[t] = layer
|
|
157
|
+
|
|
158
|
+
def __init__(self):
|
|
159
|
+
"""Initialize an empty knowledge graph."""
|
|
160
|
+
if nx is None:
|
|
161
|
+
raise ImportError("networkx is required for KnowledgeGraph. Install with: pip install networkx>=3.0")
|
|
162
|
+
|
|
163
|
+
self._graph: DiGraph = DiGraph()
|
|
164
|
+
self._entity_index: Dict[str, Set[str]] = defaultdict(set) # name -> set of node_ids (handles duplicates)
|
|
165
|
+
self._type_index: Dict[str, Set[str]] = defaultdict(set) # type (lowercase) -> node_ids
|
|
166
|
+
self._file_index: Dict[str, Set[str]] = defaultdict(set) # file_path -> node_ids
|
|
167
|
+
self._source_doc_index: Dict[str, Set[str]] = defaultdict(set) # source_doc_id -> node_ids
|
|
168
|
+
self._metadata: Dict[str, Any] = {} # Graph metadata (sources, timestamps)
|
|
169
|
+
self._schema: Optional[Dict[str, Any]] = None # Discovered entity schema
|
|
170
|
+
|
|
171
|
+
# ========== Entity Operations ==========
|
|
172
|
+
|
|
173
|
+
def add_entity(
|
|
174
|
+
self,
|
|
175
|
+
entity_id: str,
|
|
176
|
+
name: str,
|
|
177
|
+
entity_type: str,
|
|
178
|
+
citation: Optional[Citation] = None,
|
|
179
|
+
properties: Optional[Dict[str, Any]] = None,
|
|
180
|
+
) -> str:
|
|
181
|
+
"""
|
|
182
|
+
Add an entity to the graph with optional citation.
|
|
183
|
+
|
|
184
|
+
If an entity with this ID already exists, the citation is merged
|
|
185
|
+
into the existing entity's citations list (enabling same-named
|
|
186
|
+
entities from different files to be unified).
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
entity_id: Unique identifier for the entity
|
|
190
|
+
name: Human-readable entity name
|
|
191
|
+
entity_type: Type classification (e.g., 'Class', 'Function', 'Service')
|
|
192
|
+
citation: Source citation (file path, line numbers)
|
|
193
|
+
properties: Additional properties (no raw content, only metadata)
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
The entity_id (node ID in graph)
|
|
197
|
+
"""
|
|
198
|
+
# Check if entity already exists (for merging citations)
|
|
199
|
+
existing = self._graph.nodes.get(entity_id)
|
|
200
|
+
|
|
201
|
+
if existing:
|
|
202
|
+
# Entity exists - merge the new citation
|
|
203
|
+
if citation:
|
|
204
|
+
new_citation_dict = citation.to_dict()
|
|
205
|
+
existing_citations = existing.get('citations', [])
|
|
206
|
+
|
|
207
|
+
# Migrate legacy single 'citation' to list
|
|
208
|
+
if 'citation' in existing and existing['citation']:
|
|
209
|
+
legacy = existing['citation']
|
|
210
|
+
if legacy not in existing_citations:
|
|
211
|
+
existing_citations.append(legacy)
|
|
212
|
+
|
|
213
|
+
# Add new citation if not duplicate
|
|
214
|
+
if new_citation_dict not in existing_citations:
|
|
215
|
+
existing_citations.append(new_citation_dict)
|
|
216
|
+
|
|
217
|
+
# Update node with merged citations
|
|
218
|
+
self._graph.nodes[entity_id]['citations'] = existing_citations
|
|
219
|
+
self._graph.nodes[entity_id].pop('citation', None) # Remove legacy field
|
|
220
|
+
|
|
221
|
+
# Track source document
|
|
222
|
+
if citation.doc_id:
|
|
223
|
+
self._source_doc_index[citation.doc_id].add(entity_id)
|
|
224
|
+
|
|
225
|
+
logger.debug(f"Merged citation into existing entity: {entity_type} '{name}' ({entity_id})")
|
|
226
|
+
return entity_id
|
|
227
|
+
|
|
228
|
+
# New entity - prepare node data
|
|
229
|
+
node_data = {
|
|
230
|
+
'id': entity_id,
|
|
231
|
+
'name': name,
|
|
232
|
+
'type': entity_type,
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
# Auto-assign layer based on entity type
|
|
236
|
+
inferred_layer = self.TYPE_TO_LAYER.get(entity_type.lower())
|
|
237
|
+
if inferred_layer:
|
|
238
|
+
node_data['layer'] = inferred_layer
|
|
239
|
+
|
|
240
|
+
# Store citation in list format from the start
|
|
241
|
+
if citation:
|
|
242
|
+
node_data['citations'] = [citation.to_dict()]
|
|
243
|
+
# Track source document
|
|
244
|
+
if citation.doc_id:
|
|
245
|
+
self._source_doc_index[citation.doc_id].add(entity_id)
|
|
246
|
+
# Track file index
|
|
247
|
+
if citation.file_path:
|
|
248
|
+
self._file_index[citation.file_path].add(entity_id)
|
|
249
|
+
|
|
250
|
+
# Add other properties (excluding any large content)
|
|
251
|
+
if properties:
|
|
252
|
+
# Filter out raw content fields
|
|
253
|
+
excluded_keys = {'content', 'text', 'raw', 'body', 'source_content'}
|
|
254
|
+
for key, value in properties.items():
|
|
255
|
+
if key not in excluded_keys:
|
|
256
|
+
# Only store if serializable and reasonably sized
|
|
257
|
+
if isinstance(value, (str, int, float, bool, list, dict)) and \
|
|
258
|
+
(not isinstance(value, str) or len(value) < 1000):
|
|
259
|
+
node_data[key] = value
|
|
260
|
+
|
|
261
|
+
# Add new node
|
|
262
|
+
self._graph.add_node(entity_id, **node_data)
|
|
263
|
+
|
|
264
|
+
# Update indices - store ALL entities with this name (not just one)
|
|
265
|
+
self._entity_index[name.lower()].add(entity_id)
|
|
266
|
+
self._type_index[entity_type.lower()].add(entity_id)
|
|
267
|
+
|
|
268
|
+
logger.debug(f"Added entity: {entity_type} '{name}' ({entity_id})")
|
|
269
|
+
return entity_id
|
|
270
|
+
|
|
271
|
+
def get_entity(self, entity_id: str) -> Optional[Dict[str, Any]]:
|
|
272
|
+
"""Get entity by ID."""
|
|
273
|
+
if self._graph.has_node(entity_id):
|
|
274
|
+
return dict(self._graph.nodes[entity_id])
|
|
275
|
+
return None
|
|
276
|
+
|
|
277
|
+
def find_entity_by_name(self, name: str) -> Optional[Dict[str, Any]]:
|
|
278
|
+
"""
|
|
279
|
+
Find entity by name (case-insensitive).
|
|
280
|
+
|
|
281
|
+
If multiple entities have the same name, returns the first one found.
|
|
282
|
+
Use find_all_entities_by_name to get all matches.
|
|
283
|
+
"""
|
|
284
|
+
node_ids = self._entity_index.get(name.lower(), set())
|
|
285
|
+
if node_ids:
|
|
286
|
+
# Return first match
|
|
287
|
+
return self.get_entity(next(iter(node_ids)))
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
def find_all_entities_by_name(self, name: str) -> List[Dict[str, Any]]:
|
|
291
|
+
"""
|
|
292
|
+
Find all entities with the given name (case-insensitive).
|
|
293
|
+
|
|
294
|
+
Returns all entities if multiple have the same name but different types.
|
|
295
|
+
"""
|
|
296
|
+
node_ids = self._entity_index.get(name.lower(), set())
|
|
297
|
+
return [self.get_entity(nid) for nid in node_ids if nid]
|
|
298
|
+
|
|
299
|
+
def get_entities_by_type(self, entity_type: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
300
|
+
"""
|
|
301
|
+
Get all entities of a specific type (case-insensitive).
|
|
302
|
+
|
|
303
|
+
Also checks layer-based type groups. For example, searching for 'code'
|
|
304
|
+
will return classes, functions, methods, etc.
|
|
305
|
+
"""
|
|
306
|
+
entity_type_lower = entity_type.lower()
|
|
307
|
+
|
|
308
|
+
# Check if this is a layer name
|
|
309
|
+
if entity_type_lower in self.LAYER_TYPE_MAPPING:
|
|
310
|
+
# Get all types in this layer
|
|
311
|
+
results = []
|
|
312
|
+
for t in self.LAYER_TYPE_MAPPING[entity_type_lower]:
|
|
313
|
+
node_ids = self._type_index.get(t, set())
|
|
314
|
+
for nid in node_ids:
|
|
315
|
+
entity = self.get_entity(nid)
|
|
316
|
+
if entity:
|
|
317
|
+
results.append(entity)
|
|
318
|
+
if limit:
|
|
319
|
+
return results[:limit]
|
|
320
|
+
return results
|
|
321
|
+
|
|
322
|
+
# Use type index for fast lookup
|
|
323
|
+
node_ids = self._type_index.get(entity_type_lower, set())
|
|
324
|
+
if node_ids:
|
|
325
|
+
results = [self.get_entity(nid) for nid in node_ids if nid]
|
|
326
|
+
if limit:
|
|
327
|
+
return results[:limit]
|
|
328
|
+
return results
|
|
329
|
+
|
|
330
|
+
# Fallback: linear scan (for types not in index)
|
|
331
|
+
results = [
|
|
332
|
+
dict(data)
|
|
333
|
+
for _, data in self._graph.nodes(data=True)
|
|
334
|
+
if data.get('type', '').lower() == entity_type_lower
|
|
335
|
+
]
|
|
336
|
+
if limit:
|
|
337
|
+
return results[:limit]
|
|
338
|
+
return results
|
|
339
|
+
|
|
340
|
+
def get_entities_by_layer(self, layer: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
341
|
+
"""
|
|
342
|
+
Get all entities in a specific layer (product, domain, service, code, data, etc.).
|
|
343
|
+
|
|
344
|
+
Layer is inferred from entity type if not explicitly set on the entity.
|
|
345
|
+
"""
|
|
346
|
+
layer_lower = layer.lower()
|
|
347
|
+
|
|
348
|
+
# Get types that belong to this layer
|
|
349
|
+
layer_types = self.LAYER_TYPE_MAPPING.get(layer_lower, set())
|
|
350
|
+
|
|
351
|
+
results = []
|
|
352
|
+
for _, data in self._graph.nodes(data=True):
|
|
353
|
+
# Check explicit layer
|
|
354
|
+
if data.get('layer', '').lower() == layer_lower:
|
|
355
|
+
results.append(dict(data))
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
# Check if type belongs to this layer
|
|
359
|
+
entity_type = data.get('type', '').lower()
|
|
360
|
+
if entity_type in layer_types:
|
|
361
|
+
results.append(dict(data))
|
|
362
|
+
|
|
363
|
+
if limit:
|
|
364
|
+
return results[:limit]
|
|
365
|
+
return results
|
|
366
|
+
|
|
367
|
+
def get_all_entities(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
368
|
+
"""Get all entities in the graph."""
|
|
369
|
+
results = [
|
|
370
|
+
{'id': node_id, **dict(data)}
|
|
371
|
+
for node_id, data in self._graph.nodes(data=True)
|
|
372
|
+
]
|
|
373
|
+
if limit:
|
|
374
|
+
return results[:limit]
|
|
375
|
+
return results
|
|
376
|
+
|
|
377
|
+
def get_all_entity_types(self) -> List[str]:
|
|
378
|
+
"""Get list of all entity types in the graph."""
|
|
379
|
+
types = set()
|
|
380
|
+
for _, data in self._graph.nodes(data=True):
|
|
381
|
+
if 'type' in data:
|
|
382
|
+
types.add(data['type'])
|
|
383
|
+
return sorted(types)
|
|
384
|
+
|
|
385
|
+
def update_entity(self, entity_id: str, updates: Dict[str, Any]) -> bool:
|
|
386
|
+
"""
|
|
387
|
+
Update entity properties.
|
|
388
|
+
|
|
389
|
+
Args:
|
|
390
|
+
entity_id: Entity to update
|
|
391
|
+
updates: Properties to update (merged with existing)
|
|
392
|
+
|
|
393
|
+
Returns:
|
|
394
|
+
True if entity exists and was updated
|
|
395
|
+
"""
|
|
396
|
+
if not self._graph.has_node(entity_id):
|
|
397
|
+
return False
|
|
398
|
+
|
|
399
|
+
# Filter out raw content
|
|
400
|
+
excluded_keys = {'content', 'text', 'raw', 'body', 'source_content'}
|
|
401
|
+
filtered_updates = {
|
|
402
|
+
k: v for k, v in updates.items()
|
|
403
|
+
if k not in excluded_keys
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
current = dict(self._graph.nodes[entity_id])
|
|
407
|
+
current.update(filtered_updates)
|
|
408
|
+
|
|
409
|
+
for key, value in current.items():
|
|
410
|
+
self._graph.nodes[entity_id][key] = value
|
|
411
|
+
|
|
412
|
+
return True
|
|
413
|
+
|
|
414
|
+
def remove_entity(self, entity_id: str) -> bool:
|
|
415
|
+
"""Remove entity and its edges from the graph."""
|
|
416
|
+
if not self._graph.has_node(entity_id):
|
|
417
|
+
return False
|
|
418
|
+
|
|
419
|
+
# Remove from all indices
|
|
420
|
+
entity = self.get_entity(entity_id)
|
|
421
|
+
if entity:
|
|
422
|
+
# Remove from name index
|
|
423
|
+
name = entity.get('name', '').lower()
|
|
424
|
+
if name in self._entity_index:
|
|
425
|
+
self._entity_index[name].discard(entity_id)
|
|
426
|
+
if not self._entity_index[name]:
|
|
427
|
+
del self._entity_index[name]
|
|
428
|
+
|
|
429
|
+
# Remove from type index
|
|
430
|
+
entity_type = entity.get('type', '').lower()
|
|
431
|
+
if entity_type in self._type_index:
|
|
432
|
+
self._type_index[entity_type].discard(entity_id)
|
|
433
|
+
if not self._type_index[entity_type]:
|
|
434
|
+
del self._type_index[entity_type]
|
|
435
|
+
|
|
436
|
+
# Remove from file index
|
|
437
|
+
file_path = entity.get('file_path', '')
|
|
438
|
+
if file_path in self._file_index:
|
|
439
|
+
self._file_index[file_path].discard(entity_id)
|
|
440
|
+
if not self._file_index[file_path]:
|
|
441
|
+
del self._file_index[file_path]
|
|
442
|
+
|
|
443
|
+
# Remove from source doc index
|
|
444
|
+
for citation in entity.get('citations', []):
|
|
445
|
+
if isinstance(citation, dict):
|
|
446
|
+
doc_id = citation.get('doc_id')
|
|
447
|
+
if doc_id and entity_id in self._source_doc_index.get(doc_id, set()):
|
|
448
|
+
self._source_doc_index[doc_id].discard(entity_id)
|
|
449
|
+
|
|
450
|
+
self._graph.remove_node(entity_id)
|
|
451
|
+
return True
|
|
452
|
+
|
|
453
|
+
# ========== Relation Operations ==========
|
|
454
|
+
|
|
455
|
+
def add_relation(
|
|
456
|
+
self,
|
|
457
|
+
source_id: str,
|
|
458
|
+
target_id: str,
|
|
459
|
+
relation_type: str,
|
|
460
|
+
properties: Optional[Dict[str, Any]] = None,
|
|
461
|
+
) -> bool:
|
|
462
|
+
"""
|
|
463
|
+
Add a directed relation between entities.
|
|
464
|
+
|
|
465
|
+
Args:
|
|
466
|
+
source_id: Source entity ID
|
|
467
|
+
target_id: Target entity ID
|
|
468
|
+
relation_type: Type of relationship (e.g., 'CALLS', 'IMPORTS', 'INHERITS')
|
|
469
|
+
properties: Additional edge properties
|
|
470
|
+
|
|
471
|
+
Returns:
|
|
472
|
+
True if relation was added
|
|
473
|
+
"""
|
|
474
|
+
if not self._graph.has_node(source_id):
|
|
475
|
+
logger.warning(f"Source entity {source_id} not found")
|
|
476
|
+
return False
|
|
477
|
+
if not self._graph.has_node(target_id):
|
|
478
|
+
logger.warning(f"Target entity {target_id} not found")
|
|
479
|
+
return False
|
|
480
|
+
|
|
481
|
+
edge_data = {'relation_type': relation_type}
|
|
482
|
+
if properties:
|
|
483
|
+
edge_data.update(properties)
|
|
484
|
+
|
|
485
|
+
self._graph.add_edge(source_id, target_id, **edge_data)
|
|
486
|
+
logger.debug(f"Added relation: {source_id} --[{relation_type}]--> {target_id}")
|
|
487
|
+
return True
|
|
488
|
+
|
|
489
|
+
def get_relations(self, entity_id: str, direction: str = 'both') -> List[Dict[str, Any]]:
|
|
490
|
+
"""
|
|
491
|
+
Get relations for an entity.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
entity_id: Entity ID
|
|
495
|
+
direction: 'outgoing', 'incoming', or 'both'
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
List of relation dicts with source, target, type, properties
|
|
499
|
+
"""
|
|
500
|
+
relations = []
|
|
501
|
+
|
|
502
|
+
if direction in ('outgoing', 'both'):
|
|
503
|
+
for _, target, data in self._graph.out_edges(entity_id, data=True):
|
|
504
|
+
relations.append({
|
|
505
|
+
'source': entity_id,
|
|
506
|
+
'target': target,
|
|
507
|
+
'relation_type': data.get('relation_type'),
|
|
508
|
+
'properties': {k: v for k, v in data.items() if k != 'relation_type'}
|
|
509
|
+
})
|
|
510
|
+
|
|
511
|
+
if direction in ('incoming', 'both'):
|
|
512
|
+
for source, _, data in self._graph.in_edges(entity_id, data=True):
|
|
513
|
+
relations.append({
|
|
514
|
+
'source': source,
|
|
515
|
+
'target': entity_id,
|
|
516
|
+
'relation_type': data.get('relation_type'),
|
|
517
|
+
'properties': {k: v for k, v in data.items() if k != 'relation_type'}
|
|
518
|
+
})
|
|
519
|
+
|
|
520
|
+
return relations
|
|
521
|
+
|
|
522
|
+
def remove_relation(self, source_id: str, target_id: str) -> bool:
|
|
523
|
+
"""Remove a relation between entities."""
|
|
524
|
+
if self._graph.has_edge(source_id, target_id):
|
|
525
|
+
self._graph.remove_edge(source_id, target_id)
|
|
526
|
+
return True
|
|
527
|
+
return False
|
|
528
|
+
|
|
529
|
+
def get_relations_by_source(
|
|
530
|
+
self,
|
|
531
|
+
source_toolkit: str,
|
|
532
|
+
relation_type: Optional[str] = None
|
|
533
|
+
) -> List[Dict[str, Any]]:
|
|
534
|
+
"""
|
|
535
|
+
Get all relations from a specific source toolkit.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
source_toolkit: Name of source toolkit (e.g., 'github', 'jira')
|
|
539
|
+
relation_type: Optional filter by relation type
|
|
540
|
+
|
|
541
|
+
Returns:
|
|
542
|
+
List of relations with their properties
|
|
543
|
+
"""
|
|
544
|
+
relations = []
|
|
545
|
+
|
|
546
|
+
for source, target, data in self._graph.edges(data=True):
|
|
547
|
+
# Check if this relation is from the specified source
|
|
548
|
+
rel_source = data.get('source_toolkit')
|
|
549
|
+
if rel_source == source_toolkit:
|
|
550
|
+
# Filter by relation type if specified
|
|
551
|
+
if relation_type is None or data.get('relation_type') == relation_type:
|
|
552
|
+
relations.append({
|
|
553
|
+
'source': source,
|
|
554
|
+
'target': target,
|
|
555
|
+
'relation_type': data.get('relation_type'),
|
|
556
|
+
'source_toolkit': rel_source,
|
|
557
|
+
'properties': {k: v for k, v in data.items()
|
|
558
|
+
if k not in ('relation_type', 'source_toolkit')}
|
|
559
|
+
})
|
|
560
|
+
|
|
561
|
+
return relations
|
|
562
|
+
|
|
563
|
+
def get_cross_source_relations(self) -> List[Dict[str, Any]]:
|
|
564
|
+
"""
|
|
565
|
+
Get relations that connect entities from different sources.
|
|
566
|
+
|
|
567
|
+
These are particularly valuable for understanding how different
|
|
568
|
+
data sources relate to each other (e.g., Jira ticket references GitHub PR).
|
|
569
|
+
|
|
570
|
+
Returns:
|
|
571
|
+
List of cross-source relations
|
|
572
|
+
"""
|
|
573
|
+
cross_source = []
|
|
574
|
+
|
|
575
|
+
for source, target, data in self._graph.edges(data=True):
|
|
576
|
+
source_node = self._graph.nodes.get(source, {})
|
|
577
|
+
target_node = self._graph.nodes.get(target, {})
|
|
578
|
+
|
|
579
|
+
# Get source toolkits from entity citations
|
|
580
|
+
source_citations = source_node.get('citations', [])
|
|
581
|
+
target_citations = target_node.get('citations', [])
|
|
582
|
+
|
|
583
|
+
if not source_citations or not target_citations:
|
|
584
|
+
continue
|
|
585
|
+
|
|
586
|
+
# Get unique source toolkits for each entity
|
|
587
|
+
source_toolkits = set()
|
|
588
|
+
target_toolkits = set()
|
|
589
|
+
|
|
590
|
+
for citation in source_citations:
|
|
591
|
+
if isinstance(citation, dict):
|
|
592
|
+
toolkit = citation.get('source_toolkit')
|
|
593
|
+
elif hasattr(citation, 'source_toolkit'):
|
|
594
|
+
toolkit = citation.source_toolkit
|
|
595
|
+
else:
|
|
596
|
+
toolkit = None
|
|
597
|
+
if toolkit:
|
|
598
|
+
source_toolkits.add(toolkit)
|
|
599
|
+
|
|
600
|
+
for citation in target_citations:
|
|
601
|
+
if isinstance(citation, dict):
|
|
602
|
+
toolkit = citation.get('source_toolkit')
|
|
603
|
+
elif hasattr(citation, 'source_toolkit'):
|
|
604
|
+
toolkit = citation.source_toolkit
|
|
605
|
+
else:
|
|
606
|
+
toolkit = None
|
|
607
|
+
if toolkit:
|
|
608
|
+
target_toolkits.add(toolkit)
|
|
609
|
+
|
|
610
|
+
# Check if entities come from different sources
|
|
611
|
+
if source_toolkits and target_toolkits and source_toolkits != target_toolkits:
|
|
612
|
+
cross_source.append({
|
|
613
|
+
'source': source,
|
|
614
|
+
'target': target,
|
|
615
|
+
'source_toolkits': list(source_toolkits),
|
|
616
|
+
'target_toolkits': list(target_toolkits),
|
|
617
|
+
'relation_type': data.get('relation_type'),
|
|
618
|
+
'relation_source': data.get('source_toolkit'),
|
|
619
|
+
'properties': {k: v for k, v in data.items()
|
|
620
|
+
if k not in ('relation_type', 'source_toolkit')}
|
|
621
|
+
})
|
|
622
|
+
|
|
623
|
+
return cross_source
|
|
624
|
+
|
|
625
|
+
# ========== Graph Analysis ==========
|
|
626
|
+
|
|
627
|
+
def get_neighbors(
|
|
628
|
+
self,
|
|
629
|
+
entity_id: str,
|
|
630
|
+
max_depth: int = 1,
|
|
631
|
+
relation_types: Optional[List[str]] = None,
|
|
632
|
+
) -> Dict[str, Any]:
|
|
633
|
+
"""
|
|
634
|
+
Get neighboring entities up to a certain depth.
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
entity_id: Starting entity
|
|
638
|
+
max_depth: How many hops to traverse
|
|
639
|
+
relation_types: Filter by relation types
|
|
640
|
+
|
|
641
|
+
Returns:
|
|
642
|
+
Dict with entities and relations
|
|
643
|
+
"""
|
|
644
|
+
if not self._graph.has_node(entity_id):
|
|
645
|
+
return {'entities': [], 'relations': []}
|
|
646
|
+
|
|
647
|
+
visited = {entity_id}
|
|
648
|
+
entities = [self.get_entity(entity_id)]
|
|
649
|
+
relations = []
|
|
650
|
+
|
|
651
|
+
current_level = [entity_id]
|
|
652
|
+
|
|
653
|
+
for _ in range(max_depth):
|
|
654
|
+
next_level = []
|
|
655
|
+
|
|
656
|
+
for node in current_level:
|
|
657
|
+
# Outgoing edges
|
|
658
|
+
for _, target, data in self._graph.out_edges(node, data=True):
|
|
659
|
+
rel_type = data.get('relation_type')
|
|
660
|
+
if relation_types and rel_type not in relation_types:
|
|
661
|
+
continue
|
|
662
|
+
|
|
663
|
+
relations.append({
|
|
664
|
+
'source': node,
|
|
665
|
+
'target': target,
|
|
666
|
+
'relation_type': rel_type,
|
|
667
|
+
})
|
|
668
|
+
|
|
669
|
+
if target not in visited:
|
|
670
|
+
visited.add(target)
|
|
671
|
+
next_level.append(target)
|
|
672
|
+
entities.append(self.get_entity(target))
|
|
673
|
+
|
|
674
|
+
# Incoming edges
|
|
675
|
+
for source, _, data in self._graph.in_edges(node, data=True):
|
|
676
|
+
rel_type = data.get('relation_type')
|
|
677
|
+
if relation_types and rel_type not in relation_types:
|
|
678
|
+
continue
|
|
679
|
+
|
|
680
|
+
relations.append({
|
|
681
|
+
'source': source,
|
|
682
|
+
'target': node,
|
|
683
|
+
'relation_type': rel_type,
|
|
684
|
+
})
|
|
685
|
+
|
|
686
|
+
if source not in visited:
|
|
687
|
+
visited.add(source)
|
|
688
|
+
next_level.append(source)
|
|
689
|
+
entities.append(self.get_entity(source))
|
|
690
|
+
|
|
691
|
+
current_level = next_level
|
|
692
|
+
|
|
693
|
+
return {'entities': entities, 'relations': relations}
|
|
694
|
+
|
|
695
|
+
def find_path(self, source_id: str, target_id: str) -> Optional[List[str]]:
|
|
696
|
+
"""Find shortest path between two entities."""
|
|
697
|
+
if not self._graph.has_node(source_id) or not self._graph.has_node(target_id):
|
|
698
|
+
return None
|
|
699
|
+
|
|
700
|
+
try:
|
|
701
|
+
path = nx.shortest_path(self._graph, source_id, target_id)
|
|
702
|
+
return path
|
|
703
|
+
except nx.NetworkXNoPath:
|
|
704
|
+
return None
|
|
705
|
+
|
|
706
|
+
def impact_analysis(
|
|
707
|
+
self,
|
|
708
|
+
entity_id: str,
|
|
709
|
+
direction: str = 'downstream',
|
|
710
|
+
max_depth: int = 3,
|
|
711
|
+
) -> Dict[str, Any]:
|
|
712
|
+
"""
|
|
713
|
+
Analyze impact of changes to an entity.
|
|
714
|
+
|
|
715
|
+
Args:
|
|
716
|
+
entity_id: Entity to analyze
|
|
717
|
+
direction: 'downstream' (what depends on this) or 'upstream' (what this depends on)
|
|
718
|
+
max_depth: Maximum traversal depth
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
Dict with impacted entities and paths
|
|
722
|
+
"""
|
|
723
|
+
if not self._graph.has_node(entity_id):
|
|
724
|
+
return {'impacted': [], 'paths': []}
|
|
725
|
+
|
|
726
|
+
impacted = []
|
|
727
|
+
paths = []
|
|
728
|
+
|
|
729
|
+
# Use BFS for level-by-level analysis
|
|
730
|
+
visited = {entity_id}
|
|
731
|
+
queue = [(entity_id, [entity_id], 0)]
|
|
732
|
+
|
|
733
|
+
while queue:
|
|
734
|
+
current, path, depth = queue.pop(0)
|
|
735
|
+
|
|
736
|
+
if depth >= max_depth:
|
|
737
|
+
continue
|
|
738
|
+
|
|
739
|
+
# Get edges based on direction
|
|
740
|
+
if direction == 'downstream':
|
|
741
|
+
edges = self._graph.in_edges(current, data=True)
|
|
742
|
+
else: # upstream
|
|
743
|
+
edges = self._graph.out_edges(current, data=True)
|
|
744
|
+
|
|
745
|
+
for edge in edges:
|
|
746
|
+
if direction == 'downstream':
|
|
747
|
+
neighbor = edge[0]
|
|
748
|
+
else:
|
|
749
|
+
neighbor = edge[1]
|
|
750
|
+
|
|
751
|
+
if neighbor not in visited:
|
|
752
|
+
visited.add(neighbor)
|
|
753
|
+
new_path = path + [neighbor]
|
|
754
|
+
|
|
755
|
+
entity = self.get_entity(neighbor)
|
|
756
|
+
impacted.append({
|
|
757
|
+
'entity': entity,
|
|
758
|
+
'depth': depth + 1,
|
|
759
|
+
'path': new_path,
|
|
760
|
+
})
|
|
761
|
+
paths.append(new_path)
|
|
762
|
+
|
|
763
|
+
queue.append((neighbor, new_path, depth + 1))
|
|
764
|
+
|
|
765
|
+
return {'impacted': impacted, 'paths': paths}
|
|
766
|
+
|
|
767
|
+
# ========== Search Operations ==========
|
|
768
|
+
|
|
769
|
+
def _tokenize(self, text: str) -> Set[str]:
|
|
770
|
+
"""Tokenize text into searchable tokens (handles camelCase, snake_case, etc.)."""
|
|
771
|
+
import re
|
|
772
|
+
if not text:
|
|
773
|
+
return set()
|
|
774
|
+
|
|
775
|
+
# Split on non-alphanumeric
|
|
776
|
+
words = re.split(r'[^a-zA-Z0-9]+', text.lower())
|
|
777
|
+
|
|
778
|
+
# Also split camelCase
|
|
779
|
+
tokens = set()
|
|
780
|
+
for word in words:
|
|
781
|
+
if word:
|
|
782
|
+
tokens.add(word)
|
|
783
|
+
# Split camelCase: "ChatMessageHandler" -> ["chat", "message", "handler"]
|
|
784
|
+
camel_parts = re.findall(r'[a-z]+|[A-Z][a-z]*|[0-9]+', word)
|
|
785
|
+
tokens.update(p.lower() for p in camel_parts if p)
|
|
786
|
+
|
|
787
|
+
return tokens
|
|
788
|
+
|
|
789
|
+
def _calculate_match_score(
|
|
790
|
+
self,
|
|
791
|
+
query_tokens: Set[str],
|
|
792
|
+
query_lower: str,
|
|
793
|
+
name: str,
|
|
794
|
+
entity_type: str,
|
|
795
|
+
description: str,
|
|
796
|
+
file_path: str,
|
|
797
|
+
) -> tuple:
|
|
798
|
+
"""
|
|
799
|
+
Calculate match score for an entity.
|
|
800
|
+
|
|
801
|
+
Returns (score, match_field) tuple.
|
|
802
|
+
Higher scores mean better matches.
|
|
803
|
+
"""
|
|
804
|
+
name_lower = name.lower()
|
|
805
|
+
name_tokens = self._tokenize(name)
|
|
806
|
+
|
|
807
|
+
# Exact name match (highest priority)
|
|
808
|
+
if query_lower == name_lower:
|
|
809
|
+
return (1.0, 'name_exact')
|
|
810
|
+
|
|
811
|
+
# Exact substring in name
|
|
812
|
+
if query_lower in name_lower:
|
|
813
|
+
# Prefer matches at word boundaries
|
|
814
|
+
score = 0.85 if name_lower.startswith(query_lower) else 0.75
|
|
815
|
+
return (score, 'name_contains')
|
|
816
|
+
|
|
817
|
+
# Token overlap in name (for camelCase matching)
|
|
818
|
+
if query_tokens and name_tokens:
|
|
819
|
+
overlap = len(query_tokens & name_tokens)
|
|
820
|
+
if overlap > 0:
|
|
821
|
+
# Score based on percentage of query tokens matched
|
|
822
|
+
score = 0.6 * (overlap / len(query_tokens))
|
|
823
|
+
if overlap == len(query_tokens): # All query tokens found
|
|
824
|
+
score = 0.7
|
|
825
|
+
return (score, 'name_tokens')
|
|
826
|
+
|
|
827
|
+
# Check file path
|
|
828
|
+
if file_path and query_lower in file_path.lower():
|
|
829
|
+
return (0.55, 'file_path')
|
|
830
|
+
|
|
831
|
+
# Check description
|
|
832
|
+
if description:
|
|
833
|
+
desc_lower = description.lower()
|
|
834
|
+
if query_lower in desc_lower:
|
|
835
|
+
return (0.5, 'description')
|
|
836
|
+
# Token match in description
|
|
837
|
+
desc_tokens = self._tokenize(description)
|
|
838
|
+
if query_tokens and desc_tokens:
|
|
839
|
+
overlap = len(query_tokens & desc_tokens)
|
|
840
|
+
if overlap > 0:
|
|
841
|
+
score = 0.35 * (overlap / len(query_tokens))
|
|
842
|
+
return (score, 'description_tokens')
|
|
843
|
+
|
|
844
|
+
# Check entity type
|
|
845
|
+
if query_lower in entity_type.lower():
|
|
846
|
+
return (0.3, 'type')
|
|
847
|
+
|
|
848
|
+
return (0.0, None)
|
|
849
|
+
|
|
850
|
+
def search(
|
|
851
|
+
self,
|
|
852
|
+
query: str,
|
|
853
|
+
top_k: int = 10,
|
|
854
|
+
entity_type: Optional[str] = None,
|
|
855
|
+
layer: Optional[str] = None,
|
|
856
|
+
file_pattern: Optional[str] = None,
|
|
857
|
+
) -> List[Dict[str, Any]]:
|
|
858
|
+
"""
|
|
859
|
+
Search entities with enhanced matching capabilities.
|
|
860
|
+
|
|
861
|
+
Supports:
|
|
862
|
+
- Exact and partial name matching
|
|
863
|
+
- Token-based matching (handles camelCase, snake_case)
|
|
864
|
+
- Description and property search
|
|
865
|
+
- File path pattern matching
|
|
866
|
+
- Type and layer filtering
|
|
867
|
+
|
|
868
|
+
Args:
|
|
869
|
+
query: Search query string
|
|
870
|
+
top_k: Maximum results to return
|
|
871
|
+
entity_type: Filter by entity type (case-insensitive)
|
|
872
|
+
layer: Filter by layer (code, service, data, product, etc.)
|
|
873
|
+
file_pattern: Filter by file path pattern (glob-like)
|
|
874
|
+
|
|
875
|
+
Returns:
|
|
876
|
+
List of matching entities with scores
|
|
877
|
+
"""
|
|
878
|
+
import re
|
|
879
|
+
|
|
880
|
+
results = []
|
|
881
|
+
query_lower = query.lower().strip()
|
|
882
|
+
query_tokens = self._tokenize(query)
|
|
883
|
+
|
|
884
|
+
# Get layer types for filtering
|
|
885
|
+
layer_types = set()
|
|
886
|
+
if layer:
|
|
887
|
+
layer_types = self.LAYER_TYPE_MAPPING.get(layer.lower(), set())
|
|
888
|
+
|
|
889
|
+
# Compile file pattern if provided
|
|
890
|
+
file_regex = None
|
|
891
|
+
if file_pattern:
|
|
892
|
+
# Convert glob pattern to regex
|
|
893
|
+
pattern = file_pattern.replace('.', r'\.').replace('*', '.*').replace('?', '.')
|
|
894
|
+
try:
|
|
895
|
+
file_regex = re.compile(pattern, re.IGNORECASE)
|
|
896
|
+
except re.error:
|
|
897
|
+
pass
|
|
898
|
+
|
|
899
|
+
for node_id, data in self._graph.nodes(data=True):
|
|
900
|
+
# Type filter (case-insensitive)
|
|
901
|
+
data_type = data.get('type', '').lower()
|
|
902
|
+
if entity_type and data_type != entity_type.lower():
|
|
903
|
+
continue
|
|
904
|
+
|
|
905
|
+
# Layer filter
|
|
906
|
+
if layer:
|
|
907
|
+
entity_layer = data.get('layer', '').lower()
|
|
908
|
+
if entity_layer != layer.lower() and data_type not in layer_types:
|
|
909
|
+
continue
|
|
910
|
+
|
|
911
|
+
# File pattern filter
|
|
912
|
+
citations = data.get('citations', [])
|
|
913
|
+
if not citations and 'citation' in data:
|
|
914
|
+
citations = [data['citation']]
|
|
915
|
+
|
|
916
|
+
file_paths = [c.get('file_path', '') for c in citations if isinstance(c, dict)]
|
|
917
|
+
primary_file = file_paths[0] if file_paths else data.get('file_path', '')
|
|
918
|
+
|
|
919
|
+
if file_regex and primary_file:
|
|
920
|
+
if not file_regex.search(primary_file):
|
|
921
|
+
continue
|
|
922
|
+
|
|
923
|
+
# Calculate match score
|
|
924
|
+
name = data.get('name', '')
|
|
925
|
+
description = data.get('description', '')
|
|
926
|
+
if isinstance(data.get('properties'), dict):
|
|
927
|
+
description = description or data['properties'].get('description', '')
|
|
928
|
+
|
|
929
|
+
score, match_field = self._calculate_match_score(
|
|
930
|
+
query_tokens, query_lower, name, data_type, description, primary_file
|
|
931
|
+
)
|
|
932
|
+
|
|
933
|
+
if score > 0:
|
|
934
|
+
results.append({
|
|
935
|
+
'entity': dict(data),
|
|
936
|
+
'score': score,
|
|
937
|
+
'match_field': match_field,
|
|
938
|
+
})
|
|
939
|
+
|
|
940
|
+
# Sort by score (descending), then by name
|
|
941
|
+
results.sort(key=lambda x: (-x['score'], x['entity'].get('name', '').lower()))
|
|
942
|
+
return results[:top_k]
|
|
943
|
+
|
|
944
|
+
def search_by_file(self, file_path_pattern: str, limit: int = 50) -> List[Dict[str, Any]]:
|
|
945
|
+
"""
|
|
946
|
+
Search entities by file path pattern.
|
|
947
|
+
|
|
948
|
+
Args:
|
|
949
|
+
file_path_pattern: Glob-like pattern (e.g., "api/*.py", "**/chat*.py")
|
|
950
|
+
limit: Maximum results
|
|
951
|
+
|
|
952
|
+
Returns:
|
|
953
|
+
List of entities from matching files
|
|
954
|
+
"""
|
|
955
|
+
import re
|
|
956
|
+
|
|
957
|
+
# Convert glob to regex
|
|
958
|
+
pattern = file_path_pattern.replace('.', r'\.').replace('**', '.*').replace('*', '[^/]*').replace('?', '.')
|
|
959
|
+
try:
|
|
960
|
+
file_regex = re.compile(pattern, re.IGNORECASE)
|
|
961
|
+
except re.error:
|
|
962
|
+
return []
|
|
963
|
+
|
|
964
|
+
results = []
|
|
965
|
+
for file_path, node_ids in self._file_index.items():
|
|
966
|
+
if file_regex.search(file_path):
|
|
967
|
+
for nid in node_ids:
|
|
968
|
+
entity = self.get_entity(nid)
|
|
969
|
+
if entity:
|
|
970
|
+
results.append(entity)
|
|
971
|
+
if len(results) >= limit:
|
|
972
|
+
return results
|
|
973
|
+
|
|
974
|
+
# Also check entities with file_path attribute (backup)
|
|
975
|
+
if not results:
|
|
976
|
+
for _, data in self._graph.nodes(data=True):
|
|
977
|
+
fp = data.get('file_path', '')
|
|
978
|
+
if fp and file_regex.search(fp):
|
|
979
|
+
results.append(dict(data))
|
|
980
|
+
if len(results) >= limit:
|
|
981
|
+
break
|
|
982
|
+
|
|
983
|
+
return results
|
|
984
|
+
|
|
985
|
+
def search_advanced(
|
|
986
|
+
self,
|
|
987
|
+
query: Optional[str] = None,
|
|
988
|
+
entity_types: Optional[List[str]] = None,
|
|
989
|
+
layers: Optional[List[str]] = None,
|
|
990
|
+
file_patterns: Optional[List[str]] = None,
|
|
991
|
+
has_relations: Optional[bool] = None,
|
|
992
|
+
min_citations: Optional[int] = None,
|
|
993
|
+
top_k: int = 20,
|
|
994
|
+
) -> List[Dict[str, Any]]:
|
|
995
|
+
"""
|
|
996
|
+
Advanced search with multiple filter criteria.
|
|
997
|
+
|
|
998
|
+
Args:
|
|
999
|
+
query: Text search query (optional)
|
|
1000
|
+
entity_types: List of types to include (OR logic)
|
|
1001
|
+
layers: List of layers to include (OR logic)
|
|
1002
|
+
file_patterns: List of file patterns to include (OR logic)
|
|
1003
|
+
has_relations: If True, only entities with relations; if False, isolated entities
|
|
1004
|
+
min_citations: Minimum number of citations required
|
|
1005
|
+
top_k: Maximum results
|
|
1006
|
+
|
|
1007
|
+
Returns:
|
|
1008
|
+
List of matching entities
|
|
1009
|
+
"""
|
|
1010
|
+
import re
|
|
1011
|
+
|
|
1012
|
+
# Build type filter set
|
|
1013
|
+
type_filter = set()
|
|
1014
|
+
if entity_types:
|
|
1015
|
+
for t in entity_types:
|
|
1016
|
+
type_filter.add(t.lower())
|
|
1017
|
+
# Expand layer names to types
|
|
1018
|
+
if t.lower() in self.LAYER_TYPE_MAPPING:
|
|
1019
|
+
type_filter.update(self.LAYER_TYPE_MAPPING[t.lower()])
|
|
1020
|
+
|
|
1021
|
+
# Build layer filter set
|
|
1022
|
+
layer_filter = set()
|
|
1023
|
+
if layers:
|
|
1024
|
+
for l in layers:
|
|
1025
|
+
layer_filter.add(l.lower())
|
|
1026
|
+
|
|
1027
|
+
# Build file regex patterns
|
|
1028
|
+
file_regexes = []
|
|
1029
|
+
if file_patterns:
|
|
1030
|
+
for fp in file_patterns:
|
|
1031
|
+
pattern = fp.replace('.', r'\.').replace('**', '.*').replace('*', '[^/]*')
|
|
1032
|
+
try:
|
|
1033
|
+
file_regexes.append(re.compile(pattern, re.IGNORECASE))
|
|
1034
|
+
except re.error:
|
|
1035
|
+
pass
|
|
1036
|
+
|
|
1037
|
+
query_tokens = self._tokenize(query) if query else set()
|
|
1038
|
+
query_lower = query.lower().strip() if query else ''
|
|
1039
|
+
|
|
1040
|
+
results = []
|
|
1041
|
+
|
|
1042
|
+
for node_id, data in self._graph.nodes(data=True):
|
|
1043
|
+
data_type = data.get('type', '').lower()
|
|
1044
|
+
data_layer = data.get('layer', '').lower() or self.TYPE_TO_LAYER.get(data_type, '')
|
|
1045
|
+
|
|
1046
|
+
# Type filter
|
|
1047
|
+
if type_filter and data_type not in type_filter:
|
|
1048
|
+
continue
|
|
1049
|
+
|
|
1050
|
+
# Layer filter
|
|
1051
|
+
if layer_filter and data_layer not in layer_filter:
|
|
1052
|
+
continue
|
|
1053
|
+
|
|
1054
|
+
# File pattern filter
|
|
1055
|
+
file_path = data.get('file_path', '')
|
|
1056
|
+
if file_regexes:
|
|
1057
|
+
if not any(rx.search(file_path) for rx in file_regexes):
|
|
1058
|
+
continue
|
|
1059
|
+
|
|
1060
|
+
# Relations filter
|
|
1061
|
+
if has_relations is not None:
|
|
1062
|
+
has_edges = (
|
|
1063
|
+
self._graph.in_degree(node_id) > 0 or
|
|
1064
|
+
self._graph.out_degree(node_id) > 0
|
|
1065
|
+
)
|
|
1066
|
+
if has_relations and not has_edges:
|
|
1067
|
+
continue
|
|
1068
|
+
if not has_relations and has_edges:
|
|
1069
|
+
continue
|
|
1070
|
+
|
|
1071
|
+
# Citations filter
|
|
1072
|
+
if min_citations:
|
|
1073
|
+
citations = data.get('citations', [])
|
|
1074
|
+
if len(citations) < min_citations:
|
|
1075
|
+
continue
|
|
1076
|
+
|
|
1077
|
+
# Text search
|
|
1078
|
+
score = 1.0
|
|
1079
|
+
match_field = 'filter'
|
|
1080
|
+
|
|
1081
|
+
if query:
|
|
1082
|
+
name = data.get('name', '')
|
|
1083
|
+
description = data.get('description', '')
|
|
1084
|
+
if isinstance(data.get('properties'), dict):
|
|
1085
|
+
description = description or data['properties'].get('description', '')
|
|
1086
|
+
|
|
1087
|
+
score, match_field = self._calculate_match_score(
|
|
1088
|
+
query_tokens, query_lower, name, data_type, description, file_path
|
|
1089
|
+
)
|
|
1090
|
+
|
|
1091
|
+
if score == 0:
|
|
1092
|
+
continue
|
|
1093
|
+
|
|
1094
|
+
results.append({
|
|
1095
|
+
'entity': dict(data),
|
|
1096
|
+
'score': score,
|
|
1097
|
+
'match_field': match_field,
|
|
1098
|
+
})
|
|
1099
|
+
|
|
1100
|
+
results.sort(key=lambda x: (-x['score'], x['entity'].get('name', '').lower()))
|
|
1101
|
+
return results[:top_k]
|
|
1102
|
+
|
|
1103
|
+
def get_entities_by_source(self, doc_id: str) -> List[Dict[str, Any]]:
|
|
1104
|
+
"""Get all entities from a specific source document."""
|
|
1105
|
+
node_ids = self._source_doc_index.get(doc_id, set())
|
|
1106
|
+
return [self.get_entity(nid) for nid in node_ids if nid]
|
|
1107
|
+
|
|
1108
|
+
def get_entities_by_file(self, file_path: str) -> List[Dict[str, Any]]:
|
|
1109
|
+
"""Get all entities with citations from a specific file."""
|
|
1110
|
+
# First try the file index
|
|
1111
|
+
node_ids = self._file_index.get(file_path, set())
|
|
1112
|
+
if node_ids:
|
|
1113
|
+
return [self.get_entity(nid) for nid in node_ids if nid]
|
|
1114
|
+
|
|
1115
|
+
# Fallback to linear scan for partial matches
|
|
1116
|
+
results = []
|
|
1117
|
+
for _, data in self._graph.nodes(data=True):
|
|
1118
|
+
# Check file_path attribute
|
|
1119
|
+
if data.get('file_path') == file_path:
|
|
1120
|
+
results.append(dict(data))
|
|
1121
|
+
continue
|
|
1122
|
+
|
|
1123
|
+
# Check citations
|
|
1124
|
+
for citation in data.get('citations', []):
|
|
1125
|
+
if isinstance(citation, dict) and citation.get('file_path') == file_path:
|
|
1126
|
+
results.append(dict(data))
|
|
1127
|
+
break
|
|
1128
|
+
|
|
1129
|
+
return results
|
|
1130
|
+
|
|
1131
|
+
# ========== Delta Operations ==========
|
|
1132
|
+
|
|
1133
|
+
def remove_entities_by_source(self, doc_id: str) -> int:
|
|
1134
|
+
"""
|
|
1135
|
+
Remove all entities from a specific source document.
|
|
1136
|
+
Used for delta updates to clean stale entities.
|
|
1137
|
+
|
|
1138
|
+
Returns:
|
|
1139
|
+
Number of entities removed
|
|
1140
|
+
"""
|
|
1141
|
+
node_ids = list(self._source_doc_index.get(doc_id, set()))
|
|
1142
|
+
for node_id in node_ids:
|
|
1143
|
+
self.remove_entity(node_id)
|
|
1144
|
+
return len(node_ids)
|
|
1145
|
+
|
|
1146
|
+
def remove_entities_by_file(self, file_path: str) -> int:
|
|
1147
|
+
"""
|
|
1148
|
+
Remove all entities with citations from a specific file.
|
|
1149
|
+
Used for delta updates when a file changes.
|
|
1150
|
+
|
|
1151
|
+
Returns:
|
|
1152
|
+
Number of entities removed
|
|
1153
|
+
"""
|
|
1154
|
+
to_remove = []
|
|
1155
|
+
for node_id, data in self._graph.nodes(data=True):
|
|
1156
|
+
citation = data.get('citation', {})
|
|
1157
|
+
if isinstance(citation, dict) and citation.get('file_path') == file_path:
|
|
1158
|
+
to_remove.append(node_id)
|
|
1159
|
+
|
|
1160
|
+
for node_id in to_remove:
|
|
1161
|
+
self.remove_entity(node_id)
|
|
1162
|
+
|
|
1163
|
+
return len(to_remove)
|
|
1164
|
+
|
|
1165
|
+
# ========== Schema Operations ==========
|
|
1166
|
+
|
|
1167
|
+
def set_schema(self, schema: Dict[str, Any]) -> None:
|
|
1168
|
+
"""Store the discovered entity schema."""
|
|
1169
|
+
self._schema = schema
|
|
1170
|
+
|
|
1171
|
+
def get_schema(self) -> Optional[Dict[str, Any]]:
|
|
1172
|
+
"""Get the discovered schema."""
|
|
1173
|
+
return self._schema
|
|
1174
|
+
|
|
1175
|
+
# ========== Statistics ==========
|
|
1176
|
+
|
|
1177
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
1178
|
+
"""Get graph statistics."""
|
|
1179
|
+
entity_types = defaultdict(int)
|
|
1180
|
+
relation_types = defaultdict(int)
|
|
1181
|
+
sources = set()
|
|
1182
|
+
relations_by_source = defaultdict(int)
|
|
1183
|
+
|
|
1184
|
+
for _, data in self._graph.nodes(data=True):
|
|
1185
|
+
if 'type' in data:
|
|
1186
|
+
entity_types[data['type']] += 1
|
|
1187
|
+
citation = data.get('citation', {})
|
|
1188
|
+
if isinstance(citation, dict) and citation.get('source_toolkit'):
|
|
1189
|
+
sources.add(citation['source_toolkit'])
|
|
1190
|
+
|
|
1191
|
+
for _, _, data in self._graph.edges(data=True):
|
|
1192
|
+
if 'relation_type' in data:
|
|
1193
|
+
relation_types[data['relation_type']] += 1
|
|
1194
|
+
# Track relations by source
|
|
1195
|
+
rel_source = data.get('source_toolkit')
|
|
1196
|
+
if rel_source:
|
|
1197
|
+
relations_by_source[rel_source] += 1
|
|
1198
|
+
|
|
1199
|
+
return {
|
|
1200
|
+
'node_count': self._graph.number_of_nodes(),
|
|
1201
|
+
'edge_count': self._graph.number_of_edges(),
|
|
1202
|
+
'entity_types': dict(entity_types),
|
|
1203
|
+
'relation_types': dict(relation_types),
|
|
1204
|
+
'source_toolkits': sorted(sources),
|
|
1205
|
+
'relations_by_source': dict(relations_by_source),
|
|
1206
|
+
'cross_source_relations': len(self.get_cross_source_relations()),
|
|
1207
|
+
'last_saved': self._metadata.get('last_saved'),
|
|
1208
|
+
}
|
|
1209
|
+
|
|
1210
|
+
# ========== Persistence ==========
|
|
1211
|
+
|
|
1212
|
+
def dump_to_json(self, path: str) -> None:
|
|
1213
|
+
"""
|
|
1214
|
+
Export graph to JSON file using node_link format.
|
|
1215
|
+
|
|
1216
|
+
The graph file is lightweight - contains only:
|
|
1217
|
+
- Entity metadata and citations (no raw content)
|
|
1218
|
+
- Relationships
|
|
1219
|
+
- Schema and indices
|
|
1220
|
+
|
|
1221
|
+
Args:
|
|
1222
|
+
path: File path to write JSON
|
|
1223
|
+
"""
|
|
1224
|
+
# Use edges="links" explicitly for NetworkX 3.5+ compatibility
|
|
1225
|
+
# This ensures consistent format that visualize.py and load_from_json expect
|
|
1226
|
+
data = nx.node_link_data(self._graph, edges="links")
|
|
1227
|
+
|
|
1228
|
+
# Add index data for persistence
|
|
1229
|
+
data['_indices'] = {
|
|
1230
|
+
'entity_index': {k: list(v) for k, v in self._entity_index.items()},
|
|
1231
|
+
'type_index': {k: list(v) for k, v in self._type_index.items()},
|
|
1232
|
+
'file_index': {k: list(v) for k, v in self._file_index.items()},
|
|
1233
|
+
'source_doc_index': {k: list(v) for k, v in self._source_doc_index.items()}
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
# Add schema if discovered
|
|
1237
|
+
if self._schema:
|
|
1238
|
+
data['_schema'] = self._schema
|
|
1239
|
+
|
|
1240
|
+
# Add metadata
|
|
1241
|
+
self._metadata['last_saved'] = datetime.now().isoformat()
|
|
1242
|
+
self._metadata['version'] = '2.1' # Enhanced indices version
|
|
1243
|
+
data['_metadata'] = self._metadata
|
|
1244
|
+
|
|
1245
|
+
with open(path, 'w', encoding='utf-8') as f:
|
|
1246
|
+
json.dump(data, f, indent=2, default=str)
|
|
1247
|
+
|
|
1248
|
+
logger.info(f"Saved graph to {path} ({self._graph.number_of_nodes()} entities, {self._graph.number_of_edges()} relations)")
|
|
1249
|
+
|
|
1250
|
+
def load_from_json(self, path: str) -> None:
|
|
1251
|
+
"""
|
|
1252
|
+
Load graph from JSON file.
|
|
1253
|
+
|
|
1254
|
+
Args:
|
|
1255
|
+
path: File path to read JSON from
|
|
1256
|
+
|
|
1257
|
+
Raises:
|
|
1258
|
+
FileNotFoundError: If file doesn't exist
|
|
1259
|
+
"""
|
|
1260
|
+
with open(path, 'r', encoding='utf-8') as f:
|
|
1261
|
+
data = json.load(f)
|
|
1262
|
+
|
|
1263
|
+
# Restore indices
|
|
1264
|
+
indices = data.pop('_indices', {})
|
|
1265
|
+
|
|
1266
|
+
# Entity index - convert to set (handles both old string format and new list format)
|
|
1267
|
+
self._entity_index = defaultdict(set)
|
|
1268
|
+
for k, v in indices.get('entity_index', {}).items():
|
|
1269
|
+
if isinstance(v, list):
|
|
1270
|
+
self._entity_index[k] = set(v)
|
|
1271
|
+
elif isinstance(v, str):
|
|
1272
|
+
self._entity_index[k] = {v} # Legacy format
|
|
1273
|
+
|
|
1274
|
+
# Type index
|
|
1275
|
+
self._type_index = defaultdict(set)
|
|
1276
|
+
for k, v in indices.get('type_index', {}).items():
|
|
1277
|
+
self._type_index[k] = set(v) if isinstance(v, list) else set()
|
|
1278
|
+
|
|
1279
|
+
# File index
|
|
1280
|
+
self._file_index = defaultdict(set)
|
|
1281
|
+
for k, v in indices.get('file_index', {}).items():
|
|
1282
|
+
self._file_index[k] = set(v) if isinstance(v, list) else set()
|
|
1283
|
+
|
|
1284
|
+
# Source doc index
|
|
1285
|
+
self._source_doc_index = defaultdict(set)
|
|
1286
|
+
for k, v in indices.get('source_doc_index', {}).items():
|
|
1287
|
+
self._source_doc_index[k] = set(v) if isinstance(v, list) else set()
|
|
1288
|
+
|
|
1289
|
+
# Restore schema
|
|
1290
|
+
self._schema = data.pop('_schema', None)
|
|
1291
|
+
|
|
1292
|
+
# Restore metadata
|
|
1293
|
+
self._metadata = data.pop('_metadata', {})
|
|
1294
|
+
|
|
1295
|
+
# Restore graph - handle both "links" and "edges" keys for compatibility
|
|
1296
|
+
# NetworkX 3.5+ defaults to "edges", but we write "links" for visualization compatibility
|
|
1297
|
+
if 'edges' in data and 'links' not in data:
|
|
1298
|
+
# Data uses new NetworkX 3.5+ default "edges" key - rename to "links" for node_link_graph
|
|
1299
|
+
data['links'] = data.pop('edges')
|
|
1300
|
+
|
|
1301
|
+
self._graph = nx.node_link_graph(data, edges="links")
|
|
1302
|
+
|
|
1303
|
+
# Rebuild missing indices if needed (for legacy graphs)
|
|
1304
|
+
if not self._type_index or not self._file_index:
|
|
1305
|
+
self._rebuild_indices()
|
|
1306
|
+
|
|
1307
|
+
logger.info(f"Loaded graph from {path} ({self._graph.number_of_nodes()} entities, {self._graph.number_of_edges()} relations)")
|
|
1308
|
+
|
|
1309
|
+
def _rebuild_indices(self) -> None:
|
|
1310
|
+
"""Rebuild all indices from graph data (for legacy graph files)."""
|
|
1311
|
+
self._entity_index = defaultdict(set)
|
|
1312
|
+
self._type_index = defaultdict(set)
|
|
1313
|
+
self._file_index = defaultdict(set)
|
|
1314
|
+
self._source_doc_index = defaultdict(set)
|
|
1315
|
+
|
|
1316
|
+
for node_id, data in self._graph.nodes(data=True):
|
|
1317
|
+
# Name index
|
|
1318
|
+
name = data.get('name', '').lower()
|
|
1319
|
+
if name:
|
|
1320
|
+
self._entity_index[name].add(node_id)
|
|
1321
|
+
|
|
1322
|
+
# Type index
|
|
1323
|
+
entity_type = data.get('type', '').lower()
|
|
1324
|
+
if entity_type:
|
|
1325
|
+
self._type_index[entity_type].add(node_id)
|
|
1326
|
+
|
|
1327
|
+
# File index (from file_path attribute)
|
|
1328
|
+
file_path = data.get('file_path', '')
|
|
1329
|
+
if file_path:
|
|
1330
|
+
self._file_index[file_path].add(node_id)
|
|
1331
|
+
|
|
1332
|
+
# Also index from citations
|
|
1333
|
+
for citation in data.get('citations', []):
|
|
1334
|
+
if isinstance(citation, dict):
|
|
1335
|
+
fp = citation.get('file_path', '')
|
|
1336
|
+
if fp:
|
|
1337
|
+
self._file_index[fp].add(node_id)
|
|
1338
|
+
doc_id = citation.get('doc_id', '')
|
|
1339
|
+
if doc_id:
|
|
1340
|
+
self._source_doc_index[doc_id].add(node_id)
|
|
1341
|
+
|
|
1342
|
+
logger.info(f"Rebuilt indices: {len(self._entity_index)} names, {len(self._type_index)} types, {len(self._file_index)} files")
|
|
1343
|
+
|
|
1344
|
+
def clear(self) -> None:
|
|
1345
|
+
"""Clear all data from the graph."""
|
|
1346
|
+
self._graph.clear()
|
|
1347
|
+
self._entity_index.clear()
|
|
1348
|
+
self._type_index.clear()
|
|
1349
|
+
self._file_index.clear()
|
|
1350
|
+
self._source_doc_index.clear()
|
|
1351
|
+
self._schema = None
|
|
1352
|
+
self._metadata = {}
|
|
1353
|
+
|
|
1354
|
+
# ========== Subgraph Operations ==========
|
|
1355
|
+
|
|
1356
|
+
def get_subgraph(self, node_ids: List[str]) -> 'KnowledgeGraph':
|
|
1357
|
+
"""
|
|
1358
|
+
Get a subgraph containing only specified nodes and their edges.
|
|
1359
|
+
|
|
1360
|
+
Args:
|
|
1361
|
+
node_ids: List of node IDs to include
|
|
1362
|
+
|
|
1363
|
+
Returns:
|
|
1364
|
+
New KnowledgeGraph instance with subgraph
|
|
1365
|
+
"""
|
|
1366
|
+
subgraph = KnowledgeGraph()
|
|
1367
|
+
subgraph._graph = self._graph.subgraph(node_ids).copy()
|
|
1368
|
+
|
|
1369
|
+
# Rebuild indices for subgraph
|
|
1370
|
+
for node_id, data in subgraph._graph.nodes(data=True):
|
|
1371
|
+
name = data.get('name', '').lower()
|
|
1372
|
+
if name:
|
|
1373
|
+
subgraph._entity_index[name] = node_id
|
|
1374
|
+
|
|
1375
|
+
citation = data.get('citation', {})
|
|
1376
|
+
if isinstance(citation, dict):
|
|
1377
|
+
doc_id = citation.get('doc_id')
|
|
1378
|
+
if doc_id:
|
|
1379
|
+
subgraph._source_doc_index[doc_id].add(node_id)
|
|
1380
|
+
|
|
1381
|
+
return subgraph
|
|
1382
|
+
|
|
1383
|
+
def get_connected_component(self, node_id: str) -> List[str]:
|
|
1384
|
+
"""
|
|
1385
|
+
Get all nodes in the same connected component as the given node.
|
|
1386
|
+
|
|
1387
|
+
Args:
|
|
1388
|
+
node_id: Starting node ID
|
|
1389
|
+
|
|
1390
|
+
Returns:
|
|
1391
|
+
List of node IDs in the connected component
|
|
1392
|
+
"""
|
|
1393
|
+
if not self._graph.has_node(node_id):
|
|
1394
|
+
return []
|
|
1395
|
+
|
|
1396
|
+
# For directed graphs, use weakly connected components
|
|
1397
|
+
undirected = self._graph.to_undirected()
|
|
1398
|
+
component = nx.node_connected_component(undirected, node_id)
|
|
1399
|
+
return list(component)
|
|
1400
|
+
|
|
1401
|
+
# ========== Citation Helpers ==========
|
|
1402
|
+
|
|
1403
|
+
def get_citation(self, entity_id: str) -> Optional[Citation]:
|
|
1404
|
+
"""Get citation for an entity."""
|
|
1405
|
+
entity = self.get_entity(entity_id)
|
|
1406
|
+
if entity and 'citation' in entity:
|
|
1407
|
+
return Citation.from_dict(entity['citation'])
|
|
1408
|
+
return None
|
|
1409
|
+
|
|
1410
|
+
def get_citations_for_query(self, query: str, top_k: int = 5) -> List[Citation]:
|
|
1411
|
+
"""
|
|
1412
|
+
Get citations for entities matching a query.
|
|
1413
|
+
|
|
1414
|
+
Useful for the LLM to retrieve source content on-demand.
|
|
1415
|
+
|
|
1416
|
+
Args:
|
|
1417
|
+
query: Search query
|
|
1418
|
+
top_k: Maximum citations to return
|
|
1419
|
+
|
|
1420
|
+
Returns:
|
|
1421
|
+
List of Citation objects
|
|
1422
|
+
"""
|
|
1423
|
+
results = self.search(query, top_k=top_k)
|
|
1424
|
+
citations = []
|
|
1425
|
+
|
|
1426
|
+
for result in results:
|
|
1427
|
+
entity = result['entity']
|
|
1428
|
+
if 'citation' in entity:
|
|
1429
|
+
citations.append(Citation.from_dict(entity['citation']))
|
|
1430
|
+
|
|
1431
|
+
return citations
|
|
1432
|
+
|
|
1433
|
+
def export_citations_summary(self) -> Dict[str, List[Dict[str, Any]]]:
|
|
1434
|
+
"""
|
|
1435
|
+
Export a summary of all citations grouped by file.
|
|
1436
|
+
|
|
1437
|
+
Returns:
|
|
1438
|
+
Dict mapping file paths to lists of entity summaries
|
|
1439
|
+
"""
|
|
1440
|
+
by_file: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
|
|
1441
|
+
|
|
1442
|
+
for node_id, data in self._graph.nodes(data=True):
|
|
1443
|
+
citation = data.get('citation', {})
|
|
1444
|
+
if isinstance(citation, dict) and citation.get('file_path'):
|
|
1445
|
+
by_file[citation['file_path']].append({
|
|
1446
|
+
'entity_id': node_id,
|
|
1447
|
+
'name': data.get('name'),
|
|
1448
|
+
'type': data.get('type'),
|
|
1449
|
+
'line_start': citation.get('line_start'),
|
|
1450
|
+
'line_end': citation.get('line_end'),
|
|
1451
|
+
})
|
|
1452
|
+
|
|
1453
|
+
# Sort entities within each file by line number
|
|
1454
|
+
for file_path in by_file:
|
|
1455
|
+
by_file[file_path].sort(key=lambda x: x.get('line_start') or 0)
|
|
1456
|
+
|
|
1457
|
+
return dict(by_file)
|