alita-sdk 0.3.379__py3-none-any.whl → 0.3.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +156 -0
- alita_sdk/cli/agent_loader.py +245 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3113 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +1 -1
- alita_sdk/configurations/ado.py +141 -20
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/figma.py +76 -0
- alita_sdk/configurations/gitlab.py +91 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +329 -0
- alita_sdk/configurations/qtest.py +72 -1
- alita_sdk/configurations/report_portal.py +96 -0
- alita_sdk/configurations/sharepoint.py +148 -0
- alita_sdk/configurations/testio.py +83 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/artifact.py +3 -3
- alita_sdk/runtime/clients/client.py +388 -46
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +8 -21
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +157 -39
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
- alita_sdk/runtime/langchain/document_loaders/constants.py +40 -19
- alita_sdk/runtime/langchain/langraph_agent.py +405 -84
- alita_sdk/runtime/langchain/utils.py +106 -7
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +31 -0
- alita_sdk/runtime/toolkits/application.py +29 -10
- alita_sdk/runtime/toolkits/artifact.py +20 -11
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +783 -0
- alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +251 -6
- alita_sdk/runtime/toolkits/tools.py +356 -69
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +10 -3
- alita_sdk/runtime/tools/application.py +27 -6
- alita_sdk/runtime/tools/artifact.py +511 -28
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +67 -35
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +148 -46
- alita_sdk/runtime/tools/llm.py +1003 -128
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +8 -5
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -4
- alita_sdk/runtime/tools/sandbox.py +65 -48
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +9 -3
- alita_sdk/runtime/tools/vectorstore_base.py +70 -14
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/serialization.py +155 -0
- alita_sdk/runtime/utils/streamlit.py +40 -13
- alita_sdk/runtime/utils/toolkit_utils.py +30 -9
- alita_sdk/runtime/utils/utils.py +36 -0
- alita_sdk/tools/__init__.py +134 -35
- alita_sdk/tools/ado/repos/__init__.py +51 -32
- alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
- alita_sdk/tools/ado/test_plan/__init__.py +25 -9
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
- alita_sdk/tools/ado/utils.py +1 -18
- alita_sdk/tools/ado/wiki/__init__.py +25 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
- alita_sdk/tools/ado/work_item/__init__.py +26 -13
- alita_sdk/tools/ado/work_item/ado_wrapper.py +73 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +11 -8
- alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +271 -84
- alita_sdk/tools/bitbucket/__init__.py +17 -11
- alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/browser/__init__.py +5 -4
- alita_sdk/tools/carrier/__init__.py +5 -6
- alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
- alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
- alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +10 -7
- alita_sdk/tools/cloud/azure/__init__.py +10 -7
- alita_sdk/tools/cloud/gcp/__init__.py +10 -7
- alita_sdk/tools/cloud/k8s/__init__.py +10 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +11 -8
- alita_sdk/tools/code_indexer_toolkit.py +82 -22
- alita_sdk/tools/confluence/__init__.py +22 -16
- alita_sdk/tools/confluence/api_wrapper.py +107 -30
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +493 -30
- alita_sdk/tools/figma/__init__.py +58 -11
- alita_sdk/tools/figma/api_wrapper.py +1235 -143
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +14 -15
- alita_sdk/tools/github/github_client.py +224 -100
- alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
- alita_sdk/tools/github/schemas.py +14 -5
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/github/tool_prompts.py +9 -22
- alita_sdk/tools/gitlab/__init__.py +16 -11
- alita_sdk/tools/gitlab/api_wrapper.py +218 -48
- alita_sdk/tools/gitlab_org/__init__.py +10 -9
- alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
- alita_sdk/tools/google/bigquery/__init__.py +13 -12
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +11 -8
- alita_sdk/tools/google_places/api_wrapper.py +1 -1
- alita_sdk/tools/jira/__init__.py +17 -10
- alita_sdk/tools/jira/api_wrapper.py +92 -41
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +12 -4
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1368 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +38 -25
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +10 -9
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
- alita_sdk/tools/qtest/__init__.py +31 -11
- alita_sdk/tools/qtest/api_wrapper.py +2135 -86
- alita_sdk/tools/rally/__init__.py +10 -9
- alita_sdk/tools/rally/api_wrapper.py +1 -1
- alita_sdk/tools/report_portal/__init__.py +12 -8
- alita_sdk/tools/salesforce/__init__.py +10 -8
- alita_sdk/tools/servicenow/__init__.py +17 -15
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +10 -7
- alita_sdk/tools/sharepoint/api_wrapper.py +129 -38
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +10 -7
- alita_sdk/tools/slack/api_wrapper.py +2 -2
- alita_sdk/tools/sql/__init__.py +12 -9
- alita_sdk/tools/testio/__init__.py +10 -7
- alita_sdk/tools/testrail/__init__.py +11 -10
- alita_sdk/tools/testrail/api_wrapper.py +1 -1
- alita_sdk/tools/utils/__init__.py +9 -4
- alita_sdk/tools/utils/content_parser.py +103 -18
- alita_sdk/tools/utils/text_operations.py +410 -0
- alita_sdk/tools/utils/tool_prompts.py +79 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +30 -13
- alita_sdk/tools/xray/__init__.py +13 -9
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +10 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +11 -7
- alita_sdk/tools/zephyr_essential/__init__.py +10 -7
- alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
- alita_sdk/tools/zephyr_essential/client.py +2 -2
- alita_sdk/tools/zephyr_scale/__init__.py +11 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
- alita_sdk/tools/zephyr_squad/__init__.py +10 -7
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +154 -8
- alita_sdk-0.3.627.dist-info/RECORD +468 -0
- alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.379.dist-info/RECORD +0 -360
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,3172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Inventory Ingestion Pipeline.
|
|
3
|
+
|
|
4
|
+
This module provides a workflow/pipeline for building and updating knowledge graphs
|
|
5
|
+
from source code repositories. It is NOT a toolkit - it's a defined process that:
|
|
6
|
+
|
|
7
|
+
1. Connects to source toolkits (GitHub, ADO, LocalGit, etc.)
|
|
8
|
+
2. Fetches documents via their loader() methods
|
|
9
|
+
3. Extracts entities using LLM
|
|
10
|
+
4. Extracts relations between entities
|
|
11
|
+
5. Tracks source information for both entities (via citations) and relations
|
|
12
|
+
6. Persists the graph to JSON
|
|
13
|
+
|
|
14
|
+
The result is a graph dump that can be queried by the RetrievalToolkit.
|
|
15
|
+
|
|
16
|
+
Multi-Source Support:
|
|
17
|
+
- Entities from different sources are merged when they have the same (type, name)
|
|
18
|
+
- Each entity maintains citations from all sources that reference it
|
|
19
|
+
- Relations are tagged with source_toolkit to track which source created them
|
|
20
|
+
- Cross-source relations are automatically tracked (e.g., Jira ticket -> GitHub PR)
|
|
21
|
+
- Query relations by source: graph.get_relations_by_source('github')
|
|
22
|
+
- Find cross-source relations: graph.get_cross_source_relations()
|
|
23
|
+
|
|
24
|
+
Usage:
|
|
25
|
+
# With full configuration
|
|
26
|
+
from alita_sdk.community.inventory import IngestionConfig, IngestionPipeline
|
|
27
|
+
|
|
28
|
+
config = IngestionConfig.from_env() # or .from_yaml("config.yml")
|
|
29
|
+
pipeline = IngestionPipeline.from_config(config)
|
|
30
|
+
pipeline.register_toolkit('github', github_toolkit)
|
|
31
|
+
result = pipeline.run(source='github', branch='main')
|
|
32
|
+
|
|
33
|
+
# Or simpler approach
|
|
34
|
+
pipeline = IngestionPipeline(
|
|
35
|
+
llm=llm,
|
|
36
|
+
graph_path="/path/to/graph.json",
|
|
37
|
+
source_toolkits={'github': github_toolkit}
|
|
38
|
+
)
|
|
39
|
+
result = pipeline.run(source='github')
|
|
40
|
+
|
|
41
|
+
# Or delta update for changed files
|
|
42
|
+
result = pipeline.delta_update(
|
|
43
|
+
source='github',
|
|
44
|
+
file_paths=['src/app.py', 'src/utils.py']
|
|
45
|
+
)
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
import logging
|
|
49
|
+
import hashlib
|
|
50
|
+
import re
|
|
51
|
+
import time
|
|
52
|
+
import asyncio
|
|
53
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
54
|
+
from pathlib import Path
|
|
55
|
+
from typing import Any, Optional, List, Dict, Generator, Callable, TYPE_CHECKING, Tuple
|
|
56
|
+
from datetime import datetime
|
|
57
|
+
|
|
58
|
+
from pydantic import BaseModel, Field, PrivateAttr
|
|
59
|
+
from langchain_core.documents import Document
|
|
60
|
+
|
|
61
|
+
from .knowledge_graph import KnowledgeGraph, Citation
|
|
62
|
+
from .extractors import (
|
|
63
|
+
DocumentClassifier,
|
|
64
|
+
EntitySchemaDiscoverer,
|
|
65
|
+
EntityExtractor,
|
|
66
|
+
RelationExtractor,
|
|
67
|
+
FactExtractor,
|
|
68
|
+
ENTITY_TAXONOMY,
|
|
69
|
+
RELATIONSHIP_TAXONOMY,
|
|
70
|
+
)
|
|
71
|
+
from .parsers import (
|
|
72
|
+
parse_file as parser_parse_file,
|
|
73
|
+
get_parser_for_file,
|
|
74
|
+
ParseResult,
|
|
75
|
+
Symbol,
|
|
76
|
+
Relationship as ParserRelationship,
|
|
77
|
+
SymbolType,
|
|
78
|
+
RelationshipType as ParserRelationshipType,
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
if TYPE_CHECKING:
|
|
82
|
+
from .config import GuardrailsConfig, IngestionConfig
|
|
83
|
+
|
|
84
|
+
logger = logging.getLogger(__name__)
|
|
85
|
+
|
|
86
|
+
# ============================================================================
|
|
87
|
+
# PARSER-BASED EXTRACTION (AST/Regex - No LLM)
|
|
88
|
+
# ============================================================================
|
|
89
|
+
|
|
90
|
+
# Symbol types that parsers extract (skip LLM for these)
|
|
91
|
+
PARSER_EXTRACTED_TYPES = {
|
|
92
|
+
SymbolType.CLASS, SymbolType.FUNCTION, SymbolType.METHOD,
|
|
93
|
+
SymbolType.MODULE, SymbolType.INTERFACE, SymbolType.CONSTANT,
|
|
94
|
+
SymbolType.VARIABLE, SymbolType.IMPORT, SymbolType.PROPERTY,
|
|
95
|
+
SymbolType.FIELD, SymbolType.ENUM, SymbolType.TYPE_ALIAS,
|
|
96
|
+
SymbolType.DECORATOR, SymbolType.NAMESPACE, SymbolType.PARAMETER,
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
# Map parser SymbolType to entity type strings
|
|
100
|
+
SYMBOL_TYPE_TO_ENTITY_TYPE = {
|
|
101
|
+
SymbolType.CLASS: "class",
|
|
102
|
+
SymbolType.FUNCTION: "function",
|
|
103
|
+
SymbolType.METHOD: "method",
|
|
104
|
+
SymbolType.MODULE: "module",
|
|
105
|
+
SymbolType.INTERFACE: "interface",
|
|
106
|
+
SymbolType.CONSTANT: "constant",
|
|
107
|
+
SymbolType.VARIABLE: "variable",
|
|
108
|
+
SymbolType.IMPORT: "import",
|
|
109
|
+
SymbolType.PROPERTY: "property",
|
|
110
|
+
SymbolType.FIELD: "field",
|
|
111
|
+
SymbolType.ENUM: "enum",
|
|
112
|
+
SymbolType.TYPE_ALIAS: "type_alias",
|
|
113
|
+
SymbolType.DECORATOR: "decorator",
|
|
114
|
+
SymbolType.NAMESPACE: "namespace",
|
|
115
|
+
SymbolType.PARAMETER: "parameter",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
# Map parser RelationshipType to relation type strings
|
|
119
|
+
PARSER_REL_TYPE_TO_STRING = {
|
|
120
|
+
ParserRelationshipType.IMPORTS: "imports",
|
|
121
|
+
ParserRelationshipType.EXPORTS: "exports",
|
|
122
|
+
ParserRelationshipType.CALLS: "calls",
|
|
123
|
+
ParserRelationshipType.RETURNS: "returns",
|
|
124
|
+
ParserRelationshipType.INHERITANCE: "extends",
|
|
125
|
+
ParserRelationshipType.IMPLEMENTATION: "implements",
|
|
126
|
+
ParserRelationshipType.COMPOSITION: "contains",
|
|
127
|
+
ParserRelationshipType.AGGREGATION: "uses",
|
|
128
|
+
ParserRelationshipType.DEFINES: "defines",
|
|
129
|
+
ParserRelationshipType.CONTAINS: "contains",
|
|
130
|
+
ParserRelationshipType.DECORATES: "decorates",
|
|
131
|
+
ParserRelationshipType.ANNOTATES: "annotates",
|
|
132
|
+
ParserRelationshipType.REFERENCES: "references",
|
|
133
|
+
ParserRelationshipType.USES: "uses",
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _is_code_file(file_path: str) -> bool:
|
|
138
|
+
"""Check if file is a code file that parsers can handle."""
|
|
139
|
+
code_extensions = {
|
|
140
|
+
'.py', '.pyx', '.pyi', # Python
|
|
141
|
+
'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', # JavaScript/TypeScript
|
|
142
|
+
'.java', # Java
|
|
143
|
+
'.kt', '.kts', # Kotlin
|
|
144
|
+
'.cs', # C#
|
|
145
|
+
'.rs', # Rust
|
|
146
|
+
'.swift', # Swift
|
|
147
|
+
'.go', # Go
|
|
148
|
+
}
|
|
149
|
+
ext = Path(file_path).suffix.lower()
|
|
150
|
+
return ext in code_extensions
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _is_code_like_file(file_path: str) -> bool:
|
|
154
|
+
"""
|
|
155
|
+
Check if file looks like code but may not have a specific parser.
|
|
156
|
+
|
|
157
|
+
This includes:
|
|
158
|
+
- Supported code files (with parsers)
|
|
159
|
+
- Unsupported code files (no parser - use hybrid fallback)
|
|
160
|
+
- Script files that contain code structure
|
|
161
|
+
"""
|
|
162
|
+
# All supported code files
|
|
163
|
+
supported_extensions = {
|
|
164
|
+
'.py', '.pyx', '.pyi', # Python
|
|
165
|
+
'.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', # JavaScript/TypeScript
|
|
166
|
+
'.java', # Java
|
|
167
|
+
'.kt', '.kts', # Kotlin
|
|
168
|
+
'.cs', # C#
|
|
169
|
+
'.rs', # Rust
|
|
170
|
+
'.swift', # Swift
|
|
171
|
+
'.go', # Go
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
# Additional code-like files that need hybrid fallback
|
|
175
|
+
unsupported_code_extensions = {
|
|
176
|
+
# Scripting languages
|
|
177
|
+
'.lua', '.pl', '.pm', '.perl', '.rb', '.php',
|
|
178
|
+
'.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
|
|
179
|
+
# Other programming languages
|
|
180
|
+
'.scala', '.clj', '.cljs', '.ex', '.exs', '.erl', '.hrl',
|
|
181
|
+
'.hs', '.ml', '.fs', '.fsx', '.r', '.R', '.jl',
|
|
182
|
+
'.dart', '.nim', '.v', '.zig', '.cr', '.d',
|
|
183
|
+
'.c', '.cpp', '.cc', '.cxx', '.h', '.hpp', '.hxx',
|
|
184
|
+
'.m', '.mm', # Objective-C
|
|
185
|
+
'.groovy', '.gradle',
|
|
186
|
+
# Data/Config that may contain code
|
|
187
|
+
'.cmake', '.makefile', '.mk',
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
ext = Path(file_path).suffix.lower()
|
|
191
|
+
|
|
192
|
+
# Also check for Makefile without extension
|
|
193
|
+
file_name = Path(file_path).name.lower()
|
|
194
|
+
if file_name in {'makefile', 'gnumakefile'}:
|
|
195
|
+
return True
|
|
196
|
+
|
|
197
|
+
return ext in supported_extensions or ext in unsupported_code_extensions
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _symbol_to_entity(
|
|
201
|
+
symbol: Symbol,
|
|
202
|
+
source_toolkit: str,
|
|
203
|
+
generate_id_func: Callable[[str, str, str], str]
|
|
204
|
+
) -> Dict[str, Any]:
|
|
205
|
+
"""
|
|
206
|
+
Convert a parser Symbol to an entity dict.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
symbol: Parsed symbol from code parser
|
|
210
|
+
source_toolkit: Source toolkit name
|
|
211
|
+
generate_id_func: Function to generate entity ID
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
Entity dictionary compatible with graph
|
|
215
|
+
"""
|
|
216
|
+
entity_type = SYMBOL_TYPE_TO_ENTITY_TYPE.get(symbol.symbol_type, "unknown")
|
|
217
|
+
|
|
218
|
+
# Generate entity ID
|
|
219
|
+
entity_id = generate_id_func(entity_type, symbol.name, symbol.file_path)
|
|
220
|
+
|
|
221
|
+
# Build properties from symbol metadata
|
|
222
|
+
properties = {
|
|
223
|
+
'description': symbol.docstring or '',
|
|
224
|
+
'parent_symbol': symbol.parent_symbol,
|
|
225
|
+
'full_name': symbol.full_name or symbol.get_qualified_name(),
|
|
226
|
+
'visibility': symbol.visibility,
|
|
227
|
+
'is_static': symbol.is_static,
|
|
228
|
+
'is_async': symbol.is_async,
|
|
229
|
+
'is_exported': symbol.is_exported,
|
|
230
|
+
'signature': symbol.signature,
|
|
231
|
+
'return_type': symbol.return_type,
|
|
232
|
+
}
|
|
233
|
+
# Add any extra metadata
|
|
234
|
+
properties.update(symbol.metadata)
|
|
235
|
+
# Remove None values
|
|
236
|
+
properties = {k: v for k, v in properties.items() if v is not None}
|
|
237
|
+
|
|
238
|
+
# Create citation with line range
|
|
239
|
+
line_start = symbol.range.start.line if symbol.range else 1
|
|
240
|
+
line_end = symbol.range.end.line if symbol.range else line_start
|
|
241
|
+
|
|
242
|
+
citation = Citation(
|
|
243
|
+
file_path=symbol.file_path,
|
|
244
|
+
line_start=line_start,
|
|
245
|
+
line_end=line_end,
|
|
246
|
+
source_toolkit=source_toolkit,
|
|
247
|
+
doc_id=f"{source_toolkit}://{symbol.file_path}",
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
return {
|
|
251
|
+
'id': entity_id,
|
|
252
|
+
'name': symbol.name,
|
|
253
|
+
'type': entity_type,
|
|
254
|
+
'citation': citation,
|
|
255
|
+
'properties': properties,
|
|
256
|
+
'source': 'parser', # Mark as parser-extracted
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _parser_relationship_to_dict(
|
|
261
|
+
rel: ParserRelationship,
|
|
262
|
+
source_toolkit: str,
|
|
263
|
+
) -> Dict[str, Any]:
|
|
264
|
+
"""
|
|
265
|
+
Convert a parser Relationship to a relation dict.
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
rel: Parsed relationship from code parser
|
|
269
|
+
source_toolkit: Source toolkit name
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
Relation dictionary compatible with graph
|
|
273
|
+
"""
|
|
274
|
+
rel_type = PARSER_REL_TYPE_TO_STRING.get(rel.relationship_type, "references")
|
|
275
|
+
|
|
276
|
+
return {
|
|
277
|
+
'source_symbol': rel.source_symbol,
|
|
278
|
+
'target_symbol': rel.target_symbol,
|
|
279
|
+
'relation_type': rel_type,
|
|
280
|
+
'source_file': rel.source_file,
|
|
281
|
+
'target_file': rel.target_file,
|
|
282
|
+
'confidence': rel.confidence,
|
|
283
|
+
'is_cross_file': rel.is_cross_file,
|
|
284
|
+
'source': 'parser', # Mark as parser-extracted
|
|
285
|
+
'source_toolkit': source_toolkit,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
# ============================================================================
|
|
289
|
+
# ENTITY TYPE NORMALIZATION
|
|
290
|
+
# ============================================================================
|
|
291
|
+
|
|
292
|
+
# Types that should never be deduplicated (context-dependent)
|
|
293
|
+
CONTEXT_DEPENDENT_TYPES = {
|
|
294
|
+
"tool", "property", "properties", "parameter", "argument",
|
|
295
|
+
"field", "column", "attribute", "option", "setting",
|
|
296
|
+
"step", "test_step", "ui_field", "endpoint", "method",
|
|
297
|
+
"mcp_tool", "mcp_resource",
|
|
298
|
+
# File-level nodes are unique per file path
|
|
299
|
+
"file", "source_file", "document_file", "config_file", "web_file",
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
# Build canonical type set from ENTITY_TAXONOMY
|
|
303
|
+
_CANONICAL_TYPES = set()
|
|
304
|
+
for layer_data in ENTITY_TAXONOMY.values():
|
|
305
|
+
for type_def in layer_data["types"]:
|
|
306
|
+
_CANONICAL_TYPES.add(type_def["name"].lower())
|
|
307
|
+
|
|
308
|
+
# Map common variations to canonical forms
|
|
309
|
+
TYPE_NORMALIZATION_MAP = {
|
|
310
|
+
# Tool/Toolkit variations
|
|
311
|
+
"tools": "tool",
|
|
312
|
+
"Tool": "tool",
|
|
313
|
+
"Tools": "tool",
|
|
314
|
+
"Toolkit": "toolkit",
|
|
315
|
+
"toolkits": "toolkit",
|
|
316
|
+
# MCP variations
|
|
317
|
+
"MCP Server": "mcp_server",
|
|
318
|
+
"MCP Tool": "mcp_tool",
|
|
319
|
+
"MCP Resource": "mcp_resource",
|
|
320
|
+
# Common variations
|
|
321
|
+
"Feature": "feature",
|
|
322
|
+
"Features": "feature",
|
|
323
|
+
"API": "api",
|
|
324
|
+
"APIs": "api",
|
|
325
|
+
"Service": "service",
|
|
326
|
+
"Services": "service",
|
|
327
|
+
"Endpoint": "endpoint",
|
|
328
|
+
"Endpoints": "endpoint",
|
|
329
|
+
"Configuration": "configuration",
|
|
330
|
+
"Config": "configuration",
|
|
331
|
+
"Test Case": "test_case",
|
|
332
|
+
"Test Cases": "test_case",
|
|
333
|
+
"test case": "test_case",
|
|
334
|
+
"User Story": "user_story",
|
|
335
|
+
"User Stories": "user_story",
|
|
336
|
+
"user story": "user_story",
|
|
337
|
+
"Business Rule": "business_rule",
|
|
338
|
+
"business rule": "business_rule",
|
|
339
|
+
"UI Component": "ui_component",
|
|
340
|
+
"ui component": "ui_component",
|
|
341
|
+
"UI Field": "ui_field",
|
|
342
|
+
"ui field": "ui_field",
|
|
343
|
+
"Test Suite": "test_suite",
|
|
344
|
+
"test suite": "test_suite",
|
|
345
|
+
"Test Step": "test_step",
|
|
346
|
+
"test step": "test_step",
|
|
347
|
+
"Glossary Term": "glossary_term",
|
|
348
|
+
"glossary term": "glossary_term",
|
|
349
|
+
"Domain Entity": "domain_entity",
|
|
350
|
+
"domain entity": "domain_entity",
|
|
351
|
+
"Pull Request": "pull_request",
|
|
352
|
+
"pull request": "pull_request",
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
def normalize_entity_type(entity_type: str) -> str:
|
|
356
|
+
"""
|
|
357
|
+
Normalize entity type to canonical lowercase form.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
entity_type: Raw entity type from LLM extraction
|
|
361
|
+
|
|
362
|
+
Returns:
|
|
363
|
+
Canonical lowercase entity type
|
|
364
|
+
"""
|
|
365
|
+
if not entity_type:
|
|
366
|
+
return "unknown"
|
|
367
|
+
|
|
368
|
+
# Check explicit mapping first
|
|
369
|
+
if entity_type in TYPE_NORMALIZATION_MAP:
|
|
370
|
+
return TYPE_NORMALIZATION_MAP[entity_type]
|
|
371
|
+
|
|
372
|
+
# Normalize: lowercase, replace spaces with underscores
|
|
373
|
+
normalized = entity_type.lower().strip().replace(" ", "_").replace("-", "_")
|
|
374
|
+
|
|
375
|
+
# If it's already canonical, return it
|
|
376
|
+
if normalized in _CANONICAL_TYPES:
|
|
377
|
+
return normalized
|
|
378
|
+
|
|
379
|
+
# Handle plural forms by removing trailing 's' (but not 'ss' like 'class')
|
|
380
|
+
if normalized.endswith('s') and not normalized.endswith('ss') and len(normalized) > 3:
|
|
381
|
+
singular = normalized[:-1]
|
|
382
|
+
if singular in _CANONICAL_TYPES:
|
|
383
|
+
return singular
|
|
384
|
+
|
|
385
|
+
# Return the normalized form even if not in taxonomy
|
|
386
|
+
# (allows for custom types while maintaining consistency)
|
|
387
|
+
return normalized
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class IngestionResult(BaseModel):
|
|
391
|
+
"""Result of an ingestion run."""
|
|
392
|
+
success: bool = True
|
|
393
|
+
source: str = "unknown"
|
|
394
|
+
documents_processed: int = 0
|
|
395
|
+
documents_skipped: int = 0
|
|
396
|
+
entities_added: int = 0
|
|
397
|
+
entities_removed: int = 0
|
|
398
|
+
relations_added: int = 0
|
|
399
|
+
duration_seconds: float = 0.0
|
|
400
|
+
errors: List[str] = Field(default_factory=list)
|
|
401
|
+
failed_documents: List[str] = Field(default_factory=list)
|
|
402
|
+
graph_stats: Dict[str, Any] = Field(default_factory=dict)
|
|
403
|
+
resumed_from_checkpoint: bool = False
|
|
404
|
+
|
|
405
|
+
def __str__(self) -> str:
|
|
406
|
+
status = "✅ Success" if self.success else "❌ Failed"
|
|
407
|
+
resumed = " (resumed)" if self.resumed_from_checkpoint else ""
|
|
408
|
+
skipped_info = f"\n Documents skipped: {self.documents_skipped}" if self.documents_skipped else ""
|
|
409
|
+
failed_info = f"\n Failed documents: {len(self.failed_documents)}" if self.failed_documents else ""
|
|
410
|
+
return (
|
|
411
|
+
f"{status}: Ingestion from {self.source}{resumed}\n"
|
|
412
|
+
f" Documents processed: {self.documents_processed}{skipped_info}{failed_info}\n"
|
|
413
|
+
f" Entities added: {self.entities_added}\n"
|
|
414
|
+
f" Relations added: {self.relations_added}\n"
|
|
415
|
+
f" Duration: {self.duration_seconds:.1f}s\n"
|
|
416
|
+
f" Graph: {self.graph_stats.get('node_count', 0)} entities, "
|
|
417
|
+
f"{self.graph_stats.get('edge_count', 0)} relations"
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class IngestionCheckpoint(BaseModel):
|
|
422
|
+
"""
|
|
423
|
+
Checkpoint for resumable ingestion.
|
|
424
|
+
|
|
425
|
+
Saved periodically during ingestion to allow recovery from failures.
|
|
426
|
+
"""
|
|
427
|
+
# Run identification
|
|
428
|
+
run_id: str = Field(description="Unique identifier for this ingestion run")
|
|
429
|
+
source: str = Field(description="Source toolkit name")
|
|
430
|
+
started_at: str = Field(description="ISO timestamp when ingestion started")
|
|
431
|
+
updated_at: str = Field(description="ISO timestamp of last checkpoint update")
|
|
432
|
+
|
|
433
|
+
# Configuration
|
|
434
|
+
branch: Optional[str] = None
|
|
435
|
+
whitelist: Optional[List[str]] = None
|
|
436
|
+
blacklist: Optional[List[str]] = None
|
|
437
|
+
extract_relations: bool = True
|
|
438
|
+
|
|
439
|
+
# Progress tracking
|
|
440
|
+
phase: str = Field(default="fetch", description="Current phase: fetch, extract, relations, complete")
|
|
441
|
+
documents_processed: int = 0
|
|
442
|
+
entities_added: int = 0
|
|
443
|
+
relations_added: int = 0
|
|
444
|
+
|
|
445
|
+
# Processed document tracking with content hashes for incremental updates
|
|
446
|
+
# Maps file_path -> content_hash (allows detecting changed files)
|
|
447
|
+
processed_files: List[str] = Field(default_factory=list) # Legacy: just paths
|
|
448
|
+
file_hashes: Dict[str, str] = Field(default_factory=dict) # New: path -> content_hash
|
|
449
|
+
|
|
450
|
+
# Failed document tracking for retry
|
|
451
|
+
failed_files: List[Dict[str, Any]] = Field(default_factory=list) # [{file_path, error, attempts}]
|
|
452
|
+
|
|
453
|
+
# Collected entities for relation extraction (stored if phase changes)
|
|
454
|
+
pending_entities: List[Dict[str, Any]] = Field(default_factory=list)
|
|
455
|
+
|
|
456
|
+
# Status
|
|
457
|
+
completed: bool = False
|
|
458
|
+
errors: List[str] = Field(default_factory=list)
|
|
459
|
+
|
|
460
|
+
@classmethod
|
|
461
|
+
def create(cls, source: str, branch: Optional[str] = None,
|
|
462
|
+
whitelist: Optional[List[str]] = None,
|
|
463
|
+
blacklist: Optional[List[str]] = None,
|
|
464
|
+
extract_relations: bool = True) -> 'IngestionCheckpoint':
|
|
465
|
+
"""Create a new checkpoint for a fresh ingestion run."""
|
|
466
|
+
import uuid
|
|
467
|
+
now = datetime.utcnow().isoformat()
|
|
468
|
+
return cls(
|
|
469
|
+
run_id=str(uuid.uuid4())[:8],
|
|
470
|
+
source=source,
|
|
471
|
+
started_at=now,
|
|
472
|
+
updated_at=now,
|
|
473
|
+
branch=branch,
|
|
474
|
+
whitelist=whitelist,
|
|
475
|
+
blacklist=blacklist,
|
|
476
|
+
extract_relations=extract_relations,
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
def save(self, checkpoint_path: str) -> None:
|
|
480
|
+
"""Save checkpoint to disk."""
|
|
481
|
+
import json
|
|
482
|
+
self.updated_at = datetime.utcnow().isoformat()
|
|
483
|
+
path = Path(checkpoint_path)
|
|
484
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
485
|
+
|
|
486
|
+
# Write to temp file first, then rename for atomicity
|
|
487
|
+
temp_path = path.with_suffix('.tmp')
|
|
488
|
+
with open(temp_path, 'w') as f:
|
|
489
|
+
json.dump(self.model_dump(), f, indent=2, default=str)
|
|
490
|
+
temp_path.rename(path)
|
|
491
|
+
|
|
492
|
+
logger.debug(f"Checkpoint saved: {self.documents_processed} docs, {self.entities_added} entities")
|
|
493
|
+
|
|
494
|
+
@classmethod
|
|
495
|
+
def load(cls, checkpoint_path: str) -> Optional['IngestionCheckpoint']:
|
|
496
|
+
"""Load checkpoint from disk. Returns None if not found."""
|
|
497
|
+
import json
|
|
498
|
+
path = Path(checkpoint_path)
|
|
499
|
+
if not path.exists():
|
|
500
|
+
return None
|
|
501
|
+
|
|
502
|
+
try:
|
|
503
|
+
with open(path) as f:
|
|
504
|
+
data = json.load(f)
|
|
505
|
+
return cls(**data)
|
|
506
|
+
except Exception as e:
|
|
507
|
+
logger.warning(f"Failed to load checkpoint: {e}")
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
def mark_file_processed(self, file_path: str, content_hash: Optional[str] = None) -> None:
|
|
511
|
+
"""Mark a file as successfully processed with optional content hash."""
|
|
512
|
+
if file_path not in self.processed_files:
|
|
513
|
+
self.processed_files.append(file_path)
|
|
514
|
+
if content_hash:
|
|
515
|
+
self.file_hashes[file_path] = content_hash
|
|
516
|
+
|
|
517
|
+
def mark_file_failed(self, file_path: str, error: str) -> None:
|
|
518
|
+
"""Mark a file as failed with error details."""
|
|
519
|
+
# Check if already in failed list
|
|
520
|
+
for failed in self.failed_files:
|
|
521
|
+
if failed['file_path'] == file_path:
|
|
522
|
+
failed['attempts'] = failed.get('attempts', 1) + 1
|
|
523
|
+
failed['last_error'] = error
|
|
524
|
+
return
|
|
525
|
+
|
|
526
|
+
self.failed_files.append({
|
|
527
|
+
'file_path': file_path,
|
|
528
|
+
'error': error,
|
|
529
|
+
'attempts': 1
|
|
530
|
+
})
|
|
531
|
+
|
|
532
|
+
def is_file_processed(self, file_path: str) -> bool:
|
|
533
|
+
"""Check if a file has already been processed."""
|
|
534
|
+
return file_path in self.processed_files
|
|
535
|
+
|
|
536
|
+
def has_file_changed(self, file_path: str, content_hash: str) -> bool:
|
|
537
|
+
"""
|
|
538
|
+
Check if a file has changed since last processing.
|
|
539
|
+
|
|
540
|
+
Returns True if:
|
|
541
|
+
- File was never processed before
|
|
542
|
+
- File was processed but we don't have its hash (legacy)
|
|
543
|
+
- File content hash differs from stored hash
|
|
544
|
+
"""
|
|
545
|
+
if file_path not in self.file_hashes:
|
|
546
|
+
return True # Never seen or no hash stored
|
|
547
|
+
return self.file_hashes.get(file_path) != content_hash
|
|
548
|
+
|
|
549
|
+
def get_file_hash(self, file_path: str) -> Optional[str]:
|
|
550
|
+
"""Get stored content hash for a file."""
|
|
551
|
+
return self.file_hashes.get(file_path)
|
|
552
|
+
|
|
553
|
+
def get_retry_files(self, max_attempts: int = 3) -> List[str]:
|
|
554
|
+
"""Get files that should be retried (under max attempts)."""
|
|
555
|
+
return [
|
|
556
|
+
f['file_path'] for f in self.failed_files
|
|
557
|
+
if f.get('attempts', 1) < max_attempts
|
|
558
|
+
]
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class IngestionPipeline(BaseModel):
|
|
562
|
+
"""
|
|
563
|
+
Pipeline for ingesting source code into a knowledge graph.
|
|
564
|
+
|
|
565
|
+
This is a workflow, not a toolkit. It processes sources and produces
|
|
566
|
+
a graph dump that can be queried by the RetrievalToolkit.
|
|
567
|
+
|
|
568
|
+
The pipeline:
|
|
569
|
+
1. Connects to source toolkits (GitHub, ADO, LocalGit, etc.)
|
|
570
|
+
2. Fetches documents via their loader() methods
|
|
571
|
+
3. Uses LLM to extract entities based on ENTITY_TAXONOMY
|
|
572
|
+
4. Uses LLM to extract relations based on RELATIONSHIP_TAXONOMY
|
|
573
|
+
5. Persists graph to JSON (auto-save after mutations)
|
|
574
|
+
|
|
575
|
+
Configuration can be provided directly or via IngestionConfig:
|
|
576
|
+
|
|
577
|
+
# Direct configuration
|
|
578
|
+
pipeline = IngestionPipeline(
|
|
579
|
+
llm=llm,
|
|
580
|
+
graph_path="./graph.json",
|
|
581
|
+
guardrails=GuardrailsConfig(max_tokens_per_doc=4000),
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
# From config file
|
|
585
|
+
config = IngestionConfig.from_yaml("config.yml")
|
|
586
|
+
pipeline = IngestionPipeline.from_config(config)
|
|
587
|
+
"""
|
|
588
|
+
|
|
589
|
+
# Core dependencies
|
|
590
|
+
llm: Any = None
|
|
591
|
+
alita: Any = None
|
|
592
|
+
|
|
593
|
+
# Graph persistence path
|
|
594
|
+
graph_path: str = Field(description="Path to persist the knowledge graph JSON")
|
|
595
|
+
|
|
596
|
+
# Source toolkits (injected by runtime)
|
|
597
|
+
# Maps toolkit name -> toolkit instance (e.g., {'github': GitHubApiWrapper})
|
|
598
|
+
source_toolkits: Dict[str, Any] = Field(default_factory=dict)
|
|
599
|
+
|
|
600
|
+
# Optional embedding for semantic search
|
|
601
|
+
embedding: Optional[Any] = Field(default=None, description="Embedding model instance")
|
|
602
|
+
embedding_model: Optional[str] = Field(default=None, description="Embedding model name (for Alita)")
|
|
603
|
+
|
|
604
|
+
# Guardrails configuration
|
|
605
|
+
guardrails: Optional[Any] = Field(
|
|
606
|
+
default=None,
|
|
607
|
+
description="GuardrailsConfig for rate limiting, content filtering, etc."
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
# Checkpoint configuration for resumable ingestion
|
|
611
|
+
checkpoint_dir: Optional[str] = Field(
|
|
612
|
+
default=None,
|
|
613
|
+
description="Directory to store checkpoints. If None, uses graph_path directory."
|
|
614
|
+
)
|
|
615
|
+
checkpoint_interval: int = Field(
|
|
616
|
+
default=10,
|
|
617
|
+
description="Save checkpoint every N documents processed"
|
|
618
|
+
)
|
|
619
|
+
|
|
620
|
+
# Parallel processing configuration
|
|
621
|
+
max_parallel_extractions: int = Field(
|
|
622
|
+
default=10,
|
|
623
|
+
description="Maximum number of parallel entity extraction requests (default: 10)"
|
|
624
|
+
)
|
|
625
|
+
batch_size: int = Field(
|
|
626
|
+
default=10,
|
|
627
|
+
description="Number of documents to process in each parallel batch (default: 10)"
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
# Skip trivial files configuration
|
|
631
|
+
min_file_lines: int = Field(
|
|
632
|
+
default=20,
|
|
633
|
+
description="Minimum number of lines for LLM extraction (smaller files only use parser)"
|
|
634
|
+
)
|
|
635
|
+
min_file_chars: int = Field(
|
|
636
|
+
default=300,
|
|
637
|
+
description="Minimum number of characters for LLM extraction (smaller files only use parser)"
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
# Progress callback (optional)
|
|
641
|
+
# Signature: callback(message: str, phase: str) -> None
|
|
642
|
+
progress_callback: Optional[Callable[[str, str], None]] = None
|
|
643
|
+
|
|
644
|
+
# Private attributes
|
|
645
|
+
_embedding: Optional[Any] = PrivateAttr(default=None)
|
|
646
|
+
_knowledge_graph: Optional[KnowledgeGraph] = PrivateAttr(default=None)
|
|
647
|
+
_document_classifier: Optional[DocumentClassifier] = PrivateAttr(default=None)
|
|
648
|
+
_schema_discoverer: Optional[EntitySchemaDiscoverer] = PrivateAttr(default=None)
|
|
649
|
+
_entity_extractor: Optional[EntityExtractor] = PrivateAttr(default=None)
|
|
650
|
+
_relation_extractor: Optional[RelationExtractor] = PrivateAttr(default=None)
|
|
651
|
+
_initialized: bool = PrivateAttr(default=False)
|
|
652
|
+
_last_request_time: float = PrivateAttr(default=0.0)
|
|
653
|
+
_request_count: int = PrivateAttr(default=0)
|
|
654
|
+
_current_checkpoint: Optional[IngestionCheckpoint] = PrivateAttr(default=None)
|
|
655
|
+
|
|
656
|
+
class Config:
|
|
657
|
+
arbitrary_types_allowed = True
|
|
658
|
+
|
|
659
|
+
def model_post_init(self, __context) -> None:
|
|
660
|
+
"""Initialize after model construction."""
|
|
661
|
+
# Initialize knowledge graph
|
|
662
|
+
self._knowledge_graph = KnowledgeGraph()
|
|
663
|
+
|
|
664
|
+
# Handle model_construct case where graph_path may not be set
|
|
665
|
+
graph_path = getattr(self, 'graph_path', None)
|
|
666
|
+
if graph_path:
|
|
667
|
+
try:
|
|
668
|
+
path = Path(graph_path)
|
|
669
|
+
if path.exists():
|
|
670
|
+
self._knowledge_graph.load_from_json(graph_path)
|
|
671
|
+
stats = self._knowledge_graph.get_stats()
|
|
672
|
+
logger.info(f"Loaded existing graph: {stats['node_count']} entities, {stats['edge_count']} relations")
|
|
673
|
+
except Exception as e:
|
|
674
|
+
logger.warning(f"Could not load existing graph: {e}")
|
|
675
|
+
|
|
676
|
+
self._init_extractors()
|
|
677
|
+
|
|
678
|
+
def _init_extractors(self) -> bool:
|
|
679
|
+
"""Initialize LLM-based extractors."""
|
|
680
|
+
if self._initialized:
|
|
681
|
+
return True
|
|
682
|
+
|
|
683
|
+
if not self.llm:
|
|
684
|
+
logger.warning("LLM not configured - extraction will fail")
|
|
685
|
+
return False
|
|
686
|
+
|
|
687
|
+
# Initialize embedding if configured (either directly or via Alita)
|
|
688
|
+
if self.embedding:
|
|
689
|
+
self._embedding = self.embedding
|
|
690
|
+
elif self.alita and self.embedding_model:
|
|
691
|
+
try:
|
|
692
|
+
self._embedding = self.alita.get_embeddings(self.embedding_model)
|
|
693
|
+
except Exception as e:
|
|
694
|
+
logger.warning(f"Could not initialize embeddings: {e}")
|
|
695
|
+
|
|
696
|
+
# Initialize extractors
|
|
697
|
+
self._document_classifier = DocumentClassifier(llm=self.llm)
|
|
698
|
+
self._schema_discoverer = EntitySchemaDiscoverer(llm=self.llm)
|
|
699
|
+
self._entity_extractor = EntityExtractor(llm=self.llm, embedding=self._embedding)
|
|
700
|
+
self._relation_extractor = RelationExtractor(llm=self.llm)
|
|
701
|
+
self._initialized = True
|
|
702
|
+
|
|
703
|
+
logger.info("Ingestion extractors initialized")
|
|
704
|
+
return True
|
|
705
|
+
|
|
706
|
+
def _apply_rate_limit(self) -> None:
|
|
707
|
+
"""Apply rate limiting if configured in guardrails."""
|
|
708
|
+
if not self.guardrails:
|
|
709
|
+
return
|
|
710
|
+
|
|
711
|
+
rpm = getattr(self.guardrails, 'rate_limit_requests_per_minute', None)
|
|
712
|
+
if not rpm:
|
|
713
|
+
return
|
|
714
|
+
|
|
715
|
+
# Calculate minimum interval between requests
|
|
716
|
+
min_interval = 60.0 / rpm
|
|
717
|
+
elapsed = time.time() - self._last_request_time
|
|
718
|
+
|
|
719
|
+
if elapsed < min_interval:
|
|
720
|
+
sleep_time = min_interval - elapsed
|
|
721
|
+
logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
|
|
722
|
+
time.sleep(sleep_time)
|
|
723
|
+
|
|
724
|
+
self._last_request_time = time.time()
|
|
725
|
+
self._request_count += 1
|
|
726
|
+
|
|
727
|
+
def _filter_content(self, content: str) -> str:
|
|
728
|
+
"""Apply content filtering based on guardrails."""
|
|
729
|
+
if not self.guardrails:
|
|
730
|
+
return content
|
|
731
|
+
|
|
732
|
+
if not getattr(self.guardrails, 'content_filter_enabled', False):
|
|
733
|
+
return content
|
|
734
|
+
|
|
735
|
+
filtered = content
|
|
736
|
+
patterns = getattr(self.guardrails, 'filter_patterns', [])
|
|
737
|
+
|
|
738
|
+
for pattern in patterns:
|
|
739
|
+
try:
|
|
740
|
+
filtered = re.sub(pattern, '[FILTERED]', filtered, flags=re.IGNORECASE)
|
|
741
|
+
except re.error as e:
|
|
742
|
+
logger.warning(f"Invalid filter pattern '{pattern}': {e}")
|
|
743
|
+
|
|
744
|
+
if filtered != content:
|
|
745
|
+
logger.debug("Content filtered for PII/secrets")
|
|
746
|
+
|
|
747
|
+
return filtered
|
|
748
|
+
|
|
749
|
+
def _get_max_entities(self) -> int:
|
|
750
|
+
"""Get max entities per doc from guardrails."""
|
|
751
|
+
if self.guardrails:
|
|
752
|
+
return getattr(self.guardrails, 'max_entities_per_doc', 50)
|
|
753
|
+
return 50
|
|
754
|
+
|
|
755
|
+
def _get_max_relations(self) -> int:
|
|
756
|
+
"""Get max relations per doc from guardrails."""
|
|
757
|
+
if self.guardrails:
|
|
758
|
+
return getattr(self.guardrails, 'max_relations_per_doc', 100)
|
|
759
|
+
return 100
|
|
760
|
+
|
|
761
|
+
def _get_confidence_threshold(self, for_relations: bool = False) -> float:
|
|
762
|
+
"""Get confidence threshold from guardrails."""
|
|
763
|
+
if not self.guardrails:
|
|
764
|
+
return 0.5
|
|
765
|
+
|
|
766
|
+
if for_relations:
|
|
767
|
+
return getattr(self.guardrails, 'relation_confidence_threshold', 0.5)
|
|
768
|
+
return getattr(self.guardrails, 'entity_confidence_threshold', 0.5)
|
|
769
|
+
|
|
770
|
+
def _log_progress(self, message: str, phase: str = "ingestion") -> None:
|
|
771
|
+
"""Log progress and call callback if set."""
|
|
772
|
+
logger.info(f"[{phase}] {message}")
|
|
773
|
+
if self.progress_callback:
|
|
774
|
+
try:
|
|
775
|
+
self.progress_callback(message, phase)
|
|
776
|
+
except Exception as e:
|
|
777
|
+
logger.debug(f"Progress callback failed: {e}")
|
|
778
|
+
|
|
779
|
+
def _auto_save(self) -> None:
|
|
780
|
+
"""Auto-save graph after mutations."""
|
|
781
|
+
if self.graph_path:
|
|
782
|
+
try:
|
|
783
|
+
self._knowledge_graph.dump_to_json(self.graph_path)
|
|
784
|
+
logger.debug(f"Auto-saved graph to {self.graph_path}")
|
|
785
|
+
except Exception as e:
|
|
786
|
+
logger.warning(f"Failed to auto-save: {e}")
|
|
787
|
+
|
|
788
|
+
def _get_checkpoint_path(self, source: str) -> str:
|
|
789
|
+
"""Get checkpoint file path for a source."""
|
|
790
|
+
if self.checkpoint_dir:
|
|
791
|
+
base_dir = Path(self.checkpoint_dir)
|
|
792
|
+
else:
|
|
793
|
+
base_dir = Path(self.graph_path).parent
|
|
794
|
+
|
|
795
|
+
return str(base_dir / f".ingestion-checkpoint-{source}.json")
|
|
796
|
+
|
|
797
|
+
def _save_checkpoint(self, checkpoint: IngestionCheckpoint) -> None:
|
|
798
|
+
"""Save checkpoint to disk."""
|
|
799
|
+
try:
|
|
800
|
+
checkpoint_path = self._get_checkpoint_path(checkpoint.source)
|
|
801
|
+
checkpoint.save(checkpoint_path)
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.warning(f"Failed to save checkpoint: {e}")
|
|
804
|
+
|
|
805
|
+
def _load_checkpoint(self, source: str) -> Optional[IngestionCheckpoint]:
|
|
806
|
+
"""Load checkpoint from disk if exists."""
|
|
807
|
+
checkpoint_path = self._get_checkpoint_path(source)
|
|
808
|
+
return IngestionCheckpoint.load(checkpoint_path)
|
|
809
|
+
|
|
810
|
+
def _clear_checkpoint(self, source: str) -> None:
|
|
811
|
+
"""Clear checkpoint file after successful completion."""
|
|
812
|
+
try:
|
|
813
|
+
checkpoint_path = Path(self._get_checkpoint_path(source))
|
|
814
|
+
if checkpoint_path.exists():
|
|
815
|
+
checkpoint_path.unlink()
|
|
816
|
+
logger.debug(f"Cleared checkpoint for {source}")
|
|
817
|
+
except Exception as e:
|
|
818
|
+
logger.warning(f"Failed to clear checkpoint: {e}")
|
|
819
|
+
|
|
820
|
+
def clear_checkpoint(self, source: str) -> bool:
|
|
821
|
+
"""
|
|
822
|
+
Clear checkpoint for a source to force fresh ingestion.
|
|
823
|
+
|
|
824
|
+
Use this when you want to re-ingest everything from scratch,
|
|
825
|
+
ignoring previous file hashes and processing state.
|
|
826
|
+
|
|
827
|
+
Args:
|
|
828
|
+
source: Name of source toolkit
|
|
829
|
+
|
|
830
|
+
Returns:
|
|
831
|
+
True if checkpoint was cleared, False if no checkpoint existed
|
|
832
|
+
"""
|
|
833
|
+
checkpoint_path = Path(self._get_checkpoint_path(source))
|
|
834
|
+
if checkpoint_path.exists():
|
|
835
|
+
self._clear_checkpoint(source)
|
|
836
|
+
self._log_progress(f"🗑️ Cleared checkpoint for {source}", "reset")
|
|
837
|
+
return True
|
|
838
|
+
return False
|
|
839
|
+
|
|
840
|
+
def get_checkpoint_info(self, source: str) -> Optional[Dict[str, Any]]:
|
|
841
|
+
"""
|
|
842
|
+
Get information about existing checkpoint for a source.
|
|
843
|
+
|
|
844
|
+
Useful for checking if incremental update is available and
|
|
845
|
+
how many files are being tracked.
|
|
846
|
+
|
|
847
|
+
Args:
|
|
848
|
+
source: Name of source toolkit
|
|
849
|
+
|
|
850
|
+
Returns:
|
|
851
|
+
Dict with checkpoint info or None if no checkpoint exists
|
|
852
|
+
"""
|
|
853
|
+
checkpoint = self._load_checkpoint(source)
|
|
854
|
+
if not checkpoint:
|
|
855
|
+
return None
|
|
856
|
+
|
|
857
|
+
return {
|
|
858
|
+
'run_id': checkpoint.run_id,
|
|
859
|
+
'completed': checkpoint.completed,
|
|
860
|
+
'phase': checkpoint.phase,
|
|
861
|
+
'started_at': checkpoint.started_at,
|
|
862
|
+
'updated_at': checkpoint.updated_at,
|
|
863
|
+
'documents_processed': checkpoint.documents_processed,
|
|
864
|
+
'entities_added': checkpoint.entities_added,
|
|
865
|
+
'relations_added': checkpoint.relations_added,
|
|
866
|
+
'files_tracked': len(checkpoint.file_hashes),
|
|
867
|
+
'files_processed': len(checkpoint.processed_files),
|
|
868
|
+
'files_failed': len(checkpoint.failed_files),
|
|
869
|
+
}
|
|
870
|
+
|
|
871
|
+
def _generate_entity_id(self, entity_type: str, name: str, file_path: str = None) -> str:
|
|
872
|
+
"""
|
|
873
|
+
Generate unique entity ID.
|
|
874
|
+
|
|
875
|
+
For most entity types, IDs are based on (type, name) only - NOT file_path.
|
|
876
|
+
This enables same-named entities from different files to be merged,
|
|
877
|
+
creating a unified knowledge graph with multiple citations per entity.
|
|
878
|
+
|
|
879
|
+
HOWEVER, for context-dependent types (tools, properties, etc.), the file_path
|
|
880
|
+
IS included because the same name in different files means different things:
|
|
881
|
+
- "Get Tests" tool in Xray toolkit != "Get Tests" tool in Zephyr toolkit
|
|
882
|
+
- "name" property in User entity != "name" property in Project entity
|
|
883
|
+
"""
|
|
884
|
+
# Types that are context-dependent - same name in different files = different entities
|
|
885
|
+
CONTEXT_DEPENDENT_TYPES = {
|
|
886
|
+
"tool", "property", "properties", "parameter", "argument",
|
|
887
|
+
"field", "column", "attribute", "option", "setting",
|
|
888
|
+
"step", "test_step", "ui_field", "endpoint", "method",
|
|
889
|
+
# File-level nodes are unique per file path
|
|
890
|
+
"file", "source_file", "document_file", "config_file", "web_file",
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
# Normalize name for consistent hashing
|
|
894
|
+
normalized_name = name.lower().strip()
|
|
895
|
+
normalized_type = entity_type.lower().strip()
|
|
896
|
+
|
|
897
|
+
# Include file_path for context-dependent types
|
|
898
|
+
if normalized_type in CONTEXT_DEPENDENT_TYPES and file_path:
|
|
899
|
+
# Use file path to differentiate same-named entities from different contexts
|
|
900
|
+
content = f"{entity_type}:{normalized_name}:{file_path}"
|
|
901
|
+
else:
|
|
902
|
+
# Standard: merge same-named entities across files
|
|
903
|
+
content = f"{entity_type}:{normalized_name}"
|
|
904
|
+
|
|
905
|
+
return hashlib.md5(content.encode()).hexdigest()[:12]
|
|
906
|
+
|
|
907
|
+
def _normalize_document(self, doc: Any, source_toolkit: str) -> Optional[Document]:
|
|
908
|
+
"""Normalize various document formats to LangChain Document."""
|
|
909
|
+
if isinstance(doc, Document):
|
|
910
|
+
# Already a Document, ensure metadata has source_toolkit
|
|
911
|
+
doc.metadata['source_toolkit'] = source_toolkit
|
|
912
|
+
return doc
|
|
913
|
+
|
|
914
|
+
if isinstance(doc, dict):
|
|
915
|
+
# Dict from loader generator
|
|
916
|
+
content = doc.get('file_content') or doc.get('page_content', '')
|
|
917
|
+
if not content:
|
|
918
|
+
return None
|
|
919
|
+
|
|
920
|
+
metadata = {
|
|
921
|
+
'file_path': doc.get('file_name') or doc.get('source', 'unknown'),
|
|
922
|
+
'commit_hash': doc.get('commit_hash'),
|
|
923
|
+
'source_toolkit': source_toolkit,
|
|
924
|
+
}
|
|
925
|
+
# Merge additional metadata
|
|
926
|
+
for k, v in doc.items():
|
|
927
|
+
if k not in ('file_content', 'page_content', 'file_name', 'source', 'commit_hash'):
|
|
928
|
+
metadata[k] = v
|
|
929
|
+
|
|
930
|
+
return Document(page_content=content, metadata=metadata)
|
|
931
|
+
|
|
932
|
+
logger.warning(f"Unknown document type: {type(doc)}")
|
|
933
|
+
return None
|
|
934
|
+
|
|
935
|
+
def _extract_entities_from_doc(
|
|
936
|
+
self,
|
|
937
|
+
doc: Document,
|
|
938
|
+
source_toolkit: str,
|
|
939
|
+
schema: Optional[Dict] = None
|
|
940
|
+
) -> Tuple[List[Dict[str, Any]], List[str], List[Dict[str, Any]]]:
|
|
941
|
+
"""Extract entities from a single document.
|
|
942
|
+
|
|
943
|
+
Uses parser-first approach:
|
|
944
|
+
1. For code files with parser: Use AST/regex parsers to extract symbols (no LLM)
|
|
945
|
+
2. For code files without parser: HYBRID FALLBACK - TextParser + full LLM
|
|
946
|
+
3. For non-code files: LLM extracts semantic entities
|
|
947
|
+
4. For all files with parser: Also run LLM for semantic entities not in code structure
|
|
948
|
+
|
|
949
|
+
Returns:
|
|
950
|
+
Tuple of (entities, failed_file_paths, parser_relationships) where:
|
|
951
|
+
- entities: List of extracted entity dicts
|
|
952
|
+
- failed_file_paths: File path if extraction failed, empty list otherwise
|
|
953
|
+
- parser_relationships: List of relationships from parser (for code files)
|
|
954
|
+
"""
|
|
955
|
+
file_path = (doc.metadata.get('file_path') or
|
|
956
|
+
doc.metadata.get('file_name') or
|
|
957
|
+
doc.metadata.get('source', 'unknown'))
|
|
958
|
+
|
|
959
|
+
entities = []
|
|
960
|
+
parser_relationships = []
|
|
961
|
+
failed_docs = []
|
|
962
|
+
|
|
963
|
+
# Get chunk position info for line number adjustment
|
|
964
|
+
start_line = doc.metadata.get('start_line') or doc.metadata.get('line_start')
|
|
965
|
+
|
|
966
|
+
# ========== PARSER-FIRST EXTRACTION ==========
|
|
967
|
+
# Try to use parser for code files (AST/regex - no LLM needed)
|
|
968
|
+
parser = get_parser_for_file(file_path)
|
|
969
|
+
parser_extracted_names = set() # Track what parser extracted to avoid LLM duplication
|
|
970
|
+
use_full_llm_extraction = False # Flag for hybrid fallback
|
|
971
|
+
|
|
972
|
+
if parser and _is_code_file(file_path):
|
|
973
|
+
try:
|
|
974
|
+
# Parse file content with language-specific parser
|
|
975
|
+
parse_result = parser_parse_file(file_path, content=doc.page_content)
|
|
976
|
+
|
|
977
|
+
# Build symbol name to entity ID mapping for containment edges
|
|
978
|
+
symbol_name_to_entity_id = {}
|
|
979
|
+
|
|
980
|
+
# Convert symbols to entities
|
|
981
|
+
for symbol in parse_result.symbols:
|
|
982
|
+
entity = _symbol_to_entity(
|
|
983
|
+
symbol,
|
|
984
|
+
source_toolkit,
|
|
985
|
+
self._generate_entity_id
|
|
986
|
+
)
|
|
987
|
+
# Update citation with commit hash if available
|
|
988
|
+
if doc.metadata.get('commit_hash'):
|
|
989
|
+
entity['citation'].content_hash = doc.metadata.get('commit_hash')
|
|
990
|
+
|
|
991
|
+
entities.append(entity)
|
|
992
|
+
parser_extracted_names.add(symbol.name.lower())
|
|
993
|
+
|
|
994
|
+
# Track symbol full name to entity ID for containment edges
|
|
995
|
+
full_name = symbol.full_name or symbol.get_qualified_name() or symbol.name
|
|
996
|
+
symbol_name_to_entity_id[full_name] = entity['id']
|
|
997
|
+
# Also track by simple name for fallback matching
|
|
998
|
+
symbol_name_to_entity_id[symbol.name] = entity['id']
|
|
999
|
+
|
|
1000
|
+
# Convert relationships from parser
|
|
1001
|
+
for rel in parse_result.relationships:
|
|
1002
|
+
parser_relationships.append(
|
|
1003
|
+
_parser_relationship_to_dict(rel, source_toolkit)
|
|
1004
|
+
)
|
|
1005
|
+
|
|
1006
|
+
# ========== INTRA-FILE CONTAINMENT EDGES ==========
|
|
1007
|
+
# Create containment relationships based on Symbol.parent_symbol
|
|
1008
|
+
containment_count = 0
|
|
1009
|
+
for symbol in parse_result.symbols:
|
|
1010
|
+
if symbol.parent_symbol:
|
|
1011
|
+
# Find parent entity ID
|
|
1012
|
+
child_full_name = symbol.full_name or symbol.get_qualified_name() or symbol.name
|
|
1013
|
+
child_id = symbol_name_to_entity_id.get(child_full_name) or symbol_name_to_entity_id.get(symbol.name)
|
|
1014
|
+
|
|
1015
|
+
# Try to find parent by full name or simple name
|
|
1016
|
+
parent_id = symbol_name_to_entity_id.get(symbol.parent_symbol)
|
|
1017
|
+
|
|
1018
|
+
if child_id and parent_id and child_id != parent_id:
|
|
1019
|
+
parser_relationships.append({
|
|
1020
|
+
'source_symbol': symbol.parent_symbol,
|
|
1021
|
+
'target_symbol': child_full_name,
|
|
1022
|
+
'relation_type': 'contains',
|
|
1023
|
+
'source_file': file_path,
|
|
1024
|
+
'target_file': file_path,
|
|
1025
|
+
'confidence': 1.0, # High confidence - structural
|
|
1026
|
+
'is_cross_file': False,
|
|
1027
|
+
'source': 'parser',
|
|
1028
|
+
'source_toolkit': source_toolkit,
|
|
1029
|
+
# Pre-resolved IDs for graph insertion
|
|
1030
|
+
'_resolved_source_id': parent_id,
|
|
1031
|
+
'_resolved_target_id': child_id,
|
|
1032
|
+
})
|
|
1033
|
+
containment_count += 1
|
|
1034
|
+
|
|
1035
|
+
logger.debug(f"Parser extracted {len(entities)} entities, {len(parser_relationships)} relationships ({containment_count} containment) from {file_path}")
|
|
1036
|
+
|
|
1037
|
+
except Exception as e:
|
|
1038
|
+
logger.warning(f"Parser failed for {file_path}: {e}, using hybrid fallback")
|
|
1039
|
+
use_full_llm_extraction = True # Enable full LLM extraction
|
|
1040
|
+
|
|
1041
|
+
elif _is_code_like_file(file_path) and not parser:
|
|
1042
|
+
# ========== HYBRID FALLBACK ==========
|
|
1043
|
+
# File looks like code but no parser available (e.g., .lua, .perl, .sh)
|
|
1044
|
+
# Use TextParser to extract textual references + full LLM extraction
|
|
1045
|
+
logger.info(f"Hybrid fallback for unsupported code file: {file_path}")
|
|
1046
|
+
use_full_llm_extraction = True
|
|
1047
|
+
|
|
1048
|
+
try:
|
|
1049
|
+
# Use TextParser to extract textual references
|
|
1050
|
+
from .parsers import TextParser
|
|
1051
|
+
text_parser = TextParser()
|
|
1052
|
+
parse_result = text_parser.parse_file(file_path, content=doc.page_content)
|
|
1053
|
+
|
|
1054
|
+
# Extract any textual relationships (See X, Depends on Y, etc.)
|
|
1055
|
+
for rel in parse_result.relationships:
|
|
1056
|
+
parser_relationships.append(
|
|
1057
|
+
_parser_relationship_to_dict(rel, source_toolkit)
|
|
1058
|
+
)
|
|
1059
|
+
|
|
1060
|
+
logger.debug(f"TextParser extracted {len(parse_result.relationships)} textual references from {file_path}")
|
|
1061
|
+
|
|
1062
|
+
except Exception as e:
|
|
1063
|
+
logger.warning(f"TextParser failed for {file_path}: {e}")
|
|
1064
|
+
|
|
1065
|
+
# ========== LLM EXTRACTION (semantic entities) ==========
|
|
1066
|
+
# For code files with parser: LLM extracts only semantic entities (features, requirements, etc.)
|
|
1067
|
+
# For hybrid fallback: LLM does full extraction including code structure
|
|
1068
|
+
# For non-code files: LLM does full extraction
|
|
1069
|
+
|
|
1070
|
+
if self._entity_extractor:
|
|
1071
|
+
try:
|
|
1072
|
+
# Extract entities - skip_on_error=True returns (entities, failed_docs)
|
|
1073
|
+
extracted, llm_failed_docs = self._entity_extractor.extract_batch(
|
|
1074
|
+
[doc], schema=schema, skip_on_error=True
|
|
1075
|
+
)
|
|
1076
|
+
failed_docs.extend(llm_failed_docs)
|
|
1077
|
+
|
|
1078
|
+
for entity in extracted:
|
|
1079
|
+
entity_name = entity.get('name', '').lower()
|
|
1080
|
+
raw_type = entity.get('type', 'unknown')
|
|
1081
|
+
normalized_type = normalize_entity_type(raw_type)
|
|
1082
|
+
|
|
1083
|
+
# Skip if parser already extracted this (avoid duplicates for code entities)
|
|
1084
|
+
# Only skip for code_layer types that parsers handle, and only if not hybrid fallback
|
|
1085
|
+
code_layer_types = {'class', 'function', 'method', 'module', 'interface',
|
|
1086
|
+
'constant', 'variable', 'import', 'property', 'field'}
|
|
1087
|
+
if (not use_full_llm_extraction and
|
|
1088
|
+
entity_name in parser_extracted_names and
|
|
1089
|
+
normalized_type in code_layer_types):
|
|
1090
|
+
continue
|
|
1091
|
+
|
|
1092
|
+
# Adjust line numbers if this is a chunk with offset
|
|
1093
|
+
entity_line_start = entity.get('line_start')
|
|
1094
|
+
entity_line_end = entity.get('line_end')
|
|
1095
|
+
|
|
1096
|
+
if start_line and entity_line_start:
|
|
1097
|
+
entity_line_start = start_line + entity_line_start - 1
|
|
1098
|
+
if entity_line_end:
|
|
1099
|
+
entity_line_end = start_line + entity_line_end - 1
|
|
1100
|
+
|
|
1101
|
+
entity_id = self._generate_entity_id(
|
|
1102
|
+
normalized_type,
|
|
1103
|
+
entity.get('name', 'unnamed'),
|
|
1104
|
+
file_path
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
# Create citation
|
|
1108
|
+
citation = Citation(
|
|
1109
|
+
file_path=file_path,
|
|
1110
|
+
line_start=entity_line_start or entity.get('line_start'),
|
|
1111
|
+
line_end=entity_line_end or entity.get('line_end'),
|
|
1112
|
+
source_toolkit=source_toolkit,
|
|
1113
|
+
doc_id=f"{source_toolkit}://{file_path}",
|
|
1114
|
+
content_hash=doc.metadata.get('commit_hash'),
|
|
1115
|
+
)
|
|
1116
|
+
|
|
1117
|
+
entities.append({
|
|
1118
|
+
'id': entity_id,
|
|
1119
|
+
'name': entity.get('name', 'unnamed'),
|
|
1120
|
+
'type': normalized_type,
|
|
1121
|
+
'citation': citation,
|
|
1122
|
+
'properties': {
|
|
1123
|
+
k: v for k, v in entity.items()
|
|
1124
|
+
if k not in ('id', 'name', 'type', 'content', 'text', 'line_start', 'line_end')
|
|
1125
|
+
},
|
|
1126
|
+
'source_doc': doc,
|
|
1127
|
+
'source': 'llm_hybrid' if use_full_llm_extraction else 'llm',
|
|
1128
|
+
})
|
|
1129
|
+
|
|
1130
|
+
except Exception as e:
|
|
1131
|
+
logger.error(f"LLM extraction failed for {file_path}: {e}")
|
|
1132
|
+
failed_docs.append(file_path)
|
|
1133
|
+
|
|
1134
|
+
# =====================================================================
|
|
1135
|
+
# FACT EXTRACTION - Lightweight LLM for semantic insights
|
|
1136
|
+
# Code files: extract algorithms, behaviors, validations, dependencies
|
|
1137
|
+
# Text files: extract decisions, requirements, definitions, dates
|
|
1138
|
+
# =====================================================================
|
|
1139
|
+
if self.llm:
|
|
1140
|
+
try:
|
|
1141
|
+
fact_extractor = FactExtractor(self.llm)
|
|
1142
|
+
is_code = _is_code_file(file_path) or _is_code_like_file(file_path)
|
|
1143
|
+
|
|
1144
|
+
# Use appropriate extraction method based on file type
|
|
1145
|
+
if is_code:
|
|
1146
|
+
facts = fact_extractor.extract_code(doc)
|
|
1147
|
+
else:
|
|
1148
|
+
facts = fact_extractor.extract(doc)
|
|
1149
|
+
|
|
1150
|
+
for fact in facts:
|
|
1151
|
+
fact_id = self._generate_entity_id(
|
|
1152
|
+
'fact',
|
|
1153
|
+
f"{fact.get('fact_type', 'unknown')}_{fact.get('subject', 'unknown')[:30]}",
|
|
1154
|
+
file_path
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
# Create citation for the fact
|
|
1158
|
+
citation = Citation(
|
|
1159
|
+
file_path=file_path,
|
|
1160
|
+
line_start=fact.get('line_start'),
|
|
1161
|
+
line_end=fact.get('line_end'),
|
|
1162
|
+
source_toolkit=source_toolkit,
|
|
1163
|
+
doc_id=f"{source_toolkit}://{file_path}",
|
|
1164
|
+
content_hash=doc.metadata.get('commit_hash'),
|
|
1165
|
+
)
|
|
1166
|
+
|
|
1167
|
+
entities.append({
|
|
1168
|
+
'id': fact_id,
|
|
1169
|
+
'name': fact.get('subject', 'unknown fact'),
|
|
1170
|
+
'type': 'fact',
|
|
1171
|
+
'citation': citation,
|
|
1172
|
+
'properties': {
|
|
1173
|
+
'fact_type': fact.get('fact_type'),
|
|
1174
|
+
'subject': fact.get('subject'),
|
|
1175
|
+
'predicate': fact.get('predicate'),
|
|
1176
|
+
'object': fact.get('object'),
|
|
1177
|
+
'confidence': fact.get('confidence', 0.8),
|
|
1178
|
+
},
|
|
1179
|
+
'source_doc': doc,
|
|
1180
|
+
'source': 'llm_fact',
|
|
1181
|
+
})
|
|
1182
|
+
|
|
1183
|
+
logger.debug(f"Extracted {len(facts)} facts from {file_path}")
|
|
1184
|
+
except Exception as e:
|
|
1185
|
+
logger.warning(f"Fact extraction failed for {file_path}: {e}")
|
|
1186
|
+
|
|
1187
|
+
return entities, failed_docs, parser_relationships
|
|
1188
|
+
|
|
1189
|
+
def _process_documents_batch(
|
|
1190
|
+
self,
|
|
1191
|
+
documents: List[Document],
|
|
1192
|
+
source_toolkit: str,
|
|
1193
|
+
schema: Optional[Dict] = None
|
|
1194
|
+
) -> Tuple[List[Dict[str, Any]], List[str], Dict[str, str], List[Dict[str, Any]]]:
|
|
1195
|
+
"""
|
|
1196
|
+
Process a batch of documents in parallel for entity extraction.
|
|
1197
|
+
|
|
1198
|
+
Args:
|
|
1199
|
+
documents: List of documents to process
|
|
1200
|
+
source_toolkit: Source toolkit name
|
|
1201
|
+
schema: Optional schema for extraction
|
|
1202
|
+
|
|
1203
|
+
Returns:
|
|
1204
|
+
Tuple of (all_entities, failed_files, file_hashes, parser_relationships) where:
|
|
1205
|
+
- all_entities: Combined list of entities from all documents
|
|
1206
|
+
- failed_files: List of file paths that failed extraction
|
|
1207
|
+
- file_hashes: Dict mapping file_path to content_hash
|
|
1208
|
+
- parser_relationships: List of relationships from parsers (AST/regex extracted)
|
|
1209
|
+
"""
|
|
1210
|
+
all_entities = []
|
|
1211
|
+
failed_files = []
|
|
1212
|
+
file_hashes = {}
|
|
1213
|
+
all_parser_relationships = []
|
|
1214
|
+
|
|
1215
|
+
# Use ThreadPoolExecutor for parallel extraction
|
|
1216
|
+
with ThreadPoolExecutor(max_workers=self.max_parallel_extractions) as executor:
|
|
1217
|
+
# Submit all extraction tasks
|
|
1218
|
+
future_to_doc = {
|
|
1219
|
+
executor.submit(self._extract_entities_from_doc, doc, source_toolkit, schema): doc
|
|
1220
|
+
for doc in documents
|
|
1221
|
+
}
|
|
1222
|
+
|
|
1223
|
+
# Process completed tasks as they finish
|
|
1224
|
+
for future in as_completed(future_to_doc):
|
|
1225
|
+
doc = future_to_doc[future]
|
|
1226
|
+
file_path = (doc.metadata.get('file_path') or
|
|
1227
|
+
doc.metadata.get('file_name') or
|
|
1228
|
+
doc.metadata.get('source', 'unknown'))
|
|
1229
|
+
|
|
1230
|
+
try:
|
|
1231
|
+
entities, extraction_failures, parser_relationships = future.result()
|
|
1232
|
+
|
|
1233
|
+
# Track content hash
|
|
1234
|
+
content_hash = hashlib.sha256(doc.page_content.encode()).hexdigest()
|
|
1235
|
+
file_hashes[file_path] = content_hash
|
|
1236
|
+
|
|
1237
|
+
# Add entities to batch results
|
|
1238
|
+
all_entities.extend(entities)
|
|
1239
|
+
|
|
1240
|
+
# Collect parser relationships
|
|
1241
|
+
all_parser_relationships.extend(parser_relationships)
|
|
1242
|
+
|
|
1243
|
+
# Track failures
|
|
1244
|
+
if extraction_failures:
|
|
1245
|
+
failed_files.extend(extraction_failures)
|
|
1246
|
+
|
|
1247
|
+
except Exception as e:
|
|
1248
|
+
logger.warning(f"Failed to process document '{file_path}': {e}")
|
|
1249
|
+
failed_files.append(file_path)
|
|
1250
|
+
|
|
1251
|
+
return all_entities, failed_files, file_hashes, all_parser_relationships
|
|
1252
|
+
|
|
1253
|
+
def _process_batch_and_update_graph(
|
|
1254
|
+
self,
|
|
1255
|
+
doc_batch: List[Document],
|
|
1256
|
+
source: str,
|
|
1257
|
+
schema: Optional[Dict],
|
|
1258
|
+
checkpoint: IngestionCheckpoint,
|
|
1259
|
+
result: IngestionResult,
|
|
1260
|
+
all_entities: List[Dict[str, Any]],
|
|
1261
|
+
all_parser_relationships: List[Dict[str, Any]],
|
|
1262
|
+
is_incremental_update: bool
|
|
1263
|
+
) -> None:
|
|
1264
|
+
"""
|
|
1265
|
+
Process a batch of documents in parallel and update the graph.
|
|
1266
|
+
|
|
1267
|
+
This method extracts entities from all documents in the batch concurrently,
|
|
1268
|
+
then adds them to the graph sequentially (graph operations are not thread-safe).
|
|
1269
|
+
|
|
1270
|
+
Args:
|
|
1271
|
+
doc_batch: List of documents to process
|
|
1272
|
+
source: Source toolkit name
|
|
1273
|
+
schema: Optional schema for extraction
|
|
1274
|
+
checkpoint: Checkpoint for progress tracking
|
|
1275
|
+
result: IngestionResult to update
|
|
1276
|
+
all_entities: List to accumulate all entities
|
|
1277
|
+
all_parser_relationships: List to accumulate parser-extracted relationships
|
|
1278
|
+
is_incremental_update: Whether this is an incremental update
|
|
1279
|
+
"""
|
|
1280
|
+
# Extract entities from all docs in parallel
|
|
1281
|
+
batch_entities, failed_files, file_hashes, parser_rels = self._process_documents_batch(
|
|
1282
|
+
doc_batch, source, schema
|
|
1283
|
+
)
|
|
1284
|
+
|
|
1285
|
+
# Update graph with batch results (sequential - graph is not thread-safe)
|
|
1286
|
+
for entity in batch_entities:
|
|
1287
|
+
self._knowledge_graph.add_entity(
|
|
1288
|
+
entity_id=entity['id'],
|
|
1289
|
+
name=entity['name'],
|
|
1290
|
+
entity_type=entity['type'],
|
|
1291
|
+
citation=entity['citation'],
|
|
1292
|
+
properties=entity['properties']
|
|
1293
|
+
)
|
|
1294
|
+
result.entities_added += 1
|
|
1295
|
+
all_entities.append(entity)
|
|
1296
|
+
|
|
1297
|
+
# Collect parser relationships for later processing
|
|
1298
|
+
all_parser_relationships.extend(parser_rels)
|
|
1299
|
+
|
|
1300
|
+
# Update checkpoint with processed files and hashes
|
|
1301
|
+
for file_path, content_hash in file_hashes.items():
|
|
1302
|
+
if file_path not in failed_files:
|
|
1303
|
+
checkpoint.mark_file_processed(file_path, content_hash)
|
|
1304
|
+
result.documents_processed += 1
|
|
1305
|
+
|
|
1306
|
+
# Track failed files
|
|
1307
|
+
for failed_file in failed_files:
|
|
1308
|
+
checkpoint.mark_file_failed(failed_file, "Entity extraction failed")
|
|
1309
|
+
if failed_file not in result.failed_documents:
|
|
1310
|
+
result.failed_documents.append(failed_file)
|
|
1311
|
+
|
|
1312
|
+
def _process_file_batch_and_update_graph(
|
|
1313
|
+
self,
|
|
1314
|
+
file_batch: List[Tuple[str, List[Document], Document]],
|
|
1315
|
+
_raw_doc_by_file: Dict[str, Document], # Deprecated, kept for compatibility
|
|
1316
|
+
source: str,
|
|
1317
|
+
schema: Optional[Dict],
|
|
1318
|
+
checkpoint: IngestionCheckpoint,
|
|
1319
|
+
result: IngestionResult,
|
|
1320
|
+
all_entities: List[Dict[str, Any]],
|
|
1321
|
+
all_parser_relationships: List[Dict[str, Any]],
|
|
1322
|
+
is_incremental_update: bool
|
|
1323
|
+
) -> None:
|
|
1324
|
+
"""
|
|
1325
|
+
Process a batch of files with their chunks and update the graph.
|
|
1326
|
+
|
|
1327
|
+
For each file:
|
|
1328
|
+
1. Run parser on whole file (AST/regex extraction - no LLM)
|
|
1329
|
+
2. Run LLM on each chunk (facts + entities)
|
|
1330
|
+
3. Deduplicate facts/entities at file level
|
|
1331
|
+
4. Add to graph
|
|
1332
|
+
|
|
1333
|
+
Args:
|
|
1334
|
+
file_batch: List of (file_path, chunks, raw_doc) tuples
|
|
1335
|
+
_raw_doc_by_file: DEPRECATED - raw_doc is now passed in file_batch tuple
|
|
1336
|
+
source: Source toolkit name
|
|
1337
|
+
schema: Optional schema for extraction
|
|
1338
|
+
checkpoint: Checkpoint for progress tracking
|
|
1339
|
+
result: IngestionResult to update
|
|
1340
|
+
all_entities: List to accumulate all entities
|
|
1341
|
+
all_parser_relationships: List to accumulate parser-extracted relationships
|
|
1342
|
+
is_incremental_update: Whether this is an incremental update
|
|
1343
|
+
"""
|
|
1344
|
+
# Process files in parallel
|
|
1345
|
+
batch_start_time = time.time()
|
|
1346
|
+
logger.info(f"⏱️ [TIMING] Batch start: {len(file_batch)} files")
|
|
1347
|
+
|
|
1348
|
+
with ThreadPoolExecutor(max_workers=self.max_parallel_extractions) as executor:
|
|
1349
|
+
future_to_file = {
|
|
1350
|
+
executor.submit(
|
|
1351
|
+
self._process_file_with_chunks,
|
|
1352
|
+
file_path, chunks, raw_doc, source, schema
|
|
1353
|
+
): file_path
|
|
1354
|
+
for file_path, chunks, raw_doc in file_batch
|
|
1355
|
+
}
|
|
1356
|
+
|
|
1357
|
+
for future in as_completed(future_to_file):
|
|
1358
|
+
file_path = future_to_file[future]
|
|
1359
|
+
|
|
1360
|
+
try:
|
|
1361
|
+
file_entities, parser_rels, content_hash = future.result()
|
|
1362
|
+
|
|
1363
|
+
# Update graph with file results (sequential - graph is not thread-safe)
|
|
1364
|
+
for entity in file_entities:
|
|
1365
|
+
self._knowledge_graph.add_entity(
|
|
1366
|
+
entity_id=entity['id'],
|
|
1367
|
+
name=entity['name'],
|
|
1368
|
+
entity_type=entity['type'],
|
|
1369
|
+
citation=entity['citation'],
|
|
1370
|
+
properties=entity['properties']
|
|
1371
|
+
)
|
|
1372
|
+
result.entities_added += 1
|
|
1373
|
+
all_entities.append(entity)
|
|
1374
|
+
|
|
1375
|
+
# Collect parser relationships
|
|
1376
|
+
all_parser_relationships.extend(parser_rels)
|
|
1377
|
+
|
|
1378
|
+
# Mark file as processed
|
|
1379
|
+
checkpoint.mark_file_processed(file_path, content_hash)
|
|
1380
|
+
result.documents_processed += 1
|
|
1381
|
+
|
|
1382
|
+
except Exception as e:
|
|
1383
|
+
logger.warning(f"Failed to process file '{file_path}': {e}")
|
|
1384
|
+
checkpoint.mark_file_failed(file_path, str(e))
|
|
1385
|
+
if file_path not in result.failed_documents:
|
|
1386
|
+
result.failed_documents.append(file_path)
|
|
1387
|
+
result.documents_processed += 1
|
|
1388
|
+
|
|
1389
|
+
batch_duration = time.time() - batch_start_time
|
|
1390
|
+
logger.info(f"⏱️ [TIMING] Batch complete: {len(file_batch)} files in {batch_duration:.3f}s ({batch_duration/len(file_batch):.3f}s/file avg)")
|
|
1391
|
+
|
|
1392
|
+
def _process_file_with_chunks(
|
|
1393
|
+
self,
|
|
1394
|
+
file_path: str,
|
|
1395
|
+
chunks: List[Document],
|
|
1396
|
+
raw_doc: Document,
|
|
1397
|
+
source_toolkit: str,
|
|
1398
|
+
schema: Optional[Dict] = None
|
|
1399
|
+
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], str]:
|
|
1400
|
+
"""
|
|
1401
|
+
Process a single file: parser on whole file, LLM on chunks, dedupe at file level.
|
|
1402
|
+
|
|
1403
|
+
Args:
|
|
1404
|
+
file_path: Path to the file
|
|
1405
|
+
chunks: List of chunk Documents for this file
|
|
1406
|
+
raw_doc: Raw (whole file) Document for parser
|
|
1407
|
+
source_toolkit: Source toolkit name
|
|
1408
|
+
schema: Optional schema for extraction
|
|
1409
|
+
|
|
1410
|
+
Returns:
|
|
1411
|
+
Tuple of (deduplicated_entities, parser_relationships, content_hash)
|
|
1412
|
+
"""
|
|
1413
|
+
all_entities = []
|
|
1414
|
+
parser_relationships = []
|
|
1415
|
+
content_hash = hashlib.sha256(raw_doc.page_content.encode()).hexdigest()
|
|
1416
|
+
|
|
1417
|
+
# Check if file is too small or trivial for LLM extraction
|
|
1418
|
+
file_content = raw_doc.page_content
|
|
1419
|
+
line_count = file_content.count('\n') + 1
|
|
1420
|
+
char_count = len(file_content)
|
|
1421
|
+
|
|
1422
|
+
# Detect trivial/boilerplate content
|
|
1423
|
+
skip_llm = False
|
|
1424
|
+
skip_reason = ""
|
|
1425
|
+
|
|
1426
|
+
# 1. Too small
|
|
1427
|
+
if line_count < self.min_file_lines or char_count < self.min_file_chars:
|
|
1428
|
+
skip_llm = True
|
|
1429
|
+
skip_reason = f"small ({line_count} lines, {char_count} chars)"
|
|
1430
|
+
|
|
1431
|
+
# 2. License-only files or files starting with license header that's most of the content
|
|
1432
|
+
if not skip_llm:
|
|
1433
|
+
content_lower = file_content.lower()
|
|
1434
|
+
license_indicators = [
|
|
1435
|
+
'apache license', 'mit license', 'bsd license', 'gpl license',
|
|
1436
|
+
'licensed under the', 'permission is hereby granted',
|
|
1437
|
+
'copyright (c)', 'copyright 20', 'all rights reserved',
|
|
1438
|
+
'without warranties or conditions', 'provided "as is"',
|
|
1439
|
+
]
|
|
1440
|
+
license_matches = sum(1 for ind in license_indicators if ind in content_lower)
|
|
1441
|
+
|
|
1442
|
+
# If 3+ license indicators and file is mostly comments/license text
|
|
1443
|
+
if license_matches >= 3:
|
|
1444
|
+
# Count actual code lines (non-empty, non-comment)
|
|
1445
|
+
code_lines = 0
|
|
1446
|
+
for line in file_content.split('\n'):
|
|
1447
|
+
stripped = line.strip()
|
|
1448
|
+
if stripped and not stripped.startswith(('#', '//', '/*', '*', '<!--', '"""', "'''")):
|
|
1449
|
+
code_lines += 1
|
|
1450
|
+
|
|
1451
|
+
# If less than 20% is actual code, it's mostly license/boilerplate
|
|
1452
|
+
if code_lines < line_count * 0.2:
|
|
1453
|
+
skip_llm = True
|
|
1454
|
+
skip_reason = f"license/boilerplate ({code_lines} code lines of {line_count})"
|
|
1455
|
+
|
|
1456
|
+
# 3. Re-export / barrel files (e.g., index.js with only exports)
|
|
1457
|
+
if not skip_llm:
|
|
1458
|
+
content_stripped = file_content.strip()
|
|
1459
|
+
lines = [l.strip() for l in content_stripped.split('\n') if l.strip()]
|
|
1460
|
+
|
|
1461
|
+
# Check if file is mostly import/export statements
|
|
1462
|
+
export_import_lines = sum(1 for l in lines if
|
|
1463
|
+
l.startswith(('export ', 'import ', 'from ', 'module.exports', 'exports.'))
|
|
1464
|
+
or l.startswith('export {') or l.startswith('export default')
|
|
1465
|
+
or 'require(' in l)
|
|
1466
|
+
|
|
1467
|
+
if len(lines) > 0 and export_import_lines / len(lines) > 0.8:
|
|
1468
|
+
skip_llm = True
|
|
1469
|
+
skip_reason = f"barrel/re-export file ({export_import_lines}/{len(lines)} export lines)"
|
|
1470
|
+
|
|
1471
|
+
if skip_llm:
|
|
1472
|
+
logger.debug(f"Skipping LLM for {Path(file_path).name}: {skip_reason}")
|
|
1473
|
+
|
|
1474
|
+
# ========== PARSER EXTRACTION (whole file, no LLM) ==========
|
|
1475
|
+
parser_start = time.time()
|
|
1476
|
+
parser = get_parser_for_file(file_path)
|
|
1477
|
+
parser_extracted_names = set()
|
|
1478
|
+
|
|
1479
|
+
if parser and _is_code_file(file_path):
|
|
1480
|
+
try:
|
|
1481
|
+
parse_result = parser_parse_file(file_path, content=raw_doc.page_content)
|
|
1482
|
+
|
|
1483
|
+
# Build symbol name to entity ID mapping for containment edges
|
|
1484
|
+
symbol_name_to_entity_id = {}
|
|
1485
|
+
|
|
1486
|
+
# Convert symbols to entities
|
|
1487
|
+
for symbol in parse_result.symbols:
|
|
1488
|
+
entity = _symbol_to_entity(
|
|
1489
|
+
symbol,
|
|
1490
|
+
source_toolkit,
|
|
1491
|
+
self._generate_entity_id
|
|
1492
|
+
)
|
|
1493
|
+
if raw_doc.metadata.get('commit_hash'):
|
|
1494
|
+
entity['citation'].content_hash = raw_doc.metadata.get('commit_hash')
|
|
1495
|
+
|
|
1496
|
+
all_entities.append(entity)
|
|
1497
|
+
parser_extracted_names.add(symbol.name.lower())
|
|
1498
|
+
|
|
1499
|
+
full_name = symbol.full_name or symbol.get_qualified_name()
|
|
1500
|
+
if full_name:
|
|
1501
|
+
symbol_name_to_entity_id[full_name] = entity['id']
|
|
1502
|
+
|
|
1503
|
+
# Convert relationships
|
|
1504
|
+
for rel in parse_result.relationships:
|
|
1505
|
+
parser_relationships.append(
|
|
1506
|
+
_parser_relationship_to_dict(rel, source_toolkit)
|
|
1507
|
+
)
|
|
1508
|
+
|
|
1509
|
+
# Add containment edges from parent_symbol
|
|
1510
|
+
containment_count = 0
|
|
1511
|
+
for symbol in parse_result.symbols:
|
|
1512
|
+
if symbol.parent_symbol:
|
|
1513
|
+
child_name = symbol.full_name or symbol.get_qualified_name()
|
|
1514
|
+
parent_name = symbol.parent_symbol
|
|
1515
|
+
|
|
1516
|
+
child_id = symbol_name_to_entity_id.get(child_name)
|
|
1517
|
+
parent_id = symbol_name_to_entity_id.get(parent_name)
|
|
1518
|
+
|
|
1519
|
+
if child_id and parent_id:
|
|
1520
|
+
parser_relationships.append({
|
|
1521
|
+
'source_id': parent_id,
|
|
1522
|
+
'target_id': child_id,
|
|
1523
|
+
'relation_type': 'contains',
|
|
1524
|
+
'properties': {
|
|
1525
|
+
'source': 'parser',
|
|
1526
|
+
'source_toolkit': source_toolkit,
|
|
1527
|
+
'file_path': file_path,
|
|
1528
|
+
},
|
|
1529
|
+
'source': 'parser',
|
|
1530
|
+
})
|
|
1531
|
+
containment_count += 1
|
|
1532
|
+
|
|
1533
|
+
logger.debug(f"Parser extracted {len(all_entities)} symbols, {len(parser_relationships)} relationships from {file_path}")
|
|
1534
|
+
|
|
1535
|
+
except Exception as e:
|
|
1536
|
+
logger.warning(f"Parser failed for {file_path}: {e}")
|
|
1537
|
+
|
|
1538
|
+
parser_duration = time.time() - parser_start
|
|
1539
|
+
if parser_duration > 0.1: # Only log if > 100ms
|
|
1540
|
+
logger.info(f"⏱️ [TIMING] Parser: {parser_duration:.3f}s for {file_path}")
|
|
1541
|
+
|
|
1542
|
+
# ========== PARALLEL LLM EXTRACTION (Entity + Fact in parallel) ==========
|
|
1543
|
+
chunk_entities = []
|
|
1544
|
+
chunk_facts = []
|
|
1545
|
+
entity_llm_duration = 0.0
|
|
1546
|
+
fact_llm_duration = 0.0
|
|
1547
|
+
|
|
1548
|
+
# Build chunk metadata for line number adjustment
|
|
1549
|
+
chunk_offsets = []
|
|
1550
|
+
for chunk in chunks:
|
|
1551
|
+
start_line = chunk.metadata.get('start_line') or chunk.metadata.get('line_start') or 1
|
|
1552
|
+
chunk_offsets.append(start_line)
|
|
1553
|
+
|
|
1554
|
+
# Helper functions for parallel execution
|
|
1555
|
+
def extract_entities():
|
|
1556
|
+
"""Extract entities from chunks - runs in parallel thread."""
|
|
1557
|
+
entities = []
|
|
1558
|
+
if not self._entity_extractor or not chunks:
|
|
1559
|
+
return entities, 0.0
|
|
1560
|
+
|
|
1561
|
+
start = time.time()
|
|
1562
|
+
try:
|
|
1563
|
+
extracted, _ = self._entity_extractor.extract_batch(
|
|
1564
|
+
chunks, schema=schema, skip_on_error=True
|
|
1565
|
+
)
|
|
1566
|
+
|
|
1567
|
+
for entity in extracted:
|
|
1568
|
+
entity_name = entity.get('name', '').lower()
|
|
1569
|
+
raw_type = entity.get('type', 'unknown')
|
|
1570
|
+
normalized_type = normalize_entity_type(raw_type)
|
|
1571
|
+
|
|
1572
|
+
# Skip if parser already extracted this
|
|
1573
|
+
code_layer_types = {'class', 'function', 'method', 'module', 'interface',
|
|
1574
|
+
'constant', 'variable', 'import', 'property', 'field'}
|
|
1575
|
+
if (entity_name in parser_extracted_names and
|
|
1576
|
+
normalized_type in code_layer_types):
|
|
1577
|
+
continue
|
|
1578
|
+
|
|
1579
|
+
entity_id = self._generate_entity_id(
|
|
1580
|
+
normalized_type,
|
|
1581
|
+
entity.get('name', 'unnamed'),
|
|
1582
|
+
file_path
|
|
1583
|
+
)
|
|
1584
|
+
|
|
1585
|
+
citation = Citation(
|
|
1586
|
+
file_path=file_path,
|
|
1587
|
+
line_start=entity.get('line_start'),
|
|
1588
|
+
line_end=entity.get('line_end'),
|
|
1589
|
+
source_toolkit=source_toolkit,
|
|
1590
|
+
doc_id=f"{source_toolkit}://{file_path}",
|
|
1591
|
+
content_hash=raw_doc.metadata.get('commit_hash'),
|
|
1592
|
+
)
|
|
1593
|
+
|
|
1594
|
+
entities.append({
|
|
1595
|
+
'id': entity_id,
|
|
1596
|
+
'name': entity.get('name', 'unnamed'),
|
|
1597
|
+
'type': normalized_type,
|
|
1598
|
+
'citation': citation,
|
|
1599
|
+
'properties': {
|
|
1600
|
+
k: v for k, v in entity.items()
|
|
1601
|
+
if k not in ('id', 'name', 'type', 'content', 'text', 'line_start', 'line_end')
|
|
1602
|
+
},
|
|
1603
|
+
'source_doc': chunks[0] if chunks else None,
|
|
1604
|
+
'source': 'llm',
|
|
1605
|
+
})
|
|
1606
|
+
except Exception as e:
|
|
1607
|
+
logger.warning(f"Batched entity extraction failed for {file_path}: {e}")
|
|
1608
|
+
|
|
1609
|
+
return entities, time.time() - start
|
|
1610
|
+
|
|
1611
|
+
def extract_facts():
|
|
1612
|
+
"""Extract facts from chunks - runs in parallel thread."""
|
|
1613
|
+
facts = []
|
|
1614
|
+
if not self.llm or not chunks:
|
|
1615
|
+
return facts, 0.0
|
|
1616
|
+
|
|
1617
|
+
start = time.time()
|
|
1618
|
+
try:
|
|
1619
|
+
fact_extractor = FactExtractor(self.llm)
|
|
1620
|
+
is_code = _is_code_file(file_path) or _is_code_like_file(file_path)
|
|
1621
|
+
|
|
1622
|
+
if is_code:
|
|
1623
|
+
all_facts = fact_extractor.extract_batch_code(chunks)
|
|
1624
|
+
else:
|
|
1625
|
+
all_facts = fact_extractor.extract_batch(chunks)
|
|
1626
|
+
|
|
1627
|
+
for fact in all_facts:
|
|
1628
|
+
fact_id = self._generate_entity_id(
|
|
1629
|
+
'fact',
|
|
1630
|
+
f"{fact.get('fact_type', 'unknown')}_{fact.get('subject', 'unknown')[:30]}",
|
|
1631
|
+
file_path
|
|
1632
|
+
)
|
|
1633
|
+
|
|
1634
|
+
citation = Citation(
|
|
1635
|
+
file_path=file_path,
|
|
1636
|
+
line_start=fact.get('line_start'),
|
|
1637
|
+
line_end=fact.get('line_end'),
|
|
1638
|
+
source_toolkit=source_toolkit,
|
|
1639
|
+
doc_id=f"{source_toolkit}://{file_path}",
|
|
1640
|
+
content_hash=raw_doc.metadata.get('commit_hash'),
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
facts.append({
|
|
1644
|
+
'id': fact_id,
|
|
1645
|
+
'name': fact.get('subject', 'unknown fact'),
|
|
1646
|
+
'type': 'fact',
|
|
1647
|
+
'citation': citation,
|
|
1648
|
+
'properties': {
|
|
1649
|
+
'fact_type': fact.get('fact_type'),
|
|
1650
|
+
'subject': fact.get('subject'),
|
|
1651
|
+
'predicate': fact.get('predicate'),
|
|
1652
|
+
'object': fact.get('object'),
|
|
1653
|
+
'confidence': fact.get('confidence', 0.8),
|
|
1654
|
+
},
|
|
1655
|
+
'source_doc': chunks[0] if chunks else None,
|
|
1656
|
+
'source': 'llm_fact',
|
|
1657
|
+
})
|
|
1658
|
+
except Exception as e:
|
|
1659
|
+
logger.warning(f"Batched fact extraction failed for {file_path}: {e}")
|
|
1660
|
+
|
|
1661
|
+
return facts, time.time() - start
|
|
1662
|
+
|
|
1663
|
+
# Run entity and fact extraction in PARALLEL (skip for trivial files)
|
|
1664
|
+
llm_start = time.time()
|
|
1665
|
+
if chunks and not skip_llm:
|
|
1666
|
+
with ThreadPoolExecutor(max_workers=2) as executor:
|
|
1667
|
+
entity_future = executor.submit(extract_entities)
|
|
1668
|
+
fact_future = executor.submit(extract_facts)
|
|
1669
|
+
|
|
1670
|
+
chunk_entities, entity_llm_duration = entity_future.result()
|
|
1671
|
+
chunk_facts, fact_llm_duration = fact_future.result()
|
|
1672
|
+
|
|
1673
|
+
llm_total = time.time() - llm_start
|
|
1674
|
+
logger.info(f"⏱️ [TIMING] LLM parallel: {llm_total:.3f}s (entity: {entity_llm_duration:.3f}s, fact: {fact_llm_duration:.3f}s, {len(chunks)} chunks) for {Path(file_path).name}")
|
|
1675
|
+
elif skip_llm:
|
|
1676
|
+
logger.info(f"⏱️ [TIMING] LLM skipped ({skip_reason}) for {Path(file_path).name}")
|
|
1677
|
+
|
|
1678
|
+
# ========== FILE-LEVEL DEDUPLICATION ==========
|
|
1679
|
+
# Deduplicate entities by (type, name)
|
|
1680
|
+
seen_entities = {}
|
|
1681
|
+
for entity in chunk_entities:
|
|
1682
|
+
key = (entity['type'], entity['name'].lower())
|
|
1683
|
+
if key not in seen_entities:
|
|
1684
|
+
seen_entities[key] = entity
|
|
1685
|
+
else:
|
|
1686
|
+
# Merge properties, keep first citation
|
|
1687
|
+
existing = seen_entities[key]
|
|
1688
|
+
for prop_key, prop_value in entity.get('properties', {}).items():
|
|
1689
|
+
if prop_key not in existing.get('properties', {}):
|
|
1690
|
+
existing.setdefault('properties', {})[prop_key] = prop_value
|
|
1691
|
+
|
|
1692
|
+
# Deduplicate facts by (fact_type, subject)
|
|
1693
|
+
seen_facts = {}
|
|
1694
|
+
for fact in chunk_facts:
|
|
1695
|
+
key = (fact['properties'].get('fact_type'), fact['name'].lower())
|
|
1696
|
+
if key not in seen_facts:
|
|
1697
|
+
seen_facts[key] = fact
|
|
1698
|
+
else:
|
|
1699
|
+
# Keep higher confidence
|
|
1700
|
+
existing = seen_facts[key]
|
|
1701
|
+
if fact['properties'].get('confidence', 0) > existing['properties'].get('confidence', 0):
|
|
1702
|
+
seen_facts[key] = fact
|
|
1703
|
+
|
|
1704
|
+
# Combine: parser entities + deduplicated chunk entities + deduplicated facts
|
|
1705
|
+
all_entities.extend(seen_entities.values())
|
|
1706
|
+
all_entities.extend(seen_facts.values())
|
|
1707
|
+
|
|
1708
|
+
# ========== CREATE FILE-LEVEL NODE ==========
|
|
1709
|
+
# File node acts as a container for all entities/facts from this file
|
|
1710
|
+
file_name = Path(file_path).name
|
|
1711
|
+
file_ext = Path(file_path).suffix.lower()
|
|
1712
|
+
|
|
1713
|
+
# Determine file type based on extension
|
|
1714
|
+
file_type = 'file'
|
|
1715
|
+
if file_ext in {'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs', '.kt', '.swift', '.cs', '.c', '.cpp', '.h'}:
|
|
1716
|
+
file_type = 'source_file'
|
|
1717
|
+
elif file_ext in {'.md', '.rst', '.txt'}:
|
|
1718
|
+
file_type = 'document_file'
|
|
1719
|
+
elif file_ext in {'.yml', '.yaml', '.json', '.toml', '.ini', '.cfg'}:
|
|
1720
|
+
file_type = 'config_file'
|
|
1721
|
+
elif file_ext in {'.html', '.css', '.scss', '.less'}:
|
|
1722
|
+
file_type = 'web_file'
|
|
1723
|
+
|
|
1724
|
+
file_entity_id = self._generate_entity_id('file', file_path, file_path)
|
|
1725
|
+
|
|
1726
|
+
file_citation = Citation(
|
|
1727
|
+
file_path=file_path,
|
|
1728
|
+
line_start=1,
|
|
1729
|
+
line_end=raw_doc.page_content.count('\n') + 1,
|
|
1730
|
+
source_toolkit=source_toolkit,
|
|
1731
|
+
doc_id=f"{source_toolkit}://{file_path}",
|
|
1732
|
+
content_hash=content_hash,
|
|
1733
|
+
)
|
|
1734
|
+
|
|
1735
|
+
# Count entities by category for file properties
|
|
1736
|
+
code_entity_count = sum(1 for e in all_entities if e['type'] in {'class', 'function', 'method', 'module', 'interface'})
|
|
1737
|
+
fact_count = sum(1 for e in all_entities if e['type'] == 'fact')
|
|
1738
|
+
other_entity_count = len(all_entities) - code_entity_count - fact_count
|
|
1739
|
+
|
|
1740
|
+
file_entity = {
|
|
1741
|
+
'id': file_entity_id,
|
|
1742
|
+
'name': file_name,
|
|
1743
|
+
'type': file_type,
|
|
1744
|
+
'citation': file_citation,
|
|
1745
|
+
'properties': {
|
|
1746
|
+
'full_path': file_path,
|
|
1747
|
+
'extension': file_ext,
|
|
1748
|
+
'line_count': raw_doc.page_content.count('\n') + 1,
|
|
1749
|
+
'size_bytes': len(raw_doc.page_content.encode('utf-8')),
|
|
1750
|
+
'content_hash': content_hash,
|
|
1751
|
+
'entity_count': len(all_entities),
|
|
1752
|
+
'code_entity_count': code_entity_count,
|
|
1753
|
+
'fact_count': fact_count,
|
|
1754
|
+
'other_entity_count': other_entity_count,
|
|
1755
|
+
},
|
|
1756
|
+
'source': 'parser',
|
|
1757
|
+
}
|
|
1758
|
+
|
|
1759
|
+
# Add file entity to the beginning (it's the container)
|
|
1760
|
+
all_entities.insert(0, file_entity)
|
|
1761
|
+
|
|
1762
|
+
# Create DEFINED_IN relationships from all entities to file
|
|
1763
|
+
for entity in all_entities[1:]: # Skip the file entity itself
|
|
1764
|
+
parser_relationships.append({
|
|
1765
|
+
'source_id': entity['id'],
|
|
1766
|
+
'target_id': file_entity_id,
|
|
1767
|
+
'relation_type': 'defined_in',
|
|
1768
|
+
'properties': {
|
|
1769
|
+
'source': 'parser',
|
|
1770
|
+
'source_toolkit': source_toolkit,
|
|
1771
|
+
},
|
|
1772
|
+
})
|
|
1773
|
+
|
|
1774
|
+
file_total_time = (time.time() - parser_start)
|
|
1775
|
+
logger.info(f"⏱️ [TIMING] File total: {file_total_time:.3f}s (parser: {parser_duration:.3f}s, llm_max: {max(entity_llm_duration, fact_llm_duration):.3f}s) for {Path(file_path).name}")
|
|
1776
|
+
logger.debug(f"File {file_path}: {len(all_entities)} total entities ({len(seen_entities)} from LLM, {len(seen_facts)} facts)")
|
|
1777
|
+
|
|
1778
|
+
return all_entities, parser_relationships, content_hash
|
|
1779
|
+
|
|
1780
|
+
def _extract_relations_from_file(
|
|
1781
|
+
self,
|
|
1782
|
+
file_path: str,
|
|
1783
|
+
file_entities: List[Dict[str, Any]],
|
|
1784
|
+
all_entity_dicts: List[Dict[str, Any]],
|
|
1785
|
+
schema: Optional[Dict] = None,
|
|
1786
|
+
max_retries: int = 3
|
|
1787
|
+
) -> Tuple[List[Dict[str, Any]], Optional[str]]:
|
|
1788
|
+
"""
|
|
1789
|
+
Extract relations from entities in a single file with retry logic.
|
|
1790
|
+
|
|
1791
|
+
Args:
|
|
1792
|
+
file_path: Path to the file being processed
|
|
1793
|
+
file_entities: Entities from this file
|
|
1794
|
+
all_entity_dicts: All graph entities for ID resolution
|
|
1795
|
+
schema: Optional schema to guide extraction
|
|
1796
|
+
max_retries: Maximum number of retry attempts (default: 3)
|
|
1797
|
+
|
|
1798
|
+
Returns:
|
|
1799
|
+
Tuple of (relations_list, error_message)
|
|
1800
|
+
error_message is None on success
|
|
1801
|
+
"""
|
|
1802
|
+
# Use first entity's doc for context
|
|
1803
|
+
doc = file_entities[0].get('source_doc')
|
|
1804
|
+
if not doc or not doc.page_content:
|
|
1805
|
+
# Try to reload content from file if source_doc is missing
|
|
1806
|
+
# This happens when resuming from checkpoint (source_doc isn't serialized)
|
|
1807
|
+
try:
|
|
1808
|
+
if file_path and Path(file_path).exists():
|
|
1809
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
1810
|
+
content = f.read()
|
|
1811
|
+
doc = Document(page_content=content, metadata={'file_path': file_path})
|
|
1812
|
+
logger.debug(f"Reloaded content from file: {file_path} ({len(content)} chars)")
|
|
1813
|
+
else:
|
|
1814
|
+
# Can't reload - return empty (no relations can be extracted without content)
|
|
1815
|
+
logger.debug(f"Cannot reload content for relation extraction: {file_path}")
|
|
1816
|
+
return [], None # Return empty but not an error
|
|
1817
|
+
except Exception as e:
|
|
1818
|
+
logger.warning(f"Failed to reload content from {file_path}: {e}")
|
|
1819
|
+
return [], None # Return empty but not an error
|
|
1820
|
+
|
|
1821
|
+
# Convert to format expected by relation extractor
|
|
1822
|
+
entity_dicts = [
|
|
1823
|
+
{'id': e['id'], 'name': e['name'], 'type': e['type'], **e.get('properties', {})}
|
|
1824
|
+
for e in file_entities
|
|
1825
|
+
]
|
|
1826
|
+
|
|
1827
|
+
# Retry logic with exponential backoff
|
|
1828
|
+
last_error = None
|
|
1829
|
+
for attempt in range(max_retries):
|
|
1830
|
+
try:
|
|
1831
|
+
file_relations = self._relation_extractor.extract(
|
|
1832
|
+
doc, entity_dicts, schema=schema, confidence_threshold=0.5,
|
|
1833
|
+
all_entities=all_entity_dicts
|
|
1834
|
+
)
|
|
1835
|
+
|
|
1836
|
+
# Add source tracking to each relation
|
|
1837
|
+
source_toolkit = file_entities[0].get('source_toolkit') if file_entities else None
|
|
1838
|
+
for rel in file_relations:
|
|
1839
|
+
if source_toolkit:
|
|
1840
|
+
if 'properties' not in rel:
|
|
1841
|
+
rel['properties'] = {}
|
|
1842
|
+
rel['properties']['source_toolkit'] = source_toolkit
|
|
1843
|
+
rel['properties']['discovered_in_file'] = file_path
|
|
1844
|
+
|
|
1845
|
+
return file_relations, None
|
|
1846
|
+
|
|
1847
|
+
except Exception as e:
|
|
1848
|
+
last_error = str(e)
|
|
1849
|
+
logger.warning(
|
|
1850
|
+
f"Relation extraction failed for '{file_path}' "
|
|
1851
|
+
f"(attempt {attempt + 1}/{max_retries}): {e}"
|
|
1852
|
+
)
|
|
1853
|
+
|
|
1854
|
+
# Exponential backoff: 1s, 2s, 4s
|
|
1855
|
+
if attempt < max_retries - 1:
|
|
1856
|
+
wait_time = 2 ** attempt
|
|
1857
|
+
time.sleep(wait_time)
|
|
1858
|
+
|
|
1859
|
+
# All retries failed
|
|
1860
|
+
logger.error(f"Failed to extract relations from '{file_path}' after {max_retries} attempts: {last_error}")
|
|
1861
|
+
return [], f"Failed after {max_retries} attempts: {last_error}"
|
|
1862
|
+
|
|
1863
|
+
def _extract_relations(
|
|
1864
|
+
self,
|
|
1865
|
+
entities: List[Dict[str, Any]],
|
|
1866
|
+
schema: Optional[Dict] = None,
|
|
1867
|
+
all_graph_entities: Optional[List[Dict[str, Any]]] = None
|
|
1868
|
+
) -> List[Dict[str, Any]]:
|
|
1869
|
+
"""
|
|
1870
|
+
Extract relations between entities in parallel with robust error handling.
|
|
1871
|
+
|
|
1872
|
+
Uses ThreadPoolExecutor to process multiple files concurrently, with automatic
|
|
1873
|
+
retry logic for failed extractions. Progress is reported as tasks complete.
|
|
1874
|
+
|
|
1875
|
+
Args:
|
|
1876
|
+
entities: New entities to extract relations from
|
|
1877
|
+
schema: Optional schema to guide extraction
|
|
1878
|
+
all_graph_entities: All entities in graph (for ID resolution across sources)
|
|
1879
|
+
|
|
1880
|
+
Returns:
|
|
1881
|
+
List of extracted relations
|
|
1882
|
+
"""
|
|
1883
|
+
if not self._relation_extractor or len(entities) < 2:
|
|
1884
|
+
return []
|
|
1885
|
+
|
|
1886
|
+
extract_rel_start = time.time()
|
|
1887
|
+
relations = []
|
|
1888
|
+
failed_files = []
|
|
1889
|
+
|
|
1890
|
+
# Build ID lookup from ALL graph entities (enables cross-source relations)
|
|
1891
|
+
all_entities_for_lookup = all_graph_entities or entities
|
|
1892
|
+
|
|
1893
|
+
# Group entities by file for relation extraction
|
|
1894
|
+
by_file: Dict[str, List] = {}
|
|
1895
|
+
for ent in entities:
|
|
1896
|
+
citation = ent.get('citation')
|
|
1897
|
+
if isinstance(citation, dict):
|
|
1898
|
+
fpath = citation.get('file_path', '')
|
|
1899
|
+
elif hasattr(citation, 'file_path'):
|
|
1900
|
+
fpath = citation.file_path
|
|
1901
|
+
else:
|
|
1902
|
+
fpath = ent.get('file_path', '')
|
|
1903
|
+
|
|
1904
|
+
if not fpath:
|
|
1905
|
+
continue
|
|
1906
|
+
|
|
1907
|
+
if fpath not in by_file:
|
|
1908
|
+
by_file[fpath] = []
|
|
1909
|
+
by_file[fpath].append(ent)
|
|
1910
|
+
|
|
1911
|
+
# Filter files with enough entities for relation extraction
|
|
1912
|
+
files_to_process = [(fp, ents) for fp, ents in by_file.items() if len(ents) >= 2]
|
|
1913
|
+
total_files = len(files_to_process)
|
|
1914
|
+
|
|
1915
|
+
if total_files == 0:
|
|
1916
|
+
return []
|
|
1917
|
+
|
|
1918
|
+
# Prepare all_entity_dicts for cross-source ID resolution
|
|
1919
|
+
# Use all_graph_entities if provided, otherwise use the entities we're processing
|
|
1920
|
+
all_entity_dicts = [
|
|
1921
|
+
{'id': e.get('id'), 'name': e.get('name'), 'type': e.get('type')}
|
|
1922
|
+
for e in all_entities_for_lookup
|
|
1923
|
+
if e.get('id')
|
|
1924
|
+
]
|
|
1925
|
+
|
|
1926
|
+
# Use ThreadPoolExecutor for parallel relation extraction
|
|
1927
|
+
completed_files = 0
|
|
1928
|
+
|
|
1929
|
+
with ThreadPoolExecutor(max_workers=self.max_parallel_extractions) as executor:
|
|
1930
|
+
# Submit all extraction tasks
|
|
1931
|
+
future_to_file = {
|
|
1932
|
+
executor.submit(
|
|
1933
|
+
self._extract_relations_from_file,
|
|
1934
|
+
file_path,
|
|
1935
|
+
file_entities,
|
|
1936
|
+
all_entity_dicts,
|
|
1937
|
+
schema
|
|
1938
|
+
): (file_path, file_entities)
|
|
1939
|
+
for file_path, file_entities in files_to_process
|
|
1940
|
+
}
|
|
1941
|
+
|
|
1942
|
+
# Process completed tasks as they finish
|
|
1943
|
+
for future in as_completed(future_to_file):
|
|
1944
|
+
file_path, file_entities = future_to_file[future]
|
|
1945
|
+
completed_files += 1
|
|
1946
|
+
|
|
1947
|
+
try:
|
|
1948
|
+
file_relations, error = future.result()
|
|
1949
|
+
|
|
1950
|
+
if error:
|
|
1951
|
+
# Log failed file but continue processing
|
|
1952
|
+
failed_files.append({
|
|
1953
|
+
'file_path': file_path,
|
|
1954
|
+
'error': error,
|
|
1955
|
+
'entity_count': len(file_entities)
|
|
1956
|
+
})
|
|
1957
|
+
else:
|
|
1958
|
+
relations.extend(file_relations)
|
|
1959
|
+
|
|
1960
|
+
except Exception as e:
|
|
1961
|
+
# Unexpected error (shouldn't happen since we catch in _extract_relations_from_file)
|
|
1962
|
+
logger.error(f"Unexpected error processing '{file_path}': {e}")
|
|
1963
|
+
failed_files.append({
|
|
1964
|
+
'file_path': file_path,
|
|
1965
|
+
'error': f"Unexpected error: {str(e)}",
|
|
1966
|
+
'entity_count': len(file_entities)
|
|
1967
|
+
})
|
|
1968
|
+
|
|
1969
|
+
# Log progress periodically
|
|
1970
|
+
if completed_files % 10 == 0 or completed_files == total_files or completed_files == 1:
|
|
1971
|
+
pct = (completed_files / total_files) * 100
|
|
1972
|
+
status_msg = f"🔗 Relations: {completed_files}/{total_files} files ({pct:.0f}%) | Found {len(relations)} relations"
|
|
1973
|
+
if failed_files:
|
|
1974
|
+
status_msg += f" | {len(failed_files)} files failed"
|
|
1975
|
+
self._log_progress(status_msg, "relations")
|
|
1976
|
+
|
|
1977
|
+
# Log summary of failures if any
|
|
1978
|
+
if failed_files:
|
|
1979
|
+
self._log_progress(
|
|
1980
|
+
f"⚠️ Relation extraction failed for {len(failed_files)}/{total_files} files. "
|
|
1981
|
+
f"Successfully extracted {len(relations)} relations from {total_files - len(failed_files)} files.",
|
|
1982
|
+
"relations"
|
|
1983
|
+
)
|
|
1984
|
+
# Log first few failures for debugging
|
|
1985
|
+
for failed in failed_files[:3]:
|
|
1986
|
+
logger.warning(
|
|
1987
|
+
f"Failed to extract relations from '{failed['file_path']}' "
|
|
1988
|
+
f"({failed['entity_count']} entities): {failed['error']}"
|
|
1989
|
+
)
|
|
1990
|
+
|
|
1991
|
+
file_rel_duration = time.time() - extract_rel_start
|
|
1992
|
+
logger.info(f"⏱️ [TIMING] Per-file relation extraction: {file_rel_duration:.3f}s for {total_files} files")
|
|
1993
|
+
|
|
1994
|
+
# Phase 2: Extract cross-file relations (imports, dependencies between modules)
|
|
1995
|
+
cross_file_start = time.time()
|
|
1996
|
+
cross_file_relations = self._extract_cross_file_relations(entities, all_entity_dicts, by_file)
|
|
1997
|
+
if cross_file_relations:
|
|
1998
|
+
relations.extend(cross_file_relations)
|
|
1999
|
+
self._log_progress(
|
|
2000
|
+
f"🔗 Cross-file: Found {len(cross_file_relations)} inter-module relations",
|
|
2001
|
+
"relations"
|
|
2002
|
+
)
|
|
2003
|
+
cross_file_duration = time.time() - cross_file_start
|
|
2004
|
+
logger.info(f"⏱️ [TIMING] Cross-file relation extraction: {cross_file_duration:.3f}s")
|
|
2005
|
+
|
|
2006
|
+
total_rel_duration = time.time() - extract_rel_start
|
|
2007
|
+
logger.info(f"⏱️ [TIMING] _extract_relations total: {total_rel_duration:.3f}s ({len(relations)} relations)")
|
|
2008
|
+
|
|
2009
|
+
return relations
|
|
2010
|
+
|
|
2011
|
+
def _extract_cross_file_relations(
|
|
2012
|
+
self,
|
|
2013
|
+
entities: List[Dict[str, Any]],
|
|
2014
|
+
all_entity_dicts: List[Dict[str, Any]],
|
|
2015
|
+
by_file: Dict[str, List[Dict[str, Any]]]
|
|
2016
|
+
) -> List[Dict[str, Any]]:
|
|
2017
|
+
"""
|
|
2018
|
+
Extract cross-file relationships by analyzing imports, references, and dependencies.
|
|
2019
|
+
|
|
2020
|
+
Uses the patterns module for extensible, language-specific pattern matching.
|
|
2021
|
+
Patterns cover:
|
|
2022
|
+
- Import statements (JS/TS, Python, Java, C#, Go, Ruby, Rust, PHP, etc.)
|
|
2023
|
+
- Documentation links (Markdown, Wiki, HTML, RST)
|
|
2024
|
+
- Text citations and references ("see X", "@see X", etc.)
|
|
2025
|
+
- Inheritance patterns
|
|
2026
|
+
- Entity name mentions in content
|
|
2027
|
+
|
|
2028
|
+
Args:
|
|
2029
|
+
entities: All entities to analyze
|
|
2030
|
+
all_entity_dicts: Entity dictionaries for lookup
|
|
2031
|
+
by_file: Entities grouped by file path
|
|
2032
|
+
|
|
2033
|
+
Returns:
|
|
2034
|
+
List of cross-file relations
|
|
2035
|
+
"""
|
|
2036
|
+
from .patterns import get_patterns_for_file, PatternCategory
|
|
2037
|
+
import re
|
|
2038
|
+
|
|
2039
|
+
cross_relations = []
|
|
2040
|
+
|
|
2041
|
+
# Build lookup tables
|
|
2042
|
+
entity_by_name: Dict[str, Dict] = {}
|
|
2043
|
+
entity_by_id: Dict[str, Dict] = {}
|
|
2044
|
+
file_to_entities: Dict[str, List[Dict]] = {}
|
|
2045
|
+
module_to_file: Dict[str, str] = {}
|
|
2046
|
+
|
|
2047
|
+
# For entity mention matching
|
|
2048
|
+
significant_entities: List[Tuple[str, Dict]] = []
|
|
2049
|
+
|
|
2050
|
+
for ent in entities:
|
|
2051
|
+
name = ent.get('name', '')
|
|
2052
|
+
ent_id = ent.get('id', '')
|
|
2053
|
+
|
|
2054
|
+
if name:
|
|
2055
|
+
name_lower = name.lower()
|
|
2056
|
+
entity_by_name[name_lower] = ent
|
|
2057
|
+
entity_by_name[name] = ent
|
|
2058
|
+
|
|
2059
|
+
# Track significant entities for mention detection
|
|
2060
|
+
ent_type = ent.get('type', '').lower()
|
|
2061
|
+
if ent_type in ('class', 'component', 'service', 'module', 'api', 'endpoint',
|
|
2062
|
+
'feature', 'epic', 'requirement', 'interface', 'schema', 'table'):
|
|
2063
|
+
if len(name) >= 3: # Min 3 chars to reduce noise
|
|
2064
|
+
significant_entities.append((name_lower, ent))
|
|
2065
|
+
|
|
2066
|
+
if ent_id:
|
|
2067
|
+
entity_by_id[ent_id] = ent
|
|
2068
|
+
|
|
2069
|
+
# Build file -> entities and module -> file mappings
|
|
2070
|
+
citation = ent.get('citation')
|
|
2071
|
+
if citation:
|
|
2072
|
+
file_path = citation.get('file_path', '') if isinstance(citation, dict) else getattr(citation, 'file_path', '')
|
|
2073
|
+
if file_path:
|
|
2074
|
+
if file_path not in file_to_entities:
|
|
2075
|
+
file_to_entities[file_path] = []
|
|
2076
|
+
file_to_entities[file_path].append(ent)
|
|
2077
|
+
|
|
2078
|
+
p = Path(file_path)
|
|
2079
|
+
stem = p.stem
|
|
2080
|
+
module_to_file[stem.lower()] = file_path
|
|
2081
|
+
if stem.lower() == 'index':
|
|
2082
|
+
module_to_file[p.parent.name.lower()] = file_path
|
|
2083
|
+
|
|
2084
|
+
# Sort significant entities by length for greedy matching
|
|
2085
|
+
significant_entities.sort(key=lambda x: len(x[0]), reverse=True)
|
|
2086
|
+
|
|
2087
|
+
# ========================================================================
|
|
2088
|
+
# PHASE 1: Pattern-based extraction from file content
|
|
2089
|
+
# ========================================================================
|
|
2090
|
+
|
|
2091
|
+
for file_path, file_ents in by_file.items():
|
|
2092
|
+
if not file_ents:
|
|
2093
|
+
continue
|
|
2094
|
+
|
|
2095
|
+
# Read file content
|
|
2096
|
+
file_content = ""
|
|
2097
|
+
try:
|
|
2098
|
+
for ent in file_ents:
|
|
2099
|
+
doc = ent.get('source_doc')
|
|
2100
|
+
if doc and hasattr(doc, 'page_content') and doc.page_content:
|
|
2101
|
+
file_content = doc.page_content
|
|
2102
|
+
break
|
|
2103
|
+
|
|
2104
|
+
if not file_content and Path(file_path).exists():
|
|
2105
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
2106
|
+
file_content = f.read()
|
|
2107
|
+
except Exception:
|
|
2108
|
+
pass
|
|
2109
|
+
|
|
2110
|
+
if not file_content:
|
|
2111
|
+
continue
|
|
2112
|
+
|
|
2113
|
+
# Get source entity for this file
|
|
2114
|
+
source_ent = next(
|
|
2115
|
+
(e for e in file_ents if e.get('type', '').lower() in
|
|
2116
|
+
('module', 'component', 'class', 'file', 'page', 'document')),
|
|
2117
|
+
file_ents[0] if file_ents else None
|
|
2118
|
+
)
|
|
2119
|
+
if not source_ent:
|
|
2120
|
+
continue
|
|
2121
|
+
|
|
2122
|
+
source_id = source_ent.get('id')
|
|
2123
|
+
|
|
2124
|
+
# Get patterns for this file type
|
|
2125
|
+
patterns = get_patterns_for_file(file_path)
|
|
2126
|
+
|
|
2127
|
+
# Apply each pattern
|
|
2128
|
+
for pattern in patterns:
|
|
2129
|
+
matches = pattern.match(file_content)
|
|
2130
|
+
|
|
2131
|
+
for match_value in matches:
|
|
2132
|
+
if not match_value or len(match_value) < 2:
|
|
2133
|
+
continue
|
|
2134
|
+
|
|
2135
|
+
# Try to resolve match to an entity
|
|
2136
|
+
target_ent = None
|
|
2137
|
+
match_lower = match_value.lower()
|
|
2138
|
+
|
|
2139
|
+
# For imports, try module-to-file mapping
|
|
2140
|
+
if pattern.category == PatternCategory.IMPORT:
|
|
2141
|
+
target_file = module_to_file.get(match_lower)
|
|
2142
|
+
if target_file and target_file != file_path:
|
|
2143
|
+
target_ents = file_to_entities.get(target_file, [])
|
|
2144
|
+
target_ent = next(
|
|
2145
|
+
(e for e in target_ents if e.get('type', '').lower() in
|
|
2146
|
+
('module', 'component', 'class', 'file')),
|
|
2147
|
+
target_ents[0] if target_ents else None
|
|
2148
|
+
)
|
|
2149
|
+
|
|
2150
|
+
# For links/citations, try entity name lookup
|
|
2151
|
+
if pattern.category in (PatternCategory.LINK, PatternCategory.CITATION,
|
|
2152
|
+
PatternCategory.INHERITANCE, PatternCategory.TYPE_REF):
|
|
2153
|
+
# Skip external URLs
|
|
2154
|
+
if match_value.startswith(('http://', 'https://', '#')):
|
|
2155
|
+
continue
|
|
2156
|
+
|
|
2157
|
+
# Try direct name match
|
|
2158
|
+
target_ent = entity_by_name.get(match_lower) or entity_by_name.get(match_value)
|
|
2159
|
+
|
|
2160
|
+
# Try as file path
|
|
2161
|
+
if not target_ent:
|
|
2162
|
+
target_file = module_to_file.get(Path(match_value).stem.lower())
|
|
2163
|
+
if target_file:
|
|
2164
|
+
target_ents = file_to_entities.get(target_file, [])
|
|
2165
|
+
target_ent = target_ents[0] if target_ents else None
|
|
2166
|
+
|
|
2167
|
+
if target_ent and source_id != target_ent.get('id'):
|
|
2168
|
+
target_citation = target_ent.get('citation')
|
|
2169
|
+
target_file = ''
|
|
2170
|
+
if target_citation:
|
|
2171
|
+
target_file = (target_citation.get('file_path', '')
|
|
2172
|
+
if isinstance(target_citation, dict)
|
|
2173
|
+
else getattr(target_citation, 'file_path', ''))
|
|
2174
|
+
|
|
2175
|
+
if target_file != file_path:
|
|
2176
|
+
cross_relations.append({
|
|
2177
|
+
'source_id': source_id,
|
|
2178
|
+
'target_id': target_ent.get('id'),
|
|
2179
|
+
'type': pattern.relation_type.value,
|
|
2180
|
+
'properties': {
|
|
2181
|
+
'source_file': file_path,
|
|
2182
|
+
'target_file': target_file,
|
|
2183
|
+
'discovered_by': f'pattern:{pattern.name}',
|
|
2184
|
+
'matched_value': match_value
|
|
2185
|
+
},
|
|
2186
|
+
'confidence': pattern.confidence
|
|
2187
|
+
})
|
|
2188
|
+
|
|
2189
|
+
# --- Entity mention detection ---
|
|
2190
|
+
content_lower = file_content.lower()
|
|
2191
|
+
for name_lower, target_ent in significant_entities:
|
|
2192
|
+
if target_ent.get('id') == source_id:
|
|
2193
|
+
continue
|
|
2194
|
+
|
|
2195
|
+
if name_lower in content_lower:
|
|
2196
|
+
# Verify word boundary
|
|
2197
|
+
if re.search(r'\b' + re.escape(name_lower) + r'\b', content_lower):
|
|
2198
|
+
target_citation = target_ent.get('citation')
|
|
2199
|
+
target_file = ''
|
|
2200
|
+
if target_citation:
|
|
2201
|
+
target_file = (target_citation.get('file_path', '')
|
|
2202
|
+
if isinstance(target_citation, dict)
|
|
2203
|
+
else getattr(target_citation, 'file_path', ''))
|
|
2204
|
+
|
|
2205
|
+
if target_file and target_file != file_path:
|
|
2206
|
+
cross_relations.append({
|
|
2207
|
+
'source_id': source_id,
|
|
2208
|
+
'target_id': target_ent.get('id'),
|
|
2209
|
+
'type': 'MENTIONS',
|
|
2210
|
+
'properties': {
|
|
2211
|
+
'source_file': file_path,
|
|
2212
|
+
'target_file': target_file,
|
|
2213
|
+
'discovered_by': 'content_mention',
|
|
2214
|
+
'mentioned_name': target_ent.get('name', '')
|
|
2215
|
+
},
|
|
2216
|
+
'confidence': 0.7
|
|
2217
|
+
})
|
|
2218
|
+
|
|
2219
|
+
# ========================================================================
|
|
2220
|
+
# PHASE 1.5: AST-based analysis (when available)
|
|
2221
|
+
# ========================================================================
|
|
2222
|
+
# Uses deepwiki parsers for more accurate code analysis
|
|
2223
|
+
|
|
2224
|
+
try:
|
|
2225
|
+
from .patterns import is_ast_available, extract_ast_cross_file_relations
|
|
2226
|
+
|
|
2227
|
+
if is_ast_available():
|
|
2228
|
+
# Collect file contents for AST analysis
|
|
2229
|
+
ast_file_contents: Dict[str, str] = {}
|
|
2230
|
+
ast_file_paths = []
|
|
2231
|
+
|
|
2232
|
+
for file_path, file_ents in by_file.items():
|
|
2233
|
+
# Only process code files that benefit from AST
|
|
2234
|
+
ext = Path(file_path).suffix.lower()
|
|
2235
|
+
if ext in ('.py', '.js', '.jsx', '.ts', '.tsx', '.java'):
|
|
2236
|
+
# Get content from entity or file
|
|
2237
|
+
file_content = ""
|
|
2238
|
+
try:
|
|
2239
|
+
for ent in file_ents:
|
|
2240
|
+
doc = ent.get('source_doc')
|
|
2241
|
+
if doc and hasattr(doc, 'page_content') and doc.page_content:
|
|
2242
|
+
file_content = doc.page_content
|
|
2243
|
+
break
|
|
2244
|
+
|
|
2245
|
+
if not file_content and Path(file_path).exists():
|
|
2246
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
2247
|
+
file_content = f.read()
|
|
2248
|
+
except Exception:
|
|
2249
|
+
pass
|
|
2250
|
+
|
|
2251
|
+
if file_content:
|
|
2252
|
+
ast_file_contents[file_path] = file_content
|
|
2253
|
+
ast_file_paths.append(file_path)
|
|
2254
|
+
|
|
2255
|
+
if ast_file_paths:
|
|
2256
|
+
# Extract AST-based relations
|
|
2257
|
+
ast_relations = extract_ast_cross_file_relations(
|
|
2258
|
+
ast_file_paths,
|
|
2259
|
+
ast_file_contents,
|
|
2260
|
+
entities
|
|
2261
|
+
)
|
|
2262
|
+
|
|
2263
|
+
# Convert to standard format and add
|
|
2264
|
+
for ast_rel in ast_relations:
|
|
2265
|
+
source_name = ast_rel.get('source_entity', '')
|
|
2266
|
+
target_name = ast_rel.get('target_entity', '')
|
|
2267
|
+
|
|
2268
|
+
# Resolve to entity IDs
|
|
2269
|
+
source_ent = entity_by_name.get(source_name.lower()) or entity_by_name.get(source_name)
|
|
2270
|
+
target_ent = entity_by_name.get(target_name.lower()) or entity_by_name.get(target_name)
|
|
2271
|
+
|
|
2272
|
+
if source_ent and target_ent and source_ent.get('id') != target_ent.get('id'):
|
|
2273
|
+
cross_relations.append({
|
|
2274
|
+
'source_id': source_ent.get('id'),
|
|
2275
|
+
'target_id': target_ent.get('id'),
|
|
2276
|
+
'type': ast_rel.get('relationship_type', 'REFERENCES').upper(),
|
|
2277
|
+
'properties': {
|
|
2278
|
+
'source_file': ast_rel.get('metadata', {}).get('source_file', ''),
|
|
2279
|
+
'target_file': ast_rel.get('metadata', {}).get('target_file', ''),
|
|
2280
|
+
'discovered_by': 'ast_analysis',
|
|
2281
|
+
'line': ast_rel.get('metadata', {}).get('line', 0)
|
|
2282
|
+
},
|
|
2283
|
+
'confidence': ast_rel.get('relationship_strength', 0.95)
|
|
2284
|
+
})
|
|
2285
|
+
|
|
2286
|
+
if ast_relations:
|
|
2287
|
+
self._log_progress(
|
|
2288
|
+
f"🌳 AST analysis found {len(ast_relations)} relations",
|
|
2289
|
+
"relations"
|
|
2290
|
+
)
|
|
2291
|
+
except ImportError:
|
|
2292
|
+
pass # AST adapter not available
|
|
2293
|
+
except Exception as e:
|
|
2294
|
+
import traceback
|
|
2295
|
+
self._log_progress(f"AST analysis failed: {e}", "debug")
|
|
2296
|
+
|
|
2297
|
+
# ========================================================================
|
|
2298
|
+
# PHASE 2: Entity property analysis
|
|
2299
|
+
# ========================================================================
|
|
2300
|
+
|
|
2301
|
+
def to_list(val):
|
|
2302
|
+
if isinstance(val, str):
|
|
2303
|
+
return [val] if val else []
|
|
2304
|
+
if isinstance(val, list):
|
|
2305
|
+
return val
|
|
2306
|
+
return []
|
|
2307
|
+
|
|
2308
|
+
for ent in entities:
|
|
2309
|
+
props = ent.get('properties', {})
|
|
2310
|
+
ent_id = ent.get('id', '')
|
|
2311
|
+
|
|
2312
|
+
citation = ent.get('citation')
|
|
2313
|
+
source_file = ''
|
|
2314
|
+
if citation:
|
|
2315
|
+
source_file = (citation.get('file_path', '')
|
|
2316
|
+
if isinstance(citation, dict)
|
|
2317
|
+
else getattr(citation, 'file_path', ''))
|
|
2318
|
+
|
|
2319
|
+
# Property-based references
|
|
2320
|
+
all_refs = [
|
|
2321
|
+
(to_list(props.get('imports', [])), 'IMPORTS'),
|
|
2322
|
+
(to_list(props.get('dependencies', [])), 'DEPENDS_ON'),
|
|
2323
|
+
(to_list(props.get('extends', props.get('parent_class', ''))), 'EXTENDS'),
|
|
2324
|
+
(to_list(props.get('implements', [])), 'IMPLEMENTS'),
|
|
2325
|
+
(to_list(props.get('uses', props.get('calls', []))), 'USES'),
|
|
2326
|
+
(to_list(props.get('references', props.get('links', []))), 'REFERENCES'),
|
|
2327
|
+
]
|
|
2328
|
+
|
|
2329
|
+
for ref_list, rel_type in all_refs:
|
|
2330
|
+
for ref in ref_list:
|
|
2331
|
+
if not ref:
|
|
2332
|
+
continue
|
|
2333
|
+
|
|
2334
|
+
ref_lower = ref.lower() if isinstance(ref, str) else str(ref).lower()
|
|
2335
|
+
target_ent = entity_by_name.get(ref_lower) or entity_by_name.get(ref)
|
|
2336
|
+
|
|
2337
|
+
if not target_ent and ('/' in ref_lower or '.' in ref_lower):
|
|
2338
|
+
clean_ref = ref_lower.split('/')[-1].split('.')[-1]
|
|
2339
|
+
target_ent = entity_by_name.get(clean_ref)
|
|
2340
|
+
|
|
2341
|
+
if target_ent and target_ent.get('id') != ent_id:
|
|
2342
|
+
target_citation = target_ent.get('citation')
|
|
2343
|
+
target_file = ''
|
|
2344
|
+
if target_citation:
|
|
2345
|
+
target_file = (target_citation.get('file_path', '')
|
|
2346
|
+
if isinstance(target_citation, dict)
|
|
2347
|
+
else getattr(target_citation, 'file_path', ''))
|
|
2348
|
+
|
|
2349
|
+
if target_file and source_file and target_file != source_file:
|
|
2350
|
+
cross_relations.append({
|
|
2351
|
+
'source_id': ent_id,
|
|
2352
|
+
'target_id': target_ent.get('id'),
|
|
2353
|
+
'type': rel_type,
|
|
2354
|
+
'properties': {
|
|
2355
|
+
'source_file': source_file,
|
|
2356
|
+
'target_file': target_file,
|
|
2357
|
+
'discovered_by': 'property_analysis',
|
|
2358
|
+
'reference_name': ref
|
|
2359
|
+
},
|
|
2360
|
+
'confidence': 0.9
|
|
2361
|
+
})
|
|
2362
|
+
|
|
2363
|
+
# Deduplicate
|
|
2364
|
+
seen = set()
|
|
2365
|
+
unique_relations = []
|
|
2366
|
+
for rel in cross_relations:
|
|
2367
|
+
key = (rel['source_id'], rel['target_id'], rel['type'])
|
|
2368
|
+
if key not in seen:
|
|
2369
|
+
seen.add(key)
|
|
2370
|
+
unique_relations.append(rel)
|
|
2371
|
+
|
|
2372
|
+
return unique_relations
|
|
2373
|
+
|
|
2374
|
+
def run(
|
|
2375
|
+
self,
|
|
2376
|
+
source: str,
|
|
2377
|
+
branch: Optional[str] = None,
|
|
2378
|
+
whitelist: Optional[List[str]] = None,
|
|
2379
|
+
blacklist: Optional[List[str]] = None,
|
|
2380
|
+
extract_relations: bool = True,
|
|
2381
|
+
resume: bool = True,
|
|
2382
|
+
max_documents: Optional[int] = None,
|
|
2383
|
+
**loader_kwargs
|
|
2384
|
+
) -> IngestionResult:
|
|
2385
|
+
"""
|
|
2386
|
+
Run the full ingestion pipeline with checkpoint support for resumability.
|
|
2387
|
+
|
|
2388
|
+
Args:
|
|
2389
|
+
source: Name of source toolkit (must be in source_toolkits)
|
|
2390
|
+
branch: Branch to analyze (optional, uses default if not specified)
|
|
2391
|
+
whitelist: File patterns to include (e.g., ['*.py', '*.js'])
|
|
2392
|
+
blacklist: File patterns to exclude (e.g., ['*test*', '*vendor*'])
|
|
2393
|
+
extract_relations: Whether to extract relations between entities
|
|
2394
|
+
resume: If True, try to resume from last checkpoint
|
|
2395
|
+
max_documents: Maximum number of documents to process (for testing)
|
|
2396
|
+
**loader_kwargs: Additional arguments for the toolkit's loader
|
|
2397
|
+
|
|
2398
|
+
Returns:
|
|
2399
|
+
IngestionResult with statistics and any errors
|
|
2400
|
+
"""
|
|
2401
|
+
import time
|
|
2402
|
+
start_time = time.time()
|
|
2403
|
+
result = IngestionResult(source=source)
|
|
2404
|
+
|
|
2405
|
+
# Validate source toolkit
|
|
2406
|
+
if source not in self.source_toolkits:
|
|
2407
|
+
available = list(self.source_toolkits.keys()) if self.source_toolkits else ['none']
|
|
2408
|
+
result.success = False
|
|
2409
|
+
result.errors.append(f"Toolkit '{source}' not found. Available: {', '.join(available)}")
|
|
2410
|
+
return result
|
|
2411
|
+
|
|
2412
|
+
toolkit = self.source_toolkits[source]
|
|
2413
|
+
|
|
2414
|
+
# Check for loader method
|
|
2415
|
+
if not hasattr(toolkit, 'loader'):
|
|
2416
|
+
result.success = False
|
|
2417
|
+
result.errors.append(f"Toolkit '{source}' does not have a loader method")
|
|
2418
|
+
return result
|
|
2419
|
+
|
|
2420
|
+
# Ensure extractors are initialized
|
|
2421
|
+
if not self._init_extractors():
|
|
2422
|
+
result.success = False
|
|
2423
|
+
result.errors.append("LLM not configured - cannot extract entities")
|
|
2424
|
+
return result
|
|
2425
|
+
|
|
2426
|
+
# Try to load existing checkpoint if resume is enabled
|
|
2427
|
+
checkpoint = None
|
|
2428
|
+
is_incremental_update = False
|
|
2429
|
+
if resume:
|
|
2430
|
+
checkpoint = self._load_checkpoint(source)
|
|
2431
|
+
if checkpoint:
|
|
2432
|
+
if checkpoint.completed:
|
|
2433
|
+
# Completed checkpoint - use for incremental update
|
|
2434
|
+
is_incremental_update = True
|
|
2435
|
+
num_tracked = len(checkpoint.file_hashes)
|
|
2436
|
+
self._log_progress(
|
|
2437
|
+
f"📋 Incremental update: tracking {num_tracked} files for changes",
|
|
2438
|
+
"incremental"
|
|
2439
|
+
)
|
|
2440
|
+
# Reset counters for new run but keep file hashes
|
|
2441
|
+
checkpoint.completed = False
|
|
2442
|
+
checkpoint.phase = "extract"
|
|
2443
|
+
checkpoint.pending_entities = []
|
|
2444
|
+
checkpoint.errors = []
|
|
2445
|
+
else:
|
|
2446
|
+
# Incomplete checkpoint - resume from failure
|
|
2447
|
+
self._log_progress(
|
|
2448
|
+
f"📋 Resuming from checkpoint: {checkpoint.documents_processed} docs already processed",
|
|
2449
|
+
"resume"
|
|
2450
|
+
)
|
|
2451
|
+
result.resumed_from_checkpoint = True
|
|
2452
|
+
# Restore progress from checkpoint
|
|
2453
|
+
result.documents_processed = checkpoint.documents_processed
|
|
2454
|
+
result.entities_added = checkpoint.entities_added
|
|
2455
|
+
|
|
2456
|
+
# Create new checkpoint if no existing one
|
|
2457
|
+
if not checkpoint:
|
|
2458
|
+
checkpoint = IngestionCheckpoint.create(
|
|
2459
|
+
source=source,
|
|
2460
|
+
branch=branch,
|
|
2461
|
+
whitelist=whitelist,
|
|
2462
|
+
blacklist=blacklist,
|
|
2463
|
+
extract_relations=extract_relations,
|
|
2464
|
+
)
|
|
2465
|
+
|
|
2466
|
+
self._current_checkpoint = checkpoint
|
|
2467
|
+
|
|
2468
|
+
self._log_progress(f"🚀 Starting ingestion from {source}", "start")
|
|
2469
|
+
|
|
2470
|
+
# Build loader kwargs
|
|
2471
|
+
loader_args = {**loader_kwargs}
|
|
2472
|
+
if branch:
|
|
2473
|
+
loader_args['branch'] = branch
|
|
2474
|
+
if whitelist:
|
|
2475
|
+
loader_args['whitelist'] = whitelist
|
|
2476
|
+
if blacklist:
|
|
2477
|
+
loader_args['blacklist'] = blacklist
|
|
2478
|
+
|
|
2479
|
+
if loader_args:
|
|
2480
|
+
params_str = ", ".join(f"{k}={v}" for k, v in loader_args.items() if v is not None)
|
|
2481
|
+
self._log_progress(f"📋 Loader params: {params_str}", "config")
|
|
2482
|
+
|
|
2483
|
+
try:
|
|
2484
|
+
# ========== STREAMING APPROACH ==========
|
|
2485
|
+
# Read files once, create raw doc + chunks on the fly
|
|
2486
|
+
# Process in batches to limit memory usage
|
|
2487
|
+
|
|
2488
|
+
self._log_progress(f"📥 Fetching documents from {source}...", "fetch")
|
|
2489
|
+
|
|
2490
|
+
# Note: We don't pre-count files to avoid iterating twice
|
|
2491
|
+
# The toolkit's loader() will log progress as it goes
|
|
2492
|
+
|
|
2493
|
+
# Import chunker for on-the-fly chunking
|
|
2494
|
+
try:
|
|
2495
|
+
from alita_sdk.tools.chunkers.universal_chunker import chunk_single_document
|
|
2496
|
+
from alita_sdk.tools.chunkers.code.codeparser import parse_code_files_for_db
|
|
2497
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
2498
|
+
has_chunker = True
|
|
2499
|
+
|
|
2500
|
+
# Create text splitter for non-code files
|
|
2501
|
+
text_splitter = RecursiveCharacterTextSplitter(
|
|
2502
|
+
chunk_size=1000,
|
|
2503
|
+
chunk_overlap=100,
|
|
2504
|
+
length_function=len,
|
|
2505
|
+
)
|
|
2506
|
+
|
|
2507
|
+
# Code extensions that use tree-sitter
|
|
2508
|
+
CODE_EXTENSIONS = {
|
|
2509
|
+
'.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
|
|
2510
|
+
'.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
|
|
2511
|
+
'.hs', '.rb', '.scala', '.lua'
|
|
2512
|
+
}
|
|
2513
|
+
|
|
2514
|
+
def chunk_document_direct(doc: Document) -> List[Document]:
|
|
2515
|
+
"""Chunk a single document directly without buffering."""
|
|
2516
|
+
file_path = doc.metadata.get('file_path', '')
|
|
2517
|
+
ext = Path(file_path).suffix.lower()
|
|
2518
|
+
|
|
2519
|
+
if ext in CODE_EXTENSIONS:
|
|
2520
|
+
# Use code parser directly
|
|
2521
|
+
try:
|
|
2522
|
+
chunks = list(parse_code_files_for_db([{
|
|
2523
|
+
'file_name': file_path,
|
|
2524
|
+
'file_content': doc.page_content,
|
|
2525
|
+
'commit_hash': doc.metadata.get('commit_hash', ''),
|
|
2526
|
+
}]))
|
|
2527
|
+
# Ensure file_path is preserved
|
|
2528
|
+
for chunk in chunks:
|
|
2529
|
+
if 'file_path' not in chunk.metadata:
|
|
2530
|
+
chunk.metadata['file_path'] = file_path
|
|
2531
|
+
return chunks if chunks else [doc]
|
|
2532
|
+
except Exception as e:
|
|
2533
|
+
logger.debug(f"Code chunking failed for {file_path}: {e}")
|
|
2534
|
+
return [doc]
|
|
2535
|
+
else:
|
|
2536
|
+
# Use text splitter
|
|
2537
|
+
try:
|
|
2538
|
+
chunks = text_splitter.split_documents([doc])
|
|
2539
|
+
for idx, chunk in enumerate(chunks, 1):
|
|
2540
|
+
chunk.metadata['chunk_id'] = idx
|
|
2541
|
+
return chunks if chunks else [doc]
|
|
2542
|
+
except Exception:
|
|
2543
|
+
return [doc]
|
|
2544
|
+
|
|
2545
|
+
except ImportError:
|
|
2546
|
+
has_chunker = False
|
|
2547
|
+
chunk_document_direct = None
|
|
2548
|
+
logger.warning("Chunkers not available, using raw documents")
|
|
2549
|
+
|
|
2550
|
+
# Get schema
|
|
2551
|
+
schema = self._knowledge_graph.get_schema()
|
|
2552
|
+
all_entities = list(checkpoint.pending_entities) if checkpoint.pending_entities else []
|
|
2553
|
+
all_parser_relationships = [] # Collect parser-extracted relationships
|
|
2554
|
+
|
|
2555
|
+
checkpoint.phase = "extract"
|
|
2556
|
+
self._log_progress(
|
|
2557
|
+
f"🔍 Extracting entities (parallel batches of {self.batch_size}, "
|
|
2558
|
+
f"max {self.max_parallel_extractions} concurrent)...",
|
|
2559
|
+
"extract"
|
|
2560
|
+
)
|
|
2561
|
+
|
|
2562
|
+
# ========== STREAMING FILE PROCESSING ==========
|
|
2563
|
+
# Process files one at a time, creating chunks on-the-fly
|
|
2564
|
+
file_batch = []
|
|
2565
|
+
total_batches_processed = 0
|
|
2566
|
+
files_seen = 0
|
|
2567
|
+
streaming_start = time.time()
|
|
2568
|
+
total_chunk_time = 0.0
|
|
2569
|
+
|
|
2570
|
+
# Stream raw documents (read once)
|
|
2571
|
+
loader_args['chunked'] = False
|
|
2572
|
+
for raw_doc in toolkit.loader(**loader_args):
|
|
2573
|
+
file_path = (raw_doc.metadata.get('file_path') or
|
|
2574
|
+
raw_doc.metadata.get('file_name') or
|
|
2575
|
+
raw_doc.metadata.get('source', 'unknown'))
|
|
2576
|
+
files_seen += 1
|
|
2577
|
+
|
|
2578
|
+
# Check document limit (for testing)
|
|
2579
|
+
if max_documents and result.documents_processed >= max_documents:
|
|
2580
|
+
# Process remaining batch if any
|
|
2581
|
+
if file_batch:
|
|
2582
|
+
self._process_file_batch_and_update_graph(
|
|
2583
|
+
file_batch, {}, source, schema, checkpoint, result,
|
|
2584
|
+
all_entities, all_parser_relationships, is_incremental_update
|
|
2585
|
+
)
|
|
2586
|
+
self._log_progress(
|
|
2587
|
+
f"⚠️ Reached document limit ({max_documents}), stopping...",
|
|
2588
|
+
"limit"
|
|
2589
|
+
)
|
|
2590
|
+
break
|
|
2591
|
+
|
|
2592
|
+
# Normalize document
|
|
2593
|
+
normalized = self._normalize_document(raw_doc, source)
|
|
2594
|
+
if not normalized:
|
|
2595
|
+
continue
|
|
2596
|
+
|
|
2597
|
+
# For incremental updates, check if file changed
|
|
2598
|
+
if is_incremental_update:
|
|
2599
|
+
content_hash = hashlib.sha256(normalized.page_content.encode()).hexdigest()
|
|
2600
|
+
if not checkpoint.has_file_changed(file_path, content_hash):
|
|
2601
|
+
result.documents_skipped += 1
|
|
2602
|
+
continue
|
|
2603
|
+
else:
|
|
2604
|
+
# File has changed - remove old entities before reprocessing
|
|
2605
|
+
removed = self._knowledge_graph.remove_entities_by_file(file_path)
|
|
2606
|
+
if removed > 0:
|
|
2607
|
+
result.entities_removed += removed
|
|
2608
|
+
logger.debug(f"Removed {removed} stale entities from {file_path}")
|
|
2609
|
+
|
|
2610
|
+
# Skip if already processed in current run (resuming from checkpoint)
|
|
2611
|
+
if not is_incremental_update and checkpoint.is_file_processed(file_path):
|
|
2612
|
+
result.documents_skipped += 1
|
|
2613
|
+
continue
|
|
2614
|
+
|
|
2615
|
+
# Create chunks on-the-fly from this single document
|
|
2616
|
+
chunk_start = time.time()
|
|
2617
|
+
if has_chunker and chunk_document_direct:
|
|
2618
|
+
# Direct chunking - no buffering overhead
|
|
2619
|
+
chunks = chunk_document_direct(normalized)
|
|
2620
|
+
else:
|
|
2621
|
+
# No chunker - use raw doc as single chunk
|
|
2622
|
+
chunks = [normalized]
|
|
2623
|
+
chunk_time = time.time() - chunk_start
|
|
2624
|
+
total_chunk_time += chunk_time
|
|
2625
|
+
if chunk_time > 0.1: # Log if chunking takes > 100ms
|
|
2626
|
+
logger.info(f"⏱️ [TIMING] Chunking: {chunk_time:.3f}s ({len(chunks)} chunks) for {Path(file_path).name}")
|
|
2627
|
+
|
|
2628
|
+
# Add to current batch: (file_path, chunks, raw_doc)
|
|
2629
|
+
file_batch.append((file_path, chunks, normalized))
|
|
2630
|
+
|
|
2631
|
+
# Process batch when it reaches batch_size
|
|
2632
|
+
if len(file_batch) >= self.batch_size:
|
|
2633
|
+
batch_num = total_batches_processed + 1
|
|
2634
|
+
self._log_progress(
|
|
2635
|
+
f"⚡ Processing batch {batch_num} ({len(file_batch)} files, file #{files_seen})...",
|
|
2636
|
+
"batch"
|
|
2637
|
+
)
|
|
2638
|
+
|
|
2639
|
+
self._process_file_batch_and_update_graph(
|
|
2640
|
+
file_batch, {}, source, schema, checkpoint, result,
|
|
2641
|
+
all_entities, all_parser_relationships, is_incremental_update
|
|
2642
|
+
)
|
|
2643
|
+
|
|
2644
|
+
total_batches_processed += 1
|
|
2645
|
+
file_batch = [] # Reset batch
|
|
2646
|
+
|
|
2647
|
+
# Save checkpoint after each batch
|
|
2648
|
+
checkpoint.documents_processed = result.documents_processed
|
|
2649
|
+
checkpoint.entities_added = result.entities_added
|
|
2650
|
+
self._save_checkpoint(checkpoint)
|
|
2651
|
+
self._auto_save()
|
|
2652
|
+
|
|
2653
|
+
self._log_progress(
|
|
2654
|
+
f"📄 Processed {result.documents_processed} files | "
|
|
2655
|
+
f"📊 {result.entities_added} entities | 💾 Checkpoint saved",
|
|
2656
|
+
"progress"
|
|
2657
|
+
)
|
|
2658
|
+
|
|
2659
|
+
# Process remaining files in final batch
|
|
2660
|
+
if file_batch:
|
|
2661
|
+
batch_num = total_batches_processed + 1
|
|
2662
|
+
self._log_progress(
|
|
2663
|
+
f"⚡ Processing final batch {batch_num} ({len(file_batch)} files)...",
|
|
2664
|
+
"batch"
|
|
2665
|
+
)
|
|
2666
|
+
self._process_file_batch_and_update_graph(
|
|
2667
|
+
file_batch, {}, source, schema, checkpoint, result,
|
|
2668
|
+
all_entities, all_parser_relationships, is_incremental_update
|
|
2669
|
+
)
|
|
2670
|
+
|
|
2671
|
+
streaming_duration = time.time() - streaming_start
|
|
2672
|
+
logger.info(f"⏱️ [TIMING] Streaming phase complete: {streaming_duration:.3f}s total, {total_chunk_time:.3f}s chunking, {total_batches_processed + 1} batches")
|
|
2673
|
+
|
|
2674
|
+
# Report skipped files before relation extraction
|
|
2675
|
+
if result.documents_skipped > 0:
|
|
2676
|
+
self._log_progress(
|
|
2677
|
+
f"⏭️ Skipped {result.documents_skipped} unchanged files",
|
|
2678
|
+
"progress"
|
|
2679
|
+
)
|
|
2680
|
+
|
|
2681
|
+
# Update checkpoint before relation extraction
|
|
2682
|
+
checkpoint.documents_processed = result.documents_processed
|
|
2683
|
+
checkpoint.entities_added = result.entities_added
|
|
2684
|
+
checkpoint.pending_entities = [
|
|
2685
|
+
{'id': e['id'], 'name': e['name'], 'type': e['type'],
|
|
2686
|
+
'file_path': (e['citation'].file_path if hasattr(e.get('citation'), 'file_path')
|
|
2687
|
+
else e.get('citation', {}).get('file_path', e.get('file_path', ''))),
|
|
2688
|
+
'properties': e.get('properties', {})}
|
|
2689
|
+
for e in all_entities
|
|
2690
|
+
]
|
|
2691
|
+
self._save_checkpoint(checkpoint)
|
|
2692
|
+
|
|
2693
|
+
# Extract relations
|
|
2694
|
+
if extract_relations and all_entities:
|
|
2695
|
+
checkpoint.phase = "relations"
|
|
2696
|
+
self._save_checkpoint(checkpoint)
|
|
2697
|
+
relations_phase_start = time.time()
|
|
2698
|
+
|
|
2699
|
+
# Get ALL entities from graph (existing + new) for relation resolution
|
|
2700
|
+
# This enables cross-source relations (e.g., github entities referencing confluence entities)
|
|
2701
|
+
graph_entities = self._knowledge_graph.get_all_entities()
|
|
2702
|
+
|
|
2703
|
+
# ========== PARSER RELATIONSHIPS (no LLM) ==========
|
|
2704
|
+
# Add parser-extracted relationships directly to graph
|
|
2705
|
+
parser_rel_start = time.time()
|
|
2706
|
+
if all_parser_relationships:
|
|
2707
|
+
self._log_progress(
|
|
2708
|
+
f"🔗 Adding {len(all_parser_relationships)} parser-extracted relationships...",
|
|
2709
|
+
"relations"
|
|
2710
|
+
)
|
|
2711
|
+
|
|
2712
|
+
# Build entity lookup for ID resolution
|
|
2713
|
+
entity_by_name = {}
|
|
2714
|
+
for e in graph_entities:
|
|
2715
|
+
name_lower = e.get('name', '').lower()
|
|
2716
|
+
entity_by_name[name_lower] = e.get('id')
|
|
2717
|
+
# Also map full qualified names
|
|
2718
|
+
full_name = e.get('properties', {}).get('full_name', '')
|
|
2719
|
+
if full_name:
|
|
2720
|
+
entity_by_name[full_name.lower()] = e.get('id')
|
|
2721
|
+
|
|
2722
|
+
for rel in all_parser_relationships:
|
|
2723
|
+
# Check for pre-resolved IDs (used for containment edges)
|
|
2724
|
+
source_id = rel.get('_resolved_source_id')
|
|
2725
|
+
target_id = rel.get('_resolved_target_id')
|
|
2726
|
+
|
|
2727
|
+
# Fall back to name-based resolution if not pre-resolved
|
|
2728
|
+
if not source_id or not target_id:
|
|
2729
|
+
source_name = rel.get('source_symbol', '').lower()
|
|
2730
|
+
target_name = rel.get('target_symbol', '').lower()
|
|
2731
|
+
|
|
2732
|
+
source_id = source_id or entity_by_name.get(source_name)
|
|
2733
|
+
target_id = target_id or entity_by_name.get(target_name)
|
|
2734
|
+
|
|
2735
|
+
if source_id and target_id:
|
|
2736
|
+
properties = {
|
|
2737
|
+
'source_toolkit': rel.get('source_toolkit', source),
|
|
2738
|
+
'confidence': rel.get('confidence', 1.0),
|
|
2739
|
+
'source': 'parser',
|
|
2740
|
+
'discovered_in_file': rel.get('source_file'),
|
|
2741
|
+
}
|
|
2742
|
+
if rel.get('is_cross_file'):
|
|
2743
|
+
properties['is_cross_file'] = True
|
|
2744
|
+
|
|
2745
|
+
success = self._knowledge_graph.add_relation(
|
|
2746
|
+
source_id=source_id,
|
|
2747
|
+
target_id=target_id,
|
|
2748
|
+
relation_type=rel.get('relation_type', 'references'),
|
|
2749
|
+
properties=properties
|
|
2750
|
+
)
|
|
2751
|
+
if success:
|
|
2752
|
+
result.relations_added += 1
|
|
2753
|
+
|
|
2754
|
+
parser_rel_duration = time.time() - parser_rel_start
|
|
2755
|
+
logger.info(f"⏱️ [TIMING] Parser relations: {parser_rel_duration:.3f}s for {len(all_parser_relationships)} relationships")
|
|
2756
|
+
|
|
2757
|
+
# ========== LLM RELATIONSHIPS (semantic) ==========
|
|
2758
|
+
llm_rel_start = time.time()
|
|
2759
|
+
self._log_progress(
|
|
2760
|
+
f"🔗 Extracting semantic relations from {len(all_entities)} new entities "
|
|
2761
|
+
f"(graph has {len(graph_entities)} total)...",
|
|
2762
|
+
"relations"
|
|
2763
|
+
)
|
|
2764
|
+
|
|
2765
|
+
# Pass all graph entities for ID resolution, but only extract from new docs
|
|
2766
|
+
relations = self._extract_relations(all_entities, schema, all_graph_entities=graph_entities)
|
|
2767
|
+
|
|
2768
|
+
for rel in relations:
|
|
2769
|
+
# Merge source information into properties
|
|
2770
|
+
properties = rel.get('properties', {})
|
|
2771
|
+
if 'source_toolkit' not in properties:
|
|
2772
|
+
# Fallback: add current source if not already set
|
|
2773
|
+
properties['source_toolkit'] = source
|
|
2774
|
+
properties['source'] = 'llm' # Mark as LLM-extracted
|
|
2775
|
+
|
|
2776
|
+
success = self._knowledge_graph.add_relation(
|
|
2777
|
+
source_id=rel.get('source_id'),
|
|
2778
|
+
target_id=rel.get('target_id'),
|
|
2779
|
+
relation_type=rel.get('relation_type', 'RELATED_TO'),
|
|
2780
|
+
properties=properties
|
|
2781
|
+
)
|
|
2782
|
+
if success:
|
|
2783
|
+
result.relations_added += 1
|
|
2784
|
+
|
|
2785
|
+
llm_rel_duration = time.time() - llm_rel_start
|
|
2786
|
+
relations_phase_duration = time.time() - relations_phase_start
|
|
2787
|
+
logger.info(f"⏱️ [TIMING] LLM relations: {llm_rel_duration:.3f}s")
|
|
2788
|
+
logger.info(f"⏱️ [TIMING] Relations phase total: {relations_phase_duration:.3f}s")
|
|
2789
|
+
|
|
2790
|
+
# Save final graph
|
|
2791
|
+
self._auto_save()
|
|
2792
|
+
|
|
2793
|
+
# Mark checkpoint as complete - keep it for incremental updates
|
|
2794
|
+
checkpoint.completed = True
|
|
2795
|
+
checkpoint.phase = "complete"
|
|
2796
|
+
checkpoint.relations_added = result.relations_added
|
|
2797
|
+
checkpoint.pending_entities = [] # Clear pending entities to save space
|
|
2798
|
+
self._save_checkpoint(checkpoint)
|
|
2799
|
+
# Note: We keep the checkpoint for incremental updates (file hash tracking)
|
|
2800
|
+
|
|
2801
|
+
result.graph_stats = self._knowledge_graph.get_stats()
|
|
2802
|
+
result.duration_seconds = time.time() - start_time
|
|
2803
|
+
|
|
2804
|
+
# Report any failed documents
|
|
2805
|
+
if result.failed_documents:
|
|
2806
|
+
self._log_progress(
|
|
2807
|
+
f"⚠️ {len(result.failed_documents)} documents failed to process",
|
|
2808
|
+
"warning"
|
|
2809
|
+
)
|
|
2810
|
+
|
|
2811
|
+
# Build completion message
|
|
2812
|
+
completion_msg = (
|
|
2813
|
+
f"✅ Ingestion complete! {result.entities_added} entities, "
|
|
2814
|
+
f"{result.relations_added} relations in {result.duration_seconds:.1f}s"
|
|
2815
|
+
)
|
|
2816
|
+
if result.documents_skipped > 0:
|
|
2817
|
+
completion_msg += f" ({result.documents_skipped} unchanged files skipped)"
|
|
2818
|
+
|
|
2819
|
+
self._log_progress(completion_msg, "complete")
|
|
2820
|
+
|
|
2821
|
+
except Exception as e:
|
|
2822
|
+
logger.exception(f"Ingestion failed: {e}")
|
|
2823
|
+
result.success = False
|
|
2824
|
+
result.errors.append(str(e))
|
|
2825
|
+
result.duration_seconds = time.time() - start_time
|
|
2826
|
+
|
|
2827
|
+
# Save checkpoint on failure for resume
|
|
2828
|
+
checkpoint.errors.append(str(e))
|
|
2829
|
+
checkpoint.documents_processed = result.documents_processed
|
|
2830
|
+
checkpoint.entities_added = result.entities_added
|
|
2831
|
+
self._save_checkpoint(checkpoint)
|
|
2832
|
+
self._auto_save() # Save graph progress
|
|
2833
|
+
|
|
2834
|
+
self._log_progress(
|
|
2835
|
+
f"❌ Ingestion failed. Checkpoint saved for resume. "
|
|
2836
|
+
f"Processed {result.documents_processed} docs before failure.",
|
|
2837
|
+
"error"
|
|
2838
|
+
)
|
|
2839
|
+
|
|
2840
|
+
return result
|
|
2841
|
+
|
|
2842
|
+
def run_from_generator(
|
|
2843
|
+
self,
|
|
2844
|
+
documents: Generator[Document, None, None],
|
|
2845
|
+
source: str = "custom",
|
|
2846
|
+
extract_relations: bool = True
|
|
2847
|
+
) -> IngestionResult:
|
|
2848
|
+
"""
|
|
2849
|
+
Run ingestion from a pre-built document generator.
|
|
2850
|
+
|
|
2851
|
+
Use this when you have your own document source that's not
|
|
2852
|
+
a standard toolkit (e.g., custom loader, S3 files, etc.).
|
|
2853
|
+
|
|
2854
|
+
Args:
|
|
2855
|
+
documents: Generator yielding LangChain Documents
|
|
2856
|
+
source: Name to identify the source in citations
|
|
2857
|
+
extract_relations: Whether to extract relations
|
|
2858
|
+
|
|
2859
|
+
Returns:
|
|
2860
|
+
IngestionResult with statistics
|
|
2861
|
+
"""
|
|
2862
|
+
import time
|
|
2863
|
+
start_time = time.time()
|
|
2864
|
+
result = IngestionResult(source=source)
|
|
2865
|
+
|
|
2866
|
+
if not self._init_extractors():
|
|
2867
|
+
result.success = False
|
|
2868
|
+
result.errors.append("LLM not configured")
|
|
2869
|
+
return result
|
|
2870
|
+
|
|
2871
|
+
self._log_progress(f"🚀 Starting ingestion from {source} generator", "start")
|
|
2872
|
+
|
|
2873
|
+
schema = self._knowledge_graph.get_schema()
|
|
2874
|
+
all_entities = []
|
|
2875
|
+
|
|
2876
|
+
try:
|
|
2877
|
+
for doc in documents:
|
|
2878
|
+
normalized = self._normalize_document(doc, source)
|
|
2879
|
+
if not normalized:
|
|
2880
|
+
continue
|
|
2881
|
+
|
|
2882
|
+
result.documents_processed += 1
|
|
2883
|
+
entities, extraction_failures = self._extract_entities_from_doc(normalized, source, schema)
|
|
2884
|
+
|
|
2885
|
+
# Track extraction failures
|
|
2886
|
+
if extraction_failures:
|
|
2887
|
+
for failed_path in extraction_failures:
|
|
2888
|
+
if failed_path not in result.failed_documents:
|
|
2889
|
+
result.failed_documents.append(failed_path)
|
|
2890
|
+
|
|
2891
|
+
for entity in entities:
|
|
2892
|
+
self._knowledge_graph.add_entity(
|
|
2893
|
+
entity_id=entity['id'],
|
|
2894
|
+
name=entity['name'],
|
|
2895
|
+
entity_type=entity['type'],
|
|
2896
|
+
citation=entity['citation'],
|
|
2897
|
+
properties=entity['properties']
|
|
2898
|
+
)
|
|
2899
|
+
result.entities_added += 1
|
|
2900
|
+
all_entities.append(entity)
|
|
2901
|
+
|
|
2902
|
+
if result.documents_processed % 10 == 0:
|
|
2903
|
+
self._log_progress(
|
|
2904
|
+
f"📄 {result.documents_processed} docs | 📊 {result.entities_added} entities",
|
|
2905
|
+
"progress"
|
|
2906
|
+
)
|
|
2907
|
+
|
|
2908
|
+
if extract_relations and all_entities:
|
|
2909
|
+
graph_entities = self._knowledge_graph.get_all_entities()
|
|
2910
|
+
self._log_progress(
|
|
2911
|
+
f"🔗 Extracting relations from {len(all_entities)} new entities "
|
|
2912
|
+
f"(graph has {len(graph_entities)} total)...",
|
|
2913
|
+
"relations"
|
|
2914
|
+
)
|
|
2915
|
+
relations = self._extract_relations(all_entities, schema, all_graph_entities=graph_entities)
|
|
2916
|
+
|
|
2917
|
+
for rel in relations:
|
|
2918
|
+
# Merge source information into properties
|
|
2919
|
+
properties = rel.get('properties', {})
|
|
2920
|
+
if 'source_toolkit' not in properties:
|
|
2921
|
+
# Add current source if not already set
|
|
2922
|
+
properties['source_toolkit'] = source
|
|
2923
|
+
|
|
2924
|
+
if self._knowledge_graph.add_relation(
|
|
2925
|
+
source_id=rel.get('source_id'),
|
|
2926
|
+
target_id=rel.get('target_id'),
|
|
2927
|
+
relation_type=rel.get('relation_type', 'RELATED_TO'),
|
|
2928
|
+
properties=properties
|
|
2929
|
+
):
|
|
2930
|
+
result.relations_added += 1
|
|
2931
|
+
|
|
2932
|
+
self._auto_save()
|
|
2933
|
+
result.graph_stats = self._knowledge_graph.get_stats()
|
|
2934
|
+
result.duration_seconds = time.time() - start_time
|
|
2935
|
+
|
|
2936
|
+
self._log_progress(f"✅ Complete! {result}", "complete")
|
|
2937
|
+
|
|
2938
|
+
except Exception as e:
|
|
2939
|
+
logger.exception(f"Ingestion failed: {e}")
|
|
2940
|
+
result.success = False
|
|
2941
|
+
result.errors.append(str(e))
|
|
2942
|
+
result.duration_seconds = time.time() - start_time
|
|
2943
|
+
|
|
2944
|
+
return result
|
|
2945
|
+
|
|
2946
|
+
def delta_update(
|
|
2947
|
+
self,
|
|
2948
|
+
source: str,
|
|
2949
|
+
file_paths: List[str],
|
|
2950
|
+
extract_relations: bool = True
|
|
2951
|
+
) -> IngestionResult:
|
|
2952
|
+
"""
|
|
2953
|
+
Perform delta update for changed files.
|
|
2954
|
+
|
|
2955
|
+
1. Removes existing entities from the specified files
|
|
2956
|
+
2. Re-fetches and re-analyzes those files
|
|
2957
|
+
3. Adds new entities with fresh citations
|
|
2958
|
+
|
|
2959
|
+
Args:
|
|
2960
|
+
source: Name of source toolkit
|
|
2961
|
+
file_paths: List of file paths that have changed
|
|
2962
|
+
extract_relations: Whether to extract relations
|
|
2963
|
+
|
|
2964
|
+
Returns:
|
|
2965
|
+
IngestionResult with statistics including entities removed
|
|
2966
|
+
"""
|
|
2967
|
+
import time
|
|
2968
|
+
start_time = time.time()
|
|
2969
|
+
result = IngestionResult(source=source)
|
|
2970
|
+
|
|
2971
|
+
self._log_progress(f"🔄 Delta update for {len(file_paths)} files from {source}", "start")
|
|
2972
|
+
|
|
2973
|
+
# Remove stale entities
|
|
2974
|
+
for file_path in file_paths:
|
|
2975
|
+
removed = self._knowledge_graph.remove_entities_by_file(file_path)
|
|
2976
|
+
result.entities_removed += removed
|
|
2977
|
+
|
|
2978
|
+
self._log_progress(f"🗑️ Removed {result.entities_removed} stale entities", "cleanup")
|
|
2979
|
+
|
|
2980
|
+
# Re-ingest the changed files
|
|
2981
|
+
if source not in self.source_toolkits:
|
|
2982
|
+
# Fall back to local file read if toolkit not available
|
|
2983
|
+
self._log_progress("📁 Reading files locally (toolkit not available)", "local")
|
|
2984
|
+
|
|
2985
|
+
from pathlib import Path
|
|
2986
|
+
|
|
2987
|
+
def local_loader():
|
|
2988
|
+
for file_path in file_paths:
|
|
2989
|
+
try:
|
|
2990
|
+
content = Path(file_path).read_text(encoding='utf-8')
|
|
2991
|
+
yield Document(
|
|
2992
|
+
page_content=content,
|
|
2993
|
+
metadata={'file_path': file_path, 'source_toolkit': 'filesystem'}
|
|
2994
|
+
)
|
|
2995
|
+
except Exception as e:
|
|
2996
|
+
logger.warning(f"Could not read {file_path}: {e}")
|
|
2997
|
+
|
|
2998
|
+
ingest_result = self.run_from_generator(
|
|
2999
|
+
documents=local_loader(),
|
|
3000
|
+
source='filesystem',
|
|
3001
|
+
extract_relations=extract_relations
|
|
3002
|
+
)
|
|
3003
|
+
else:
|
|
3004
|
+
# Use toolkit to fetch specific files
|
|
3005
|
+
toolkit = self.source_toolkits[source]
|
|
3006
|
+
|
|
3007
|
+
# Try to use toolkit's file-specific loader if available
|
|
3008
|
+
if hasattr(toolkit, 'get_files_content'):
|
|
3009
|
+
def file_loader():
|
|
3010
|
+
for file_path in file_paths:
|
|
3011
|
+
try:
|
|
3012
|
+
content = toolkit.get_files_content(file_path)
|
|
3013
|
+
if content:
|
|
3014
|
+
yield Document(
|
|
3015
|
+
page_content=content,
|
|
3016
|
+
metadata={'file_path': file_path, 'source_toolkit': source}
|
|
3017
|
+
)
|
|
3018
|
+
except Exception as e:
|
|
3019
|
+
logger.warning(f"Could not fetch {file_path}: {e}")
|
|
3020
|
+
|
|
3021
|
+
ingest_result = self.run_from_generator(
|
|
3022
|
+
documents=file_loader(),
|
|
3023
|
+
source=source,
|
|
3024
|
+
extract_relations=extract_relations
|
|
3025
|
+
)
|
|
3026
|
+
else:
|
|
3027
|
+
# Run full ingestion with whitelist
|
|
3028
|
+
ingest_result = self.run(
|
|
3029
|
+
source=source,
|
|
3030
|
+
whitelist=file_paths,
|
|
3031
|
+
extract_relations=extract_relations
|
|
3032
|
+
)
|
|
3033
|
+
|
|
3034
|
+
# Merge results
|
|
3035
|
+
result.documents_processed = ingest_result.documents_processed
|
|
3036
|
+
result.entities_added = ingest_result.entities_added
|
|
3037
|
+
result.relations_added = ingest_result.relations_added
|
|
3038
|
+
result.errors.extend(ingest_result.errors)
|
|
3039
|
+
result.success = ingest_result.success
|
|
3040
|
+
result.graph_stats = ingest_result.graph_stats
|
|
3041
|
+
result.duration_seconds = time.time() - start_time
|
|
3042
|
+
|
|
3043
|
+
self._log_progress(
|
|
3044
|
+
f"✅ Delta update complete: -{result.entities_removed} +{result.entities_added}",
|
|
3045
|
+
"complete"
|
|
3046
|
+
)
|
|
3047
|
+
|
|
3048
|
+
return result
|
|
3049
|
+
|
|
3050
|
+
def discover_schema(self, sample_file_paths: List[str]) -> Dict[str, Any]:
|
|
3051
|
+
"""
|
|
3052
|
+
Discover entity types from sample files using LLM.
|
|
3053
|
+
|
|
3054
|
+
Useful for customizing extraction for domain-specific codebases.
|
|
3055
|
+
|
|
3056
|
+
Args:
|
|
3057
|
+
sample_file_paths: Paths to sample files for schema discovery
|
|
3058
|
+
|
|
3059
|
+
Returns:
|
|
3060
|
+
Discovered schema with entity_types and relation_types
|
|
3061
|
+
"""
|
|
3062
|
+
if not self._init_extractors():
|
|
3063
|
+
return {'error': 'LLM not configured'}
|
|
3064
|
+
|
|
3065
|
+
self._log_progress(f"🔍 Discovering schema from {len(sample_file_paths)} samples", "schema")
|
|
3066
|
+
|
|
3067
|
+
from pathlib import Path
|
|
3068
|
+
docs = []
|
|
3069
|
+
|
|
3070
|
+
for file_path in sample_file_paths[:10]:
|
|
3071
|
+
try:
|
|
3072
|
+
content = Path(file_path).read_text(encoding='utf-8')
|
|
3073
|
+
docs.append(Document(
|
|
3074
|
+
page_content=content[:5000],
|
|
3075
|
+
metadata={'file_path': file_path}
|
|
3076
|
+
))
|
|
3077
|
+
except Exception as e:
|
|
3078
|
+
logger.warning(f"Could not read {file_path}: {e}")
|
|
3079
|
+
|
|
3080
|
+
if not docs:
|
|
3081
|
+
return {'error': 'Could not read any sample files'}
|
|
3082
|
+
|
|
3083
|
+
schema = self._schema_discoverer.discover(docs)
|
|
3084
|
+
self._knowledge_graph.set_schema(schema)
|
|
3085
|
+
self._auto_save()
|
|
3086
|
+
|
|
3087
|
+
self._log_progress(
|
|
3088
|
+
f"✅ Discovered {len(schema.get('entity_types', []))} entity types, "
|
|
3089
|
+
f"{len(schema.get('relation_types', []))} relation types",
|
|
3090
|
+
"schema"
|
|
3091
|
+
)
|
|
3092
|
+
|
|
3093
|
+
return schema
|
|
3094
|
+
|
|
3095
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
3096
|
+
"""Get current graph statistics."""
|
|
3097
|
+
return self._knowledge_graph.get_stats()
|
|
3098
|
+
|
|
3099
|
+
def export(self, path: Optional[str] = None) -> str:
|
|
3100
|
+
"""Export graph to JSON."""
|
|
3101
|
+
export_path = path or self.graph_path
|
|
3102
|
+
self._knowledge_graph.dump_to_json(export_path)
|
|
3103
|
+
return export_path
|
|
3104
|
+
|
|
3105
|
+
def register_toolkit(self, name: str, toolkit: Any) -> None:
|
|
3106
|
+
"""Register a source toolkit for ingestion."""
|
|
3107
|
+
self.source_toolkits[name] = toolkit
|
|
3108
|
+
logger.info(f"Registered toolkit: {name}")
|
|
3109
|
+
|
|
3110
|
+
|
|
3111
|
+
# Convenience function for one-shot ingestion
|
|
3112
|
+
def ingest_repository(
|
|
3113
|
+
llm: Any,
|
|
3114
|
+
graph_path: str,
|
|
3115
|
+
source_toolkit: Any,
|
|
3116
|
+
source_name: str = "repository",
|
|
3117
|
+
branch: Optional[str] = None,
|
|
3118
|
+
whitelist: Optional[List[str]] = None,
|
|
3119
|
+
blacklist: Optional[List[str]] = None,
|
|
3120
|
+
extract_relations: bool = True,
|
|
3121
|
+
progress_callback: Optional[Callable] = None,
|
|
3122
|
+
) -> IngestionResult:
|
|
3123
|
+
"""
|
|
3124
|
+
Convenience function for one-shot repository ingestion.
|
|
3125
|
+
|
|
3126
|
+
Args:
|
|
3127
|
+
llm: LangChain LLM instance
|
|
3128
|
+
graph_path: Where to save the graph JSON
|
|
3129
|
+
source_toolkit: Toolkit instance with loader() method
|
|
3130
|
+
source_name: Name for the source in citations
|
|
3131
|
+
branch: Branch to analyze
|
|
3132
|
+
whitelist: File patterns to include
|
|
3133
|
+
blacklist: File patterns to exclude
|
|
3134
|
+
extract_relations: Whether to extract relations
|
|
3135
|
+
progress_callback: Optional callback for progress updates
|
|
3136
|
+
|
|
3137
|
+
Returns:
|
|
3138
|
+
IngestionResult with statistics
|
|
3139
|
+
|
|
3140
|
+
Example:
|
|
3141
|
+
from alita_sdk.community.github.api_wrapper import GitHubApiWrapper
|
|
3142
|
+
|
|
3143
|
+
github = GitHubApiWrapper(
|
|
3144
|
+
api_base="...",
|
|
3145
|
+
api_key="...",
|
|
3146
|
+
repository="owner/repo"
|
|
3147
|
+
)
|
|
3148
|
+
|
|
3149
|
+
result = ingest_repository(
|
|
3150
|
+
llm=llm,
|
|
3151
|
+
graph_path="./graph.json",
|
|
3152
|
+
source_toolkit=github,
|
|
3153
|
+
source_name="github",
|
|
3154
|
+
branch="main",
|
|
3155
|
+
whitelist=["*.py"],
|
|
3156
|
+
progress_callback=lambda msg, phase: print(f"[{phase}] {msg}")
|
|
3157
|
+
)
|
|
3158
|
+
"""
|
|
3159
|
+
pipeline = IngestionPipeline(
|
|
3160
|
+
llm=llm,
|
|
3161
|
+
graph_path=graph_path,
|
|
3162
|
+
source_toolkits={source_name: source_toolkit},
|
|
3163
|
+
progress_callback=progress_callback,
|
|
3164
|
+
)
|
|
3165
|
+
|
|
3166
|
+
return pipeline.run(
|
|
3167
|
+
source=source_name,
|
|
3168
|
+
branch=branch,
|
|
3169
|
+
whitelist=whitelist,
|
|
3170
|
+
blacklist=blacklist,
|
|
3171
|
+
extract_relations=extract_relations,
|
|
3172
|
+
)
|